├── .dockerignore ├── .gitignore ├── CMakeLists.txt ├── Dockerfile ├── LICENSE ├── cmake ├── GreylockConfig.cmake.in └── locate_library.cmake ├── conf └── greylock.conf ├── debian ├── changelog ├── compat ├── control ├── copyright ├── dirs ├── docs ├── greylock-dev.install ├── greylock.install └── rules ├── greylock-bf.spec ├── include └── greylock │ ├── database.hpp │ ├── error.hpp │ ├── id.hpp │ ├── intersection.hpp │ ├── iterator.hpp │ ├── json.hpp │ ├── jsonvalue.hpp │ ├── types.hpp │ └── utils.hpp └── src ├── CMakeLists.txt ├── check.cpp ├── compact.cpp ├── exception.cpp ├── list.cpp ├── merge.cpp ├── meta.cpp └── server.cpp /.dockerignore: -------------------------------------------------------------------------------- 1 | build 2 | tags 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.a 2 | *.o 3 | *.py[co] 4 | *.so 5 | *.so.* 6 | *.tar.gz 7 | .*.sw* 8 | *~ 9 | CMakeCache.txt 10 | CMakeFiles 11 | build 12 | cmake_install.cmake 13 | install_manifest.txt 14 | tags 15 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 2.6) 2 | project (greylock) 3 | 4 | FILE (READ "${CMAKE_CURRENT_SOURCE_DIR}/debian/changelog" DEBCHANGELOG) 5 | 6 | string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" DEBFULLVERSION "${DEBCHANGELOG}") 7 | STRING (REGEX MATCH "([0-9]+\\.[0-9]+)" GREYLOCK_MAJOR_VERSION "${DEBFULLVERSION}") 8 | SET(GREYLOCK_FULL_VERSION ${DEBFULLVERSION}) 9 | 10 | set(CMAKE_CXX_FLAGS "-g -std=c++0x -W -Wall -Wextra -fstack-protector-all") 11 | 12 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/") 13 | 14 | find_package(Boost REQUIRED COMPONENTS system program_options filesystem thread) 15 | find_package(Ribosome REQUIRED) 16 | 17 | INCLUDE(cmake/locate_library.cmake) 18 | 19 | LOCATE_LIBRARY(JEMALLOC "jemalloc/jemalloc.h" "jemalloc") 20 | LOCATE_LIBRARY(MSGPACK "msgpack.hpp" "msgpack") 21 | LOCATE_LIBRARY(THEVOID "thevoid/server.hpp" "thevoid") 22 | LOCATE_LIBRARY(SWARM "swarm/http_request.hpp" "swarm") 23 | LOCATE_LIBRARY(ROCKSDB "rocksdb/db.h" "rocksdb") 24 | 25 | FILE(GLOB headers 26 | "${CMAKE_CURRENT_SOURCE_DIR}/include/greylock/*.hpp" 27 | "${CMAKE_CURRENT_SOURCE_DIR}/include/greylock/*.h" 28 | 29 | ) 30 | install(FILES ${headers} DESTINATION include/greylock) 31 | 32 | configure_file(cmake/GreylockConfig.cmake.in "${PROJECT_BINARY_DIR}/cmake/GreylockConfig.cmake" @ONLY) 33 | install(FILES "${PROJECT_BINARY_DIR}/cmake/GreylockConfig.cmake" DESTINATION share/greylock/cmake) 34 | 35 | include_directories(${PROJECT_SOURCE_DIR}/include 36 | ${Boost_INCLUDE_DIRS} 37 | ${MSGPACK_INCLUDE_DIRS} 38 | ${RIBOSOME_INCLUDE_DIRS} 39 | ${ROCKSDB_INCLUDE_DIRS} 40 | ${SWARM_INCLUDE_DIRS} 41 | ${THEVOID_INCLUDE_DIRS} 42 | ) 43 | 44 | add_subdirectory(src) 45 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM reverbrain/xenial-dev 2 | 3 | #RUN echo "deb http://repo.reverbrain.com/trusty/ current/amd64/" > /etc/apt/sources.list.d/reverbrain.list && \ 4 | # echo "deb http://repo.reverbrain.com/trusty/ current/all/" >> /etc/apt/sources.list.d/reverbrain.list && \ 5 | # apt-get install -y curl tzdata && \ 6 | # cp -f /usr/share/zoneinfo/posix/W-SU /etc/localtime && \ 7 | # curl http://repo.reverbrain.com/REVERBRAIN.GPG | apt-key add - && \ 8 | # apt-get update && \ 9 | # apt-get upgrade -y && \ 10 | # apt-get install -y git g++ liblz4-dev libsnappy-dev zlib1g-dev libbz2-dev libzstd-dev libgflags-dev libjemalloc-dev && \ 11 | # apt-get install -y cmake debhelper cdbs devscripts && \ 12 | # apt-get install -y libboost-system-dev libboost-filesystem-dev libboost-program-options-dev && \ 13 | # apt-get install -y libmsgpack-dev libswarm3-dev libthevoid3-dev ribosome-dev && \ 14 | # git config --global user.email "zbr@ioremap.net" && \ 15 | # git config --global user.name "Evgeniy Polyakov" 16 | 17 | #RUN cd /tmp && \ 18 | # git clone https://github.com/facebook/rocksdb && \ 19 | # cd rocksdb && \ 20 | # PORTABLE=1 make shared_lib && \ 21 | # make INSTALL_PATH=/usr install-shared && \ 22 | # echo "Rocksdb package has been updated and installed" 23 | 24 | RUN cd /tmp && \ 25 | rm -rf ribosome && \ 26 | git clone https://github.com/reverbrain/ribosome && \ 27 | cd ribosome && \ 28 | git branch -v && \ 29 | dpkg-buildpackage -b && \ 30 | dpkg -i ../ribosome*.deb && \ 31 | echo "Ribosome package has been updated and installed" && \ 32 | 33 | cd /tmp && \ 34 | rm -rf greylock && \ 35 | git clone https://github.com/reverbrain/greylock && \ 36 | cd greylock && \ 37 | git branch -v && \ 38 | dpkg-buildpackage -b && \ 39 | dpkg -i ../greylock_*.deb ../greylock-dev_*.deb && \ 40 | echo "Greylock package has been updated and installed" && \ 41 | rm -rf /var/lib/apt/lists/* 42 | 43 | EXPOSE 8080 8181 8111 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | {one line to give the program's name and a brief idea of what it does.} 635 | Copyright (C) {year} {name of author} 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | {project} Copyright (C) {year} {fullname} 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /cmake/GreylockConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # - Config file for the Elliptics package 2 | # It defines the following variables 3 | # GREYLOCK_INCLUDE_DIRS - include directories for Elliptics 4 | # GREYLOCK_LIBRARY_DIRS - library directories 5 | # GREYLOCK_LIBRARIES - libraries to link against 6 | 7 | get_filename_component(GREYLOCK_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) 8 | 9 | set(GREYLOCK_INCLUDE_DIRS @INSTALL_INCLUDE_DIR@ 10 | @LZ4_INCLUDE_DIRS@ 11 | @MSGPACK_INCLUDE_DIRS@ 12 | @ROCKSDB_INCLUDE_DIRS@ 13 | @SWARM_INCLUDE_DIRS@ 14 | @THEVOID_INCLUDE_DIRS@ 15 | ) 16 | 17 | set(GREYLOCK_LIBRARY_DIRS 18 | @LZ4_LIBRARY_DIRS@ 19 | @MSGPACK_LIBRARY_DIRS@ 20 | @ROCKSDB_LIBRARY_DIRS@ 21 | @SWARM_LIBRARY_DIRS@ 22 | @THEVOID_LIBRARY_DIRS@ 23 | ) 24 | 25 | set(GREYLOCK_LIBRARIES 26 | @LZ4_LIBRARIES@ 27 | @MSGPACK_LIBRARIES@ 28 | @ROCKSDB_LIBRARIES@ 29 | @SWARM_LIBRARIES@ 30 | @THEVOID_LIBRARIES@ 31 | greylock 32 | ) 33 | -------------------------------------------------------------------------------- /cmake/locate_library.cmake: -------------------------------------------------------------------------------- 1 | FUNCTION(LOCATE_LIBRARY VARIABLE HEADER LIBRARY) 2 | IF(${VARIABLE}_INCLUDE_DIRS AND ${VARIABLE}_LIBRARY_DIRS) 3 | RETURN() 4 | ENDIF() 5 | FIND_PATH(${VARIABLE}_INCLUDE_DIRS NAMES ${HEADER} PATH_SUFFIXES ${ARGN}) 6 | message("header: ${HEADER}, arguments: ${ARGN} ==> ${${VARIABLE}_INCLUDE_DIRS}") 7 | FIND_LIBRARY(${VARIABLE}_LIBRARIES NAMES ${LIBRARY} PATH_SUFFIXES ${ARGN}) 8 | message("library: ${LIBRARY}, arguments: ${ARGN} ==> ${${VARIABLE}_LIBRARIES}") 9 | 10 | STRING(TOLOWER ${VARIABLE} LIBRARY_NAME) 11 | 12 | IF(NOT ${VARIABLE}_INCLUDE_DIRS OR NOT ${VARIABLE}_LIBRARIES) 13 | MESSAGE(FATAL_ERROR "${LIBRARY_NAME} development files are required to build.") 14 | ELSE() 15 | MESSAGE(STATUS "Found ${LIBRARY_NAME}: ${${VARIABLE}_LIBRARIES} - ${${VARIABLE}_INCLUDE_DIRS}") 16 | ENDIF() 17 | ENDFUNCTION() 18 | 19 | FUNCTION(LOCATE_HEADERS VARIABLE HEADER) 20 | IF(${VARIABLE}_INCLUDE_DIRS) 21 | RETURN() 22 | ENDIF() 23 | 24 | FIND_PATH(${VARIABLE}_INCLUDE_DIRS NAMES ${HEADER} PATH_SUFFIXES ${ARGN}) 25 | message("header: ${HEADER}, arguments: ${ARGN} ==> ${${VARIABLE}_INCLUDE_DIRS}") 26 | 27 | IF(NOT ${VARIABLE}_INCLUDE_DIRS) 28 | MESSAGE(FATAL_ERROR "${LIBRARY_NAME} development files (headers) are required to build.") 29 | ENDIF() 30 | 31 | MESSAGE(STATUS "Found ${HEADER}: ${${VARIABLE}_INCLUDE_DIRS}") 32 | ENDFUNCTION() 33 | -------------------------------------------------------------------------------- /conf/greylock.conf: -------------------------------------------------------------------------------- 1 | { 2 | "endpoints": [ 3 | "0.0.0.0:8181" 4 | ], 5 | "backlog": 512, 6 | "threads": 10, 7 | "buffer_size": 65536, 8 | "logger": { 9 | "level": "info", 10 | "frontends": [ 11 | { 12 | "formatter": { 13 | "type": "string", 14 | "pattern": "%(timestamp)s %(request_id)s/%(lwp)s/%(pid)s %(severity)s: %(message)s, %(...L)s" 15 | }, 16 | "sink": { 17 | "type": "files", 18 | "path": "/dev/stdout", 19 | "path1": "greylock.log", 20 | "autoflush": true, 21 | "rotation": { "move": 0 } 22 | } 23 | } 24 | ] 25 | }, 26 | "daemon": { 27 | "fork": false, 28 | "uid": 1000 29 | }, 30 | "monitor-port": 21235, 31 | "request_header": "X-Request", 32 | "trace_header": "X-Trace", 33 | "application": { 34 | "rocksdb.docs": { 35 | "read_only": false, 36 | "bulk_upload": false, 37 | "path": "/mnt/disk/search/lj/rocksdb.docs" 38 | }, 39 | "rocksdb.indexes": { 40 | "read_only": false, 41 | "bulk_upload": false, 42 | "path": "/mnt/disk/search/lj/rocksdb.indexes" 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /debian/changelog: -------------------------------------------------------------------------------- 1 | greylock (1.1.0) unstable; urgency=low 2 | 3 | * Added date/time search 4 | * Added exact phrase search 5 | * Added negation support 6 | * Added pagination support 7 | 8 | -- Evgeniy Polyakov Tue, 09 Aug 2016 01:24:04 +0400 9 | 10 | greylock (1.0.0) unstable; urgency=low 11 | 12 | * Rewrite greylock search engine to use local rocksdb storage. It is not distributed search so far. 13 | 14 | -- Evgeniy Polyakov Thu, 28 Jul 2016 08:59:06 +0400 15 | 16 | -------------------------------------------------------------------------------- /debian/compat: -------------------------------------------------------------------------------- 1 | 7 2 | -------------------------------------------------------------------------------- /debian/control: -------------------------------------------------------------------------------- 1 | Source: greylock 2 | Section: net 3 | Priority: optional 4 | Maintainer: Evgeniy Polyakov 5 | Build-Depends: 6 | cdbs, 7 | cmake (>= 2.6), 8 | debhelper (>= 7.0.50~), 9 | ribosome-dev (>= 0.2.8), 10 | libboost-dev, 11 | libboost-system-dev, 12 | libboost-program-options-dev, 13 | libboost-filesystem-dev, 14 | libjemalloc-dev, 15 | libmsgpack-dev, 16 | liblz4-dev, 17 | libswarm3-dev, 18 | libthevoid3-dev, 19 | zlib1g-dev, 20 | libbz2-dev, 21 | libsnappy-dev 22 | Standards-Version: 3.8.0 23 | Homepage: http://www.reverbrain.com/ 24 | Vcs-Git: git://github.com/reverbrain/greylock.git 25 | Vcs-Browser: https://github.com/reverbrain/greylock 26 | 27 | Package: greylock 28 | Architecture: any 29 | Depends: ${shlibs:Depends}, ${misc:Depends} 30 | Description: Greylock is a local searching/indexing engine 31 | 32 | Package: greylock-dev 33 | Architecture: any 34 | Depends: ${shlibs:Depends}, ${misc:Depends}, 35 | ribosome-dev (>= 0.2.8), 36 | libboost-dev, 37 | libboost-system-dev, 38 | libboost-program-options-dev, 39 | libboost-filesystem-dev, 40 | libjemalloc-dev, 41 | libmsgpack-dev, 42 | liblz4-dev, 43 | libswarm3-dev, 44 | libthevoid3-dev, 45 | zlib1g-dev, 46 | libbz2-dev, 47 | libsnappy-dev 48 | Description: Development files for greylock search engine 49 | -------------------------------------------------------------------------------- /debian/copyright: -------------------------------------------------------------------------------- 1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ 2 | Upstream-Name: greylock 3 | Upstream-Contact: Evgeniy Polyakov 4 | Source: https://github.com/reverbrain/greylock 5 | 6 | Files: * 7 | Copyright: (C) 2015+ Evgeniy Polyakov 8 | License: GPL-3.0 9 | -------------------------------------------------------------------------------- /debian/dirs: -------------------------------------------------------------------------------- 1 | usr/bin 2 | usr/sbin 3 | -------------------------------------------------------------------------------- /debian/docs: -------------------------------------------------------------------------------- 1 | conf/ 2 | -------------------------------------------------------------------------------- /debian/greylock-dev.install: -------------------------------------------------------------------------------- 1 | usr/include/greylock/* 2 | usr/share/greylock/* 3 | usr/lib/libgreylock.so 4 | -------------------------------------------------------------------------------- /debian/greylock.install: -------------------------------------------------------------------------------- 1 | usr/bin/greylock_* 2 | usr/lib/libgreylock.so.* 3 | -------------------------------------------------------------------------------- /debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | 3 | include /usr/share/cdbs/1/rules/debhelper.mk 4 | include /usr/share/cdbs/1/class/cmake.mk 5 | 6 | DEB_CMAKE_EXTRA_FLAGS= 7 | DEB_DH_SHLIBDEPS_ARGS_ALL= --dpkg-shlibdeps-params=--ignore-missing-info 8 | 9 | install/greylock-dev:: 10 | 11 | -------------------------------------------------------------------------------- /greylock-bf.spec: -------------------------------------------------------------------------------- 1 | Summary: Greylock is an embedded search engine 2 | Name: greylock 3 | Version: 1.1.0 4 | Release: 1%{?dist}.1 5 | 6 | License: GPLv3 7 | Group: System Environment/Libraries 8 | URL: http://reverbrain.com/ 9 | Source0: %{name}-%{version}.tar.bz2 10 | BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) 11 | 12 | 13 | BuildRequires: ribosome-devel 14 | BuildRequires: libswarm3-devel, libthevoid3-devel 15 | BuildRequires: boost-devel, boost-system, boost-program-options, boost-filesystem 16 | BuildRequires: jemalloc-devel, msgpack-devel, lz4-devel 17 | BuildRequires: cmake >= 2.6 18 | 19 | %description 20 | Greylock is an embedded search engine which is aimed at index size and performace. 21 | Index of 200k livejournal.com entries (200Mb of uncompressed data) takes about 450Mb, 22 | index includes: full-text and per-author search indexes, original content, stemmed and original content. 23 | 24 | %package devel 25 | Summary: Development files for %{name} 26 | Group: Development/Libraries 27 | Requires: %{name} = %{version}-%{release} 28 | 29 | 30 | %description devel 31 | Greylock is an embedded search engine which is aimed at index size and performace. 32 | 33 | This package contains libraries, header files and developer documentation 34 | needed for developing software which uses greylock utils. 35 | 36 | %prep 37 | %setup -q 38 | 39 | %build 40 | export LDFLAGS="-Wl,-z,defs" 41 | export DESTDIR="%{buildroot}" 42 | %{cmake} . 43 | make %{?_smp_mflags} 44 | 45 | %install 46 | rm -rf %{buildroot} 47 | make install DESTDIR="%{buildroot}" 48 | 49 | %post -p /sbin/ldconfig 50 | %postun -p /sbin/ldconfig 51 | 52 | %clean 53 | rm -rf %{buildroot} 54 | 55 | %files 56 | %defattr(-,root,root,-) 57 | %{_bindir}/greylock_* 58 | %{_libdir}/libgreylock.so.* 59 | %doc conf/ 60 | 61 | 62 | %files devel 63 | %defattr(-,root,root,-) 64 | %{_includedir}/* 65 | %{_datadir}/greylock/cmake/* 66 | %{_libdir}/libgreylock.so 67 | 68 | %changelog 69 | * Tue Aug 09 2016 Evgeniy Polyakov - 1.1.0 70 | - Added date/time search 71 | - Added exact phrase search 72 | - Added negation support 73 | - Added pagination support 74 | 75 | * Thu Jul 28 2016 Evgeniy Polyakov - 1.0.0 76 | - Rewrite greylock search engine to use local rocksdb storage. It is not distributed search so far. 77 | 78 | -------------------------------------------------------------------------------- /include/greylock/database.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "greylock/error.hpp" 4 | #include "greylock/id.hpp" 5 | #include "greylock/utils.hpp" 6 | 7 | #include 8 | 9 | #pragma GCC diagnostic push 10 | #pragma GCC diagnostic ignored "-Wunused-parameter" 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #pragma GCC diagnostic pop 22 | 23 | #include 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | 31 | namespace ioremap { namespace greylock { 32 | 33 | struct options { 34 | size_t tokens_shard_size = 3600 * 1 * 24; 35 | 36 | int max_threads = 8; 37 | 38 | int bits_per_key = 10; // bloom filter parameter 39 | 40 | long lru_cache_size = 100 * 1024 * 1024; // 100 MB of uncompressed data cache 41 | 42 | long sync_metadata_timeout = 60000; // 60 seconds 43 | 44 | // mininmum size of the token which will go into separate index, 45 | // if token size is smaller, it will be combined into 2 indexes 46 | // with the previous and next tokens. 47 | // This options greatly speeds up requests with small words (like [to be or not to be]), 48 | // but heavily increases index size. 49 | unsigned int ngram_index_size = 0; 50 | 51 | enum { 52 | default_column = 0, 53 | documents_column, 54 | document_ids_column, 55 | token_shards_column, 56 | indexes_column, 57 | meta_column, 58 | __column_size, 59 | }; 60 | 61 | std::vector column_names; 62 | std::string metadata_key; 63 | 64 | options(): metadata_key("greylock.meta.key") { 65 | column_names.resize(__column_size); 66 | column_names[default_column] = rocksdb::kDefaultColumnFamilyName; 67 | column_names[documents_column] = "documents"; 68 | column_names[document_ids_column] = "document_ids"; 69 | column_names[token_shards_column] = "token_shards"; 70 | column_names[indexes_column] = "indexes"; 71 | column_names[meta_column] = "meta"; 72 | } 73 | 74 | std::string column_name(int cnum) const { 75 | if (cnum < 0 || cnum >= __column_size) 76 | return ""; 77 | 78 | return column_names[cnum]; 79 | } 80 | }; 81 | 82 | class metadata { 83 | public: 84 | metadata() : m_dirty(false), m_seq(0) {} 85 | 86 | bool dirty() const { 87 | return m_dirty; 88 | } 89 | void clear_dirty() { 90 | m_dirty = false; 91 | } 92 | 93 | long get_sequence() { 94 | m_dirty = true; 95 | return m_seq++; 96 | } 97 | 98 | void set_sequence(long seq) { 99 | m_dirty = true; 100 | m_seq = seq; 101 | } 102 | 103 | enum { 104 | serialize_version_2 = 2, 105 | }; 106 | 107 | template 108 | void msgpack_pack(msgpack::packer &o) const { 109 | o.pack_array(metadata::serialize_version_2); 110 | o.pack((int)metadata::serialize_version_2); 111 | o.pack(m_seq.load()); 112 | } 113 | 114 | void msgpack_unpack(msgpack::object o) { 115 | if (o.type != msgpack::type::ARRAY) { 116 | std::ostringstream ss; 117 | ss << "could not unpack metadata, object type is " << o.type << 118 | ", must be array (" << msgpack::type::ARRAY << ")"; 119 | throw std::runtime_error(ss.str()); 120 | } 121 | 122 | int version; 123 | long seq; 124 | 125 | msgpack::object *p = o.via.array.ptr; 126 | p[0].convert(&version); 127 | 128 | if (version != (int)o.via.array.size) { 129 | std::ostringstream ss; 130 | ss << "could not unpack document, invalid version: " << version << ", array size: " << o.via.array.size; 131 | throw std::runtime_error(ss.str()); 132 | } 133 | 134 | switch (version) { 135 | case metadata::serialize_version_2: 136 | p[1].convert(&seq); 137 | m_seq.store(seq); 138 | break; 139 | default: { 140 | std::ostringstream ss; 141 | ss << "could not unpack metadata, invalid version " << version; 142 | throw std::runtime_error(ss.str()); 143 | } 144 | } 145 | } 146 | 147 | private: 148 | bool m_dirty; 149 | std::atomic_long m_seq; 150 | }; 151 | 152 | struct document_for_index { 153 | id_t indexed_id; 154 | MSGPACK_DEFINE(indexed_id); 155 | 156 | bool operator<(const document_for_index &other) const { 157 | return indexed_id < other.indexed_id; 158 | } 159 | }; 160 | 161 | namespace { 162 | static const uint32_t disk_cookie = 0x45589560; 163 | } 164 | 165 | struct disk_index { 166 | typedef document_for_index value_type; 167 | typedef document_for_index& reference; 168 | typedef document_for_index* pointer; 169 | 170 | std::vector ids; 171 | 172 | template 173 | void msgpack_pack(msgpack::packer &o) const { 174 | o.pack_array(2); 175 | o.pack(disk_cookie); 176 | o.pack(ids); 177 | } 178 | 179 | void msgpack_unpack(msgpack::object o) { 180 | if (o.type != msgpack::type::ARRAY) { 181 | std::ostringstream ss; 182 | ss << "could not unpack disk index, object type is " << o.type << 183 | ", must be array (" << msgpack::type::ARRAY << ")"; 184 | throw std::runtime_error(ss.str()); 185 | } 186 | 187 | uint32_t cookie; 188 | 189 | msgpack::object *p = o.via.array.ptr; 190 | p[0].convert(&cookie); 191 | 192 | if (cookie != disk_cookie) { 193 | std::ostringstream ss; 194 | ss << "could not unpack disk index, cookie mismatch: " << std::hex << cookie << 195 | ", must be: " << std::hex << disk_cookie; 196 | throw std::runtime_error(ss.str()); 197 | } 198 | 199 | p[1].convert(&ids); 200 | } 201 | }; 202 | 203 | struct disk_token { 204 | std::vector shards; 205 | MSGPACK_DEFINE(shards); 206 | 207 | disk_token() {} 208 | disk_token(const std::set &s): shards(s.begin(), s.end()) {} 209 | disk_token(const std::vector &s): shards(s) {} 210 | }; 211 | 212 | class indexes_merge_operator : public rocksdb::MergeOperator { 213 | public: 214 | virtual const char* Name() const override { 215 | return "indexes_merge_operator"; 216 | } 217 | 218 | bool merge_indexes(const rocksdb::Slice& key, const rocksdb::Slice* old_value, 219 | const std::deque& operand_list, 220 | std::string* new_value, 221 | rocksdb::Logger *logger) const { 222 | 223 | disk_index index; 224 | greylock::error_info err; 225 | std::set unique_index; 226 | size_t ocount = 0; 227 | 228 | if (old_value) { 229 | err = deserialize(index, old_value->data(), old_value->size()); 230 | if (err) { 231 | rocksdb::Error(logger, "merge: key: %s, index deserialize failed: %s [%d]", 232 | key.ToString().c_str(), err.message().c_str(), err.code()); 233 | return false; 234 | } 235 | 236 | unique_index.insert(index.ids.begin(), index.ids.end()); 237 | ocount = unique_index.size(); 238 | } 239 | 240 | for (const auto& value : operand_list) { 241 | msgpack::unpacked msg; 242 | msgpack::unpack(&msg, value.data(), value.size()); 243 | 244 | try { 245 | msgpack::object o = msg.get(); 246 | 247 | if (o.type != msgpack::type::ARRAY) { 248 | document_for_index did; 249 | o.convert(&did); 250 | unique_index.emplace(did); 251 | continue; 252 | } 253 | 254 | disk_index idx; 255 | o.convert(&idx); 256 | 257 | unique_index.insert(idx.ids.begin(), idx.ids.end()); 258 | } catch (const std::exception &e) { 259 | rocksdb::Error(logger, "merge: key: %s, document deserialize failed: %s", 260 | key.ToString().c_str(), e.what()); 261 | return false; 262 | } 263 | } 264 | 265 | index.ids.clear(); 266 | index.ids.insert(index.ids.end(), unique_index.begin(), unique_index.end()); 267 | *new_value = serialize(index); 268 | 269 | if (new_value->size() > 1024 * 1024) { 270 | size_t osize = 0; 271 | if (old_value) 272 | osize = old_value->size(); 273 | rocksdb::Info(logger, "index_merge: key: %s, size: %ld -> %ld, counts: %ld -> %ld", 274 | key.ToString().c_str(), osize, new_value->size(), ocount, index.ids.size()); 275 | } 276 | 277 | return true; 278 | } 279 | 280 | virtual bool FullMerge(const rocksdb::Slice& key, const rocksdb::Slice* old_value, 281 | const std::deque& operand_list, 282 | std::string* new_value, 283 | rocksdb::Logger *logger) const override { 284 | return merge_indexes(key, old_value, operand_list, new_value, logger); 285 | } 286 | 287 | virtual bool PartialMerge(const rocksdb::Slice& key, 288 | const rocksdb::Slice& left_operand, const rocksdb::Slice& right_operand, 289 | std::string* new_value, 290 | rocksdb::Logger* logger) const { 291 | #if 0 292 | auto dump = [](const rocksdb::Slice &v) { 293 | std::ostringstream ss; 294 | 295 | msgpack::unpacked msg; 296 | msgpack::unpack(&msg, v.data(), v.size()); 297 | 298 | ss << msg.get(); 299 | return ss.str(); 300 | }; 301 | 302 | printf("partial merge: key: %s, left: %s, right: %s\n", 303 | key.ToString().c_str(), dump(left_operand).c_str(), dump(right_operand).c_str()); 304 | #endif 305 | (void) key; 306 | (void) left_operand; 307 | (void) right_operand; 308 | (void) new_value; 309 | (void) logger; 310 | 311 | return false; 312 | } 313 | }; 314 | 315 | class token_shards_merge_operator : public rocksdb::MergeOperator { 316 | public: 317 | virtual const char* Name() const override { 318 | return "token_shards_merge_operator"; 319 | } 320 | 321 | bool merge_token_shards(const rocksdb::Slice& key, const rocksdb::Slice* old_value, 322 | const std::deque& operand_list, 323 | std::string* new_value, 324 | rocksdb::Logger *logger) const { 325 | 326 | disk_token dt; 327 | std::set shards; 328 | greylock::error_info err; 329 | 330 | if (old_value) { 331 | err = deserialize(dt, old_value->data(), old_value->size()); 332 | if (err) { 333 | rocksdb::Error(logger, "merge: key: %s, disk_token deserialize failed: %s [%d]", 334 | key.ToString().c_str(), err.message().c_str(), err.code()); 335 | return false; 336 | } 337 | 338 | shards.insert(dt.shards.begin(), dt.shards.end()); 339 | } 340 | 341 | for (const auto& value : operand_list) { 342 | disk_token s; 343 | err = deserialize(s, value.data(), value.size()); 344 | if (err) { 345 | rocksdb::Error(logger, "merge: key: %s, disk_token operand deserialize failed: %s [%d]", 346 | key.ToString().c_str(), err.message().c_str(), err.code()); 347 | return false; 348 | } 349 | 350 | shards.insert(s.shards.begin(), s.shards.end()); 351 | } 352 | 353 | dt.shards = std::vector(shards.begin(), shards.end()); 354 | *new_value = serialize(dt); 355 | 356 | if (new_value->size() > 1024 * 1024) { 357 | size_t osize = 0; 358 | if (old_value) { 359 | osize = old_value->size(); 360 | } 361 | 362 | rocksdb::Warn(logger, "shard_merge: key: %s, size: %ld -> %ld", 363 | key.ToString().c_str(), osize, new_value->size()); 364 | } 365 | 366 | return true; 367 | } 368 | 369 | virtual bool FullMerge(const rocksdb::Slice& key, const rocksdb::Slice* old_value, 370 | const std::deque& operand_list, 371 | std::string* new_value, 372 | rocksdb::Logger *logger) const override { 373 | return merge_token_shards(key, old_value, operand_list, new_value, logger); 374 | } 375 | 376 | virtual bool PartialMerge(const rocksdb::Slice& key, 377 | const rocksdb::Slice& left_operand, const rocksdb::Slice& right_operand, 378 | std::string* new_value, 379 | rocksdb::Logger* logger) const { 380 | #if 0 381 | auto dump = [](const rocksdb::Slice &v) { 382 | std::ostringstream ss; 383 | 384 | msgpack::unpacked msg; 385 | msgpack::unpack(&msg, v.data(), v.size()); 386 | 387 | ss << msg.get(); 388 | return ss.str(); 389 | }; 390 | 391 | printf("partial merge: key: %s, left: %s, right: %s\n", 392 | key.ToString().c_str(), dump(left_operand).c_str(), dump(right_operand).c_str()); 393 | #endif 394 | (void) key; 395 | (void) left_operand; 396 | (void) right_operand; 397 | (void) new_value; 398 | (void) logger; 399 | 400 | return false; 401 | } 402 | }; 403 | 404 | class database { 405 | public: 406 | ~database() { 407 | if (!m_ro) { 408 | m_expiration_timer.stop(); 409 | sync_metadata(NULL); 410 | } 411 | } 412 | 413 | const greylock::options &options() const { 414 | return m_opts; 415 | } 416 | greylock::metadata &metadata() { 417 | return m_meta; 418 | } 419 | 420 | rocksdb::ColumnFamilyHandle *cfhandle(int c) { 421 | return m_handles[c]; 422 | } 423 | 424 | void compact() { 425 | if (m_db) { 426 | for (auto h: m_handles) { 427 | struct rocksdb::CompactRangeOptions opts; 428 | opts.change_level = true; 429 | opts.target_level = 0; 430 | m_db->CompactRange(opts, h, NULL, NULL); 431 | } 432 | } 433 | } 434 | 435 | void compact(size_t c, const rocksdb::Slice &start, const rocksdb::Slice &end) { 436 | if (m_db && c < m_handles.size()) { 437 | const rocksdb::Slice *b = NULL; 438 | const rocksdb::Slice *e = NULL; 439 | 440 | if (start != rocksdb::Slice()) { 441 | b = &start; 442 | } 443 | if (end != rocksdb::Slice()) { 444 | e = &end; 445 | } 446 | 447 | struct rocksdb::CompactRangeOptions opts; 448 | opts.change_level = true; 449 | opts.target_level = 0; 450 | m_db->CompactRange(opts, cfhandle(c), b, e); 451 | } 452 | } 453 | 454 | greylock::error_info sync_metadata(rocksdb::WriteBatch *batch) { 455 | if (m_ro) { 456 | return greylock::create_error(-EROFS, "read-only database"); 457 | } 458 | 459 | if (!m_db) { 460 | return greylock::create_error(-EINVAL, "database is not opened"); 461 | } 462 | 463 | if (!m_meta.dirty()) 464 | return greylock::error_info(); 465 | 466 | std::string meta_serialized = serialize(m_meta); 467 | 468 | rocksdb::Status s; 469 | if (batch) { 470 | batch->Put(m_handles[options::meta_column], rocksdb::Slice(m_opts.metadata_key), rocksdb::Slice(meta_serialized)); 471 | } else { 472 | s = m_db->Put(rocksdb::WriteOptions(), m_handles[options::meta_column], 473 | rocksdb::Slice(m_opts.metadata_key), rocksdb::Slice(meta_serialized)); 474 | } 475 | 476 | if (!s.ok()) { 477 | return greylock::create_error(-s.code(), "could not write metadata key: %s, error: %s", 478 | m_opts.metadata_key.c_str(), s.ToString().c_str()); 479 | } 480 | 481 | m_meta.clear_dirty(); 482 | return greylock::error_info(); 483 | } 484 | 485 | greylock::error_info open_read_only(const std::string &path) { 486 | return open(path, true, false); 487 | } 488 | greylock::error_info open_read_write(const std::string &path) { 489 | return open(path, false, false); 490 | } 491 | 492 | greylock::error_info open(const std::string &path, bool ro, bool bulk) { 493 | if (m_db) { 494 | return greylock::create_error(-EINVAL, "database is already opened"); 495 | } 496 | 497 | rocksdb::Options dbo; 498 | dbo.max_open_files = 1000; 499 | //dbo.disableDataSync = true; 500 | dbo.IncreaseParallelism(m_opts.max_threads); 501 | 502 | dbo.max_bytes_for_level_base = 1024 * 1024 * 1024 * 100UL; 503 | //dbo.write_buffer_size = 1024 * 1024 * 1024UL; 504 | //dbo.max_write_buffer_number = 10; 505 | //dbo.min_write_buffer_number_to_merge = 4; 506 | 507 | dbo.compression = rocksdb::kZSTDNotFinalCompression; 508 | dbo.num_levels = 10; 509 | #if 0 510 | dbo.compression_per_level = 511 | std::vector({ 512 | rocksdb::kZSTDNotFinalCompression, 513 | rocksdb::kZSTDNotFinalCompression, 514 | rocksdb::kZSTDNotFinalCompression, 515 | rocksdb::kZSTDNotFinalCompression, 516 | rocksdb::kZSTDNotFinalCompression, 517 | }); 518 | #endif 519 | dbo.compression_opts = rocksdb::CompressionOptions(-14, 5, 0, 0); 520 | 521 | dbo.create_if_missing = true; 522 | dbo.create_missing_column_families = true; 523 | 524 | if (!ro && bulk) { 525 | dbo.PrepareForBulkLoad(); 526 | } 527 | 528 | dbo.statistics = rocksdb::CreateDBStatistics(); 529 | dbo.stats_dump_period_sec = 60; 530 | 531 | rocksdb::BlockBasedTableOptions table_options; 532 | table_options.block_cache = rocksdb::NewLRUCache(m_opts.lru_cache_size); 533 | table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(m_opts.bits_per_key, true)); 534 | dbo.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options)); 535 | 536 | rocksdb::DB *db; 537 | rocksdb::Status s; 538 | 539 | rocksdb::ColumnFamilyOptions cfo(dbo); 540 | 541 | std::vector column_families; 542 | 543 | for (size_t i = 0; i < options().column_names.size(); ++i) { 544 | auto cname = options().column_names[i]; 545 | 546 | cfo.merge_operator.reset(); 547 | 548 | if (i == greylock::options::token_shards_column) { 549 | cfo.merge_operator.reset(new token_shards_merge_operator); 550 | } 551 | if (i == greylock::options::indexes_column) { 552 | cfo.merge_operator.reset(new indexes_merge_operator); 553 | } 554 | 555 | column_families.push_back(rocksdb::ColumnFamilyDescriptor(cname, cfo)); 556 | } 557 | 558 | if (ro) { 559 | s = rocksdb::DB::OpenForReadOnly(dbo, path, column_families, &m_handles, &db); 560 | } else { 561 | s = rocksdb::DB::Open(dbo, path, column_families, &m_handles, &db); 562 | } 563 | if (!s.ok()) { 564 | return greylock::create_error(-s.code(), "failed to open rocksdb database: '%s', read-only: %d, error: %s", 565 | path.c_str(), ro, s.ToString().c_str()); 566 | } 567 | m_db.reset(db); 568 | m_ro = ro; 569 | 570 | std::string meta; 571 | s = m_db->Get(rocksdb::ReadOptions(), m_handles[options::meta_column], rocksdb::Slice(m_opts.metadata_key), &meta); 572 | if (!s.ok() && !s.IsNotFound()) { 573 | return greylock::create_error(-s.code(), "could not read key: %s, error: %s", 574 | m_opts.metadata_key.c_str(), s.ToString().c_str()); 575 | } 576 | 577 | if (s.ok()) { 578 | auto err = deserialize(m_meta, meta.data(), meta.size()); 579 | if (err) 580 | return greylock::create_error(err.code(), "metadata deserialization failed, key: %s, error: %s", 581 | m_opts.metadata_key.c_str(), err.message().c_str()); 582 | } 583 | 584 | if (m_opts.sync_metadata_timeout > 0 && !ro) { 585 | sync_metadata_callback(); 586 | } 587 | 588 | return greylock::error_info(); 589 | } 590 | 591 | std::vector get_shards(const std::string &key) { 592 | disk_token dt; 593 | if (!m_db) { 594 | return dt.shards; 595 | } 596 | 597 | std::string ser_shards; 598 | auto err = read(options::token_shards_column, key, &ser_shards); 599 | if (err) 600 | return dt.shards; 601 | 602 | err = deserialize(dt, ser_shards.data(), ser_shards.size()); 603 | if (err) 604 | return dt.shards; 605 | 606 | return dt.shards; 607 | } 608 | 609 | rocksdb::Iterator *iterator(int column, const rocksdb::ReadOptions &ro) { 610 | return m_db->NewIterator(ro, m_handles[column]); 611 | } 612 | 613 | greylock::error_info read(int column, const std::string &key, std::string *ret) { 614 | if (!m_db) { 615 | return greylock::create_error(-EINVAL, "database is not opened"); 616 | } 617 | 618 | auto s = m_db->Get(rocksdb::ReadOptions(), m_handles[column], rocksdb::Slice(key), ret); 619 | if (!s.ok()) { 620 | return greylock::create_error(-s.code(), "could not read key: %s, error: %s", key.c_str(), s.ToString().c_str()); 621 | } 622 | return greylock::error_info(); 623 | } 624 | 625 | greylock::error_info write(rocksdb::WriteBatch *batch) { 626 | if (!m_db) { 627 | return greylock::create_error(-EINVAL, "database is not opened"); 628 | } 629 | 630 | if (m_ro) { 631 | return greylock::create_error(-EROFS, "read-only database"); 632 | } 633 | 634 | auto wo = rocksdb::WriteOptions(); 635 | 636 | auto s = m_db->Write(wo, batch); 637 | if (!s.ok()) { 638 | return greylock::create_error(-s.code(), "could not write batch: %s", s.ToString().c_str()); 639 | } 640 | 641 | return greylock::error_info(); 642 | } 643 | 644 | greylock::error_info write(int column, const std::string &key, const std::string &value) { 645 | if (!m_db) { 646 | return greylock::create_error(-EINVAL, "database is not opened"); 647 | } 648 | 649 | if (m_ro) { 650 | return greylock::create_error(-EROFS, "read-only database"); 651 | } 652 | 653 | auto wo = rocksdb::WriteOptions(); 654 | 655 | auto s = m_db->Merge(wo, m_handles[column], rocksdb::Slice(key), rocksdb::Slice(value)); 656 | if (!s.ok()) { 657 | return greylock::create_error(-s.code(), "could not write batch: %s", s.ToString().c_str()); 658 | } 659 | 660 | return greylock::error_info(); 661 | } 662 | 663 | private: 664 | bool m_ro = false; 665 | std::vector m_handles; 666 | std::unique_ptr m_db; 667 | greylock::options m_opts; 668 | greylock::metadata m_meta; 669 | 670 | ribosome::expiration m_expiration_timer; 671 | 672 | void sync_metadata_callback() { 673 | sync_metadata(NULL); 674 | 675 | auto expires_at = std::chrono::system_clock::now() + std::chrono::milliseconds(m_opts.sync_metadata_timeout); 676 | m_expiration_timer.insert(expires_at, std::bind(&database::sync_metadata_callback, this)); 677 | } 678 | }; 679 | 680 | }} // namespace ioremap::greylock 681 | -------------------------------------------------------------------------------- /include/greylock/error.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace ioremap { namespace greylock { 7 | 8 | class error : public std::exception 9 | { 10 | public: 11 | // err must be negative value 12 | explicit error(int err, const std::string &message) throw(); 13 | ~error() throw() {} 14 | 15 | int error_code() const; 16 | 17 | virtual const char *what() const throw(); 18 | 19 | std::string error_message() const throw(); 20 | 21 | private: 22 | int m_errno; 23 | std::string m_message; 24 | }; 25 | 26 | class not_found_error : public error 27 | { 28 | public: 29 | explicit not_found_error(const std::string &message) throw(); 30 | }; 31 | 32 | class timeout_error : public error 33 | { 34 | public: 35 | explicit timeout_error(const std::string &message) throw(); 36 | }; 37 | 38 | class no_such_address_error : public error 39 | { 40 | public: 41 | explicit no_such_address_error(const std::string &message) throw(); 42 | }; 43 | 44 | class error_info 45 | { 46 | public: 47 | inline error_info() : m_code(0) {} 48 | inline error_info(int code, const std::string &&message) 49 | : m_code(code), m_message(message) {} 50 | inline error_info(int code, const std::string &message) 51 | : m_code(code), m_message(message) {} 52 | inline ~error_info() {} 53 | 54 | inline int code() const { return m_code; } 55 | inline const std::string &message() const { return m_message; } 56 | inline operator bool() const { return m_code != 0; } 57 | inline bool operator !() const { return !operator bool(); } 58 | operator int() const = delete; // disable implicit cast to int 59 | 60 | void throw_error() const; 61 | private: 62 | int m_code; 63 | std::string m_message; 64 | }; 65 | 66 | // err must be negative value 67 | void throw_error(int err, const char *format, ...) 68 | __attribute__ ((format (printf, 2, 3))); 69 | 70 | // err must be negative value 71 | error_info create_error(int err, const char *format, ...) 72 | __attribute__ ((format (printf, 2, 3))); 73 | 74 | }} /* namespace ioremap::greylock */ 75 | -------------------------------------------------------------------------------- /include/greylock/id.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace ioremap { namespace greylock { 11 | 12 | namespace { 13 | static const uint32_t start_date = 0; 14 | static const uint32_t date_div = 3600 * 24; 15 | } 16 | 17 | struct id_t { 18 | uint64_t timestamp = 0; 19 | 20 | MSGPACK_DEFINE(timestamp); 21 | 22 | void set_timestamp(long tsec, long aux) { 23 | tsec = (tsec - start_date) / date_div; 24 | 25 | timestamp = tsec << 32; 26 | timestamp |= aux & ((1UL << 32) - 1); 27 | } 28 | 29 | void get_timestamp(long *tsec, long *aux) const { 30 | *tsec = (timestamp >> 32) * date_div + start_date; 31 | *aux = timestamp & ((1UL << 32) - 1); 32 | } 33 | 34 | bool operator<(const id_t &other) const { 35 | return timestamp < other.timestamp; 36 | } 37 | bool operator>(const id_t &other) const { 38 | return timestamp > other.timestamp; 39 | } 40 | 41 | bool operator==(const id_t &other) const { 42 | return (timestamp == other.timestamp); 43 | } 44 | bool operator!=(const id_t &other) const { 45 | return !operator==(other); 46 | } 47 | 48 | std::string to_string() const { 49 | char buf[64]; 50 | size_t sz = snprintf(buf, sizeof(buf), "%016lx", timestamp); 51 | return std::string(buf, sz); 52 | } 53 | 54 | id_t(): timestamp(0) { 55 | } 56 | 57 | id_t(const id_t &other) { 58 | timestamp = other.timestamp; 59 | } 60 | 61 | id_t(const char *str) { 62 | if (!str) { 63 | id_t(); 64 | return; 65 | } 66 | 67 | timestamp = strtoull(str, NULL, 16); 68 | } 69 | 70 | void set_next_id(const id_t &other) { 71 | timestamp = other.timestamp + 1; 72 | } 73 | 74 | }; 75 | 76 | }} // namespace ioremap::greylock 77 | -------------------------------------------------------------------------------- /include/greylock/intersection.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __INDEXES_INTERSECTION_HPP 2 | #define __INDEXES_INTERSECTION_HPP 3 | 4 | #include "greylock/iterator.hpp" 5 | #include "greylock/types.hpp" 6 | 7 | namespace ioremap { namespace greylock { 8 | 9 | struct single_doc_result { 10 | document doc; 11 | 12 | float relevance = 0; 13 | }; 14 | 15 | struct search_result { 16 | bool completed = true; 17 | 18 | // This will contain a cookie which must be used for the next intersection request, 19 | // if current request is not complete. This may happen when client has requested limited 20 | // maximum number of keys in reply and there are more keys. 21 | id_t next_document_id; 22 | long max_number_of_documents = ~0UL; 23 | 24 | // array of documents which contain all requested indexes 25 | std::vector docs; 26 | }; 27 | 28 | // check whether given result matches query, may also set or change some result parameters like relevance field 29 | typedef std::function check_result_function_t; 30 | 31 | struct mailbox_query { 32 | std::string mbox; 33 | greylock::indexes idx; 34 | 35 | greylock::error_info parse_error; 36 | 37 | mailbox_query(const greylock::options &options, const rapidjson::Value &doc) { 38 | const rapidjson::Value &query_and = greylock::get_object(doc, "query"); 39 | if (query_and.IsObject()) { 40 | auto ireq = indexes::get_indexes(options, query_and); 41 | idx.merge_query(ireq); 42 | } 43 | 44 | const rapidjson::Value &query_exact = greylock::get_object(doc, "exact"); 45 | if (query_exact.IsObject()) { 46 | auto ireq = indexes::get_indexes(options, query_exact); 47 | 48 | // merge these indexes into intersection set, 49 | // since exact phrase match implies document contains all tokens 50 | idx.merge_exact(ireq); 51 | } 52 | 53 | const rapidjson::Value &query_negation = greylock::get_object(doc, "negation"); 54 | if (query_negation.IsObject()) { 55 | auto ireq = indexes::get_indexes(options, query_negation); 56 | // do not merge these indexes into intersection set, put them into own container 57 | idx.merge_negation(ireq); 58 | } 59 | 60 | if (idx.attributes.empty()) { 61 | parse_error = greylock::create_error(-ENOENT, 62 | "search: mailbox: %s, there are no queries suitable for search", mbox.c_str()); 63 | return; 64 | } 65 | } 66 | }; 67 | 68 | struct intersection_query { 69 | id_t range_start, range_end; 70 | 71 | std::vector se; 72 | 73 | id_t next_document_id; 74 | size_t max_number = LONG_MAX; 75 | 76 | std::string to_string() const { 77 | std::ostringstream ss; 78 | 79 | ss << "[ "; 80 | for (const auto &ent: se) { 81 | ss << "mailbox: " << ent.mbox << ", indexes: " << ent.idx.to_string() << "| "; 82 | } 83 | ss << "]"; 84 | 85 | return ss.str(); 86 | } 87 | }; 88 | 89 | template 90 | class intersector { 91 | public: 92 | intersector(DBT &db_docs, DBT &db_indexes) : m_db_docs(db_docs), m_db_indexes(db_indexes) {} 93 | 94 | search_result intersect(const intersection_query &iq) const { 95 | return intersect(iq, [&] (single_doc_result &) -> bool { 96 | return true; 97 | }); 98 | } 99 | 100 | // search for intersections between all @indexes 101 | // starting with the key @start, returning at most @num entries 102 | // 103 | // after @intersect() completes, it sets @start to the next key to start searching from 104 | // user should not change that token, otherwise @intersect() may skip some entries or 105 | // return duplicates. 106 | // 107 | // if number of returned entries is less than requested number @num or if @start has been set to empty string 108 | // after call to this function returns, then intersection is completed. 109 | // 110 | // @search_result.completed will be set to true in this case. 111 | search_result intersect(const intersection_query &iq, check_result_function_t check) const { 112 | search_result res; 113 | #ifdef STDOUT_DEBUG 114 | auto dump_vector = [] (const std::vector &sh) -> std::string { 115 | std::ostringstream ss; 116 | for (size_t i = 0; i < sh.size(); ++i) { 117 | ss << sh[i]; 118 | if (i != sh.size() - 1) 119 | ss << " "; 120 | } 121 | 122 | return ss.str(); 123 | }; 124 | 125 | #endif 126 | 127 | 128 | std::vector common_shards; 129 | bool init = true; 130 | for (const auto &ent: iq.se) { 131 | for (const auto &attr: ent.idx.attributes) { 132 | for (const auto &t: attr.tokens) { 133 | std::string shard_key = document::generate_shard_key(m_db_indexes.options(), ent.mbox, attr.name, t.name); 134 | auto shards = m_db_indexes.get_shards(shard_key); 135 | #ifdef STDOUT_DEBUG 136 | printf("common_shards: %s, key: %s, shards: %s\n", 137 | dump_vector(common_shards).c_str(), shard_key.c_str(), 138 | dump_vector(shards).c_str()); 139 | #endif 140 | // one index is empty, intersection will be empty, return early 141 | if (shards.size() == 0) { 142 | return res; 143 | } 144 | 145 | if (init) { 146 | common_shards = shards; 147 | init = false; 148 | } else { 149 | std::vector intersection; 150 | std::set_intersection(common_shards.begin(), common_shards.end(), 151 | shards.begin(), shards.end(), 152 | std::back_inserter(intersection)); 153 | common_shards = intersection; 154 | } 155 | 156 | // intersection is empty, return early 157 | if (common_shards.size() == 0) { 158 | return res; 159 | } 160 | } 161 | } 162 | } 163 | 164 | struct iter { 165 | greylock::index_iterator begin, end; 166 | 167 | iter(DBT &db, const std::string &mbox, const std::string &attr, const std::string &token, 168 | const std::vector &shards) : 169 | begin(greylock::index_iterator::begin(db, mbox, attr, token, shards)), 170 | end(greylock::index_iterator::end(db, mbox, attr, token)) 171 | { 172 | } 173 | }; 174 | 175 | // contains vector of iterators pointing to the requested indexes 176 | // iterator always points to the smallest document ID not yet pushed into resulting structure (or to client) 177 | // or discarded (if other index iterators point to larger document IDs) 178 | std::vector idata; 179 | std::vector inegation; 180 | 181 | for (const auto &ent: iq.se) { 182 | for (const auto &attr: ent.idx.attributes) { 183 | for (const auto &t: attr.tokens) { 184 | iter itr(m_db_indexes, ent.mbox, attr.name, t.name, common_shards); 185 | 186 | if (iq.next_document_id != 0) { 187 | itr.begin.rewind_to_index(iq.next_document_id); 188 | } else { 189 | itr.begin.rewind_to_index(iq.range_start); 190 | } 191 | 192 | idata.emplace_back(itr); 193 | } 194 | } 195 | 196 | for (const auto &attr: ent.idx.negation) { 197 | for (const auto &t: attr.tokens) { 198 | std::string shard_key = document::generate_shard_key(m_db_indexes.options(), ent.mbox, attr.name, t.name); 199 | auto shards = m_db_indexes.get_shards(shard_key); 200 | #ifdef STDOUT_DEBUG 201 | printf("negation: key: %s, shards: %s\n", 202 | shard_key.c_str(), 203 | dump_vector(shards).c_str()); 204 | #endif 205 | 206 | iter itr(m_db_indexes, ent.mbox, attr.name, t.name, shards); 207 | inegation.emplace_back(itr); 208 | } 209 | } 210 | } 211 | 212 | while (true) { 213 | // contains indexes within @idata array of iterators, 214 | // each iterator contains the same and smallest to the known moment reference to the document (i.e. document ID) 215 | // 216 | // if checking @idata array yelds smaller document ID than that in iterators referenced in @pos, 217 | // then we clear @pos and starts pushing the new smallest iterator indexes 218 | // 219 | // we could break out of the @idata processing, increase the smallest pointing iterator and start over, 220 | // but we optimize @idata processing - if there are other iterators in @idata which equal to the smallest 221 | // iterator value (document ID), we put them into @pos 222 | // Since @pos doesn't contain all indexes (its size doesn't equal to the size of @idata), we will increase 223 | // all iterators where we have found the smallest document ID, hopefully they will point to the new document ID, 224 | // which might be the same for all iterator among @idata and thus we will push this document ID to the result 225 | // structure returned to the client 226 | // 227 | // Here is an example: 228 | // 229 | // 1. @idata iterators 0 1 2 3 230 | // ------------------------- 231 | // document ids d0 d2 d3 d3 232 | // d2 d3 d4 d4 233 | // d3 d4 d5 d5 234 | // d4 - - - 235 | // d5 - - - 236 | // 237 | // We start from the top of this table, i.e. row after 'document ids' string 238 | // @pos will contain following values during iteration over @idata iterators 239 | // 0 - select the first value 240 | // 0 - skip iterator 1 (d2 document id) since its value is greater than that 0'th iterator value (d0) 241 | // 0 - skip iterator 2 242 | // 0 - skip iterator 3 243 | // 244 | // @pos contains only 0 index, it is not equal to the size of @idata (4), thus we have to increase 0'th iterator 245 | // discarding its first value 246 | // 247 | // 2. @idata iterators 0 1 2 3 248 | // ------------------------- 249 | // document ids d2 d2 d3 d3 250 | // d3 d3 d4 d4 251 | // d4 d4 d5 d5 252 | // d5 - - - 253 | // @pos: 254 | // 0 - select the first iterator 255 | // 0 1 - 1'th iterator value equals to the value of the 0'th iterator, append it to the array 256 | // 0 1 - 2'th iterator value (d3) is greater than that of the 0'th iterator (d2) 257 | // 0 1 - the same as above 258 | // since size of the @pos is not equal to the size of @idata we increment all iterators which are indexed in @pos 259 | // 260 | // 3. @idata iterators 0 1 2 3 261 | // ------------------------- 262 | // document ids d3 d3 d3 d3 263 | // d4 d4 d4 d4 264 | // d5 - d5 d5 265 | // @pos will contain all 4 indexes, since all iterator's value are the same (d3) 266 | // We will increment all iterators and push d3 into resulting array which will be returned to the client, 267 | // since size of the @pos array equals to the @idata size 268 | // 269 | // 4. @idata iterators 0 1 2 3 270 | // ------------------------- 271 | // document ids d4 d4 d4 d4 272 | // d5 - d5 d5 273 | // We put d4 into resulting array and increment all iterators as above 274 | // 275 | // 5. @idata iterators 0 1 2 3 276 | // ------------------------- 277 | // document ids d5 - d5 d5 278 | // 279 | // @pos: 280 | // 0 - select the first iterator 281 | // Stop processing, since 1'th iterator is empty. 282 | // This means no further iteration checks can contain all 4 the same value, 283 | // thus it is not possible to find any other document with higher ID 284 | // which will contain all 4 requested indexes. 285 | // 286 | // 6. Return [d3, d4] values to the client 287 | std::vector pos; 288 | 289 | id_t next_id; 290 | 291 | int current = -1; 292 | for (auto &itr: idata) { 293 | auto &it = itr.begin; 294 | auto &e = itr.end; 295 | ++current; 296 | 297 | if (it == e) { 298 | res.completed = true; 299 | break; 300 | } 301 | 302 | if (it->indexed_id > iq.range_end) { 303 | res.completed = true; 304 | break; 305 | } 306 | 307 | res.completed = false; 308 | res.next_document_id.set_next_id(it->indexed_id); 309 | 310 | if (pos.size() == 0) { 311 | pos.push_back(current); 312 | continue; 313 | } 314 | 315 | auto &min_it = idata[pos[0]].begin; 316 | #if 0 317 | BH_LOG(m_bp.logger(), INDEXES_LOG_INFO, "intersection: min-index: %s, id: %s, it-index: %s, id: %s", 318 | idata[pos[0]].idx.start_key().str(), min_it->str(), 319 | idata_it->idx.start_key().str(), it->str()); 320 | #endif 321 | if (it->indexed_id == min_it->indexed_id) { 322 | pos.push_back(current); 323 | continue; 324 | } 325 | 326 | next_id = std::max(it->indexed_id, min_it->indexed_id); 327 | res.next_document_id.set_next_id(next_id); 328 | 329 | pos.clear(); 330 | break; 331 | } 332 | 333 | // this can only happen if one of the iterators has been finished, 334 | // which means number of found positions will not be equal to the number 335 | // of indexes to intersect, and thus there is no more data to push into result. 336 | // Just break out of the processing loop - nothing can be added anymore. 337 | if (res.completed) { 338 | break; 339 | } 340 | 341 | // number of entries with the same document ID doesn't match number of indexes, 342 | // this means some index doesn't have this docuement and thus it has to be skipped 343 | // and iteration check process has to be started over 344 | if (pos.size() != idata.size()) { 345 | for (auto &it: idata) { 346 | auto &min_it = it.begin; 347 | 348 | min_it.rewind_to_index(next_id); 349 | } 350 | 351 | continue; 352 | } 353 | 354 | auto &min_it = idata[pos.front()].begin; 355 | id_t indexed_id = min_it->indexed_id; 356 | 357 | bool negation_match = false; 358 | for (auto &neg: inegation) { 359 | auto &it = neg.begin; 360 | it.rewind_to_index(indexed_id); 361 | if (it != neg.end) { 362 | if (it->indexed_id == indexed_id) { 363 | negation_match = true; 364 | break; 365 | } 366 | } 367 | } 368 | 369 | auto increment_all_iterators = [&] () { 370 | for (auto it = pos.begin(); it != pos.end(); ++it) { 371 | auto &idata_iter = idata[*it].begin; 372 | ++idata_iter; 373 | } 374 | }; 375 | 376 | if (negation_match) { 377 | increment_all_iterators(); 378 | continue; 379 | } 380 | 381 | single_doc_result rs; 382 | auto err = min_it.document(m_db_docs, &rs.doc); 383 | if (err) { 384 | #if 0 385 | printf("could not read document id: %ld, err: %s [%d]\n", 386 | min_it->indexed_id, err.message().c_str(), err.code()); 387 | #endif 388 | increment_all_iterators(); 389 | continue; 390 | } 391 | rs.doc.indexed_id = indexed_id; 392 | 393 | // increment all iterators 394 | increment_all_iterators(); 395 | 396 | if (!check(rs)) { 397 | continue; 398 | } 399 | 400 | res.docs.emplace_back(rs); 401 | if (res.docs.size() == iq.max_number) 402 | break; 403 | } 404 | 405 | return res; 406 | } 407 | private: 408 | DBT &m_db_docs; 409 | DBT &m_db_indexes; 410 | }; 411 | 412 | }} // namespace ioremap::greylock 413 | 414 | #endif // __INDEXES_INTERSECTION_HPP 415 | -------------------------------------------------------------------------------- /include/greylock/iterator.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "greylock/types.hpp" 4 | 5 | #include 6 | 7 | //#define STDOUT_DEBUG 8 | #ifdef STDOUT_DEBUG 9 | #define dprintf(fmt, a...) printf(fmt, ##a) 10 | #else 11 | #define dprintf(fmt, ...) 12 | #endif 13 | 14 | namespace ioremap { namespace greylock { 15 | 16 | template 17 | class index_iterator { 18 | private: 19 | disk_index m_current; 20 | typename decltype(m_current.ids)::iterator m_idx_current, m_idx_end; 21 | public: 22 | typedef index_iterator self_type; 23 | typedef disk_index::value_type value_type; 24 | typedef typename decltype(m_current.ids)::iterator::reference reference; 25 | typedef typename decltype(m_current.ids)::iterator::pointer pointer; 26 | typedef std::forward_iterator_tag iterator_category; 27 | typedef std::ptrdiff_t difference_type; 28 | 29 | static index_iterator begin(DBT &db, const std::string &mbox, const std::string &attr, const std::string &token) { 30 | std::string index_base = document::generate_index_base(db.options(), mbox, attr, token); 31 | std::vector shards(db.get_shards(document::generate_shard_key(db.options(), mbox, attr, token))); 32 | if (shards.size() == 0) { 33 | return end(db, index_base); 34 | } 35 | 36 | return index_iterator(db, index_base, shards); 37 | } 38 | static index_iterator begin(DBT &db, const std::string &mbox, const std::string &attr, const std::string &token, 39 | const std::vector &shards) { 40 | std::string index_base = document::generate_index_base(db.options(), mbox, attr, token); 41 | if (shards.size() == 0) { 42 | return end(db, index_base); 43 | } 44 | 45 | return index_iterator(db, index_base, shards); 46 | } 47 | 48 | static index_iterator end(DBT &db, const std::string &base) { 49 | return index_iterator(db, base); 50 | } 51 | static index_iterator end(DBT &db, const std::string &mbox, const std::string &attr, const std::string &token) { 52 | std::string index_base = document::generate_index_base(db.options(), mbox, attr, token); 53 | return index_iterator(db, index_base); 54 | } 55 | 56 | index_iterator(const index_iterator &src): m_db(src.m_db) { 57 | m_current = src.m_current; 58 | if (src.m_idx_current == src.m_idx_end) { 59 | m_idx_current = m_idx_end = m_current.ids.end(); 60 | } else { 61 | typename decltype(src.m_current.ids)::const_iterator sib = src.m_current.ids.begin(); 62 | typename decltype(src.m_current.ids)::const_iterator sic = src.m_idx_current; 63 | 64 | auto diff = std::distance(sib, sic); 65 | dprintf("src: %s, diff: %ld\n", src.to_string().c_str(), diff); 66 | 67 | m_idx_current = std::next(m_current.ids.begin(), diff); 68 | m_idx_end = m_current.ids.end(); 69 | } 70 | 71 | m_base = src.m_base; 72 | m_shards = src.m_shards; 73 | m_shards_idx = src.m_shards_idx; 74 | } 75 | 76 | self_type &operator++() { 77 | ++m_idx_current; 78 | if (m_idx_current == m_idx_end) { 79 | load_next(); 80 | } 81 | return *this; 82 | } 83 | 84 | self_type &rewind_to_index(const id_t &idx) { 85 | size_t rewind_shard = document::generate_shard_number(m_db.options(), idx); 86 | dprintf("rewind: %s, idx: %s, rewind_shard: %ld\n", to_string().c_str(), idx.to_string().c_str(), rewind_shard); 87 | 88 | auto rewind_shard_it = std::lower_bound(m_shards.begin(), m_shards.end(), rewind_shard); 89 | if (rewind_shard_it == m_shards.end()) { 90 | set_shard_index(-1); 91 | dprintf("could not increase iterator: %s\n", to_string().c_str()); 92 | return *this; 93 | } 94 | 95 | int rewind_shard_idx = std::distance(m_shards.begin(), rewind_shard_it); 96 | if (rewind_shard_idx != m_shards_idx - 1) { 97 | set_shard_index(rewind_shard_idx); 98 | load_next(); 99 | } 100 | 101 | if (m_shards_idx >= 0) { 102 | document_for_index did; 103 | did.indexed_id = idx; 104 | 105 | do { 106 | m_idx_current = std::lower_bound(m_idx_current, m_idx_end, did); 107 | if (m_idx_current == m_idx_end) { 108 | load_next(); 109 | if (m_shards_idx < 0) 110 | break; 111 | } 112 | 113 | } while (m_idx_current->indexed_id < idx); 114 | } 115 | 116 | dprintf("increased iterator: %s\n", to_string().c_str()); 117 | return *this; 118 | } 119 | 120 | reference operator*() { 121 | return *m_idx_current; 122 | } 123 | pointer operator->() { 124 | return &(*m_idx_current); 125 | } 126 | 127 | error_info document(DBT &db, document *doc) { 128 | std::string doc_data; 129 | auto err = db.read(greylock::options::documents_column, m_idx_current->indexed_id.to_string(), &doc_data); 130 | if (err) 131 | return err; 132 | 133 | deserialize(*doc, doc_data.data(), doc_data.size()); 134 | return greylock::error_info(); 135 | } 136 | 137 | std::string to_string() const { 138 | auto dump_shards = [&]() -> std::string { 139 | std::ostringstream out; 140 | for (size_t i = 0; i < m_shards.size(); ++i) { 141 | out << m_shards[i]; 142 | if (i != m_shards.size() - 1) 143 | out << " "; 144 | } 145 | return out.str(); 146 | }; 147 | std::ostringstream ss; 148 | ss << "base: " << m_base << 149 | ", next_shard_idx: " << m_shards_idx << 150 | ", shards: [" << dump_shards() << "] " << 151 | ", ids_size: " << m_current.ids.size() << 152 | ", current_is_end: " << (m_idx_current == m_idx_end) << 153 | ", indexed_id: " << ((m_idx_current == m_idx_end) ? "none" : m_idx_current->indexed_id.to_string()); 154 | return ss.str(); 155 | } 156 | 157 | bool operator==(const self_type& rhs) { 158 | if (m_base != rhs.m_base) 159 | return false; 160 | if (m_shards.size() != rhs.m_shards.size()) 161 | return false; 162 | if (m_shards != rhs.m_shards) 163 | return false; 164 | if (m_shards_idx != rhs.m_shards_idx) 165 | return false; 166 | 167 | if ((m_idx_current == m_idx_end) && (rhs.m_idx_current == rhs.m_idx_end)) 168 | return true; 169 | 170 | if (m_idx_current->indexed_id != rhs.m_idx_current->indexed_id) 171 | return false; 172 | 173 | return true; 174 | } 175 | bool operator!=(const self_type& rhs) { 176 | return !operator==(rhs); 177 | } 178 | 179 | private: 180 | DBT &m_db; 181 | std::string m_base; 182 | std::vector m_shards; 183 | int m_shards_idx = -1; 184 | 185 | index_iterator(DBT &db, const std::string &base): m_db(db), m_base(base) { 186 | } 187 | 188 | index_iterator(DBT &db, const std::string &base, const std::vector shards): m_db(db), m_base(base), m_shards(shards) { 189 | set_shard_index(0); 190 | load_next(); 191 | } 192 | 193 | void set_shard_index(int idx) { 194 | m_shards_idx = idx; 195 | if (idx < 0) { 196 | m_shards.clear(); 197 | 198 | m_current.ids.clear(); 199 | m_idx_current = m_current.ids.begin(); 200 | m_idx_end = m_current.ids.end(); 201 | } 202 | } 203 | 204 | void load_next() { 205 | do { 206 | load_next_one(); 207 | } while (m_shards_idx >= 0 && m_current.ids.empty()); 208 | } 209 | 210 | void load_next_one() { 211 | dprintf("loading: %s\n", to_string().c_str()); 212 | m_current.ids.clear(); 213 | m_idx_current = m_current.ids.begin(); 214 | m_idx_end = m_current.ids.end(); 215 | 216 | if (m_shards_idx < 0 || m_shards_idx >= (int)m_shards.size()) { 217 | set_shard_index(-1); 218 | return; 219 | } 220 | 221 | std::string key = document::generate_index_key_shard_number(m_base, m_shards[m_shards_idx]); 222 | std::string data; 223 | auto err = m_db.read(greylock::options::indexes_column, key, &data); 224 | if (err) { 225 | set_shard_index(-1); 226 | return; 227 | } 228 | 229 | try { 230 | deserialize(m_current, data.data(), data.size()); 231 | 232 | m_idx_current = m_current.ids.begin(); 233 | m_idx_end = m_current.ids.end(); 234 | } catch (...) { 235 | set_shard_index(-1); 236 | return; 237 | } 238 | 239 | set_shard_index(m_shards_idx + 1); 240 | dprintf("loaded: %s\n", to_string().c_str()); 241 | } 242 | }; 243 | }} // namespace ioremap::greylock 244 | -------------------------------------------------------------------------------- /include/greylock/json.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __INDEXES_JSON_HPP 2 | #define __INDEXES_JSON_HPP 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace ioremap { namespace greylock { 9 | 10 | static inline const char *get_string(const rapidjson::Value &entry, const char *name, const char *def = NULL) { 11 | if (entry.HasMember(name)) { 12 | const rapidjson::Value &v = entry[name]; 13 | if (v.IsString()) { 14 | return v.GetString(); 15 | } 16 | } 17 | 18 | return def; 19 | } 20 | 21 | static inline int64_t get_int64(const rapidjson::Value &entry, const char *name, int64_t def = -1) { 22 | if (entry.HasMember(name)) { 23 | const rapidjson::Value &v = entry[name]; 24 | if (v.IsInt()) { 25 | return v.GetInt(); 26 | } 27 | if (v.IsUint()) { 28 | return v.GetUint(); 29 | } 30 | if (v.IsInt64()) { 31 | return v.GetInt64(); 32 | } 33 | if (v.IsUint()) { 34 | return v.GetUint64(); 35 | } 36 | } 37 | 38 | return def; 39 | } 40 | 41 | static inline const rapidjson::Value &get_object(const rapidjson::Value &entry, const char *name, 42 | const rapidjson::Value &def = rapidjson::Value()) { 43 | if (entry.HasMember(name)) { 44 | const rapidjson::Value &v = entry[name]; 45 | 46 | if (v.IsObject()) 47 | return v; 48 | } 49 | 50 | return def; 51 | } 52 | 53 | static inline const rapidjson::Value &get_array(const rapidjson::Value &entry, const char *name, 54 | const rapidjson::Value &def = rapidjson::Value()) { 55 | if (entry.HasMember(name)) { 56 | const rapidjson::Value &v = entry[name]; 57 | 58 | if (v.IsArray()) 59 | return v; 60 | } 61 | 62 | return def; 63 | } 64 | 65 | static inline bool get_bool(const rapidjson::Value &entry, const char *name, bool def = true) { 66 | if (entry.HasMember(name)) { 67 | const rapidjson::Value &v = entry[name]; 68 | 69 | if (v.IsBool()) 70 | return v.GetBool(); 71 | } 72 | 73 | return def; 74 | } 75 | 76 | }} // namespace ioremap::greylock 77 | 78 | #endif // __INDEXES_JSON_HPP 79 | -------------------------------------------------------------------------------- /include/greylock/jsonvalue.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | namespace ioremap { namespace greylock { 12 | 13 | class JsonValue : public rapidjson::Value 14 | { 15 | public: 16 | JsonValue() { 17 | SetObject(); 18 | } 19 | 20 | ~JsonValue() { 21 | } 22 | 23 | static void set_time(rapidjson::Value &obj, rapidjson::Document::AllocatorType &alloc, long tsec, long usec) { 24 | char str[64]; 25 | struct tm tm; 26 | 27 | localtime_r((time_t *)&tsec, &tm); 28 | strftime(str, sizeof(str), "%F %Z %R:%S", &tm); 29 | 30 | char time_str[128]; 31 | snprintf(time_str, sizeof(time_str), "%s.%06lu", str, usec); 32 | 33 | obj.SetObject(); 34 | 35 | rapidjson::Value tobj(time_str, strlen(time_str), alloc); 36 | obj.AddMember("time", tobj, alloc); 37 | 38 | std::string raw_time = std::to_string(tsec) + "." + std::to_string(usec); 39 | rapidjson::Value tobj_raw(raw_time.c_str(), raw_time.size(), alloc); 40 | obj.AddMember("time-raw", tobj_raw, alloc); 41 | } 42 | 43 | std::string ToString() const { 44 | rapidjson::StringBuffer buffer; 45 | rapidjson::PrettyWriter writer(buffer); 46 | 47 | Accept(writer); 48 | buffer.Put('\n'); 49 | 50 | return std::string(buffer.GetString(), buffer.Size()); 51 | } 52 | 53 | rapidjson::MemoryPoolAllocator<> &GetAllocator() { 54 | return m_allocator; 55 | } 56 | 57 | private: 58 | rapidjson::MemoryPoolAllocator<> m_allocator; 59 | }; 60 | 61 | 62 | }} // namespace ioremap::greylock 63 | -------------------------------------------------------------------------------- /include/greylock/types.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "greylock/database.hpp" 4 | #include "greylock/json.hpp" 5 | #include "greylock/id.hpp" 6 | 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include 19 | 20 | namespace ioremap { namespace greylock { 21 | 22 | typedef int pos_t; 23 | 24 | struct token { 25 | std::string name; 26 | std::vector positions; 27 | 28 | std::string shard_key; 29 | std::set shards; 30 | 31 | token(const std::string &name): name(name) {} 32 | void insert_position(pos_t pos) { 33 | positions.push_back(pos); 34 | } 35 | void insert_positions(const std::vector &pos) { 36 | positions.insert(positions.end(), pos.begin(), pos.end()); 37 | } 38 | 39 | std::string key; 40 | }; 41 | 42 | struct attribute { 43 | std::string name; 44 | std::vector tokens; 45 | 46 | attribute(const std::string &name): name(name) {} 47 | void insert(const std::string &tname, pos_t pos) { 48 | auto it = std::find_if(tokens.begin(), tokens.end(), [&](const token &t) { 49 | return t.name == tname; 50 | }); 51 | if (it == tokens.end()) { 52 | token t(tname); 53 | t.insert_position(pos); 54 | tokens.emplace_back(t); 55 | return; 56 | } 57 | 58 | it->insert_position(pos); 59 | } 60 | 61 | void insert(const std::string &tname, const std::vector &positions) { 62 | auto it = std::find_if(tokens.begin(), tokens.end(), [&](const token &t) { 63 | return t.name == tname; 64 | }); 65 | if (it == tokens.end()) { 66 | token t(tname); 67 | t.insert_positions(positions); 68 | tokens.emplace_back(t); 69 | return; 70 | } 71 | 72 | it->insert_positions(positions); 73 | } 74 | }; 75 | 76 | struct indexes { 77 | std::vector attributes; 78 | std::vector exact; 79 | std::vector negation; 80 | 81 | std::vector merge(const std::vector &our, const std::vector &other) const { 82 | std::map attrs; 83 | 84 | auto merge_one = [&] (const std::vector &v) { 85 | for (auto &a: v) { 86 | if (a.tokens.empty()) 87 | continue; 88 | 89 | auto it = attrs.find(a.name); 90 | if (it == attrs.end()) { 91 | attrs.insert(std::make_pair(a.name, a)); 92 | } else { 93 | for (auto &t: a.tokens) { 94 | it->second.insert(t.name, t.positions); 95 | } 96 | } 97 | } 98 | }; 99 | 100 | merge_one(our); 101 | merge_one(other); 102 | 103 | std::vector ret; 104 | ret.reserve(attrs.size()); 105 | for (auto &p: attrs) { 106 | ret.push_back(p.second); 107 | } 108 | return ret; 109 | } 110 | 111 | void merge_query(const indexes &other) { 112 | attributes = merge(attributes, other.attributes); 113 | } 114 | 115 | void merge_exact(const indexes &other) { 116 | exact = merge(exact, other.attributes); 117 | } 118 | 119 | void merge_negation(const indexes &other) { 120 | negation = merge(negation, other.attributes); 121 | } 122 | 123 | std::string to_string() const { 124 | std::ostringstream ss; 125 | 126 | auto dump_attributes = [] (const std::vector &v) { 127 | return dump_vector(v, [] (const attribute &a) -> std::string { 128 | std::ostringstream ss; 129 | ss << a.name; 130 | if (a.tokens.size()) { 131 | ss << "{"; 132 | for (size_t i = 0; i < a.tokens.size(); ++i) { 133 | auto &token = a.tokens[i]; 134 | ss << token.name; 135 | if (i != a.tokens.size() - 1) 136 | ss << " "; 137 | } 138 | ss << "}"; 139 | } 140 | return ss.str(); 141 | }); 142 | }; 143 | 144 | ss << "negation: [" << dump_attributes(negation) << "] " << 145 | "exact: [" << dump_attributes(exact) << "] " << 146 | "query: [" << dump_attributes(attributes) << "] "; 147 | return ss.str(); 148 | } 149 | 150 | static indexes get_indexes(const greylock::options &options, const rapidjson::Value &idxs) { 151 | indexes ireq; 152 | 153 | if (!idxs.IsObject()) 154 | return ireq; 155 | 156 | ribosome::split spl; 157 | for (rapidjson::Value::ConstMemberIterator it = idxs.MemberBegin(), idxs_end = idxs.MemberEnd(); it != idxs_end; ++it) { 158 | const char *aname = it->name.GetString(); 159 | const rapidjson::Value &avalue = it->value; 160 | 161 | if (!avalue.IsString()) 162 | continue; 163 | 164 | greylock::attribute a(aname); 165 | 166 | std::vector indexes = 167 | spl.convert_split_words(avalue.GetString(), avalue.GetStringLength()); 168 | for (size_t pos = 0; pos < indexes.size(); ++pos) { 169 | auto &idx = indexes[pos]; 170 | if (idx.size() >= options.ngram_index_size) { 171 | a.insert(ribosome::lconvert::to_string(idx), pos); 172 | } else { 173 | if (pos > 0) { 174 | auto &prev = indexes[pos - 1]; 175 | a.insert(ribosome::lconvert::to_string(prev + idx), pos); 176 | } 177 | 178 | if (pos < indexes.size() - 1) { 179 | auto &next = indexes[pos + 1]; 180 | a.insert(ribosome::lconvert::to_string(idx + next), pos); 181 | } 182 | } 183 | } 184 | 185 | ireq.attributes.emplace_back(a); 186 | } 187 | 188 | return ireq; 189 | } 190 | 191 | }; 192 | 193 | struct content { 194 | std::string content; 195 | std::string title; 196 | std::vector links; 197 | std::vector images; 198 | 199 | MSGPACK_DEFINE(content, title, links, images); 200 | }; 201 | 202 | struct document { 203 | id_t indexed_id; 204 | 205 | enum { 206 | serialize_version_7 = 7, 207 | }; 208 | 209 | std::string mbox; 210 | 211 | bool is_comment = false; 212 | 213 | std::string author; 214 | std::string id; 215 | 216 | content ctx; 217 | 218 | indexes idx; 219 | 220 | template 221 | void msgpack_pack(msgpack::packer &o) const { 222 | o.pack_array(document::serialize_version_7); 223 | o.pack((int)document::serialize_version_7); 224 | o.pack(is_comment); 225 | o.pack(author); 226 | o.pack(ctx); 227 | o.pack(id); 228 | o.pack(indexed_id); 229 | o.pack(0); // unused 230 | } 231 | 232 | void msgpack_unpack(msgpack::object o) { 233 | if (o.type != msgpack::type::ARRAY) { 234 | std::ostringstream ss; 235 | ss << "could not unpack document, object type is " << o.type << 236 | ", must be array (" << msgpack::type::ARRAY << ")"; 237 | throw std::runtime_error(ss.str()); 238 | } 239 | 240 | int version; 241 | 242 | msgpack::object *p = o.via.array.ptr; 243 | p[0].convert(&version); 244 | 245 | if (version != (int)o.via.array.size) { 246 | std::ostringstream ss; 247 | ss << "could not unpack document, invalid version: " << version << ", array size: " << o.via.array.size; 248 | throw std::runtime_error(ss.str()); 249 | } 250 | 251 | switch (version) { 252 | case document::serialize_version_7: 253 | p[1].convert(&is_comment); 254 | p[2].convert(&author); 255 | p[3].convert(&ctx); 256 | p[4].convert(&id); 257 | p[5].convert(&indexed_id); 258 | //p[6].convert(); unused 259 | break; 260 | default: { 261 | std::ostringstream ss; 262 | ss << "could not unpack document, invalid version " << version; 263 | throw std::runtime_error(ss.str()); 264 | } 265 | } 266 | } 267 | 268 | void assign_id(const char *cid, long seq, long tsec, long tnsec) { 269 | id.assign(cid); 270 | (void) tnsec; 271 | indexed_id.set_timestamp(tsec, seq); 272 | } 273 | 274 | void generate_token_keys(const options &options) { 275 | size_t shard_number = generate_shard_number(options, indexed_id); 276 | 277 | for (auto &attr: idx.attributes) { 278 | for (auto &t: attr.tokens) { 279 | std::string index_base = generate_index_base(options, mbox, attr.name, t.name); 280 | t.key = generate_index_key_shard_number(index_base, shard_number); 281 | t.shard_key = generate_shard_key(options, mbox, attr.name, t.name); 282 | 283 | t.shards.insert(shard_number); 284 | } 285 | } 286 | } 287 | 288 | static size_t generate_shard_number(const options &options, const id_t &indexed_id) { 289 | long tsec, tnsec; 290 | indexed_id.get_timestamp(&tsec, &tnsec); 291 | return tsec / options.tokens_shard_size; 292 | } 293 | 294 | static std::string generate_index_base(const options &options, 295 | const std::string &mbox, const std::string &attr, const std::string &token) { 296 | (void) options; 297 | char ckey[mbox.size() + attr.size() + token.size() + 5]; 298 | size_t csize = snprintf(ckey, sizeof(ckey), "%s.%s.%s", 299 | mbox.c_str(), attr.c_str(), token.c_str()); 300 | 301 | return std::string(ckey, csize); 302 | } 303 | 304 | static std::string generate_index_key_shard_number(const std::string &base, size_t sn) { 305 | char ckey[base.size() + 19]; 306 | size_t csize = snprintf(ckey, sizeof(ckey), "%016lx.%s", sn, base.c_str()); 307 | 308 | return std::string(ckey, csize); 309 | } 310 | static std::string generate_index_key(const options &options, const std::string &base, const id_t &indexed_id) { 311 | size_t shard_number = generate_shard_number(options, indexed_id); 312 | return generate_index_key_shard_number(base, shard_number); 313 | } 314 | static std::string generate_index_key(const options &options, 315 | const std::string &mbox, const std::string &attr, const std::string &token, 316 | const id_t &indexed_id) { 317 | std::string base = generate_index_base(options, mbox, attr, token); 318 | return generate_index_key(options, base, indexed_id); 319 | } 320 | 321 | static std::string generate_shard_key(const options &options, 322 | const std::string &mbox, const std::string &attr, const std::string &token) { 323 | return generate_index_base(options, mbox, attr, token); 324 | } 325 | }; 326 | 327 | }} // namespace ioremap::greylock 328 | -------------------------------------------------------------------------------- /include/greylock/utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "greylock/error.hpp" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | namespace ioremap { namespace greylock { 12 | 13 | template 14 | std::string dump_vector(const std::vector &vec) { 15 | std::ostringstream ss; 16 | for (size_t i = 0; i < vec.size(); ++i) { 17 | ss << vec[i]; 18 | if (i != vec.size() - 1) 19 | ss << " "; 20 | } 21 | 22 | return ss.str(); 23 | } 24 | 25 | template 26 | std::string dump_vector(const std::vector &vec, std::function convert) { 27 | std::ostringstream ss; 28 | for (size_t i = 0; i < vec.size(); ++i) { 29 | ss << convert(vec[i]); 30 | if (i != vec.size() - 1) 31 | ss << " "; 32 | } 33 | 34 | return ss.str(); 35 | } 36 | 37 | template 38 | greylock::error_info deserialize(T &t, const char *data, size_t size) { 39 | msgpack::unpacked msg; 40 | try { 41 | msgpack::unpack(&msg, data, size); 42 | 43 | msg.get().convert(&t); 44 | } catch (const std::exception &e) { 45 | std::ostringstream ss; 46 | ss << msg.get(); 47 | return greylock::create_error(-EINVAL, "could not unpack data, size: %ld, value: %s, error: %s", 48 | size, ss.str().c_str(), e.what()); 49 | } 50 | 51 | return greylock::error_info(); 52 | } 53 | 54 | template 55 | std::string serialize(const T &t) { 56 | std::stringstream buffer; 57 | msgpack::pack(buffer, t); 58 | buffer.seekg(0); 59 | return buffer.str(); 60 | } 61 | 62 | }} // namesapce ioremap::greylock 63 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(greylock SHARED exception.cpp) 2 | set_target_properties(greylock PROPERTIES 3 | VERSION ${GREYLOCK_MAJOR_VERSION} 4 | SOVERSION ${GREYLOCK_FULL_VERSION} 5 | LINKER_LANGUAGE CXX 6 | ) 7 | target_link_libraries(greylock 8 | ${Boost_LIBRARIES} 9 | ${JEMALLOC_LIBRARIES} 10 | ${MSGPACK_LIBRARIES} 11 | ${RIBOSOME_LIBRARIES} 12 | ${ROCKSDB_LIBRARIES} 13 | ${SWARM_LIBRARIES} 14 | ${THEVOID_LIBRARIES} 15 | pthread 16 | ) 17 | add_executable(greylock_server server.cpp) 18 | target_link_libraries(greylock_server 19 | greylock 20 | ) 21 | 22 | add_executable(greylock_list list.cpp) 23 | target_link_libraries(greylock_list 24 | greylock 25 | ) 26 | add_executable(greylock_meta meta.cpp) 27 | target_link_libraries(greylock_meta 28 | greylock 29 | ) 30 | add_executable(greylock_check check.cpp) 31 | target_link_libraries(greylock_check 32 | greylock 33 | ) 34 | add_executable(greylock_compact compact.cpp) 35 | target_link_libraries(greylock_compact 36 | greylock 37 | ) 38 | 39 | add_executable(greylock_merge merge.cpp) 40 | target_link_libraries(greylock_merge 41 | greylock 42 | ) 43 | 44 | install(TARGETS greylock 45 | LIBRARY DESTINATION lib${LIB_SUFFIX} 46 | ARCHIVE DESTINATION lib${LIB_SUFFIX} 47 | BUNDLE DESTINATION library 48 | ) 49 | install(TARGETS greylock_server greylock_meta greylock_check greylock_compact greylock_merge 50 | RUNTIME DESTINATION bin COMPONENT runtime 51 | ) 52 | 53 | -------------------------------------------------------------------------------- /src/check.cpp: -------------------------------------------------------------------------------- 1 | #include "greylock/database.hpp" 2 | #include "greylock/types.hpp" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | using namespace ioremap; 12 | 13 | static inline const char *print_time(long tsec, long tnsec) 14 | { 15 | char str[64]; 16 | struct tm tm; 17 | 18 | static __thread char __dnet_print_time[128]; 19 | 20 | localtime_r((time_t *)&tsec, &tm); 21 | strftime(str, sizeof(str), "%F %R:%S", &tm); 22 | 23 | snprintf(__dnet_print_time, sizeof(__dnet_print_time), "%s.%06llu", str, (long long unsigned) tnsec / 1000); 24 | return __dnet_print_time; 25 | } 26 | 27 | class checker { 28 | public: 29 | checker(long print_interval) : m_print_interval(print_interval) { 30 | } 31 | 32 | void check(int column, const std::string &input) { 33 | std::unique_ptr dbu(new greylock::database()); 34 | auto err = dbu->open_read_only(input); 35 | if (err) { 36 | ribosome::throw_error(err.code(), "could not open input database: %s: %s", 37 | input.c_str(), err.message().c_str()); 38 | } 39 | 40 | printf("Input database %s has been opened\n", input.c_str()); 41 | 42 | rocksdb::ReadOptions ro; 43 | rocksdb::Iterator *it = dbu->iterator(column, ro); 44 | it->SeekToFirst(); 45 | 46 | printf("Input database %s has been positioned\n", input.c_str()); 47 | 48 | if (!it->Valid()) { 49 | auto s = it->status(); 50 | ribosome::throw_error(-s.code(), "iterator from database %s is not valid: %s [%d]", 51 | input.c_str(), s.ToString().c_str(), s.code()); 52 | } 53 | 54 | size_t prev_shard_number = 0; 55 | size_t shard_number = 0; 56 | size_t prev_documents = 0; 57 | size_t documents = 0; 58 | size_t shard_documents = 0; 59 | 60 | ribosome::timer tm, last_print; 61 | greylock::document doc; 62 | 63 | auto print_stats = [&] () -> char * { 64 | struct timespec ts; 65 | clock_gettime(CLOCK_REALTIME, &ts); 66 | 67 | static char tmp[1024]; 68 | 69 | snprintf(tmp, sizeof(tmp), 70 | "%s: %ld seconds: documents: %ld, speed: %.2f [%.2f] docs/s, " 71 | "shard: %ld, docs: %ld, id: %s, doc: %s", 72 | print_time(ts.tv_sec, ts.tv_nsec), 73 | tm.elapsed() / 1000, 74 | documents, 75 | (float)documents * 1000.0 / (float)tm.elapsed(), 76 | (float)(documents - prev_documents) * 1000.0 / (float)last_print.elapsed(), 77 | prev_shard_number, shard_documents, 78 | doc.indexed_id.to_string().c_str(), doc.id.c_str()); 79 | 80 | prev_documents = documents; 81 | last_print.restart(); 82 | return tmp; 83 | }; 84 | 85 | for (; it->Valid(); it->Next()) { 86 | auto sl = it->value(); 87 | 88 | auto gerr = deserialize(doc, sl.data(), sl.size()); 89 | if (gerr) { 90 | ribosome::throw_error(err.code(), "could not deserialize document, key: %s, size: %ld, error: %s [%d]", 91 | it->key().ToString().c_str(), sl.size(), gerr.message().c_str(), gerr.code()); 92 | } 93 | 94 | shard_number = greylock::document::generate_shard_number(greylock::options(), doc.indexed_id); 95 | if (shard_number > 10000) { 96 | printf("shard_number: %ld [%lx], id: %s, doc: %s\n", 97 | shard_number, shard_number, doc.indexed_id.to_string().c_str(), 98 | doc.id.c_str()); 99 | } 100 | 101 | if (shard_number < prev_shard_number) { 102 | printf("shard_number: %ld -> %ld, id: %s, doc: %s, error: shard number decreased\n", 103 | prev_shard_number, shard_number, doc.indexed_id.to_string().c_str(), 104 | doc.id.c_str()); 105 | } 106 | 107 | if ((last_print.elapsed() > m_print_interval) || (prev_shard_number != shard_number)) { 108 | std::cout << print_stats() << std::endl; 109 | } 110 | 111 | if (prev_shard_number != shard_number) { 112 | shard_documents = 0; 113 | } 114 | 115 | documents++; 116 | shard_documents++; 117 | 118 | prev_shard_number = shard_number; 119 | } 120 | std::cout << print_stats() << std::endl; 121 | } 122 | 123 | private: 124 | long m_print_interval; 125 | }; 126 | 127 | int main(int argc, char *argv[]) 128 | { 129 | namespace bpo = boost::program_options; 130 | 131 | bpo::options_description generic("Merge options"); 132 | 133 | std::string input; 134 | std::string column; 135 | long print_interval; 136 | generic.add_options() 137 | ("help", "This help message") 138 | ("column", bpo::value(&column)->required(), "Column name to check") 139 | ("input", bpo::value(&input)->required(), "Input rocksdb database") 140 | ("print-interval", bpo::value(&print_interval)->default_value(1000), "Period to dump merge stats (in milliseconds)") 141 | ; 142 | 143 | bpo::options_description cmdline_options; 144 | cmdline_options.add(generic); 145 | 146 | bpo::variables_map vm; 147 | 148 | try { 149 | bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm); 150 | 151 | if (vm.count("help")) { 152 | std::cout << generic << std::endl; 153 | return 0; 154 | } 155 | 156 | bpo::notify(vm); 157 | } catch (const std::exception &e) { 158 | std::cerr << "Invalid options: " << e.what() << "\n" << generic << std::endl; 159 | return -1; 160 | } 161 | 162 | greylock::options opt; 163 | auto it = std::find(opt.column_names.begin(), opt.column_names.end(), column); 164 | if (it == opt.column_names.end()) { 165 | std::cerr << "Invalig column " << column << ", supported columns: " << greylock::dump_vector(opt.column_names) << std::endl; 166 | return -EINVAL; 167 | } 168 | 169 | auto column_id = std::distance(opt.column_names.begin(), it); 170 | 171 | try { 172 | checker c(print_interval); 173 | c.check(column_id, input); 174 | } catch (const std::exception &e) { 175 | std::cerr << "Exception: " << e.what() << std::endl; 176 | return -1; 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /src/compact.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "greylock/database.hpp" 4 | #include "greylock/types.hpp" 5 | 6 | #include 7 | 8 | #include 9 | 10 | using namespace ioremap; 11 | 12 | static inline const char *print_time(long tsec, long tnsec) 13 | { 14 | char str[64]; 15 | struct tm tm; 16 | 17 | static __thread char __dnet_print_time[128]; 18 | 19 | localtime_r((time_t *)&tsec, &tm); 20 | strftime(str, sizeof(str), "%F %R:%S", &tm); 21 | 22 | snprintf(__dnet_print_time, sizeof(__dnet_print_time), "%s.%06llu", str, (long long unsigned) tnsec / 1000); 23 | return __dnet_print_time; 24 | } 25 | 26 | int main(int argc, char *argv[]) 27 | { 28 | namespace bpo = boost::program_options; 29 | 30 | bpo::options_description generic("Database compact options"); 31 | generic.add_options() 32 | ("help", "this help message") 33 | ; 34 | 35 | 36 | std::string dpath; 37 | long csize_mb; 38 | std::string cname; 39 | bpo::options_description gr("Compaction options"); 40 | gr.add_options() 41 | ("path", bpo::value(&dpath)->required(), "path to rocksdb database") 42 | ("column", bpo::value(&cname)->required(), "Column name to compact") 43 | ("size", bpo::value(&csize_mb)->default_value(1024), "Number of MBs to compact in one chunk") 44 | ; 45 | 46 | bpo::options_description cmdline_options; 47 | cmdline_options.add(generic).add(gr); 48 | 49 | bpo::variables_map vm; 50 | 51 | try { 52 | bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm); 53 | 54 | if (vm.count("help")) { 55 | std::cout << cmdline_options << std::endl; 56 | return 0; 57 | } 58 | 59 | bpo::notify(vm); 60 | } catch (const std::exception &e) { 61 | std::cerr << "Invalid options: " << e.what() << "\n" << cmdline_options << std::endl; 62 | return -1; 63 | } 64 | 65 | greylock::options opt; 66 | auto it = std::find(opt.column_names.begin(), opt.column_names.end(), cname); 67 | if (it == opt.column_names.end()) { 68 | std::cerr << "Invalig column " << cname << ", supported columns: " << greylock::dump_vector(opt.column_names) << std::endl; 69 | return -EINVAL; 70 | } 71 | 72 | auto column_id = std::distance(opt.column_names.begin(), it); 73 | 74 | #define SECONDS(x) ((x) / 1000.) 75 | 76 | try { 77 | ribosome::timer tm; 78 | 79 | greylock::database db; 80 | auto err = db.open_read_write(dpath); 81 | if (err) { 82 | std::cerr << "could not open database: " << err.message(); 83 | return err.code(); 84 | } 85 | long open_time = tm.elapsed(); 86 | printf("%.2fs : %.2fs: database has been opened\n", SECONDS(tm.elapsed()), SECONDS(open_time)); 87 | 88 | rocksdb::ReadOptions ro; 89 | auto it = db.iterator(column_id, ro); 90 | it->SeekToFirst(); 91 | long position_time = tm.elapsed() - open_time; 92 | printf("%.2fs : %.2fs: database has been positioned\n", SECONDS(tm.elapsed()), SECONDS(position_time)); 93 | 94 | if (!it->Valid()) { 95 | auto s = it->status(); 96 | fprintf(stderr, "iterator is not valid: %s [%d]", s.ToString().c_str(), s.code()); 97 | return -s.code(); 98 | } 99 | 100 | long compact_size = csize_mb * 1024 * 1024; 101 | 102 | long compaction_start_time = tm.elapsed(); 103 | while (it->Valid()) { 104 | long compaction_tmp_start_time = tm.elapsed(); 105 | 106 | long current_size = 0; 107 | rocksdb::Slice start, end; 108 | 109 | start = it->key(); 110 | while (it->Valid() && current_size < compact_size) { 111 | current_size += it->value().size(); 112 | end = it->key(); 113 | 114 | it->Next(); 115 | } 116 | 117 | db.compact(column_id, start, end); 118 | long compaction_time = tm.elapsed() - compaction_tmp_start_time; 119 | 120 | printf("%.2fs : %.2fs: compaction: start: %s, end: %s, size: %.2f MB\n", 121 | SECONDS(tm.elapsed()), SECONDS(compaction_time), 122 | start.ToString().c_str(), end.ToString().c_str(), 123 | current_size / (1024. * 1024.)); 124 | } 125 | 126 | if (!it->Valid()) { 127 | auto s = it->status(); 128 | if (s.code() != 0) { 129 | fprintf(stderr, "iterator has become invalid during iteration: %s [%d]", s.ToString().c_str(), s.code()); 130 | return -s.code(); 131 | } 132 | } 133 | 134 | long compaction_time = tm.elapsed() - compaction_start_time; 135 | 136 | printf("%.2fs : %.2fs: database has been compacted\n", SECONDS(tm.elapsed()), SECONDS(compaction_time)); 137 | } catch (const std::exception &e) { 138 | std::cerr << "Exception: " << e.what() << std::endl; 139 | } 140 | } 141 | 142 | -------------------------------------------------------------------------------- /src/exception.cpp: -------------------------------------------------------------------------------- 1 | #include "greylock/error.hpp" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace ioremap { namespace greylock { 11 | 12 | error::error(int code, const std::string &message) throw() : m_errno(code), m_message(message) 13 | { 14 | } 15 | 16 | int error::error_code() const 17 | { 18 | return m_errno; 19 | } 20 | 21 | const char *error::what() const throw() 22 | { 23 | return m_message.c_str(); 24 | } 25 | 26 | std::string error::error_message() const throw() 27 | { 28 | return m_message; 29 | } 30 | 31 | not_found_error::not_found_error(const std::string &message) throw() 32 | : error(-ENOENT, message) 33 | { 34 | } 35 | 36 | timeout_error::timeout_error(const std::string &message) throw() 37 | : error(-ETIMEDOUT, message) 38 | { 39 | } 40 | 41 | no_such_address_error::no_such_address_error(const std::string &message) throw() 42 | : error(-ENXIO, message) 43 | { 44 | } 45 | 46 | void error_info::throw_error() const 47 | { 48 | switch (m_code) { 49 | case -ENOENT: 50 | throw not_found_error(m_message); 51 | break; 52 | case -ETIMEDOUT: 53 | throw timeout_error(m_message); 54 | break; 55 | case -ENOMEM: 56 | throw std::bad_alloc(); 57 | break; 58 | case -ENXIO: 59 | throw no_such_address_error(m_message); 60 | break; 61 | case 0: 62 | // Do nothing, it's not an error 63 | break; 64 | default: 65 | throw error(m_code, m_message); 66 | break; 67 | } 68 | } 69 | 70 | static error_info create_info(int err, const char *id, const char *format, va_list args) 71 | { 72 | if (err == -ENOMEM) 73 | return error_info(err, std::string()); 74 | 75 | std::ostringstream message; 76 | char buffer[1024]; 77 | const size_t buffer_size = sizeof(buffer); 78 | if (id) { 79 | message << id << ": "; 80 | } 81 | vsnprintf(buffer, buffer_size, format, args); 82 | buffer[buffer_size - 1] = '\0'; 83 | message << buffer << ": " << strerror(-err) << ": " << err; 84 | return error_info(err, message.str()); 85 | } 86 | 87 | void throw_error(int err, const char *format, ...) 88 | { 89 | va_list args; 90 | va_start(args, format); 91 | error_info error = create_info(err, 0, format, args); 92 | va_end(args); 93 | error.throw_error(); 94 | } 95 | 96 | error_info create_error(int err, const char *format, ...) 97 | { 98 | va_list args; 99 | va_start(args, format); 100 | error_info error = create_info(err, 0, format, args); 101 | va_end(args); 102 | return error; 103 | } 104 | 105 | }} // namespace ioremap::greylock 106 | -------------------------------------------------------------------------------- /src/list.cpp: -------------------------------------------------------------------------------- 1 | #include "greylock/database.hpp" 2 | #include "greylock/types.hpp" 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | 10 | using namespace ioremap; 11 | 12 | static void list(const std::string &input, int column) { 13 | std::unique_ptr dbu(new greylock::database()); 14 | auto err = dbu->open_read_only(input); 15 | if (err) { 16 | ribosome::throw_error(err.code(), "could not open input database: %s: %s", 17 | input.c_str(), err.message().c_str()); 18 | } 19 | 20 | auto it = dbu->iterator(column, rocksdb::ReadOptions()); 21 | it->SeekToFirst(); 22 | 23 | if (!it->Valid()) { 24 | auto s = it->status(); 25 | ribosome::throw_error(-s.code(), "iterator from database %s is not valid: %s [%d]", 26 | input.c_str(), s.ToString().c_str(), s.code()); 27 | } 28 | 29 | long data_size = 0; 30 | long keys = 0; 31 | for (; it->Valid(); it->Next()) { 32 | keys++; 33 | data_size += it->value().size(); 34 | 35 | printf("merge: column: %s [%d], keys: %ld, total data size: %ld, key: %s, size: %ld\n", 36 | dbu->options().column_names[column].c_str(), column, keys, data_size, 37 | it->key().ToString().c_str(), it->value().size()); 38 | } 39 | 40 | } 41 | 42 | int main(int argc, char *argv[]) 43 | { 44 | namespace bpo = boost::program_options; 45 | 46 | bpo::options_description generic("List options"); 47 | 48 | std::string input; 49 | std::string column; 50 | generic.add_options() 51 | ("help", "This help message") 52 | ("column", bpo::value(&column)->required(), "Column name to merge") 53 | ("rocksdb", bpo::value(&input)->required(), "Input rocksdb database") 54 | ; 55 | 56 | bpo::options_description cmdline_options; 57 | cmdline_options.add(generic); 58 | 59 | bpo::variables_map vm; 60 | 61 | try { 62 | bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm); 63 | 64 | if (vm.count("help")) { 65 | std::cout << generic << std::endl; 66 | return 0; 67 | } 68 | 69 | bpo::notify(vm); 70 | } catch (const std::exception &e) { 71 | std::cerr << "Invalid options: " << e.what() << "\n" << generic << std::endl; 72 | return -1; 73 | } 74 | 75 | greylock::options opt; 76 | auto it = std::find(opt.column_names.begin(), opt.column_names.end(), column); 77 | if (it == opt.column_names.end()) { 78 | std::cerr << "Invalig column " << column << ", supported columns: " << greylock::dump_vector(opt.column_names) << std::endl; 79 | return -EINVAL; 80 | } 81 | 82 | auto column_id = std::distance(opt.column_names.begin(), it); 83 | 84 | try { 85 | list(input, column_id); 86 | } catch (const std::exception &e) { 87 | std::cerr << "Exception: " << e.what() << std::endl; 88 | return -1; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/merge.cpp: -------------------------------------------------------------------------------- 1 | #include "greylock/database.hpp" 2 | #include "greylock/types.hpp" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | #include 10 | 11 | using namespace ioremap; 12 | 13 | static inline const char *print_time(long tsec, long tnsec) 14 | { 15 | char str[64]; 16 | struct tm tm; 17 | 18 | static __thread char __dnet_print_time[128]; 19 | 20 | localtime_r((time_t *)&tsec, &tm); 21 | strftime(str, sizeof(str), "%F %R:%S", &tm); 22 | 23 | snprintf(__dnet_print_time, sizeof(__dnet_print_time), "%s.%06llu", str, (long long unsigned) tnsec / 1000); 24 | return __dnet_print_time; 25 | } 26 | 27 | class merger { 28 | public: 29 | merger(long print_interval) : m_print_interval(print_interval) { 30 | } 31 | 32 | void merge(int column, const std::string &output, const std::vector &inputs, bool compact) { 33 | ribosome::timer tm; 34 | 35 | greylock::database odb; 36 | auto err = odb.open_read_write(output); 37 | if (err) { 38 | ribosome::throw_error(err.code(), "could not open output database: %s: %s", 39 | output.c_str(), err.message().c_str()); 40 | } 41 | 42 | printf("Output databse %s has been opened\n", output.c_str()); 43 | 44 | std::vector> dbs; 45 | std::vector its; 46 | rocksdb::ReadOptions ro; 47 | 48 | for (auto &path: inputs) { 49 | std::unique_ptr dbu(new greylock::database()); 50 | err = dbu->open_read_only(path); 51 | if (err) { 52 | ribosome::throw_error(err.code(), "could not open input database: %s: %s", 53 | path.c_str(), err.message().c_str()); 54 | } 55 | 56 | printf("Input databse %s has been opened\n", path.c_str()); 57 | 58 | auto it = dbu->iterator(column, ro); 59 | it->SeekToFirst(); 60 | 61 | printf("Input databse %s has been positioned\n", path.c_str()); 62 | 63 | if (!it->Valid()) { 64 | auto s = it->status(); 65 | ribosome::throw_error(-s.code(), "iterator from database %s is not valid: %s [%d]", 66 | path.c_str(), s.ToString().c_str(), s.code()); 67 | } 68 | 69 | its.emplace_back(it); 70 | dbs.emplace_back(std::move(dbu)); 71 | } 72 | 73 | auto cmp = rocksdb::BytewiseComparator(); 74 | 75 | long data_size = 0; 76 | long written_keys = 0; 77 | std::string first_key, last_key; 78 | long prev_written_keys = 0; 79 | long prev_data_size = 0; 80 | 81 | ribosome::timer merge_tm; 82 | 83 | auto print_stats = [&] () { 84 | struct timespec ts; 85 | clock_gettime(CLOCK_REALTIME, &ts); 86 | 87 | float kspeed = (float)written_keys * 1000.0 / (float)merge_tm.elapsed(); 88 | float kspeed_moment = (float)(written_keys - prev_written_keys) * 1000.0 / (float)tm.elapsed(); 89 | 90 | float dspeed = (float)data_size * 1000.0 / (float)merge_tm.elapsed() / (1024.0 * 1024.0); 91 | float dspeed_moment = (float)(data_size - prev_data_size) * 1000.0 / (float)tm.elapsed() / (1024.0 * 1024.0); 92 | 93 | printf("%s: column: %s [%d], written keys: %ld, speed: %.2f [%.2f] keys/s, " 94 | "written data size: %.2f MBs, speed: %.2f [%.2f] MB/s, " 95 | "first_key: %s, last_key: %s\n", 96 | print_time(ts.tv_sec, ts.tv_nsec), 97 | odb.options().column_names[column].c_str(), column, 98 | written_keys, kspeed, kspeed_moment, 99 | (float)data_size / (1024.0 * 1024.0), dspeed, dspeed_moment, 100 | first_key.c_str(), last_key.c_str()); 101 | 102 | prev_written_keys = written_keys; 103 | prev_data_size = data_size; 104 | tm.restart(); 105 | }; 106 | 107 | while (true) { 108 | rocksdb::Slice key; 109 | std::vector positions; 110 | std::vector to_remove; 111 | 112 | for (size_t pos = 0; pos < its.size(); ++pos) { 113 | auto &it = its[pos]; 114 | if (!it->Valid()) { 115 | to_remove.push_back(pos); 116 | continue; 117 | } 118 | 119 | if (key.size() == 0) { 120 | key = it->key(); 121 | positions.push_back(pos); 122 | continue; 123 | } 124 | 125 | int cval = cmp->Compare(it->key(), key); 126 | if (cval < 0) { 127 | key = it->key(); 128 | positions.clear(); 129 | positions.push_back(pos); 130 | continue; 131 | } 132 | 133 | if (cval > 0) { 134 | continue; 135 | } 136 | 137 | positions.push_back(pos); 138 | } 139 | 140 | if (key.size() == 0) 141 | break; 142 | 143 | rocksdb::WriteBatch batch; 144 | 145 | long ds = 0; 146 | for (auto pos: positions) { 147 | auto &it = its[pos]; 148 | 149 | if ((column == greylock::options::token_shards_column) || (column == greylock::options::indexes_column)) { 150 | batch.Merge(odb.cfhandle(column), key, it->value()); 151 | } else { 152 | batch.Put(odb.cfhandle(column), key, it->value()); 153 | } 154 | ds += it->value().size(); 155 | } 156 | 157 | err = odb.write(&batch); 158 | if (err) { 159 | ribosome::throw_error(err.code(), "key: %s, inputs: %s, could not write batch of %ld elements: %s", 160 | key.ToString().c_str(), greylock::dump_vector(positions).c_str(), 161 | positions.size(), err.message().c_str()); 162 | } 163 | 164 | if (written_keys == 0) { 165 | first_key = key.ToString(); 166 | } 167 | 168 | written_keys++; 169 | data_size += ds; 170 | last_key = key.ToString(); 171 | 172 | for (auto pos: positions) { 173 | auto &it = its[pos]; 174 | it->Next(); 175 | } 176 | 177 | for (auto it = to_remove.rbegin(); it != to_remove.rend(); ++it) { 178 | printf("Input file %s has been processed\n", inputs[*it].c_str()); 179 | its.erase(its.begin() + (*it)); 180 | } 181 | 182 | if (tm.elapsed() > m_print_interval) { 183 | print_stats(); 184 | } 185 | } 186 | 187 | print_stats(); 188 | 189 | if (compact) { 190 | struct timespec ts; 191 | 192 | clock_gettime(CLOCK_REALTIME, &ts); 193 | printf("%s: starting compaction\n", print_time(ts.tv_sec, ts.tv_nsec)); 194 | tm.restart(); 195 | 196 | odb.compact(); 197 | clock_gettime(CLOCK_REALTIME, &ts); 198 | printf("%s: compaction 1 took %.1f seconds\n", print_time(ts.tv_sec, ts.tv_nsec), tm.restart() / 1000.0); 199 | 200 | odb.compact(); 201 | clock_gettime(CLOCK_REALTIME, &ts); 202 | printf("%s: compaction 2 took %.1f seconds\n", print_time(ts.tv_sec, ts.tv_nsec), tm.restart() / 1000.0); 203 | } 204 | } 205 | private: 206 | long m_print_interval; 207 | }; 208 | 209 | int main(int argc, char *argv[]) 210 | { 211 | namespace bpo = boost::program_options; 212 | 213 | bpo::options_description generic("Merge options"); 214 | 215 | std::string output; 216 | std::vector inputs; 217 | int thread_num; 218 | std::string column; 219 | long print_interval; 220 | generic.add_options() 221 | ("help", "This help message") 222 | ("column", bpo::value(&column)->required(), "Column name to merge") 223 | ("compact", "Whether to compact output database or not") 224 | ("input", bpo::value>(&inputs)->required()->composing(), "Input rocksdb database") 225 | ("output", bpo::value(&output)->required(), "Output rocksdb database") 226 | ("threads", bpo::value(&thread_num)->default_value(8), "Number of merge threads") 227 | ("print-interval", bpo::value(&print_interval)->default_value(10000), "Period to dump merge stats (in milliseconds)") 228 | ; 229 | 230 | bpo::options_description cmdline_options; 231 | cmdline_options.add(generic); 232 | 233 | bpo::variables_map vm; 234 | 235 | try { 236 | bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm); 237 | 238 | if (vm.count("help")) { 239 | std::cout << generic << std::endl; 240 | return 0; 241 | } 242 | 243 | bpo::notify(vm); 244 | } catch (const std::exception &e) { 245 | std::cerr << "Invalid options: " << e.what() << "\n" << generic << std::endl; 246 | return -1; 247 | } 248 | 249 | greylock::options opt; 250 | auto it = std::find(opt.column_names.begin(), opt.column_names.end(), column); 251 | if (it == opt.column_names.end()) { 252 | std::cerr << "Invalig column " << column << ", supported columns: " << greylock::dump_vector(opt.column_names) << std::endl; 253 | return -EINVAL; 254 | } 255 | 256 | auto column_id = std::distance(opt.column_names.begin(), it); 257 | 258 | try { 259 | merger m(print_interval); 260 | m.merge(column_id, output, inputs, vm.count("compact") != 0); 261 | } catch (const std::exception &e) { 262 | std::cerr << "Exception: " << e.what() << std::endl; 263 | return -1; 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /src/meta.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "greylock/database.hpp" 4 | #include "greylock/types.hpp" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | using namespace ioremap; 13 | 14 | static inline const char *print_time(long tsec, long tnsec) 15 | { 16 | char str[64]; 17 | struct tm tm; 18 | 19 | static __thread char __dnet_print_time[128]; 20 | 21 | localtime_r((time_t *)&tsec, &tm); 22 | strftime(str, sizeof(str), "%F %R:%S", &tm); 23 | 24 | snprintf(__dnet_print_time, sizeof(__dnet_print_time), "%s.%06llu", str, (long long unsigned) tnsec / 1000); 25 | return __dnet_print_time; 26 | } 27 | 28 | int main(int argc, char *argv[]) 29 | { 30 | namespace bpo = boost::program_options; 31 | 32 | bpo::options_description generic("Index metadata reader options"); 33 | generic.add_options() 34 | ("help", "this help message") 35 | ; 36 | 37 | 38 | std::string dpath, ipath; 39 | std::string iname; 40 | bool dump = false; 41 | std::string id_str; 42 | std::string save_prefix; 43 | bpo::options_description gr("Greylock index options"); 44 | gr.add_options() 45 | ("index", bpo::value(&iname), "index name, format: mailbox.attribute.index") 46 | ("id", bpo::value(&id_str), "read document with this indexed ID, format: ts") 47 | ("save", bpo::value(&save_prefix), "save index data into this directory") 48 | ("rocksdb.docs", bpo::value(&dpath), 49 | "path to rocksdb containing documents, " 50 | "will be opened in read-only mode, safe to be called if different process is already using it") 51 | ("rocksdb.indexes", bpo::value(&ipath)->required(), 52 | "path to rocksdb containing indexes, " 53 | "will be opened in read-only mode, safe to be called if different process is already using it") 54 | ("dump", "dump document data to stdout") 55 | ; 56 | 57 | bpo::options_description cmdline_options; 58 | cmdline_options.add(generic).add(gr); 59 | 60 | bpo::variables_map vm; 61 | 62 | try { 63 | bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm); 64 | 65 | if (vm.count("help")) { 66 | std::cout << cmdline_options << std::endl; 67 | return 0; 68 | } 69 | 70 | bpo::notify(vm); 71 | } catch (const std::exception &e) { 72 | std::cerr << "Invalid options: " << e.what() << "\n" << cmdline_options << std::endl; 73 | return -1; 74 | } 75 | 76 | if (vm.count("dump")) { 77 | dump = true; 78 | } 79 | 80 | if (dump || vm.count("id")) { 81 | if (dpath.empty()) { 82 | std::cerr << "You must provide documents database when using dump or id option\n" << cmdline_options << std::endl; 83 | return -1; 84 | } 85 | } 86 | 87 | try { 88 | greylock::database db; 89 | auto err = db.open_read_only(ipath); 90 | if (err) { 91 | std::cerr << "could not open database: " << err.message(); 92 | return err.code(); 93 | } 94 | 95 | greylock::database db_docs; 96 | if (dpath.size()) { 97 | auto err = db_docs.open_read_only(dpath); 98 | if (err) { 99 | std::cerr << "could not open database: " << err.message(); 100 | return err.code(); 101 | } 102 | } 103 | 104 | ribosome::timer tm; 105 | 106 | auto print_index = [&](const greylock::id_t &id) -> std::string { 107 | long tsec, aux; 108 | id.get_timestamp(&tsec, &aux); 109 | 110 | std::ostringstream ss; 111 | ss << id.to_string() << 112 | ", raw_ts: " << id.timestamp << 113 | ", aux: " << aux << 114 | ", ts: " << print_time(tsec, 0); 115 | return ss.str(); 116 | }; 117 | 118 | auto print_doc = [&](const greylock::document &doc) -> std::string { 119 | std::ostringstream ss; 120 | 121 | ss << "id: " << doc.id << ", author: " << doc.author; 122 | 123 | ss << "\n content: " << doc.ctx.content; 124 | ss << "\n title: " << doc.ctx.title; 125 | ss << "\n links: " << greylock::dump_vector(doc.ctx.links); 126 | ss << "\n images: " << greylock::dump_vector(doc.ctx.images); 127 | 128 | return ss.str(); 129 | }; 130 | 131 | if (vm.count("index")) { 132 | std::vector cmp; 133 | size_t pos = 0; 134 | for (int i = 0; i < 2; ++i) { 135 | size_t dot = iname.find('.', pos); 136 | if (dot == std::string::npos) { 137 | std::cerr << "invalid index name " << iname << ", must be mailbox.attribute.index" << std::endl; 138 | return -1; 139 | } 140 | 141 | cmp.push_back(iname.substr(pos, dot - pos)); 142 | pos = dot + 1; 143 | } 144 | cmp.push_back(iname.substr(pos)); 145 | 146 | if (save_prefix.size()) { 147 | boost::system::error_code ec; 148 | std::string dname = save_prefix + "/" + iname; 149 | boost::filesystem::create_directories(dname, ec); 150 | if (ec && ec != boost::system::errc::file_exists) { 151 | fprintf(stderr, "could not create directory %s: %s [%d]\n", 152 | dname.c_str(), ec.message().c_str(), ec.value()); 153 | return -ec.value(); 154 | } 155 | 156 | save_prefix = dname; 157 | } 158 | 159 | const std::string &mbox = cmp[0]; 160 | const std::string &attr = cmp[1]; 161 | const std::string &token = cmp[2]; 162 | 163 | std::string index_base = greylock::document::generate_index_base(db.options(), mbox, attr, token); 164 | std::string skey = greylock::document::generate_shard_key(db.options(), mbox, attr, token); 165 | std::vector shards(db.get_shards(skey)); 166 | 167 | if (save_prefix.size()) { 168 | std::ofstream sout(save_prefix + "/shards.bin", std::ios::trunc); 169 | std::string sdata; 170 | auto err = db.read(greylock::options::token_shards_column, skey, &sdata); 171 | if (err) { 172 | fprintf(stderr, "could not read shards %s: %s [%d]\n", 173 | skey.c_str(), err.message().c_str(), err.code()); 174 | return err.code(); 175 | } 176 | 177 | sout.write(sdata.data(), sdata.size()); 178 | } 179 | 180 | std::set sidx; 181 | 182 | std::cout << "Number of shards: " << shards.size() << ", shards: " << greylock::dump_vector(shards) << std::endl; 183 | for (auto shard_number: shards) { 184 | std::string ikey = greylock::document::generate_index_key_shard_number(index_base, shard_number); 185 | std::string idata; 186 | auto err = db.read(greylock::options::indexes_column, ikey, &idata); 187 | if (err) { 188 | fprintf(stderr, "could not read index %s: %s [%d]\n", 189 | ikey.c_str(), err.message().c_str(), err.code()); 190 | return err.code(); 191 | } 192 | 193 | if (save_prefix.size()) { 194 | std::ofstream sout(save_prefix + "/idx_shard." + std::to_string(shard_number), std::ios::trunc); 195 | sout.write(idata.data(), idata.size()); 196 | } 197 | 198 | 199 | greylock::disk_index idx; 200 | err = greylock::deserialize(idx, idata.data(), idata.size()); 201 | if (err) { 202 | fprintf(stderr, "could not deserialize index %s, size: %ld: %s [%d]\n", 203 | ikey.c_str(), idata.size(), err.message().c_str(), err.code()); 204 | return err.code(); 205 | } 206 | 207 | std::cout << "shard: " << shard_number << ", indexes: " << idx.ids.size() << std::endl; 208 | sidx.insert(idx.ids.begin(), idx.ids.end()); 209 | 210 | for (auto &id: idx.ids) { 211 | std::cout << "indexed_id: " << print_index(id.indexed_id); 212 | if (dump) { 213 | greylock::document doc; 214 | 215 | std::string doc_data; 216 | std::string dkey = id.indexed_id.to_string(); 217 | auto err = db_docs.read(greylock::options::documents_column, dkey, &doc_data); 218 | if (err) { 219 | fprintf(stderr, "could not read document %s: %s [%d]\n", 220 | dkey.c_str(), err.message().c_str(), err.code()); 221 | return err.code(); 222 | } 223 | 224 | err = greylock::deserialize(doc, doc_data.data(), doc_data.size()); 225 | if (err) { 226 | fprintf(stderr, "could not deserialize document %s, size: %ld: %s [%d]\n", 227 | dkey.c_str(), doc_data.size(), err.message().c_str(), err.code()); 228 | return err.code(); 229 | } 230 | 231 | std::cout << ", doc: " << print_doc(doc); 232 | } 233 | 234 | std::cout << std::endl; 235 | } 236 | } 237 | 238 | if (save_prefix.size()) { 239 | greylock::disk_index idx; 240 | idx.ids.insert(idx.ids.begin(), sidx.begin(), sidx.end()); 241 | 242 | std::ofstream sout(save_prefix + "/idx_merged.bin", std::ios::trunc); 243 | std::string mdata = serialize(idx); 244 | sout.write(mdata.data(), mdata.size()); 245 | } 246 | 247 | } 248 | 249 | if (vm.count("id")) { 250 | greylock::id_t indexed_id(id_str.c_str()); 251 | 252 | std::string doc_data; 253 | auto err = db_docs.read(greylock::options::documents_column, indexed_id.to_string(), &doc_data); 254 | if (err) { 255 | std::cout << "could not read document with indexed_id: " << id_str << 256 | ", error: " << err.message() << std::endl; 257 | return err.code(); 258 | } 259 | 260 | greylock::document doc; 261 | err = greylock::deserialize(doc, doc_data.data(), doc_data.size()); 262 | if (err) { 263 | std::cout << "could not deserialize document with indexed_id: " << id_str << 264 | ", data_size: " << doc_data.size() << 265 | ", error: " << err.message() << std::endl; 266 | return err.code(); 267 | } 268 | 269 | std::cout << "indexed_id: " << print_index(doc.indexed_id) << 270 | ", doc: " << print_doc(doc) << std::endl; 271 | } 272 | 273 | printf("Operation took %.2f seconds\n", tm.elapsed() / 1000.); 274 | } catch (const std::exception &e) { 275 | std::cerr << "Exception: " << e.what() << std::endl; 276 | } 277 | } 278 | -------------------------------------------------------------------------------- /src/server.cpp: -------------------------------------------------------------------------------- 1 | #include "greylock/database.hpp" 2 | #include "greylock/error.hpp" 3 | #include "greylock/json.hpp" 4 | #include "greylock/jsonvalue.hpp" 5 | #include "greylock/intersection.hpp" 6 | #include "greylock/types.hpp" 7 | #include "greylock/utils.hpp" 8 | 9 | #include 10 | #include 11 | 12 | #include 13 | #include 14 | 15 | #include 16 | #include 17 | #include 18 | 19 | #include 20 | 21 | #include 22 | 23 | #include 24 | #include 25 | #include 26 | 27 | #define ILOG(level, a...) BH_LOG(logger(), level, ##a) 28 | #define ILOG_ERROR(a...) ILOG(SWARM_LOG_ERROR, ##a) 29 | #define ILOG_WARNING(a...) ILOG(SWARM_LOG_WARNING, ##a) 30 | #define ILOG_INFO(a...) ILOG(SWARM_LOG_INFO, ##a) 31 | #define ILOG_NOTICE(a...) ILOG(SWARM_LOG_NOTICE, ##a) 32 | #define ILOG_DEBUG(a...) ILOG(SWARM_LOG_DEBUG, ##a) 33 | 34 | using namespace ioremap; 35 | 36 | template 37 | struct simple_request_stream_error : public thevoid::simple_request_stream { 38 | void send_error(int status, int error, const char *fmt, ...) { 39 | va_list args; 40 | va_start(args, fmt); 41 | 42 | char buffer[1024]; 43 | int sz = vsnprintf(buffer, sizeof(buffer), fmt, args); 44 | 45 | BH_LOG(this->server()->logger(), SWARM_LOG_ERROR, "%s: %d", buffer, error); 46 | 47 | greylock::JsonValue val; 48 | rapidjson::Value ev(rapidjson::kObjectType); 49 | 50 | 51 | rapidjson::Value esv(buffer, sz, val.GetAllocator()); 52 | ev.AddMember("message", esv, val.GetAllocator()); 53 | ev.AddMember("code", error, val.GetAllocator()); 54 | val.AddMember("error", ev, val.GetAllocator()); 55 | 56 | va_end(args); 57 | 58 | std::string data = val.ToString(); 59 | 60 | thevoid::http_response http_reply; 61 | http_reply.set_code(status); 62 | http_reply.headers().set_content_length(data.size()); 63 | http_reply.headers().set_content_type("text/json"); 64 | 65 | this->send_reply(std::move(http_reply), std::move(data)); 66 | } 67 | }; 68 | 69 | class http_server : public thevoid::server 70 | { 71 | public: 72 | virtual ~http_server() { 73 | } 74 | 75 | virtual bool initialize(const rapidjson::Value &config) { 76 | if (!rocksdb_init(config)) 77 | return false; 78 | 79 | on( 80 | options::exact_match("/ping"), 81 | options::methods("GET") 82 | ); 83 | 84 | on( 85 | options::exact_match("/compact"), 86 | options::methods("POST", "PUT") 87 | ); 88 | 89 | on( 90 | options::exact_match("/index"), 91 | options::methods("POST", "PUT") 92 | ); 93 | 94 | on( 95 | options::exact_match("/search"), 96 | options::methods("POST", "PUT") 97 | ); 98 | 99 | return true; 100 | } 101 | 102 | struct on_ping : public simple_request_stream_error { 103 | virtual void on_request(const thevoid::http_request &req, const boost::asio::const_buffer &buffer) { 104 | (void) buffer; 105 | (void) req; 106 | 107 | this->send_reply(thevoid::http_response::ok); 108 | } 109 | }; 110 | 111 | struct on_compact : public simple_request_stream_error { 112 | virtual void on_request(const thevoid::http_request &req, const boost::asio::const_buffer &buffer) { 113 | (void) req; 114 | (void) buffer; 115 | 116 | server()->db_docs().compact(); 117 | server()->db_indexes().compact(); 118 | this->send_reply(thevoid::http_response::ok); 119 | } 120 | }; 121 | 122 | struct on_search : public simple_request_stream_error { 123 | bool check_negation(const std::vector &tokens, const std::vector &content) { 124 | for (const auto &t: tokens) { 125 | for (const auto &word: content) { 126 | if (t.name == word) { 127 | return true; 128 | } 129 | } 130 | } 131 | 132 | return false; 133 | } 134 | 135 | bool check_exact(const std::vector &tokens, const std::vector &content) { 136 | auto check_token_positions = [] (const greylock::token &token, 137 | const std::vector &content, size_t content_offset) -> bool { 138 | for (size_t pos: token.positions) { 139 | size_t offset = content_offset + pos; 140 | if (offset >= content.size()) { 141 | return false; 142 | } 143 | 144 | if (token.name != content[offset]) { 145 | return false; 146 | } 147 | } 148 | 149 | return true; 150 | }; 151 | 152 | for (size_t content_offset = 0; content_offset < content.size(); ++content_offset) { 153 | bool match = true; 154 | 155 | for (const auto &token: tokens) { 156 | match = check_token_positions(token, content, content_offset); 157 | if (!match) 158 | break; 159 | } 160 | 161 | if (match) 162 | return true; 163 | } 164 | 165 | return false; 166 | } 167 | 168 | std::vector split_content(const std::string &content) { 169 | std::vector ret; 170 | 171 | ribosome::html_parser html; 172 | html.feed_text(content); 173 | 174 | ribosome::split spl; 175 | for (auto &t: html.tokens()) { 176 | ribosome::lstring lt = ribosome::lconvert::from_utf8(t); 177 | auto lower_request = ribosome::lconvert::to_lower(lt); 178 | 179 | auto all_words = spl.convert_split_words(lower_request, ".:,"); 180 | for (auto &word: all_words) { 181 | ret.emplace_back(ribosome::lconvert::to_string(word)); 182 | } 183 | } 184 | 185 | return ret; 186 | } 187 | 188 | // returns true if record has to be accepted, false - if record must be dropped 189 | bool check_result(const greylock::intersection_query &iq, greylock::single_doc_result &sd) { 190 | const greylock::document &doc = sd.doc; 191 | 192 | for (const auto &ent: iq.se) { 193 | for (const auto &attr: ent.idx.exact) { 194 | bool match; 195 | 196 | if (attr.name.find("title") != std::string::npos) { 197 | match = check_exact(attr.tokens, split_content(doc.ctx.title)); 198 | } else { 199 | match = check_exact(attr.tokens, split_content(doc.ctx.content)); 200 | } 201 | 202 | if (!match) 203 | return false; 204 | } 205 | } 206 | 207 | return true; 208 | } 209 | 210 | virtual void on_request(const thevoid::http_request &req, const boost::asio::const_buffer &buffer) { 211 | (void) req; 212 | 213 | ribosome::timer search_tm; 214 | 215 | // this is needed to put ending zero-byte, otherwise rapidjson parser will explode 216 | std::string data(const_cast(boost::asio::buffer_cast(buffer)), 217 | boost::asio::buffer_size(buffer)); 218 | 219 | rapidjson::Document doc; 220 | doc.Parse<0>(data.c_str()); 221 | 222 | if (doc.HasParseError()) { 223 | send_error(swarm::http_response::bad_request, -EINVAL, 224 | "search: could not parse document: %s, error offset: %d", 225 | doc.GetParseError(), doc.GetErrorOffset()); 226 | return; 227 | } 228 | if (!doc.IsObject()) { 229 | send_error(swarm::http_response::bad_request, -EINVAL, "search: document must be object"); 230 | return; 231 | } 232 | 233 | greylock::intersection_query iq; 234 | 235 | const auto &paging = greylock::get_object(doc, "paging"); 236 | if (paging.IsObject()) { 237 | iq.next_document_id = greylock::id_t(greylock::get_string(paging, "next_document_id")); 238 | iq.max_number = greylock::get_int64(paging, "max_number", LONG_MAX); 239 | } 240 | 241 | long sec_start = 0, sec_end = LONG_MAX; 242 | const auto &time = greylock::get_object(doc, "time"); 243 | if (time.IsObject()) { 244 | sec_start = greylock::get_int64(time, "start", sec_start); 245 | sec_end = greylock::get_int64(time, "end", sec_end); 246 | } 247 | iq.range_start.set_timestamp(sec_start, 0); 248 | iq.range_end.set_timestamp(sec_end, 0); 249 | 250 | 251 | std::vector se; 252 | const auto &request = greylock::get_object(doc, "request"); 253 | if (!request.IsObject()) { 254 | send_error(swarm::http_response::bad_request, -EINVAL, "search: document must contain 'request' object"); 255 | return; 256 | } 257 | 258 | for (auto it = request.MemberBegin(), jse_end = request.MemberEnd(); it != jse_end; ++it) { 259 | if (!it->value.IsObject()) { 260 | send_error(swarm::http_response::bad_request, -EINVAL, 261 | "search: mailbox query '%s' must contain object", 262 | it->name.GetString()); 263 | return; 264 | } 265 | 266 | greylock::mailbox_query q(server()->db_indexes().options(), it->value); 267 | if (q.parse_error) { 268 | send_error(swarm::http_response::bad_request, q.parse_error.code(), 269 | "search: could not parse mailbox query: %s", 270 | q.parse_error.message().c_str()); 271 | return; 272 | } 273 | 274 | q.mbox.assign(it->name.GetString(), it->name.GetStringLength()); 275 | 276 | iq.se.emplace_back(std::move(q)); 277 | } 278 | 279 | greylock::search_result result; 280 | greylock::intersector inter(server()->db_docs(), server()->db_indexes()); 281 | result = inter.intersect(iq, std::bind(&on_search::check_result, this, std::ref(iq), std::placeholders::_1)); 282 | 283 | send_search_result(result); 284 | 285 | ILOG_INFO("search: query: %s, next_document_id: %s -> %s, indexes: %ld/%ld, completed: %d, duration: %d ms", 286 | iq.to_string().c_str(), 287 | iq.next_document_id.to_string().c_str(), result.next_document_id.to_string().c_str(), 288 | result.docs.size(), iq.max_number, 289 | result.completed, search_tm.elapsed()); 290 | } 291 | 292 | void pack_string_array(rapidjson::Value &parent, rapidjson::Document::AllocatorType &allocator, 293 | const char *name, const std::vector &data) { 294 | rapidjson::Value arr(rapidjson::kArrayType); 295 | for (const auto &s: data) { 296 | rapidjson::Value v(s.c_str(), s.size(), allocator); 297 | arr.PushBack(v, allocator); 298 | } 299 | 300 | parent.AddMember(name, arr, allocator); 301 | } 302 | 303 | template 304 | void pack_simple_array(rapidjson::Value &parent, rapidjson::Document::AllocatorType &allocator, 305 | const char *name, const std::vector &data) { 306 | rapidjson::Value arr(rapidjson::kArrayType); 307 | for (const auto &s: data) { 308 | arr.PushBack(s, allocator); 309 | } 310 | 311 | parent.AddMember(name, arr, allocator); 312 | } 313 | 314 | void send_search_result(const greylock::search_result &result) { 315 | greylock::JsonValue ret; 316 | auto &allocator = ret.GetAllocator(); 317 | 318 | rapidjson::Value ids(rapidjson::kArrayType); 319 | for (auto it = result.docs.begin(), end = result.docs.end(); it != end; ++it) { 320 | rapidjson::Value key(rapidjson::kObjectType); 321 | 322 | const greylock::document &doc = it->doc; 323 | 324 | rapidjson::Value idv(doc.id.c_str(), doc.id.size(), allocator); 325 | key.AddMember("id", idv, allocator); 326 | 327 | std::string id_str = doc.indexed_id.to_string(); 328 | rapidjson::Value indv(id_str.c_str(), id_str.size(), allocator); 329 | key.AddMember("indexed_id", indv, allocator); 330 | 331 | rapidjson::Value av(doc.author.c_str(), doc.author.size(), allocator); 332 | key.AddMember("author", av, allocator); 333 | 334 | rapidjson::Value cv(rapidjson::kObjectType); 335 | 336 | rapidjson::Value csv(doc.ctx.content.c_str(), doc.ctx.content.size(), allocator); 337 | cv.AddMember("content", csv, allocator); 338 | 339 | rapidjson::Value tsv(doc.ctx.title.c_str(), doc.ctx.title.size(), allocator); 340 | cv.AddMember("title", tsv, allocator); 341 | 342 | pack_string_array(cv, allocator, "links", doc.ctx.links); 343 | pack_string_array(cv, allocator, "images", doc.ctx.images); 344 | key.AddMember("content", cv, allocator); 345 | 346 | key.AddMember("relevance", it->relevance, allocator); 347 | 348 | long tsec, tnsec; 349 | doc.indexed_id.get_timestamp(&tsec, &tnsec); 350 | rapidjson::Value ts(rapidjson::kObjectType); 351 | ts.AddMember("tsec", tsec, allocator); 352 | ts.AddMember("tnsec", tnsec, allocator); 353 | key.AddMember("timestamp", ts, allocator); 354 | 355 | ids.PushBack(key, allocator); 356 | } 357 | 358 | ret.AddMember("ids", ids, allocator); 359 | ret.AddMember("completed", result.completed, allocator); 360 | 361 | std::string next_id_str = result.next_document_id.to_string(); 362 | rapidjson::Value nidv(next_id_str.c_str(), next_id_str.size(), allocator); 363 | ret.AddMember("next_document_id", nidv, allocator); 364 | 365 | std::string data = ret.ToString(); 366 | 367 | thevoid::http_response reply; 368 | reply.set_code(swarm::http_response::ok); 369 | reply.headers().set_content_type("text/json; charset=utf-8"); 370 | reply.headers().set_content_length(data.size()); 371 | 372 | this->send_reply(std::move(reply), std::move(data)); 373 | } 374 | }; 375 | 376 | struct on_index : public simple_request_stream_error { 377 | greylock::error_info process_one_document(greylock::document &doc) { 378 | doc.generate_token_keys(server()->db_indexes().options()); 379 | 380 | rocksdb::WriteBatch docs_batch, indexes_batch; 381 | 382 | std::string doc_serialized = serialize(doc); 383 | rocksdb::Slice doc_value(doc_serialized); 384 | 385 | greylock::document_for_index did; 386 | did.indexed_id = doc.indexed_id; 387 | std::string sdid = serialize(did); 388 | 389 | size_t indexes = 0; 390 | for (const auto &attr: doc.idx.attributes) { 391 | for (const auto &t: attr.tokens) { 392 | indexes_batch.Merge(rocksdb::Slice(t.key), rocksdb::Slice(sdid)); 393 | 394 | greylock::disk_token dt(t.shards); 395 | std::string dts = serialize(dt); 396 | 397 | indexes_batch.Merge(rocksdb::Slice(t.shard_key), rocksdb::Slice(dts)); 398 | 399 | indexes++; 400 | } 401 | } 402 | 403 | // we must have a copy, since otherwise batch will cache stall pointer to rvalue 404 | std::string dkey = doc.indexed_id.to_string(); 405 | docs_batch.Put(server()->db_docs().cfhandle(greylock::options::documents_column), rocksdb::Slice(dkey), doc_value); 406 | 407 | std::string doc_indexed_id_serialized = serialize(doc.indexed_id); 408 | docs_batch.Put(server()->db_docs().cfhandle(greylock::options::document_ids_column), 409 | rocksdb::Slice(doc.id), rocksdb::Slice(doc_indexed_id_serialized)); 410 | 411 | 412 | auto err = server()->db_docs().write(&docs_batch); 413 | if (err) { 414 | return greylock::create_error(err.code(), "could not write docs batch, mbox: %s, id: %s, error: %s", 415 | doc.mbox.c_str(), doc.id.c_str(), err.message().c_str()); 416 | } 417 | 418 | err = server()->db_indexes().write(&indexes_batch); 419 | if (err) { 420 | return greylock::create_error(err.code(), "could not write indexes batch, mbox: %s, id: %s, error: %s", 421 | doc.mbox.c_str(), doc.id.c_str(), err.message().c_str()); 422 | } 423 | 424 | ILOG_INFO("index: successfully indexed document: mbox: %s, id: %s, " 425 | "indexed_id: %s, indexes: %ld, serialized_doc_size: %ld", 426 | doc.mbox.c_str(), doc.id.c_str(), 427 | doc.indexed_id.to_string().c_str(), indexes, doc_value.size()); 428 | return greylock::error_info(); 429 | } 430 | 431 | template 432 | std::vector get_numeric_vector(const rapidjson::Value &data, const char *name) { 433 | std::vector ret; 434 | const auto &arr = greylock::get_array(data, name); 435 | if (!arr.IsArray()) 436 | return ret; 437 | 438 | for (auto it = arr.Begin(), end = arr.End(); it != end; it++) { 439 | if (it->IsNumber()) 440 | ret.push_back((T)it->GetDouble()); 441 | } 442 | 443 | return ret; 444 | } 445 | 446 | std::vector get_string_vector(const rapidjson::Value &ctx, const char *name) { 447 | std::vector ret; 448 | 449 | const auto &a = greylock::get_array(ctx, name); 450 | if (!a.IsArray()) 451 | return ret; 452 | 453 | for (auto it = a.Begin(), end = a.End(); it != end; ++it) { 454 | if (it->IsString()) 455 | ret.push_back(std::string(it->GetString(), it->GetStringLength())); 456 | } 457 | 458 | return ret; 459 | } 460 | greylock::error_info parse_content(const rapidjson::Value &ctx, greylock::document &doc) { 461 | doc.ctx.content = greylock::get_string(ctx, "content", ""); 462 | doc.ctx.title = greylock::get_string(ctx, "title", ""); 463 | doc.ctx.links = get_string_vector(ctx, "links"); 464 | doc.ctx.images = get_string_vector(ctx, "images"); 465 | 466 | return greylock::error_info(); 467 | } 468 | 469 | greylock::error_info parse_docs(const std::string &mbox, const rapidjson::Value &docs) { 470 | greylock::error_info err = greylock::create_error(-ENOENT, 471 | "parse_docs: mbox: %s: could not parse document, there are no valid index entries", mbox.c_str()); 472 | 473 | for (auto it = docs.Begin(), id_end = docs.End(); it != id_end; ++it) { 474 | if (!it->IsObject()) { 475 | return greylock::create_error(-EINVAL, "docs entries must be objects"); 476 | } 477 | 478 | const char *id = greylock::get_string(*it, "id"); 479 | const char *author = greylock::get_string(*it, "author"); 480 | if (!id) { 481 | return greylock::create_error(-EINVAL, "id must be string"); 482 | } 483 | 484 | struct timespec ts; 485 | clock_gettime(CLOCK_REALTIME, &ts); 486 | 487 | long tsec, tnsec; 488 | const rapidjson::Value ×tamp = greylock::get_object(*it, "timestamp"); 489 | if (timestamp.IsObject()) { 490 | tsec = greylock::get_int64(timestamp, "tsec", ts.tv_sec); 491 | tnsec = greylock::get_int64(timestamp, "tnsec", ts.tv_nsec); 492 | } else { 493 | tsec = ts.tv_sec; 494 | tnsec = ts.tv_nsec; 495 | } 496 | 497 | 498 | greylock::document doc; 499 | doc.mbox = mbox; 500 | doc.assign_id(id, std::hash{}(id), tsec, tnsec); 501 | 502 | if (author) { 503 | doc.author.assign(author); 504 | } 505 | 506 | const rapidjson::Value &ctx = greylock::get_object(*it, "content"); 507 | if (ctx.IsObject()) { 508 | err = parse_content(ctx, doc); 509 | if (err) 510 | return err; 511 | } 512 | 513 | const rapidjson::Value &idxs = greylock::get_object(*it, "index"); 514 | if (!idxs.IsObject()) { 515 | return greylock::create_error(-EINVAL, "docs/index must be array"); 516 | } 517 | 518 | doc.idx = greylock::indexes::get_indexes(server()->db_indexes().options(), idxs); 519 | 520 | err = process_one_document(doc); 521 | if (err) 522 | return err; 523 | } 524 | 525 | return err; 526 | } 527 | 528 | virtual void on_request(const thevoid::http_request &req, const boost::asio::const_buffer &buffer) { 529 | (void) req; 530 | ribosome::timer index_tm; 531 | 532 | // this is needed to put ending zero-byte, otherwise rapidjson parser will explode 533 | std::string data(const_cast(boost::asio::buffer_cast(buffer)), 534 | boost::asio::buffer_size(buffer)); 535 | 536 | rapidjson::Document doc; 537 | doc.Parse<0>(data.c_str()); 538 | 539 | if (doc.HasParseError()) { 540 | send_error(swarm::http_response::bad_request, -EINVAL, 541 | "index: could not parse document: %s, error offset: %d", 542 | doc.GetParseError(), doc.GetErrorOffset()); 543 | return; 544 | } 545 | 546 | if (!doc.IsObject()) { 547 | send_error(swarm::http_response::bad_request, -EINVAL, "index: document must be object, its type: %d", 548 | doc.GetType()); 549 | return; 550 | } 551 | 552 | const char *mbox = greylock::get_string(doc, "mailbox"); 553 | if (!mbox) { 554 | send_error(swarm::http_response::bad_request, -ENOENT, "index: 'mailbox' must be a string"); 555 | this->send_reply(swarm::http_response::bad_request); 556 | return; 557 | } 558 | 559 | const rapidjson::Value &docs = greylock::get_array(doc, "docs"); 560 | if (!docs.IsArray()) { 561 | send_error(swarm::http_response::bad_request, -ENOENT, "index: mailbox: %s, 'docs' must be array", mbox); 562 | return; 563 | } 564 | 565 | greylock::error_info err = parse_docs(mbox, docs); 566 | if (err) { 567 | send_error(swarm::http_response::bad_request, err.code(), 568 | "index: mailbox: %s, keys: %d: insertion error: %s", 569 | mbox, docs.Size(), err.message()); 570 | return; 571 | } 572 | 573 | ILOG_INFO("index: mailbox: %s, keys: %d: insertion completed, index duration: %d ms", 574 | mbox, docs.Size(), index_tm.elapsed()); 575 | this->send_reply(thevoid::http_response::ok); 576 | } 577 | }; 578 | 579 | greylock::database &db_docs() { 580 | return m_db_docs; 581 | } 582 | greylock::database &db_indexes() { 583 | return m_db_indexes; 584 | } 585 | 586 | private: 587 | greylock::database m_db_docs, m_db_indexes; 588 | 589 | bool rocksdb_init(const rapidjson::Value &config) { 590 | const auto &rdbconf = greylock::get_object(config, "rocksdb.docs"); 591 | if (!rdbconf.IsObject()) { 592 | ILOG_ERROR("there is no 'rocksdb.docs' object in config"); 593 | return false; 594 | } 595 | 596 | const auto &riconf = greylock::get_object(config, "rocksdb.indexes"); 597 | if (!riconf.IsObject()) { 598 | ILOG_ERROR("there is no 'rocksdb.indexes' object in config"); 599 | return false; 600 | } 601 | 602 | if (!rocksdb_config_parse(rdbconf, &m_db_docs)) 603 | return false; 604 | 605 | if (!rocksdb_config_parse(riconf, &m_db_indexes)) 606 | return false; 607 | 608 | return true; 609 | } 610 | 611 | bool rocksdb_config_parse(const rapidjson::Value &config, greylock::database *db) { 612 | const char *path = greylock::get_string(config, "path"); 613 | if (!path) { 614 | ILOG_ERROR("there is no 'path' string in rocksdb config"); 615 | return false; 616 | } 617 | bool ro = greylock::get_bool(config, "read_only", false); 618 | bool bulk = greylock::get_bool(config, "bulk_upload", false); 619 | 620 | auto err = db->open(path, ro, bulk); 621 | if (err) { 622 | ILOG_ERROR("could not open database: %s [%d]", err.message().c_str(), err.code()); 623 | return false; 624 | } 625 | 626 | return true; 627 | } 628 | }; 629 | 630 | int main(int argc, char **argv) 631 | { 632 | ioremap::ribosome::set_locale("en_US.UTF8"); 633 | 634 | ioremap::thevoid::register_signal_handler(SIGINT, ioremap::thevoid::handle_stop_signal); 635 | ioremap::thevoid::register_signal_handler(SIGTERM, ioremap::thevoid::handle_stop_signal); 636 | ioremap::thevoid::register_signal_handler(SIGHUP, ioremap::thevoid::handle_reload_signal); 637 | ioremap::thevoid::register_signal_handler(SIGUSR1, ioremap::thevoid::handle_ignore_signal); 638 | ioremap::thevoid::register_signal_handler(SIGUSR2, ioremap::thevoid::handle_ignore_signal); 639 | 640 | ioremap::thevoid::run_signal_thread(); 641 | 642 | auto server = ioremap::thevoid::create_server(); 643 | int err = server->run(argc, argv); 644 | 645 | ioremap::thevoid::stop_signal_thread(); 646 | 647 | return err; 648 | } 649 | 650 | --------------------------------------------------------------------------------