├── .dockerignore
├── .gitignore
├── CMakeLists.txt
├── Dockerfile
├── LICENSE
├── cmake
    ├── GreylockConfig.cmake.in
    └── locate_library.cmake
├── conf
    └── greylock.conf
├── debian
    ├── changelog
    ├── compat
    ├── control
    ├── copyright
    ├── dirs
    ├── docs
    ├── greylock-dev.install
    ├── greylock.install
    └── rules
├── greylock-bf.spec
├── include
    └── greylock
    │   ├── database.hpp
    │   ├── error.hpp
    │   ├── id.hpp
    │   ├── intersection.hpp
    │   ├── iterator.hpp
    │   ├── json.hpp
    │   ├── jsonvalue.hpp
    │   ├── types.hpp
    │   └── utils.hpp
└── src
    ├── CMakeLists.txt
    ├── check.cpp
    ├── compact.cpp
    ├── exception.cpp
    ├── list.cpp
    ├── merge.cpp
    ├── meta.cpp
    └── server.cpp


/.dockerignore:
--------------------------------------------------------------------------------
1 | build
2 | tags
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.a
 2 | *.o
 3 | *.py[co]
 4 | *.so
 5 | *.so.*
 6 | *.tar.gz
 7 | .*.sw*
 8 | *~
 9 | CMakeCache.txt
10 | CMakeFiles
11 | build
12 | cmake_install.cmake
13 | install_manifest.txt
14 | tags
15 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required (VERSION 2.6)
 2 | project (greylock)
 3 | 
 4 | FILE (READ "${CMAKE_CURRENT_SOURCE_DIR}/debian/changelog" DEBCHANGELOG)
 5 | 
 6 | string(REGEX MATCH "([0-9]+\\.[0-9]+\\.[0-9]+)" DEBFULLVERSION "${DEBCHANGELOG}")
 7 | STRING (REGEX MATCH "([0-9]+\\.[0-9]+)" GREYLOCK_MAJOR_VERSION "${DEBFULLVERSION}")
 8 | SET(GREYLOCK_FULL_VERSION ${DEBFULLVERSION})
 9 | 
10 | set(CMAKE_CXX_FLAGS "-g -std=c++0x -W -Wall -Wextra -fstack-protector-all")
11 | 
12 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/")
13 | 
14 | find_package(Boost REQUIRED COMPONENTS system program_options filesystem thread)
15 | find_package(Ribosome REQUIRED)
16 | 
17 | INCLUDE(cmake/locate_library.cmake)
18 | 
19 | LOCATE_LIBRARY(JEMALLOC "jemalloc/jemalloc.h" "jemalloc")
20 | LOCATE_LIBRARY(MSGPACK "msgpack.hpp" "msgpack")
21 | LOCATE_LIBRARY(THEVOID "thevoid/server.hpp" "thevoid")
22 | LOCATE_LIBRARY(SWARM "swarm/http_request.hpp" "swarm")
23 | LOCATE_LIBRARY(ROCKSDB "rocksdb/db.h" "rocksdb")
24 | 
25 | FILE(GLOB headers
26 | 	"${CMAKE_CURRENT_SOURCE_DIR}/include/greylock/*.hpp"
27 | 	"${CMAKE_CURRENT_SOURCE_DIR}/include/greylock/*.h"
28 | 
29 | )
30 | install(FILES ${headers} DESTINATION include/greylock)
31 | 
32 | configure_file(cmake/GreylockConfig.cmake.in "${PROJECT_BINARY_DIR}/cmake/GreylockConfig.cmake" @ONLY)
33 | install(FILES "${PROJECT_BINARY_DIR}/cmake/GreylockConfig.cmake" DESTINATION share/greylock/cmake)
34 | 
35 | include_directories(${PROJECT_SOURCE_DIR}/include
36 | 	${Boost_INCLUDE_DIRS}
37 | 	${MSGPACK_INCLUDE_DIRS}
38 | 	${RIBOSOME_INCLUDE_DIRS}
39 | 	${ROCKSDB_INCLUDE_DIRS}
40 | 	${SWARM_INCLUDE_DIRS}
41 | 	${THEVOID_INCLUDE_DIRS}
42 | )
43 | 
44 | add_subdirectory(src)
45 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM reverbrain/xenial-dev
 2 | 
 3 | #RUN	echo "deb http://repo.reverbrain.com/trusty/ current/amd64/" > /etc/apt/sources.list.d/reverbrain.list && \
 4 | #	echo "deb http://repo.reverbrain.com/trusty/ current/all/" >> /etc/apt/sources.list.d/reverbrain.list && \
 5 | #	apt-get install -y curl tzdata && \
 6 | #	cp -f /usr/share/zoneinfo/posix/W-SU /etc/localtime && \
 7 | #	curl http://repo.reverbrain.com/REVERBRAIN.GPG | apt-key add - && \
 8 | #	apt-get update && \
 9 | #	apt-get upgrade -y && \
10 | #	apt-get install -y git g++ liblz4-dev libsnappy-dev zlib1g-dev libbz2-dev libzstd-dev libgflags-dev libjemalloc-dev && \
11 | #	apt-get install -y cmake debhelper cdbs devscripts && \
12 | #	apt-get install -y libboost-system-dev libboost-filesystem-dev libboost-program-options-dev && \
13 | #	apt-get install -y libmsgpack-dev libswarm3-dev libthevoid3-dev ribosome-dev && \
14 | #	git config --global user.email "zbr@ioremap.net" && \
15 | #	git config --global user.name "Evgeniy Polyakov"
16 | 
17 | #RUN	cd /tmp && \
18 | #	git clone https://github.com/facebook/rocksdb && \
19 | #	cd rocksdb && \
20 | #	PORTABLE=1 make shared_lib && \
21 | #	make INSTALL_PATH=/usr install-shared && \
22 | #	echo "Rocksdb package has been updated and installed"
23 | 
24 | RUN	cd /tmp && \
25 | 	rm -rf ribosome && \
26 | 	git clone https://github.com/reverbrain/ribosome && \
27 | 	cd ribosome && \
28 | 	git branch -v && \
29 | 	dpkg-buildpackage -b && \
30 | 	dpkg -i ../ribosome*.deb && \
31 | 	echo "Ribosome package has been updated and installed" && \
32 | 
33 | 	cd /tmp && \
34 | 	rm -rf greylock && \
35 | 	git clone https://github.com/reverbrain/greylock && \
36 | 	cd greylock && \
37 | 	git branch -v && \
38 | 	dpkg-buildpackage -b && \
39 | 	dpkg -i ../greylock_*.deb ../greylock-dev_*.deb && \
40 | 	echo "Greylock package has been updated and installed" && \
41 |     	rm -rf /var/lib/apt/lists/*
42 | 
43 | EXPOSE 8080 8181 8111
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU General Public License is a free, copyleft license for
 11 | software and other kinds of works.
 12 | 
 13 |   The licenses for most software and other practical works are designed
 14 | to take away your freedom to share and change the works.  By contrast,
 15 | the GNU General Public License is intended to guarantee your freedom to
 16 | share and change all versions of a program--to make sure it remains free
 17 | software for all its users.  We, the Free Software Foundation, use the
 18 | GNU General Public License for most of our software; it applies also to
 19 | any other work released this way by its authors.  You can apply it to
 20 | your programs, too.
 21 | 
 22 |   When we speak of free software, we are referring to freedom, not
 23 | price.  Our General Public Licenses are designed to make sure that you
 24 | have the freedom to distribute copies of free software (and charge for
 25 | them if you wish), that you receive source code or can get it if you
 26 | want it, that you can change the software or use pieces of it in new
 27 | free programs, and that you know you can do these things.
 28 | 
 29 |   To protect your rights, we need to prevent others from denying you
 30 | these rights or asking you to surrender the rights.  Therefore, you have
 31 | certain responsibilities if you distribute copies of the software, or if
 32 | you modify it: responsibilities to respect the freedom of others.
 33 | 
 34 |   For example, if you distribute copies of such a program, whether
 35 | gratis or for a fee, you must pass on to the recipients the same
 36 | freedoms that you received.  You must make sure that they, too, receive
 37 | or can get the source code.  And you must show them these terms so they
 38 | know their rights.
 39 | 
 40 |   Developers that use the GNU GPL protect your rights with two steps:
 41 | (1) assert copyright on the software, and (2) offer you this License
 42 | giving you legal permission to copy, distribute and/or modify it.
 43 | 
 44 |   For the developers' and authors' protection, the GPL clearly explains
 45 | that there is no warranty for this free software.  For both users' and
 46 | authors' sake, the GPL requires that modified versions be marked as
 47 | changed, so that their problems will not be attributed erroneously to
 48 | authors of previous versions.
 49 | 
 50 |   Some devices are designed to deny users access to install or run
 51 | modified versions of the software inside them, although the manufacturer
 52 | can do so.  This is fundamentally incompatible with the aim of
 53 | protecting users' freedom to change the software.  The systematic
 54 | pattern of such abuse occurs in the area of products for individuals to
 55 | use, which is precisely where it is most unacceptable.  Therefore, we
 56 | have designed this version of the GPL to prohibit the practice for those
 57 | products.  If such problems arise substantially in other domains, we
 58 | stand ready to extend this provision to those domains in future versions
 59 | of the GPL, as needed to protect the freedom of users.
 60 | 
 61 |   Finally, every program is threatened constantly by software patents.
 62 | States should not allow patents to restrict development and use of
 63 | software on general-purpose computers, but in those that do, we wish to
 64 | avoid the special danger that patents applied to a free program could
 65 | make it effectively proprietary.  To prevent this, the GPL assures that
 66 | patents cannot be used to render the program non-free.
 67 | 
 68 |   The precise terms and conditions for copying, distribution and
 69 | modification follow.
 70 | 
 71 |                        TERMS AND CONDITIONS
 72 | 
 73 |   0. Definitions.
 74 | 
 75 |   "This License" refers to version 3 of the GNU General Public License.
 76 | 
 77 |   "Copyright" also means copyright-like laws that apply to other kinds of
 78 | works, such as semiconductor masks.
 79 | 
 80 |   "The Program" refers to any copyrightable work licensed under this
 81 | License.  Each licensee is addressed as "you".  "Licensees" and
 82 | "recipients" may be individuals or organizations.
 83 | 
 84 |   To "modify" a work means to copy from or adapt all or part of the work
 85 | in a fashion requiring copyright permission, other than the making of an
 86 | exact copy.  The resulting work is called a "modified version" of the
 87 | earlier work or a work "based on" the earlier work.
 88 | 
 89 |   A "covered work" means either the unmodified Program or a work based
 90 | on the Program.
 91 | 
 92 |   To "propagate" a work means to do anything with it that, without
 93 | permission, would make you directly or secondarily liable for
 94 | infringement under applicable copyright law, except executing it on a
 95 | computer or modifying a private copy.  Propagation includes copying,
 96 | distribution (with or without modification), making available to the
 97 | public, and in some countries other activities as well.
 98 | 
 99 |   To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies.  Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 | 
103 |   An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License.  If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 | 
112 |   1. Source Code.
113 | 
114 |   The "source code" for a work means the preferred form of the work
115 | for making modifications to it.  "Object code" means any non-source
116 | form of a work.
117 | 
118 |   A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 | 
123 |   The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form.  A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 | 
134 |   The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities.  However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work.  For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 | 
147 |   The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 | 
151 |   The Corresponding Source for a work in source code form is that
152 | same work.
153 | 
154 |   2. Basic Permissions.
155 | 
156 |   All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met.  This License explicitly affirms your unlimited
159 | permission to run the unmodified Program.  The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work.  This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 | 
164 |   You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force.  You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright.  Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 | 
175 |   Conveying under any other circumstances is permitted solely under
176 | the conditions stated below.  Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 | 
179 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 | 
181 |   No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 | 
187 |   When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 | 
195 |   4. Conveying Verbatim Copies.
196 | 
197 |   You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 | 
205 |   You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 | 
208 |   5. Conveying Modified Source Versions.
209 | 
210 |   You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 | 
214 |     a) The work must carry prominent notices stating that you modified
215 |     it, and giving a relevant date.
216 | 
217 |     b) The work must carry prominent notices stating that it is
218 |     released under this License and any conditions added under section
219 |     7.  This requirement modifies the requirement in section 4 to
220 |     "keep intact all notices".
221 | 
222 |     c) You must license the entire work, as a whole, under this
223 |     License to anyone who comes into possession of a copy.  This
224 |     License will therefore apply, along with any applicable section 7
225 |     additional terms, to the whole of the work, and all its parts,
226 |     regardless of how they are packaged.  This License gives no
227 |     permission to license the work in any other way, but it does not
228 |     invalidate such permission if you have separately received it.
229 | 
230 |     d) If the work has interactive user interfaces, each must display
231 |     Appropriate Legal Notices; however, if the Program has interactive
232 |     interfaces that do not display Appropriate Legal Notices, your
233 |     work need not make them do so.
234 | 
235 |   A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit.  Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 | 
245 |   6. Conveying Non-Source Forms.
246 | 
247 |   You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 | 
252 |     a) Convey the object code in, or embodied in, a physical product
253 |     (including a physical distribution medium), accompanied by the
254 |     Corresponding Source fixed on a durable physical medium
255 |     customarily used for software interchange.
256 | 
257 |     b) Convey the object code in, or embodied in, a physical product
258 |     (including a physical distribution medium), accompanied by a
259 |     written offer, valid for at least three years and valid for as
260 |     long as you offer spare parts or customer support for that product
261 |     model, to give anyone who possesses the object code either (1) a
262 |     copy of the Corresponding Source for all the software in the
263 |     product that is covered by this License, on a durable physical
264 |     medium customarily used for software interchange, for a price no
265 |     more than your reasonable cost of physically performing this
266 |     conveying of source, or (2) access to copy the
267 |     Corresponding Source from a network server at no charge.
268 | 
269 |     c) Convey individual copies of the object code with a copy of the
270 |     written offer to provide the Corresponding Source.  This
271 |     alternative is allowed only occasionally and noncommercially, and
272 |     only if you received the object code with such an offer, in accord
273 |     with subsection 6b.
274 | 
275 |     d) Convey the object code by offering access from a designated
276 |     place (gratis or for a charge), and offer equivalent access to the
277 |     Corresponding Source in the same way through the same place at no
278 |     further charge.  You need not require recipients to copy the
279 |     Corresponding Source along with the object code.  If the place to
280 |     copy the object code is a network server, the Corresponding Source
281 |     may be on a different server (operated by you or a third party)
282 |     that supports equivalent copying facilities, provided you maintain
283 |     clear directions next to the object code saying where to find the
284 |     Corresponding Source.  Regardless of what server hosts the
285 |     Corresponding Source, you remain obligated to ensure that it is
286 |     available for as long as needed to satisfy these requirements.
287 | 
288 |     e) Convey the object code using peer-to-peer transmission, provided
289 |     you inform other peers where the object code and Corresponding
290 |     Source of the work are being offered to the general public at no
291 |     charge under subsection 6d.
292 | 
293 |   A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 | 
297 |   A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling.  In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage.  For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product.  A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 | 
310 |   "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source.  The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 | 
318 |   If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information.  But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 | 
329 |   The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed.  Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 | 
337 |   Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 | 
343 |   7. Additional Terms.
344 | 
345 |   "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law.  If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 | 
354 |   When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it.  (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.)  You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 | 
361 |   Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 | 
365 |     a) Disclaiming warranty or limiting liability differently from the
366 |     terms of sections 15 and 16 of this License; or
367 | 
368 |     b) Requiring preservation of specified reasonable legal notices or
369 |     author attributions in that material or in the Appropriate Legal
370 |     Notices displayed by works containing it; or
371 | 
372 |     c) Prohibiting misrepresentation of the origin of that material, or
373 |     requiring that modified versions of such material be marked in
374 |     reasonable ways as different from the original version; or
375 | 
376 |     d) Limiting the use for publicity purposes of names of licensors or
377 |     authors of the material; or
378 | 
379 |     e) Declining to grant rights under trademark law for use of some
380 |     trade names, trademarks, or service marks; or
381 | 
382 |     f) Requiring indemnification of licensors and authors of that
383 |     material by anyone who conveys the material (or modified versions of
384 |     it) with contractual assumptions of liability to the recipient, for
385 |     any liability that these contractual assumptions directly impose on
386 |     those licensors and authors.
387 | 
388 |   All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10.  If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term.  If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 | 
398 |   If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 | 
403 |   Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 | 
407 |   8. Termination.
408 | 
409 |   You may not propagate or modify a covered work except as expressly
410 | provided under this License.  Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 | 
415 |   However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 | 
422 |   Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 | 
429 |   Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License.  If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 | 
435 |   9. Acceptance Not Required for Having Copies.
436 | 
437 |   You are not required to accept this License in order to receive or
438 | run a copy of the Program.  Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance.  However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work.  These actions infringe copyright if you do
443 | not accept this License.  Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 | 
446 |   10. Automatic Licensing of Downstream Recipients.
447 | 
448 |   Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License.  You are not responsible
451 | for enforcing compliance by third parties with this License.
452 | 
453 |   An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations.  If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 | 
463 |   You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License.  For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 | 
471 |   11. Patents.
472 | 
473 |   A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based.  The
475 | work thus licensed is called the contributor's "contributor version".
476 | 
477 |   A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version.  For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 | 
487 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 | 
492 |   In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement).  To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 | 
499 |   If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients.  "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 | 
513 |   If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 | 
521 |   A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License.  You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 | 
536 |   Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 | 
540 |   12. No Surrender of Others' Freedom.
541 | 
542 |   If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License.  If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all.  For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 | 
552 |   13. Use with the GNU Affero General Public License.
553 | 
554 |   Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work.  The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 | 
563 |   14. Revised Versions of this License.
564 | 
565 |   The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time.  Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 | 
570 |   Each version is given a distinguishing version number.  If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation.  If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 | 
579 |   If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 | 
584 |   Later license versions may give you additional or different
585 | permissions.  However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 | 
589 |   15. Disclaimer of Warranty.
590 | 
591 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 | 
600 |   16. Limitation of Liability.
601 | 
602 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 | 
612 |   17. Interpretation of Sections 15 and 16.
613 | 
614 |   If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 | 
621 |                      END OF TERMS AND CONDITIONS
622 | 
623 |             How to Apply These Terms to Your New Programs
624 | 
625 |   If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 | 
629 |   To do so, attach the following notices to the program.  It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 | 
634 |     {one line to give the program's name and a brief idea of what it does.}
635 |     Copyright (C) {year}  {name of author}
636 | 
637 |     This program is free software: you can redistribute it and/or modify
638 |     it under the terms of the GNU General Public License as published by
639 |     the Free Software Foundation, either version 3 of the License, or
640 |     (at your option) any later version.
641 | 
642 |     This program is distributed in the hope that it will be useful,
643 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
644 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
645 |     GNU General Public License for more details.
646 | 
647 |     You should have received a copy of the GNU General Public License
648 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
649 | 
650 | Also add information on how to contact you by electronic and paper mail.
651 | 
652 |   If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 | 
655 |     {project}  Copyright (C) {year}  {fullname}
656 |     This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 |     This is free software, and you are welcome to redistribute it
658 |     under certain conditions; type `show c' for details.
659 | 
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License.  Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 | 
664 |   You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | <http://www.gnu.org/licenses/>.
668 | 
669 |   The GNU General Public License does not permit incorporating your program
670 | into proprietary programs.  If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library.  If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License.  But first, please read
674 | <http://www.gnu.org/philosophy/why-not-lgpl.html>.
675 | 


--------------------------------------------------------------------------------
/cmake/GreylockConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # - Config file for the Elliptics package
 2 | # It defines the following variables
 3 | #  GREYLOCK_INCLUDE_DIRS - include directories for Elliptics
 4 | #  GREYLOCK_LIBRARY_DIRS - library directories
 5 | #  GREYLOCK_LIBRARIES    - libraries to link against
 6 | 
 7 | get_filename_component(GREYLOCK_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 8 | 
 9 | set(GREYLOCK_INCLUDE_DIRS @INSTALL_INCLUDE_DIR@
10 | 	@LZ4_INCLUDE_DIRS@
11 | 	@MSGPACK_INCLUDE_DIRS@
12 | 	@ROCKSDB_INCLUDE_DIRS@
13 | 	@SWARM_INCLUDE_DIRS@
14 | 	@THEVOID_INCLUDE_DIRS@
15 | )
16 | 
17 | set(GREYLOCK_LIBRARY_DIRS
18 | 	@LZ4_LIBRARY_DIRS@
19 | 	@MSGPACK_LIBRARY_DIRS@
20 | 	@ROCKSDB_LIBRARY_DIRS@
21 | 	@SWARM_LIBRARY_DIRS@
22 | 	@THEVOID_LIBRARY_DIRS@
23 | )
24 | 
25 | set(GREYLOCK_LIBRARIES
26 | 	@LZ4_LIBRARIES@
27 | 	@MSGPACK_LIBRARIES@
28 | 	@ROCKSDB_LIBRARIES@
29 | 	@SWARM_LIBRARIES@
30 | 	@THEVOID_LIBRARIES@
31 | 	greylock
32 | )
33 | 


--------------------------------------------------------------------------------
/cmake/locate_library.cmake:
--------------------------------------------------------------------------------
 1 | FUNCTION(LOCATE_LIBRARY VARIABLE HEADER LIBRARY)
 2 |     IF(${VARIABLE}_INCLUDE_DIRS AND ${VARIABLE}_LIBRARY_DIRS)
 3 |         RETURN()
 4 |     ENDIF()
 5 |     FIND_PATH(${VARIABLE}_INCLUDE_DIRS NAMES ${HEADER} PATH_SUFFIXES ${ARGN})
 6 |     message("header: ${HEADER}, arguments: ${ARGN} ==> ${${VARIABLE}_INCLUDE_DIRS}")
 7 |     FIND_LIBRARY(${VARIABLE}_LIBRARIES NAMES ${LIBRARY} PATH_SUFFIXES ${ARGN})
 8 |     message("library: ${LIBRARY}, arguments: ${ARGN} ==> ${${VARIABLE}_LIBRARIES}")
 9 |  
10 |     STRING(TOLOWER ${VARIABLE} LIBRARY_NAME)
11 | 
12 |     IF(NOT ${VARIABLE}_INCLUDE_DIRS OR NOT ${VARIABLE}_LIBRARIES)
13 |         MESSAGE(FATAL_ERROR "${LIBRARY_NAME} development files are required to build.")
14 |     ELSE()
15 |         MESSAGE(STATUS "Found ${LIBRARY_NAME}: ${${VARIABLE}_LIBRARIES} - ${${VARIABLE}_INCLUDE_DIRS}")
16 |     ENDIF()
17 | ENDFUNCTION()
18 | 
19 | FUNCTION(LOCATE_HEADERS VARIABLE HEADER)
20 |     IF(${VARIABLE}_INCLUDE_DIRS)
21 |         RETURN()
22 |     ENDIF()
23 | 
24 |     FIND_PATH(${VARIABLE}_INCLUDE_DIRS NAMES ${HEADER} PATH_SUFFIXES ${ARGN})
25 |     message("header: ${HEADER}, arguments: ${ARGN} ==> ${${VARIABLE}_INCLUDE_DIRS}")
26 | 
27 |     IF(NOT ${VARIABLE}_INCLUDE_DIRS)
28 | 	    MESSAGE(FATAL_ERROR "${LIBRARY_NAME} development files (headers) are required to build.")
29 |     ENDIF()
30 | 
31 |     MESSAGE(STATUS "Found ${HEADER}: ${${VARIABLE}_INCLUDE_DIRS}")
32 | ENDFUNCTION()
33 | 


--------------------------------------------------------------------------------
/conf/greylock.conf:
--------------------------------------------------------------------------------
 1 | {
 2 |     "endpoints": [
 3 |         "0.0.0.0:8181"
 4 |     ],
 5 |     "backlog": 512,
 6 |     "threads": 10,
 7 |     "buffer_size": 65536,
 8 | 	"logger": {
 9 |         "level": "info",
10 |         "frontends": [
11 |             {
12 |                 "formatter": {
13 |                     "type": "string",
14 |                     "pattern": "%(timestamp)s %(request_id)s/%(lwp)s/%(pid)s %(severity)s: %(message)s, %(...L)s"
15 |                 },
16 |                 "sink": {
17 |                     "type": "files",
18 |                     "path": "/dev/stdout",
19 |                     "path1": "greylock.log",
20 |                     "autoflush": true,
21 |                     "rotation": { "move": 0 }
22 |                 }
23 |             }
24 |         ]
25 |     },
26 |     "daemon": {
27 |         "fork": false,
28 |         "uid": 1000
29 |     },
30 |     "monitor-port": 21235,
31 |     "request_header": "X-Request",
32 |     "trace_header": "X-Trace",
33 |     "application": {
34 |         "rocksdb.docs": {
35 | 	    "read_only": false,
36 | 	    "bulk_upload": false,
37 |             "path": "/mnt/disk/search/lj/rocksdb.docs"
38 |         },
39 |         "rocksdb.indexes": {
40 | 	    "read_only": false,
41 | 	    "bulk_upload": false,
42 |             "path": "/mnt/disk/search/lj/rocksdb.indexes"
43 |         }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/debian/changelog:
--------------------------------------------------------------------------------
 1 | greylock (1.1.0) unstable; urgency=low
 2 | 
 3 |   * Added date/time search
 4 |   * Added exact phrase search
 5 |   * Added negation support
 6 |   * Added pagination support
 7 | 
 8 |  -- Evgeniy Polyakov <zbr@ioremap.net>  Tue, 09 Aug 2016 01:24:04 +0400
 9 | 
10 | greylock (1.0.0) unstable; urgency=low
11 | 
12 |   * Rewrite greylock search engine to use local rocksdb storage. It is not distributed search so far.
13 | 
14 |  -- Evgeniy Polyakov <zbr@ioremap.net>  Thu, 28 Jul 2016 08:59:06 +0400
15 | 
16 | 


--------------------------------------------------------------------------------
/debian/compat:
--------------------------------------------------------------------------------
1 | 7
2 | 


--------------------------------------------------------------------------------
/debian/control:
--------------------------------------------------------------------------------
 1 | Source: greylock
 2 | Section: net
 3 | Priority: optional
 4 | Maintainer: Evgeniy Polyakov <zbr@ioremap.net>
 5 | Build-Depends:
 6 |  cdbs,
 7 |  cmake (>= 2.6),
 8 |  debhelper (>= 7.0.50~),
 9 |  ribosome-dev (>= 0.2.8),
10 |  libboost-dev,
11 |  libboost-system-dev,
12 |  libboost-program-options-dev,
13 |  libboost-filesystem-dev,
14 |  libjemalloc-dev,
15 |  libmsgpack-dev,
16 |  liblz4-dev,
17 |  libswarm3-dev,
18 |  libthevoid3-dev,
19 |  zlib1g-dev,
20 |  libbz2-dev,
21 |  libsnappy-dev
22 | Standards-Version: 3.8.0
23 | Homepage: http://www.reverbrain.com/
24 | Vcs-Git: git://github.com/reverbrain/greylock.git
25 | Vcs-Browser: https://github.com/reverbrain/greylock
26 | 
27 | Package: greylock
28 | Architecture: any
29 | Depends: ${shlibs:Depends}, ${misc:Depends}
30 | Description: Greylock is a local searching/indexing engine
31 | 
32 | Package: greylock-dev
33 | Architecture: any
34 | Depends: ${shlibs:Depends}, ${misc:Depends},
35 |  ribosome-dev (>= 0.2.8),
36 |  libboost-dev,
37 |  libboost-system-dev,
38 |  libboost-program-options-dev,
39 |  libboost-filesystem-dev,
40 |  libjemalloc-dev,
41 |  libmsgpack-dev,
42 |  liblz4-dev,
43 |  libswarm3-dev,
44 |  libthevoid3-dev,
45 |  zlib1g-dev,
46 |  libbz2-dev,
47 |  libsnappy-dev
48 | Description: Development files for greylock search engine
49 | 


--------------------------------------------------------------------------------
/debian/copyright:
--------------------------------------------------------------------------------
1 | Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
2 | Upstream-Name: greylock
3 | Upstream-Contact: Evgeniy Polyakov <zbr@ioremap.net>
4 | Source: https://github.com/reverbrain/greylock
5 | 
6 | Files: *
7 | Copyright: (C) 2015+ Evgeniy Polyakov <zbr@ioremap.net>
8 | License: GPL-3.0
9 | 


--------------------------------------------------------------------------------
/debian/dirs:
--------------------------------------------------------------------------------
1 | usr/bin
2 | usr/sbin
3 | 


--------------------------------------------------------------------------------
/debian/docs:
--------------------------------------------------------------------------------
1 | conf/
2 | 


--------------------------------------------------------------------------------
/debian/greylock-dev.install:
--------------------------------------------------------------------------------
1 | usr/include/greylock/*
2 | usr/share/greylock/*
3 | usr/lib/libgreylock.so
4 | 


--------------------------------------------------------------------------------
/debian/greylock.install:
--------------------------------------------------------------------------------
1 | usr/bin/greylock_*
2 | usr/lib/libgreylock.so.*
3 | 


--------------------------------------------------------------------------------
/debian/rules:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/make -f
 2 | 
 3 | include /usr/share/cdbs/1/rules/debhelper.mk
 4 | include /usr/share/cdbs/1/class/cmake.mk
 5 | 
 6 | DEB_CMAKE_EXTRA_FLAGS=
 7 | DEB_DH_SHLIBDEPS_ARGS_ALL= --dpkg-shlibdeps-params=--ignore-missing-info
 8 | 
 9 | install/greylock-dev::
10 | 
11 | 


--------------------------------------------------------------------------------
/greylock-bf.spec:
--------------------------------------------------------------------------------
 1 | Summary:	Greylock is an embedded search engine
 2 | Name:		greylock
 3 | Version:	1.1.0
 4 | Release:	1%{?dist}.1
 5 | 
 6 | License:	GPLv3
 7 | Group:		System Environment/Libraries
 8 | URL:		http://reverbrain.com/
 9 | Source0:	%{name}-%{version}.tar.bz2
10 | BuildRoot:	%{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
11 | 
12 | 
13 | BuildRequires:	ribosome-devel
14 | BuildRequires:	libswarm3-devel, libthevoid3-devel
15 | BuildRequires:	boost-devel, boost-system, boost-program-options, boost-filesystem
16 | BuildRequires:	jemalloc-devel, msgpack-devel, lz4-devel
17 | BuildRequires:	cmake >= 2.6
18 | 
19 | %description
20 | Greylock is an embedded search engine which is aimed at index size and performace.
21 | Index of 200k livejournal.com entries (200Mb of uncompressed data) takes about 450Mb,
22 | index includes: full-text and per-author search indexes, original content, stemmed and original content.
23 | 
24 | %package devel
25 | Summary: Development files for %{name}
26 | Group: Development/Libraries
27 | Requires: %{name} = %{version}-%{release}
28 | 
29 | 
30 | %description devel
31 | Greylock is an embedded search engine which is aimed at index size and performace.
32 | 
33 | This package contains libraries, header files and developer documentation
34 | needed for developing software which uses greylock utils.
35 | 
36 | %prep
37 | %setup -q
38 | 
39 | %build
40 | export LDFLAGS="-Wl,-z,defs"
41 | export DESTDIR="%{buildroot}"
42 | %{cmake} .
43 | make %{?_smp_mflags}
44 | 
45 | %install
46 | rm -rf %{buildroot}
47 | make install DESTDIR="%{buildroot}"
48 | 
49 | %post -p /sbin/ldconfig
50 | %postun -p /sbin/ldconfig
51 | 
52 | %clean
53 | rm -rf %{buildroot}
54 | 
55 | %files
56 | %defattr(-,root,root,-)
57 | %{_bindir}/greylock_*
58 | %{_libdir}/libgreylock.so.*
59 | %doc conf/
60 | 
61 | 
62 | %files devel
63 | %defattr(-,root,root,-)
64 | %{_includedir}/*
65 | %{_datadir}/greylock/cmake/*
66 | %{_libdir}/libgreylock.so
67 | 
68 | %changelog
69 | * Tue Aug 09 2016 Evgeniy Polyakov <zbr@ioremap.net> - 1.1.0
70 | - Added date/time search
71 | - Added exact phrase search
72 | - Added negation support
73 | - Added pagination support
74 | 
75 | * Thu Jul 28 2016 Evgeniy Polyakov <zbr@ioremap.net> - 1.0.0
76 | - Rewrite greylock search engine to use local rocksdb storage. It is not distributed search so far.
77 | 
78 | 


--------------------------------------------------------------------------------
/include/greylock/database.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "greylock/error.hpp"
  4 | #include "greylock/id.hpp"
  5 | #include "greylock/utils.hpp"
  6 | 
  7 | #include <ribosome/expiration.hpp>
  8 | 
  9 | #pragma GCC diagnostic push 
 10 | #pragma GCC diagnostic ignored "-Wunused-parameter"
 11 | #include <rocksdb/cache.h>
 12 | #include <rocksdb/db.h>
 13 | #include <rocksdb/filter_policy.h>
 14 | #include <rocksdb/merge_operator.h>
 15 | #include <rocksdb/options.h>
 16 | #include <rocksdb/slice.h>
 17 | #include <rocksdb/status.h>
 18 | #include <rocksdb/statistics.h>
 19 | #include <rocksdb/table.h>
 20 | #include <rocksdb/utilities/transaction_db.h>
 21 | #pragma GCC diagnostic pop
 22 | 
 23 | #include <msgpack.hpp>
 24 | 
 25 | #include <iostream>
 26 | #include <memory>
 27 | #include <string>
 28 | #include <set>
 29 | #include <vector>
 30 | 
 31 | namespace ioremap { namespace greylock {
 32 | 
 33 | struct options {
 34 | 	size_t tokens_shard_size = 3600 * 1 * 24;
 35 | 
 36 | 	int max_threads = 8;
 37 | 
 38 | 	int bits_per_key = 10; // bloom filter parameter
 39 | 
 40 | 	long lru_cache_size = 100 * 1024 * 1024; // 100 MB of uncompressed data cache
 41 | 
 42 | 	long sync_metadata_timeout = 60000; // 60 seconds
 43 | 
 44 | 	// mininmum size of the token which will go into separate index,
 45 | 	// if token size is smaller, it will be combined into 2 indexes
 46 | 	// with the previous and next tokens.
 47 | 	// This options greatly speeds up requests with small words (like [to be or not to be]),
 48 | 	// but heavily increases index size.
 49 | 	unsigned int ngram_index_size = 0;
 50 | 
 51 | 	enum {
 52 | 		default_column = 0,
 53 | 		documents_column,
 54 | 		document_ids_column,
 55 | 		token_shards_column,
 56 | 		indexes_column,
 57 | 		meta_column,
 58 | 		__column_size,
 59 | 	};
 60 | 
 61 | 	std::vector<std::string> column_names;
 62 | 	std::string metadata_key;
 63 | 
 64 | 	options(): metadata_key("greylock.meta.key") {
 65 | 		column_names.resize(__column_size);
 66 | 		column_names[default_column] = rocksdb::kDefaultColumnFamilyName;
 67 | 		column_names[documents_column] = "documents";
 68 | 		column_names[document_ids_column] = "document_ids";
 69 | 		column_names[token_shards_column] = "token_shards";
 70 | 		column_names[indexes_column] = "indexes";
 71 | 		column_names[meta_column] = "meta";
 72 | 	}
 73 | 
 74 | 	std::string column_name(int cnum) const {
 75 | 		if (cnum < 0 || cnum >= __column_size)
 76 | 			return "";
 77 | 
 78 | 		return column_names[cnum];
 79 | 	}
 80 | };
 81 | 
 82 | class metadata {
 83 | public:
 84 | 	metadata() : m_dirty(false), m_seq(0) {}
 85 | 
 86 | 	bool dirty() const {
 87 | 		return m_dirty;
 88 | 	}
 89 | 	void clear_dirty() {
 90 | 		m_dirty = false;
 91 | 	}
 92 | 
 93 | 	long get_sequence() {
 94 | 		m_dirty = true;
 95 | 		return m_seq++;
 96 | 	}
 97 | 
 98 | 	void set_sequence(long seq) {
 99 | 		m_dirty = true;
100 | 		m_seq = seq;
101 | 	}
102 | 
103 | 	enum {
104 | 		serialize_version_2 = 2,
105 | 	};
106 | 
107 | 	template <typename Stream>
108 | 	void msgpack_pack(msgpack::packer<Stream> &o) const {
109 | 		o.pack_array(metadata::serialize_version_2);
110 | 		o.pack((int)metadata::serialize_version_2);
111 | 		o.pack(m_seq.load());
112 | 	}
113 | 
114 | 	void msgpack_unpack(msgpack::object o) {
115 | 		if (o.type != msgpack::type::ARRAY) {
116 | 			std::ostringstream ss;
117 | 			ss << "could not unpack metadata, object type is " << o.type <<
118 | 				", must be array (" << msgpack::type::ARRAY << ")";
119 | 			throw std::runtime_error(ss.str());
120 | 		}
121 | 
122 | 		int version;
123 | 		long seq;
124 | 
125 | 		msgpack::object *p = o.via.array.ptr;
126 | 		p[0].convert(&version);
127 | 
128 | 		if (version != (int)o.via.array.size) {
129 | 			std::ostringstream ss;
130 | 			ss << "could not unpack document, invalid version: " << version << ", array size: " << o.via.array.size;
131 | 			throw std::runtime_error(ss.str());
132 | 		}
133 | 
134 | 		switch (version) {
135 | 		case metadata::serialize_version_2:
136 | 			p[1].convert(&seq);
137 | 			m_seq.store(seq);
138 | 			break;
139 | 		default: {
140 | 			std::ostringstream ss;
141 | 			ss << "could not unpack metadata, invalid version " << version;
142 | 			throw std::runtime_error(ss.str());
143 | 		}
144 | 		}
145 | 	}
146 | 
147 | private:
148 | 	bool m_dirty;
149 | 	std::atomic_long m_seq;
150 | };
151 | 
152 | struct document_for_index {
153 | 	id_t indexed_id;
154 | 	MSGPACK_DEFINE(indexed_id);
155 | 
156 | 	bool operator<(const document_for_index &other) const {
157 | 		return indexed_id < other.indexed_id;
158 | 	}
159 | };
160 | 
161 | namespace {
162 | 	static const uint32_t disk_cookie = 0x45589560;
163 | }
164 | 
165 | struct disk_index {
166 | 	typedef document_for_index value_type;
167 | 	typedef document_for_index& reference;
168 | 	typedef document_for_index* pointer;
169 | 
170 | 	std::vector<document_for_index> ids;
171 | 
172 | 	template <typename Stream>
173 | 	void msgpack_pack(msgpack::packer<Stream> &o) const {
174 | 		o.pack_array(2);
175 | 		o.pack(disk_cookie);
176 | 		o.pack(ids);
177 | 	}
178 | 
179 | 	void msgpack_unpack(msgpack::object o) {
180 | 		if (o.type != msgpack::type::ARRAY) {
181 | 			std::ostringstream ss;
182 | 			ss << "could not unpack disk index, object type is " << o.type <<
183 | 				", must be array (" << msgpack::type::ARRAY << ")";
184 | 			throw std::runtime_error(ss.str());
185 | 		}
186 | 
187 | 		uint32_t cookie;
188 | 
189 | 		msgpack::object *p = o.via.array.ptr;
190 | 		p[0].convert(&cookie);
191 | 
192 | 		if (cookie != disk_cookie) {
193 | 			std::ostringstream ss;
194 | 			ss << "could not unpack disk index, cookie mismatch: " << std::hex << cookie <<
195 | 				", must be: " << std::hex << disk_cookie;
196 | 			throw std::runtime_error(ss.str());
197 | 		}
198 | 
199 | 		p[1].convert(&ids);
200 | 	}
201 | };
202 | 
203 | struct disk_token {
204 | 	std::vector<size_t> shards;
205 | 	MSGPACK_DEFINE(shards);
206 | 
207 | 	disk_token() {}
208 | 	disk_token(const std::set<size_t> &s): shards(s.begin(), s.end()) {}
209 | 	disk_token(const std::vector<size_t> &s): shards(s) {}
210 | };
211 | 
212 | class indexes_merge_operator : public rocksdb::MergeOperator {
213 | public:
214 | 	virtual const char* Name() const override {
215 | 		return "indexes_merge_operator";
216 | 	}
217 | 
218 | 	bool merge_indexes(const rocksdb::Slice& key, const rocksdb::Slice* old_value,
219 | 			const std::deque<std::string>& operand_list,
220 | 			std::string* new_value,
221 | 			rocksdb::Logger *logger) const {
222 | 
223 | 		disk_index index;
224 | 		greylock::error_info err;
225 | 		std::set<document_for_index> unique_index;
226 | 		size_t ocount = 0;
227 | 
228 | 		if (old_value) {
229 | 			err = deserialize(index, old_value->data(), old_value->size());
230 | 			if (err) {
231 | 				rocksdb::Error(logger, "merge: key: %s, index deserialize failed: %s [%d]",
232 | 						key.ToString().c_str(), err.message().c_str(), err.code());
233 | 				return false;
234 | 			}
235 | 
236 | 			unique_index.insert(index.ids.begin(), index.ids.end());
237 | 			ocount = unique_index.size();
238 | 		}
239 | 
240 | 		for (const auto& value : operand_list) {
241 | 			msgpack::unpacked msg;
242 | 			msgpack::unpack(&msg, value.data(), value.size());
243 | 
244 | 			try {
245 | 				msgpack::object o = msg.get();
246 | 
247 | 				if (o.type != msgpack::type::ARRAY) {
248 | 					document_for_index did;
249 | 					o.convert(&did);
250 | 					unique_index.emplace(did);
251 | 					continue;
252 | 				}
253 | 
254 | 				disk_index idx;
255 | 				o.convert(&idx);
256 | 
257 | 				unique_index.insert(idx.ids.begin(), idx.ids.end());
258 | 			} catch (const std::exception &e) {
259 | 				rocksdb::Error(logger, "merge: key: %s, document deserialize failed: %s",
260 | 						key.ToString().c_str(), e.what());
261 | 				return false;
262 | 			}
263 | 		}
264 | 
265 | 		index.ids.clear();
266 | 		index.ids.insert(index.ids.end(), unique_index.begin(), unique_index.end());
267 | 		*new_value = serialize(index);
268 | 
269 | 		if (new_value->size() > 1024 * 1024) {
270 | 			size_t osize = 0;
271 | 			if (old_value)
272 | 				osize = old_value->size();
273 | 			rocksdb::Info(logger, "index_merge: key: %s, size: %ld -> %ld, counts: %ld -> %ld",
274 | 					key.ToString().c_str(), osize, new_value->size(), ocount, index.ids.size());
275 | 		}
276 | 
277 | 		return true;
278 | 	}
279 | 
280 | 	virtual bool FullMerge(const rocksdb::Slice& key, const rocksdb::Slice* old_value,
281 | 			const std::deque<std::string>& operand_list,
282 | 			std::string* new_value,
283 | 			rocksdb::Logger *logger) const override {
284 | 		return merge_indexes(key, old_value, operand_list, new_value, logger);
285 | 	}
286 | 
287 | 	virtual bool PartialMerge(const rocksdb::Slice& key,
288 | 			const rocksdb::Slice& left_operand, const rocksdb::Slice& right_operand,
289 | 			std::string* new_value,
290 | 			rocksdb::Logger* logger) const {
291 | #if 0
292 | 		auto dump = [](const rocksdb::Slice &v) {
293 | 			std::ostringstream ss;
294 | 
295 | 			msgpack::unpacked msg;
296 | 			msgpack::unpack(&msg, v.data(), v.size());
297 | 
298 | 			ss << msg.get();
299 | 			return ss.str();
300 | 		};
301 | 
302 | 		printf("partial merge: key: %s, left: %s, right: %s\n",
303 | 				key.ToString().c_str(), dump(left_operand).c_str(), dump(right_operand).c_str());
304 | #endif
305 | 		(void) key;
306 | 		(void) left_operand;
307 | 		(void) right_operand;
308 | 		(void) new_value;
309 | 		(void) logger;
310 | 
311 | 		return false;
312 | 	}
313 | };
314 | 
315 | class token_shards_merge_operator : public rocksdb::MergeOperator {
316 | public:
317 | 	virtual const char* Name() const override {
318 | 		return "token_shards_merge_operator";
319 | 	}
320 | 
321 | 	bool merge_token_shards(const rocksdb::Slice& key, const rocksdb::Slice* old_value,
322 | 			const std::deque<std::string>& operand_list,
323 | 			std::string* new_value,
324 | 			rocksdb::Logger *logger) const {
325 | 
326 | 		disk_token dt;
327 | 		std::set<size_t> shards;
328 | 		greylock::error_info err;
329 | 
330 | 		if (old_value) {
331 | 			err = deserialize(dt, old_value->data(), old_value->size());
332 | 			if (err) {
333 | 				rocksdb::Error(logger, "merge: key: %s, disk_token deserialize failed: %s [%d]",
334 | 						key.ToString().c_str(), err.message().c_str(), err.code());
335 | 				return false;
336 | 			}
337 | 
338 | 			shards.insert(dt.shards.begin(), dt.shards.end());
339 | 		}
340 | 
341 | 		for (const auto& value : operand_list) {
342 | 			disk_token s;
343 | 			err = deserialize(s, value.data(), value.size());
344 | 			if (err) {
345 | 				rocksdb::Error(logger, "merge: key: %s, disk_token operand deserialize failed: %s [%d]",
346 | 						key.ToString().c_str(), err.message().c_str(), err.code());
347 | 				return false;
348 | 			}
349 | 
350 | 			shards.insert(s.shards.begin(), s.shards.end());
351 | 		}
352 | 
353 | 		dt.shards = std::vector<size_t>(shards.begin(), shards.end());
354 | 		*new_value = serialize(dt);
355 | 
356 | 		if (new_value->size() > 1024 * 1024) {
357 | 			size_t osize = 0;
358 | 			if (old_value) {
359 | 				osize = old_value->size();
360 | 			}
361 | 
362 | 			rocksdb::Warn(logger, "shard_merge: key: %s, size: %ld -> %ld",
363 | 					key.ToString().c_str(), osize, new_value->size());
364 | 		}
365 | 
366 | 		return true;
367 | 	}
368 | 
369 | 	virtual bool FullMerge(const rocksdb::Slice& key, const rocksdb::Slice* old_value,
370 | 			const std::deque<std::string>& operand_list,
371 | 			std::string* new_value,
372 | 			rocksdb::Logger *logger) const override {
373 | 		return merge_token_shards(key, old_value, operand_list, new_value, logger);
374 | 	}
375 | 
376 | 	virtual bool PartialMerge(const rocksdb::Slice& key,
377 | 			const rocksdb::Slice& left_operand, const rocksdb::Slice& right_operand,
378 | 			std::string* new_value,
379 | 			rocksdb::Logger* logger) const {
380 | #if 0
381 | 		auto dump = [](const rocksdb::Slice &v) {
382 | 			std::ostringstream ss;
383 | 
384 | 			msgpack::unpacked msg;
385 | 			msgpack::unpack(&msg, v.data(), v.size());
386 | 
387 | 			ss << msg.get();
388 | 			return ss.str();
389 | 		};
390 | 
391 | 		printf("partial merge: key: %s, left: %s, right: %s\n",
392 | 				key.ToString().c_str(), dump(left_operand).c_str(), dump(right_operand).c_str());
393 | #endif
394 | 		(void) key;
395 | 		(void) left_operand;
396 | 		(void) right_operand;
397 | 		(void) new_value;
398 | 		(void) logger;
399 | 
400 | 		return false;
401 | 	}
402 | };
403 | 
404 | class database {
405 | public:
406 | 	~database() {
407 | 		if (!m_ro) {
408 | 			m_expiration_timer.stop();
409 | 			sync_metadata(NULL);
410 | 		}
411 | 	}
412 | 
413 | 	const greylock::options &options() const {
414 | 		return m_opts;
415 | 	}
416 | 	greylock::metadata &metadata() {
417 | 		return m_meta;
418 | 	}
419 | 
420 | 	rocksdb::ColumnFamilyHandle *cfhandle(int c) {
421 | 		return m_handles[c];
422 | 	}
423 | 
424 | 	void compact() {
425 | 		if (m_db) {
426 | 			for (auto h: m_handles) {
427 | 				struct rocksdb::CompactRangeOptions opts;
428 | 				opts.change_level = true;
429 | 				opts.target_level = 0;
430 | 				m_db->CompactRange(opts, h, NULL, NULL);
431 | 			}
432 | 		}
433 | 	}
434 | 
435 | 	void compact(size_t c, const rocksdb::Slice &start, const rocksdb::Slice &end) {
436 | 		if (m_db && c < m_handles.size()) {
437 | 			const rocksdb::Slice *b = NULL;
438 | 			const rocksdb::Slice *e = NULL;
439 | 
440 | 			if (start != rocksdb::Slice()) {
441 | 				b = &start;
442 | 			}
443 | 			if (end != rocksdb::Slice()) {
444 | 				e = &end;
445 | 			}
446 | 
447 | 			struct rocksdb::CompactRangeOptions opts;
448 | 			opts.change_level = true;
449 | 			opts.target_level = 0;
450 | 			m_db->CompactRange(opts, cfhandle(c), b, e);
451 | 		}
452 | 	}
453 | 
454 | 	greylock::error_info sync_metadata(rocksdb::WriteBatch *batch) {
455 | 		if (m_ro) {
456 | 			return greylock::create_error(-EROFS, "read-only database");
457 | 		}
458 | 
459 | 		if (!m_db) {
460 | 			return greylock::create_error(-EINVAL, "database is not opened");
461 | 		}
462 | 
463 | 		if (!m_meta.dirty())
464 | 			return greylock::error_info();
465 | 
466 | 		std::string meta_serialized = serialize(m_meta);
467 | 
468 | 		rocksdb::Status s;
469 | 		if (batch) {
470 | 			batch->Put(m_handles[options::meta_column], rocksdb::Slice(m_opts.metadata_key), rocksdb::Slice(meta_serialized));
471 | 		} else {
472 | 			s = m_db->Put(rocksdb::WriteOptions(), m_handles[options::meta_column],
473 | 					rocksdb::Slice(m_opts.metadata_key), rocksdb::Slice(meta_serialized));
474 | 		}
475 | 
476 | 		if (!s.ok()) {
477 | 			return greylock::create_error(-s.code(), "could not write metadata key: %s, error: %s",
478 | 					m_opts.metadata_key.c_str(), s.ToString().c_str());
479 | 		}
480 | 
481 | 		m_meta.clear_dirty();
482 | 		return greylock::error_info();
483 | 	}
484 | 
485 | 	greylock::error_info open_read_only(const std::string &path) {
486 | 		return open(path, true, false);
487 | 	}
488 | 	greylock::error_info open_read_write(const std::string &path) {
489 | 		return open(path, false, false);
490 | 	}
491 | 
492 | 	greylock::error_info open(const std::string &path, bool ro, bool bulk) {
493 | 		if (m_db) {
494 | 			return greylock::create_error(-EINVAL, "database is already opened");
495 | 		}
496 | 
497 | 		rocksdb::Options dbo;
498 | 		dbo.max_open_files = 1000;
499 | 		//dbo.disableDataSync = true;
500 | 		dbo.IncreaseParallelism(m_opts.max_threads);
501 | 
502 | 		dbo.max_bytes_for_level_base = 1024 * 1024 * 1024 * 100UL;
503 | 		//dbo.write_buffer_size = 1024 * 1024 * 1024UL;
504 | 		//dbo.max_write_buffer_number = 10;
505 | 		//dbo.min_write_buffer_number_to_merge = 4;
506 | 
507 | 		dbo.compression = rocksdb::kZSTDNotFinalCompression;
508 | 		dbo.num_levels = 10;
509 | #if 0
510 | 		dbo.compression_per_level =
511 | 			std::vector<rocksdb::CompressionType>({
512 | 					rocksdb::kZSTDNotFinalCompression,
513 | 					rocksdb::kZSTDNotFinalCompression,
514 | 					rocksdb::kZSTDNotFinalCompression,
515 | 					rocksdb::kZSTDNotFinalCompression,
516 | 					rocksdb::kZSTDNotFinalCompression,
517 | 				});
518 | #endif
519 | 		dbo.compression_opts = rocksdb::CompressionOptions(-14, 5, 0, 0);
520 | 
521 | 		dbo.create_if_missing = true;
522 | 		dbo.create_missing_column_families = true;
523 | 
524 | 		if (!ro && bulk) {
525 | 			dbo.PrepareForBulkLoad();
526 | 		}
527 | 
528 | 		dbo.statistics = rocksdb::CreateDBStatistics();
529 | 		dbo.stats_dump_period_sec = 60;
530 | 
531 | 		rocksdb::BlockBasedTableOptions table_options;
532 | 		table_options.block_cache = rocksdb::NewLRUCache(m_opts.lru_cache_size);
533 | 		table_options.filter_policy.reset(rocksdb::NewBloomFilterPolicy(m_opts.bits_per_key, true));
534 | 		dbo.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options));
535 | 
536 | 		rocksdb::DB *db;
537 | 		rocksdb::Status s;
538 | 
539 | 		rocksdb::ColumnFamilyOptions cfo(dbo);
540 | 
541 | 		std::vector<rocksdb::ColumnFamilyDescriptor> column_families;
542 | 
543 | 		for (size_t i = 0; i < options().column_names.size(); ++i) {
544 | 			auto cname = options().column_names[i];
545 | 
546 | 			cfo.merge_operator.reset();
547 | 
548 | 			if (i == greylock::options::token_shards_column) {
549 | 				cfo.merge_operator.reset(new token_shards_merge_operator);
550 | 			}
551 | 			if (i == greylock::options::indexes_column) {
552 | 				cfo.merge_operator.reset(new indexes_merge_operator);
553 | 			}
554 | 
555 | 			column_families.push_back(rocksdb::ColumnFamilyDescriptor(cname, cfo));
556 | 		}
557 | 
558 | 		if (ro) {
559 | 			s = rocksdb::DB::OpenForReadOnly(dbo, path, column_families, &m_handles, &db);
560 | 		} else {
561 | 			s = rocksdb::DB::Open(dbo, path, column_families, &m_handles, &db);
562 | 		}
563 | 		if (!s.ok()) {
564 | 			return greylock::create_error(-s.code(), "failed to open rocksdb database: '%s', read-only: %d, error: %s",
565 | 					path.c_str(), ro, s.ToString().c_str());
566 | 		}
567 | 		m_db.reset(db);
568 | 		m_ro = ro;
569 | 
570 | 		std::string meta;
571 | 		s = m_db->Get(rocksdb::ReadOptions(), m_handles[options::meta_column], rocksdb::Slice(m_opts.metadata_key), &meta);
572 | 		if (!s.ok() && !s.IsNotFound()) {
573 | 			return greylock::create_error(-s.code(), "could not read key: %s, error: %s",
574 | 					m_opts.metadata_key.c_str(), s.ToString().c_str());
575 | 		}
576 | 
577 | 		if (s.ok()) {
578 | 			auto err = deserialize(m_meta, meta.data(), meta.size());
579 | 			if (err)
580 | 				return greylock::create_error(err.code(), "metadata deserialization failed, key: %s, error: %s",
581 | 					m_opts.metadata_key.c_str(), err.message().c_str());
582 | 		}
583 | 
584 | 		if (m_opts.sync_metadata_timeout > 0 && !ro) {
585 | 			sync_metadata_callback();
586 | 		}
587 | 
588 | 		return greylock::error_info(); 
589 | 	}
590 | 
591 | 	std::vector<size_t> get_shards(const std::string &key) {
592 | 		disk_token dt;
593 | 		if (!m_db) {
594 | 			return dt.shards;
595 | 		}
596 | 
597 | 		std::string ser_shards;
598 | 		auto err = read(options::token_shards_column, key, &ser_shards);
599 | 		if (err)
600 | 			return dt.shards;
601 | 
602 | 		err = deserialize(dt, ser_shards.data(), ser_shards.size());
603 | 		if (err)
604 | 			return dt.shards;
605 | 
606 | 		return dt.shards;
607 | 	}
608 | 
609 | 	rocksdb::Iterator *iterator(int column, const rocksdb::ReadOptions &ro) {
610 | 		return m_db->NewIterator(ro, m_handles[column]);
611 | 	}
612 | 
613 | 	greylock::error_info read(int column, const std::string &key, std::string *ret) {
614 | 		if (!m_db) {
615 | 			return greylock::create_error(-EINVAL, "database is not opened");
616 | 		}
617 | 
618 | 		auto s = m_db->Get(rocksdb::ReadOptions(), m_handles[column], rocksdb::Slice(key), ret);
619 | 		if (!s.ok()) {
620 | 			return greylock::create_error(-s.code(), "could not read key: %s, error: %s", key.c_str(), s.ToString().c_str());
621 | 		}
622 | 		return greylock::error_info();
623 | 	}
624 | 
625 | 	greylock::error_info write(rocksdb::WriteBatch *batch) {
626 | 		if (!m_db) {
627 | 			return greylock::create_error(-EINVAL, "database is not opened");
628 | 		}
629 | 
630 | 		if (m_ro) {
631 | 			return greylock::create_error(-EROFS, "read-only database");
632 | 		}
633 | 
634 | 		auto wo = rocksdb::WriteOptions();
635 | 
636 | 		auto s = m_db->Write(wo, batch);
637 | 		if (!s.ok()) {
638 | 			return greylock::create_error(-s.code(), "could not write batch: %s", s.ToString().c_str());
639 | 		}
640 | 
641 | 		return greylock::error_info();
642 | 	}
643 | 
644 | 	greylock::error_info write(int column, const std::string &key, const std::string &value) {
645 | 		if (!m_db) {
646 | 			return greylock::create_error(-EINVAL, "database is not opened");
647 | 		}
648 | 
649 | 		if (m_ro) {
650 | 			return greylock::create_error(-EROFS, "read-only database");
651 | 		}
652 | 
653 | 		auto wo = rocksdb::WriteOptions();
654 | 
655 | 		auto s = m_db->Merge(wo, m_handles[column], rocksdb::Slice(key), rocksdb::Slice(value));
656 | 		if (!s.ok()) {
657 | 			return greylock::create_error(-s.code(), "could not write batch: %s", s.ToString().c_str());
658 | 		}
659 | 
660 | 		return greylock::error_info();
661 | 	}
662 | 
663 | private:
664 | 	bool m_ro = false;
665 | 	std::vector<rocksdb::ColumnFamilyHandle*> m_handles;
666 | 	std::unique_ptr<rocksdb::DB> m_db;
667 | 	greylock::options m_opts;
668 | 	greylock::metadata m_meta;
669 | 
670 | 	ribosome::expiration m_expiration_timer;
671 | 
672 | 	void sync_metadata_callback() {
673 | 		sync_metadata(NULL);
674 | 
675 | 		auto expires_at = std::chrono::system_clock::now() + std::chrono::milliseconds(m_opts.sync_metadata_timeout);
676 | 		m_expiration_timer.insert(expires_at, std::bind(&database::sync_metadata_callback, this));
677 | 	}
678 | };
679 | 
680 | }} // namespace ioremap::greylock
681 | 


--------------------------------------------------------------------------------
/include/greylock/error.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdexcept>
 4 | #include <string>
 5 | 
 6 | namespace ioremap { namespace greylock {
 7 | 
 8 | class error : public std::exception
 9 | {
10 | 	public:
11 | 		// err must be negative value
12 | 		explicit error(int err, const std::string &message) throw();
13 | 		~error() throw() {}
14 | 
15 | 		int error_code() const;
16 | 
17 | 		virtual const char *what() const throw();
18 | 
19 | 		std::string error_message() const throw();
20 | 
21 | 	private:
22 | 		int m_errno;
23 | 		std::string m_message;
24 | };
25 | 
26 | class not_found_error : public error
27 | {
28 | 	public:
29 | 		explicit not_found_error(const std::string &message) throw();
30 | };
31 | 
32 | class timeout_error : public error
33 | {
34 | 	public:
35 | 		explicit timeout_error(const std::string &message) throw();
36 | };
37 | 
38 | class no_such_address_error : public error
39 | {
40 | 	public:
41 | 		explicit no_such_address_error(const std::string &message) throw();
42 | };
43 | 
44 | class error_info
45 | {
46 | 	public:
47 | 		inline error_info() : m_code(0) {}
48 | 		inline error_info(int code, const std::string &&message)
49 | 			: m_code(code), m_message(message) {}
50 | 		inline error_info(int code, const std::string &message)
51 | 			: m_code(code), m_message(message) {}
52 | 		inline ~error_info() {}
53 | 
54 | 		inline int code() const { return m_code; }
55 | 		inline const std::string &message() const { return m_message; }
56 | 		inline operator bool() const { return m_code != 0; }
57 | 		inline bool operator !() const { return !operator bool(); }
58 | 		operator int() const = delete; // disable implicit cast to int
59 | 
60 | 		void throw_error() const;
61 | 	private:
62 | 		int m_code;
63 | 		std::string m_message;
64 | };
65 | 
66 | // err must be negative value
67 | void throw_error(int err, const char *format, ...)
68 | 	__attribute__ ((format (printf, 2, 3)));
69 | 
70 | // err must be negative value
71 | error_info create_error(int err, const char *format, ...)
72 | 	__attribute__ ((format (printf, 2, 3)));
73 | 
74 | }} /* namespace ioremap::greylock */
75 | 


--------------------------------------------------------------------------------
/include/greylock/id.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | #include <stdint.h>
 6 | #include <string.h>
 7 | 
 8 | #include <msgpack.hpp>
 9 | 
10 | namespace ioremap { namespace greylock {
11 | 
12 | namespace {
13 | 	static const uint32_t start_date = 0;
14 | 	static const uint32_t date_div = 3600 * 24;
15 | }
16 | 
17 | struct id_t {
18 | 	uint64_t	timestamp = 0;
19 | 
20 | 	MSGPACK_DEFINE(timestamp);
21 | 
22 | 	void set_timestamp(long tsec, long aux) {
23 | 		tsec = (tsec - start_date) / date_div;
24 | 
25 | 		timestamp = tsec << 32;
26 | 		timestamp |= aux & ((1UL << 32) - 1);
27 | 	}
28 | 
29 | 	void get_timestamp(long *tsec, long *aux) const {
30 | 		*tsec = (timestamp >> 32) * date_div + start_date;
31 | 		*aux = timestamp & ((1UL << 32) - 1);
32 | 	}
33 | 
34 | 	bool operator<(const id_t &other) const {
35 | 		return timestamp < other.timestamp;
36 | 	}
37 | 	bool operator>(const id_t &other) const {
38 | 		return timestamp > other.timestamp;
39 | 	}
40 | 
41 | 	bool operator==(const id_t &other) const {
42 | 		return (timestamp == other.timestamp);
43 | 	}
44 | 	bool operator!=(const id_t &other) const {
45 | 		return !operator==(other);
46 | 	}
47 | 
48 | 	std::string to_string() const {
49 | 		char buf[64];
50 | 		size_t sz = snprintf(buf, sizeof(buf), "%016lx", timestamp);
51 | 		return std::string(buf, sz);
52 | 	}
53 | 
54 | 	id_t(): timestamp(0) {
55 | 	}
56 | 
57 | 	id_t(const id_t &other) {
58 | 		timestamp = other.timestamp;
59 | 	}
60 | 
61 | 	id_t(const char *str) {
62 | 		if (!str) {
63 | 			id_t();
64 | 			return;
65 | 		}
66 | 
67 | 		timestamp = strtoull(str, NULL, 16);
68 | 	}
69 | 
70 | 	void set_next_id(const id_t &other) {
71 | 		timestamp = other.timestamp + 1;
72 | 	}
73 | 
74 | };
75 | 
76 | }} // namespace ioremap::greylock
77 | 


--------------------------------------------------------------------------------
/include/greylock/intersection.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef __INDEXES_INTERSECTION_HPP
  2 | #define __INDEXES_INTERSECTION_HPP
  3 | 
  4 | #include "greylock/iterator.hpp"
  5 | #include "greylock/types.hpp"
  6 | 
  7 | namespace ioremap { namespace greylock {
  8 | 
  9 | struct single_doc_result {
 10 | 	document doc;
 11 | 
 12 | 	float relevance = 0;
 13 | };
 14 | 
 15 | struct search_result {
 16 | 	bool completed = true;
 17 | 
 18 | 	// This will contain a cookie which must be used for the next intersection request,
 19 | 	// if current request is not complete. This may happen when client has requested limited
 20 | 	// maximum number of keys in reply and there are more keys.
 21 | 	id_t next_document_id;
 22 | 	long max_number_of_documents = ~0UL;
 23 | 
 24 | 	// array of documents which contain all requested indexes
 25 | 	std::vector<single_doc_result> docs;
 26 | };
 27 | 
 28 | // check whether given result matches query, may also set or change some result parameters like relevance field
 29 | typedef std::function<bool (single_doc_result &)> check_result_function_t;
 30 | 
 31 | struct mailbox_query {
 32 | 	std::string mbox;
 33 | 	greylock::indexes idx;
 34 | 
 35 | 	greylock::error_info parse_error;
 36 | 
 37 | 	mailbox_query(const greylock::options &options, const rapidjson::Value &doc) {
 38 | 		const rapidjson::Value &query_and = greylock::get_object(doc, "query");
 39 | 		if (query_and.IsObject()) {
 40 | 			auto ireq = indexes::get_indexes(options, query_and);
 41 | 			idx.merge_query(ireq);
 42 | 		}
 43 | 
 44 | 		const rapidjson::Value &query_exact = greylock::get_object(doc, "exact");
 45 | 		if (query_exact.IsObject()) {
 46 | 			auto ireq = indexes::get_indexes(options, query_exact);
 47 | 
 48 | 			// merge these indexes into intersection set,
 49 | 			// since exact phrase match implies document contains all tokens
 50 | 			idx.merge_exact(ireq);
 51 | 		}
 52 | 
 53 | 		const rapidjson::Value &query_negation = greylock::get_object(doc, "negation");
 54 | 		if (query_negation.IsObject()) {
 55 | 			auto ireq = indexes::get_indexes(options, query_negation);
 56 | 			// do not merge these indexes into intersection set, put them into own container
 57 | 			idx.merge_negation(ireq);
 58 | 		}
 59 | 
 60 | 		if (idx.attributes.empty()) {
 61 | 			parse_error = greylock::create_error(-ENOENT,
 62 | 					"search: mailbox: %s, there are no queries suitable for search", mbox.c_str());
 63 | 			return;
 64 | 		}
 65 | 	}
 66 | };
 67 | 
 68 | struct intersection_query {
 69 | 	id_t range_start, range_end;
 70 | 
 71 | 	std::vector<mailbox_query> se;
 72 | 
 73 | 	id_t next_document_id;
 74 | 	size_t max_number = LONG_MAX;
 75 | 
 76 | 	std::string to_string() const {
 77 | 		std::ostringstream ss;
 78 | 
 79 | 		ss << "[ ";
 80 | 		for (const auto &ent: se) {
 81 | 			ss << "mailbox: " << ent.mbox << ", indexes: " << ent.idx.to_string() << "| ";
 82 | 		}
 83 | 		ss << "]";
 84 | 
 85 | 		return ss.str();
 86 | 	}
 87 | };
 88 | 
 89 | template <typename DBT>
 90 | class intersector {
 91 | public:
 92 | 	intersector(DBT &db_docs, DBT &db_indexes) : m_db_docs(db_docs), m_db_indexes(db_indexes) {}
 93 | 
 94 | 	search_result intersect(const intersection_query &iq) const {
 95 | 		return intersect(iq, [&] (single_doc_result &) -> bool {
 96 | 					return true;
 97 | 				});
 98 | 	}
 99 | 
100 | 	// search for intersections between all @indexes
101 | 	// starting with the key @start, returning at most @num entries
102 | 	//
103 | 	// after @intersect() completes, it sets @start to the next key to start searching from
104 | 	// user should not change that token, otherwise @intersect() may skip some entries or
105 | 	// return duplicates.
106 | 	//
107 | 	// if number of returned entries is less than requested number @num or if @start has been set to empty string
108 | 	// after call to this function returns, then intersection is completed.
109 | 	//
110 | 	// @search_result.completed will be set to true in this case.
111 | 	search_result intersect(const intersection_query &iq, check_result_function_t check) const {
112 | 		search_result res;
113 | #ifdef STDOUT_DEBUG
114 | 				auto dump_vector = [] (const std::vector<size_t> &sh) -> std::string {
115 | 					std::ostringstream ss;
116 | 					for (size_t i = 0; i < sh.size(); ++i) {
117 | 						ss << sh[i];
118 | 						if (i != sh.size() - 1)
119 | 							ss << " ";
120 | 					}
121 | 
122 | 					return ss.str();
123 | 				};
124 | 
125 | #endif
126 | 
127 | 
128 | 		std::vector<size_t> common_shards;
129 | 		bool init = true;
130 | 		for (const auto &ent: iq.se) {
131 | 			for (const auto &attr: ent.idx.attributes) {
132 | 				for (const auto &t: attr.tokens) {
133 | 					std::string shard_key = document::generate_shard_key(m_db_indexes.options(), ent.mbox, attr.name, t.name);
134 | 					auto shards = m_db_indexes.get_shards(shard_key);
135 | #ifdef STDOUT_DEBUG
136 | 					printf("common_shards: %s, key: %s, shards: %s\n",
137 | 							dump_vector(common_shards).c_str(), shard_key.c_str(),
138 | 							dump_vector(shards).c_str());
139 | #endif
140 | 					// one index is empty, intersection will be empty, return early
141 | 					if (shards.size() == 0) {
142 | 						return res;
143 | 					}
144 | 
145 | 					if (init) {
146 | 						common_shards = shards;
147 | 						init = false;
148 | 					} else {
149 | 						std::vector<size_t> intersection;
150 | 						std::set_intersection(common_shards.begin(), common_shards.end(),
151 | 								shards.begin(), shards.end(),
152 | 								std::back_inserter(intersection));
153 | 						common_shards = intersection;
154 | 					}
155 | 
156 | 					// intersection is empty, return early
157 | 					if (common_shards.size() == 0) {
158 | 						return res;
159 | 					}
160 | 				}
161 | 			}
162 | 		}
163 | 
164 | 		struct iter {
165 | 			greylock::index_iterator<DBT> begin, end;
166 | 
167 | 			iter(DBT &db, const std::string &mbox, const std::string &attr, const std::string &token,
168 | 					const std::vector<size_t> &shards) :
169 | 				begin(greylock::index_iterator<DBT>::begin(db, mbox, attr, token, shards)),
170 | 				end(greylock::index_iterator<DBT>::end(db, mbox, attr, token))
171 | 			{
172 | 			}
173 | 		};
174 | 
175 | 		// contains vector of iterators pointing to the requested indexes
176 | 		// iterator always points to the smallest document ID not yet pushed into resulting structure (or to client)
177 | 		// or discarded (if other index iterators point to larger document IDs)
178 | 		std::vector<iter> idata;
179 | 		std::vector<iter> inegation;
180 | 
181 | 		for (const auto &ent: iq.se) {
182 | 			for (const auto &attr: ent.idx.attributes) {
183 | 				for (const auto &t: attr.tokens) {
184 | 					iter itr(m_db_indexes, ent.mbox, attr.name, t.name, common_shards);
185 | 
186 | 					if (iq.next_document_id != 0) {
187 | 						itr.begin.rewind_to_index(iq.next_document_id);
188 | 					} else {
189 | 						itr.begin.rewind_to_index(iq.range_start);
190 | 					}
191 | 
192 | 					idata.emplace_back(itr);
193 | 				}
194 | 			}
195 | 
196 | 			for (const auto &attr: ent.idx.negation) {
197 | 				for (const auto &t: attr.tokens) {
198 | 					std::string shard_key = document::generate_shard_key(m_db_indexes.options(), ent.mbox, attr.name, t.name);
199 | 					auto shards = m_db_indexes.get_shards(shard_key);
200 | #ifdef STDOUT_DEBUG
201 | 					printf("negation: key: %s, shards: %s\n",
202 | 							shard_key.c_str(),
203 | 							dump_vector(shards).c_str());
204 | #endif
205 | 
206 | 					iter itr(m_db_indexes, ent.mbox, attr.name, t.name, shards);
207 | 					inegation.emplace_back(itr);
208 | 				}
209 | 			}
210 | 		}
211 | 
212 | 		while (true) {
213 | 			// contains indexes within @idata array of iterators,
214 | 			// each iterator contains the same and smallest to the known moment reference to the document (i.e. document ID)
215 | 			//
216 | 			// if checking @idata array yelds smaller document ID than that in iterators referenced in @pos,
217 | 			// then we clear @pos and starts pushing the new smallest iterator indexes
218 | 			//
219 | 			// we could break out of the @idata processing, increase the smallest pointing iterator and start over,
220 | 			// but we optimize @idata processing - if there are other iterators in @idata which equal to the smallest
221 | 			// iterator value (document ID), we put them into @pos
222 | 			// Since @pos doesn't contain all indexes (its size doesn't equal to the size of @idata), we will increase
223 | 			// all iterators where we have found the smallest document ID, hopefully they will point to the new document ID,
224 | 			// which might be the same for all iterator among @idata and thus we will push this document ID to the result
225 | 			// structure returned to the client
226 | 			//
227 | 			// Here is an example:
228 | 			//
229 | 			// 1. @idata iterators	0	1	2	3
230 | 			//                      -------------------------
231 | 			// document ids		d0	d2	d3	d3
232 | 			// 			d2	d3	d4	d4
233 | 			// 			d3	d4	d5	d5
234 | 			// 			d4	-	-	-
235 | 			// 			d5	-	-	-
236 | 			//
237 | 			// We start from the top of this table, i.e. row after 'document ids' string
238 | 			// @pos will contain following values during iteration over @idata iterators
239 | 			// 	0 - select the first value
240 | 			// 	0 - skip iterator 1 (d2 document id) since its value is greater than that 0'th iterator value (d0)
241 | 			// 	0 - skip iterator 2
242 | 			// 	0 - skip iterator 3
243 | 			//
244 | 			// @pos contains only 0 index, it is not equal to the size of @idata (4), thus we have to increase 0'th iterator
245 | 			// discarding its first value
246 | 			//
247 | 			// 2. @idata iterators	0	1	2	3
248 | 			//                      -------------------------
249 | 			// document ids		d2	d2	d3	d3
250 | 			// 			d3	d3	d4	d4
251 | 			// 			d4	d4	d5	d5
252 | 			// 			d5	-	-	-
253 | 			// @pos:
254 | 			// 	0 - select the first iterator
255 | 			// 	0 1 - 1'th iterator value equals to the value of the 0'th iterator, append it to the array
256 | 			// 	0 1 - 2'th iterator value (d3) is greater than that of the 0'th iterator (d2)
257 | 			// 	0 1 - the same as above
258 | 			// since size of the @pos is not equal to the size of @idata we increment all iterators which are indexed in @pos
259 | 			//
260 | 			// 3. @idata iterators	0	1	2	3
261 | 			//                      -------------------------
262 | 			// document ids		d3	d3	d3	d3
263 | 			// 			d4	d4	d4	d4
264 | 			// 			d5	-	d5	d5
265 | 			// @pos will contain all 4 indexes, since all iterator's value are the same (d3)
266 | 			// We will increment all iterators and push d3 into resulting array which will be returned to the client,
267 | 			// since size of the @pos array equals to the @idata size
268 | 			//
269 | 			// 4. @idata iterators	0	1	2	3
270 | 			//                      -------------------------
271 | 			// document ids		d4	d4	d4	d4
272 | 			// 			d5	-	d5	d5
273 | 			// We put d4 into resulting array and increment all iterators as above
274 | 			//
275 | 			// 5. @idata iterators	0	1	2	3
276 | 			//                      -------------------------
277 | 			// document ids		d5	-	d5	d5
278 | 			//
279 | 			// @pos:
280 | 			// 	0 - select the first iterator
281 | 			// 	Stop processing, since 1'th iterator is empty.
282 | 			// 	This means no further iteration checks can contain all 4 the same value,
283 | 			// 	thus it is not possible to find any other document with higher ID
284 | 			// 	which will contain all 4 requested indexes.
285 | 			//
286 | 			// 6. Return [d3, d4] values to the client
287 | 			std::vector<pos_t> pos;
288 | 
289 | 			id_t next_id;
290 | 
291 | 			int current = -1;
292 | 			for (auto &itr: idata) {
293 | 				auto &it = itr.begin;
294 | 				auto &e = itr.end;
295 | 				++current;
296 | 
297 | 				if (it == e) {
298 | 					res.completed = true;
299 | 					break;
300 | 				}
301 | 
302 | 				if (it->indexed_id > iq.range_end) {
303 | 					res.completed = true;
304 | 					break;
305 | 				}
306 | 
307 | 				res.completed = false;
308 | 				res.next_document_id.set_next_id(it->indexed_id);
309 | 
310 | 				if (pos.size() == 0) {
311 | 					pos.push_back(current);
312 | 					continue;
313 | 				}
314 | 
315 | 				auto &min_it = idata[pos[0]].begin;
316 | #if 0
317 | 				BH_LOG(m_bp.logger(), INDEXES_LOG_INFO, "intersection: min-index: %s, id: %s, it-index: %s, id: %s",
318 | 						idata[pos[0]].idx.start_key().str(), min_it->str(),
319 | 						idata_it->idx.start_key().str(), it->str());
320 | #endif
321 | 				if (it->indexed_id == min_it->indexed_id) {
322 | 					pos.push_back(current);
323 | 					continue;
324 | 				}
325 | 
326 | 				next_id = std::max(it->indexed_id, min_it->indexed_id);
327 | 				res.next_document_id.set_next_id(next_id);
328 | 
329 | 				pos.clear();
330 | 				break;
331 | 			}
332 | 
333 | 			// this can only happen if one of the iterators has been finished,
334 | 			// which means number of found positions will not be equal to the number
335 | 			// of indexes to intersect, and thus there is no more data to push into result.
336 | 			// Just break out of the processing loop - nothing can be added anymore.
337 | 			if (res.completed) {
338 | 				break;
339 | 			}
340 | 
341 | 			// number of entries with the same document ID doesn't match number of indexes,
342 | 			// this means some index doesn't have this docuement and thus it has to be skipped
343 | 			// and iteration check process has to be started over
344 | 			if (pos.size() != idata.size()) {
345 | 				for (auto &it: idata) {
346 | 					auto &min_it = it.begin;
347 | 
348 | 					min_it.rewind_to_index(next_id);
349 | 				}
350 | 
351 | 				continue;
352 | 			}
353 | 
354 | 			auto &min_it = idata[pos.front()].begin;
355 | 			id_t indexed_id = min_it->indexed_id;
356 | 
357 | 			bool negation_match = false;
358 | 			for (auto &neg: inegation) {
359 | 				auto &it = neg.begin;
360 | 				it.rewind_to_index(indexed_id);
361 | 				if (it != neg.end) {
362 | 					if (it->indexed_id == indexed_id) {
363 | 						negation_match = true;
364 | 						break;
365 | 					}
366 | 				}
367 | 			}
368 | 
369 | 			auto increment_all_iterators = [&] () {
370 | 				for (auto it = pos.begin(); it != pos.end(); ++it) {
371 | 					auto &idata_iter = idata[*it].begin;
372 | 					++idata_iter;
373 | 				}
374 | 			};
375 | 
376 | 			if (negation_match) {
377 | 				increment_all_iterators();
378 | 				continue;
379 | 			}
380 | 
381 | 			single_doc_result rs;
382 | 			auto err = min_it.document(m_db_docs, &rs.doc);
383 | 			if (err) {
384 | #if 0
385 | 				printf("could not read document id: %ld, err: %s [%d]\n",
386 | 						min_it->indexed_id, err.message().c_str(), err.code());
387 | #endif
388 | 				increment_all_iterators();
389 | 				continue;
390 | 			}
391 | 			rs.doc.indexed_id = indexed_id;
392 | 
393 | 			// increment all iterators
394 | 			increment_all_iterators();
395 | 
396 | 			if (!check(rs)) {
397 | 				continue;
398 | 			}
399 | 
400 | 			res.docs.emplace_back(rs);
401 | 			if (res.docs.size() == iq.max_number)
402 | 				break;
403 | 		}
404 | 
405 | 		return res;
406 | 	}
407 | private:
408 | 	DBT &m_db_docs;
409 | 	DBT &m_db_indexes;
410 | };
411 | 
412 | }} // namespace ioremap::greylock
413 | 
414 | #endif // __INDEXES_INTERSECTION_HPP
415 | 


--------------------------------------------------------------------------------
/include/greylock/iterator.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "greylock/types.hpp"
  4 | 
  5 | #include <iterator>
  6 | 
  7 | //#define STDOUT_DEBUG
  8 | #ifdef STDOUT_DEBUG
  9 | #define dprintf(fmt, a...) printf(fmt, ##a)
 10 | #else
 11 | #define dprintf(fmt, ...)
 12 | #endif
 13 | 
 14 | namespace ioremap { namespace greylock {
 15 | 
 16 | template <typename DBT>
 17 | class index_iterator {
 18 | private:
 19 | 	disk_index m_current;
 20 | 	typename decltype(m_current.ids)::iterator m_idx_current, m_idx_end;
 21 | public:
 22 | 	typedef index_iterator self_type;
 23 | 	typedef disk_index::value_type value_type;
 24 | 	typedef typename decltype(m_current.ids)::iterator::reference reference;
 25 | 	typedef typename decltype(m_current.ids)::iterator::pointer pointer;
 26 | 	typedef std::forward_iterator_tag iterator_category;
 27 | 	typedef std::ptrdiff_t difference_type;
 28 | 
 29 | 	static index_iterator begin(DBT &db, const std::string &mbox, const std::string &attr, const std::string &token) {
 30 | 		std::string index_base = document::generate_index_base(db.options(), mbox, attr, token);
 31 | 		std::vector<size_t> shards(db.get_shards(document::generate_shard_key(db.options(), mbox, attr, token)));
 32 | 		if (shards.size() == 0) {
 33 | 			return end(db, index_base);
 34 | 		}
 35 | 
 36 | 		return index_iterator(db, index_base, shards);
 37 | 	}
 38 | 	static index_iterator begin(DBT &db, const std::string &mbox, const std::string &attr, const std::string &token,
 39 | 			const std::vector<size_t> &shards) {
 40 | 		std::string index_base = document::generate_index_base(db.options(), mbox, attr, token);
 41 | 		if (shards.size() == 0) {
 42 | 			return end(db, index_base);
 43 | 		}
 44 | 
 45 | 		return index_iterator(db, index_base, shards);
 46 | 	}
 47 | 
 48 | 	static index_iterator end(DBT &db, const std::string &base) {
 49 | 		return index_iterator(db, base);
 50 | 	}
 51 | 	static index_iterator end(DBT &db, const std::string &mbox, const std::string &attr, const std::string &token) {
 52 | 		std::string index_base = document::generate_index_base(db.options(), mbox, attr, token);
 53 | 		return index_iterator(db, index_base);
 54 | 	}
 55 | 
 56 | 	index_iterator(const index_iterator &src): m_db(src.m_db) {
 57 | 		m_current = src.m_current;
 58 | 		if (src.m_idx_current == src.m_idx_end) {
 59 | 			m_idx_current = m_idx_end = m_current.ids.end();
 60 | 		} else {
 61 | 			typename decltype(src.m_current.ids)::const_iterator sib = src.m_current.ids.begin();
 62 | 			typename decltype(src.m_current.ids)::const_iterator sic = src.m_idx_current;
 63 | 
 64 | 			auto diff = std::distance(sib, sic);
 65 | 			dprintf("src: %s, diff: %ld\n", src.to_string().c_str(), diff);
 66 | 
 67 | 			m_idx_current = std::next(m_current.ids.begin(), diff);
 68 | 			m_idx_end = m_current.ids.end();
 69 | 		}
 70 | 
 71 | 		m_base = src.m_base;
 72 | 		m_shards = src.m_shards;
 73 | 		m_shards_idx = src.m_shards_idx;
 74 | 	}
 75 | 
 76 | 	self_type &operator++() {
 77 | 		++m_idx_current;
 78 | 		if (m_idx_current == m_idx_end) {
 79 | 			load_next();
 80 | 		}
 81 | 		return *this;
 82 | 	}
 83 | 
 84 | 	self_type &rewind_to_index(const id_t &idx) {
 85 | 		size_t rewind_shard = document::generate_shard_number(m_db.options(), idx);
 86 | 		dprintf("rewind: %s, idx: %s, rewind_shard: %ld\n", to_string().c_str(), idx.to_string().c_str(), rewind_shard);
 87 | 
 88 | 		auto rewind_shard_it = std::lower_bound(m_shards.begin(), m_shards.end(), rewind_shard);
 89 | 		if (rewind_shard_it == m_shards.end()) {
 90 | 			set_shard_index(-1);
 91 | 			dprintf("could not increase iterator: %s\n", to_string().c_str());
 92 | 			return *this;
 93 | 		}
 94 | 
 95 | 		int rewind_shard_idx = std::distance(m_shards.begin(), rewind_shard_it);
 96 | 		if (rewind_shard_idx != m_shards_idx - 1) {
 97 | 			set_shard_index(rewind_shard_idx);
 98 | 			load_next();
 99 | 		}
100 | 
101 | 		if (m_shards_idx >= 0) {
102 | 			document_for_index did;
103 | 			did.indexed_id = idx;
104 | 
105 | 			do {
106 | 				m_idx_current = std::lower_bound(m_idx_current, m_idx_end, did);
107 | 				if (m_idx_current == m_idx_end) {
108 | 					load_next();
109 | 					if (m_shards_idx < 0)
110 | 						break;
111 | 				}
112 | 
113 | 			} while (m_idx_current->indexed_id < idx);
114 | 		}
115 | 
116 | 		dprintf("increased iterator: %s\n", to_string().c_str());
117 | 		return *this;
118 | 	}
119 | 
120 | 	reference operator*() {
121 | 		return *m_idx_current;
122 | 	}
123 | 	pointer operator->() {
124 | 		return &(*m_idx_current);
125 | 	}
126 | 
127 | 	error_info document(DBT &db, document *doc) {
128 | 		std::string doc_data;
129 | 		auto err = db.read(greylock::options::documents_column, m_idx_current->indexed_id.to_string(), &doc_data);
130 | 		if (err)
131 | 			return err;
132 | 
133 | 		deserialize(*doc, doc_data.data(), doc_data.size());
134 | 		return greylock::error_info();
135 | 	}
136 | 
137 | 	std::string to_string() const {
138 | 		auto dump_shards = [&]() -> std::string {
139 | 			std::ostringstream out;
140 | 			for (size_t i = 0; i < m_shards.size(); ++i) {
141 | 				out << m_shards[i];
142 | 				if (i != m_shards.size() - 1)
143 | 					out << " ";
144 | 			}
145 | 			return out.str();
146 | 		};
147 | 		std::ostringstream ss;
148 | 		ss << "base: " << m_base <<
149 | 			", next_shard_idx: " << m_shards_idx <<
150 | 			", shards: [" << dump_shards() << "] " <<
151 | 			", ids_size: " << m_current.ids.size() <<
152 | 			", current_is_end: " << (m_idx_current == m_idx_end) <<
153 | 			", indexed_id: " << ((m_idx_current == m_idx_end) ? "none" : m_idx_current->indexed_id.to_string());
154 | 		return ss.str();
155 | 	}
156 | 
157 | 	bool operator==(const self_type& rhs) {
158 | 		if (m_base != rhs.m_base)
159 | 			return false;
160 | 		if (m_shards.size() != rhs.m_shards.size())
161 | 			return false;
162 | 		if (m_shards != rhs.m_shards)
163 | 			return false;
164 | 		if (m_shards_idx != rhs.m_shards_idx)
165 | 			return false;
166 | 
167 | 		if ((m_idx_current == m_idx_end) && (rhs.m_idx_current == rhs.m_idx_end))
168 | 			return true;
169 | 
170 | 		if (m_idx_current->indexed_id != rhs.m_idx_current->indexed_id)
171 | 			return false;
172 | 
173 | 		return true;
174 | 	}
175 | 	bool operator!=(const self_type& rhs) {
176 | 		return !operator==(rhs);
177 | 	}
178 | 
179 | private:
180 | 	DBT &m_db;
181 | 	std::string m_base;
182 | 	std::vector<size_t> m_shards;
183 | 	int m_shards_idx = -1;
184 | 
185 | 	index_iterator(DBT &db, const std::string &base): m_db(db), m_base(base) {
186 | 	}
187 | 
188 | 	index_iterator(DBT &db, const std::string &base, const std::vector<size_t> shards): m_db(db), m_base(base), m_shards(shards) {
189 | 		set_shard_index(0);
190 | 		load_next();
191 | 	}
192 | 
193 | 	void set_shard_index(int idx) {
194 | 		m_shards_idx = idx;
195 | 		if (idx < 0) {
196 | 			m_shards.clear();
197 | 
198 | 			m_current.ids.clear();
199 | 			m_idx_current = m_current.ids.begin();
200 | 			m_idx_end = m_current.ids.end();
201 | 		}
202 | 	}
203 | 
204 | 	void load_next() {
205 | 		do {
206 | 			load_next_one();
207 | 		} while (m_shards_idx >= 0 && m_current.ids.empty());
208 | 	}
209 | 
210 | 	void load_next_one() {
211 | 		dprintf("loading: %s\n", to_string().c_str());
212 | 		m_current.ids.clear();
213 | 		m_idx_current = m_current.ids.begin();
214 | 		m_idx_end = m_current.ids.end();
215 | 
216 | 		if (m_shards_idx < 0 || m_shards_idx >= (int)m_shards.size()) {
217 | 			set_shard_index(-1);
218 | 			return;
219 | 		}
220 | 
221 | 		std::string key = document::generate_index_key_shard_number(m_base, m_shards[m_shards_idx]);
222 | 		std::string data;
223 | 		auto err = m_db.read(greylock::options::indexes_column, key, &data);
224 | 		if (err) {
225 | 			set_shard_index(-1);
226 | 			return;
227 | 		}
228 | 
229 | 		try {
230 | 			deserialize(m_current, data.data(), data.size());
231 | 
232 | 			m_idx_current = m_current.ids.begin();
233 | 			m_idx_end = m_current.ids.end();
234 | 		} catch (...) {
235 | 			set_shard_index(-1);
236 | 			return;
237 | 		}
238 | 
239 | 		set_shard_index(m_shards_idx + 1);
240 | 		dprintf("loaded: %s\n", to_string().c_str());
241 | 	}
242 | };
243 | }} // namespace ioremap::greylock
244 | 


--------------------------------------------------------------------------------
/include/greylock/json.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __INDEXES_JSON_HPP
 2 | #define __INDEXES_JSON_HPP
 3 | 
 4 | #include <thevoid/rapidjson/document.h>
 5 | 
 6 | #include <stdint.h>
 7 | 
 8 | namespace ioremap { namespace greylock {
 9 | 
10 | static inline const char *get_string(const rapidjson::Value &entry, const char *name, const char *def = NULL) {
11 | 	if (entry.HasMember(name)) {
12 | 		const rapidjson::Value &v = entry[name];
13 | 		if (v.IsString()) {
14 | 			return v.GetString();
15 | 		}
16 | 	}
17 | 
18 | 	return def;
19 | }
20 | 
21 | static inline int64_t get_int64(const rapidjson::Value &entry, const char *name, int64_t def = -1) {
22 | 	if (entry.HasMember(name)) {
23 | 		const rapidjson::Value &v = entry[name];
24 | 		if (v.IsInt()) {
25 | 			return v.GetInt();
26 | 		}
27 | 		if (v.IsUint()) {
28 | 			return v.GetUint();
29 | 		}
30 | 		if (v.IsInt64()) {
31 | 			return v.GetInt64();
32 | 		}
33 | 		if (v.IsUint()) {
34 | 			return v.GetUint64();
35 | 		}
36 | 	}
37 | 
38 | 	return def;
39 | }
40 | 
41 | static inline const rapidjson::Value &get_object(const rapidjson::Value &entry, const char *name,
42 | 		const rapidjson::Value &def = rapidjson::Value()) {
43 | 	if (entry.HasMember(name)) {
44 | 		const rapidjson::Value &v = entry[name];
45 | 
46 | 		if (v.IsObject())
47 | 			return v;
48 | 	}
49 | 
50 | 	return def;
51 | }
52 | 
53 | static inline const rapidjson::Value &get_array(const rapidjson::Value &entry, const char *name,
54 | 		const rapidjson::Value &def = rapidjson::Value()) {
55 | 	if (entry.HasMember(name)) {
56 | 		const rapidjson::Value &v = entry[name];
57 | 
58 | 		if (v.IsArray())
59 | 			return v;
60 | 	}
61 | 
62 | 	return def;
63 | }
64 | 
65 | static inline bool get_bool(const rapidjson::Value &entry, const char *name, bool def = true) {
66 | 	if (entry.HasMember(name)) {
67 | 		const rapidjson::Value &v = entry[name];
68 | 
69 | 		if (v.IsBool())
70 | 			return v.GetBool();
71 | 	}
72 | 
73 | 	return def;
74 | }
75 | 
76 | }} // namespace ioremap::greylock
77 | 
78 | #endif // __INDEXES_JSON_HPP
79 | 


--------------------------------------------------------------------------------
/include/greylock/jsonvalue.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <thevoid/rapidjson/stringbuffer.h>
 4 | #include <thevoid/rapidjson/prettywriter.h>
 5 | #include <thevoid/rapidjson/document.h>
 6 | 
 7 | #include <string>
 8 | 
 9 | #include <time.h>
10 | 
11 | namespace ioremap { namespace greylock {
12 | 
13 | class JsonValue : public rapidjson::Value
14 | {
15 | public:
16 | 	JsonValue() {
17 | 		SetObject();
18 | 	}
19 | 
20 | 	~JsonValue() {
21 | 	}
22 | 
23 | 	static void set_time(rapidjson::Value &obj, rapidjson::Document::AllocatorType &alloc, long tsec, long usec) {
24 | 		char str[64];
25 | 		struct tm tm;
26 | 
27 | 		localtime_r((time_t *)&tsec, &tm);
28 | 		strftime(str, sizeof(str), "%F %Z %R:%S", &tm);
29 | 
30 | 		char time_str[128];
31 | 		snprintf(time_str, sizeof(time_str), "%s.%06lu", str, usec);
32 | 
33 | 		obj.SetObject();
34 | 
35 | 		rapidjson::Value tobj(time_str, strlen(time_str), alloc);
36 | 		obj.AddMember("time", tobj, alloc);
37 | 
38 | 		std::string raw_time = std::to_string(tsec) + "." + std::to_string(usec);
39 | 		rapidjson::Value tobj_raw(raw_time.c_str(), raw_time.size(), alloc);
40 | 		obj.AddMember("time-raw", tobj_raw, alloc);
41 | 	}
42 | 
43 | 	std::string ToString() const {
44 | 		rapidjson::StringBuffer buffer;
45 | 		rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer);
46 | 
47 | 		Accept(writer);
48 | 		buffer.Put('\n');
49 | 
50 | 		return std::string(buffer.GetString(), buffer.Size());
51 | 	}
52 | 
53 | 	rapidjson::MemoryPoolAllocator<> &GetAllocator() {
54 | 		return m_allocator;
55 | 	}
56 | 
57 | private:
58 | 	rapidjson::MemoryPoolAllocator<> m_allocator;
59 | };
60 | 
61 | 
62 | }} // namespace ioremap::greylock
63 | 


--------------------------------------------------------------------------------
/include/greylock/types.hpp:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "greylock/database.hpp"
  4 | #include "greylock/json.hpp"
  5 | #include "greylock/id.hpp"
  6 | 
  7 | #include <msgpack.hpp>
  8 | 
  9 | #include <algorithm>
 10 | #include <atomic>
 11 | #include <iostream>
 12 | #include <map>
 13 | #include <set>
 14 | #include <string>
 15 | #include <sstream>
 16 | #include <vector>
 17 | 
 18 | #include <ribosome/split.hpp>
 19 | 
 20 | namespace ioremap { namespace greylock {
 21 | 
 22 | typedef int pos_t;
 23 | 
 24 | struct token {
 25 | 	std::string name;
 26 | 	std::vector<pos_t> positions;
 27 | 
 28 | 	std::string shard_key;
 29 | 	std::set<size_t> shards;
 30 | 
 31 | 	token(const std::string &name): name(name) {}
 32 | 	void insert_position(pos_t pos) {
 33 | 		positions.push_back(pos);
 34 | 	}
 35 | 	void insert_positions(const std::vector<pos_t> &pos) {
 36 | 		positions.insert(positions.end(), pos.begin(), pos.end());
 37 | 	}
 38 | 
 39 | 	std::string key;
 40 | };
 41 | 
 42 | struct attribute {
 43 | 	std::string name;
 44 | 	std::vector<token> tokens;
 45 | 
 46 | 	attribute(const std::string &name): name(name) {}
 47 | 	void insert(const std::string &tname, pos_t pos) {
 48 | 		auto it = std::find_if(tokens.begin(), tokens.end(), [&](const token &t) {
 49 | 					return t.name == tname;
 50 | 				});
 51 | 		if (it == tokens.end()) {
 52 | 			token t(tname);
 53 | 			t.insert_position(pos);
 54 | 			tokens.emplace_back(t);
 55 | 			return;
 56 | 		}
 57 | 
 58 | 		it->insert_position(pos);
 59 | 	}
 60 | 
 61 | 	void insert(const std::string &tname, const std::vector<pos_t> &positions) {
 62 | 		auto it = std::find_if(tokens.begin(), tokens.end(), [&](const token &t) {
 63 | 					return t.name == tname;
 64 | 				});
 65 | 		if (it == tokens.end()) {
 66 | 			token t(tname);
 67 | 			t.insert_positions(positions);
 68 | 			tokens.emplace_back(t);
 69 | 			return;
 70 | 		}
 71 | 
 72 | 		it->insert_positions(positions);
 73 | 	}
 74 | };
 75 | 
 76 | struct indexes {
 77 | 	std::vector<attribute> attributes;
 78 | 	std::vector<attribute> exact;
 79 | 	std::vector<attribute> negation;
 80 | 
 81 | 	std::vector<attribute> merge(const std::vector<attribute> &our, const std::vector<attribute> &other) const {
 82 | 		std::map<std::string, attribute> attrs;
 83 | 
 84 | 		auto merge_one = [&] (const std::vector<attribute> &v) {
 85 | 			for (auto &a: v) {
 86 | 				if (a.tokens.empty())
 87 | 					continue;
 88 | 
 89 | 				auto it = attrs.find(a.name);
 90 | 				if (it == attrs.end()) {
 91 | 					attrs.insert(std::make_pair(a.name, a));
 92 | 				} else {
 93 | 					for (auto &t: a.tokens) {
 94 | 						it->second.insert(t.name, t.positions);
 95 | 					}
 96 | 				}
 97 | 			}
 98 | 		};
 99 | 
100 | 		merge_one(our);
101 | 		merge_one(other);
102 | 
103 | 		std::vector<attribute> ret;
104 | 		ret.reserve(attrs.size());
105 | 		for (auto &p: attrs) {
106 | 			ret.push_back(p.second);
107 | 		}
108 | 		return ret;
109 | 	}
110 | 
111 | 	void merge_query(const indexes &other) {
112 | 		attributes = merge(attributes, other.attributes);
113 | 	}
114 | 
115 | 	void merge_exact(const indexes &other) {
116 | 		exact = merge(exact, other.attributes);
117 | 	}
118 | 
119 | 	void merge_negation(const indexes &other) {
120 | 		negation = merge(negation, other.attributes);
121 | 	}
122 | 
123 | 	std::string to_string() const {
124 | 		std::ostringstream ss;
125 | 
126 | 		auto dump_attributes = [] (const std::vector<attribute> &v) {
127 | 			return dump_vector<attribute>(v, [] (const attribute &a) -> std::string {
128 | 					std::ostringstream ss;
129 | 					ss << a.name;
130 | 					if (a.tokens.size()) {
131 | 						ss << "{";
132 | 						for (size_t i = 0; i < a.tokens.size(); ++i) {
133 | 							auto &token = a.tokens[i];
134 | 							ss << token.name;
135 | 							if (i != a.tokens.size() - 1)
136 | 								ss << " ";
137 | 						}
138 | 						ss << "}";
139 | 					}
140 | 					return ss.str();
141 | 				});
142 | 		};
143 | 
144 | 		ss << "negation: [" << dump_attributes(negation) << "] " <<
145 | 			"exact: [" << dump_attributes(exact) << "] " <<
146 | 			"query: [" << dump_attributes(attributes) << "] ";
147 | 		return ss.str();
148 | 	}
149 | 
150 | 	static indexes get_indexes(const greylock::options &options, const rapidjson::Value &idxs) {
151 | 		indexes ireq;
152 | 
153 | 		if (!idxs.IsObject())
154 | 			return ireq;
155 | 
156 | 		ribosome::split spl;
157 | 		for (rapidjson::Value::ConstMemberIterator it = idxs.MemberBegin(), idxs_end = idxs.MemberEnd(); it != idxs_end; ++it) {
158 | 			const char *aname = it->name.GetString();
159 | 			const rapidjson::Value &avalue = it->value;
160 | 
161 | 			if (!avalue.IsString())
162 | 				continue;
163 | 
164 | 			greylock::attribute a(aname);
165 | 
166 | 			std::vector<ribosome::lstring> indexes =
167 | 				spl.convert_split_words(avalue.GetString(), avalue.GetStringLength());
168 | 			for (size_t pos = 0; pos < indexes.size(); ++pos) {
169 | 				auto &idx = indexes[pos];
170 | 				if (idx.size() >= options.ngram_index_size) {
171 | 					a.insert(ribosome::lconvert::to_string(idx), pos);
172 | 				} else {
173 | 					if (pos > 0) {
174 | 						auto &prev = indexes[pos - 1];
175 | 						a.insert(ribosome::lconvert::to_string(prev + idx), pos);
176 | 					}
177 | 
178 | 					if (pos < indexes.size() - 1) {
179 | 						auto &next = indexes[pos + 1];
180 | 						a.insert(ribosome::lconvert::to_string(idx + next), pos);
181 | 					}
182 | 				}
183 | 			}
184 | 
185 | 			ireq.attributes.emplace_back(a);
186 | 		}
187 | 
188 | 		return ireq;
189 | 	}
190 | 
191 | };
192 | 
193 | struct content {
194 | 	std::string content;
195 | 	std::string title;
196 | 	std::vector<std::string> links;
197 | 	std::vector<std::string> images;
198 | 
199 | 	MSGPACK_DEFINE(content, title, links, images);
200 | };
201 | 
202 | struct document {
203 | 	id_t indexed_id;
204 | 
205 | 	enum {
206 | 		serialize_version_7 = 7,
207 | 	};
208 | 
209 | 	std::string mbox;
210 | 
211 | 	bool is_comment = false;
212 | 
213 | 	std::string author;
214 | 	std::string id;
215 | 
216 | 	content ctx;
217 | 
218 | 	indexes idx;
219 | 
220 | 	template <typename Stream>
221 | 	void msgpack_pack(msgpack::packer<Stream> &o) const {
222 | 		o.pack_array(document::serialize_version_7);
223 | 		o.pack((int)document::serialize_version_7);
224 | 		o.pack(is_comment);
225 | 		o.pack(author);
226 | 		o.pack(ctx);
227 | 		o.pack(id);
228 | 		o.pack(indexed_id);
229 | 		o.pack(0); // unused
230 | 	}
231 | 
232 | 	void msgpack_unpack(msgpack::object o) {
233 | 		if (o.type != msgpack::type::ARRAY) {
234 | 			std::ostringstream ss;
235 | 			ss << "could not unpack document, object type is " << o.type <<
236 | 				", must be array (" << msgpack::type::ARRAY << ")";
237 | 			throw std::runtime_error(ss.str());
238 | 		}
239 | 
240 | 		int version;
241 | 
242 | 		msgpack::object *p = o.via.array.ptr;
243 | 		p[0].convert(&version);
244 | 
245 | 		if (version != (int)o.via.array.size) {
246 | 			std::ostringstream ss;
247 | 			ss << "could not unpack document, invalid version: " << version << ", array size: " << o.via.array.size;
248 | 			throw std::runtime_error(ss.str());
249 | 		}
250 | 
251 | 		switch (version) {
252 | 		case document::serialize_version_7:
253 | 			p[1].convert(&is_comment);
254 | 			p[2].convert(&author);
255 | 			p[3].convert(&ctx);
256 | 			p[4].convert(&id);
257 | 			p[5].convert(&indexed_id);
258 | 			//p[6].convert(); unused
259 | 			break;
260 | 		default: {
261 | 			std::ostringstream ss;
262 | 			ss << "could not unpack document, invalid version " << version;
263 | 			throw std::runtime_error(ss.str());
264 | 		}
265 | 		}
266 | 	}
267 | 
268 | 	void assign_id(const char *cid, long seq, long tsec, long tnsec) {
269 | 		id.assign(cid);
270 | 		(void) tnsec;
271 | 		indexed_id.set_timestamp(tsec, seq);
272 | 	}
273 | 
274 | 	void generate_token_keys(const options &options) {
275 | 		size_t shard_number = generate_shard_number(options, indexed_id);
276 | 
277 | 		for (auto &attr: idx.attributes) {
278 | 			for (auto &t: attr.tokens) {
279 | 				std::string index_base = generate_index_base(options, mbox, attr.name, t.name);
280 | 				t.key = generate_index_key_shard_number(index_base, shard_number);
281 | 				t.shard_key = generate_shard_key(options, mbox, attr.name, t.name);
282 | 
283 | 				t.shards.insert(shard_number);
284 | 			}
285 | 		}
286 | 	}
287 | 
288 | 	static size_t generate_shard_number(const options &options, const id_t &indexed_id) {
289 | 		long tsec, tnsec;
290 | 		indexed_id.get_timestamp(&tsec, &tnsec);
291 | 		return tsec / options.tokens_shard_size;
292 | 	}
293 | 
294 | 	static std::string generate_index_base(const options &options,
295 | 			const std::string &mbox, const std::string &attr, const std::string &token) {
296 | 		(void) options;
297 | 		char ckey[mbox.size() + attr.size() + token.size() + 5];
298 | 		size_t csize = snprintf(ckey, sizeof(ckey), "%s.%s.%s",
299 | 				mbox.c_str(), attr.c_str(), token.c_str());
300 | 
301 | 		return std::string(ckey, csize);
302 | 	}
303 | 
304 | 	static std::string generate_index_key_shard_number(const std::string &base, size_t sn) {
305 | 		char ckey[base.size() + 19];
306 | 		size_t csize = snprintf(ckey, sizeof(ckey), "%016lx.%s", sn, base.c_str());
307 | 
308 | 		return std::string(ckey, csize);
309 | 	}
310 | 	static std::string generate_index_key(const options &options, const std::string &base, const id_t &indexed_id) {
311 | 		size_t shard_number = generate_shard_number(options, indexed_id);
312 | 		return generate_index_key_shard_number(base, shard_number);
313 | 	}
314 | 	static std::string generate_index_key(const options &options,
315 | 			const std::string &mbox, const std::string &attr, const std::string &token,
316 | 			const id_t &indexed_id) {
317 | 		std::string base = generate_index_base(options, mbox, attr, token);
318 | 		return generate_index_key(options, base, indexed_id);
319 | 	}
320 | 
321 | 	static std::string generate_shard_key(const options &options,
322 | 			const std::string &mbox, const std::string &attr, const std::string &token) {
323 | 		return generate_index_base(options, mbox, attr, token);
324 | 	}
325 | };
326 | 
327 | }} // namespace ioremap::greylock
328 | 


--------------------------------------------------------------------------------
/include/greylock/utils.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "greylock/error.hpp"
 4 | 
 5 | #include <string>
 6 | #include <sstream>
 7 | #include <vector>
 8 | 
 9 | #include <msgpack.hpp>
10 | 
11 | namespace ioremap { namespace greylock {
12 | 
13 | template <typename T>
14 | std::string dump_vector(const std::vector<T> &vec) {
15 | 	std::ostringstream ss;
16 | 	for (size_t i = 0; i < vec.size(); ++i) {
17 | 		ss << vec[i];
18 | 		if (i != vec.size() - 1)
19 | 			ss << " ";
20 | 	}
21 | 
22 | 	return ss.str();
23 | }
24 | 
25 | template <typename T>
26 | std::string dump_vector(const std::vector<T> &vec, std::function<std::string (const T &)> convert) {
27 | 	std::ostringstream ss;
28 | 	for (size_t i = 0; i < vec.size(); ++i) {
29 | 		ss << convert(vec[i]);
30 | 		if (i != vec.size() - 1)
31 | 			ss << " ";
32 | 	}
33 | 
34 | 	return ss.str();
35 | }
36 | 
37 | template <typename T>
38 | greylock::error_info deserialize(T &t, const char *data, size_t size) {
39 | 	msgpack::unpacked msg;
40 | 	try {
41 | 		msgpack::unpack(&msg, data, size);
42 | 
43 | 		msg.get().convert(&t);
44 | 	} catch (const std::exception &e) {
45 | 		std::ostringstream ss;
46 | 		ss << msg.get();
47 | 		return greylock::create_error(-EINVAL, "could not unpack data, size: %ld, value: %s, error: %s",
48 | 				size, ss.str().c_str(), e.what());
49 | 	}
50 | 
51 | 	return greylock::error_info();
52 | }
53 | 
54 | template <typename T>
55 | std::string serialize(const T &t) {
56 | 	std::stringstream buffer;
57 | 	msgpack::pack(buffer, t);
58 | 	buffer.seekg(0);
59 | 	return buffer.str();
60 | }
61 | 
62 | }} // namesapce ioremap::greylock
63 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(greylock SHARED exception.cpp)
 2 | set_target_properties(greylock PROPERTIES
 3 | 	VERSION ${GREYLOCK_MAJOR_VERSION}
 4 | 	SOVERSION ${GREYLOCK_FULL_VERSION}
 5 | 	LINKER_LANGUAGE CXX
 6 | )
 7 | target_link_libraries(greylock
 8 | 	${Boost_LIBRARIES}
 9 | 	${JEMALLOC_LIBRARIES}
10 | 	${MSGPACK_LIBRARIES}
11 | 	${RIBOSOME_LIBRARIES}
12 | 	${ROCKSDB_LIBRARIES}
13 | 	${SWARM_LIBRARIES}
14 | 	${THEVOID_LIBRARIES}
15 | 	pthread
16 | )
17 | add_executable(greylock_server server.cpp)
18 | target_link_libraries(greylock_server
19 | 	greylock
20 | )
21 | 
22 | add_executable(greylock_list list.cpp)
23 | target_link_libraries(greylock_list
24 | 	greylock
25 | )
26 | add_executable(greylock_meta meta.cpp)
27 | target_link_libraries(greylock_meta
28 | 	greylock
29 | )
30 | add_executable(greylock_check check.cpp)
31 | target_link_libraries(greylock_check
32 | 	greylock
33 | )
34 | add_executable(greylock_compact compact.cpp)
35 | target_link_libraries(greylock_compact
36 | 	greylock
37 | )
38 | 
39 | add_executable(greylock_merge merge.cpp)
40 | target_link_libraries(greylock_merge
41 | 	greylock
42 | )
43 | 
44 | install(TARGETS	greylock
45 | 	LIBRARY DESTINATION lib${LIB_SUFFIX}
46 | 	ARCHIVE DESTINATION lib${LIB_SUFFIX}
47 | 	BUNDLE DESTINATION library
48 | )
49 | install(TARGETS	greylock_server greylock_meta greylock_check greylock_compact greylock_merge
50 | 	RUNTIME DESTINATION bin COMPONENT runtime
51 | )
52 | 
53 | 


--------------------------------------------------------------------------------
/src/check.cpp:
--------------------------------------------------------------------------------
  1 | #include "greylock/database.hpp"
  2 | #include "greylock/types.hpp"
  3 | 
  4 | #include <ribosome/error.hpp>
  5 | #include <ribosome/timer.hpp>
  6 | 
  7 | #include <boost/program_options.hpp>
  8 | 
  9 | #include <rocksdb/comparator.h>
 10 | 
 11 | using namespace ioremap;
 12 | 
 13 | static inline const char *print_time(long tsec, long tnsec)
 14 | {
 15 | 	char str[64];
 16 | 	struct tm tm;
 17 | 
 18 | 	static __thread char __dnet_print_time[128];
 19 | 
 20 | 	localtime_r((time_t *)&tsec, &tm);
 21 | 	strftime(str, sizeof(str), "%F %R:%S", &tm);
 22 | 
 23 | 	snprintf(__dnet_print_time, sizeof(__dnet_print_time), "%s.%06llu", str, (long long unsigned) tnsec / 1000);
 24 | 	return __dnet_print_time;
 25 | }
 26 | 
 27 | class checker {
 28 | public:
 29 | 	checker(long print_interval) : m_print_interval(print_interval) {
 30 | 	}
 31 | 
 32 | 	void check(int column, const std::string &input) {
 33 | 		std::unique_ptr<greylock::database> dbu(new greylock::database());
 34 | 		auto err = dbu->open_read_only(input);
 35 | 		if (err) {
 36 | 			ribosome::throw_error(err.code(), "could not open input database: %s: %s",
 37 | 					input.c_str(), err.message().c_str());
 38 | 		}
 39 | 
 40 | 		printf("Input database %s has been opened\n", input.c_str());
 41 | 
 42 | 		rocksdb::ReadOptions ro;
 43 | 		rocksdb::Iterator *it = dbu->iterator(column, ro);
 44 | 		it->SeekToFirst();
 45 | 
 46 | 		printf("Input database %s has been positioned\n", input.c_str());
 47 | 
 48 | 		if (!it->Valid()) {
 49 | 			auto s = it->status();
 50 | 			ribosome::throw_error(-s.code(), "iterator from database %s is not valid: %s [%d]",
 51 | 					input.c_str(), s.ToString().c_str(), s.code());
 52 | 		}
 53 | 
 54 | 		size_t prev_shard_number = 0;
 55 | 		size_t shard_number = 0;
 56 | 		size_t prev_documents = 0;
 57 | 		size_t documents = 0;
 58 | 		size_t shard_documents = 0;
 59 | 
 60 | 		ribosome::timer tm, last_print;
 61 | 		greylock::document doc;
 62 | 
 63 | 		auto print_stats = [&] () -> char * {
 64 | 			struct timespec ts;
 65 | 			clock_gettime(CLOCK_REALTIME, &ts);
 66 | 
 67 | 			static char tmp[1024];
 68 | 
 69 | 			snprintf(tmp, sizeof(tmp),
 70 | 				"%s: %ld seconds: documents: %ld, speed: %.2f [%.2f] docs/s, "
 71 | 					"shard: %ld, docs: %ld, id: %s, doc: %s",
 72 | 				print_time(ts.tv_sec, ts.tv_nsec),
 73 | 				tm.elapsed() / 1000,
 74 | 				documents,
 75 | 				(float)documents * 1000.0 / (float)tm.elapsed(),
 76 | 				(float)(documents - prev_documents) * 1000.0 / (float)last_print.elapsed(),
 77 | 				prev_shard_number, shard_documents,
 78 | 				doc.indexed_id.to_string().c_str(), doc.id.c_str());
 79 | 
 80 | 			prev_documents = documents;
 81 | 			last_print.restart();
 82 | 			return tmp;
 83 | 		};
 84 | 
 85 | 		for (; it->Valid(); it->Next()) {
 86 | 			auto sl = it->value();
 87 | 
 88 | 			auto gerr = deserialize(doc, sl.data(), sl.size());
 89 | 			if (gerr) {
 90 | 				ribosome::throw_error(err.code(), "could not deserialize document, key: %s, size: %ld, error: %s [%d]",
 91 | 						it->key().ToString().c_str(), sl.size(), gerr.message().c_str(), gerr.code());
 92 | 			}
 93 | 
 94 | 			shard_number = greylock::document::generate_shard_number(greylock::options(), doc.indexed_id);
 95 | 			if (shard_number > 10000) {
 96 | 				printf("shard_number: %ld [%lx], id: %s, doc: %s\n",
 97 | 						shard_number, shard_number, doc.indexed_id.to_string().c_str(),
 98 | 						doc.id.c_str());
 99 | 			}
100 | 
101 | 			if (shard_number < prev_shard_number) {
102 | 				printf("shard_number: %ld -> %ld, id: %s, doc: %s, error: shard number decreased\n",
103 | 						prev_shard_number, shard_number, doc.indexed_id.to_string().c_str(),
104 | 						doc.id.c_str());
105 | 			}
106 | 
107 | 			if ((last_print.elapsed() > m_print_interval) || (prev_shard_number != shard_number)) {
108 | 				std::cout << print_stats() << std::endl;
109 | 			}
110 | 
111 | 			if (prev_shard_number != shard_number) {
112 | 				shard_documents = 0;
113 | 			}
114 | 
115 | 			documents++;
116 | 			shard_documents++;
117 | 
118 | 			prev_shard_number = shard_number;
119 | 		}
120 | 		std::cout << print_stats() << std::endl;
121 | 	}
122 | 
123 | private:
124 | 	long m_print_interval;
125 | };
126 | 
127 | int main(int argc, char *argv[])
128 | {
129 | 	namespace bpo = boost::program_options;
130 | 
131 | 	bpo::options_description generic("Merge options");
132 | 
133 | 	std::string input;
134 | 	std::string column;
135 | 	long print_interval;
136 | 	generic.add_options()
137 | 		("help", "This help message")
138 | 		("column", bpo::value<std::string>(&column)->required(), "Column name to check")
139 | 		("input", bpo::value<std::string>(&input)->required(), "Input rocksdb database")
140 | 		("print-interval", bpo::value<long>(&print_interval)->default_value(1000), "Period to dump merge stats (in milliseconds)")
141 | 		;
142 | 
143 | 	bpo::options_description cmdline_options;
144 | 	cmdline_options.add(generic);
145 | 
146 | 	bpo::variables_map vm;
147 | 
148 | 	try {
149 | 		bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm);
150 | 
151 | 		if (vm.count("help")) {
152 | 			std::cout << generic << std::endl;
153 | 			return 0;
154 | 		}
155 | 
156 | 		bpo::notify(vm);
157 | 	} catch (const std::exception &e) {
158 | 		std::cerr << "Invalid options: " << e.what() << "\n" << generic << std::endl;
159 | 		return -1;
160 | 	}
161 | 
162 | 	greylock::options opt;
163 | 	auto it = std::find(opt.column_names.begin(), opt.column_names.end(), column);
164 | 	if (it == opt.column_names.end()) {
165 | 		std::cerr << "Invalig column " << column << ", supported columns: " << greylock::dump_vector(opt.column_names) << std::endl;
166 | 		return -EINVAL;
167 | 	}
168 | 
169 | 	auto column_id = std::distance(opt.column_names.begin(), it);
170 | 
171 | 	try {
172 | 		checker c(print_interval);
173 | 		c.check(column_id, input);
174 | 	} catch (const std::exception &e) {
175 | 		std::cerr << "Exception: " << e.what() << std::endl;
176 | 		return -1;
177 | 	}
178 | }
179 | 


--------------------------------------------------------------------------------
/src/compact.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | 
  3 | #include "greylock/database.hpp"
  4 | #include "greylock/types.hpp"
  5 | 
  6 | #include <boost/program_options.hpp>
  7 | 
  8 | #include <ribosome/timer.hpp>
  9 | 
 10 | using namespace ioremap;
 11 | 
 12 | static inline const char *print_time(long tsec, long tnsec)
 13 | {
 14 | 	char str[64];
 15 | 	struct tm tm;
 16 | 
 17 | 	static __thread char __dnet_print_time[128];
 18 | 
 19 | 	localtime_r((time_t *)&tsec, &tm);
 20 | 	strftime(str, sizeof(str), "%F %R:%S", &tm);
 21 | 
 22 | 	snprintf(__dnet_print_time, sizeof(__dnet_print_time), "%s.%06llu", str, (long long unsigned) tnsec / 1000);
 23 | 	return __dnet_print_time;
 24 | }
 25 | 
 26 | int main(int argc, char *argv[])
 27 | {
 28 | 	namespace bpo = boost::program_options;
 29 | 
 30 | 	bpo::options_description generic("Database compact options");
 31 | 	generic.add_options()
 32 | 		("help", "this help message")
 33 | 		;
 34 | 
 35 | 
 36 | 	std::string dpath;
 37 | 	long csize_mb;
 38 | 	std::string cname;
 39 | 	bpo::options_description gr("Compaction options");
 40 | 	gr.add_options()
 41 | 		("path", bpo::value<std::string>(&dpath)->required(), "path to rocksdb database")
 42 | 		("column", bpo::value<std::string>(&cname)->required(), "Column name to compact")
 43 | 		("size", bpo::value<long>(&csize_mb)->default_value(1024), "Number of MBs to compact in one chunk")
 44 | 		;
 45 | 
 46 | 	bpo::options_description cmdline_options;
 47 | 	cmdline_options.add(generic).add(gr);
 48 | 
 49 | 	bpo::variables_map vm;
 50 | 
 51 | 	try {
 52 | 		bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm);
 53 | 
 54 | 		if (vm.count("help")) {
 55 | 			std::cout << cmdline_options << std::endl;
 56 | 			return 0;
 57 | 		}
 58 | 
 59 | 		bpo::notify(vm);
 60 | 	} catch (const std::exception &e) {
 61 | 		std::cerr << "Invalid options: " << e.what() << "\n" << cmdline_options << std::endl;
 62 | 		return -1;
 63 | 	}
 64 | 
 65 | 	greylock::options opt;
 66 | 	auto it = std::find(opt.column_names.begin(), opt.column_names.end(), cname);
 67 | 	if (it == opt.column_names.end()) {
 68 | 		std::cerr << "Invalig column " << cname << ", supported columns: " << greylock::dump_vector(opt.column_names) << std::endl;
 69 | 		return -EINVAL;
 70 | 	}
 71 | 
 72 | 	auto column_id = std::distance(opt.column_names.begin(), it);
 73 | 
 74 | #define SECONDS(x) ((x) / 1000.)
 75 | 
 76 | 	try {
 77 | 		ribosome::timer tm;
 78 | 
 79 | 		greylock::database db;
 80 | 		auto err = db.open_read_write(dpath);
 81 | 		if (err) {
 82 | 			std::cerr << "could not open database: " << err.message();
 83 | 			return err.code();
 84 | 		}
 85 | 		long open_time = tm.elapsed();
 86 | 		printf("%.2fs : %.2fs: database has been opened\n", SECONDS(tm.elapsed()), SECONDS(open_time)); 
 87 | 
 88 | 		rocksdb::ReadOptions ro;
 89 | 		auto it = db.iterator(column_id, ro);
 90 | 		it->SeekToFirst();
 91 | 		long position_time = tm.elapsed() - open_time;
 92 | 		printf("%.2fs : %.2fs: database has been positioned\n", SECONDS(tm.elapsed()), SECONDS(position_time));
 93 | 
 94 | 		if (!it->Valid()) {
 95 | 			auto s = it->status();
 96 | 			fprintf(stderr, "iterator is not valid: %s [%d]", s.ToString().c_str(), s.code());
 97 | 			return -s.code();
 98 | 		}
 99 | 
100 | 		long compact_size = csize_mb * 1024 * 1024;
101 | 
102 | 		long compaction_start_time = tm.elapsed();
103 | 		while (it->Valid()) {
104 | 			long compaction_tmp_start_time = tm.elapsed();
105 | 
106 | 			long current_size = 0;
107 | 			rocksdb::Slice start, end;
108 | 
109 | 			start = it->key();
110 | 			while (it->Valid() && current_size < compact_size) {
111 | 				current_size += it->value().size();
112 | 				end = it->key();
113 | 
114 | 				it->Next();
115 | 			}
116 | 
117 | 			db.compact(column_id, start, end);
118 | 			long compaction_time = tm.elapsed() - compaction_tmp_start_time;
119 | 
120 | 			printf("%.2fs : %.2fs: compaction: start: %s, end: %s, size: %.2f MB\n",
121 | 					SECONDS(tm.elapsed()), SECONDS(compaction_time),
122 | 					start.ToString().c_str(), end.ToString().c_str(),
123 | 					current_size / (1024. * 1024.)); 
124 | 		}
125 | 
126 | 		if (!it->Valid()) {
127 | 			auto s = it->status();
128 | 			if (s.code() != 0) {
129 | 				fprintf(stderr, "iterator has become invalid during iteration: %s [%d]", s.ToString().c_str(), s.code());
130 | 				return -s.code();
131 | 			}
132 | 		}
133 | 
134 | 		long compaction_time = tm.elapsed() - compaction_start_time;
135 | 
136 | 		printf("%.2fs : %.2fs: database has been compacted\n", SECONDS(tm.elapsed()), SECONDS(compaction_time));
137 | 	} catch (const std::exception &e) {
138 | 		std::cerr << "Exception: " << e.what() << std::endl;
139 | 	}
140 | }
141 | 
142 | 


--------------------------------------------------------------------------------
/src/exception.cpp:
--------------------------------------------------------------------------------
  1 | #include "greylock/error.hpp"
  2 | 
  3 | #include <cstdarg>
  4 | #include <cstdio>
  5 | #include <sstream>
  6 | 
  7 | #include <errno.h>
  8 | #include <string.h>
  9 | 
 10 | namespace ioremap { namespace greylock {
 11 | 
 12 | error::error(int code, const std::string &message) throw() : m_errno(code), m_message(message)
 13 | {
 14 | }
 15 | 
 16 | int error::error_code() const
 17 | {
 18 | 	return m_errno;
 19 | }
 20 | 
 21 | const char *error::what() const throw()
 22 | {
 23 | 	return m_message.c_str();
 24 | }
 25 | 
 26 | std::string error::error_message() const throw()
 27 | {
 28 | 	return m_message;
 29 | }
 30 | 
 31 | not_found_error::not_found_error(const std::string &message) throw()
 32 | 	: error(-ENOENT, message)
 33 | {
 34 | }
 35 | 
 36 | timeout_error::timeout_error(const std::string &message) throw()
 37 | 	: error(-ETIMEDOUT, message)
 38 | {
 39 | }
 40 | 
 41 | no_such_address_error::no_such_address_error(const std::string &message) throw()
 42 | 	: error(-ENXIO, message)
 43 | {
 44 | }
 45 | 
 46 | void error_info::throw_error() const
 47 | {
 48 | 	switch (m_code) {
 49 | 		case -ENOENT:
 50 | 			throw not_found_error(m_message);
 51 | 			break;
 52 | 		case -ETIMEDOUT:
 53 | 			throw timeout_error(m_message);
 54 | 			break;
 55 | 		case -ENOMEM:
 56 | 			throw std::bad_alloc();
 57 | 			break;
 58 | 		case -ENXIO:
 59 | 			throw no_such_address_error(m_message);
 60 | 			break;
 61 | 		case 0:
 62 | 			// Do nothing, it's not an error
 63 | 			break;
 64 | 		default:
 65 | 			throw error(m_code, m_message);
 66 | 			break;
 67 | 	}
 68 | }
 69 | 
 70 | static error_info create_info(int err, const char *id, const char *format, va_list args)
 71 | {
 72 | 	if (err == -ENOMEM)
 73 | 		return error_info(err, std::string());
 74 | 
 75 | 	std::ostringstream message;
 76 | 	char buffer[1024];
 77 | 	const size_t buffer_size = sizeof(buffer);
 78 | 	if (id) {
 79 | 		message << id << ": ";
 80 | 	}
 81 | 	vsnprintf(buffer, buffer_size, format, args);
 82 | 	buffer[buffer_size - 1] = '\0';
 83 | 	message << buffer << ": " << strerror(-err) << ": " << err;
 84 | 	return error_info(err, message.str());
 85 | }
 86 | 
 87 | void throw_error(int err, const char *format, ...)
 88 | {
 89 | 	va_list args;
 90 | 	va_start(args, format);
 91 | 	error_info error = create_info(err, 0, format, args);
 92 | 	va_end(args);
 93 | 	error.throw_error();
 94 | }
 95 | 
 96 | error_info create_error(int err, const char *format, ...)
 97 | {
 98 | 	va_list args;
 99 | 	va_start(args, format);
100 | 	error_info error = create_info(err, 0, format, args);
101 | 	va_end(args);
102 | 	return error;
103 | }
104 | 
105 | }} // namespace ioremap::greylock
106 | 


--------------------------------------------------------------------------------
/src/list.cpp:
--------------------------------------------------------------------------------
 1 | #include "greylock/database.hpp"
 2 | #include "greylock/types.hpp"
 3 | 
 4 | #include <ribosome/error.hpp>
 5 | 
 6 | #include <boost/program_options.hpp>
 7 | 
 8 | #include <rocksdb/comparator.h>
 9 | 
10 | using namespace ioremap;
11 | 
12 | static void list(const std::string &input, int column) {
13 | 	std::unique_ptr<greylock::database> dbu(new greylock::database());
14 | 	auto err = dbu->open_read_only(input);
15 | 	if (err) {
16 | 		ribosome::throw_error(err.code(), "could not open input database: %s: %s",
17 | 				input.c_str(), err.message().c_str());
18 | 	}
19 | 
20 | 	auto it = dbu->iterator(column, rocksdb::ReadOptions());
21 | 	it->SeekToFirst();
22 | 
23 | 	if (!it->Valid()) {
24 | 		auto s = it->status();
25 | 		ribosome::throw_error(-s.code(), "iterator from database %s is not valid: %s [%d]",
26 | 				input.c_str(), s.ToString().c_str(), s.code());
27 | 	}
28 | 
29 | 	long data_size = 0;
30 | 	long keys = 0;
31 | 	for (; it->Valid(); it->Next()) {
32 | 		keys++;
33 | 		data_size += it->value().size();
34 | 
35 | 		printf("merge: column: %s [%d], keys: %ld, total data size: %ld, key: %s, size: %ld\n",
36 | 				dbu->options().column_names[column].c_str(), column, keys, data_size,
37 | 				it->key().ToString().c_str(), it->value().size());
38 | 	}
39 | 
40 | }
41 | 
42 | int main(int argc, char *argv[])
43 | {
44 | 	namespace bpo = boost::program_options;
45 | 
46 | 	bpo::options_description generic("List options");
47 | 
48 | 	std::string input;
49 | 	std::string column;
50 | 	generic.add_options()
51 | 		("help", "This help message")
52 | 		("column", bpo::value<std::string>(&column)->required(), "Column name to merge")
53 | 		("rocksdb", bpo::value<std::string>(&input)->required(), "Input rocksdb database")
54 | 		;
55 | 
56 | 	bpo::options_description cmdline_options;
57 | 	cmdline_options.add(generic);
58 | 
59 | 	bpo::variables_map vm;
60 | 
61 | 	try {
62 | 		bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm);
63 | 
64 | 		if (vm.count("help")) {
65 | 			std::cout << generic << std::endl;
66 | 			return 0;
67 | 		}
68 | 
69 | 		bpo::notify(vm);
70 | 	} catch (const std::exception &e) {
71 | 		std::cerr << "Invalid options: " << e.what() << "\n" << generic << std::endl;
72 | 		return -1;
73 | 	}
74 | 
75 | 	greylock::options opt;
76 | 	auto it = std::find(opt.column_names.begin(), opt.column_names.end(), column);
77 | 	if (it == opt.column_names.end()) {
78 | 		std::cerr << "Invalig column " << column << ", supported columns: " << greylock::dump_vector(opt.column_names) << std::endl;
79 | 		return -EINVAL;
80 | 	}
81 | 
82 | 	auto column_id = std::distance(opt.column_names.begin(), it);
83 | 
84 | 	try {
85 | 		list(input, column_id);
86 | 	} catch (const std::exception &e) {
87 | 		std::cerr << "Exception: " << e.what() << std::endl;
88 | 		return -1;
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/src/merge.cpp:
--------------------------------------------------------------------------------
  1 | #include "greylock/database.hpp"
  2 | #include "greylock/types.hpp"
  3 | 
  4 | #include <ribosome/error.hpp>
  5 | #include <ribosome/timer.hpp>
  6 | 
  7 | #include <boost/program_options.hpp>
  8 | 
  9 | #include <rocksdb/comparator.h>
 10 | 
 11 | using namespace ioremap;
 12 | 
 13 | static inline const char *print_time(long tsec, long tnsec)
 14 | {
 15 | 	char str[64];
 16 | 	struct tm tm;
 17 | 
 18 | 	static __thread char __dnet_print_time[128];
 19 | 
 20 | 	localtime_r((time_t *)&tsec, &tm);
 21 | 	strftime(str, sizeof(str), "%F %R:%S", &tm);
 22 | 
 23 | 	snprintf(__dnet_print_time, sizeof(__dnet_print_time), "%s.%06llu", str, (long long unsigned) tnsec / 1000);
 24 | 	return __dnet_print_time;
 25 | }
 26 | 
 27 | class merger {
 28 | public:
 29 | 	merger(long print_interval) : m_print_interval(print_interval) {
 30 | 	}
 31 | 
 32 | 	void merge(int column, const std::string &output, const std::vector<std::string> &inputs, bool compact) {
 33 | 		ribosome::timer tm;
 34 | 
 35 | 		greylock::database odb;
 36 | 		auto err = odb.open_read_write(output);
 37 | 		if (err) {
 38 | 			ribosome::throw_error(err.code(), "could not open output database: %s: %s",
 39 | 					output.c_str(), err.message().c_str());
 40 | 		}
 41 | 
 42 | 		printf("Output databse %s has been opened\n", output.c_str());
 43 | 
 44 | 		std::vector<std::unique_ptr<greylock::database>> dbs;
 45 | 		std::vector<rocksdb::Iterator *> its;
 46 | 		rocksdb::ReadOptions ro;
 47 | 
 48 | 		for (auto &path: inputs) {
 49 | 			std::unique_ptr<greylock::database> dbu(new greylock::database());
 50 | 			err = dbu->open_read_only(path);
 51 | 			if (err) {
 52 | 				ribosome::throw_error(err.code(), "could not open input database: %s: %s",
 53 | 						path.c_str(), err.message().c_str());
 54 | 			}
 55 | 
 56 | 			printf("Input databse %s has been opened\n", path.c_str());
 57 | 
 58 | 			auto it = dbu->iterator(column, ro);
 59 | 			it->SeekToFirst();
 60 | 
 61 | 			printf("Input databse %s has been positioned\n", path.c_str());
 62 | 
 63 | 			if (!it->Valid()) {
 64 | 				auto s = it->status();
 65 | 				ribosome::throw_error(-s.code(), "iterator from database %s is not valid: %s [%d]",
 66 | 						path.c_str(), s.ToString().c_str(), s.code());
 67 | 			}
 68 | 
 69 | 			its.emplace_back(it);
 70 | 			dbs.emplace_back(std::move(dbu));
 71 | 		}
 72 | 
 73 | 		auto cmp = rocksdb::BytewiseComparator();
 74 | 
 75 | 		long data_size = 0;
 76 | 		long written_keys = 0;
 77 | 		std::string first_key, last_key;
 78 | 		long prev_written_keys = 0;
 79 | 		long prev_data_size = 0;
 80 | 
 81 | 		ribosome::timer merge_tm;
 82 | 
 83 | 		auto print_stats = [&] () {
 84 | 			struct timespec ts;
 85 | 			clock_gettime(CLOCK_REALTIME, &ts);
 86 | 
 87 | 			float kspeed = (float)written_keys * 1000.0 / (float)merge_tm.elapsed();
 88 | 			float kspeed_moment = (float)(written_keys - prev_written_keys) * 1000.0 / (float)tm.elapsed();
 89 | 
 90 | 			float dspeed = (float)data_size * 1000.0 / (float)merge_tm.elapsed() / (1024.0 * 1024.0);
 91 | 			float dspeed_moment = (float)(data_size - prev_data_size) * 1000.0 / (float)tm.elapsed() / (1024.0 * 1024.0);
 92 | 
 93 | 			printf("%s: column: %s [%d], written keys: %ld, speed: %.2f [%.2f] keys/s, "
 94 | 				"written data size: %.2f MBs, speed: %.2f [%.2f] MB/s, "
 95 | 				"first_key: %s, last_key: %s\n",
 96 | 				print_time(ts.tv_sec, ts.tv_nsec),
 97 | 				odb.options().column_names[column].c_str(), column,
 98 | 				written_keys, kspeed, kspeed_moment,
 99 | 				(float)data_size / (1024.0 * 1024.0), dspeed, dspeed_moment,
100 | 				first_key.c_str(), last_key.c_str());
101 | 
102 | 			prev_written_keys = written_keys;
103 | 			prev_data_size = data_size;
104 | 			tm.restart();
105 | 		};
106 | 
107 | 		while (true) {
108 | 			rocksdb::Slice key;
109 | 			std::vector<size_t> positions;
110 | 			std::vector<size_t> to_remove;
111 | 
112 | 			for (size_t pos = 0; pos < its.size(); ++pos) {
113 | 				auto &it = its[pos];
114 | 				if (!it->Valid()) {
115 | 					to_remove.push_back(pos);
116 | 					continue;
117 | 				}
118 | 
119 | 				if (key.size() == 0) {
120 | 					key = it->key();
121 | 					positions.push_back(pos);
122 | 					continue;
123 | 				}
124 | 
125 | 				int cval = cmp->Compare(it->key(), key);
126 | 				if (cval < 0) {
127 | 					key = it->key();
128 | 					positions.clear();
129 | 					positions.push_back(pos);
130 | 					continue;
131 | 				}
132 | 
133 | 				if (cval > 0) {
134 | 					continue;
135 | 				}
136 | 
137 | 				positions.push_back(pos);
138 | 			}
139 | 
140 | 			if (key.size() == 0)
141 | 				break;
142 | 
143 | 			rocksdb::WriteBatch batch;
144 | 
145 | 			long ds = 0;
146 | 			for (auto pos: positions) {
147 | 				auto &it = its[pos];
148 | 
149 | 				if ((column == greylock::options::token_shards_column) || (column == greylock::options::indexes_column)) {
150 | 					batch.Merge(odb.cfhandle(column), key, it->value());
151 | 				} else {
152 | 					batch.Put(odb.cfhandle(column), key, it->value());
153 | 				}
154 | 				ds += it->value().size();
155 | 			}
156 | 
157 | 			err = odb.write(&batch);
158 | 			if (err) {
159 | 				ribosome::throw_error(err.code(), "key: %s, inputs: %s, could not write batch of %ld elements: %s",
160 | 						key.ToString().c_str(), greylock::dump_vector(positions).c_str(),
161 | 						positions.size(), err.message().c_str());
162 | 			}
163 | 
164 | 			if (written_keys == 0) {
165 | 				first_key = key.ToString();
166 | 			}
167 | 
168 | 			written_keys++;
169 | 			data_size += ds;
170 | 			last_key = key.ToString();
171 | 
172 | 			for (auto pos: positions) {
173 | 				auto &it = its[pos];
174 | 				it->Next();
175 | 			}
176 | 
177 | 			for (auto it = to_remove.rbegin(); it != to_remove.rend(); ++it) {
178 | 				printf("Input file %s has been processed\n", inputs[*it].c_str());
179 | 				its.erase(its.begin() + (*it));
180 | 			}
181 | 
182 | 			if (tm.elapsed() > m_print_interval) {
183 | 				print_stats();
184 | 			}
185 | 		}
186 | 
187 | 		print_stats();
188 | 
189 | 		if (compact) {
190 | 			struct timespec ts;
191 | 
192 | 			clock_gettime(CLOCK_REALTIME, &ts);
193 | 			printf("%s: starting compaction\n", print_time(ts.tv_sec, ts.tv_nsec));
194 | 			tm.restart();
195 | 
196 | 			odb.compact();
197 | 			clock_gettime(CLOCK_REALTIME, &ts);
198 | 			printf("%s: compaction 1 took %.1f seconds\n", print_time(ts.tv_sec, ts.tv_nsec), tm.restart() / 1000.0);
199 | 
200 | 			odb.compact();
201 | 			clock_gettime(CLOCK_REALTIME, &ts);
202 | 			printf("%s: compaction 2 took %.1f seconds\n", print_time(ts.tv_sec, ts.tv_nsec), tm.restart() / 1000.0);
203 | 		}
204 | 	}
205 | private:
206 | 	long m_print_interval;
207 | };
208 | 
209 | int main(int argc, char *argv[])
210 | {
211 | 	namespace bpo = boost::program_options;
212 | 
213 | 	bpo::options_description generic("Merge options");
214 | 
215 | 	std::string output;
216 | 	std::vector<std::string> inputs;
217 | 	int thread_num;
218 | 	std::string column;
219 | 	long print_interval;
220 | 	generic.add_options()
221 | 		("help", "This help message")
222 | 		("column", bpo::value<std::string>(&column)->required(), "Column name to merge")
223 | 		("compact", "Whether to compact output database or not")
224 | 		("input", bpo::value<std::vector<std::string>>(&inputs)->required()->composing(), "Input rocksdb database")
225 | 		("output", bpo::value<std::string>(&output)->required(), "Output rocksdb database")
226 | 		("threads", bpo::value<int>(&thread_num)->default_value(8), "Number of merge threads")
227 | 		("print-interval", bpo::value<long>(&print_interval)->default_value(10000), "Period to dump merge stats (in milliseconds)")
228 | 		;
229 | 
230 | 	bpo::options_description cmdline_options;
231 | 	cmdline_options.add(generic);
232 | 
233 | 	bpo::variables_map vm;
234 | 
235 | 	try {
236 | 		bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm);
237 | 
238 | 		if (vm.count("help")) {
239 | 			std::cout << generic << std::endl;
240 | 			return 0;
241 | 		}
242 | 
243 | 		bpo::notify(vm);
244 | 	} catch (const std::exception &e) {
245 | 		std::cerr << "Invalid options: " << e.what() << "\n" << generic << std::endl;
246 | 		return -1;
247 | 	}
248 | 
249 | 	greylock::options opt;
250 | 	auto it = std::find(opt.column_names.begin(), opt.column_names.end(), column);
251 | 	if (it == opt.column_names.end()) {
252 | 		std::cerr << "Invalig column " << column << ", supported columns: " << greylock::dump_vector(opt.column_names) << std::endl;
253 | 		return -EINVAL;
254 | 	}
255 | 
256 | 	auto column_id = std::distance(opt.column_names.begin(), it);
257 | 
258 | 	try {
259 | 		merger m(print_interval);
260 | 		m.merge(column_id, output, inputs, vm.count("compact") != 0);
261 | 	} catch (const std::exception &e) {
262 | 		std::cerr << "Exception: " << e.what() << std::endl;
263 | 		return -1;
264 | 	}
265 | }
266 | 


--------------------------------------------------------------------------------
/src/meta.cpp:
--------------------------------------------------------------------------------
  1 | #include <iostream>
  2 | 
  3 | #include "greylock/database.hpp"
  4 | #include "greylock/types.hpp"
  5 | 
  6 | #include <boost/algorithm/string.hpp>
  7 | #include <boost/filesystem.hpp>
  8 | #include <boost/program_options.hpp>
  9 | 
 10 | #include <ribosome/timer.hpp>
 11 | 
 12 | using namespace ioremap;
 13 | 
 14 | static inline const char *print_time(long tsec, long tnsec)
 15 | {
 16 | 	char str[64];
 17 | 	struct tm tm;
 18 | 
 19 | 	static __thread char __dnet_print_time[128];
 20 | 
 21 | 	localtime_r((time_t *)&tsec, &tm);
 22 | 	strftime(str, sizeof(str), "%F %R:%S", &tm);
 23 | 
 24 | 	snprintf(__dnet_print_time, sizeof(__dnet_print_time), "%s.%06llu", str, (long long unsigned) tnsec / 1000);
 25 | 	return __dnet_print_time;
 26 | }
 27 | 
 28 | int main(int argc, char *argv[])
 29 | {
 30 | 	namespace bpo = boost::program_options;
 31 | 
 32 | 	bpo::options_description generic("Index metadata reader options");
 33 | 	generic.add_options()
 34 | 		("help", "this help message")
 35 | 		;
 36 | 
 37 | 
 38 | 	std::string dpath, ipath;
 39 | 	std::string iname;
 40 | 	bool dump = false;
 41 | 	std::string id_str;
 42 | 	std::string save_prefix;
 43 | 	bpo::options_description gr("Greylock index options");
 44 | 	gr.add_options()
 45 | 		("index", bpo::value<std::string>(&iname), "index name, format: mailbox.attribute.index")
 46 | 		("id", bpo::value<std::string>(&id_str), "read document with this indexed ID, format: ts")
 47 | 		("save", bpo::value<std::string>(&save_prefix), "save index data into this directory")
 48 | 		("rocksdb.docs", bpo::value<std::string>(&dpath),
 49 | 		 	"path to rocksdb containing documents, "
 50 | 			"will be opened in read-only mode, safe to be called if different process is already using it")
 51 | 		("rocksdb.indexes", bpo::value<std::string>(&ipath)->required(),
 52 | 		 	"path to rocksdb containing indexes, "
 53 | 			"will be opened in read-only mode, safe to be called if different process is already using it")
 54 | 		("dump", "dump document data to stdout")
 55 | 		;
 56 | 
 57 | 	bpo::options_description cmdline_options;
 58 | 	cmdline_options.add(generic).add(gr);
 59 | 
 60 | 	bpo::variables_map vm;
 61 | 
 62 | 	try {
 63 | 		bpo::store(bpo::command_line_parser(argc, argv).options(cmdline_options).run(), vm);
 64 | 
 65 | 		if (vm.count("help")) {
 66 | 			std::cout << cmdline_options << std::endl;
 67 | 			return 0;
 68 | 		}
 69 | 
 70 | 		bpo::notify(vm);
 71 | 	} catch (const std::exception &e) {
 72 | 		std::cerr << "Invalid options: " << e.what() << "\n" << cmdline_options << std::endl;
 73 | 		return -1;
 74 | 	}
 75 | 
 76 | 	if (vm.count("dump")) {
 77 | 		dump = true;
 78 | 	}
 79 | 
 80 | 	if (dump || vm.count("id")) {
 81 | 		if (dpath.empty()) {
 82 | 			std::cerr << "You must provide documents database when using dump or id option\n" << cmdline_options << std::endl;
 83 | 			return -1;
 84 | 		}
 85 | 	}
 86 | 
 87 | 	try {
 88 | 		greylock::database db;
 89 | 		auto err = db.open_read_only(ipath);
 90 | 		if (err) {
 91 | 			std::cerr << "could not open database: " << err.message();
 92 | 			return err.code();
 93 | 		}
 94 | 
 95 | 		greylock::database db_docs;
 96 | 		if (dpath.size()) {
 97 | 			auto err = db_docs.open_read_only(dpath);
 98 | 			if (err) {
 99 | 				std::cerr << "could not open database: " << err.message();
100 | 				return err.code();
101 | 			}
102 | 		}
103 | 
104 | 		ribosome::timer tm;
105 | 
106 | 		auto print_index = [&](const greylock::id_t &id) -> std::string {
107 | 			long tsec, aux;
108 | 			id.get_timestamp(&tsec, &aux);
109 | 
110 | 			std::ostringstream ss;
111 | 			ss << id.to_string() <<
112 | 				", raw_ts: " << id.timestamp <<
113 | 				", aux: " << aux <<
114 | 				", ts: " << print_time(tsec, 0);
115 | 			return ss.str();
116 | 		};
117 | 
118 | 		auto print_doc = [&](const greylock::document &doc) -> std::string {
119 | 			std::ostringstream ss;
120 | 
121 | 			ss << "id: " << doc.id << ", author: " << doc.author;
122 | 
123 | 			ss << "\n          content: " << doc.ctx.content;
124 | 			ss << "\n            title: " << doc.ctx.title;
125 | 			ss << "\n            links: " << greylock::dump_vector(doc.ctx.links);
126 | 			ss << "\n           images: " << greylock::dump_vector(doc.ctx.images);
127 | 
128 | 			return ss.str();
129 | 		};
130 | 
131 | 		if (vm.count("index")) {
132 | 			std::vector<std::string> cmp;
133 | 			size_t pos = 0;
134 | 			for (int i = 0; i < 2; ++i) {
135 | 				size_t dot = iname.find('.', pos);
136 | 				if (dot == std::string::npos) {
137 | 					std::cerr << "invalid index name " << iname << ", must be mailbox.attribute.index" << std::endl;
138 | 					return -1;
139 | 				}
140 | 
141 | 				cmp.push_back(iname.substr(pos, dot - pos));
142 | 				pos = dot + 1;
143 | 			}
144 | 			cmp.push_back(iname.substr(pos));
145 | 
146 | 			if (save_prefix.size()) {
147 | 				boost::system::error_code ec;
148 | 				std::string dname = save_prefix + "/" + iname;
149 | 				boost::filesystem::create_directories(dname, ec);
150 | 				if (ec && ec != boost::system::errc::file_exists) {
151 | 					fprintf(stderr, "could not create directory %s: %s [%d]\n",
152 | 							dname.c_str(), ec.message().c_str(), ec.value());
153 | 					return -ec.value();
154 | 				}
155 | 
156 | 				save_prefix = dname;
157 | 			}
158 | 
159 | 			const std::string &mbox = cmp[0];
160 | 			const std::string &attr = cmp[1];
161 | 			const std::string &token = cmp[2];
162 | 
163 | 			std::string index_base = greylock::document::generate_index_base(db.options(), mbox, attr, token);
164 | 			std::string skey = greylock::document::generate_shard_key(db.options(), mbox, attr, token);
165 | 			std::vector<size_t> shards(db.get_shards(skey));
166 | 
167 | 			if (save_prefix.size()) {
168 | 				std::ofstream sout(save_prefix + "/shards.bin", std::ios::trunc);
169 | 				std::string sdata;
170 | 				auto err = db.read(greylock::options::token_shards_column, skey, &sdata);
171 | 				if (err) {
172 | 					fprintf(stderr, "could not read shards %s: %s [%d]\n",
173 | 							skey.c_str(), err.message().c_str(), err.code());
174 | 					return err.code();
175 | 				}
176 | 
177 | 				sout.write(sdata.data(), sdata.size());
178 | 			}
179 | 
180 | 			std::set<greylock::document_for_index> sidx;
181 | 
182 | 			std::cout << "Number of shards: " << shards.size() << ", shards: " << greylock::dump_vector(shards) << std::endl;
183 | 			for (auto shard_number: shards) {
184 | 				std::string ikey = greylock::document::generate_index_key_shard_number(index_base, shard_number);
185 | 				std::string idata;
186 | 				auto err = db.read(greylock::options::indexes_column, ikey, &idata);
187 | 				if (err) {
188 | 					fprintf(stderr, "could not read index %s: %s [%d]\n",
189 | 							ikey.c_str(), err.message().c_str(), err.code());
190 | 					return err.code();
191 | 				}
192 | 
193 | 				if (save_prefix.size()) {
194 | 					std::ofstream sout(save_prefix + "/idx_shard." + std::to_string(shard_number), std::ios::trunc);
195 | 					sout.write(idata.data(), idata.size());
196 | 				}
197 | 
198 | 
199 | 				greylock::disk_index idx;
200 | 				err = greylock::deserialize(idx, idata.data(), idata.size());
201 | 				if (err) {
202 | 					fprintf(stderr, "could not deserialize index %s, size: %ld: %s [%d]\n",
203 | 							ikey.c_str(), idata.size(), err.message().c_str(), err.code());
204 | 					return err.code();
205 | 				}
206 | 
207 | 				std::cout << "shard: " << shard_number << ", indexes: " << idx.ids.size() << std::endl;
208 | 				sidx.insert(idx.ids.begin(), idx.ids.end());
209 | 
210 | 				for (auto &id: idx.ids) {
211 | 					std::cout << "indexed_id: " << print_index(id.indexed_id);
212 | 					if (dump) {
213 | 						greylock::document doc;
214 | 
215 | 						std::string doc_data;
216 | 						std::string dkey = id.indexed_id.to_string();
217 | 						auto err = db_docs.read(greylock::options::documents_column, dkey, &doc_data);
218 | 						if (err) {
219 | 							fprintf(stderr, "could not read document %s: %s [%d]\n",
220 | 									dkey.c_str(), err.message().c_str(), err.code());
221 | 							return err.code();
222 | 						}
223 | 
224 | 						err = greylock::deserialize(doc, doc_data.data(), doc_data.size());
225 | 						if (err) {
226 | 							fprintf(stderr, "could not deserialize document %s, size: %ld: %s [%d]\n",
227 | 									dkey.c_str(), doc_data.size(), err.message().c_str(), err.code());
228 | 							return err.code();
229 | 						}
230 | 
231 | 						std::cout << ", doc: " << print_doc(doc);
232 | 					}
233 | 
234 | 					std::cout << std::endl;
235 | 				}
236 | 			}
237 | 
238 | 			if (save_prefix.size()) {
239 | 				greylock::disk_index idx;
240 | 				idx.ids.insert(idx.ids.begin(), sidx.begin(), sidx.end());
241 | 
242 | 				std::ofstream sout(save_prefix + "/idx_merged.bin", std::ios::trunc);
243 | 				std::string mdata = serialize(idx);
244 | 				sout.write(mdata.data(), mdata.size());
245 | 			}
246 | 
247 | 		}
248 | 
249 | 		if (vm.count("id")) {
250 | 			greylock::id_t indexed_id(id_str.c_str());
251 | 
252 | 			std::string doc_data;
253 | 			auto err = db_docs.read(greylock::options::documents_column, indexed_id.to_string(), &doc_data);
254 | 			if (err) {
255 | 				std::cout << "could not read document with indexed_id: " << id_str <<
256 | 					", error: " << err.message() << std::endl;
257 | 				return err.code();
258 | 			}
259 | 
260 | 			greylock::document doc;
261 | 			err = greylock::deserialize(doc, doc_data.data(), doc_data.size());
262 | 			if (err) {
263 | 				std::cout << "could not deserialize document with indexed_id: " << id_str <<
264 | 					", data_size: " << doc_data.size() <<
265 | 					", error: " << err.message() << std::endl;
266 | 				return err.code();
267 | 			}
268 | 
269 | 			std::cout << "indexed_id: " << print_index(doc.indexed_id) <<
270 | 				", doc: " << print_doc(doc) << std::endl;
271 | 		}
272 | 
273 | 		printf("Operation took %.2f seconds\n", tm.elapsed() / 1000.); 
274 | 	} catch (const std::exception &e) {
275 | 		std::cerr << "Exception: " << e.what() << std::endl;
276 | 	}
277 | }
278 | 


--------------------------------------------------------------------------------
/src/server.cpp:
--------------------------------------------------------------------------------
  1 | #include "greylock/database.hpp"
  2 | #include "greylock/error.hpp"
  3 | #include "greylock/json.hpp"
  4 | #include "greylock/jsonvalue.hpp"
  5 | #include "greylock/intersection.hpp"
  6 | #include "greylock/types.hpp"
  7 | #include "greylock/utils.hpp"
  8 | 
  9 | #include <unistd.h>
 10 | #include <signal.h>
 11 | 
 12 | #include <thevoid/server.hpp>
 13 | #include <thevoid/stream.hpp>
 14 | 
 15 | #include <ribosome/html.hpp>
 16 | #include <ribosome/split.hpp>
 17 | #include <ribosome/timer.hpp>
 18 | 
 19 | #include <swarm/logger.hpp>
 20 | 
 21 | #include <msgpack.hpp>
 22 | 
 23 | #include <functional>
 24 | #include <string>
 25 | #include <thread>
 26 | 
 27 | #define ILOG(level, a...) BH_LOG(logger(), level, ##a)
 28 | #define ILOG_ERROR(a...) ILOG(SWARM_LOG_ERROR, ##a)
 29 | #define ILOG_WARNING(a...) ILOG(SWARM_LOG_WARNING, ##a)
 30 | #define ILOG_INFO(a...) ILOG(SWARM_LOG_INFO, ##a)
 31 | #define ILOG_NOTICE(a...) ILOG(SWARM_LOG_NOTICE, ##a)
 32 | #define ILOG_DEBUG(a...) ILOG(SWARM_LOG_DEBUG, ##a)
 33 | 
 34 | using namespace ioremap;
 35 | 
 36 | template <typename Server>
 37 | struct simple_request_stream_error : public thevoid::simple_request_stream<Server> {
 38 | 	void send_error(int status, int error, const char *fmt, ...) {
 39 | 		va_list args;
 40 | 		va_start(args, fmt);
 41 | 
 42 | 		char buffer[1024];
 43 | 		int sz = vsnprintf(buffer, sizeof(buffer), fmt, args);
 44 | 
 45 | 		BH_LOG(this->server()->logger(), SWARM_LOG_ERROR, "%s: %d", buffer, error);
 46 | 
 47 | 		greylock::JsonValue val;
 48 | 		rapidjson::Value ev(rapidjson::kObjectType);
 49 | 
 50 | 
 51 | 		rapidjson::Value esv(buffer, sz, val.GetAllocator());
 52 | 		ev.AddMember("message", esv, val.GetAllocator());
 53 | 		ev.AddMember("code", error, val.GetAllocator());
 54 | 		val.AddMember("error", ev, val.GetAllocator());
 55 | 
 56 | 		va_end(args);
 57 | 
 58 | 		std::string data = val.ToString();
 59 | 
 60 | 		thevoid::http_response http_reply;
 61 | 		http_reply.set_code(status);
 62 | 		http_reply.headers().set_content_length(data.size());
 63 | 		http_reply.headers().set_content_type("text/json");
 64 | 
 65 | 		this->send_reply(std::move(http_reply), std::move(data));
 66 | 	}
 67 | };
 68 | 
 69 | class http_server : public thevoid::server<http_server>
 70 | {
 71 | public:
 72 | 	virtual ~http_server() {
 73 | 	}
 74 | 
 75 | 	virtual bool initialize(const rapidjson::Value &config) {
 76 | 		if (!rocksdb_init(config))
 77 | 			return false;
 78 | 
 79 | 		on<on_ping>(
 80 | 			options::exact_match("/ping"),
 81 | 			options::methods("GET")
 82 | 		);
 83 | 
 84 | 		on<on_compact>(
 85 | 			options::exact_match("/compact"),
 86 | 			options::methods("POST", "PUT")
 87 | 		);
 88 | 
 89 | 		on<on_index>(
 90 | 			options::exact_match("/index"),
 91 | 			options::methods("POST", "PUT")
 92 | 		);
 93 | 
 94 | 		on<on_search>(
 95 | 			options::exact_match("/search"),
 96 | 			options::methods("POST", "PUT")
 97 | 		);
 98 | 
 99 | 		return true;
100 | 	}
101 | 
102 | 	struct on_ping : public simple_request_stream_error<http_server> {
103 | 		virtual void on_request(const thevoid::http_request &req, const boost::asio::const_buffer &buffer) {
104 | 			(void) buffer;
105 | 			(void) req;
106 | 
107 | 			this->send_reply(thevoid::http_response::ok);
108 | 		}
109 | 	};
110 | 
111 | 	struct on_compact : public simple_request_stream_error<http_server> {
112 | 		virtual void on_request(const thevoid::http_request &req, const boost::asio::const_buffer &buffer) {
113 | 			(void) req;
114 | 			(void) buffer;
115 | 
116 | 			server()->db_docs().compact();
117 | 			server()->db_indexes().compact();
118 | 			this->send_reply(thevoid::http_response::ok);
119 | 		}
120 | 	};
121 | 
122 | 	struct on_search : public simple_request_stream_error<http_server> {
123 | 		bool check_negation(const std::vector<greylock::token> &tokens, const std::vector<std::string> &content) {
124 | 			for (const auto &t: tokens) {
125 | 				for (const auto &word: content) {
126 | 					if (t.name == word) {
127 | 						return true;
128 | 					}
129 | 				}
130 | 			}
131 | 
132 | 			return false;
133 | 		}
134 | 
135 | 		bool check_exact(const std::vector<greylock::token> &tokens, const std::vector<std::string> &content) {
136 | 			auto check_token_positions = [] (const greylock::token &token,
137 | 					const std::vector<std::string> &content, size_t content_offset) -> bool {
138 | 				for (size_t pos: token.positions) {
139 | 					size_t offset = content_offset + pos;
140 | 					if (offset >= content.size()) {
141 | 						return false;
142 | 					}
143 | 
144 | 					if (token.name != content[offset]) {
145 | 						return false;
146 | 					}
147 | 				}
148 | 
149 | 				return true;
150 | 			};
151 | 
152 | 			for (size_t content_offset = 0; content_offset < content.size(); ++content_offset) {
153 | 				bool match = true;
154 | 
155 | 				for (const auto &token: tokens) {
156 | 					match = check_token_positions(token, content, content_offset);
157 | 					if (!match)
158 | 						break;
159 | 				}
160 | 
161 | 				if (match)
162 | 					return true;
163 | 			}
164 | 
165 | 			return false;
166 | 		}
167 | 
168 | 		std::vector<std::string> split_content(const std::string &content) {
169 | 			std::vector<std::string> ret;
170 | 
171 | 			ribosome::html_parser html;
172 | 			html.feed_text(content);
173 | 
174 | 			ribosome::split spl;
175 | 			for (auto &t: html.tokens()) {
176 | 				ribosome::lstring lt = ribosome::lconvert::from_utf8(t);
177 | 				auto lower_request = ribosome::lconvert::to_lower(lt);
178 | 
179 | 				auto all_words = spl.convert_split_words(lower_request, ".:,");
180 | 				for (auto &word: all_words) {
181 | 					ret.emplace_back(ribosome::lconvert::to_string(word));
182 | 				}
183 | 			}
184 | 
185 | 			return ret;
186 | 		}
187 | 
188 | 		// returns true if record has to be accepted, false - if record must be dropped
189 | 		bool check_result(const greylock::intersection_query &iq, greylock::single_doc_result &sd) {
190 | 			const greylock::document &doc = sd.doc;
191 | 
192 | 			for (const auto &ent: iq.se) {
193 | 				for (const auto &attr: ent.idx.exact) {
194 | 					bool match;
195 | 
196 | 					if (attr.name.find("title") != std::string::npos) {
197 | 						match = check_exact(attr.tokens, split_content(doc.ctx.title));
198 | 					} else {
199 | 						match = check_exact(attr.tokens, split_content(doc.ctx.content));
200 | 					}
201 | 
202 | 					if (!match)
203 | 						return false;
204 | 				}
205 | 			}
206 | 
207 | 			return true;
208 | 		}
209 | 
210 | 		virtual void on_request(const thevoid::http_request &req, const boost::asio::const_buffer &buffer) {
211 | 			(void) req;
212 | 
213 | 			ribosome::timer search_tm;
214 | 
215 | 			// this is needed to put ending zero-byte, otherwise rapidjson parser will explode
216 | 			std::string data(const_cast<char *>(boost::asio::buffer_cast<const char*>(buffer)),
217 | 					boost::asio::buffer_size(buffer));
218 | 
219 | 			rapidjson::Document doc;
220 | 			doc.Parse<0>(data.c_str());
221 | 
222 | 			if (doc.HasParseError()) {
223 | 				send_error(swarm::http_response::bad_request, -EINVAL,
224 | 						"search: could not parse document: %s, error offset: %d",
225 | 						doc.GetParseError(), doc.GetErrorOffset());
226 | 				return;
227 | 			}
228 | 			if (!doc.IsObject()) {
229 | 				send_error(swarm::http_response::bad_request, -EINVAL, "search: document must be object");
230 | 				return;
231 | 			}
232 | 
233 | 			greylock::intersection_query iq;
234 | 
235 | 			const auto &paging = greylock::get_object(doc, "paging");
236 | 			if (paging.IsObject()) {
237 | 				iq.next_document_id = greylock::id_t(greylock::get_string(paging, "next_document_id"));
238 | 				iq.max_number = greylock::get_int64(paging, "max_number", LONG_MAX);
239 | 			}
240 | 
241 | 			long sec_start = 0, sec_end = LONG_MAX;
242 | 			const auto &time = greylock::get_object(doc, "time");
243 | 			if (time.IsObject()) {
244 | 				sec_start = greylock::get_int64(time, "start", sec_start);
245 | 				sec_end = greylock::get_int64(time, "end", sec_end);
246 | 			}
247 | 			iq.range_start.set_timestamp(sec_start, 0);
248 | 			iq.range_end.set_timestamp(sec_end, 0);
249 | 
250 | 
251 | 			std::vector<greylock::mailbox_query> se;
252 | 			const auto &request = greylock::get_object(doc, "request");
253 | 			if (!request.IsObject()) {
254 | 				send_error(swarm::http_response::bad_request, -EINVAL, "search: document must contain 'request' object");
255 | 				return;
256 | 			}
257 | 
258 | 			for (auto it = request.MemberBegin(), jse_end = request.MemberEnd(); it != jse_end; ++it) {
259 | 				if (!it->value.IsObject()) {
260 | 					send_error(swarm::http_response::bad_request, -EINVAL,
261 | 							"search: mailbox query '%s' must contain object",
262 | 								it->name.GetString());
263 | 					return;
264 | 				}
265 | 
266 | 				greylock::mailbox_query q(server()->db_indexes().options(), it->value);
267 | 				if (q.parse_error) {
268 | 					send_error(swarm::http_response::bad_request, q.parse_error.code(),
269 | 							"search: could not parse mailbox query: %s",
270 | 								q.parse_error.message().c_str());
271 | 					return;
272 | 				}
273 | 
274 | 				q.mbox.assign(it->name.GetString(), it->name.GetStringLength());
275 | 
276 | 				iq.se.emplace_back(std::move(q));
277 | 			}
278 | 
279 | 			greylock::search_result result;
280 | 			greylock::intersector<greylock::database> inter(server()->db_docs(), server()->db_indexes());
281 | 			result = inter.intersect(iq, std::bind(&on_search::check_result, this, std::ref(iq), std::placeholders::_1));
282 | 
283 | 			send_search_result(result);
284 | 
285 | 			ILOG_INFO("search: query: %s, next_document_id: %s -> %s, indexes: %ld/%ld, completed: %d, duration: %d ms",
286 | 					iq.to_string().c_str(),
287 | 					iq.next_document_id.to_string().c_str(), result.next_document_id.to_string().c_str(),
288 | 					result.docs.size(), iq.max_number,
289 | 					result.completed, search_tm.elapsed());
290 | 		}
291 | 
292 | 		void pack_string_array(rapidjson::Value &parent, rapidjson::Document::AllocatorType &allocator,
293 | 				const char *name, const std::vector<std::string> &data) {
294 | 			rapidjson::Value arr(rapidjson::kArrayType);
295 | 			for (const auto &s: data) {
296 | 				rapidjson::Value v(s.c_str(), s.size(), allocator);
297 | 				arr.PushBack(v, allocator);
298 | 			}
299 | 
300 | 			parent.AddMember(name, arr, allocator);
301 | 		}
302 | 
303 | 		template <typename T>
304 | 		void pack_simple_array(rapidjson::Value &parent, rapidjson::Document::AllocatorType &allocator,
305 | 				const char *name, const std::vector<T> &data) {
306 | 			rapidjson::Value arr(rapidjson::kArrayType);
307 | 			for (const auto &s: data) {
308 | 				arr.PushBack(s, allocator);
309 | 			}
310 | 
311 | 			parent.AddMember(name, arr, allocator);
312 | 		}
313 | 
314 | 		void send_search_result(const greylock::search_result &result) {
315 | 			greylock::JsonValue ret;
316 | 			auto &allocator = ret.GetAllocator();
317 | 
318 | 			rapidjson::Value ids(rapidjson::kArrayType);
319 | 			for (auto it = result.docs.begin(), end = result.docs.end(); it != end; ++it) {
320 | 				rapidjson::Value key(rapidjson::kObjectType);
321 | 
322 | 				const greylock::document &doc = it->doc;
323 | 
324 | 				rapidjson::Value idv(doc.id.c_str(), doc.id.size(), allocator);
325 | 				key.AddMember("id", idv, allocator);
326 | 
327 | 				std::string id_str = doc.indexed_id.to_string();
328 | 				rapidjson::Value indv(id_str.c_str(), id_str.size(), allocator);
329 | 				key.AddMember("indexed_id", indv, allocator);
330 | 
331 | 				rapidjson::Value av(doc.author.c_str(), doc.author.size(), allocator);
332 | 				key.AddMember("author", av, allocator);
333 | 
334 | 				rapidjson::Value cv(rapidjson::kObjectType);
335 | 
336 | 				rapidjson::Value csv(doc.ctx.content.c_str(), doc.ctx.content.size(), allocator);
337 | 				cv.AddMember("content", csv, allocator);
338 | 
339 | 				rapidjson::Value tsv(doc.ctx.title.c_str(), doc.ctx.title.size(), allocator);
340 | 				cv.AddMember("title", tsv, allocator);
341 | 
342 | 				pack_string_array(cv, allocator, "links", doc.ctx.links);
343 | 				pack_string_array(cv, allocator, "images", doc.ctx.images);
344 | 				key.AddMember("content", cv, allocator);
345 | 
346 | 				key.AddMember("relevance", it->relevance, allocator);
347 | 
348 | 				long tsec, tnsec;
349 | 				doc.indexed_id.get_timestamp(&tsec, &tnsec);
350 | 				rapidjson::Value ts(rapidjson::kObjectType);
351 | 				ts.AddMember("tsec", tsec, allocator);
352 | 				ts.AddMember("tnsec", tnsec, allocator);
353 | 				key.AddMember("timestamp", ts, allocator);
354 | 
355 | 				ids.PushBack(key, allocator);
356 | 			}
357 | 
358 | 			ret.AddMember("ids", ids, allocator);
359 | 			ret.AddMember("completed", result.completed, allocator);
360 | 
361 | 			std::string next_id_str = result.next_document_id.to_string();
362 | 			rapidjson::Value nidv(next_id_str.c_str(), next_id_str.size(), allocator);
363 | 			ret.AddMember("next_document_id", nidv, allocator);
364 | 
365 | 			std::string data = ret.ToString();
366 | 
367 | 			thevoid::http_response reply;
368 | 			reply.set_code(swarm::http_response::ok);
369 | 			reply.headers().set_content_type("text/json; charset=utf-8");
370 | 			reply.headers().set_content_length(data.size());
371 | 
372 | 			this->send_reply(std::move(reply), std::move(data));
373 | 		}
374 | 	};
375 | 
376 | 	struct on_index : public simple_request_stream_error<http_server> {
377 | 		greylock::error_info process_one_document(greylock::document &doc) {
378 | 			doc.generate_token_keys(server()->db_indexes().options());
379 | 
380 | 			rocksdb::WriteBatch docs_batch, indexes_batch;
381 | 
382 | 			std::string doc_serialized = serialize(doc);
383 | 			rocksdb::Slice doc_value(doc_serialized);
384 | 
385 | 			greylock::document_for_index did;
386 | 			did.indexed_id = doc.indexed_id;
387 | 			std::string sdid = serialize(did);
388 | 
389 | 			size_t indexes = 0;
390 | 			for (const auto &attr: doc.idx.attributes) {
391 | 				for (const auto &t: attr.tokens) {
392 | 					indexes_batch.Merge(rocksdb::Slice(t.key), rocksdb::Slice(sdid));
393 | 
394 | 					greylock::disk_token dt(t.shards);
395 | 					std::string dts = serialize(dt);
396 | 
397 | 					indexes_batch.Merge(rocksdb::Slice(t.shard_key), rocksdb::Slice(dts));
398 | 
399 | 					indexes++;
400 | 				}
401 | 			}
402 | 
403 | 			// we must have a copy, since otherwise batch will cache stall pointer to rvalue
404 | 			std::string dkey = doc.indexed_id.to_string();
405 | 			docs_batch.Put(server()->db_docs().cfhandle(greylock::options::documents_column), rocksdb::Slice(dkey), doc_value);
406 | 
407 | 			std::string doc_indexed_id_serialized = serialize(doc.indexed_id);
408 | 			docs_batch.Put(server()->db_docs().cfhandle(greylock::options::document_ids_column),
409 | 					rocksdb::Slice(doc.id), rocksdb::Slice(doc_indexed_id_serialized));
410 | 
411 | 
412 | 			auto err = server()->db_docs().write(&docs_batch);
413 | 			if (err) {
414 | 				return greylock::create_error(err.code(), "could not write docs batch, mbox: %s, id: %s, error: %s",
415 | 					doc.mbox.c_str(), doc.id.c_str(), err.message().c_str());
416 | 			}
417 | 
418 | 			err = server()->db_indexes().write(&indexes_batch);
419 | 			if (err) {
420 | 				return greylock::create_error(err.code(), "could not write indexes batch, mbox: %s, id: %s, error: %s",
421 | 					doc.mbox.c_str(), doc.id.c_str(), err.message().c_str());
422 | 			}
423 | 
424 | 			ILOG_INFO("index: successfully indexed document: mbox: %s, id: %s, "
425 | 					"indexed_id: %s, indexes: %ld, serialized_doc_size: %ld",
426 | 					doc.mbox.c_str(), doc.id.c_str(),
427 | 					doc.indexed_id.to_string().c_str(), indexes, doc_value.size());
428 | 			return greylock::error_info();
429 | 		}
430 | 
431 | 		template <typename T>
432 | 		std::vector<T> get_numeric_vector(const rapidjson::Value &data, const char *name) {
433 | 			std::vector<T> ret;
434 | 			const auto &arr = greylock::get_array(data, name);
435 | 			if (!arr.IsArray())
436 | 				return ret;
437 | 
438 | 			for (auto it = arr.Begin(), end = arr.End(); it != end; it++) {
439 | 				if (it->IsNumber())
440 | 					ret.push_back((T)it->GetDouble());
441 | 			}
442 | 
443 | 			return ret;
444 | 		}
445 | 
446 | 		std::vector<std::string> get_string_vector(const rapidjson::Value &ctx, const char *name) {
447 | 			std::vector<std::string> ret;
448 | 
449 | 			const auto &a = greylock::get_array(ctx, name);
450 | 			if (!a.IsArray())
451 | 				return ret;
452 | 
453 | 			for (auto it = a.Begin(), end = a.End(); it != end; ++it) {
454 | 				if (it->IsString())
455 | 					ret.push_back(std::string(it->GetString(), it->GetStringLength()));
456 | 			}
457 | 
458 | 			return ret;
459 | 		}
460 | 		greylock::error_info parse_content(const rapidjson::Value &ctx, greylock::document &doc) {
461 | 			doc.ctx.content = greylock::get_string(ctx, "content", "");
462 | 			doc.ctx.title = greylock::get_string(ctx, "title", "");
463 | 			doc.ctx.links = get_string_vector(ctx, "links");
464 | 			doc.ctx.images = get_string_vector(ctx, "images");
465 | 
466 | 			return greylock::error_info();
467 | 		}
468 | 
469 | 		greylock::error_info parse_docs(const std::string &mbox, const rapidjson::Value &docs) {
470 | 			greylock::error_info err = greylock::create_error(-ENOENT,
471 | 					"parse_docs: mbox: %s: could not parse document, there are no valid index entries", mbox.c_str());
472 | 
473 | 			for (auto it = docs.Begin(), id_end = docs.End(); it != id_end; ++it) {
474 | 				if (!it->IsObject()) {
475 | 					return greylock::create_error(-EINVAL, "docs entries must be objects");
476 | 				}
477 | 
478 | 				const char *id = greylock::get_string(*it, "id");
479 | 				const char *author = greylock::get_string(*it, "author");
480 | 				if (!id) {
481 | 					return greylock::create_error(-EINVAL, "id must be string");
482 | 				}
483 | 
484 | 				struct timespec ts;
485 | 				clock_gettime(CLOCK_REALTIME, &ts);
486 | 
487 | 				long tsec, tnsec;
488 | 				const rapidjson::Value &timestamp = greylock::get_object(*it, "timestamp");
489 | 				if (timestamp.IsObject()) {
490 | 					tsec = greylock::get_int64(timestamp, "tsec", ts.tv_sec);
491 | 					tnsec = greylock::get_int64(timestamp, "tnsec", ts.tv_nsec);
492 | 				} else {
493 | 					tsec = ts.tv_sec;
494 | 					tnsec = ts.tv_nsec;
495 | 				}
496 | 
497 | 
498 | 				greylock::document doc;
499 | 				doc.mbox = mbox;
500 | 				doc.assign_id(id, std::hash<std::string>{}(id), tsec, tnsec);
501 | 
502 | 				if (author) {
503 | 					doc.author.assign(author);
504 | 				}
505 | 
506 | 				const rapidjson::Value &ctx = greylock::get_object(*it, "content");
507 | 				if (ctx.IsObject()) {
508 | 					err = parse_content(ctx, doc);
509 | 					if (err)
510 | 						return err;
511 | 				}
512 | 
513 | 				const rapidjson::Value &idxs = greylock::get_object(*it, "index");
514 | 				if (!idxs.IsObject()) {
515 | 					return greylock::create_error(-EINVAL, "docs/index must be array");
516 | 				}
517 | 
518 | 				doc.idx = greylock::indexes::get_indexes(server()->db_indexes().options(), idxs);
519 | 
520 | 				err = process_one_document(doc);
521 | 				if (err)
522 | 					return err;
523 | 			}
524 | 
525 | 			return err;
526 | 		}
527 | 
528 | 		virtual void on_request(const thevoid::http_request &req, const boost::asio::const_buffer &buffer) {
529 | 			(void) req;
530 | 			ribosome::timer index_tm;
531 | 
532 | 			// this is needed to put ending zero-byte, otherwise rapidjson parser will explode
533 | 			std::string data(const_cast<char *>(boost::asio::buffer_cast<const char*>(buffer)),
534 | 					boost::asio::buffer_size(buffer));
535 | 
536 | 			rapidjson::Document doc;
537 | 			doc.Parse<0>(data.c_str());
538 | 
539 | 			if (doc.HasParseError()) {
540 | 				send_error(swarm::http_response::bad_request, -EINVAL,
541 | 						"index: could not parse document: %s, error offset: %d",
542 | 						doc.GetParseError(), doc.GetErrorOffset());
543 | 				return;
544 | 			}
545 | 
546 | 			if (!doc.IsObject()) {
547 | 				send_error(swarm::http_response::bad_request, -EINVAL, "index: document must be object, its type: %d",
548 | 						doc.GetType());
549 | 				return;
550 | 			}
551 | 
552 | 			const char *mbox = greylock::get_string(doc, "mailbox");
553 | 			if (!mbox) {
554 | 				send_error(swarm::http_response::bad_request, -ENOENT, "index: 'mailbox' must be a string");
555 | 				this->send_reply(swarm::http_response::bad_request);
556 | 				return;
557 | 			}
558 | 
559 | 			const rapidjson::Value &docs = greylock::get_array(doc, "docs");
560 | 			if (!docs.IsArray()) {
561 | 				send_error(swarm::http_response::bad_request, -ENOENT, "index: mailbox: %s, 'docs' must be array", mbox);
562 | 				return;
563 | 			}
564 | 
565 | 			greylock::error_info err = parse_docs(mbox, docs);
566 | 			if (err) {
567 | 				send_error(swarm::http_response::bad_request, err.code(),
568 | 						"index: mailbox: %s, keys: %d: insertion error: %s",
569 | 					mbox, docs.Size(), err.message());
570 | 				return;
571 | 			}
572 | 
573 | 			ILOG_INFO("index: mailbox: %s, keys: %d: insertion completed, index duration: %d ms",
574 | 					mbox, docs.Size(), index_tm.elapsed());
575 | 			this->send_reply(thevoid::http_response::ok);
576 | 		}
577 | 	};
578 | 
579 | 	greylock::database &db_docs() {
580 | 		return m_db_docs;
581 | 	}
582 | 	greylock::database &db_indexes() {
583 | 		return m_db_indexes;
584 | 	}
585 | 
586 | private:
587 | 	greylock::database m_db_docs, m_db_indexes;
588 | 
589 | 	bool rocksdb_init(const rapidjson::Value &config) {
590 | 		const auto &rdbconf = greylock::get_object(config, "rocksdb.docs");
591 | 		if (!rdbconf.IsObject()) {
592 | 			ILOG_ERROR("there is no 'rocksdb.docs' object in config");
593 | 			return false;
594 | 		}
595 | 
596 | 		const auto &riconf = greylock::get_object(config, "rocksdb.indexes");
597 | 		if (!riconf.IsObject()) {
598 | 			ILOG_ERROR("there is no 'rocksdb.indexes' object in config");
599 | 			return false;
600 | 		}
601 | 
602 | 		if (!rocksdb_config_parse(rdbconf, &m_db_docs))
603 | 			return false;
604 | 
605 | 		if (!rocksdb_config_parse(riconf, &m_db_indexes))
606 | 			return false;
607 | 
608 | 		return true;
609 | 	}
610 | 
611 | 	bool rocksdb_config_parse(const rapidjson::Value &config, greylock::database *db) {
612 | 		const char *path = greylock::get_string(config, "path");
613 | 		if (!path) {
614 | 			ILOG_ERROR("there is no 'path' string in rocksdb config");
615 | 			return false;
616 | 		}
617 | 		bool ro = greylock::get_bool(config, "read_only", false);
618 | 		bool bulk = greylock::get_bool(config, "bulk_upload", false);
619 | 
620 | 		auto err = db->open(path, ro, bulk);
621 | 		if (err) {
622 | 			ILOG_ERROR("could not open database: %s [%d]", err.message().c_str(), err.code());
623 | 			return false;
624 | 		}
625 | 
626 | 		return true;
627 | 	}
628 | };
629 | 
630 | int main(int argc, char **argv)
631 | {
632 | 	ioremap::ribosome::set_locale("en_US.UTF8");
633 | 
634 | 	ioremap::thevoid::register_signal_handler(SIGINT, ioremap::thevoid::handle_stop_signal);
635 | 	ioremap::thevoid::register_signal_handler(SIGTERM, ioremap::thevoid::handle_stop_signal);
636 | 	ioremap::thevoid::register_signal_handler(SIGHUP, ioremap::thevoid::handle_reload_signal);
637 | 	ioremap::thevoid::register_signal_handler(SIGUSR1, ioremap::thevoid::handle_ignore_signal);
638 | 	ioremap::thevoid::register_signal_handler(SIGUSR2, ioremap::thevoid::handle_ignore_signal);
639 | 
640 | 	ioremap::thevoid::run_signal_thread();
641 | 
642 | 	auto server = ioremap::thevoid::create_server<http_server>();
643 | 	int err = server->run(argc, argv);
644 | 
645 | 	ioremap::thevoid::stop_signal_thread();
646 | 
647 | 	return err;
648 | }
649 | 
650 | 


--------------------------------------------------------------------------------