├── LICENSE
├── Makefile
├── README.markdown
├── add
├── db.ini
├── del
├── init
├── query.sh
├── run
└── update.pid
├── scripts
├── db.py
├── monitor.py
├── preprocessor.py
├── replay.py
├── search.py
├── test.py
├── times.py
└── util.py
├── src
├── _tags
├── dynArray.ml
├── dynArray.mli
├── hashset.ml
├── hashset.mli
├── index.ml
├── latex.ml
├── latex.mli
├── myMap.ml
├── myMap.mli
├── pid.ml
├── pid.mli
├── query.ml
├── query.mli
├── suffix.ml
├── suffix.mli
├── suffix_array.ml
├── suffix_array.mli
├── suffix_array_test.ml
├── suffix_test.ml
├── test.mltop
├── util.ml
└── util.mli
├── start
└── stop
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 | Preamble
9 |
10 | The GNU General Public License is a free, copyleft license for
11 | software and other kinds of works.
12 |
13 | The licenses for most software and other practical works are designed
14 | to take away your freedom to share and change the works. By contrast,
15 | the GNU General Public License is intended to guarantee your freedom to
16 | share and change all versions of a program--to make sure it remains free
17 | software for all its users. We, the Free Software Foundation, use the
18 | GNU General Public License for most of our software; it applies also to
19 | any other work released this way by its authors. You can apply it to
20 | your programs, too.
21 |
22 | When we speak of free software, we are referring to freedom, not
23 | price. Our General Public Licenses are designed to make sure that you
24 | have the freedom to distribute copies of free software (and charge for
25 | them if you wish), that you receive source code or can get it if you
26 | want it, that you can change the software or use pieces of it in new
27 | free programs, and that you know you can do these things.
28 |
29 | To protect your rights, we need to prevent others from denying you
30 | these rights or asking you to surrender the rights. Therefore, you have
31 | certain responsibilities if you distribute copies of the software, or if
32 | you modify it: responsibilities to respect the freedom of others.
33 |
34 | For example, if you distribute copies of such a program, whether
35 | gratis or for a fee, you must pass on to the recipients the same
36 | freedoms that you received. You must make sure that they, too, receive
37 | or can get the source code. And you must show them these terms so they
38 | know their rights.
39 |
40 | Developers that use the GNU GPL protect your rights with two steps:
41 | (1) assert copyright on the software, and (2) offer you this License
42 | giving you legal permission to copy, distribute and/or modify it.
43 |
44 | For the developers' and authors' protection, the GPL clearly explains
45 | that there is no warranty for this free software. For both users' and
46 | authors' sake, the GPL requires that modified versions be marked as
47 | changed, so that their problems will not be attributed erroneously to
48 | authors of previous versions.
49 |
50 | Some devices are designed to deny users access to install or run
51 | modified versions of the software inside them, although the manufacturer
52 | can do so. This is fundamentally incompatible with the aim of
53 | protecting users' freedom to change the software. The systematic
54 | pattern of such abuse occurs in the area of products for individuals to
55 | use, which is precisely where it is most unacceptable. Therefore, we
56 | have designed this version of the GPL to prohibit the practice for those
57 | products. If such problems arise substantially in other domains, we
58 | stand ready to extend this provision to those domains in future versions
59 | of the GPL, as needed to protect the freedom of users.
60 |
61 | Finally, every program is threatened constantly by software patents.
62 | States should not allow patents to restrict development and use of
63 | software on general-purpose computers, but in those that do, we wish to
64 | avoid the special danger that patents applied to a free program could
65 | make it effectively proprietary. To prevent this, the GPL assures that
66 | patents cannot be used to render the program non-free.
67 |
68 | The precise terms and conditions for copying, distribution and
69 | modification follow.
70 |
71 | TERMS AND CONDITIONS
72 |
73 | 0. Definitions.
74 |
75 | "This License" refers to version 3 of the GNU General Public License.
76 |
77 | "Copyright" also means copyright-like laws that apply to other kinds of
78 | works, such as semiconductor masks.
79 |
80 | "The Program" refers to any copyrightable work licensed under this
81 | License. Each licensee is addressed as "you". "Licensees" and
82 | "recipients" may be individuals or organizations.
83 |
84 | To "modify" a work means to copy from or adapt all or part of the work
85 | in a fashion requiring copyright permission, other than the making of an
86 | exact copy. The resulting work is called a "modified version" of the
87 | earlier work or a work "based on" the earlier work.
88 |
89 | A "covered work" means either the unmodified Program or a work based
90 | on the Program.
91 |
92 | To "propagate" a work means to do anything with it that, without
93 | permission, would make you directly or secondarily liable for
94 | infringement under applicable copyright law, except executing it on a
95 | computer or modifying a private copy. Propagation includes copying,
96 | distribution (with or without modification), making available to the
97 | public, and in some countries other activities as well.
98 |
99 | To "convey" a work means any kind of propagation that enables other
100 | parties to make or receive copies. Mere interaction with a user through
101 | a computer network, with no transfer of a copy, is not conveying.
102 |
103 | An interactive user interface displays "Appropriate Legal Notices"
104 | to the extent that it includes a convenient and prominently visible
105 | feature that (1) displays an appropriate copyright notice, and (2)
106 | tells the user that there is no warranty for the work (except to the
107 | extent that warranties are provided), that licensees may convey the
108 | work under this License, and how to view a copy of this License. If
109 | the interface presents a list of user commands or options, such as a
110 | menu, a prominent item in the list meets this criterion.
111 |
112 | 1. Source Code.
113 |
114 | The "source code" for a work means the preferred form of the work
115 | for making modifications to it. "Object code" means any non-source
116 | form of a work.
117 |
118 | A "Standard Interface" means an interface that either is an official
119 | standard defined by a recognized standards body, or, in the case of
120 | interfaces specified for a particular programming language, one that
121 | is widely used among developers working in that language.
122 |
123 | The "System Libraries" of an executable work include anything, other
124 | than the work as a whole, that (a) is included in the normal form of
125 | packaging a Major Component, but which is not part of that Major
126 | Component, and (b) serves only to enable use of the work with that
127 | Major Component, or to implement a Standard Interface for which an
128 | implementation is available to the public in source code form. A
129 | "Major Component", in this context, means a major essential component
130 | (kernel, window system, and so on) of the specific operating system
131 | (if any) on which the executable work runs, or a compiler used to
132 | produce the work, or an object code interpreter used to run it.
133 |
134 | The "Corresponding Source" for a work in object code form means all
135 | the source code needed to generate, install, and (for an executable
136 | work) run the object code and to modify the work, including scripts to
137 | control those activities. However, it does not include the work's
138 | System Libraries, or general-purpose tools or generally available free
139 | programs which are used unmodified in performing those activities but
140 | which are not part of the work. For example, Corresponding Source
141 | includes interface definition files associated with source files for
142 | the work, and the source code for shared libraries and dynamically
143 | linked subprograms that the work is specifically designed to require,
144 | such as by intimate data communication or control flow between those
145 | subprograms and other parts of the work.
146 |
147 | The Corresponding Source need not include anything that users
148 | can regenerate automatically from other parts of the Corresponding
149 | Source.
150 |
151 | The Corresponding Source for a work in source code form is that
152 | same work.
153 |
154 | 2. Basic Permissions.
155 |
156 | All rights granted under this License are granted for the term of
157 | copyright on the Program, and are irrevocable provided the stated
158 | conditions are met. This License explicitly affirms your unlimited
159 | permission to run the unmodified Program. The output from running a
160 | covered work is covered by this License only if the output, given its
161 | content, constitutes a covered work. This License acknowledges your
162 | rights of fair use or other equivalent, as provided by copyright law.
163 |
164 | You may make, run and propagate covered works that you do not
165 | convey, without conditions so long as your license otherwise remains
166 | in force. You may convey covered works to others for the sole purpose
167 | of having them make modifications exclusively for you, or provide you
168 | with facilities for running those works, provided that you comply with
169 | the terms of this License in conveying all material for which you do
170 | not control copyright. Those thus making or running the covered works
171 | for you must do so exclusively on your behalf, under your direction
172 | and control, on terms that prohibit them from making any copies of
173 | your copyrighted material outside their relationship with you.
174 |
175 | Conveying under any other circumstances is permitted solely under
176 | the conditions stated below. Sublicensing is not allowed; section 10
177 | makes it unnecessary.
178 |
179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
180 |
181 | No covered work shall be deemed part of an effective technological
182 | measure under any applicable law fulfilling obligations under article
183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
184 | similar laws prohibiting or restricting circumvention of such
185 | measures.
186 |
187 | When you convey a covered work, you waive any legal power to forbid
188 | circumvention of technological measures to the extent such circumvention
189 | is effected by exercising rights under this License with respect to
190 | the covered work, and you disclaim any intention to limit operation or
191 | modification of the work as a means of enforcing, against the work's
192 | users, your or third parties' legal rights to forbid circumvention of
193 | technological measures.
194 |
195 | 4. Conveying Verbatim Copies.
196 |
197 | You may convey verbatim copies of the Program's source code as you
198 | receive it, in any medium, provided that you conspicuously and
199 | appropriately publish on each copy an appropriate copyright notice;
200 | keep intact all notices stating that this License and any
201 | non-permissive terms added in accord with section 7 apply to the code;
202 | keep intact all notices of the absence of any warranty; and give all
203 | recipients a copy of this License along with the Program.
204 |
205 | You may charge any price or no price for each copy that you convey,
206 | and you may offer support or warranty protection for a fee.
207 |
208 | 5. Conveying Modified Source Versions.
209 |
210 | You may convey a work based on the Program, or the modifications to
211 | produce it from the Program, in the form of source code under the
212 | terms of section 4, provided that you also meet all of these conditions:
213 |
214 | a) The work must carry prominent notices stating that you modified
215 | it, and giving a relevant date.
216 |
217 | b) The work must carry prominent notices stating that it is
218 | released under this License and any conditions added under section
219 | 7. This requirement modifies the requirement in section 4 to
220 | "keep intact all notices".
221 |
222 | c) You must license the entire work, as a whole, under this
223 | License to anyone who comes into possession of a copy. This
224 | License will therefore apply, along with any applicable section 7
225 | additional terms, to the whole of the work, and all its parts,
226 | regardless of how they are packaged. This License gives no
227 | permission to license the work in any other way, but it does not
228 | invalidate such permission if you have separately received it.
229 |
230 | d) If the work has interactive user interfaces, each must display
231 | Appropriate Legal Notices; however, if the Program has interactive
232 | interfaces that do not display Appropriate Legal Notices, your
233 | work need not make them do so.
234 |
235 | A compilation of a covered work with other separate and independent
236 | works, which are not by their nature extensions of the covered work,
237 | and which are not combined with it such as to form a larger program,
238 | in or on a volume of a storage or distribution medium, is called an
239 | "aggregate" if the compilation and its resulting copyright are not
240 | used to limit the access or legal rights of the compilation's users
241 | beyond what the individual works permit. Inclusion of a covered work
242 | in an aggregate does not cause this License to apply to the other
243 | parts of the aggregate.
244 |
245 | 6. Conveying Non-Source Forms.
246 |
247 | You may convey a covered work in object code form under the terms
248 | of sections 4 and 5, provided that you also convey the
249 | machine-readable Corresponding Source under the terms of this License,
250 | in one of these ways:
251 |
252 | a) Convey the object code in, or embodied in, a physical product
253 | (including a physical distribution medium), accompanied by the
254 | Corresponding Source fixed on a durable physical medium
255 | customarily used for software interchange.
256 |
257 | b) Convey the object code in, or embodied in, a physical product
258 | (including a physical distribution medium), accompanied by a
259 | written offer, valid for at least three years and valid for as
260 | long as you offer spare parts or customer support for that product
261 | model, to give anyone who possesses the object code either (1) a
262 | copy of the Corresponding Source for all the software in the
263 | product that is covered by this License, on a durable physical
264 | medium customarily used for software interchange, for a price no
265 | more than your reasonable cost of physically performing this
266 | conveying of source, or (2) access to copy the
267 | Corresponding Source from a network server at no charge.
268 |
269 | c) Convey individual copies of the object code with a copy of the
270 | written offer to provide the Corresponding Source. This
271 | alternative is allowed only occasionally and noncommercially, and
272 | only if you received the object code with such an offer, in accord
273 | with subsection 6b.
274 |
275 | d) Convey the object code by offering access from a designated
276 | place (gratis or for a charge), and offer equivalent access to the
277 | Corresponding Source in the same way through the same place at no
278 | further charge. You need not require recipients to copy the
279 | Corresponding Source along with the object code. If the place to
280 | copy the object code is a network server, the Corresponding Source
281 | may be on a different server (operated by you or a third party)
282 | that supports equivalent copying facilities, provided you maintain
283 | clear directions next to the object code saying where to find the
284 | Corresponding Source. Regardless of what server hosts the
285 | Corresponding Source, you remain obligated to ensure that it is
286 | available for as long as needed to satisfy these requirements.
287 |
288 | e) Convey the object code using peer-to-peer transmission, provided
289 | you inform other peers where the object code and Corresponding
290 | Source of the work are being offered to the general public at no
291 | charge under subsection 6d.
292 |
293 | A separable portion of the object code, whose source code is excluded
294 | from the Corresponding Source as a System Library, need not be
295 | included in conveying the object code work.
296 |
297 | A "User Product" is either (1) a "consumer product", which means any
298 | tangible personal property which is normally used for personal, family,
299 | or household purposes, or (2) anything designed or sold for incorporation
300 | into a dwelling. In determining whether a product is a consumer product,
301 | doubtful cases shall be resolved in favor of coverage. For a particular
302 | product received by a particular user, "normally used" refers to a
303 | typical or common use of that class of product, regardless of the status
304 | of the particular user or of the way in which the particular user
305 | actually uses, or expects or is expected to use, the product. A product
306 | is a consumer product regardless of whether the product has substantial
307 | commercial, industrial or non-consumer uses, unless such uses represent
308 | the only significant mode of use of the product.
309 |
310 | "Installation Information" for a User Product means any methods,
311 | procedures, authorization keys, or other information required to install
312 | and execute modified versions of a covered work in that User Product from
313 | a modified version of its Corresponding Source. The information must
314 | suffice to ensure that the continued functioning of the modified object
315 | code is in no case prevented or interfered with solely because
316 | modification has been made.
317 |
318 | If you convey an object code work under this section in, or with, or
319 | specifically for use in, a User Product, and the conveying occurs as
320 | part of a transaction in which the right of possession and use of the
321 | User Product is transferred to the recipient in perpetuity or for a
322 | fixed term (regardless of how the transaction is characterized), the
323 | Corresponding Source conveyed under this section must be accompanied
324 | by the Installation Information. But this requirement does not apply
325 | if neither you nor any third party retains the ability to install
326 | modified object code on the User Product (for example, the work has
327 | been installed in ROM).
328 |
329 | The requirement to provide Installation Information does not include a
330 | requirement to continue to provide support service, warranty, or updates
331 | for a work that has been modified or installed by the recipient, or for
332 | the User Product in which it has been modified or installed. Access to a
333 | network may be denied when the modification itself materially and
334 | adversely affects the operation of the network or violates the rules and
335 | protocols for communication across the network.
336 |
337 | Corresponding Source conveyed, and Installation Information provided,
338 | in accord with this section must be in a format that is publicly
339 | documented (and with an implementation available to the public in
340 | source code form), and must require no special password or key for
341 | unpacking, reading or copying.
342 |
343 | 7. Additional Terms.
344 |
345 | "Additional permissions" are terms that supplement the terms of this
346 | License by making exceptions from one or more of its conditions.
347 | Additional permissions that are applicable to the entire Program shall
348 | be treated as though they were included in this License, to the extent
349 | that they are valid under applicable law. If additional permissions
350 | apply only to part of the Program, that part may be used separately
351 | under those permissions, but the entire Program remains governed by
352 | this License without regard to the additional permissions.
353 |
354 | When you convey a copy of a covered work, you may at your option
355 | remove any additional permissions from that copy, or from any part of
356 | it. (Additional permissions may be written to require their own
357 | removal in certain cases when you modify the work.) You may place
358 | additional permissions on material, added by you to a covered work,
359 | for which you have or can give appropriate copyright permission.
360 |
361 | Notwithstanding any other provision of this License, for material you
362 | add to a covered work, you may (if authorized by the copyright holders of
363 | that material) supplement the terms of this License with terms:
364 |
365 | a) Disclaiming warranty or limiting liability differently from the
366 | terms of sections 15 and 16 of this License; or
367 |
368 | b) Requiring preservation of specified reasonable legal notices or
369 | author attributions in that material or in the Appropriate Legal
370 | Notices displayed by works containing it; or
371 |
372 | c) Prohibiting misrepresentation of the origin of that material, or
373 | requiring that modified versions of such material be marked in
374 | reasonable ways as different from the original version; or
375 |
376 | d) Limiting the use for publicity purposes of names of licensors or
377 | authors of the material; or
378 |
379 | e) Declining to grant rights under trademark law for use of some
380 | trade names, trademarks, or service marks; or
381 |
382 | f) Requiring indemnification of licensors and authors of that
383 | material by anyone who conveys the material (or modified versions of
384 | it) with contractual assumptions of liability to the recipient, for
385 | any liability that these contractual assumptions directly impose on
386 | those licensors and authors.
387 |
388 | All other non-permissive additional terms are considered "further
389 | restrictions" within the meaning of section 10. If the Program as you
390 | received it, or any part of it, contains a notice stating that it is
391 | governed by this License along with a term that is a further
392 | restriction, you may remove that term. If a license document contains
393 | a further restriction but permits relicensing or conveying under this
394 | License, you may add to a covered work material governed by the terms
395 | of that license document, provided that the further restriction does
396 | not survive such relicensing or conveying.
397 |
398 | If you add terms to a covered work in accord with this section, you
399 | must place, in the relevant source files, a statement of the
400 | additional terms that apply to those files, or a notice indicating
401 | where to find the applicable terms.
402 |
403 | Additional terms, permissive or non-permissive, may be stated in the
404 | form of a separately written license, or stated as exceptions;
405 | the above requirements apply either way.
406 |
407 | 8. Termination.
408 |
409 | You may not propagate or modify a covered work except as expressly
410 | provided under this License. Any attempt otherwise to propagate or
411 | modify it is void, and will automatically terminate your rights under
412 | this License (including any patent licenses granted under the third
413 | paragraph of section 11).
414 |
415 | However, if you cease all violation of this License, then your
416 | license from a particular copyright holder is reinstated (a)
417 | provisionally, unless and until the copyright holder explicitly and
418 | finally terminates your license, and (b) permanently, if the copyright
419 | holder fails to notify you of the violation by some reasonable means
420 | prior to 60 days after the cessation.
421 |
422 | Moreover, your license from a particular copyright holder is
423 | reinstated permanently if the copyright holder notifies you of the
424 | violation by some reasonable means, this is the first time you have
425 | received notice of violation of this License (for any work) from that
426 | copyright holder, and you cure the violation prior to 30 days after
427 | your receipt of the notice.
428 |
429 | Termination of your rights under this section does not terminate the
430 | licenses of parties who have received copies or rights from you under
431 | this License. If your rights have been terminated and not permanently
432 | reinstated, you do not qualify to receive new licenses for the same
433 | material under section 10.
434 |
435 | 9. Acceptance Not Required for Having Copies.
436 |
437 | You are not required to accept this License in order to receive or
438 | run a copy of the Program. Ancillary propagation of a covered work
439 | occurring solely as a consequence of using peer-to-peer transmission
440 | to receive a copy likewise does not require acceptance. However,
441 | nothing other than this License grants you permission to propagate or
442 | modify any covered work. These actions infringe copyright if you do
443 | not accept this License. Therefore, by modifying or propagating a
444 | covered work, you indicate your acceptance of this License to do so.
445 |
446 | 10. Automatic Licensing of Downstream Recipients.
447 |
448 | Each time you convey a covered work, the recipient automatically
449 | receives a license from the original licensors, to run, modify and
450 | propagate that work, subject to this License. You are not responsible
451 | for enforcing compliance by third parties with this License.
452 |
453 | An "entity transaction" is a transaction transferring control of an
454 | organization, or substantially all assets of one, or subdividing an
455 | organization, or merging organizations. If propagation of a covered
456 | work results from an entity transaction, each party to that
457 | transaction who receives a copy of the work also receives whatever
458 | licenses to the work the party's predecessor in interest had or could
459 | give under the previous paragraph, plus a right to possession of the
460 | Corresponding Source of the work from the predecessor in interest, if
461 | the predecessor has it or can get it with reasonable efforts.
462 |
463 | You may not impose any further restrictions on the exercise of the
464 | rights granted or affirmed under this License. For example, you may
465 | not impose a license fee, royalty, or other charge for exercise of
466 | rights granted under this License, and you may not initiate litigation
467 | (including a cross-claim or counterclaim in a lawsuit) alleging that
468 | any patent claim is infringed by making, using, selling, offering for
469 | sale, or importing the Program or any portion of it.
470 |
471 | 11. Patents.
472 |
473 | A "contributor" is a copyright holder who authorizes use under this
474 | License of the Program or a work on which the Program is based. The
475 | work thus licensed is called the contributor's "contributor version".
476 |
477 | A contributor's "essential patent claims" are all patent claims
478 | owned or controlled by the contributor, whether already acquired or
479 | hereafter acquired, that would be infringed by some manner, permitted
480 | by this License, of making, using, or selling its contributor version,
481 | but do not include claims that would be infringed only as a
482 | consequence of further modification of the contributor version. For
483 | purposes of this definition, "control" includes the right to grant
484 | patent sublicenses in a manner consistent with the requirements of
485 | this License.
486 |
487 | Each contributor grants you a non-exclusive, worldwide, royalty-free
488 | patent license under the contributor's essential patent claims, to
489 | make, use, sell, offer for sale, import and otherwise run, modify and
490 | propagate the contents of its contributor version.
491 |
492 | In the following three paragraphs, a "patent license" is any express
493 | agreement or commitment, however denominated, not to enforce a patent
494 | (such as an express permission to practice a patent or covenant not to
495 | sue for patent infringement). To "grant" such a patent license to a
496 | party means to make such an agreement or commitment not to enforce a
497 | patent against the party.
498 |
499 | If you convey a covered work, knowingly relying on a patent license,
500 | and the Corresponding Source of the work is not available for anyone
501 | to copy, free of charge and under the terms of this License, through a
502 | publicly available network server or other readily accessible means,
503 | then you must either (1) cause the Corresponding Source to be so
504 | available, or (2) arrange to deprive yourself of the benefit of the
505 | patent license for this particular work, or (3) arrange, in a manner
506 | consistent with the requirements of this License, to extend the patent
507 | license to downstream recipients. "Knowingly relying" means you have
508 | actual knowledge that, but for the patent license, your conveying the
509 | covered work in a country, or your recipient's use of the covered work
510 | in a country, would infringe one or more identifiable patents in that
511 | country that you have reason to believe are valid.
512 |
513 | If, pursuant to or in connection with a single transaction or
514 | arrangement, you convey, or propagate by procuring conveyance of, a
515 | covered work, and grant a patent license to some of the parties
516 | receiving the covered work authorizing them to use, propagate, modify
517 | or convey a specific copy of the covered work, then the patent license
518 | you grant is automatically extended to all recipients of the covered
519 | work and works based on it.
520 |
521 | A patent license is "discriminatory" if it does not include within
522 | the scope of its coverage, prohibits the exercise of, or is
523 | conditioned on the non-exercise of one or more of the rights that are
524 | specifically granted under this License. You may not convey a covered
525 | work if you are a party to an arrangement with a third party that is
526 | in the business of distributing software, under which you make payment
527 | to the third party based on the extent of your activity of conveying
528 | the work, and under which the third party grants, to any of the
529 | parties who would receive the covered work from you, a discriminatory
530 | patent license (a) in connection with copies of the covered work
531 | conveyed by you (or copies made from those copies), or (b) primarily
532 | for and in connection with specific products or compilations that
533 | contain the covered work, unless you entered into that arrangement,
534 | or that patent license was granted, prior to 28 March 2007.
535 |
536 | Nothing in this License shall be construed as excluding or limiting
537 | any implied license or other defenses to infringement that may
538 | otherwise be available to you under applicable patent law.
539 |
540 | 12. No Surrender of Others' Freedom.
541 |
542 | If conditions are imposed on you (whether by court order, agreement or
543 | otherwise) that contradict the conditions of this License, they do not
544 | excuse you from the conditions of this License. If you cannot convey a
545 | covered work so as to satisfy simultaneously your obligations under this
546 | License and any other pertinent obligations, then as a consequence you may
547 | not convey it at all. For example, if you agree to terms that obligate you
548 | to collect a royalty for further conveying from those to whom you convey
549 | the Program, the only way you could satisfy both those terms and this
550 | License would be to refrain entirely from conveying the Program.
551 |
552 | 13. Use with the GNU Affero General Public License.
553 |
554 | Notwithstanding any other provision of this License, you have
555 | permission to link or combine any covered work with a work licensed
556 | under version 3 of the GNU Affero General Public License into a single
557 | combined work, and to convey the resulting work. The terms of this
558 | License will continue to apply to the part which is the covered work,
559 | but the special requirements of the GNU Affero General Public License,
560 | section 13, concerning interaction through a network will apply to the
561 | combination as such.
562 |
563 | 14. Revised Versions of this License.
564 |
565 | The Free Software Foundation may publish revised and/or new versions of
566 | the GNU General Public License from time to time. Such new versions will
567 | be similar in spirit to the present version, but may differ in detail to
568 | address new problems or concerns.
569 |
570 | Each version is given a distinguishing version number. If the
571 | Program specifies that a certain numbered version of the GNU General
572 | Public License "or any later version" applies to it, you have the
573 | option of following the terms and conditions either of that numbered
574 | version or of any later version published by the Free Software
575 | Foundation. If the Program does not specify a version number of the
576 | GNU General Public License, you may choose any version ever published
577 | by the Free Software Foundation.
578 |
579 | If the Program specifies that a proxy can decide which future
580 | versions of the GNU General Public License can be used, that proxy's
581 | public statement of acceptance of a version permanently authorizes you
582 | to choose that version for the Program.
583 |
584 | Later license versions may give you additional or different
585 | permissions. However, no additional obligations are imposed on any
586 | author or copyright holder as a result of your choosing to follow a
587 | later version.
588 |
589 | 15. Disclaimer of Warranty.
590 |
591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
599 |
600 | 16. Limitation of Liability.
601 |
602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
610 | SUCH DAMAGES.
611 |
612 | 17. Interpretation of Sections 15 and 16.
613 |
614 | If the disclaimer of warranty and limitation of liability provided
615 | above cannot be given local legal effect according to their terms,
616 | reviewing courts shall apply local law that most closely approximates
617 | an absolute waiver of all civil liability in connection with the
618 | Program, unless a warranty or assumption of liability accompanies a
619 | copy of the Program in return for a fee.
620 |
621 | END OF TERMS AND CONDITIONS
622 |
623 | How to Apply These Terms to Your New Programs
624 |
625 | If you develop a new program, and you want it to be of the greatest
626 | possible use to the public, the best way to achieve this is to make it
627 | free software which everyone can redistribute and change under these terms.
628 |
629 | To do so, attach the following notices to the program. It is safest
630 | to attach them to the start of each source file to most effectively
631 | state the exclusion of warranty; and each file should have at least
632 | the "copyright" line and a pointer to where the full notice is found.
633 |
634 |
635 | Copyright (C)
636 |
637 | This program is free software: you can redistribute it and/or modify
638 | it under the terms of the GNU General Public License as published by
639 | the Free Software Foundation, either version 3 of the License, or
640 | (at your option) any later version.
641 |
642 | This program is distributed in the hope that it will be useful,
643 | but WITHOUT ANY WARRANTY; without even the implied warranty of
644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
645 | GNU General Public License for more details.
646 |
647 | You should have received a copy of the GNU General Public License
648 | along with this program. If not, see .
649 |
650 | Also add information on how to contact you by electronic and paper mail.
651 |
652 | If the program does terminal interaction, make it output a short
653 | notice like this when it starts in an interactive mode:
654 |
655 | Copyright (C)
656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
657 | This is free software, and you are welcome to redistribute it
658 | under certain conditions; type `show c' for details.
659 |
660 | The hypothetical commands `show w' and `show c' should show the appropriate
661 | parts of the General Public License. Of course, your program's commands
662 | might be different; for a GUI interface, you would use an "about box".
663 |
664 | You should also get your employer (if you work as a programmer) or school,
665 | if any, to sign a "copyright disclaimer" for the program, if necessary.
666 | For more information on this, and how to apply and follow the GNU GPL, see
667 | .
668 |
669 | The GNU General Public License does not permit incorporating your program
670 | into proprietary programs. If your program is a subroutine library, you
671 | may consider it more useful to permit linking proprietary applications with
672 | the library. If this is what you want to do, use the GNU Lesser General
673 | Public License instead of this License. But first, please read
674 | .
675 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | all: opt
2 |
3 | opt:
4 | ocamlbuild -use-ocamlfind -I src src/index.native
5 | cp index.native index
6 |
7 | test:
8 | ocamlbuild -use-ocamlfind -I src src/test.top
9 |
10 | clean:
11 | ocamlbuild -clean
--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
1 | Texsearch is a search index specialised for LaTeX equations, forming part of the backend for Springer's [latexsearch.com](http://latexsearch.com). Latexsearch currently indexes more than 2 million documents drawn from Springer journals and books.
2 |
3 | Every LaTeX equation in the corpus is parsed and evaluated on entry to produce an AST. The similarity between a pair of equations is calculated as the Levenshtein distance between their respective ASTs as a fraction of the total size of the ASTs. Given a LateX equation as a search term, texsearch will retrieve all equations in the corpus whose similarity to the search term falls under a specified margin.
4 |
5 | The index uses a suffix array to quickly calculate a superset of the search results by finding exact matches of fragments of the search term.
6 |
7 | Previous versions use a modified bk-tree which is capable of performing vicinity searches over any quasi-metric space using any query function satisfying:
8 |
9 | For all a. query a >= 0
10 | For all a, b. query b - query a <= dist a b
11 |
12 | This index is stored in-memory and is relatively compact - the index for latexsearch.com is under 800MB.
13 |
14 | # Architecture
15 |
16 | Couchdb is the root process. The preprocessor and index are run as _external services on couchdb. Raw data is stored in the 'documents' db on couchdb. The search index is stored in the file 'data/index'.
17 |
18 | Springer documents are uploaded to the server as xml files. The command 'db.py --add some_doc.xml' extracts latex formulae and metadata from some_doc.xml, runs the latex through the preprocessor and stores the results in couchdb. The command 'index -update' uses the couchdb change log to locate new or modified documents and update the index file. Restarting the index external service causes it to load the new index file.
19 |
20 | # Requirements
21 |
22 | Tested with:
23 |
24 | couchdb 0.6.0
25 |
26 | ocaml 3.12.0
27 | ancient 0.9.0
28 | json-wheel 1.0.6
29 | json-static 0.9.8
30 | ocamlnet 3.2
31 | pcre-ocaml 6.2.2
32 | xml-light 2.2
33 |
34 | python 2.6.6
35 | couchdb 0.6 (python lib)
36 | httplib2 0.5.0
37 | plastex 0.9.2
38 |
39 |
--------------------------------------------------------------------------------
/add:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ulimit -s unlimited
3 | dir=$(dirname $0)
4 | python $dir/scripts/db.py --add=$1
5 | $dir/index -update
6 | $dir/stop
7 | $dir/start
8 |
--------------------------------------------------------------------------------
/db.ini:
--------------------------------------------------------------------------------
1 | [couchdb]
2 | database_dir = ./db
3 | view_index_dir = ./db
4 | max_document_size = 4294967296 ; 4 GB
5 | max_attachment_chunk_size = 4294967296 ; 4GB
6 | os_process_timeout = 60000 ; 60 seconds. for view and external servers.
7 | max_dbs_open = 100
8 |
9 | [httpd]
10 | port = 5984
11 | bind_address = 127.0.0.1
12 | authentication_handler = {couch_httpd, default_authentication_handler}
13 | WWW-Authenticate = Basic realm="administrator"
14 |
15 | [log]
16 | file = ./log/couch.log
17 | level = info
18 |
19 | [query_servers]
20 | javascript = /usr/bin/couchjs /usr/share/couchdb/server/main.js
21 |
22 | [external]
23 | index = ./query.sh
24 | preprocess = python ./scripts/preprocessor.py
25 |
26 | [daemons]
27 | view_manager={couch_view, start_link, []}
28 | external_manager={couch_external_manager, start_link, []}
29 | db_update_notifier={couch_db_update_notifier_sup, start_link, []}
30 | query_servers={couch_query_servers, start_link, []}
31 | httpd={couch_httpd, start_link, []}
32 | stats_aggregator={couch_stats_aggregator, start, []}
33 | stats_collector={couch_stats_collector, start, []}
34 |
35 | [httpd_global_handlers]
36 | / = {couch_httpd_misc_handlers, handle_welcome_req, <<"Welcome">>}
37 | favicon.ico = {couch_httpd_misc_handlers, handle_favicon_req, "/usr/share/couchdb/www"}
38 |
39 | _utils = {couch_httpd_misc_handlers, handle_utils_dir_req, "/usr/share/couchdb/www"}
40 | _all_dbs = {couch_httpd_misc_handlers, handle_all_dbs_req}
41 | _active_tasks = {couch_httpd_misc_handlers, handle_task_status_req}
42 | _config = {couch_httpd_misc_handlers, handle_config_req}
43 | _replicate = {couch_httpd_misc_handlers, handle_replicate_req}
44 | _uuids = {couch_httpd_misc_handlers, handle_uuids_req}
45 | _restart = {couch_httpd_misc_handlers, handle_restart_req}
46 | _stats = {couch_httpd_stats_handlers, handle_stats_req}
47 |
48 | [httpd_db_handlers]
49 | _design = {couch_httpd_db, handle_design_req}
50 | _temp_view = {couch_httpd_view, handle_temp_view_req}
51 |
52 | ; The external module takes an optional argument allowing you to narrow it to a
53 | ; single script. Otherwise the script name is inferred from the first path section
54 | ; after _external's own path.
55 | ; _mypath = {couch_httpd_external, handle_external_req, <<"mykey">>}
56 | _external = {couch_httpd_external, handle_external_req}
57 |
58 | [httpd_design_handlers]
59 | _view = {couch_httpd_view, handle_view_req}
60 | _show = {couch_httpd_show, handle_doc_show_req}
61 | _list = {couch_httpd_show, handle_view_list_req}
62 |
--------------------------------------------------------------------------------
/del:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ulimit -s unlimited
3 | dir=$(dirname $0)
4 | python $dir/scripts/db.py --del=$1
5 | $dir/index -update
6 | $dir/stop
7 | $dir/start
8 |
--------------------------------------------------------------------------------
/init:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | dir=$(dirname $0)
3 | $dir/index -init
4 | python $dir/scripts/db.py --init
5 |
--------------------------------------------------------------------------------
/query.sh:
--------------------------------------------------------------------------------
1 | ulimit -s unlimited
2 | ./index -query
3 |
--------------------------------------------------------------------------------
/run/update.pid:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamii/texsearch/d0d4423f093dfafadd935f785b384d2c2fb7abf9/run/update.pid
--------------------------------------------------------------------------------
/scripts/db.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """ Handles parsing Springer documents and adding/deleting document entries to/from couchdb """
4 |
5 | import re
6 | import sys, httplib, urllib
7 | from xml.dom import minidom
8 | from preprocessor import JsonProcessor, parseLaTeX
9 | from util import encodeDoi, decodeDoi
10 | import couchdb.client
11 |
12 | # Find the couchdb server
13 | conf = open("./db.ini")
14 | port = re.compile(r"port *= *(\d+)").search(conf.read()).group(1)
15 | conf.close()
16 |
17 | couchdb_server = couchdb.client.Server('http://localhost:%s/' % port)
18 |
19 | def confirm(prompt):
20 | response = raw_input(prompt + " (y/n):")
21 | if response != 'y':
22 | print "Ok, nothing was done"
23 | sys.exit(0)
24 |
25 | ### Initial configuration of the database ###
26 |
27 | def initDB():
28 | confirm("This will erase the texsearch database. Are you sure?")
29 |
30 | print "Deleting existing databases"
31 | try:
32 | del couchdb_server['documents']
33 | except couchdb.client.ResourceNotFound:
34 | # db doesnt exist yet
35 | pass
36 |
37 | print "Creating new databases"
38 | couchdb_server.create('documents')
39 |
40 | ### Parsing and preprocessing xml articles ###
41 |
42 | # Wrap the JsonProcessor in some error handling, since plasTeX often fails in weird ways
43 | def preprocess(eqnID, latex):
44 | try:
45 | result = JsonProcessor().process(parseLaTeX("\\begin{document} " + latex + " \\end{document}")).dumps()
46 | return (eqnID, result)
47 | except KeyboardInterrupt, e:
48 | raise e
49 | except Exception, e:
50 | print "Note: Preprocessor failed on equation %s : %s" % (eqnID, e)
51 | return None
52 |
53 | def parseEquation(eqn):
54 | eqnID = eqn.attributes.get('ID').value
55 | try:
56 | for eqnSource in eqn.getElementsByTagName("EquationSource"):
57 | if eqnSource.attributes.get('Format').value == "TEX":
58 | latex = eqnSource.childNodes[0].wholeText
59 | return (latex, eqnID)
60 | return None
61 | except IndexError:
62 | print ("Note: no equation source for eqn %s" % eqnID)
63 | except AttributeError:
64 | print ("Note: missing format attribute for eqn %s" % eqnID)
65 | return None
66 |
67 | def filterNone(xs):
68 | return [x for x in xs if x is not None]
69 |
70 | def parseEquations(item):
71 | equations = filterNone([parseEquation(eqn) for eqn in item.getElementsByTagName("Equation") + item.getElementsByTagName("InlineEquation")])
72 | # Eliminate duplicate equations (key is latex)
73 | equations = dict(equations).items()
74 |
75 | source = dict([(eqnID, latex) for (latex, eqnID) in equations])
76 | content = dict(filterNone([preprocess(eqnID, latex) for (latex, eqnID) in equations]))
77 |
78 | return (source, content)
79 |
80 | def parseArticle(article):
81 | doi = article.getElementsByTagName("ArticleDOI")[0].childNodes[0].wholeText
82 | print ("Parsing article %s" % doi)
83 |
84 | publicationDate = article.getElementsByTagName("PrintDate") or article.getElementsByTagName("CoverDate") or article.getElementsByTagName("OnlineDate")
85 | if publicationDate:
86 | publicationYear = publicationDate[0].getElementsByTagName("Year")[0].childNodes[0].wholeText
87 | else:
88 | print "Note: no publication year"
89 | publicationYear = None
90 |
91 | journalID = article.getElementsByTagName("JournalID")[0].childNodes[0].wholeText
92 | (source, content) = parseEquations(article)
93 | return {'_id': encodeDoi(doi), 'source': source, 'content': content, 'format': 'Article', 'containerID': journalID, 'publicationYear': publicationYear}
94 |
95 | def parseChapter(chapter):
96 | doi = chapter.getElementsByTagName("ChapterDOI")[0].childNodes[0].wholeText
97 | print ("Parsing chapter %s" % doi)
98 | (source, content) = parseEquations(chapter)
99 | return {'_id': encodeDoi(doi), 'source': source, 'content': content, 'format':'Chapter'}
100 |
101 | def parseBook(book):
102 | bookDOI = book.getElementsByTagName("BookDOI")[0].childNodes[0].wholeText
103 |
104 | publicationDate = book.getElementsByTagName("BookCopyright")
105 | if publicationDate:
106 | publicationYear = publicationDate[0].getElementsByTagName("CopyrightYear")[0].childNodes[0].wholeText
107 | else:
108 | print "Note: no publication year"
109 | publicationYear = None
110 |
111 | chapters = []
112 | for chapter in book.getElementsByTagName("Chapter"):
113 | chapter = parseChapter(chapter)
114 | chapter['containerID'] = bookDOI
115 | chapter['publicationYear'] = publicationYear
116 | chapters.append(chapter)
117 | return chapters
118 |
119 | def parseFile(fileName):
120 | xml = minidom.parse(fileName)
121 |
122 | articles = [parseArticle(article) for article in xml.getElementsByTagName("Article")]
123 | chapters = []
124 | for book in xml.getElementsByTagName("Book"):
125 | chapters.extend(parseBook(book))
126 | docs = articles + chapters
127 |
128 | return docs
129 |
130 | ### Adding and deleting articles from the database ###
131 |
132 | def addFile(fileName, type):
133 | db = couchdb_server['documents']
134 |
135 | print "Reading file %s" % fileName
136 | docs = parseFile(fileName)
137 |
138 | for doc in docs:
139 | doc['type'] = type
140 |
141 | oldDoc = db.get(doc['_id'],None)
142 | if not oldDoc:
143 | print "Adding new entry"
144 | db[doc['_id']] = doc
145 | elif (doc['type'] == 'xml.meta') and (oldDoc['type'] == 'xml'):
146 | print "Full entry already exists, not overwriting with meta"
147 | else:
148 | print "Overwriting existing entry"
149 | doc['_rev'] = oldDoc['_rev']
150 | db[doc['_id']] = doc
151 |
152 | print
153 |
154 | def delFile(fileName, type):
155 | db = couchdb_server['documents']
156 |
157 | print "Reading file %s" % fileName
158 | xml = minidom.parse(fileName)
159 |
160 | for article in xml.getElementsByTagName("Article"):
161 | doi = encodeDoi(article.getElementsByTagName("ArticleDOI")[0].childNodes[0].wholeText)
162 |
163 | oldDoc = db.get(doi,None)
164 | if not oldDoc:
165 | print "No entry to delete"
166 | elif (type == 'xml.meta') and (oldDoc['type'] == 'xml'):
167 | print "Full entry exists, not deleting meta"
168 | else:
169 | print "Deleting entry"
170 | del db[doi]
171 |
172 | # Reprocess all latex sources in the database, handy when changing the preprocessor
173 | def reprocess():
174 | db = couchdb_server['documents']
175 |
176 | print "Reprocessing latex sources"
177 | for doi in db:
178 | print "Reprocessing %s" % decodeDoi(doi)
179 | doc = db[doi]
180 | doc['content'] = dict(filterNone([(preprocess(eqnID, latex)) for (eqnID, latex) in doc['source'].items()]))
181 | db[doi] = doc
182 |
183 | # Rename journalID field to containerID
184 | def convert_journalID_containerID():
185 | db = couchdb_server['documents']
186 |
187 | print "Converting"
188 | for doi in db:
189 | print "Converting %s" % decodeDoi(doi)
190 | doc = db[doi]
191 | if 'journalID' in doc:
192 | doc['containerID'] = doc['journalID']
193 | del doc['journalID']
194 | db[doi] = doc
195 |
196 | def ml_year(doi):
197 | response = urllib.urlopen("http://latexalpha.mpstechnologies.com/year.do?doi=" + doi).read()
198 | xml = minidom.parseString(response)
199 | return xml.childNodes[0].getAttribute('year')[0:4]
200 |
201 | # Check dates against ML
202 | def check_dates():
203 | db = couchdb_server['documents']
204 |
205 | print "Checking dates"
206 | for doi in db:
207 | try:
208 | doc = db[doi]
209 | actual = doc['publicationYear']
210 | expected = ml_year(decodeDoi(doi))
211 | if expected != "":
212 | if expected != actual:
213 | print ("Doi: %s Expected: %s Actual: %s" % (doi, expected, actual))
214 | doc['publicationYear'] = expected
215 | db[doi] = doc
216 | else:
217 | print ("Doi: %s ok" % doi)
218 | elif doc.get('format', 'article').lower() == 'article':
219 | print ("ML year not defined for article: %s" % doi)
220 | except KeyboardInterrupt, e:
221 | raise e
222 | except Exception, e:
223 | print ("Failed on doi: %s" % doi)
224 | print e
225 |
226 | # Repair this server by copying content from targetServer
227 | def repair(targetServer):
228 | db = couchdb_server['documents']
229 | targetdb = couchdb.client.Server(targetServer)['documents']
230 |
231 | print "Copying from %s" % target_server
232 |
233 | for doi in db:
234 | targetDoc = targetdb.get(doi,None)
235 | if targetDoc:
236 | db[doi] = targetDoc
237 |
238 | ### Command line interaction ###
239 |
240 | def walk(path):
241 | for root, _, files in os.walk(arg):
242 | for file in files:
243 | yield os.path.join(root,file)
244 |
245 | def usage():
246 | print "Usage: --init, --reprocess, --add=/docs/addme, --del=/docs/delme"
247 |
248 | import os, os.path, getopt
249 |
250 | if __name__ == '__main__':
251 | try:
252 | opts, args = getopt.getopt(sys.argv[1:], "", ["init", "add=", "del=", "convert", "reprocess", "check_dates"])
253 | errors = []
254 |
255 | for opt, arg in opts:
256 | if opt == "--init":
257 | initDB()
258 | elif opt == "--add":
259 | for file in walk(arg):
260 | try:
261 | if file.lower().endswith(".xml"):
262 | addFile(file,"xml")
263 | elif file.lower().endswith(".xml.meta"):
264 | addFile(file,"xml.meta")
265 | except KeyboardInterrupt, e:
266 | raise e
267 | except Exception, exc:
268 | print exc
269 | errors.append((file,exc))
270 | elif opt == "--del":
271 | for file in walk(arg):
272 | try:
273 | if file.lower().endswith(".xml"):
274 | delFile(file,"xml")
275 | elif file.lower().endswith(".xml.meta"):
276 | delFile(file,"xml.meta")
277 | except KeyboardInterrupt, e:
278 | raise e
279 | except Exception, exc:
280 | print exc
281 | errors.append((file,exc))
282 | elif opt == "--reprocess":
283 | reprocess()
284 | elif opt == "--check_dates":
285 | check_dates()
286 | elif opt == "--convert":
287 | convert_journalID_containerID()
288 | if errors:
289 | print "Errors occurred whilst processing the following files:"
290 | for (fi,exc) in errors:
291 | print fi
292 | print exc
293 | else:
294 | print "Ok"
295 |
296 | except getopt.GetoptError:
297 | usage()
298 | sys.exit(2)
299 |
--------------------------------------------------------------------------------
/scripts/monitor.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python
2 |
3 | """ Runs regression tests against texsearch and reports failures by email. In production is run by cron every 15 minutes """
4 |
5 | import urllib
6 | from xml.dom import minidom
7 | import smtplib
8 | from email.MIMEMultipart import MIMEMultipart
9 | from email.MIMEText import MIMEText
10 | import time
11 | import os.path as path
12 | from popen2 import popen2
13 |
14 | ports = [5985, 5984]
15 | searchTerms = open("searchTerms").read().splitlines()
16 |
17 | user = 'latex_operations@springer.com'
18 |
19 | # Reporting levels
20 | errorGroup = ['jamiiecb@googlemail.com'] # ['latex_operations@springer.com']
21 | infoGroup = ['jamiiecb@googlemail.com']
22 |
23 | def searchURL(port,searchTerm):
24 | return ("http://localhost:%d/documents/_external/index?searchTerm=%s&precision=0.66&limit=10000" % (port,urllib.quote(searchTerm)))
25 |
26 | def countResults(resultString):
27 | try:
28 | dom = minidom.parseString(resultString)
29 | if dom.getElementsByTagName("results"):
30 | results = dom.getElementsByTagName("result") + dom.getElementsByTagName("Chapter") + dom.getElementsByTagName("Article")
31 | return len(results)
32 | except Exception, e:
33 | pass
34 |
35 | # Not a correct result string
36 | return None
37 |
38 | def readResults(port, i):
39 | file = open(("%s/%s" % (port, i)), 'r')
40 | results = int(file.read())
41 | file.close()
42 |
43 | return results
44 |
45 | def writeResults(port, i, results):
46 | file = open(("%s/%s" % (port, i)), 'w')
47 | file.write(str(results))
48 | file.close
49 |
50 | def init():
51 | for port in ports:
52 | for i in range(0,len(searchTerms)):
53 | url = searchURL(port, searchTerms[i])
54 | resultString = urllib.urlopen(url).read()
55 | results = countResults(resultString)
56 | writeResults(port, i, results)
57 |
58 | def test():
59 | info = []
60 | errors = []
61 |
62 | for port in ports:
63 | for i in range(0,len(searchTerms)):
64 | url = searchURL(port, searchTerms[i])
65 |
66 | try:
67 | resultString = urllib.urlopen(url).read()
68 | results = countResults(resultString)
69 | expectedResults = readResults(port, i)
70 | if results == None:
71 | # Didnt get a correct result string
72 | errors.append(("Url: %s\n%s" % (url, resultString)))
73 | elif results == expectedResults:
74 | # Uninteresting
75 | pass
76 | elif results > expectedResults:
77 | # No of results may increase when adding content
78 | writeResults(port, i, results)
79 | info.append(("Url: %s\nExpected %d results, got %d results" % (url, expectedResults, results)))
80 | else:
81 | # No of results should never decrease
82 | writeResults(port, i, results)
83 | errors.append(("Url: %s\nExpected %d results, got %d results" % (url, expectedResults, results)))
84 | except Exception, e:
85 | # Most likely connection refused or http 500
86 | errors.append(("Url: %s\n%s" % (url, str(e))))
87 |
88 | return (info, errors)
89 |
90 | def mail(to, subject, text):
91 | print subject
92 | print text
93 |
94 | msg = MIMEMultipart()
95 | msg['From'] = user
96 | msg['To'] = to
97 | msg['Subject'] = subject
98 | msg.attach(MIMEText(text))
99 |
100 | mailServer = smtplib.SMTP('smtp.springer-sbm.com')
101 | mailServer.sendmail(user, to, msg.as_string())
102 | mailServer.close()
103 |
104 | def top():
105 | pout, pin = popen2("top -b -n 1")
106 | return pout.read()
107 |
108 | def reportErrors(errors):
109 | subject = ("TeXsearch error report: %s" % time.asctime())
110 | text = "\n\n".join(errors + [top()])
111 | for e in errorGroup:
112 | mail(e, subject, text)
113 |
114 | def reportInfo(info):
115 | subject = ("TeXsearch info report: %s" % time.asctime())
116 | text = "\n\n".join(info + [top()])
117 | for i in infoGroup:
118 | mail(i, subject, text)
119 |
120 | import sys, getopt
121 |
122 | if __name__ == '__main__':
123 | opts, args = getopt.getopt(sys.argv[1:], "", ["init","test"])
124 | for opt, arg in opts:
125 | if opt == "--init":
126 | init()
127 | if opt == "--test":
128 | info, errors = test()
129 | if errors:
130 | reportErrors(errors)
131 | if info:
132 | reportInfo(info)
133 |
--------------------------------------------------------------------------------
/scripts/preprocessor.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python
2 |
3 | """ Parses and preprocesses LaTeX formulae using PlasTeX """
4 |
5 | import string, re
6 | from plasTeX import TeXFragment, TeXDocument
7 | import plasTeX.Context
8 | from plasTeX.DOM import Node
9 | from plasTeX.TeX import TeX
10 | from plasTeX.Base.TeX.Primitives import MathShift
11 |
12 | ### LaTeX preprocessing ###
13 |
14 | # Ignore useless nodes
15 | # There are probably more nodes that could be ignored but these are the most common
16 | ignoreSet = frozenset([
17 | 'displaymath'
18 | ,'bgroup'
19 | ,'egroup'
20 | ,'math'
21 | ,'text'
22 | ,'nulldelimiterspace'
23 | ,'kern'
24 | ,'vphantom'
25 | ,'hphantom'
26 | ,'hfill'
27 | ,'vfill'
28 | ,'hbox'
29 | ,'align'
30 | ,'aligned'
31 | ,'gathered'
32 | ,'active::&'
33 | ,'#document'
34 | ,'document'
35 | ,'rm'
36 | ,'par'
37 | ,'None'
38 | ,'mathord'
39 | ,'array'
40 | ])
41 |
42 | class BadProcess(Exception):
43 | pass
44 |
45 | class Processor:
46 | def __init__(self):
47 | self.textNode = False
48 |
49 | def process(self,node):
50 | if node.nodeName.startswith('text'):
51 | self.textNode = True
52 | if node.nodeType == Node.TEXT_NODE:
53 | # Short circuit text nodes
54 | text = unicode(node)
55 | # Unfortunately plasTeX does not place \text node arguments under text nodes
56 | if self.textNode:
57 | self.addText(text)
58 | self.textNode = False
59 | else:
60 | for char in text:
61 | if char != ' ':
62 | self.addText(char)
63 | elif node.nodeName in ignoreSet:
64 | # Ignore node and move on to children
65 | for child in node.childNodes:
66 | self.process(child)
67 | else:
68 | self.pushMacro(unicode(node.nodeName))
69 | self.processChildren(node)
70 | self.popMacro(unicode(node.nodeName))
71 |
72 | return self
73 |
74 | def processChildren(self,node):
75 | # See if we have any attributes to process
76 | if node.hasAttributes():
77 | for key, value in node.attributes.items():
78 | # If the key is 'self' these nodes are the same as the child nodes
79 | # If the key is '*modifier*' we dont care about it
80 | if key == 'self' or key == '*modifier*':
81 | continue
82 | elif value.__class__ is TeXFragment:
83 | self.openBracket()
84 | for child in value.childNodes:
85 | self.process(child)
86 | self.closeBracket()
87 | elif value.__class__ is Node:
88 | self.openBracket()
89 | self.process(value)
90 | self.closeBracket()
91 | else:
92 | continue
93 |
94 | # Process child nodes
95 | if node.childNodes:
96 | self.openBracket()
97 | for child in node.childNodes:
98 | self.process(child)
99 | self.closeBracket()
100 |
101 | return self
102 |
103 | # Converts a plasTeX DOM tree into a json tree #
104 | class JsonProcessor(Processor):
105 | def __init__(self):
106 | self.textNode = False
107 | self.text = [[]]
108 | self.macros = []
109 |
110 | def dumps(self):
111 | if len(self.text) != 1:
112 | raise BadProcess()
113 | return self.text[0]
114 |
115 | def addText(self,text):
116 | self.text[-1].append(text)
117 |
118 | def pushMacro(self,macro):
119 | self.text.append([])
120 | self.macros.append(macro)
121 |
122 | def popMacro(self,macro):
123 | currentMacro = self.macros.pop()
124 | if currentMacro != macro:
125 | raise BadProcess()
126 | currentText = self.text.pop()
127 | self.text[-1].append({currentMacro : currentText})
128 |
129 | def openBracket(self):
130 | pass
131 |
132 | def closeBracket(self):
133 | pass
134 |
135 | # Converts a plasTeX DOM tree back into plain LaTeX
136 | class PlainProcessor(Processor):
137 | def __init__(self):
138 | self.textNode = False
139 | self.text = []
140 | self.macros = 0
141 |
142 | def dumps(self):
143 | return " ".join(self.text)
144 |
145 | def addText(self,text):
146 | self.text.append(text)
147 |
148 | def pushMacro(self,macro):
149 | self.macros += 1
150 | if macro.startswith("active::"):
151 | self.text.append(macro.lstrip("active::"))
152 | else:
153 | self.text.append("\\" + macro)
154 |
155 | def popMacro(self,macro):
156 | self.macros -= 1
157 |
158 | def openBracket(self):
159 | self.text.append("{")
160 |
161 | def closeBracket(self):
162 | self.text.append("}")
163 |
164 | # Override plasTeX's buggy handling of mathmode, since we dont need textmode
165 | plasTeX.Context.Context.isMathMode = property(lambda obj: True)
166 |
167 | def parseLaTeX(string):
168 | # PlasTeX bug - this variable doent get reinitialised
169 | MathShift.inEnv = []
170 |
171 | # Instantiate a TeX processor and parse the input text
172 | tex = TeX()
173 | tex.disableLogging()
174 |
175 | # Parse the LaTeX
176 | tex.input(string)
177 | return tex.parse()
178 |
179 | ### Making the preprocessor available as a couchdb _external ###
180 |
181 | import sys
182 | import simplejson as json
183 |
184 | def requests():
185 | line = sys.stdin.readline()
186 | while line:
187 | yield json.loads(line)
188 | line = sys.stdin.readline()
189 |
190 | import signal
191 |
192 | class Timeout(Exception):
193 | def __str__(self):
194 | return "Timed out"
195 |
196 | def handleTimeout(signum,frame):
197 | raise Timeout()
198 |
199 | def main():
200 | # Work around the lack of real threading by using an alarm signal for timeouts
201 | signal.signal(signal.SIGALRM, handleTimeout)
202 |
203 | for request in requests():
204 | try:
205 | try: # Nested try because older versions of python cant handle except/finally
206 | query = request['query']
207 |
208 | format = query['format']
209 |
210 | try:
211 | timeout = int(float(query['timeout']))
212 | except ValueError, e:
213 | timeout = 5
214 | except KeyError, e:
215 | timeout = 5
216 | signal.alarm(timeout)
217 |
218 | dom = parseLaTeX("\\begin{document} $$" + query['latex'] + "$$ \\end{document}")
219 |
220 | if format == 'json-plain':
221 | jsonResponse = JsonProcessor().process(dom).dumps()
222 | plainResponse = PlainProcessor().process(dom).dumps()
223 | response = {'code':200, 'json':{'json':jsonResponse, 'plain':plainResponse}}
224 | elif format == 'json':
225 | jsonResponse = JsonProcessor().process(dom).dumps()
226 | response = {'code':200, 'json':jsonResponse}
227 | elif format == 'plain':
228 | plainResponse = PlainProcessor().process(dom).dumps()
229 | response = {'code':200, 'body':plainResponse, 'headers':{'Content-type':'text/plain'}}
230 | else:
231 | response = {'code':400, 'body':('Error: bad format argument'), 'headers':{'Content-type':'text/plain'}} # Bad request
232 |
233 | except KeyError, e:
234 | response = {'code':400, 'body':('Error: ' + str(e)), 'headers':{'Content-type':'text/plain'}} # Bad request
235 | except Timeout, e:
236 | response = {'code':500, 'body':('Error: ' + str(e)), 'headers':{'Content-type':'text/plain'}} # Internal server error
237 | except Exception, e:
238 | response = {'code':500, 'body':('Error: ' + str(e)), 'headers':{'Content-type':'text/plain'}} # Internal server error
239 | finally:
240 | # Deactivate the timeout
241 | signal.alarm(0)
242 |
243 | sys.stdout.write("%s\n" % json.dumps(response))
244 | sys.stdout.flush()
245 |
246 | def dumps(latex):
247 | return JsonProcessor().process(parseLaTeX("\\begin{document} $$" + latex + "$$ \\end{document}")).dumps()
248 |
249 | if __name__ == "__main__":
250 | main()
251 |
--------------------------------------------------------------------------------
/scripts/replay.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """ Replay existing searches found in couchdb logs """
4 |
5 | import sys
6 | import re
7 | import search
8 | import urllib
9 |
10 | search_term_re = re.compile(r'searchTerm=([^&]*)&')
11 |
12 | def replay_log_file(filename):
13 | search_terms = set()
14 |
15 | for log in open(filename):
16 | match = search_term_re.search(log)
17 | if match:
18 | search_term = urllib.unquote(match.group(1))
19 | search_terms.add(search_term)
20 |
21 | for search_term in search_terms:
22 | result = search.search(search_term, searchTimeout="55000.0", limit="10000")
23 | yield (result['time'], search_term)
24 |
25 | if __name__ == '__main__':
26 | for time, search_term in replay_log_file(sys.argv[1]):
27 | print time, search_term
28 |
--------------------------------------------------------------------------------
/scripts/search.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python
2 |
3 | """ Python interface to the index external service running on couchdb """
4 |
5 | import urllib
6 | import time
7 | from xml.dom import minidom
8 | from db import port, couchdb_server
9 | from util import encodeDoi
10 |
11 | def parseResults(results):
12 | db = couchdb_server['documents']
13 | for result in results.getElementsByTagName("Article") + results.getElementsByTagName("Book"):
14 | doi = result.attributes.get('doi').value
15 | source = db[encodeDoi(doi)]['source']
16 | eqns = [(eqn.attributes.get('id').value, eqn.attributes.get('distance').value) for eqn in result.getElementsByTagName("equation")]
17 | yield (doi, [(eqnID, distance, source[eqnID]) for (eqnID, distance) in eqns])
18 |
19 | def search(searchTerm, searchTimeout="20.0", limit="2500", precision="0.7"):
20 | response = {}
21 |
22 | url = "http://localhost:%s/documents/_external/index?searchTerm=%s&searchTimeout=%s&limit=%s&precision=%s" % (port, urllib.quote(searchTerm), searchTimeout, limit, precision)
23 | startTime = time.time()
24 | results = urllib.urlopen(url).read()
25 | endTime = time.time()
26 |
27 | response['time'] = endTime - startTime
28 | if results == "" or results == "" or results == "":
29 | response['error'] = results
30 | else:
31 | response['results'] = list(parseResults(minidom.parseString(results)))
32 |
33 | return response
34 |
35 | import sys
36 | import simplejson as json
37 |
38 | def requests():
39 | line = sys.stdin.readline()
40 | while line:
41 | yield json.loads(line)
42 | line = sys.stdin.readline()
43 |
44 | def main():
45 | for request in requests():
46 | try:
47 | query = request['query']
48 | response = {'code':200, 'json':search(**query)}
49 | except Exception, e:
50 | response = {'code':200, 'body':('Error: ' + str(e)), 'headers':{'Content-type':'text/plain'}} # Internal server error
51 |
52 | sys.stdout.write("%s\n" % json.dumps(response))
53 | sys.stdout.flush()
54 |
55 | if __name__ == "__main__":
56 | main()
57 |
--------------------------------------------------------------------------------
/scripts/test.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python
2 |
3 | """ Whitebox testing of the index external service running on couchdb """
4 |
5 | import os, sys, httplib, urllib, socket
6 | from xml.dom import minidom
7 | from util import decodeDoi
8 | import random
9 | from preprocessor import PlainProcessor, parseLaTeX
10 | import couchdb.client
11 | from db import couchdb_server, port
12 | import time
13 |
14 | rand = random.Random()
15 |
16 | def pruneNode(node):
17 | if node.childNodes:
18 | if len(node.childNodes)>2:
19 | start = rand.randint(0, len(node.childNodes)-1)
20 | end = rand.randint(0, len(node.childNodes)-1)
21 | if start>end:
22 | start, end = end, start
23 | elif start == end and end < len(node.childNodes):
24 | end = end+1
25 | elif start == end and start > 0:
26 | start = start-1
27 | try:
28 | del node.childNodes[end:len(node.childNodes)]
29 | del node.childNodes[0:start]
30 | except AttributeError:
31 | pass # Some types of nodes dont support deletion
32 |
33 | return node
34 |
35 | # Return a random (and syntacically correct) substring of a latex string
36 | def substring(latex):
37 | node = parseLaTeX("\\begin{document} $$ " + latex + " $$ \\end{document}")
38 | pruneNode(node)
39 | result = PlainProcessor().process(node).dumps()
40 | return result
41 |
42 | # Search for a substring of an existing equation and check that the parent article is included in the results
43 | def runTest(doi,transform):
44 | db = couchdb_server['documents']
45 | eqnID, source = rand.choice(db[doi]['source'].items())
46 | results = None
47 | searchTerm = None
48 | try:
49 | searchTerm = transform(source)
50 | url = "http://localhost:%s/documents/_external/index?searchTerm=\"%s\"&searchTimeout=20&limit=2500" % (port, urllib.quote(searchTerm))
51 | startTime = time.time()
52 | resultsFile = urllib.urlopen(url)
53 | endTime = time.time()
54 | results = minidom.parse(resultsFile)
55 | if results.getElementsByTagName("LatexParseError"):
56 | print "Latex parse error on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime)
57 | return False
58 | if results.getElementsByTagName("TimedOut"):
59 | print "Timed out on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime)
60 | return False
61 | if results.getElementsByTagName("LimitExceeded"):
62 | print "Limit exceeded on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime)
63 | return False
64 | for result in results.getElementsByTagName("Article") + results.getElementsByTagName("Chapter"):
65 | if result.attributes.get('doi').value == decodeDoi(doi):
66 | for eqn in result.getElementsByTagName("equation"):
67 | if eqn.attributes.get('id').value == eqnID:
68 | print "Passed on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime)
69 | return True
70 | print "Failed on doi: %s and eqnID: %s (%fs)" % (doi, eqnID, endTime-startTime)
71 | print searchTerm
72 | return False
73 | except KeyboardInterrupt, e:
74 | raise e
75 | except Exception, e:
76 | print "Error on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, 0)
77 | print e
78 | try:
79 | print "Searchterm: %s" % searchTerm
80 | except UnicodeEncodeError:
81 | pass
82 | return False
83 |
84 | def runTests(n,transform):
85 | db = couchdb_server['documents']
86 | dois = list(db)
87 | for i in xrange(0,n):
88 | doi = None
89 | source = None
90 | while not source:
91 | try:
92 | doi = rand.choice(dois)
93 | source = db[doi]['source']
94 | except socket.error:
95 | pass # Connection refused, probably because someone restarted the server
96 | runTest(doi,transform)
97 | sys.stdout.flush()
98 |
99 | import getopt
100 |
101 | if __name__ == '__main__':
102 | opts, args = getopt.getopt(sys.argv[1:], "", ["simple=","substring="])
103 | for opt, arg in opts:
104 | if opt == "--simple":
105 | runTests(int(arg),lambda x: x)
106 | if opt == "--substring":
107 | runTests(int(arg),substring)
108 | print "Ok"
109 |
110 |
--------------------------------------------------------------------------------
/scripts/times.py:
--------------------------------------------------------------------------------
1 | #!/bin/env python
2 |
3 | """ Benchmarking for the index external service running on couchdb """
4 |
5 | import os, sys, httplib, urllib, socket
6 | import random
7 | import couchdb.client
8 | from db import couchdb_server, port
9 | import time
10 |
11 | rand = random.Random()
12 |
13 | def runTime(doi):
14 | db = couchdb_server['documents']
15 | eqnID, searchTerm = rand.choice(db[doi]['source'].items())
16 | try:
17 | url = "http://localhost:%s/documents/_external/index?searchTerm=\"%s\"&searchTimeout=60&limit=10000" % (port, urllib.quote(searchTerm))
18 | startTime = time.time()
19 | resultsFile = urllib.urlopen(url)
20 | endTime = time.time()
21 | print endTime-startTime
22 | except KeyboardInterrupt, e:
23 | raise e
24 | except Exception, e:
25 | pass
26 |
27 | def runTimes(n):
28 | db = couchdb_server['documents']
29 | dois = list(db)
30 | for i in xrange(0,n):
31 | doi = None
32 | source = None
33 | while not source:
34 | try:
35 | doi = rand.choice(dois)
36 | source = db[doi]['source']
37 | except socket.error:
38 | pass # Connection refused, probably because someone restarted the server
39 | runTime(doi)
40 | sys.stdout.flush()
41 |
42 | import getopt
43 |
44 | if __name__ == '__main__':
45 | opts, args = getopt.getopt(sys.argv[1:], "", ["n="])
46 | for opt, arg in opts:
47 | if opt == "--n":
48 | runTimes(int(arg))
49 | print "Ok"
50 |
51 |
--------------------------------------------------------------------------------
/scripts/util.py:
--------------------------------------------------------------------------------
1 | def encodeDoi(doi):
2 | return doi.replace("/","_",1)
3 |
4 | def decodeDoi(doi):
5 | return doi.replace("_","/",1)
6 |
--------------------------------------------------------------------------------
/src/_tags:
--------------------------------------------------------------------------------
1 | <*> : syntax(camlp4o), package(extlib), package(netclient), package(json-wheel), package(json-static), package(ancient), package(xml-light), package(str), package(unix)
--------------------------------------------------------------------------------
/src/dynArray.ml:
--------------------------------------------------------------------------------
1 | (* Modified version of ExtLib DynArray - contains no functional values so is safer for Marshal *)
2 |
3 | (*
4 | * DynArray - Resizeable Ocaml arrays
5 | * Copyright (C) 2003 Brian Hurt
6 | * Copyright (C) 2003 Nicolas Cannasse
7 | *
8 | * This library is free software; you can redistribute it and/or
9 | * modify it under the terms of the GNU Lesser General Public
10 | * License as published by the Free Software Foundation; either
11 | * version 2.1 of the License, or (at your option) any later version,
12 | * with the special exception on linking described in file LICENSE.
13 | *
14 | * This library is distributed in the hope that it will be useful,
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 | * Lesser General Public License for more details.
18 | *
19 | * You should have received a copy of the GNU Lesser General Public
20 | * License along with this library; if not, write to the Free Software
21 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 | *)
23 |
24 | type resizer_t = currslots:int -> oldlength:int -> newlength:int -> int
25 |
26 | type 'a intern
27 |
28 | external ilen : 'a intern -> int = "%obj_size"
29 | let idup (x : 'a intern) = if ilen x = 0 then x else (Obj.magic (Obj.dup (Obj.repr x)) : 'a intern)
30 | let imake tag len = (Obj.magic (Obj.new_block tag len) : 'a intern)
31 | external iget : 'a intern -> int -> 'a = "%obj_field"
32 | external iset : 'a intern -> int -> 'a -> unit = "%obj_set_field"
33 |
34 | type 'a t = {
35 | mutable arr : 'a intern;
36 | mutable len : int;
37 | }
38 |
39 | exception Invalid_arg of int * string * string
40 |
41 | let invalid_arg n f p = raise (Invalid_arg (n,f,p))
42 |
43 | let length d = d.len
44 |
45 | let exponential_resizer ~currslots ~oldlength ~newlength =
46 | let rec doubler x = if x >= newlength then x else doubler (x * 2) in
47 | let rec halfer x = if x / 2 < newlength then x else halfer (x / 2) in
48 | if newlength = 1 then
49 | 1
50 | else if currslots = 0 then
51 | doubler 1
52 | else if currslots < newlength then
53 | doubler currslots
54 | else
55 | halfer currslots
56 |
57 | let step_resizer step =
58 | if step <= 0 then invalid_arg step "step_resizer" "step";
59 | (fun ~currslots ~oldlength ~newlength ->
60 | if currslots < newlength || newlength < (currslots - step)
61 | then
62 | (newlength + step - (newlength mod step))
63 | else
64 | currslots)
65 |
66 | let conservative_exponential_resizer ~currslots ~oldlength ~newlength =
67 | let rec doubler x = if x >= newlength then x else doubler (x * 2) in
68 | let rec halfer x = if x / 2 < newlength then x else halfer (x / 2) in
69 | if currslots < newlength then begin
70 | if newlength = 1 then
71 | 1
72 | else if currslots = 0 then
73 | doubler 1
74 | else
75 | doubler currslots
76 | end else if oldlength < newlength then
77 | halfer currslots
78 | else
79 | currslots
80 |
81 | let default_resizer = conservative_exponential_resizer
82 |
83 | let changelen (d : 'a t) newlen =
84 | let oldsize = ilen d.arr in
85 | let r = default_resizer
86 | ~currslots:oldsize
87 | ~oldlength:d.len
88 | ~newlength:newlen
89 | in
90 | (* We require the size to be at least large enough to hold the number
91 | * of elements we know we need!
92 | *)
93 | let newsize = if r < newlen then newlen else r in
94 | if newsize <> oldsize then begin
95 | let newarr = imake 0 newsize in
96 | let cpylen = (if newlen < d.len then newlen else d.len) in
97 | for i = 0 to cpylen - 1 do
98 | iset newarr i (iget d.arr i);
99 | done;
100 | d.arr <- newarr;
101 | end;
102 | d.len <- newlen
103 |
104 | let compact d =
105 | if d.len <> ilen d.arr then begin
106 | let newarr = imake 0 d.len in
107 | for i = 0 to d.len - 1 do
108 | iset newarr i (iget d.arr i)
109 | done;
110 | d.arr <- newarr;
111 | end
112 |
113 | let create() =
114 | {
115 | len = 0;
116 | arr = imake 0 0;
117 | }
118 |
119 | let make initsize =
120 | if initsize < 0 then invalid_arg initsize "make" "size";
121 | {
122 | len = 0;
123 | arr = imake 0 initsize;
124 | }
125 |
126 | let init initlen f =
127 | if initlen < 0 then invalid_arg initlen "init" "len";
128 | let arr = imake 0 initlen in
129 | for i = 0 to initlen-1 do
130 | iset arr i (f i)
131 | done;
132 | {
133 | len = initlen;
134 | arr = arr;
135 | }
136 |
137 | let set_resizer d resizer =
138 | ()
139 |
140 | let get_resizer d =
141 | default_resizer
142 |
143 | let empty d =
144 | d.len = 0
145 |
146 | let get d idx =
147 | if idx < 0 || idx >= d.len then invalid_arg idx "get" "index";
148 | iget d.arr idx
149 |
150 | let last d =
151 | if d.len = 0 then invalid_arg 0 "last" "";
152 | iget d.arr (d.len - 1)
153 |
154 | let set d idx v =
155 | if idx < 0 || idx >= d.len then invalid_arg idx "set" "index";
156 | iset d.arr idx v
157 |
158 | let insert d idx v =
159 | if idx < 0 || idx > d.len then invalid_arg idx "insert" "index";
160 | if d.len = ilen d.arr then changelen d (d.len + 1) else d.len <- d.len + 1;
161 | if idx < d.len - 1 then begin
162 | for i = d.len - 2 downto idx do
163 | iset d.arr (i+1) (iget d.arr i)
164 | done;
165 | end;
166 | iset d.arr idx v
167 |
168 | let add d v =
169 | if d.len = ilen d.arr then changelen d (d.len + 1) else d.len <- d.len + 1;
170 | iset d.arr (d.len - 1) v
171 |
172 | let delete d idx =
173 | if idx < 0 || idx >= d.len then invalid_arg idx "delete" "index";
174 | let oldsize = ilen d.arr in
175 | (* we don't call changelen because we want to blit *)
176 | let r = default_resizer
177 | ~currslots:oldsize
178 | ~oldlength:d.len
179 | ~newlength:(d.len - 1)
180 | in
181 | let newsize = (if r < d.len - 1 then d.len - 1 else r) in
182 | if oldsize <> newsize then begin
183 | let newarr = imake 0 newsize in
184 | for i = 0 to idx - 1 do
185 | iset newarr i (iget d.arr i);
186 | done;
187 | for i = idx to d.len - 2 do
188 | iset newarr i (iget d.arr (i+1));
189 | done;
190 | d.arr <- newarr;
191 | end else begin
192 | for i = idx to d.len - 2 do
193 | iset d.arr i (iget d.arr (i+1));
194 | done;
195 | iset d.arr (d.len - 1) (Obj.magic 0)
196 | end;
197 | d.len <- d.len - 1
198 |
199 |
200 | let delete_range d idx len =
201 | if len < 0 then invalid_arg len "delete_range" "length";
202 | if idx < 0 || idx + len > d.len then invalid_arg idx "delete_range" "index";
203 | let oldsize = ilen d.arr in
204 | (* we don't call changelen because we want to blit *)
205 | let r = default_resizer
206 | ~currslots:oldsize
207 | ~oldlength:d.len
208 | ~newlength:(d.len - len)
209 | in
210 | let newsize = (if r < d.len - len then d.len - len else r) in
211 | if oldsize <> newsize then begin
212 | let newarr = imake 0 newsize in
213 | for i = 0 to idx - 1 do
214 | iset newarr i (iget d.arr i);
215 | done;
216 | for i = idx to d.len - len - 1 do
217 | iset newarr i (iget d.arr (i+len));
218 | done;
219 | d.arr <- newarr;
220 | end else begin
221 | for i = idx to d.len - len - 1 do
222 | iset d.arr i (iget d.arr (i+len));
223 | done;
224 | for i = d.len - len to d.len - 1 do
225 | iset d.arr i (Obj.magic 0)
226 | done;
227 | end;
228 | d.len <- d.len - len
229 |
230 | let clear d =
231 | d.len <- 0;
232 | d.arr <- imake 0 0
233 |
234 | let delete_last d =
235 | if d.len <= 0 then invalid_arg 0 "delete_last" "";
236 | (* erase for GC, in case changelen don't resize our array *)
237 | iset d.arr (d.len - 1) (Obj.magic 0);
238 | changelen d (d.len - 1)
239 |
240 | let rec blit src srcidx dst dstidx len =
241 | if len < 0 then invalid_arg len "blit" "len";
242 | if srcidx < 0 || srcidx + len > src.len then invalid_arg srcidx "blit" "source index";
243 | if dstidx < 0 || dstidx > dst.len then invalid_arg dstidx "blit" "dest index";
244 | let newlen = dstidx + len in
245 | if newlen > ilen dst.arr then begin
246 | (* this case could be inlined so we don't blit on just-copied elements *)
247 | changelen dst newlen
248 | end else begin
249 | if newlen > dst.len then dst.len <- newlen;
250 | end;
251 | (* same array ! we need to copy in reverse order *)
252 | if src.arr == dst.arr && dstidx > srcidx then
253 | for i = len - 1 downto 0 do
254 | iset dst.arr (dstidx+i) (iget src.arr (srcidx+i));
255 | done
256 | else
257 | for i = 0 to len - 1 do
258 | iset dst.arr (dstidx+i) (iget src.arr (srcidx+i));
259 | done
260 |
261 | let append src dst =
262 | blit src 0 dst dst.len src.len
263 |
264 | let to_list d =
265 | let rec loop idx accum =
266 | if idx < 0 then accum else loop (idx - 1) (iget d.arr idx :: accum)
267 | in
268 | loop (d.len - 1) []
269 |
270 | let to_array d =
271 | if d.len = 0 then begin
272 | (* since the empty array is an atom, we don't care if float or not *)
273 | [||]
274 | end else begin
275 | let arr = Array.make d.len (iget d.arr 0) in
276 | for i = 1 to d.len - 1 do
277 | Array.unsafe_set arr i (iget d.arr i)
278 | done;
279 | arr;
280 | end
281 |
282 | let of_list lst =
283 | let size = List.length lst in
284 | let arr = imake 0 size in
285 | let rec loop idx = function
286 | | h :: t -> iset arr idx h; loop (idx + 1) t
287 | | [] -> ()
288 | in
289 | loop 0 lst;
290 | {
291 | len = size;
292 | arr = arr;
293 | }
294 |
295 | let of_array src =
296 | let size = Array.length src in
297 | let is_float = Obj.tag (Obj.repr src) = Obj.double_array_tag in
298 | let arr = (if is_float then begin
299 | let arr = imake 0 size in
300 | for i = 0 to size - 1 do
301 | iset arr i (Array.unsafe_get src i);
302 | done;
303 | arr
304 | end else
305 | (* copy the fields *)
306 | idup (Obj.magic src : 'a intern))
307 | in
308 | {
309 | len = size;
310 | arr = arr;
311 | }
312 |
313 | let copy src =
314 | {
315 | len = src.len;
316 | arr = idup src.arr;
317 | }
318 |
319 | let sub src start len =
320 | if len < 0 then invalid_arg len "sub" "len";
321 | if start < 0 || start + len > src.len then invalid_arg start "sub" "start";
322 | let arr = imake 0 len in
323 | for i = 0 to len - 1 do
324 | iset arr i (iget src.arr (i+start));
325 | done;
326 | {
327 | len = len;
328 | arr = arr;
329 | }
330 |
331 | let iter f d =
332 | for i = 0 to d.len - 1 do
333 | f (iget d.arr i)
334 | done
335 |
336 | let iteri f d =
337 | for i = 0 to d.len - 1 do
338 | f i (iget d.arr i)
339 | done
340 |
341 | let filter f d =
342 | let l = d.len in
343 | let a = imake 0 l in
344 | let a2 = d.arr in
345 | let p = ref 0 in
346 | for i = 0 to l - 1 do
347 | let x = iget a2 i in
348 | if f x then begin
349 | iset a !p x;
350 | incr p;
351 | end;
352 | done;
353 | d.len <- !p;
354 | d.arr <- a
355 |
356 | let index_of f d =
357 | let rec loop i =
358 | if i >= d.len then
359 | raise Not_found
360 | else
361 | if f (iget d.arr i) then
362 | i
363 | else
364 | loop (i+1)
365 | in
366 | loop 0
367 |
368 | let map f src =
369 | let arr = imake 0 src.len in
370 | for i = 0 to src.len - 1 do
371 | iset arr i (f (iget src.arr i))
372 | done;
373 | {
374 | len = src.len;
375 | arr = arr;
376 | }
377 |
378 | let mapi f src =
379 | let arr = imake 0 src.len in
380 | for i = 0 to src.len - 1 do
381 | iset arr i (f i (iget src.arr i))
382 | done;
383 | {
384 | len = src.len;
385 | arr = arr;
386 | }
387 |
388 | let fold_left f x a =
389 | let rec loop idx x =
390 | if idx >= a.len then x else loop (idx + 1) (f x (iget a.arr idx))
391 | in
392 | loop 0 x
393 |
394 | let fold_right f a x =
395 | let rec loop idx x =
396 | if idx < 0 then x
397 | else loop (idx - 1) (f (iget a.arr idx) x)
398 | in
399 | loop (a.len - 1) x
400 |
401 | let enum d =
402 | let rec make start =
403 | let idxref = ref 0 in
404 | let next () =
405 | if !idxref >= d.len then
406 | raise Enum.No_more_elements
407 | else
408 | let retval = iget d.arr !idxref in
409 | incr idxref;
410 | retval
411 | and count () =
412 | if !idxref >= d.len then 0
413 | else d.len - !idxref
414 | and clone () =
415 | make !idxref
416 | in
417 | Enum.make ~next:next ~count:count ~clone:clone
418 | in
419 | make 0
420 |
421 | let of_enum e =
422 | if Enum.fast_count e then begin
423 | let c = Enum.count e in
424 | let arr = imake 0 c in
425 | Enum.iteri (fun i x -> iset arr i x) e;
426 | {
427 | len = c;
428 | arr = arr;
429 | }
430 | end else
431 | let d = make 0 in
432 | Enum.iter (add d) e;
433 | d
434 |
435 | let unsafe_get a n =
436 | iget a.arr n
437 |
438 | let unsafe_set a n x =
439 | iset a.arr n x
440 |
--------------------------------------------------------------------------------
/src/dynArray.mli:
--------------------------------------------------------------------------------
1 | (*
2 | * DynArray - Resizeable Ocaml arrays
3 | * Copyright (C) 2003 Brian Hurt
4 | * Copyright (C) 2003 Nicolas Cannasse
5 | *
6 | * This library is free software; you can redistribute it and/or
7 | * modify it under the terms of the GNU Lesser General Public
8 | * License as published by the Free Software Foundation; either
9 | * version 2.1 of the License, or (at your option) any later version,
10 | * with the special exception on linking described in file LICENSE.
11 | *
12 | * This library is distributed in the hope that it will be useful,
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 | * Lesser General Public License for more details.
16 | *
17 | * You should have received a copy of the GNU Lesser General Public
18 | * License along with this library; if not, write to the Free Software
19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 | *)
21 |
22 | (** Dynamic arrays.
23 |
24 | A dynamic array is equivalent to a OCaml array that will resize itself
25 | when elements are added or removed, except that floats are boxed and
26 | that no initialization element is required.
27 | *)
28 |
29 | type 'a t
30 |
31 | exception Invalid_arg of int * string * string
32 | (** When an operation on an array fails, [Invalid_arg] is raised. The
33 | integer is the value that made the operation fail, the first string
34 | contains the function name that has been called and the second string
35 | contains the parameter name that made the operation fail.
36 | *)
37 |
38 | (** {6 Array creation} *)
39 |
40 | val create : unit -> 'a t
41 | (** [create()] returns a new empty dynamic array. *)
42 |
43 | val make : int -> 'a t
44 | (** [make count] returns an array with some memory already allocated so
45 | up to [count] elements can be stored into it without resizing. *)
46 |
47 | val init : int -> (int -> 'a) -> 'a t
48 | (** [init n f] returns an array of [n] elements filled with values
49 | returned by [f 0 , f 1, ... f (n-1)]. *)
50 |
51 | (** {6 Array manipulation functions} *)
52 |
53 | val empty : 'a t -> bool
54 | (** Return true if the number of elements in the array is 0. *)
55 |
56 | val length : 'a t -> int
57 | (** Return the number of elements in the array. *)
58 |
59 | val get : 'a t -> int -> 'a
60 | (** [get darr idx] gets the element in [darr] at index [idx]. If [darr] has
61 | [len] elements in it, then the valid indexes range from [0] to [len-1]. *)
62 |
63 | val last : 'a t -> 'a
64 | (** [last darr] returns the last element of [darr]. *)
65 |
66 | val set : 'a t -> int -> 'a -> unit
67 | (** [set darr idx v] sets the element of [darr] at index [idx] to value
68 | [v]. The previous value is overwritten. *)
69 |
70 | val insert : 'a t -> int -> 'a -> unit
71 | (** [insert darr idx v] inserts [v] into [darr] at index [idx]. All elements
72 | of [darr] with an index greater than or equal to [idx] have their
73 | index incremented (are moved up one place) to make room for the new
74 | element. *)
75 |
76 | val add : 'a t -> 'a -> unit
77 | (** [add darr v] appends [v] onto [darr]. [v] becomes the new
78 | last element of [darr]. *)
79 |
80 | val append : 'a t -> 'a t -> unit
81 | (** [append src dst] adds all elements of [src] to the end of [dst]. *)
82 |
83 | val delete : 'a t -> int -> unit
84 | (** [delete darr idx] deletes the element of [darr] at [idx]. All elements
85 | with an index greater than [idx] have their index decremented (are
86 | moved down one place) to fill in the hole. *)
87 |
88 | val delete_last : 'a t -> unit
89 | (** [delete_last darr] deletes the last element of [darr]. This is equivalent
90 | of doing [delete darr ((length darr) - 1)]. *)
91 |
92 | val delete_range : 'a t -> int -> int -> unit
93 | (** [delete_range darr p len] deletes [len] elements starting at index [p].
94 | All elements with an index greater than [p+len] are moved to fill
95 | in the hole. *)
96 |
97 | val clear : 'a t -> unit
98 | (** remove all elements from the array and resize it to 0. *)
99 |
100 | val blit : 'a t -> int -> 'a t -> int -> int -> unit
101 | (** [blit src srcidx dst dstidx len] copies [len] elements from [src]
102 | starting with index [srcidx] to [dst] starting at [dstidx]. *)
103 |
104 | val compact : 'a t -> unit
105 | (** [compact darr] ensures that the space allocated by the array is minimal.*)
106 |
107 | (** {6 Array copy and conversion} *)
108 |
109 | val to_list : 'a t -> 'a list
110 | (** [to_list darr] returns the elements of [darr] in order as a list. *)
111 |
112 | val to_array : 'a t -> 'a array
113 | (** [to_array darr] returns the elements of [darr] in order as an array. *)
114 |
115 | val enum : 'a t -> 'a Enum.t
116 | (** [enum darr] returns the enumeration of [darr] elements. *)
117 |
118 | val of_list : 'a list -> 'a t
119 | (** [of_list lst] returns a dynamic array with the elements of [lst] in
120 | it in order. *)
121 |
122 | val of_array : 'a array -> 'a t
123 | (** [of_array arr] returns an array with the elements of [arr] in it
124 | in order. *)
125 |
126 | val of_enum : 'a Enum.t -> 'a t
127 | (** [of_enum e] returns an array that holds, in order, the elements of [e]. *)
128 |
129 | val copy : 'a t -> 'a t
130 | (** [copy src] returns a fresh copy of [src], such that no modification of
131 | [src] affects the copy, or vice versa (all new memory is allocated for
132 | the copy). *)
133 |
134 | val sub : 'a t -> int -> int -> 'a t
135 | (** [sub darr start len] returns an array holding the subset of [len]
136 | elements from [darr] starting with the element at index [idx]. *)
137 |
138 | (** {6 Array functional support} *)
139 |
140 | val iter : ('a -> unit) -> 'a t -> unit
141 | (** [iter f darr] calls the function [f] on every element of [darr]. It
142 | is equivalent to [for i = 0 to length darr - 1 do f (get darr i) done;] *)
143 |
144 | val iteri : (int -> 'a -> unit) -> 'a t -> unit
145 | (** [iter f darr] calls the function [f] on every element of [darr]. It
146 | is equivalent to [for i = 0 to length darr - 1 do f i (get darr i) done;]
147 | *)
148 |
149 | val map : ('a -> 'b) -> 'a t -> 'b t
150 | (** [map f darr] applies the function [f] to every element of [darr]
151 | and creates a dynamic array from the results - similar to [List.map] or
152 | [Array.map]. *)
153 |
154 | val mapi : (int -> 'a -> 'b) -> 'a t -> 'b t
155 | (** [mapi f darr] applies the function [f] to every element of [darr]
156 | and creates a dynamic array from the results - similar to [List.mapi] or
157 | [Array.mapi]. *)
158 |
159 | val fold_left : ('a -> 'b -> 'a) -> 'a -> 'b t -> 'a
160 | (** [fold_left f x darr] computes
161 | [f ( ... ( f ( f (get darr 0) x) (get darr 1) ) ... ) (get darr n-1)],
162 | similar to [Array.fold_left] or [List.fold_left]. *)
163 |
164 | val fold_right : ('a -> 'b -> 'b) -> 'a t -> 'b -> 'b
165 | (** [fold_right f darr x] computes
166 | [ f (get darr 0) (f (get darr 1) ( ... ( f (get darr n-1) x ) ... ) ) ]
167 | similar to [Array.fold_right] or [List.fold_right]. *)
168 |
169 | val index_of : ('a -> bool) -> 'a t -> int
170 | (** [index_of f darr] returns the index of the first element [x] in darr such
171 | as [f x] returns [true] or raise [Not_found] if not found. *)
172 |
173 | val filter : ('a -> bool) -> 'a t -> unit
174 |
175 | (** {6 Array resizers} *)
176 |
177 | type resizer_t = currslots:int -> oldlength:int -> newlength:int -> int
178 | (** The type of a resizer function.
179 |
180 | Resizer functions are called whenever elements are added to
181 | or removed from the dynamic array to determine what the current number of
182 | storage spaces in the array should be. The three named arguments
183 | passed to a resizer are the current number of storage spaces in
184 | the array, the length of the array before the elements are
185 | added or removed, and the length the array will be after the
186 | elements are added or removed. If elements are being added, newlength
187 | will be larger than oldlength, if elements are being removed,
188 | newlength will be smaller than oldlength. If the resizer function
189 | returns exactly oldlength, the size of the array is only changed when
190 | adding an element while there is not enough space for it.
191 |
192 | By default, all dynamic arrays are created with the [default_resizer].
193 | When a dynamic array is created from another dynamic array (using [copy],
194 | [map] , etc. ) the resizer of the copy will be the same as the original
195 | dynamic array resizer. To change the resizer, use the [set_resizer]
196 | function.
197 | *)
198 |
199 | val set_resizer : 'a t -> resizer_t -> unit
200 | (** Change the resizer for this array. *)
201 |
202 | val get_resizer : 'a t -> resizer_t
203 | (** Get the current resizer function for a given array *)
204 |
205 | val default_resizer : resizer_t
206 | (** The default resizer function the library is using - in this version
207 | of DynArray, this is the [exponential_resizer] but should change in
208 | next versions. *)
209 |
210 | val exponential_resizer : resizer_t
211 | (** The exponential resizer- The default resizer except when the resizer
212 | is being copied from some other darray.
213 |
214 | [exponential_resizer] works by doubling or halving the number of
215 | slots until they "fit". If the number of slots is less than the
216 | new length, the number of slots is doubled until it is greater
217 | than the new length (or Sys.max_array_size is reached).
218 |
219 | If the number of slots is more than four times the new length,
220 | the number of slots is halved until it is less than four times the
221 | new length.
222 |
223 | Allowing darrays to fall below 25% utilization before shrinking them
224 | prevents "thrashing". Consider the case where the caller is constantly
225 | adding a few elements, and then removing a few elements, causing
226 | the length to constantly cross above and below a power of two.
227 | Shrinking the array when it falls below 50% would causing the
228 | underlying array to be constantly allocated and deallocated.
229 | A few elements would be added, causing the array to be reallocated
230 | and have a usage of just above 50%. Then a few elements would be
231 | remove, and the array would fall below 50% utilization and be
232 | reallocated yet again. The bulk of the array, untouched, would be
233 | copied and copied again. By setting the threshold at 25% instead,
234 | such "thrashing" only occurs with wild swings- adding and removing
235 | huge numbers of elements (more than half of the elements in the array).
236 |
237 | [exponential_resizer] is a good performing resizer for most
238 | applications. A list allocates 2 words for every element, while an
239 | array (with large numbers of elements) allocates only 1 word per
240 | element (ignoring unboxed floats). On insert, [exponential_resizer]
241 | keeps the amount of wasted "extra" array elements below 50%, meaning
242 | that less than 2 words per element are used. Even on removals
243 | where the amount of wasted space is allowed to rise to 75%, that
244 | only means that darray is using 4 words per element. This is
245 | generally not a significant overhead.
246 |
247 | Furthermore, [exponential_resizer] minimizes the number of copies
248 | needed- appending n elements into an empty darray with initial size
249 | 0 requires between n and 2n elements of the array be copied- O(n)
250 | work, or O(1) work per element (on average). A similar argument
251 | can be made that deletes from the end of the array are O(1) as
252 | well (obviously deletes from anywhere else are O(n) work- you
253 | have to move the n or so elements above the deleted element down).
254 |
255 | *)
256 |
257 | val step_resizer : int -> resizer_t
258 | (** The stepwise resizer- another example of a resizer function, this
259 | time of a parameterized resizer.
260 |
261 | The resizer returned by [step_resizer step] returns the smallest
262 | multiple of [step] larger than [newlength] if [currslots] is less
263 | then [newlength]-[step] or greater than [newlength].
264 |
265 | For example, to make an darray with a step of 10, a length
266 | of len, and a null of null, you would do:
267 | [make] ~resizer:([step_resizer] 10) len null
268 | *)
269 |
270 | val conservative_exponential_resizer : resizer_t
271 | (** [conservative_exponential_resizer] is an example resizer function
272 | which uses the oldlength parameter. It only shrinks the array
273 | on inserts- no deletes shrink the array, only inserts. It does
274 | this by comparing the oldlength and newlength parameters. Other
275 | than that, it acts like [exponential_resizer].
276 | *)
277 |
278 | (** {6 Unsafe operations} **)
279 |
280 | val unsafe_get : 'a t -> int -> 'a
281 | val unsafe_set : 'a t -> int -> 'a -> unit
282 |
--------------------------------------------------------------------------------
/src/hashset.ml:
--------------------------------------------------------------------------------
1 | (* Simple sets using Hashtbl *)
2 |
3 | type 'a t = ('a, unit) Hashtbl.t
4 |
5 | let create = Hashtbl.create
6 |
7 | let mem = Hashtbl.mem
8 |
9 | let add ht key = Hashtbl.replace ht key ()
10 |
11 | let to_list ht = Hashtbl.fold (fun k v rest -> k :: rest) ht []
12 |
13 | let of_list list =
14 | let ht = create 0 in
15 | List.map (add ht) list;
16 | ht
17 |
18 | let union ht1 ht2 =
19 | let ht3 = create 0 in
20 | Hashtbl.iter (fun key _ -> add ht3 key) ht1;
21 | Hashtbl.iter (fun key _ -> add ht3 key) ht2;
22 | ht3
23 |
24 | let inter ht1 ht2 =
25 | let ht3 = create 0 in
26 | Hashtbl.iter (fun key _ -> if mem ht2 key then add ht3 key else ()) ht1;
27 | ht3
28 |
29 | let filter f ht =
30 | Hashtbl.iter
31 | (fun elem _ ->
32 | if f elem
33 | then ()
34 | else Hashtbl.remove ht elem)
35 | ht
36 |
--------------------------------------------------------------------------------
/src/hashset.mli:
--------------------------------------------------------------------------------
1 | type 'a t
2 |
3 | val create : int -> 'a t
4 |
5 | val mem : 'a t -> 'a -> bool
6 | val add : 'a t -> 'a -> unit
7 |
8 | val to_list : 'a t -> 'a list
9 | val of_list : 'a list -> 'a t
10 |
11 | val union : 'a t -> 'a t -> 'a t
12 | val inter : 'a t -> 'a t -> 'a t
13 |
14 | val filter: ('a -> bool) -> 'a t -> unit
15 |
--------------------------------------------------------------------------------
/src/index.ml:
--------------------------------------------------------------------------------
1 | (*
2 | This module controls toplevel interaction with the search index.
3 | Mostly I/O and error handling. See the last section for the commands supported.
4 | *)
5 |
6 | (* Types and json parsing *)
7 |
8 | type json doi = string
9 |
10 | and eqnID = string
11 | and containerID = string
12 | and publicationYear = string
13 | and format = string
14 |
15 | and document =
16 | < ?containerID : containerID option
17 | ; ?format : format = "Article" (* Originally only articles were supported *)
18 | ; publicationYear : publicationYear option
19 | ; content : (string * Json_type.t) assoc (* eqnID*Latex.t *)
20 | ; source : (string * string) assoc > (* eqnID*string *)
21 |
22 | and args =
23 | < searchTerm : string
24 | ; ?searchTimeout : string = "10.0"
25 | ; ?preprocessorTimeout : string = "5.0"
26 | ; ?limit : string = "1000"
27 | ; ?start : string = "0"
28 | ; ?count : string = string_of_int max_int
29 | ; ?doi : string option
30 | ; ?containerID : containerID option
31 | ; ?publishedAfter : publicationYear option
32 | ; ?publishedBefore : publicationYear option
33 | ; ?precision : string = "0.7" >
34 |
35 | and get =
36 | < query : args >
37 |
38 | and post =
39 | < body : string >
40 |
41 | and update =
42 | < id : doi
43 | ; key : int
44 | ; value : < ?deleted : bool = false >
45 | ; ?doc : Json_type.t option >
46 |
47 | and updates =
48 | < rows : update list >
49 |
50 | and preprocessed =
51 | < json : Json_type.t
52 | ; plain : string >
53 |
54 | type equation =
55 | { doi : doi
56 | ; eqnID : eqnID }
57 |
58 | (* json-static converts json into objects which cannot be stored using Marshal, so store metadata record instead *)
59 | type metadata =
60 | { containerID : containerID option
61 | ; format : format
62 | ; publicationYear : publicationYear option
63 | ; no_eqns : int }
64 |
65 | let metadata_of_doc doc =
66 | { containerID = doc#containerID
67 | ; format = doc#format
68 | ; publicationYear = doc#publicationYear
69 | ; no_eqns = List.length doc#content }
70 |
71 | (* Assorted imports and utililty functions *)
72 |
73 | module Http = Http_client.Convenience
74 | let encode url = Netencoding.Url.encode ~plus:false url
75 |
76 | (* couchdb does not allow '/' in keys *)
77 | let encode_doi doi = Str.replace_first (Str.regexp "/") "_" doi
78 | let decode_doi doi = Str.replace_first (Str.regexp "_") "/" doi
79 |
80 | let flush_line = Util.flush_line
81 |
82 | module Doi_map = MyMap.Make
83 | (struct
84 | type t = doi
85 | let compare = compare
86 | end)
87 |
88 | module Eqnid_map = MyMap.Make
89 | (struct
90 | type t = eqnID
91 | let compare = compare
92 | end)
93 |
94 | (* Our main index structure *)
95 |
96 | type index =
97 | { last_update : int (* Key of the last update received from couchdb *)
98 | ; metadata : metadata Doi_map.t
99 | ; suffix_array : equation Suffix_array.t }
100 |
101 | (* Persisting *)
102 |
103 | let load_index () = (Util.load_data "./data/index" : index)
104 |
105 | let save_index index = Util.save_data "./data/index" (index : index)
106 |
107 | (* Database interaction *)
108 |
109 | let couchdb_url =
110 | (* Ocaml's file handling is terrible... *)
111 | let conf = open_in "./db.ini" in
112 | let rec read_port () =
113 | try
114 | let line = input_line conf in
115 | Str.search_forward (Str.regexp "port *= *\([0-9]+\)") line 0;
116 | Str.matched_group 1 line
117 | with Not_found -> read_port () in
118 | "http://localhost:" ^ read_port () ^ "/"
119 |
120 | let db_url = couchdb_url ^ "documents/"
121 |
122 | let get_document doi =
123 | let url = db_url ^ doi in
124 | let json = Json_io.json_of_string (Http.http_get url) in
125 | document_of_json json
126 |
127 | let preprocess timeout latex_string =
128 | let url = db_url ^ "_external/preprocess?format=json-plain&timeout=" ^ (encode timeout) ^ "&latex=" ^ (encode latex_string) in
129 | let preprocessed = preprocessed_of_json (Json_io.json_of_string (Http.http_get url)) in
130 | (Latex.of_json preprocessed#json,preprocessed#plain)
131 |
132 | (* Responses to couchdb *)
133 |
134 | let xml_of_results results query_string =
135 | let xml_of_eqn (eqnID,weight) =
136 | Xml.Element ("equation", [("distance",string_of_int weight);("id",eqnID)], []) in
137 | let xml_of_result (doi,metadata,eqns) =
138 | Xml.Element (metadata.format,
139 | [("doi", decode_doi doi);("count", string_of_int (List.length eqns))],
140 | (List.map xml_of_eqn eqns)) in
141 | let xml_of_query_string =
142 | Xml.Element ("query",[],[Xml.PCData query_string]) in
143 | Xml.Element ("results", [], xml_of_query_string :: (List.map xml_of_result results))
144 |
145 | let xml_error error = Xml.Element (error,[],[])
146 |
147 | let xml_response xml =
148 | Json_type.Object
149 | [ ("code",Json_type.Int 200)
150 | ; ("headers",Json_type.Object [("Content-type",Json_type.String "text/xml")])
151 | ; ("body",Json_type.String (Xml.to_string xml)) ]
152 |
153 | (* Timeouts *)
154 |
155 | exception Timeout
156 |
157 | let set_timer tsecs = ignore (Unix.setitimer Unix.ITIMER_REAL { Unix.it_interval = 0.0; Unix.it_value = tsecs })
158 |
159 | let with_timeout tsecs f =
160 | Sys.set_signal Sys.sigalrm (Sys.Signal_handle (fun _ -> raise Timeout));
161 | try
162 | set_timer tsecs;
163 | let result = f () in
164 | set_timer 0.0;
165 | result
166 | with exc ->
167 | set_timer 0.0;
168 | raise exc
169 |
170 | (* Queries *)
171 |
172 | let run_query index query precision filter limit start count =
173 | let eqns = Suffix_array.find_query index.suffix_array precision query in
174 | (* Collate eqns by doi *)
175 | let doi_map =
176 | List.fold_left
177 | (fun doi_map (weight,equation) ->
178 | let (key, value) = (equation.doi, (equation.eqnID,weight)) in
179 | Doi_map.update key (fun values -> value::values) [value] doi_map)
180 | Doi_map.empty
181 | eqns in
182 | (* Remove the dummy node *)
183 | let doi_map = Doi_map.remove "" doi_map in
184 | if Doi_map.count doi_map > limit
185 | then
186 | xml_error "LimitExceeded"
187 | else
188 | let results = Doi_map.to_list doi_map in
189 | (* Insert metadata *)
190 | let results = List.map (fun (doi,eqns) -> (doi, Doi_map.find doi index.metadata, eqns)) results in
191 | (* Apply filter *)
192 | let results = List.filter (fun (doi,metadata,_) -> filter doi metadata) results in
193 | (* Sort each set of equations by weight *)
194 | let results = List.map (fun (doi,metadata,eqns) -> (doi,metadata,List.fast_sort (fun a b -> compare (snd a) (snd b)) eqns)) results in
195 | (* Sort doi's by lowest weighted equation *)
196 | let results = List.fast_sort (fun (_,_,eqnsA) (_,_,eqnsB) -> compare (snd (List.hd eqnsA)) (snd (List.hd eqnsB))) results in
197 | (* Return the chosen page *)
198 | let results = ExtList.List.take count (ExtList.List.drop start results) in
199 | xml_of_results results (Query.to_string query)
200 |
201 | let handle_query index str =
202 | try
203 | let args =
204 | let json = Json_io.json_of_string str in
205 | (* accept args either as query string or as post body *)
206 | try
207 | (get_of_json json)#query
208 | with Json_type.Json_error _ ->
209 | let json = Json_io.json_of_string (post_of_json json)#body in
210 | args_of_json json in
211 | let searchTimeout = float_of_string args#searchTimeout in
212 | let preprocessorTimeout = args#preprocessorTimeout in
213 | let limit = int_of_string args#limit in
214 | let start = int_of_string args#start in
215 | let count = int_of_string args#count in
216 | let query = Query.of_string (preprocess preprocessorTimeout) args#searchTerm in
217 | let precision = float_of_string args#precision in
218 | let containerIDs =
219 | match args#containerID with
220 | | None -> []
221 | | Some csv -> ExtString.String.nsplit csv "," in
222 | let dois =
223 | match args#doi with
224 | | None -> []
225 | | Some csv -> ExtString.String.nsplit csv "," in
226 | let filter doi metadata =
227 | ((args#containerID = None) || (List.exists (fun containerID -> metadata.containerID = Some containerID) containerIDs))
228 | && ((args#doi = None) || (List.mem (decode_doi doi) dois))
229 | && ((args#publishedBefore = None) || ((args#publishedBefore >= metadata.publicationYear) && (metadata.publicationYear <> None)))
230 | && ((args#publishedAfter = None) || ((args#publishedAfter <= metadata.publicationYear) && (metadata.publicationYear <> None))) in
231 | xml_response (with_timeout searchTimeout (fun () -> run_query index query precision filter limit start count))
232 | with
233 | | Json_type.Json_error _ | Failure _ -> xml_response (xml_error "ArgParseError")
234 | | Query.Parse_error -> xml_response (xml_error "QueryParseError")
235 | | Timeout -> xml_response (xml_error "TimedOut")
236 | | _ -> Json_type.Object [("code",Json_type.Int 500)] (* Internal server error *)
237 |
238 | let handle_queries () =
239 | let index = load_index () in
240 | Suffix_array.ancientify index.suffix_array;
241 | while true do
242 | let input = input_line stdin in
243 | let json = handle_query index input in
244 | flush_line (Json_io.string_of_json ~compact:true json)
245 | done
246 |
247 | (* Initialising index *)
248 |
249 | let init_index () =
250 | flush_line ("couchdb is at " ^ couchdb_url);
251 | print_string "This will erase the existing index. Are you sure? (y/n):"; flush stdout;
252 | if read_line () = "y"
253 | then
254 | (flush_line "Saving index";
255 | save_index {last_update = -1; suffix_array = Suffix_array.create (); metadata = Doi_map.empty};
256 | flush_line "Ok")
257 | else
258 | flush_line "Ok, nothing was done"
259 |
260 | (* Updating the index *)
261 |
262 | let batch_size = 1000
263 |
264 | let get_update_batch last_update =
265 | flush_line
266 | ("Fetching updates from " ^
267 | (string_of_int (last_update+1)) ^
268 | " onwards");
269 | let url =
270 | db_url ^ "_all_docs_by_seq?include_docs=true" ^
271 | "&startkey=" ^ (string_of_int last_update) ^
272 | "&limit=" ^ (string_of_int batch_size) in
273 | try
274 | let json = Json_io.json_of_string (Http.http_get url) in
275 | (updates_of_json json)#rows
276 | with _ ->
277 | flush_line "Error contacting database (documents)";
278 | raise Exit
279 |
280 | exception FailedUpdate of int * doi
281 |
282 | let run_update index update =
283 | try
284 | (* Start by deleting old version of the document if it already exists *)
285 | let index =
286 | if not (Doi_map.mem update#id index.metadata) then index else
287 | begin
288 | Util.flush_line ("Deleting " ^ update#id);
289 | Suffix_array.delete index.suffix_array (fun equation -> equation.doi = update#id);
290 | let metadata = Doi_map.remove update#id index.metadata in
291 | {index with metadata=metadata}
292 | end in
293 | (* Add the new version of the documents if the deleted flag is not set *)
294 | match (update#doc, update#value#deleted) with
295 | | (None, _) | (_,true) ->
296 | {index with last_update=update#key}
297 | | (Some json, false) ->
298 | begin
299 | let doc = document_of_json json in
300 | let equations =
301 | List.map
302 | (fun (eqnID,json) -> ({doi=update#id; eqnID=eqnID}, Latex.of_json json))
303 | doc#content in
304 | Suffix_array.add index.suffix_array equations;
305 | let metadata = Doi_map.add update#id (metadata_of_doc doc) index.metadata in
306 | {index with last_update=update#key; metadata=metadata}
307 | end
308 | with _ ->
309 | raise (FailedUpdate (update#key, update#id))
310 |
311 | let rec run_update_batches index =
312 | let update_batch = get_update_batch index.last_update in
313 | let index = List.fold_left run_update index update_batch in
314 | save_index index;
315 | if List.length update_batch < batch_size then index else run_update_batches index
316 |
317 | let run_updates () =
318 | Pid.lock ();
319 | flush_line ("couchdb is at " ^ couchdb_url);
320 | flush_line "Loading index";
321 | let index = load_index () in
322 | let index =
323 | try
324 | run_update_batches index
325 | with FailedUpdate(key,id) ->
326 | flush_line ("Update " ^ (string_of_int key) ^ " failed (DOI: " ^ id ^ ")");
327 | index in
328 | flush_line ("Finished updating at update: " ^ (string_of_int index.last_update));
329 | flush_line "Preparing index";
330 | Suffix_array.prepare index.suffix_array;
331 | save_index index;
332 | flush_line "Ok"
333 |
334 | (* Introspection *)
335 |
336 | let list_all () =
337 | flush_line ("couchdb is at " ^ couchdb_url);
338 | flush_line "Loading index";
339 | let index = load_index () in
340 | Doi_map.iter
341 | (fun doi metadata ->
342 | match metadata.containerID with
343 | | None ->
344 | flush_line ((decode_doi doi) ^ "no_equations=" ^ (string_of_int metadata.no_eqns))
345 | | Some containerID ->
346 | flush_line ((decode_doi doi) ^ " containerID=" ^ containerID ^ " no_equations=" ^ (string_of_int metadata.no_eqns)))
347 | index.metadata;
348 | let no_eqns = Doi_map.fold (fun _ metadata total -> metadata.no_eqns+total) index.metadata 0 in
349 | flush_line ("Total number of equations: " ^ (string_of_int no_eqns))
350 |
351 | let list_one doi =
352 | let doi = encode_doi doi in
353 | flush_line ("couchdb is at " ^ couchdb_url);
354 | flush_line "Loading index";
355 | let index = load_index () in
356 | flush_line ("Searching for " ^ doi);
357 | try
358 | let metadata = Doi_map.find (encode_doi doi) index.metadata in
359 | (match metadata.containerID with
360 | | None ->
361 | flush_line ((decode_doi doi) ^ "no_equations=" ^ (string_of_int metadata.no_eqns))
362 | | Some containerID ->
363 | flush_line ((decode_doi doi) ^ " containerID=" ^ containerID ^ " no_equations=" ^ (string_of_int metadata.no_eqns)))
364 | with Not_found ->
365 | flush_line "DOI not indexed"
366 |
367 | (* Main *)
368 |
369 | open Arg
370 | let _ = parse
371 | [("-init", Unit init_index, ": Create an empty index")
372 | ;("-update", Unit run_updates, ": Update the index")
373 | ;("-query", Unit handle_queries, ": Handle index queries as a couchdb _external")
374 | ;("-list_all", Unit list_all, ": List all indexed keys")
375 | ;("-list", String list_one, ": List the entry for a given key")]
376 | ignore
377 | "Use 'index -help' for available options"
378 |
--------------------------------------------------------------------------------
/src/latex.ml:
--------------------------------------------------------------------------------
1 | (*
2 | The internal representation of preprocessed latex strings.
3 | The string elements are hashed to save space and speed up comparisons.
4 | The json input is produced by the python preprocessor.
5 | *)
6 |
7 | type element =
8 | | Command of string
9 | | Text of string
10 |
11 | type t = int array
12 |
13 | let empty () = Array.make 0 0
14 |
15 | let of_array array = array
16 |
17 | exception Parse_error
18 |
19 | let rec element_of_json json =
20 | match json with
21 | | Json_type.Object [(command,json)] -> (Command command) :: element_list_of_json json
22 | | Json_type.String text -> [Text text]
23 | | _ -> raise Parse_error
24 | and element_list_of_json json =
25 | match json with
26 | | Json_type.Array jsons -> List.concat (List.map element_of_json jsons)
27 | | _ -> raise Parse_error
28 |
29 | (* Parsing elements from json *)
30 | let of_json json =
31 | Array.of_list (List.map Hashtbl.hash (element_list_of_json json))
32 |
33 | (* Defined to make json-static happy, not used *)
34 | let to_json latex = Json_type.Null
35 |
36 | let length = Array.length
37 |
38 | type pos = int
39 |
40 | let compare_suffix (latexL, pos1) (latexR, pos2) =
41 | let n1, n2 = length latexL, length latexR in
42 | let rec compare_suffix' pos1 pos2 =
43 | match (pos1 >= n1, pos2 >= n2) with
44 | | (true, true) -> 0
45 | | (true, false) -> -1
46 | | (false, true) -> 1
47 | | (false, false) ->
48 | let cmp = compare latexL.(pos1) latexR.(pos2) in
49 | if cmp < 0 then -1 else
50 | if cmp > 0 then 1 else
51 | compare_suffix' (pos1+1) (pos2+1) in
52 | compare_suffix' pos1 pos2
53 |
54 | let is_prefix (latexL, pos1) (latexR, pos2) =
55 | let n1, n2 = length latexL, length latexR in
56 | let rec is_prefix' pos1 pos2 =
57 | if pos1 >= n1 then true else
58 | if pos2 >= n2 then false else
59 | if latexL.(pos1) != latexR.(pos2) then false else
60 | is_prefix' (pos1+1) (pos2+1) in
61 | is_prefix' pos1 pos2
62 |
63 | (* Divide latex into k substrings of equal(ish) lengths *)
64 | let fragments latex k =
65 | let n = length latex in
66 | let size = n / k in
67 | let rec fragments' pos larger =
68 | if pos >= n then [] else
69 | let size = if larger > 0 then size+1 else size in
70 | (Array.sub latex pos size) :: (fragments' (pos+size) (larger-1)) in
71 | fragments' 0 (n mod k)
72 |
73 | let rec minimum (x : int) y z =
74 | if y < x then minimum y x z else
75 | if z < y then minimum x z y else x
76 |
77 | let cutoff precision latex =
78 | let errors = (1.0 -. precision) *. (float_of_int (length latex)) in
79 | max 1 (min 5 (int_of_float (ceil errors)))
80 |
81 | (*
82 | Calculation of the Levensthein edit distance between two latex strings.
83 | The calculation is left-biased: the left string is matched to any substring of the right string
84 | *)
85 | let distance latexL latexR =
86 | let maxl, maxr = Array.length latexL, Array.length latexR in
87 | if maxl = 0 then 0 else
88 | if maxr = 0 then maxl else
89 | (* cache.(l).(r) is the distance between latexL[l to maxl] and latexR[r to maxr] *)
90 | let cache = Array.make_matrix (maxl + 1) (maxr + 1) 0 in
91 | (* Must match everything on the left *)
92 | for l = maxl - 1 downto 0 do
93 | cache.(l).(maxr) <- 1 + cache.(l+1).(maxr)
94 | done;
95 | (* General matching *)
96 | for l = maxl - 1 downto 1 do
97 | for r = maxr - 1 downto 0 do
98 | cache.(l).(r) <-
99 | minimum
100 | (1 + cache.(l).(r+1))
101 | (1 + cache.(l+1).(r))
102 | ((abs (compare latexL.(l) latexR.(r))) + cache.(l+1).(r+1))
103 | done done;
104 | (* Non-matches on the right dont count until left starts matching *)
105 | for r = maxr - 1 downto 0 do
106 | cache.(0).(r) <-
107 | minimum
108 | (cache.(0).(r+1))
109 | (1 + cache.(1).(r))
110 | ((abs (compare latexL.(0) latexR.(r))) + cache.(1).(r+1))
111 | done;
112 | cache.(0).(0)
113 |
114 | let similar precision latexL latexR =
115 | let dist = distance latexL latexR in
116 | if dist < cutoff precision latexL then Some dist else None
117 |
--------------------------------------------------------------------------------
/src/latex.mli:
--------------------------------------------------------------------------------
1 | type t = int array
2 |
3 | val empty : unit -> t
4 | val length : t -> int
5 |
6 | val of_array : int array -> t
7 |
8 | val of_json : Json_type.t -> t
9 | val to_json : t -> Json_type.t
10 |
11 | type pos = int
12 |
13 | val compare_suffix : (t * pos) -> (t * pos) -> int
14 | val is_prefix : (t * pos) -> (t * pos) -> bool
15 | val fragments : t -> int -> t list
16 |
17 | val cutoff : float -> t -> int
18 | val distance : t -> t -> int
19 | val similar : float -> t -> t -> int option
20 |
--------------------------------------------------------------------------------
/src/myMap.ml:
--------------------------------------------------------------------------------
1 | module type S =
2 | sig
3 | include Map.S
4 |
5 | val update : key -> ('a -> 'a) -> 'a -> 'a t -> 'a t
6 | val count : 'a t -> int
7 | val to_list : 'a t -> (key * 'a) list
8 | val find_with : key -> 'a -> 'a t -> 'a
9 | val filter_map : ('a -> 'b option) -> 'a t -> 'b t
10 | end
11 |
12 | module Make (Ord : Map.OrderedType) : (S with type key = Ord.t) =
13 | struct
14 | include Map.Make (Ord)
15 |
16 | let update key f default map =
17 | add key (try f (find key map) with Not_found -> default) map
18 |
19 | let count map = fold (fun _ _ n -> n+1) map 0
20 |
21 | let to_list map = fold (fun k v rest -> (k,v) :: rest) map []
22 |
23 | let find_with key default map =
24 | try
25 | find key map
26 | with Not_found ->
27 | default
28 |
29 | let filter_map f map =
30 | fold
31 | (fun key value map ->
32 | match (f value) with
33 | | None -> map
34 | | Some value -> add key value map)
35 | map
36 | empty
37 | end
38 |
--------------------------------------------------------------------------------
/src/myMap.mli:
--------------------------------------------------------------------------------
1 | module type S =
2 | sig
3 | include Map.S
4 |
5 | val update : key -> ('a -> 'a) -> 'a -> 'a t -> 'a t
6 | val count : 'a t -> int
7 | val to_list : 'a t -> (key * 'a) list
8 | val find_with : key -> 'a -> 'a t -> 'a
9 | val filter_map : ('a -> 'b option) -> 'a t -> 'b t
10 | end
11 |
12 | module Make (Ord : Map.OrderedType) : (S with type key = Ord.t)
13 |
--------------------------------------------------------------------------------
/src/pid.ml:
--------------------------------------------------------------------------------
1 | (* Prevents multiple update processes from running in parrallel *)
2 |
3 | let lock () =
4 | try
5 | Util.flush_line "Checking pid file";
6 | let pid_file = open_in "run/update.pid" in
7 | let pid = try input_line pid_file with End_of_file -> "" in
8 | begin
9 | match (pid, Unix.system ("ps " ^ pid ^ " &> /dev/null")) with
10 | | ("", _) ->
11 | Util.flush_line "No existing pid"
12 | | (pid, Unix.WEXITED 0) ->
13 | Util.flush_line ("Process with pid " ^ pid ^ " already exists");
14 | raise Exit
15 | | (pid, Unix.WEXITED 1) ->
16 | Util.flush_line ("Process with pid " ^ pid ^ " does not exist")
17 | end;
18 | close_in pid_file;
19 | let pid_file = open_out "run/update.pid" in
20 | output_string pid_file (string_of_int (Unix.getpid ()));
21 | close_out pid_file
22 | with
23 | | Exit ->
24 | raise Exit
25 | | exc ->
26 | Util.flush_line "Error checking pid in run/update.pid";
27 | raise exc
28 |
--------------------------------------------------------------------------------
/src/pid.mli:
--------------------------------------------------------------------------------
1 | (* Exits if the process in the pid file is alive *)
2 | val lock : unit -> unit
3 |
--------------------------------------------------------------------------------
/src/query.ml:
--------------------------------------------------------------------------------
1 | (* Compound boolean queries *)
2 |
3 | (* The query type *)
4 | type t =
5 | | Latex of Latex.t * string (* Store the string version so we can send the query back to the users *)
6 | | And of t * t
7 | | Or of t * t
8 |
9 | let is_blank_string str =
10 | let blank = ref true in
11 | String.iter
12 | (fun char ->
13 | if char <> ' '
14 | then blank := false
15 | else ())
16 | str;
17 | !blank
18 |
19 | let is_quoted_string str =
20 | (String.get str 0 == '"') && (String.get str (String.length str - 1) == '"')
21 |
22 | open Str
23 |
24 | (* Quick and dirty lexer, delimiters are: "latexstring" ) ( AND OR *)
25 | let token_spec = regexp "\"[^\"]*\"\|(\|)\|AND\|OR"
26 | let lex str =
27 | let tokens = full_split token_spec str in
28 | let tokens =
29 | List.filter
30 | (function
31 | | Text text when is_blank_string text -> false
32 | | _ -> true)
33 | tokens in
34 | Stream.of_list tokens
35 |
36 | (* A simple recursive descent parser *)
37 | let parse_query preprocesser tokens =
38 | let rec parse_atom =
39 | parser
40 | | [< 'Delim "("; q=parse_expr; 'Delim ")" >] -> q
41 | | [< 'Delim delim when is_quoted_string delim >] ->
42 | let text = String.sub delim 1 (String.length delim - 2) in
43 | let (latex, plain) = preprocesser text in
44 | Latex (latex, plain)
45 |
46 | and parse_expr =
47 | parser
48 | | [< q1=parse_atom; stream >] ->
49 | (parser
50 | | [< 'Delim "AND"; q2=parse_expr >] -> And (q1, q2)
51 | | [< 'Delim "OR"; q2=parse_expr >] -> Or (q1, q2)
52 | | [< >] -> q1)
53 | stream
54 |
55 | and parse_query =
56 | parser
57 | | [< q=parse_expr; stream >] ->
58 | Stream.empty stream; q in
59 |
60 | parse_query tokens
61 |
62 | exception Parse_error
63 |
64 | let of_string preprocesser str =
65 | try
66 | parse_query preprocesser (lex str)
67 | with _ ->
68 | raise Parse_error (* Dont care whether the error was parsing the query or preprocessing the latex *)
69 |
70 | let rec to_string query =
71 | match query with
72 | | Latex (_,plain) -> "\"" ^ plain ^ "\""
73 | | And (query1,query2) -> "(" ^ (to_string query1) ^ " AND " ^ (to_string query2) ^ ")"
74 | | Or (query1,query2) -> "(" ^ (to_string query1) ^ " OR " ^ (to_string query2) ^ ")"
75 |
76 | (* Extending the edit distance on latex strings to edit distance on compound queries *)
77 | let rec distance query latexR =
78 | match query with
79 | | Latex (latexL,_) -> Latex.distance latexL latexR
80 | | And (query1,query2) -> max (distance query1 latexR) (distance query2 latexR)
81 | | Or (query1,query2) -> min (distance query1 latexR) (distance query2 latexR)
82 |
83 | let rec similar precision query latexR =
84 | match query with
85 | | Latex (latexL,_) ->
86 | Latex.similar precision latexL latexR
87 | | And (query1, query2) ->
88 | begin
89 | match (similar precision query1 latexR, similar precision query2 latexR) with
90 | | (Some dist1, Some dist2) -> Some (max dist1 dist2)
91 | | _ -> None
92 | end
93 | | Or (query1, query2) ->
94 | begin
95 | match (similar precision query1 latexR, similar precision query2 latexR) with
96 | | (Some dist1, Some dist2) -> Some (min dist1 dist2)
97 | | (Some dist1, None) -> Some dist1
98 | | (None, Some dist2) -> Some dist2
99 | | (None, None) -> None
100 | end
101 |
--------------------------------------------------------------------------------
/src/query.mli:
--------------------------------------------------------------------------------
1 | type t =
2 | | Latex of Latex.t * string (* Store the string version so we can send the query back to the users *)
3 | | And of t * t
4 | | Or of t * t
5 |
6 | exception Parse_error
7 |
8 | val of_string : (string -> (Latex.t * string)) -> string -> t
9 | val to_string : t -> string
10 |
11 | val distance : t -> Latex.t -> int
12 | val similar : float -> t -> Latex.t -> int option
13 |
--------------------------------------------------------------------------------
/src/suffix.ml:
--------------------------------------------------------------------------------
1 | (* Packed representations of suffixes of strings. Used by suffix_array *)
2 |
3 | type id = int
4 | type pos = int
5 |
6 | type t = int
7 |
8 | let pack_size = (Sys.word_size / 2) - 1
9 | let max_size = 1 lsl pack_size
10 |
11 | exception Invalid_suffix of id * pos
12 |
13 | let pack (id, pos) =
14 | if (id < 0) || (id >= max_size)
15 | || (pos < 0) || (pos >= max_size)
16 | then raise (Invalid_suffix (id, pos))
17 | else pos lor (id lsl pack_size)
18 |
19 | let unpack suffix =
20 | let id = suffix lsr pack_size in
21 | let pos = suffix land (max_size - 1) in
22 | (id, pos)
23 |
--------------------------------------------------------------------------------
/src/suffix.mli:
--------------------------------------------------------------------------------
1 | type id = int
2 | type pos = int
3 |
4 | type t
5 |
6 | val max_size : int
7 |
8 | exception Invalid_suffix of id * pos
9 |
10 | val pack : id * pos -> t
11 | val unpack : t -> id * pos
12 |
--------------------------------------------------------------------------------
/src/suffix_array.ml:
--------------------------------------------------------------------------------
1 | (*
2 | Suffix arrays storing compressed latex formulae.
3 | Allows neighbourhood search by Latex.distance
4 | *)
5 |
6 | open Util
7 |
8 | type id = Suffix.id
9 | type pos = Suffix.pos
10 |
11 | type 'a t =
12 | { latexs : Latex.t DynArray.t
13 | ; opaques : 'a DynArray.t
14 | ; deleted : bool DynArray.t
15 | ; mutable next_id : id
16 | ; mutable array : Suffix.t array
17 | ; mutable unsorted : ('a * Latex.t) list }
18 |
19 | let create () =
20 | { latexs = DynArray.create ()
21 | ; opaques = DynArray.create ()
22 | ; deleted = DynArray.create ()
23 | ; next_id = 0
24 | ; array = [||]
25 | ; unsorted = []}
26 |
27 | let ancientify sa =
28 | sa.array <- Ancient.follow (Ancient.mark sa.array);
29 | Gc.full_major ()
30 |
31 | let add sa latexs =
32 | sa.unsorted <- latexs @ sa.unsorted
33 |
34 | let compare_suffix sa (id1,pos1) (id2,pos2) =
35 | let latexL, latexR = DynArray.get sa.latexs id1, DynArray.get sa.latexs id2 in
36 | Latex.compare_suffix (latexL,pos1) (latexR,pos2)
37 |
38 | let suffixes sa id =
39 | let latex = DynArray.get sa.latexs id in
40 | let n = Latex.length latex in
41 | List.map (fun pos -> Suffix.pack (id,pos)) (Util.range 0 n)
42 |
43 | let insert sa (opaque, latex) =
44 | let id = sa.next_id in
45 | sa.next_id <- id + 1;
46 | DynArray.add sa.opaques opaque;
47 | DynArray.add sa.latexs latex;
48 | DynArray.add sa.deleted false;
49 | id
50 |
51 | (* a little convoluted to keep memory usage as low as possible *)
52 | let prepare sa =
53 | let ids = List.map (insert sa) sa.unsorted in
54 | sa.unsorted <- [];
55 | let new_suffixes = Util.concat_map (suffixes sa) ids in
56 | let old_len = Array.length sa.array in
57 | let new_len = List.length new_suffixes in
58 | let array = Array.make (old_len + new_len) (Suffix.pack (0,0)) in
59 | Array.blit sa.array 0 array 0 old_len;
60 | sa.array <- array;
61 | let index = ref old_len in
62 | List.iter
63 | (fun suffix ->
64 | array.(!index) <- suffix;
65 | index := !index + 1)
66 | new_suffixes;
67 | let cmp suffix1 suffix2 =
68 | let (id1,pos1) = Suffix.unpack suffix1 in
69 | let (id2,pos2) = Suffix.unpack suffix2 in
70 | compare_suffix sa (id1,pos1) (id2,pos2) in
71 | Array.fast_sort cmp sa.array
72 |
73 | let delete sa filter =
74 | let deleted_ids =
75 | Util.filter_map
76 | (fun id ->
77 | if filter (DynArray.get sa.opaques id)
78 | then Some id
79 | else None)
80 | (Util.range 0 (DynArray.length sa.opaques)) in
81 | List.iter (fun id -> DynArray.set sa.deleted id true) deleted_ids
82 |
83 | let filter_deleted sa ids =
84 | Hashset.filter (fun id -> not (DynArray.get sa.deleted id)) ids
85 |
86 | let is_prefix sa latexL (id,pos) =
87 | let latexR = DynArray.get sa.latexs id in
88 | Latex.is_prefix (latexL,0) (latexR,pos)
89 |
90 | let leq sa latexL (id,pos) =
91 | let latexR = DynArray.get sa.latexs id in
92 | (Latex.compare_suffix (latexL,0) (latexR,pos)) <= 0
93 |
94 | (* Exact searching *)
95 |
96 | (* binary search *)
97 | let gather_exact ids sa latex =
98 | (* find beginning of region *)
99 | (* lo < latex *)
100 | (* hi >= latex *)
101 | let rec narrow lo hi =
102 | let mid = lo + ((hi-lo) / 2) in
103 | if lo = mid then hi else
104 | if leq sa latex (Suffix.unpack sa.array.(mid))
105 | then narrow lo mid
106 | else narrow mid hi in
107 | let n = Array.length sa.array in
108 | let rec traverse index =
109 | if index >= n then () else
110 | let (id, pos) = Suffix.unpack sa.array.(index) in
111 | if is_prefix sa latex (id, pos)
112 | then
113 | begin
114 | Hashset.add ids id;
115 | traverse (index+1)
116 | end
117 | else () in
118 | traverse (narrow (-1) (n-1))
119 |
120 | let exact_match sa id =
121 | (0, DynArray.get sa.opaques id)
122 |
123 | let find_exact sa latex =
124 | let ids = Hashset.create 0 in
125 | gather_exact ids sa latex;
126 | filter_deleted sa ids;
127 | List.map (exact_match sa) (Hashset.to_list ids)
128 |
129 | (* Searching by Latex.distance *)
130 |
131 | (*
132 | The logic behind the approx search is as follows:
133 | Suppose Latex.distance latex corpus_term < k
134 | Then List.exists (fun fragment -> Latex.distance fragment corpus_term = 0) (Latex.fragments latex k)
135 | *)
136 | let gather_approx sa precision latex =
137 | let k = Latex.cutoff precision latex in
138 | let ids = Hashset.create 0 in
139 | List.iter (gather_exact ids sa) (Latex.fragments latex k);
140 | ids
141 |
142 | let approx_match sa precision latexL id =
143 | let latexR = DynArray.get sa.latexs id in
144 | match Latex.similar precision latexL latexR with
145 | | Some dist ->
146 | let opaque = DynArray.get sa.opaques id in
147 | Some (dist, opaque)
148 | | None ->
149 | None
150 |
151 | let find_approx sa precision latex =
152 | let ids = gather_approx sa precision latex in
153 | filter_deleted sa ids;
154 | Util.filter_map (approx_match sa precision latex) (Hashset.to_list ids)
155 |
156 | (* Searching by Query.distance *)
157 |
158 | let rec gather_query sa precision query =
159 | match query with
160 | | Query.Latex (latex, _) -> gather_approx sa precision latex
161 | | Query.And (query1, query2) -> Hashset.inter (gather_query sa precision query1) (gather_query sa precision query2)
162 | | Query.Or (query1, query2) -> Hashset.union (gather_query sa precision query1) (gather_query sa precision query2)
163 |
164 | let query_match sa precision query id =
165 | let latexR = DynArray.get sa.latexs id in
166 | match Query.similar precision query latexR with
167 | | Some dist ->
168 | let opaque = DynArray.get sa.opaques id in
169 | Some (dist, opaque)
170 | | None ->
171 | None
172 |
173 | let find_query sa precision query =
174 | let ids = gather_query sa precision query in
175 | filter_deleted sa ids;
176 | Util.filter_map (query_match sa precision query) (Hashset.to_list ids)
177 |
--------------------------------------------------------------------------------
/src/suffix_array.mli:
--------------------------------------------------------------------------------
1 | type id = int
2 | type pos = int
3 |
4 | type 'a t =
5 | { latexs : Latex.t DynArray.t
6 | ; opaques : 'a DynArray.t
7 | ; deleted : bool DynArray.t
8 | ; mutable next_id : id
9 | ; mutable array : Suffix.t array
10 | ; mutable unsorted : ('a * Latex.t) list }
11 |
12 | val create : unit -> 'a t
13 | val ancientify : 'a t -> unit
14 |
15 | val add : 'a t -> ('a * Latex.t) list -> unit
16 | val prepare : 'a t -> unit
17 |
18 | val delete : 'a t -> ('a -> bool) -> unit
19 |
20 | val find_exact : 'a t -> Latex.t -> (int * 'a) list
21 | val find_approx : 'a t -> float -> Latex.t -> (int * 'a) list
22 | val find_query : 'a t -> float -> Query.t -> (int * 'a) list
23 |
--------------------------------------------------------------------------------
/src/suffix_array_test.ml:
--------------------------------------------------------------------------------
1 | let random_array length gen =
2 | Array.map (fun _ -> gen ()) (Array.make length 0)
3 |
4 | let random_list length gen =
5 | Array.to_list (random_array length gen)
6 |
7 | let random_latex_element () =
8 | Random.int 50
9 |
10 | let random_latex max_length =
11 | let length = (1 + Random.int max_length) in
12 | Latex.of_array (random_array length random_latex_element)
13 |
14 | let random_string max_length =
15 | let length = (1 + Random.int max_length) in
16 | String.create length (* unitialised memory is fine *)
17 |
18 | let rec random_query max_length =
19 | match Random.int 6 with
20 | | 0 -> Query.And (random_query max_length, random_query max_length)
21 | | 1 -> Query.Or (random_query max_length, random_query max_length)
22 | | _ -> Query.Latex (random_latex max_length, "")
23 |
24 | let random_corpus n =
25 | let latexs = random_list n (fun () -> random_latex 1000) in
26 | let opaques = random_list n (fun () -> random_string 1000) in
27 | let items = List.combine opaques latexs in
28 | let sa = Suffix_array.create () in
29 | Suffix_array.add sa items;
30 | Suffix_array.prepare sa;
31 | let ((opaque,latex)::items) = items in
32 | Suffix_array.delete sa ((=) opaque);
33 | Suffix_array.ancientify sa;
34 | (items, sa)
35 |
36 | let test_find test find n =
37 | let (items, sa) = random_corpus n in
38 | let test_result = List.sort compare (test items) in
39 | let real_result = List.sort compare (List.map (fun (_,opaque) -> opaque) (find sa)) in
40 | if test_result <> real_result then Util.flush_line "Fail!" else Util.flush_line "Pass!";
41 | (test_result = real_result, List.length test_result, List.length real_result)
42 |
43 | let exact_match latexL latexR =
44 | Latex.distance latexL latexR = 0
45 |
46 | let test_find_exact n =
47 | let latexL = random_latex 5 in
48 | let test items =
49 | Util.filter_map
50 | (fun (id,latexR) ->
51 | if exact_match latexL latexR
52 | then Some id
53 | else None)
54 | items in
55 | let find sa =
56 | Suffix_array.find_exact sa latexL in
57 | test_find test find n
58 |
59 | let approx_match precision latexL latexR =
60 | Latex.similar precision latexL latexR <> None
61 |
62 | let test_find_approx n =
63 | let latexL = random_latex 5 in
64 | let precision = Random.float 1.0 in
65 | let test items =
66 | Util.filter_map
67 | (fun (id,latexR) ->
68 | if approx_match precision latexL latexR
69 | then Some id
70 | else None)
71 | items in
72 | let find sa =
73 | Suffix_array.find_approx sa precision latexL in
74 | test_find test find n
75 |
76 | let rec query_match precision query latexR =
77 | Query.similar precision query latexR <> None
78 |
79 | let test_find_query n =
80 | let query = random_query 5 in
81 | let precision = Random.float 1.0 in
82 | let test items =
83 | Util.filter_map
84 | (fun (id,latexR) ->
85 | if query_match precision query latexR
86 | then Some id
87 | else None)
88 | items in
89 | let find sa =
90 | Suffix_array.find_query sa precision query in
91 | test_find test find n
92 |
93 | let test_find_max_precision n =
94 | let latexL = random_latex 5 in
95 | let test items =
96 | Util.filter_map
97 | (fun (id,latexR) ->
98 | if exact_match latexL latexR
99 | then Some id
100 | else None)
101 | items in
102 | let find sa =
103 | Suffix_array.find_approx sa 1.0 latexL in
104 | test_find test find n
105 |
--------------------------------------------------------------------------------
/src/suffix_test.ml:
--------------------------------------------------------------------------------
1 | let test_pack n =
2 | for i = 0 to n do
3 | let (id, pos) = (Random.int Suffix.max_size, Random.int Suffix.max_size) in
4 | if (id, pos) = Suffix.unpack (Suffix.pack (id,pos))
5 | then () (* Util.flush_line "Pass!" *)
6 | else Util.flush_line ("Fail!: " ^ (string_of_int id) ^ " " ^ (string_of_int pos))
7 | done
8 |
--------------------------------------------------------------------------------
/src/test.mltop:
--------------------------------------------------------------------------------
1 | DynArray
2 | Hashset
3 | MyMap
4 | Util
5 | Latex
6 | Query
7 | Suffix
8 | Suffix_array
9 | Suffix_test
10 | Suffix_array_test
--------------------------------------------------------------------------------
/src/util.ml:
--------------------------------------------------------------------------------
1 | let flush_line str = print_string str; print_string "\n"; flush stdout
2 |
3 | let minimum (l::ls) = List.fold_left min l ls
4 |
5 | let maximum (l::ls) = List.fold_left max l ls
6 |
7 | let filter_map f ls =
8 | List.map
9 | (fun l -> match l with Some a -> a)
10 | ((List.filter
11 | (fun l -> l <> None)
12 | (List.map f ls)))
13 |
14 | let concat_map f ls =
15 | List.fold_right (@) (List.map f ls) []
16 |
17 | let rec range start finish =
18 | if start < finish then start :: range (start+1) finish else []
19 |
20 | (* Fairly hackish method of sucking out stream elements *)
21 | let list_of_stream stream = Stream.npeek max_int stream
22 |
23 | let load_data filename =
24 | try
25 | let data_file = open_in_bin filename in
26 | let data = Marshal.from_channel data_file in
27 | close_in data_file; data
28 | with _ ->
29 | flush_line ("Error opening file " ^ filename);
30 | raise Exit
31 |
32 | let save_data filename data =
33 | try
34 | let data_file = open_out_bin (filename ^ "_tmp") in
35 | Marshal.to_channel data_file data [];
36 | close_out data_file;
37 | Unix.rename (filename ^ "_tmp") filename
38 | with _ ->
39 | flush_line ("Error saving to file " ^ filename);
40 | raise Exit
41 |
42 | (* Tune the gc for lots of garbage *)
43 | open Gc
44 | let expect_garbage () =
45 | let m = 1024 * 1024 in
46 | Gc.set
47 | {(Gc.get ()) with
48 | minor_heap_size = 256 * m;
49 | major_heap_increment = 64 * m;
50 | space_overhead = 200
51 | }
52 |
53 | let backtrace f =
54 | Printexc.record_backtrace true;
55 | try Printexc.print f (); () with _ -> Printexc.print_backtrace stdout
56 |
--------------------------------------------------------------------------------
/src/util.mli:
--------------------------------------------------------------------------------
1 | val flush_line : string -> unit
2 |
3 | val minimum : 'a list -> 'a
4 | val maximum : 'a list -> 'a
5 |
6 | val filter_map : ('a -> 'b option) -> 'a list -> 'b list
7 | val concat_map : ('a -> 'b list) -> 'a list -> 'b list
8 |
9 | val range : int -> int -> int list
10 |
11 | val list_of_stream : 'a Stream.t -> 'a list
12 |
13 | val load_data : string -> 'a
14 | val save_data : string -> 'a -> unit
15 |
16 | val backtrace : (unit -> 'a) -> unit
17 |
--------------------------------------------------------------------------------
/start:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | dir=$(dirname $0)
3 | couchdb -b -c $dir/db.ini -p $dir/run/couchdb.pid
4 |
--------------------------------------------------------------------------------
/stop:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | dir=$(dirname $0)
3 | couchdb -p $dir/run/couchdb.pid -d
4 |
--------------------------------------------------------------------------------