├── LICENSE ├── Makefile ├── README.markdown ├── add ├── db.ini ├── del ├── init ├── query.sh ├── run └── update.pid ├── scripts ├── db.py ├── monitor.py ├── preprocessor.py ├── replay.py ├── search.py ├── test.py ├── times.py └── util.py ├── src ├── _tags ├── dynArray.ml ├── dynArray.mli ├── hashset.ml ├── hashset.mli ├── index.ml ├── latex.ml ├── latex.mli ├── myMap.ml ├── myMap.mli ├── pid.ml ├── pid.mli ├── query.ml ├── query.mli ├── suffix.ml ├── suffix.mli ├── suffix_array.ml ├── suffix_array.mli ├── suffix_array_test.ml ├── suffix_test.ml ├── test.mltop ├── util.ml └── util.mli ├── start └── stop /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: opt 2 | 3 | opt: 4 | ocamlbuild -use-ocamlfind -I src src/index.native 5 | cp index.native index 6 | 7 | test: 8 | ocamlbuild -use-ocamlfind -I src src/test.top 9 | 10 | clean: 11 | ocamlbuild -clean -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | Texsearch is a search index specialised for LaTeX equations, forming part of the backend for Springer's [latexsearch.com](http://latexsearch.com). Latexsearch currently indexes more than 2 million documents drawn from Springer journals and books. 2 | 3 | Every LaTeX equation in the corpus is parsed and evaluated on entry to produce an AST. The similarity between a pair of equations is calculated as the Levenshtein distance between their respective ASTs as a fraction of the total size of the ASTs. Given a LateX equation as a search term, texsearch will retrieve all equations in the corpus whose similarity to the search term falls under a specified margin. 4 | 5 | The index uses a suffix array to quickly calculate a superset of the search results by finding exact matches of fragments of the search term. 6 | 7 | Previous versions use a modified bk-tree which is capable of performing vicinity searches over any quasi-metric space using any query function satisfying: 8 | 9 | For all a. query a >= 0 10 | For all a, b. query b - query a <= dist a b 11 | 12 | This index is stored in-memory and is relatively compact - the index for latexsearch.com is under 800MB. 13 | 14 | # Architecture 15 | 16 | Couchdb is the root process. The preprocessor and index are run as _external services on couchdb. Raw data is stored in the 'documents' db on couchdb. The search index is stored in the file 'data/index'. 17 | 18 | Springer documents are uploaded to the server as xml files. The command 'db.py --add some_doc.xml' extracts latex formulae and metadata from some_doc.xml, runs the latex through the preprocessor and stores the results in couchdb. The command 'index -update' uses the couchdb change log to locate new or modified documents and update the index file. Restarting the index external service causes it to load the new index file. 19 | 20 | # Requirements 21 | 22 | Tested with: 23 | 24 | couchdb 0.6.0 25 | 26 | ocaml 3.12.0 27 | ancient 0.9.0 28 | json-wheel 1.0.6 29 | json-static 0.9.8 30 | ocamlnet 3.2 31 | pcre-ocaml 6.2.2 32 | xml-light 2.2 33 | 34 | python 2.6.6 35 | couchdb 0.6 (python lib) 36 | httplib2 0.5.0 37 | plastex 0.9.2 38 | 39 | -------------------------------------------------------------------------------- /add: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ulimit -s unlimited 3 | dir=$(dirname $0) 4 | python $dir/scripts/db.py --add=$1 5 | $dir/index -update 6 | $dir/stop 7 | $dir/start 8 | -------------------------------------------------------------------------------- /db.ini: -------------------------------------------------------------------------------- 1 | [couchdb] 2 | database_dir = ./db 3 | view_index_dir = ./db 4 | max_document_size = 4294967296 ; 4 GB 5 | max_attachment_chunk_size = 4294967296 ; 4GB 6 | os_process_timeout = 60000 ; 60 seconds. for view and external servers. 7 | max_dbs_open = 100 8 | 9 | [httpd] 10 | port = 5984 11 | bind_address = 127.0.0.1 12 | authentication_handler = {couch_httpd, default_authentication_handler} 13 | WWW-Authenticate = Basic realm="administrator" 14 | 15 | [log] 16 | file = ./log/couch.log 17 | level = info 18 | 19 | [query_servers] 20 | javascript = /usr/bin/couchjs /usr/share/couchdb/server/main.js 21 | 22 | [external] 23 | index = ./query.sh 24 | preprocess = python ./scripts/preprocessor.py 25 | 26 | [daemons] 27 | view_manager={couch_view, start_link, []} 28 | external_manager={couch_external_manager, start_link, []} 29 | db_update_notifier={couch_db_update_notifier_sup, start_link, []} 30 | query_servers={couch_query_servers, start_link, []} 31 | httpd={couch_httpd, start_link, []} 32 | stats_aggregator={couch_stats_aggregator, start, []} 33 | stats_collector={couch_stats_collector, start, []} 34 | 35 | [httpd_global_handlers] 36 | / = {couch_httpd_misc_handlers, handle_welcome_req, <<"Welcome">>} 37 | favicon.ico = {couch_httpd_misc_handlers, handle_favicon_req, "/usr/share/couchdb/www"} 38 | 39 | _utils = {couch_httpd_misc_handlers, handle_utils_dir_req, "/usr/share/couchdb/www"} 40 | _all_dbs = {couch_httpd_misc_handlers, handle_all_dbs_req} 41 | _active_tasks = {couch_httpd_misc_handlers, handle_task_status_req} 42 | _config = {couch_httpd_misc_handlers, handle_config_req} 43 | _replicate = {couch_httpd_misc_handlers, handle_replicate_req} 44 | _uuids = {couch_httpd_misc_handlers, handle_uuids_req} 45 | _restart = {couch_httpd_misc_handlers, handle_restart_req} 46 | _stats = {couch_httpd_stats_handlers, handle_stats_req} 47 | 48 | [httpd_db_handlers] 49 | _design = {couch_httpd_db, handle_design_req} 50 | _temp_view = {couch_httpd_view, handle_temp_view_req} 51 | 52 | ; The external module takes an optional argument allowing you to narrow it to a 53 | ; single script. Otherwise the script name is inferred from the first path section 54 | ; after _external's own path. 55 | ; _mypath = {couch_httpd_external, handle_external_req, <<"mykey">>} 56 | _external = {couch_httpd_external, handle_external_req} 57 | 58 | [httpd_design_handlers] 59 | _view = {couch_httpd_view, handle_view_req} 60 | _show = {couch_httpd_show, handle_doc_show_req} 61 | _list = {couch_httpd_show, handle_view_list_req} 62 | -------------------------------------------------------------------------------- /del: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ulimit -s unlimited 3 | dir=$(dirname $0) 4 | python $dir/scripts/db.py --del=$1 5 | $dir/index -update 6 | $dir/stop 7 | $dir/start 8 | -------------------------------------------------------------------------------- /init: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | dir=$(dirname $0) 3 | $dir/index -init 4 | python $dir/scripts/db.py --init 5 | -------------------------------------------------------------------------------- /query.sh: -------------------------------------------------------------------------------- 1 | ulimit -s unlimited 2 | ./index -query 3 | -------------------------------------------------------------------------------- /run/update.pid: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamii/texsearch/d0d4423f093dfafadd935f785b384d2c2fb7abf9/run/update.pid -------------------------------------------------------------------------------- /scripts/db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ Handles parsing Springer documents and adding/deleting document entries to/from couchdb """ 4 | 5 | import re 6 | import sys, httplib, urllib 7 | from xml.dom import minidom 8 | from preprocessor import JsonProcessor, parseLaTeX 9 | from util import encodeDoi, decodeDoi 10 | import couchdb.client 11 | 12 | # Find the couchdb server 13 | conf = open("./db.ini") 14 | port = re.compile(r"port *= *(\d+)").search(conf.read()).group(1) 15 | conf.close() 16 | 17 | couchdb_server = couchdb.client.Server('http://localhost:%s/' % port) 18 | 19 | def confirm(prompt): 20 | response = raw_input(prompt + " (y/n):") 21 | if response != 'y': 22 | print "Ok, nothing was done" 23 | sys.exit(0) 24 | 25 | ### Initial configuration of the database ### 26 | 27 | def initDB(): 28 | confirm("This will erase the texsearch database. Are you sure?") 29 | 30 | print "Deleting existing databases" 31 | try: 32 | del couchdb_server['documents'] 33 | except couchdb.client.ResourceNotFound: 34 | # db doesnt exist yet 35 | pass 36 | 37 | print "Creating new databases" 38 | couchdb_server.create('documents') 39 | 40 | ### Parsing and preprocessing xml articles ### 41 | 42 | # Wrap the JsonProcessor in some error handling, since plasTeX often fails in weird ways 43 | def preprocess(eqnID, latex): 44 | try: 45 | result = JsonProcessor().process(parseLaTeX("\\begin{document} " + latex + " \\end{document}")).dumps() 46 | return (eqnID, result) 47 | except KeyboardInterrupt, e: 48 | raise e 49 | except Exception, e: 50 | print "Note: Preprocessor failed on equation %s : %s" % (eqnID, e) 51 | return None 52 | 53 | def parseEquation(eqn): 54 | eqnID = eqn.attributes.get('ID').value 55 | try: 56 | for eqnSource in eqn.getElementsByTagName("EquationSource"): 57 | if eqnSource.attributes.get('Format').value == "TEX": 58 | latex = eqnSource.childNodes[0].wholeText 59 | return (latex, eqnID) 60 | return None 61 | except IndexError: 62 | print ("Note: no equation source for eqn %s" % eqnID) 63 | except AttributeError: 64 | print ("Note: missing format attribute for eqn %s" % eqnID) 65 | return None 66 | 67 | def filterNone(xs): 68 | return [x for x in xs if x is not None] 69 | 70 | def parseEquations(item): 71 | equations = filterNone([parseEquation(eqn) for eqn in item.getElementsByTagName("Equation") + item.getElementsByTagName("InlineEquation")]) 72 | # Eliminate duplicate equations (key is latex) 73 | equations = dict(equations).items() 74 | 75 | source = dict([(eqnID, latex) for (latex, eqnID) in equations]) 76 | content = dict(filterNone([preprocess(eqnID, latex) for (latex, eqnID) in equations])) 77 | 78 | return (source, content) 79 | 80 | def parseArticle(article): 81 | doi = article.getElementsByTagName("ArticleDOI")[0].childNodes[0].wholeText 82 | print ("Parsing article %s" % doi) 83 | 84 | publicationDate = article.getElementsByTagName("PrintDate") or article.getElementsByTagName("CoverDate") or article.getElementsByTagName("OnlineDate") 85 | if publicationDate: 86 | publicationYear = publicationDate[0].getElementsByTagName("Year")[0].childNodes[0].wholeText 87 | else: 88 | print "Note: no publication year" 89 | publicationYear = None 90 | 91 | journalID = article.getElementsByTagName("JournalID")[0].childNodes[0].wholeText 92 | (source, content) = parseEquations(article) 93 | return {'_id': encodeDoi(doi), 'source': source, 'content': content, 'format': 'Article', 'containerID': journalID, 'publicationYear': publicationYear} 94 | 95 | def parseChapter(chapter): 96 | doi = chapter.getElementsByTagName("ChapterDOI")[0].childNodes[0].wholeText 97 | print ("Parsing chapter %s" % doi) 98 | (source, content) = parseEquations(chapter) 99 | return {'_id': encodeDoi(doi), 'source': source, 'content': content, 'format':'Chapter'} 100 | 101 | def parseBook(book): 102 | bookDOI = book.getElementsByTagName("BookDOI")[0].childNodes[0].wholeText 103 | 104 | publicationDate = book.getElementsByTagName("BookCopyright") 105 | if publicationDate: 106 | publicationYear = publicationDate[0].getElementsByTagName("CopyrightYear")[0].childNodes[0].wholeText 107 | else: 108 | print "Note: no publication year" 109 | publicationYear = None 110 | 111 | chapters = [] 112 | for chapter in book.getElementsByTagName("Chapter"): 113 | chapter = parseChapter(chapter) 114 | chapter['containerID'] = bookDOI 115 | chapter['publicationYear'] = publicationYear 116 | chapters.append(chapter) 117 | return chapters 118 | 119 | def parseFile(fileName): 120 | xml = minidom.parse(fileName) 121 | 122 | articles = [parseArticle(article) for article in xml.getElementsByTagName("Article")] 123 | chapters = [] 124 | for book in xml.getElementsByTagName("Book"): 125 | chapters.extend(parseBook(book)) 126 | docs = articles + chapters 127 | 128 | return docs 129 | 130 | ### Adding and deleting articles from the database ### 131 | 132 | def addFile(fileName, type): 133 | db = couchdb_server['documents'] 134 | 135 | print "Reading file %s" % fileName 136 | docs = parseFile(fileName) 137 | 138 | for doc in docs: 139 | doc['type'] = type 140 | 141 | oldDoc = db.get(doc['_id'],None) 142 | if not oldDoc: 143 | print "Adding new entry" 144 | db[doc['_id']] = doc 145 | elif (doc['type'] == 'xml.meta') and (oldDoc['type'] == 'xml'): 146 | print "Full entry already exists, not overwriting with meta" 147 | else: 148 | print "Overwriting existing entry" 149 | doc['_rev'] = oldDoc['_rev'] 150 | db[doc['_id']] = doc 151 | 152 | print 153 | 154 | def delFile(fileName, type): 155 | db = couchdb_server['documents'] 156 | 157 | print "Reading file %s" % fileName 158 | xml = minidom.parse(fileName) 159 | 160 | for article in xml.getElementsByTagName("Article"): 161 | doi = encodeDoi(article.getElementsByTagName("ArticleDOI")[0].childNodes[0].wholeText) 162 | 163 | oldDoc = db.get(doi,None) 164 | if not oldDoc: 165 | print "No entry to delete" 166 | elif (type == 'xml.meta') and (oldDoc['type'] == 'xml'): 167 | print "Full entry exists, not deleting meta" 168 | else: 169 | print "Deleting entry" 170 | del db[doi] 171 | 172 | # Reprocess all latex sources in the database, handy when changing the preprocessor 173 | def reprocess(): 174 | db = couchdb_server['documents'] 175 | 176 | print "Reprocessing latex sources" 177 | for doi in db: 178 | print "Reprocessing %s" % decodeDoi(doi) 179 | doc = db[doi] 180 | doc['content'] = dict(filterNone([(preprocess(eqnID, latex)) for (eqnID, latex) in doc['source'].items()])) 181 | db[doi] = doc 182 | 183 | # Rename journalID field to containerID 184 | def convert_journalID_containerID(): 185 | db = couchdb_server['documents'] 186 | 187 | print "Converting" 188 | for doi in db: 189 | print "Converting %s" % decodeDoi(doi) 190 | doc = db[doi] 191 | if 'journalID' in doc: 192 | doc['containerID'] = doc['journalID'] 193 | del doc['journalID'] 194 | db[doi] = doc 195 | 196 | def ml_year(doi): 197 | response = urllib.urlopen("http://latexalpha.mpstechnologies.com/year.do?doi=" + doi).read() 198 | xml = minidom.parseString(response) 199 | return xml.childNodes[0].getAttribute('year')[0:4] 200 | 201 | # Check dates against ML 202 | def check_dates(): 203 | db = couchdb_server['documents'] 204 | 205 | print "Checking dates" 206 | for doi in db: 207 | try: 208 | doc = db[doi] 209 | actual = doc['publicationYear'] 210 | expected = ml_year(decodeDoi(doi)) 211 | if expected != "": 212 | if expected != actual: 213 | print ("Doi: %s Expected: %s Actual: %s" % (doi, expected, actual)) 214 | doc['publicationYear'] = expected 215 | db[doi] = doc 216 | else: 217 | print ("Doi: %s ok" % doi) 218 | elif doc.get('format', 'article').lower() == 'article': 219 | print ("ML year not defined for article: %s" % doi) 220 | except KeyboardInterrupt, e: 221 | raise e 222 | except Exception, e: 223 | print ("Failed on doi: %s" % doi) 224 | print e 225 | 226 | # Repair this server by copying content from targetServer 227 | def repair(targetServer): 228 | db = couchdb_server['documents'] 229 | targetdb = couchdb.client.Server(targetServer)['documents'] 230 | 231 | print "Copying from %s" % target_server 232 | 233 | for doi in db: 234 | targetDoc = targetdb.get(doi,None) 235 | if targetDoc: 236 | db[doi] = targetDoc 237 | 238 | ### Command line interaction ### 239 | 240 | def walk(path): 241 | for root, _, files in os.walk(arg): 242 | for file in files: 243 | yield os.path.join(root,file) 244 | 245 | def usage(): 246 | print "Usage: --init, --reprocess, --add=/docs/addme, --del=/docs/delme" 247 | 248 | import os, os.path, getopt 249 | 250 | if __name__ == '__main__': 251 | try: 252 | opts, args = getopt.getopt(sys.argv[1:], "", ["init", "add=", "del=", "convert", "reprocess", "check_dates"]) 253 | errors = [] 254 | 255 | for opt, arg in opts: 256 | if opt == "--init": 257 | initDB() 258 | elif opt == "--add": 259 | for file in walk(arg): 260 | try: 261 | if file.lower().endswith(".xml"): 262 | addFile(file,"xml") 263 | elif file.lower().endswith(".xml.meta"): 264 | addFile(file,"xml.meta") 265 | except KeyboardInterrupt, e: 266 | raise e 267 | except Exception, exc: 268 | print exc 269 | errors.append((file,exc)) 270 | elif opt == "--del": 271 | for file in walk(arg): 272 | try: 273 | if file.lower().endswith(".xml"): 274 | delFile(file,"xml") 275 | elif file.lower().endswith(".xml.meta"): 276 | delFile(file,"xml.meta") 277 | except KeyboardInterrupt, e: 278 | raise e 279 | except Exception, exc: 280 | print exc 281 | errors.append((file,exc)) 282 | elif opt == "--reprocess": 283 | reprocess() 284 | elif opt == "--check_dates": 285 | check_dates() 286 | elif opt == "--convert": 287 | convert_journalID_containerID() 288 | if errors: 289 | print "Errors occurred whilst processing the following files:" 290 | for (fi,exc) in errors: 291 | print fi 292 | print exc 293 | else: 294 | print "Ok" 295 | 296 | except getopt.GetoptError: 297 | usage() 298 | sys.exit(2) 299 | -------------------------------------------------------------------------------- /scripts/monitor.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ Runs regression tests against texsearch and reports failures by email. In production is run by cron every 15 minutes """ 4 | 5 | import urllib 6 | from xml.dom import minidom 7 | import smtplib 8 | from email.MIMEMultipart import MIMEMultipart 9 | from email.MIMEText import MIMEText 10 | import time 11 | import os.path as path 12 | from popen2 import popen2 13 | 14 | ports = [5985, 5984] 15 | searchTerms = open("searchTerms").read().splitlines() 16 | 17 | user = 'latex_operations@springer.com' 18 | 19 | # Reporting levels 20 | errorGroup = ['jamiiecb@googlemail.com'] # ['latex_operations@springer.com'] 21 | infoGroup = ['jamiiecb@googlemail.com'] 22 | 23 | def searchURL(port,searchTerm): 24 | return ("http://localhost:%d/documents/_external/index?searchTerm=%s&precision=0.66&limit=10000" % (port,urllib.quote(searchTerm))) 25 | 26 | def countResults(resultString): 27 | try: 28 | dom = minidom.parseString(resultString) 29 | if dom.getElementsByTagName("results"): 30 | results = dom.getElementsByTagName("result") + dom.getElementsByTagName("Chapter") + dom.getElementsByTagName("Article") 31 | return len(results) 32 | except Exception, e: 33 | pass 34 | 35 | # Not a correct result string 36 | return None 37 | 38 | def readResults(port, i): 39 | file = open(("%s/%s" % (port, i)), 'r') 40 | results = int(file.read()) 41 | file.close() 42 | 43 | return results 44 | 45 | def writeResults(port, i, results): 46 | file = open(("%s/%s" % (port, i)), 'w') 47 | file.write(str(results)) 48 | file.close 49 | 50 | def init(): 51 | for port in ports: 52 | for i in range(0,len(searchTerms)): 53 | url = searchURL(port, searchTerms[i]) 54 | resultString = urllib.urlopen(url).read() 55 | results = countResults(resultString) 56 | writeResults(port, i, results) 57 | 58 | def test(): 59 | info = [] 60 | errors = [] 61 | 62 | for port in ports: 63 | for i in range(0,len(searchTerms)): 64 | url = searchURL(port, searchTerms[i]) 65 | 66 | try: 67 | resultString = urllib.urlopen(url).read() 68 | results = countResults(resultString) 69 | expectedResults = readResults(port, i) 70 | if results == None: 71 | # Didnt get a correct result string 72 | errors.append(("Url: %s\n%s" % (url, resultString))) 73 | elif results == expectedResults: 74 | # Uninteresting 75 | pass 76 | elif results > expectedResults: 77 | # No of results may increase when adding content 78 | writeResults(port, i, results) 79 | info.append(("Url: %s\nExpected %d results, got %d results" % (url, expectedResults, results))) 80 | else: 81 | # No of results should never decrease 82 | writeResults(port, i, results) 83 | errors.append(("Url: %s\nExpected %d results, got %d results" % (url, expectedResults, results))) 84 | except Exception, e: 85 | # Most likely connection refused or http 500 86 | errors.append(("Url: %s\n%s" % (url, str(e)))) 87 | 88 | return (info, errors) 89 | 90 | def mail(to, subject, text): 91 | print subject 92 | print text 93 | 94 | msg = MIMEMultipart() 95 | msg['From'] = user 96 | msg['To'] = to 97 | msg['Subject'] = subject 98 | msg.attach(MIMEText(text)) 99 | 100 | mailServer = smtplib.SMTP('smtp.springer-sbm.com') 101 | mailServer.sendmail(user, to, msg.as_string()) 102 | mailServer.close() 103 | 104 | def top(): 105 | pout, pin = popen2("top -b -n 1") 106 | return pout.read() 107 | 108 | def reportErrors(errors): 109 | subject = ("TeXsearch error report: %s" % time.asctime()) 110 | text = "\n\n".join(errors + [top()]) 111 | for e in errorGroup: 112 | mail(e, subject, text) 113 | 114 | def reportInfo(info): 115 | subject = ("TeXsearch info report: %s" % time.asctime()) 116 | text = "\n\n".join(info + [top()]) 117 | for i in infoGroup: 118 | mail(i, subject, text) 119 | 120 | import sys, getopt 121 | 122 | if __name__ == '__main__': 123 | opts, args = getopt.getopt(sys.argv[1:], "", ["init","test"]) 124 | for opt, arg in opts: 125 | if opt == "--init": 126 | init() 127 | if opt == "--test": 128 | info, errors = test() 129 | if errors: 130 | reportErrors(errors) 131 | if info: 132 | reportInfo(info) 133 | -------------------------------------------------------------------------------- /scripts/preprocessor.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ Parses and preprocesses LaTeX formulae using PlasTeX """ 4 | 5 | import string, re 6 | from plasTeX import TeXFragment, TeXDocument 7 | import plasTeX.Context 8 | from plasTeX.DOM import Node 9 | from plasTeX.TeX import TeX 10 | from plasTeX.Base.TeX.Primitives import MathShift 11 | 12 | ### LaTeX preprocessing ### 13 | 14 | # Ignore useless nodes 15 | # There are probably more nodes that could be ignored but these are the most common 16 | ignoreSet = frozenset([ 17 | 'displaymath' 18 | ,'bgroup' 19 | ,'egroup' 20 | ,'math' 21 | ,'text' 22 | ,'nulldelimiterspace' 23 | ,'kern' 24 | ,'vphantom' 25 | ,'hphantom' 26 | ,'hfill' 27 | ,'vfill' 28 | ,'hbox' 29 | ,'align' 30 | ,'aligned' 31 | ,'gathered' 32 | ,'active::&' 33 | ,'#document' 34 | ,'document' 35 | ,'rm' 36 | ,'par' 37 | ,'None' 38 | ,'mathord' 39 | ,'array' 40 | ]) 41 | 42 | class BadProcess(Exception): 43 | pass 44 | 45 | class Processor: 46 | def __init__(self): 47 | self.textNode = False 48 | 49 | def process(self,node): 50 | if node.nodeName.startswith('text'): 51 | self.textNode = True 52 | if node.nodeType == Node.TEXT_NODE: 53 | # Short circuit text nodes 54 | text = unicode(node) 55 | # Unfortunately plasTeX does not place \text node arguments under text nodes 56 | if self.textNode: 57 | self.addText(text) 58 | self.textNode = False 59 | else: 60 | for char in text: 61 | if char != ' ': 62 | self.addText(char) 63 | elif node.nodeName in ignoreSet: 64 | # Ignore node and move on to children 65 | for child in node.childNodes: 66 | self.process(child) 67 | else: 68 | self.pushMacro(unicode(node.nodeName)) 69 | self.processChildren(node) 70 | self.popMacro(unicode(node.nodeName)) 71 | 72 | return self 73 | 74 | def processChildren(self,node): 75 | # See if we have any attributes to process 76 | if node.hasAttributes(): 77 | for key, value in node.attributes.items(): 78 | # If the key is 'self' these nodes are the same as the child nodes 79 | # If the key is '*modifier*' we dont care about it 80 | if key == 'self' or key == '*modifier*': 81 | continue 82 | elif value.__class__ is TeXFragment: 83 | self.openBracket() 84 | for child in value.childNodes: 85 | self.process(child) 86 | self.closeBracket() 87 | elif value.__class__ is Node: 88 | self.openBracket() 89 | self.process(value) 90 | self.closeBracket() 91 | else: 92 | continue 93 | 94 | # Process child nodes 95 | if node.childNodes: 96 | self.openBracket() 97 | for child in node.childNodes: 98 | self.process(child) 99 | self.closeBracket() 100 | 101 | return self 102 | 103 | # Converts a plasTeX DOM tree into a json tree # 104 | class JsonProcessor(Processor): 105 | def __init__(self): 106 | self.textNode = False 107 | self.text = [[]] 108 | self.macros = [] 109 | 110 | def dumps(self): 111 | if len(self.text) != 1: 112 | raise BadProcess() 113 | return self.text[0] 114 | 115 | def addText(self,text): 116 | self.text[-1].append(text) 117 | 118 | def pushMacro(self,macro): 119 | self.text.append([]) 120 | self.macros.append(macro) 121 | 122 | def popMacro(self,macro): 123 | currentMacro = self.macros.pop() 124 | if currentMacro != macro: 125 | raise BadProcess() 126 | currentText = self.text.pop() 127 | self.text[-1].append({currentMacro : currentText}) 128 | 129 | def openBracket(self): 130 | pass 131 | 132 | def closeBracket(self): 133 | pass 134 | 135 | # Converts a plasTeX DOM tree back into plain LaTeX 136 | class PlainProcessor(Processor): 137 | def __init__(self): 138 | self.textNode = False 139 | self.text = [] 140 | self.macros = 0 141 | 142 | def dumps(self): 143 | return " ".join(self.text) 144 | 145 | def addText(self,text): 146 | self.text.append(text) 147 | 148 | def pushMacro(self,macro): 149 | self.macros += 1 150 | if macro.startswith("active::"): 151 | self.text.append(macro.lstrip("active::")) 152 | else: 153 | self.text.append("\\" + macro) 154 | 155 | def popMacro(self,macro): 156 | self.macros -= 1 157 | 158 | def openBracket(self): 159 | self.text.append("{") 160 | 161 | def closeBracket(self): 162 | self.text.append("}") 163 | 164 | # Override plasTeX's buggy handling of mathmode, since we dont need textmode 165 | plasTeX.Context.Context.isMathMode = property(lambda obj: True) 166 | 167 | def parseLaTeX(string): 168 | # PlasTeX bug - this variable doent get reinitialised 169 | MathShift.inEnv = [] 170 | 171 | # Instantiate a TeX processor and parse the input text 172 | tex = TeX() 173 | tex.disableLogging() 174 | 175 | # Parse the LaTeX 176 | tex.input(string) 177 | return tex.parse() 178 | 179 | ### Making the preprocessor available as a couchdb _external ### 180 | 181 | import sys 182 | import simplejson as json 183 | 184 | def requests(): 185 | line = sys.stdin.readline() 186 | while line: 187 | yield json.loads(line) 188 | line = sys.stdin.readline() 189 | 190 | import signal 191 | 192 | class Timeout(Exception): 193 | def __str__(self): 194 | return "Timed out" 195 | 196 | def handleTimeout(signum,frame): 197 | raise Timeout() 198 | 199 | def main(): 200 | # Work around the lack of real threading by using an alarm signal for timeouts 201 | signal.signal(signal.SIGALRM, handleTimeout) 202 | 203 | for request in requests(): 204 | try: 205 | try: # Nested try because older versions of python cant handle except/finally 206 | query = request['query'] 207 | 208 | format = query['format'] 209 | 210 | try: 211 | timeout = int(float(query['timeout'])) 212 | except ValueError, e: 213 | timeout = 5 214 | except KeyError, e: 215 | timeout = 5 216 | signal.alarm(timeout) 217 | 218 | dom = parseLaTeX("\\begin{document} $$" + query['latex'] + "$$ \\end{document}") 219 | 220 | if format == 'json-plain': 221 | jsonResponse = JsonProcessor().process(dom).dumps() 222 | plainResponse = PlainProcessor().process(dom).dumps() 223 | response = {'code':200, 'json':{'json':jsonResponse, 'plain':plainResponse}} 224 | elif format == 'json': 225 | jsonResponse = JsonProcessor().process(dom).dumps() 226 | response = {'code':200, 'json':jsonResponse} 227 | elif format == 'plain': 228 | plainResponse = PlainProcessor().process(dom).dumps() 229 | response = {'code':200, 'body':plainResponse, 'headers':{'Content-type':'text/plain'}} 230 | else: 231 | response = {'code':400, 'body':('Error: bad format argument'), 'headers':{'Content-type':'text/plain'}} # Bad request 232 | 233 | except KeyError, e: 234 | response = {'code':400, 'body':('Error: ' + str(e)), 'headers':{'Content-type':'text/plain'}} # Bad request 235 | except Timeout, e: 236 | response = {'code':500, 'body':('Error: ' + str(e)), 'headers':{'Content-type':'text/plain'}} # Internal server error 237 | except Exception, e: 238 | response = {'code':500, 'body':('Error: ' + str(e)), 'headers':{'Content-type':'text/plain'}} # Internal server error 239 | finally: 240 | # Deactivate the timeout 241 | signal.alarm(0) 242 | 243 | sys.stdout.write("%s\n" % json.dumps(response)) 244 | sys.stdout.flush() 245 | 246 | def dumps(latex): 247 | return JsonProcessor().process(parseLaTeX("\\begin{document} $$" + latex + "$$ \\end{document}")).dumps() 248 | 249 | if __name__ == "__main__": 250 | main() 251 | -------------------------------------------------------------------------------- /scripts/replay.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ Replay existing searches found in couchdb logs """ 4 | 5 | import sys 6 | import re 7 | import search 8 | import urllib 9 | 10 | search_term_re = re.compile(r'searchTerm=([^&]*)&') 11 | 12 | def replay_log_file(filename): 13 | search_terms = set() 14 | 15 | for log in open(filename): 16 | match = search_term_re.search(log) 17 | if match: 18 | search_term = urllib.unquote(match.group(1)) 19 | search_terms.add(search_term) 20 | 21 | for search_term in search_terms: 22 | result = search.search(search_term, searchTimeout="55000.0", limit="10000") 23 | yield (result['time'], search_term) 24 | 25 | if __name__ == '__main__': 26 | for time, search_term in replay_log_file(sys.argv[1]): 27 | print time, search_term 28 | -------------------------------------------------------------------------------- /scripts/search.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ Python interface to the index external service running on couchdb """ 4 | 5 | import urllib 6 | import time 7 | from xml.dom import minidom 8 | from db import port, couchdb_server 9 | from util import encodeDoi 10 | 11 | def parseResults(results): 12 | db = couchdb_server['documents'] 13 | for result in results.getElementsByTagName("Article") + results.getElementsByTagName("Book"): 14 | doi = result.attributes.get('doi').value 15 | source = db[encodeDoi(doi)]['source'] 16 | eqns = [(eqn.attributes.get('id').value, eqn.attributes.get('distance').value) for eqn in result.getElementsByTagName("equation")] 17 | yield (doi, [(eqnID, distance, source[eqnID]) for (eqnID, distance) in eqns]) 18 | 19 | def search(searchTerm, searchTimeout="20.0", limit="2500", precision="0.7"): 20 | response = {} 21 | 22 | url = "http://localhost:%s/documents/_external/index?searchTerm=%s&searchTimeout=%s&limit=%s&precision=%s" % (port, urllib.quote(searchTerm), searchTimeout, limit, precision) 23 | startTime = time.time() 24 | results = urllib.urlopen(url).read() 25 | endTime = time.time() 26 | 27 | response['time'] = endTime - startTime 28 | if results == "" or results == "" or results == "": 29 | response['error'] = results 30 | else: 31 | response['results'] = list(parseResults(minidom.parseString(results))) 32 | 33 | return response 34 | 35 | import sys 36 | import simplejson as json 37 | 38 | def requests(): 39 | line = sys.stdin.readline() 40 | while line: 41 | yield json.loads(line) 42 | line = sys.stdin.readline() 43 | 44 | def main(): 45 | for request in requests(): 46 | try: 47 | query = request['query'] 48 | response = {'code':200, 'json':search(**query)} 49 | except Exception, e: 50 | response = {'code':200, 'body':('Error: ' + str(e)), 'headers':{'Content-type':'text/plain'}} # Internal server error 51 | 52 | sys.stdout.write("%s\n" % json.dumps(response)) 53 | sys.stdout.flush() 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /scripts/test.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ Whitebox testing of the index external service running on couchdb """ 4 | 5 | import os, sys, httplib, urllib, socket 6 | from xml.dom import minidom 7 | from util import decodeDoi 8 | import random 9 | from preprocessor import PlainProcessor, parseLaTeX 10 | import couchdb.client 11 | from db import couchdb_server, port 12 | import time 13 | 14 | rand = random.Random() 15 | 16 | def pruneNode(node): 17 | if node.childNodes: 18 | if len(node.childNodes)>2: 19 | start = rand.randint(0, len(node.childNodes)-1) 20 | end = rand.randint(0, len(node.childNodes)-1) 21 | if start>end: 22 | start, end = end, start 23 | elif start == end and end < len(node.childNodes): 24 | end = end+1 25 | elif start == end and start > 0: 26 | start = start-1 27 | try: 28 | del node.childNodes[end:len(node.childNodes)] 29 | del node.childNodes[0:start] 30 | except AttributeError: 31 | pass # Some types of nodes dont support deletion 32 | 33 | return node 34 | 35 | # Return a random (and syntacically correct) substring of a latex string 36 | def substring(latex): 37 | node = parseLaTeX("\\begin{document} $$ " + latex + " $$ \\end{document}") 38 | pruneNode(node) 39 | result = PlainProcessor().process(node).dumps() 40 | return result 41 | 42 | # Search for a substring of an existing equation and check that the parent article is included in the results 43 | def runTest(doi,transform): 44 | db = couchdb_server['documents'] 45 | eqnID, source = rand.choice(db[doi]['source'].items()) 46 | results = None 47 | searchTerm = None 48 | try: 49 | searchTerm = transform(source) 50 | url = "http://localhost:%s/documents/_external/index?searchTerm=\"%s\"&searchTimeout=20&limit=2500" % (port, urllib.quote(searchTerm)) 51 | startTime = time.time() 52 | resultsFile = urllib.urlopen(url) 53 | endTime = time.time() 54 | results = minidom.parse(resultsFile) 55 | if results.getElementsByTagName("LatexParseError"): 56 | print "Latex parse error on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) 57 | return False 58 | if results.getElementsByTagName("TimedOut"): 59 | print "Timed out on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) 60 | return False 61 | if results.getElementsByTagName("LimitExceeded"): 62 | print "Limit exceeded on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) 63 | return False 64 | for result in results.getElementsByTagName("Article") + results.getElementsByTagName("Chapter"): 65 | if result.attributes.get('doi').value == decodeDoi(doi): 66 | for eqn in result.getElementsByTagName("equation"): 67 | if eqn.attributes.get('id').value == eqnID: 68 | print "Passed on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, endTime-startTime) 69 | return True 70 | print "Failed on doi: %s and eqnID: %s (%fs)" % (doi, eqnID, endTime-startTime) 71 | print searchTerm 72 | return False 73 | except KeyboardInterrupt, e: 74 | raise e 75 | except Exception, e: 76 | print "Error on doi: %s and eqnID: %s (%fs)" % (decodeDoi(doi), eqnID, 0) 77 | print e 78 | try: 79 | print "Searchterm: %s" % searchTerm 80 | except UnicodeEncodeError: 81 | pass 82 | return False 83 | 84 | def runTests(n,transform): 85 | db = couchdb_server['documents'] 86 | dois = list(db) 87 | for i in xrange(0,n): 88 | doi = None 89 | source = None 90 | while not source: 91 | try: 92 | doi = rand.choice(dois) 93 | source = db[doi]['source'] 94 | except socket.error: 95 | pass # Connection refused, probably because someone restarted the server 96 | runTest(doi,transform) 97 | sys.stdout.flush() 98 | 99 | import getopt 100 | 101 | if __name__ == '__main__': 102 | opts, args = getopt.getopt(sys.argv[1:], "", ["simple=","substring="]) 103 | for opt, arg in opts: 104 | if opt == "--simple": 105 | runTests(int(arg),lambda x: x) 106 | if opt == "--substring": 107 | runTests(int(arg),substring) 108 | print "Ok" 109 | 110 | -------------------------------------------------------------------------------- /scripts/times.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | """ Benchmarking for the index external service running on couchdb """ 4 | 5 | import os, sys, httplib, urllib, socket 6 | import random 7 | import couchdb.client 8 | from db import couchdb_server, port 9 | import time 10 | 11 | rand = random.Random() 12 | 13 | def runTime(doi): 14 | db = couchdb_server['documents'] 15 | eqnID, searchTerm = rand.choice(db[doi]['source'].items()) 16 | try: 17 | url = "http://localhost:%s/documents/_external/index?searchTerm=\"%s\"&searchTimeout=60&limit=10000" % (port, urllib.quote(searchTerm)) 18 | startTime = time.time() 19 | resultsFile = urllib.urlopen(url) 20 | endTime = time.time() 21 | print endTime-startTime 22 | except KeyboardInterrupt, e: 23 | raise e 24 | except Exception, e: 25 | pass 26 | 27 | def runTimes(n): 28 | db = couchdb_server['documents'] 29 | dois = list(db) 30 | for i in xrange(0,n): 31 | doi = None 32 | source = None 33 | while not source: 34 | try: 35 | doi = rand.choice(dois) 36 | source = db[doi]['source'] 37 | except socket.error: 38 | pass # Connection refused, probably because someone restarted the server 39 | runTime(doi) 40 | sys.stdout.flush() 41 | 42 | import getopt 43 | 44 | if __name__ == '__main__': 45 | opts, args = getopt.getopt(sys.argv[1:], "", ["n="]) 46 | for opt, arg in opts: 47 | if opt == "--n": 48 | runTimes(int(arg)) 49 | print "Ok" 50 | 51 | -------------------------------------------------------------------------------- /scripts/util.py: -------------------------------------------------------------------------------- 1 | def encodeDoi(doi): 2 | return doi.replace("/","_",1) 3 | 4 | def decodeDoi(doi): 5 | return doi.replace("_","/",1) 6 | -------------------------------------------------------------------------------- /src/_tags: -------------------------------------------------------------------------------- 1 | <*> : syntax(camlp4o), package(extlib), package(netclient), package(json-wheel), package(json-static), package(ancient), package(xml-light), package(str), package(unix) -------------------------------------------------------------------------------- /src/dynArray.ml: -------------------------------------------------------------------------------- 1 | (* Modified version of ExtLib DynArray - contains no functional values so is safer for Marshal *) 2 | 3 | (* 4 | * DynArray - Resizeable Ocaml arrays 5 | * Copyright (C) 2003 Brian Hurt 6 | * Copyright (C) 2003 Nicolas Cannasse 7 | * 8 | * This library is free software; you can redistribute it and/or 9 | * modify it under the terms of the GNU Lesser General Public 10 | * License as published by the Free Software Foundation; either 11 | * version 2.1 of the License, or (at your option) any later version, 12 | * with the special exception on linking described in file LICENSE. 13 | * 14 | * This library is distributed in the hope that it will be useful, 15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 | * Lesser General Public License for more details. 18 | * 19 | * You should have received a copy of the GNU Lesser General Public 20 | * License along with this library; if not, write to the Free Software 21 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 22 | *) 23 | 24 | type resizer_t = currslots:int -> oldlength:int -> newlength:int -> int 25 | 26 | type 'a intern 27 | 28 | external ilen : 'a intern -> int = "%obj_size" 29 | let idup (x : 'a intern) = if ilen x = 0 then x else (Obj.magic (Obj.dup (Obj.repr x)) : 'a intern) 30 | let imake tag len = (Obj.magic (Obj.new_block tag len) : 'a intern) 31 | external iget : 'a intern -> int -> 'a = "%obj_field" 32 | external iset : 'a intern -> int -> 'a -> unit = "%obj_set_field" 33 | 34 | type 'a t = { 35 | mutable arr : 'a intern; 36 | mutable len : int; 37 | } 38 | 39 | exception Invalid_arg of int * string * string 40 | 41 | let invalid_arg n f p = raise (Invalid_arg (n,f,p)) 42 | 43 | let length d = d.len 44 | 45 | let exponential_resizer ~currslots ~oldlength ~newlength = 46 | let rec doubler x = if x >= newlength then x else doubler (x * 2) in 47 | let rec halfer x = if x / 2 < newlength then x else halfer (x / 2) in 48 | if newlength = 1 then 49 | 1 50 | else if currslots = 0 then 51 | doubler 1 52 | else if currslots < newlength then 53 | doubler currslots 54 | else 55 | halfer currslots 56 | 57 | let step_resizer step = 58 | if step <= 0 then invalid_arg step "step_resizer" "step"; 59 | (fun ~currslots ~oldlength ~newlength -> 60 | if currslots < newlength || newlength < (currslots - step) 61 | then 62 | (newlength + step - (newlength mod step)) 63 | else 64 | currslots) 65 | 66 | let conservative_exponential_resizer ~currslots ~oldlength ~newlength = 67 | let rec doubler x = if x >= newlength then x else doubler (x * 2) in 68 | let rec halfer x = if x / 2 < newlength then x else halfer (x / 2) in 69 | if currslots < newlength then begin 70 | if newlength = 1 then 71 | 1 72 | else if currslots = 0 then 73 | doubler 1 74 | else 75 | doubler currslots 76 | end else if oldlength < newlength then 77 | halfer currslots 78 | else 79 | currslots 80 | 81 | let default_resizer = conservative_exponential_resizer 82 | 83 | let changelen (d : 'a t) newlen = 84 | let oldsize = ilen d.arr in 85 | let r = default_resizer 86 | ~currslots:oldsize 87 | ~oldlength:d.len 88 | ~newlength:newlen 89 | in 90 | (* We require the size to be at least large enough to hold the number 91 | * of elements we know we need! 92 | *) 93 | let newsize = if r < newlen then newlen else r in 94 | if newsize <> oldsize then begin 95 | let newarr = imake 0 newsize in 96 | let cpylen = (if newlen < d.len then newlen else d.len) in 97 | for i = 0 to cpylen - 1 do 98 | iset newarr i (iget d.arr i); 99 | done; 100 | d.arr <- newarr; 101 | end; 102 | d.len <- newlen 103 | 104 | let compact d = 105 | if d.len <> ilen d.arr then begin 106 | let newarr = imake 0 d.len in 107 | for i = 0 to d.len - 1 do 108 | iset newarr i (iget d.arr i) 109 | done; 110 | d.arr <- newarr; 111 | end 112 | 113 | let create() = 114 | { 115 | len = 0; 116 | arr = imake 0 0; 117 | } 118 | 119 | let make initsize = 120 | if initsize < 0 then invalid_arg initsize "make" "size"; 121 | { 122 | len = 0; 123 | arr = imake 0 initsize; 124 | } 125 | 126 | let init initlen f = 127 | if initlen < 0 then invalid_arg initlen "init" "len"; 128 | let arr = imake 0 initlen in 129 | for i = 0 to initlen-1 do 130 | iset arr i (f i) 131 | done; 132 | { 133 | len = initlen; 134 | arr = arr; 135 | } 136 | 137 | let set_resizer d resizer = 138 | () 139 | 140 | let get_resizer d = 141 | default_resizer 142 | 143 | let empty d = 144 | d.len = 0 145 | 146 | let get d idx = 147 | if idx < 0 || idx >= d.len then invalid_arg idx "get" "index"; 148 | iget d.arr idx 149 | 150 | let last d = 151 | if d.len = 0 then invalid_arg 0 "last" ""; 152 | iget d.arr (d.len - 1) 153 | 154 | let set d idx v = 155 | if idx < 0 || idx >= d.len then invalid_arg idx "set" "index"; 156 | iset d.arr idx v 157 | 158 | let insert d idx v = 159 | if idx < 0 || idx > d.len then invalid_arg idx "insert" "index"; 160 | if d.len = ilen d.arr then changelen d (d.len + 1) else d.len <- d.len + 1; 161 | if idx < d.len - 1 then begin 162 | for i = d.len - 2 downto idx do 163 | iset d.arr (i+1) (iget d.arr i) 164 | done; 165 | end; 166 | iset d.arr idx v 167 | 168 | let add d v = 169 | if d.len = ilen d.arr then changelen d (d.len + 1) else d.len <- d.len + 1; 170 | iset d.arr (d.len - 1) v 171 | 172 | let delete d idx = 173 | if idx < 0 || idx >= d.len then invalid_arg idx "delete" "index"; 174 | let oldsize = ilen d.arr in 175 | (* we don't call changelen because we want to blit *) 176 | let r = default_resizer 177 | ~currslots:oldsize 178 | ~oldlength:d.len 179 | ~newlength:(d.len - 1) 180 | in 181 | let newsize = (if r < d.len - 1 then d.len - 1 else r) in 182 | if oldsize <> newsize then begin 183 | let newarr = imake 0 newsize in 184 | for i = 0 to idx - 1 do 185 | iset newarr i (iget d.arr i); 186 | done; 187 | for i = idx to d.len - 2 do 188 | iset newarr i (iget d.arr (i+1)); 189 | done; 190 | d.arr <- newarr; 191 | end else begin 192 | for i = idx to d.len - 2 do 193 | iset d.arr i (iget d.arr (i+1)); 194 | done; 195 | iset d.arr (d.len - 1) (Obj.magic 0) 196 | end; 197 | d.len <- d.len - 1 198 | 199 | 200 | let delete_range d idx len = 201 | if len < 0 then invalid_arg len "delete_range" "length"; 202 | if idx < 0 || idx + len > d.len then invalid_arg idx "delete_range" "index"; 203 | let oldsize = ilen d.arr in 204 | (* we don't call changelen because we want to blit *) 205 | let r = default_resizer 206 | ~currslots:oldsize 207 | ~oldlength:d.len 208 | ~newlength:(d.len - len) 209 | in 210 | let newsize = (if r < d.len - len then d.len - len else r) in 211 | if oldsize <> newsize then begin 212 | let newarr = imake 0 newsize in 213 | for i = 0 to idx - 1 do 214 | iset newarr i (iget d.arr i); 215 | done; 216 | for i = idx to d.len - len - 1 do 217 | iset newarr i (iget d.arr (i+len)); 218 | done; 219 | d.arr <- newarr; 220 | end else begin 221 | for i = idx to d.len - len - 1 do 222 | iset d.arr i (iget d.arr (i+len)); 223 | done; 224 | for i = d.len - len to d.len - 1 do 225 | iset d.arr i (Obj.magic 0) 226 | done; 227 | end; 228 | d.len <- d.len - len 229 | 230 | let clear d = 231 | d.len <- 0; 232 | d.arr <- imake 0 0 233 | 234 | let delete_last d = 235 | if d.len <= 0 then invalid_arg 0 "delete_last" ""; 236 | (* erase for GC, in case changelen don't resize our array *) 237 | iset d.arr (d.len - 1) (Obj.magic 0); 238 | changelen d (d.len - 1) 239 | 240 | let rec blit src srcidx dst dstidx len = 241 | if len < 0 then invalid_arg len "blit" "len"; 242 | if srcidx < 0 || srcidx + len > src.len then invalid_arg srcidx "blit" "source index"; 243 | if dstidx < 0 || dstidx > dst.len then invalid_arg dstidx "blit" "dest index"; 244 | let newlen = dstidx + len in 245 | if newlen > ilen dst.arr then begin 246 | (* this case could be inlined so we don't blit on just-copied elements *) 247 | changelen dst newlen 248 | end else begin 249 | if newlen > dst.len then dst.len <- newlen; 250 | end; 251 | (* same array ! we need to copy in reverse order *) 252 | if src.arr == dst.arr && dstidx > srcidx then 253 | for i = len - 1 downto 0 do 254 | iset dst.arr (dstidx+i) (iget src.arr (srcidx+i)); 255 | done 256 | else 257 | for i = 0 to len - 1 do 258 | iset dst.arr (dstidx+i) (iget src.arr (srcidx+i)); 259 | done 260 | 261 | let append src dst = 262 | blit src 0 dst dst.len src.len 263 | 264 | let to_list d = 265 | let rec loop idx accum = 266 | if idx < 0 then accum else loop (idx - 1) (iget d.arr idx :: accum) 267 | in 268 | loop (d.len - 1) [] 269 | 270 | let to_array d = 271 | if d.len = 0 then begin 272 | (* since the empty array is an atom, we don't care if float or not *) 273 | [||] 274 | end else begin 275 | let arr = Array.make d.len (iget d.arr 0) in 276 | for i = 1 to d.len - 1 do 277 | Array.unsafe_set arr i (iget d.arr i) 278 | done; 279 | arr; 280 | end 281 | 282 | let of_list lst = 283 | let size = List.length lst in 284 | let arr = imake 0 size in 285 | let rec loop idx = function 286 | | h :: t -> iset arr idx h; loop (idx + 1) t 287 | | [] -> () 288 | in 289 | loop 0 lst; 290 | { 291 | len = size; 292 | arr = arr; 293 | } 294 | 295 | let of_array src = 296 | let size = Array.length src in 297 | let is_float = Obj.tag (Obj.repr src) = Obj.double_array_tag in 298 | let arr = (if is_float then begin 299 | let arr = imake 0 size in 300 | for i = 0 to size - 1 do 301 | iset arr i (Array.unsafe_get src i); 302 | done; 303 | arr 304 | end else 305 | (* copy the fields *) 306 | idup (Obj.magic src : 'a intern)) 307 | in 308 | { 309 | len = size; 310 | arr = arr; 311 | } 312 | 313 | let copy src = 314 | { 315 | len = src.len; 316 | arr = idup src.arr; 317 | } 318 | 319 | let sub src start len = 320 | if len < 0 then invalid_arg len "sub" "len"; 321 | if start < 0 || start + len > src.len then invalid_arg start "sub" "start"; 322 | let arr = imake 0 len in 323 | for i = 0 to len - 1 do 324 | iset arr i (iget src.arr (i+start)); 325 | done; 326 | { 327 | len = len; 328 | arr = arr; 329 | } 330 | 331 | let iter f d = 332 | for i = 0 to d.len - 1 do 333 | f (iget d.arr i) 334 | done 335 | 336 | let iteri f d = 337 | for i = 0 to d.len - 1 do 338 | f i (iget d.arr i) 339 | done 340 | 341 | let filter f d = 342 | let l = d.len in 343 | let a = imake 0 l in 344 | let a2 = d.arr in 345 | let p = ref 0 in 346 | for i = 0 to l - 1 do 347 | let x = iget a2 i in 348 | if f x then begin 349 | iset a !p x; 350 | incr p; 351 | end; 352 | done; 353 | d.len <- !p; 354 | d.arr <- a 355 | 356 | let index_of f d = 357 | let rec loop i = 358 | if i >= d.len then 359 | raise Not_found 360 | else 361 | if f (iget d.arr i) then 362 | i 363 | else 364 | loop (i+1) 365 | in 366 | loop 0 367 | 368 | let map f src = 369 | let arr = imake 0 src.len in 370 | for i = 0 to src.len - 1 do 371 | iset arr i (f (iget src.arr i)) 372 | done; 373 | { 374 | len = src.len; 375 | arr = arr; 376 | } 377 | 378 | let mapi f src = 379 | let arr = imake 0 src.len in 380 | for i = 0 to src.len - 1 do 381 | iset arr i (f i (iget src.arr i)) 382 | done; 383 | { 384 | len = src.len; 385 | arr = arr; 386 | } 387 | 388 | let fold_left f x a = 389 | let rec loop idx x = 390 | if idx >= a.len then x else loop (idx + 1) (f x (iget a.arr idx)) 391 | in 392 | loop 0 x 393 | 394 | let fold_right f a x = 395 | let rec loop idx x = 396 | if idx < 0 then x 397 | else loop (idx - 1) (f (iget a.arr idx) x) 398 | in 399 | loop (a.len - 1) x 400 | 401 | let enum d = 402 | let rec make start = 403 | let idxref = ref 0 in 404 | let next () = 405 | if !idxref >= d.len then 406 | raise Enum.No_more_elements 407 | else 408 | let retval = iget d.arr !idxref in 409 | incr idxref; 410 | retval 411 | and count () = 412 | if !idxref >= d.len then 0 413 | else d.len - !idxref 414 | and clone () = 415 | make !idxref 416 | in 417 | Enum.make ~next:next ~count:count ~clone:clone 418 | in 419 | make 0 420 | 421 | let of_enum e = 422 | if Enum.fast_count e then begin 423 | let c = Enum.count e in 424 | let arr = imake 0 c in 425 | Enum.iteri (fun i x -> iset arr i x) e; 426 | { 427 | len = c; 428 | arr = arr; 429 | } 430 | end else 431 | let d = make 0 in 432 | Enum.iter (add d) e; 433 | d 434 | 435 | let unsafe_get a n = 436 | iget a.arr n 437 | 438 | let unsafe_set a n x = 439 | iset a.arr n x 440 | -------------------------------------------------------------------------------- /src/dynArray.mli: -------------------------------------------------------------------------------- 1 | (* 2 | * DynArray - Resizeable Ocaml arrays 3 | * Copyright (C) 2003 Brian Hurt 4 | * Copyright (C) 2003 Nicolas Cannasse 5 | * 6 | * This library is free software; you can redistribute it and/or 7 | * modify it under the terms of the GNU Lesser General Public 8 | * License as published by the Free Software Foundation; either 9 | * version 2.1 of the License, or (at your option) any later version, 10 | * with the special exception on linking described in file LICENSE. 11 | * 12 | * This library is distributed in the hope that it will be useful, 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 | * Lesser General Public License for more details. 16 | * 17 | * You should have received a copy of the GNU Lesser General Public 18 | * License along with this library; if not, write to the Free Software 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 20 | *) 21 | 22 | (** Dynamic arrays. 23 | 24 | A dynamic array is equivalent to a OCaml array that will resize itself 25 | when elements are added or removed, except that floats are boxed and 26 | that no initialization element is required. 27 | *) 28 | 29 | type 'a t 30 | 31 | exception Invalid_arg of int * string * string 32 | (** When an operation on an array fails, [Invalid_arg] is raised. The 33 | integer is the value that made the operation fail, the first string 34 | contains the function name that has been called and the second string 35 | contains the parameter name that made the operation fail. 36 | *) 37 | 38 | (** {6 Array creation} *) 39 | 40 | val create : unit -> 'a t 41 | (** [create()] returns a new empty dynamic array. *) 42 | 43 | val make : int -> 'a t 44 | (** [make count] returns an array with some memory already allocated so 45 | up to [count] elements can be stored into it without resizing. *) 46 | 47 | val init : int -> (int -> 'a) -> 'a t 48 | (** [init n f] returns an array of [n] elements filled with values 49 | returned by [f 0 , f 1, ... f (n-1)]. *) 50 | 51 | (** {6 Array manipulation functions} *) 52 | 53 | val empty : 'a t -> bool 54 | (** Return true if the number of elements in the array is 0. *) 55 | 56 | val length : 'a t -> int 57 | (** Return the number of elements in the array. *) 58 | 59 | val get : 'a t -> int -> 'a 60 | (** [get darr idx] gets the element in [darr] at index [idx]. If [darr] has 61 | [len] elements in it, then the valid indexes range from [0] to [len-1]. *) 62 | 63 | val last : 'a t -> 'a 64 | (** [last darr] returns the last element of [darr]. *) 65 | 66 | val set : 'a t -> int -> 'a -> unit 67 | (** [set darr idx v] sets the element of [darr] at index [idx] to value 68 | [v]. The previous value is overwritten. *) 69 | 70 | val insert : 'a t -> int -> 'a -> unit 71 | (** [insert darr idx v] inserts [v] into [darr] at index [idx]. All elements 72 | of [darr] with an index greater than or equal to [idx] have their 73 | index incremented (are moved up one place) to make room for the new 74 | element. *) 75 | 76 | val add : 'a t -> 'a -> unit 77 | (** [add darr v] appends [v] onto [darr]. [v] becomes the new 78 | last element of [darr]. *) 79 | 80 | val append : 'a t -> 'a t -> unit 81 | (** [append src dst] adds all elements of [src] to the end of [dst]. *) 82 | 83 | val delete : 'a t -> int -> unit 84 | (** [delete darr idx] deletes the element of [darr] at [idx]. All elements 85 | with an index greater than [idx] have their index decremented (are 86 | moved down one place) to fill in the hole. *) 87 | 88 | val delete_last : 'a t -> unit 89 | (** [delete_last darr] deletes the last element of [darr]. This is equivalent 90 | of doing [delete darr ((length darr) - 1)]. *) 91 | 92 | val delete_range : 'a t -> int -> int -> unit 93 | (** [delete_range darr p len] deletes [len] elements starting at index [p]. 94 | All elements with an index greater than [p+len] are moved to fill 95 | in the hole. *) 96 | 97 | val clear : 'a t -> unit 98 | (** remove all elements from the array and resize it to 0. *) 99 | 100 | val blit : 'a t -> int -> 'a t -> int -> int -> unit 101 | (** [blit src srcidx dst dstidx len] copies [len] elements from [src] 102 | starting with index [srcidx] to [dst] starting at [dstidx]. *) 103 | 104 | val compact : 'a t -> unit 105 | (** [compact darr] ensures that the space allocated by the array is minimal.*) 106 | 107 | (** {6 Array copy and conversion} *) 108 | 109 | val to_list : 'a t -> 'a list 110 | (** [to_list darr] returns the elements of [darr] in order as a list. *) 111 | 112 | val to_array : 'a t -> 'a array 113 | (** [to_array darr] returns the elements of [darr] in order as an array. *) 114 | 115 | val enum : 'a t -> 'a Enum.t 116 | (** [enum darr] returns the enumeration of [darr] elements. *) 117 | 118 | val of_list : 'a list -> 'a t 119 | (** [of_list lst] returns a dynamic array with the elements of [lst] in 120 | it in order. *) 121 | 122 | val of_array : 'a array -> 'a t 123 | (** [of_array arr] returns an array with the elements of [arr] in it 124 | in order. *) 125 | 126 | val of_enum : 'a Enum.t -> 'a t 127 | (** [of_enum e] returns an array that holds, in order, the elements of [e]. *) 128 | 129 | val copy : 'a t -> 'a t 130 | (** [copy src] returns a fresh copy of [src], such that no modification of 131 | [src] affects the copy, or vice versa (all new memory is allocated for 132 | the copy). *) 133 | 134 | val sub : 'a t -> int -> int -> 'a t 135 | (** [sub darr start len] returns an array holding the subset of [len] 136 | elements from [darr] starting with the element at index [idx]. *) 137 | 138 | (** {6 Array functional support} *) 139 | 140 | val iter : ('a -> unit) -> 'a t -> unit 141 | (** [iter f darr] calls the function [f] on every element of [darr]. It 142 | is equivalent to [for i = 0 to length darr - 1 do f (get darr i) done;] *) 143 | 144 | val iteri : (int -> 'a -> unit) -> 'a t -> unit 145 | (** [iter f darr] calls the function [f] on every element of [darr]. It 146 | is equivalent to [for i = 0 to length darr - 1 do f i (get darr i) done;] 147 | *) 148 | 149 | val map : ('a -> 'b) -> 'a t -> 'b t 150 | (** [map f darr] applies the function [f] to every element of [darr] 151 | and creates a dynamic array from the results - similar to [List.map] or 152 | [Array.map]. *) 153 | 154 | val mapi : (int -> 'a -> 'b) -> 'a t -> 'b t 155 | (** [mapi f darr] applies the function [f] to every element of [darr] 156 | and creates a dynamic array from the results - similar to [List.mapi] or 157 | [Array.mapi]. *) 158 | 159 | val fold_left : ('a -> 'b -> 'a) -> 'a -> 'b t -> 'a 160 | (** [fold_left f x darr] computes 161 | [f ( ... ( f ( f (get darr 0) x) (get darr 1) ) ... ) (get darr n-1)], 162 | similar to [Array.fold_left] or [List.fold_left]. *) 163 | 164 | val fold_right : ('a -> 'b -> 'b) -> 'a t -> 'b -> 'b 165 | (** [fold_right f darr x] computes 166 | [ f (get darr 0) (f (get darr 1) ( ... ( f (get darr n-1) x ) ... ) ) ] 167 | similar to [Array.fold_right] or [List.fold_right]. *) 168 | 169 | val index_of : ('a -> bool) -> 'a t -> int 170 | (** [index_of f darr] returns the index of the first element [x] in darr such 171 | as [f x] returns [true] or raise [Not_found] if not found. *) 172 | 173 | val filter : ('a -> bool) -> 'a t -> unit 174 | 175 | (** {6 Array resizers} *) 176 | 177 | type resizer_t = currslots:int -> oldlength:int -> newlength:int -> int 178 | (** The type of a resizer function. 179 | 180 | Resizer functions are called whenever elements are added to 181 | or removed from the dynamic array to determine what the current number of 182 | storage spaces in the array should be. The three named arguments 183 | passed to a resizer are the current number of storage spaces in 184 | the array, the length of the array before the elements are 185 | added or removed, and the length the array will be after the 186 | elements are added or removed. If elements are being added, newlength 187 | will be larger than oldlength, if elements are being removed, 188 | newlength will be smaller than oldlength. If the resizer function 189 | returns exactly oldlength, the size of the array is only changed when 190 | adding an element while there is not enough space for it. 191 | 192 | By default, all dynamic arrays are created with the [default_resizer]. 193 | When a dynamic array is created from another dynamic array (using [copy], 194 | [map] , etc. ) the resizer of the copy will be the same as the original 195 | dynamic array resizer. To change the resizer, use the [set_resizer] 196 | function. 197 | *) 198 | 199 | val set_resizer : 'a t -> resizer_t -> unit 200 | (** Change the resizer for this array. *) 201 | 202 | val get_resizer : 'a t -> resizer_t 203 | (** Get the current resizer function for a given array *) 204 | 205 | val default_resizer : resizer_t 206 | (** The default resizer function the library is using - in this version 207 | of DynArray, this is the [exponential_resizer] but should change in 208 | next versions. *) 209 | 210 | val exponential_resizer : resizer_t 211 | (** The exponential resizer- The default resizer except when the resizer 212 | is being copied from some other darray. 213 | 214 | [exponential_resizer] works by doubling or halving the number of 215 | slots until they "fit". If the number of slots is less than the 216 | new length, the number of slots is doubled until it is greater 217 | than the new length (or Sys.max_array_size is reached). 218 | 219 | If the number of slots is more than four times the new length, 220 | the number of slots is halved until it is less than four times the 221 | new length. 222 | 223 | Allowing darrays to fall below 25% utilization before shrinking them 224 | prevents "thrashing". Consider the case where the caller is constantly 225 | adding a few elements, and then removing a few elements, causing 226 | the length to constantly cross above and below a power of two. 227 | Shrinking the array when it falls below 50% would causing the 228 | underlying array to be constantly allocated and deallocated. 229 | A few elements would be added, causing the array to be reallocated 230 | and have a usage of just above 50%. Then a few elements would be 231 | remove, and the array would fall below 50% utilization and be 232 | reallocated yet again. The bulk of the array, untouched, would be 233 | copied and copied again. By setting the threshold at 25% instead, 234 | such "thrashing" only occurs with wild swings- adding and removing 235 | huge numbers of elements (more than half of the elements in the array). 236 | 237 | [exponential_resizer] is a good performing resizer for most 238 | applications. A list allocates 2 words for every element, while an 239 | array (with large numbers of elements) allocates only 1 word per 240 | element (ignoring unboxed floats). On insert, [exponential_resizer] 241 | keeps the amount of wasted "extra" array elements below 50%, meaning 242 | that less than 2 words per element are used. Even on removals 243 | where the amount of wasted space is allowed to rise to 75%, that 244 | only means that darray is using 4 words per element. This is 245 | generally not a significant overhead. 246 | 247 | Furthermore, [exponential_resizer] minimizes the number of copies 248 | needed- appending n elements into an empty darray with initial size 249 | 0 requires between n and 2n elements of the array be copied- O(n) 250 | work, or O(1) work per element (on average). A similar argument 251 | can be made that deletes from the end of the array are O(1) as 252 | well (obviously deletes from anywhere else are O(n) work- you 253 | have to move the n or so elements above the deleted element down). 254 | 255 | *) 256 | 257 | val step_resizer : int -> resizer_t 258 | (** The stepwise resizer- another example of a resizer function, this 259 | time of a parameterized resizer. 260 | 261 | The resizer returned by [step_resizer step] returns the smallest 262 | multiple of [step] larger than [newlength] if [currslots] is less 263 | then [newlength]-[step] or greater than [newlength]. 264 | 265 | For example, to make an darray with a step of 10, a length 266 | of len, and a null of null, you would do: 267 | [make] ~resizer:([step_resizer] 10) len null 268 | *) 269 | 270 | val conservative_exponential_resizer : resizer_t 271 | (** [conservative_exponential_resizer] is an example resizer function 272 | which uses the oldlength parameter. It only shrinks the array 273 | on inserts- no deletes shrink the array, only inserts. It does 274 | this by comparing the oldlength and newlength parameters. Other 275 | than that, it acts like [exponential_resizer]. 276 | *) 277 | 278 | (** {6 Unsafe operations} **) 279 | 280 | val unsafe_get : 'a t -> int -> 'a 281 | val unsafe_set : 'a t -> int -> 'a -> unit 282 | -------------------------------------------------------------------------------- /src/hashset.ml: -------------------------------------------------------------------------------- 1 | (* Simple sets using Hashtbl *) 2 | 3 | type 'a t = ('a, unit) Hashtbl.t 4 | 5 | let create = Hashtbl.create 6 | 7 | let mem = Hashtbl.mem 8 | 9 | let add ht key = Hashtbl.replace ht key () 10 | 11 | let to_list ht = Hashtbl.fold (fun k v rest -> k :: rest) ht [] 12 | 13 | let of_list list = 14 | let ht = create 0 in 15 | List.map (add ht) list; 16 | ht 17 | 18 | let union ht1 ht2 = 19 | let ht3 = create 0 in 20 | Hashtbl.iter (fun key _ -> add ht3 key) ht1; 21 | Hashtbl.iter (fun key _ -> add ht3 key) ht2; 22 | ht3 23 | 24 | let inter ht1 ht2 = 25 | let ht3 = create 0 in 26 | Hashtbl.iter (fun key _ -> if mem ht2 key then add ht3 key else ()) ht1; 27 | ht3 28 | 29 | let filter f ht = 30 | Hashtbl.iter 31 | (fun elem _ -> 32 | if f elem 33 | then () 34 | else Hashtbl.remove ht elem) 35 | ht 36 | -------------------------------------------------------------------------------- /src/hashset.mli: -------------------------------------------------------------------------------- 1 | type 'a t 2 | 3 | val create : int -> 'a t 4 | 5 | val mem : 'a t -> 'a -> bool 6 | val add : 'a t -> 'a -> unit 7 | 8 | val to_list : 'a t -> 'a list 9 | val of_list : 'a list -> 'a t 10 | 11 | val union : 'a t -> 'a t -> 'a t 12 | val inter : 'a t -> 'a t -> 'a t 13 | 14 | val filter: ('a -> bool) -> 'a t -> unit 15 | -------------------------------------------------------------------------------- /src/index.ml: -------------------------------------------------------------------------------- 1 | (* 2 | This module controls toplevel interaction with the search index. 3 | Mostly I/O and error handling. See the last section for the commands supported. 4 | *) 5 | 6 | (* Types and json parsing *) 7 | 8 | type json doi = string 9 | 10 | and eqnID = string 11 | and containerID = string 12 | and publicationYear = string 13 | and format = string 14 | 15 | and document = 16 | < ?containerID : containerID option 17 | ; ?format : format = "Article" (* Originally only articles were supported *) 18 | ; publicationYear : publicationYear option 19 | ; content : (string * Json_type.t) assoc (* eqnID*Latex.t *) 20 | ; source : (string * string) assoc > (* eqnID*string *) 21 | 22 | and args = 23 | < searchTerm : string 24 | ; ?searchTimeout : string = "10.0" 25 | ; ?preprocessorTimeout : string = "5.0" 26 | ; ?limit : string = "1000" 27 | ; ?start : string = "0" 28 | ; ?count : string = string_of_int max_int 29 | ; ?doi : string option 30 | ; ?containerID : containerID option 31 | ; ?publishedAfter : publicationYear option 32 | ; ?publishedBefore : publicationYear option 33 | ; ?precision : string = "0.7" > 34 | 35 | and get = 36 | < query : args > 37 | 38 | and post = 39 | < body : string > 40 | 41 | and update = 42 | < id : doi 43 | ; key : int 44 | ; value : < ?deleted : bool = false > 45 | ; ?doc : Json_type.t option > 46 | 47 | and updates = 48 | < rows : update list > 49 | 50 | and preprocessed = 51 | < json : Json_type.t 52 | ; plain : string > 53 | 54 | type equation = 55 | { doi : doi 56 | ; eqnID : eqnID } 57 | 58 | (* json-static converts json into objects which cannot be stored using Marshal, so store metadata record instead *) 59 | type metadata = 60 | { containerID : containerID option 61 | ; format : format 62 | ; publicationYear : publicationYear option 63 | ; no_eqns : int } 64 | 65 | let metadata_of_doc doc = 66 | { containerID = doc#containerID 67 | ; format = doc#format 68 | ; publicationYear = doc#publicationYear 69 | ; no_eqns = List.length doc#content } 70 | 71 | (* Assorted imports and utililty functions *) 72 | 73 | module Http = Http_client.Convenience 74 | let encode url = Netencoding.Url.encode ~plus:false url 75 | 76 | (* couchdb does not allow '/' in keys *) 77 | let encode_doi doi = Str.replace_first (Str.regexp "/") "_" doi 78 | let decode_doi doi = Str.replace_first (Str.regexp "_") "/" doi 79 | 80 | let flush_line = Util.flush_line 81 | 82 | module Doi_map = MyMap.Make 83 | (struct 84 | type t = doi 85 | let compare = compare 86 | end) 87 | 88 | module Eqnid_map = MyMap.Make 89 | (struct 90 | type t = eqnID 91 | let compare = compare 92 | end) 93 | 94 | (* Our main index structure *) 95 | 96 | type index = 97 | { last_update : int (* Key of the last update received from couchdb *) 98 | ; metadata : metadata Doi_map.t 99 | ; suffix_array : equation Suffix_array.t } 100 | 101 | (* Persisting *) 102 | 103 | let load_index () = (Util.load_data "./data/index" : index) 104 | 105 | let save_index index = Util.save_data "./data/index" (index : index) 106 | 107 | (* Database interaction *) 108 | 109 | let couchdb_url = 110 | (* Ocaml's file handling is terrible... *) 111 | let conf = open_in "./db.ini" in 112 | let rec read_port () = 113 | try 114 | let line = input_line conf in 115 | Str.search_forward (Str.regexp "port *= *\([0-9]+\)") line 0; 116 | Str.matched_group 1 line 117 | with Not_found -> read_port () in 118 | "http://localhost:" ^ read_port () ^ "/" 119 | 120 | let db_url = couchdb_url ^ "documents/" 121 | 122 | let get_document doi = 123 | let url = db_url ^ doi in 124 | let json = Json_io.json_of_string (Http.http_get url) in 125 | document_of_json json 126 | 127 | let preprocess timeout latex_string = 128 | let url = db_url ^ "_external/preprocess?format=json-plain&timeout=" ^ (encode timeout) ^ "&latex=" ^ (encode latex_string) in 129 | let preprocessed = preprocessed_of_json (Json_io.json_of_string (Http.http_get url)) in 130 | (Latex.of_json preprocessed#json,preprocessed#plain) 131 | 132 | (* Responses to couchdb *) 133 | 134 | let xml_of_results results query_string = 135 | let xml_of_eqn (eqnID,weight) = 136 | Xml.Element ("equation", [("distance",string_of_int weight);("id",eqnID)], []) in 137 | let xml_of_result (doi,metadata,eqns) = 138 | Xml.Element (metadata.format, 139 | [("doi", decode_doi doi);("count", string_of_int (List.length eqns))], 140 | (List.map xml_of_eqn eqns)) in 141 | let xml_of_query_string = 142 | Xml.Element ("query",[],[Xml.PCData query_string]) in 143 | Xml.Element ("results", [], xml_of_query_string :: (List.map xml_of_result results)) 144 | 145 | let xml_error error = Xml.Element (error,[],[]) 146 | 147 | let xml_response xml = 148 | Json_type.Object 149 | [ ("code",Json_type.Int 200) 150 | ; ("headers",Json_type.Object [("Content-type",Json_type.String "text/xml")]) 151 | ; ("body",Json_type.String (Xml.to_string xml)) ] 152 | 153 | (* Timeouts *) 154 | 155 | exception Timeout 156 | 157 | let set_timer tsecs = ignore (Unix.setitimer Unix.ITIMER_REAL { Unix.it_interval = 0.0; Unix.it_value = tsecs }) 158 | 159 | let with_timeout tsecs f = 160 | Sys.set_signal Sys.sigalrm (Sys.Signal_handle (fun _ -> raise Timeout)); 161 | try 162 | set_timer tsecs; 163 | let result = f () in 164 | set_timer 0.0; 165 | result 166 | with exc -> 167 | set_timer 0.0; 168 | raise exc 169 | 170 | (* Queries *) 171 | 172 | let run_query index query precision filter limit start count = 173 | let eqns = Suffix_array.find_query index.suffix_array precision query in 174 | (* Collate eqns by doi *) 175 | let doi_map = 176 | List.fold_left 177 | (fun doi_map (weight,equation) -> 178 | let (key, value) = (equation.doi, (equation.eqnID,weight)) in 179 | Doi_map.update key (fun values -> value::values) [value] doi_map) 180 | Doi_map.empty 181 | eqns in 182 | (* Remove the dummy node *) 183 | let doi_map = Doi_map.remove "" doi_map in 184 | if Doi_map.count doi_map > limit 185 | then 186 | xml_error "LimitExceeded" 187 | else 188 | let results = Doi_map.to_list doi_map in 189 | (* Insert metadata *) 190 | let results = List.map (fun (doi,eqns) -> (doi, Doi_map.find doi index.metadata, eqns)) results in 191 | (* Apply filter *) 192 | let results = List.filter (fun (doi,metadata,_) -> filter doi metadata) results in 193 | (* Sort each set of equations by weight *) 194 | let results = List.map (fun (doi,metadata,eqns) -> (doi,metadata,List.fast_sort (fun a b -> compare (snd a) (snd b)) eqns)) results in 195 | (* Sort doi's by lowest weighted equation *) 196 | let results = List.fast_sort (fun (_,_,eqnsA) (_,_,eqnsB) -> compare (snd (List.hd eqnsA)) (snd (List.hd eqnsB))) results in 197 | (* Return the chosen page *) 198 | let results = ExtList.List.take count (ExtList.List.drop start results) in 199 | xml_of_results results (Query.to_string query) 200 | 201 | let handle_query index str = 202 | try 203 | let args = 204 | let json = Json_io.json_of_string str in 205 | (* accept args either as query string or as post body *) 206 | try 207 | (get_of_json json)#query 208 | with Json_type.Json_error _ -> 209 | let json = Json_io.json_of_string (post_of_json json)#body in 210 | args_of_json json in 211 | let searchTimeout = float_of_string args#searchTimeout in 212 | let preprocessorTimeout = args#preprocessorTimeout in 213 | let limit = int_of_string args#limit in 214 | let start = int_of_string args#start in 215 | let count = int_of_string args#count in 216 | let query = Query.of_string (preprocess preprocessorTimeout) args#searchTerm in 217 | let precision = float_of_string args#precision in 218 | let containerIDs = 219 | match args#containerID with 220 | | None -> [] 221 | | Some csv -> ExtString.String.nsplit csv "," in 222 | let dois = 223 | match args#doi with 224 | | None -> [] 225 | | Some csv -> ExtString.String.nsplit csv "," in 226 | let filter doi metadata = 227 | ((args#containerID = None) || (List.exists (fun containerID -> metadata.containerID = Some containerID) containerIDs)) 228 | && ((args#doi = None) || (List.mem (decode_doi doi) dois)) 229 | && ((args#publishedBefore = None) || ((args#publishedBefore >= metadata.publicationYear) && (metadata.publicationYear <> None))) 230 | && ((args#publishedAfter = None) || ((args#publishedAfter <= metadata.publicationYear) && (metadata.publicationYear <> None))) in 231 | xml_response (with_timeout searchTimeout (fun () -> run_query index query precision filter limit start count)) 232 | with 233 | | Json_type.Json_error _ | Failure _ -> xml_response (xml_error "ArgParseError") 234 | | Query.Parse_error -> xml_response (xml_error "QueryParseError") 235 | | Timeout -> xml_response (xml_error "TimedOut") 236 | | _ -> Json_type.Object [("code",Json_type.Int 500)] (* Internal server error *) 237 | 238 | let handle_queries () = 239 | let index = load_index () in 240 | Suffix_array.ancientify index.suffix_array; 241 | while true do 242 | let input = input_line stdin in 243 | let json = handle_query index input in 244 | flush_line (Json_io.string_of_json ~compact:true json) 245 | done 246 | 247 | (* Initialising index *) 248 | 249 | let init_index () = 250 | flush_line ("couchdb is at " ^ couchdb_url); 251 | print_string "This will erase the existing index. Are you sure? (y/n):"; flush stdout; 252 | if read_line () = "y" 253 | then 254 | (flush_line "Saving index"; 255 | save_index {last_update = -1; suffix_array = Suffix_array.create (); metadata = Doi_map.empty}; 256 | flush_line "Ok") 257 | else 258 | flush_line "Ok, nothing was done" 259 | 260 | (* Updating the index *) 261 | 262 | let batch_size = 1000 263 | 264 | let get_update_batch last_update = 265 | flush_line 266 | ("Fetching updates from " ^ 267 | (string_of_int (last_update+1)) ^ 268 | " onwards"); 269 | let url = 270 | db_url ^ "_all_docs_by_seq?include_docs=true" ^ 271 | "&startkey=" ^ (string_of_int last_update) ^ 272 | "&limit=" ^ (string_of_int batch_size) in 273 | try 274 | let json = Json_io.json_of_string (Http.http_get url) in 275 | (updates_of_json json)#rows 276 | with _ -> 277 | flush_line "Error contacting database (documents)"; 278 | raise Exit 279 | 280 | exception FailedUpdate of int * doi 281 | 282 | let run_update index update = 283 | try 284 | (* Start by deleting old version of the document if it already exists *) 285 | let index = 286 | if not (Doi_map.mem update#id index.metadata) then index else 287 | begin 288 | Util.flush_line ("Deleting " ^ update#id); 289 | Suffix_array.delete index.suffix_array (fun equation -> equation.doi = update#id); 290 | let metadata = Doi_map.remove update#id index.metadata in 291 | {index with metadata=metadata} 292 | end in 293 | (* Add the new version of the documents if the deleted flag is not set *) 294 | match (update#doc, update#value#deleted) with 295 | | (None, _) | (_,true) -> 296 | {index with last_update=update#key} 297 | | (Some json, false) -> 298 | begin 299 | let doc = document_of_json json in 300 | let equations = 301 | List.map 302 | (fun (eqnID,json) -> ({doi=update#id; eqnID=eqnID}, Latex.of_json json)) 303 | doc#content in 304 | Suffix_array.add index.suffix_array equations; 305 | let metadata = Doi_map.add update#id (metadata_of_doc doc) index.metadata in 306 | {index with last_update=update#key; metadata=metadata} 307 | end 308 | with _ -> 309 | raise (FailedUpdate (update#key, update#id)) 310 | 311 | let rec run_update_batches index = 312 | let update_batch = get_update_batch index.last_update in 313 | let index = List.fold_left run_update index update_batch in 314 | save_index index; 315 | if List.length update_batch < batch_size then index else run_update_batches index 316 | 317 | let run_updates () = 318 | Pid.lock (); 319 | flush_line ("couchdb is at " ^ couchdb_url); 320 | flush_line "Loading index"; 321 | let index = load_index () in 322 | let index = 323 | try 324 | run_update_batches index 325 | with FailedUpdate(key,id) -> 326 | flush_line ("Update " ^ (string_of_int key) ^ " failed (DOI: " ^ id ^ ")"); 327 | index in 328 | flush_line ("Finished updating at update: " ^ (string_of_int index.last_update)); 329 | flush_line "Preparing index"; 330 | Suffix_array.prepare index.suffix_array; 331 | save_index index; 332 | flush_line "Ok" 333 | 334 | (* Introspection *) 335 | 336 | let list_all () = 337 | flush_line ("couchdb is at " ^ couchdb_url); 338 | flush_line "Loading index"; 339 | let index = load_index () in 340 | Doi_map.iter 341 | (fun doi metadata -> 342 | match metadata.containerID with 343 | | None -> 344 | flush_line ((decode_doi doi) ^ "no_equations=" ^ (string_of_int metadata.no_eqns)) 345 | | Some containerID -> 346 | flush_line ((decode_doi doi) ^ " containerID=" ^ containerID ^ " no_equations=" ^ (string_of_int metadata.no_eqns))) 347 | index.metadata; 348 | let no_eqns = Doi_map.fold (fun _ metadata total -> metadata.no_eqns+total) index.metadata 0 in 349 | flush_line ("Total number of equations: " ^ (string_of_int no_eqns)) 350 | 351 | let list_one doi = 352 | let doi = encode_doi doi in 353 | flush_line ("couchdb is at " ^ couchdb_url); 354 | flush_line "Loading index"; 355 | let index = load_index () in 356 | flush_line ("Searching for " ^ doi); 357 | try 358 | let metadata = Doi_map.find (encode_doi doi) index.metadata in 359 | (match metadata.containerID with 360 | | None -> 361 | flush_line ((decode_doi doi) ^ "no_equations=" ^ (string_of_int metadata.no_eqns)) 362 | | Some containerID -> 363 | flush_line ((decode_doi doi) ^ " containerID=" ^ containerID ^ " no_equations=" ^ (string_of_int metadata.no_eqns))) 364 | with Not_found -> 365 | flush_line "DOI not indexed" 366 | 367 | (* Main *) 368 | 369 | open Arg 370 | let _ = parse 371 | [("-init", Unit init_index, ": Create an empty index") 372 | ;("-update", Unit run_updates, ": Update the index") 373 | ;("-query", Unit handle_queries, ": Handle index queries as a couchdb _external") 374 | ;("-list_all", Unit list_all, ": List all indexed keys") 375 | ;("-list", String list_one, ": List the entry for a given key")] 376 | ignore 377 | "Use 'index -help' for available options" 378 | -------------------------------------------------------------------------------- /src/latex.ml: -------------------------------------------------------------------------------- 1 | (* 2 | The internal representation of preprocessed latex strings. 3 | The string elements are hashed to save space and speed up comparisons. 4 | The json input is produced by the python preprocessor. 5 | *) 6 | 7 | type element = 8 | | Command of string 9 | | Text of string 10 | 11 | type t = int array 12 | 13 | let empty () = Array.make 0 0 14 | 15 | let of_array array = array 16 | 17 | exception Parse_error 18 | 19 | let rec element_of_json json = 20 | match json with 21 | | Json_type.Object [(command,json)] -> (Command command) :: element_list_of_json json 22 | | Json_type.String text -> [Text text] 23 | | _ -> raise Parse_error 24 | and element_list_of_json json = 25 | match json with 26 | | Json_type.Array jsons -> List.concat (List.map element_of_json jsons) 27 | | _ -> raise Parse_error 28 | 29 | (* Parsing elements from json *) 30 | let of_json json = 31 | Array.of_list (List.map Hashtbl.hash (element_list_of_json json)) 32 | 33 | (* Defined to make json-static happy, not used *) 34 | let to_json latex = Json_type.Null 35 | 36 | let length = Array.length 37 | 38 | type pos = int 39 | 40 | let compare_suffix (latexL, pos1) (latexR, pos2) = 41 | let n1, n2 = length latexL, length latexR in 42 | let rec compare_suffix' pos1 pos2 = 43 | match (pos1 >= n1, pos2 >= n2) with 44 | | (true, true) -> 0 45 | | (true, false) -> -1 46 | | (false, true) -> 1 47 | | (false, false) -> 48 | let cmp = compare latexL.(pos1) latexR.(pos2) in 49 | if cmp < 0 then -1 else 50 | if cmp > 0 then 1 else 51 | compare_suffix' (pos1+1) (pos2+1) in 52 | compare_suffix' pos1 pos2 53 | 54 | let is_prefix (latexL, pos1) (latexR, pos2) = 55 | let n1, n2 = length latexL, length latexR in 56 | let rec is_prefix' pos1 pos2 = 57 | if pos1 >= n1 then true else 58 | if pos2 >= n2 then false else 59 | if latexL.(pos1) != latexR.(pos2) then false else 60 | is_prefix' (pos1+1) (pos2+1) in 61 | is_prefix' pos1 pos2 62 | 63 | (* Divide latex into k substrings of equal(ish) lengths *) 64 | let fragments latex k = 65 | let n = length latex in 66 | let size = n / k in 67 | let rec fragments' pos larger = 68 | if pos >= n then [] else 69 | let size = if larger > 0 then size+1 else size in 70 | (Array.sub latex pos size) :: (fragments' (pos+size) (larger-1)) in 71 | fragments' 0 (n mod k) 72 | 73 | let rec minimum (x : int) y z = 74 | if y < x then minimum y x z else 75 | if z < y then minimum x z y else x 76 | 77 | let cutoff precision latex = 78 | let errors = (1.0 -. precision) *. (float_of_int (length latex)) in 79 | max 1 (min 5 (int_of_float (ceil errors))) 80 | 81 | (* 82 | Calculation of the Levensthein edit distance between two latex strings. 83 | The calculation is left-biased: the left string is matched to any substring of the right string 84 | *) 85 | let distance latexL latexR = 86 | let maxl, maxr = Array.length latexL, Array.length latexR in 87 | if maxl = 0 then 0 else 88 | if maxr = 0 then maxl else 89 | (* cache.(l).(r) is the distance between latexL[l to maxl] and latexR[r to maxr] *) 90 | let cache = Array.make_matrix (maxl + 1) (maxr + 1) 0 in 91 | (* Must match everything on the left *) 92 | for l = maxl - 1 downto 0 do 93 | cache.(l).(maxr) <- 1 + cache.(l+1).(maxr) 94 | done; 95 | (* General matching *) 96 | for l = maxl - 1 downto 1 do 97 | for r = maxr - 1 downto 0 do 98 | cache.(l).(r) <- 99 | minimum 100 | (1 + cache.(l).(r+1)) 101 | (1 + cache.(l+1).(r)) 102 | ((abs (compare latexL.(l) latexR.(r))) + cache.(l+1).(r+1)) 103 | done done; 104 | (* Non-matches on the right dont count until left starts matching *) 105 | for r = maxr - 1 downto 0 do 106 | cache.(0).(r) <- 107 | minimum 108 | (cache.(0).(r+1)) 109 | (1 + cache.(1).(r)) 110 | ((abs (compare latexL.(0) latexR.(r))) + cache.(1).(r+1)) 111 | done; 112 | cache.(0).(0) 113 | 114 | let similar precision latexL latexR = 115 | let dist = distance latexL latexR in 116 | if dist < cutoff precision latexL then Some dist else None 117 | -------------------------------------------------------------------------------- /src/latex.mli: -------------------------------------------------------------------------------- 1 | type t = int array 2 | 3 | val empty : unit -> t 4 | val length : t -> int 5 | 6 | val of_array : int array -> t 7 | 8 | val of_json : Json_type.t -> t 9 | val to_json : t -> Json_type.t 10 | 11 | type pos = int 12 | 13 | val compare_suffix : (t * pos) -> (t * pos) -> int 14 | val is_prefix : (t * pos) -> (t * pos) -> bool 15 | val fragments : t -> int -> t list 16 | 17 | val cutoff : float -> t -> int 18 | val distance : t -> t -> int 19 | val similar : float -> t -> t -> int option 20 | -------------------------------------------------------------------------------- /src/myMap.ml: -------------------------------------------------------------------------------- 1 | module type S = 2 | sig 3 | include Map.S 4 | 5 | val update : key -> ('a -> 'a) -> 'a -> 'a t -> 'a t 6 | val count : 'a t -> int 7 | val to_list : 'a t -> (key * 'a) list 8 | val find_with : key -> 'a -> 'a t -> 'a 9 | val filter_map : ('a -> 'b option) -> 'a t -> 'b t 10 | end 11 | 12 | module Make (Ord : Map.OrderedType) : (S with type key = Ord.t) = 13 | struct 14 | include Map.Make (Ord) 15 | 16 | let update key f default map = 17 | add key (try f (find key map) with Not_found -> default) map 18 | 19 | let count map = fold (fun _ _ n -> n+1) map 0 20 | 21 | let to_list map = fold (fun k v rest -> (k,v) :: rest) map [] 22 | 23 | let find_with key default map = 24 | try 25 | find key map 26 | with Not_found -> 27 | default 28 | 29 | let filter_map f map = 30 | fold 31 | (fun key value map -> 32 | match (f value) with 33 | | None -> map 34 | | Some value -> add key value map) 35 | map 36 | empty 37 | end 38 | -------------------------------------------------------------------------------- /src/myMap.mli: -------------------------------------------------------------------------------- 1 | module type S = 2 | sig 3 | include Map.S 4 | 5 | val update : key -> ('a -> 'a) -> 'a -> 'a t -> 'a t 6 | val count : 'a t -> int 7 | val to_list : 'a t -> (key * 'a) list 8 | val find_with : key -> 'a -> 'a t -> 'a 9 | val filter_map : ('a -> 'b option) -> 'a t -> 'b t 10 | end 11 | 12 | module Make (Ord : Map.OrderedType) : (S with type key = Ord.t) 13 | -------------------------------------------------------------------------------- /src/pid.ml: -------------------------------------------------------------------------------- 1 | (* Prevents multiple update processes from running in parrallel *) 2 | 3 | let lock () = 4 | try 5 | Util.flush_line "Checking pid file"; 6 | let pid_file = open_in "run/update.pid" in 7 | let pid = try input_line pid_file with End_of_file -> "" in 8 | begin 9 | match (pid, Unix.system ("ps " ^ pid ^ " &> /dev/null")) with 10 | | ("", _) -> 11 | Util.flush_line "No existing pid" 12 | | (pid, Unix.WEXITED 0) -> 13 | Util.flush_line ("Process with pid " ^ pid ^ " already exists"); 14 | raise Exit 15 | | (pid, Unix.WEXITED 1) -> 16 | Util.flush_line ("Process with pid " ^ pid ^ " does not exist") 17 | end; 18 | close_in pid_file; 19 | let pid_file = open_out "run/update.pid" in 20 | output_string pid_file (string_of_int (Unix.getpid ())); 21 | close_out pid_file 22 | with 23 | | Exit -> 24 | raise Exit 25 | | exc -> 26 | Util.flush_line "Error checking pid in run/update.pid"; 27 | raise exc 28 | -------------------------------------------------------------------------------- /src/pid.mli: -------------------------------------------------------------------------------- 1 | (* Exits if the process in the pid file is alive *) 2 | val lock : unit -> unit 3 | -------------------------------------------------------------------------------- /src/query.ml: -------------------------------------------------------------------------------- 1 | (* Compound boolean queries *) 2 | 3 | (* The query type *) 4 | type t = 5 | | Latex of Latex.t * string (* Store the string version so we can send the query back to the users *) 6 | | And of t * t 7 | | Or of t * t 8 | 9 | let is_blank_string str = 10 | let blank = ref true in 11 | String.iter 12 | (fun char -> 13 | if char <> ' ' 14 | then blank := false 15 | else ()) 16 | str; 17 | !blank 18 | 19 | let is_quoted_string str = 20 | (String.get str 0 == '"') && (String.get str (String.length str - 1) == '"') 21 | 22 | open Str 23 | 24 | (* Quick and dirty lexer, delimiters are: "latexstring" ) ( AND OR *) 25 | let token_spec = regexp "\"[^\"]*\"\|(\|)\|AND\|OR" 26 | let lex str = 27 | let tokens = full_split token_spec str in 28 | let tokens = 29 | List.filter 30 | (function 31 | | Text text when is_blank_string text -> false 32 | | _ -> true) 33 | tokens in 34 | Stream.of_list tokens 35 | 36 | (* A simple recursive descent parser *) 37 | let parse_query preprocesser tokens = 38 | let rec parse_atom = 39 | parser 40 | | [< 'Delim "("; q=parse_expr; 'Delim ")" >] -> q 41 | | [< 'Delim delim when is_quoted_string delim >] -> 42 | let text = String.sub delim 1 (String.length delim - 2) in 43 | let (latex, plain) = preprocesser text in 44 | Latex (latex, plain) 45 | 46 | and parse_expr = 47 | parser 48 | | [< q1=parse_atom; stream >] -> 49 | (parser 50 | | [< 'Delim "AND"; q2=parse_expr >] -> And (q1, q2) 51 | | [< 'Delim "OR"; q2=parse_expr >] -> Or (q1, q2) 52 | | [< >] -> q1) 53 | stream 54 | 55 | and parse_query = 56 | parser 57 | | [< q=parse_expr; stream >] -> 58 | Stream.empty stream; q in 59 | 60 | parse_query tokens 61 | 62 | exception Parse_error 63 | 64 | let of_string preprocesser str = 65 | try 66 | parse_query preprocesser (lex str) 67 | with _ -> 68 | raise Parse_error (* Dont care whether the error was parsing the query or preprocessing the latex *) 69 | 70 | let rec to_string query = 71 | match query with 72 | | Latex (_,plain) -> "\"" ^ plain ^ "\"" 73 | | And (query1,query2) -> "(" ^ (to_string query1) ^ " AND " ^ (to_string query2) ^ ")" 74 | | Or (query1,query2) -> "(" ^ (to_string query1) ^ " OR " ^ (to_string query2) ^ ")" 75 | 76 | (* Extending the edit distance on latex strings to edit distance on compound queries *) 77 | let rec distance query latexR = 78 | match query with 79 | | Latex (latexL,_) -> Latex.distance latexL latexR 80 | | And (query1,query2) -> max (distance query1 latexR) (distance query2 latexR) 81 | | Or (query1,query2) -> min (distance query1 latexR) (distance query2 latexR) 82 | 83 | let rec similar precision query latexR = 84 | match query with 85 | | Latex (latexL,_) -> 86 | Latex.similar precision latexL latexR 87 | | And (query1, query2) -> 88 | begin 89 | match (similar precision query1 latexR, similar precision query2 latexR) with 90 | | (Some dist1, Some dist2) -> Some (max dist1 dist2) 91 | | _ -> None 92 | end 93 | | Or (query1, query2) -> 94 | begin 95 | match (similar precision query1 latexR, similar precision query2 latexR) with 96 | | (Some dist1, Some dist2) -> Some (min dist1 dist2) 97 | | (Some dist1, None) -> Some dist1 98 | | (None, Some dist2) -> Some dist2 99 | | (None, None) -> None 100 | end 101 | -------------------------------------------------------------------------------- /src/query.mli: -------------------------------------------------------------------------------- 1 | type t = 2 | | Latex of Latex.t * string (* Store the string version so we can send the query back to the users *) 3 | | And of t * t 4 | | Or of t * t 5 | 6 | exception Parse_error 7 | 8 | val of_string : (string -> (Latex.t * string)) -> string -> t 9 | val to_string : t -> string 10 | 11 | val distance : t -> Latex.t -> int 12 | val similar : float -> t -> Latex.t -> int option 13 | -------------------------------------------------------------------------------- /src/suffix.ml: -------------------------------------------------------------------------------- 1 | (* Packed representations of suffixes of strings. Used by suffix_array *) 2 | 3 | type id = int 4 | type pos = int 5 | 6 | type t = int 7 | 8 | let pack_size = (Sys.word_size / 2) - 1 9 | let max_size = 1 lsl pack_size 10 | 11 | exception Invalid_suffix of id * pos 12 | 13 | let pack (id, pos) = 14 | if (id < 0) || (id >= max_size) 15 | || (pos < 0) || (pos >= max_size) 16 | then raise (Invalid_suffix (id, pos)) 17 | else pos lor (id lsl pack_size) 18 | 19 | let unpack suffix = 20 | let id = suffix lsr pack_size in 21 | let pos = suffix land (max_size - 1) in 22 | (id, pos) 23 | -------------------------------------------------------------------------------- /src/suffix.mli: -------------------------------------------------------------------------------- 1 | type id = int 2 | type pos = int 3 | 4 | type t 5 | 6 | val max_size : int 7 | 8 | exception Invalid_suffix of id * pos 9 | 10 | val pack : id * pos -> t 11 | val unpack : t -> id * pos 12 | -------------------------------------------------------------------------------- /src/suffix_array.ml: -------------------------------------------------------------------------------- 1 | (* 2 | Suffix arrays storing compressed latex formulae. 3 | Allows neighbourhood search by Latex.distance 4 | *) 5 | 6 | open Util 7 | 8 | type id = Suffix.id 9 | type pos = Suffix.pos 10 | 11 | type 'a t = 12 | { latexs : Latex.t DynArray.t 13 | ; opaques : 'a DynArray.t 14 | ; deleted : bool DynArray.t 15 | ; mutable next_id : id 16 | ; mutable array : Suffix.t array 17 | ; mutable unsorted : ('a * Latex.t) list } 18 | 19 | let create () = 20 | { latexs = DynArray.create () 21 | ; opaques = DynArray.create () 22 | ; deleted = DynArray.create () 23 | ; next_id = 0 24 | ; array = [||] 25 | ; unsorted = []} 26 | 27 | let ancientify sa = 28 | sa.array <- Ancient.follow (Ancient.mark sa.array); 29 | Gc.full_major () 30 | 31 | let add sa latexs = 32 | sa.unsorted <- latexs @ sa.unsorted 33 | 34 | let compare_suffix sa (id1,pos1) (id2,pos2) = 35 | let latexL, latexR = DynArray.get sa.latexs id1, DynArray.get sa.latexs id2 in 36 | Latex.compare_suffix (latexL,pos1) (latexR,pos2) 37 | 38 | let suffixes sa id = 39 | let latex = DynArray.get sa.latexs id in 40 | let n = Latex.length latex in 41 | List.map (fun pos -> Suffix.pack (id,pos)) (Util.range 0 n) 42 | 43 | let insert sa (opaque, latex) = 44 | let id = sa.next_id in 45 | sa.next_id <- id + 1; 46 | DynArray.add sa.opaques opaque; 47 | DynArray.add sa.latexs latex; 48 | DynArray.add sa.deleted false; 49 | id 50 | 51 | (* a little convoluted to keep memory usage as low as possible *) 52 | let prepare sa = 53 | let ids = List.map (insert sa) sa.unsorted in 54 | sa.unsorted <- []; 55 | let new_suffixes = Util.concat_map (suffixes sa) ids in 56 | let old_len = Array.length sa.array in 57 | let new_len = List.length new_suffixes in 58 | let array = Array.make (old_len + new_len) (Suffix.pack (0,0)) in 59 | Array.blit sa.array 0 array 0 old_len; 60 | sa.array <- array; 61 | let index = ref old_len in 62 | List.iter 63 | (fun suffix -> 64 | array.(!index) <- suffix; 65 | index := !index + 1) 66 | new_suffixes; 67 | let cmp suffix1 suffix2 = 68 | let (id1,pos1) = Suffix.unpack suffix1 in 69 | let (id2,pos2) = Suffix.unpack suffix2 in 70 | compare_suffix sa (id1,pos1) (id2,pos2) in 71 | Array.fast_sort cmp sa.array 72 | 73 | let delete sa filter = 74 | let deleted_ids = 75 | Util.filter_map 76 | (fun id -> 77 | if filter (DynArray.get sa.opaques id) 78 | then Some id 79 | else None) 80 | (Util.range 0 (DynArray.length sa.opaques)) in 81 | List.iter (fun id -> DynArray.set sa.deleted id true) deleted_ids 82 | 83 | let filter_deleted sa ids = 84 | Hashset.filter (fun id -> not (DynArray.get sa.deleted id)) ids 85 | 86 | let is_prefix sa latexL (id,pos) = 87 | let latexR = DynArray.get sa.latexs id in 88 | Latex.is_prefix (latexL,0) (latexR,pos) 89 | 90 | let leq sa latexL (id,pos) = 91 | let latexR = DynArray.get sa.latexs id in 92 | (Latex.compare_suffix (latexL,0) (latexR,pos)) <= 0 93 | 94 | (* Exact searching *) 95 | 96 | (* binary search *) 97 | let gather_exact ids sa latex = 98 | (* find beginning of region *) 99 | (* lo < latex *) 100 | (* hi >= latex *) 101 | let rec narrow lo hi = 102 | let mid = lo + ((hi-lo) / 2) in 103 | if lo = mid then hi else 104 | if leq sa latex (Suffix.unpack sa.array.(mid)) 105 | then narrow lo mid 106 | else narrow mid hi in 107 | let n = Array.length sa.array in 108 | let rec traverse index = 109 | if index >= n then () else 110 | let (id, pos) = Suffix.unpack sa.array.(index) in 111 | if is_prefix sa latex (id, pos) 112 | then 113 | begin 114 | Hashset.add ids id; 115 | traverse (index+1) 116 | end 117 | else () in 118 | traverse (narrow (-1) (n-1)) 119 | 120 | let exact_match sa id = 121 | (0, DynArray.get sa.opaques id) 122 | 123 | let find_exact sa latex = 124 | let ids = Hashset.create 0 in 125 | gather_exact ids sa latex; 126 | filter_deleted sa ids; 127 | List.map (exact_match sa) (Hashset.to_list ids) 128 | 129 | (* Searching by Latex.distance *) 130 | 131 | (* 132 | The logic behind the approx search is as follows: 133 | Suppose Latex.distance latex corpus_term < k 134 | Then List.exists (fun fragment -> Latex.distance fragment corpus_term = 0) (Latex.fragments latex k) 135 | *) 136 | let gather_approx sa precision latex = 137 | let k = Latex.cutoff precision latex in 138 | let ids = Hashset.create 0 in 139 | List.iter (gather_exact ids sa) (Latex.fragments latex k); 140 | ids 141 | 142 | let approx_match sa precision latexL id = 143 | let latexR = DynArray.get sa.latexs id in 144 | match Latex.similar precision latexL latexR with 145 | | Some dist -> 146 | let opaque = DynArray.get sa.opaques id in 147 | Some (dist, opaque) 148 | | None -> 149 | None 150 | 151 | let find_approx sa precision latex = 152 | let ids = gather_approx sa precision latex in 153 | filter_deleted sa ids; 154 | Util.filter_map (approx_match sa precision latex) (Hashset.to_list ids) 155 | 156 | (* Searching by Query.distance *) 157 | 158 | let rec gather_query sa precision query = 159 | match query with 160 | | Query.Latex (latex, _) -> gather_approx sa precision latex 161 | | Query.And (query1, query2) -> Hashset.inter (gather_query sa precision query1) (gather_query sa precision query2) 162 | | Query.Or (query1, query2) -> Hashset.union (gather_query sa precision query1) (gather_query sa precision query2) 163 | 164 | let query_match sa precision query id = 165 | let latexR = DynArray.get sa.latexs id in 166 | match Query.similar precision query latexR with 167 | | Some dist -> 168 | let opaque = DynArray.get sa.opaques id in 169 | Some (dist, opaque) 170 | | None -> 171 | None 172 | 173 | let find_query sa precision query = 174 | let ids = gather_query sa precision query in 175 | filter_deleted sa ids; 176 | Util.filter_map (query_match sa precision query) (Hashset.to_list ids) 177 | -------------------------------------------------------------------------------- /src/suffix_array.mli: -------------------------------------------------------------------------------- 1 | type id = int 2 | type pos = int 3 | 4 | type 'a t = 5 | { latexs : Latex.t DynArray.t 6 | ; opaques : 'a DynArray.t 7 | ; deleted : bool DynArray.t 8 | ; mutable next_id : id 9 | ; mutable array : Suffix.t array 10 | ; mutable unsorted : ('a * Latex.t) list } 11 | 12 | val create : unit -> 'a t 13 | val ancientify : 'a t -> unit 14 | 15 | val add : 'a t -> ('a * Latex.t) list -> unit 16 | val prepare : 'a t -> unit 17 | 18 | val delete : 'a t -> ('a -> bool) -> unit 19 | 20 | val find_exact : 'a t -> Latex.t -> (int * 'a) list 21 | val find_approx : 'a t -> float -> Latex.t -> (int * 'a) list 22 | val find_query : 'a t -> float -> Query.t -> (int * 'a) list 23 | -------------------------------------------------------------------------------- /src/suffix_array_test.ml: -------------------------------------------------------------------------------- 1 | let random_array length gen = 2 | Array.map (fun _ -> gen ()) (Array.make length 0) 3 | 4 | let random_list length gen = 5 | Array.to_list (random_array length gen) 6 | 7 | let random_latex_element () = 8 | Random.int 50 9 | 10 | let random_latex max_length = 11 | let length = (1 + Random.int max_length) in 12 | Latex.of_array (random_array length random_latex_element) 13 | 14 | let random_string max_length = 15 | let length = (1 + Random.int max_length) in 16 | String.create length (* unitialised memory is fine *) 17 | 18 | let rec random_query max_length = 19 | match Random.int 6 with 20 | | 0 -> Query.And (random_query max_length, random_query max_length) 21 | | 1 -> Query.Or (random_query max_length, random_query max_length) 22 | | _ -> Query.Latex (random_latex max_length, "") 23 | 24 | let random_corpus n = 25 | let latexs = random_list n (fun () -> random_latex 1000) in 26 | let opaques = random_list n (fun () -> random_string 1000) in 27 | let items = List.combine opaques latexs in 28 | let sa = Suffix_array.create () in 29 | Suffix_array.add sa items; 30 | Suffix_array.prepare sa; 31 | let ((opaque,latex)::items) = items in 32 | Suffix_array.delete sa ((=) opaque); 33 | Suffix_array.ancientify sa; 34 | (items, sa) 35 | 36 | let test_find test find n = 37 | let (items, sa) = random_corpus n in 38 | let test_result = List.sort compare (test items) in 39 | let real_result = List.sort compare (List.map (fun (_,opaque) -> opaque) (find sa)) in 40 | if test_result <> real_result then Util.flush_line "Fail!" else Util.flush_line "Pass!"; 41 | (test_result = real_result, List.length test_result, List.length real_result) 42 | 43 | let exact_match latexL latexR = 44 | Latex.distance latexL latexR = 0 45 | 46 | let test_find_exact n = 47 | let latexL = random_latex 5 in 48 | let test items = 49 | Util.filter_map 50 | (fun (id,latexR) -> 51 | if exact_match latexL latexR 52 | then Some id 53 | else None) 54 | items in 55 | let find sa = 56 | Suffix_array.find_exact sa latexL in 57 | test_find test find n 58 | 59 | let approx_match precision latexL latexR = 60 | Latex.similar precision latexL latexR <> None 61 | 62 | let test_find_approx n = 63 | let latexL = random_latex 5 in 64 | let precision = Random.float 1.0 in 65 | let test items = 66 | Util.filter_map 67 | (fun (id,latexR) -> 68 | if approx_match precision latexL latexR 69 | then Some id 70 | else None) 71 | items in 72 | let find sa = 73 | Suffix_array.find_approx sa precision latexL in 74 | test_find test find n 75 | 76 | let rec query_match precision query latexR = 77 | Query.similar precision query latexR <> None 78 | 79 | let test_find_query n = 80 | let query = random_query 5 in 81 | let precision = Random.float 1.0 in 82 | let test items = 83 | Util.filter_map 84 | (fun (id,latexR) -> 85 | if query_match precision query latexR 86 | then Some id 87 | else None) 88 | items in 89 | let find sa = 90 | Suffix_array.find_query sa precision query in 91 | test_find test find n 92 | 93 | let test_find_max_precision n = 94 | let latexL = random_latex 5 in 95 | let test items = 96 | Util.filter_map 97 | (fun (id,latexR) -> 98 | if exact_match latexL latexR 99 | then Some id 100 | else None) 101 | items in 102 | let find sa = 103 | Suffix_array.find_approx sa 1.0 latexL in 104 | test_find test find n 105 | -------------------------------------------------------------------------------- /src/suffix_test.ml: -------------------------------------------------------------------------------- 1 | let test_pack n = 2 | for i = 0 to n do 3 | let (id, pos) = (Random.int Suffix.max_size, Random.int Suffix.max_size) in 4 | if (id, pos) = Suffix.unpack (Suffix.pack (id,pos)) 5 | then () (* Util.flush_line "Pass!" *) 6 | else Util.flush_line ("Fail!: " ^ (string_of_int id) ^ " " ^ (string_of_int pos)) 7 | done 8 | -------------------------------------------------------------------------------- /src/test.mltop: -------------------------------------------------------------------------------- 1 | DynArray 2 | Hashset 3 | MyMap 4 | Util 5 | Latex 6 | Query 7 | Suffix 8 | Suffix_array 9 | Suffix_test 10 | Suffix_array_test -------------------------------------------------------------------------------- /src/util.ml: -------------------------------------------------------------------------------- 1 | let flush_line str = print_string str; print_string "\n"; flush stdout 2 | 3 | let minimum (l::ls) = List.fold_left min l ls 4 | 5 | let maximum (l::ls) = List.fold_left max l ls 6 | 7 | let filter_map f ls = 8 | List.map 9 | (fun l -> match l with Some a -> a) 10 | ((List.filter 11 | (fun l -> l <> None) 12 | (List.map f ls))) 13 | 14 | let concat_map f ls = 15 | List.fold_right (@) (List.map f ls) [] 16 | 17 | let rec range start finish = 18 | if start < finish then start :: range (start+1) finish else [] 19 | 20 | (* Fairly hackish method of sucking out stream elements *) 21 | let list_of_stream stream = Stream.npeek max_int stream 22 | 23 | let load_data filename = 24 | try 25 | let data_file = open_in_bin filename in 26 | let data = Marshal.from_channel data_file in 27 | close_in data_file; data 28 | with _ -> 29 | flush_line ("Error opening file " ^ filename); 30 | raise Exit 31 | 32 | let save_data filename data = 33 | try 34 | let data_file = open_out_bin (filename ^ "_tmp") in 35 | Marshal.to_channel data_file data []; 36 | close_out data_file; 37 | Unix.rename (filename ^ "_tmp") filename 38 | with _ -> 39 | flush_line ("Error saving to file " ^ filename); 40 | raise Exit 41 | 42 | (* Tune the gc for lots of garbage *) 43 | open Gc 44 | let expect_garbage () = 45 | let m = 1024 * 1024 in 46 | Gc.set 47 | {(Gc.get ()) with 48 | minor_heap_size = 256 * m; 49 | major_heap_increment = 64 * m; 50 | space_overhead = 200 51 | } 52 | 53 | let backtrace f = 54 | Printexc.record_backtrace true; 55 | try Printexc.print f (); () with _ -> Printexc.print_backtrace stdout 56 | -------------------------------------------------------------------------------- /src/util.mli: -------------------------------------------------------------------------------- 1 | val flush_line : string -> unit 2 | 3 | val minimum : 'a list -> 'a 4 | val maximum : 'a list -> 'a 5 | 6 | val filter_map : ('a -> 'b option) -> 'a list -> 'b list 7 | val concat_map : ('a -> 'b list) -> 'a list -> 'b list 8 | 9 | val range : int -> int -> int list 10 | 11 | val list_of_stream : 'a Stream.t -> 'a list 12 | 13 | val load_data : string -> 'a 14 | val save_data : string -> 'a -> unit 15 | 16 | val backtrace : (unit -> 'a) -> unit 17 | -------------------------------------------------------------------------------- /start: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | dir=$(dirname $0) 3 | couchdb -b -c $dir/db.ini -p $dir/run/couchdb.pid 4 | -------------------------------------------------------------------------------- /stop: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | dir=$(dirname $0) 3 | couchdb -p $dir/run/couchdb.pid -d 4 | --------------------------------------------------------------------------------