├── .gitignore ├── COPYRIGHT ├── README ├── config ├── HDTMRBuilder.xml ├── lubm-dictionary.xml └── lubm-triples.xml ├── iface └── org │ └── rdfhdt │ ├── hdt │ └── trans │ │ └── TransientElement.java │ └── mrbuilder │ ├── io │ ├── TripleComparator.java │ └── TripleWritable.java │ └── triples │ └── TriplesMapper.java ├── pom.xml └── src └── org └── rdfhdt ├── hdt ├── compact │ ├── bitmap │ │ └── TransientBitmap375.java │ └── sequence │ │ └── TransientSequenceLog64.java ├── dictionary │ └── impl │ │ ├── FourSectionDictionary2.java │ │ └── section │ │ ├── DictionarySectionFactory2.java │ │ └── TransientDictionarySection.java ├── hdt │ └── impl │ │ └── TransientHDT.java └── triples │ ├── ScapedTripleString.java │ └── impl │ └── TransientBitMapTriples.java ├── listener └── HDTBuilderListener.java └── mrbuilder ├── HDTBuilderConfiguration.java ├── HDTBuilderDriver.java ├── dictionary ├── DictionaryCombiner.java ├── DictionaryMapper.java ├── DictionaryReducer.java ├── DictionarySamplerMapper.java └── DictionarySamplerReducer.java ├── io ├── TripleSPOComparator.java └── TripleSPOWritable.java ├── triples └── TriplesSPOMapper.java └── util └── FileStatusComparator.java /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac OS X 2 | .DS_Store 3 | 4 | # Editor backup files 5 | *~ 6 | -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 2.1, February 1999 3 | 4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc. 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | [This is the first released version of the Lesser GPL. It also counts 10 | as the successor of the GNU Library Public License, version 2, hence 11 | the version number 2.1.] 12 | 13 | Preamble 14 | 15 | The licenses for most software are designed to take away your 16 | freedom to share and change it. By contrast, the GNU General Public 17 | Licenses are intended to guarantee your freedom to share and change 18 | free software--to make sure the software is free for all its users. 19 | 20 | This license, the Lesser General Public License, applies to some 21 | specially designated software packages--typically libraries--of the 22 | Free Software Foundation and other authors who decide to use it. You 23 | can use it too, but we suggest you first think carefully about whether 24 | this license or the ordinary General Public License is the better 25 | strategy to use in any particular case, based on the explanations below. 26 | 27 | When we speak of free software, we are referring to freedom of use, 28 | not price. Our General Public Licenses are designed to make sure that 29 | you have the freedom to distribute copies of free software (and charge 30 | for this service if you wish); that you receive source code or can get 31 | it if you want it; that you can change the software and use pieces of 32 | it in new free programs; and that you are informed that you can do 33 | these things. 34 | 35 | To protect your rights, we need to make restrictions that forbid 36 | distributors to deny you these rights or to ask you to surrender these 37 | rights. These restrictions translate to certain responsibilities for 38 | you if you distribute copies of the library or if you modify it. 39 | 40 | For example, if you distribute copies of the library, whether gratis 41 | or for a fee, you must give the recipients all the rights that we gave 42 | you. You must make sure that they, too, receive or can get the source 43 | code. If you link other code with the library, you must provide 44 | complete object files to the recipients, so that they can relink them 45 | with the library after making changes to the library and recompiling 46 | it. And you must show them these terms so they know their rights. 47 | 48 | We protect your rights with a two-step method: (1) we copyright the 49 | library, and (2) we offer you this license, which gives you legal 50 | permission to copy, distribute and/or modify the library. 51 | 52 | To protect each distributor, we want to make it very clear that 53 | there is no warranty for the free library. Also, if the library is 54 | modified by someone else and passed on, the recipients should know 55 | that what they have is not the original version, so that the original 56 | author's reputation will not be affected by problems that might be 57 | introduced by others. 58 | 59 | Finally, software patents pose a constant threat to the existence of 60 | any free program. We wish to make sure that a company cannot 61 | effectively restrict the users of a free program by obtaining a 62 | restrictive license from a patent holder. Therefore, we insist that 63 | any patent license obtained for a version of the library must be 64 | consistent with the full freedom of use specified in this license. 65 | 66 | Most GNU software, including some libraries, is covered by the 67 | ordinary GNU General Public License. This license, the GNU Lesser 68 | General Public License, applies to certain designated libraries, and 69 | is quite different from the ordinary General Public License. We use 70 | this license for certain libraries in order to permit linking those 71 | libraries into non-free programs. 72 | 73 | When a program is linked with a library, whether statically or using 74 | a shared library, the combination of the two is legally speaking a 75 | combined work, a derivative of the original library. The ordinary 76 | General Public License therefore permits such linking only if the 77 | entire combination fits its criteria of freedom. The Lesser General 78 | Public License permits more lax criteria for linking other code with 79 | the library. 80 | 81 | We call this license the "Lesser" General Public License because it 82 | does Less to protect the user's freedom than the ordinary General 83 | Public License. It also provides other free software developers Less 84 | of an advantage over competing non-free programs. These disadvantages 85 | are the reason we use the ordinary General Public License for many 86 | libraries. However, the Lesser license provides advantages in certain 87 | special circumstances. 88 | 89 | For example, on rare occasions, there may be a special need to 90 | encourage the widest possible use of a certain library, so that it becomes 91 | a de-facto standard. To achieve this, non-free programs must be 92 | allowed to use the library. A more frequent case is that a free 93 | library does the same job as widely used non-free libraries. In this 94 | case, there is little to gain by limiting the free library to free 95 | software only, so we use the Lesser General Public License. 96 | 97 | In other cases, permission to use a particular library in non-free 98 | programs enables a greater number of people to use a large body of 99 | free software. For example, permission to use the GNU C Library in 100 | non-free programs enables many more people to use the whole GNU 101 | operating system, as well as its variant, the GNU/Linux operating 102 | system. 103 | 104 | Although the Lesser General Public License is Less protective of the 105 | users' freedom, it does ensure that the user of a program that is 106 | linked with the Library has the freedom and the wherewithal to run 107 | that program using a modified version of the Library. 108 | 109 | The precise terms and conditions for copying, distribution and 110 | modification follow. Pay close attention to the difference between a 111 | "work based on the library" and a "work that uses the library". The 112 | former contains code derived from the library, whereas the latter must 113 | be combined with the library in order to run. 114 | 115 | GNU LESSER GENERAL PUBLIC LICENSE 116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 117 | 118 | 0. This License Agreement applies to any software library or other 119 | program which contains a notice placed by the copyright holder or 120 | other authorized party saying it may be distributed under the terms of 121 | this Lesser General Public License (also called "this License"). 122 | Each licensee is addressed as "you". 123 | 124 | A "library" means a collection of software functions and/or data 125 | prepared so as to be conveniently linked with application programs 126 | (which use some of those functions and data) to form executables. 127 | 128 | The "Library", below, refers to any such software library or work 129 | which has been distributed under these terms. A "work based on the 130 | Library" means either the Library or any derivative work under 131 | copyright law: that is to say, a work containing the Library or a 132 | portion of it, either verbatim or with modifications and/or translated 133 | straightforwardly into another language. (Hereinafter, translation is 134 | included without limitation in the term "modification".) 135 | 136 | "Source code" for a work means the preferred form of the work for 137 | making modifications to it. For a library, complete source code means 138 | all the source code for all modules it contains, plus any associated 139 | interface definition files, plus the scripts used to control compilation 140 | and installation of the library. 141 | 142 | Activities other than copying, distribution and modification are not 143 | covered by this License; they are outside its scope. The act of 144 | running a program using the Library is not restricted, and output from 145 | such a program is covered only if its contents constitute a work based 146 | on the Library (independent of the use of the Library in a tool for 147 | writing it). Whether that is true depends on what the Library does 148 | and what the program that uses the Library does. 149 | 150 | 1. You may copy and distribute verbatim copies of the Library's 151 | complete source code as you receive it, in any medium, provided that 152 | you conspicuously and appropriately publish on each copy an 153 | appropriate copyright notice and disclaimer of warranty; keep intact 154 | all the notices that refer to this License and to the absence of any 155 | warranty; and distribute a copy of this License along with the 156 | Library. 157 | 158 | You may charge a fee for the physical act of transferring a copy, 159 | and you may at your option offer warranty protection in exchange for a 160 | fee. 161 | 162 | 2. You may modify your copy or copies of the Library or any portion 163 | of it, thus forming a work based on the Library, and copy and 164 | distribute such modifications or work under the terms of Section 1 165 | above, provided that you also meet all of these conditions: 166 | 167 | a) The modified work must itself be a software library. 168 | 169 | b) You must cause the files modified to carry prominent notices 170 | stating that you changed the files and the date of any change. 171 | 172 | c) You must cause the whole of the work to be licensed at no 173 | charge to all third parties under the terms of this License. 174 | 175 | d) If a facility in the modified Library refers to a function or a 176 | table of data to be supplied by an application program that uses 177 | the facility, other than as an argument passed when the facility 178 | is invoked, then you must make a good faith effort to ensure that, 179 | in the event an application does not supply such function or 180 | table, the facility still operates, and performs whatever part of 181 | its purpose remains meaningful. 182 | 183 | (For example, a function in a library to compute square roots has 184 | a purpose that is entirely well-defined independent of the 185 | application. Therefore, Subsection 2d requires that any 186 | application-supplied function or table used by this function must 187 | be optional: if the application does not supply it, the square 188 | root function must still compute square roots.) 189 | 190 | These requirements apply to the modified work as a whole. If 191 | identifiable sections of that work are not derived from the Library, 192 | and can be reasonably considered independent and separate works in 193 | themselves, then this License, and its terms, do not apply to those 194 | sections when you distribute them as separate works. But when you 195 | distribute the same sections as part of a whole which is a work based 196 | on the Library, the distribution of the whole must be on the terms of 197 | this License, whose permissions for other licensees extend to the 198 | entire whole, and thus to each and every part regardless of who wrote 199 | it. 200 | 201 | Thus, it is not the intent of this section to claim rights or contest 202 | your rights to work written entirely by you; rather, the intent is to 203 | exercise the right to control the distribution of derivative or 204 | collective works based on the Library. 205 | 206 | In addition, mere aggregation of another work not based on the Library 207 | with the Library (or with a work based on the Library) on a volume of 208 | a storage or distribution medium does not bring the other work under 209 | the scope of this License. 210 | 211 | 3. You may opt to apply the terms of the ordinary GNU General Public 212 | License instead of this License to a given copy of the Library. To do 213 | this, you must alter all the notices that refer to this License, so 214 | that they refer to the ordinary GNU General Public License, version 2, 215 | instead of to this License. (If a newer version than version 2 of the 216 | ordinary GNU General Public License has appeared, then you can specify 217 | that version instead if you wish.) Do not make any other change in 218 | these notices. 219 | 220 | Once this change is made in a given copy, it is irreversible for 221 | that copy, so the ordinary GNU General Public License applies to all 222 | subsequent copies and derivative works made from that copy. 223 | 224 | This option is useful when you wish to copy part of the code of 225 | the Library into a program that is not a library. 226 | 227 | 4. You may copy and distribute the Library (or a portion or 228 | derivative of it, under Section 2) in object code or executable form 229 | under the terms of Sections 1 and 2 above provided that you accompany 230 | it with the complete corresponding machine-readable source code, which 231 | must be distributed under the terms of Sections 1 and 2 above on a 232 | medium customarily used for software interchange. 233 | 234 | If distribution of object code is made by offering access to copy 235 | from a designated place, then offering equivalent access to copy the 236 | source code from the same place satisfies the requirement to 237 | distribute the source code, even though third parties are not 238 | compelled to copy the source along with the object code. 239 | 240 | 5. A program that contains no derivative of any portion of the 241 | Library, but is designed to work with the Library by being compiled or 242 | linked with it, is called a "work that uses the Library". Such a 243 | work, in isolation, is not a derivative work of the Library, and 244 | therefore falls outside the scope of this License. 245 | 246 | However, linking a "work that uses the Library" with the Library 247 | creates an executable that is a derivative of the Library (because it 248 | contains portions of the Library), rather than a "work that uses the 249 | library". The executable is therefore covered by this License. 250 | Section 6 states terms for distribution of such executables. 251 | 252 | When a "work that uses the Library" uses material from a header file 253 | that is part of the Library, the object code for the work may be a 254 | derivative work of the Library even though the source code is not. 255 | Whether this is true is especially significant if the work can be 256 | linked without the Library, or if the work is itself a library. The 257 | threshold for this to be true is not precisely defined by law. 258 | 259 | If such an object file uses only numerical parameters, data 260 | structure layouts and accessors, and small macros and small inline 261 | functions (ten lines or less in length), then the use of the object 262 | file is unrestricted, regardless of whether it is legally a derivative 263 | work. (Executables containing this object code plus portions of the 264 | Library will still fall under Section 6.) 265 | 266 | Otherwise, if the work is a derivative of the Library, you may 267 | distribute the object code for the work under the terms of Section 6. 268 | Any executables containing that work also fall under Section 6, 269 | whether or not they are linked directly with the Library itself. 270 | 271 | 6. As an exception to the Sections above, you may also combine or 272 | link a "work that uses the Library" with the Library to produce a 273 | work containing portions of the Library, and distribute that work 274 | under terms of your choice, provided that the terms permit 275 | modification of the work for the customer's own use and reverse 276 | engineering for debugging such modifications. 277 | 278 | You must give prominent notice with each copy of the work that the 279 | Library is used in it and that the Library and its use are covered by 280 | this License. You must supply a copy of this License. If the work 281 | during execution displays copyright notices, you must include the 282 | copyright notice for the Library among them, as well as a reference 283 | directing the user to the copy of this License. Also, you must do one 284 | of these things: 285 | 286 | a) Accompany the work with the complete corresponding 287 | machine-readable source code for the Library including whatever 288 | changes were used in the work (which must be distributed under 289 | Sections 1 and 2 above); and, if the work is an executable linked 290 | with the Library, with the complete machine-readable "work that 291 | uses the Library", as object code and/or source code, so that the 292 | user can modify the Library and then relink to produce a modified 293 | executable containing the modified Library. (It is understood 294 | that the user who changes the contents of definitions files in the 295 | Library will not necessarily be able to recompile the application 296 | to use the modified definitions.) 297 | 298 | b) Use a suitable shared library mechanism for linking with the 299 | Library. A suitable mechanism is one that (1) uses at run time a 300 | copy of the library already present on the user's computer system, 301 | rather than copying library functions into the executable, and (2) 302 | will operate properly with a modified version of the library, if 303 | the user installs one, as long as the modified version is 304 | interface-compatible with the version that the work was made with. 305 | 306 | c) Accompany the work with a written offer, valid for at 307 | least three years, to give the same user the materials 308 | specified in Subsection 6a, above, for a charge no more 309 | than the cost of performing this distribution. 310 | 311 | d) If distribution of the work is made by offering access to copy 312 | from a designated place, offer equivalent access to copy the above 313 | specified materials from the same place. 314 | 315 | e) Verify that the user has already received a copy of these 316 | materials or that you have already sent this user a copy. 317 | 318 | For an executable, the required form of the "work that uses the 319 | Library" must include any data and utility programs needed for 320 | reproducing the executable from it. However, as a special exception, 321 | the materials to be distributed need not include anything that is 322 | normally distributed (in either source or binary form) with the major 323 | components (compiler, kernel, and so on) of the operating system on 324 | which the executable runs, unless that component itself accompanies 325 | the executable. 326 | 327 | It may happen that this requirement contradicts the license 328 | restrictions of other proprietary libraries that do not normally 329 | accompany the operating system. Such a contradiction means you cannot 330 | use both them and the Library together in an executable that you 331 | distribute. 332 | 333 | 7. You may place library facilities that are a work based on the 334 | Library side-by-side in a single library together with other library 335 | facilities not covered by this License, and distribute such a combined 336 | library, provided that the separate distribution of the work based on 337 | the Library and of the other library facilities is otherwise 338 | permitted, and provided that you do these two things: 339 | 340 | a) Accompany the combined library with a copy of the same work 341 | based on the Library, uncombined with any other library 342 | facilities. This must be distributed under the terms of the 343 | Sections above. 344 | 345 | b) Give prominent notice with the combined library of the fact 346 | that part of it is a work based on the Library, and explaining 347 | where to find the accompanying uncombined form of the same work. 348 | 349 | 8. You may not copy, modify, sublicense, link with, or distribute 350 | the Library except as expressly provided under this License. Any 351 | attempt otherwise to copy, modify, sublicense, link with, or 352 | distribute the Library is void, and will automatically terminate your 353 | rights under this License. However, parties who have received copies, 354 | or rights, from you under this License will not have their licenses 355 | terminated so long as such parties remain in full compliance. 356 | 357 | 9. You are not required to accept this License, since you have not 358 | signed it. However, nothing else grants you permission to modify or 359 | distribute the Library or its derivative works. These actions are 360 | prohibited by law if you do not accept this License. Therefore, by 361 | modifying or distributing the Library (or any work based on the 362 | Library), you indicate your acceptance of this License to do so, and 363 | all its terms and conditions for copying, distributing or modifying 364 | the Library or works based on it. 365 | 366 | 10. Each time you redistribute the Library (or any work based on the 367 | Library), the recipient automatically receives a license from the 368 | original licensor to copy, distribute, link with or modify the Library 369 | subject to these terms and conditions. You may not impose any further 370 | restrictions on the recipients' exercise of the rights granted herein. 371 | You are not responsible for enforcing compliance by third parties with 372 | this License. 373 | 374 | 11. If, as a consequence of a court judgment or allegation of patent 375 | infringement or for any other reason (not limited to patent issues), 376 | conditions are imposed on you (whether by court order, agreement or 377 | otherwise) that contradict the conditions of this License, they do not 378 | excuse you from the conditions of this License. If you cannot 379 | distribute so as to satisfy simultaneously your obligations under this 380 | License and any other pertinent obligations, then as a consequence you 381 | may not distribute the Library at all. For example, if a patent 382 | license would not permit royalty-free redistribution of the Library by 383 | all those who receive copies directly or indirectly through you, then 384 | the only way you could satisfy both it and this License would be to 385 | refrain entirely from distribution of the Library. 386 | 387 | If any portion of this section is held invalid or unenforceable under any 388 | particular circumstance, the balance of the section is intended to apply, 389 | and the section as a whole is intended to apply in other circumstances. 390 | 391 | It is not the purpose of this section to induce you to infringe any 392 | patents or other property right claims or to contest validity of any 393 | such claims; this section has the sole purpose of protecting the 394 | integrity of the free software distribution system which is 395 | implemented by public license practices. Many people have made 396 | generous contributions to the wide range of software distributed 397 | through that system in reliance on consistent application of that 398 | system; it is up to the author/donor to decide if he or she is willing 399 | to distribute software through any other system and a licensee cannot 400 | impose that choice. 401 | 402 | This section is intended to make thoroughly clear what is believed to 403 | be a consequence of the rest of this License. 404 | 405 | 12. If the distribution and/or use of the Library is restricted in 406 | certain countries either by patents or by copyrighted interfaces, the 407 | original copyright holder who places the Library under this License may add 408 | an explicit geographical distribution limitation excluding those countries, 409 | so that distribution is permitted only in or among countries not thus 410 | excluded. In such case, this License incorporates the limitation as if 411 | written in the body of this License. 412 | 413 | 13. The Free Software Foundation may publish revised and/or new 414 | versions of the Lesser General Public License from time to time. 415 | Such new versions will be similar in spirit to the present version, 416 | but may differ in detail to address new problems or concerns. 417 | 418 | Each version is given a distinguishing version number. If the Library 419 | specifies a version number of this License which applies to it and 420 | "any later version", you have the option of following the terms and 421 | conditions either of that version or of any later version published by 422 | the Free Software Foundation. If the Library does not specify a 423 | license version number, you may choose any version ever published by 424 | the Free Software Foundation. 425 | 426 | 14. If you wish to incorporate parts of the Library into other free 427 | programs whose distribution conditions are incompatible with these, 428 | write to the author to ask for permission. For software which is 429 | copyrighted by the Free Software Foundation, write to the Free 430 | Software Foundation; we sometimes make exceptions for this. Our 431 | decision will be guided by the two goals of preserving the free status 432 | of all derivatives of our free software and of promoting the sharing 433 | and reuse of software generally. 434 | 435 | NO WARRANTY 436 | 437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO 438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. 439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR 440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY 441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE 442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE 444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME 445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 446 | 447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN 448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY 449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU 450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR 451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE 452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING 453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A 454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF 455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 456 | DAMAGES. 457 | 458 | END OF TERMS AND CONDITIONS -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | ======================== 2 | HDT-MR Library. 3 | ======================== 4 | 5 | Copyright (C) 2015, Jose M. Gimenez-Garcia, Javier D. Fernandez, Miguel A. Martinez-Prieto 6 | All rights reserved. 7 | 8 | This library is free software; you can redistribute it and/or 9 | modify it under the terms of the GNU Lesser General Public 10 | License as published by the Free Software Foundation; either 11 | version 2.1 of the License, or (at your option) any later version. 12 | 13 | This library is distributed in the hope that it will be useful, 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 | Lesser General Public License for more details. 17 | 18 | You should have received a copy of the GNU Lesser General Public 19 | License along with this library; if not, write to the Free Software 20 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 21 | 22 | Visit our Web Page: dataweb.infor.uva.es/projects/hdt-mr 23 | 24 | Contacting the authors: 25 | Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 26 | Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 27 | Miguel A. Martinez-Prieto: migumar2@infor.uva.es 28 | 29 | 30 | Overview 31 | ================= 32 | 33 | HDT-MR improves the HDT-java library by introducing MapReduce as the computation model for large HDT serialization. HDT-MR performs in linear time with the dataset size and has proven able to serialize datasets up to 4.42 billion triples, preserving HDT compression and retrieval features. 34 | 35 | HDT-java is a Java library that implements the W3C Submission (http://www.w3.org/Submission/2011/03/) of the RDF HDT (Header-Dictionary-Triples) binary format for publishing and exchanging RDF data at large scale. Its compact representation allows storing RDF in fewer space, while providing direct access to the stored information. See rdfhdt.org for further information. 36 | 37 | 38 | 39 | HDT-MR provides three components: 40 | - iface: Provides an API to use HDT-MR, including interfaces and abstract classes 41 | - src: Core library and command lines tools for using HDT-MR. It allows creating HDT files from RDF. 42 | - config: Examples of configuration files 43 | Note that the current distribution is an alpha version. Therefore, while this build has been tested, it is still subject to bugs and optimizations. 44 | 45 | 46 | 47 | 48 | Compiling 49 | ================= 50 | Dependencies: 51 | * HDT-java (https://code.google.com/p/hdt-java/). 52 | *** src/org/rdfhdt/hdt includes those classes who has been modified/extended 53 | 54 | Command line tools 55 | ================= 56 | 57 | The tool provides the following main command line tool: 58 | 59 | Usage: hadoop HDTBuilderDriver [options] 60 | Options: 61 | -a, --awsbucket 62 | Amazon Web Services bucket 63 | -bu, --baseURI 64 | Base URI for the dataset 65 | -b, --basedir 66 | Root directory for the process 67 | -bd, --builddictionary 68 | Whether to build HDT dictionary or not 69 | -bh, --buildhdt 70 | Whether to build HDT or not 71 | -c, --conf 72 | Path to configuration file 73 | -dd, --deleteoutputdictionary 74 | Delete dictionary job output path before running job 75 | -dt, --deleteoutputtriples 76 | Delete triples job output path before running job 77 | -dsd, --deletesampledictionary 78 | Delete dictionary job sample path before running job 79 | -dst, --deletesampletriples 80 | Delete triples job sample path before running job 81 | -d, --dictionarydistribution 82 | Dictionary distribution among mappers and reducers 83 | -fd, --filedictionary 84 | Name of hdt dictionary file 85 | -fr, --fileobjects 86 | Name of hdt dictionary file for Reducers 87 | -fm, --filesubjects 88 | Name of hdt dictionary file for Mappers 89 | -hc, --hdtconf 90 | Conversion config file 91 | -x, --index 92 | Generate also external indices to solve all queries 93 | -i, --input 94 | Path to input files. Relative to basedir 95 | -it, --inputtriples 96 | Path to triples job input files. Relative to basedir 97 | -nd, --namedictionaryjob 98 | Name of dictionary job 99 | -fh, --namehdtfile 100 | Name of hdt file 101 | -nt, --nametriplesjob 102 | Name of triples job 103 | -o, --options 104 | HDT Conversion options (override those of config file) 105 | -od, --outputdictionary 106 | Path to dictionary job output files. Relative to basedir 107 | -ot, --outputtriples 108 | Path to triples job output files. Relative to basedir 109 | -q, --quiet 110 | Do not show progress of the conversion 111 | -t, --rdftype 112 | Type of RDF Input (ntriples, nquad, n3, turtle, rdfxml) 113 | -Rd, --reducersdictionary 114 | Number of reducers for dictionary job 115 | -Rds, --reducersdictionarysampling 116 | Number of reducers for dictionary input sampling job 117 | -Rt, --reducerstriples 118 | Number of reducers for triples job 119 | -Rts, --reducerstriplessampling 120 | Number of reducers for triples input sampling job 121 | -rd, --rundictionary 122 | Whether to run dictionary job or not 123 | -rds, --rundictionarysampling 124 | Whether to run dictionary input sampling job or not 125 | -rt, --runtriples 126 | Whether to run triples job or not 127 | -rts, --runtriplessampling 128 | Whether to run triples input sampling job or not 129 | -p, --sampleprobability 130 | Probability of using each element for sampling 131 | -sd, --samplesdictionary 132 | Path to dictionary job sample files. Relative to basedir 133 | -st, --samplestriples 134 | Path to triples job sample files. Relative to basedir 135 | 136 | 137 | Usage example 138 | ================= 139 | 140 | After installation, run: 141 | 142 | $ hadoop HDTBuilderDriver 143 | # This first try to read configuration parameters at the default config file (HDTMRBuilder.xml), using default values for those missing parameters. It reads RDF input data from the default 'input' folder and outputs the HDT conversion in 'output.hdt' 144 | 145 | $ hadoop HDTBuilderDriver -i mashup 146 | # Same previous example, but it reads RDF input data from the directory 'mashup' 147 | 148 | $ hadoop HDTBuilderDriver -c lubm-dictionary.xml -p 0.01 149 | # It uses 'lubm-dictionary.xml' as the configuration file. This file states that input data must be taken from the 'lubm' directory and it forces to compute only the HDT dictionary, which is written in 'dictionary/dictionary.hdt' 150 | # It uses 0.01 as the probability of using each element for sampling. 151 | 152 | 153 | $ hadoop HDTBuilderDriver -c lubm-triples.xml -Rt 1 -Rts 1 154 | # It uses 'lubm-triples.xml' as the configuration file. This file states that input data must be taken from the 'lubm' directory and it forces to compute the HDT triples and the final HDT representation by taken the already computed dictionary in 'dictionary/dictionary.hdt' 155 | # It forces to use one reducer in both jobs. 156 | 157 | License 158 | =============== 159 | 160 | All HDT-MR content is licensed by Lesser General Public License. 161 | 162 | Acknowledgements 163 | ================ 164 | 165 | HDT-MR is a project partially funded by Ministerio de Economia y Competitividad, Spain: TIN2013-46238-C4-3-R, and Austrian Science Fund (FWF): M1720-G11. 166 | 167 | 168 | -------------------------------------------------------------------------------- /config/HDTMRBuilder.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | global.path.base 5 | . 6 | Root directory 7 | 8 | 9 | 10 | global.path.input 11 | input 12 | input path 13 | 14 | 15 | 16 | job.dictionary.path.output 17 | d 18 | Dictionary output path / Triples input path 19 | 20 | 21 | 22 | job.dictionary.path.output.delete 23 | true 24 | Whether to delete dictionary output path 25 | 26 | 27 | 28 | job.dictionary.path.sample 29 | s 30 | Dictionary sample path 31 | 32 | 33 | 34 | job.dictionary.path.sample.delete 35 | true 36 | Whether to delete dictionary sample path 37 | 38 | 39 | 40 | job.triples.path.output 41 | t 42 | Triples output path 43 | 44 | 45 | 46 | job.triples.path.output.delete 47 | true 48 | Whether to delete triples output path 49 | 50 | 51 | 52 | job.dictionary.reducers 53 | 10 54 | Number of reducers used by jobs 55 | 56 | 57 | 58 | job.triples.reducers 59 | 10 60 | Number of reducers used by jobs 61 | 62 | 63 | 64 | job.dictionary.sample.probability 65 | 0.000001 66 | Sampler Probability 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /config/lubm-dictionary.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | job.dictionary.run 5 | true 6 | 7 | 8 | 9 | job.dictionary.sample.run 10 | true 11 | 12 | 13 | 14 | job.dictionary.sample.reducers 15 | 10 16 | 17 | 18 | 19 | hdt.dictionary.build 20 | true 21 | 22 | 23 | 24 | job.triples.run 25 | false 26 | 27 | 28 | 29 | job.triples.sample.run 30 | false 31 | 32 | 33 | 34 | hdt.build 35 | false 36 | 37 | 38 | 39 | global.path.base 40 | . 41 | Root directory 42 | 43 | 44 | 45 | global.path.input 46 | lubm 47 | input path 48 | 49 | 50 | 51 | job.dictionary.path.output 52 | dictionary 53 | Dictionary output path / Triples input path 54 | 55 | 56 | 57 | job.dictionary.path.output.delete 58 | true 59 | Whether to delete dictionary output path 60 | 61 | 62 | 63 | job.dictionary.path.sample 64 | dictionary_sample 65 | Dictionary samples path 66 | 67 | 68 | 69 | job.dictionary.path.sample.delete 70 | true 71 | Whether to delete dictionary samples path 72 | 73 | 74 | 75 | job.dictionary.reducers 76 | 10 77 | Number of reducers used by jobs 78 | 79 | 80 | 81 | job.dictionary.sample.probability 82 | 0.000001 83 | Sampler Probability 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /config/lubm-triples.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | job.dictionary.run 5 | false 6 | 7 | 8 | 9 | job.dictionary.sample.run 10 | false 11 | 12 | 13 | 14 | hdt.dictionary.build 15 | false 16 | 17 | 18 | 19 | job.triples.run 20 | true 21 | 22 | 23 | 24 | job.triples.sample.run 25 | true 26 | 27 | 28 | 29 | hdt.build 30 | true 31 | 32 | 33 | 34 | global.path.base 35 | . 36 | Root directory 37 | 38 | 39 | 40 | global.path.input 41 | lubm 42 | input path 43 | 44 | 45 | 46 | job.dictionary.path.output 47 | dictionary 48 | Dictionary output path / Triples input path 49 | 50 | 51 | 52 | job.triples.path.output.delete 53 | true 54 | Whether to delete triples output path 55 | 56 | 57 | 58 | job.triples.path.sample 59 | triples_sample 60 | Tripls samples path 61 | 62 | 63 | 64 | job.triples.path.sample.delete 65 | true 66 | Whether to delete tripls samples path 67 | 68 | 69 | 70 | job.triples.reducers 71 | 10 72 | Number of reducers used by jobs 73 | 74 | 75 | 76 | job.triples.sample.probability 77 | 0.000001 78 | Sampler Probability 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /iface/org/rdfhdt/hdt/trans/TransientElement.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package org.rdfhdt.hdt.trans; 5 | 6 | import java.io.IOException; 7 | 8 | import org.apache.hadoop.io.SequenceFile; 9 | import org.rdfhdt.hdt.listener.ProgressListener; 10 | 11 | /** 12 | * @author chemi 13 | * 14 | */ 15 | public interface TransientElement { 16 | 17 | public void initialize(long numentries); 18 | 19 | public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException; 20 | 21 | public void close() throws IOException; 22 | 23 | } 24 | -------------------------------------------------------------------------------- /iface/org/rdfhdt/mrbuilder/io/TripleComparator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.io; 24 | 25 | import org.apache.hadoop.io.WritableComparable; 26 | import org.apache.hadoop.io.WritableComparator; 27 | 28 | /** 29 | * @author chemi 30 | * 31 | */ 32 | @SuppressWarnings("rawtypes") 33 | public abstract class TripleComparator extends WritableComparator { 34 | 35 | public TripleComparator(Class keyClass, boolean createInstances) { 36 | super(keyClass, createInstances); 37 | } 38 | 39 | public TripleComparator(Class keyClass) { 40 | super(keyClass); 41 | } 42 | 43 | @SuppressWarnings("unchecked") 44 | @Override 45 | public int compare(WritableComparable wc1, WritableComparable wc2) { 46 | TW key1 = (TW) wc1; 47 | TW key2 = (TW) wc2; 48 | return key1.compareTo(key2); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /iface/org/rdfhdt/mrbuilder/io/TripleWritable.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.io; 24 | 25 | import java.io.DataInput; 26 | import java.io.DataOutput; 27 | import java.io.IOException; 28 | 29 | import org.apache.hadoop.io.WritableComparable; 30 | 31 | /** 32 | * @author chemi 33 | * 34 | */ 35 | 36 | @SuppressWarnings("rawtypes") 37 | public abstract class TripleWritable implements WritableComparable> { 38 | 39 | protected S subject; 40 | protected P predicate; 41 | protected O object; 42 | 43 | /** 44 | * 45 | */ 46 | public TripleWritable(S subject, P predicate, O object) { 47 | this.setSubject(subject); 48 | this.setPredicate(predicate); 49 | this.setObject(object); 50 | } 51 | 52 | /** 53 | * @return the subject 54 | */ 55 | public S getSubject() { 56 | return this.subject; 57 | } 58 | 59 | /** 60 | * @param subject 61 | * the subject to set 62 | */ 63 | public void setSubject(S subject) { 64 | this.subject = subject; 65 | } 66 | 67 | /** 68 | * @return the predicate 69 | */ 70 | public P getPredicate() { 71 | return this.predicate; 72 | } 73 | 74 | /** 75 | * @param predicate 76 | * the predicate to set 77 | */ 78 | public void setPredicate(P predicate) { 79 | this.predicate = predicate; 80 | } 81 | 82 | /** 83 | * @return the object 84 | */ 85 | public O getObject() { 86 | return this.object; 87 | } 88 | 89 | /** 90 | * @param object 91 | * the object to set 92 | */ 93 | public void setObject(O object) { 94 | this.object = object; 95 | } 96 | 97 | /* 98 | * (non-Javadoc) 99 | * 100 | * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput) 101 | */ 102 | @Override 103 | public void readFields(DataInput input) throws IOException { 104 | this.subject.readFields(input); 105 | this.predicate.readFields(input); 106 | this.object.readFields(input); 107 | } 108 | 109 | /* 110 | * (non-Javadoc) 111 | * 112 | * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput) 113 | */ 114 | @Override 115 | public void write(DataOutput output) throws IOException { 116 | this.subject.write(output); 117 | this.predicate.write(output); 118 | this.object.write(output); 119 | } 120 | 121 | /* 122 | * (non-Javadoc) 123 | * 124 | * @see java.lang.Comparable#compareTo(java.lang.Object) 125 | */ 126 | @Override 127 | public int compareTo(TripleWritable otherKey) { 128 | int comparison; 129 | if ((comparison = this.compareSubjectTo(otherKey)) == 0) 130 | if ((comparison = this.comparePredicateTo(otherKey)) == 0) 131 | comparison = this.compareObjectTo(otherKey); 132 | return comparison; 133 | } 134 | 135 | public int compareSubjectTo(TripleWritable otherKey) { 136 | return this.compareRole(this.getSubject(), otherKey.getSubject()); 137 | } 138 | 139 | public int comparePredicateTo(TripleWritable otherKey) { 140 | return this.compareRole(this.getPredicate(), otherKey.getPredicate()); 141 | } 142 | 143 | public int compareObjectTo(TripleWritable otherKey) { 144 | return this.compareRole(this.getObject(), otherKey.getObject()); 145 | } 146 | 147 | @SuppressWarnings("unchecked") 148 | protected int compareRole(WritableComparable wc1, WritableComparable wc2) { 149 | return (wc1.compareTo(wc2) < 0) ? -1 : ((wc1.compareTo(wc2) > 0) ? 1 : 0); 150 | } 151 | 152 | /* 153 | * (non-Javadoc) 154 | * 155 | * @see java.lang.Object#toString() 156 | */ 157 | @Override 158 | public String toString() { 159 | return this.subject + " " + this.predicate + " " + this.object; 160 | } 161 | 162 | } 163 | -------------------------------------------------------------------------------- /iface/org/rdfhdt/mrbuilder/triples/TriplesMapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.triples; 24 | 25 | import java.io.BufferedInputStream; 26 | import java.io.File; 27 | import java.io.FileInputStream; 28 | import java.io.IOException; 29 | 30 | import org.apache.hadoop.filecache.DistributedCache; 31 | import org.apache.hadoop.fs.Path; 32 | import org.apache.hadoop.io.LongWritable; 33 | import org.apache.hadoop.io.Text; 34 | import org.apache.hadoop.io.WritableComparable; 35 | import org.apache.hadoop.mapreduce.Mapper; 36 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary; 37 | import org.rdfhdt.hdt.exceptions.ParserException; 38 | import org.rdfhdt.hdt.listener.ProgressListener; 39 | import org.rdfhdt.hdt.triples.TripleString; 40 | import org.rdfhdt.hdt.util.io.CountInputStream; 41 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration; 42 | import org.rdfhdt.mrbuilder.HDTBuilderDriver.Counters; 43 | import org.rdfhdt.mrbuilder.io.TripleWritable; 44 | 45 | @SuppressWarnings("rawtypes") 46 | public abstract class TriplesMapper extends Mapper implements ProgressListener { 47 | 48 | protected FourSectionDictionary dictionary; 49 | protected HDTBuilderConfiguration conf; 50 | 51 | /* 52 | * (non-Javadoc) 53 | * 54 | * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) 55 | */ 56 | @Override 57 | protected void setup(Context context) throws IOException, InterruptedException { 58 | 59 | Path[] cache = DistributedCache.getLocalCacheFiles(context.getConfiguration()); 60 | 61 | this.conf = new HDTBuilderConfiguration(context.getConfiguration()); 62 | CountInputStream input = new CountInputStream(new BufferedInputStream(new FileInputStream(cache[0].toString()))); 63 | File file = new File(cache[0].toString()); 64 | this.dictionary = new FourSectionDictionary(this.conf.getSpec()); 65 | this.dictionary.mapFromFile(input, file, this); 66 | input.close(); 67 | 68 | // DEBUG 69 | // ((PFCDictionarySection) this.dictionary.getShared()).dumpAll(); 70 | } 71 | 72 | @Override 73 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 74 | TripleString tripleString = new TripleString(); 75 | 76 | try { 77 | tripleString.read(value.toString()); 78 | } catch (ParserException e) { 79 | // TODO Auto-generated catch block 80 | e.printStackTrace(); 81 | } 82 | 83 | context.write(this.key(tripleString), this.value(tripleString)); 84 | context.getCounter(Counters.Triples).increment(1); 85 | } 86 | 87 | @Override 88 | public void notifyProgress(float level, String message) { 89 | // if (!this.conf.getQuiet()) { 90 | System.out.print("\r" + message + "\t" + Float.toString(level) + " \r"); 91 | } 92 | 93 | protected abstract K key(TripleString tripleString) throws InterruptedException; 94 | 95 | protected abstract V value(TripleString tripleString); 96 | 97 | } 98 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | org.rdfhdt 4 | hdt-mr 5 | 2.0 6 | HDT MapReduce 7 | jar 8 | 9 | 10 | UTF-8 11 | 1.8 12 | 1.8 13 | 14 | 15 | 16 | 17 | org.rdfhdt 18 | hdt-api 19 | 2.0 20 | 21 | 22 | org.rdfhdt 23 | hdt-java-core 24 | 2.0 25 | 26 | 27 | org.apache.hadoop 28 | hadoop-common 29 | 2.7.0 30 | 31 | 32 | org.apache.hadoop 33 | hadoop-mapreduce-client-core 34 | 2.6.0 35 | 36 | 37 | com.hadoop.gplcompression 38 | hadoop-lzo 39 | 0.4.20-SNAPSHOT 40 | 41 | 42 | commons-lang 43 | commons-lang 44 | 2.1 45 | 46 | 47 | org.codehaus.plexus 48 | plexus-utils 49 | 1.1 50 | 51 | 52 | 53 | 54 | 55 | . 56 | 57 | 58 | org.apache.maven.plugins 59 | maven-assembly-plugin 60 | 61 | 62 | iface/**/*.java 63 | src/**/*.java 64 | 65 | 66 | 67 | org.rdfhdt.mrbuilder.HDTBuilderDriver 68 | 69 | 70 | 71 | jar-with-dependencies 72 | 73 | 74 | 75 | 76 | make-assembly 77 | package 78 | 79 | single 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /src/org/rdfhdt/hdt/compact/bitmap/TransientBitmap375.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.hdt.compact.bitmap; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | import java.util.UUID; 6 | 7 | import org.apache.commons.io.IOUtils; 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.rdfhdt.hdt.compact.integer.VByte; 12 | import org.rdfhdt.hdt.listener.ProgressListener; 13 | import org.rdfhdt.hdt.util.BitUtil; 14 | import org.rdfhdt.hdt.util.crc.CRC32; 15 | import org.rdfhdt.hdt.util.crc.CRC8; 16 | import org.rdfhdt.hdt.util.crc.CRCOutputStream; 17 | import org.rdfhdt.hdt.util.io.IOUtil; 18 | 19 | public class TransientBitmap375 extends Bitmap375 { 20 | 21 | protected OutputStream tempOutput; 22 | protected int bufferSize; 23 | protected int previousWordIndex; 24 | protected long nbits; 25 | private long totalbits = 0; 26 | private long totalwords = 0; 27 | 28 | protected FileSystem fileSystem; 29 | protected Path file; 30 | protected String fileName; 31 | 32 | public TransientBitmap375(int bufferSize) { 33 | super(); 34 | this.bufferSize = bufferSize; 35 | this.previousWordIndex = wordIndex(0); 36 | } 37 | 38 | public TransientBitmap375(int bufferSize, long nbits, FileSystem fs, Path path) throws IOException { 39 | super(Math.min(bufferSize, nbits)); 40 | 41 | this.bufferSize = bufferSize; 42 | this.nbits = nbits; 43 | this.previousWordIndex = wordIndex(0); 44 | 45 | this.fileName = UUID.randomUUID().toString(); 46 | 47 | if (fs == null) { 48 | fs = FileSystem.getLocal(new Configuration()); 49 | } 50 | if (path == null) { 51 | path = new Path("."); 52 | } 53 | 54 | this.fileSystem = fs; 55 | this.file = new Path(path, this.fileName); 56 | this.tempOutput = this.fileSystem.create(this.file); 57 | 58 | } 59 | 60 | @Override 61 | public long getNumBits() { 62 | return this.totalbits; 63 | } 64 | 65 | // @Override 66 | // public void append(boolean value) { 67 | // this.set(this.numbits++, value); 68 | // } 69 | 70 | @Override 71 | public void set(long bitIndex, boolean value) { 72 | if ((this.previousWordIndex >= this.bufferSize) && (this.previousWordIndex != wordIndex(bitIndex))) { 73 | try { 74 | // System.out.println("bitIndex = " + bitIndex); 75 | // System.out.println("numbits = " + this.numbits); 76 | this.flushData(); 77 | super.set(0, value); 78 | this.previousWordIndex = wordIndex(0); 79 | } catch (IOException e) { 80 | // TODO Auto-generated catch block 81 | e.printStackTrace(); 82 | } 83 | } else { 84 | super.set(bitIndex, value); 85 | this.previousWordIndex = wordIndex(bitIndex); 86 | } 87 | } 88 | 89 | private void flushData() throws IOException { 90 | 91 | // System.out.println("flushing bitmap " + this.fileName + " with " + this.numbits + " bits"); 92 | // System.out.println("Bits from last word = " + lastWordNumBits(this.numbits)); 93 | 94 | this.totalbits += this.numbits - 1; 95 | 96 | int numwords = (int) numWords(this.numbits - 1); 97 | 98 | this.totalwords += numwords; 99 | 100 | for (int i = 0; i < numwords; i++) { 101 | IOUtil.writeLong(this.tempOutput, this.words[i]); 102 | } 103 | this.words = new long[(int) numWords(this.nbits)]; 104 | this.numbits = 0; 105 | this.previousWordIndex = wordIndex(0); 106 | } 107 | 108 | public void close() throws IOException { 109 | 110 | this.totalbits += this.numbits; 111 | 112 | int numwords = (int) numWords(this.numbits); 113 | 114 | this.totalwords += numwords; 115 | 116 | // System.out.println("Closing bitmap."); 117 | // System.out.println("Writing " + this.totalbits + " bits"); 118 | // System.out.println("There should be " + this.nbits + " bits"); 119 | // System.out.println("Writing " + this.totalwords + "words"); 120 | // System.out.println("Bits from last word = " + lastWordNumBits(this.numbits)); 121 | 122 | for (int i = 0; i < numwords - 1; i++) { 123 | IOUtil.writeLong(this.tempOutput, this.words[i]); 124 | } 125 | 126 | if (numwords > 0) { 127 | // Write only used bits from last entry (byte aligned, little endian) 128 | int lastWordUsed = lastWordNumBits(this.numbits); 129 | BitUtil.writeLowerBitsByteAligned(this.words[numwords - 1], lastWordUsed, this.tempOutput); 130 | } 131 | 132 | this.tempOutput.flush(); 133 | this.tempOutput.close(); 134 | 135 | this.words = new long[0]; 136 | } 137 | 138 | @Override 139 | public void save(OutputStream output, ProgressListener listener) throws IOException { 140 | CRCOutputStream out = new CRCOutputStream(output, new CRC8()); 141 | 142 | // Write Type and Numbits 143 | out.write(BitmapFactory.TYPE_BITMAP_PLAIN); 144 | VByte.encode(out, this.totalbits); 145 | 146 | // Write CRC 147 | out.writeCRC(); 148 | 149 | // Setup new CRC 150 | out.setCRC(new CRC32()); 151 | 152 | // FileInputStream input = new FileInputStream(this.fileName); 153 | // long bytesCopied = Files.copy(this.fileSystem.open(this.file), out); 154 | long bytesCopied = IOUtils.copyLarge(this.fileSystem.open(this.file), out); 155 | // input.close(); 156 | this.fileSystem.delete(this.file, true); 157 | System.out.println("bytes copied from " + this.fileName + " = " + bytesCopied); 158 | 159 | // System.out.println("CRC = " + out.getCRC().getValue()); 160 | out.writeCRC(); 161 | 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/org/rdfhdt/hdt/compact/sequence/TransientSequenceLog64.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.hdt.compact.sequence; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | import java.util.UUID; 6 | 7 | import org.apache.commons.io.IOUtils; 8 | import org.apache.hadoop.conf.Configuration; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.rdfhdt.hdt.compact.integer.VByte; 12 | import org.rdfhdt.hdt.listener.ProgressListener; 13 | import org.rdfhdt.hdt.util.BitUtil; 14 | import org.rdfhdt.hdt.util.crc.CRC32; 15 | import org.rdfhdt.hdt.util.crc.CRC8; 16 | import org.rdfhdt.hdt.util.crc.CRCOutputStream; 17 | import org.rdfhdt.hdt.util.io.IOUtil; 18 | 19 | public class TransientSequenceLog64 extends SequenceLog64 { 20 | 21 | protected OutputStream tempOutput; 22 | protected long bufferSize, maxentries; 23 | protected long capacity; 24 | private long totalentries, totalwords; 25 | 26 | protected FileSystem fileSystem; 27 | protected Path file; 28 | protected String fileName; 29 | 30 | public TransientSequenceLog64(int bufferSize) throws IOException { 31 | this(bufferSize, W); 32 | } 33 | 34 | public TransientSequenceLog64(int bufferSize, int numbits) throws IOException { 35 | this(bufferSize, numbits, 0); 36 | } 37 | 38 | public TransientSequenceLog64(int bufferSize, int numbits, long capacity, boolean initialize) throws IOException { 39 | this(bufferSize, numbits, capacity); 40 | if (initialize) { 41 | this.numentries = capacity; 42 | } 43 | } 44 | 45 | public TransientSequenceLog64(int bufferSize, int numbits, long capacity) throws IOException { 46 | this(bufferSize, numbits, capacity, null, null); 47 | } 48 | 49 | public TransientSequenceLog64(int bufferSize, int numbits, long capacity, FileSystem fs, Path path) throws IOException { 50 | super(numbits, Math.min(bufferSize, capacity)); 51 | 52 | this.capacity = capacity; 53 | 54 | // parameter provided as bytes, transform to entries 55 | this.maxentries = (int) ((W / (double) numbits) * bufferSize); 56 | 57 | this.fileName = UUID.randomUUID().toString(); 58 | 59 | if (fs == null) { 60 | fs = FileSystem.getLocal(new Configuration()); 61 | } 62 | if (path == null) { 63 | path = new Path("."); 64 | } 65 | 66 | this.fileSystem = fs; 67 | this.file = new Path(path, this.fileName); 68 | this.tempOutput = this.fileSystem.create(this.file); 69 | } 70 | 71 | @Override 72 | public long getNumberOfElements() { 73 | return this.totalentries; 74 | } 75 | 76 | @Override 77 | public void append(long value) { 78 | super.append(value); 79 | 80 | if (this.numentries >= this.maxentries && (lastWordNumBits(this.numbits, this.numentries) == 64)) { 81 | try { 82 | this.flushData(); 83 | } catch (IOException e) { 84 | // TODO Auto-generated catch block 85 | e.printStackTrace(); 86 | } 87 | } 88 | } 89 | 90 | protected void flushData() throws IOException { 91 | // System.out.println("Flushing Sequence"); 92 | 93 | this.totalentries += this.numentries; 94 | 95 | int numwords = (int) numWordsFor(this.numbits, this.numentries); 96 | 97 | this.totalwords += numwords; 98 | 99 | // System.out.println("Remaining bits =" + lastWordNumBits(this.numbits, this.numentries)); 100 | 101 | for (int i = 0; i < numwords; i++) { 102 | IOUtil.writeLong(this.tempOutput, this.data[i]); 103 | } 104 | 105 | long size = numWordsFor(this.numbits, this.numentries); 106 | assert size >= 0 && size <= Integer.MAX_VALUE; 107 | 108 | this.data = new long[Math.max((int) size, 1)]; 109 | this.numentries = 0; 110 | } 111 | 112 | public void close() throws IOException { 113 | 114 | this.totalentries += this.numentries; 115 | 116 | int numwords = (int) numWordsFor(this.numbits, this.numentries); 117 | 118 | this.totalwords += numwords; 119 | 120 | // System.out.println("Closing sequence."); 121 | // System.out.println("Writing " + this.totalentries + " entries"); 122 | // System.out.println("There should be " + this.capacity + " entries"); 123 | // System.out.println("Writing " + this.totalwords + "words"); 124 | 125 | // System.out.println("Remaining bits =" + lastWordNumBits(this.numbits, this.numentries)); 126 | 127 | for (int i = 0; i < numwords - 1; i++) { 128 | IOUtil.writeLong(this.tempOutput, this.data[i]); 129 | } 130 | 131 | if (numwords > 0) { 132 | // Write only used bits from last entry (byte aligned, little endian) 133 | int lastWordUsedBits = lastWordNumBits(this.numbits, this.numentries); 134 | BitUtil.writeLowerBitsByteAligned(this.data[numwords - 1], lastWordUsedBits, this.tempOutput); 135 | } 136 | 137 | this.tempOutput.flush(); 138 | this.tempOutput.close(); 139 | 140 | this.data = new long[0]; 141 | } 142 | 143 | @Override 144 | public void save(OutputStream output, ProgressListener listener) throws IOException { 145 | CRCOutputStream out = new CRCOutputStream(output, new CRC8()); 146 | 147 | out.write(SequenceFactory.TYPE_SEQLOG); 148 | out.write(this.numbits); 149 | VByte.encode(out, this.totalentries); 150 | out.writeCRC(); 151 | out.setCRC(new CRC32()); 152 | 153 | // long bytesCopied = Files.copy(this.fileSystem.open(this.file), out); 154 | long bytesCopied = IOUtils.copy(this.fileSystem.open(this.file), out); 155 | System.out.println("bytes copied from " + this.fileName + " = " + bytesCopied); 156 | this.fileSystem.delete(this.file, true); 157 | 158 | // System.out.println("CRC = " + out.getCRC().getValue()); 159 | out.writeCRC(); 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /src/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary2.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.hdt.dictionary.impl; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | 7 | import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; 8 | import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionCacheAll; 9 | import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory2; 10 | import org.rdfhdt.hdt.exceptions.IllegalFormatException; 11 | import org.rdfhdt.hdt.listener.ProgressListener; 12 | import org.rdfhdt.hdt.options.ControlInfo; 13 | import org.rdfhdt.hdt.options.ControlInformation; 14 | import org.rdfhdt.hdt.options.HDTOptions; 15 | import org.rdfhdt.hdt.util.io.CountInputStream; 16 | import org.rdfhdt.hdt.util.listener.IntermediateListener; 17 | 18 | public class FourSectionDictionary2 extends FourSectionDictionary { 19 | 20 | public FourSectionDictionary2(HDTOptions spec, DictionarySectionPrivate s, DictionarySectionPrivate p, DictionarySectionPrivate o, DictionarySectionPrivate sh) { 21 | super(spec, s, p, o, sh); 22 | } 23 | 24 | public FourSectionDictionary2(HDTOptions spec) { 25 | super(spec); 26 | } 27 | 28 | public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException { 29 | if(ci.getType()!=ControlInfo.Type.DICTIONARY) { 30 | throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); 31 | } 32 | 33 | IntermediateListener iListener = new IntermediateListener(listener); 34 | 35 | shared = DictionarySectionFactory2.loadFrom(input, iListener); 36 | subjects = DictionarySectionFactory2.loadFrom(input, iListener); 37 | predicates = DictionarySectionFactory2.loadFrom(input, iListener); 38 | objects = DictionarySectionFactory2.loadFrom(input, iListener); 39 | } 40 | 41 | @Override 42 | public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException { 43 | ControlInformation ci = new ControlInformation(); 44 | ci.load(in); 45 | if(ci.getType()!=ControlInfo.Type.DICTIONARY) { 46 | throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary."); 47 | } 48 | 49 | IntermediateListener iListener = new IntermediateListener(listener); 50 | shared = DictionarySectionFactory2.loadFrom(in, f, iListener); 51 | subjects = DictionarySectionFactory2.loadFrom(in, f, iListener); 52 | predicates = DictionarySectionFactory2.loadFrom(in, f, iListener); 53 | objects = DictionarySectionFactory2.loadFrom(in, f, iListener); 54 | 55 | // Use cache only for predicates. Preload only up to 100K predicates. 56 | predicates = new DictionarySectionCacheAll(predicates, predicates.getNumberOfElements()<100000); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/org/rdfhdt/hdt/dictionary/impl/section/DictionarySectionFactory2.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.hdt.dictionary.impl.section; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | 6 | import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate; 7 | import org.rdfhdt.hdt.listener.ProgressListener; 8 | import org.rdfhdt.hdt.options.HDTSpecification; 9 | 10 | public class DictionarySectionFactory2 extends DictionarySectionFactory { 11 | 12 | 13 | public static DictionarySectionPrivate loadFrom(InputStream input, ProgressListener listener) throws IOException { 14 | if(!input.markSupported()) { 15 | throw new IllegalArgumentException("Need support for mark()/reset(). Please wrap the InputStream with a BufferedInputStream"); 16 | } 17 | input.mark(64); 18 | int dictType = input.read(); 19 | input.reset(); 20 | input.mark(64); // To allow children to reset() and try another instance. 21 | 22 | DictionarySectionPrivate section=null; 23 | 24 | switch(dictType) { 25 | case PFCDictionarySection.TYPE_INDEX: 26 | try{ 27 | // First try load using the standard PFC 28 | section = new PFCDictionarySection(new HDTSpecification()); 29 | section.load(input, listener); 30 | } catch (IllegalArgumentException e) { 31 | // The PFC Could not load the file because it is too big, use PFCBig 32 | section = new TransientDictionarySection(new HDTSpecification()); 33 | section.load(input, listener); 34 | } 35 | return section; 36 | } 37 | throw new IOException("DictionarySection implementation not available for id "+dictType); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/org/rdfhdt/hdt/dictionary/impl/section/TransientDictionarySection.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.hdt.dictionary.impl.section; 2 | 3 | import java.io.ByteArrayOutputStream; 4 | import java.io.IOException; 5 | import java.io.OutputStream; 6 | 7 | import org.apache.hadoop.io.SequenceFile; 8 | import org.apache.hadoop.io.Text; 9 | import org.rdfhdt.hdt.compact.integer.VByte; 10 | import org.rdfhdt.hdt.compact.sequence.SequenceLog64; 11 | import org.rdfhdt.hdt.listener.ProgressListener; 12 | import org.rdfhdt.hdt.options.HDTOptions; 13 | import org.rdfhdt.hdt.trans.TransientElement; 14 | import org.rdfhdt.hdt.util.Mutable; 15 | import org.rdfhdt.hdt.util.crc.CRC32; 16 | import org.rdfhdt.hdt.util.crc.CRC8; 17 | import org.rdfhdt.hdt.util.crc.CRCOutputStream; 18 | import org.rdfhdt.hdt.util.io.IOUtil; 19 | import org.rdfhdt.hdt.util.string.ByteStringUtil; 20 | import org.rdfhdt.hdt.util.string.CompactString; 21 | import org.rdfhdt.hdt.util.string.ReplazableString; 22 | 23 | public class TransientDictionarySection extends PFCDictionarySectionBig implements TransientElement { 24 | 25 | ByteArrayOutputStream byteOut; 26 | CharSequence previousStr; 27 | int buffer; 28 | int blockPerBuffer; 29 | long storedBuffersSize; 30 | 31 | public TransientDictionarySection(HDTOptions spec) { 32 | super(spec); 33 | this.blocksize = (int) spec.getInt("pfc.blocksize"); 34 | if (this.blocksize == 0) { 35 | this.blocksize = DEFAULT_BLOCK_SIZE; 36 | } 37 | if (this.blockPerBuffer == 0) { 38 | this.blockPerBuffer = BLOCK_PER_BUFFER; 39 | } 40 | } 41 | 42 | @Override 43 | public void initialize(long numentries) { 44 | this.blocks = new SequenceLog64(63, numentries / this.blocksize); 45 | this.storedBuffersSize = 0; 46 | this.numstrings = 0; 47 | this.byteOut = new ByteArrayOutputStream(16 * 1024); 48 | this.blockPerBuffer = BLOCK_PER_BUFFER / 5; 49 | this.data = new byte[(int) Math.ceil((((double) numentries / this.blocksize) / this.blockPerBuffer))][]; 50 | this.posFirst = new long[this.data.length]; 51 | this.buffer = 0; 52 | this.previousStr = null; 53 | } 54 | 55 | @Override 56 | public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException { 57 | CharSequence str = null; 58 | Text line = new Text(); 59 | 60 | this.posFirst[0] = 0; 61 | while (input.next(line)) { 62 | str = new CompactString(line.toString()); 63 | 64 | if (this.numstrings % this.blocksize == 0) { 65 | // Add new block pointer 66 | // System.out.println(this.storedBuffersSize); 67 | // System.out.println(this.byteOut.size()); 68 | // System.out.println(this.blocksize); 69 | this.blocks.append(this.storedBuffersSize + this.byteOut.size()); 70 | 71 | // if number of block per buffer reached, change buffer 72 | if (((this.blocks.getNumberOfElements() - 1) % this.blockPerBuffer == 0) && ((this.blocks.getNumberOfElements() - 1) / this.blockPerBuffer != 0)) { 73 | this.storedBuffersSize += this.byteOut.size(); 74 | this.storeBuffer(this.buffer); 75 | this.byteOut = new ByteArrayOutputStream(16 * 1024); 76 | if (this.buffer < this.data.length - 1) { 77 | this.posFirst[++this.buffer] = this.storedBuffersSize + this.byteOut.size(); 78 | } 79 | } 80 | 81 | // Copy full string 82 | ByteStringUtil.append(this.byteOut, str, 0); 83 | } else { 84 | // Find common part. 85 | int delta = ByteStringUtil.longestCommonPrefix(this.previousStr, str); 86 | // Write Delta in VByte 87 | VByte.encode(this.byteOut, delta); 88 | // Write remaining 89 | ByteStringUtil.append(this.byteOut, str, delta); 90 | } 91 | 92 | // System.out.println(str); 93 | 94 | this.byteOut.write(0); // End of string 95 | this.numstrings++; 96 | this.previousStr = str; 97 | } 98 | } 99 | 100 | protected void storeBuffer(int buffer) throws IOException { 101 | // System.out.println("Buffer = " + buffer); 102 | this.byteOut.flush(); 103 | this.data[buffer] = this.byteOut.toByteArray(); 104 | this.byteOut.close(); 105 | } 106 | 107 | @Override 108 | public void close() throws IOException { 109 | // Ending block pointer. 110 | this.blocks.append(this.storedBuffersSize + this.byteOut.size()); 111 | 112 | // Trim text/blocks 113 | this.blocks.aggresiveTrimToSize(); 114 | 115 | // System.out.println("Data length = " + this.data.length); 116 | this.storeBuffer(this.buffer); 117 | } 118 | 119 | /* 120 | * (non-Javadoc) 121 | * 122 | * @see org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySectionBig#save(java.io.OutputStream, org.rdfhdt.hdt.listener.ProgressListener) 123 | */ 124 | @Override 125 | public void save(OutputStream output, ProgressListener listener) throws IOException { 126 | long dataLenght = 0; 127 | CRCOutputStream out = new CRCOutputStream(output, new CRC8()); 128 | 129 | for (byte[] buffer : this.data) { 130 | dataLenght += buffer.length; 131 | } 132 | 133 | out.write(TYPE_INDEX); 134 | VByte.encode(out, this.numstrings); 135 | VByte.encode(out, dataLenght); 136 | VByte.encode(out, this.blocksize); 137 | 138 | out.writeCRC(); 139 | 140 | this.blocks.save(output, listener); // Write blocks directly to output, they have their own CRC check. 141 | 142 | out.setCRC(new CRC32()); 143 | for (byte[] buffer : this.data) { 144 | IOUtil.writeBuffer(out, buffer, 0, buffer.length, listener); 145 | } 146 | out.writeCRC(); 147 | } 148 | 149 | /* 150 | * (non-Javadoc) 151 | * 152 | * @see hdt.dictionary.DictionarySection#extract(int) 153 | */ 154 | @Override 155 | public CharSequence extract(int id) { 156 | 157 | // System.out.println("id = " + id); 158 | 159 | if (id < 1 || id > this.numstrings) { 160 | return null; 161 | } 162 | 163 | // Locate block 164 | int blockid = (id - 1) / this.blocksize; 165 | int nstring = (id - 1) % this.blocksize; 166 | 167 | // System.out.println("blockid = " + blockid); 168 | // System.out.println("nstring = " + nstring); 169 | 170 | byte[] block = this.data[blockid / this.blockPerBuffer]; 171 | int pos = (int) (this.blocks.get(blockid) - this.posFirst[blockid / this.blockPerBuffer]); 172 | 173 | // System.out.println("pos = " + pos); 174 | 175 | // Copy first string 176 | int len = ByteStringUtil.strlen(block, pos); 177 | 178 | // System.out.println("len = " + len); 179 | 180 | Mutable delta = new Mutable(0L); 181 | ReplazableString tempString = new ReplazableString(); 182 | tempString.append(block, pos, len); 183 | 184 | // System.out.println("dentro del for"); 185 | 186 | // Copy strings untill we find our's. 187 | for (int i = 0; i < nstring; i++) { 188 | pos += len + 1; 189 | // System.out.println("pos = " + pos); 190 | pos += VByte.decode(block, pos, delta); 191 | // System.out.println("pos = " + pos); 192 | // System.out.println("delta = [" + delta + "]"); 193 | len = ByteStringUtil.strlen(block, pos); 194 | // System.out.println("len = " + len); 195 | tempString.replace(delta.getValue().intValue(), block, pos, len); 196 | // System.out.println("tempstring = [" + tempString + "]"); 197 | } 198 | return tempString; 199 | } 200 | 201 | /** 202 | * Locate the block of a string doing binary search. 203 | */ 204 | @Override 205 | protected int locateBlock(CharSequence str) { 206 | int low = 0; 207 | int high = (int) this.blocks.getNumberOfElements() - 1; 208 | int max = high; 209 | 210 | while (low <= high) { 211 | int mid = (low + high) >>> 1; 212 | 213 | int cmp; 214 | if (mid == max) { 215 | cmp = -1; 216 | } else { 217 | cmp = ByteStringUtil.strcmp(str, this.data[mid / this.blockPerBuffer], (int) (this.blocks.get(mid) - this.posFirst[mid / this.blockPerBuffer])); 218 | 219 | // if (str.toString().contains("http://dbpedia.org/ontology/Agent") || str.toString().contains("The Health Inspector pays a visit") || str.toString().contains("Crockett_Middle_School") || str.toString().contains("Benthosuchus")) { 220 | // System.out.println("Block: "+ mid + ": "+ ByteStringUtil.asString(data[mid / blockPerBuffer], (int) (this.blocks.get(mid) - this.posFirst[mid / blockPerBuffer])) + " Result: " + cmp); 221 | // } 222 | } 223 | 224 | if (cmp < 0) { 225 | high = mid - 1; 226 | } else if (cmp > 0) { 227 | low = mid + 1; 228 | } else { 229 | return mid; // key found 230 | } 231 | } 232 | return -(low + 1); // key not found. 233 | } 234 | 235 | @Override 236 | protected int locateInBlock(int blockid, CharSequence str) { 237 | 238 | ReplazableString tempString = new ReplazableString(); 239 | 240 | Mutable delta = new Mutable(0L); 241 | int idInBlock = 0; 242 | int cshared = 0; 243 | 244 | byte[] block = this.data[blockid / this.blockPerBuffer]; 245 | int pos = (int) (this.blocks.get(blockid) - this.posFirst[blockid / this.blockPerBuffer]); 246 | 247 | // Read the first string in the block 248 | int slen = ByteStringUtil.strlen(block, pos); 249 | tempString.append(block, pos, slen); 250 | pos += slen + 1; 251 | idInBlock++; 252 | 253 | while ((idInBlock < this.blocksize) && (pos < block.length)) { 254 | // Decode prefix 255 | pos += VByte.decode(block, pos, delta); 256 | 257 | // Copy suffix 258 | slen = ByteStringUtil.strlen(block, pos); 259 | tempString.replace(delta.getValue().intValue(), block, pos, slen); 260 | 261 | if (delta.getValue() >= cshared) { 262 | // Current delta value means that this string 263 | // has a larger long common prefix than the previous one 264 | // if (str.toString().contains("http://dbpedia.org/ontology/Agent") || str.toString().contains("The Health Inspector pays a visit") || str.toString().contains("Crockett_Middle_School") || str.toString().contains("Benthosuchus")) { 265 | // System.out.println("[" + tempString + "]. cshared [" + cshared + "]"); 266 | // } 267 | cshared += ByteStringUtil.longestCommonPrefix(tempString, str, cshared); 268 | 269 | if ((cshared == str.length()) && (tempString.length() == str.length())) { 270 | break; 271 | } 272 | } else { 273 | // We have less common characters than before, 274 | // this string is bigger that what we are looking for. 275 | // i.e. Not found. 276 | idInBlock = 0; 277 | break; 278 | } 279 | pos += slen + 1; 280 | idInBlock++; 281 | 282 | } 283 | 284 | // Not found 285 | if (pos == block.length || idInBlock == this.blocksize) { 286 | idInBlock = 0; 287 | } 288 | 289 | return idInBlock; 290 | } 291 | 292 | } 293 | -------------------------------------------------------------------------------- /src/org/rdfhdt/hdt/hdt/impl/TransientHDT.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.hdt.hdt.impl; 2 | 3 | import java.io.IOException; 4 | import java.io.OutputStream; 5 | 6 | import org.rdfhdt.hdt.dictionary.DictionaryPrivate; 7 | import org.rdfhdt.hdt.header.HeaderPrivate; 8 | import org.rdfhdt.hdt.listener.ProgressListener; 9 | import org.rdfhdt.hdt.options.HDTOptions; 10 | import org.rdfhdt.hdt.triples.TriplesPrivate; 11 | 12 | /** 13 | * @author José M. Giménez-García 14 | * 15 | * @Note: HDTImpl modified to make fields protected instead of private 16 | * 17 | */ 18 | public class TransientHDT extends HDTImpl { 19 | 20 | public TransientHDT(HDTOptions spec) { 21 | super(spec); 22 | } 23 | 24 | public void setHeader(HeaderPrivate header) { 25 | this.header = header; 26 | } 27 | 28 | public void setDictionary(DictionaryPrivate dictionary) { 29 | this.dictionary = dictionary; 30 | } 31 | 32 | @Override 33 | public void setTriples(TriplesPrivate triples) { 34 | this.triples = triples; 35 | } 36 | 37 | @Override 38 | public void saveToHDT(OutputStream output, ProgressListener listener) throws IOException { 39 | // TODO Auto-generated method stub 40 | super.saveToHDT(output, listener); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/org/rdfhdt/hdt/triples/ScapedTripleString.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.hdt.triples; 2 | 3 | import org.rdfhdt.hdt.exceptions.ParserException; 4 | 5 | /** 6 | * TripleString holds a triple as Strings 7 | */ 8 | public final class ScapedTripleString extends TripleString { 9 | 10 | public ScapedTripleString() { 11 | super(); 12 | } 13 | 14 | public ScapedTripleString(CharSequence subject, CharSequence predicate, CharSequence object) { 15 | super(subject, predicate, object); 16 | } 17 | 18 | public ScapedTripleString(TripleString other) { 19 | super(other); 20 | } 21 | 22 | /** 23 | * Read from a line, where each component is separated by space. 24 | * 25 | * @param line 26 | */ 27 | @Override 28 | public void read(String line) throws ParserException { 29 | int split, posa, posb; 30 | this.clear(); 31 | 32 | // SET SUBJECT 33 | posa = 0; 34 | 35 | if (line.charAt(posa) == '<') { // subject between '<' and '>' symbols 36 | posa++; // Remove < 37 | posb = line.indexOf('>', posa); 38 | split = posb + 1; 39 | } else { // subject until the first space 40 | posb = split = line.indexOf(' ', posa); 41 | } 42 | if (posb == -1) { 43 | return; // Not found, error. 44 | } 45 | 46 | this.setSubject(line.substring(posa, posb)); 47 | 48 | // SET PREDICATE 49 | posa = split + 1; 50 | 51 | if (line.charAt(posa) == '<') { // predicate between '<' and '>' symbols 52 | posa++; // Remove < 53 | posb = line.indexOf('>', posa); 54 | split = posb + 1; 55 | } else { // predicate until the first space 56 | posb = split = line.indexOf(' ', posa); 57 | } 58 | if (posb == -1) { 59 | return; // Not found, error. 60 | } 61 | 62 | this.setPredicate(line.substring(posa, posb)); 63 | 64 | // SET OBJECT 65 | posa = split + 1; 66 | posb = line.length(); 67 | 68 | if (line.charAt(posb - 1) == '.') { 69 | posb--; // Remove trailing from NTRIPLES. 70 | } 71 | if (line.charAt(posb - 1) == ' ') { 72 | posb--; 73 | } 74 | 75 | if (line.charAt(posa) == '<') { 76 | posa++; 77 | 78 | // Remove trailing > only if < appears, so "some"^^ is kept as-is. 79 | if (posb > posa && line.charAt(posb - 1) == '>') { 80 | posb--; 81 | } 82 | } 83 | 84 | this.setObject(line.substring(posa, posb)); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/org/rdfhdt/hdt/triples/impl/TransientBitMapTriples.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.hdt.triples.impl; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.FileSystem; 6 | import org.apache.hadoop.fs.Path; 7 | import org.apache.hadoop.io.SequenceFile; 8 | import org.rdfhdt.hdt.compact.bitmap.AdjacencyList; 9 | import org.rdfhdt.hdt.compact.bitmap.TransientBitmap375; 10 | import org.rdfhdt.hdt.compact.sequence.TransientSequenceLog64; 11 | import org.rdfhdt.hdt.enums.TripleComponentOrder; 12 | import org.rdfhdt.hdt.exceptions.IllegalFormatException; 13 | import org.rdfhdt.hdt.listener.ProgressListener; 14 | import org.rdfhdt.hdt.options.HDTOptions; 15 | import org.rdfhdt.hdt.triples.TripleID; 16 | import org.rdfhdt.hdt.util.BitUtil; 17 | import org.rdfhdt.hdt.util.listener.ListenerUtil; 18 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration; 19 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable; 20 | 21 | public class TransientBitMapTriples extends BitmapTriples { 22 | 23 | long number; 24 | long size; 25 | long lastX = 0, lastY = 0, lastZ = 0; 26 | long x, y, z; 27 | long numTriples = 0; 28 | boolean trimNeeded = false; 29 | 30 | FileSystem fileSystem; 31 | Path path; 32 | 33 | public TransientBitMapTriples() { 34 | super(); 35 | } 36 | 37 | public TransientBitMapTriples(HDTOptions spec) { 38 | super(spec); 39 | } 40 | 41 | public TransientBitMapTriples(FileSystem fs, Path path) { 42 | this(); 43 | this.setFileSystem(fs); 44 | this.setPath(path); 45 | } 46 | 47 | public TransientBitMapTriples(HDTOptions spec, FileSystem fs, Path path) { 48 | this(spec); 49 | this.setFileSystem(fs); 50 | this.setPath(path); 51 | } 52 | 53 | public void setFileSystem(FileSystem fs) { 54 | this.fileSystem = fs; 55 | } 56 | 57 | public void setPath(Path path) { 58 | this.path = path; 59 | } 60 | 61 | public void initialize(long numentries) throws IOException { 62 | this.initialize(numentries, numentries, numentries); 63 | this.trimNeeded = true; 64 | } 65 | 66 | public void initialize(long numentries, long maxvalue) throws IOException { 67 | this.initialize(numentries, maxvalue, maxvalue); 68 | this.trimNeeded = true; 69 | } 70 | 71 | public void initialize(long numentries, long maxpredicate, long maxobject) throws IOException { 72 | 73 | // System.out.println("Numentries: " + numentries); 74 | 75 | this.number = numentries; 76 | this.seqY = new TransientSequenceLog64(HDTBuilderConfiguration.CHUNK_SIZE, BitUtil.log2(maxpredicate), this.number, this.fileSystem, this.path); 77 | this.seqZ = new TransientSequenceLog64(HDTBuilderConfiguration.CHUNK_SIZE, BitUtil.log2(maxobject), this.number, this.fileSystem, this.path); 78 | this.bitmapY = new TransientBitmap375(HDTBuilderConfiguration.CHUNK_SIZE, this.number, this.fileSystem, this.path); 79 | this.bitmapZ = new TransientBitmap375(HDTBuilderConfiguration.CHUNK_SIZE, this.number, this.fileSystem, this.path); 80 | // this.bitmapY = new Bitmap375(this.number); 81 | // this.bitmapZ = new Bitmap375(this.number); 82 | } 83 | 84 | public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException { 85 | TripleSPOWritable tripleWritable = new TripleSPOWritable(); 86 | 87 | while (input.next(tripleWritable)) { 88 | TripleID triple = new TripleID((int) tripleWritable.getSubject().get(), (int) tripleWritable.getPredicate().get(), (int) tripleWritable.getObject().get()); 89 | this.add(triple); 90 | ListenerUtil.notifyCond(listener, "Converting to BitmapTriples", this.numTriples, this.numTriples, this.number); 91 | this.numTriples++; 92 | } 93 | } 94 | 95 | public void add(TripleID triple) { 96 | TransientSequenceLog64 vectorY = (TransientSequenceLog64) this.seqY; 97 | TransientSequenceLog64 vectorZ = (TransientSequenceLog64) this.seqZ; 98 | TransientBitmap375 bitY = (TransientBitmap375) this.bitmapY; 99 | TransientBitmap375 bitZ = (TransientBitmap375) this.bitmapZ; 100 | // Bitmap375 bitY = (Bitmap375) this.bitmapY; 101 | // Bitmap375 bitZ = (Bitmap375) this.bitmapZ; 102 | 103 | TripleOrderConvert.swapComponentOrder(triple, TripleComponentOrder.SPO, this.order); 104 | this.x = triple.getSubject(); 105 | this.y = triple.getPredicate(); 106 | this.z = triple.getObject(); 107 | 108 | if (this.x == 0 || this.y == 0 || this.z == 0) { 109 | throw new IllegalFormatException("None of the components of a triple can be null"); 110 | } 111 | 112 | if (this.numTriples == 0) { 113 | // First triple 114 | vectorY.append(this.y); 115 | vectorZ.append(this.z); 116 | } else if (this.x != this.lastX) { 117 | if (this.x != this.lastX + 1) { 118 | throw new IllegalFormatException("Upper level must be increasing and correlative."); 119 | } 120 | // X changed 121 | bitY.append(true); 122 | vectorY.append(this.y); 123 | 124 | bitZ.append(true); 125 | vectorZ.append(this.z); 126 | } else if (this.y != this.lastY) { 127 | if (this.y < this.lastY) { 128 | throw new IllegalFormatException("Middle level must be increasing for each parent."); 129 | } 130 | 131 | // Y changed 132 | bitY.append(false); 133 | vectorY.append(this.y); 134 | 135 | bitZ.append(true); 136 | vectorZ.append(this.z); 137 | } else if (this.z != this.lastZ) { // Añadido para quitar triples duplicados 138 | if (this.z < this.lastZ) { 139 | throw new IllegalFormatException("Lower level must be increasing for each parent."); 140 | } 141 | 142 | // Z changed 143 | bitZ.append(false); 144 | vectorZ.append(this.z); 145 | } 146 | 147 | this.lastX = this.x; 148 | this.lastY = this.y; 149 | this.lastZ = this.z; 150 | } 151 | 152 | public void close() throws IOException { 153 | TransientSequenceLog64 vectorY = (TransientSequenceLog64) this.seqY; 154 | TransientSequenceLog64 vectorZ = (TransientSequenceLog64) this.seqZ; 155 | TransientBitmap375 bitY = (TransientBitmap375) this.bitmapY; 156 | TransientBitmap375 bitZ = (TransientBitmap375) this.bitmapZ; 157 | // Bitmap375 bitY = (Bitmap375) this.bitmapY; 158 | // Bitmap375 bitZ = (Bitmap375) this.bitmapZ; 159 | 160 | bitY.append(true); 161 | bitZ.append(true); 162 | 163 | bitY.close(); 164 | bitZ.close(); 165 | 166 | vectorY.close(); 167 | vectorZ.close(); 168 | 169 | // System.out.println("bitmapY size = " + this.bitmapY.getNumBits()); 170 | // System.out.println("seqY size = " + this.seqY.getNumberOfElements()); 171 | // System.out.println("bitmapZ size = " + this.bitmapZ.getNumBits()); 172 | // System.out.println("seqZ size = " + this.seqZ.getNumberOfElements()); 173 | 174 | if (this.trimNeeded) { 175 | vectorY.aggresiveTrimToSize(); 176 | vectorZ.trimToSize(); 177 | } 178 | 179 | this.adjY = new AdjacencyList(this.seqY, this.bitmapY); 180 | this.adjZ = new AdjacencyList(this.seqZ, this.bitmapZ); 181 | 182 | // DEBUG 183 | // this.adjY.dump(); 184 | // this.adjZ.dump(); 185 | } 186 | 187 | @Override 188 | public long getNumberOfElements() { 189 | return this.number; 190 | } 191 | 192 | @Override 193 | public long size() { 194 | return this.size; 195 | } 196 | 197 | } 198 | -------------------------------------------------------------------------------- /src/org/rdfhdt/listener/HDTBuilderListener.java: -------------------------------------------------------------------------------- 1 | package org.rdfhdt.listener; 2 | 3 | import org.rdfhdt.hdt.listener.ProgressListener; 4 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration; 5 | 6 | public class HDTBuilderListener implements ProgressListener { 7 | 8 | boolean quiet; 9 | 10 | public HDTBuilderListener(HDTBuilderConfiguration conf) { 11 | this.quiet = conf.getQuiet(); 12 | } 13 | 14 | public HDTBuilderListener(boolean quiet) { 15 | this.quiet = quiet; 16 | } 17 | 18 | @Override 19 | public void notifyProgress(float level, String message) { 20 | if (!this.quiet) { 21 | System.out.print("\r" + message + "\t" + Float.toString(level) + " \r"); 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/HDTBuilderConfiguration.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder; 24 | 25 | import java.io.IOException; 26 | 27 | import org.apache.commons.io.FilenameUtils; 28 | import org.apache.commons.lang.StringUtils; 29 | import org.apache.hadoop.conf.Configuration; 30 | import org.apache.hadoop.fs.Path; 31 | import org.rdfhdt.hdt.options.HDTSpecification; 32 | 33 | import com.beust.jcommander.JCommander; 34 | import com.beust.jcommander.Parameter; 35 | 36 | public class HDTBuilderConfiguration { 37 | 38 | public final static int CHUNK_SIZE = 1 * 1024 * 1024; 39 | 40 | public final static String SHARED = "shared"; 41 | public final static String SUBJECTS = "subjects"; 42 | public final static String PREDICATES = "predicates"; 43 | public final static String OBJECTS = "objects"; 44 | public final static String SAMPLE = "samples"; 45 | 46 | public final static String SHARED_OUTPUT_PATH = SHARED + "/"; 47 | public final static String SUBJECTS_OUTPUT_PATH = SUBJECTS + "/"; 48 | public final static String PREDICATES_OUTPUT_PATH = PREDICATES + "/"; 49 | public final static String OBJECTS_OUTPUT_PATH = OBJECTS + "/"; 50 | public final static String SAMPLE_OUTPUT_PATH = SAMPLE + "/"; 51 | 52 | final static String DEFAULT_CONFIGURATION_PATH = "HDTMRBuilder.xml"; 53 | 54 | final static String AWS_BUCKET_NAME = "global.bucket"; 55 | final static String AWS_BUCKET_DEFAULT_VALUE = null; 56 | 57 | final static String BASE_PATH_NAME = "global.path.base"; 58 | final static String BASE_PATH_DEFAULT_VALUE = "."; 59 | final static String INPUT_PATH_NAME = "global.path.input"; 60 | final static String INPUT_PATH_DEFAULT_VALUE = "input"; 61 | 62 | final static String DICTIONARY_RUN_JOB_NAME = "job.dictionary.run"; 63 | final static Boolean DICTIONARY_RUN_JOB_DEFAULT_VALUE = true; 64 | final static String DICTIONARY_JOB_NAME_NAME = "job.dictionary.name"; 65 | final static String DICTIONARY_JOB_NAME_DEFAULT_VALUE = "DictionaryJob"; 66 | final static String DICTIONARY_OUTPUT_PATH_NAME = "job.dictionary.path.output"; 67 | final static String DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE = "dictionary"; 68 | final static String DICTIONARY_DELETE_OUTPUT_PATH_NAME = "job.dictionary.path.output.delete"; 69 | final static boolean DICTIONARY_DELETE_OUTPUT_PATH_DEFAULT_VALUE = false; 70 | final static String DICTIONARY_NUM_REDUCERS_NAME = "job.dictionary.reducers"; 71 | final static int DICTIONARY_NUM_REDUCERS_DEFAULT_VALUE = 1; 72 | 73 | final static String DICTIONARY_RUN_SAMPLE_NAME = "job.dictionary.sample.run"; 74 | final static boolean DICTIONARY_RUN_SAMPLE_DEFAULT_VALUE = true; 75 | final static String DICTIONARY_SAMPLE_PROBABILITY_NAME = "job.dictionary.sample.probability"; 76 | final static float DICTIONARY_SAMPLE_PROBABILITY_DEFAULT_VALUE = (float) 0.001; 77 | final static String DICTIONARY_SAMPLE_OUTPUT_PATH_NAME = "job.dictionary.path.sample"; 78 | final static String DICTIONARY_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE = "dictionary_samples"; 79 | final static String DICTIONARY_DELETE_SAMPLE_PATH_NAME = "job.dictionary.path.sample.delete"; 80 | final static boolean DICTIONARY_DELETE_SAMPLE_PATH_DEFAULT_VALUE = false; 81 | final static String DICTIONARY_SAMPLE_NUM_REDUCERS_NAME = "job.dictionary.sample.reducers"; 82 | final static int DICTIONARY_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE = 1; 83 | 84 | final static String HDTDICTIONARY_BUILD_NAME = "hdt.dictionary.build"; 85 | final static boolean HDTDICTIONARY_BUILD_DEFAULT_VALUE = true; 86 | final static String HDTDICTIONARY_FILE_NAME = "hdt.dictionary.file"; 87 | final static String HDTDICTIONARY_FILE_DEFAULT_VALUE = "dictionary.hdt"; 88 | final static String HDTDICTIONARY_DISTRIBUTION_NAME = "job.triples.dictionary.distribution"; 89 | final static int HDTDICTIONARY_DISTRIBUTION_DEFAULT_VALUE = 1; 90 | 91 | final static String TRIPLES_RUN_JOB_NAME = "job.triples.run"; 92 | final static boolean TRIPLES_RUN_JOB_DEFAULT_VALUE = true; 93 | final static String TRIPLES_JOB_NAME_NAME = "job.triples.name"; 94 | final static String TRIPLES_JOB_NAME_DEFAULT_VALUE = "TriplesJob"; 95 | // final static String TRIPLES_MAP_DICTIONARY_FILE_NAME = "job.triples.map.dictionary.file"; 96 | // final static String TRIPLES_MAP_DICTIONARY_FILE_DEFAULT_VALUE = "dictionary_map.hdt"; 97 | // final static String TRIPLES_REDUCE_DICTIONARY_FILE_NAME = "job.triples.reduce.dictionary.file"; 98 | // final static String TRIPLES_REDUCE_DICTIONARY_FILE_DEFAULT_VALUE = "dictionary_reduce.hdt"; 99 | final static String TRIPLES_OUTPUT_PATH_NAME = "job.triples.path.output"; 100 | final static String TRIPLES_OUTPUT_PATH_DEFAULT_VALUE = "triples"; 101 | final static String TRIPLES_DELETE_OUTPUT_PATH_NAME = "job.triples.path.output.delete"; 102 | final static boolean TRIPLES_DELETE_OUTPUT_PATH_DEFAULT_VALUE = false; 103 | final static String TRIPLES_NUM_REDUCERS_NAME = "job.triples.reducers"; 104 | final static int TRIPLES_NUM_REDUCERS_DEFAULT_VALUE = 1; 105 | 106 | final static String TRIPLES_RUN_SAMPLE_NAME = "job.triples.sample.run"; 107 | final static boolean TRIPLES_RUN_SAMPLE_DEFAULT_VALUE = true; 108 | final static String TRIPLES_SAMPLE_PROBABILITY_NAME = "job.triples.sample.probability"; 109 | final static float TRIPLES_SAMPLE_PROBABILITY_DEFAULT_VALUE = (float) 0.001; 110 | final static String TRIPLES_SAMPLE_OUTPUT_PATH_NAME = "job.triples.path.sample"; 111 | final static String TRIPLES_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE = "triples_samples"; 112 | final static String TRIPLES_DELETE_SAMPLE_PATH_NAME = "job.triples.path.sample.delete"; 113 | final static boolean TRIPLES_DELETE_SAMPLE_PATH_DEFAULT_VALUE = false; 114 | final static String TRIPLES_SAMPLE_NUM_REDUCERS_NAME = "job.triples.sample.reducers"; 115 | final static int TRIPLES_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE = 1; 116 | 117 | final static String HDT_BUILD_NAME = "hdt.build"; 118 | final static boolean HDT_BUILD_DEFAULT_VALUE = true; 119 | final static String HDT_OUTPUT_PATH_NAME = "hdt.path.output"; 120 | final static String HDT_OUTPUT_PATH_DEFAULT_VALUE = "hdt_output"; 121 | final static String HDT_FILE_NAME = "hdt.file"; 122 | final static String HDT_FILE_DEFAULT_VALUE = "output.hdt"; 123 | 124 | final static String CONFIG_FILE_NAME = "hdt-lib.configFile"; 125 | final static String CONFIG_FILE_DEFAULT_VALUE = null; 126 | final static String OPTIONS_NAME = "hdtl-lib.options"; 127 | final static String OPTIONS_DEFAULT_VALUE = null; 128 | final static String RDF_TYPE_NAME = "hdt-lib.rdfType"; 129 | final static String RDF_TYPE_DEFAULT_VALUE = "ntriples"; 130 | final static String QUIET_NAME = "hdt-lib.quiet"; 131 | final static boolean QUIET_DEFAULT_VALUE = false; 132 | final static String BASE_URI_NAME = "hdt-lib.baseUri"; 133 | final static String BASE_URI_DEFAULT_VALUE = "http://rdfhdt.org/HDTMR"; 134 | final static String GENERATE_INDEX_NAME = "hdt-lib.generateIndex"; 135 | final static boolean GENERATE_INDEX_DEFAULT_VALUE = false; 136 | 137 | JCommander jc; 138 | 139 | @Parameter(names = { "-h", "--help" }, help = true, hidden = true) 140 | boolean help = false; 141 | 142 | @Parameter(names = { "-a", "--awsbucket" }, description = "Amazon Web Services bucket") 143 | String pAwsBucket = null; 144 | 145 | @Parameter(names = { "-c", "--conf" }, description = "Path to configuration file") 146 | String pConfigFile = null; 147 | 148 | @Parameter(names = { "-b", "--basedir" }, description = "Root directory for the process") 149 | String pBasePath = null; 150 | 151 | @Parameter(names = { "-rd", "--rundictionary" }, description = "Whether to run dictionary job or not", arity = 1) 152 | Boolean pRunDictionary = null; 153 | 154 | @Parameter(names = { "-rds", "--rundictionarysampling" }, description = "Whether to run dictionary input sampling job or not", arity = 1) 155 | Boolean pRunDictionarySampling = null; 156 | 157 | @Parameter(names = { "-nd", "--namedictionaryjob" }, description = "Name of dictionary job") 158 | String pDictionaryName = null; 159 | 160 | @Parameter(names = { "-i", "--input" }, description = "Path to input files. Relative to basedir") 161 | String pInputPath = null; 162 | 163 | @Parameter(names = { "-sd", "--samplesdictionary" }, description = "Path to dictionary job sample files. Relative to basedir") 164 | String pDictionarySamplePath = null; 165 | 166 | @Parameter(names = { "-st", "--samplestriples" }, description = "Path to triples job sample files. Relative to basedir") 167 | String pTriplesSamplePath = null; 168 | 169 | @Parameter(names = { "-od", "--outputdictionary" }, description = "Path to dictionary job output files. Relative to basedir") 170 | String pDictionaryOutputPath = null; 171 | 172 | @Parameter(names = { "-dd", "--deleteoutputdictionary" }, description = "Delete dictionary job output path before running job") 173 | Boolean pDeleteDictionaryOutputPath = null; 174 | 175 | @Parameter(names = { "-dsd", "--deletesampledictionary" }, description = "Delete dictionary job sample path before running job") 176 | Boolean pDeleteDictionarySamplePath = null; 177 | 178 | @Parameter(names = { "-dst", "--deletesampletriples" }, description = "Delete triples job sample path before running job") 179 | Boolean pDeleteTriplesSamplePath = null; 180 | 181 | @Parameter(names = { "-Rd", "--reducersdictionary" }, description = "Number of reducers for dictionary job") 182 | Integer pNumReducersDictionary = null; 183 | 184 | @Parameter(names = { "-Rds", "--reducersdictionarysampling" }, description = "Number of reducers for dictionary input sampling job") 185 | Integer pNumReducersDictionarySampling = null; 186 | 187 | @Parameter(names = { "-bd", "--builddictionary" }, description = "Whether to build HDT dictionary or not", arity = 1) 188 | Boolean pBuildDictionary = null; 189 | 190 | @Parameter(names = { "-bh", "--buildhdt" }, description = "Whether to build HDT or not", arity = 1) 191 | Boolean pBuildHDT = null; 192 | 193 | @Parameter(names = { "-fd", "--filedictionary" }, description = "Name of hdt dictionary file") 194 | String pDictionaryFileName = null; 195 | 196 | @Parameter(names = { "-fm", "--filesubjects" }, description = "Name of hdt dictionary file for Mappers") 197 | String pMapDictionaryFileName = null; 198 | 199 | @Parameter(names = { "-fr", "--fileobjects" }, description = "Name of hdt dictionary file for Reducers") 200 | String pReduceDictionaryFileName = null; 201 | 202 | @Parameter(names = { "-d", "--dictionarydistribution" }, description = "Dictionary distribution among mappers and reducers") 203 | Integer pDictionaryDistribution = null; 204 | 205 | @Parameter(names = { "-rt", "--runtriples" }, description = "Whether to run triples job or not", arity = 1) 206 | Boolean pRunTriples = null; 207 | 208 | @Parameter(names = { "-rts", "--runtriplessampling" }, description = "Whether to run triples input sampling job or not", arity = 1) 209 | Boolean pRunTriplesSampling = null; 210 | 211 | @Parameter(names = { "-nt", "--nametriplesjob" }, description = "Name of triples job") 212 | String pTriplesName = null; 213 | 214 | @Parameter(names = { "-it", "--inputtriples" }, description = "Path to triples job input files. Relative to basedir") 215 | String pTriplesInputPath = null; 216 | 217 | @Parameter(names = { "-ot", "--outputtriples" }, description = "Path to triples job output files. Relative to basedir") 218 | String pTriplesOutputPath = null; 219 | 220 | @Parameter(names = { "-dt", "--deleteoutputtriples" }, description = "Delete triples job output path before running job") 221 | Boolean pDeleteTriplesOutputPath = null; 222 | 223 | @Parameter(names = { "-Rt", "--reducerstriples" }, description = "Number of reducers for triples job") 224 | Integer pNumReducersTriples = null; 225 | 226 | @Parameter(names = { "-Rts", "--reducerstriplessampling" }, description = "Number of reducers for triples input sampling job") 227 | Integer pNumReducersTriplesSampling = null; 228 | 229 | @Parameter(names = { "-fh", "--namehdtfile" }, description = "Name of hdt file") 230 | String pHdtFileName = null; 231 | 232 | @Parameter(names = { "-hc", "--hdtconf" }, description = "Conversion config file") 233 | String pHdtConfigFile = null; 234 | 235 | @Parameter(names = { "-o", "--options" }, description = "HDT Conversion options (override those of config file)") 236 | String pOptions = null; 237 | 238 | @Parameter(names = { "-t", "--rdftype" }, description = "Type of RDF Input (ntriples, nquad, n3, turtle, rdfxml)") 239 | String pRdfType = null; 240 | 241 | @Parameter(names = { "-bu", "--baseURI" }, description = "Base URI for the dataset") 242 | String pBaseURI = null; 243 | 244 | @Parameter(names = { "-q", "--quiet" }, description = "Do not show progress of the conversion") 245 | Boolean pQuiet = null; 246 | 247 | @Parameter(names = { "-x", "--index" }, description = "Generate also external indices to solve all queries") 248 | Boolean pGenerateIndex = null; 249 | 250 | @Parameter(names = { "-p", "--sampleprobability" }, description = "Probability of using each element for sampling") 251 | Float pSampleProbability = null; 252 | 253 | Path inputPath = null, dictionarySamplesPath = null, dictionaryOutputPath = null, sharedOutputPath = null, subjectsOutputPath = null, predicatesOutputPath = null, objectsOutputPath = null; 254 | Path dictionaryCountersFile = null, triplesSamplesPath = null, triplesCountersFile = null, hdtDictionarySPOFile = null, hdtMapDictionaryFile = null, hdtReduceDictionaryFile = null, hdtFile = null; 255 | Path triplesInputPath = null, triplesOutputPath = null; 256 | 257 | Configuration mrConfiguration = new Configuration(); 258 | 259 | HDTSpecification spec; 260 | 261 | // This constructor is to be used by Tasks (Mappers and/or Reducers) 262 | public HDTBuilderConfiguration(Configuration config) throws IOException { 263 | this.mrConfiguration = config; 264 | } 265 | 266 | // This constructor is to be used by Drivers 267 | public HDTBuilderConfiguration(String[] args) { 268 | this.jc = new JCommander(this, args); 269 | if (this.help) { 270 | this.jc.usage(); 271 | System.exit(1); 272 | } 273 | this.addConfigurationResource(this.getConfigFile()); 274 | 275 | // FIXME: Esto debería hacerse para todos los parámetros pasados por 276 | // línea de comandos 277 | this.setProperty(DICTIONARY_OUTPUT_PATH_NAME, this.getDictionaryOutputPath().toString()); 278 | } 279 | 280 | private void addConfigurationResource(String configurationPath) { 281 | this.mrConfiguration.addResource(new Path(configurationPath)); 282 | } 283 | 284 | private String getConfigFile() { 285 | return this.addBucket(this.pConfigFile != null ? this.pConfigFile : DEFAULT_CONFIGURATION_PATH); 286 | } 287 | 288 | public Configuration getConfigurationObject() { 289 | return this.mrConfiguration; 290 | } 291 | 292 | public void setProperty(String name, String value) { 293 | this.mrConfiguration.set(name, value); 294 | } 295 | 296 | public void setProperty(String name, int value) { 297 | this.mrConfiguration.setInt(name, value); 298 | } 299 | 300 | public String getAwsBucket() { 301 | return this.get(this.pAwsBucket, AWS_BUCKET_NAME, AWS_BUCKET_DEFAULT_VALUE); 302 | } 303 | 304 | public boolean runDictionary() { 305 | return this.get(this.pRunDictionary, DICTIONARY_RUN_JOB_NAME, DICTIONARY_RUN_JOB_DEFAULT_VALUE); 306 | } 307 | 308 | public boolean runDictionarySampling() { 309 | return this.get(this.pRunDictionarySampling, DICTIONARY_RUN_SAMPLE_NAME, DICTIONARY_RUN_SAMPLE_DEFAULT_VALUE); 310 | } 311 | 312 | public boolean runTriples() { 313 | return this.get(this.pRunTriples, TRIPLES_RUN_JOB_NAME, TRIPLES_RUN_JOB_DEFAULT_VALUE); 314 | } 315 | 316 | public boolean runTriplesSampling() { 317 | return this.get(this.pRunTriplesSampling, TRIPLES_RUN_SAMPLE_NAME, TRIPLES_RUN_SAMPLE_DEFAULT_VALUE); 318 | } 319 | 320 | public boolean buildDictionary() { 321 | return this.get(this.pBuildDictionary, HDTDICTIONARY_BUILD_NAME, HDTDICTIONARY_BUILD_DEFAULT_VALUE); 322 | } 323 | 324 | public boolean buildHDT() { 325 | return this.get(this.pBuildHDT, HDT_BUILD_NAME, HDT_BUILD_DEFAULT_VALUE); 326 | } 327 | 328 | public String getDictionaryJobName() { 329 | return this.get(this.pTriplesName, DICTIONARY_JOB_NAME_NAME, DICTIONARY_JOB_NAME_DEFAULT_VALUE); 330 | } 331 | 332 | public String getTriplesJobName() { 333 | return this.get(this.pTriplesName, DICTIONARY_JOB_NAME_NAME, DICTIONARY_JOB_NAME_DEFAULT_VALUE); 334 | } 335 | 336 | public Path getInputPath() { 337 | if (this.inputPath == null) { 338 | this.inputPath = new Path(this.getPath(this.get(this.pInputPath, INPUT_PATH_NAME, INPUT_PATH_DEFAULT_VALUE))); 339 | } 340 | return this.inputPath; 341 | } 342 | 343 | public Path getDictionaryOutputPath() { 344 | if (this.dictionaryOutputPath == null) { 345 | this.dictionaryOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE))); 346 | } 347 | return this.dictionaryOutputPath; 348 | } 349 | 350 | public Path getSharedSectionPath() { 351 | if (this.sharedOutputPath == null) { 352 | this.sharedOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + SHARED_OUTPUT_PATH); 353 | } 354 | return this.sharedOutputPath; 355 | } 356 | 357 | public Path getSubjectsSectionPath() { 358 | if (this.subjectsOutputPath == null) { 359 | this.subjectsOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + SUBJECTS_OUTPUT_PATH); 360 | } 361 | return this.subjectsOutputPath; 362 | } 363 | 364 | public Path getPredicatesSectionPath() { 365 | if (this.predicatesOutputPath == null) { 366 | this.predicatesOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + PREDICATES_OUTPUT_PATH); 367 | } 368 | return this.predicatesOutputPath; 369 | } 370 | 371 | public Path getObjectsSectionPath() { 372 | if (this.objectsOutputPath == null) { 373 | this.objectsOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + OBJECTS_OUTPUT_PATH); 374 | } 375 | return this.objectsOutputPath; 376 | } 377 | 378 | public Path getDictionarySamplesPath() { 379 | if (this.dictionarySamplesPath == null) { 380 | this.dictionarySamplesPath = new Path(this.getPath(this.get(this.pDictionarySamplePath, DICTIONARY_SAMPLE_OUTPUT_PATH_NAME, DICTIONARY_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE))); 381 | } 382 | return this.dictionarySamplesPath; 383 | } 384 | 385 | public Path getTriplesSamplesPath() { 386 | if (this.triplesSamplesPath == null) { 387 | this.triplesSamplesPath = new Path(this.getPath(this.get(this.pTriplesSamplePath, TRIPLES_SAMPLE_OUTPUT_PATH_NAME, TRIPLES_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE))); 388 | } 389 | return this.triplesSamplesPath; 390 | } 391 | 392 | public float getSampleProbability() { 393 | return this.get(this.pSampleProbability, DICTIONARY_SAMPLE_PROBABILITY_NAME, DICTIONARY_SAMPLE_PROBABILITY_DEFAULT_VALUE); 394 | } 395 | 396 | public Path getDictionaryCountersFile() { 397 | if (this.dictionaryCountersFile == null) { 398 | this.dictionaryCountersFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + ".info"); 399 | } 400 | return this.dictionaryCountersFile; 401 | } 402 | 403 | public Path getDictionaryFile() { 404 | if (this.hdtDictionarySPOFile == null) { 405 | this.hdtDictionarySPOFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pDictionaryFileName, HDTDICTIONARY_FILE_NAME, HDTDICTIONARY_FILE_DEFAULT_VALUE)); 406 | } 407 | return this.hdtDictionarySPOFile; 408 | } 409 | 410 | // public Path getDictionaryMapFile() { 411 | // if (this.hdtMapDictionaryFile == null) { 412 | // this.hdtMapDictionaryFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pMapDictionaryFileName, TRIPLES_MAP_DICTIONARY_FILE_NAME, TRIPLES_MAP_DICTIONARY_FILE_DEFAULT_VALUE)); 413 | // } 414 | // return this.hdtMapDictionaryFile; 415 | // } 416 | // 417 | // public Path getDictionaryReduceFile() { 418 | // if (this.hdtReduceDictionaryFile == null) { 419 | // this.hdtReduceDictionaryFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pReduceDictionaryFileName, TRIPLES_REDUCE_DICTIONARY_FILE_NAME, TRIPLES_REDUCE_DICTIONARY_FILE_DEFAULT_VALUE)); 420 | // } 421 | // return this.hdtReduceDictionaryFile; 422 | // } 423 | 424 | public int getDictionaryDistribution() { 425 | return this.get(this.pDictionaryDistribution, HDTDICTIONARY_DISTRIBUTION_NAME, HDTDICTIONARY_DISTRIBUTION_DEFAULT_VALUE); 426 | } 427 | 428 | public Path getTriplesOutputPath() { 429 | if (this.triplesOutputPath == null) { 430 | this.triplesOutputPath = new Path(this.getPath(this.get(this.pTriplesOutputPath, TRIPLES_OUTPUT_PATH_NAME, TRIPLES_OUTPUT_PATH_DEFAULT_VALUE))); 431 | } 432 | return this.triplesOutputPath; 433 | } 434 | 435 | public Path getTriplesCountersFile() { 436 | if (this.triplesCountersFile == null) { 437 | this.triplesCountersFile = new Path(this.getPath(this.get(this.pTriplesOutputPath, TRIPLES_OUTPUT_PATH_NAME, TRIPLES_OUTPUT_PATH_DEFAULT_VALUE)) + ".info"); 438 | } 439 | return this.triplesCountersFile; 440 | } 441 | 442 | public Path getHDTFile() { 443 | if (this.hdtFile == null) { 444 | this.hdtFile = new Path(this.getPath(this.get(this.pHdtFileName, HDT_FILE_NAME, HDT_FILE_DEFAULT_VALUE))); 445 | } 446 | return this.hdtFile; 447 | } 448 | 449 | public boolean getDeleteDictionaryOutputPath() { 450 | return this.get(this.pDeleteDictionaryOutputPath, DICTIONARY_DELETE_OUTPUT_PATH_NAME, DICTIONARY_DELETE_OUTPUT_PATH_DEFAULT_VALUE); 451 | } 452 | 453 | public boolean getDeleteDictionarySamplesPath() { 454 | return this.get(this.pDeleteDictionarySamplePath, DICTIONARY_DELETE_SAMPLE_PATH_NAME, DICTIONARY_DELETE_SAMPLE_PATH_DEFAULT_VALUE); 455 | } 456 | 457 | public boolean getDeleteTriplesOutputPath() { 458 | return this.get(this.pDeleteTriplesOutputPath, TRIPLES_DELETE_OUTPUT_PATH_NAME, TRIPLES_DELETE_OUTPUT_PATH_DEFAULT_VALUE); 459 | } 460 | 461 | public boolean getDeleteTriplesSamplesPath() { 462 | return this.get(this.pDeleteTriplesSamplePath, TRIPLES_DELETE_SAMPLE_PATH_NAME, TRIPLES_DELETE_SAMPLE_PATH_DEFAULT_VALUE); 463 | } 464 | 465 | public int getDictionaryReducers() { 466 | return this.get(this.pNumReducersDictionary, DICTIONARY_NUM_REDUCERS_NAME, DICTIONARY_NUM_REDUCERS_DEFAULT_VALUE); 467 | } 468 | 469 | public int getDictionarySampleReducers() { 470 | return this.get(this.pNumReducersDictionarySampling, DICTIONARY_SAMPLE_NUM_REDUCERS_NAME, DICTIONARY_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE); 471 | } 472 | 473 | public int getTriplesReducers() { 474 | return this.get(this.pNumReducersTriples, TRIPLES_NUM_REDUCERS_NAME, TRIPLES_NUM_REDUCERS_DEFAULT_VALUE); 475 | } 476 | 477 | public int getTriplesSampleReducers() { 478 | return this.get(this.pNumReducersTriplesSampling, TRIPLES_SAMPLE_NUM_REDUCERS_NAME, TRIPLES_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE); 479 | } 480 | 481 | public String getHdtConfigFile() { 482 | return this.getPath(this.get(this.pHdtConfigFile, CONFIG_FILE_NAME, CONFIG_FILE_DEFAULT_VALUE)); 483 | } 484 | 485 | public String getOptions() { 486 | return this.get(this.pOptions, OPTIONS_NAME, OPTIONS_DEFAULT_VALUE); 487 | } 488 | 489 | public String getRdfType() { 490 | return this.get(this.pRdfType, RDF_TYPE_NAME, RDF_TYPE_DEFAULT_VALUE); 491 | } 492 | 493 | public boolean getQuiet() { 494 | return this.get(this.pQuiet, QUIET_NAME, QUIET_DEFAULT_VALUE); 495 | } 496 | 497 | public String getBaseURI() { 498 | return this.get(this.pBaseURI, BASE_URI_NAME, BASE_URI_DEFAULT_VALUE); 499 | } 500 | 501 | public HDTSpecification getSpec() throws IOException { 502 | if (this.spec == null) { 503 | if (this.getHdtConfigFile() != null) { 504 | this.spec = new HDTSpecification(this.getHdtConfigFile()); 505 | } else { 506 | this.spec = new HDTSpecification(); 507 | } 508 | if (this.getOptions() != null) { 509 | this.spec.setOptions(this.getOptions()); 510 | } 511 | } 512 | return this.spec; 513 | } 514 | 515 | private String get(String paramValue, String confName, String defaultValue) { 516 | return paramValue != null ? paramValue : this.mrConfiguration.get(confName, defaultValue); 517 | } 518 | 519 | private boolean get(Boolean paramValue, String confName, boolean defaultValue) { 520 | return paramValue != null ? paramValue : this.mrConfiguration.getBoolean(confName, defaultValue); 521 | } 522 | 523 | private int get(Integer paramValue, String confName, int defaultValue) { 524 | return paramValue != null ? paramValue : this.mrConfiguration.getInt(confName, defaultValue); 525 | } 526 | 527 | private float get(Float paramValue, String confName, float defaultValue) { 528 | return paramValue != null ? paramValue : this.mrConfiguration.getFloat(confName, defaultValue); 529 | } 530 | 531 | private String getPath(String path) { 532 | // Add Base Path 533 | return FilenameUtils.concat(this.get(this.pBasePath, BASE_PATH_NAME, BASE_PATH_DEFAULT_VALUE), path); 534 | } 535 | 536 | private String addBucket(String path) { 537 | // If bucket is provided as parameter, and configuration path is 538 | // relative, create absolute configuration path 539 | if (this.getAwsBucket() != null && !path.startsWith("s3n://")) { 540 | path = "s3n://" + this.getAwsBucket() + "/" + StringUtils.removeStart(path, "/"); 541 | } 542 | return path; 543 | } 544 | 545 | // private void set(Integer paramValue, String confName, int defautlValue) { 546 | // mrConfiguration.setInt(confName, paramValue != null ? paramValue : 547 | // mrConfiguration.getInt(confName, defautlValue)); 548 | // } 549 | 550 | } 551 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/HDTBuilderDriver.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder; 24 | 25 | import java.io.BufferedInputStream; 26 | import java.io.BufferedOutputStream; 27 | import java.io.BufferedReader; 28 | import java.io.BufferedWriter; 29 | import java.io.IOException; 30 | import java.io.InputStreamReader; 31 | import java.io.OutputStreamWriter; 32 | import java.net.URI; 33 | import java.net.URISyntaxException; 34 | import java.util.Arrays; 35 | 36 | import org.apache.hadoop.filecache.DistributedCache; 37 | import org.apache.hadoop.fs.FileStatus; 38 | import org.apache.hadoop.fs.FileSystem; 39 | import org.apache.hadoop.fs.Path; 40 | import org.apache.hadoop.fs.PathFilter; 41 | import org.apache.hadoop.io.NullWritable; 42 | import org.apache.hadoop.io.SequenceFile; 43 | import org.apache.hadoop.io.Text; 44 | import org.apache.hadoop.mapreduce.Job; 45 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 46 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; 47 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 48 | import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; 49 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 50 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; 51 | import org.apache.hadoop.mapreduce.lib.partition.InputSampler; 52 | import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner; 53 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary; 54 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary2; 55 | import org.rdfhdt.hdt.dictionary.impl.section.TransientDictionarySection; 56 | import org.rdfhdt.hdt.hdt.impl.TransientHDT; 57 | import org.rdfhdt.hdt.options.ControlInformation; 58 | import org.rdfhdt.hdt.trans.TransientElement; 59 | import org.rdfhdt.hdt.triples.impl.TransientBitMapTriples; 60 | import org.rdfhdt.listener.HDTBuilderListener; 61 | import org.rdfhdt.mrbuilder.dictionary.DictionaryCombiner; 62 | import org.rdfhdt.mrbuilder.dictionary.DictionaryMapper; 63 | import org.rdfhdt.mrbuilder.dictionary.DictionaryReducer; 64 | import org.rdfhdt.mrbuilder.dictionary.DictionarySamplerMapper; 65 | import org.rdfhdt.mrbuilder.dictionary.DictionarySamplerReducer; 66 | import org.rdfhdt.mrbuilder.io.TripleSPOComparator; 67 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable; 68 | import org.rdfhdt.mrbuilder.triples.TriplesSPOMapper; 69 | import org.rdfhdt.mrbuilder.util.FileStatusComparator; 70 | 71 | import com.hadoop.mapreduce.LzoTextInputFormat; 72 | 73 | public class HDTBuilderDriver { 74 | 75 | public enum Counters { 76 | Triples, Subjects, Predicates, Objects, Shared, Sample 77 | } 78 | 79 | protected HDTBuilderConfiguration conf; 80 | protected HDTBuilderListener listener; 81 | protected FileSystem inputFS, dictionaryFS, triplesFS; 82 | protected Long numTriples = null, numShared = null, numSubjects = null, numPredicates = null, numObjects = null; 83 | protected FourSectionDictionary2 dictionary = null; 84 | 85 | public HDTBuilderDriver(String[] args) throws IOException { 86 | 87 | // load configuration 88 | this.conf = new HDTBuilderConfiguration(args); 89 | 90 | this.listener = new HDTBuilderListener(this.conf); 91 | 92 | // get the FileSystem instances for each path 93 | this.inputFS = this.conf.getInputPath().getFileSystem(this.conf.getConfigurationObject()); 94 | this.dictionaryFS = this.conf.getDictionaryOutputPath().getFileSystem(this.conf.getConfigurationObject()); 95 | this.triplesFS = this.conf.getTriplesOutputPath().getFileSystem(this.conf.getConfigurationObject()); 96 | 97 | } 98 | 99 | public static void main(String[] args) throws Exception { 100 | boolean ok = true; 101 | HDTBuilderDriver driver = new HDTBuilderDriver(args); 102 | 103 | if (ok && driver.conf.runDictionarySampling()) { 104 | if (driver.conf.getDictionaryReducers() == 1) { 105 | System.out.println("WARNING: Only one Reducer. Dictionary creation as a single job is more efficient."); 106 | } 107 | ok = driver.runDictionaryJobSampling(); 108 | } 109 | 110 | if (ok && driver.conf.runDictionary()) { 111 | if (driver.conf.getDictionaryReducers() > 1) { 112 | ok = driver.runDictionaryJob(); 113 | } else { 114 | ok = driver.runDictionaryJobWithOneJob(); 115 | } 116 | } 117 | 118 | if (ok && driver.conf.buildDictionary()) { 119 | ok = driver.buildDictionary(); 120 | } 121 | 122 | if (ok && driver.conf.runTriplesSampling()) { 123 | if (driver.conf.getTriplesReducers() == 1) { 124 | System.out.println("WARNING: Only one Reducer. Triples creation as a single job is more efficient."); 125 | } 126 | ok = driver.runTriplesJobSampling(); 127 | } 128 | 129 | if (ok && driver.conf.runTriples()) { 130 | if (driver.conf.getTriplesReducers() > 1) { 131 | ok = driver.runTriplesJob(); 132 | } else { 133 | ok = driver.runTriplesJobWithOneJob(); 134 | } 135 | } 136 | 137 | if (ok && driver.conf.buildHDT()) { 138 | ok = driver.buidHDT(); 139 | } 140 | 141 | System.exit(ok ? 0 : 1); 142 | } 143 | 144 | protected boolean runDictionaryJobSampling() throws IOException, ClassNotFoundException, InterruptedException { 145 | boolean jobOK; 146 | Job job = null; 147 | 148 | // if input path does not exists, fail 149 | if (!this.inputFS.exists(this.conf.getInputPath())) { 150 | System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); 151 | System.exit(-1); 152 | } 153 | 154 | // if samples path exists... 155 | if (this.dictionaryFS.exists(this.conf.getDictionarySamplesPath())) { 156 | if (this.conf.getDeleteDictionarySamplesPath()) { // ... and option provided, delete recursively 157 | this.dictionaryFS.delete(this.conf.getDictionarySamplesPath(), true); 158 | } else { // ... and option not provided, fail 159 | System.out.println("Dictionary samples path does exist: " + this.conf.getDictionarySamplesPath()); 160 | System.out.println("Select other path or use option -ds to overwrite"); 161 | System.exit(-1); 162 | } 163 | } 164 | 165 | // Job to create a SequenceInputFormat with Roles 166 | job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 1"); 167 | job.setJarByClass(HDTBuilderDriver.class); 168 | 169 | System.out.println("input = " + this.conf.getInputPath()); 170 | System.out.println("samples = " + this.conf.getDictionarySamplesPath()); 171 | 172 | FileInputFormat.addInputPath(job, this.conf.getInputPath()); 173 | FileOutputFormat.setOutputPath(job, this.conf.getDictionarySamplesPath()); 174 | 175 | job.setInputFormatClass(LzoTextInputFormat.class); 176 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); 177 | 178 | job.setMapperClass(DictionarySamplerMapper.class); 179 | job.setMapOutputKeyClass(Text.class); 180 | job.setMapOutputValueClass(Text.class); 181 | job.setCombinerClass(DictionarySamplerReducer.class); 182 | job.setReducerClass(DictionarySamplerReducer.class); 183 | job.setOutputKeyClass(Text.class); 184 | job.setOutputValueClass(Text.class); 185 | 186 | job.setNumReduceTasks(this.conf.getDictionarySampleReducers()); 187 | 188 | SequenceFileOutputFormat.setCompressOutput(job, true); 189 | SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); 190 | SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); 191 | 192 | jobOK = job.waitForCompletion(true); 193 | 194 | return jobOK; 195 | } 196 | 197 | protected boolean runDictionaryJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException { 198 | boolean jobOK; 199 | Job job = null; 200 | BufferedWriter bufferedWriter; 201 | 202 | // if output path exists... 203 | if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) { 204 | if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively 205 | this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true); 206 | } else { // ... and option not provided, fail 207 | System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath()); 208 | System.out.println("Select other path or use option -dd to overwrite"); 209 | System.exit(-1); 210 | } 211 | } 212 | 213 | // Sample the SequenceInputFormat to do TotalSort and create final output 214 | job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2"); 215 | 216 | job.setJarByClass(HDTBuilderDriver.class); 217 | 218 | System.out.println("samples = " + this.conf.getDictionarySamplesPath()); 219 | System.out.println("output = " + this.conf.getDictionaryOutputPath()); 220 | 221 | FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath()); 222 | FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath()); 223 | 224 | job.setInputFormatClass(SequenceFileInputFormat.class); 225 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); 226 | 227 | // Identity Mapper 228 | // job.setMapperClass(Mapper.class); 229 | job.setCombinerClass(DictionaryCombiner.class); 230 | job.setPartitionerClass(TotalOrderPartitioner.class); 231 | job.setReducerClass(DictionaryReducer.class); 232 | 233 | job.setNumReduceTasks(this.conf.getDictionaryReducers()); 234 | 235 | job.setMapOutputKeyClass(Text.class); 236 | job.setMapOutputValueClass(Text.class); 237 | 238 | job.setOutputKeyClass(Text.class); 239 | job.setOutputValueClass(NullWritable.class); 240 | 241 | System.out.println("Sampling started"); 242 | InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler(this.conf.getSampleProbability())); 243 | String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration()); 244 | URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH); 245 | DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); 246 | DistributedCache.createSymlink(job.getConfiguration()); 247 | System.out.println("Sampling finished"); 248 | 249 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class); 250 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); 251 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class); 252 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); 253 | 254 | SequenceFileOutputFormat.setCompressOutput(job, true); 255 | SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); 256 | SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); 257 | 258 | jobOK = job.waitForCompletion(true); 259 | 260 | this.numShared = job.getCounters().findCounter(Counters.Shared).getValue(); 261 | this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue(); 262 | this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue(); 263 | this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue(); 264 | 265 | bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile()))); 266 | 267 | bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n"); 268 | bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n"); 269 | bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n"); 270 | bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n"); 271 | 272 | bufferedWriter.close(); 273 | 274 | return jobOK; 275 | } 276 | 277 | protected boolean runDictionaryJobWithOneJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException { 278 | boolean jobOK; 279 | Job job = null; 280 | BufferedWriter bufferedWriter; 281 | 282 | // if input path does not exists, fail 283 | if (!this.inputFS.exists(this.conf.getInputPath())) { 284 | System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); 285 | System.exit(-1); 286 | } 287 | 288 | // if output path exists... 289 | if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) { 290 | if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively 291 | this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true); 292 | } else { // ... and option not provided, fail 293 | System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath()); 294 | System.out.println("Select other path or use option -dd to overwrite"); 295 | System.exit(-1); 296 | } 297 | } 298 | 299 | // Launch job 300 | job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName()); 301 | job.setJarByClass(HDTBuilderDriver.class); 302 | 303 | FileInputFormat.addInputPath(job, this.conf.getInputPath()); 304 | FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath()); 305 | 306 | job.setInputFormatClass(LzoTextInputFormat.class); 307 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); 308 | 309 | job.setMapperClass(DictionaryMapper.class); 310 | job.setCombinerClass(DictionaryCombiner.class); 311 | job.setReducerClass(DictionaryReducer.class); 312 | 313 | job.setNumReduceTasks(this.conf.getDictionaryReducers()); 314 | 315 | job.setMapOutputKeyClass(Text.class); 316 | job.setMapOutputValueClass(Text.class); 317 | 318 | job.setOutputKeyClass(Text.class); 319 | job.setOutputValueClass(NullWritable.class); 320 | 321 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class); 322 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); 323 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class); 324 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); 325 | 326 | jobOK = job.waitForCompletion(true); 327 | 328 | this.numShared = job.getCounters().findCounter(Counters.Shared).getValue(); 329 | this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue(); 330 | this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue(); 331 | this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue(); 332 | 333 | bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile()))); 334 | 335 | bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n"); 336 | bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n"); 337 | bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n"); 338 | bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n"); 339 | 340 | bufferedWriter.close(); 341 | 342 | return jobOK; 343 | } 344 | 345 | protected boolean buildDictionary() throws IOException { 346 | FourSectionDictionary dictionary4mappers, dictionary4reducers; 347 | 348 | // if job not ran, read Counters 349 | if (!this.conf.runDictionary()) { 350 | 351 | System.out.println("Dictionary job not ran. Reading data from file."); 352 | 353 | BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getDictionaryCountersFile()))); 354 | String line = reader.readLine(); 355 | while (line != null) { 356 | String[] data = line.split("="); 357 | switch (data[0]) { 358 | case HDTBuilderConfiguration.SHARED: 359 | this.numShared = Long.parseLong(data[1]); 360 | break; 361 | case HDTBuilderConfiguration.SUBJECTS: 362 | this.numSubjects = Long.parseLong(data[1]); 363 | break; 364 | case HDTBuilderConfiguration.PREDICATES: 365 | this.numPredicates = Long.parseLong(data[1]); 366 | break; 367 | case HDTBuilderConfiguration.OBJECTS: 368 | this.numObjects = Long.parseLong(data[1]); 369 | } 370 | line = reader.readLine(); 371 | } 372 | reader.close(); 373 | } 374 | 375 | TransientDictionarySection shared = new TransientDictionarySection(this.conf.getSpec()); 376 | TransientDictionarySection subjects = new TransientDictionarySection(this.conf.getSpec()); 377 | TransientDictionarySection predicates = new TransientDictionarySection(this.conf.getSpec()); 378 | TransientDictionarySection objects = new TransientDictionarySection(this.conf.getSpec()); 379 | 380 | 381 | 382 | if (this.dictionaryFS.exists(this.conf.getSharedSectionPath())) { 383 | System.out.println("Shared section = " + this.conf.getSharedSectionPath()); 384 | this.loadFromDir(shared, this.numShared, this.dictionaryFS, this.conf.getSharedSectionPath()); 385 | } 386 | 387 | this.loadFromDir(subjects, this.numSubjects, this.dictionaryFS, this.conf.getSubjectsSectionPath()); 388 | this.loadFromDir(predicates, this.numPredicates, this.dictionaryFS, this.conf.getPredicatesSectionPath()); 389 | this.loadFromDir(objects, this.numObjects, this.dictionaryFS, this.conf.getObjectsSectionPath()); 390 | 391 | System.out.println("Saving dictionary..."); 392 | this.dictionary = new FourSectionDictionary2(this.conf.getSpec(), subjects, predicates, objects, shared); 393 | this.saveDictionary(this.dictionary, this.dictionaryFS, this.conf.getDictionaryFile()); 394 | 395 | return true; 396 | 397 | } 398 | 399 | protected boolean runTriplesJobSampling() throws ClassNotFoundException, IOException, InterruptedException { 400 | Job job = null; 401 | boolean jobOK; 402 | BufferedWriter bufferedWriter; 403 | 404 | // if input path does not exists, fail 405 | if (!this.inputFS.exists(this.conf.getInputPath())) { 406 | System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); 407 | System.exit(-1); 408 | } 409 | 410 | // if dictionary output path does not exists, fail 411 | if (!this.dictionaryFS.exists(this.conf.getInputPath())) { 412 | System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath()); 413 | System.exit(-1); 414 | } 415 | 416 | // if samples path exists, fail 417 | if (this.dictionaryFS.exists(this.conf.getTriplesSamplesPath())) { 418 | if (this.conf.getDeleteTriplesSamplesPath()) { // ... and option 419 | // provided, delete 420 | // recursively 421 | this.dictionaryFS.delete(this.conf.getTriplesSamplesPath(), true); 422 | } else { // ... and option not provided, fail 423 | System.out.println("Triples samples path does exist: " + this.conf.getTriplesSamplesPath()); 424 | System.out.println("Select other path or use option -dst to overwrite"); 425 | System.exit(-1); 426 | } 427 | } 428 | 429 | this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m"); 430 | 431 | // Job to create a SequenceInputFormat 432 | job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 1"); 433 | 434 | job.setJarByClass(HDTBuilderDriver.class); 435 | 436 | FileInputFormat.addInputPath(job, this.conf.getInputPath()); 437 | FileOutputFormat.setOutputPath(job, this.conf.getTriplesSamplesPath()); 438 | 439 | job.setInputFormatClass(LzoTextInputFormat.class); 440 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); 441 | 442 | job.setMapperClass(TriplesSPOMapper.class); 443 | job.setSortComparatorClass(TripleSPOComparator.class); 444 | job.setGroupingComparatorClass(TripleSPOComparator.class); 445 | job.setMapOutputKeyClass(TripleSPOWritable.class); 446 | job.setMapOutputValueClass(NullWritable.class); 447 | job.setOutputKeyClass(TripleSPOWritable.class); 448 | job.setOutputValueClass(NullWritable.class); 449 | 450 | job.setNumReduceTasks(this.conf.getTriplesReducers()); 451 | 452 | DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration()); 453 | 454 | SequenceFileOutputFormat.setCompressOutput(job, true); 455 | SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); 456 | SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); 457 | 458 | jobOK = job.waitForCompletion(true); 459 | 460 | this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue(); 461 | bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile()))); 462 | bufferedWriter.write(this.numTriples.toString() + "\n"); 463 | bufferedWriter.close(); 464 | 465 | return jobOK; 466 | } 467 | 468 | protected boolean runTriplesJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { 469 | Job job = null; 470 | boolean jobOK; 471 | 472 | // if triples output path exists... 473 | if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) { 474 | if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively 475 | this.triplesFS.delete(this.conf.getTriplesOutputPath(), true); 476 | } else { // ... and option not provided, fail 477 | System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath()); 478 | System.out.println("Select other path or use option -dt to overwrite"); 479 | System.exit(-1); 480 | } 481 | } 482 | 483 | job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2"); 484 | 485 | job.setJarByClass(HDTBuilderDriver.class); 486 | 487 | FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath()); 488 | FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath()); 489 | 490 | job.setInputFormatClass(SequenceFileInputFormat.class); 491 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); 492 | 493 | job.setSortComparatorClass(TripleSPOComparator.class); 494 | job.setGroupingComparatorClass(TripleSPOComparator.class); 495 | 496 | job.setPartitionerClass(TotalOrderPartitioner.class); 497 | 498 | job.setOutputKeyClass(TripleSPOWritable.class); 499 | job.setOutputValueClass(NullWritable.class); 500 | 501 | job.setNumReduceTasks(this.conf.getTriplesReducers()); 502 | 503 | System.out.println("Sampling started"); 504 | InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler(this.conf.getSampleProbability())); 505 | String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration()); 506 | URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH); 507 | DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); 508 | DistributedCache.createSymlink(job.getConfiguration()); 509 | System.out.println("Sampling finished"); 510 | 511 | SequenceFileOutputFormat.setCompressOutput(job, true); 512 | SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); 513 | SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); 514 | 515 | jobOK = job.waitForCompletion(true); 516 | 517 | return jobOK; 518 | } 519 | 520 | protected boolean runTriplesJobWithOneJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { 521 | Job job = null; 522 | boolean jobOK; 523 | BufferedWriter bufferedWriter; 524 | 525 | // if input path does not exists, fail 526 | if (!this.inputFS.exists(this.conf.getInputPath())) { 527 | System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); 528 | System.exit(-1); 529 | } 530 | 531 | // if dictionary output path does not exists, fail 532 | if (!this.dictionaryFS.exists(this.conf.getInputPath())) { 533 | System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath()); 534 | System.exit(-1); 535 | } 536 | 537 | // if triples output path exists... 538 | if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) { 539 | if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively 540 | this.triplesFS.delete(this.conf.getTriplesOutputPath(), true); 541 | } else { // ... and option not provided, fail 542 | System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath()); 543 | System.out.println("Select other path or use option -dt to overwrite"); 544 | System.exit(-1); 545 | } 546 | } 547 | 548 | // Launch job 549 | this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m"); 550 | 551 | job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName()); 552 | job.setJarByClass(HDTBuilderDriver.class); 553 | 554 | FileInputFormat.addInputPath(job, this.conf.getInputPath()); 555 | FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath()); 556 | 557 | job.setInputFormatClass(LzoTextInputFormat.class); 558 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); 559 | 560 | job.setMapperClass(TriplesSPOMapper.class); 561 | job.setSortComparatorClass(TripleSPOComparator.class); 562 | job.setMapOutputKeyClass(TripleSPOWritable.class); 563 | job.setMapOutputValueClass(NullWritable.class); 564 | 565 | job.setNumReduceTasks(this.conf.getTriplesReducers()); 566 | 567 | job.setOutputKeyClass(TripleSPOWritable.class); 568 | job.setOutputValueClass(NullWritable.class); 569 | 570 | DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration()); 571 | // DistributedCache.addCacheFile(this.conf.getDictionaryMapFile().toUri(), job.getConfiguration()); 572 | // DistributedCache.addCacheFile(this.conf.getDictionaryReduceFile().toUri(), job.getConfiguration()); 573 | 574 | jobOK = job.waitForCompletion(true); 575 | 576 | this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue(); 577 | bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile()))); 578 | bufferedWriter.write(this.numTriples.toString() + "\n"); 579 | bufferedWriter.close(); 580 | 581 | return jobOK; 582 | } 583 | 584 | protected boolean buidHDT() throws IOException { 585 | BufferedOutputStream output; 586 | TransientHDT hdt = new TransientHDT(this.conf.getSpec()); 587 | TransientBitMapTriples triples = new TransientBitMapTriples(this.conf.getSpec(), this.triplesFS, new Path("temp")); 588 | 589 | // if dictionary not built, load it 590 | if (this.dictionary == null) { 591 | System.out.println("Dictionary not built. Reading data from " + this.conf.getDictionaryFile()); 592 | this.dictionary = this.loadDictionary(this.dictionaryFS, this.conf.getDictionaryFile()); 593 | } 594 | 595 | // if maxvalues not loaded, read Counters 596 | if (!this.conf.runDictionary()) { 597 | 598 | System.out.println("Dictionary Samples job not ran. Reading data from file."); 599 | 600 | BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getDictionaryCountersFile()))); 601 | String line = reader.readLine(); 602 | while (line != null) { 603 | String[] data = line.split("="); 604 | switch (data[0]) { 605 | case HDTBuilderConfiguration.SHARED: 606 | this.numShared = Long.parseLong(data[1]); 607 | break; 608 | case HDTBuilderConfiguration.SUBJECTS: 609 | this.numSubjects = Long.parseLong(data[1]); 610 | break; 611 | case HDTBuilderConfiguration.PREDICATES: 612 | this.numPredicates = Long.parseLong(data[1]); 613 | break; 614 | case HDTBuilderConfiguration.OBJECTS: 615 | this.numObjects = Long.parseLong(data[1]); 616 | } 617 | line = reader.readLine(); 618 | } 619 | reader.close(); 620 | } 621 | 622 | // if triples job not ran, read Counters 623 | if (!this.conf.runTriples()) { 624 | System.out.println("Triples job nor ran. Reading data from " + this.conf.getTriplesCountersFile()); 625 | BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getTriplesCountersFile()))); 626 | this.numTriples = Long.parseLong(reader.readLine()); 627 | reader.close(); 628 | } 629 | 630 | this.loadFromDir(triples, this.numTriples, this.numPredicates, (this.numShared + this.numObjects), this.triplesFS, this.conf.getTriplesOutputPath()); 631 | 632 | hdt.setDictionary(this.dictionary); 633 | hdt.setTriples(triples); 634 | hdt.populateHeaderStructure(this.conf.getBaseURI()); 635 | 636 | output = new BufferedOutputStream(this.triplesFS.create(this.conf.getHDTFile())); 637 | hdt.saveToHDT(output, this.listener); 638 | output.close(); 639 | 640 | return true; 641 | } 642 | 643 | protected void loadFromDir(TransientElement part, long numentries, FileSystem fs, Path path) throws IOException { 644 | PathFilter filter = new PathFilter() { 645 | @Override 646 | public boolean accept(Path path) { 647 | return !path.getName().startsWith("_"); 648 | } 649 | }; 650 | FileStatus[] status = fs.listStatus(path, filter); 651 | 652 | if (status.length == 0) { 653 | System.out.println("Path [" + path + "] has no files. Initializing section."); 654 | part.initialize(0); 655 | } else { 656 | Arrays.sort(status, new FileStatusComparator()); 657 | 658 | System.out.println("Initializing section " + path); 659 | part.initialize(numentries); 660 | for (FileStatus file : status) { 661 | System.out.println("Reading file [" + file.getPath() + "]"); 662 | SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), this.conf.getConfigurationObject()); 663 | part.load(reader, this.listener); 664 | reader.close(); 665 | } 666 | System.out.println("Closing section " + path); 667 | part.close(); 668 | } 669 | } 670 | 671 | protected void loadFromDir(TransientBitMapTriples part, long numentries, long maxpredicate, long maxobject, FileSystem fs, Path path) throws IOException { 672 | PathFilter filter = new PathFilter() { 673 | @Override 674 | public boolean accept(Path path) { 675 | return !path.getName().startsWith("_"); 676 | } 677 | }; 678 | FileStatus[] status = fs.listStatus(path, filter); 679 | 680 | if (status.length == 0) { 681 | System.out.println("Path [" + path + "] has no files. Initializing section."); 682 | part.initialize(0, 0); 683 | } else { 684 | Arrays.sort(status, new FileStatusComparator()); 685 | 686 | System.out.println("Initializing section " + path); 687 | part.initialize(numentries, maxpredicate, maxobject); 688 | for (FileStatus file : status) { 689 | System.out.println("Reading file [" + file.getPath() + "]"); 690 | SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), this.conf.getConfigurationObject()); 691 | part.load(reader, this.listener); 692 | reader.close(); 693 | } 694 | System.out.println("Closing section " + path); 695 | part.close(); 696 | } 697 | } 698 | 699 | protected FourSectionDictionary2 loadDictionary(FileSystem fs, Path dictionaryPath) throws IOException { 700 | BufferedInputStream input = new BufferedInputStream(fs.open(dictionaryPath)); 701 | FourSectionDictionary2 dictionary = new FourSectionDictionary2(this.conf.getSpec()); 702 | ControlInformation ci = new ControlInformation(); 703 | ci.clear(); 704 | ci.load(input); 705 | dictionary.load(input, ci, this.listener); 706 | return dictionary; 707 | } 708 | 709 | protected void saveDictionary(FourSectionDictionary2 dictionary, FileSystem fs, Path dictionaryPath) throws IOException { 710 | BufferedOutputStream output = new BufferedOutputStream(fs.create(dictionaryPath)); 711 | dictionary.save(output, new ControlInformation(), this.listener); 712 | output.close(); 713 | } 714 | 715 | } 716 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/dictionary/DictionaryCombiner.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.dictionary; 24 | 25 | import java.io.IOException; 26 | 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.hadoop.mapreduce.Reducer; 29 | 30 | public class DictionaryCombiner extends Reducer { 31 | 32 | @Override 33 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 34 | boolean isSubject = false, isPredicate = false, isObject = false; 35 | String output = new String(); 36 | 37 | for (Text value : values) { 38 | if (value.toString().contains("S")) 39 | isSubject = true; 40 | if (value.toString().contains("P")) 41 | isPredicate = true; 42 | if (value.toString().contains("O")) 43 | isObject = true; 44 | } 45 | 46 | if (isSubject) 47 | output = output.concat("S"); 48 | if (isPredicate) 49 | output = output.concat("P"); 50 | if (isObject) 51 | output = output.concat("O"); 52 | 53 | context.write(key, new Text(output)); 54 | 55 | // if (key.toString().toString().contains("Forest Green is an unincorporated community in southeastern Chariton County")) 56 | // System.out.println("Combiner: " + key.toString()); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/dictionary/DictionaryMapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.dictionary; 24 | 25 | import java.io.IOException; 26 | 27 | import org.apache.hadoop.io.LongWritable; 28 | import org.apache.hadoop.io.Text; 29 | import org.apache.hadoop.mapreduce.Mapper; 30 | import org.rdfhdt.hdt.exceptions.ParserException; 31 | import org.rdfhdt.hdt.triples.TripleString; 32 | 33 | public class DictionaryMapper extends Mapper { 34 | 35 | @Override 36 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 37 | 38 | TripleString triple = new TripleString(); 39 | try { 40 | triple.read(value.toString()); 41 | } catch (ParserException e) { 42 | // TODO Auto-generated catch block 43 | e.printStackTrace(); 44 | } 45 | 46 | context.write(new Text(triple.getSubject().toString()), new Text("S")); 47 | context.write(new Text(triple.getPredicate().toString()), new Text("P")); 48 | context.write(new Text(triple.getObject().toString()), new Text("O")); 49 | 50 | // if (triple.getObject().toString().toString().contains("Forest Green is an unincorporated community in southeastern Chariton County")) 51 | // System.out.println("Mapper: " + triple.getObject().toString()); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/dictionary/DictionaryReducer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.dictionary; 24 | 25 | import java.io.IOException; 26 | 27 | import org.apache.hadoop.io.NullWritable; 28 | import org.apache.hadoop.io.Text; 29 | import org.apache.hadoop.mapreduce.Reducer; 30 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs; 31 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration; 32 | import org.rdfhdt.mrbuilder.HDTBuilderDriver.Counters; 33 | 34 | public class DictionaryReducer extends Reducer { 35 | 36 | protected MultipleOutputs output; 37 | 38 | @Override 39 | protected void setup(Context context) throws IOException, InterruptedException { 40 | this.output = new MultipleOutputs(context); 41 | super.setup(context); 42 | } 43 | 44 | @Override 45 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 46 | boolean isSubject = false, isPredicate = false, isObject = false; 47 | 48 | //key = new Text(UnicodeEscape.escapeString(key.toString())); 49 | 50 | for (Text value : values) { 51 | if (value.toString().contains("S")) 52 | isSubject = true; 53 | if (value.toString().contains("P")) 54 | isPredicate = true; 55 | if (value.toString().contains("O")) 56 | isObject = true; 57 | } 58 | 59 | if (isSubject && isObject) { 60 | this.output.write(HDTBuilderConfiguration.SHARED, key, NullWritable.get(), HDTBuilderConfiguration.SHARED_OUTPUT_PATH); 61 | context.getCounter(Counters.Shared).increment(1); 62 | } else { 63 | if (isSubject) { 64 | this.output.write(HDTBuilderConfiguration.SUBJECTS, key, NullWritable.get(), HDTBuilderConfiguration.SUBJECTS_OUTPUT_PATH); 65 | context.getCounter(Counters.Subjects).increment(1); 66 | } 67 | if (isObject) { 68 | this.output.write(HDTBuilderConfiguration.OBJECTS, key, NullWritable.get(), HDTBuilderConfiguration.OBJECTS_OUTPUT_PATH); 69 | context.getCounter(Counters.Objects).increment(1); 70 | } 71 | } 72 | if (isPredicate) { 73 | this.output.write(HDTBuilderConfiguration.PREDICATES, key, NullWritable.get(), HDTBuilderConfiguration.PREDICATES_OUTPUT_PATH); 74 | context.getCounter(Counters.Predicates).increment(1); 75 | } 76 | 77 | // if (key.toString().contains("Forest Green is an unincorporated community in southeastern Chariton County")) 78 | // System.out.println("Reducer: " + key.toString()); 79 | } 80 | 81 | @Override 82 | protected void cleanup(Context context) throws IOException, InterruptedException { 83 | this.output.close(); 84 | super.cleanup(context); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/dictionary/DictionarySamplerMapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.dictionary; 24 | 25 | import java.io.IOException; 26 | 27 | import org.apache.hadoop.io.LongWritable; 28 | import org.apache.hadoop.io.Text; 29 | import org.apache.hadoop.mapreduce.Mapper; 30 | import org.rdfhdt.hdt.exceptions.ParserException; 31 | import org.rdfhdt.hdt.triples.TripleString; 32 | 33 | public class DictionarySamplerMapper extends Mapper { 34 | 35 | @Override 36 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 37 | 38 | TripleString triple = new TripleString(); 39 | try { 40 | triple.read(value.toString()); 41 | } catch (ParserException e) { 42 | // TODO Auto-generated catch block 43 | e.printStackTrace(); 44 | } 45 | 46 | context.write(new Text(triple.getSubject().toString()), new Text("S")); 47 | context.write(new Text(triple.getPredicate().toString()), new Text("P")); 48 | context.write(new Text(triple.getObject().toString()), new Text("O")); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/dictionary/DictionarySamplerReducer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.dictionary; 24 | 25 | import java.io.IOException; 26 | 27 | import org.apache.hadoop.io.Text; 28 | import org.apache.hadoop.mapreduce.Reducer; 29 | 30 | public class DictionarySamplerReducer extends Reducer { 31 | 32 | @Override 33 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException { 34 | boolean isSubject = false, isPredicate = false, isObject = false; 35 | String outputValue = ""; 36 | 37 | for (Text value : values) { 38 | if (value.toString().contains("S")) 39 | isSubject = true; 40 | if (value.toString().contains("P")) 41 | isPredicate = true; 42 | if (value.toString().contains("O")) 43 | isObject = true; 44 | } 45 | 46 | if (isSubject) 47 | outputValue = outputValue.concat("S"); 48 | if (isPredicate) 49 | outputValue = outputValue.concat("P"); 50 | if (isObject) 51 | outputValue = outputValue.concat("O"); 52 | 53 | context.write(key, new Text(outputValue)); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/io/TripleSPOComparator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.io; 24 | 25 | 26 | public class TripleSPOComparator extends TripleComparator { 27 | 28 | 29 | public TripleSPOComparator() { 30 | super(TripleSPOWritable.class, true); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/io/TripleSPOWritable.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.io; 24 | 25 | import org.apache.hadoop.io.LongWritable; 26 | 27 | public class TripleSPOWritable extends TripleWritable { 28 | 29 | /** 30 | * 31 | */ 32 | public TripleSPOWritable() { 33 | super(new LongWritable(), new LongWritable(), new LongWritable()); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/triples/TriplesSPOMapper.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.triples; 24 | 25 | import org.apache.hadoop.io.LongWritable; 26 | import org.apache.hadoop.io.NullWritable; 27 | import org.rdfhdt.hdt.enums.TripleComponentRole; 28 | import org.rdfhdt.hdt.triples.TripleString; 29 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable; 30 | 31 | public class TriplesSPOMapper extends TriplesMapper { 32 | 33 | /* 34 | * (non-Javadoc) 35 | * 36 | * @see org.rdfhdt.mrbuilder.triples.TriplesMapper#key(org.rdfhdt.hdt.triples.TripleString) 37 | */ 38 | @Override 39 | protected TripleSPOWritable key(TripleString tripleString) throws InterruptedException { 40 | long subject, predicate, object; 41 | 42 | if ((subject = this.dictionary.stringToId(tripleString.getSubject(), TripleComponentRole.SUBJECT)) == -1) { 43 | System.out.println("Subject nor found"); 44 | System.out.println("Subject [" + tripleString.getSubject() + "]"); 45 | System.out.println("Predicate [" + tripleString.getPredicate() + "]"); 46 | System.out.println("Object [" + tripleString.getObject() + "]"); 47 | throw new InterruptedException("Dictionary not loaded correctly"); 48 | } 49 | if ((predicate = this.dictionary.stringToId(tripleString.getPredicate(), TripleComponentRole.PREDICATE)) == -1) 50 | { 51 | System.out.println("Predicate nor found"); 52 | System.out.println("Subject [" + tripleString.getSubject() + "]"); 53 | System.out.println("Predicate [" + tripleString.getPredicate() + "]"); 54 | System.out.println("Object [" + tripleString.getObject() + "]"); 55 | throw new InterruptedException("Dictionary not loaded correctly"); 56 | } 57 | if ((object = this.dictionary.stringToId(tripleString.getObject(), TripleComponentRole.OBJECT)) == -1) 58 | { 59 | System.out.println("Object nor found"); 60 | System.out.println("Subject [" + tripleString.getSubject() + "]"); 61 | System.out.println("Predicate [" + tripleString.getPredicate() + "]"); 62 | System.out.println("Object [" + tripleString.getObject() + "]"); 63 | throw new InterruptedException("Dictionary not loaded correctly"); 64 | } 65 | 66 | TripleSPOWritable tripleIDs = new TripleSPOWritable(); 67 | tripleIDs.setSubject(new LongWritable(subject)); 68 | tripleIDs.setPredicate(new LongWritable(predicate)); 69 | tripleIDs.setObject(new LongWritable(object)); 70 | return tripleIDs; 71 | } 72 | 73 | /* 74 | * (non-Javadoc) 75 | * 76 | * @see org.rdfhdt.mrbuilder.triples.TriplesMapper#value(org.rdfhdt.hdt.triples.TripleString) 77 | */ 78 | @Override 79 | protected NullWritable value(TripleString tripleString) { 80 | return NullWritable.get(); 81 | } 82 | 83 | } 84 | -------------------------------------------------------------------------------- /src/org/rdfhdt/mrbuilder/util/FileStatusComparator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 3 | * 4 | * This library is free software; you can redistribute it and/or 5 | * modify it under the terms of the GNU Lesser General Public 6 | * License as published by the Free Software Foundation; either 7 | * version 2.1 of the License, or (at your option) any later version. 8 | * 9 | * This library is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 | * Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU Lesser General Public 15 | * License along with this library; if not, write to the Free Software 16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 | * 18 | * Contacting the authors: 19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es 20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at 21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es 22 | */ 23 | package org.rdfhdt.mrbuilder.util; 24 | 25 | import java.util.Comparator; 26 | 27 | import org.apache.hadoop.fs.FileStatus; 28 | 29 | 30 | public class FileStatusComparator implements Comparator { 31 | 32 | /* 33 | * (non-Javadoc) 34 | * 35 | * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) 36 | */ 37 | @Override 38 | public int compare(FileStatus fs1, FileStatus fs2) { 39 | return fs1.getPath().getName().compareTo(fs2.getPath().getName()); 40 | } 41 | 42 | } 43 | --------------------------------------------------------------------------------