├── .gitignore
├── COPYRIGHT
├── README
├── config
    ├── HDTMRBuilder.xml
    ├── lubm-dictionary.xml
    └── lubm-triples.xml
├── iface
    └── org
    │   └── rdfhdt
    │       ├── hdt
    │           └── trans
    │           │   └── TransientElement.java
    │       └── mrbuilder
    │           ├── io
    │               ├── TripleComparator.java
    │               └── TripleWritable.java
    │           └── triples
    │               └── TriplesMapper.java
├── pom.xml
└── src
    └── org
        └── rdfhdt
            ├── hdt
                ├── compact
                │   ├── bitmap
                │   │   └── TransientBitmap375.java
                │   └── sequence
                │   │   └── TransientSequenceLog64.java
                ├── dictionary
                │   └── impl
                │   │   ├── FourSectionDictionary2.java
                │   │   └── section
                │   │       ├── DictionarySectionFactory2.java
                │   │       └── TransientDictionarySection.java
                ├── hdt
                │   └── impl
                │   │   └── TransientHDT.java
                └── triples
                │   ├── ScapedTripleString.java
                │   └── impl
                │       └── TransientBitMapTriples.java
            ├── listener
                └── HDTBuilderListener.java
            └── mrbuilder
                ├── HDTBuilderConfiguration.java
                ├── HDTBuilderDriver.java
                ├── dictionary
                    ├── DictionaryCombiner.java
                    ├── DictionaryMapper.java
                    ├── DictionaryReducer.java
                    ├── DictionarySamplerMapper.java
                    └── DictionarySamplerReducer.java
                ├── io
                    ├── TripleSPOComparator.java
                    └── TripleSPOWritable.java
                ├── triples
                    └── TriplesSPOMapper.java
                └── util
                    └── FileStatusComparator.java


/.gitignore:
--------------------------------------------------------------------------------
1 | # Mac OS X
2 | .DS_Store
3 | 
4 | # Editor backup files
5 | *~
6 | 


--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
  1 | 		  GNU LESSER GENERAL PUBLIC LICENSE
  2 | 		       Version 2.1, February 1999
  3 | 
  4 |  Copyright (C) 1991, 1999 Free Software Foundation, Inc.
  5 |      51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 | [This is the first released version of the Lesser GPL.  It also counts
 10 |  as the successor of the GNU Library Public License, version 2, hence
 11 |  the version number 2.1.]
 12 | 
 13 | 			    Preamble
 14 | 
 15 |   The licenses for most software are designed to take away your
 16 | freedom to share and change it.  By contrast, the GNU General Public
 17 | Licenses are intended to guarantee your freedom to share and change
 18 | free software--to make sure the software is free for all its users.
 19 | 
 20 |   This license, the Lesser General Public License, applies to some
 21 | specially designated software packages--typically libraries--of the
 22 | Free Software Foundation and other authors who decide to use it.  You
 23 | can use it too, but we suggest you first think carefully about whether
 24 | this license or the ordinary General Public License is the better
 25 | strategy to use in any particular case, based on the explanations below.
 26 | 
 27 |   When we speak of free software, we are referring to freedom of use,
 28 | not price.  Our General Public Licenses are designed to make sure that
 29 | you have the freedom to distribute copies of free software (and charge
 30 | for this service if you wish); that you receive source code or can get
 31 | it if you want it; that you can change the software and use pieces of
 32 | it in new free programs; and that you are informed that you can do
 33 | these things.
 34 | 
 35 |   To protect your rights, we need to make restrictions that forbid
 36 | distributors to deny you these rights or to ask you to surrender these
 37 | rights.  These restrictions translate to certain responsibilities for
 38 | you if you distribute copies of the library or if you modify it.
 39 | 
 40 |   For example, if you distribute copies of the library, whether gratis
 41 | or for a fee, you must give the recipients all the rights that we gave
 42 | you.  You must make sure that they, too, receive or can get the source
 43 | code.  If you link other code with the library, you must provide
 44 | complete object files to the recipients, so that they can relink them
 45 | with the library after making changes to the library and recompiling
 46 | it.  And you must show them these terms so they know their rights.
 47 | 
 48 |   We protect your rights with a two-step method: (1) we copyright the
 49 | library, and (2) we offer you this license, which gives you legal
 50 | permission to copy, distribute and/or modify the library.
 51 | 
 52 |   To protect each distributor, we want to make it very clear that
 53 | there is no warranty for the free library.  Also, if the library is
 54 | modified by someone else and passed on, the recipients should know
 55 | that what they have is not the original version, so that the original
 56 | author's reputation will not be affected by problems that might be
 57 | introduced by others.
 58 | 
 59 |   Finally, software patents pose a constant threat to the existence of
 60 | any free program.  We wish to make sure that a company cannot
 61 | effectively restrict the users of a free program by obtaining a
 62 | restrictive license from a patent holder.  Therefore, we insist that
 63 | any patent license obtained for a version of the library must be
 64 | consistent with the full freedom of use specified in this license.
 65 | 
 66 |   Most GNU software, including some libraries, is covered by the
 67 | ordinary GNU General Public License.  This license, the GNU Lesser
 68 | General Public License, applies to certain designated libraries, and
 69 | is quite different from the ordinary General Public License.  We use
 70 | this license for certain libraries in order to permit linking those
 71 | libraries into non-free programs.
 72 | 
 73 |   When a program is linked with a library, whether statically or using
 74 | a shared library, the combination of the two is legally speaking a
 75 | combined work, a derivative of the original library.  The ordinary
 76 | General Public License therefore permits such linking only if the
 77 | entire combination fits its criteria of freedom.  The Lesser General
 78 | Public License permits more lax criteria for linking other code with
 79 | the library.
 80 | 
 81 |   We call this license the "Lesser" General Public License because it
 82 | does Less to protect the user's freedom than the ordinary General
 83 | Public License.  It also provides other free software developers Less
 84 | of an advantage over competing non-free programs.  These disadvantages
 85 | are the reason we use the ordinary General Public License for many
 86 | libraries.  However, the Lesser license provides advantages in certain
 87 | special circumstances.
 88 | 
 89 |   For example, on rare occasions, there may be a special need to
 90 | encourage the widest possible use of a certain library, so that it becomes
 91 | a de-facto standard.  To achieve this, non-free programs must be
 92 | allowed to use the library.  A more frequent case is that a free
 93 | library does the same job as widely used non-free libraries.  In this
 94 | case, there is little to gain by limiting the free library to free
 95 | software only, so we use the Lesser General Public License.
 96 | 
 97 |   In other cases, permission to use a particular library in non-free
 98 | programs enables a greater number of people to use a large body of
 99 | free software.  For example, permission to use the GNU C Library in
100 | non-free programs enables many more people to use the whole GNU
101 | operating system, as well as its variant, the GNU/Linux operating
102 | system.
103 | 
104 |   Although the Lesser General Public License is Less protective of the
105 | users' freedom, it does ensure that the user of a program that is
106 | linked with the Library has the freedom and the wherewithal to run
107 | that program using a modified version of the Library.
108 | 
109 |   The precise terms and conditions for copying, distribution and
110 | modification follow.  Pay close attention to the difference between a
111 | "work based on the library" and a "work that uses the library".  The
112 | former contains code derived from the library, whereas the latter must
113 | be combined with the library in order to run.
114 | 
115 | 		  GNU LESSER GENERAL PUBLIC LICENSE
116 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117 | 
118 |   0. This License Agreement applies to any software library or other
119 | program which contains a notice placed by the copyright holder or
120 | other authorized party saying it may be distributed under the terms of
121 | this Lesser General Public License (also called "this License").
122 | Each licensee is addressed as "you".
123 | 
124 |   A "library" means a collection of software functions and/or data
125 | prepared so as to be conveniently linked with application programs
126 | (which use some of those functions and data) to form executables.
127 | 
128 |   The "Library", below, refers to any such software library or work
129 | which has been distributed under these terms.  A "work based on the
130 | Library" means either the Library or any derivative work under
131 | copyright law: that is to say, a work containing the Library or a
132 | portion of it, either verbatim or with modifications and/or translated
133 | straightforwardly into another language.  (Hereinafter, translation is
134 | included without limitation in the term "modification".)
135 | 
136 |   "Source code" for a work means the preferred form of the work for
137 | making modifications to it.  For a library, complete source code means
138 | all the source code for all modules it contains, plus any associated
139 | interface definition files, plus the scripts used to control compilation
140 | and installation of the library.
141 | 
142 |   Activities other than copying, distribution and modification are not
143 | covered by this License; they are outside its scope.  The act of
144 | running a program using the Library is not restricted, and output from
145 | such a program is covered only if its contents constitute a work based
146 | on the Library (independent of the use of the Library in a tool for
147 | writing it).  Whether that is true depends on what the Library does
148 | and what the program that uses the Library does.
149 |   
150 |   1. You may copy and distribute verbatim copies of the Library's
151 | complete source code as you receive it, in any medium, provided that
152 | you conspicuously and appropriately publish on each copy an
153 | appropriate copyright notice and disclaimer of warranty; keep intact
154 | all the notices that refer to this License and to the absence of any
155 | warranty; and distribute a copy of this License along with the
156 | Library.
157 | 
158 |   You may charge a fee for the physical act of transferring a copy,
159 | and you may at your option offer warranty protection in exchange for a
160 | fee.
161 | 
162 |   2. You may modify your copy or copies of the Library or any portion
163 | of it, thus forming a work based on the Library, and copy and
164 | distribute such modifications or work under the terms of Section 1
165 | above, provided that you also meet all of these conditions:
166 | 
167 |     a) The modified work must itself be a software library.
168 | 
169 |     b) You must cause the files modified to carry prominent notices
170 |     stating that you changed the files and the date of any change.
171 | 
172 |     c) You must cause the whole of the work to be licensed at no
173 |     charge to all third parties under the terms of this License.
174 | 
175 |     d) If a facility in the modified Library refers to a function or a
176 |     table of data to be supplied by an application program that uses
177 |     the facility, other than as an argument passed when the facility
178 |     is invoked, then you must make a good faith effort to ensure that,
179 |     in the event an application does not supply such function or
180 |     table, the facility still operates, and performs whatever part of
181 |     its purpose remains meaningful.
182 | 
183 |     (For example, a function in a library to compute square roots has
184 |     a purpose that is entirely well-defined independent of the
185 |     application.  Therefore, Subsection 2d requires that any
186 |     application-supplied function or table used by this function must
187 |     be optional: if the application does not supply it, the square
188 |     root function must still compute square roots.)
189 | 
190 | These requirements apply to the modified work as a whole.  If
191 | identifiable sections of that work are not derived from the Library,
192 | and can be reasonably considered independent and separate works in
193 | themselves, then this License, and its terms, do not apply to those
194 | sections when you distribute them as separate works.  But when you
195 | distribute the same sections as part of a whole which is a work based
196 | on the Library, the distribution of the whole must be on the terms of
197 | this License, whose permissions for other licensees extend to the
198 | entire whole, and thus to each and every part regardless of who wrote
199 | it.
200 | 
201 | Thus, it is not the intent of this section to claim rights or contest
202 | your rights to work written entirely by you; rather, the intent is to
203 | exercise the right to control the distribution of derivative or
204 | collective works based on the Library.
205 | 
206 | In addition, mere aggregation of another work not based on the Library
207 | with the Library (or with a work based on the Library) on a volume of
208 | a storage or distribution medium does not bring the other work under
209 | the scope of this License.
210 | 
211 |   3. You may opt to apply the terms of the ordinary GNU General Public
212 | License instead of this License to a given copy of the Library.  To do
213 | this, you must alter all the notices that refer to this License, so
214 | that they refer to the ordinary GNU General Public License, version 2,
215 | instead of to this License.  (If a newer version than version 2 of the
216 | ordinary GNU General Public License has appeared, then you can specify
217 | that version instead if you wish.)  Do not make any other change in
218 | these notices.
219 | 
220 |   Once this change is made in a given copy, it is irreversible for
221 | that copy, so the ordinary GNU General Public License applies to all
222 | subsequent copies and derivative works made from that copy.
223 | 
224 |   This option is useful when you wish to copy part of the code of
225 | the Library into a program that is not a library.
226 | 
227 |   4. You may copy and distribute the Library (or a portion or
228 | derivative of it, under Section 2) in object code or executable form
229 | under the terms of Sections 1 and 2 above provided that you accompany
230 | it with the complete corresponding machine-readable source code, which
231 | must be distributed under the terms of Sections 1 and 2 above on a
232 | medium customarily used for software interchange.
233 | 
234 |   If distribution of object code is made by offering access to copy
235 | from a designated place, then offering equivalent access to copy the
236 | source code from the same place satisfies the requirement to
237 | distribute the source code, even though third parties are not
238 | compelled to copy the source along with the object code.
239 | 
240 |   5. A program that contains no derivative of any portion of the
241 | Library, but is designed to work with the Library by being compiled or
242 | linked with it, is called a "work that uses the Library".  Such a
243 | work, in isolation, is not a derivative work of the Library, and
244 | therefore falls outside the scope of this License.
245 | 
246 |   However, linking a "work that uses the Library" with the Library
247 | creates an executable that is a derivative of the Library (because it
248 | contains portions of the Library), rather than a "work that uses the
249 | library".  The executable is therefore covered by this License.
250 | Section 6 states terms for distribution of such executables.
251 | 
252 |   When a "work that uses the Library" uses material from a header file
253 | that is part of the Library, the object code for the work may be a
254 | derivative work of the Library even though the source code is not.
255 | Whether this is true is especially significant if the work can be
256 | linked without the Library, or if the work is itself a library.  The
257 | threshold for this to be true is not precisely defined by law.
258 | 
259 |   If such an object file uses only numerical parameters, data
260 | structure layouts and accessors, and small macros and small inline
261 | functions (ten lines or less in length), then the use of the object
262 | file is unrestricted, regardless of whether it is legally a derivative
263 | work.  (Executables containing this object code plus portions of the
264 | Library will still fall under Section 6.)
265 | 
266 |   Otherwise, if the work is a derivative of the Library, you may
267 | distribute the object code for the work under the terms of Section 6.
268 | Any executables containing that work also fall under Section 6,
269 | whether or not they are linked directly with the Library itself.
270 | 
271 |   6. As an exception to the Sections above, you may also combine or
272 | link a "work that uses the Library" with the Library to produce a
273 | work containing portions of the Library, and distribute that work
274 | under terms of your choice, provided that the terms permit
275 | modification of the work for the customer's own use and reverse
276 | engineering for debugging such modifications.
277 | 
278 |   You must give prominent notice with each copy of the work that the
279 | Library is used in it and that the Library and its use are covered by
280 | this License.  You must supply a copy of this License.  If the work
281 | during execution displays copyright notices, you must include the
282 | copyright notice for the Library among them, as well as a reference
283 | directing the user to the copy of this License.  Also, you must do one
284 | of these things:
285 | 
286 |     a) Accompany the work with the complete corresponding
287 |     machine-readable source code for the Library including whatever
288 |     changes were used in the work (which must be distributed under
289 |     Sections 1 and 2 above); and, if the work is an executable linked
290 |     with the Library, with the complete machine-readable "work that
291 |     uses the Library", as object code and/or source code, so that the
292 |     user can modify the Library and then relink to produce a modified
293 |     executable containing the modified Library.  (It is understood
294 |     that the user who changes the contents of definitions files in the
295 |     Library will not necessarily be able to recompile the application
296 |     to use the modified definitions.)
297 | 
298 |     b) Use a suitable shared library mechanism for linking with the
299 |     Library.  A suitable mechanism is one that (1) uses at run time a
300 |     copy of the library already present on the user's computer system,
301 |     rather than copying library functions into the executable, and (2)
302 |     will operate properly with a modified version of the library, if
303 |     the user installs one, as long as the modified version is
304 |     interface-compatible with the version that the work was made with.
305 | 
306 |     c) Accompany the work with a written offer, valid for at
307 |     least three years, to give the same user the materials
308 |     specified in Subsection 6a, above, for a charge no more
309 |     than the cost of performing this distribution.
310 | 
311 |     d) If distribution of the work is made by offering access to copy
312 |     from a designated place, offer equivalent access to copy the above
313 |     specified materials from the same place.
314 | 
315 |     e) Verify that the user has already received a copy of these
316 |     materials or that you have already sent this user a copy.
317 | 
318 |   For an executable, the required form of the "work that uses the
319 | Library" must include any data and utility programs needed for
320 | reproducing the executable from it.  However, as a special exception,
321 | the materials to be distributed need not include anything that is
322 | normally distributed (in either source or binary form) with the major
323 | components (compiler, kernel, and so on) of the operating system on
324 | which the executable runs, unless that component itself accompanies
325 | the executable.
326 | 
327 |   It may happen that this requirement contradicts the license
328 | restrictions of other proprietary libraries that do not normally
329 | accompany the operating system.  Such a contradiction means you cannot
330 | use both them and the Library together in an executable that you
331 | distribute.
332 | 
333 |   7. You may place library facilities that are a work based on the
334 | Library side-by-side in a single library together with other library
335 | facilities not covered by this License, and distribute such a combined
336 | library, provided that the separate distribution of the work based on
337 | the Library and of the other library facilities is otherwise
338 | permitted, and provided that you do these two things:
339 | 
340 |     a) Accompany the combined library with a copy of the same work
341 |     based on the Library, uncombined with any other library
342 |     facilities.  This must be distributed under the terms of the
343 |     Sections above.
344 | 
345 |     b) Give prominent notice with the combined library of the fact
346 |     that part of it is a work based on the Library, and explaining
347 |     where to find the accompanying uncombined form of the same work.
348 | 
349 |   8. You may not copy, modify, sublicense, link with, or distribute
350 | the Library except as expressly provided under this License.  Any
351 | attempt otherwise to copy, modify, sublicense, link with, or
352 | distribute the Library is void, and will automatically terminate your
353 | rights under this License.  However, parties who have received copies,
354 | or rights, from you under this License will not have their licenses
355 | terminated so long as such parties remain in full compliance.
356 | 
357 |   9. You are not required to accept this License, since you have not
358 | signed it.  However, nothing else grants you permission to modify or
359 | distribute the Library or its derivative works.  These actions are
360 | prohibited by law if you do not accept this License.  Therefore, by
361 | modifying or distributing the Library (or any work based on the
362 | Library), you indicate your acceptance of this License to do so, and
363 | all its terms and conditions for copying, distributing or modifying
364 | the Library or works based on it.
365 | 
366 |   10. Each time you redistribute the Library (or any work based on the
367 | Library), the recipient automatically receives a license from the
368 | original licensor to copy, distribute, link with or modify the Library
369 | subject to these terms and conditions.  You may not impose any further
370 | restrictions on the recipients' exercise of the rights granted herein.
371 | You are not responsible for enforcing compliance by third parties with
372 | this License.
373 | 
374 |   11. If, as a consequence of a court judgment or allegation of patent
375 | infringement or for any other reason (not limited to patent issues),
376 | conditions are imposed on you (whether by court order, agreement or
377 | otherwise) that contradict the conditions of this License, they do not
378 | excuse you from the conditions of this License.  If you cannot
379 | distribute so as to satisfy simultaneously your obligations under this
380 | License and any other pertinent obligations, then as a consequence you
381 | may not distribute the Library at all.  For example, if a patent
382 | license would not permit royalty-free redistribution of the Library by
383 | all those who receive copies directly or indirectly through you, then
384 | the only way you could satisfy both it and this License would be to
385 | refrain entirely from distribution of the Library.
386 | 
387 | If any portion of this section is held invalid or unenforceable under any
388 | particular circumstance, the balance of the section is intended to apply,
389 | and the section as a whole is intended to apply in other circumstances.
390 | 
391 | It is not the purpose of this section to induce you to infringe any
392 | patents or other property right claims or to contest validity of any
393 | such claims; this section has the sole purpose of protecting the
394 | integrity of the free software distribution system which is
395 | implemented by public license practices.  Many people have made
396 | generous contributions to the wide range of software distributed
397 | through that system in reliance on consistent application of that
398 | system; it is up to the author/donor to decide if he or she is willing
399 | to distribute software through any other system and a licensee cannot
400 | impose that choice.
401 | 
402 | This section is intended to make thoroughly clear what is believed to
403 | be a consequence of the rest of this License.
404 | 
405 |   12. If the distribution and/or use of the Library is restricted in
406 | certain countries either by patents or by copyrighted interfaces, the
407 | original copyright holder who places the Library under this License may add
408 | an explicit geographical distribution limitation excluding those countries,
409 | so that distribution is permitted only in or among countries not thus
410 | excluded.  In such case, this License incorporates the limitation as if
411 | written in the body of this License.
412 | 
413 |   13. The Free Software Foundation may publish revised and/or new
414 | versions of the Lesser General Public License from time to time.
415 | Such new versions will be similar in spirit to the present version,
416 | but may differ in detail to address new problems or concerns.
417 | 
418 | Each version is given a distinguishing version number.  If the Library
419 | specifies a version number of this License which applies to it and
420 | "any later version", you have the option of following the terms and
421 | conditions either of that version or of any later version published by
422 | the Free Software Foundation.  If the Library does not specify a
423 | license version number, you may choose any version ever published by
424 | the Free Software Foundation.
425 | 
426 |   14. If you wish to incorporate parts of the Library into other free
427 | programs whose distribution conditions are incompatible with these,
428 | write to the author to ask for permission.  For software which is
429 | copyrighted by the Free Software Foundation, write to the Free
430 | Software Foundation; we sometimes make exceptions for this.  Our
431 | decision will be guided by the two goals of preserving the free status
432 | of all derivatives of our free software and of promoting the sharing
433 | and reuse of software generally.
434 | 
435 | 			    NO WARRANTY
436 | 
437 |   15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
443 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
444 | LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
446 | 
447 |   16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
456 | DAMAGES.
457 | 
458 | 		     END OF TERMS AND CONDITIONS


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | ========================
  2 |  HDT-MR Library.
  3 | ========================
  4 | 
  5 | Copyright (C) 2015, Jose M. Gimenez-Garcia, Javier D. Fernandez, Miguel A. Martinez-Prieto
  6 | All rights reserved.
  7 | 
  8 | This library is free software; you can redistribute it and/or
  9 | modify it under the terms of the GNU Lesser General Public
 10 | License as published by the Free Software Foundation; either
 11 | version 2.1 of the License, or (at your option) any later version.
 12 | 
 13 | This library is distributed in the hope that it will be useful,
 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 16 | Lesser General Public License for more details.
 17 | 
 18 | You should have received a copy of the GNU Lesser General Public
 19 | License along with this library; if not, write to the Free Software
 20 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 21 | 
 22 | Visit our Web Page: dataweb.infor.uva.es/projects/hdt-mr
 23 | 
 24 | Contacting the authors:
 25 |  Jose M. Gimenez-Garcia:    josemiguel.gimenez@alumnos.uva.es
 26 |  Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
 27 |  Miguel A. Martinez-Prieto: migumar2@infor.uva.es
 28 | 
 29 | 
 30 | Overview
 31 | =================
 32 | 
 33 | HDT-MR improves the HDT-java library by introducing MapReduce as the computation model for large HDT serialization. HDT-MR performs in linear time with the dataset size and has proven able to serialize datasets up to 4.42 billion triples, preserving HDT compression and retrieval features.
 34 | 
 35 | HDT-java is a Java library that implements the W3C Submission (http://www.w3.org/Submission/2011/03/) of the RDF HDT (Header-Dictionary-Triples) binary format for publishing and exchanging RDF data at large scale. Its compact representation allows storing RDF in fewer space, while providing direct access to the stored information. See rdfhdt.org for further information.
 36 | 
 37 | 
 38 | 
 39 | HDT-MR provides three components:
 40 | - iface: Provides an API to use HDT-MR, including interfaces and abstract classes 
 41 | - src: Core library and command lines tools for using HDT-MR. It allows creating HDT files from RDF.
 42 | - config: Examples of configuration files
 43 | Note that the current distribution is an alpha version. Therefore, while this build has been tested, it is still subject to bugs and optimizations.
 44 | 
 45 | 
 46 | 
 47 | 
 48 | Compiling
 49 | =================
 50 | Dependencies: 
 51 | 	* HDT-java (https://code.google.com/p/hdt-java/).
 52 | 	 *** src/org/rdfhdt/hdt includes those classes who has been modified/extended 
 53 | 
 54 | Command line tools
 55 | =================
 56 | 
 57 | The tool provides the following main command line tool:
 58 | 
 59 | Usage: hadoop HDTBuilderDriver [options]
 60 |   Options:
 61 |     -a, --awsbucket
 62 |        Amazon Web Services bucket
 63 |     -bu, --baseURI
 64 |        Base URI for the dataset
 65 |     -b, --basedir
 66 |        Root directory for the process
 67 |     -bd, --builddictionary
 68 |        Whether to build HDT dictionary or not
 69 |     -bh, --buildhdt
 70 |        Whether to build HDT or not
 71 |     -c, --conf
 72 |        Path to configuration file
 73 |     -dd, --deleteoutputdictionary
 74 |        Delete dictionary job output path before running job
 75 |     -dt, --deleteoutputtriples
 76 |        Delete triples job output path before running job
 77 |     -dsd, --deletesampledictionary
 78 |        Delete dictionary job sample path before running job
 79 |     -dst, --deletesampletriples
 80 |        Delete triples job sample path before running job
 81 |     -d, --dictionarydistribution
 82 |        Dictionary distribution among mappers and reducers
 83 |     -fd, --filedictionary
 84 |        Name of hdt dictionary file
 85 |     -fr, --fileobjects
 86 |        Name of hdt dictionary file for Reducers
 87 |     -fm, --filesubjects
 88 |        Name of hdt dictionary file for Mappers
 89 |     -hc, --hdtconf
 90 |        Conversion config file
 91 |     -x, --index
 92 |        Generate also external indices to solve all queries
 93 |     -i, --input
 94 |        Path to input files. Relative to basedir
 95 |     -it, --inputtriples
 96 |        Path to triples job input files. Relative to basedir
 97 |     -nd, --namedictionaryjob
 98 |        Name of dictionary job
 99 |     -fh, --namehdtfile
100 |        Name of hdt  file
101 |     -nt, --nametriplesjob
102 |        Name of triples job
103 |     -o, --options
104 |        HDT Conversion options (override those of config file)
105 |     -od, --outputdictionary
106 |        Path to dictionary job output files. Relative to basedir
107 |     -ot, --outputtriples
108 |        Path to triples job output files. Relative to basedir
109 |     -q, --quiet
110 |        Do not show progress of the conversion
111 |     -t, --rdftype
112 |        Type of RDF Input (ntriples, nquad, n3, turtle, rdfxml)
113 |     -Rd, --reducersdictionary
114 |        Number of reducers for dictionary job
115 |     -Rds, --reducersdictionarysampling
116 |        Number of reducers for dictionary input sampling job
117 |     -Rt, --reducerstriples
118 |        Number of reducers for triples job
119 |     -Rts, --reducerstriplessampling
120 |        Number of reducers for triples input sampling job
121 |     -rd, --rundictionary
122 |        Whether to run dictionary job or not
123 |     -rds, --rundictionarysampling
124 |        Whether to run dictionary input sampling job or not
125 |     -rt, --runtriples
126 |        Whether to run triples job or not
127 |     -rts, --runtriplessampling
128 |        Whether to run triples input sampling job or not
129 |     -p, --sampleprobability
130 |        Probability of using each element for sampling
131 |     -sd, --samplesdictionary
132 |        Path to dictionary job sample files. Relative to basedir
133 |     -st, --samplestriples
134 |        Path to triples job sample files. Relative to basedir
135 | 
136 | 
137 | Usage example
138 | =================
139 | 
140 | After installation, run:
141 | 
142 | $ hadoop HDTBuilderDriver
143 | # This first try to read configuration parameters at the default config file (HDTMRBuilder.xml), using default values for those missing parameters. It reads RDF input data from the default 'input' folder and outputs the HDT conversion in 'output.hdt'
144 | 
145 | $ hadoop HDTBuilderDriver -i mashup
146 | # Same previous example, but it reads RDF input data from the directory 'mashup'
147 | 
148 | $ hadoop HDTBuilderDriver -c lubm-dictionary.xml -p 0.01
149 | # It uses 'lubm-dictionary.xml' as the configuration file. This file states that input data must be taken from the 'lubm' directory and it forces to compute only the HDT dictionary, which is written in 'dictionary/dictionary.hdt'
150 | # It uses 0.01 as the probability of using each element for sampling.
151 | 
152 | 
153 | $ hadoop HDTBuilderDriver -c lubm-triples.xml -Rt 1 -Rts 1
154 | # It uses 'lubm-triples.xml' as the configuration file. This file states that input data must be taken from the 'lubm' directory and it forces to compute the HDT triples and the final HDT representation by taken the already computed dictionary in 'dictionary/dictionary.hdt'
155 | # It forces to use one reducer in both jobs.
156 | 
157 | License
158 | ===============
159 | 
160 | All HDT-MR content is licensed by Lesser General Public License.
161 | 
162 | Acknowledgements
163 | ================
164 | 
165 | HDT-MR is a project partially funded by Ministerio de Economia y Competitividad, Spain: TIN2013-46238-C4-3-R, and Austrian Science Fund (FWF): M1720-G11.
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/config/HDTMRBuilder.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <property>
 4 |     <name>global.path.base</name>
 5 |     <value>.</value>
 6 |     <description>Root directory</description>
 7 |   </property>
 8 | 
 9 |   <property>
10 |     <name>global.path.input</name>
11 |     <value>input</value>
12 |     <description>input path</description>
13 |   </property>
14 | 
15 |   <property>
16 |     <name>job.dictionary.path.output</name>
17 |     <value>d</value>
18 |     <description>Dictionary output path / Triples input path</description>
19 |   </property>
20 | 
21 |   <property>
22 |     <name>job.dictionary.path.output.delete</name>
23 |     <value>true</value>
24 |     <description>Whether to delete dictionary output path</description>
25 |   </property>
26 | 
27 |   <property>
28 |     <name>job.dictionary.path.sample</name>
29 |     <value>s</value>
30 |     <description>Dictionary sample path</description>
31 |   </property>
32 | 
33 |   <property>
34 |     <name>job.dictionary.path.sample.delete</name>
35 |     <value>true</value>
36 |     <description>Whether to delete dictionary sample path</description>
37 |   </property>
38 | 
39 |   <property>
40 |     <name>job.triples.path.output</name>
41 |     <value>t</value>
42 |     <description>Triples output path</description>
43 |   </property>
44 | 
45 |   <property>
46 |     <name>job.triples.path.output.delete</name>
47 |     <value>true</value>
48 |     <description>Whether to delete triples output path</description>
49 |   </property>
50 | 
51 |   <property>
52 |     <name>job.dictionary.reducers</name>
53 |     <value>10</value>
54 |     <description>Number of reducers used by jobs</description>
55 |   </property>
56 | 
57 |   <property>
58 |     <name>job.triples.reducers</name>
59 |     <value>10</value>
60 |     <description>Number of reducers used by jobs</description>
61 |   </property>
62 | 
63 |   <property>
64 |     <name>job.dictionary.sample.probability</name>
65 |     <value>0.000001</value>
66 |     <description>Sampler Probability</description>
67 |   </property>
68 | 
69 | </configuration>
70 | 


--------------------------------------------------------------------------------
/config/lubm-dictionary.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <property>
 4 |     <name>job.dictionary.run</name>
 5 |     <value>true</value>
 6 |   </property>
 7 | 
 8 |   <property>
 9 |     <name>job.dictionary.sample.run</name>
10 |     <value>true</value>
11 |   </property>
12 | 
13 |   <property>
14 |     <name>job.dictionary.sample.reducers</name>
15 |     <value>10</value>
16 |   </property>
17 | 
18 |   <property>
19 |     <name>hdt.dictionary.build</name>
20 |     <value>true</value>
21 |   </property>
22 | 
23 |   <property>
24 |     <name>job.triples.run</name>
25 |     <value>false</value>
26 |   </property>
27 | 
28 |   <property>
29 |     <name>job.triples.sample.run</name>
30 |     <value>false</value>
31 |   </property>
32 | 
33 |   <property>
34 |     <name>hdt.build</name>
35 |     <value>false</value>
36 |   </property>
37 | 
38 |   <property>
39 |     <name>global.path.base</name>
40 |     <value>.</value>
41 |     <description>Root directory</description>
42 |   </property>
43 | 
44 |   <property>
45 |     <name>global.path.input</name>
46 |     <value>lubm</value>
47 |     <description>input path</description>
48 |   </property>
49 | 
50 |   <property>
51 |     <name>job.dictionary.path.output</name>
52 |     <value>dictionary</value>
53 |     <description>Dictionary output path / Triples input path</description>
54 |   </property>
55 | 
56 |   <property>
57 |     <name>job.dictionary.path.output.delete</name>
58 |     <value>true</value>
59 |     <description>Whether to delete dictionary output path</description>
60 |   </property>
61 | 
62 |   <property>
63 |     <name>job.dictionary.path.sample</name>
64 |     <value>dictionary_sample</value>
65 |     <description>Dictionary samples path</description>
66 |   </property>
67 | 
68 |   <property>
69 |     <name>job.dictionary.path.sample.delete</name>
70 |     <value>true</value>
71 |     <description>Whether to delete dictionary samples path</description>
72 |   </property>
73 | 
74 |   <property>
75 |     <name>job.dictionary.reducers</name>
76 |     <value>10</value>
77 |     <description>Number of reducers used by jobs</description>
78 |   </property>
79 | 
80 |   <property>
81 |     <name>job.dictionary.sample.probability</name>
82 |     <value>0.000001</value>
83 |     <description>Sampler Probability</description>
84 |   </property>
85 | 
86 | </configuration>
87 | 


--------------------------------------------------------------------------------
/config/lubm-triples.xml:
--------------------------------------------------------------------------------
 1 | <configuration>
 2 | 
 3 |   <property>
 4 |     <name>job.dictionary.run</name>
 5 |     <value>false</value>
 6 |   </property>
 7 | 
 8 |   <property>
 9 |     <name>job.dictionary.sample.run</name>
10 |     <value>false</value>
11 |   </property>
12 | 
13 |   <property>
14 |     <name>hdt.dictionary.build</name>
15 |     <value>false</value>
16 |   </property>
17 | 
18 |   <property>
19 |     <name>job.triples.run</name>
20 |     <value>true</value>
21 |   </property>
22 | 
23 |   <property>
24 |     <name>job.triples.sample.run</name>
25 |     <value>true</value>
26 |   </property>
27 | 
28 |   <property>
29 |     <name>hdt.build</name>
30 |     <value>true</value>
31 |   </property>
32 | 
33 |   <property>
34 |     <name>global.path.base</name>
35 |     <value>.</value>
36 |     <description>Root directory</description>
37 |   </property>
38 | 
39 |   <property>
40 |     <name>global.path.input</name>
41 |     <value>lubm</value>
42 |     <description>input path</description>
43 |   </property>
44 | 
45 |   <property>
46 |     <name>job.dictionary.path.output</name>
47 |     <value>dictionary</value>
48 |     <description>Dictionary output path / Triples input path</description>
49 |   </property>
50 | 
51 |   <property>
52 |     <name>job.triples.path.output.delete</name>
53 |     <value>true</value>
54 |     <description>Whether to delete triples output path</description>
55 |   </property>
56 | 
57 |   <property>
58 |     <name>job.triples.path.sample</name>
59 |     <value>triples_sample</value>
60 |     <description>Tripls samples path</description>
61 |   </property>
62 | 
63 |   <property>
64 |     <name>job.triples.path.sample.delete</name>
65 |     <value>true</value>
66 |     <description>Whether to delete tripls samples path</description>
67 |   </property>
68 | 
69 |   <property>
70 |     <name>job.triples.reducers</name>
71 |     <value>10</value>
72 |     <description>Number of reducers used by jobs</description>
73 |   </property>
74 | 
75 |   <property>
76 |     <name>job.triples.sample.probability</name>
77 |     <value>0.000001</value>
78 |     <description>Sampler Probability</description>
79 |   </property>
80 | 
81 | </configuration>
82 | 


--------------------------------------------------------------------------------
/iface/org/rdfhdt/hdt/trans/TransientElement.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  */
 4 | package org.rdfhdt.hdt.trans;
 5 | 
 6 | import java.io.IOException;
 7 | 
 8 | import org.apache.hadoop.io.SequenceFile;
 9 | import org.rdfhdt.hdt.listener.ProgressListener;
10 | 
11 | /**
12 |  * @author chemi
13 |  *
14 |  */
15 | public interface TransientElement {
16 | 
17 |     public void initialize(long numentries);
18 | 
19 |     public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException;
20 | 
21 |     public void close() throws IOException;
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/iface/org/rdfhdt/mrbuilder/io/TripleComparator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.io;
24 | 
25 | import org.apache.hadoop.io.WritableComparable;
26 | import org.apache.hadoop.io.WritableComparator;
27 | 
28 | /**
29 |  * @author chemi
30 |  *
31 |  */
32 | @SuppressWarnings("rawtypes")
33 | public abstract class TripleComparator<TW extends TripleWritable> extends WritableComparator {
34 | 
35 |     public TripleComparator(Class<? extends TripleWritable> keyClass, boolean createInstances) {
36 | 	super(keyClass, createInstances);
37 |     }
38 | 
39 |     public TripleComparator(Class<? extends TripleWritable> keyClass) {
40 | 	super(keyClass);
41 |     }
42 | 
43 |     @SuppressWarnings("unchecked")
44 |     @Override
45 |     public int compare(WritableComparable wc1, WritableComparable wc2) {
46 | 	TW key1 = (TW) wc1;
47 | 	TW key2 = (TW) wc2;
48 | 	return key1.compareTo(key2);
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/iface/org/rdfhdt/mrbuilder/io/TripleWritable.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
  3 |  *
  4 |  * This library is free software; you can redistribute it and/or
  5 |  * modify it under the terms of the GNU Lesser General Public
  6 |  * License as published by the Free Software Foundation; either
  7 |  * version 2.1 of the License, or (at your option) any later version.
  8 |  *
  9 |  * This library is distributed in the hope that it will be useful,
 10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 12 |  * Lesser General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU Lesser General Public
 15 |  * License along with this library; if not, write to the Free Software
 16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 17 |  *
 18 |  * Contacting the authors:
 19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
 20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
 21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
 22 |  */
 23 | package org.rdfhdt.mrbuilder.io;
 24 | 
 25 | import java.io.DataInput;
 26 | import java.io.DataOutput;
 27 | import java.io.IOException;
 28 | 
 29 | import org.apache.hadoop.io.WritableComparable;
 30 | 
 31 | /**
 32 |  * @author chemi
 33 |  *
 34 |  */
 35 | 
 36 | @SuppressWarnings("rawtypes")
 37 | public abstract class TripleWritable<S extends WritableComparable, P extends WritableComparable, O extends WritableComparable> implements WritableComparable<TripleWritable<S, P, O>> {
 38 | 
 39 |     protected S subject;
 40 |     protected P predicate;
 41 |     protected O object;
 42 | 
 43 |     /**
 44 |      *
 45 |      */
 46 |     public TripleWritable(S subject, P predicate, O object) {
 47 | 	this.setSubject(subject);
 48 | 	this.setPredicate(predicate);
 49 | 	this.setObject(object);
 50 |     }
 51 | 
 52 |     /**
 53 |      * @return the subject
 54 |      */
 55 |     public S getSubject() {
 56 | 	return this.subject;
 57 |     }
 58 | 
 59 |     /**
 60 |      * @param subject
 61 |      *            the subject to set
 62 |      */
 63 |     public void setSubject(S subject) {
 64 | 	this.subject = subject;
 65 |     }
 66 | 
 67 |     /**
 68 |      * @return the predicate
 69 |      */
 70 |     public P getPredicate() {
 71 | 	return this.predicate;
 72 |     }
 73 | 
 74 |     /**
 75 |      * @param predicate
 76 |      *            the predicate to set
 77 |      */
 78 |     public void setPredicate(P predicate) {
 79 | 	this.predicate = predicate;
 80 |     }
 81 | 
 82 |     /**
 83 |      * @return the object
 84 |      */
 85 |     public O getObject() {
 86 | 	return this.object;
 87 |     }
 88 | 
 89 |     /**
 90 |      * @param object
 91 |      *            the object to set
 92 |      */
 93 |     public void setObject(O object) {
 94 | 	this.object = object;
 95 |     }
 96 | 
 97 |     /*
 98 |      * (non-Javadoc)
 99 |      *
100 |      * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
101 |      */
102 |     @Override
103 |     public void readFields(DataInput input) throws IOException {
104 | 	this.subject.readFields(input);
105 | 	this.predicate.readFields(input);
106 | 	this.object.readFields(input);
107 |     }
108 | 
109 |     /*
110 |      * (non-Javadoc)
111 |      *
112 |      * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
113 |      */
114 |     @Override
115 |     public void write(DataOutput output) throws IOException {
116 | 	this.subject.write(output);
117 | 	this.predicate.write(output);
118 | 	this.object.write(output);
119 |     }
120 | 
121 |     /*
122 |      * (non-Javadoc)
123 |      *
124 |      * @see java.lang.Comparable#compareTo(java.lang.Object)
125 |      */
126 |     @Override
127 |     public int compareTo(TripleWritable<S, P, O> otherKey) {
128 | 	int comparison;
129 | 	if ((comparison = this.compareSubjectTo(otherKey)) == 0)
130 | 	    if ((comparison = this.comparePredicateTo(otherKey)) == 0)
131 | 		comparison = this.compareObjectTo(otherKey);
132 | 	return comparison;
133 |     }
134 | 
135 |     public int compareSubjectTo(TripleWritable<S, P, O> otherKey) {
136 | 	return this.compareRole(this.getSubject(), otherKey.getSubject());
137 |     }
138 | 
139 |     public int comparePredicateTo(TripleWritable<S, P, O> otherKey) {
140 | 	return this.compareRole(this.getPredicate(), otherKey.getPredicate());
141 |     }
142 | 
143 |     public int compareObjectTo(TripleWritable<S, P, O> otherKey) {
144 | 	return this.compareRole(this.getObject(), otherKey.getObject());
145 |     }
146 | 
147 |     @SuppressWarnings("unchecked")
148 |     protected int compareRole(WritableComparable wc1, WritableComparable wc2) {
149 | 	return (wc1.compareTo(wc2) < 0) ? -1 : ((wc1.compareTo(wc2) > 0) ? 1 : 0);
150 |     }
151 | 
152 |     /*
153 |      * (non-Javadoc)
154 |      *
155 |      * @see java.lang.Object#toString()
156 |      */
157 |     @Override
158 |     public String toString() {
159 | 	return this.subject + " " + this.predicate + " " + this.object;
160 |     }
161 | 
162 | }
163 | 


--------------------------------------------------------------------------------
/iface/org/rdfhdt/mrbuilder/triples/TriplesMapper.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.triples;
24 | 
25 | import java.io.BufferedInputStream;
26 | import java.io.File;
27 | import java.io.FileInputStream;
28 | import java.io.IOException;
29 | 
30 | import org.apache.hadoop.filecache.DistributedCache;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.hadoop.io.LongWritable;
33 | import org.apache.hadoop.io.Text;
34 | import org.apache.hadoop.io.WritableComparable;
35 | import org.apache.hadoop.mapreduce.Mapper;
36 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary;
37 | import org.rdfhdt.hdt.exceptions.ParserException;
38 | import org.rdfhdt.hdt.listener.ProgressListener;
39 | import org.rdfhdt.hdt.triples.TripleString;
40 | import org.rdfhdt.hdt.util.io.CountInputStream;
41 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration;
42 | import org.rdfhdt.mrbuilder.HDTBuilderDriver.Counters;
43 | import org.rdfhdt.mrbuilder.io.TripleWritable;
44 | 
45 | @SuppressWarnings("rawtypes")
46 | public abstract class TriplesMapper<K extends TripleWritable, V extends WritableComparable> extends Mapper<LongWritable, Text, K, V> implements ProgressListener {
47 | 
48 |     protected FourSectionDictionary   dictionary;
49 |     protected HDTBuilderConfiguration conf;
50 | 
51 |     /*
52 |      * (non-Javadoc)
53 |      *
54 |      * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
55 |      */
56 |     @Override
57 |     protected void setup(Context context) throws IOException, InterruptedException {
58 | 
59 |     Path[] cache = DistributedCache.getLocalCacheFiles(context.getConfiguration());
60 |     	
61 | 	this.conf = new HDTBuilderConfiguration(context.getConfiguration());
62 | 	CountInputStream input = new CountInputStream(new BufferedInputStream(new FileInputStream(cache[0].toString())));
63 | 	File file = new File(cache[0].toString());
64 | 	this.dictionary = new FourSectionDictionary(this.conf.getSpec());
65 | 	this.dictionary.mapFromFile(input, file, this);
66 | 	input.close();
67 | 
68 | 	// DEBUG
69 | 	// ((PFCDictionarySection) this.dictionary.getShared()).dumpAll();
70 |     }
71 | 
72 |     @Override
73 |     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
74 | 	TripleString tripleString = new TripleString();
75 | 
76 | 	try {
77 | 	    tripleString.read(value.toString());
78 | 	} catch (ParserException e) {
79 | 	    // TODO Auto-generated catch block
80 | 	    e.printStackTrace();
81 | 	}
82 | 
83 | 	context.write(this.key(tripleString), this.value(tripleString));
84 | 	context.getCounter(Counters.Triples).increment(1);
85 |     }
86 | 
87 |     @Override
88 |     public void notifyProgress(float level, String message) {
89 | 	// if (!this.conf.getQuiet()) {
90 | 	System.out.print("\r" + message + "\t" + Float.toString(level) + "                            \r");
91 |     }
92 | 
93 |     protected abstract K key(TripleString tripleString) throws InterruptedException;
94 | 
95 |     protected abstract V value(TripleString tripleString);
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 2 |   <modelVersion>4.0.0</modelVersion>
 3 |   <groupId>org.rdfhdt</groupId>
 4 |   <artifactId>hdt-mr</artifactId>
 5 |   <version>2.0</version>
 6 |   <name>HDT MapReduce</name>
 7 |   <packaging>jar</packaging>
 8 |   
 9 |   <properties>
10 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
11 | 	<maven.compiler.source>1.8</maven.compiler.source>
12 |     <maven.compiler.target>1.8</maven.compiler.target>
13 |   </properties>
14 |   
15 |   <dependencies>
16 |     <dependency>
17 |       <groupId>org.rdfhdt</groupId>
18 |       <artifactId>hdt-api</artifactId>
19 |       <version>2.0</version>
20 |     </dependency>
21 |     <dependency>
22 |     <groupId>org.rdfhdt</groupId>
23 |       <artifactId>hdt-java-core</artifactId>
24 |       <version>2.0</version>
25 |     </dependency>
26 |     <dependency>
27 |       <groupId>org.apache.hadoop</groupId>
28 |       <artifactId>hadoop-common</artifactId>
29 |       <version>2.7.0</version>
30 |     </dependency>
31 |     <dependency>
32 |       <groupId>org.apache.hadoop</groupId>
33 |       <artifactId>hadoop-mapreduce-client-core</artifactId>
34 |       <version>2.6.0</version>
35 |     </dependency>
36 |     <dependency>
37 |       <groupId>com.hadoop.gplcompression</groupId>
38 |       <artifactId>hadoop-lzo</artifactId>
39 |       <version>0.4.20-SNAPSHOT</version>
40 |     </dependency>
41 | 	<dependency>
42 |       <groupId>commons-lang</groupId>
43 |       <artifactId>commons-lang</artifactId>
44 |       <version>2.1</version>
45 |     </dependency>
46 |     <dependency>
47 |       <groupId>org.codehaus.plexus</groupId>
48 |       <artifactId>plexus-utils</artifactId>
49 |       <version>1.1</version>
50 |     </dependency>
51 |   </dependencies>
52 |   
53 |   
54 |   <build>
55 | 	<sourceDirectory>.</sourceDirectory>
56 | 	<plugins>
57 | 	  <plugin>
58 | 		<groupId>org.apache.maven.plugins</groupId>
59 | 		<artifactId>maven-assembly-plugin</artifactId>
60 | 		<configuration>
61 |             <includes>
62 |                 <include>iface/**/*.java</include>
63 |                 <include>src/**/*.java</include>
64 |             </includes>
65 |             <archive>
66 |                 <manifest>
67 | 				    <mainClass>org.rdfhdt.mrbuilder.HDTBuilderDriver</mainClass>
68 |                 </manifest>
69 |             </archive>
70 | 			<descriptorRefs>
71 |                 <descriptorRef>jar-with-dependencies</descriptorRef>
72 |             </descriptorRefs>
73 | 		</configuration>
74 | 		<executions>
75 |           <execution>
76 |             <id>make-assembly</id>
77 |             <phase>package</phase>
78 |             <goals>
79 |               <goal>single</goal>
80 |             </goals>
81 |           </execution>
82 |         </executions>
83 | 	  </plugin>
84 | 	</plugins>
85 |   </build>
86 | </project>
87 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/compact/bitmap/TransientBitmap375.java:
--------------------------------------------------------------------------------
  1 | package org.rdfhdt.hdt.compact.bitmap;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.OutputStream;
  5 | import java.util.UUID;
  6 | 
  7 | import org.apache.commons.io.IOUtils;
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.FileSystem;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.rdfhdt.hdt.compact.integer.VByte;
 12 | import org.rdfhdt.hdt.listener.ProgressListener;
 13 | import org.rdfhdt.hdt.util.BitUtil;
 14 | import org.rdfhdt.hdt.util.crc.CRC32;
 15 | import org.rdfhdt.hdt.util.crc.CRC8;
 16 | import org.rdfhdt.hdt.util.crc.CRCOutputStream;
 17 | import org.rdfhdt.hdt.util.io.IOUtil;
 18 | 
 19 | public class TransientBitmap375 extends Bitmap375 {
 20 | 
 21 | 	protected OutputStream	tempOutput;
 22 | 	protected int			bufferSize;
 23 | 	protected int			previousWordIndex;
 24 | 	protected long			nbits;
 25 | 	private long			totalbits	= 0;
 26 | 	private long			totalwords	= 0;
 27 | 
 28 | 	protected FileSystem	fileSystem;
 29 | 	protected Path			file;
 30 | 	protected String		fileName;
 31 | 
 32 | 	public TransientBitmap375(int bufferSize) {
 33 | 		super();
 34 | 		this.bufferSize = bufferSize;
 35 | 		this.previousWordIndex = wordIndex(0);
 36 | 	}
 37 | 
 38 | 	public TransientBitmap375(int bufferSize, long nbits, FileSystem fs, Path path) throws IOException {
 39 | 		super(Math.min(bufferSize, nbits));
 40 | 
 41 | 		this.bufferSize = bufferSize;
 42 | 		this.nbits = nbits;
 43 | 		this.previousWordIndex = wordIndex(0);
 44 | 
 45 | 		this.fileName = UUID.randomUUID().toString();
 46 | 
 47 | 		if (fs == null) {
 48 | 			fs = FileSystem.getLocal(new Configuration());
 49 | 		}
 50 | 		if (path == null) {
 51 | 			path = new Path(".");
 52 | 		}
 53 | 
 54 | 		this.fileSystem = fs;
 55 | 		this.file = new Path(path, this.fileName);
 56 | 		this.tempOutput = this.fileSystem.create(this.file);
 57 | 
 58 | 	}
 59 | 
 60 | 	@Override
 61 | 	public long getNumBits() {
 62 | 		return this.totalbits;
 63 | 	}
 64 | 
 65 | 	// @Override
 66 | 	// public void append(boolean value) {
 67 | 	// this.set(this.numbits++, value);
 68 | 	// }
 69 | 
 70 | 	@Override
 71 | 	public void set(long bitIndex, boolean value) {
 72 | 		if ((this.previousWordIndex >= this.bufferSize) && (this.previousWordIndex != wordIndex(bitIndex))) {
 73 | 			try {
 74 | 				// System.out.println("bitIndex = " + bitIndex);
 75 | 				// System.out.println("numbits = " + this.numbits);
 76 | 				this.flushData();
 77 | 				super.set(0, value);
 78 | 				this.previousWordIndex = wordIndex(0);
 79 | 			} catch (IOException e) {
 80 | 				// TODO Auto-generated catch block
 81 | 				e.printStackTrace();
 82 | 			}
 83 | 		} else {
 84 | 			super.set(bitIndex, value);
 85 | 			this.previousWordIndex = wordIndex(bitIndex);
 86 | 		}
 87 | 	}
 88 | 
 89 | 	private void flushData() throws IOException {
 90 | 
 91 | 		// System.out.println("flushing bitmap " + this.fileName + " with " + this.numbits + " bits");
 92 | 		// System.out.println("Bits from last word = " + lastWordNumBits(this.numbits));
 93 | 
 94 | 		this.totalbits += this.numbits - 1;
 95 | 
 96 | 		int numwords = (int) numWords(this.numbits - 1);
 97 | 
 98 | 		this.totalwords += numwords;
 99 | 
100 | 		for (int i = 0; i < numwords; i++) {
101 | 			IOUtil.writeLong(this.tempOutput, this.words[i]);
102 | 		}
103 | 		this.words = new long[(int) numWords(this.nbits)];
104 | 		this.numbits = 0;
105 | 		this.previousWordIndex = wordIndex(0);
106 | 	}
107 | 
108 | 	public void close() throws IOException {
109 | 
110 | 		this.totalbits += this.numbits;
111 | 
112 | 		int numwords = (int) numWords(this.numbits);
113 | 
114 | 		this.totalwords += numwords;
115 | 
116 | 		// System.out.println("Closing bitmap.");
117 | 		// System.out.println("Writing " + this.totalbits + " bits");
118 | 		// System.out.println("There should be " + this.nbits + " bits");
119 | 		// System.out.println("Writing " + this.totalwords + "words");
120 | 		// System.out.println("Bits from last word = " + lastWordNumBits(this.numbits));
121 | 
122 | 		for (int i = 0; i < numwords - 1; i++) {
123 | 			IOUtil.writeLong(this.tempOutput, this.words[i]);
124 | 		}
125 | 
126 | 		if (numwords > 0) {
127 | 			// Write only used bits from last entry (byte aligned, little endian)
128 | 			int lastWordUsed = lastWordNumBits(this.numbits);
129 | 			BitUtil.writeLowerBitsByteAligned(this.words[numwords - 1], lastWordUsed, this.tempOutput);
130 | 		}
131 | 
132 | 		this.tempOutput.flush();
133 | 		this.tempOutput.close();
134 | 
135 | 		this.words = new long[0];
136 | 	}
137 | 
138 | 	@Override
139 | 	public void save(OutputStream output, ProgressListener listener) throws IOException {
140 | 		CRCOutputStream out = new CRCOutputStream(output, new CRC8());
141 | 
142 | 		// Write Type and Numbits
143 | 		out.write(BitmapFactory.TYPE_BITMAP_PLAIN);
144 | 		VByte.encode(out, this.totalbits);
145 | 
146 | 		// Write CRC
147 | 		out.writeCRC();
148 | 
149 | 		// Setup new CRC
150 | 		out.setCRC(new CRC32());
151 | 
152 | 		// FileInputStream input = new FileInputStream(this.fileName);
153 | 		// long bytesCopied = Files.copy(this.fileSystem.open(this.file), out);
154 | 		long bytesCopied = IOUtils.copyLarge(this.fileSystem.open(this.file), out);
155 | 		// input.close();
156 | 		this.fileSystem.delete(this.file, true);
157 | 		System.out.println("bytes copied from " + this.fileName + " = " + bytesCopied);
158 | 
159 | 		// System.out.println("CRC = " + out.getCRC().getValue());
160 | 		out.writeCRC();
161 | 
162 | 	}
163 | }
164 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/compact/sequence/TransientSequenceLog64.java:
--------------------------------------------------------------------------------
  1 | package org.rdfhdt.hdt.compact.sequence;
  2 | 
  3 | import java.io.IOException;
  4 | import java.io.OutputStream;
  5 | import java.util.UUID;
  6 | 
  7 | import org.apache.commons.io.IOUtils;
  8 | import org.apache.hadoop.conf.Configuration;
  9 | import org.apache.hadoop.fs.FileSystem;
 10 | import org.apache.hadoop.fs.Path;
 11 | import org.rdfhdt.hdt.compact.integer.VByte;
 12 | import org.rdfhdt.hdt.listener.ProgressListener;
 13 | import org.rdfhdt.hdt.util.BitUtil;
 14 | import org.rdfhdt.hdt.util.crc.CRC32;
 15 | import org.rdfhdt.hdt.util.crc.CRC8;
 16 | import org.rdfhdt.hdt.util.crc.CRCOutputStream;
 17 | import org.rdfhdt.hdt.util.io.IOUtil;
 18 | 
 19 | public class TransientSequenceLog64 extends SequenceLog64 {
 20 | 
 21 | 	protected OutputStream	tempOutput;
 22 | 	protected long			bufferSize, maxentries;
 23 | 	protected long			capacity;
 24 | 	private long			totalentries, totalwords;
 25 | 
 26 | 	protected FileSystem	fileSystem;
 27 | 	protected Path			file;
 28 | 	protected String		fileName;
 29 | 
 30 | 	public TransientSequenceLog64(int bufferSize) throws IOException {
 31 | 		this(bufferSize, W);
 32 | 	}
 33 | 
 34 | 	public TransientSequenceLog64(int bufferSize, int numbits) throws IOException {
 35 | 		this(bufferSize, numbits, 0);
 36 | 	}
 37 | 
 38 | 	public TransientSequenceLog64(int bufferSize, int numbits, long capacity, boolean initialize) throws IOException {
 39 | 		this(bufferSize, numbits, capacity);
 40 | 		if (initialize) {
 41 | 			this.numentries = capacity;
 42 | 		}
 43 | 	}
 44 | 
 45 | 	public TransientSequenceLog64(int bufferSize, int numbits, long capacity) throws IOException {
 46 | 		this(bufferSize, numbits, capacity, null, null);
 47 | 	}
 48 | 
 49 | 	public TransientSequenceLog64(int bufferSize, int numbits, long capacity, FileSystem fs, Path path) throws IOException {
 50 | 		super(numbits, Math.min(bufferSize, capacity));
 51 | 
 52 | 		this.capacity = capacity;
 53 | 
 54 | 		// parameter provided as bytes, transform to entries
 55 | 		this.maxentries = (int) ((W / (double) numbits) * bufferSize);
 56 | 
 57 | 		this.fileName = UUID.randomUUID().toString();
 58 | 
 59 | 		if (fs == null) {
 60 | 			fs = FileSystem.getLocal(new Configuration());
 61 | 		}
 62 | 		if (path == null) {
 63 | 			path = new Path(".");
 64 | 		}
 65 | 
 66 | 		this.fileSystem = fs;
 67 | 		this.file = new Path(path, this.fileName);
 68 | 		this.tempOutput = this.fileSystem.create(this.file);
 69 | 	}
 70 | 
 71 | 	@Override
 72 | 	public long getNumberOfElements() {
 73 | 		return this.totalentries;
 74 | 	}
 75 | 
 76 | 	@Override
 77 | 	public void append(long value) {
 78 | 		super.append(value);
 79 | 
 80 | 		if (this.numentries >= this.maxentries && (lastWordNumBits(this.numbits, this.numentries) == 64)) {
 81 | 			try {
 82 | 				this.flushData();
 83 | 			} catch (IOException e) {
 84 | 				// TODO Auto-generated catch block
 85 | 				e.printStackTrace();
 86 | 			}
 87 | 		}
 88 | 	}
 89 | 
 90 | 	protected void flushData() throws IOException {
 91 | 		// System.out.println("Flushing Sequence");
 92 | 
 93 | 		this.totalentries += this.numentries;
 94 | 
 95 | 		int numwords = (int) numWordsFor(this.numbits, this.numentries);
 96 | 
 97 | 		this.totalwords += numwords;
 98 | 
 99 | 		// System.out.println("Remaining bits =" + lastWordNumBits(this.numbits, this.numentries));
100 | 
101 | 		for (int i = 0; i < numwords; i++) {
102 | 			IOUtil.writeLong(this.tempOutput, this.data[i]);
103 | 		}
104 | 
105 | 		long size = numWordsFor(this.numbits, this.numentries);
106 | 		assert size >= 0 && size <= Integer.MAX_VALUE;
107 | 
108 | 		this.data = new long[Math.max((int) size, 1)];
109 | 		this.numentries = 0;
110 | 	}
111 | 
112 | 	public void close() throws IOException {
113 | 
114 | 		this.totalentries += this.numentries;
115 | 
116 | 		int numwords = (int) numWordsFor(this.numbits, this.numentries);
117 | 
118 | 		this.totalwords += numwords;
119 | 
120 | 		// System.out.println("Closing sequence.");
121 | 		// System.out.println("Writing " + this.totalentries + " entries");
122 | 		// System.out.println("There should be " + this.capacity + " entries");
123 | 		// System.out.println("Writing " + this.totalwords + "words");
124 | 
125 | 		// System.out.println("Remaining bits =" + lastWordNumBits(this.numbits, this.numentries));
126 | 
127 | 		for (int i = 0; i < numwords - 1; i++) {
128 | 			IOUtil.writeLong(this.tempOutput, this.data[i]);
129 | 		}
130 | 
131 | 		if (numwords > 0) {
132 | 			// Write only used bits from last entry (byte aligned, little endian)
133 | 			int lastWordUsedBits = lastWordNumBits(this.numbits, this.numentries);
134 | 			BitUtil.writeLowerBitsByteAligned(this.data[numwords - 1], lastWordUsedBits, this.tempOutput);
135 | 		}
136 | 
137 | 		this.tempOutput.flush();
138 | 		this.tempOutput.close();
139 | 
140 | 		this.data = new long[0];
141 | 	}
142 | 
143 | 	@Override
144 | 	public void save(OutputStream output, ProgressListener listener) throws IOException {
145 | 		CRCOutputStream out = new CRCOutputStream(output, new CRC8());
146 | 
147 | 		out.write(SequenceFactory.TYPE_SEQLOG);
148 | 		out.write(this.numbits);
149 | 		VByte.encode(out, this.totalentries);
150 | 		out.writeCRC();
151 | 		out.setCRC(new CRC32());
152 | 
153 | 		// long bytesCopied = Files.copy(this.fileSystem.open(this.file), out);
154 | 		long bytesCopied = IOUtils.copy(this.fileSystem.open(this.file), out);
155 | 		System.out.println("bytes copied from " + this.fileName + " = " + bytesCopied);
156 | 		this.fileSystem.delete(this.file, true);
157 | 
158 | 		// System.out.println("CRC = " + out.getCRC().getValue());
159 | 		out.writeCRC();
160 | 	}
161 | }
162 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary2.java:
--------------------------------------------------------------------------------
 1 | package org.rdfhdt.hdt.dictionary.impl;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.io.InputStream;
 6 | 
 7 | import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate;
 8 | import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionCacheAll;
 9 | import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory2;
10 | import org.rdfhdt.hdt.exceptions.IllegalFormatException;
11 | import org.rdfhdt.hdt.listener.ProgressListener;
12 | import org.rdfhdt.hdt.options.ControlInfo;
13 | import org.rdfhdt.hdt.options.ControlInformation;
14 | import org.rdfhdt.hdt.options.HDTOptions;
15 | import org.rdfhdt.hdt.util.io.CountInputStream;
16 | import org.rdfhdt.hdt.util.listener.IntermediateListener;
17 | 
18 | public class FourSectionDictionary2 extends FourSectionDictionary {
19 | 
20 | 	public FourSectionDictionary2(HDTOptions spec, DictionarySectionPrivate s, DictionarySectionPrivate p, DictionarySectionPrivate o, DictionarySectionPrivate sh) {
21 | 		super(spec, s, p, o, sh);
22 | 	}
23 | 
24 | 	public FourSectionDictionary2(HDTOptions spec) {
25 | 		super(spec);
26 | 	}
27 | 
28 | 	public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException {
29 | 		if(ci.getType()!=ControlInfo.Type.DICTIONARY) {
30 | 			throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary.");
31 | 		}
32 | 		
33 | 		IntermediateListener iListener = new IntermediateListener(listener);
34 | 
35 | 		shared = DictionarySectionFactory2.loadFrom(input, iListener);
36 | 		subjects = DictionarySectionFactory2.loadFrom(input, iListener);
37 | 		predicates = DictionarySectionFactory2.loadFrom(input, iListener);
38 | 		objects = DictionarySectionFactory2.loadFrom(input, iListener);
39 | 	}
40 | 	
41 | 	@Override
42 | 	public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException {
43 | 		ControlInformation ci = new ControlInformation();
44 | 		ci.load(in);
45 | 		if(ci.getType()!=ControlInfo.Type.DICTIONARY) {
46 | 			throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary.");
47 | 		}
48 | 		
49 | 		IntermediateListener iListener = new IntermediateListener(listener);
50 | 		shared = DictionarySectionFactory2.loadFrom(in, f, iListener);
51 | 		subjects = DictionarySectionFactory2.loadFrom(in, f, iListener);
52 | 		predicates = DictionarySectionFactory2.loadFrom(in, f, iListener);
53 | 		objects = DictionarySectionFactory2.loadFrom(in, f, iListener);
54 | 		
55 | 		// Use cache only for predicates. Preload only up to 100K predicates.
56 | 		predicates = new DictionarySectionCacheAll(predicates, predicates.getNumberOfElements()<100000);
57 | 	}
58 | 	
59 | }
60 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/dictionary/impl/section/DictionarySectionFactory2.java:
--------------------------------------------------------------------------------
 1 | package org.rdfhdt.hdt.dictionary.impl.section;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | 
 6 | import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate;
 7 | import org.rdfhdt.hdt.listener.ProgressListener;
 8 | import org.rdfhdt.hdt.options.HDTSpecification;
 9 | 
10 | public class DictionarySectionFactory2 extends DictionarySectionFactory {
11 | 
12 | 	
13 | 	public static DictionarySectionPrivate loadFrom(InputStream input, ProgressListener listener) throws IOException {
14 | 		if(!input.markSupported()) {
15 | 			throw new IllegalArgumentException("Need support for mark()/reset(). Please wrap the InputStream with a BufferedInputStream");
16 | 		}
17 | 		input.mark(64);
18 | 		int dictType = input.read();
19 | 		input.reset();
20 | 		input.mark(64);		// To allow children to reset() and try another instance.
21 | 		
22 | 		DictionarySectionPrivate section=null;
23 | 		
24 | 		switch(dictType) {
25 | 		case PFCDictionarySection.TYPE_INDEX:
26 | 			try{
27 | 				// First try load using the standard PFC 
28 | 				section = new PFCDictionarySection(new HDTSpecification());
29 | 				section.load(input, listener);
30 | 			} catch (IllegalArgumentException e) {
31 | 				// The PFC Could not load the file because it is too big, use PFCBig
32 | 				section = new TransientDictionarySection(new HDTSpecification());
33 | 				section.load(input, listener);
34 | 			}
35 | 			return section;
36 | 		}
37 | 		throw new IOException("DictionarySection implementation not available for id "+dictType);
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/dictionary/impl/section/TransientDictionarySection.java:
--------------------------------------------------------------------------------
  1 | package org.rdfhdt.hdt.dictionary.impl.section;
  2 | 
  3 | import java.io.ByteArrayOutputStream;
  4 | import java.io.IOException;
  5 | import java.io.OutputStream;
  6 | 
  7 | import org.apache.hadoop.io.SequenceFile;
  8 | import org.apache.hadoop.io.Text;
  9 | import org.rdfhdt.hdt.compact.integer.VByte;
 10 | import org.rdfhdt.hdt.compact.sequence.SequenceLog64;
 11 | import org.rdfhdt.hdt.listener.ProgressListener;
 12 | import org.rdfhdt.hdt.options.HDTOptions;
 13 | import org.rdfhdt.hdt.trans.TransientElement;
 14 | import org.rdfhdt.hdt.util.Mutable;
 15 | import org.rdfhdt.hdt.util.crc.CRC32;
 16 | import org.rdfhdt.hdt.util.crc.CRC8;
 17 | import org.rdfhdt.hdt.util.crc.CRCOutputStream;
 18 | import org.rdfhdt.hdt.util.io.IOUtil;
 19 | import org.rdfhdt.hdt.util.string.ByteStringUtil;
 20 | import org.rdfhdt.hdt.util.string.CompactString;
 21 | import org.rdfhdt.hdt.util.string.ReplazableString;
 22 | 
 23 | public class TransientDictionarySection extends PFCDictionarySectionBig implements TransientElement {
 24 | 
 25 | 	ByteArrayOutputStream	byteOut;
 26 | 	CharSequence			previousStr;
 27 | 	int						buffer;
 28 | 	int						blockPerBuffer;
 29 | 	long					storedBuffersSize;
 30 | 
 31 | 	public TransientDictionarySection(HDTOptions spec) {
 32 | 		super(spec);
 33 | 		this.blocksize = (int) spec.getInt("pfc.blocksize");
 34 | 		if (this.blocksize == 0) {
 35 | 			this.blocksize = DEFAULT_BLOCK_SIZE;
 36 | 		}
 37 | 		if (this.blockPerBuffer == 0) {
 38 | 			this.blockPerBuffer = BLOCK_PER_BUFFER;
 39 | 		}
 40 | 	}
 41 | 
 42 | 	@Override
 43 | 	public void initialize(long numentries) {
 44 | 		this.blocks = new SequenceLog64(63, numentries / this.blocksize);
 45 | 		this.storedBuffersSize = 0;
 46 | 		this.numstrings = 0;
 47 | 		this.byteOut = new ByteArrayOutputStream(16 * 1024);
 48 | 		this.blockPerBuffer = BLOCK_PER_BUFFER / 5;
 49 | 		this.data = new byte[(int) Math.ceil((((double) numentries / this.blocksize) / this.blockPerBuffer))][];
 50 | 		this.posFirst = new long[this.data.length];
 51 | 		this.buffer = 0;
 52 | 		this.previousStr = null;
 53 | 	}
 54 | 
 55 | 	@Override
 56 | 	public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException {
 57 | 		CharSequence str = null;
 58 | 		Text line = new Text();
 59 | 
 60 | 		this.posFirst[0] = 0;
 61 | 		while (input.next(line)) {
 62 | 			str = new CompactString(line.toString());
 63 | 
 64 | 			if (this.numstrings % this.blocksize == 0) {
 65 | 				// Add new block pointer
 66 | 				// System.out.println(this.storedBuffersSize);
 67 | 				// System.out.println(this.byteOut.size());
 68 | 				// System.out.println(this.blocksize);
 69 | 				this.blocks.append(this.storedBuffersSize + this.byteOut.size());
 70 | 
 71 | 				// if number of block per buffer reached, change buffer
 72 | 				if (((this.blocks.getNumberOfElements() - 1) % this.blockPerBuffer == 0) && ((this.blocks.getNumberOfElements() - 1) / this.blockPerBuffer != 0)) {
 73 | 					this.storedBuffersSize += this.byteOut.size();
 74 | 					this.storeBuffer(this.buffer);
 75 | 					this.byteOut = new ByteArrayOutputStream(16 * 1024);
 76 | 					if (this.buffer < this.data.length - 1) {
 77 | 						this.posFirst[++this.buffer] = this.storedBuffersSize + this.byteOut.size();
 78 | 					}
 79 | 				}
 80 | 
 81 | 				// Copy full string
 82 | 				ByteStringUtil.append(this.byteOut, str, 0);
 83 | 			} else {
 84 | 				// Find common part.
 85 | 				int delta = ByteStringUtil.longestCommonPrefix(this.previousStr, str);
 86 | 				// Write Delta in VByte
 87 | 				VByte.encode(this.byteOut, delta);
 88 | 				// Write remaining
 89 | 				ByteStringUtil.append(this.byteOut, str, delta);
 90 | 			}
 91 | 
 92 | 			// System.out.println(str);
 93 | 
 94 | 			this.byteOut.write(0); // End of string
 95 | 			this.numstrings++;
 96 | 			this.previousStr = str;
 97 | 		}
 98 | 	}
 99 | 
100 | 	protected void storeBuffer(int buffer) throws IOException {
101 | 		// System.out.println("Buffer = " + buffer);
102 | 		this.byteOut.flush();
103 | 		this.data[buffer] = this.byteOut.toByteArray();
104 | 		this.byteOut.close();
105 | 	}
106 | 
107 | 	@Override
108 | 	public void close() throws IOException {
109 | 		// Ending block pointer.
110 | 		this.blocks.append(this.storedBuffersSize + this.byteOut.size());
111 | 
112 | 		// Trim text/blocks
113 | 		this.blocks.aggresiveTrimToSize();
114 | 
115 | 		// System.out.println("Data length = " + this.data.length);
116 | 		this.storeBuffer(this.buffer);
117 | 	}
118 | 
119 | 	/*
120 | 	 * (non-Javadoc)
121 | 	 *
122 | 	 * @see org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySectionBig#save(java.io.OutputStream, org.rdfhdt.hdt.listener.ProgressListener)
123 | 	 */
124 | 	@Override
125 | 	public void save(OutputStream output, ProgressListener listener) throws IOException {
126 | 		long dataLenght = 0;
127 | 		CRCOutputStream out = new CRCOutputStream(output, new CRC8());
128 | 
129 | 		for (byte[] buffer : this.data) {
130 | 			dataLenght += buffer.length;
131 | 		}
132 | 
133 | 		out.write(TYPE_INDEX);
134 | 		VByte.encode(out, this.numstrings);
135 | 		VByte.encode(out, dataLenght);
136 | 		VByte.encode(out, this.blocksize);
137 | 
138 | 		out.writeCRC();
139 | 
140 | 		this.blocks.save(output, listener); // Write blocks directly to output, they have their own CRC check.
141 | 
142 | 		out.setCRC(new CRC32());
143 | 		for (byte[] buffer : this.data) {
144 | 			IOUtil.writeBuffer(out, buffer, 0, buffer.length, listener);
145 | 		}
146 | 		out.writeCRC();
147 | 	}
148 | 
149 | 	/*
150 | 	 * (non-Javadoc)
151 | 	 *
152 | 	 * @see hdt.dictionary.DictionarySection#extract(int)
153 | 	 */
154 | 	@Override
155 | 	public CharSequence extract(int id) {
156 | 
157 | 		// System.out.println("id = " + id);
158 | 
159 | 		if (id < 1 || id > this.numstrings) {
160 | 			return null;
161 | 		}
162 | 
163 | 		// Locate block
164 | 		int blockid = (id - 1) / this.blocksize;
165 | 		int nstring = (id - 1) % this.blocksize;
166 | 
167 | 		// System.out.println("blockid = " + blockid);
168 | 		// System.out.println("nstring = " + nstring);
169 | 
170 | 		byte[] block = this.data[blockid / this.blockPerBuffer];
171 | 		int pos = (int) (this.blocks.get(blockid) - this.posFirst[blockid / this.blockPerBuffer]);
172 | 
173 | 		// System.out.println("pos = " + pos);
174 | 
175 | 		// Copy first string
176 | 		int len = ByteStringUtil.strlen(block, pos);
177 | 
178 | 		// System.out.println("len = " + len);
179 | 
180 | 		Mutable<Long> delta = new Mutable<Long>(0L);
181 | 		ReplazableString tempString = new ReplazableString();
182 | 		tempString.append(block, pos, len);
183 | 
184 | 		// System.out.println("dentro del for");
185 | 
186 | 		// Copy strings untill we find our's.
187 | 		for (int i = 0; i < nstring; i++) {
188 | 			pos += len + 1;
189 | 			// System.out.println("pos = " + pos);
190 | 			pos += VByte.decode(block, pos, delta);
191 | 			// System.out.println("pos = " + pos);
192 | 			// System.out.println("delta = [" + delta + "]");
193 | 			len = ByteStringUtil.strlen(block, pos);
194 | 			// System.out.println("len = " + len);
195 | 			tempString.replace(delta.getValue().intValue(), block, pos, len);
196 | 			// System.out.println("tempstring = [" + tempString + "]");
197 | 		}
198 | 		return tempString;
199 | 	}
200 | 
201 | 	/**
202 | 	 * Locate the block of a string doing binary search.
203 | 	 */
204 | 	@Override
205 | 	protected int locateBlock(CharSequence str) {
206 | 		int low = 0;
207 | 		int high = (int) this.blocks.getNumberOfElements() - 1;
208 | 		int max = high;
209 | 
210 | 		while (low <= high) {
211 | 			int mid = (low + high) >>> 1;
212 | 
213 | 		int cmp;
214 | 		if (mid == max) {
215 | 			cmp = -1;
216 | 		} else {
217 | 			cmp = ByteStringUtil.strcmp(str, this.data[mid / this.blockPerBuffer], (int) (this.blocks.get(mid) - this.posFirst[mid / this.blockPerBuffer]));
218 | 
219 | 			// if (str.toString().contains("http://dbpedia.org/ontology/Agent") || str.toString().contains("The Health Inspector pays a visit") || str.toString().contains("Crockett_Middle_School") || str.toString().contains("Benthosuchus")) {
220 | 			// System.out.println("Block: "+ mid + ": "+ ByteStringUtil.asString(data[mid / blockPerBuffer], (int) (this.blocks.get(mid) - this.posFirst[mid / blockPerBuffer])) + " Result: " + cmp);
221 | 			// }
222 | 		}
223 | 
224 | 		if (cmp < 0) {
225 | 			high = mid - 1;
226 | 		} else if (cmp > 0) {
227 | 			low = mid + 1;
228 | 		} else {
229 | 			return mid; // key found
230 | 		}
231 | 		}
232 | 		return -(low + 1); // key not found.
233 | 	}
234 | 
235 | 	@Override
236 | 	protected int locateInBlock(int blockid, CharSequence str) {
237 | 
238 | 		ReplazableString tempString = new ReplazableString();
239 | 
240 | 		Mutable<Long> delta = new Mutable<Long>(0L);
241 | 		int idInBlock = 0;
242 | 		int cshared = 0;
243 | 
244 | 		byte[] block = this.data[blockid / this.blockPerBuffer];
245 | 		int pos = (int) (this.blocks.get(blockid) - this.posFirst[blockid / this.blockPerBuffer]);
246 | 
247 | 		// Read the first string in the block
248 | 		int slen = ByteStringUtil.strlen(block, pos);
249 | 		tempString.append(block, pos, slen);
250 | 		pos += slen + 1;
251 | 		idInBlock++;
252 | 
253 | 		while ((idInBlock < this.blocksize) && (pos < block.length)) {
254 | 			// Decode prefix
255 | 			pos += VByte.decode(block, pos, delta);
256 | 
257 | 			// Copy suffix
258 | 			slen = ByteStringUtil.strlen(block, pos);
259 | 			tempString.replace(delta.getValue().intValue(), block, pos, slen);
260 | 
261 | 			if (delta.getValue() >= cshared) {
262 | 				// Current delta value means that this string
263 | 				// has a larger long common prefix than the previous one
264 | 				// if (str.toString().contains("http://dbpedia.org/ontology/Agent") || str.toString().contains("The Health Inspector pays a visit") || str.toString().contains("Crockett_Middle_School") || str.toString().contains("Benthosuchus")) {
265 | 				// System.out.println("[" + tempString + "]. cshared [" + cshared + "]");
266 | 				// }
267 | 				cshared += ByteStringUtil.longestCommonPrefix(tempString, str, cshared);
268 | 
269 | 				if ((cshared == str.length()) && (tempString.length() == str.length())) {
270 | 					break;
271 | 				}
272 | 			} else {
273 | 				// We have less common characters than before,
274 | 				// this string is bigger that what we are looking for.
275 | 				// i.e. Not found.
276 | 				idInBlock = 0;
277 | 				break;
278 | 			}
279 | 			pos += slen + 1;
280 | 			idInBlock++;
281 | 
282 | 		}
283 | 
284 | 		// Not found
285 | 		if (pos == block.length || idInBlock == this.blocksize) {
286 | 			idInBlock = 0;
287 | 		}
288 | 
289 | 		return idInBlock;
290 | 	}
291 | 
292 | }
293 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/hdt/impl/TransientHDT.java:
--------------------------------------------------------------------------------
 1 | package org.rdfhdt.hdt.hdt.impl;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.OutputStream;
 5 | 
 6 | import org.rdfhdt.hdt.dictionary.DictionaryPrivate;
 7 | import org.rdfhdt.hdt.header.HeaderPrivate;
 8 | import org.rdfhdt.hdt.listener.ProgressListener;
 9 | import org.rdfhdt.hdt.options.HDTOptions;
10 | import org.rdfhdt.hdt.triples.TriplesPrivate;
11 | 
12 | /**
13 |  * @author José M. Giménez-García
14 |  *
15 |  * @Note: HDTImpl modified to make fields protected instead of private
16 |  *
17 |  */
18 | public class TransientHDT extends HDTImpl {
19 | 
20 | 	public TransientHDT(HDTOptions spec) {
21 | 		super(spec);
22 | 	}
23 | 
24 | 	public void setHeader(HeaderPrivate header) {
25 | 		this.header = header;
26 | 	}
27 | 
28 | 	public void setDictionary(DictionaryPrivate dictionary) {
29 | 		this.dictionary = dictionary;
30 | 	}
31 | 
32 | 	@Override
33 | 	public void setTriples(TriplesPrivate triples) {
34 | 		this.triples = triples;
35 | 	}
36 | 
37 | 	@Override
38 | 	public void saveToHDT(OutputStream output, ProgressListener listener) throws IOException {
39 | 		// TODO Auto-generated method stub
40 | 		super.saveToHDT(output, listener);
41 | 	}
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/triples/ScapedTripleString.java:
--------------------------------------------------------------------------------
 1 | package org.rdfhdt.hdt.triples;
 2 | 
 3 | import org.rdfhdt.hdt.exceptions.ParserException;
 4 | 
 5 | /**
 6 |  * TripleString holds a triple as Strings
 7 |  */
 8 | public final class ScapedTripleString extends TripleString {
 9 | 
10 | 	public ScapedTripleString() {
11 | 		super();
12 | 	}
13 | 
14 | 	public ScapedTripleString(CharSequence subject, CharSequence predicate, CharSequence object) {
15 | 		super(subject, predicate, object);
16 | 	}
17 | 
18 | 	public ScapedTripleString(TripleString other) {
19 | 		super(other);
20 | 	}
21 | 
22 | 	/**
23 | 	 * Read from a line, where each component is separated by space.
24 | 	 *
25 | 	 * @param line
26 | 	 */
27 | 	@Override
28 | 	public void read(String line) throws ParserException {
29 | 		int split, posa, posb;
30 | 		this.clear();
31 | 
32 | 		// SET SUBJECT
33 | 		posa = 0;
34 | 
35 | 		if (line.charAt(posa) == '<') { // subject between '<' and '>' symbols
36 | 			posa++; // Remove <
37 | 			posb = line.indexOf('>', posa);
38 | 			split = posb + 1;
39 | 		} else { // subject until the first space
40 | 			posb = split = line.indexOf(' ', posa);
41 | 		}
42 | 		if (posb == -1) {
43 | 			return; // Not found, error.
44 | 		}
45 | 
46 | 		this.setSubject(line.substring(posa, posb));
47 | 
48 | 		// SET PREDICATE
49 | 		posa = split + 1;
50 | 
51 | 		if (line.charAt(posa) == '<') { // predicate between '<' and '>' symbols
52 | 			posa++; // Remove <
53 | 			posb = line.indexOf('>', posa);
54 | 			split = posb + 1;
55 | 		} else { // predicate until the first space
56 | 			posb = split = line.indexOf(' ', posa);
57 | 		}
58 | 		if (posb == -1) {
59 | 			return; // Not found, error.
60 | 		}
61 | 
62 | 		this.setPredicate(line.substring(posa, posb));
63 | 
64 | 		// SET OBJECT
65 | 		posa = split + 1;
66 | 		posb = line.length();
67 | 
68 | 		if (line.charAt(posb - 1) == '.') {
69 | 			posb--; // Remove trailing <space> <dot> from NTRIPLES.
70 | 		}
71 | 		if (line.charAt(posb - 1) == ' ') {
72 | 			posb--;
73 | 		}
74 | 
75 | 		if (line.charAt(posa) == '<') {
76 | 			posa++;
77 | 
78 | 			// Remove trailing > only if < appears, so "some"^^<http://datatype> is kept as-is.
79 | 			if (posb > posa && line.charAt(posb - 1) == '>') {
80 | 				posb--;
81 | 			}
82 | 		}
83 | 
84 | 		this.setObject(line.substring(posa, posb));
85 | 	}
86 | }
87 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/triples/impl/TransientBitMapTriples.java:
--------------------------------------------------------------------------------
  1 | package org.rdfhdt.hdt.triples.impl;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.hadoop.fs.FileSystem;
  6 | import org.apache.hadoop.fs.Path;
  7 | import org.apache.hadoop.io.SequenceFile;
  8 | import org.rdfhdt.hdt.compact.bitmap.AdjacencyList;
  9 | import org.rdfhdt.hdt.compact.bitmap.TransientBitmap375;
 10 | import org.rdfhdt.hdt.compact.sequence.TransientSequenceLog64;
 11 | import org.rdfhdt.hdt.enums.TripleComponentOrder;
 12 | import org.rdfhdt.hdt.exceptions.IllegalFormatException;
 13 | import org.rdfhdt.hdt.listener.ProgressListener;
 14 | import org.rdfhdt.hdt.options.HDTOptions;
 15 | import org.rdfhdt.hdt.triples.TripleID;
 16 | import org.rdfhdt.hdt.util.BitUtil;
 17 | import org.rdfhdt.hdt.util.listener.ListenerUtil;
 18 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration;
 19 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable;
 20 | 
 21 | public class TransientBitMapTriples extends BitmapTriples {
 22 | 
 23 | 	long		number;
 24 | 	long		size;
 25 | 	long		lastX		= 0, lastY = 0, lastZ = 0;
 26 | 	long		x, y, z;
 27 | 	long		numTriples	= 0;
 28 | 	boolean		trimNeeded	= false;
 29 | 
 30 | 	FileSystem	fileSystem;
 31 | 	Path		path;
 32 | 
 33 | 	public TransientBitMapTriples() {
 34 | 		super();
 35 | 	}
 36 | 
 37 | 	public TransientBitMapTriples(HDTOptions spec) {
 38 | 		super(spec);
 39 | 	}
 40 | 
 41 | 	public TransientBitMapTriples(FileSystem fs, Path path) {
 42 | 		this();
 43 | 		this.setFileSystem(fs);
 44 | 		this.setPath(path);
 45 | 	}
 46 | 
 47 | 	public TransientBitMapTriples(HDTOptions spec, FileSystem fs, Path path) {
 48 | 		this(spec);
 49 | 		this.setFileSystem(fs);
 50 | 		this.setPath(path);
 51 | 	}
 52 | 
 53 | 	public void setFileSystem(FileSystem fs) {
 54 | 		this.fileSystem = fs;
 55 | 	}
 56 | 
 57 | 	public void setPath(Path path) {
 58 | 		this.path = path;
 59 | 	}
 60 | 
 61 | 	public void initialize(long numentries) throws IOException {
 62 | 		this.initialize(numentries, numentries, numentries);
 63 | 		this.trimNeeded = true;
 64 | 	}
 65 | 
 66 | 	public void initialize(long numentries, long maxvalue) throws IOException {
 67 | 		this.initialize(numentries, maxvalue, maxvalue);
 68 | 		this.trimNeeded = true;
 69 | 	}
 70 | 
 71 | 	public void initialize(long numentries, long maxpredicate, long maxobject) throws IOException {
 72 | 
 73 | 		// System.out.println("Numentries: " + numentries);
 74 | 
 75 | 		this.number = numentries;
 76 | 		this.seqY = new TransientSequenceLog64(HDTBuilderConfiguration.CHUNK_SIZE, BitUtil.log2(maxpredicate), this.number, this.fileSystem, this.path);
 77 | 		this.seqZ = new TransientSequenceLog64(HDTBuilderConfiguration.CHUNK_SIZE, BitUtil.log2(maxobject), this.number, this.fileSystem, this.path);
 78 | 		this.bitmapY = new TransientBitmap375(HDTBuilderConfiguration.CHUNK_SIZE, this.number, this.fileSystem, this.path);
 79 | 		this.bitmapZ = new TransientBitmap375(HDTBuilderConfiguration.CHUNK_SIZE, this.number, this.fileSystem, this.path);
 80 | 		// this.bitmapY = new Bitmap375(this.number);
 81 | 		// this.bitmapZ = new Bitmap375(this.number);
 82 | 	}
 83 | 
 84 | 	public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException {
 85 | 		TripleSPOWritable tripleWritable = new TripleSPOWritable();
 86 | 
 87 | 		while (input.next(tripleWritable)) {
 88 | 			TripleID triple = new TripleID((int) tripleWritable.getSubject().get(), (int) tripleWritable.getPredicate().get(), (int) tripleWritable.getObject().get());
 89 | 			this.add(triple);
 90 | 			ListenerUtil.notifyCond(listener, "Converting to BitmapTriples", this.numTriples, this.numTriples, this.number);
 91 | 			this.numTriples++;
 92 | 		}
 93 | 	}
 94 | 
 95 | 	public void add(TripleID triple) {
 96 | 		TransientSequenceLog64 vectorY = (TransientSequenceLog64) this.seqY;
 97 | 		TransientSequenceLog64 vectorZ = (TransientSequenceLog64) this.seqZ;
 98 | 		TransientBitmap375 bitY = (TransientBitmap375) this.bitmapY;
 99 | 		TransientBitmap375 bitZ = (TransientBitmap375) this.bitmapZ;
100 | 		// Bitmap375 bitY = (Bitmap375) this.bitmapY;
101 | 		// Bitmap375 bitZ = (Bitmap375) this.bitmapZ;
102 | 
103 | 		TripleOrderConvert.swapComponentOrder(triple, TripleComponentOrder.SPO, this.order);
104 | 		this.x = triple.getSubject();
105 | 		this.y = triple.getPredicate();
106 | 		this.z = triple.getObject();
107 | 
108 | 		if (this.x == 0 || this.y == 0 || this.z == 0) {
109 | 			throw new IllegalFormatException("None of the components of a triple can be null");
110 | 		}
111 | 
112 | 		if (this.numTriples == 0) {
113 | 			// First triple
114 | 			vectorY.append(this.y);
115 | 			vectorZ.append(this.z);
116 | 		} else if (this.x != this.lastX) {
117 | 			if (this.x != this.lastX + 1) {
118 | 				throw new IllegalFormatException("Upper level must be increasing and correlative.");
119 | 			}
120 | 			// X changed
121 | 			bitY.append(true);
122 | 			vectorY.append(this.y);
123 | 
124 | 			bitZ.append(true);
125 | 			vectorZ.append(this.z);
126 | 		} else if (this.y != this.lastY) {
127 | 			if (this.y < this.lastY) {
128 | 				throw new IllegalFormatException("Middle level must be increasing for each parent.");
129 | 			}
130 | 
131 | 			// Y changed
132 | 			bitY.append(false);
133 | 			vectorY.append(this.y);
134 | 
135 | 			bitZ.append(true);
136 | 			vectorZ.append(this.z);
137 | 		} else if (this.z != this.lastZ) { // Añadido para quitar triples duplicados
138 | 			if (this.z < this.lastZ) {
139 | 				throw new IllegalFormatException("Lower level must be increasing for each parent.");
140 | 			}
141 | 
142 | 			// Z changed
143 | 			bitZ.append(false);
144 | 			vectorZ.append(this.z);
145 | 		}
146 | 
147 | 		this.lastX = this.x;
148 | 		this.lastY = this.y;
149 | 		this.lastZ = this.z;
150 | 	}
151 | 
152 | 	public void close() throws IOException {
153 | 		TransientSequenceLog64 vectorY = (TransientSequenceLog64) this.seqY;
154 | 		TransientSequenceLog64 vectorZ = (TransientSequenceLog64) this.seqZ;
155 | 		TransientBitmap375 bitY = (TransientBitmap375) this.bitmapY;
156 | 		TransientBitmap375 bitZ = (TransientBitmap375) this.bitmapZ;
157 | 		// Bitmap375 bitY = (Bitmap375) this.bitmapY;
158 | 		// Bitmap375 bitZ = (Bitmap375) this.bitmapZ;
159 | 
160 | 		bitY.append(true);
161 | 		bitZ.append(true);
162 | 
163 | 		bitY.close();
164 | 		bitZ.close();
165 | 
166 | 		vectorY.close();
167 | 		vectorZ.close();
168 | 
169 | 		// System.out.println("bitmapY size = " + this.bitmapY.getNumBits());
170 | 		// System.out.println("seqY size = " + this.seqY.getNumberOfElements());
171 | 		// System.out.println("bitmapZ size = " + this.bitmapZ.getNumBits());
172 | 		// System.out.println("seqZ size = " + this.seqZ.getNumberOfElements());
173 | 
174 | 		if (this.trimNeeded) {
175 | 			vectorY.aggresiveTrimToSize();
176 | 			vectorZ.trimToSize();
177 | 		}
178 | 
179 | 		this.adjY = new AdjacencyList(this.seqY, this.bitmapY);
180 | 		this.adjZ = new AdjacencyList(this.seqZ, this.bitmapZ);
181 | 
182 | 		// DEBUG
183 | 		// this.adjY.dump();
184 | 		// this.adjZ.dump();
185 | 	}
186 | 
187 | 	@Override
188 | 	public long getNumberOfElements() {
189 | 		return this.number;
190 | 	}
191 | 
192 | 	@Override
193 | 	public long size() {
194 | 		return this.size;
195 | 	}
196 | 
197 | }
198 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/listener/HDTBuilderListener.java:
--------------------------------------------------------------------------------
 1 | package org.rdfhdt.listener;
 2 | 
 3 | import org.rdfhdt.hdt.listener.ProgressListener;
 4 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration;
 5 | 
 6 | public class HDTBuilderListener implements ProgressListener {
 7 | 
 8 | 	boolean	quiet;
 9 | 
10 | 	public HDTBuilderListener(HDTBuilderConfiguration conf) {
11 | 		this.quiet = conf.getQuiet();
12 | 	}
13 | 
14 | 	public HDTBuilderListener(boolean quiet) {
15 | 		this.quiet = quiet;
16 | 	}
17 | 
18 | 	@Override
19 | 	public void notifyProgress(float level, String message) {
20 | 		if (!this.quiet) {
21 | 			System.out.print("\r" + message + "\t" + Float.toString(level) + "                            \r");
22 | 		}
23 | 	}
24 | }


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/HDTBuilderConfiguration.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
  3 |  *
  4 |  * This library is free software; you can redistribute it and/or
  5 |  * modify it under the terms of the GNU Lesser General Public
  6 |  * License as published by the Free Software Foundation; either
  7 |  * version 2.1 of the License, or (at your option) any later version.
  8 |  *
  9 |  * This library is distributed in the hope that it will be useful,
 10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 12 |  * Lesser General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU Lesser General Public
 15 |  * License along with this library; if not, write to the Free Software
 16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 17 |  *
 18 |  * Contacting the authors:
 19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
 20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
 21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
 22 |  */
 23 | package org.rdfhdt.mrbuilder;
 24 | 
 25 | import java.io.IOException;
 26 | 
 27 | import org.apache.commons.io.FilenameUtils;
 28 | import org.apache.commons.lang.StringUtils;
 29 | import org.apache.hadoop.conf.Configuration;
 30 | import org.apache.hadoop.fs.Path;
 31 | import org.rdfhdt.hdt.options.HDTSpecification;
 32 | 
 33 | import com.beust.jcommander.JCommander;
 34 | import com.beust.jcommander.Parameter;
 35 | 
 36 | public class HDTBuilderConfiguration {
 37 | 
 38 | 	public final static int		CHUNK_SIZE										= 1 * 1024 * 1024;
 39 | 
 40 | 	public final static String	SHARED											= "shared";
 41 | 	public final static String	SUBJECTS										= "subjects";
 42 | 	public final static String	PREDICATES										= "predicates";
 43 | 	public final static String	OBJECTS											= "objects";
 44 | 	public final static String	SAMPLE											= "samples";
 45 | 
 46 | 	public final static String	SHARED_OUTPUT_PATH								= SHARED + "/";
 47 | 	public final static String	SUBJECTS_OUTPUT_PATH							= SUBJECTS + "/";
 48 | 	public final static String	PREDICATES_OUTPUT_PATH							= PREDICATES + "/";
 49 | 	public final static String	OBJECTS_OUTPUT_PATH								= OBJECTS + "/";
 50 | 	public final static String	SAMPLE_OUTPUT_PATH								= SAMPLE + "/";
 51 | 
 52 | 	final static String			DEFAULT_CONFIGURATION_PATH						= "HDTMRBuilder.xml";
 53 | 
 54 | 	final static String			AWS_BUCKET_NAME									= "global.bucket";
 55 | 	final static String			AWS_BUCKET_DEFAULT_VALUE						= null;
 56 | 
 57 | 	final static String			BASE_PATH_NAME									= "global.path.base";
 58 | 	final static String			BASE_PATH_DEFAULT_VALUE							= ".";
 59 | 	final static String			INPUT_PATH_NAME									= "global.path.input";
 60 | 	final static String			INPUT_PATH_DEFAULT_VALUE						= "input";
 61 | 
 62 | 	final static String			DICTIONARY_RUN_JOB_NAME							= "job.dictionary.run";
 63 | 	final static Boolean		DICTIONARY_RUN_JOB_DEFAULT_VALUE				= true;
 64 | 	final static String			DICTIONARY_JOB_NAME_NAME						= "job.dictionary.name";
 65 | 	final static String			DICTIONARY_JOB_NAME_DEFAULT_VALUE				= "DictionaryJob";
 66 | 	final static String			DICTIONARY_OUTPUT_PATH_NAME						= "job.dictionary.path.output";
 67 | 	final static String			DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE			= "dictionary";
 68 | 	final static String			DICTIONARY_DELETE_OUTPUT_PATH_NAME				= "job.dictionary.path.output.delete";
 69 | 	final static boolean		DICTIONARY_DELETE_OUTPUT_PATH_DEFAULT_VALUE		= false;
 70 | 	final static String			DICTIONARY_NUM_REDUCERS_NAME					= "job.dictionary.reducers";
 71 | 	final static int			DICTIONARY_NUM_REDUCERS_DEFAULT_VALUE			= 1;
 72 | 
 73 | 	final static String			DICTIONARY_RUN_SAMPLE_NAME						= "job.dictionary.sample.run";
 74 | 	final static boolean		DICTIONARY_RUN_SAMPLE_DEFAULT_VALUE				= true;
 75 | 	final static String			DICTIONARY_SAMPLE_PROBABILITY_NAME				= "job.dictionary.sample.probability";
 76 | 	final static float			DICTIONARY_SAMPLE_PROBABILITY_DEFAULT_VALUE		= (float) 0.001;
 77 | 	final static String			DICTIONARY_SAMPLE_OUTPUT_PATH_NAME				= "job.dictionary.path.sample";
 78 | 	final static String			DICTIONARY_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE		= "dictionary_samples";
 79 | 	final static String			DICTIONARY_DELETE_SAMPLE_PATH_NAME				= "job.dictionary.path.sample.delete";
 80 | 	final static boolean		DICTIONARY_DELETE_SAMPLE_PATH_DEFAULT_VALUE		= false;
 81 | 	final static String			DICTIONARY_SAMPLE_NUM_REDUCERS_NAME				= "job.dictionary.sample.reducers";
 82 | 	final static int			DICTIONARY_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE	= 1;
 83 | 
 84 | 	final static String			HDTDICTIONARY_BUILD_NAME						= "hdt.dictionary.build";
 85 | 	final static boolean		HDTDICTIONARY_BUILD_DEFAULT_VALUE				= true;
 86 | 	final static String			HDTDICTIONARY_FILE_NAME							= "hdt.dictionary.file";
 87 | 	final static String			HDTDICTIONARY_FILE_DEFAULT_VALUE				= "dictionary.hdt";
 88 | 	final static String			HDTDICTIONARY_DISTRIBUTION_NAME					= "job.triples.dictionary.distribution";
 89 | 	final static int			HDTDICTIONARY_DISTRIBUTION_DEFAULT_VALUE		= 1;
 90 | 
 91 | 	final static String			TRIPLES_RUN_JOB_NAME							= "job.triples.run";
 92 | 	final static boolean		TRIPLES_RUN_JOB_DEFAULT_VALUE					= true;
 93 | 	final static String			TRIPLES_JOB_NAME_NAME							= "job.triples.name";
 94 | 	final static String			TRIPLES_JOB_NAME_DEFAULT_VALUE					= "TriplesJob";
 95 | 	// final static String TRIPLES_MAP_DICTIONARY_FILE_NAME = "job.triples.map.dictionary.file";
 96 | 	// final static String TRIPLES_MAP_DICTIONARY_FILE_DEFAULT_VALUE = "dictionary_map.hdt";
 97 | 	// final static String TRIPLES_REDUCE_DICTIONARY_FILE_NAME = "job.triples.reduce.dictionary.file";
 98 | 	// final static String TRIPLES_REDUCE_DICTIONARY_FILE_DEFAULT_VALUE = "dictionary_reduce.hdt";
 99 | 	final static String			TRIPLES_OUTPUT_PATH_NAME						= "job.triples.path.output";
100 | 	final static String			TRIPLES_OUTPUT_PATH_DEFAULT_VALUE				= "triples";
101 | 	final static String			TRIPLES_DELETE_OUTPUT_PATH_NAME					= "job.triples.path.output.delete";
102 | 	final static boolean		TRIPLES_DELETE_OUTPUT_PATH_DEFAULT_VALUE		= false;
103 | 	final static String			TRIPLES_NUM_REDUCERS_NAME						= "job.triples.reducers";
104 | 	final static int			TRIPLES_NUM_REDUCERS_DEFAULT_VALUE				= 1;
105 | 
106 | 	final static String			TRIPLES_RUN_SAMPLE_NAME							= "job.triples.sample.run";
107 | 	final static boolean		TRIPLES_RUN_SAMPLE_DEFAULT_VALUE				= true;
108 | 	final static String			TRIPLES_SAMPLE_PROBABILITY_NAME					= "job.triples.sample.probability";
109 | 	final static float			TRIPLES_SAMPLE_PROBABILITY_DEFAULT_VALUE		= (float) 0.001;
110 | 	final static String			TRIPLES_SAMPLE_OUTPUT_PATH_NAME					= "job.triples.path.sample";
111 | 	final static String			TRIPLES_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE		= "triples_samples";
112 | 	final static String			TRIPLES_DELETE_SAMPLE_PATH_NAME					= "job.triples.path.sample.delete";
113 | 	final static boolean		TRIPLES_DELETE_SAMPLE_PATH_DEFAULT_VALUE		= false;
114 | 	final static String			TRIPLES_SAMPLE_NUM_REDUCERS_NAME				= "job.triples.sample.reducers";
115 | 	final static int			TRIPLES_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE		= 1;
116 | 
117 | 	final static String			HDT_BUILD_NAME									= "hdt.build";
118 | 	final static boolean		HDT_BUILD_DEFAULT_VALUE							= true;
119 | 	final static String			HDT_OUTPUT_PATH_NAME							= "hdt.path.output";
120 | 	final static String			HDT_OUTPUT_PATH_DEFAULT_VALUE					= "hdt_output";
121 | 	final static String			HDT_FILE_NAME									= "hdt.file";
122 | 	final static String			HDT_FILE_DEFAULT_VALUE							= "output.hdt";
123 | 
124 | 	final static String			CONFIG_FILE_NAME								= "hdt-lib.configFile";
125 | 	final static String			CONFIG_FILE_DEFAULT_VALUE						= null;
126 | 	final static String			OPTIONS_NAME									= "hdtl-lib.options";
127 | 	final static String			OPTIONS_DEFAULT_VALUE							= null;
128 | 	final static String			RDF_TYPE_NAME									= "hdt-lib.rdfType";
129 | 	final static String			RDF_TYPE_DEFAULT_VALUE							= "ntriples";
130 | 	final static String			QUIET_NAME										= "hdt-lib.quiet";
131 | 	final static boolean		QUIET_DEFAULT_VALUE								= false;
132 | 	final static String			BASE_URI_NAME									= "hdt-lib.baseUri";
133 | 	final static String			BASE_URI_DEFAULT_VALUE							= "http://rdfhdt.org/HDTMR";
134 | 	final static String			GENERATE_INDEX_NAME								= "hdt-lib.generateIndex";
135 | 	final static boolean		GENERATE_INDEX_DEFAULT_VALUE					= false;
136 | 
137 | 	JCommander					jc;
138 | 
139 | 	@Parameter(names = { "-h", "--help" }, help = true, hidden = true)
140 | 	boolean						help											= false;
141 | 
142 | 	@Parameter(names = { "-a", "--awsbucket" }, description = "Amazon Web Services bucket")
143 | 	String						pAwsBucket										= null;
144 | 
145 | 	@Parameter(names = { "-c", "--conf" }, description = "Path to configuration file")
146 | 	String						pConfigFile										= null;
147 | 
148 | 	@Parameter(names = { "-b", "--basedir" }, description = "Root directory for the process")
149 | 	String						pBasePath										= null;
150 | 
151 | 	@Parameter(names = { "-rd", "--rundictionary" }, description = "Whether to run dictionary job or not", arity = 1)
152 | 	Boolean						pRunDictionary									= null;
153 | 
154 | 	@Parameter(names = { "-rds", "--rundictionarysampling" }, description = "Whether to run dictionary input sampling job or not", arity = 1)
155 | 	Boolean						pRunDictionarySampling							= null;
156 | 
157 | 	@Parameter(names = { "-nd", "--namedictionaryjob" }, description = "Name of dictionary job")
158 | 	String						pDictionaryName									= null;
159 | 
160 | 	@Parameter(names = { "-i", "--input" }, description = "Path to input files. Relative to basedir")
161 | 	String						pInputPath										= null;
162 | 
163 | 	@Parameter(names = { "-sd", "--samplesdictionary" }, description = "Path to dictionary job sample files. Relative to basedir")
164 | 	String						pDictionarySamplePath							= null;
165 | 
166 | 	@Parameter(names = { "-st", "--samplestriples" }, description = "Path to triples job sample files. Relative to basedir")
167 | 	String						pTriplesSamplePath								= null;
168 | 
169 | 	@Parameter(names = { "-od", "--outputdictionary" }, description = "Path to dictionary job output files. Relative to basedir")
170 | 	String						pDictionaryOutputPath							= null;
171 | 
172 | 	@Parameter(names = { "-dd", "--deleteoutputdictionary" }, description = "Delete dictionary job output path before running job")
173 | 	Boolean						pDeleteDictionaryOutputPath						= null;
174 | 
175 | 	@Parameter(names = { "-dsd", "--deletesampledictionary" }, description = "Delete dictionary job sample path before running job")
176 | 	Boolean						pDeleteDictionarySamplePath						= null;
177 | 
178 | 	@Parameter(names = { "-dst", "--deletesampletriples" }, description = "Delete triples job sample path before running job")
179 | 	Boolean						pDeleteTriplesSamplePath						= null;
180 | 
181 | 	@Parameter(names = { "-Rd", "--reducersdictionary" }, description = "Number of reducers for dictionary job")
182 | 	Integer						pNumReducersDictionary							= null;
183 | 
184 | 	@Parameter(names = { "-Rds", "--reducersdictionarysampling" }, description = "Number of reducers for dictionary input sampling job")
185 | 	Integer						pNumReducersDictionarySampling					= null;
186 | 
187 | 	@Parameter(names = { "-bd", "--builddictionary" }, description = "Whether to build HDT dictionary or not", arity = 1)
188 | 	Boolean						pBuildDictionary								= null;
189 | 
190 | 	@Parameter(names = { "-bh", "--buildhdt" }, description = "Whether to build HDT or not", arity = 1)
191 | 	Boolean						pBuildHDT										= null;
192 | 
193 | 	@Parameter(names = { "-fd", "--filedictionary" }, description = "Name of hdt dictionary file")
194 | 	String						pDictionaryFileName								= null;
195 | 
196 | 	@Parameter(names = { "-fm", "--filesubjects" }, description = "Name of hdt dictionary file for Mappers")
197 | 	String						pMapDictionaryFileName							= null;
198 | 
199 | 	@Parameter(names = { "-fr", "--fileobjects" }, description = "Name of hdt dictionary file for Reducers")
200 | 	String						pReduceDictionaryFileName						= null;
201 | 
202 | 	@Parameter(names = { "-d", "--dictionarydistribution" }, description = "Dictionary distribution among mappers and reducers")
203 | 	Integer						pDictionaryDistribution							= null;
204 | 
205 | 	@Parameter(names = { "-rt", "--runtriples" }, description = "Whether to run triples job or not", arity = 1)
206 | 	Boolean						pRunTriples										= null;
207 | 
208 | 	@Parameter(names = { "-rts", "--runtriplessampling" }, description = "Whether to run triples input sampling job or not", arity = 1)
209 | 	Boolean						pRunTriplesSampling								= null;
210 | 
211 | 	@Parameter(names = { "-nt", "--nametriplesjob" }, description = "Name of triples job")
212 | 	String						pTriplesName									= null;
213 | 
214 | 	@Parameter(names = { "-it", "--inputtriples" }, description = "Path to triples job input files. Relative to basedir")
215 | 	String						pTriplesInputPath								= null;
216 | 
217 | 	@Parameter(names = { "-ot", "--outputtriples" }, description = "Path to triples job output files. Relative to basedir")
218 | 	String						pTriplesOutputPath								= null;
219 | 
220 | 	@Parameter(names = { "-dt", "--deleteoutputtriples" }, description = "Delete triples job output path before running job")
221 | 	Boolean						pDeleteTriplesOutputPath						= null;
222 | 
223 | 	@Parameter(names = { "-Rt", "--reducerstriples" }, description = "Number of reducers for triples job")
224 | 	Integer						pNumReducersTriples								= null;
225 | 
226 | 	@Parameter(names = { "-Rts", "--reducerstriplessampling" }, description = "Number of reducers for triples input sampling job")
227 | 	Integer						pNumReducersTriplesSampling						= null;
228 | 
229 | 	@Parameter(names = { "-fh", "--namehdtfile" }, description = "Name of hdt  file")
230 | 	String						pHdtFileName									= null;
231 | 
232 | 	@Parameter(names = { "-hc", "--hdtconf" }, description = "Conversion config file")
233 | 	String						pHdtConfigFile									= null;
234 | 
235 | 	@Parameter(names = { "-o", "--options" }, description = "HDT Conversion options (override those of config file)")
236 | 	String						pOptions										= null;
237 | 
238 | 	@Parameter(names = { "-t", "--rdftype" }, description = "Type of RDF Input (ntriples, nquad, n3, turtle, rdfxml)")
239 | 	String						pRdfType										= null;
240 | 
241 | 	@Parameter(names = { "-bu", "--baseURI" }, description = "Base URI for the dataset")
242 | 	String						pBaseURI										= null;
243 | 
244 | 	@Parameter(names = { "-q", "--quiet" }, description = "Do not show progress of the conversion")
245 | 	Boolean						pQuiet											= null;
246 | 
247 | 	@Parameter(names = { "-x", "--index" }, description = "Generate also external indices to solve all queries")
248 | 	Boolean						pGenerateIndex									= null;
249 | 
250 | 	@Parameter(names = { "-p", "--sampleprobability" }, description = "Probability of using each element for sampling")
251 | 	Float						pSampleProbability								= null;
252 | 
253 | 	Path						inputPath										= null, dictionarySamplesPath = null, dictionaryOutputPath = null, sharedOutputPath = null, subjectsOutputPath = null, predicatesOutputPath = null, objectsOutputPath = null;
254 | 	Path						dictionaryCountersFile							= null, triplesSamplesPath = null, triplesCountersFile = null, hdtDictionarySPOFile = null, hdtMapDictionaryFile = null, hdtReduceDictionaryFile = null, hdtFile = null;
255 | 	Path						triplesInputPath								= null, triplesOutputPath = null;
256 | 
257 | 	Configuration				mrConfiguration									= new Configuration();
258 | 
259 | 	HDTSpecification			spec;
260 | 
261 | 	// This constructor is to be used by Tasks (Mappers and/or Reducers)
262 | 	public HDTBuilderConfiguration(Configuration config) throws IOException {
263 | 		this.mrConfiguration = config;
264 | 	}
265 | 
266 | 	// This constructor is to be used by Drivers
267 | 	public HDTBuilderConfiguration(String[] args) {
268 | 		this.jc = new JCommander(this, args);
269 | 		if (this.help) {
270 | 			this.jc.usage();
271 | 			System.exit(1);
272 | 		}
273 | 		this.addConfigurationResource(this.getConfigFile());
274 | 
275 | 		// FIXME: Esto debería hacerse para todos los parámetros pasados por
276 | 		// línea de comandos
277 | 		this.setProperty(DICTIONARY_OUTPUT_PATH_NAME, this.getDictionaryOutputPath().toString());
278 | 	}
279 | 
280 | 	private void addConfigurationResource(String configurationPath) {
281 | 		this.mrConfiguration.addResource(new Path(configurationPath));
282 | 	}
283 | 
284 | 	private String getConfigFile() {
285 | 		return this.addBucket(this.pConfigFile != null ? this.pConfigFile : DEFAULT_CONFIGURATION_PATH);
286 | 	}
287 | 
288 | 	public Configuration getConfigurationObject() {
289 | 		return this.mrConfiguration;
290 | 	}
291 | 
292 | 	public void setProperty(String name, String value) {
293 | 		this.mrConfiguration.set(name, value);
294 | 	}
295 | 
296 | 	public void setProperty(String name, int value) {
297 | 		this.mrConfiguration.setInt(name, value);
298 | 	}
299 | 
300 | 	public String getAwsBucket() {
301 | 		return this.get(this.pAwsBucket, AWS_BUCKET_NAME, AWS_BUCKET_DEFAULT_VALUE);
302 | 	}
303 | 
304 | 	public boolean runDictionary() {
305 | 		return this.get(this.pRunDictionary, DICTIONARY_RUN_JOB_NAME, DICTIONARY_RUN_JOB_DEFAULT_VALUE);
306 | 	}
307 | 
308 | 	public boolean runDictionarySampling() {
309 | 		return this.get(this.pRunDictionarySampling, DICTIONARY_RUN_SAMPLE_NAME, DICTIONARY_RUN_SAMPLE_DEFAULT_VALUE);
310 | 	}
311 | 
312 | 	public boolean runTriples() {
313 | 		return this.get(this.pRunTriples, TRIPLES_RUN_JOB_NAME, TRIPLES_RUN_JOB_DEFAULT_VALUE);
314 | 	}
315 | 
316 | 	public boolean runTriplesSampling() {
317 | 		return this.get(this.pRunTriplesSampling, TRIPLES_RUN_SAMPLE_NAME, TRIPLES_RUN_SAMPLE_DEFAULT_VALUE);
318 | 	}
319 | 
320 | 	public boolean buildDictionary() {
321 | 		return this.get(this.pBuildDictionary, HDTDICTIONARY_BUILD_NAME, HDTDICTIONARY_BUILD_DEFAULT_VALUE);
322 | 	}
323 | 
324 | 	public boolean buildHDT() {
325 | 		return this.get(this.pBuildHDT, HDT_BUILD_NAME, HDT_BUILD_DEFAULT_VALUE);
326 | 	}
327 | 
328 | 	public String getDictionaryJobName() {
329 | 		return this.get(this.pTriplesName, DICTIONARY_JOB_NAME_NAME, DICTIONARY_JOB_NAME_DEFAULT_VALUE);
330 | 	}
331 | 
332 | 	public String getTriplesJobName() {
333 | 		return this.get(this.pTriplesName, DICTIONARY_JOB_NAME_NAME, DICTIONARY_JOB_NAME_DEFAULT_VALUE);
334 | 	}
335 | 
336 | 	public Path getInputPath() {
337 | 		if (this.inputPath == null) {
338 | 			this.inputPath = new Path(this.getPath(this.get(this.pInputPath, INPUT_PATH_NAME, INPUT_PATH_DEFAULT_VALUE)));
339 | 		}
340 | 		return this.inputPath;
341 | 	}
342 | 
343 | 	public Path getDictionaryOutputPath() {
344 | 		if (this.dictionaryOutputPath == null) {
345 | 			this.dictionaryOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)));
346 | 		}
347 | 		return this.dictionaryOutputPath;
348 | 	}
349 | 
350 | 	public Path getSharedSectionPath() {
351 | 		if (this.sharedOutputPath == null) {
352 | 			this.sharedOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + SHARED_OUTPUT_PATH);
353 | 		}
354 | 		return this.sharedOutputPath;
355 | 	}
356 | 
357 | 	public Path getSubjectsSectionPath() {
358 | 		if (this.subjectsOutputPath == null) {
359 | 			this.subjectsOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + SUBJECTS_OUTPUT_PATH);
360 | 		}
361 | 		return this.subjectsOutputPath;
362 | 	}
363 | 
364 | 	public Path getPredicatesSectionPath() {
365 | 		if (this.predicatesOutputPath == null) {
366 | 			this.predicatesOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + PREDICATES_OUTPUT_PATH);
367 | 		}
368 | 		return this.predicatesOutputPath;
369 | 	}
370 | 
371 | 	public Path getObjectsSectionPath() {
372 | 		if (this.objectsOutputPath == null) {
373 | 			this.objectsOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + OBJECTS_OUTPUT_PATH);
374 | 		}
375 | 		return this.objectsOutputPath;
376 | 	}
377 | 
378 | 	public Path getDictionarySamplesPath() {
379 | 		if (this.dictionarySamplesPath == null) {
380 | 			this.dictionarySamplesPath = new Path(this.getPath(this.get(this.pDictionarySamplePath, DICTIONARY_SAMPLE_OUTPUT_PATH_NAME, DICTIONARY_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE)));
381 | 		}
382 | 		return this.dictionarySamplesPath;
383 | 	}
384 | 
385 | 	public Path getTriplesSamplesPath() {
386 | 		if (this.triplesSamplesPath == null) {
387 | 			this.triplesSamplesPath = new Path(this.getPath(this.get(this.pTriplesSamplePath, TRIPLES_SAMPLE_OUTPUT_PATH_NAME, TRIPLES_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE)));
388 | 		}
389 | 		return this.triplesSamplesPath;
390 | 	}
391 | 
392 | 	public float getSampleProbability() {
393 | 		return this.get(this.pSampleProbability, DICTIONARY_SAMPLE_PROBABILITY_NAME, DICTIONARY_SAMPLE_PROBABILITY_DEFAULT_VALUE);
394 | 	}
395 | 
396 | 	public Path getDictionaryCountersFile() {
397 | 		if (this.dictionaryCountersFile == null) {
398 | 			this.dictionaryCountersFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + ".info");
399 | 		}
400 | 		return this.dictionaryCountersFile;
401 | 	}
402 | 
403 | 	public Path getDictionaryFile() {
404 | 		if (this.hdtDictionarySPOFile == null) {
405 | 			this.hdtDictionarySPOFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pDictionaryFileName, HDTDICTIONARY_FILE_NAME, HDTDICTIONARY_FILE_DEFAULT_VALUE));
406 | 		}
407 | 		return this.hdtDictionarySPOFile;
408 | 	}
409 | 
410 | 	// public Path getDictionaryMapFile() {
411 | 	// if (this.hdtMapDictionaryFile == null) {
412 | 	// this.hdtMapDictionaryFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pMapDictionaryFileName, TRIPLES_MAP_DICTIONARY_FILE_NAME, TRIPLES_MAP_DICTIONARY_FILE_DEFAULT_VALUE));
413 | 	// }
414 | 	// return this.hdtMapDictionaryFile;
415 | 	// }
416 | 	//
417 | 	// public Path getDictionaryReduceFile() {
418 | 	// if (this.hdtReduceDictionaryFile == null) {
419 | 	// this.hdtReduceDictionaryFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pReduceDictionaryFileName, TRIPLES_REDUCE_DICTIONARY_FILE_NAME, TRIPLES_REDUCE_DICTIONARY_FILE_DEFAULT_VALUE));
420 | 	// }
421 | 	// return this.hdtReduceDictionaryFile;
422 | 	// }
423 | 
424 | 	public int getDictionaryDistribution() {
425 | 		return this.get(this.pDictionaryDistribution, HDTDICTIONARY_DISTRIBUTION_NAME, HDTDICTIONARY_DISTRIBUTION_DEFAULT_VALUE);
426 | 	}
427 | 
428 | 	public Path getTriplesOutputPath() {
429 | 		if (this.triplesOutputPath == null) {
430 | 			this.triplesOutputPath = new Path(this.getPath(this.get(this.pTriplesOutputPath, TRIPLES_OUTPUT_PATH_NAME, TRIPLES_OUTPUT_PATH_DEFAULT_VALUE)));
431 | 		}
432 | 		return this.triplesOutputPath;
433 | 	}
434 | 
435 | 	public Path getTriplesCountersFile() {
436 | 		if (this.triplesCountersFile == null) {
437 | 			this.triplesCountersFile = new Path(this.getPath(this.get(this.pTriplesOutputPath, TRIPLES_OUTPUT_PATH_NAME, TRIPLES_OUTPUT_PATH_DEFAULT_VALUE)) + ".info");
438 | 		}
439 | 		return this.triplesCountersFile;
440 | 	}
441 | 
442 | 	public Path getHDTFile() {
443 | 		if (this.hdtFile == null) {
444 | 			this.hdtFile = new Path(this.getPath(this.get(this.pHdtFileName, HDT_FILE_NAME, HDT_FILE_DEFAULT_VALUE)));
445 | 		}
446 | 		return this.hdtFile;
447 | 	}
448 | 
449 | 	public boolean getDeleteDictionaryOutputPath() {
450 | 		return this.get(this.pDeleteDictionaryOutputPath, DICTIONARY_DELETE_OUTPUT_PATH_NAME, DICTIONARY_DELETE_OUTPUT_PATH_DEFAULT_VALUE);
451 | 	}
452 | 
453 | 	public boolean getDeleteDictionarySamplesPath() {
454 | 		return this.get(this.pDeleteDictionarySamplePath, DICTIONARY_DELETE_SAMPLE_PATH_NAME, DICTIONARY_DELETE_SAMPLE_PATH_DEFAULT_VALUE);
455 | 	}
456 | 
457 | 	public boolean getDeleteTriplesOutputPath() {
458 | 		return this.get(this.pDeleteTriplesOutputPath, TRIPLES_DELETE_OUTPUT_PATH_NAME, TRIPLES_DELETE_OUTPUT_PATH_DEFAULT_VALUE);
459 | 	}
460 | 
461 | 	public boolean getDeleteTriplesSamplesPath() {
462 | 		return this.get(this.pDeleteTriplesSamplePath, TRIPLES_DELETE_SAMPLE_PATH_NAME, TRIPLES_DELETE_SAMPLE_PATH_DEFAULT_VALUE);
463 | 	}
464 | 
465 | 	public int getDictionaryReducers() {
466 | 		return this.get(this.pNumReducersDictionary, DICTIONARY_NUM_REDUCERS_NAME, DICTIONARY_NUM_REDUCERS_DEFAULT_VALUE);
467 | 	}
468 | 
469 | 	public int getDictionarySampleReducers() {
470 | 		return this.get(this.pNumReducersDictionarySampling, DICTIONARY_SAMPLE_NUM_REDUCERS_NAME, DICTIONARY_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE);
471 | 	}
472 | 
473 | 	public int getTriplesReducers() {
474 | 		return this.get(this.pNumReducersTriples, TRIPLES_NUM_REDUCERS_NAME, TRIPLES_NUM_REDUCERS_DEFAULT_VALUE);
475 | 	}
476 | 
477 | 	public int getTriplesSampleReducers() {
478 | 		return this.get(this.pNumReducersTriplesSampling, TRIPLES_SAMPLE_NUM_REDUCERS_NAME, TRIPLES_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE);
479 | 	}
480 | 
481 | 	public String getHdtConfigFile() {
482 | 		return this.getPath(this.get(this.pHdtConfigFile, CONFIG_FILE_NAME, CONFIG_FILE_DEFAULT_VALUE));
483 | 	}
484 | 
485 | 	public String getOptions() {
486 | 		return this.get(this.pOptions, OPTIONS_NAME, OPTIONS_DEFAULT_VALUE);
487 | 	}
488 | 
489 | 	public String getRdfType() {
490 | 		return this.get(this.pRdfType, RDF_TYPE_NAME, RDF_TYPE_DEFAULT_VALUE);
491 | 	}
492 | 
493 | 	public boolean getQuiet() {
494 | 		return this.get(this.pQuiet, QUIET_NAME, QUIET_DEFAULT_VALUE);
495 | 	}
496 | 
497 | 	public String getBaseURI() {
498 | 		return this.get(this.pBaseURI, BASE_URI_NAME, BASE_URI_DEFAULT_VALUE);
499 | 	}
500 | 
501 | 	public HDTSpecification getSpec() throws IOException {
502 | 		if (this.spec == null) {
503 | 			if (this.getHdtConfigFile() != null) {
504 | 				this.spec = new HDTSpecification(this.getHdtConfigFile());
505 | 			} else {
506 | 				this.spec = new HDTSpecification();
507 | 			}
508 | 			if (this.getOptions() != null) {
509 | 				this.spec.setOptions(this.getOptions());
510 | 			}
511 | 		}
512 | 		return this.spec;
513 | 	}
514 | 
515 | 	private String get(String paramValue, String confName, String defaultValue) {
516 | 		return paramValue != null ? paramValue : this.mrConfiguration.get(confName, defaultValue);
517 | 	}
518 | 
519 | 	private boolean get(Boolean paramValue, String confName, boolean defaultValue) {
520 | 		return paramValue != null ? paramValue : this.mrConfiguration.getBoolean(confName, defaultValue);
521 | 	}
522 | 
523 | 	private int get(Integer paramValue, String confName, int defaultValue) {
524 | 		return paramValue != null ? paramValue : this.mrConfiguration.getInt(confName, defaultValue);
525 | 	}
526 | 
527 | 	private float get(Float paramValue, String confName, float defaultValue) {
528 | 		return paramValue != null ? paramValue : this.mrConfiguration.getFloat(confName, defaultValue);
529 | 	}
530 | 
531 | 	private String getPath(String path) {
532 | 		// Add Base Path
533 | 		return FilenameUtils.concat(this.get(this.pBasePath, BASE_PATH_NAME, BASE_PATH_DEFAULT_VALUE), path);
534 | 	}
535 | 
536 | 	private String addBucket(String path) {
537 | 		// If bucket is provided as parameter, and configuration path is
538 | 		// relative, create absolute configuration path
539 | 		if (this.getAwsBucket() != null && !path.startsWith("s3n://")) {
540 | 			path = "s3n://" + this.getAwsBucket() + "/" + StringUtils.removeStart(path, "/");
541 | 		}
542 | 		return path;
543 | 	}
544 | 
545 | 	// private void set(Integer paramValue, String confName, int defautlValue) {
546 | 	// mrConfiguration.setInt(confName, paramValue != null ? paramValue :
547 | 	// mrConfiguration.getInt(confName, defautlValue));
548 | 	// }
549 | 
550 | }
551 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/HDTBuilderDriver.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
  3 |  *
  4 |  * This library is free software; you can redistribute it and/or
  5 |  * modify it under the terms of the GNU Lesser General Public
  6 |  * License as published by the Free Software Foundation; either
  7 |  * version 2.1 of the License, or (at your option) any later version.
  8 |  *
  9 |  * This library is distributed in the hope that it will be useful,
 10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 12 |  * Lesser General Public License for more details.
 13 |  *
 14 |  * You should have received a copy of the GNU Lesser General Public
 15 |  * License along with this library; if not, write to the Free Software
 16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 17 |  *
 18 |  * Contacting the authors:
 19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
 20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
 21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
 22 |  */
 23 | package org.rdfhdt.mrbuilder;
 24 | 
 25 | import java.io.BufferedInputStream;
 26 | import java.io.BufferedOutputStream;
 27 | import java.io.BufferedReader;
 28 | import java.io.BufferedWriter;
 29 | import java.io.IOException;
 30 | import java.io.InputStreamReader;
 31 | import java.io.OutputStreamWriter;
 32 | import java.net.URI;
 33 | import java.net.URISyntaxException;
 34 | import java.util.Arrays;
 35 | 
 36 | import org.apache.hadoop.filecache.DistributedCache;
 37 | import org.apache.hadoop.fs.FileStatus;
 38 | import org.apache.hadoop.fs.FileSystem;
 39 | import org.apache.hadoop.fs.Path;
 40 | import org.apache.hadoop.fs.PathFilter;
 41 | import org.apache.hadoop.io.NullWritable;
 42 | import org.apache.hadoop.io.SequenceFile;
 43 | import org.apache.hadoop.io.Text;
 44 | import org.apache.hadoop.mapreduce.Job;
 45 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
 46 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
 47 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
 48 | import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
 49 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
 50 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
 51 | import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
 52 | import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
 53 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary;
 54 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary2;
 55 | import org.rdfhdt.hdt.dictionary.impl.section.TransientDictionarySection;
 56 | import org.rdfhdt.hdt.hdt.impl.TransientHDT;
 57 | import org.rdfhdt.hdt.options.ControlInformation;
 58 | import org.rdfhdt.hdt.trans.TransientElement;
 59 | import org.rdfhdt.hdt.triples.impl.TransientBitMapTriples;
 60 | import org.rdfhdt.listener.HDTBuilderListener;
 61 | import org.rdfhdt.mrbuilder.dictionary.DictionaryCombiner;
 62 | import org.rdfhdt.mrbuilder.dictionary.DictionaryMapper;
 63 | import org.rdfhdt.mrbuilder.dictionary.DictionaryReducer;
 64 | import org.rdfhdt.mrbuilder.dictionary.DictionarySamplerMapper;
 65 | import org.rdfhdt.mrbuilder.dictionary.DictionarySamplerReducer;
 66 | import org.rdfhdt.mrbuilder.io.TripleSPOComparator;
 67 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable;
 68 | import org.rdfhdt.mrbuilder.triples.TriplesSPOMapper;
 69 | import org.rdfhdt.mrbuilder.util.FileStatusComparator;
 70 | 
 71 | import com.hadoop.mapreduce.LzoTextInputFormat;
 72 | 
 73 | public class HDTBuilderDriver {
 74 | 
 75 | 	public enum Counters {
 76 | 		Triples, Subjects, Predicates, Objects, Shared, Sample
 77 | 	}
 78 | 
 79 | 	protected HDTBuilderConfiguration	conf;
 80 | 	protected HDTBuilderListener		listener;
 81 | 	protected FileSystem				inputFS, dictionaryFS, triplesFS;
 82 | 	protected Long						numTriples	= null, numShared = null, numSubjects = null, numPredicates = null, numObjects = null;
 83 | 	protected FourSectionDictionary2	dictionary	= null;
 84 | 
 85 | 	public HDTBuilderDriver(String[] args) throws IOException {
 86 | 
 87 | 		// load configuration
 88 | 		this.conf = new HDTBuilderConfiguration(args);
 89 | 
 90 | 		this.listener = new HDTBuilderListener(this.conf);
 91 | 
 92 | 		// get the FileSystem instances for each path
 93 | 		this.inputFS = this.conf.getInputPath().getFileSystem(this.conf.getConfigurationObject());
 94 | 		this.dictionaryFS = this.conf.getDictionaryOutputPath().getFileSystem(this.conf.getConfigurationObject());
 95 | 		this.triplesFS = this.conf.getTriplesOutputPath().getFileSystem(this.conf.getConfigurationObject());
 96 | 
 97 | 	}
 98 | 
 99 | 	public static void main(String[] args) throws Exception {
100 | 		boolean ok = true;
101 | 		HDTBuilderDriver driver = new HDTBuilderDriver(args);
102 | 
103 | 		if (ok && driver.conf.runDictionarySampling()) {
104 | 			if (driver.conf.getDictionaryReducers() == 1) {
105 | 				System.out.println("WARNING: Only one Reducer. Dictionary creation as a single job is more efficient.");
106 | 			}
107 | 			ok = driver.runDictionaryJobSampling();
108 | 		}
109 | 
110 | 		if (ok && driver.conf.runDictionary()) {
111 | 			if (driver.conf.getDictionaryReducers() > 1) {
112 | 				ok = driver.runDictionaryJob();
113 | 			} else {
114 | 				ok = driver.runDictionaryJobWithOneJob();
115 | 			}
116 | 		}
117 | 
118 | 		if (ok && driver.conf.buildDictionary()) {
119 | 			ok = driver.buildDictionary();
120 | 		}
121 | 
122 | 		if (ok && driver.conf.runTriplesSampling()) {
123 | 			if (driver.conf.getTriplesReducers() == 1) {
124 | 				System.out.println("WARNING: Only one Reducer. Triples creation as a single job is more efficient.");
125 | 			}
126 | 			ok = driver.runTriplesJobSampling();
127 | 		}
128 | 
129 | 		if (ok && driver.conf.runTriples()) {
130 | 			if (driver.conf.getTriplesReducers() > 1) {
131 | 				ok = driver.runTriplesJob();
132 | 			} else {
133 | 				ok = driver.runTriplesJobWithOneJob();
134 | 			}
135 | 		}
136 | 
137 | 		if (ok && driver.conf.buildHDT()) {
138 | 			ok = driver.buidHDT();
139 | 		}
140 | 
141 | 		System.exit(ok ? 0 : 1);
142 | 	}
143 | 
144 | 	protected boolean runDictionaryJobSampling() throws IOException, ClassNotFoundException, InterruptedException {
145 | 		boolean jobOK;
146 | 		Job job = null;
147 | 
148 | 		// if input path does not exists, fail
149 | 		if (!this.inputFS.exists(this.conf.getInputPath())) {
150 | 			System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
151 | 			System.exit(-1);
152 | 		}
153 | 
154 | 		// if samples path exists...
155 | 		if (this.dictionaryFS.exists(this.conf.getDictionarySamplesPath())) {
156 | 			if (this.conf.getDeleteDictionarySamplesPath()) { // ... and option provided, delete recursively
157 | 				this.dictionaryFS.delete(this.conf.getDictionarySamplesPath(), true);
158 | 			} else { // ... and option not provided, fail
159 | 				System.out.println("Dictionary samples path does exist: " + this.conf.getDictionarySamplesPath());
160 | 				System.out.println("Select other path or use option -ds to overwrite");
161 | 				System.exit(-1);
162 | 			}
163 | 		}
164 | 
165 | 		// Job to create a SequenceInputFormat with Roles
166 | 		job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 1");
167 | 		job.setJarByClass(HDTBuilderDriver.class);
168 | 
169 | 		System.out.println("input = " + this.conf.getInputPath());
170 | 		System.out.println("samples = " + this.conf.getDictionarySamplesPath());
171 | 
172 | 		FileInputFormat.addInputPath(job, this.conf.getInputPath());
173 | 		FileOutputFormat.setOutputPath(job, this.conf.getDictionarySamplesPath());
174 | 
175 | 		job.setInputFormatClass(LzoTextInputFormat.class);
176 | 		LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
177 | 
178 | 		job.setMapperClass(DictionarySamplerMapper.class);
179 | 		job.setMapOutputKeyClass(Text.class);
180 | 		job.setMapOutputValueClass(Text.class);
181 | 		job.setCombinerClass(DictionarySamplerReducer.class);
182 | 		job.setReducerClass(DictionarySamplerReducer.class);
183 | 		job.setOutputKeyClass(Text.class);
184 | 		job.setOutputValueClass(Text.class);
185 | 
186 | 		job.setNumReduceTasks(this.conf.getDictionarySampleReducers());
187 | 
188 | 		SequenceFileOutputFormat.setCompressOutput(job, true);
189 | 		SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
190 | 		SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
191 | 
192 | 		jobOK = job.waitForCompletion(true);
193 | 
194 | 		return jobOK;
195 | 	}
196 | 
197 | 	protected boolean runDictionaryJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
198 | 		boolean jobOK;
199 | 		Job job = null;
200 | 		BufferedWriter bufferedWriter;
201 | 
202 | 		// if output path exists...
203 | 		if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
204 | 			if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
205 | 				this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
206 | 			} else { // ... and option not provided, fail
207 | 				System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
208 | 				System.out.println("Select other path or use option -dd to overwrite");
209 | 				System.exit(-1);
210 | 			}
211 | 		}
212 | 
213 | 		// Sample the SequenceInputFormat to do TotalSort and create final output
214 | 		job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");
215 | 
216 | 		job.setJarByClass(HDTBuilderDriver.class);
217 | 
218 | 		System.out.println("samples = " + this.conf.getDictionarySamplesPath());
219 | 		System.out.println("output = " + this.conf.getDictionaryOutputPath());
220 | 
221 | 		FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
222 | 		FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());
223 | 
224 | 		job.setInputFormatClass(SequenceFileInputFormat.class);
225 | 		LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
226 | 
227 | 		// Identity Mapper
228 | 		// job.setMapperClass(Mapper.class);
229 | 		job.setCombinerClass(DictionaryCombiner.class);
230 | 		job.setPartitionerClass(TotalOrderPartitioner.class);
231 | 		job.setReducerClass(DictionaryReducer.class);
232 | 
233 | 		job.setNumReduceTasks(this.conf.getDictionaryReducers());
234 | 
235 | 		job.setMapOutputKeyClass(Text.class);
236 | 		job.setMapOutputValueClass(Text.class);
237 | 
238 | 		job.setOutputKeyClass(Text.class);
239 | 		job.setOutputValueClass(NullWritable.class);
240 | 
241 | 		System.out.println("Sampling started");
242 | 		InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
243 | 		String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
244 | 		URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
245 | 		DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
246 | 		DistributedCache.createSymlink(job.getConfiguration());
247 | 		System.out.println("Sampling finished");
248 | 
249 | 		MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
250 | 		MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
251 | 		MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
252 | 		MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
253 | 
254 | 		SequenceFileOutputFormat.setCompressOutput(job, true);
255 | 		SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
256 | 		SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
257 | 
258 | 		jobOK = job.waitForCompletion(true);
259 | 
260 | 		this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
261 | 		this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
262 | 		this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
263 | 		this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();
264 | 
265 | 		bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));
266 | 
267 | 		bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
268 | 		bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
269 | 		bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
270 | 		bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");
271 | 
272 | 		bufferedWriter.close();
273 | 
274 | 		return jobOK;
275 | 	}
276 | 
277 | 	protected boolean runDictionaryJobWithOneJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
278 | 		boolean jobOK;
279 | 		Job job = null;
280 | 		BufferedWriter bufferedWriter;
281 | 
282 | 		// if input path does not exists, fail
283 | 		if (!this.inputFS.exists(this.conf.getInputPath())) {
284 | 			System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
285 | 			System.exit(-1);
286 | 		}
287 | 
288 | 		// if output path exists...
289 | 		if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
290 | 			if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
291 | 				this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
292 | 			} else { // ... and option not provided, fail
293 | 				System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
294 | 				System.out.println("Select other path or use option -dd to overwrite");
295 | 				System.exit(-1);
296 | 			}
297 | 		}
298 | 
299 | 		// Launch job
300 | 		job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName());
301 | 		job.setJarByClass(HDTBuilderDriver.class);
302 | 
303 | 		FileInputFormat.addInputPath(job, this.conf.getInputPath());
304 | 		FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());
305 | 
306 | 		job.setInputFormatClass(LzoTextInputFormat.class);
307 | 		LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
308 | 
309 | 		job.setMapperClass(DictionaryMapper.class);
310 | 		job.setCombinerClass(DictionaryCombiner.class);
311 | 		job.setReducerClass(DictionaryReducer.class);
312 | 
313 | 		job.setNumReduceTasks(this.conf.getDictionaryReducers());
314 | 
315 | 		job.setMapOutputKeyClass(Text.class);
316 | 		job.setMapOutputValueClass(Text.class);
317 | 
318 | 		job.setOutputKeyClass(Text.class);
319 | 		job.setOutputValueClass(NullWritable.class);
320 | 
321 | 		MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
322 | 		MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
323 | 		MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
324 | 		MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
325 | 
326 | 		jobOK = job.waitForCompletion(true);
327 | 
328 | 		this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
329 | 		this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
330 | 		this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
331 | 		this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();
332 | 
333 | 		bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));
334 | 
335 | 		bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
336 | 		bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
337 | 		bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
338 | 		bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");
339 | 
340 | 		bufferedWriter.close();
341 | 
342 | 		return jobOK;
343 | 	}
344 | 
345 | 	protected boolean buildDictionary() throws IOException {
346 | 		FourSectionDictionary dictionary4mappers, dictionary4reducers;
347 | 
348 | 		// if job not ran, read Counters
349 | 		if (!this.conf.runDictionary()) {
350 | 
351 | 			System.out.println("Dictionary job not ran. Reading data from file.");
352 | 
353 | 			BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getDictionaryCountersFile())));
354 | 			String line = reader.readLine();
355 | 			while (line != null) {
356 | 				String[] data = line.split("=");
357 | 				switch (data[0]) {
358 | 					case HDTBuilderConfiguration.SHARED:
359 | 						this.numShared = Long.parseLong(data[1]);
360 | 						break;
361 | 					case HDTBuilderConfiguration.SUBJECTS:
362 | 						this.numSubjects = Long.parseLong(data[1]);
363 | 						break;
364 | 					case HDTBuilderConfiguration.PREDICATES:
365 | 						this.numPredicates = Long.parseLong(data[1]);
366 | 						break;
367 | 					case HDTBuilderConfiguration.OBJECTS:
368 | 						this.numObjects = Long.parseLong(data[1]);
369 | 				}
370 | 				line = reader.readLine();
371 | 			}
372 | 			reader.close();
373 | 		}
374 | 
375 | 		TransientDictionarySection shared = new TransientDictionarySection(this.conf.getSpec());
376 | 		TransientDictionarySection subjects = new TransientDictionarySection(this.conf.getSpec());
377 | 		TransientDictionarySection predicates = new TransientDictionarySection(this.conf.getSpec());
378 | 		TransientDictionarySection objects = new TransientDictionarySection(this.conf.getSpec());
379 | 
380 | 		
381 | 
382 | 		if (this.dictionaryFS.exists(this.conf.getSharedSectionPath())) {
383 | 			System.out.println("Shared section = " + this.conf.getSharedSectionPath());
384 | 			this.loadFromDir(shared, this.numShared, this.dictionaryFS, this.conf.getSharedSectionPath());
385 | 		}
386 | 			
387 | 		this.loadFromDir(subjects, this.numSubjects, this.dictionaryFS, this.conf.getSubjectsSectionPath());
388 | 		this.loadFromDir(predicates, this.numPredicates, this.dictionaryFS, this.conf.getPredicatesSectionPath());
389 | 		this.loadFromDir(objects, this.numObjects, this.dictionaryFS, this.conf.getObjectsSectionPath());
390 | 
391 | 		System.out.println("Saving dictionary...");
392 | 		this.dictionary = new FourSectionDictionary2(this.conf.getSpec(), subjects, predicates, objects, shared);
393 | 		this.saveDictionary(this.dictionary, this.dictionaryFS, this.conf.getDictionaryFile());
394 | 
395 | 		return true;
396 | 
397 | 	}
398 | 
399 | 	protected boolean runTriplesJobSampling() throws ClassNotFoundException, IOException, InterruptedException {
400 | 		Job job = null;
401 | 		boolean jobOK;
402 | 		BufferedWriter bufferedWriter;
403 | 
404 | 		// if input path does not exists, fail
405 | 		if (!this.inputFS.exists(this.conf.getInputPath())) {
406 | 			System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
407 | 			System.exit(-1);
408 | 		}
409 | 
410 | 		// if dictionary output path does not exists, fail
411 | 		if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
412 | 			System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
413 | 			System.exit(-1);
414 | 		}
415 | 
416 | 		// if samples path exists, fail
417 | 		if (this.dictionaryFS.exists(this.conf.getTriplesSamplesPath())) {
418 | 			if (this.conf.getDeleteTriplesSamplesPath()) { // ... and option
419 | 				// provided, delete
420 | 				// recursively
421 | 				this.dictionaryFS.delete(this.conf.getTriplesSamplesPath(), true);
422 | 			} else { // ... and option not provided, fail
423 | 				System.out.println("Triples samples path does exist: " + this.conf.getTriplesSamplesPath());
424 | 				System.out.println("Select other path or use option -dst to overwrite");
425 | 				System.exit(-1);
426 | 			}
427 | 		}
428 | 
429 | 		this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");
430 | 
431 | 		// Job to create a SequenceInputFormat
432 | 		job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 1");
433 | 
434 | 		job.setJarByClass(HDTBuilderDriver.class);
435 | 
436 | 		FileInputFormat.addInputPath(job, this.conf.getInputPath());
437 | 		FileOutputFormat.setOutputPath(job, this.conf.getTriplesSamplesPath());
438 | 
439 | 		job.setInputFormatClass(LzoTextInputFormat.class);
440 | 		LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
441 | 
442 | 		job.setMapperClass(TriplesSPOMapper.class);
443 | 		job.setSortComparatorClass(TripleSPOComparator.class);
444 | 		job.setGroupingComparatorClass(TripleSPOComparator.class);
445 | 		job.setMapOutputKeyClass(TripleSPOWritable.class);
446 | 		job.setMapOutputValueClass(NullWritable.class);
447 | 		job.setOutputKeyClass(TripleSPOWritable.class);
448 | 		job.setOutputValueClass(NullWritable.class);
449 | 
450 | 		job.setNumReduceTasks(this.conf.getTriplesReducers());
451 | 
452 | 		DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());
453 | 
454 | 		SequenceFileOutputFormat.setCompressOutput(job, true);
455 | 		SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
456 | 		SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
457 | 
458 | 		jobOK = job.waitForCompletion(true);
459 | 
460 | 		this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
461 | 		bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
462 | 		bufferedWriter.write(this.numTriples.toString() + "\n");
463 | 		bufferedWriter.close();
464 | 
465 | 		return jobOK;
466 | 	}
467 | 
468 | 	protected boolean runTriplesJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
469 | 		Job job = null;
470 | 		boolean jobOK;
471 | 
472 | 		// if triples output path exists...
473 | 		if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
474 | 			if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
475 | 				this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
476 | 			} else { // ... and option not provided, fail
477 | 				System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
478 | 				System.out.println("Select other path or use option -dt to overwrite");
479 | 				System.exit(-1);
480 | 			}
481 | 		}
482 | 
483 | 		job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2");
484 | 
485 | 		job.setJarByClass(HDTBuilderDriver.class);
486 | 
487 | 		FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath());
488 | 		FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());
489 | 
490 | 		job.setInputFormatClass(SequenceFileInputFormat.class);
491 | 		LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
492 | 
493 | 		job.setSortComparatorClass(TripleSPOComparator.class);
494 | 		job.setGroupingComparatorClass(TripleSPOComparator.class);
495 | 
496 | 		job.setPartitionerClass(TotalOrderPartitioner.class);
497 | 
498 | 		job.setOutputKeyClass(TripleSPOWritable.class);
499 | 		job.setOutputValueClass(NullWritable.class);
500 | 
501 | 		job.setNumReduceTasks(this.conf.getTriplesReducers());
502 | 
503 | 		System.out.println("Sampling started");
504 | 		InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
505 | 		String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
506 | 		URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
507 | 		DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
508 | 		DistributedCache.createSymlink(job.getConfiguration());
509 | 		System.out.println("Sampling finished");
510 | 
511 | 		SequenceFileOutputFormat.setCompressOutput(job, true);
512 | 		SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
513 | 		SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
514 | 
515 | 		jobOK = job.waitForCompletion(true);
516 | 
517 | 		return jobOK;
518 | 	}
519 | 
520 | 	protected boolean runTriplesJobWithOneJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
521 | 		Job job = null;
522 | 		boolean jobOK;
523 | 		BufferedWriter bufferedWriter;
524 | 
525 | 		// if input path does not exists, fail
526 | 		if (!this.inputFS.exists(this.conf.getInputPath())) {
527 | 			System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
528 | 			System.exit(-1);
529 | 		}
530 | 
531 | 		// if dictionary output path does not exists, fail
532 | 		if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
533 | 			System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
534 | 			System.exit(-1);
535 | 		}
536 | 
537 | 		// if triples output path exists...
538 | 		if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
539 | 			if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
540 | 				this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
541 | 			} else { // ... and option not provided, fail
542 | 				System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
543 | 				System.out.println("Select other path or use option -dt to overwrite");
544 | 				System.exit(-1);
545 | 			}
546 | 		}
547 | 
548 | 		// Launch job
549 | 		this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");
550 | 
551 | 		job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName());
552 | 		job.setJarByClass(HDTBuilderDriver.class);
553 | 
554 | 		FileInputFormat.addInputPath(job, this.conf.getInputPath());
555 | 		FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());
556 | 
557 | 		job.setInputFormatClass(LzoTextInputFormat.class);
558 | 		LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
559 | 
560 | 		job.setMapperClass(TriplesSPOMapper.class);
561 | 		job.setSortComparatorClass(TripleSPOComparator.class);
562 | 		job.setMapOutputKeyClass(TripleSPOWritable.class);
563 | 		job.setMapOutputValueClass(NullWritable.class);
564 | 
565 | 		job.setNumReduceTasks(this.conf.getTriplesReducers());
566 | 
567 | 		job.setOutputKeyClass(TripleSPOWritable.class);
568 | 		job.setOutputValueClass(NullWritable.class);
569 | 
570 | 		DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());
571 | 		// DistributedCache.addCacheFile(this.conf.getDictionaryMapFile().toUri(), job.getConfiguration());
572 | 		// DistributedCache.addCacheFile(this.conf.getDictionaryReduceFile().toUri(), job.getConfiguration());
573 | 
574 | 		jobOK = job.waitForCompletion(true);
575 | 
576 | 		this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
577 | 		bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
578 | 		bufferedWriter.write(this.numTriples.toString() + "\n");
579 | 		bufferedWriter.close();
580 | 
581 | 		return jobOK;
582 | 	}
583 | 
584 | 	protected boolean buidHDT() throws IOException {
585 | 		BufferedOutputStream output;
586 | 		TransientHDT hdt = new TransientHDT(this.conf.getSpec());
587 | 		TransientBitMapTriples triples = new TransientBitMapTriples(this.conf.getSpec(), this.triplesFS, new Path("temp"));
588 | 
589 | 		// if dictionary not built, load it
590 | 		if (this.dictionary == null) {
591 | 			System.out.println("Dictionary not built. Reading data from " + this.conf.getDictionaryFile());
592 | 			this.dictionary = this.loadDictionary(this.dictionaryFS, this.conf.getDictionaryFile());
593 | 		}
594 | 
595 | 		// if maxvalues not loaded, read Counters
596 | 		if (!this.conf.runDictionary()) {
597 | 
598 | 			System.out.println("Dictionary Samples job not ran. Reading data from file.");
599 | 
600 | 			BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getDictionaryCountersFile())));
601 | 			String line = reader.readLine();
602 | 			while (line != null) {
603 | 				String[] data = line.split("=");
604 | 				switch (data[0]) {
605 | 					case HDTBuilderConfiguration.SHARED:
606 | 						this.numShared = Long.parseLong(data[1]);
607 | 						break;
608 | 					case HDTBuilderConfiguration.SUBJECTS:
609 | 						this.numSubjects = Long.parseLong(data[1]);
610 | 						break;
611 | 					case HDTBuilderConfiguration.PREDICATES:
612 | 						this.numPredicates = Long.parseLong(data[1]);
613 | 						break;
614 | 					case HDTBuilderConfiguration.OBJECTS:
615 | 						this.numObjects = Long.parseLong(data[1]);
616 | 				}
617 | 				line = reader.readLine();
618 | 			}
619 | 			reader.close();
620 | 		}
621 | 
622 | 		// if triples job not ran, read Counters
623 | 		if (!this.conf.runTriples()) {
624 | 			System.out.println("Triples job nor ran. Reading data from " + this.conf.getTriplesCountersFile());
625 | 			BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getTriplesCountersFile())));
626 | 			this.numTriples = Long.parseLong(reader.readLine());
627 | 			reader.close();
628 | 		}
629 | 
630 | 		this.loadFromDir(triples, this.numTriples, this.numPredicates, (this.numShared + this.numObjects), this.triplesFS, this.conf.getTriplesOutputPath());
631 | 
632 | 		hdt.setDictionary(this.dictionary);
633 | 		hdt.setTriples(triples);
634 | 		hdt.populateHeaderStructure(this.conf.getBaseURI());
635 | 
636 | 		output = new BufferedOutputStream(this.triplesFS.create(this.conf.getHDTFile()));
637 | 		hdt.saveToHDT(output, this.listener);
638 | 		output.close();
639 | 
640 | 		return true;
641 | 	}
642 | 
643 | 	protected void loadFromDir(TransientElement part, long numentries, FileSystem fs, Path path) throws IOException {
644 | 		PathFilter filter = new PathFilter() {
645 | 			@Override
646 | 			public boolean accept(Path path) {
647 | 				return !path.getName().startsWith("_");
648 | 			}
649 | 		};
650 | 		FileStatus[] status = fs.listStatus(path, filter);
651 | 
652 | 		if (status.length == 0) {
653 | 			System.out.println("Path [" + path + "] has no files. Initializing section.");
654 | 			part.initialize(0);
655 | 		} else {
656 | 			Arrays.sort(status, new FileStatusComparator());
657 | 
658 | 			System.out.println("Initializing section " + path);
659 | 			part.initialize(numentries);
660 | 			for (FileStatus file : status) {
661 | 				System.out.println("Reading file [" + file.getPath() + "]");
662 | 				SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), this.conf.getConfigurationObject());
663 | 				part.load(reader, this.listener);
664 | 				reader.close();
665 | 			}
666 | 			System.out.println("Closing section " + path);
667 | 			part.close();
668 | 		}
669 | 	}
670 | 
671 | 	protected void loadFromDir(TransientBitMapTriples part, long numentries, long maxpredicate, long maxobject, FileSystem fs, Path path) throws IOException {
672 | 		PathFilter filter = new PathFilter() {
673 | 			@Override
674 | 			public boolean accept(Path path) {
675 | 				return !path.getName().startsWith("_");
676 | 			}
677 | 		};
678 | 		FileStatus[] status = fs.listStatus(path, filter);
679 | 
680 | 		if (status.length == 0) {
681 | 			System.out.println("Path [" + path + "] has no files. Initializing section.");
682 | 			part.initialize(0, 0);
683 | 		} else {
684 | 			Arrays.sort(status, new FileStatusComparator());
685 | 
686 | 			System.out.println("Initializing section " + path);
687 | 			part.initialize(numentries, maxpredicate, maxobject);
688 | 			for (FileStatus file : status) {
689 | 				System.out.println("Reading file [" + file.getPath() + "]");
690 | 				SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), this.conf.getConfigurationObject());
691 | 				part.load(reader, this.listener);
692 | 				reader.close();
693 | 			}
694 | 			System.out.println("Closing section " + path);
695 | 			part.close();
696 | 		}
697 | 	}
698 | 
699 | 	protected FourSectionDictionary2 loadDictionary(FileSystem fs, Path dictionaryPath) throws IOException {
700 | 		BufferedInputStream input = new BufferedInputStream(fs.open(dictionaryPath));
701 | 		FourSectionDictionary2 dictionary = new FourSectionDictionary2(this.conf.getSpec());
702 | 		ControlInformation ci = new ControlInformation();
703 | 		ci.clear();
704 | 		ci.load(input);
705 | 		dictionary.load(input, ci, this.listener);
706 | 		return dictionary;
707 | 	}
708 | 
709 | 	protected void saveDictionary(FourSectionDictionary2 dictionary, FileSystem fs, Path dictionaryPath) throws IOException {
710 | 		BufferedOutputStream output = new BufferedOutputStream(fs.create(dictionaryPath));
711 | 		dictionary.save(output, new ControlInformation(), this.listener);
712 | 		output.close();
713 | 	}
714 | 
715 | }
716 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionaryCombiner.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 | 
25 | import java.io.IOException;
26 | 
27 | import org.apache.hadoop.io.Text;
28 | import org.apache.hadoop.mapreduce.Reducer;
29 | 
30 | public class DictionaryCombiner extends Reducer<Text, Text, Text, Text> {
31 | 
32 |     @Override
33 |     protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
34 | 	boolean isSubject = false, isPredicate = false, isObject = false;
35 | 	String output = new String();
36 | 
37 | 	for (Text value : values) {
38 | 	    if (value.toString().contains("S"))
39 | 		isSubject = true;
40 | 	    if (value.toString().contains("P"))
41 | 		isPredicate = true;
42 | 	    if (value.toString().contains("O"))
43 | 		isObject = true;
44 | 	}
45 | 
46 | 	if (isSubject)
47 | 	    output = output.concat("S");
48 | 	if (isPredicate)
49 | 	    output = output.concat("P");
50 | 	if (isObject)
51 | 	    output = output.concat("O");
52 | 
53 | 	context.write(key, new Text(output));
54 | 	
55 | //	if (key.toString().toString().contains("Forest Green is an unincorporated community in southeastern Chariton County"))
56 | //    	System.out.println("Combiner: " + key.toString());
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionaryMapper.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 | 
25 | import java.io.IOException;
26 | 
27 | import org.apache.hadoop.io.LongWritable;
28 | import org.apache.hadoop.io.Text;
29 | import org.apache.hadoop.mapreduce.Mapper;
30 | import org.rdfhdt.hdt.exceptions.ParserException;
31 | import org.rdfhdt.hdt.triples.TripleString;
32 | 
33 | public class DictionaryMapper extends Mapper<LongWritable, Text, Text, Text> {
34 | 
35 |     @Override
36 |     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
37 | 
38 | 	TripleString triple = new TripleString();
39 | 	try {
40 | 	    triple.read(value.toString());
41 | 	} catch (ParserException e) {
42 | 	    // TODO Auto-generated catch block
43 | 	    e.printStackTrace();
44 | 	}
45 | 
46 | 	context.write(new Text(triple.getSubject().toString()), new Text("S"));
47 | 	context.write(new Text(triple.getPredicate().toString()), new Text("P"));
48 | 	context.write(new Text(triple.getObject().toString()), new Text("O"));
49 | 	
50 | //	if (triple.getObject().toString().toString().contains("Forest Green is an unincorporated community in southeastern Chariton County"))
51 | //    	System.out.println("Mapper: " + triple.getObject().toString());
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionaryReducer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 | 
25 | import java.io.IOException;
26 | 
27 | import org.apache.hadoop.io.NullWritable;
28 | import org.apache.hadoop.io.Text;
29 | import org.apache.hadoop.mapreduce.Reducer;
30 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
31 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration;
32 | import org.rdfhdt.mrbuilder.HDTBuilderDriver.Counters;
33 | 
34 | public class DictionaryReducer extends Reducer<Text, Text, Text, NullWritable> {
35 | 
36 |     protected MultipleOutputs<Text, NullWritable> output;
37 | 
38 |     @Override
39 |     protected void setup(Context context) throws IOException, InterruptedException {
40 | 	this.output = new MultipleOutputs<Text, NullWritable>(context);
41 | 	super.setup(context);
42 |     }
43 | 
44 |     @Override
45 |     protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
46 | 	boolean isSubject = false, isPredicate = false, isObject = false;
47 | 
48 | 	//key = new Text(UnicodeEscape.escapeString(key.toString()));
49 | 
50 | 	for (Text value : values) {
51 | 	    if (value.toString().contains("S"))
52 | 		isSubject = true;
53 | 	    if (value.toString().contains("P"))
54 | 		isPredicate = true;
55 | 	    if (value.toString().contains("O"))
56 | 		isObject = true;
57 | 	}
58 | 
59 | 	if (isSubject && isObject) {
60 | 	    this.output.write(HDTBuilderConfiguration.SHARED, key, NullWritable.get(), HDTBuilderConfiguration.SHARED_OUTPUT_PATH);
61 | 	    context.getCounter(Counters.Shared).increment(1);
62 | 	} else {
63 | 	    if (isSubject) {
64 | 		this.output.write(HDTBuilderConfiguration.SUBJECTS, key, NullWritable.get(), HDTBuilderConfiguration.SUBJECTS_OUTPUT_PATH);
65 | 		context.getCounter(Counters.Subjects).increment(1);
66 | 	    }
67 | 	    if (isObject) {
68 | 		this.output.write(HDTBuilderConfiguration.OBJECTS, key, NullWritable.get(), HDTBuilderConfiguration.OBJECTS_OUTPUT_PATH);
69 | 		context.getCounter(Counters.Objects).increment(1);
70 | 	    }
71 | 	}
72 | 	if (isPredicate) {
73 | 	    this.output.write(HDTBuilderConfiguration.PREDICATES, key, NullWritable.get(), HDTBuilderConfiguration.PREDICATES_OUTPUT_PATH);
74 | 	    context.getCounter(Counters.Predicates).increment(1);
75 | 	}
76 | 	
77 | //	if (key.toString().contains("Forest Green is an unincorporated community in southeastern Chariton County"))
78 | //    	System.out.println("Reducer: " + key.toString());
79 |     }
80 | 
81 |     @Override
82 |     protected void cleanup(Context context) throws IOException, InterruptedException {
83 | 	this.output.close();
84 | 	super.cleanup(context);
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionarySamplerMapper.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 | 
25 | import java.io.IOException;
26 | 
27 | import org.apache.hadoop.io.LongWritable;
28 | import org.apache.hadoop.io.Text;
29 | import org.apache.hadoop.mapreduce.Mapper;
30 | import org.rdfhdt.hdt.exceptions.ParserException;
31 | import org.rdfhdt.hdt.triples.TripleString;
32 | 
33 | public class DictionarySamplerMapper extends Mapper<LongWritable, Text, Text, Text> {
34 | 
35 |     @Override
36 |     protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
37 | 
38 | 	TripleString triple = new TripleString();
39 | 	try {
40 | 	    triple.read(value.toString());
41 | 	} catch (ParserException e) {
42 | 	    // TODO Auto-generated catch block
43 | 	    e.printStackTrace();
44 | 	}
45 | 
46 | 	context.write(new Text(triple.getSubject().toString()), new Text("S"));
47 | 	context.write(new Text(triple.getPredicate().toString()), new Text("P"));
48 | 	context.write(new Text(triple.getObject().toString()), new Text("O"));
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionarySamplerReducer.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 | 
25 | import java.io.IOException;
26 | 
27 | import org.apache.hadoop.io.Text;
28 | import org.apache.hadoop.mapreduce.Reducer;
29 | 
30 | public class DictionarySamplerReducer extends Reducer<Text, Text, Text, Text> {
31 | 
32 |     @Override
33 |     protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
34 | 	boolean isSubject = false, isPredicate = false, isObject = false;
35 | 	String outputValue = "";
36 | 
37 | 	for (Text value : values) {
38 | 	    if (value.toString().contains("S"))
39 | 		isSubject = true;
40 | 	    if (value.toString().contains("P"))
41 | 		isPredicate = true;
42 | 	    if (value.toString().contains("O"))
43 | 		isObject = true;
44 | 	}
45 | 
46 | 	if (isSubject)
47 | 	    outputValue = outputValue.concat("S");
48 | 	if (isPredicate)
49 | 	    outputValue = outputValue.concat("P");
50 | 	if (isObject)
51 | 	    outputValue = outputValue.concat("O");
52 | 
53 | 	context.write(key, new Text(outputValue));
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/io/TripleSPOComparator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.io;
24 | 
25 | 
26 | public class TripleSPOComparator extends TripleComparator<TripleSPOWritable> {
27 | 
28 |    
29 |     public TripleSPOComparator() {
30 | 	super(TripleSPOWritable.class, true);
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/io/TripleSPOWritable.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.io;
24 | 
25 | import org.apache.hadoop.io.LongWritable;
26 | 
27 | public class TripleSPOWritable extends TripleWritable<LongWritable, LongWritable, LongWritable> {
28 | 
29 |     /**
30 |      *
31 |      */
32 |     public TripleSPOWritable() {
33 | 	super(new LongWritable(), new LongWritable(), new LongWritable());
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/triples/TriplesSPOMapper.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.triples;
24 | 
25 | import org.apache.hadoop.io.LongWritable;
26 | import org.apache.hadoop.io.NullWritable;
27 | import org.rdfhdt.hdt.enums.TripleComponentRole;
28 | import org.rdfhdt.hdt.triples.TripleString;
29 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable;
30 | 
31 | public class TriplesSPOMapper extends TriplesMapper<TripleSPOWritable, NullWritable> {
32 | 
33 |     /*
34 |      * (non-Javadoc)
35 |      * 
36 |      * @see org.rdfhdt.mrbuilder.triples.TriplesMapper#key(org.rdfhdt.hdt.triples.TripleString)
37 |      */
38 |     @Override
39 |     protected TripleSPOWritable key(TripleString tripleString) throws InterruptedException {
40 |     	long subject, predicate, object;
41 | 
42 | 	if ((subject = this.dictionary.stringToId(tripleString.getSubject(), TripleComponentRole.SUBJECT)) == -1) {
43 | 	    System.out.println("Subject nor found");
44 | 		System.out.println("Subject [" + tripleString.getSubject() + "]");
45 | 	    System.out.println("Predicate [" + tripleString.getPredicate() + "]");
46 | 	    System.out.println("Object [" + tripleString.getObject() + "]");
47 | 		throw new InterruptedException("Dictionary not loaded correctly");
48 | 	}
49 | 	if ((predicate = this.dictionary.stringToId(tripleString.getPredicate(), TripleComponentRole.PREDICATE)) == -1)
50 | 	{
51 | 		System.out.println("Predicate nor found");
52 | 		System.out.println("Subject [" + tripleString.getSubject() + "]");
53 | 	    System.out.println("Predicate [" + tripleString.getPredicate() + "]");
54 | 	    System.out.println("Object [" + tripleString.getObject() + "]");
55 | 	    throw new InterruptedException("Dictionary not loaded correctly");
56 | 	}
57 | 	if ((object = this.dictionary.stringToId(tripleString.getObject(), TripleComponentRole.OBJECT)) == -1)
58 | 	{
59 | 		System.out.println("Object nor found");
60 | 		System.out.println("Subject [" + tripleString.getSubject() + "]");
61 | 	    System.out.println("Predicate [" + tripleString.getPredicate() + "]");
62 | 	    System.out.println("Object [" + tripleString.getObject() + "]");
63 | 	    throw new InterruptedException("Dictionary not loaded correctly");
64 | 	}
65 | 	
66 | 	TripleSPOWritable tripleIDs = new TripleSPOWritable();
67 | 	tripleIDs.setSubject(new LongWritable(subject));
68 | 	tripleIDs.setPredicate(new LongWritable(predicate));
69 | 	tripleIDs.setObject(new LongWritable(object));
70 | 	return tripleIDs;
71 |     }
72 | 
73 |     /*
74 |      * (non-Javadoc)
75 |      * 
76 |      * @see org.rdfhdt.mrbuilder.triples.TriplesMapper#value(org.rdfhdt.hdt.triples.TripleString)
77 |      */
78 |     @Override
79 |     protected NullWritable value(TripleString tripleString) {
80 | 	return NullWritable.get();
81 |     }
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/util/FileStatusComparator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
 3 |  *
 4 |  * This library is free software; you can redistribute it and/or
 5 |  * modify it under the terms of the GNU Lesser General Public
 6 |  * License as published by the Free Software Foundation; either
 7 |  * version 2.1 of the License, or (at your option) any later version.
 8 |  *
 9 |  * This library is distributed in the hope that it will be useful,
10 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12 |  * Lesser General Public License for more details.
13 |  *
14 |  * You should have received a copy of the GNU Lesser General Public
15 |  * License along with this library; if not, write to the Free Software
16 |  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 |  *
18 |  * Contacting the authors:
19 |  *   Jose M. Gimenez-Garcia: 	josemiguel.gimenez@alumnos.uva.es
20 |  *   Javier D. Fernandez:       jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 |  *   Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 |  */
23 | package org.rdfhdt.mrbuilder.util;
24 | 
25 | import java.util.Comparator;
26 | 
27 | import org.apache.hadoop.fs.FileStatus;
28 | 
29 | 
30 | public class FileStatusComparator implements Comparator<FileStatus> {
31 | 
32 |     /*
33 |      * (non-Javadoc)
34 |      *
35 |      * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
36 |      */
37 |     @Override
38 |     public int compare(FileStatus fs1, FileStatus fs2) {
39 | 	return fs1.getPath().getName().compareTo(fs2.getPath().getName());
40 |     }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------