├── .gitignore
├── COPYRIGHT
├── README
├── config
├── HDTMRBuilder.xml
├── lubm-dictionary.xml
└── lubm-triples.xml
├── iface
└── org
│ └── rdfhdt
│ ├── hdt
│ └── trans
│ │ └── TransientElement.java
│ └── mrbuilder
│ ├── io
│ ├── TripleComparator.java
│ └── TripleWritable.java
│ └── triples
│ └── TriplesMapper.java
├── pom.xml
└── src
└── org
└── rdfhdt
├── hdt
├── compact
│ ├── bitmap
│ │ └── TransientBitmap375.java
│ └── sequence
│ │ └── TransientSequenceLog64.java
├── dictionary
│ └── impl
│ │ ├── FourSectionDictionary2.java
│ │ └── section
│ │ ├── DictionarySectionFactory2.java
│ │ └── TransientDictionarySection.java
├── hdt
│ └── impl
│ │ └── TransientHDT.java
└── triples
│ ├── ScapedTripleString.java
│ └── impl
│ └── TransientBitMapTriples.java
├── listener
└── HDTBuilderListener.java
└── mrbuilder
├── HDTBuilderConfiguration.java
├── HDTBuilderDriver.java
├── dictionary
├── DictionaryCombiner.java
├── DictionaryMapper.java
├── DictionaryReducer.java
├── DictionarySamplerMapper.java
└── DictionarySamplerReducer.java
├── io
├── TripleSPOComparator.java
└── TripleSPOWritable.java
├── triples
└── TriplesSPOMapper.java
└── util
└── FileStatusComparator.java
/.gitignore:
--------------------------------------------------------------------------------
1 | # Mac OS X
2 | .DS_Store
3 |
4 | # Editor backup files
5 | *~
6 |
--------------------------------------------------------------------------------
/COPYRIGHT:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 2.1, February 1999
3 |
4 | Copyright (C) 1991, 1999 Free Software Foundation, Inc.
5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
6 | Everyone is permitted to copy and distribute verbatim copies
7 | of this license document, but changing it is not allowed.
8 |
9 | [This is the first released version of the Lesser GPL. It also counts
10 | as the successor of the GNU Library Public License, version 2, hence
11 | the version number 2.1.]
12 |
13 | Preamble
14 |
15 | The licenses for most software are designed to take away your
16 | freedom to share and change it. By contrast, the GNU General Public
17 | Licenses are intended to guarantee your freedom to share and change
18 | free software--to make sure the software is free for all its users.
19 |
20 | This license, the Lesser General Public License, applies to some
21 | specially designated software packages--typically libraries--of the
22 | Free Software Foundation and other authors who decide to use it. You
23 | can use it too, but we suggest you first think carefully about whether
24 | this license or the ordinary General Public License is the better
25 | strategy to use in any particular case, based on the explanations below.
26 |
27 | When we speak of free software, we are referring to freedom of use,
28 | not price. Our General Public Licenses are designed to make sure that
29 | you have the freedom to distribute copies of free software (and charge
30 | for this service if you wish); that you receive source code or can get
31 | it if you want it; that you can change the software and use pieces of
32 | it in new free programs; and that you are informed that you can do
33 | these things.
34 |
35 | To protect your rights, we need to make restrictions that forbid
36 | distributors to deny you these rights or to ask you to surrender these
37 | rights. These restrictions translate to certain responsibilities for
38 | you if you distribute copies of the library or if you modify it.
39 |
40 | For example, if you distribute copies of the library, whether gratis
41 | or for a fee, you must give the recipients all the rights that we gave
42 | you. You must make sure that they, too, receive or can get the source
43 | code. If you link other code with the library, you must provide
44 | complete object files to the recipients, so that they can relink them
45 | with the library after making changes to the library and recompiling
46 | it. And you must show them these terms so they know their rights.
47 |
48 | We protect your rights with a two-step method: (1) we copyright the
49 | library, and (2) we offer you this license, which gives you legal
50 | permission to copy, distribute and/or modify the library.
51 |
52 | To protect each distributor, we want to make it very clear that
53 | there is no warranty for the free library. Also, if the library is
54 | modified by someone else and passed on, the recipients should know
55 | that what they have is not the original version, so that the original
56 | author's reputation will not be affected by problems that might be
57 | introduced by others.
58 |
59 | Finally, software patents pose a constant threat to the existence of
60 | any free program. We wish to make sure that a company cannot
61 | effectively restrict the users of a free program by obtaining a
62 | restrictive license from a patent holder. Therefore, we insist that
63 | any patent license obtained for a version of the library must be
64 | consistent with the full freedom of use specified in this license.
65 |
66 | Most GNU software, including some libraries, is covered by the
67 | ordinary GNU General Public License. This license, the GNU Lesser
68 | General Public License, applies to certain designated libraries, and
69 | is quite different from the ordinary General Public License. We use
70 | this license for certain libraries in order to permit linking those
71 | libraries into non-free programs.
72 |
73 | When a program is linked with a library, whether statically or using
74 | a shared library, the combination of the two is legally speaking a
75 | combined work, a derivative of the original library. The ordinary
76 | General Public License therefore permits such linking only if the
77 | entire combination fits its criteria of freedom. The Lesser General
78 | Public License permits more lax criteria for linking other code with
79 | the library.
80 |
81 | We call this license the "Lesser" General Public License because it
82 | does Less to protect the user's freedom than the ordinary General
83 | Public License. It also provides other free software developers Less
84 | of an advantage over competing non-free programs. These disadvantages
85 | are the reason we use the ordinary General Public License for many
86 | libraries. However, the Lesser license provides advantages in certain
87 | special circumstances.
88 |
89 | For example, on rare occasions, there may be a special need to
90 | encourage the widest possible use of a certain library, so that it becomes
91 | a de-facto standard. To achieve this, non-free programs must be
92 | allowed to use the library. A more frequent case is that a free
93 | library does the same job as widely used non-free libraries. In this
94 | case, there is little to gain by limiting the free library to free
95 | software only, so we use the Lesser General Public License.
96 |
97 | In other cases, permission to use a particular library in non-free
98 | programs enables a greater number of people to use a large body of
99 | free software. For example, permission to use the GNU C Library in
100 | non-free programs enables many more people to use the whole GNU
101 | operating system, as well as its variant, the GNU/Linux operating
102 | system.
103 |
104 | Although the Lesser General Public License is Less protective of the
105 | users' freedom, it does ensure that the user of a program that is
106 | linked with the Library has the freedom and the wherewithal to run
107 | that program using a modified version of the Library.
108 |
109 | The precise terms and conditions for copying, distribution and
110 | modification follow. Pay close attention to the difference between a
111 | "work based on the library" and a "work that uses the library". The
112 | former contains code derived from the library, whereas the latter must
113 | be combined with the library in order to run.
114 |
115 | GNU LESSER GENERAL PUBLIC LICENSE
116 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
117 |
118 | 0. This License Agreement applies to any software library or other
119 | program which contains a notice placed by the copyright holder or
120 | other authorized party saying it may be distributed under the terms of
121 | this Lesser General Public License (also called "this License").
122 | Each licensee is addressed as "you".
123 |
124 | A "library" means a collection of software functions and/or data
125 | prepared so as to be conveniently linked with application programs
126 | (which use some of those functions and data) to form executables.
127 |
128 | The "Library", below, refers to any such software library or work
129 | which has been distributed under these terms. A "work based on the
130 | Library" means either the Library or any derivative work under
131 | copyright law: that is to say, a work containing the Library or a
132 | portion of it, either verbatim or with modifications and/or translated
133 | straightforwardly into another language. (Hereinafter, translation is
134 | included without limitation in the term "modification".)
135 |
136 | "Source code" for a work means the preferred form of the work for
137 | making modifications to it. For a library, complete source code means
138 | all the source code for all modules it contains, plus any associated
139 | interface definition files, plus the scripts used to control compilation
140 | and installation of the library.
141 |
142 | Activities other than copying, distribution and modification are not
143 | covered by this License; they are outside its scope. The act of
144 | running a program using the Library is not restricted, and output from
145 | such a program is covered only if its contents constitute a work based
146 | on the Library (independent of the use of the Library in a tool for
147 | writing it). Whether that is true depends on what the Library does
148 | and what the program that uses the Library does.
149 |
150 | 1. You may copy and distribute verbatim copies of the Library's
151 | complete source code as you receive it, in any medium, provided that
152 | you conspicuously and appropriately publish on each copy an
153 | appropriate copyright notice and disclaimer of warranty; keep intact
154 | all the notices that refer to this License and to the absence of any
155 | warranty; and distribute a copy of this License along with the
156 | Library.
157 |
158 | You may charge a fee for the physical act of transferring a copy,
159 | and you may at your option offer warranty protection in exchange for a
160 | fee.
161 |
162 | 2. You may modify your copy or copies of the Library or any portion
163 | of it, thus forming a work based on the Library, and copy and
164 | distribute such modifications or work under the terms of Section 1
165 | above, provided that you also meet all of these conditions:
166 |
167 | a) The modified work must itself be a software library.
168 |
169 | b) You must cause the files modified to carry prominent notices
170 | stating that you changed the files and the date of any change.
171 |
172 | c) You must cause the whole of the work to be licensed at no
173 | charge to all third parties under the terms of this License.
174 |
175 | d) If a facility in the modified Library refers to a function or a
176 | table of data to be supplied by an application program that uses
177 | the facility, other than as an argument passed when the facility
178 | is invoked, then you must make a good faith effort to ensure that,
179 | in the event an application does not supply such function or
180 | table, the facility still operates, and performs whatever part of
181 | its purpose remains meaningful.
182 |
183 | (For example, a function in a library to compute square roots has
184 | a purpose that is entirely well-defined independent of the
185 | application. Therefore, Subsection 2d requires that any
186 | application-supplied function or table used by this function must
187 | be optional: if the application does not supply it, the square
188 | root function must still compute square roots.)
189 |
190 | These requirements apply to the modified work as a whole. If
191 | identifiable sections of that work are not derived from the Library,
192 | and can be reasonably considered independent and separate works in
193 | themselves, then this License, and its terms, do not apply to those
194 | sections when you distribute them as separate works. But when you
195 | distribute the same sections as part of a whole which is a work based
196 | on the Library, the distribution of the whole must be on the terms of
197 | this License, whose permissions for other licensees extend to the
198 | entire whole, and thus to each and every part regardless of who wrote
199 | it.
200 |
201 | Thus, it is not the intent of this section to claim rights or contest
202 | your rights to work written entirely by you; rather, the intent is to
203 | exercise the right to control the distribution of derivative or
204 | collective works based on the Library.
205 |
206 | In addition, mere aggregation of another work not based on the Library
207 | with the Library (or with a work based on the Library) on a volume of
208 | a storage or distribution medium does not bring the other work under
209 | the scope of this License.
210 |
211 | 3. You may opt to apply the terms of the ordinary GNU General Public
212 | License instead of this License to a given copy of the Library. To do
213 | this, you must alter all the notices that refer to this License, so
214 | that they refer to the ordinary GNU General Public License, version 2,
215 | instead of to this License. (If a newer version than version 2 of the
216 | ordinary GNU General Public License has appeared, then you can specify
217 | that version instead if you wish.) Do not make any other change in
218 | these notices.
219 |
220 | Once this change is made in a given copy, it is irreversible for
221 | that copy, so the ordinary GNU General Public License applies to all
222 | subsequent copies and derivative works made from that copy.
223 |
224 | This option is useful when you wish to copy part of the code of
225 | the Library into a program that is not a library.
226 |
227 | 4. You may copy and distribute the Library (or a portion or
228 | derivative of it, under Section 2) in object code or executable form
229 | under the terms of Sections 1 and 2 above provided that you accompany
230 | it with the complete corresponding machine-readable source code, which
231 | must be distributed under the terms of Sections 1 and 2 above on a
232 | medium customarily used for software interchange.
233 |
234 | If distribution of object code is made by offering access to copy
235 | from a designated place, then offering equivalent access to copy the
236 | source code from the same place satisfies the requirement to
237 | distribute the source code, even though third parties are not
238 | compelled to copy the source along with the object code.
239 |
240 | 5. A program that contains no derivative of any portion of the
241 | Library, but is designed to work with the Library by being compiled or
242 | linked with it, is called a "work that uses the Library". Such a
243 | work, in isolation, is not a derivative work of the Library, and
244 | therefore falls outside the scope of this License.
245 |
246 | However, linking a "work that uses the Library" with the Library
247 | creates an executable that is a derivative of the Library (because it
248 | contains portions of the Library), rather than a "work that uses the
249 | library". The executable is therefore covered by this License.
250 | Section 6 states terms for distribution of such executables.
251 |
252 | When a "work that uses the Library" uses material from a header file
253 | that is part of the Library, the object code for the work may be a
254 | derivative work of the Library even though the source code is not.
255 | Whether this is true is especially significant if the work can be
256 | linked without the Library, or if the work is itself a library. The
257 | threshold for this to be true is not precisely defined by law.
258 |
259 | If such an object file uses only numerical parameters, data
260 | structure layouts and accessors, and small macros and small inline
261 | functions (ten lines or less in length), then the use of the object
262 | file is unrestricted, regardless of whether it is legally a derivative
263 | work. (Executables containing this object code plus portions of the
264 | Library will still fall under Section 6.)
265 |
266 | Otherwise, if the work is a derivative of the Library, you may
267 | distribute the object code for the work under the terms of Section 6.
268 | Any executables containing that work also fall under Section 6,
269 | whether or not they are linked directly with the Library itself.
270 |
271 | 6. As an exception to the Sections above, you may also combine or
272 | link a "work that uses the Library" with the Library to produce a
273 | work containing portions of the Library, and distribute that work
274 | under terms of your choice, provided that the terms permit
275 | modification of the work for the customer's own use and reverse
276 | engineering for debugging such modifications.
277 |
278 | You must give prominent notice with each copy of the work that the
279 | Library is used in it and that the Library and its use are covered by
280 | this License. You must supply a copy of this License. If the work
281 | during execution displays copyright notices, you must include the
282 | copyright notice for the Library among them, as well as a reference
283 | directing the user to the copy of this License. Also, you must do one
284 | of these things:
285 |
286 | a) Accompany the work with the complete corresponding
287 | machine-readable source code for the Library including whatever
288 | changes were used in the work (which must be distributed under
289 | Sections 1 and 2 above); and, if the work is an executable linked
290 | with the Library, with the complete machine-readable "work that
291 | uses the Library", as object code and/or source code, so that the
292 | user can modify the Library and then relink to produce a modified
293 | executable containing the modified Library. (It is understood
294 | that the user who changes the contents of definitions files in the
295 | Library will not necessarily be able to recompile the application
296 | to use the modified definitions.)
297 |
298 | b) Use a suitable shared library mechanism for linking with the
299 | Library. A suitable mechanism is one that (1) uses at run time a
300 | copy of the library already present on the user's computer system,
301 | rather than copying library functions into the executable, and (2)
302 | will operate properly with a modified version of the library, if
303 | the user installs one, as long as the modified version is
304 | interface-compatible with the version that the work was made with.
305 |
306 | c) Accompany the work with a written offer, valid for at
307 | least three years, to give the same user the materials
308 | specified in Subsection 6a, above, for a charge no more
309 | than the cost of performing this distribution.
310 |
311 | d) If distribution of the work is made by offering access to copy
312 | from a designated place, offer equivalent access to copy the above
313 | specified materials from the same place.
314 |
315 | e) Verify that the user has already received a copy of these
316 | materials or that you have already sent this user a copy.
317 |
318 | For an executable, the required form of the "work that uses the
319 | Library" must include any data and utility programs needed for
320 | reproducing the executable from it. However, as a special exception,
321 | the materials to be distributed need not include anything that is
322 | normally distributed (in either source or binary form) with the major
323 | components (compiler, kernel, and so on) of the operating system on
324 | which the executable runs, unless that component itself accompanies
325 | the executable.
326 |
327 | It may happen that this requirement contradicts the license
328 | restrictions of other proprietary libraries that do not normally
329 | accompany the operating system. Such a contradiction means you cannot
330 | use both them and the Library together in an executable that you
331 | distribute.
332 |
333 | 7. You may place library facilities that are a work based on the
334 | Library side-by-side in a single library together with other library
335 | facilities not covered by this License, and distribute such a combined
336 | library, provided that the separate distribution of the work based on
337 | the Library and of the other library facilities is otherwise
338 | permitted, and provided that you do these two things:
339 |
340 | a) Accompany the combined library with a copy of the same work
341 | based on the Library, uncombined with any other library
342 | facilities. This must be distributed under the terms of the
343 | Sections above.
344 |
345 | b) Give prominent notice with the combined library of the fact
346 | that part of it is a work based on the Library, and explaining
347 | where to find the accompanying uncombined form of the same work.
348 |
349 | 8. You may not copy, modify, sublicense, link with, or distribute
350 | the Library except as expressly provided under this License. Any
351 | attempt otherwise to copy, modify, sublicense, link with, or
352 | distribute the Library is void, and will automatically terminate your
353 | rights under this License. However, parties who have received copies,
354 | or rights, from you under this License will not have their licenses
355 | terminated so long as such parties remain in full compliance.
356 |
357 | 9. You are not required to accept this License, since you have not
358 | signed it. However, nothing else grants you permission to modify or
359 | distribute the Library or its derivative works. These actions are
360 | prohibited by law if you do not accept this License. Therefore, by
361 | modifying or distributing the Library (or any work based on the
362 | Library), you indicate your acceptance of this License to do so, and
363 | all its terms and conditions for copying, distributing or modifying
364 | the Library or works based on it.
365 |
366 | 10. Each time you redistribute the Library (or any work based on the
367 | Library), the recipient automatically receives a license from the
368 | original licensor to copy, distribute, link with or modify the Library
369 | subject to these terms and conditions. You may not impose any further
370 | restrictions on the recipients' exercise of the rights granted herein.
371 | You are not responsible for enforcing compliance by third parties with
372 | this License.
373 |
374 | 11. If, as a consequence of a court judgment or allegation of patent
375 | infringement or for any other reason (not limited to patent issues),
376 | conditions are imposed on you (whether by court order, agreement or
377 | otherwise) that contradict the conditions of this License, they do not
378 | excuse you from the conditions of this License. If you cannot
379 | distribute so as to satisfy simultaneously your obligations under this
380 | License and any other pertinent obligations, then as a consequence you
381 | may not distribute the Library at all. For example, if a patent
382 | license would not permit royalty-free redistribution of the Library by
383 | all those who receive copies directly or indirectly through you, then
384 | the only way you could satisfy both it and this License would be to
385 | refrain entirely from distribution of the Library.
386 |
387 | If any portion of this section is held invalid or unenforceable under any
388 | particular circumstance, the balance of the section is intended to apply,
389 | and the section as a whole is intended to apply in other circumstances.
390 |
391 | It is not the purpose of this section to induce you to infringe any
392 | patents or other property right claims or to contest validity of any
393 | such claims; this section has the sole purpose of protecting the
394 | integrity of the free software distribution system which is
395 | implemented by public license practices. Many people have made
396 | generous contributions to the wide range of software distributed
397 | through that system in reliance on consistent application of that
398 | system; it is up to the author/donor to decide if he or she is willing
399 | to distribute software through any other system and a licensee cannot
400 | impose that choice.
401 |
402 | This section is intended to make thoroughly clear what is believed to
403 | be a consequence of the rest of this License.
404 |
405 | 12. If the distribution and/or use of the Library is restricted in
406 | certain countries either by patents or by copyrighted interfaces, the
407 | original copyright holder who places the Library under this License may add
408 | an explicit geographical distribution limitation excluding those countries,
409 | so that distribution is permitted only in or among countries not thus
410 | excluded. In such case, this License incorporates the limitation as if
411 | written in the body of this License.
412 |
413 | 13. The Free Software Foundation may publish revised and/or new
414 | versions of the Lesser General Public License from time to time.
415 | Such new versions will be similar in spirit to the present version,
416 | but may differ in detail to address new problems or concerns.
417 |
418 | Each version is given a distinguishing version number. If the Library
419 | specifies a version number of this License which applies to it and
420 | "any later version", you have the option of following the terms and
421 | conditions either of that version or of any later version published by
422 | the Free Software Foundation. If the Library does not specify a
423 | license version number, you may choose any version ever published by
424 | the Free Software Foundation.
425 |
426 | 14. If you wish to incorporate parts of the Library into other free
427 | programs whose distribution conditions are incompatible with these,
428 | write to the author to ask for permission. For software which is
429 | copyrighted by the Free Software Foundation, write to the Free
430 | Software Foundation; we sometimes make exceptions for this. Our
431 | decision will be guided by the two goals of preserving the free status
432 | of all derivatives of our free software and of promoting the sharing
433 | and reuse of software generally.
434 |
435 | NO WARRANTY
436 |
437 | 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
438 | WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
439 | EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
440 | OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
441 | KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
442 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
443 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
444 | LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
445 | THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
446 |
447 | 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
448 | WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
449 | AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
450 | FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
451 | CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
452 | LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
453 | RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
454 | FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
455 | SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
456 | DAMAGES.
457 |
458 | END OF TERMS AND CONDITIONS
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | ========================
2 | HDT-MR Library.
3 | ========================
4 |
5 | Copyright (C) 2015, Jose M. Gimenez-Garcia, Javier D. Fernandez, Miguel A. Martinez-Prieto
6 | All rights reserved.
7 |
8 | This library is free software; you can redistribute it and/or
9 | modify it under the terms of the GNU Lesser General Public
10 | License as published by the Free Software Foundation; either
11 | version 2.1 of the License, or (at your option) any later version.
12 |
13 | This library is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 | Lesser General Public License for more details.
17 |
18 | You should have received a copy of the GNU Lesser General Public
19 | License along with this library; if not, write to the Free Software
20 | Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
21 |
22 | Visit our Web Page: dataweb.infor.uva.es/projects/hdt-mr
23 |
24 | Contacting the authors:
25 | Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
26 | Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
27 | Miguel A. Martinez-Prieto: migumar2@infor.uva.es
28 |
29 |
30 | Overview
31 | =================
32 |
33 | HDT-MR improves the HDT-java library by introducing MapReduce as the computation model for large HDT serialization. HDT-MR performs in linear time with the dataset size and has proven able to serialize datasets up to 4.42 billion triples, preserving HDT compression and retrieval features.
34 |
35 | HDT-java is a Java library that implements the W3C Submission (http://www.w3.org/Submission/2011/03/) of the RDF HDT (Header-Dictionary-Triples) binary format for publishing and exchanging RDF data at large scale. Its compact representation allows storing RDF in fewer space, while providing direct access to the stored information. See rdfhdt.org for further information.
36 |
37 |
38 |
39 | HDT-MR provides three components:
40 | - iface: Provides an API to use HDT-MR, including interfaces and abstract classes
41 | - src: Core library and command lines tools for using HDT-MR. It allows creating HDT files from RDF.
42 | - config: Examples of configuration files
43 | Note that the current distribution is an alpha version. Therefore, while this build has been tested, it is still subject to bugs and optimizations.
44 |
45 |
46 |
47 |
48 | Compiling
49 | =================
50 | Dependencies:
51 | * HDT-java (https://code.google.com/p/hdt-java/).
52 | *** src/org/rdfhdt/hdt includes those classes who has been modified/extended
53 |
54 | Command line tools
55 | =================
56 |
57 | The tool provides the following main command line tool:
58 |
59 | Usage: hadoop HDTBuilderDriver [options]
60 | Options:
61 | -a, --awsbucket
62 | Amazon Web Services bucket
63 | -bu, --baseURI
64 | Base URI for the dataset
65 | -b, --basedir
66 | Root directory for the process
67 | -bd, --builddictionary
68 | Whether to build HDT dictionary or not
69 | -bh, --buildhdt
70 | Whether to build HDT or not
71 | -c, --conf
72 | Path to configuration file
73 | -dd, --deleteoutputdictionary
74 | Delete dictionary job output path before running job
75 | -dt, --deleteoutputtriples
76 | Delete triples job output path before running job
77 | -dsd, --deletesampledictionary
78 | Delete dictionary job sample path before running job
79 | -dst, --deletesampletriples
80 | Delete triples job sample path before running job
81 | -d, --dictionarydistribution
82 | Dictionary distribution among mappers and reducers
83 | -fd, --filedictionary
84 | Name of hdt dictionary file
85 | -fr, --fileobjects
86 | Name of hdt dictionary file for Reducers
87 | -fm, --filesubjects
88 | Name of hdt dictionary file for Mappers
89 | -hc, --hdtconf
90 | Conversion config file
91 | -x, --index
92 | Generate also external indices to solve all queries
93 | -i, --input
94 | Path to input files. Relative to basedir
95 | -it, --inputtriples
96 | Path to triples job input files. Relative to basedir
97 | -nd, --namedictionaryjob
98 | Name of dictionary job
99 | -fh, --namehdtfile
100 | Name of hdt file
101 | -nt, --nametriplesjob
102 | Name of triples job
103 | -o, --options
104 | HDT Conversion options (override those of config file)
105 | -od, --outputdictionary
106 | Path to dictionary job output files. Relative to basedir
107 | -ot, --outputtriples
108 | Path to triples job output files. Relative to basedir
109 | -q, --quiet
110 | Do not show progress of the conversion
111 | -t, --rdftype
112 | Type of RDF Input (ntriples, nquad, n3, turtle, rdfxml)
113 | -Rd, --reducersdictionary
114 | Number of reducers for dictionary job
115 | -Rds, --reducersdictionarysampling
116 | Number of reducers for dictionary input sampling job
117 | -Rt, --reducerstriples
118 | Number of reducers for triples job
119 | -Rts, --reducerstriplessampling
120 | Number of reducers for triples input sampling job
121 | -rd, --rundictionary
122 | Whether to run dictionary job or not
123 | -rds, --rundictionarysampling
124 | Whether to run dictionary input sampling job or not
125 | -rt, --runtriples
126 | Whether to run triples job or not
127 | -rts, --runtriplessampling
128 | Whether to run triples input sampling job or not
129 | -p, --sampleprobability
130 | Probability of using each element for sampling
131 | -sd, --samplesdictionary
132 | Path to dictionary job sample files. Relative to basedir
133 | -st, --samplestriples
134 | Path to triples job sample files. Relative to basedir
135 |
136 |
137 | Usage example
138 | =================
139 |
140 | After installation, run:
141 |
142 | $ hadoop HDTBuilderDriver
143 | # This first try to read configuration parameters at the default config file (HDTMRBuilder.xml), using default values for those missing parameters. It reads RDF input data from the default 'input' folder and outputs the HDT conversion in 'output.hdt'
144 |
145 | $ hadoop HDTBuilderDriver -i mashup
146 | # Same previous example, but it reads RDF input data from the directory 'mashup'
147 |
148 | $ hadoop HDTBuilderDriver -c lubm-dictionary.xml -p 0.01
149 | # It uses 'lubm-dictionary.xml' as the configuration file. This file states that input data must be taken from the 'lubm' directory and it forces to compute only the HDT dictionary, which is written in 'dictionary/dictionary.hdt'
150 | # It uses 0.01 as the probability of using each element for sampling.
151 |
152 |
153 | $ hadoop HDTBuilderDriver -c lubm-triples.xml -Rt 1 -Rts 1
154 | # It uses 'lubm-triples.xml' as the configuration file. This file states that input data must be taken from the 'lubm' directory and it forces to compute the HDT triples and the final HDT representation by taken the already computed dictionary in 'dictionary/dictionary.hdt'
155 | # It forces to use one reducer in both jobs.
156 |
157 | License
158 | ===============
159 |
160 | All HDT-MR content is licensed by Lesser General Public License.
161 |
162 | Acknowledgements
163 | ================
164 |
165 | HDT-MR is a project partially funded by Ministerio de Economia y Competitividad, Spain: TIN2013-46238-C4-3-R, and Austrian Science Fund (FWF): M1720-G11.
166 |
167 |
168 |
--------------------------------------------------------------------------------
/config/HDTMRBuilder.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | global.path.base
5 | .
6 | Root directory
7 |
8 |
9 |
10 | global.path.input
11 | input
12 | input path
13 |
14 |
15 |
16 | job.dictionary.path.output
17 | d
18 | Dictionary output path / Triples input path
19 |
20 |
21 |
22 | job.dictionary.path.output.delete
23 | true
24 | Whether to delete dictionary output path
25 |
26 |
27 |
28 | job.dictionary.path.sample
29 | s
30 | Dictionary sample path
31 |
32 |
33 |
34 | job.dictionary.path.sample.delete
35 | true
36 | Whether to delete dictionary sample path
37 |
38 |
39 |
40 | job.triples.path.output
41 | t
42 | Triples output path
43 |
44 |
45 |
46 | job.triples.path.output.delete
47 | true
48 | Whether to delete triples output path
49 |
50 |
51 |
52 | job.dictionary.reducers
53 | 10
54 | Number of reducers used by jobs
55 |
56 |
57 |
58 | job.triples.reducers
59 | 10
60 | Number of reducers used by jobs
61 |
62 |
63 |
64 | job.dictionary.sample.probability
65 | 0.000001
66 | Sampler Probability
67 |
68 |
69 |
70 |
--------------------------------------------------------------------------------
/config/lubm-dictionary.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | job.dictionary.run
5 | true
6 |
7 |
8 |
9 | job.dictionary.sample.run
10 | true
11 |
12 |
13 |
14 | job.dictionary.sample.reducers
15 | 10
16 |
17 |
18 |
19 | hdt.dictionary.build
20 | true
21 |
22 |
23 |
24 | job.triples.run
25 | false
26 |
27 |
28 |
29 | job.triples.sample.run
30 | false
31 |
32 |
33 |
34 | hdt.build
35 | false
36 |
37 |
38 |
39 | global.path.base
40 | .
41 | Root directory
42 |
43 |
44 |
45 | global.path.input
46 | lubm
47 | input path
48 |
49 |
50 |
51 | job.dictionary.path.output
52 | dictionary
53 | Dictionary output path / Triples input path
54 |
55 |
56 |
57 | job.dictionary.path.output.delete
58 | true
59 | Whether to delete dictionary output path
60 |
61 |
62 |
63 | job.dictionary.path.sample
64 | dictionary_sample
65 | Dictionary samples path
66 |
67 |
68 |
69 | job.dictionary.path.sample.delete
70 | true
71 | Whether to delete dictionary samples path
72 |
73 |
74 |
75 | job.dictionary.reducers
76 | 10
77 | Number of reducers used by jobs
78 |
79 |
80 |
81 | job.dictionary.sample.probability
82 | 0.000001
83 | Sampler Probability
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/config/lubm-triples.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | job.dictionary.run
5 | false
6 |
7 |
8 |
9 | job.dictionary.sample.run
10 | false
11 |
12 |
13 |
14 | hdt.dictionary.build
15 | false
16 |
17 |
18 |
19 | job.triples.run
20 | true
21 |
22 |
23 |
24 | job.triples.sample.run
25 | true
26 |
27 |
28 |
29 | hdt.build
30 | true
31 |
32 |
33 |
34 | global.path.base
35 | .
36 | Root directory
37 |
38 |
39 |
40 | global.path.input
41 | lubm
42 | input path
43 |
44 |
45 |
46 | job.dictionary.path.output
47 | dictionary
48 | Dictionary output path / Triples input path
49 |
50 |
51 |
52 | job.triples.path.output.delete
53 | true
54 | Whether to delete triples output path
55 |
56 |
57 |
58 | job.triples.path.sample
59 | triples_sample
60 | Tripls samples path
61 |
62 |
63 |
64 | job.triples.path.sample.delete
65 | true
66 | Whether to delete tripls samples path
67 |
68 |
69 |
70 | job.triples.reducers
71 | 10
72 | Number of reducers used by jobs
73 |
74 |
75 |
76 | job.triples.sample.probability
77 | 0.000001
78 | Sampler Probability
79 |
80 |
81 |
82 |
--------------------------------------------------------------------------------
/iface/org/rdfhdt/hdt/trans/TransientElement.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package org.rdfhdt.hdt.trans;
5 |
6 | import java.io.IOException;
7 |
8 | import org.apache.hadoop.io.SequenceFile;
9 | import org.rdfhdt.hdt.listener.ProgressListener;
10 |
11 | /**
12 | * @author chemi
13 | *
14 | */
15 | public interface TransientElement {
16 |
17 | public void initialize(long numentries);
18 |
19 | public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException;
20 |
21 | public void close() throws IOException;
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/iface/org/rdfhdt/mrbuilder/io/TripleComparator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.io;
24 |
25 | import org.apache.hadoop.io.WritableComparable;
26 | import org.apache.hadoop.io.WritableComparator;
27 |
28 | /**
29 | * @author chemi
30 | *
31 | */
32 | @SuppressWarnings("rawtypes")
33 | public abstract class TripleComparator extends WritableComparator {
34 |
35 | public TripleComparator(Class extends TripleWritable> keyClass, boolean createInstances) {
36 | super(keyClass, createInstances);
37 | }
38 |
39 | public TripleComparator(Class extends TripleWritable> keyClass) {
40 | super(keyClass);
41 | }
42 |
43 | @SuppressWarnings("unchecked")
44 | @Override
45 | public int compare(WritableComparable wc1, WritableComparable wc2) {
46 | TW key1 = (TW) wc1;
47 | TW key2 = (TW) wc2;
48 | return key1.compareTo(key2);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/iface/org/rdfhdt/mrbuilder/io/TripleWritable.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.io;
24 |
25 | import java.io.DataInput;
26 | import java.io.DataOutput;
27 | import java.io.IOException;
28 |
29 | import org.apache.hadoop.io.WritableComparable;
30 |
31 | /**
32 | * @author chemi
33 | *
34 | */
35 |
36 | @SuppressWarnings("rawtypes")
37 | public abstract class TripleWritable implements WritableComparable> {
38 |
39 | protected S subject;
40 | protected P predicate;
41 | protected O object;
42 |
43 | /**
44 | *
45 | */
46 | public TripleWritable(S subject, P predicate, O object) {
47 | this.setSubject(subject);
48 | this.setPredicate(predicate);
49 | this.setObject(object);
50 | }
51 |
52 | /**
53 | * @return the subject
54 | */
55 | public S getSubject() {
56 | return this.subject;
57 | }
58 |
59 | /**
60 | * @param subject
61 | * the subject to set
62 | */
63 | public void setSubject(S subject) {
64 | this.subject = subject;
65 | }
66 |
67 | /**
68 | * @return the predicate
69 | */
70 | public P getPredicate() {
71 | return this.predicate;
72 | }
73 |
74 | /**
75 | * @param predicate
76 | * the predicate to set
77 | */
78 | public void setPredicate(P predicate) {
79 | this.predicate = predicate;
80 | }
81 |
82 | /**
83 | * @return the object
84 | */
85 | public O getObject() {
86 | return this.object;
87 | }
88 |
89 | /**
90 | * @param object
91 | * the object to set
92 | */
93 | public void setObject(O object) {
94 | this.object = object;
95 | }
96 |
97 | /*
98 | * (non-Javadoc)
99 | *
100 | * @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
101 | */
102 | @Override
103 | public void readFields(DataInput input) throws IOException {
104 | this.subject.readFields(input);
105 | this.predicate.readFields(input);
106 | this.object.readFields(input);
107 | }
108 |
109 | /*
110 | * (non-Javadoc)
111 | *
112 | * @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
113 | */
114 | @Override
115 | public void write(DataOutput output) throws IOException {
116 | this.subject.write(output);
117 | this.predicate.write(output);
118 | this.object.write(output);
119 | }
120 |
121 | /*
122 | * (non-Javadoc)
123 | *
124 | * @see java.lang.Comparable#compareTo(java.lang.Object)
125 | */
126 | @Override
127 | public int compareTo(TripleWritable otherKey) {
128 | int comparison;
129 | if ((comparison = this.compareSubjectTo(otherKey)) == 0)
130 | if ((comparison = this.comparePredicateTo(otherKey)) == 0)
131 | comparison = this.compareObjectTo(otherKey);
132 | return comparison;
133 | }
134 |
135 | public int compareSubjectTo(TripleWritable otherKey) {
136 | return this.compareRole(this.getSubject(), otherKey.getSubject());
137 | }
138 |
139 | public int comparePredicateTo(TripleWritable otherKey) {
140 | return this.compareRole(this.getPredicate(), otherKey.getPredicate());
141 | }
142 |
143 | public int compareObjectTo(TripleWritable otherKey) {
144 | return this.compareRole(this.getObject(), otherKey.getObject());
145 | }
146 |
147 | @SuppressWarnings("unchecked")
148 | protected int compareRole(WritableComparable wc1, WritableComparable wc2) {
149 | return (wc1.compareTo(wc2) < 0) ? -1 : ((wc1.compareTo(wc2) > 0) ? 1 : 0);
150 | }
151 |
152 | /*
153 | * (non-Javadoc)
154 | *
155 | * @see java.lang.Object#toString()
156 | */
157 | @Override
158 | public String toString() {
159 | return this.subject + " " + this.predicate + " " + this.object;
160 | }
161 |
162 | }
163 |
--------------------------------------------------------------------------------
/iface/org/rdfhdt/mrbuilder/triples/TriplesMapper.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.triples;
24 |
25 | import java.io.BufferedInputStream;
26 | import java.io.File;
27 | import java.io.FileInputStream;
28 | import java.io.IOException;
29 |
30 | import org.apache.hadoop.filecache.DistributedCache;
31 | import org.apache.hadoop.fs.Path;
32 | import org.apache.hadoop.io.LongWritable;
33 | import org.apache.hadoop.io.Text;
34 | import org.apache.hadoop.io.WritableComparable;
35 | import org.apache.hadoop.mapreduce.Mapper;
36 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary;
37 | import org.rdfhdt.hdt.exceptions.ParserException;
38 | import org.rdfhdt.hdt.listener.ProgressListener;
39 | import org.rdfhdt.hdt.triples.TripleString;
40 | import org.rdfhdt.hdt.util.io.CountInputStream;
41 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration;
42 | import org.rdfhdt.mrbuilder.HDTBuilderDriver.Counters;
43 | import org.rdfhdt.mrbuilder.io.TripleWritable;
44 |
45 | @SuppressWarnings("rawtypes")
46 | public abstract class TriplesMapper extends Mapper implements ProgressListener {
47 |
48 | protected FourSectionDictionary dictionary;
49 | protected HDTBuilderConfiguration conf;
50 |
51 | /*
52 | * (non-Javadoc)
53 | *
54 | * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
55 | */
56 | @Override
57 | protected void setup(Context context) throws IOException, InterruptedException {
58 |
59 | Path[] cache = DistributedCache.getLocalCacheFiles(context.getConfiguration());
60 |
61 | this.conf = new HDTBuilderConfiguration(context.getConfiguration());
62 | CountInputStream input = new CountInputStream(new BufferedInputStream(new FileInputStream(cache[0].toString())));
63 | File file = new File(cache[0].toString());
64 | this.dictionary = new FourSectionDictionary(this.conf.getSpec());
65 | this.dictionary.mapFromFile(input, file, this);
66 | input.close();
67 |
68 | // DEBUG
69 | // ((PFCDictionarySection) this.dictionary.getShared()).dumpAll();
70 | }
71 |
72 | @Override
73 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
74 | TripleString tripleString = new TripleString();
75 |
76 | try {
77 | tripleString.read(value.toString());
78 | } catch (ParserException e) {
79 | // TODO Auto-generated catch block
80 | e.printStackTrace();
81 | }
82 |
83 | context.write(this.key(tripleString), this.value(tripleString));
84 | context.getCounter(Counters.Triples).increment(1);
85 | }
86 |
87 | @Override
88 | public void notifyProgress(float level, String message) {
89 | // if (!this.conf.getQuiet()) {
90 | System.out.print("\r" + message + "\t" + Float.toString(level) + " \r");
91 | }
92 |
93 | protected abstract K key(TripleString tripleString) throws InterruptedException;
94 |
95 | protected abstract V value(TripleString tripleString);
96 |
97 | }
98 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | org.rdfhdt
4 | hdt-mr
5 | 2.0
6 | HDT MapReduce
7 | jar
8 |
9 |
10 | UTF-8
11 | 1.8
12 | 1.8
13 |
14 |
15 |
16 |
17 | org.rdfhdt
18 | hdt-api
19 | 2.0
20 |
21 |
22 | org.rdfhdt
23 | hdt-java-core
24 | 2.0
25 |
26 |
27 | org.apache.hadoop
28 | hadoop-common
29 | 2.7.0
30 |
31 |
32 | org.apache.hadoop
33 | hadoop-mapreduce-client-core
34 | 2.6.0
35 |
36 |
37 | com.hadoop.gplcompression
38 | hadoop-lzo
39 | 0.4.20-SNAPSHOT
40 |
41 |
42 | commons-lang
43 | commons-lang
44 | 2.1
45 |
46 |
47 | org.codehaus.plexus
48 | plexus-utils
49 | 1.1
50 |
51 |
52 |
53 |
54 |
55 | .
56 |
57 |
58 | org.apache.maven.plugins
59 | maven-assembly-plugin
60 |
61 |
62 | iface/**/*.java
63 | src/**/*.java
64 |
65 |
66 |
67 | org.rdfhdt.mrbuilder.HDTBuilderDriver
68 |
69 |
70 |
71 | jar-with-dependencies
72 |
73 |
74 |
75 |
76 | make-assembly
77 | package
78 |
79 | single
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/compact/bitmap/TransientBitmap375.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.hdt.compact.bitmap;
2 |
3 | import java.io.IOException;
4 | import java.io.OutputStream;
5 | import java.util.UUID;
6 |
7 | import org.apache.commons.io.IOUtils;
8 | import org.apache.hadoop.conf.Configuration;
9 | import org.apache.hadoop.fs.FileSystem;
10 | import org.apache.hadoop.fs.Path;
11 | import org.rdfhdt.hdt.compact.integer.VByte;
12 | import org.rdfhdt.hdt.listener.ProgressListener;
13 | import org.rdfhdt.hdt.util.BitUtil;
14 | import org.rdfhdt.hdt.util.crc.CRC32;
15 | import org.rdfhdt.hdt.util.crc.CRC8;
16 | import org.rdfhdt.hdt.util.crc.CRCOutputStream;
17 | import org.rdfhdt.hdt.util.io.IOUtil;
18 |
19 | public class TransientBitmap375 extends Bitmap375 {
20 |
21 | protected OutputStream tempOutput;
22 | protected int bufferSize;
23 | protected int previousWordIndex;
24 | protected long nbits;
25 | private long totalbits = 0;
26 | private long totalwords = 0;
27 |
28 | protected FileSystem fileSystem;
29 | protected Path file;
30 | protected String fileName;
31 |
32 | public TransientBitmap375(int bufferSize) {
33 | super();
34 | this.bufferSize = bufferSize;
35 | this.previousWordIndex = wordIndex(0);
36 | }
37 |
38 | public TransientBitmap375(int bufferSize, long nbits, FileSystem fs, Path path) throws IOException {
39 | super(Math.min(bufferSize, nbits));
40 |
41 | this.bufferSize = bufferSize;
42 | this.nbits = nbits;
43 | this.previousWordIndex = wordIndex(0);
44 |
45 | this.fileName = UUID.randomUUID().toString();
46 |
47 | if (fs == null) {
48 | fs = FileSystem.getLocal(new Configuration());
49 | }
50 | if (path == null) {
51 | path = new Path(".");
52 | }
53 |
54 | this.fileSystem = fs;
55 | this.file = new Path(path, this.fileName);
56 | this.tempOutput = this.fileSystem.create(this.file);
57 |
58 | }
59 |
60 | @Override
61 | public long getNumBits() {
62 | return this.totalbits;
63 | }
64 |
65 | // @Override
66 | // public void append(boolean value) {
67 | // this.set(this.numbits++, value);
68 | // }
69 |
70 | @Override
71 | public void set(long bitIndex, boolean value) {
72 | if ((this.previousWordIndex >= this.bufferSize) && (this.previousWordIndex != wordIndex(bitIndex))) {
73 | try {
74 | // System.out.println("bitIndex = " + bitIndex);
75 | // System.out.println("numbits = " + this.numbits);
76 | this.flushData();
77 | super.set(0, value);
78 | this.previousWordIndex = wordIndex(0);
79 | } catch (IOException e) {
80 | // TODO Auto-generated catch block
81 | e.printStackTrace();
82 | }
83 | } else {
84 | super.set(bitIndex, value);
85 | this.previousWordIndex = wordIndex(bitIndex);
86 | }
87 | }
88 |
89 | private void flushData() throws IOException {
90 |
91 | // System.out.println("flushing bitmap " + this.fileName + " with " + this.numbits + " bits");
92 | // System.out.println("Bits from last word = " + lastWordNumBits(this.numbits));
93 |
94 | this.totalbits += this.numbits - 1;
95 |
96 | int numwords = (int) numWords(this.numbits - 1);
97 |
98 | this.totalwords += numwords;
99 |
100 | for (int i = 0; i < numwords; i++) {
101 | IOUtil.writeLong(this.tempOutput, this.words[i]);
102 | }
103 | this.words = new long[(int) numWords(this.nbits)];
104 | this.numbits = 0;
105 | this.previousWordIndex = wordIndex(0);
106 | }
107 |
108 | public void close() throws IOException {
109 |
110 | this.totalbits += this.numbits;
111 |
112 | int numwords = (int) numWords(this.numbits);
113 |
114 | this.totalwords += numwords;
115 |
116 | // System.out.println("Closing bitmap.");
117 | // System.out.println("Writing " + this.totalbits + " bits");
118 | // System.out.println("There should be " + this.nbits + " bits");
119 | // System.out.println("Writing " + this.totalwords + "words");
120 | // System.out.println("Bits from last word = " + lastWordNumBits(this.numbits));
121 |
122 | for (int i = 0; i < numwords - 1; i++) {
123 | IOUtil.writeLong(this.tempOutput, this.words[i]);
124 | }
125 |
126 | if (numwords > 0) {
127 | // Write only used bits from last entry (byte aligned, little endian)
128 | int lastWordUsed = lastWordNumBits(this.numbits);
129 | BitUtil.writeLowerBitsByteAligned(this.words[numwords - 1], lastWordUsed, this.tempOutput);
130 | }
131 |
132 | this.tempOutput.flush();
133 | this.tempOutput.close();
134 |
135 | this.words = new long[0];
136 | }
137 |
138 | @Override
139 | public void save(OutputStream output, ProgressListener listener) throws IOException {
140 | CRCOutputStream out = new CRCOutputStream(output, new CRC8());
141 |
142 | // Write Type and Numbits
143 | out.write(BitmapFactory.TYPE_BITMAP_PLAIN);
144 | VByte.encode(out, this.totalbits);
145 |
146 | // Write CRC
147 | out.writeCRC();
148 |
149 | // Setup new CRC
150 | out.setCRC(new CRC32());
151 |
152 | // FileInputStream input = new FileInputStream(this.fileName);
153 | // long bytesCopied = Files.copy(this.fileSystem.open(this.file), out);
154 | long bytesCopied = IOUtils.copyLarge(this.fileSystem.open(this.file), out);
155 | // input.close();
156 | this.fileSystem.delete(this.file, true);
157 | System.out.println("bytes copied from " + this.fileName + " = " + bytesCopied);
158 |
159 | // System.out.println("CRC = " + out.getCRC().getValue());
160 | out.writeCRC();
161 |
162 | }
163 | }
164 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/compact/sequence/TransientSequenceLog64.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.hdt.compact.sequence;
2 |
3 | import java.io.IOException;
4 | import java.io.OutputStream;
5 | import java.util.UUID;
6 |
7 | import org.apache.commons.io.IOUtils;
8 | import org.apache.hadoop.conf.Configuration;
9 | import org.apache.hadoop.fs.FileSystem;
10 | import org.apache.hadoop.fs.Path;
11 | import org.rdfhdt.hdt.compact.integer.VByte;
12 | import org.rdfhdt.hdt.listener.ProgressListener;
13 | import org.rdfhdt.hdt.util.BitUtil;
14 | import org.rdfhdt.hdt.util.crc.CRC32;
15 | import org.rdfhdt.hdt.util.crc.CRC8;
16 | import org.rdfhdt.hdt.util.crc.CRCOutputStream;
17 | import org.rdfhdt.hdt.util.io.IOUtil;
18 |
19 | public class TransientSequenceLog64 extends SequenceLog64 {
20 |
21 | protected OutputStream tempOutput;
22 | protected long bufferSize, maxentries;
23 | protected long capacity;
24 | private long totalentries, totalwords;
25 |
26 | protected FileSystem fileSystem;
27 | protected Path file;
28 | protected String fileName;
29 |
30 | public TransientSequenceLog64(int bufferSize) throws IOException {
31 | this(bufferSize, W);
32 | }
33 |
34 | public TransientSequenceLog64(int bufferSize, int numbits) throws IOException {
35 | this(bufferSize, numbits, 0);
36 | }
37 |
38 | public TransientSequenceLog64(int bufferSize, int numbits, long capacity, boolean initialize) throws IOException {
39 | this(bufferSize, numbits, capacity);
40 | if (initialize) {
41 | this.numentries = capacity;
42 | }
43 | }
44 |
45 | public TransientSequenceLog64(int bufferSize, int numbits, long capacity) throws IOException {
46 | this(bufferSize, numbits, capacity, null, null);
47 | }
48 |
49 | public TransientSequenceLog64(int bufferSize, int numbits, long capacity, FileSystem fs, Path path) throws IOException {
50 | super(numbits, Math.min(bufferSize, capacity));
51 |
52 | this.capacity = capacity;
53 |
54 | // parameter provided as bytes, transform to entries
55 | this.maxentries = (int) ((W / (double) numbits) * bufferSize);
56 |
57 | this.fileName = UUID.randomUUID().toString();
58 |
59 | if (fs == null) {
60 | fs = FileSystem.getLocal(new Configuration());
61 | }
62 | if (path == null) {
63 | path = new Path(".");
64 | }
65 |
66 | this.fileSystem = fs;
67 | this.file = new Path(path, this.fileName);
68 | this.tempOutput = this.fileSystem.create(this.file);
69 | }
70 |
71 | @Override
72 | public long getNumberOfElements() {
73 | return this.totalentries;
74 | }
75 |
76 | @Override
77 | public void append(long value) {
78 | super.append(value);
79 |
80 | if (this.numentries >= this.maxentries && (lastWordNumBits(this.numbits, this.numentries) == 64)) {
81 | try {
82 | this.flushData();
83 | } catch (IOException e) {
84 | // TODO Auto-generated catch block
85 | e.printStackTrace();
86 | }
87 | }
88 | }
89 |
90 | protected void flushData() throws IOException {
91 | // System.out.println("Flushing Sequence");
92 |
93 | this.totalentries += this.numentries;
94 |
95 | int numwords = (int) numWordsFor(this.numbits, this.numentries);
96 |
97 | this.totalwords += numwords;
98 |
99 | // System.out.println("Remaining bits =" + lastWordNumBits(this.numbits, this.numentries));
100 |
101 | for (int i = 0; i < numwords; i++) {
102 | IOUtil.writeLong(this.tempOutput, this.data[i]);
103 | }
104 |
105 | long size = numWordsFor(this.numbits, this.numentries);
106 | assert size >= 0 && size <= Integer.MAX_VALUE;
107 |
108 | this.data = new long[Math.max((int) size, 1)];
109 | this.numentries = 0;
110 | }
111 |
112 | public void close() throws IOException {
113 |
114 | this.totalentries += this.numentries;
115 |
116 | int numwords = (int) numWordsFor(this.numbits, this.numentries);
117 |
118 | this.totalwords += numwords;
119 |
120 | // System.out.println("Closing sequence.");
121 | // System.out.println("Writing " + this.totalentries + " entries");
122 | // System.out.println("There should be " + this.capacity + " entries");
123 | // System.out.println("Writing " + this.totalwords + "words");
124 |
125 | // System.out.println("Remaining bits =" + lastWordNumBits(this.numbits, this.numentries));
126 |
127 | for (int i = 0; i < numwords - 1; i++) {
128 | IOUtil.writeLong(this.tempOutput, this.data[i]);
129 | }
130 |
131 | if (numwords > 0) {
132 | // Write only used bits from last entry (byte aligned, little endian)
133 | int lastWordUsedBits = lastWordNumBits(this.numbits, this.numentries);
134 | BitUtil.writeLowerBitsByteAligned(this.data[numwords - 1], lastWordUsedBits, this.tempOutput);
135 | }
136 |
137 | this.tempOutput.flush();
138 | this.tempOutput.close();
139 |
140 | this.data = new long[0];
141 | }
142 |
143 | @Override
144 | public void save(OutputStream output, ProgressListener listener) throws IOException {
145 | CRCOutputStream out = new CRCOutputStream(output, new CRC8());
146 |
147 | out.write(SequenceFactory.TYPE_SEQLOG);
148 | out.write(this.numbits);
149 | VByte.encode(out, this.totalentries);
150 | out.writeCRC();
151 | out.setCRC(new CRC32());
152 |
153 | // long bytesCopied = Files.copy(this.fileSystem.open(this.file), out);
154 | long bytesCopied = IOUtils.copy(this.fileSystem.open(this.file), out);
155 | System.out.println("bytes copied from " + this.fileName + " = " + bytesCopied);
156 | this.fileSystem.delete(this.file, true);
157 |
158 | // System.out.println("CRC = " + out.getCRC().getValue());
159 | out.writeCRC();
160 | }
161 | }
162 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/dictionary/impl/FourSectionDictionary2.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.hdt.dictionary.impl;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.io.InputStream;
6 |
7 | import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate;
8 | import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionCacheAll;
9 | import org.rdfhdt.hdt.dictionary.impl.section.DictionarySectionFactory2;
10 | import org.rdfhdt.hdt.exceptions.IllegalFormatException;
11 | import org.rdfhdt.hdt.listener.ProgressListener;
12 | import org.rdfhdt.hdt.options.ControlInfo;
13 | import org.rdfhdt.hdt.options.ControlInformation;
14 | import org.rdfhdt.hdt.options.HDTOptions;
15 | import org.rdfhdt.hdt.util.io.CountInputStream;
16 | import org.rdfhdt.hdt.util.listener.IntermediateListener;
17 |
18 | public class FourSectionDictionary2 extends FourSectionDictionary {
19 |
20 | public FourSectionDictionary2(HDTOptions spec, DictionarySectionPrivate s, DictionarySectionPrivate p, DictionarySectionPrivate o, DictionarySectionPrivate sh) {
21 | super(spec, s, p, o, sh);
22 | }
23 |
24 | public FourSectionDictionary2(HDTOptions spec) {
25 | super(spec);
26 | }
27 |
28 | public void load(InputStream input, ControlInfo ci, ProgressListener listener) throws IOException {
29 | if(ci.getType()!=ControlInfo.Type.DICTIONARY) {
30 | throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary.");
31 | }
32 |
33 | IntermediateListener iListener = new IntermediateListener(listener);
34 |
35 | shared = DictionarySectionFactory2.loadFrom(input, iListener);
36 | subjects = DictionarySectionFactory2.loadFrom(input, iListener);
37 | predicates = DictionarySectionFactory2.loadFrom(input, iListener);
38 | objects = DictionarySectionFactory2.loadFrom(input, iListener);
39 | }
40 |
41 | @Override
42 | public void mapFromFile(CountInputStream in, File f, ProgressListener listener) throws IOException {
43 | ControlInformation ci = new ControlInformation();
44 | ci.load(in);
45 | if(ci.getType()!=ControlInfo.Type.DICTIONARY) {
46 | throw new IllegalFormatException("Trying to read a dictionary section, but was not dictionary.");
47 | }
48 |
49 | IntermediateListener iListener = new IntermediateListener(listener);
50 | shared = DictionarySectionFactory2.loadFrom(in, f, iListener);
51 | subjects = DictionarySectionFactory2.loadFrom(in, f, iListener);
52 | predicates = DictionarySectionFactory2.loadFrom(in, f, iListener);
53 | objects = DictionarySectionFactory2.loadFrom(in, f, iListener);
54 |
55 | // Use cache only for predicates. Preload only up to 100K predicates.
56 | predicates = new DictionarySectionCacheAll(predicates, predicates.getNumberOfElements()<100000);
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/dictionary/impl/section/DictionarySectionFactory2.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.hdt.dictionary.impl.section;
2 |
3 | import java.io.IOException;
4 | import java.io.InputStream;
5 |
6 | import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate;
7 | import org.rdfhdt.hdt.listener.ProgressListener;
8 | import org.rdfhdt.hdt.options.HDTSpecification;
9 |
10 | public class DictionarySectionFactory2 extends DictionarySectionFactory {
11 |
12 |
13 | public static DictionarySectionPrivate loadFrom(InputStream input, ProgressListener listener) throws IOException {
14 | if(!input.markSupported()) {
15 | throw new IllegalArgumentException("Need support for mark()/reset(). Please wrap the InputStream with a BufferedInputStream");
16 | }
17 | input.mark(64);
18 | int dictType = input.read();
19 | input.reset();
20 | input.mark(64); // To allow children to reset() and try another instance.
21 |
22 | DictionarySectionPrivate section=null;
23 |
24 | switch(dictType) {
25 | case PFCDictionarySection.TYPE_INDEX:
26 | try{
27 | // First try load using the standard PFC
28 | section = new PFCDictionarySection(new HDTSpecification());
29 | section.load(input, listener);
30 | } catch (IllegalArgumentException e) {
31 | // The PFC Could not load the file because it is too big, use PFCBig
32 | section = new TransientDictionarySection(new HDTSpecification());
33 | section.load(input, listener);
34 | }
35 | return section;
36 | }
37 | throw new IOException("DictionarySection implementation not available for id "+dictType);
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/dictionary/impl/section/TransientDictionarySection.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.hdt.dictionary.impl.section;
2 |
3 | import java.io.ByteArrayOutputStream;
4 | import java.io.IOException;
5 | import java.io.OutputStream;
6 |
7 | import org.apache.hadoop.io.SequenceFile;
8 | import org.apache.hadoop.io.Text;
9 | import org.rdfhdt.hdt.compact.integer.VByte;
10 | import org.rdfhdt.hdt.compact.sequence.SequenceLog64;
11 | import org.rdfhdt.hdt.listener.ProgressListener;
12 | import org.rdfhdt.hdt.options.HDTOptions;
13 | import org.rdfhdt.hdt.trans.TransientElement;
14 | import org.rdfhdt.hdt.util.Mutable;
15 | import org.rdfhdt.hdt.util.crc.CRC32;
16 | import org.rdfhdt.hdt.util.crc.CRC8;
17 | import org.rdfhdt.hdt.util.crc.CRCOutputStream;
18 | import org.rdfhdt.hdt.util.io.IOUtil;
19 | import org.rdfhdt.hdt.util.string.ByteStringUtil;
20 | import org.rdfhdt.hdt.util.string.CompactString;
21 | import org.rdfhdt.hdt.util.string.ReplazableString;
22 |
23 | public class TransientDictionarySection extends PFCDictionarySectionBig implements TransientElement {
24 |
25 | ByteArrayOutputStream byteOut;
26 | CharSequence previousStr;
27 | int buffer;
28 | int blockPerBuffer;
29 | long storedBuffersSize;
30 |
31 | public TransientDictionarySection(HDTOptions spec) {
32 | super(spec);
33 | this.blocksize = (int) spec.getInt("pfc.blocksize");
34 | if (this.blocksize == 0) {
35 | this.blocksize = DEFAULT_BLOCK_SIZE;
36 | }
37 | if (this.blockPerBuffer == 0) {
38 | this.blockPerBuffer = BLOCK_PER_BUFFER;
39 | }
40 | }
41 |
42 | @Override
43 | public void initialize(long numentries) {
44 | this.blocks = new SequenceLog64(63, numentries / this.blocksize);
45 | this.storedBuffersSize = 0;
46 | this.numstrings = 0;
47 | this.byteOut = new ByteArrayOutputStream(16 * 1024);
48 | this.blockPerBuffer = BLOCK_PER_BUFFER / 5;
49 | this.data = new byte[(int) Math.ceil((((double) numentries / this.blocksize) / this.blockPerBuffer))][];
50 | this.posFirst = new long[this.data.length];
51 | this.buffer = 0;
52 | this.previousStr = null;
53 | }
54 |
55 | @Override
56 | public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException {
57 | CharSequence str = null;
58 | Text line = new Text();
59 |
60 | this.posFirst[0] = 0;
61 | while (input.next(line)) {
62 | str = new CompactString(line.toString());
63 |
64 | if (this.numstrings % this.blocksize == 0) {
65 | // Add new block pointer
66 | // System.out.println(this.storedBuffersSize);
67 | // System.out.println(this.byteOut.size());
68 | // System.out.println(this.blocksize);
69 | this.blocks.append(this.storedBuffersSize + this.byteOut.size());
70 |
71 | // if number of block per buffer reached, change buffer
72 | if (((this.blocks.getNumberOfElements() - 1) % this.blockPerBuffer == 0) && ((this.blocks.getNumberOfElements() - 1) / this.blockPerBuffer != 0)) {
73 | this.storedBuffersSize += this.byteOut.size();
74 | this.storeBuffer(this.buffer);
75 | this.byteOut = new ByteArrayOutputStream(16 * 1024);
76 | if (this.buffer < this.data.length - 1) {
77 | this.posFirst[++this.buffer] = this.storedBuffersSize + this.byteOut.size();
78 | }
79 | }
80 |
81 | // Copy full string
82 | ByteStringUtil.append(this.byteOut, str, 0);
83 | } else {
84 | // Find common part.
85 | int delta = ByteStringUtil.longestCommonPrefix(this.previousStr, str);
86 | // Write Delta in VByte
87 | VByte.encode(this.byteOut, delta);
88 | // Write remaining
89 | ByteStringUtil.append(this.byteOut, str, delta);
90 | }
91 |
92 | // System.out.println(str);
93 |
94 | this.byteOut.write(0); // End of string
95 | this.numstrings++;
96 | this.previousStr = str;
97 | }
98 | }
99 |
100 | protected void storeBuffer(int buffer) throws IOException {
101 | // System.out.println("Buffer = " + buffer);
102 | this.byteOut.flush();
103 | this.data[buffer] = this.byteOut.toByteArray();
104 | this.byteOut.close();
105 | }
106 |
107 | @Override
108 | public void close() throws IOException {
109 | // Ending block pointer.
110 | this.blocks.append(this.storedBuffersSize + this.byteOut.size());
111 |
112 | // Trim text/blocks
113 | this.blocks.aggresiveTrimToSize();
114 |
115 | // System.out.println("Data length = " + this.data.length);
116 | this.storeBuffer(this.buffer);
117 | }
118 |
119 | /*
120 | * (non-Javadoc)
121 | *
122 | * @see org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySectionBig#save(java.io.OutputStream, org.rdfhdt.hdt.listener.ProgressListener)
123 | */
124 | @Override
125 | public void save(OutputStream output, ProgressListener listener) throws IOException {
126 | long dataLenght = 0;
127 | CRCOutputStream out = new CRCOutputStream(output, new CRC8());
128 |
129 | for (byte[] buffer : this.data) {
130 | dataLenght += buffer.length;
131 | }
132 |
133 | out.write(TYPE_INDEX);
134 | VByte.encode(out, this.numstrings);
135 | VByte.encode(out, dataLenght);
136 | VByte.encode(out, this.blocksize);
137 |
138 | out.writeCRC();
139 |
140 | this.blocks.save(output, listener); // Write blocks directly to output, they have their own CRC check.
141 |
142 | out.setCRC(new CRC32());
143 | for (byte[] buffer : this.data) {
144 | IOUtil.writeBuffer(out, buffer, 0, buffer.length, listener);
145 | }
146 | out.writeCRC();
147 | }
148 |
149 | /*
150 | * (non-Javadoc)
151 | *
152 | * @see hdt.dictionary.DictionarySection#extract(int)
153 | */
154 | @Override
155 | public CharSequence extract(int id) {
156 |
157 | // System.out.println("id = " + id);
158 |
159 | if (id < 1 || id > this.numstrings) {
160 | return null;
161 | }
162 |
163 | // Locate block
164 | int blockid = (id - 1) / this.blocksize;
165 | int nstring = (id - 1) % this.blocksize;
166 |
167 | // System.out.println("blockid = " + blockid);
168 | // System.out.println("nstring = " + nstring);
169 |
170 | byte[] block = this.data[blockid / this.blockPerBuffer];
171 | int pos = (int) (this.blocks.get(blockid) - this.posFirst[blockid / this.blockPerBuffer]);
172 |
173 | // System.out.println("pos = " + pos);
174 |
175 | // Copy first string
176 | int len = ByteStringUtil.strlen(block, pos);
177 |
178 | // System.out.println("len = " + len);
179 |
180 | Mutable delta = new Mutable(0L);
181 | ReplazableString tempString = new ReplazableString();
182 | tempString.append(block, pos, len);
183 |
184 | // System.out.println("dentro del for");
185 |
186 | // Copy strings untill we find our's.
187 | for (int i = 0; i < nstring; i++) {
188 | pos += len + 1;
189 | // System.out.println("pos = " + pos);
190 | pos += VByte.decode(block, pos, delta);
191 | // System.out.println("pos = " + pos);
192 | // System.out.println("delta = [" + delta + "]");
193 | len = ByteStringUtil.strlen(block, pos);
194 | // System.out.println("len = " + len);
195 | tempString.replace(delta.getValue().intValue(), block, pos, len);
196 | // System.out.println("tempstring = [" + tempString + "]");
197 | }
198 | return tempString;
199 | }
200 |
201 | /**
202 | * Locate the block of a string doing binary search.
203 | */
204 | @Override
205 | protected int locateBlock(CharSequence str) {
206 | int low = 0;
207 | int high = (int) this.blocks.getNumberOfElements() - 1;
208 | int max = high;
209 |
210 | while (low <= high) {
211 | int mid = (low + high) >>> 1;
212 |
213 | int cmp;
214 | if (mid == max) {
215 | cmp = -1;
216 | } else {
217 | cmp = ByteStringUtil.strcmp(str, this.data[mid / this.blockPerBuffer], (int) (this.blocks.get(mid) - this.posFirst[mid / this.blockPerBuffer]));
218 |
219 | // if (str.toString().contains("http://dbpedia.org/ontology/Agent") || str.toString().contains("The Health Inspector pays a visit") || str.toString().contains("Crockett_Middle_School") || str.toString().contains("Benthosuchus")) {
220 | // System.out.println("Block: "+ mid + ": "+ ByteStringUtil.asString(data[mid / blockPerBuffer], (int) (this.blocks.get(mid) - this.posFirst[mid / blockPerBuffer])) + " Result: " + cmp);
221 | // }
222 | }
223 |
224 | if (cmp < 0) {
225 | high = mid - 1;
226 | } else if (cmp > 0) {
227 | low = mid + 1;
228 | } else {
229 | return mid; // key found
230 | }
231 | }
232 | return -(low + 1); // key not found.
233 | }
234 |
235 | @Override
236 | protected int locateInBlock(int blockid, CharSequence str) {
237 |
238 | ReplazableString tempString = new ReplazableString();
239 |
240 | Mutable delta = new Mutable(0L);
241 | int idInBlock = 0;
242 | int cshared = 0;
243 |
244 | byte[] block = this.data[blockid / this.blockPerBuffer];
245 | int pos = (int) (this.blocks.get(blockid) - this.posFirst[blockid / this.blockPerBuffer]);
246 |
247 | // Read the first string in the block
248 | int slen = ByteStringUtil.strlen(block, pos);
249 | tempString.append(block, pos, slen);
250 | pos += slen + 1;
251 | idInBlock++;
252 |
253 | while ((idInBlock < this.blocksize) && (pos < block.length)) {
254 | // Decode prefix
255 | pos += VByte.decode(block, pos, delta);
256 |
257 | // Copy suffix
258 | slen = ByteStringUtil.strlen(block, pos);
259 | tempString.replace(delta.getValue().intValue(), block, pos, slen);
260 |
261 | if (delta.getValue() >= cshared) {
262 | // Current delta value means that this string
263 | // has a larger long common prefix than the previous one
264 | // if (str.toString().contains("http://dbpedia.org/ontology/Agent") || str.toString().contains("The Health Inspector pays a visit") || str.toString().contains("Crockett_Middle_School") || str.toString().contains("Benthosuchus")) {
265 | // System.out.println("[" + tempString + "]. cshared [" + cshared + "]");
266 | // }
267 | cshared += ByteStringUtil.longestCommonPrefix(tempString, str, cshared);
268 |
269 | if ((cshared == str.length()) && (tempString.length() == str.length())) {
270 | break;
271 | }
272 | } else {
273 | // We have less common characters than before,
274 | // this string is bigger that what we are looking for.
275 | // i.e. Not found.
276 | idInBlock = 0;
277 | break;
278 | }
279 | pos += slen + 1;
280 | idInBlock++;
281 |
282 | }
283 |
284 | // Not found
285 | if (pos == block.length || idInBlock == this.blocksize) {
286 | idInBlock = 0;
287 | }
288 |
289 | return idInBlock;
290 | }
291 |
292 | }
293 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/hdt/impl/TransientHDT.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.hdt.hdt.impl;
2 |
3 | import java.io.IOException;
4 | import java.io.OutputStream;
5 |
6 | import org.rdfhdt.hdt.dictionary.DictionaryPrivate;
7 | import org.rdfhdt.hdt.header.HeaderPrivate;
8 | import org.rdfhdt.hdt.listener.ProgressListener;
9 | import org.rdfhdt.hdt.options.HDTOptions;
10 | import org.rdfhdt.hdt.triples.TriplesPrivate;
11 |
12 | /**
13 | * @author José M. Giménez-García
14 | *
15 | * @Note: HDTImpl modified to make fields protected instead of private
16 | *
17 | */
18 | public class TransientHDT extends HDTImpl {
19 |
20 | public TransientHDT(HDTOptions spec) {
21 | super(spec);
22 | }
23 |
24 | public void setHeader(HeaderPrivate header) {
25 | this.header = header;
26 | }
27 |
28 | public void setDictionary(DictionaryPrivate dictionary) {
29 | this.dictionary = dictionary;
30 | }
31 |
32 | @Override
33 | public void setTriples(TriplesPrivate triples) {
34 | this.triples = triples;
35 | }
36 |
37 | @Override
38 | public void saveToHDT(OutputStream output, ProgressListener listener) throws IOException {
39 | // TODO Auto-generated method stub
40 | super.saveToHDT(output, listener);
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/triples/ScapedTripleString.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.hdt.triples;
2 |
3 | import org.rdfhdt.hdt.exceptions.ParserException;
4 |
5 | /**
6 | * TripleString holds a triple as Strings
7 | */
8 | public final class ScapedTripleString extends TripleString {
9 |
10 | public ScapedTripleString() {
11 | super();
12 | }
13 |
14 | public ScapedTripleString(CharSequence subject, CharSequence predicate, CharSequence object) {
15 | super(subject, predicate, object);
16 | }
17 |
18 | public ScapedTripleString(TripleString other) {
19 | super(other);
20 | }
21 |
22 | /**
23 | * Read from a line, where each component is separated by space.
24 | *
25 | * @param line
26 | */
27 | @Override
28 | public void read(String line) throws ParserException {
29 | int split, posa, posb;
30 | this.clear();
31 |
32 | // SET SUBJECT
33 | posa = 0;
34 |
35 | if (line.charAt(posa) == '<') { // subject between '<' and '>' symbols
36 | posa++; // Remove <
37 | posb = line.indexOf('>', posa);
38 | split = posb + 1;
39 | } else { // subject until the first space
40 | posb = split = line.indexOf(' ', posa);
41 | }
42 | if (posb == -1) {
43 | return; // Not found, error.
44 | }
45 |
46 | this.setSubject(line.substring(posa, posb));
47 |
48 | // SET PREDICATE
49 | posa = split + 1;
50 |
51 | if (line.charAt(posa) == '<') { // predicate between '<' and '>' symbols
52 | posa++; // Remove <
53 | posb = line.indexOf('>', posa);
54 | split = posb + 1;
55 | } else { // predicate until the first space
56 | posb = split = line.indexOf(' ', posa);
57 | }
58 | if (posb == -1) {
59 | return; // Not found, error.
60 | }
61 |
62 | this.setPredicate(line.substring(posa, posb));
63 |
64 | // SET OBJECT
65 | posa = split + 1;
66 | posb = line.length();
67 |
68 | if (line.charAt(posb - 1) == '.') {
69 | posb--; // Remove trailing from NTRIPLES.
70 | }
71 | if (line.charAt(posb - 1) == ' ') {
72 | posb--;
73 | }
74 |
75 | if (line.charAt(posa) == '<') {
76 | posa++;
77 |
78 | // Remove trailing > only if < appears, so "some"^^ is kept as-is.
79 | if (posb > posa && line.charAt(posb - 1) == '>') {
80 | posb--;
81 | }
82 | }
83 |
84 | this.setObject(line.substring(posa, posb));
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/hdt/triples/impl/TransientBitMapTriples.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.hdt.triples.impl;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.hadoop.fs.FileSystem;
6 | import org.apache.hadoop.fs.Path;
7 | import org.apache.hadoop.io.SequenceFile;
8 | import org.rdfhdt.hdt.compact.bitmap.AdjacencyList;
9 | import org.rdfhdt.hdt.compact.bitmap.TransientBitmap375;
10 | import org.rdfhdt.hdt.compact.sequence.TransientSequenceLog64;
11 | import org.rdfhdt.hdt.enums.TripleComponentOrder;
12 | import org.rdfhdt.hdt.exceptions.IllegalFormatException;
13 | import org.rdfhdt.hdt.listener.ProgressListener;
14 | import org.rdfhdt.hdt.options.HDTOptions;
15 | import org.rdfhdt.hdt.triples.TripleID;
16 | import org.rdfhdt.hdt.util.BitUtil;
17 | import org.rdfhdt.hdt.util.listener.ListenerUtil;
18 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration;
19 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable;
20 |
21 | public class TransientBitMapTriples extends BitmapTriples {
22 |
23 | long number;
24 | long size;
25 | long lastX = 0, lastY = 0, lastZ = 0;
26 | long x, y, z;
27 | long numTriples = 0;
28 | boolean trimNeeded = false;
29 |
30 | FileSystem fileSystem;
31 | Path path;
32 |
33 | public TransientBitMapTriples() {
34 | super();
35 | }
36 |
37 | public TransientBitMapTriples(HDTOptions spec) {
38 | super(spec);
39 | }
40 |
41 | public TransientBitMapTriples(FileSystem fs, Path path) {
42 | this();
43 | this.setFileSystem(fs);
44 | this.setPath(path);
45 | }
46 |
47 | public TransientBitMapTriples(HDTOptions spec, FileSystem fs, Path path) {
48 | this(spec);
49 | this.setFileSystem(fs);
50 | this.setPath(path);
51 | }
52 |
53 | public void setFileSystem(FileSystem fs) {
54 | this.fileSystem = fs;
55 | }
56 |
57 | public void setPath(Path path) {
58 | this.path = path;
59 | }
60 |
61 | public void initialize(long numentries) throws IOException {
62 | this.initialize(numentries, numentries, numentries);
63 | this.trimNeeded = true;
64 | }
65 |
66 | public void initialize(long numentries, long maxvalue) throws IOException {
67 | this.initialize(numentries, maxvalue, maxvalue);
68 | this.trimNeeded = true;
69 | }
70 |
71 | public void initialize(long numentries, long maxpredicate, long maxobject) throws IOException {
72 |
73 | // System.out.println("Numentries: " + numentries);
74 |
75 | this.number = numentries;
76 | this.seqY = new TransientSequenceLog64(HDTBuilderConfiguration.CHUNK_SIZE, BitUtil.log2(maxpredicate), this.number, this.fileSystem, this.path);
77 | this.seqZ = new TransientSequenceLog64(HDTBuilderConfiguration.CHUNK_SIZE, BitUtil.log2(maxobject), this.number, this.fileSystem, this.path);
78 | this.bitmapY = new TransientBitmap375(HDTBuilderConfiguration.CHUNK_SIZE, this.number, this.fileSystem, this.path);
79 | this.bitmapZ = new TransientBitmap375(HDTBuilderConfiguration.CHUNK_SIZE, this.number, this.fileSystem, this.path);
80 | // this.bitmapY = new Bitmap375(this.number);
81 | // this.bitmapZ = new Bitmap375(this.number);
82 | }
83 |
84 | public void load(SequenceFile.Reader input, ProgressListener listener) throws IOException {
85 | TripleSPOWritable tripleWritable = new TripleSPOWritable();
86 |
87 | while (input.next(tripleWritable)) {
88 | TripleID triple = new TripleID((int) tripleWritable.getSubject().get(), (int) tripleWritable.getPredicate().get(), (int) tripleWritable.getObject().get());
89 | this.add(triple);
90 | ListenerUtil.notifyCond(listener, "Converting to BitmapTriples", this.numTriples, this.numTriples, this.number);
91 | this.numTriples++;
92 | }
93 | }
94 |
95 | public void add(TripleID triple) {
96 | TransientSequenceLog64 vectorY = (TransientSequenceLog64) this.seqY;
97 | TransientSequenceLog64 vectorZ = (TransientSequenceLog64) this.seqZ;
98 | TransientBitmap375 bitY = (TransientBitmap375) this.bitmapY;
99 | TransientBitmap375 bitZ = (TransientBitmap375) this.bitmapZ;
100 | // Bitmap375 bitY = (Bitmap375) this.bitmapY;
101 | // Bitmap375 bitZ = (Bitmap375) this.bitmapZ;
102 |
103 | TripleOrderConvert.swapComponentOrder(triple, TripleComponentOrder.SPO, this.order);
104 | this.x = triple.getSubject();
105 | this.y = triple.getPredicate();
106 | this.z = triple.getObject();
107 |
108 | if (this.x == 0 || this.y == 0 || this.z == 0) {
109 | throw new IllegalFormatException("None of the components of a triple can be null");
110 | }
111 |
112 | if (this.numTriples == 0) {
113 | // First triple
114 | vectorY.append(this.y);
115 | vectorZ.append(this.z);
116 | } else if (this.x != this.lastX) {
117 | if (this.x != this.lastX + 1) {
118 | throw new IllegalFormatException("Upper level must be increasing and correlative.");
119 | }
120 | // X changed
121 | bitY.append(true);
122 | vectorY.append(this.y);
123 |
124 | bitZ.append(true);
125 | vectorZ.append(this.z);
126 | } else if (this.y != this.lastY) {
127 | if (this.y < this.lastY) {
128 | throw new IllegalFormatException("Middle level must be increasing for each parent.");
129 | }
130 |
131 | // Y changed
132 | bitY.append(false);
133 | vectorY.append(this.y);
134 |
135 | bitZ.append(true);
136 | vectorZ.append(this.z);
137 | } else if (this.z != this.lastZ) { // Añadido para quitar triples duplicados
138 | if (this.z < this.lastZ) {
139 | throw new IllegalFormatException("Lower level must be increasing for each parent.");
140 | }
141 |
142 | // Z changed
143 | bitZ.append(false);
144 | vectorZ.append(this.z);
145 | }
146 |
147 | this.lastX = this.x;
148 | this.lastY = this.y;
149 | this.lastZ = this.z;
150 | }
151 |
152 | public void close() throws IOException {
153 | TransientSequenceLog64 vectorY = (TransientSequenceLog64) this.seqY;
154 | TransientSequenceLog64 vectorZ = (TransientSequenceLog64) this.seqZ;
155 | TransientBitmap375 bitY = (TransientBitmap375) this.bitmapY;
156 | TransientBitmap375 bitZ = (TransientBitmap375) this.bitmapZ;
157 | // Bitmap375 bitY = (Bitmap375) this.bitmapY;
158 | // Bitmap375 bitZ = (Bitmap375) this.bitmapZ;
159 |
160 | bitY.append(true);
161 | bitZ.append(true);
162 |
163 | bitY.close();
164 | bitZ.close();
165 |
166 | vectorY.close();
167 | vectorZ.close();
168 |
169 | // System.out.println("bitmapY size = " + this.bitmapY.getNumBits());
170 | // System.out.println("seqY size = " + this.seqY.getNumberOfElements());
171 | // System.out.println("bitmapZ size = " + this.bitmapZ.getNumBits());
172 | // System.out.println("seqZ size = " + this.seqZ.getNumberOfElements());
173 |
174 | if (this.trimNeeded) {
175 | vectorY.aggresiveTrimToSize();
176 | vectorZ.trimToSize();
177 | }
178 |
179 | this.adjY = new AdjacencyList(this.seqY, this.bitmapY);
180 | this.adjZ = new AdjacencyList(this.seqZ, this.bitmapZ);
181 |
182 | // DEBUG
183 | // this.adjY.dump();
184 | // this.adjZ.dump();
185 | }
186 |
187 | @Override
188 | public long getNumberOfElements() {
189 | return this.number;
190 | }
191 |
192 | @Override
193 | public long size() {
194 | return this.size;
195 | }
196 |
197 | }
198 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/listener/HDTBuilderListener.java:
--------------------------------------------------------------------------------
1 | package org.rdfhdt.listener;
2 |
3 | import org.rdfhdt.hdt.listener.ProgressListener;
4 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration;
5 |
6 | public class HDTBuilderListener implements ProgressListener {
7 |
8 | boolean quiet;
9 |
10 | public HDTBuilderListener(HDTBuilderConfiguration conf) {
11 | this.quiet = conf.getQuiet();
12 | }
13 |
14 | public HDTBuilderListener(boolean quiet) {
15 | this.quiet = quiet;
16 | }
17 |
18 | @Override
19 | public void notifyProgress(float level, String message) {
20 | if (!this.quiet) {
21 | System.out.print("\r" + message + "\t" + Float.toString(level) + " \r");
22 | }
23 | }
24 | }
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/HDTBuilderConfiguration.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder;
24 |
25 | import java.io.IOException;
26 |
27 | import org.apache.commons.io.FilenameUtils;
28 | import org.apache.commons.lang.StringUtils;
29 | import org.apache.hadoop.conf.Configuration;
30 | import org.apache.hadoop.fs.Path;
31 | import org.rdfhdt.hdt.options.HDTSpecification;
32 |
33 | import com.beust.jcommander.JCommander;
34 | import com.beust.jcommander.Parameter;
35 |
36 | public class HDTBuilderConfiguration {
37 |
38 | public final static int CHUNK_SIZE = 1 * 1024 * 1024;
39 |
40 | public final static String SHARED = "shared";
41 | public final static String SUBJECTS = "subjects";
42 | public final static String PREDICATES = "predicates";
43 | public final static String OBJECTS = "objects";
44 | public final static String SAMPLE = "samples";
45 |
46 | public final static String SHARED_OUTPUT_PATH = SHARED + "/";
47 | public final static String SUBJECTS_OUTPUT_PATH = SUBJECTS + "/";
48 | public final static String PREDICATES_OUTPUT_PATH = PREDICATES + "/";
49 | public final static String OBJECTS_OUTPUT_PATH = OBJECTS + "/";
50 | public final static String SAMPLE_OUTPUT_PATH = SAMPLE + "/";
51 |
52 | final static String DEFAULT_CONFIGURATION_PATH = "HDTMRBuilder.xml";
53 |
54 | final static String AWS_BUCKET_NAME = "global.bucket";
55 | final static String AWS_BUCKET_DEFAULT_VALUE = null;
56 |
57 | final static String BASE_PATH_NAME = "global.path.base";
58 | final static String BASE_PATH_DEFAULT_VALUE = ".";
59 | final static String INPUT_PATH_NAME = "global.path.input";
60 | final static String INPUT_PATH_DEFAULT_VALUE = "input";
61 |
62 | final static String DICTIONARY_RUN_JOB_NAME = "job.dictionary.run";
63 | final static Boolean DICTIONARY_RUN_JOB_DEFAULT_VALUE = true;
64 | final static String DICTIONARY_JOB_NAME_NAME = "job.dictionary.name";
65 | final static String DICTIONARY_JOB_NAME_DEFAULT_VALUE = "DictionaryJob";
66 | final static String DICTIONARY_OUTPUT_PATH_NAME = "job.dictionary.path.output";
67 | final static String DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE = "dictionary";
68 | final static String DICTIONARY_DELETE_OUTPUT_PATH_NAME = "job.dictionary.path.output.delete";
69 | final static boolean DICTIONARY_DELETE_OUTPUT_PATH_DEFAULT_VALUE = false;
70 | final static String DICTIONARY_NUM_REDUCERS_NAME = "job.dictionary.reducers";
71 | final static int DICTIONARY_NUM_REDUCERS_DEFAULT_VALUE = 1;
72 |
73 | final static String DICTIONARY_RUN_SAMPLE_NAME = "job.dictionary.sample.run";
74 | final static boolean DICTIONARY_RUN_SAMPLE_DEFAULT_VALUE = true;
75 | final static String DICTIONARY_SAMPLE_PROBABILITY_NAME = "job.dictionary.sample.probability";
76 | final static float DICTIONARY_SAMPLE_PROBABILITY_DEFAULT_VALUE = (float) 0.001;
77 | final static String DICTIONARY_SAMPLE_OUTPUT_PATH_NAME = "job.dictionary.path.sample";
78 | final static String DICTIONARY_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE = "dictionary_samples";
79 | final static String DICTIONARY_DELETE_SAMPLE_PATH_NAME = "job.dictionary.path.sample.delete";
80 | final static boolean DICTIONARY_DELETE_SAMPLE_PATH_DEFAULT_VALUE = false;
81 | final static String DICTIONARY_SAMPLE_NUM_REDUCERS_NAME = "job.dictionary.sample.reducers";
82 | final static int DICTIONARY_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE = 1;
83 |
84 | final static String HDTDICTIONARY_BUILD_NAME = "hdt.dictionary.build";
85 | final static boolean HDTDICTIONARY_BUILD_DEFAULT_VALUE = true;
86 | final static String HDTDICTIONARY_FILE_NAME = "hdt.dictionary.file";
87 | final static String HDTDICTIONARY_FILE_DEFAULT_VALUE = "dictionary.hdt";
88 | final static String HDTDICTIONARY_DISTRIBUTION_NAME = "job.triples.dictionary.distribution";
89 | final static int HDTDICTIONARY_DISTRIBUTION_DEFAULT_VALUE = 1;
90 |
91 | final static String TRIPLES_RUN_JOB_NAME = "job.triples.run";
92 | final static boolean TRIPLES_RUN_JOB_DEFAULT_VALUE = true;
93 | final static String TRIPLES_JOB_NAME_NAME = "job.triples.name";
94 | final static String TRIPLES_JOB_NAME_DEFAULT_VALUE = "TriplesJob";
95 | // final static String TRIPLES_MAP_DICTIONARY_FILE_NAME = "job.triples.map.dictionary.file";
96 | // final static String TRIPLES_MAP_DICTIONARY_FILE_DEFAULT_VALUE = "dictionary_map.hdt";
97 | // final static String TRIPLES_REDUCE_DICTIONARY_FILE_NAME = "job.triples.reduce.dictionary.file";
98 | // final static String TRIPLES_REDUCE_DICTIONARY_FILE_DEFAULT_VALUE = "dictionary_reduce.hdt";
99 | final static String TRIPLES_OUTPUT_PATH_NAME = "job.triples.path.output";
100 | final static String TRIPLES_OUTPUT_PATH_DEFAULT_VALUE = "triples";
101 | final static String TRIPLES_DELETE_OUTPUT_PATH_NAME = "job.triples.path.output.delete";
102 | final static boolean TRIPLES_DELETE_OUTPUT_PATH_DEFAULT_VALUE = false;
103 | final static String TRIPLES_NUM_REDUCERS_NAME = "job.triples.reducers";
104 | final static int TRIPLES_NUM_REDUCERS_DEFAULT_VALUE = 1;
105 |
106 | final static String TRIPLES_RUN_SAMPLE_NAME = "job.triples.sample.run";
107 | final static boolean TRIPLES_RUN_SAMPLE_DEFAULT_VALUE = true;
108 | final static String TRIPLES_SAMPLE_PROBABILITY_NAME = "job.triples.sample.probability";
109 | final static float TRIPLES_SAMPLE_PROBABILITY_DEFAULT_VALUE = (float) 0.001;
110 | final static String TRIPLES_SAMPLE_OUTPUT_PATH_NAME = "job.triples.path.sample";
111 | final static String TRIPLES_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE = "triples_samples";
112 | final static String TRIPLES_DELETE_SAMPLE_PATH_NAME = "job.triples.path.sample.delete";
113 | final static boolean TRIPLES_DELETE_SAMPLE_PATH_DEFAULT_VALUE = false;
114 | final static String TRIPLES_SAMPLE_NUM_REDUCERS_NAME = "job.triples.sample.reducers";
115 | final static int TRIPLES_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE = 1;
116 |
117 | final static String HDT_BUILD_NAME = "hdt.build";
118 | final static boolean HDT_BUILD_DEFAULT_VALUE = true;
119 | final static String HDT_OUTPUT_PATH_NAME = "hdt.path.output";
120 | final static String HDT_OUTPUT_PATH_DEFAULT_VALUE = "hdt_output";
121 | final static String HDT_FILE_NAME = "hdt.file";
122 | final static String HDT_FILE_DEFAULT_VALUE = "output.hdt";
123 |
124 | final static String CONFIG_FILE_NAME = "hdt-lib.configFile";
125 | final static String CONFIG_FILE_DEFAULT_VALUE = null;
126 | final static String OPTIONS_NAME = "hdtl-lib.options";
127 | final static String OPTIONS_DEFAULT_VALUE = null;
128 | final static String RDF_TYPE_NAME = "hdt-lib.rdfType";
129 | final static String RDF_TYPE_DEFAULT_VALUE = "ntriples";
130 | final static String QUIET_NAME = "hdt-lib.quiet";
131 | final static boolean QUIET_DEFAULT_VALUE = false;
132 | final static String BASE_URI_NAME = "hdt-lib.baseUri";
133 | final static String BASE_URI_DEFAULT_VALUE = "http://rdfhdt.org/HDTMR";
134 | final static String GENERATE_INDEX_NAME = "hdt-lib.generateIndex";
135 | final static boolean GENERATE_INDEX_DEFAULT_VALUE = false;
136 |
137 | JCommander jc;
138 |
139 | @Parameter(names = { "-h", "--help" }, help = true, hidden = true)
140 | boolean help = false;
141 |
142 | @Parameter(names = { "-a", "--awsbucket" }, description = "Amazon Web Services bucket")
143 | String pAwsBucket = null;
144 |
145 | @Parameter(names = { "-c", "--conf" }, description = "Path to configuration file")
146 | String pConfigFile = null;
147 |
148 | @Parameter(names = { "-b", "--basedir" }, description = "Root directory for the process")
149 | String pBasePath = null;
150 |
151 | @Parameter(names = { "-rd", "--rundictionary" }, description = "Whether to run dictionary job or not", arity = 1)
152 | Boolean pRunDictionary = null;
153 |
154 | @Parameter(names = { "-rds", "--rundictionarysampling" }, description = "Whether to run dictionary input sampling job or not", arity = 1)
155 | Boolean pRunDictionarySampling = null;
156 |
157 | @Parameter(names = { "-nd", "--namedictionaryjob" }, description = "Name of dictionary job")
158 | String pDictionaryName = null;
159 |
160 | @Parameter(names = { "-i", "--input" }, description = "Path to input files. Relative to basedir")
161 | String pInputPath = null;
162 |
163 | @Parameter(names = { "-sd", "--samplesdictionary" }, description = "Path to dictionary job sample files. Relative to basedir")
164 | String pDictionarySamplePath = null;
165 |
166 | @Parameter(names = { "-st", "--samplestriples" }, description = "Path to triples job sample files. Relative to basedir")
167 | String pTriplesSamplePath = null;
168 |
169 | @Parameter(names = { "-od", "--outputdictionary" }, description = "Path to dictionary job output files. Relative to basedir")
170 | String pDictionaryOutputPath = null;
171 |
172 | @Parameter(names = { "-dd", "--deleteoutputdictionary" }, description = "Delete dictionary job output path before running job")
173 | Boolean pDeleteDictionaryOutputPath = null;
174 |
175 | @Parameter(names = { "-dsd", "--deletesampledictionary" }, description = "Delete dictionary job sample path before running job")
176 | Boolean pDeleteDictionarySamplePath = null;
177 |
178 | @Parameter(names = { "-dst", "--deletesampletriples" }, description = "Delete triples job sample path before running job")
179 | Boolean pDeleteTriplesSamplePath = null;
180 |
181 | @Parameter(names = { "-Rd", "--reducersdictionary" }, description = "Number of reducers for dictionary job")
182 | Integer pNumReducersDictionary = null;
183 |
184 | @Parameter(names = { "-Rds", "--reducersdictionarysampling" }, description = "Number of reducers for dictionary input sampling job")
185 | Integer pNumReducersDictionarySampling = null;
186 |
187 | @Parameter(names = { "-bd", "--builddictionary" }, description = "Whether to build HDT dictionary or not", arity = 1)
188 | Boolean pBuildDictionary = null;
189 |
190 | @Parameter(names = { "-bh", "--buildhdt" }, description = "Whether to build HDT or not", arity = 1)
191 | Boolean pBuildHDT = null;
192 |
193 | @Parameter(names = { "-fd", "--filedictionary" }, description = "Name of hdt dictionary file")
194 | String pDictionaryFileName = null;
195 |
196 | @Parameter(names = { "-fm", "--filesubjects" }, description = "Name of hdt dictionary file for Mappers")
197 | String pMapDictionaryFileName = null;
198 |
199 | @Parameter(names = { "-fr", "--fileobjects" }, description = "Name of hdt dictionary file for Reducers")
200 | String pReduceDictionaryFileName = null;
201 |
202 | @Parameter(names = { "-d", "--dictionarydistribution" }, description = "Dictionary distribution among mappers and reducers")
203 | Integer pDictionaryDistribution = null;
204 |
205 | @Parameter(names = { "-rt", "--runtriples" }, description = "Whether to run triples job or not", arity = 1)
206 | Boolean pRunTriples = null;
207 |
208 | @Parameter(names = { "-rts", "--runtriplessampling" }, description = "Whether to run triples input sampling job or not", arity = 1)
209 | Boolean pRunTriplesSampling = null;
210 |
211 | @Parameter(names = { "-nt", "--nametriplesjob" }, description = "Name of triples job")
212 | String pTriplesName = null;
213 |
214 | @Parameter(names = { "-it", "--inputtriples" }, description = "Path to triples job input files. Relative to basedir")
215 | String pTriplesInputPath = null;
216 |
217 | @Parameter(names = { "-ot", "--outputtriples" }, description = "Path to triples job output files. Relative to basedir")
218 | String pTriplesOutputPath = null;
219 |
220 | @Parameter(names = { "-dt", "--deleteoutputtriples" }, description = "Delete triples job output path before running job")
221 | Boolean pDeleteTriplesOutputPath = null;
222 |
223 | @Parameter(names = { "-Rt", "--reducerstriples" }, description = "Number of reducers for triples job")
224 | Integer pNumReducersTriples = null;
225 |
226 | @Parameter(names = { "-Rts", "--reducerstriplessampling" }, description = "Number of reducers for triples input sampling job")
227 | Integer pNumReducersTriplesSampling = null;
228 |
229 | @Parameter(names = { "-fh", "--namehdtfile" }, description = "Name of hdt file")
230 | String pHdtFileName = null;
231 |
232 | @Parameter(names = { "-hc", "--hdtconf" }, description = "Conversion config file")
233 | String pHdtConfigFile = null;
234 |
235 | @Parameter(names = { "-o", "--options" }, description = "HDT Conversion options (override those of config file)")
236 | String pOptions = null;
237 |
238 | @Parameter(names = { "-t", "--rdftype" }, description = "Type of RDF Input (ntriples, nquad, n3, turtle, rdfxml)")
239 | String pRdfType = null;
240 |
241 | @Parameter(names = { "-bu", "--baseURI" }, description = "Base URI for the dataset")
242 | String pBaseURI = null;
243 |
244 | @Parameter(names = { "-q", "--quiet" }, description = "Do not show progress of the conversion")
245 | Boolean pQuiet = null;
246 |
247 | @Parameter(names = { "-x", "--index" }, description = "Generate also external indices to solve all queries")
248 | Boolean pGenerateIndex = null;
249 |
250 | @Parameter(names = { "-p", "--sampleprobability" }, description = "Probability of using each element for sampling")
251 | Float pSampleProbability = null;
252 |
253 | Path inputPath = null, dictionarySamplesPath = null, dictionaryOutputPath = null, sharedOutputPath = null, subjectsOutputPath = null, predicatesOutputPath = null, objectsOutputPath = null;
254 | Path dictionaryCountersFile = null, triplesSamplesPath = null, triplesCountersFile = null, hdtDictionarySPOFile = null, hdtMapDictionaryFile = null, hdtReduceDictionaryFile = null, hdtFile = null;
255 | Path triplesInputPath = null, triplesOutputPath = null;
256 |
257 | Configuration mrConfiguration = new Configuration();
258 |
259 | HDTSpecification spec;
260 |
261 | // This constructor is to be used by Tasks (Mappers and/or Reducers)
262 | public HDTBuilderConfiguration(Configuration config) throws IOException {
263 | this.mrConfiguration = config;
264 | }
265 |
266 | // This constructor is to be used by Drivers
267 | public HDTBuilderConfiguration(String[] args) {
268 | this.jc = new JCommander(this, args);
269 | if (this.help) {
270 | this.jc.usage();
271 | System.exit(1);
272 | }
273 | this.addConfigurationResource(this.getConfigFile());
274 |
275 | // FIXME: Esto debería hacerse para todos los parámetros pasados por
276 | // línea de comandos
277 | this.setProperty(DICTIONARY_OUTPUT_PATH_NAME, this.getDictionaryOutputPath().toString());
278 | }
279 |
280 | private void addConfigurationResource(String configurationPath) {
281 | this.mrConfiguration.addResource(new Path(configurationPath));
282 | }
283 |
284 | private String getConfigFile() {
285 | return this.addBucket(this.pConfigFile != null ? this.pConfigFile : DEFAULT_CONFIGURATION_PATH);
286 | }
287 |
288 | public Configuration getConfigurationObject() {
289 | return this.mrConfiguration;
290 | }
291 |
292 | public void setProperty(String name, String value) {
293 | this.mrConfiguration.set(name, value);
294 | }
295 |
296 | public void setProperty(String name, int value) {
297 | this.mrConfiguration.setInt(name, value);
298 | }
299 |
300 | public String getAwsBucket() {
301 | return this.get(this.pAwsBucket, AWS_BUCKET_NAME, AWS_BUCKET_DEFAULT_VALUE);
302 | }
303 |
304 | public boolean runDictionary() {
305 | return this.get(this.pRunDictionary, DICTIONARY_RUN_JOB_NAME, DICTIONARY_RUN_JOB_DEFAULT_VALUE);
306 | }
307 |
308 | public boolean runDictionarySampling() {
309 | return this.get(this.pRunDictionarySampling, DICTIONARY_RUN_SAMPLE_NAME, DICTIONARY_RUN_SAMPLE_DEFAULT_VALUE);
310 | }
311 |
312 | public boolean runTriples() {
313 | return this.get(this.pRunTriples, TRIPLES_RUN_JOB_NAME, TRIPLES_RUN_JOB_DEFAULT_VALUE);
314 | }
315 |
316 | public boolean runTriplesSampling() {
317 | return this.get(this.pRunTriplesSampling, TRIPLES_RUN_SAMPLE_NAME, TRIPLES_RUN_SAMPLE_DEFAULT_VALUE);
318 | }
319 |
320 | public boolean buildDictionary() {
321 | return this.get(this.pBuildDictionary, HDTDICTIONARY_BUILD_NAME, HDTDICTIONARY_BUILD_DEFAULT_VALUE);
322 | }
323 |
324 | public boolean buildHDT() {
325 | return this.get(this.pBuildHDT, HDT_BUILD_NAME, HDT_BUILD_DEFAULT_VALUE);
326 | }
327 |
328 | public String getDictionaryJobName() {
329 | return this.get(this.pTriplesName, DICTIONARY_JOB_NAME_NAME, DICTIONARY_JOB_NAME_DEFAULT_VALUE);
330 | }
331 |
332 | public String getTriplesJobName() {
333 | return this.get(this.pTriplesName, DICTIONARY_JOB_NAME_NAME, DICTIONARY_JOB_NAME_DEFAULT_VALUE);
334 | }
335 |
336 | public Path getInputPath() {
337 | if (this.inputPath == null) {
338 | this.inputPath = new Path(this.getPath(this.get(this.pInputPath, INPUT_PATH_NAME, INPUT_PATH_DEFAULT_VALUE)));
339 | }
340 | return this.inputPath;
341 | }
342 |
343 | public Path getDictionaryOutputPath() {
344 | if (this.dictionaryOutputPath == null) {
345 | this.dictionaryOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)));
346 | }
347 | return this.dictionaryOutputPath;
348 | }
349 |
350 | public Path getSharedSectionPath() {
351 | if (this.sharedOutputPath == null) {
352 | this.sharedOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + SHARED_OUTPUT_PATH);
353 | }
354 | return this.sharedOutputPath;
355 | }
356 |
357 | public Path getSubjectsSectionPath() {
358 | if (this.subjectsOutputPath == null) {
359 | this.subjectsOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + SUBJECTS_OUTPUT_PATH);
360 | }
361 | return this.subjectsOutputPath;
362 | }
363 |
364 | public Path getPredicatesSectionPath() {
365 | if (this.predicatesOutputPath == null) {
366 | this.predicatesOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + PREDICATES_OUTPUT_PATH);
367 | }
368 | return this.predicatesOutputPath;
369 | }
370 |
371 | public Path getObjectsSectionPath() {
372 | if (this.objectsOutputPath == null) {
373 | this.objectsOutputPath = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + OBJECTS_OUTPUT_PATH);
374 | }
375 | return this.objectsOutputPath;
376 | }
377 |
378 | public Path getDictionarySamplesPath() {
379 | if (this.dictionarySamplesPath == null) {
380 | this.dictionarySamplesPath = new Path(this.getPath(this.get(this.pDictionarySamplePath, DICTIONARY_SAMPLE_OUTPUT_PATH_NAME, DICTIONARY_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE)));
381 | }
382 | return this.dictionarySamplesPath;
383 | }
384 |
385 | public Path getTriplesSamplesPath() {
386 | if (this.triplesSamplesPath == null) {
387 | this.triplesSamplesPath = new Path(this.getPath(this.get(this.pTriplesSamplePath, TRIPLES_SAMPLE_OUTPUT_PATH_NAME, TRIPLES_SAMPLE_OUTPUT_PATH_DEFAULT_VALUE)));
388 | }
389 | return this.triplesSamplesPath;
390 | }
391 |
392 | public float getSampleProbability() {
393 | return this.get(this.pSampleProbability, DICTIONARY_SAMPLE_PROBABILITY_NAME, DICTIONARY_SAMPLE_PROBABILITY_DEFAULT_VALUE);
394 | }
395 |
396 | public Path getDictionaryCountersFile() {
397 | if (this.dictionaryCountersFile == null) {
398 | this.dictionaryCountersFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + ".info");
399 | }
400 | return this.dictionaryCountersFile;
401 | }
402 |
403 | public Path getDictionaryFile() {
404 | if (this.hdtDictionarySPOFile == null) {
405 | this.hdtDictionarySPOFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pDictionaryFileName, HDTDICTIONARY_FILE_NAME, HDTDICTIONARY_FILE_DEFAULT_VALUE));
406 | }
407 | return this.hdtDictionarySPOFile;
408 | }
409 |
410 | // public Path getDictionaryMapFile() {
411 | // if (this.hdtMapDictionaryFile == null) {
412 | // this.hdtMapDictionaryFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pMapDictionaryFileName, TRIPLES_MAP_DICTIONARY_FILE_NAME, TRIPLES_MAP_DICTIONARY_FILE_DEFAULT_VALUE));
413 | // }
414 | // return this.hdtMapDictionaryFile;
415 | // }
416 | //
417 | // public Path getDictionaryReduceFile() {
418 | // if (this.hdtReduceDictionaryFile == null) {
419 | // this.hdtReduceDictionaryFile = new Path(this.getPath(this.get(this.pDictionaryOutputPath, DICTIONARY_OUTPUT_PATH_NAME, DICTIONARY_OUTPUT_PATH_DEFAULT_VALUE)) + "/" + this.get(this.pReduceDictionaryFileName, TRIPLES_REDUCE_DICTIONARY_FILE_NAME, TRIPLES_REDUCE_DICTIONARY_FILE_DEFAULT_VALUE));
420 | // }
421 | // return this.hdtReduceDictionaryFile;
422 | // }
423 |
424 | public int getDictionaryDistribution() {
425 | return this.get(this.pDictionaryDistribution, HDTDICTIONARY_DISTRIBUTION_NAME, HDTDICTIONARY_DISTRIBUTION_DEFAULT_VALUE);
426 | }
427 |
428 | public Path getTriplesOutputPath() {
429 | if (this.triplesOutputPath == null) {
430 | this.triplesOutputPath = new Path(this.getPath(this.get(this.pTriplesOutputPath, TRIPLES_OUTPUT_PATH_NAME, TRIPLES_OUTPUT_PATH_DEFAULT_VALUE)));
431 | }
432 | return this.triplesOutputPath;
433 | }
434 |
435 | public Path getTriplesCountersFile() {
436 | if (this.triplesCountersFile == null) {
437 | this.triplesCountersFile = new Path(this.getPath(this.get(this.pTriplesOutputPath, TRIPLES_OUTPUT_PATH_NAME, TRIPLES_OUTPUT_PATH_DEFAULT_VALUE)) + ".info");
438 | }
439 | return this.triplesCountersFile;
440 | }
441 |
442 | public Path getHDTFile() {
443 | if (this.hdtFile == null) {
444 | this.hdtFile = new Path(this.getPath(this.get(this.pHdtFileName, HDT_FILE_NAME, HDT_FILE_DEFAULT_VALUE)));
445 | }
446 | return this.hdtFile;
447 | }
448 |
449 | public boolean getDeleteDictionaryOutputPath() {
450 | return this.get(this.pDeleteDictionaryOutputPath, DICTIONARY_DELETE_OUTPUT_PATH_NAME, DICTIONARY_DELETE_OUTPUT_PATH_DEFAULT_VALUE);
451 | }
452 |
453 | public boolean getDeleteDictionarySamplesPath() {
454 | return this.get(this.pDeleteDictionarySamplePath, DICTIONARY_DELETE_SAMPLE_PATH_NAME, DICTIONARY_DELETE_SAMPLE_PATH_DEFAULT_VALUE);
455 | }
456 |
457 | public boolean getDeleteTriplesOutputPath() {
458 | return this.get(this.pDeleteTriplesOutputPath, TRIPLES_DELETE_OUTPUT_PATH_NAME, TRIPLES_DELETE_OUTPUT_PATH_DEFAULT_VALUE);
459 | }
460 |
461 | public boolean getDeleteTriplesSamplesPath() {
462 | return this.get(this.pDeleteTriplesSamplePath, TRIPLES_DELETE_SAMPLE_PATH_NAME, TRIPLES_DELETE_SAMPLE_PATH_DEFAULT_VALUE);
463 | }
464 |
465 | public int getDictionaryReducers() {
466 | return this.get(this.pNumReducersDictionary, DICTIONARY_NUM_REDUCERS_NAME, DICTIONARY_NUM_REDUCERS_DEFAULT_VALUE);
467 | }
468 |
469 | public int getDictionarySampleReducers() {
470 | return this.get(this.pNumReducersDictionarySampling, DICTIONARY_SAMPLE_NUM_REDUCERS_NAME, DICTIONARY_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE);
471 | }
472 |
473 | public int getTriplesReducers() {
474 | return this.get(this.pNumReducersTriples, TRIPLES_NUM_REDUCERS_NAME, TRIPLES_NUM_REDUCERS_DEFAULT_VALUE);
475 | }
476 |
477 | public int getTriplesSampleReducers() {
478 | return this.get(this.pNumReducersTriplesSampling, TRIPLES_SAMPLE_NUM_REDUCERS_NAME, TRIPLES_SAMPLE_NUM_REDUCERS_DEFAULT_VALUE);
479 | }
480 |
481 | public String getHdtConfigFile() {
482 | return this.getPath(this.get(this.pHdtConfigFile, CONFIG_FILE_NAME, CONFIG_FILE_DEFAULT_VALUE));
483 | }
484 |
485 | public String getOptions() {
486 | return this.get(this.pOptions, OPTIONS_NAME, OPTIONS_DEFAULT_VALUE);
487 | }
488 |
489 | public String getRdfType() {
490 | return this.get(this.pRdfType, RDF_TYPE_NAME, RDF_TYPE_DEFAULT_VALUE);
491 | }
492 |
493 | public boolean getQuiet() {
494 | return this.get(this.pQuiet, QUIET_NAME, QUIET_DEFAULT_VALUE);
495 | }
496 |
497 | public String getBaseURI() {
498 | return this.get(this.pBaseURI, BASE_URI_NAME, BASE_URI_DEFAULT_VALUE);
499 | }
500 |
501 | public HDTSpecification getSpec() throws IOException {
502 | if (this.spec == null) {
503 | if (this.getHdtConfigFile() != null) {
504 | this.spec = new HDTSpecification(this.getHdtConfigFile());
505 | } else {
506 | this.spec = new HDTSpecification();
507 | }
508 | if (this.getOptions() != null) {
509 | this.spec.setOptions(this.getOptions());
510 | }
511 | }
512 | return this.spec;
513 | }
514 |
515 | private String get(String paramValue, String confName, String defaultValue) {
516 | return paramValue != null ? paramValue : this.mrConfiguration.get(confName, defaultValue);
517 | }
518 |
519 | private boolean get(Boolean paramValue, String confName, boolean defaultValue) {
520 | return paramValue != null ? paramValue : this.mrConfiguration.getBoolean(confName, defaultValue);
521 | }
522 |
523 | private int get(Integer paramValue, String confName, int defaultValue) {
524 | return paramValue != null ? paramValue : this.mrConfiguration.getInt(confName, defaultValue);
525 | }
526 |
527 | private float get(Float paramValue, String confName, float defaultValue) {
528 | return paramValue != null ? paramValue : this.mrConfiguration.getFloat(confName, defaultValue);
529 | }
530 |
531 | private String getPath(String path) {
532 | // Add Base Path
533 | return FilenameUtils.concat(this.get(this.pBasePath, BASE_PATH_NAME, BASE_PATH_DEFAULT_VALUE), path);
534 | }
535 |
536 | private String addBucket(String path) {
537 | // If bucket is provided as parameter, and configuration path is
538 | // relative, create absolute configuration path
539 | if (this.getAwsBucket() != null && !path.startsWith("s3n://")) {
540 | path = "s3n://" + this.getAwsBucket() + "/" + StringUtils.removeStart(path, "/");
541 | }
542 | return path;
543 | }
544 |
545 | // private void set(Integer paramValue, String confName, int defautlValue) {
546 | // mrConfiguration.setInt(confName, paramValue != null ? paramValue :
547 | // mrConfiguration.getInt(confName, defautlValue));
548 | // }
549 |
550 | }
551 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/HDTBuilderDriver.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder;
24 |
25 | import java.io.BufferedInputStream;
26 | import java.io.BufferedOutputStream;
27 | import java.io.BufferedReader;
28 | import java.io.BufferedWriter;
29 | import java.io.IOException;
30 | import java.io.InputStreamReader;
31 | import java.io.OutputStreamWriter;
32 | import java.net.URI;
33 | import java.net.URISyntaxException;
34 | import java.util.Arrays;
35 |
36 | import org.apache.hadoop.filecache.DistributedCache;
37 | import org.apache.hadoop.fs.FileStatus;
38 | import org.apache.hadoop.fs.FileSystem;
39 | import org.apache.hadoop.fs.Path;
40 | import org.apache.hadoop.fs.PathFilter;
41 | import org.apache.hadoop.io.NullWritable;
42 | import org.apache.hadoop.io.SequenceFile;
43 | import org.apache.hadoop.io.Text;
44 | import org.apache.hadoop.mapreduce.Job;
45 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
46 | import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
47 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
48 | import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
49 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
50 | import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
51 | import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
52 | import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
53 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary;
54 | import org.rdfhdt.hdt.dictionary.impl.FourSectionDictionary2;
55 | import org.rdfhdt.hdt.dictionary.impl.section.TransientDictionarySection;
56 | import org.rdfhdt.hdt.hdt.impl.TransientHDT;
57 | import org.rdfhdt.hdt.options.ControlInformation;
58 | import org.rdfhdt.hdt.trans.TransientElement;
59 | import org.rdfhdt.hdt.triples.impl.TransientBitMapTriples;
60 | import org.rdfhdt.listener.HDTBuilderListener;
61 | import org.rdfhdt.mrbuilder.dictionary.DictionaryCombiner;
62 | import org.rdfhdt.mrbuilder.dictionary.DictionaryMapper;
63 | import org.rdfhdt.mrbuilder.dictionary.DictionaryReducer;
64 | import org.rdfhdt.mrbuilder.dictionary.DictionarySamplerMapper;
65 | import org.rdfhdt.mrbuilder.dictionary.DictionarySamplerReducer;
66 | import org.rdfhdt.mrbuilder.io.TripleSPOComparator;
67 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable;
68 | import org.rdfhdt.mrbuilder.triples.TriplesSPOMapper;
69 | import org.rdfhdt.mrbuilder.util.FileStatusComparator;
70 |
71 | import com.hadoop.mapreduce.LzoTextInputFormat;
72 |
73 | public class HDTBuilderDriver {
74 |
75 | public enum Counters {
76 | Triples, Subjects, Predicates, Objects, Shared, Sample
77 | }
78 |
79 | protected HDTBuilderConfiguration conf;
80 | protected HDTBuilderListener listener;
81 | protected FileSystem inputFS, dictionaryFS, triplesFS;
82 | protected Long numTriples = null, numShared = null, numSubjects = null, numPredicates = null, numObjects = null;
83 | protected FourSectionDictionary2 dictionary = null;
84 |
85 | public HDTBuilderDriver(String[] args) throws IOException {
86 |
87 | // load configuration
88 | this.conf = new HDTBuilderConfiguration(args);
89 |
90 | this.listener = new HDTBuilderListener(this.conf);
91 |
92 | // get the FileSystem instances for each path
93 | this.inputFS = this.conf.getInputPath().getFileSystem(this.conf.getConfigurationObject());
94 | this.dictionaryFS = this.conf.getDictionaryOutputPath().getFileSystem(this.conf.getConfigurationObject());
95 | this.triplesFS = this.conf.getTriplesOutputPath().getFileSystem(this.conf.getConfigurationObject());
96 |
97 | }
98 |
99 | public static void main(String[] args) throws Exception {
100 | boolean ok = true;
101 | HDTBuilderDriver driver = new HDTBuilderDriver(args);
102 |
103 | if (ok && driver.conf.runDictionarySampling()) {
104 | if (driver.conf.getDictionaryReducers() == 1) {
105 | System.out.println("WARNING: Only one Reducer. Dictionary creation as a single job is more efficient.");
106 | }
107 | ok = driver.runDictionaryJobSampling();
108 | }
109 |
110 | if (ok && driver.conf.runDictionary()) {
111 | if (driver.conf.getDictionaryReducers() > 1) {
112 | ok = driver.runDictionaryJob();
113 | } else {
114 | ok = driver.runDictionaryJobWithOneJob();
115 | }
116 | }
117 |
118 | if (ok && driver.conf.buildDictionary()) {
119 | ok = driver.buildDictionary();
120 | }
121 |
122 | if (ok && driver.conf.runTriplesSampling()) {
123 | if (driver.conf.getTriplesReducers() == 1) {
124 | System.out.println("WARNING: Only one Reducer. Triples creation as a single job is more efficient.");
125 | }
126 | ok = driver.runTriplesJobSampling();
127 | }
128 |
129 | if (ok && driver.conf.runTriples()) {
130 | if (driver.conf.getTriplesReducers() > 1) {
131 | ok = driver.runTriplesJob();
132 | } else {
133 | ok = driver.runTriplesJobWithOneJob();
134 | }
135 | }
136 |
137 | if (ok && driver.conf.buildHDT()) {
138 | ok = driver.buidHDT();
139 | }
140 |
141 | System.exit(ok ? 0 : 1);
142 | }
143 |
144 | protected boolean runDictionaryJobSampling() throws IOException, ClassNotFoundException, InterruptedException {
145 | boolean jobOK;
146 | Job job = null;
147 |
148 | // if input path does not exists, fail
149 | if (!this.inputFS.exists(this.conf.getInputPath())) {
150 | System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
151 | System.exit(-1);
152 | }
153 |
154 | // if samples path exists...
155 | if (this.dictionaryFS.exists(this.conf.getDictionarySamplesPath())) {
156 | if (this.conf.getDeleteDictionarySamplesPath()) { // ... and option provided, delete recursively
157 | this.dictionaryFS.delete(this.conf.getDictionarySamplesPath(), true);
158 | } else { // ... and option not provided, fail
159 | System.out.println("Dictionary samples path does exist: " + this.conf.getDictionarySamplesPath());
160 | System.out.println("Select other path or use option -ds to overwrite");
161 | System.exit(-1);
162 | }
163 | }
164 |
165 | // Job to create a SequenceInputFormat with Roles
166 | job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 1");
167 | job.setJarByClass(HDTBuilderDriver.class);
168 |
169 | System.out.println("input = " + this.conf.getInputPath());
170 | System.out.println("samples = " + this.conf.getDictionarySamplesPath());
171 |
172 | FileInputFormat.addInputPath(job, this.conf.getInputPath());
173 | FileOutputFormat.setOutputPath(job, this.conf.getDictionarySamplesPath());
174 |
175 | job.setInputFormatClass(LzoTextInputFormat.class);
176 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
177 |
178 | job.setMapperClass(DictionarySamplerMapper.class);
179 | job.setMapOutputKeyClass(Text.class);
180 | job.setMapOutputValueClass(Text.class);
181 | job.setCombinerClass(DictionarySamplerReducer.class);
182 | job.setReducerClass(DictionarySamplerReducer.class);
183 | job.setOutputKeyClass(Text.class);
184 | job.setOutputValueClass(Text.class);
185 |
186 | job.setNumReduceTasks(this.conf.getDictionarySampleReducers());
187 |
188 | SequenceFileOutputFormat.setCompressOutput(job, true);
189 | SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
190 | SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
191 |
192 | jobOK = job.waitForCompletion(true);
193 |
194 | return jobOK;
195 | }
196 |
197 | protected boolean runDictionaryJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
198 | boolean jobOK;
199 | Job job = null;
200 | BufferedWriter bufferedWriter;
201 |
202 | // if output path exists...
203 | if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
204 | if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
205 | this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
206 | } else { // ... and option not provided, fail
207 | System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
208 | System.out.println("Select other path or use option -dd to overwrite");
209 | System.exit(-1);
210 | }
211 | }
212 |
213 | // Sample the SequenceInputFormat to do TotalSort and create final output
214 | job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");
215 |
216 | job.setJarByClass(HDTBuilderDriver.class);
217 |
218 | System.out.println("samples = " + this.conf.getDictionarySamplesPath());
219 | System.out.println("output = " + this.conf.getDictionaryOutputPath());
220 |
221 | FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
222 | FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());
223 |
224 | job.setInputFormatClass(SequenceFileInputFormat.class);
225 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
226 |
227 | // Identity Mapper
228 | // job.setMapperClass(Mapper.class);
229 | job.setCombinerClass(DictionaryCombiner.class);
230 | job.setPartitionerClass(TotalOrderPartitioner.class);
231 | job.setReducerClass(DictionaryReducer.class);
232 |
233 | job.setNumReduceTasks(this.conf.getDictionaryReducers());
234 |
235 | job.setMapOutputKeyClass(Text.class);
236 | job.setMapOutputValueClass(Text.class);
237 |
238 | job.setOutputKeyClass(Text.class);
239 | job.setOutputValueClass(NullWritable.class);
240 |
241 | System.out.println("Sampling started");
242 | InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler(this.conf.getSampleProbability()));
243 | String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
244 | URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
245 | DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
246 | DistributedCache.createSymlink(job.getConfiguration());
247 | System.out.println("Sampling finished");
248 |
249 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
250 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
251 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
252 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
253 |
254 | SequenceFileOutputFormat.setCompressOutput(job, true);
255 | SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
256 | SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
257 |
258 | jobOK = job.waitForCompletion(true);
259 |
260 | this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
261 | this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
262 | this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
263 | this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();
264 |
265 | bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));
266 |
267 | bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
268 | bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
269 | bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
270 | bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");
271 |
272 | bufferedWriter.close();
273 |
274 | return jobOK;
275 | }
276 |
277 | protected boolean runDictionaryJobWithOneJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
278 | boolean jobOK;
279 | Job job = null;
280 | BufferedWriter bufferedWriter;
281 |
282 | // if input path does not exists, fail
283 | if (!this.inputFS.exists(this.conf.getInputPath())) {
284 | System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
285 | System.exit(-1);
286 | }
287 |
288 | // if output path exists...
289 | if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
290 | if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
291 | this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
292 | } else { // ... and option not provided, fail
293 | System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
294 | System.out.println("Select other path or use option -dd to overwrite");
295 | System.exit(-1);
296 | }
297 | }
298 |
299 | // Launch job
300 | job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName());
301 | job.setJarByClass(HDTBuilderDriver.class);
302 |
303 | FileInputFormat.addInputPath(job, this.conf.getInputPath());
304 | FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());
305 |
306 | job.setInputFormatClass(LzoTextInputFormat.class);
307 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
308 |
309 | job.setMapperClass(DictionaryMapper.class);
310 | job.setCombinerClass(DictionaryCombiner.class);
311 | job.setReducerClass(DictionaryReducer.class);
312 |
313 | job.setNumReduceTasks(this.conf.getDictionaryReducers());
314 |
315 | job.setMapOutputKeyClass(Text.class);
316 | job.setMapOutputValueClass(Text.class);
317 |
318 | job.setOutputKeyClass(Text.class);
319 | job.setOutputValueClass(NullWritable.class);
320 |
321 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
322 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
323 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
324 | MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class);
325 |
326 | jobOK = job.waitForCompletion(true);
327 |
328 | this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
329 | this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
330 | this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
331 | this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();
332 |
333 | bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));
334 |
335 | bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
336 | bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
337 | bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
338 | bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");
339 |
340 | bufferedWriter.close();
341 |
342 | return jobOK;
343 | }
344 |
345 | protected boolean buildDictionary() throws IOException {
346 | FourSectionDictionary dictionary4mappers, dictionary4reducers;
347 |
348 | // if job not ran, read Counters
349 | if (!this.conf.runDictionary()) {
350 |
351 | System.out.println("Dictionary job not ran. Reading data from file.");
352 |
353 | BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getDictionaryCountersFile())));
354 | String line = reader.readLine();
355 | while (line != null) {
356 | String[] data = line.split("=");
357 | switch (data[0]) {
358 | case HDTBuilderConfiguration.SHARED:
359 | this.numShared = Long.parseLong(data[1]);
360 | break;
361 | case HDTBuilderConfiguration.SUBJECTS:
362 | this.numSubjects = Long.parseLong(data[1]);
363 | break;
364 | case HDTBuilderConfiguration.PREDICATES:
365 | this.numPredicates = Long.parseLong(data[1]);
366 | break;
367 | case HDTBuilderConfiguration.OBJECTS:
368 | this.numObjects = Long.parseLong(data[1]);
369 | }
370 | line = reader.readLine();
371 | }
372 | reader.close();
373 | }
374 |
375 | TransientDictionarySection shared = new TransientDictionarySection(this.conf.getSpec());
376 | TransientDictionarySection subjects = new TransientDictionarySection(this.conf.getSpec());
377 | TransientDictionarySection predicates = new TransientDictionarySection(this.conf.getSpec());
378 | TransientDictionarySection objects = new TransientDictionarySection(this.conf.getSpec());
379 |
380 |
381 |
382 | if (this.dictionaryFS.exists(this.conf.getSharedSectionPath())) {
383 | System.out.println("Shared section = " + this.conf.getSharedSectionPath());
384 | this.loadFromDir(shared, this.numShared, this.dictionaryFS, this.conf.getSharedSectionPath());
385 | }
386 |
387 | this.loadFromDir(subjects, this.numSubjects, this.dictionaryFS, this.conf.getSubjectsSectionPath());
388 | this.loadFromDir(predicates, this.numPredicates, this.dictionaryFS, this.conf.getPredicatesSectionPath());
389 | this.loadFromDir(objects, this.numObjects, this.dictionaryFS, this.conf.getObjectsSectionPath());
390 |
391 | System.out.println("Saving dictionary...");
392 | this.dictionary = new FourSectionDictionary2(this.conf.getSpec(), subjects, predicates, objects, shared);
393 | this.saveDictionary(this.dictionary, this.dictionaryFS, this.conf.getDictionaryFile());
394 |
395 | return true;
396 |
397 | }
398 |
399 | protected boolean runTriplesJobSampling() throws ClassNotFoundException, IOException, InterruptedException {
400 | Job job = null;
401 | boolean jobOK;
402 | BufferedWriter bufferedWriter;
403 |
404 | // if input path does not exists, fail
405 | if (!this.inputFS.exists(this.conf.getInputPath())) {
406 | System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
407 | System.exit(-1);
408 | }
409 |
410 | // if dictionary output path does not exists, fail
411 | if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
412 | System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
413 | System.exit(-1);
414 | }
415 |
416 | // if samples path exists, fail
417 | if (this.dictionaryFS.exists(this.conf.getTriplesSamplesPath())) {
418 | if (this.conf.getDeleteTriplesSamplesPath()) { // ... and option
419 | // provided, delete
420 | // recursively
421 | this.dictionaryFS.delete(this.conf.getTriplesSamplesPath(), true);
422 | } else { // ... and option not provided, fail
423 | System.out.println("Triples samples path does exist: " + this.conf.getTriplesSamplesPath());
424 | System.out.println("Select other path or use option -dst to overwrite");
425 | System.exit(-1);
426 | }
427 | }
428 |
429 | this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");
430 |
431 | // Job to create a SequenceInputFormat
432 | job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 1");
433 |
434 | job.setJarByClass(HDTBuilderDriver.class);
435 |
436 | FileInputFormat.addInputPath(job, this.conf.getInputPath());
437 | FileOutputFormat.setOutputPath(job, this.conf.getTriplesSamplesPath());
438 |
439 | job.setInputFormatClass(LzoTextInputFormat.class);
440 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
441 |
442 | job.setMapperClass(TriplesSPOMapper.class);
443 | job.setSortComparatorClass(TripleSPOComparator.class);
444 | job.setGroupingComparatorClass(TripleSPOComparator.class);
445 | job.setMapOutputKeyClass(TripleSPOWritable.class);
446 | job.setMapOutputValueClass(NullWritable.class);
447 | job.setOutputKeyClass(TripleSPOWritable.class);
448 | job.setOutputValueClass(NullWritable.class);
449 |
450 | job.setNumReduceTasks(this.conf.getTriplesReducers());
451 |
452 | DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());
453 |
454 | SequenceFileOutputFormat.setCompressOutput(job, true);
455 | SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
456 | SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
457 |
458 | jobOK = job.waitForCompletion(true);
459 |
460 | this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
461 | bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
462 | bufferedWriter.write(this.numTriples.toString() + "\n");
463 | bufferedWriter.close();
464 |
465 | return jobOK;
466 | }
467 |
468 | protected boolean runTriplesJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
469 | Job job = null;
470 | boolean jobOK;
471 |
472 | // if triples output path exists...
473 | if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
474 | if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
475 | this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
476 | } else { // ... and option not provided, fail
477 | System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
478 | System.out.println("Select other path or use option -dt to overwrite");
479 | System.exit(-1);
480 | }
481 | }
482 |
483 | job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 2");
484 |
485 | job.setJarByClass(HDTBuilderDriver.class);
486 |
487 | FileInputFormat.addInputPath(job, this.conf.getTriplesSamplesPath());
488 | FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());
489 |
490 | job.setInputFormatClass(SequenceFileInputFormat.class);
491 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
492 |
493 | job.setSortComparatorClass(TripleSPOComparator.class);
494 | job.setGroupingComparatorClass(TripleSPOComparator.class);
495 |
496 | job.setPartitionerClass(TotalOrderPartitioner.class);
497 |
498 | job.setOutputKeyClass(TripleSPOWritable.class);
499 | job.setOutputValueClass(NullWritable.class);
500 |
501 | job.setNumReduceTasks(this.conf.getTriplesReducers());
502 |
503 | System.out.println("Sampling started");
504 | InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler(this.conf.getSampleProbability()));
505 | String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
506 | URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
507 | DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
508 | DistributedCache.createSymlink(job.getConfiguration());
509 | System.out.println("Sampling finished");
510 |
511 | SequenceFileOutputFormat.setCompressOutput(job, true);
512 | SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
513 | SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);
514 |
515 | jobOK = job.waitForCompletion(true);
516 |
517 | return jobOK;
518 | }
519 |
520 | protected boolean runTriplesJobWithOneJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
521 | Job job = null;
522 | boolean jobOK;
523 | BufferedWriter bufferedWriter;
524 |
525 | // if input path does not exists, fail
526 | if (!this.inputFS.exists(this.conf.getInputPath())) {
527 | System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
528 | System.exit(-1);
529 | }
530 |
531 | // if dictionary output path does not exists, fail
532 | if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
533 | System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
534 | System.exit(-1);
535 | }
536 |
537 | // if triples output path exists...
538 | if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
539 | if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
540 | this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
541 | } else { // ... and option not provided, fail
542 | System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
543 | System.out.println("Select other path or use option -dt to overwrite");
544 | System.exit(-1);
545 | }
546 | }
547 |
548 | // Launch job
549 | this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");
550 |
551 | job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName());
552 | job.setJarByClass(HDTBuilderDriver.class);
553 |
554 | FileInputFormat.addInputPath(job, this.conf.getInputPath());
555 | FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());
556 |
557 | job.setInputFormatClass(LzoTextInputFormat.class);
558 | LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
559 |
560 | job.setMapperClass(TriplesSPOMapper.class);
561 | job.setSortComparatorClass(TripleSPOComparator.class);
562 | job.setMapOutputKeyClass(TripleSPOWritable.class);
563 | job.setMapOutputValueClass(NullWritable.class);
564 |
565 | job.setNumReduceTasks(this.conf.getTriplesReducers());
566 |
567 | job.setOutputKeyClass(TripleSPOWritable.class);
568 | job.setOutputValueClass(NullWritable.class);
569 |
570 | DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());
571 | // DistributedCache.addCacheFile(this.conf.getDictionaryMapFile().toUri(), job.getConfiguration());
572 | // DistributedCache.addCacheFile(this.conf.getDictionaryReduceFile().toUri(), job.getConfiguration());
573 |
574 | jobOK = job.waitForCompletion(true);
575 |
576 | this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
577 | bufferedWriter = new BufferedWriter(new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
578 | bufferedWriter.write(this.numTriples.toString() + "\n");
579 | bufferedWriter.close();
580 |
581 | return jobOK;
582 | }
583 |
584 | protected boolean buidHDT() throws IOException {
585 | BufferedOutputStream output;
586 | TransientHDT hdt = new TransientHDT(this.conf.getSpec());
587 | TransientBitMapTriples triples = new TransientBitMapTriples(this.conf.getSpec(), this.triplesFS, new Path("temp"));
588 |
589 | // if dictionary not built, load it
590 | if (this.dictionary == null) {
591 | System.out.println("Dictionary not built. Reading data from " + this.conf.getDictionaryFile());
592 | this.dictionary = this.loadDictionary(this.dictionaryFS, this.conf.getDictionaryFile());
593 | }
594 |
595 | // if maxvalues not loaded, read Counters
596 | if (!this.conf.runDictionary()) {
597 |
598 | System.out.println("Dictionary Samples job not ran. Reading data from file.");
599 |
600 | BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getDictionaryCountersFile())));
601 | String line = reader.readLine();
602 | while (line != null) {
603 | String[] data = line.split("=");
604 | switch (data[0]) {
605 | case HDTBuilderConfiguration.SHARED:
606 | this.numShared = Long.parseLong(data[1]);
607 | break;
608 | case HDTBuilderConfiguration.SUBJECTS:
609 | this.numSubjects = Long.parseLong(data[1]);
610 | break;
611 | case HDTBuilderConfiguration.PREDICATES:
612 | this.numPredicates = Long.parseLong(data[1]);
613 | break;
614 | case HDTBuilderConfiguration.OBJECTS:
615 | this.numObjects = Long.parseLong(data[1]);
616 | }
617 | line = reader.readLine();
618 | }
619 | reader.close();
620 | }
621 |
622 | // if triples job not ran, read Counters
623 | if (!this.conf.runTriples()) {
624 | System.out.println("Triples job nor ran. Reading data from " + this.conf.getTriplesCountersFile());
625 | BufferedReader reader = new BufferedReader(new InputStreamReader(this.dictionaryFS.open(this.conf.getTriplesCountersFile())));
626 | this.numTriples = Long.parseLong(reader.readLine());
627 | reader.close();
628 | }
629 |
630 | this.loadFromDir(triples, this.numTriples, this.numPredicates, (this.numShared + this.numObjects), this.triplesFS, this.conf.getTriplesOutputPath());
631 |
632 | hdt.setDictionary(this.dictionary);
633 | hdt.setTriples(triples);
634 | hdt.populateHeaderStructure(this.conf.getBaseURI());
635 |
636 | output = new BufferedOutputStream(this.triplesFS.create(this.conf.getHDTFile()));
637 | hdt.saveToHDT(output, this.listener);
638 | output.close();
639 |
640 | return true;
641 | }
642 |
643 | protected void loadFromDir(TransientElement part, long numentries, FileSystem fs, Path path) throws IOException {
644 | PathFilter filter = new PathFilter() {
645 | @Override
646 | public boolean accept(Path path) {
647 | return !path.getName().startsWith("_");
648 | }
649 | };
650 | FileStatus[] status = fs.listStatus(path, filter);
651 |
652 | if (status.length == 0) {
653 | System.out.println("Path [" + path + "] has no files. Initializing section.");
654 | part.initialize(0);
655 | } else {
656 | Arrays.sort(status, new FileStatusComparator());
657 |
658 | System.out.println("Initializing section " + path);
659 | part.initialize(numentries);
660 | for (FileStatus file : status) {
661 | System.out.println("Reading file [" + file.getPath() + "]");
662 | SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), this.conf.getConfigurationObject());
663 | part.load(reader, this.listener);
664 | reader.close();
665 | }
666 | System.out.println("Closing section " + path);
667 | part.close();
668 | }
669 | }
670 |
671 | protected void loadFromDir(TransientBitMapTriples part, long numentries, long maxpredicate, long maxobject, FileSystem fs, Path path) throws IOException {
672 | PathFilter filter = new PathFilter() {
673 | @Override
674 | public boolean accept(Path path) {
675 | return !path.getName().startsWith("_");
676 | }
677 | };
678 | FileStatus[] status = fs.listStatus(path, filter);
679 |
680 | if (status.length == 0) {
681 | System.out.println("Path [" + path + "] has no files. Initializing section.");
682 | part.initialize(0, 0);
683 | } else {
684 | Arrays.sort(status, new FileStatusComparator());
685 |
686 | System.out.println("Initializing section " + path);
687 | part.initialize(numentries, maxpredicate, maxobject);
688 | for (FileStatus file : status) {
689 | System.out.println("Reading file [" + file.getPath() + "]");
690 | SequenceFile.Reader reader = new SequenceFile.Reader(fs, file.getPath(), this.conf.getConfigurationObject());
691 | part.load(reader, this.listener);
692 | reader.close();
693 | }
694 | System.out.println("Closing section " + path);
695 | part.close();
696 | }
697 | }
698 |
699 | protected FourSectionDictionary2 loadDictionary(FileSystem fs, Path dictionaryPath) throws IOException {
700 | BufferedInputStream input = new BufferedInputStream(fs.open(dictionaryPath));
701 | FourSectionDictionary2 dictionary = new FourSectionDictionary2(this.conf.getSpec());
702 | ControlInformation ci = new ControlInformation();
703 | ci.clear();
704 | ci.load(input);
705 | dictionary.load(input, ci, this.listener);
706 | return dictionary;
707 | }
708 |
709 | protected void saveDictionary(FourSectionDictionary2 dictionary, FileSystem fs, Path dictionaryPath) throws IOException {
710 | BufferedOutputStream output = new BufferedOutputStream(fs.create(dictionaryPath));
711 | dictionary.save(output, new ControlInformation(), this.listener);
712 | output.close();
713 | }
714 |
715 | }
716 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionaryCombiner.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 |
25 | import java.io.IOException;
26 |
27 | import org.apache.hadoop.io.Text;
28 | import org.apache.hadoop.mapreduce.Reducer;
29 |
30 | public class DictionaryCombiner extends Reducer {
31 |
32 | @Override
33 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
34 | boolean isSubject = false, isPredicate = false, isObject = false;
35 | String output = new String();
36 |
37 | for (Text value : values) {
38 | if (value.toString().contains("S"))
39 | isSubject = true;
40 | if (value.toString().contains("P"))
41 | isPredicate = true;
42 | if (value.toString().contains("O"))
43 | isObject = true;
44 | }
45 |
46 | if (isSubject)
47 | output = output.concat("S");
48 | if (isPredicate)
49 | output = output.concat("P");
50 | if (isObject)
51 | output = output.concat("O");
52 |
53 | context.write(key, new Text(output));
54 |
55 | // if (key.toString().toString().contains("Forest Green is an unincorporated community in southeastern Chariton County"))
56 | // System.out.println("Combiner: " + key.toString());
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionaryMapper.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 |
25 | import java.io.IOException;
26 |
27 | import org.apache.hadoop.io.LongWritable;
28 | import org.apache.hadoop.io.Text;
29 | import org.apache.hadoop.mapreduce.Mapper;
30 | import org.rdfhdt.hdt.exceptions.ParserException;
31 | import org.rdfhdt.hdt.triples.TripleString;
32 |
33 | public class DictionaryMapper extends Mapper {
34 |
35 | @Override
36 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
37 |
38 | TripleString triple = new TripleString();
39 | try {
40 | triple.read(value.toString());
41 | } catch (ParserException e) {
42 | // TODO Auto-generated catch block
43 | e.printStackTrace();
44 | }
45 |
46 | context.write(new Text(triple.getSubject().toString()), new Text("S"));
47 | context.write(new Text(triple.getPredicate().toString()), new Text("P"));
48 | context.write(new Text(triple.getObject().toString()), new Text("O"));
49 |
50 | // if (triple.getObject().toString().toString().contains("Forest Green is an unincorporated community in southeastern Chariton County"))
51 | // System.out.println("Mapper: " + triple.getObject().toString());
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionaryReducer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 |
25 | import java.io.IOException;
26 |
27 | import org.apache.hadoop.io.NullWritable;
28 | import org.apache.hadoop.io.Text;
29 | import org.apache.hadoop.mapreduce.Reducer;
30 | import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
31 | import org.rdfhdt.mrbuilder.HDTBuilderConfiguration;
32 | import org.rdfhdt.mrbuilder.HDTBuilderDriver.Counters;
33 |
34 | public class DictionaryReducer extends Reducer {
35 |
36 | protected MultipleOutputs output;
37 |
38 | @Override
39 | protected void setup(Context context) throws IOException, InterruptedException {
40 | this.output = new MultipleOutputs(context);
41 | super.setup(context);
42 | }
43 |
44 | @Override
45 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
46 | boolean isSubject = false, isPredicate = false, isObject = false;
47 |
48 | //key = new Text(UnicodeEscape.escapeString(key.toString()));
49 |
50 | for (Text value : values) {
51 | if (value.toString().contains("S"))
52 | isSubject = true;
53 | if (value.toString().contains("P"))
54 | isPredicate = true;
55 | if (value.toString().contains("O"))
56 | isObject = true;
57 | }
58 |
59 | if (isSubject && isObject) {
60 | this.output.write(HDTBuilderConfiguration.SHARED, key, NullWritable.get(), HDTBuilderConfiguration.SHARED_OUTPUT_PATH);
61 | context.getCounter(Counters.Shared).increment(1);
62 | } else {
63 | if (isSubject) {
64 | this.output.write(HDTBuilderConfiguration.SUBJECTS, key, NullWritable.get(), HDTBuilderConfiguration.SUBJECTS_OUTPUT_PATH);
65 | context.getCounter(Counters.Subjects).increment(1);
66 | }
67 | if (isObject) {
68 | this.output.write(HDTBuilderConfiguration.OBJECTS, key, NullWritable.get(), HDTBuilderConfiguration.OBJECTS_OUTPUT_PATH);
69 | context.getCounter(Counters.Objects).increment(1);
70 | }
71 | }
72 | if (isPredicate) {
73 | this.output.write(HDTBuilderConfiguration.PREDICATES, key, NullWritable.get(), HDTBuilderConfiguration.PREDICATES_OUTPUT_PATH);
74 | context.getCounter(Counters.Predicates).increment(1);
75 | }
76 |
77 | // if (key.toString().contains("Forest Green is an unincorporated community in southeastern Chariton County"))
78 | // System.out.println("Reducer: " + key.toString());
79 | }
80 |
81 | @Override
82 | protected void cleanup(Context context) throws IOException, InterruptedException {
83 | this.output.close();
84 | super.cleanup(context);
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionarySamplerMapper.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 |
25 | import java.io.IOException;
26 |
27 | import org.apache.hadoop.io.LongWritable;
28 | import org.apache.hadoop.io.Text;
29 | import org.apache.hadoop.mapreduce.Mapper;
30 | import org.rdfhdt.hdt.exceptions.ParserException;
31 | import org.rdfhdt.hdt.triples.TripleString;
32 |
33 | public class DictionarySamplerMapper extends Mapper {
34 |
35 | @Override
36 | protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
37 |
38 | TripleString triple = new TripleString();
39 | try {
40 | triple.read(value.toString());
41 | } catch (ParserException e) {
42 | // TODO Auto-generated catch block
43 | e.printStackTrace();
44 | }
45 |
46 | context.write(new Text(triple.getSubject().toString()), new Text("S"));
47 | context.write(new Text(triple.getPredicate().toString()), new Text("P"));
48 | context.write(new Text(triple.getObject().toString()), new Text("O"));
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/dictionary/DictionarySamplerReducer.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.dictionary;
24 |
25 | import java.io.IOException;
26 |
27 | import org.apache.hadoop.io.Text;
28 | import org.apache.hadoop.mapreduce.Reducer;
29 |
30 | public class DictionarySamplerReducer extends Reducer {
31 |
32 | @Override
33 | protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
34 | boolean isSubject = false, isPredicate = false, isObject = false;
35 | String outputValue = "";
36 |
37 | for (Text value : values) {
38 | if (value.toString().contains("S"))
39 | isSubject = true;
40 | if (value.toString().contains("P"))
41 | isPredicate = true;
42 | if (value.toString().contains("O"))
43 | isObject = true;
44 | }
45 |
46 | if (isSubject)
47 | outputValue = outputValue.concat("S");
48 | if (isPredicate)
49 | outputValue = outputValue.concat("P");
50 | if (isObject)
51 | outputValue = outputValue.concat("O");
52 |
53 | context.write(key, new Text(outputValue));
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/io/TripleSPOComparator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.io;
24 |
25 |
26 | public class TripleSPOComparator extends TripleComparator {
27 |
28 |
29 | public TripleSPOComparator() {
30 | super(TripleSPOWritable.class, true);
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/io/TripleSPOWritable.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.io;
24 |
25 | import org.apache.hadoop.io.LongWritable;
26 |
27 | public class TripleSPOWritable extends TripleWritable {
28 |
29 | /**
30 | *
31 | */
32 | public TripleSPOWritable() {
33 | super(new LongWritable(), new LongWritable(), new LongWritable());
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/triples/TriplesSPOMapper.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.triples;
24 |
25 | import org.apache.hadoop.io.LongWritable;
26 | import org.apache.hadoop.io.NullWritable;
27 | import org.rdfhdt.hdt.enums.TripleComponentRole;
28 | import org.rdfhdt.hdt.triples.TripleString;
29 | import org.rdfhdt.mrbuilder.io.TripleSPOWritable;
30 |
31 | public class TriplesSPOMapper extends TriplesMapper {
32 |
33 | /*
34 | * (non-Javadoc)
35 | *
36 | * @see org.rdfhdt.mrbuilder.triples.TriplesMapper#key(org.rdfhdt.hdt.triples.TripleString)
37 | */
38 | @Override
39 | protected TripleSPOWritable key(TripleString tripleString) throws InterruptedException {
40 | long subject, predicate, object;
41 |
42 | if ((subject = this.dictionary.stringToId(tripleString.getSubject(), TripleComponentRole.SUBJECT)) == -1) {
43 | System.out.println("Subject nor found");
44 | System.out.println("Subject [" + tripleString.getSubject() + "]");
45 | System.out.println("Predicate [" + tripleString.getPredicate() + "]");
46 | System.out.println("Object [" + tripleString.getObject() + "]");
47 | throw new InterruptedException("Dictionary not loaded correctly");
48 | }
49 | if ((predicate = this.dictionary.stringToId(tripleString.getPredicate(), TripleComponentRole.PREDICATE)) == -1)
50 | {
51 | System.out.println("Predicate nor found");
52 | System.out.println("Subject [" + tripleString.getSubject() + "]");
53 | System.out.println("Predicate [" + tripleString.getPredicate() + "]");
54 | System.out.println("Object [" + tripleString.getObject() + "]");
55 | throw new InterruptedException("Dictionary not loaded correctly");
56 | }
57 | if ((object = this.dictionary.stringToId(tripleString.getObject(), TripleComponentRole.OBJECT)) == -1)
58 | {
59 | System.out.println("Object nor found");
60 | System.out.println("Subject [" + tripleString.getSubject() + "]");
61 | System.out.println("Predicate [" + tripleString.getPredicate() + "]");
62 | System.out.println("Object [" + tripleString.getObject() + "]");
63 | throw new InterruptedException("Dictionary not loaded correctly");
64 | }
65 |
66 | TripleSPOWritable tripleIDs = new TripleSPOWritable();
67 | tripleIDs.setSubject(new LongWritable(subject));
68 | tripleIDs.setPredicate(new LongWritable(predicate));
69 | tripleIDs.setObject(new LongWritable(object));
70 | return tripleIDs;
71 | }
72 |
73 | /*
74 | * (non-Javadoc)
75 | *
76 | * @see org.rdfhdt.mrbuilder.triples.TriplesMapper#value(org.rdfhdt.hdt.triples.TripleString)
77 | */
78 | @Override
79 | protected NullWritable value(TripleString tripleString) {
80 | return NullWritable.get();
81 | }
82 |
83 | }
84 |
--------------------------------------------------------------------------------
/src/org/rdfhdt/mrbuilder/util/FileStatusComparator.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Author: Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
3 | *
4 | * This library is free software; you can redistribute it and/or
5 | * modify it under the terms of the GNU Lesser General Public
6 | * License as published by the Free Software Foundation; either
7 | * version 2.1 of the License, or (at your option) any later version.
8 | *
9 | * This library is distributed in the hope that it will be useful,
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 | * Lesser General Public License for more details.
13 | *
14 | * You should have received a copy of the GNU Lesser General Public
15 | * License along with this library; if not, write to the Free Software
16 | * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17 | *
18 | * Contacting the authors:
19 | * Jose M. Gimenez-Garcia: josemiguel.gimenez@alumnos.uva.es
20 | * Javier D. Fernandez: jfergar@infor.uva.es, javier.fernandez@wu.ac.at
21 | * Miguel A. Martinez-Prieto: migumar2@infor.uva.es
22 | */
23 | package org.rdfhdt.mrbuilder.util;
24 |
25 | import java.util.Comparator;
26 |
27 | import org.apache.hadoop.fs.FileStatus;
28 |
29 |
30 | public class FileStatusComparator implements Comparator {
31 |
32 | /*
33 | * (non-Javadoc)
34 | *
35 | * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
36 | */
37 | @Override
38 | public int compare(FileStatus fs1, FileStatus fs2) {
39 | return fs1.getPath().getName().compareTo(fs2.getPath().getName());
40 | }
41 |
42 | }
43 |
--------------------------------------------------------------------------------