├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── pom.xml
└── src
└── main
├── java
└── io
│ └── github
│ └── adrianulbona
│ └── osm
│ └── parquet
│ ├── App.java
│ ├── MultiEntitySink.java
│ ├── ParquetSink.java
│ ├── ParquetWriterFactory.java
│ └── convertor
│ ├── NodeWriteSupport.java
│ ├── OsmEntityWriteSupport.java
│ ├── RelationWriteSupport.java
│ └── WayWriteSupport.java
└── resources
└── log4j.properties
/.gitignore:
--------------------------------------------------------------------------------
1 | data
2 |
3 | .DS_Store
4 |
5 | target/
6 | pom.xml.tag
7 | pom.xml.releaseBackup
8 | pom.xml.versionsBackup
9 | pom.xml.next
10 | release.properties
11 | dependency-reduced-pom.xml
12 | buildNumber.properties
13 | .mvn/timing.properties
14 |
15 | ### JetBrains template
16 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
17 |
18 | *.iml
19 |
20 | ## Directory-based project format:
21 | .idea/
22 | # if you remove the above rule, at least ignore the following:
23 |
24 | # User-specific stuff:
25 | # .idea/workspace.xml
26 | # .idea/tasks.xml
27 | # .idea/dictionaries
28 |
29 | # Sensitive or high-churn files:
30 | # .idea/dataSources.ids
31 | # .idea/dataSources.xml
32 | # .idea/sqlDataSources.xml
33 | # .idea/dynamic.xml
34 | # .idea/uiDesigner.xml
35 |
36 | # Gradle:
37 | # .idea/gradle.xml
38 | # .idea/libraries
39 |
40 | # Mongo Explorer plugin:
41 | # .idea/mongoSettings.xml
42 |
43 | ## File-based project format:
44 | *.ipr
45 | *.iws
46 |
47 | ## Plugin-specific files:
48 |
49 | # IntelliJ
50 | /out/
51 |
52 | # mpeltonen/sbt-idea plugin
53 | .idea_modules/
54 |
55 | # JIRA plugin
56 | atlassian-ide-plugin.xml
57 |
58 | # Crashlytics plugin (for Android Studio and IntelliJ)
59 | com_crashlytics_export_strings.xml
60 | crashlytics.properties
61 | crashlytics-build.properties
62 |
63 | ### Gradle template
64 | .gradle
65 | build/
66 |
67 | # Ignore Gradle GUI config
68 | gradle-app.setting
69 |
70 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
71 | !gradle-wrapper.jar
72 |
73 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: java
2 |
3 | jdk:
4 | - openjdk11
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## OpenStreetMap Parquetizer
2 |
3 | [](https://travis-ci.org/adrianulbona/osm-parquetizer)
4 |
5 | The project intends to provide a way to get the [OpenStreetMap](https://www.openstreetmap.org) data available in a Big Data friendly format as [Parquet](https://parquet.apache.org/).
6 |
7 | Currently any [PBF](http://wiki.openstreetmap.org/wiki/PBF_Format) file is converted into three parquet files, one for each type of entity from the original PBF (Nodes, Ways and Relations).
8 |
9 | In order to get started:
10 |
11 | ```shell
12 | git clone https://github.com/adrianulbona/osm-parquetizer.git
13 | cd osm-parquetizer
14 | mvn clean package
15 | java -jar target/osm-parquetizer-1.0.1-SNAPSHOT.jar path_to_your.pbf
16 | ```
17 |
18 | For example, by running:
19 |
20 | ```shell
21 | java -jar target/osm-parquetizer-1.0.1-SNAPSHOT.jar romania-latest.osm.pbf
22 | ```
23 |
24 | In a few seconds (on a decent laptop) you should get the following files:
25 | ```shell
26 | -rw-r--r-- 1 adrianbona adrianbona 145M Apr 3 19:57 romania-latest.osm.pbf
27 | -rw-r--r-- 1 adrianbona adrianbona 372M Apr 3 19:58 romania-latest.osm.pbf.node.parquet
28 | -rw-r--r-- 1 adrianbona adrianbona 1.1M Apr 3 19:58 romania-latest.osm.pbf.relation.parquet
29 | -rw-r--r-- 1 adrianbona adrianbona 123M Apr 3 19:58 romania-latest.osm.pbf.way.parquet
30 | ```
31 |
32 | The parquet files have the following schemas:
33 |
34 | ```probobuf
35 | node
36 | |-- id: long
37 | |-- version: integer
38 | |-- timestamp: long
39 | |-- changeset: long
40 | |-- uid: integer
41 | |-- user_sid: string
42 | |-- tags: array
43 | | |-- element: struct
44 | | | |-- key: string
45 | | | |-- value: string
46 | |-- latitude: double
47 | |-- longitude: double
48 |
49 | way
50 | |-- id: long
51 | |-- version: integer
52 | |-- timestamp: long
53 | |-- changeset: long
54 | |-- uid: integer
55 | |-- user_sid: string
56 | |-- tags: array
57 | | |-- element: struct
58 | | | |-- key: string
59 | | | |-- value: string
60 | |-- nodes: array
61 | | |-- element: struct
62 | | | |-- index: integer
63 | | | |-- nodeId: long
64 |
65 | relation
66 | |-- id: long
67 | |-- version: integer
68 | |-- timestamp: long
69 | |-- changeset: long
70 | |-- uid: integer
71 | |-- user_sid: string
72 | |-- tags: array
73 | | |-- element: struct
74 | | | |-- key: string
75 | | | |-- value: string
76 | |-- members: array
77 | | |-- element: struct
78 | | | |-- id: long
79 | | | |-- role: string
80 | | | |-- type: string
81 | ```
82 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | io.github.adrianulbona
5 | osm-parquetizer
6 | jar
7 | 1.0.1-SNAPSHOT
8 | OSM Parquetizer
9 | http://maven.apache.org
10 |
11 | 0.48.2
12 | 1.11.0
13 | 3.3.0
14 | UTF-8
15 |
16 |
17 |
18 | org.openstreetmap.osmosis
19 | osmosis-pbf2
20 | ${osmosis.version}
21 |
22 |
23 | org.apache.parquet
24 | parquet-hadoop
25 | ${parquet.version}
26 |
27 |
28 | org.apache.hadoop
29 | hadoop-client
30 |
31 |
32 |
33 |
34 | org.apache.hadoop
35 | hadoop-client
36 | ${hadoop.version}
37 |
38 |
39 | log4j
40 | log4j
41 |
42 |
43 | org.slf4j
44 | slf4j-log4j12
45 |
46 |
47 |
48 |
49 | org.slf4j
50 | slf4j-log4j12
51 | 1.7.19
52 |
53 |
54 | args4j
55 | args4j
56 | 2.33
57 |
58 |
59 |
60 |
61 |
62 |
63 | maven-compiler-plugin
64 | 3.3
65 |
66 | 1.8
67 | 1.8
68 |
69 |
70 |
71 | org.codehaus.mojo
72 | exec-maven-plugin
73 | 1.4.0
74 |
75 | io.github.adrianulbona.osm.parquet.App
76 |
77 |
78 |
79 | org.apache.maven.plugins
80 | maven-shade-plugin
81 | 2.4.2
82 |
83 |
84 |
86 | io.github.adrianulbona.osm.parquet.App
87 |
88 |
89 |
90 |
91 |
92 | package
93 |
94 | shade
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/src/main/java/io/github/adrianulbona/osm/parquet/App.java:
--------------------------------------------------------------------------------
1 | package io.github.adrianulbona.osm.parquet;
2 |
3 | import org.kohsuke.args4j.Argument;
4 | import org.kohsuke.args4j.CmdLineException;
5 | import org.kohsuke.args4j.CmdLineParser;
6 | import org.kohsuke.args4j.Option;
7 | import org.openstreetmap.osmosis.core.domain.v0_6.Entity;
8 | import org.openstreetmap.osmosis.core.domain.v0_6.EntityType;
9 | import org.openstreetmap.osmosis.pbf2.v0_6.PbfReader;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 |
13 | import java.io.IOException;
14 | import java.nio.file.Path;
15 | import java.util.ArrayList;
16 | import java.util.List;
17 | import java.util.concurrent.atomic.AtomicLong;
18 |
19 | import static java.util.Collections.unmodifiableList;
20 | import static org.openstreetmap.osmosis.core.domain.v0_6.EntityType.Node;
21 | import static org.openstreetmap.osmosis.core.domain.v0_6.EntityType.Relation;
22 |
23 |
24 | /**
25 | * Created by adrian.bona on 27/03/16.
26 | */
27 | public class App {
28 |
29 | public static void main(String[] args) throws IOException {
30 | final MultiEntitySinkConfig config = new MultiEntitySinkConfig();
31 | final CmdLineParser cmdLineParser = new CmdLineParser(config);
32 | try {
33 | cmdLineParser.parseArgument(args);
34 | final PbfReader reader = new PbfReader(config.getSource().toFile(), config.threads);
35 | final MultiEntitySink sink = new MultiEntitySink(config);
36 | sink.addObserver(new MultiEntitySinkObserver());
37 | reader.setSink(sink);
38 | reader.run();
39 | } catch (CmdLineException e) {
40 | System.out.println(e.getMessage());
41 | System.out.print("Usage: java -jar osm-parquetizer.jar");
42 | System.out.println();
43 | cmdLineParser.printSingleLineUsage(System.out);
44 | }
45 | }
46 |
47 | private static class MultiEntitySinkConfig implements MultiEntitySink.Config {
48 |
49 | @Argument(index = 0, metaVar = "pbf-path", usage = "the OSM PBF file to be parquetized", required = true)
50 | private Path source;
51 |
52 | @Argument(index = 1, metaVar = "output-path", usage = "the directory where to store the Parquet files",
53 | required = false)
54 | private Path destinationFolder;
55 |
56 | @Option(name = "--pbf-threads", usage = "if present number of threads for PbfReader")
57 | private int threads = 1;
58 |
59 | @Option(name = "--exclude-metadata", usage = "if present the metadata will not be parquetized")
60 | private boolean excludeMetadata = false;
61 |
62 | @Option(name = "--no-nodes", usage = "if present the nodes will be not parquetized")
63 | private boolean noNodes = false;
64 |
65 | @Option(name = "--no-ways", usage = "if present the ways will be not parquetized")
66 | private boolean noWays = false;
67 |
68 | @Option(name = "--no-relations", usage = "if present the relations will not be parquetized")
69 | private boolean noRelations = false;
70 |
71 | @Override
72 | public boolean getExcludeMetadata() {
73 | return this.excludeMetadata;
74 | }
75 |
76 | @Override
77 | public Path getSource() {
78 | return this.source;
79 | }
80 |
81 | @Override
82 | public Path getDestinationFolder() {
83 | return this.destinationFolder != null ? this.destinationFolder : this.source.toAbsolutePath().getParent();
84 | }
85 |
86 | @Override
87 | public List entitiesToBeParquetized() {
88 | final List entityTypes = new ArrayList<>();
89 | if (!noNodes) {
90 | entityTypes.add(Node);
91 | }
92 | if (!noWays) {
93 | entityTypes.add(EntityType.Way);
94 | }
95 | if (!noRelations) {
96 | entityTypes.add(Relation);
97 | }
98 | return unmodifiableList(entityTypes);
99 | }
100 | }
101 |
102 |
103 | private static class MultiEntitySinkObserver implements MultiEntitySink.Observer {
104 |
105 | private static final Logger LOGGER = LoggerFactory.getLogger(MultiEntitySinkObserver.class);
106 |
107 | private AtomicLong totalEntitiesCount;
108 |
109 | @Override
110 | public void started() {
111 | totalEntitiesCount = new AtomicLong();
112 | }
113 |
114 | @Override
115 | public void processed(Entity entity) {
116 | final long count = totalEntitiesCount.incrementAndGet();
117 | if (count % 1000000 == 0) {
118 | LOGGER.info("Entities processed: " + count);
119 |
120 | }
121 | }
122 |
123 | @Override
124 | public void ended() {
125 | LOGGER.info("Total entities processed: " + totalEntitiesCount.get());
126 | }
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/src/main/java/io/github/adrianulbona/osm/parquet/MultiEntitySink.java:
--------------------------------------------------------------------------------
1 | package io.github.adrianulbona.osm.parquet;
2 |
3 | import org.openstreetmap.osmosis.core.container.v0_6.EntityContainer;
4 | import org.openstreetmap.osmosis.core.domain.v0_6.Entity;
5 | import org.openstreetmap.osmosis.core.domain.v0_6.EntityType;
6 | import org.openstreetmap.osmosis.core.lifecycle.Closeable;
7 | import org.openstreetmap.osmosis.core.lifecycle.Completable;
8 | import org.openstreetmap.osmosis.core.task.v0_6.Sink;
9 |
10 | import java.nio.file.Path;
11 | import java.util.ArrayList;
12 | import java.util.List;
13 | import java.util.Map;
14 |
15 | import static java.util.stream.Collectors.toList;
16 |
17 |
18 | /**
19 | * Created by adrian.bona on 27/03/16.
20 | */
21 | public class MultiEntitySink implements Sink {
22 |
23 | private final List> converters;
24 |
25 | private final List observers;
26 |
27 | public MultiEntitySink(Config config) {
28 | final List entityTypes = config.entitiesToBeParquetized();
29 | this.converters = entityTypes.stream().map(type -> new ParquetSink<>(config.getSource(),
30 | config.getDestinationFolder(), config.getExcludeMetadata(), type)).collect(toList());
31 | this.observers = new ArrayList<>();
32 | }
33 |
34 | @Override
35 | public void process(EntityContainer entityContainer) {
36 | this.converters.forEach(converter -> converter.process(entityContainer));
37 | this.observers.forEach(o -> o.processed(entityContainer.getEntity()));
38 | }
39 |
40 | @Override
41 | public void initialize(Map metaData) {
42 | this.converters.forEach(converter -> converter.initialize(metaData));
43 | this.observers.forEach(Observer::started);
44 | }
45 |
46 | @Override
47 | public void complete() {
48 | this.converters.forEach(Completable::complete);
49 | this.observers.forEach(Observer::ended);
50 | }
51 |
52 | @Override
53 | public void close() {
54 | this.converters.forEach(Closeable::close);
55 | }
56 |
57 | public void addObserver(Observer observer) {
58 | this.observers.add(observer);
59 | }
60 |
61 | public void removeObserver(Observer observer) {
62 | this.observers.remove(observer);
63 | }
64 |
65 | public interface Observer {
66 |
67 | void started();
68 |
69 | void processed(Entity entity);
70 |
71 | void ended();
72 | }
73 |
74 |
75 | public interface Config {
76 |
77 | Path getSource();
78 |
79 | Path getDestinationFolder();
80 |
81 | boolean getExcludeMetadata();
82 |
83 | List entitiesToBeParquetized();
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/java/io/github/adrianulbona/osm/parquet/ParquetSink.java:
--------------------------------------------------------------------------------
1 | package io.github.adrianulbona.osm.parquet;
2 |
3 | import org.apache.parquet.hadoop.ParquetWriter;
4 | import org.openstreetmap.osmosis.core.container.v0_6.EntityContainer;
5 | import org.openstreetmap.osmosis.core.domain.v0_6.Entity;
6 | import org.openstreetmap.osmosis.core.domain.v0_6.EntityType;
7 | import org.openstreetmap.osmosis.core.task.v0_6.Sink;
8 |
9 | import java.io.IOException;
10 | import java.nio.file.Path;
11 | import java.util.ArrayList;
12 | import java.util.List;
13 | import java.util.Map;
14 | import java.util.function.Predicate;
15 |
16 | import static java.lang.String.format;
17 |
18 |
19 | public class ParquetSink implements Sink {
20 |
21 | private final Path source;
22 | private final Path destinationFolder;
23 | private final boolean excludeMetadata;
24 | private final EntityType entityType;
25 | private final List> filters;
26 |
27 | private ParquetWriter writer;
28 |
29 | public ParquetSink(Path source, Path destinationFolder, boolean excludeMetadata, EntityType entityType) {
30 | this.source = source;
31 | this.destinationFolder = destinationFolder;
32 | this.excludeMetadata = excludeMetadata;
33 | this.entityType = entityType;
34 | this.filters = new ArrayList<>();
35 | }
36 |
37 | @Override
38 | public void initialize(Map metaData) {
39 | final String pbfName = source.getFileName().toString();
40 | final String entityName = entityType.name().toLowerCase();
41 | final Path destination = destinationFolder.resolve(format("%s.%s.parquet", pbfName, entityName));
42 | try {
43 | this.writer = ParquetWriterFactory.buildFor(destination.toAbsolutePath().toString(), excludeMetadata,
44 | entityType);
45 | } catch (IOException e) {
46 | throw new RuntimeException("Unable to build writers", e);
47 | }
48 | }
49 |
50 | @Override
51 | public void process(EntityContainer entityContainer) {
52 | try {
53 | if (this.entityType == entityContainer.getEntity().getType()) {
54 | final T entity = (T) entityContainer.getEntity();
55 | if (filters.stream().noneMatch(filter -> filter.test(entity))) {
56 | writer.write(entity);
57 | }
58 | }
59 | } catch (IOException e) {
60 | throw new RuntimeException("Unable to write entity", e);
61 | }
62 | }
63 |
64 | @Override
65 | public void complete() {
66 | try {
67 | this.writer.close();
68 | } catch (IOException e) {
69 | throw new RuntimeException("Unable to close writers", e);
70 | }
71 | }
72 |
73 | @Override
74 | public void close() {
75 |
76 | }
77 |
78 | public void addFilter(Predicate predicate) {
79 | this.filters.add(predicate);
80 | }
81 |
82 | public void removeFilter(Predicate predicate) {
83 | this.filters.remove(predicate);
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/java/io/github/adrianulbona/osm/parquet/ParquetWriterFactory.java:
--------------------------------------------------------------------------------
1 | package io.github.adrianulbona.osm.parquet;
2 |
3 | import io.github.adrianulbona.osm.parquet.convertor.NodeWriteSupport;
4 | import io.github.adrianulbona.osm.parquet.convertor.RelationWriteSupport;
5 | import io.github.adrianulbona.osm.parquet.convertor.WayWriteSupport;
6 | import org.apache.hadoop.conf.Configuration;
7 | import org.apache.hadoop.fs.Path;
8 | import org.apache.parquet.hadoop.ParquetWriter;
9 | import org.apache.parquet.hadoop.api.WriteSupport;
10 | import org.apache.parquet.hadoop.metadata.CompressionCodecName;
11 | import org.openstreetmap.osmosis.core.domain.v0_6.*;
12 |
13 | import java.io.IOException;
14 |
15 | import static org.apache.parquet.hadoop.ParquetFileWriter.Mode.OVERWRITE;
16 | import static org.apache.parquet.hadoop.metadata.CompressionCodecName.SNAPPY;
17 |
18 |
19 | /**
20 | * Created by adrian.bona on 26/03/16.
21 | */
22 | public class ParquetWriterFactory {
23 |
24 | private static final CompressionCodecName COMPRESSION = SNAPPY;
25 |
26 | public static ParquetWriter buildFor(String destination, boolean excludeMetadata,
27 | EntityType entityType) throws IOException {
28 | switch (entityType) {
29 | case Node:
30 | return (ParquetWriter) NodesWriterBuilder.standard(destination, excludeMetadata);
31 | case Way:
32 | return (ParquetWriter) WaysWriterBuilder.standard(destination, excludeMetadata);
33 | case Relation:
34 | return (ParquetWriter) RelationsWriterBuilder.standard(destination, excludeMetadata);
35 | default:
36 | throw new RuntimeException("Invalid entity type");
37 | }
38 | }
39 |
40 | public static class WaysWriterBuilder extends ParquetWriter.Builder {
41 |
42 | private final boolean excludeMetadata;
43 |
44 | protected WaysWriterBuilder(Path file, boolean excludeMetadata) {
45 | super(file);
46 | this.excludeMetadata = excludeMetadata;
47 | }
48 |
49 | @Override
50 | protected WaysWriterBuilder self() {
51 | return this;
52 | }
53 |
54 | @Override
55 | protected WriteSupport getWriteSupport(Configuration conf) {
56 | return new WayWriteSupport(excludeMetadata);
57 | }
58 |
59 | public static ParquetWriter standard(String destination, boolean excludeMetadata) throws IOException {
60 | return new WaysWriterBuilder(new Path(destination), excludeMetadata).self()
61 | .withCompressionCodec(COMPRESSION).withWriteMode(OVERWRITE).build();
62 | }
63 | }
64 |
65 |
66 | public static class NodesWriterBuilder extends ParquetWriter.Builder {
67 |
68 | private final boolean excludeMetadata;
69 |
70 | protected NodesWriterBuilder(Path file, boolean excludeMetadata) {
71 | super(file);
72 | this.excludeMetadata = excludeMetadata;
73 | }
74 |
75 | @Override
76 | protected NodesWriterBuilder self() {
77 | return this;
78 | }
79 |
80 | @Override
81 | protected WriteSupport getWriteSupport(Configuration conf) {
82 | return new NodeWriteSupport(excludeMetadata);
83 | }
84 |
85 | public static ParquetWriter standard(String destination, boolean excludeMetadata) throws IOException {
86 | return new NodesWriterBuilder(new Path(destination), excludeMetadata).self()
87 | .withCompressionCodec(COMPRESSION).withWriteMode(OVERWRITE).build();
88 | }
89 | }
90 |
91 |
92 | public static class RelationsWriterBuilder extends ParquetWriter.Builder {
93 |
94 | private final boolean excludeMetadata;
95 |
96 | protected RelationsWriterBuilder(Path file, boolean excludeMetadata) {
97 | super(file);
98 | this.excludeMetadata = excludeMetadata;
99 | }
100 |
101 | @Override
102 | protected RelationsWriterBuilder self() {
103 | return this;
104 | }
105 |
106 | @Override
107 | protected WriteSupport getWriteSupport(Configuration conf) {
108 | return new RelationWriteSupport(excludeMetadata);
109 | }
110 |
111 | public static ParquetWriter standard(String destination, boolean excludeMetadata) throws IOException {
112 | return new RelationsWriterBuilder(new Path(destination), excludeMetadata).self()
113 | .withCompressionCodec(COMPRESSION).withWriteMode(OVERWRITE).build();
114 | }
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/io/github/adrianulbona/osm/parquet/convertor/NodeWriteSupport.java:
--------------------------------------------------------------------------------
1 | package io.github.adrianulbona.osm.parquet.convertor;
2 |
3 | import org.apache.parquet.schema.MessageType;
4 | import org.apache.parquet.schema.PrimitiveType;
5 | import org.apache.parquet.schema.Type;
6 | import org.openstreetmap.osmosis.core.domain.v0_6.Node;
7 |
8 | import java.util.ArrayList;
9 | import java.util.List;
10 |
11 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*;
12 | import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
13 |
14 |
15 | /**
16 | * Created by adrian.bona on 26/03/16.
17 | */
18 | public class NodeWriteSupport extends OsmEntityWriteSupport {
19 |
20 | private final PrimitiveType latType;
21 | private final PrimitiveType longType;
22 |
23 | public NodeWriteSupport(boolean excludeMetadata) {
24 | super(excludeMetadata);
25 | latType = new PrimitiveType(REQUIRED, DOUBLE, "latitude");
26 | longType = new PrimitiveType(REQUIRED, DOUBLE, "longitude");
27 | }
28 |
29 | @Override
30 | protected MessageType getSchema() {
31 | final List attributes = new ArrayList<>(getCommonAttributes());
32 | attributes.add(latType);
33 | attributes.add(longType);
34 | return new MessageType("node", attributes);
35 | }
36 |
37 | @Override
38 | protected void writeSpecificFields(Node record, int nextAvailableIndex) {
39 | recordConsumer.startField(latType.getName(), nextAvailableIndex);
40 | recordConsumer.addDouble(record.getLatitude());
41 | recordConsumer.endField(latType.getName(), nextAvailableIndex++);
42 |
43 | recordConsumer.startField(longType.getName(), nextAvailableIndex);
44 | recordConsumer.addDouble(record.getLongitude());
45 | recordConsumer.endField(longType.getName(), nextAvailableIndex);
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/java/io/github/adrianulbona/osm/parquet/convertor/OsmEntityWriteSupport.java:
--------------------------------------------------------------------------------
1 | package io.github.adrianulbona.osm.parquet.convertor;
2 |
3 | import org.apache.hadoop.conf.Configuration;
4 | import org.apache.parquet.hadoop.api.WriteSupport;
5 | import org.apache.parquet.io.api.Binary;
6 | import org.apache.parquet.io.api.RecordConsumer;
7 | import org.apache.parquet.schema.GroupType;
8 | import org.apache.parquet.schema.MessageType;
9 | import org.apache.parquet.schema.PrimitiveType;
10 | import org.apache.parquet.schema.Type;
11 | import org.openstreetmap.osmosis.core.domain.v0_6.Entity;
12 | import org.openstreetmap.osmosis.core.domain.v0_6.Tag;
13 |
14 | import java.util.Collections;
15 | import java.util.LinkedList;
16 | import java.util.List;
17 |
18 | import static java.util.Arrays.asList;
19 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*;
20 | import static org.apache.parquet.schema.Type.Repetition.*;
21 |
22 |
23 | /**
24 | * Created by adrian.bona on 26/03/16.
25 | */
26 | public abstract class OsmEntityWriteSupport extends WriteSupport {
27 |
28 | private final PrimitiveType idType;
29 | private final PrimitiveType versionType;
30 | private final GroupType tags;
31 | private final PrimitiveType tagKeyType;
32 | private final PrimitiveType tagValueType;
33 | private final PrimitiveType timestampType;
34 | private final PrimitiveType changesetType;
35 | private final PrimitiveType uidType;
36 | private final PrimitiveType userSidType;
37 |
38 | private final boolean excludeMetadata;
39 |
40 | protected RecordConsumer recordConsumer;
41 |
42 | public OsmEntityWriteSupport(boolean excludeMetadata) {
43 | idType = new PrimitiveType(REQUIRED, INT64, "id");
44 | tagKeyType = new PrimitiveType(REQUIRED, BINARY, "key");
45 | tagValueType = new PrimitiveType(OPTIONAL, BINARY, "value");
46 | tags = new GroupType(REPEATED, "tags", tagKeyType, tagValueType);
47 | versionType = new PrimitiveType(OPTIONAL, INT32, "version");
48 | timestampType = new PrimitiveType(OPTIONAL, INT64, "timestamp");
49 | changesetType = new PrimitiveType(OPTIONAL, INT64, "changeset");
50 | uidType = new PrimitiveType(OPTIONAL, INT32, "uid");
51 | userSidType = new PrimitiveType(OPTIONAL, BINARY, "user_sid");
52 | this.excludeMetadata = excludeMetadata;
53 | }
54 |
55 | protected List getCommonAttributes() {
56 | final List commonAttributes = new LinkedList<>();
57 | commonAttributes.add(idType);
58 | if (!excludeMetadata) {
59 | commonAttributes.addAll(asList(versionType, timestampType, changesetType, uidType, userSidType));
60 | }
61 | commonAttributes.add(tags);
62 | return commonAttributes;
63 | }
64 |
65 | @Override
66 | public WriteContext init(Configuration config) {
67 | return new WriteContext(getSchema(), Collections.emptyMap());
68 | }
69 |
70 | protected abstract MessageType getSchema();
71 |
72 | @Override
73 | public void prepareForWrite(RecordConsumer recordConsumer) {
74 | this.recordConsumer = recordConsumer;
75 | }
76 |
77 | protected abstract void writeSpecificFields(E record, int nextAvailableIndex);
78 |
79 | public void write(E record) {
80 | int index = 0;
81 | recordConsumer.startMessage();
82 | recordConsumer.startField(idType.getName(), index);
83 | recordConsumer.addLong(record.getId());
84 | recordConsumer.endField(idType.getName(), index++);
85 |
86 | if (!excludeMetadata) {
87 | recordConsumer.startField(versionType.getName(), index);
88 | recordConsumer.addInteger(record.getVersion());
89 | recordConsumer.endField(versionType.getName(), index++);
90 |
91 | recordConsumer.startField(timestampType.getName(), index);
92 | recordConsumer.addLong(record.getTimestamp().getTime());
93 | recordConsumer.endField(timestampType.getName(), index++);
94 |
95 | recordConsumer.startField(changesetType.getName(), index);
96 | recordConsumer.addLong(record.getChangesetId());
97 | recordConsumer.endField(changesetType.getName(), index++);
98 |
99 | recordConsumer.startField(uidType.getName(), index);
100 | recordConsumer.addInteger(record.getUser().getId());
101 | recordConsumer.endField(uidType.getName(), index++);
102 |
103 | recordConsumer.startField(userSidType.getName(), index);
104 | recordConsumer.addBinary(Binary.fromString(record.getUser().getName()));
105 | recordConsumer.endField(userSidType.getName(), index++);
106 | }
107 |
108 | if (!record.getTags().isEmpty()) {
109 | recordConsumer.startField(tags.getName(), index);
110 | for (Tag tag : record.getTags()) {
111 | recordConsumer.startGroup();
112 |
113 | recordConsumer.startField(tagKeyType.getName(), 0);
114 | recordConsumer.addBinary(Binary.fromString(tag.getKey()));
115 | recordConsumer.endField(tagKeyType.getName(), 0);
116 |
117 | recordConsumer.startField(tagValueType.getName(), 1);
118 | recordConsumer.addBinary(Binary.fromString(tag.getValue()));
119 | recordConsumer.endField(tagValueType.getName(), 1);
120 |
121 | recordConsumer.endGroup();
122 | }
123 | recordConsumer.endField(tags.getName(), index);
124 | }
125 | index++;
126 |
127 | writeSpecificFields(record, index);
128 | recordConsumer.endMessage();
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/java/io/github/adrianulbona/osm/parquet/convertor/RelationWriteSupport.java:
--------------------------------------------------------------------------------
1 | package io.github.adrianulbona.osm.parquet.convertor;
2 |
3 | import org.apache.parquet.io.api.Binary;
4 | import org.apache.parquet.schema.GroupType;
5 | import org.apache.parquet.schema.MessageType;
6 | import org.apache.parquet.schema.PrimitiveType;
7 | import org.apache.parquet.schema.Type;
8 | import org.openstreetmap.osmosis.core.domain.v0_6.Relation;
9 |
10 | import java.util.ArrayList;
11 | import java.util.List;
12 |
13 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY;
14 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
15 | import static org.apache.parquet.schema.Type.Repetition.REPEATED;
16 | import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
17 |
18 |
19 | /**
20 | * Created by adrian.bona on 26/03/16.
21 | */
22 | public class RelationWriteSupport extends OsmEntityWriteSupport {
23 |
24 | private final GroupType membersType;
25 | private final PrimitiveType memberIdType;
26 | private final PrimitiveType memberRoleType;
27 | private final PrimitiveType memberTypeType;
28 |
29 | public RelationWriteSupport(boolean excludeMetadata) {
30 | super(excludeMetadata);
31 | memberIdType = new PrimitiveType(REQUIRED, INT64, "id");
32 | memberRoleType = new PrimitiveType(REQUIRED, BINARY, "role");
33 | memberTypeType = new PrimitiveType(REQUIRED, BINARY, "type");
34 | membersType = new GroupType(REPEATED, "members", memberIdType, memberRoleType, memberTypeType);
35 | }
36 |
37 | @Override
38 | protected MessageType getSchema() {
39 | final List attributes = new ArrayList<>(getCommonAttributes());
40 | attributes.add(membersType);
41 | return new MessageType("relation", attributes);
42 | }
43 |
44 | @Override
45 | protected void writeSpecificFields(Relation record, int nextAvailableIndex) {
46 | if (!record.getMembers().isEmpty()) {
47 | recordConsumer.startField(membersType.getName(), nextAvailableIndex);
48 | record.getMembers().forEach(member -> {
49 | recordConsumer.startGroup();
50 |
51 | recordConsumer.startField(memberIdType.getName(), 0);
52 | recordConsumer.addLong(member.getMemberId());
53 | recordConsumer.endField(memberIdType.getName(), 0);
54 |
55 | recordConsumer.startField(memberRoleType.getName(), 1);
56 | recordConsumer.addBinary(Binary.fromString(member.getMemberRole()));
57 | recordConsumer.endField(memberRoleType.getName(), 1);
58 |
59 | recordConsumer.startField(memberTypeType.getName(), 2);
60 | recordConsumer.addBinary(Binary.fromString(member.getMemberType().name()));
61 | recordConsumer.endField(memberTypeType.getName(), 2);
62 |
63 | recordConsumer.endGroup();
64 | });
65 | recordConsumer.endField(membersType.getName(), nextAvailableIndex);
66 | }
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/main/java/io/github/adrianulbona/osm/parquet/convertor/WayWriteSupport.java:
--------------------------------------------------------------------------------
1 | package io.github.adrianulbona.osm.parquet.convertor;
2 |
3 | import org.apache.parquet.schema.GroupType;
4 | import org.apache.parquet.schema.MessageType;
5 | import org.apache.parquet.schema.PrimitiveType;
6 | import org.apache.parquet.schema.Type;
7 | import org.openstreetmap.osmosis.core.domain.v0_6.Way;
8 | import org.openstreetmap.osmosis.core.domain.v0_6.WayNode;
9 |
10 | import java.util.ArrayList;
11 | import java.util.HashMap;
12 | import java.util.List;
13 | import java.util.Map;
14 |
15 | import static java.util.stream.IntStream.range;
16 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32;
17 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64;
18 | import static org.apache.parquet.schema.Type.Repetition.REPEATED;
19 | import static org.apache.parquet.schema.Type.Repetition.REQUIRED;
20 |
21 |
22 | /**
23 | * Created by adrian.bona on 26/03/16.
24 | */
25 | public class WayWriteSupport extends OsmEntityWriteSupport {
26 |
27 | private final PrimitiveType nodeIndexType;
28 | private final PrimitiveType nodeIdType;
29 | private final GroupType nodes;
30 |
31 | public WayWriteSupport(boolean excludeMetadata) {
32 | super(excludeMetadata);
33 | nodeIndexType = new PrimitiveType(REQUIRED, INT32, "index");
34 | nodeIdType = new PrimitiveType(REQUIRED, INT64, "nodeId");
35 | nodes = new GroupType(REPEATED, "nodes", nodeIndexType, nodeIdType);
36 | }
37 |
38 | @Override
39 | protected MessageType getSchema() {
40 | final List attributes = new ArrayList<>(getCommonAttributes());
41 | attributes.add(nodes);
42 | return new MessageType("way", attributes);
43 | }
44 |
45 | @Override
46 | protected void writeSpecificFields(Way record, int nextAvailableIndex) {
47 | final List wayNodes = record.getWayNodes();
48 | final Map indexedNodes = new HashMap<>();
49 | range(0, wayNodes.size()).forEach(index -> indexedNodes.put(index, wayNodes.get(index).getNodeId()));
50 |
51 | if (!indexedNodes.isEmpty()) {
52 | recordConsumer.startField(nodes.getName(), nextAvailableIndex);
53 | indexedNodes.forEach((index, nodeId) -> {
54 | recordConsumer.startGroup();
55 |
56 | recordConsumer.startField(nodeIndexType.getName(), 0);
57 | recordConsumer.addInteger(index);
58 | recordConsumer.endField(nodeIndexType.getName(), 0);
59 |
60 | recordConsumer.startField(nodeIdType.getName(), 1);
61 | recordConsumer.addLong(nodeId);
62 | recordConsumer.endField(nodeIdType.getName(), 1);
63 |
64 | recordConsumer.endGroup();
65 | });
66 | recordConsumer.endField(nodes.getName(), nextAvailableIndex);
67 | }
68 | }
69 | }
70 |
--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=INFO, stdout
3 |
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.out
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
--------------------------------------------------------------------------------