├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── pom.xml └── src └── main ├── java └── io │ └── github │ └── adrianulbona │ └── osm │ └── parquet │ ├── App.java │ ├── MultiEntitySink.java │ ├── ParquetSink.java │ ├── ParquetWriterFactory.java │ └── convertor │ ├── NodeWriteSupport.java │ ├── OsmEntityWriteSupport.java │ ├── RelationWriteSupport.java │ └── WayWriteSupport.java └── resources └── log4j.properties /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | 3 | .DS_Store 4 | 5 | target/ 6 | pom.xml.tag 7 | pom.xml.releaseBackup 8 | pom.xml.versionsBackup 9 | pom.xml.next 10 | release.properties 11 | dependency-reduced-pom.xml 12 | buildNumber.properties 13 | .mvn/timing.properties 14 | 15 | ### JetBrains template 16 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 17 | 18 | *.iml 19 | 20 | ## Directory-based project format: 21 | .idea/ 22 | # if you remove the above rule, at least ignore the following: 23 | 24 | # User-specific stuff: 25 | # .idea/workspace.xml 26 | # .idea/tasks.xml 27 | # .idea/dictionaries 28 | 29 | # Sensitive or high-churn files: 30 | # .idea/dataSources.ids 31 | # .idea/dataSources.xml 32 | # .idea/sqlDataSources.xml 33 | # .idea/dynamic.xml 34 | # .idea/uiDesigner.xml 35 | 36 | # Gradle: 37 | # .idea/gradle.xml 38 | # .idea/libraries 39 | 40 | # Mongo Explorer plugin: 41 | # .idea/mongoSettings.xml 42 | 43 | ## File-based project format: 44 | *.ipr 45 | *.iws 46 | 47 | ## Plugin-specific files: 48 | 49 | # IntelliJ 50 | /out/ 51 | 52 | # mpeltonen/sbt-idea plugin 53 | .idea_modules/ 54 | 55 | # JIRA plugin 56 | atlassian-ide-plugin.xml 57 | 58 | # Crashlytics plugin (for Android Studio and IntelliJ) 59 | com_crashlytics_export_strings.xml 60 | crashlytics.properties 61 | crashlytics-build.properties 62 | 63 | ### Gradle template 64 | .gradle 65 | build/ 66 | 67 | # Ignore Gradle GUI config 68 | gradle-app.setting 69 | 70 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 71 | !gradle-wrapper.jar 72 | 73 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | 3 | jdk: 4 | - openjdk11 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## OpenStreetMap Parquetizer 2 | 3 | [![Build Status](https://travis-ci.org/adrianulbona/hmm.svg)](https://travis-ci.org/adrianulbona/osm-parquetizer) 4 | 5 | The project intends to provide a way to get the [OpenStreetMap](https://www.openstreetmap.org) data available in a Big Data friendly format as [Parquet](https://parquet.apache.org/). 6 | 7 | Currently any [PBF](http://wiki.openstreetmap.org/wiki/PBF_Format) file is converted into three parquet files, one for each type of entity from the original PBF (Nodes, Ways and Relations). 8 | 9 | In order to get started: 10 | 11 | ```shell 12 | git clone https://github.com/adrianulbona/osm-parquetizer.git 13 | cd osm-parquetizer 14 | mvn clean package 15 | java -jar target/osm-parquetizer-1.0.1-SNAPSHOT.jar path_to_your.pbf 16 | ``` 17 | 18 | For example, by running: 19 | 20 | ```shell 21 | java -jar target/osm-parquetizer-1.0.1-SNAPSHOT.jar romania-latest.osm.pbf 22 | ``` 23 | 24 | In a few seconds (on a decent laptop) you should get the following files: 25 | ```shell 26 | -rw-r--r-- 1 adrianbona adrianbona 145M Apr 3 19:57 romania-latest.osm.pbf 27 | -rw-r--r-- 1 adrianbona adrianbona 372M Apr 3 19:58 romania-latest.osm.pbf.node.parquet 28 | -rw-r--r-- 1 adrianbona adrianbona 1.1M Apr 3 19:58 romania-latest.osm.pbf.relation.parquet 29 | -rw-r--r-- 1 adrianbona adrianbona 123M Apr 3 19:58 romania-latest.osm.pbf.way.parquet 30 | ``` 31 | 32 | The parquet files have the following schemas: 33 | 34 | ```probobuf 35 | node 36 | |-- id: long 37 | |-- version: integer 38 | |-- timestamp: long 39 | |-- changeset: long 40 | |-- uid: integer 41 | |-- user_sid: string 42 | |-- tags: array 43 | | |-- element: struct 44 | | | |-- key: string 45 | | | |-- value: string 46 | |-- latitude: double 47 | |-- longitude: double 48 | 49 | way 50 | |-- id: long 51 | |-- version: integer 52 | |-- timestamp: long 53 | |-- changeset: long 54 | |-- uid: integer 55 | |-- user_sid: string 56 | |-- tags: array 57 | | |-- element: struct 58 | | | |-- key: string 59 | | | |-- value: string 60 | |-- nodes: array 61 | | |-- element: struct 62 | | | |-- index: integer 63 | | | |-- nodeId: long 64 | 65 | relation 66 | |-- id: long 67 | |-- version: integer 68 | |-- timestamp: long 69 | |-- changeset: long 70 | |-- uid: integer 71 | |-- user_sid: string 72 | |-- tags: array 73 | | |-- element: struct 74 | | | |-- key: string 75 | | | |-- value: string 76 | |-- members: array 77 | | |-- element: struct 78 | | | |-- id: long 79 | | | |-- role: string 80 | | | |-- type: string 81 | ``` 82 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | io.github.adrianulbona 5 | osm-parquetizer 6 | jar 7 | 1.0.1-SNAPSHOT 8 | OSM Parquetizer 9 | http://maven.apache.org 10 | 11 | 0.48.2 12 | 1.11.0 13 | 3.3.0 14 | UTF-8 15 | 16 | 17 | 18 | org.openstreetmap.osmosis 19 | osmosis-pbf2 20 | ${osmosis.version} 21 | 22 | 23 | org.apache.parquet 24 | parquet-hadoop 25 | ${parquet.version} 26 | 27 | 28 | org.apache.hadoop 29 | hadoop-client 30 | 31 | 32 | 33 | 34 | org.apache.hadoop 35 | hadoop-client 36 | ${hadoop.version} 37 | 38 | 39 | log4j 40 | log4j 41 | 42 | 43 | org.slf4j 44 | slf4j-log4j12 45 | 46 | 47 | 48 | 49 | org.slf4j 50 | slf4j-log4j12 51 | 1.7.19 52 | 53 | 54 | args4j 55 | args4j 56 | 2.33 57 | 58 | 59 | 60 | 61 | 62 | 63 | maven-compiler-plugin 64 | 3.3 65 | 66 | 1.8 67 | 1.8 68 | 69 | 70 | 71 | org.codehaus.mojo 72 | exec-maven-plugin 73 | 1.4.0 74 | 75 | io.github.adrianulbona.osm.parquet.App 76 | 77 | 78 | 79 | org.apache.maven.plugins 80 | maven-shade-plugin 81 | 2.4.2 82 | 83 | 84 | 86 | io.github.adrianulbona.osm.parquet.App 87 | 88 | 89 | 90 | 91 | 92 | package 93 | 94 | shade 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | -------------------------------------------------------------------------------- /src/main/java/io/github/adrianulbona/osm/parquet/App.java: -------------------------------------------------------------------------------- 1 | package io.github.adrianulbona.osm.parquet; 2 | 3 | import org.kohsuke.args4j.Argument; 4 | import org.kohsuke.args4j.CmdLineException; 5 | import org.kohsuke.args4j.CmdLineParser; 6 | import org.kohsuke.args4j.Option; 7 | import org.openstreetmap.osmosis.core.domain.v0_6.Entity; 8 | import org.openstreetmap.osmosis.core.domain.v0_6.EntityType; 9 | import org.openstreetmap.osmosis.pbf2.v0_6.PbfReader; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import java.io.IOException; 14 | import java.nio.file.Path; 15 | import java.util.ArrayList; 16 | import java.util.List; 17 | import java.util.concurrent.atomic.AtomicLong; 18 | 19 | import static java.util.Collections.unmodifiableList; 20 | import static org.openstreetmap.osmosis.core.domain.v0_6.EntityType.Node; 21 | import static org.openstreetmap.osmosis.core.domain.v0_6.EntityType.Relation; 22 | 23 | 24 | /** 25 | * Created by adrian.bona on 27/03/16. 26 | */ 27 | public class App { 28 | 29 | public static void main(String[] args) throws IOException { 30 | final MultiEntitySinkConfig config = new MultiEntitySinkConfig(); 31 | final CmdLineParser cmdLineParser = new CmdLineParser(config); 32 | try { 33 | cmdLineParser.parseArgument(args); 34 | final PbfReader reader = new PbfReader(config.getSource().toFile(), config.threads); 35 | final MultiEntitySink sink = new MultiEntitySink(config); 36 | sink.addObserver(new MultiEntitySinkObserver()); 37 | reader.setSink(sink); 38 | reader.run(); 39 | } catch (CmdLineException e) { 40 | System.out.println(e.getMessage()); 41 | System.out.print("Usage: java -jar osm-parquetizer.jar"); 42 | System.out.println(); 43 | cmdLineParser.printSingleLineUsage(System.out); 44 | } 45 | } 46 | 47 | private static class MultiEntitySinkConfig implements MultiEntitySink.Config { 48 | 49 | @Argument(index = 0, metaVar = "pbf-path", usage = "the OSM PBF file to be parquetized", required = true) 50 | private Path source; 51 | 52 | @Argument(index = 1, metaVar = "output-path", usage = "the directory where to store the Parquet files", 53 | required = false) 54 | private Path destinationFolder; 55 | 56 | @Option(name = "--pbf-threads", usage = "if present number of threads for PbfReader") 57 | private int threads = 1; 58 | 59 | @Option(name = "--exclude-metadata", usage = "if present the metadata will not be parquetized") 60 | private boolean excludeMetadata = false; 61 | 62 | @Option(name = "--no-nodes", usage = "if present the nodes will be not parquetized") 63 | private boolean noNodes = false; 64 | 65 | @Option(name = "--no-ways", usage = "if present the ways will be not parquetized") 66 | private boolean noWays = false; 67 | 68 | @Option(name = "--no-relations", usage = "if present the relations will not be parquetized") 69 | private boolean noRelations = false; 70 | 71 | @Override 72 | public boolean getExcludeMetadata() { 73 | return this.excludeMetadata; 74 | } 75 | 76 | @Override 77 | public Path getSource() { 78 | return this.source; 79 | } 80 | 81 | @Override 82 | public Path getDestinationFolder() { 83 | return this.destinationFolder != null ? this.destinationFolder : this.source.toAbsolutePath().getParent(); 84 | } 85 | 86 | @Override 87 | public List entitiesToBeParquetized() { 88 | final List entityTypes = new ArrayList<>(); 89 | if (!noNodes) { 90 | entityTypes.add(Node); 91 | } 92 | if (!noWays) { 93 | entityTypes.add(EntityType.Way); 94 | } 95 | if (!noRelations) { 96 | entityTypes.add(Relation); 97 | } 98 | return unmodifiableList(entityTypes); 99 | } 100 | } 101 | 102 | 103 | private static class MultiEntitySinkObserver implements MultiEntitySink.Observer { 104 | 105 | private static final Logger LOGGER = LoggerFactory.getLogger(MultiEntitySinkObserver.class); 106 | 107 | private AtomicLong totalEntitiesCount; 108 | 109 | @Override 110 | public void started() { 111 | totalEntitiesCount = new AtomicLong(); 112 | } 113 | 114 | @Override 115 | public void processed(Entity entity) { 116 | final long count = totalEntitiesCount.incrementAndGet(); 117 | if (count % 1000000 == 0) { 118 | LOGGER.info("Entities processed: " + count); 119 | 120 | } 121 | } 122 | 123 | @Override 124 | public void ended() { 125 | LOGGER.info("Total entities processed: " + totalEntitiesCount.get()); 126 | } 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/io/github/adrianulbona/osm/parquet/MultiEntitySink.java: -------------------------------------------------------------------------------- 1 | package io.github.adrianulbona.osm.parquet; 2 | 3 | import org.openstreetmap.osmosis.core.container.v0_6.EntityContainer; 4 | import org.openstreetmap.osmosis.core.domain.v0_6.Entity; 5 | import org.openstreetmap.osmosis.core.domain.v0_6.EntityType; 6 | import org.openstreetmap.osmosis.core.lifecycle.Closeable; 7 | import org.openstreetmap.osmosis.core.lifecycle.Completable; 8 | import org.openstreetmap.osmosis.core.task.v0_6.Sink; 9 | 10 | import java.nio.file.Path; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | import java.util.Map; 14 | 15 | import static java.util.stream.Collectors.toList; 16 | 17 | 18 | /** 19 | * Created by adrian.bona on 27/03/16. 20 | */ 21 | public class MultiEntitySink implements Sink { 22 | 23 | private final List> converters; 24 | 25 | private final List observers; 26 | 27 | public MultiEntitySink(Config config) { 28 | final List entityTypes = config.entitiesToBeParquetized(); 29 | this.converters = entityTypes.stream().map(type -> new ParquetSink<>(config.getSource(), 30 | config.getDestinationFolder(), config.getExcludeMetadata(), type)).collect(toList()); 31 | this.observers = new ArrayList<>(); 32 | } 33 | 34 | @Override 35 | public void process(EntityContainer entityContainer) { 36 | this.converters.forEach(converter -> converter.process(entityContainer)); 37 | this.observers.forEach(o -> o.processed(entityContainer.getEntity())); 38 | } 39 | 40 | @Override 41 | public void initialize(Map metaData) { 42 | this.converters.forEach(converter -> converter.initialize(metaData)); 43 | this.observers.forEach(Observer::started); 44 | } 45 | 46 | @Override 47 | public void complete() { 48 | this.converters.forEach(Completable::complete); 49 | this.observers.forEach(Observer::ended); 50 | } 51 | 52 | @Override 53 | public void close() { 54 | this.converters.forEach(Closeable::close); 55 | } 56 | 57 | public void addObserver(Observer observer) { 58 | this.observers.add(observer); 59 | } 60 | 61 | public void removeObserver(Observer observer) { 62 | this.observers.remove(observer); 63 | } 64 | 65 | public interface Observer { 66 | 67 | void started(); 68 | 69 | void processed(Entity entity); 70 | 71 | void ended(); 72 | } 73 | 74 | 75 | public interface Config { 76 | 77 | Path getSource(); 78 | 79 | Path getDestinationFolder(); 80 | 81 | boolean getExcludeMetadata(); 82 | 83 | List entitiesToBeParquetized(); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/io/github/adrianulbona/osm/parquet/ParquetSink.java: -------------------------------------------------------------------------------- 1 | package io.github.adrianulbona.osm.parquet; 2 | 3 | import org.apache.parquet.hadoop.ParquetWriter; 4 | import org.openstreetmap.osmosis.core.container.v0_6.EntityContainer; 5 | import org.openstreetmap.osmosis.core.domain.v0_6.Entity; 6 | import org.openstreetmap.osmosis.core.domain.v0_6.EntityType; 7 | import org.openstreetmap.osmosis.core.task.v0_6.Sink; 8 | 9 | import java.io.IOException; 10 | import java.nio.file.Path; 11 | import java.util.ArrayList; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.function.Predicate; 15 | 16 | import static java.lang.String.format; 17 | 18 | 19 | public class ParquetSink implements Sink { 20 | 21 | private final Path source; 22 | private final Path destinationFolder; 23 | private final boolean excludeMetadata; 24 | private final EntityType entityType; 25 | private final List> filters; 26 | 27 | private ParquetWriter writer; 28 | 29 | public ParquetSink(Path source, Path destinationFolder, boolean excludeMetadata, EntityType entityType) { 30 | this.source = source; 31 | this.destinationFolder = destinationFolder; 32 | this.excludeMetadata = excludeMetadata; 33 | this.entityType = entityType; 34 | this.filters = new ArrayList<>(); 35 | } 36 | 37 | @Override 38 | public void initialize(Map metaData) { 39 | final String pbfName = source.getFileName().toString(); 40 | final String entityName = entityType.name().toLowerCase(); 41 | final Path destination = destinationFolder.resolve(format("%s.%s.parquet", pbfName, entityName)); 42 | try { 43 | this.writer = ParquetWriterFactory.buildFor(destination.toAbsolutePath().toString(), excludeMetadata, 44 | entityType); 45 | } catch (IOException e) { 46 | throw new RuntimeException("Unable to build writers", e); 47 | } 48 | } 49 | 50 | @Override 51 | public void process(EntityContainer entityContainer) { 52 | try { 53 | if (this.entityType == entityContainer.getEntity().getType()) { 54 | final T entity = (T) entityContainer.getEntity(); 55 | if (filters.stream().noneMatch(filter -> filter.test(entity))) { 56 | writer.write(entity); 57 | } 58 | } 59 | } catch (IOException e) { 60 | throw new RuntimeException("Unable to write entity", e); 61 | } 62 | } 63 | 64 | @Override 65 | public void complete() { 66 | try { 67 | this.writer.close(); 68 | } catch (IOException e) { 69 | throw new RuntimeException("Unable to close writers", e); 70 | } 71 | } 72 | 73 | @Override 74 | public void close() { 75 | 76 | } 77 | 78 | public void addFilter(Predicate predicate) { 79 | this.filters.add(predicate); 80 | } 81 | 82 | public void removeFilter(Predicate predicate) { 83 | this.filters.remove(predicate); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/java/io/github/adrianulbona/osm/parquet/ParquetWriterFactory.java: -------------------------------------------------------------------------------- 1 | package io.github.adrianulbona.osm.parquet; 2 | 3 | import io.github.adrianulbona.osm.parquet.convertor.NodeWriteSupport; 4 | import io.github.adrianulbona.osm.parquet.convertor.RelationWriteSupport; 5 | import io.github.adrianulbona.osm.parquet.convertor.WayWriteSupport; 6 | import org.apache.hadoop.conf.Configuration; 7 | import org.apache.hadoop.fs.Path; 8 | import org.apache.parquet.hadoop.ParquetWriter; 9 | import org.apache.parquet.hadoop.api.WriteSupport; 10 | import org.apache.parquet.hadoop.metadata.CompressionCodecName; 11 | import org.openstreetmap.osmosis.core.domain.v0_6.*; 12 | 13 | import java.io.IOException; 14 | 15 | import static org.apache.parquet.hadoop.ParquetFileWriter.Mode.OVERWRITE; 16 | import static org.apache.parquet.hadoop.metadata.CompressionCodecName.SNAPPY; 17 | 18 | 19 | /** 20 | * Created by adrian.bona on 26/03/16. 21 | */ 22 | public class ParquetWriterFactory { 23 | 24 | private static final CompressionCodecName COMPRESSION = SNAPPY; 25 | 26 | public static ParquetWriter buildFor(String destination, boolean excludeMetadata, 27 | EntityType entityType) throws IOException { 28 | switch (entityType) { 29 | case Node: 30 | return (ParquetWriter) NodesWriterBuilder.standard(destination, excludeMetadata); 31 | case Way: 32 | return (ParquetWriter) WaysWriterBuilder.standard(destination, excludeMetadata); 33 | case Relation: 34 | return (ParquetWriter) RelationsWriterBuilder.standard(destination, excludeMetadata); 35 | default: 36 | throw new RuntimeException("Invalid entity type"); 37 | } 38 | } 39 | 40 | public static class WaysWriterBuilder extends ParquetWriter.Builder { 41 | 42 | private final boolean excludeMetadata; 43 | 44 | protected WaysWriterBuilder(Path file, boolean excludeMetadata) { 45 | super(file); 46 | this.excludeMetadata = excludeMetadata; 47 | } 48 | 49 | @Override 50 | protected WaysWriterBuilder self() { 51 | return this; 52 | } 53 | 54 | @Override 55 | protected WriteSupport getWriteSupport(Configuration conf) { 56 | return new WayWriteSupport(excludeMetadata); 57 | } 58 | 59 | public static ParquetWriter standard(String destination, boolean excludeMetadata) throws IOException { 60 | return new WaysWriterBuilder(new Path(destination), excludeMetadata).self() 61 | .withCompressionCodec(COMPRESSION).withWriteMode(OVERWRITE).build(); 62 | } 63 | } 64 | 65 | 66 | public static class NodesWriterBuilder extends ParquetWriter.Builder { 67 | 68 | private final boolean excludeMetadata; 69 | 70 | protected NodesWriterBuilder(Path file, boolean excludeMetadata) { 71 | super(file); 72 | this.excludeMetadata = excludeMetadata; 73 | } 74 | 75 | @Override 76 | protected NodesWriterBuilder self() { 77 | return this; 78 | } 79 | 80 | @Override 81 | protected WriteSupport getWriteSupport(Configuration conf) { 82 | return new NodeWriteSupport(excludeMetadata); 83 | } 84 | 85 | public static ParquetWriter standard(String destination, boolean excludeMetadata) throws IOException { 86 | return new NodesWriterBuilder(new Path(destination), excludeMetadata).self() 87 | .withCompressionCodec(COMPRESSION).withWriteMode(OVERWRITE).build(); 88 | } 89 | } 90 | 91 | 92 | public static class RelationsWriterBuilder extends ParquetWriter.Builder { 93 | 94 | private final boolean excludeMetadata; 95 | 96 | protected RelationsWriterBuilder(Path file, boolean excludeMetadata) { 97 | super(file); 98 | this.excludeMetadata = excludeMetadata; 99 | } 100 | 101 | @Override 102 | protected RelationsWriterBuilder self() { 103 | return this; 104 | } 105 | 106 | @Override 107 | protected WriteSupport getWriteSupport(Configuration conf) { 108 | return new RelationWriteSupport(excludeMetadata); 109 | } 110 | 111 | public static ParquetWriter standard(String destination, boolean excludeMetadata) throws IOException { 112 | return new RelationsWriterBuilder(new Path(destination), excludeMetadata).self() 113 | .withCompressionCodec(COMPRESSION).withWriteMode(OVERWRITE).build(); 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/io/github/adrianulbona/osm/parquet/convertor/NodeWriteSupport.java: -------------------------------------------------------------------------------- 1 | package io.github.adrianulbona.osm.parquet.convertor; 2 | 3 | import org.apache.parquet.schema.MessageType; 4 | import org.apache.parquet.schema.PrimitiveType; 5 | import org.apache.parquet.schema.Type; 6 | import org.openstreetmap.osmosis.core.domain.v0_6.Node; 7 | 8 | import java.util.ArrayList; 9 | import java.util.List; 10 | 11 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; 12 | import static org.apache.parquet.schema.Type.Repetition.REQUIRED; 13 | 14 | 15 | /** 16 | * Created by adrian.bona on 26/03/16. 17 | */ 18 | public class NodeWriteSupport extends OsmEntityWriteSupport { 19 | 20 | private final PrimitiveType latType; 21 | private final PrimitiveType longType; 22 | 23 | public NodeWriteSupport(boolean excludeMetadata) { 24 | super(excludeMetadata); 25 | latType = new PrimitiveType(REQUIRED, DOUBLE, "latitude"); 26 | longType = new PrimitiveType(REQUIRED, DOUBLE, "longitude"); 27 | } 28 | 29 | @Override 30 | protected MessageType getSchema() { 31 | final List attributes = new ArrayList<>(getCommonAttributes()); 32 | attributes.add(latType); 33 | attributes.add(longType); 34 | return new MessageType("node", attributes); 35 | } 36 | 37 | @Override 38 | protected void writeSpecificFields(Node record, int nextAvailableIndex) { 39 | recordConsumer.startField(latType.getName(), nextAvailableIndex); 40 | recordConsumer.addDouble(record.getLatitude()); 41 | recordConsumer.endField(latType.getName(), nextAvailableIndex++); 42 | 43 | recordConsumer.startField(longType.getName(), nextAvailableIndex); 44 | recordConsumer.addDouble(record.getLongitude()); 45 | recordConsumer.endField(longType.getName(), nextAvailableIndex); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/io/github/adrianulbona/osm/parquet/convertor/OsmEntityWriteSupport.java: -------------------------------------------------------------------------------- 1 | package io.github.adrianulbona.osm.parquet.convertor; 2 | 3 | import org.apache.hadoop.conf.Configuration; 4 | import org.apache.parquet.hadoop.api.WriteSupport; 5 | import org.apache.parquet.io.api.Binary; 6 | import org.apache.parquet.io.api.RecordConsumer; 7 | import org.apache.parquet.schema.GroupType; 8 | import org.apache.parquet.schema.MessageType; 9 | import org.apache.parquet.schema.PrimitiveType; 10 | import org.apache.parquet.schema.Type; 11 | import org.openstreetmap.osmosis.core.domain.v0_6.Entity; 12 | import org.openstreetmap.osmosis.core.domain.v0_6.Tag; 13 | 14 | import java.util.Collections; 15 | import java.util.LinkedList; 16 | import java.util.List; 17 | 18 | import static java.util.Arrays.asList; 19 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; 20 | import static org.apache.parquet.schema.Type.Repetition.*; 21 | 22 | 23 | /** 24 | * Created by adrian.bona on 26/03/16. 25 | */ 26 | public abstract class OsmEntityWriteSupport extends WriteSupport { 27 | 28 | private final PrimitiveType idType; 29 | private final PrimitiveType versionType; 30 | private final GroupType tags; 31 | private final PrimitiveType tagKeyType; 32 | private final PrimitiveType tagValueType; 33 | private final PrimitiveType timestampType; 34 | private final PrimitiveType changesetType; 35 | private final PrimitiveType uidType; 36 | private final PrimitiveType userSidType; 37 | 38 | private final boolean excludeMetadata; 39 | 40 | protected RecordConsumer recordConsumer; 41 | 42 | public OsmEntityWriteSupport(boolean excludeMetadata) { 43 | idType = new PrimitiveType(REQUIRED, INT64, "id"); 44 | tagKeyType = new PrimitiveType(REQUIRED, BINARY, "key"); 45 | tagValueType = new PrimitiveType(OPTIONAL, BINARY, "value"); 46 | tags = new GroupType(REPEATED, "tags", tagKeyType, tagValueType); 47 | versionType = new PrimitiveType(OPTIONAL, INT32, "version"); 48 | timestampType = new PrimitiveType(OPTIONAL, INT64, "timestamp"); 49 | changesetType = new PrimitiveType(OPTIONAL, INT64, "changeset"); 50 | uidType = new PrimitiveType(OPTIONAL, INT32, "uid"); 51 | userSidType = new PrimitiveType(OPTIONAL, BINARY, "user_sid"); 52 | this.excludeMetadata = excludeMetadata; 53 | } 54 | 55 | protected List getCommonAttributes() { 56 | final List commonAttributes = new LinkedList<>(); 57 | commonAttributes.add(idType); 58 | if (!excludeMetadata) { 59 | commonAttributes.addAll(asList(versionType, timestampType, changesetType, uidType, userSidType)); 60 | } 61 | commonAttributes.add(tags); 62 | return commonAttributes; 63 | } 64 | 65 | @Override 66 | public WriteContext init(Configuration config) { 67 | return new WriteContext(getSchema(), Collections.emptyMap()); 68 | } 69 | 70 | protected abstract MessageType getSchema(); 71 | 72 | @Override 73 | public void prepareForWrite(RecordConsumer recordConsumer) { 74 | this.recordConsumer = recordConsumer; 75 | } 76 | 77 | protected abstract void writeSpecificFields(E record, int nextAvailableIndex); 78 | 79 | public void write(E record) { 80 | int index = 0; 81 | recordConsumer.startMessage(); 82 | recordConsumer.startField(idType.getName(), index); 83 | recordConsumer.addLong(record.getId()); 84 | recordConsumer.endField(idType.getName(), index++); 85 | 86 | if (!excludeMetadata) { 87 | recordConsumer.startField(versionType.getName(), index); 88 | recordConsumer.addInteger(record.getVersion()); 89 | recordConsumer.endField(versionType.getName(), index++); 90 | 91 | recordConsumer.startField(timestampType.getName(), index); 92 | recordConsumer.addLong(record.getTimestamp().getTime()); 93 | recordConsumer.endField(timestampType.getName(), index++); 94 | 95 | recordConsumer.startField(changesetType.getName(), index); 96 | recordConsumer.addLong(record.getChangesetId()); 97 | recordConsumer.endField(changesetType.getName(), index++); 98 | 99 | recordConsumer.startField(uidType.getName(), index); 100 | recordConsumer.addInteger(record.getUser().getId()); 101 | recordConsumer.endField(uidType.getName(), index++); 102 | 103 | recordConsumer.startField(userSidType.getName(), index); 104 | recordConsumer.addBinary(Binary.fromString(record.getUser().getName())); 105 | recordConsumer.endField(userSidType.getName(), index++); 106 | } 107 | 108 | if (!record.getTags().isEmpty()) { 109 | recordConsumer.startField(tags.getName(), index); 110 | for (Tag tag : record.getTags()) { 111 | recordConsumer.startGroup(); 112 | 113 | recordConsumer.startField(tagKeyType.getName(), 0); 114 | recordConsumer.addBinary(Binary.fromString(tag.getKey())); 115 | recordConsumer.endField(tagKeyType.getName(), 0); 116 | 117 | recordConsumer.startField(tagValueType.getName(), 1); 118 | recordConsumer.addBinary(Binary.fromString(tag.getValue())); 119 | recordConsumer.endField(tagValueType.getName(), 1); 120 | 121 | recordConsumer.endGroup(); 122 | } 123 | recordConsumer.endField(tags.getName(), index); 124 | } 125 | index++; 126 | 127 | writeSpecificFields(record, index); 128 | recordConsumer.endMessage(); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/main/java/io/github/adrianulbona/osm/parquet/convertor/RelationWriteSupport.java: -------------------------------------------------------------------------------- 1 | package io.github.adrianulbona.osm.parquet.convertor; 2 | 3 | import org.apache.parquet.io.api.Binary; 4 | import org.apache.parquet.schema.GroupType; 5 | import org.apache.parquet.schema.MessageType; 6 | import org.apache.parquet.schema.PrimitiveType; 7 | import org.apache.parquet.schema.Type; 8 | import org.openstreetmap.osmosis.core.domain.v0_6.Relation; 9 | 10 | import java.util.ArrayList; 11 | import java.util.List; 12 | 13 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; 14 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; 15 | import static org.apache.parquet.schema.Type.Repetition.REPEATED; 16 | import static org.apache.parquet.schema.Type.Repetition.REQUIRED; 17 | 18 | 19 | /** 20 | * Created by adrian.bona on 26/03/16. 21 | */ 22 | public class RelationWriteSupport extends OsmEntityWriteSupport { 23 | 24 | private final GroupType membersType; 25 | private final PrimitiveType memberIdType; 26 | private final PrimitiveType memberRoleType; 27 | private final PrimitiveType memberTypeType; 28 | 29 | public RelationWriteSupport(boolean excludeMetadata) { 30 | super(excludeMetadata); 31 | memberIdType = new PrimitiveType(REQUIRED, INT64, "id"); 32 | memberRoleType = new PrimitiveType(REQUIRED, BINARY, "role"); 33 | memberTypeType = new PrimitiveType(REQUIRED, BINARY, "type"); 34 | membersType = new GroupType(REPEATED, "members", memberIdType, memberRoleType, memberTypeType); 35 | } 36 | 37 | @Override 38 | protected MessageType getSchema() { 39 | final List attributes = new ArrayList<>(getCommonAttributes()); 40 | attributes.add(membersType); 41 | return new MessageType("relation", attributes); 42 | } 43 | 44 | @Override 45 | protected void writeSpecificFields(Relation record, int nextAvailableIndex) { 46 | if (!record.getMembers().isEmpty()) { 47 | recordConsumer.startField(membersType.getName(), nextAvailableIndex); 48 | record.getMembers().forEach(member -> { 49 | recordConsumer.startGroup(); 50 | 51 | recordConsumer.startField(memberIdType.getName(), 0); 52 | recordConsumer.addLong(member.getMemberId()); 53 | recordConsumer.endField(memberIdType.getName(), 0); 54 | 55 | recordConsumer.startField(memberRoleType.getName(), 1); 56 | recordConsumer.addBinary(Binary.fromString(member.getMemberRole())); 57 | recordConsumer.endField(memberRoleType.getName(), 1); 58 | 59 | recordConsumer.startField(memberTypeType.getName(), 2); 60 | recordConsumer.addBinary(Binary.fromString(member.getMemberType().name())); 61 | recordConsumer.endField(memberTypeType.getName(), 2); 62 | 63 | recordConsumer.endGroup(); 64 | }); 65 | recordConsumer.endField(membersType.getName(), nextAvailableIndex); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/io/github/adrianulbona/osm/parquet/convertor/WayWriteSupport.java: -------------------------------------------------------------------------------- 1 | package io.github.adrianulbona.osm.parquet.convertor; 2 | 3 | import org.apache.parquet.schema.GroupType; 4 | import org.apache.parquet.schema.MessageType; 5 | import org.apache.parquet.schema.PrimitiveType; 6 | import org.apache.parquet.schema.Type; 7 | import org.openstreetmap.osmosis.core.domain.v0_6.Way; 8 | import org.openstreetmap.osmosis.core.domain.v0_6.WayNode; 9 | 10 | import java.util.ArrayList; 11 | import java.util.HashMap; 12 | import java.util.List; 13 | import java.util.Map; 14 | 15 | import static java.util.stream.IntStream.range; 16 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; 17 | import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT64; 18 | import static org.apache.parquet.schema.Type.Repetition.REPEATED; 19 | import static org.apache.parquet.schema.Type.Repetition.REQUIRED; 20 | 21 | 22 | /** 23 | * Created by adrian.bona on 26/03/16. 24 | */ 25 | public class WayWriteSupport extends OsmEntityWriteSupport { 26 | 27 | private final PrimitiveType nodeIndexType; 28 | private final PrimitiveType nodeIdType; 29 | private final GroupType nodes; 30 | 31 | public WayWriteSupport(boolean excludeMetadata) { 32 | super(excludeMetadata); 33 | nodeIndexType = new PrimitiveType(REQUIRED, INT32, "index"); 34 | nodeIdType = new PrimitiveType(REQUIRED, INT64, "nodeId"); 35 | nodes = new GroupType(REPEATED, "nodes", nodeIndexType, nodeIdType); 36 | } 37 | 38 | @Override 39 | protected MessageType getSchema() { 40 | final List attributes = new ArrayList<>(getCommonAttributes()); 41 | attributes.add(nodes); 42 | return new MessageType("way", attributes); 43 | } 44 | 45 | @Override 46 | protected void writeSpecificFields(Way record, int nextAvailableIndex) { 47 | final List wayNodes = record.getWayNodes(); 48 | final Map indexedNodes = new HashMap<>(); 49 | range(0, wayNodes.size()).forEach(index -> indexedNodes.put(index, wayNodes.get(index).getNodeId())); 50 | 51 | if (!indexedNodes.isEmpty()) { 52 | recordConsumer.startField(nodes.getName(), nextAvailableIndex); 53 | indexedNodes.forEach((index, nodeId) -> { 54 | recordConsumer.startGroup(); 55 | 56 | recordConsumer.startField(nodeIndexType.getName(), 0); 57 | recordConsumer.addInteger(index); 58 | recordConsumer.endField(nodeIndexType.getName(), 0); 59 | 60 | recordConsumer.startField(nodeIdType.getName(), 1); 61 | recordConsumer.addLong(nodeId); 62 | recordConsumer.endField(nodeIdType.getName(), 1); 63 | 64 | recordConsumer.endGroup(); 65 | }); 66 | recordConsumer.endField(nodes.getName(), nextAvailableIndex); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Root logger option 2 | log4j.rootLogger=INFO, stdout 3 | 4 | # Direct log messages to stdout 5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 6 | log4j.appender.stdout.Target=System.out 7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n --------------------------------------------------------------------------------