├── .gitignore ├── media ├── agol.png ├── map.png ├── sd-1.png ├── sd-2.png ├── sd-3.png ├── sd-4.png ├── sd-5.png ├── sd-6.png ├── import-hex.png ├── services.png ├── share-as.png ├── trips-1M.png ├── import-trips.png ├── web-app-hex.png ├── import-density.png ├── web-app-density.png ├── web-app-trips.png ├── widget-config.png ├── widget-execute.png └── folder-connection.png ├── src ├── main │ └── python │ │ ├── HexCells.lyr │ │ ├── HDFSToolbox.DensityTool.pyt.xml │ │ ├── HDFSToolbox.HexTool.pyt.xml │ │ ├── HDFSToolbox.TripTool.pyt.xml │ │ ├── HDFSToolbox.ExportToHDFSTool.pyt.xml │ │ ├── hexcell.py │ │ ├── mercator.py │ │ ├── GeoCount3.py │ │ ├── GeoCount1.py │ │ ├── GeoCount2.py │ │ ├── hexgrid.py │ │ └── HDFSToolbox.pyt └── test │ └── python │ └── hexgridtest.py ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | *.xml 4 | -------------------------------------------------------------------------------- /media/agol.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/agol.png -------------------------------------------------------------------------------- /media/map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/map.png -------------------------------------------------------------------------------- /media/sd-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-1.png -------------------------------------------------------------------------------- /media/sd-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-2.png -------------------------------------------------------------------------------- /media/sd-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-3.png -------------------------------------------------------------------------------- /media/sd-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-4.png -------------------------------------------------------------------------------- /media/sd-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-5.png -------------------------------------------------------------------------------- /media/sd-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-6.png -------------------------------------------------------------------------------- /media/import-hex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/import-hex.png -------------------------------------------------------------------------------- /media/services.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/services.png -------------------------------------------------------------------------------- /media/share-as.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/share-as.png -------------------------------------------------------------------------------- /media/trips-1M.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/trips-1M.png -------------------------------------------------------------------------------- /media/import-trips.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/import-trips.png -------------------------------------------------------------------------------- /media/web-app-hex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/web-app-hex.png -------------------------------------------------------------------------------- /media/import-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/import-density.png -------------------------------------------------------------------------------- /media/web-app-density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/web-app-density.png -------------------------------------------------------------------------------- /media/web-app-trips.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/web-app-trips.png -------------------------------------------------------------------------------- /media/widget-config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/widget-config.png -------------------------------------------------------------------------------- /media/widget-execute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/widget-execute.png -------------------------------------------------------------------------------- /media/folder-connection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/folder-connection.png -------------------------------------------------------------------------------- /src/main/python/HexCells.lyr: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/src/main/python/HexCells.lyr -------------------------------------------------------------------------------- /src/main/python/HDFSToolbox.DensityTool.pyt.xml: -------------------------------------------------------------------------------- 1 | 2 | 20160207074229001.0TRUE 3 | -------------------------------------------------------------------------------- /src/main/python/HDFSToolbox.HexTool.pyt.xml: -------------------------------------------------------------------------------- 1 | 2 | 20160207074230001.0TRUE 3 | -------------------------------------------------------------------------------- /src/main/python/HDFSToolbox.TripTool.pyt.xml: -------------------------------------------------------------------------------- 1 | 2 | 20160207074228001.0TRUE 3 | -------------------------------------------------------------------------------- /src/main/python/HDFSToolbox.ExportToHDFSTool.pyt.xml: -------------------------------------------------------------------------------- 1 | 2 | 20160207074231001.0TRUE 3 | -------------------------------------------------------------------------------- /src/main/python/hexcell.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | class HexCell: 5 | def __init__(self, size): 6 | self.xy = [] 7 | for i in range(7): 8 | angle = math.pi * ((i % 6) + 0.5) / 3.0 9 | x = size * math.cos(angle) 10 | y = size * math.sin(angle) 11 | self.xy.append((x, y)) 12 | 13 | def to_shape(self, cx, cy): 14 | return [[cx + x, cy + y] for (x, y) in self.xy] 15 | -------------------------------------------------------------------------------- /src/main/python/mercator.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | class Mercator: 5 | def __init__(self): 6 | pass 7 | 8 | @staticmethod 9 | def to_wgs84(x, y): 10 | rad = 6378137.0 11 | lat = (1.5707963267948966 - (2.0 * math.atan(math.exp((-1.0 * y) / rad)))) * (180 / math.pi) 12 | lon = ((x / rad) * 57.295779513082323) - ( 13 | (math.floor((((x / rad) * 57.295779513082323) + 180.0) / 360.0)) * 360.0) 14 | return lon, lat 15 | 16 | @staticmethod 17 | def to_web_mercator(lon, lat): 18 | rad = 6378137.0 19 | e = lon * 0.017453292519943295 20 | x = rad * e 21 | n = lat * 0.017453292519943295 22 | sin_n = math.sin(n) 23 | y = 3189068.5 * math.log((1.0 + sin_n) / (1.0 - sin_n)) 24 | return x, y 25 | -------------------------------------------------------------------------------- /src/test/python/hexgridtest.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import unittest 4 | 5 | from hexgrid import HexGrid 6 | 7 | 8 | class HexGridTest(unittest.TestCase): 9 | def testHexGrid(self): 10 | hg = HexGrid(0.001) 11 | head, tail = os.path.split(os.path.abspath(__file__)) 12 | head, tail = os.path.split(head) 13 | filename = os.path.join(head, 'resources', 'hex.csv') 14 | with open(filename, 'rb') as csvfile: 15 | reader = csv.reader(csvfile) 16 | for row in reader: 17 | px = float(row[0]) 18 | py = float(row[1]) 19 | er = long(row[2]) 20 | ec = long(row[3]) 21 | rr, rc = hg.xy2rc(px, py) 22 | self.assertEquals(er, rr) 23 | self.assertEquals(ec, rc) 24 | 25 | 26 | if __name__ == '__main__': 27 | unittest.main() 28 | -------------------------------------------------------------------------------- /src/main/python/GeoCount3.py: -------------------------------------------------------------------------------- 1 | # 2 | # Spark job to bin data in WebMercator Spatial Reference 3 | # The bin is a hexagon with a width of 100 meters 4 | # 5 | from pyspark import SparkContext 6 | 7 | from hexgrid import HexGrid 8 | from mercator import Mercator 9 | 10 | 11 | def line_to_row_col(line, hg): 12 | splits = line.split(',') 13 | try: 14 | lon = float(splits[10]) 15 | lat = float(splits[11]) 16 | x, y = Mercator.to_web_mercator(lon, lat) 17 | rc = hg.xy2rc(x, y) 18 | return rc, 1 19 | except: 20 | return (0, 0), -1 21 | 22 | 23 | if __name__ == "__main__": 24 | hg = HexGrid(100) 25 | sc = SparkContext() 26 | sc.textFile("hdfs:///trips"). \ 27 | map(lambda line: line_to_row_col(line, hg)). \ 28 | filter(lambda (rowcol, count): count > 0). \ 29 | reduceByKey(lambda a, b: a + b). \ 30 | filter(lambda (rowcol, count): count > 10). \ 31 | map(lambda ((row, col), count): "{0},{1},{2}".format(row, col, count)). \ 32 | saveAsTextFile("hdfs:///tmp/hex") 33 | -------------------------------------------------------------------------------- /src/main/python/GeoCount1.py: -------------------------------------------------------------------------------- 1 | # 2 | # Spark job to bin data in WGS84 Spatial Reference 3 | # The bin size is 0.001 degrees 4 | # 5 | import math 6 | 7 | from pyspark import SparkContext 8 | 9 | 10 | def line_to_row_col(line): 11 | splits = line.split(',') 12 | try: 13 | p_lon = float(splits[10]) 14 | p_lat = float(splits[11]) 15 | c = int(math.floor(p_lon / 0.001)) 16 | r = int(math.floor(p_lat / 0.001)) 17 | return (r, c), 1 18 | except: 19 | return (0, 0), -1 20 | 21 | 22 | def row_col_to_xy(row, col, count): 23 | y = row * 0.001 + 0.0005 24 | x = col * 0.001 + 0.0005 25 | return "{0},{1},{2}".format(x, y, count) 26 | 27 | 28 | if __name__ == "__main__": 29 | sc = SparkContext() 30 | sc.textFile("hdfs:///trips"). \ 31 | map(lambda line: line_to_row_col(line)). \ 32 | filter(lambda (rowcol, count): count > 0). \ 33 | reduceByKey(lambda a, b: a + b). \ 34 | filter(lambda (rowcol, count): count > 2). \ 35 | map(lambda ((row, col), count): row_col_to_xy(row, col, count)). \ 36 | saveAsTextFile("hdfs:///tmp/rowcol") 37 | -------------------------------------------------------------------------------- /src/main/python/GeoCount2.py: -------------------------------------------------------------------------------- 1 | # 2 | # Spark job to bin data in WebMercator Spatial Reference 3 | # The bin size is 100 meters 4 | # 5 | import math 6 | 7 | from pyspark import SparkContext 8 | 9 | from mercator import Mercator 10 | 11 | 12 | def line_to_row_col(line): 13 | splits = line.split(',') 14 | try: 15 | lon = float(splits[10]) 16 | lat = float(splits[11]) 17 | x, y = Mercator.to_web_mercator(lon, lat) 18 | c = int(math.floor(x / 100)) 19 | r = int(math.floor(y / 100)) 20 | return (r, c), 1 21 | except: 22 | return (0, 0), -1 23 | 24 | 25 | def row_col_to_xy(row, col, count): 26 | y = row * 100 + 50 27 | x = col * 100 + 50 28 | return "{0},{1},{2}".format(x, y, count) 29 | 30 | 31 | if __name__ == "__main__": 32 | sc = SparkContext() 33 | sc.textFile("hdfs:///trips"). \ 34 | map(lambda line: line_to_row_col(line)). \ 35 | filter(lambda (rowcol, count): count > 0). \ 36 | reduceByKey(lambda a, b: a + b). \ 37 | filter(lambda (rowcol, count): count > 2). \ 38 | map(lambda ((row, col), count): row_col_to_xy(row, col, count)). \ 39 | saveAsTextFile("hdfs:///tmp/rowcol") 40 | -------------------------------------------------------------------------------- /src/main/python/hexgrid.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | class HexGrid: 5 | def __init__(self, size=100): 6 | self.two_pi = 2.0 * math.pi 7 | self.rad_to_deg = 180.0 / math.pi 8 | self.size = size 9 | self.h = self.size * math.cos(30.0 * math.pi / 180.0) 10 | self.v = self.size * 0.5 11 | self.skip_x = 2.0 * self.h 12 | self.skip_y = 3.0 * self.v 13 | 14 | def rc2xy(self, r, c): 15 | ofs = self.h if r % 2L != 0 else 0 16 | x = c * self.skip_x + ofs 17 | y = r * self.skip_y 18 | return x, y 19 | 20 | def inside(self, px, py, cx, cy): 21 | qx = math.fabs(px - cx) 22 | qy = math.fabs(py - cy) 23 | return False if qx > self.h or qy > self.size else qx / self.h + qy / self.v <= 2.0 24 | 25 | def azimuth_to_degrees(self, px, py, cx, cy): 26 | az = math.atan2(px - cx, py - cy) # reversed on purpose 27 | return az + self.two_pi * self.rad_to_deg if az < 0.0 else az * self.rad_to_deg 28 | 29 | def proceed_to_neighbor(self, px, py, cx, cy, old_r, old_c): 30 | deg = self.azimuth_to_degrees(px, py, cx, cy) 31 | if deg > 300.0: 32 | c = old_c if old_r % 2L != 0L else old_c - 1L 33 | r = old_r + 1L 34 | elif deg > 240.0: 35 | r = old_r 36 | c = old_c - 1L 37 | elif deg > 180.0: 38 | c = old_c - 1L if old_r % 2L != 0L else old_c 39 | r = old_r - 1L 40 | elif deg > 120.0: 41 | c = old_c + 1L if old_r % 2L != 0L else old_c 42 | r = old_r - 1L 43 | elif deg > 60.0: 44 | r = old_r 45 | c = old_c + 1L 46 | else: 47 | c = old_c + 1L if old_r % 2L != 0L else old_c 48 | r = old_r + 1L 49 | return r, c 50 | 51 | def xy2rc(self, px, py): 52 | r = long(math.floor(py / self.skip_y)) 53 | c = long(math.floor(px / self.skip_x)) 54 | cx, cy = self.rc2xy(r, c) 55 | while not self.inside(px, py, cx, cy): 56 | r, c = self.proceed_to_neighbor(px, py, cx, cy, r, c) 57 | cx, cy = self.rc2xy(r, c) 58 | return r, c 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016 Mansour Raad 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [(Web)Mapping Elephants with Sparks](http://thunderheadxpler.blogspot.com/2016/02/webmapping-elephants-with-sparks.html) 2 | 3 | This project is composed of tools written in Python to process and display a set of point data in CSV format from [HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html). 4 | The processing is based on [Spark](http://spark.apache.org/), and the display on a desktop or on a web client is based on [ArcPy](http://pro.arcgis.com/en/pro-app/arcpy/get-started/what-is-arcpy-.htm). 5 | 6 | ![](media/map.png) 7 | 8 | ## Getting Started 9 | 10 | * Create an [ArcGIS Online](https://www.arcgis.com/home/) account to publish your results as WebMaps and WebApps. 11 | 12 | * You need to access an HDFS instance. For local testing, you can download either the [Hortonworks](http://hortonworks.com/products/hortonworks-sandbox/) or the [Cloudera](http://www.cloudera.com/downloads/quickstart_vms/5-5.html) sandbox. 13 | Of course you can "cloud" it. Cloudera released a [Docker](https://www.docker.com/) based [image](https://blog.cloudera.com/blog/2015/12/docker-is-the-new-quickstart-option-for-apache-hadoop-and-cloudera/) and [Hortonworks teamed up with Azure](http://hortonworks.com/blog/hortonworks-sandbox-with-hdp-2-3-is-now-available-on-microsoft-azure-gallery/) to enable you to take HDP for a spin. 14 | 15 | ## Test Data 16 | 17 | We will use the famous [NYC Taxi trips](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml) as our test set. 18 | I love that data set, as it contains 2 spatial and temporal elements (pickup and drop-off locations, dates and times) and additional trip attributes such as the number of passengers and the trip time. 19 | A 1 million trip subset can be downloaded from [here](https://dl.dropboxusercontent.com/u/2193160/trips-1M.csv.7z) to be placed in your HDFS instance as follows: 20 | 21 | ```bash 22 | sudo -u hdfs hadoop fs -mkdir /trips 23 | sudo -u hdfs hadoop fs -chown root:hdfs /trips 24 | sudo -u hdfs hadoop fs -chmod a+rw /trips 25 | 26 | yum -y install wget p7zip 27 | wget https://dl.dropboxusercontent.com/u/2193160/trips-1M.csv.7z 28 | 7za e -so trips-1M.csv.7z | hadoop fs -put - /trips/trips-1M.csv 29 | ``` 30 | 31 | ## Mapping Prerequisites 32 | 33 | The access to HDFS from a remote node will be performed through the [Requests](http://docs.python-requests.org/en/master/) python module using the [WebHDFS REST API](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html). 34 | 35 | To instantiate the [Requests](http://docs.python-requests.org/en/master/) class from [ArcGIS Desktop](http://www.esri.com/software/arcgis/arcgis-for-desktop), make sure that your system environment variable `PATH` contains the `ArcGIS` folder and the `Scripts` sub-folder. For example: 36 | 37 | ``` 38 | C:\Python27\ArcGIS10.3;C:\Python27\ArcGIS10.3\Scripts 39 | ``` 40 | 41 | Start a new `CMD` window as Administrator and execute: 42 | 43 | ``` 44 | pip2.7 install requests 45 | ``` 46 | 47 | To instantiate the [Requests](http://docs.python-requests.org/en/master/) class from [ArcGIS Server](http://www.esri.com/software/arcgis/arcgisserver), make sure that your system environment variable `PATH` contains the `ArcGIS` folder and the `Scripts` sub-folder. For example: 48 | 49 | ``` 50 | C:\Python27\ArcGISx6410.3;C:\Python27\ArcGISx6410.3\Scripts 51 | ``` 52 | 53 | * Stop `ArcGIS Server` from the `Services` interface. 54 | 55 | * Start a new `CMD` window as Administrator and execute: 56 | 57 | ``` 58 | pip2.7 install requests 59 | ``` 60 | 61 | * Modify your `C:\Windows\System32\drivers\etc\hosts` file to contain the Hadoop hostname. For example: 62 | 63 | ``` 64 | 192.168.1.25 sandbox.hortonworks.com mraad.cloudapp.net cloudera.quickstart sandbox 65 | ``` 66 | 67 | * Start `ArcGIS Server` from the `Services` interface. 68 | 69 | The `hosts` modification is **very** important. When you want to [read the content of a file](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Open_and_Read_a_File) in HDFS, you first connect to the NameNode and you request the location of the desired file. 70 | NameNode responds with the file URL on a **DataNode**, to which you connect to and you read the streaming data from the open connection. 71 | The response URL typically contains the **name** of the DataNode not its IP. The above mentioned `hosts` modification enables the host name resolving when using the sandbox or a cloud instance. 72 | 73 | ## Mapping The Data 74 | 75 | To view the trips on a map, we will extend [Desktop](http://www.esri.com/software/arcgis/arcgis-for-desktop) with an ArcPy based toolbox to create an ephemeral in-memory feature class. 76 | 77 | Create a folder connection to this project [git clone](https://git-scm.com/docs/git-clone) to access the toolbox: 78 | 79 | ![](media/folder-connection.png) 80 | 81 | The `Import Trips` tools reads the content of the trips CSV file from HDFS and parses each row for lat/lon values at specific field indexes to create an in-memory feature class of the result. 82 | 83 | ![](media/import-trips.png) 84 | 85 | The tools performs a local filter on _all_ the retrieved rows where only the trips originating in an area around Manhattan are displayed on the map resulting to about 981,000 point features as pickup locations. 86 | 87 | ![](media/trips-1M.png) 88 | 89 | ## Share As GeoProcessing 90 | 91 | The following are the steps to publish the `Import Tool` and the other included tools as a GeoProcessing services in an ArcGIS Server instance, in such that they can be consumed from a web application using the ArcGIS [rest endpoint](http://server.arcgis.com/en/server/latest/publish-services/linux/introduction-to-geoprocessing-rest-services.htm). 92 | 93 | * Edit the toolbox and set all the variables `in_memory` to `False`. This will save the resulting feature class in a scratch geodatabase workspace rather than in "in-memory" workspace. ArcGIS Server provides a runtime reference to a scratch geodatabase by reading the value of `arcpy.env.scratchGDB`. BTW, this will result in a perceivable performance degradation. On my machines, the process went from 45 sec to 1 min 24 sec. A solution (that I have to implement) is to save the features to an in-memory workspace and then perform a [copy features](http://help.arcgis.com/EN/ARCGISDESKTOP/10.0/HELP/index.html#//001700000035000000) to the scratch geodatabase. 94 | * Run the tool, and from the `Results` window, select `Geoprocessing Service` from the `Share As` menu item. 95 | 96 | ![](media/share-as.png) 97 | 98 | * **DO NOT** Publish a Service. Save a service definition file instead. In addition, make sure to **explicitly** set the `Description` to all the items even if a description already exists. What I mean by this is that you have to explicitly type in something in the description field (type a dot at the end of the text if one exists). 99 | * Make sure to save the service definition to `C:\temp` folder or a folder with a very short name. 100 | * Check the option to view the results with a map service. 101 | 102 | ![](media/sd-1.png) 103 | ![](media/sd-2.png) 104 | ![](media/sd-3.png) 105 | ![](media/sd-4.png) 106 | ![](media/sd-5.png) 107 | ![](media/sd-6.png) 108 | 109 | * Publish the service definition using the ArcGIS Server Web Manager 110 | 111 | ![](media/services.png) 112 | 113 | ## Web Visualizing 114 | 115 | The easiest way to view the result on the web is to use the ArcGIS Online [WebApp Builder](http://www.esri.com/software/web-appbuilder). 116 | 117 | ![](media/agol.png) 118 | 119 | Locate the published GeoProcessing URL and use it to add A GeoProcessing Widget to the application header. 120 | 121 | ![](media/widget-config.png) 122 | ![](media/widget-execute.png) 123 | ![](media/web-app-trips.png) 124 | 125 | ## Data Aggregation 126 | 127 | We have been displaying about 1 million points and that has been relatively tolerable. 128 | But what if we have billions of points or millions of files to scan in HDFS, then this becomes an "issue". 129 | Though some folks advocate that they _need_ to see millions of dots on a map to highlight the absence of information _as_ information, the following aggregation should assist in that matter too. 130 | 131 | The simplest aggregation that we will undertake is binning. 132 | Imagine a virtual fishnet that is cast over the point space. All the points that fall into the same fishnet cell are aggregated together. 133 | What is returned is the set of populated fishnet cells and their associated aggregates. 134 | This tremendously reduces the size of the data to be visualized in what I term "Turning BigData into WebGIS Data", basically something digestible by a web client using ArcGIS[1](#f1). 135 | 136 | The binning processing will be performed using [Spark](http://spark.apache.org/). 137 | 138 | ## Running the Spark Jobs 139 | 140 | ```bash 141 | hadoop fs -rm -r -skipTrash /tmp/rowcol 142 | spark-submit\ 143 | --master yarn\ 144 | --executor-memory 512m\ 145 | --num-executors 1\ 146 | GeoCount1.py 147 | ``` 148 | 149 | The above job aggregates the point data based on square fishnet cells. 150 | The cell size is hardcoded to 0.001 degrees (Exercise for the reader to make cell size a program argument. Hint: use `sys.argv`) and the output is emitted to HDFS in the `/tmp/rowcol` folder. 151 | 152 | ```bash 153 | hadoop fs -rm -r -skipTrash /tmp/rowcol 154 | spark-submit\ 155 | --master yarn\ 156 | --executor-memory 512m\ 157 | --num-executors 1\ 158 | --py-files mercator.py\ 159 | GeoCount2.py 160 | ``` 161 | 162 | The above job aggregates the point data based on square fishnet cells. 163 | Unlike the previous job, the point coordinates are parsed and converted from WGS84 to WebMercator. 164 | The cell size is 100 meters and the output is emitted to HDFS in the `/tmp/rowcol` folder. 165 | 166 | ```bash 167 | hadoop fs -rm -r -skipTrash /tmp/hex 168 | spark-submit\ 169 | --master yarn\ 170 | --executor-memory 512m\ 171 | --num-executors 1\ 172 | --py-files mercator.py,hexgrid.py\ 173 | GeoCount3.py 174 | ``` 175 | 176 | The above job aggregates the point data based on hexagonal fishnet cells. 177 | The point coordinates are converted to WebMercator and aggregated based on 100 meter hexagonal cells and the output is emitted to HDFS in the `/tmp/hex` folder. 178 | 179 | ## Web Visualizing Aggregated Results 180 | 181 | To view the content of the files in `hdfs:///tmp/rowcol`, use the `ImportPointDensityTool`: 182 | 183 | ![](media/import-density.png) 184 | 185 | Execute the tool and `Share As` a `Geoprocessing Service` the result. 186 | Locate the Geoprocessing tool URL, and use it to add a new Geoprocessing widget to your application. 187 | 188 | ![](media/web-app-density.png) 189 | 190 | To view the content of the files in `hdfs:///tmp/hex`, use the `ImportHexTool` 191 | 192 | ![](media/import-hex.png) 193 | 194 | Execute the tool and `Share As` a `Geoprocessing Service` the result. 195 | Locate the Geoprocessing tool URL, and use it to add a new Geoprocessing widget to your application. 196 | 197 | ![](media/web-app-hex.png) 198 | 199 | ### Cloudera Docker Notes 200 | 201 | Create a "machine" labeled `quickstart` using [VirtualBox](https://www.virtualbox.org/wiki/VirtualBox) with 4 cores, 8GB of memory and 20GB of disk space. 202 | 203 | ```bash 204 | docker-machine create\ 205 | --driver virtualbox\ 206 | --virtualbox-cpu-count 4\ 207 | --virtualbox-memory 8192\ 208 | --virtualbox-disk-size 20480\ 209 | --virtualbox-no-vtx-check\ 210 | quickstart 211 | ``` 212 | 213 | Upgrade the machine to the latest docker instance. 214 | 215 | ```bash 216 | docker-machine upgrade quickstart 217 | ``` 218 | 219 | Run the CDH image - This will start Zookeeper, HDFS, WebHDFS, YARN, Spark and other daemons. 220 | Note the `-v` option to share the VirtualBox created `/Users` shared folder and mount it as `/Users` in the container. 221 | In addition, all kind of ports are opened up (4040 for Spark, 8888 for HUE, 50070 for NameNode, 50075 for DataNode) to be accessed from the "outside". 222 | The machine IP can be retrieved using `docker-machine ip quickstart`. 223 | 224 | ```bash 225 | eval $(docker-machine env quickstart) 226 | docker run\ 227 | --rm=true\ 228 | --privileged=true\ 229 | --hostname=quickstart.cloudera\ 230 | -v /Users:/Users\ 231 | -p 4040:4040\ 232 | -p 7180:7180\ 233 | -p 8088:8088\ 234 | -p 8042:8042\ 235 | -p 8888:8888\ 236 | -p 50060:50060\ 237 | -p 50070:50070\ 238 | -p 50075:50075\ 239 | -p 21050:21050\ 240 | -t -i cloudera/quickstart:latest\ 241 | /usr/bin/docker-quickstart 242 | ``` 243 | 244 | 245 | ### References 246 | 247 | * 248 | * 249 | * 250 | * 251 | 252 | \[1\] I've been experimenting with WebGL and the new [4.0 JavaScript API for ArcGIS](https://developers.arcgis.com/javascript/beta/) and we can now render tremendous amount of data in a web browser (as long as it can be quickly transferred from the server) - will have to post about that. [↩](#a1) 253 | -------------------------------------------------------------------------------- /src/main/python/HDFSToolbox.pyt: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | import re 4 | import sys 5 | 6 | import arcpy 7 | import requests 8 | 9 | 10 | class HexGrid: 11 | def __init__(self, size=100): 12 | self.two_pi = 2.0 * math.pi 13 | self.rad_to_deg = 180.0 / math.pi 14 | self.size = size 15 | self.h = self.size * math.cos(30.0 * math.pi / 180.0) 16 | self.v = self.size * 0.5 17 | self.skip_x = 2.0 * self.h 18 | self.skip_y = 3.0 * self.v 19 | 20 | def rc2xy(self, r, c): 21 | ofs = self.h if r % 2L != 0 else 0 22 | x = c * self.skip_x + ofs 23 | y = r * self.skip_y 24 | return x, y 25 | 26 | def inside(self, px, py, cx, cy): 27 | qx = math.fabs(px - cx) 28 | qy = math.fabs(py - cy) 29 | return False if qx > self.h or qy > self.size else qx / self.h + qy / self.v <= 2.0 30 | 31 | def azimuth_to_degrees(self, px, py, cx, cy): 32 | az = math.atan2(px - cx, py - cy) # reversed on purpose 33 | return az + self.two_pi * self.rad_to_deg if az < 0.0 else az * self.rad_to_deg 34 | 35 | def proceed_to_neighbor(self, px, py, cx, cy, old_r, old_c): 36 | deg = self.azimuth_to_degrees(px, py, cx, cy) 37 | if deg > 300.0: 38 | c = old_c if old_r % 2L != 0L else old_c - 1L 39 | r = old_r + 1L 40 | elif deg > 240.0: 41 | r = old_r 42 | c = old_c - 1L 43 | elif deg > 180.0: 44 | c = old_c - 1L if old_r % 2L != 0L else old_c 45 | r = old_r - 1L 46 | elif deg > 120.0: 47 | c = old_c + 1L if old_r % 2L != 0L else old_c 48 | r = old_r - 1L 49 | elif deg > 60.0: 50 | r = old_r 51 | c = old_c + 1L 52 | else: 53 | c = old_c + 1L if old_r % 2L != 0L else old_c 54 | r = old_r + 1L 55 | return r, c 56 | 57 | def xy2rc(self, px, py): 58 | r = long(math.floor(py / self.skip_y)) 59 | c = long(math.floor(px / self.skip_x)) 60 | cx, cy = self.rc2xy(r, c) 61 | while not self.inside(px, py, cx, cy): 62 | r, c = self.proceed_to_neighbor(px, py, cx, cy, r, c) 63 | cx, cy = self.rc2xy(r, c) 64 | return r, c 65 | 66 | 67 | class HexCell: 68 | def __init__(self, size): 69 | self.xy = [] 70 | for i in range(7): 71 | angle = math.pi * ((i % 6) + 0.5) / 3.0 72 | x = size * math.cos(angle) 73 | y = size * math.sin(angle) 74 | self.xy.append((x, y)) 75 | 76 | def to_shape(self, cx, cy): 77 | return [[cx + x, cy + y] for (x, y) in self.xy] 78 | 79 | 80 | class WebHDFSRes(object): 81 | def __init__(self, res): 82 | self.res = res 83 | 84 | def __enter__(self): 85 | return self.res 86 | 87 | def __exit__(self, exception_type, exception_value, traceback): 88 | self.res.close() 89 | 90 | 91 | class WebHDFS(object): 92 | def __init__(self, host, user, port=50070): 93 | self.host = host 94 | self.port = port 95 | self.user = user 96 | 97 | def create_file(self, fc, sep, hdfs_path): 98 | description = arcpy.Describe(fc) 99 | field_names = [field.name for field in description.fields] 100 | shape_name = description.shapeFieldName 101 | field_names.remove(shape_name) 102 | field_names.append(shape_name + "@WKT") 103 | 104 | sep = "\t" if sep == "tab" else sep[0] 105 | 106 | result = arcpy.management.GetCount(fc) 107 | max_range = int(result.getOutput(0)) 108 | arcpy.SetProgressor("step", "Exporting...", 0, max_range, 1) 109 | 110 | def gen_data(): 111 | with arcpy.da.SearchCursor(fc, field_names) as cursor: 112 | arr = [] 113 | cnt = 0 114 | inc = 0 115 | for row in cursor: 116 | cnt += 1 117 | inc += 1 118 | arr.append(sep.join([str(r) for r in row])) 119 | if inc == 5000: 120 | inc = 0 121 | arr.append("") 122 | yield "\n".join(arr) 123 | arr = [] 124 | arcpy.SetProgressorPosition(cnt) 125 | if len(arr) > 0: 126 | arr.append("") 127 | yield "\n".join(arr) 128 | 129 | params = {"op": "CREATE", "user.name": self.user, "buffersize": 1024 * 1024} 130 | url = "http://{}:{}/webhdfs/v1{}".format(self.host, self.port, hdfs_path) 131 | with WebHDFSRes(requests.put(url, params=params, allow_redirects=False)) as resLoc: 132 | if resLoc.status_code == 307: 133 | location = resLoc.headers["Location"] 134 | with WebHDFSRes(requests.put(location, data=gen_data())) as resPut: 135 | if resPut.status_code != 201: 136 | arcpy.AddError("Cannot put feature class. Error code = {}".format(resLoc.status_code)) 137 | else: 138 | arcpy.AddError("Cannot get HDFS location. Error code = {}".format(resLoc.status_code)) 139 | 140 | arcpy.ResetProgressor() 141 | return 142 | 143 | def open(self, hdfs_path, offset=-1, length=-1, buffer_size=-1): 144 | # arcpy.AddMessage(hdfs_path) 145 | params = {"op": "OPEN", "user.name": self.user} 146 | if offset > 0: 147 | params["offset"] = offset 148 | if length > 0: 149 | params["length"] = length 150 | if buffer_size > 0: 151 | params["buffersize"] = buffer_size 152 | url = "http://{}:{}/webhdfs/v1{}".format(self.host, self.port, hdfs_path) 153 | return WebHDFSRes(requests.get(url, params=params, stream=True)) 154 | 155 | def list_status(self, hdfs_path, suffix_re="*"): 156 | files = [] 157 | prog = re.compile(suffix_re) 158 | params = {"op": "LISTSTATUS", "user.name": self.user} 159 | url = "http://{}:{}/webhdfs/v1{}".format(self.host, self.port, hdfs_path) 160 | with WebHDFSRes(requests.get(url, params=params)) as res: 161 | doc = res.json() 162 | for i in doc["FileStatuses"]["FileStatus"]: 163 | path_suffix = i["pathSuffix"] 164 | if prog.match(path_suffix): 165 | files.append("{}/{}".format(hdfs_path, path_suffix)) 166 | return files 167 | 168 | 169 | class Toolbox(object): 170 | def __init__(self): 171 | self.label = "WebHDFSToolbox" 172 | self.alias = "Web HDFS Toolbox" 173 | self.tools = [TripTool, DensityTool, HexTool, ExportToHDFSTool] 174 | 175 | 176 | class HexTool(object): 177 | def __init__(self): 178 | self.label = "ImportHexCells" 179 | self.description = "WebHDFS tool to import hex row,col,population" 180 | self.canRunInBackground = True 181 | 182 | def getParameterInfo(self): 183 | param_fc = arcpy.Parameter( 184 | name="out_fc", 185 | displayName="out_fc", 186 | direction="Output", 187 | datatype="Feature Layer", 188 | parameterType="Derived") 189 | head, tail = os.path.split(os.path.abspath(__file__)) 190 | param_fc.symbology = os.path.join(head, "HexCells.lyr") 191 | 192 | param_name = arcpy.Parameter( 193 | name="in_name", 194 | displayName="Name", 195 | direction="Input", 196 | datatype="GPString", 197 | parameterType="Required") 198 | param_name.value = "HexCells" 199 | 200 | param_host = arcpy.Parameter( 201 | name="in_host", 202 | displayName="HDFS Host", 203 | direction="Input", 204 | datatype="GPString", 205 | parameterType="Required") 206 | param_host.value = "sandbox" 207 | 208 | param_user = arcpy.Parameter( 209 | name="in_user", 210 | displayName="User name", 211 | direction="Input", 212 | datatype="GPString", 213 | parameterType="Required") 214 | param_user.value = "root" 215 | 216 | param_path = arcpy.Parameter( 217 | name="in_path", 218 | displayName="HDFS Path", 219 | direction="Input", 220 | datatype="GPString", 221 | parameterType="Required") 222 | param_path.value = "/tmp/hex" 223 | 224 | param_file = arcpy.Parameter( 225 | name="in_file", 226 | displayName="HDFS File(s)", 227 | direction="Input", 228 | datatype="GPString", 229 | parameterType="Required") 230 | param_file.value = "part.*" 231 | 232 | param_spref = arcpy.Parameter(name="in_spref", 233 | displayName="Spatial Reference", 234 | direction="Input", 235 | datatype="GPSpatialReference", 236 | parameterType="Required") 237 | 238 | param_size = arcpy.Parameter(name="in_size", 239 | displayName="Hex Size", 240 | direction="Input", 241 | datatype="GPLong", 242 | parameterType="Required") 243 | param_size.value = 100 244 | 245 | return [param_fc, param_name, param_host, param_user, param_path, param_file, param_spref, param_size] 246 | 247 | def isLicensed(self): 248 | return True 249 | 250 | def updateParameters(self, parameters): 251 | return 252 | 253 | def updateMessages(self, parameters): 254 | return 255 | 256 | def execute(self, parameters, messages): 257 | name = parameters[1].value 258 | host = parameters[2].value 259 | user = parameters[3].value 260 | path = parameters[4].value 261 | fext = parameters[5].value 262 | sref = parameters[6].value 263 | size = parameters[7].value 264 | 265 | in_memory = False 266 | if in_memory: 267 | ws = "in_memory" 268 | fc = ws + "/" + name 269 | else: 270 | fc = os.path.join(arcpy.env.scratchGDB, name) 271 | ws = os.path.dirname(fc) 272 | 273 | if arcpy.Exists(fc): 274 | arcpy.management.Delete(fc) 275 | 276 | arcpy.management.CreateFeatureclass(ws, name, "POLYGON", spatial_reference=sref) 277 | arcpy.management.AddField(fc, "POPULATION", "LONG") 278 | 279 | with arcpy.da.InsertCursor(fc, ["SHAPE@", "POPULATION"]) as cursor: 280 | webhdfs = WebHDFS(host, user) 281 | for path in webhdfs.list_status(path, fext): 282 | with webhdfs.open(hdfs_path=path, buffer_size=1024 * 1024) as res: 283 | hex_grid = HexGrid(size=size) 284 | hex_cell = HexCell(size=size) 285 | for line in res.iter_lines(chunk_size=1024 * 1024): 286 | row_txt, col_txt, pop_txt = line.split(",") 287 | row = float(row_txt) 288 | col = float(col_txt) 289 | pop = long(pop_txt) 290 | x, y = hex_grid.rc2xy(row, col) 291 | cursor.insertRow((hex_cell.to_shape(x, y), pop)) 292 | 293 | parameters[0].value = fc 294 | return 295 | 296 | 297 | class DensityTool(object): 298 | def __init__(self): 299 | self.label = "ImportPointDensity" 300 | self.description = "WebHDFS tool to import x,y,population" 301 | self.canRunInBackground = True 302 | 303 | def getParameterInfo(self): 304 | param_fc = arcpy.Parameter( 305 | name="out_fc", 306 | displayName="out_fc", 307 | direction="Output", 308 | datatype="Feature Layer", 309 | parameterType="Derived") 310 | 311 | param_name = arcpy.Parameter( 312 | name="in_name", 313 | displayName="Name", 314 | direction="Input", 315 | datatype="GPString", 316 | parameterType="Required") 317 | param_name.value = "DensityPoints" 318 | 319 | param_host = arcpy.Parameter( 320 | name="in_host", 321 | displayName="HDFS Host", 322 | direction="Input", 323 | datatype="GPString", 324 | parameterType="Required") 325 | param_host.value = "sandbox" 326 | 327 | param_user = arcpy.Parameter( 328 | name="in_user", 329 | displayName="User name", 330 | direction="Input", 331 | datatype="GPString", 332 | parameterType="Required") 333 | param_user.value = "root" 334 | 335 | param_path = arcpy.Parameter( 336 | name="in_path", 337 | displayName="HDFS Path", 338 | direction="Input", 339 | datatype="GPString", 340 | parameterType="Required") 341 | param_path.value = "/tmp/rowcol" 342 | 343 | param_file = arcpy.Parameter( 344 | name="in_file", 345 | displayName="HDFS File(s)", 346 | direction="Input", 347 | datatype="GPString", 348 | parameterType="Required") 349 | param_file.value = "part.*" 350 | 351 | param_spref = arcpy.Parameter(name="in_spref", 352 | displayName="Spatial Reference", 353 | direction="Input", 354 | datatype="GPSpatialReference", 355 | parameterType="Required") 356 | 357 | return [param_fc, param_name, param_host, param_user, param_path, param_file, param_spref] 358 | 359 | def isLicensed(self): 360 | return True 361 | 362 | def updateParameters(self, parameters): 363 | return 364 | 365 | def updateMessages(self, parameters): 366 | return 367 | 368 | def execute(self, parameters, messages): 369 | name = parameters[1].value 370 | host = parameters[2].value 371 | user = parameters[3].value 372 | path = parameters[4].value 373 | fext = parameters[5].value 374 | sref = parameters[6].value 375 | 376 | in_memory = False 377 | if in_memory: 378 | ws = "in_memory" 379 | fc = ws + "/" + name 380 | else: 381 | fc = os.path.join(arcpy.env.scratchGDB, name) 382 | ws = os.path.dirname(fc) 383 | 384 | if arcpy.Exists(fc): 385 | arcpy.management.Delete(fc) 386 | 387 | arcpy.management.CreateFeatureclass(ws, name, "POINT", spatial_reference=sref) 388 | arcpy.management.AddField(fc, "POPULATION", "LONG") 389 | 390 | with arcpy.da.InsertCursor(fc, ["SHAPE@XY", "POPULATION"]) as cursor: 391 | webhdfs = WebHDFS(host, user) 392 | for path in webhdfs.list_status(path, fext): 393 | with webhdfs.open(hdfs_path=path, buffer_size=1024 * 1024) as res: 394 | for line in res.iter_lines(chunk_size=1024 * 1024): 395 | lon_txt, lat_txt, pop_txt = line.split(",") 396 | lon = float(lon_txt) 397 | lat = float(lat_txt) 398 | pop = long(pop_txt) 399 | cursor.insertRow(((lon, lat), pop)) 400 | 401 | parameters[0].value = fc 402 | return 403 | 404 | 405 | class TripTool(object): 406 | def __init__(self): 407 | self.label = "ImportTrips" 408 | self.description = "WebHDFS tool to import trips" 409 | self.canRunInBackground = True 410 | 411 | def getParameterInfo(self): 412 | param_fc = arcpy.Parameter( 413 | name="out_fc", 414 | displayName="out_fc", 415 | direction="Output", 416 | datatype="Feature Layer", 417 | parameterType="Derived") 418 | 419 | param_name = arcpy.Parameter( 420 | name="in_name", 421 | displayName="Name", 422 | direction="Input", 423 | datatype="GPString", 424 | parameterType="Required") 425 | param_name.value = "PickupPoints" 426 | 427 | param_host = arcpy.Parameter( 428 | name="in_host", 429 | displayName="HDFS Host", 430 | direction="Input", 431 | datatype="GPString", 432 | parameterType="Required") 433 | param_host.value = "sandbox" 434 | 435 | param_user = arcpy.Parameter( 436 | name="in_user", 437 | displayName="User name", 438 | direction="Input", 439 | datatype="GPString", 440 | parameterType="Required") 441 | param_user.value = "root" 442 | 443 | param_path = arcpy.Parameter( 444 | name="in_path", 445 | displayName="HDFS Path", 446 | direction="Input", 447 | datatype="GPString", 448 | parameterType="Required") 449 | param_path.value = "/trips" 450 | 451 | param_file = arcpy.Parameter( 452 | name="in_file", 453 | displayName="HDFS File(s)", 454 | direction="Input", 455 | datatype="GPString", 456 | parameterType="Required") 457 | param_file.value = "trips.*" 458 | 459 | return [param_fc, param_name, param_host, param_user, param_path, param_file] 460 | 461 | def isLicensed(self): 462 | return True 463 | 464 | def updateParameters(self, parameters): 465 | return 466 | 467 | def updateMessages(self, parameters): 468 | return 469 | 470 | def execute(self, parameters, messages): 471 | reload(sys) 472 | sys.setdefaultencoding("utf8") 473 | 474 | name = parameters[1].value 475 | host = parameters[2].value 476 | user = parameters[3].value 477 | path = parameters[4].value 478 | fext = parameters[5].value 479 | 480 | in_memory = False 481 | if in_memory: 482 | ws = "in_memory" 483 | fc = ws + "/" + name 484 | else: 485 | fc = os.path.join(arcpy.env.scratchGDB, name) 486 | ws = os.path.dirname(fc) 487 | 488 | if arcpy.Exists(fc): 489 | arcpy.management.Delete(fc) 490 | 491 | sp_ref = arcpy.SpatialReference(4326) 492 | arcpy.management.CreateFeatureclass(ws, name, "POINT", spatial_reference=sp_ref) 493 | arcpy.management.AddField(fc, "DATETIME", "TEXT", field_length=32) 494 | arcpy.management.AddField(fc, "PASSENGERS", "LONG") 495 | 496 | with arcpy.da.InsertCursor(fc, ["SHAPE@XY", "DATETIME", "PASSENGERS"]) as cursor: 497 | webhdfs = WebHDFS(host, user) 498 | for path in webhdfs.list_status(path, fext): 499 | arcpy.AddMessage(path) 500 | with webhdfs.open(hdfs_path=path, buffer_size=1024 * 1024) as res: 501 | line_no = 0 502 | for line in res.iter_lines(chunk_size=1024 * 1024): 503 | line_no += 1 504 | if line_no > 1: 505 | tokens = line.split(",") 506 | if len(tokens) > 11: 507 | datetime = tokens[5] 508 | passengers = int(tokens[7]) 509 | lon = float(tokens[10]) 510 | lat = float(tokens[11]) 511 | if -74.255 < lon < -73.608 and 40.618 < lat < 40.937: 512 | cursor.insertRow(((lon, lat), datetime, passengers)) 513 | 514 | parameters[0].value = fc 515 | return 516 | 517 | 518 | class ExportToHDFSTool(object): 519 | def __init__(self): 520 | self.label = "ExportToHDFS" 521 | self.description = """ 522 | Export a feature class to HDFS in text format, where each feature is a row terminated by a line feed 523 | and each feature attribute is terminated by a tab. The shape of the feature is stored in WKT format. 524 | """ 525 | self.canRunInBackground = True 526 | 527 | def getParameterInfo(self): 528 | param_fc = arcpy.Parameter(name="in_fc", 529 | displayName="Input Feature Class", 530 | direction="Input", 531 | datatype="Table View", 532 | parameterType="Required") 533 | 534 | param_sep = arcpy.Parameter(name="in_sep", 535 | displayName="Output Field Separator", 536 | direction="Input", 537 | datatype="String", 538 | parameterType="Required") 539 | param_sep.value = "tab" 540 | 541 | param_host = arcpy.Parameter(name="in_host", 542 | displayName="HDFS Host", 543 | direction="Input", 544 | datatype="String", 545 | parameterType="Required") 546 | param_host.value = "sandbox" 547 | 548 | param_user = arcpy.Parameter(name="in_user", 549 | displayName="HDFS User", 550 | direction="Input", 551 | datatype="String", 552 | parameterType="Required") 553 | param_user.value = "root" 554 | 555 | param_path = arcpy.Parameter(name="in_path", 556 | displayName="Output HDFS Path", 557 | direction="Input", 558 | datatype="String", 559 | parameterType="Required") 560 | param_path.value = "/user/root" 561 | 562 | return [param_fc, param_sep, param_host, param_user, param_path] 563 | 564 | def isLicensed(self): 565 | return True 566 | 567 | def updateParameters(self, parameters): 568 | return 569 | 570 | def updateMessages(self, parameters): 571 | return 572 | 573 | def execute(self, parameters, messages): 574 | reload(sys) 575 | sys.setdefaultencoding("utf8") 576 | 577 | fc = parameters[0].valueAsText 578 | sep = parameters[1].valueAsText 579 | hdfs_host = parameters[2].valueAsText 580 | hdfs_user = parameters[3].valueAsText 581 | hdfs_path = parameters[4].valueAsText 582 | 583 | webhdfs = WebHDFS(hdfs_host, hdfs_user) 584 | webhdfs.create_file(fc, sep, hdfs_path) 585 | return 586 | --------------------------------------------------------------------------------