├── .gitignore
├── media
├── agol.png
├── map.png
├── sd-1.png
├── sd-2.png
├── sd-3.png
├── sd-4.png
├── sd-5.png
├── sd-6.png
├── import-hex.png
├── services.png
├── share-as.png
├── trips-1M.png
├── import-trips.png
├── web-app-hex.png
├── import-density.png
├── web-app-density.png
├── web-app-trips.png
├── widget-config.png
├── widget-execute.png
└── folder-connection.png
├── src
├── main
│ └── python
│ │ ├── HexCells.lyr
│ │ ├── HDFSToolbox.DensityTool.pyt.xml
│ │ ├── HDFSToolbox.HexTool.pyt.xml
│ │ ├── HDFSToolbox.TripTool.pyt.xml
│ │ ├── HDFSToolbox.ExportToHDFSTool.pyt.xml
│ │ ├── hexcell.py
│ │ ├── mercator.py
│ │ ├── GeoCount3.py
│ │ ├── GeoCount1.py
│ │ ├── GeoCount2.py
│ │ ├── hexgrid.py
│ │ └── HDFSToolbox.pyt
└── test
│ └── python
│ └── hexgridtest.py
├── LICENSE
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.iml
3 | *.xml
4 |
--------------------------------------------------------------------------------
/media/agol.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/agol.png
--------------------------------------------------------------------------------
/media/map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/map.png
--------------------------------------------------------------------------------
/media/sd-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-1.png
--------------------------------------------------------------------------------
/media/sd-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-2.png
--------------------------------------------------------------------------------
/media/sd-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-3.png
--------------------------------------------------------------------------------
/media/sd-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-4.png
--------------------------------------------------------------------------------
/media/sd-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-5.png
--------------------------------------------------------------------------------
/media/sd-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/sd-6.png
--------------------------------------------------------------------------------
/media/import-hex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/import-hex.png
--------------------------------------------------------------------------------
/media/services.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/services.png
--------------------------------------------------------------------------------
/media/share-as.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/share-as.png
--------------------------------------------------------------------------------
/media/trips-1M.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/trips-1M.png
--------------------------------------------------------------------------------
/media/import-trips.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/import-trips.png
--------------------------------------------------------------------------------
/media/web-app-hex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/web-app-hex.png
--------------------------------------------------------------------------------
/media/import-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/import-density.png
--------------------------------------------------------------------------------
/media/web-app-density.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/web-app-density.png
--------------------------------------------------------------------------------
/media/web-app-trips.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/web-app-trips.png
--------------------------------------------------------------------------------
/media/widget-config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/widget-config.png
--------------------------------------------------------------------------------
/media/widget-execute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/widget-execute.png
--------------------------------------------------------------------------------
/media/folder-connection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/media/folder-connection.png
--------------------------------------------------------------------------------
/src/main/python/HexCells.lyr:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mraad/hdfs-geohex/HEAD/src/main/python/HexCells.lyr
--------------------------------------------------------------------------------
/src/main/python/HDFSToolbox.DensityTool.pyt.xml:
--------------------------------------------------------------------------------
1 |
2 | 20160207074229001.0TRUE
3 |
--------------------------------------------------------------------------------
/src/main/python/HDFSToolbox.HexTool.pyt.xml:
--------------------------------------------------------------------------------
1 |
2 | 20160207074230001.0TRUE
3 |
--------------------------------------------------------------------------------
/src/main/python/HDFSToolbox.TripTool.pyt.xml:
--------------------------------------------------------------------------------
1 |
2 | 20160207074228001.0TRUE
3 |
--------------------------------------------------------------------------------
/src/main/python/HDFSToolbox.ExportToHDFSTool.pyt.xml:
--------------------------------------------------------------------------------
1 |
2 | 20160207074231001.0TRUE
3 |
--------------------------------------------------------------------------------
/src/main/python/hexcell.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 |
4 | class HexCell:
5 | def __init__(self, size):
6 | self.xy = []
7 | for i in range(7):
8 | angle = math.pi * ((i % 6) + 0.5) / 3.0
9 | x = size * math.cos(angle)
10 | y = size * math.sin(angle)
11 | self.xy.append((x, y))
12 |
13 | def to_shape(self, cx, cy):
14 | return [[cx + x, cy + y] for (x, y) in self.xy]
15 |
--------------------------------------------------------------------------------
/src/main/python/mercator.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 |
4 | class Mercator:
5 | def __init__(self):
6 | pass
7 |
8 | @staticmethod
9 | def to_wgs84(x, y):
10 | rad = 6378137.0
11 | lat = (1.5707963267948966 - (2.0 * math.atan(math.exp((-1.0 * y) / rad)))) * (180 / math.pi)
12 | lon = ((x / rad) * 57.295779513082323) - (
13 | (math.floor((((x / rad) * 57.295779513082323) + 180.0) / 360.0)) * 360.0)
14 | return lon, lat
15 |
16 | @staticmethod
17 | def to_web_mercator(lon, lat):
18 | rad = 6378137.0
19 | e = lon * 0.017453292519943295
20 | x = rad * e
21 | n = lat * 0.017453292519943295
22 | sin_n = math.sin(n)
23 | y = 3189068.5 * math.log((1.0 + sin_n) / (1.0 - sin_n))
24 | return x, y
25 |
--------------------------------------------------------------------------------
/src/test/python/hexgridtest.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import os
3 | import unittest
4 |
5 | from hexgrid import HexGrid
6 |
7 |
8 | class HexGridTest(unittest.TestCase):
9 | def testHexGrid(self):
10 | hg = HexGrid(0.001)
11 | head, tail = os.path.split(os.path.abspath(__file__))
12 | head, tail = os.path.split(head)
13 | filename = os.path.join(head, 'resources', 'hex.csv')
14 | with open(filename, 'rb') as csvfile:
15 | reader = csv.reader(csvfile)
16 | for row in reader:
17 | px = float(row[0])
18 | py = float(row[1])
19 | er = long(row[2])
20 | ec = long(row[3])
21 | rr, rc = hg.xy2rc(px, py)
22 | self.assertEquals(er, rr)
23 | self.assertEquals(ec, rc)
24 |
25 |
26 | if __name__ == '__main__':
27 | unittest.main()
28 |
--------------------------------------------------------------------------------
/src/main/python/GeoCount3.py:
--------------------------------------------------------------------------------
1 | #
2 | # Spark job to bin data in WebMercator Spatial Reference
3 | # The bin is a hexagon with a width of 100 meters
4 | #
5 | from pyspark import SparkContext
6 |
7 | from hexgrid import HexGrid
8 | from mercator import Mercator
9 |
10 |
11 | def line_to_row_col(line, hg):
12 | splits = line.split(',')
13 | try:
14 | lon = float(splits[10])
15 | lat = float(splits[11])
16 | x, y = Mercator.to_web_mercator(lon, lat)
17 | rc = hg.xy2rc(x, y)
18 | return rc, 1
19 | except:
20 | return (0, 0), -1
21 |
22 |
23 | if __name__ == "__main__":
24 | hg = HexGrid(100)
25 | sc = SparkContext()
26 | sc.textFile("hdfs:///trips"). \
27 | map(lambda line: line_to_row_col(line, hg)). \
28 | filter(lambda (rowcol, count): count > 0). \
29 | reduceByKey(lambda a, b: a + b). \
30 | filter(lambda (rowcol, count): count > 10). \
31 | map(lambda ((row, col), count): "{0},{1},{2}".format(row, col, count)). \
32 | saveAsTextFile("hdfs:///tmp/hex")
33 |
--------------------------------------------------------------------------------
/src/main/python/GeoCount1.py:
--------------------------------------------------------------------------------
1 | #
2 | # Spark job to bin data in WGS84 Spatial Reference
3 | # The bin size is 0.001 degrees
4 | #
5 | import math
6 |
7 | from pyspark import SparkContext
8 |
9 |
10 | def line_to_row_col(line):
11 | splits = line.split(',')
12 | try:
13 | p_lon = float(splits[10])
14 | p_lat = float(splits[11])
15 | c = int(math.floor(p_lon / 0.001))
16 | r = int(math.floor(p_lat / 0.001))
17 | return (r, c), 1
18 | except:
19 | return (0, 0), -1
20 |
21 |
22 | def row_col_to_xy(row, col, count):
23 | y = row * 0.001 + 0.0005
24 | x = col * 0.001 + 0.0005
25 | return "{0},{1},{2}".format(x, y, count)
26 |
27 |
28 | if __name__ == "__main__":
29 | sc = SparkContext()
30 | sc.textFile("hdfs:///trips"). \
31 | map(lambda line: line_to_row_col(line)). \
32 | filter(lambda (rowcol, count): count > 0). \
33 | reduceByKey(lambda a, b: a + b). \
34 | filter(lambda (rowcol, count): count > 2). \
35 | map(lambda ((row, col), count): row_col_to_xy(row, col, count)). \
36 | saveAsTextFile("hdfs:///tmp/rowcol")
37 |
--------------------------------------------------------------------------------
/src/main/python/GeoCount2.py:
--------------------------------------------------------------------------------
1 | #
2 | # Spark job to bin data in WebMercator Spatial Reference
3 | # The bin size is 100 meters
4 | #
5 | import math
6 |
7 | from pyspark import SparkContext
8 |
9 | from mercator import Mercator
10 |
11 |
12 | def line_to_row_col(line):
13 | splits = line.split(',')
14 | try:
15 | lon = float(splits[10])
16 | lat = float(splits[11])
17 | x, y = Mercator.to_web_mercator(lon, lat)
18 | c = int(math.floor(x / 100))
19 | r = int(math.floor(y / 100))
20 | return (r, c), 1
21 | except:
22 | return (0, 0), -1
23 |
24 |
25 | def row_col_to_xy(row, col, count):
26 | y = row * 100 + 50
27 | x = col * 100 + 50
28 | return "{0},{1},{2}".format(x, y, count)
29 |
30 |
31 | if __name__ == "__main__":
32 | sc = SparkContext()
33 | sc.textFile("hdfs:///trips"). \
34 | map(lambda line: line_to_row_col(line)). \
35 | filter(lambda (rowcol, count): count > 0). \
36 | reduceByKey(lambda a, b: a + b). \
37 | filter(lambda (rowcol, count): count > 2). \
38 | map(lambda ((row, col), count): row_col_to_xy(row, col, count)). \
39 | saveAsTextFile("hdfs:///tmp/rowcol")
40 |
--------------------------------------------------------------------------------
/src/main/python/hexgrid.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 |
4 | class HexGrid:
5 | def __init__(self, size=100):
6 | self.two_pi = 2.0 * math.pi
7 | self.rad_to_deg = 180.0 / math.pi
8 | self.size = size
9 | self.h = self.size * math.cos(30.0 * math.pi / 180.0)
10 | self.v = self.size * 0.5
11 | self.skip_x = 2.0 * self.h
12 | self.skip_y = 3.0 * self.v
13 |
14 | def rc2xy(self, r, c):
15 | ofs = self.h if r % 2L != 0 else 0
16 | x = c * self.skip_x + ofs
17 | y = r * self.skip_y
18 | return x, y
19 |
20 | def inside(self, px, py, cx, cy):
21 | qx = math.fabs(px - cx)
22 | qy = math.fabs(py - cy)
23 | return False if qx > self.h or qy > self.size else qx / self.h + qy / self.v <= 2.0
24 |
25 | def azimuth_to_degrees(self, px, py, cx, cy):
26 | az = math.atan2(px - cx, py - cy) # reversed on purpose
27 | return az + self.two_pi * self.rad_to_deg if az < 0.0 else az * self.rad_to_deg
28 |
29 | def proceed_to_neighbor(self, px, py, cx, cy, old_r, old_c):
30 | deg = self.azimuth_to_degrees(px, py, cx, cy)
31 | if deg > 300.0:
32 | c = old_c if old_r % 2L != 0L else old_c - 1L
33 | r = old_r + 1L
34 | elif deg > 240.0:
35 | r = old_r
36 | c = old_c - 1L
37 | elif deg > 180.0:
38 | c = old_c - 1L if old_r % 2L != 0L else old_c
39 | r = old_r - 1L
40 | elif deg > 120.0:
41 | c = old_c + 1L if old_r % 2L != 0L else old_c
42 | r = old_r - 1L
43 | elif deg > 60.0:
44 | r = old_r
45 | c = old_c + 1L
46 | else:
47 | c = old_c + 1L if old_r % 2L != 0L else old_c
48 | r = old_r + 1L
49 | return r, c
50 |
51 | def xy2rc(self, px, py):
52 | r = long(math.floor(py / self.skip_y))
53 | c = long(math.floor(px / self.skip_x))
54 | cx, cy = self.rc2xy(r, c)
55 | while not self.inside(px, py, cx, cy):
56 | r, c = self.proceed_to_neighbor(px, py, cx, cy, r, c)
57 | cx, cy = self.rc2xy(r, c)
58 | return r, c
59 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2016 Mansour Raad
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # [(Web)Mapping Elephants with Sparks](http://thunderheadxpler.blogspot.com/2016/02/webmapping-elephants-with-sparks.html)
2 |
3 | This project is composed of tools written in Python to process and display a set of point data in CSV format from [HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html).
4 | The processing is based on [Spark](http://spark.apache.org/), and the display on a desktop or on a web client is based on [ArcPy](http://pro.arcgis.com/en/pro-app/arcpy/get-started/what-is-arcpy-.htm).
5 |
6 | 
7 |
8 | ## Getting Started
9 |
10 | * Create an [ArcGIS Online](https://www.arcgis.com/home/) account to publish your results as WebMaps and WebApps.
11 |
12 | * You need to access an HDFS instance. For local testing, you can download either the [Hortonworks](http://hortonworks.com/products/hortonworks-sandbox/) or the [Cloudera](http://www.cloudera.com/downloads/quickstart_vms/5-5.html) sandbox.
13 | Of course you can "cloud" it. Cloudera released a [Docker](https://www.docker.com/) based [image](https://blog.cloudera.com/blog/2015/12/docker-is-the-new-quickstart-option-for-apache-hadoop-and-cloudera/) and [Hortonworks teamed up with Azure](http://hortonworks.com/blog/hortonworks-sandbox-with-hdp-2-3-is-now-available-on-microsoft-azure-gallery/) to enable you to take HDP for a spin.
14 |
15 | ## Test Data
16 |
17 | We will use the famous [NYC Taxi trips](http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml) as our test set.
18 | I love that data set, as it contains 2 spatial and temporal elements (pickup and drop-off locations, dates and times) and additional trip attributes such as the number of passengers and the trip time.
19 | A 1 million trip subset can be downloaded from [here](https://dl.dropboxusercontent.com/u/2193160/trips-1M.csv.7z) to be placed in your HDFS instance as follows:
20 |
21 | ```bash
22 | sudo -u hdfs hadoop fs -mkdir /trips
23 | sudo -u hdfs hadoop fs -chown root:hdfs /trips
24 | sudo -u hdfs hadoop fs -chmod a+rw /trips
25 |
26 | yum -y install wget p7zip
27 | wget https://dl.dropboxusercontent.com/u/2193160/trips-1M.csv.7z
28 | 7za e -so trips-1M.csv.7z | hadoop fs -put - /trips/trips-1M.csv
29 | ```
30 |
31 | ## Mapping Prerequisites
32 |
33 | The access to HDFS from a remote node will be performed through the [Requests](http://docs.python-requests.org/en/master/) python module using the [WebHDFS REST API](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html).
34 |
35 | To instantiate the [Requests](http://docs.python-requests.org/en/master/) class from [ArcGIS Desktop](http://www.esri.com/software/arcgis/arcgis-for-desktop), make sure that your system environment variable `PATH` contains the `ArcGIS` folder and the `Scripts` sub-folder. For example:
36 |
37 | ```
38 | C:\Python27\ArcGIS10.3;C:\Python27\ArcGIS10.3\Scripts
39 | ```
40 |
41 | Start a new `CMD` window as Administrator and execute:
42 |
43 | ```
44 | pip2.7 install requests
45 | ```
46 |
47 | To instantiate the [Requests](http://docs.python-requests.org/en/master/) class from [ArcGIS Server](http://www.esri.com/software/arcgis/arcgisserver), make sure that your system environment variable `PATH` contains the `ArcGIS` folder and the `Scripts` sub-folder. For example:
48 |
49 | ```
50 | C:\Python27\ArcGISx6410.3;C:\Python27\ArcGISx6410.3\Scripts
51 | ```
52 |
53 | * Stop `ArcGIS Server` from the `Services` interface.
54 |
55 | * Start a new `CMD` window as Administrator and execute:
56 |
57 | ```
58 | pip2.7 install requests
59 | ```
60 |
61 | * Modify your `C:\Windows\System32\drivers\etc\hosts` file to contain the Hadoop hostname. For example:
62 |
63 | ```
64 | 192.168.1.25 sandbox.hortonworks.com mraad.cloudapp.net cloudera.quickstart sandbox
65 | ```
66 |
67 | * Start `ArcGIS Server` from the `Services` interface.
68 |
69 | The `hosts` modification is **very** important. When you want to [read the content of a file](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html#Open_and_Read_a_File) in HDFS, you first connect to the NameNode and you request the location of the desired file.
70 | NameNode responds with the file URL on a **DataNode**, to which you connect to and you read the streaming data from the open connection.
71 | The response URL typically contains the **name** of the DataNode not its IP. The above mentioned `hosts` modification enables the host name resolving when using the sandbox or a cloud instance.
72 |
73 | ## Mapping The Data
74 |
75 | To view the trips on a map, we will extend [Desktop](http://www.esri.com/software/arcgis/arcgis-for-desktop) with an ArcPy based toolbox to create an ephemeral in-memory feature class.
76 |
77 | Create a folder connection to this project [git clone](https://git-scm.com/docs/git-clone) to access the toolbox:
78 |
79 | 
80 |
81 | The `Import Trips` tools reads the content of the trips CSV file from HDFS and parses each row for lat/lon values at specific field indexes to create an in-memory feature class of the result.
82 |
83 | 
84 |
85 | The tools performs a local filter on _all_ the retrieved rows where only the trips originating in an area around Manhattan are displayed on the map resulting to about 981,000 point features as pickup locations.
86 |
87 | 
88 |
89 | ## Share As GeoProcessing
90 |
91 | The following are the steps to publish the `Import Tool` and the other included tools as a GeoProcessing services in an ArcGIS Server instance, in such that they can be consumed from a web application using the ArcGIS [rest endpoint](http://server.arcgis.com/en/server/latest/publish-services/linux/introduction-to-geoprocessing-rest-services.htm).
92 |
93 | * Edit the toolbox and set all the variables `in_memory` to `False`. This will save the resulting feature class in a scratch geodatabase workspace rather than in "in-memory" workspace. ArcGIS Server provides a runtime reference to a scratch geodatabase by reading the value of `arcpy.env.scratchGDB`. BTW, this will result in a perceivable performance degradation. On my machines, the process went from 45 sec to 1 min 24 sec. A solution (that I have to implement) is to save the features to an in-memory workspace and then perform a [copy features](http://help.arcgis.com/EN/ARCGISDESKTOP/10.0/HELP/index.html#//001700000035000000) to the scratch geodatabase.
94 | * Run the tool, and from the `Results` window, select `Geoprocessing Service` from the `Share As` menu item.
95 |
96 | 
97 |
98 | * **DO NOT** Publish a Service. Save a service definition file instead. In addition, make sure to **explicitly** set the `Description` to all the items even if a description already exists. What I mean by this is that you have to explicitly type in something in the description field (type a dot at the end of the text if one exists).
99 | * Make sure to save the service definition to `C:\temp` folder or a folder with a very short name.
100 | * Check the option to view the results with a map service.
101 |
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 |
109 | * Publish the service definition using the ArcGIS Server Web Manager
110 |
111 | 
112 |
113 | ## Web Visualizing
114 |
115 | The easiest way to view the result on the web is to use the ArcGIS Online [WebApp Builder](http://www.esri.com/software/web-appbuilder).
116 |
117 | 
118 |
119 | Locate the published GeoProcessing URL and use it to add A GeoProcessing Widget to the application header.
120 |
121 | 
122 | 
123 | 
124 |
125 | ## Data Aggregation
126 |
127 | We have been displaying about 1 million points and that has been relatively tolerable.
128 | But what if we have billions of points or millions of files to scan in HDFS, then this becomes an "issue".
129 | Though some folks advocate that they _need_ to see millions of dots on a map to highlight the absence of information _as_ information, the following aggregation should assist in that matter too.
130 |
131 | The simplest aggregation that we will undertake is binning.
132 | Imagine a virtual fishnet that is cast over the point space. All the points that fall into the same fishnet cell are aggregated together.
133 | What is returned is the set of populated fishnet cells and their associated aggregates.
134 | This tremendously reduces the size of the data to be visualized in what I term "Turning BigData into WebGIS Data", basically something digestible by a web client using ArcGIS[1](#f1).
135 |
136 | The binning processing will be performed using [Spark](http://spark.apache.org/).
137 |
138 | ## Running the Spark Jobs
139 |
140 | ```bash
141 | hadoop fs -rm -r -skipTrash /tmp/rowcol
142 | spark-submit\
143 | --master yarn\
144 | --executor-memory 512m\
145 | --num-executors 1\
146 | GeoCount1.py
147 | ```
148 |
149 | The above job aggregates the point data based on square fishnet cells.
150 | The cell size is hardcoded to 0.001 degrees (Exercise for the reader to make cell size a program argument. Hint: use `sys.argv`) and the output is emitted to HDFS in the `/tmp/rowcol` folder.
151 |
152 | ```bash
153 | hadoop fs -rm -r -skipTrash /tmp/rowcol
154 | spark-submit\
155 | --master yarn\
156 | --executor-memory 512m\
157 | --num-executors 1\
158 | --py-files mercator.py\
159 | GeoCount2.py
160 | ```
161 |
162 | The above job aggregates the point data based on square fishnet cells.
163 | Unlike the previous job, the point coordinates are parsed and converted from WGS84 to WebMercator.
164 | The cell size is 100 meters and the output is emitted to HDFS in the `/tmp/rowcol` folder.
165 |
166 | ```bash
167 | hadoop fs -rm -r -skipTrash /tmp/hex
168 | spark-submit\
169 | --master yarn\
170 | --executor-memory 512m\
171 | --num-executors 1\
172 | --py-files mercator.py,hexgrid.py\
173 | GeoCount3.py
174 | ```
175 |
176 | The above job aggregates the point data based on hexagonal fishnet cells.
177 | The point coordinates are converted to WebMercator and aggregated based on 100 meter hexagonal cells and the output is emitted to HDFS in the `/tmp/hex` folder.
178 |
179 | ## Web Visualizing Aggregated Results
180 |
181 | To view the content of the files in `hdfs:///tmp/rowcol`, use the `ImportPointDensityTool`:
182 |
183 | 
184 |
185 | Execute the tool and `Share As` a `Geoprocessing Service` the result.
186 | Locate the Geoprocessing tool URL, and use it to add a new Geoprocessing widget to your application.
187 |
188 | 
189 |
190 | To view the content of the files in `hdfs:///tmp/hex`, use the `ImportHexTool`
191 |
192 | 
193 |
194 | Execute the tool and `Share As` a `Geoprocessing Service` the result.
195 | Locate the Geoprocessing tool URL, and use it to add a new Geoprocessing widget to your application.
196 |
197 | 
198 |
199 | ### Cloudera Docker Notes
200 |
201 | Create a "machine" labeled `quickstart` using [VirtualBox](https://www.virtualbox.org/wiki/VirtualBox) with 4 cores, 8GB of memory and 20GB of disk space.
202 |
203 | ```bash
204 | docker-machine create\
205 | --driver virtualbox\
206 | --virtualbox-cpu-count 4\
207 | --virtualbox-memory 8192\
208 | --virtualbox-disk-size 20480\
209 | --virtualbox-no-vtx-check\
210 | quickstart
211 | ```
212 |
213 | Upgrade the machine to the latest docker instance.
214 |
215 | ```bash
216 | docker-machine upgrade quickstart
217 | ```
218 |
219 | Run the CDH image - This will start Zookeeper, HDFS, WebHDFS, YARN, Spark and other daemons.
220 | Note the `-v` option to share the VirtualBox created `/Users` shared folder and mount it as `/Users` in the container.
221 | In addition, all kind of ports are opened up (4040 for Spark, 8888 for HUE, 50070 for NameNode, 50075 for DataNode) to be accessed from the "outside".
222 | The machine IP can be retrieved using `docker-machine ip quickstart`.
223 |
224 | ```bash
225 | eval $(docker-machine env quickstart)
226 | docker run\
227 | --rm=true\
228 | --privileged=true\
229 | --hostname=quickstart.cloudera\
230 | -v /Users:/Users\
231 | -p 4040:4040\
232 | -p 7180:7180\
233 | -p 8088:8088\
234 | -p 8042:8042\
235 | -p 8888:8888\
236 | -p 50060:50060\
237 | -p 50070:50070\
238 | -p 50075:50075\
239 | -p 21050:21050\
240 | -t -i cloudera/quickstart:latest\
241 | /usr/bin/docker-quickstart
242 | ```
243 |
244 |
245 | ### References
246 |
247 | *
248 | *
249 | *
250 | *
251 |
252 | \[1\] I've been experimenting with WebGL and the new [4.0 JavaScript API for ArcGIS](https://developers.arcgis.com/javascript/beta/) and we can now render tremendous amount of data in a web browser (as long as it can be quickly transferred from the server) - will have to post about that. [↩](#a1)
253 |
--------------------------------------------------------------------------------
/src/main/python/HDFSToolbox.pyt:
--------------------------------------------------------------------------------
1 | import math
2 | import os
3 | import re
4 | import sys
5 |
6 | import arcpy
7 | import requests
8 |
9 |
10 | class HexGrid:
11 | def __init__(self, size=100):
12 | self.two_pi = 2.0 * math.pi
13 | self.rad_to_deg = 180.0 / math.pi
14 | self.size = size
15 | self.h = self.size * math.cos(30.0 * math.pi / 180.0)
16 | self.v = self.size * 0.5
17 | self.skip_x = 2.0 * self.h
18 | self.skip_y = 3.0 * self.v
19 |
20 | def rc2xy(self, r, c):
21 | ofs = self.h if r % 2L != 0 else 0
22 | x = c * self.skip_x + ofs
23 | y = r * self.skip_y
24 | return x, y
25 |
26 | def inside(self, px, py, cx, cy):
27 | qx = math.fabs(px - cx)
28 | qy = math.fabs(py - cy)
29 | return False if qx > self.h or qy > self.size else qx / self.h + qy / self.v <= 2.0
30 |
31 | def azimuth_to_degrees(self, px, py, cx, cy):
32 | az = math.atan2(px - cx, py - cy) # reversed on purpose
33 | return az + self.two_pi * self.rad_to_deg if az < 0.0 else az * self.rad_to_deg
34 |
35 | def proceed_to_neighbor(self, px, py, cx, cy, old_r, old_c):
36 | deg = self.azimuth_to_degrees(px, py, cx, cy)
37 | if deg > 300.0:
38 | c = old_c if old_r % 2L != 0L else old_c - 1L
39 | r = old_r + 1L
40 | elif deg > 240.0:
41 | r = old_r
42 | c = old_c - 1L
43 | elif deg > 180.0:
44 | c = old_c - 1L if old_r % 2L != 0L else old_c
45 | r = old_r - 1L
46 | elif deg > 120.0:
47 | c = old_c + 1L if old_r % 2L != 0L else old_c
48 | r = old_r - 1L
49 | elif deg > 60.0:
50 | r = old_r
51 | c = old_c + 1L
52 | else:
53 | c = old_c + 1L if old_r % 2L != 0L else old_c
54 | r = old_r + 1L
55 | return r, c
56 |
57 | def xy2rc(self, px, py):
58 | r = long(math.floor(py / self.skip_y))
59 | c = long(math.floor(px / self.skip_x))
60 | cx, cy = self.rc2xy(r, c)
61 | while not self.inside(px, py, cx, cy):
62 | r, c = self.proceed_to_neighbor(px, py, cx, cy, r, c)
63 | cx, cy = self.rc2xy(r, c)
64 | return r, c
65 |
66 |
67 | class HexCell:
68 | def __init__(self, size):
69 | self.xy = []
70 | for i in range(7):
71 | angle = math.pi * ((i % 6) + 0.5) / 3.0
72 | x = size * math.cos(angle)
73 | y = size * math.sin(angle)
74 | self.xy.append((x, y))
75 |
76 | def to_shape(self, cx, cy):
77 | return [[cx + x, cy + y] for (x, y) in self.xy]
78 |
79 |
80 | class WebHDFSRes(object):
81 | def __init__(self, res):
82 | self.res = res
83 |
84 | def __enter__(self):
85 | return self.res
86 |
87 | def __exit__(self, exception_type, exception_value, traceback):
88 | self.res.close()
89 |
90 |
91 | class WebHDFS(object):
92 | def __init__(self, host, user, port=50070):
93 | self.host = host
94 | self.port = port
95 | self.user = user
96 |
97 | def create_file(self, fc, sep, hdfs_path):
98 | description = arcpy.Describe(fc)
99 | field_names = [field.name for field in description.fields]
100 | shape_name = description.shapeFieldName
101 | field_names.remove(shape_name)
102 | field_names.append(shape_name + "@WKT")
103 |
104 | sep = "\t" if sep == "tab" else sep[0]
105 |
106 | result = arcpy.management.GetCount(fc)
107 | max_range = int(result.getOutput(0))
108 | arcpy.SetProgressor("step", "Exporting...", 0, max_range, 1)
109 |
110 | def gen_data():
111 | with arcpy.da.SearchCursor(fc, field_names) as cursor:
112 | arr = []
113 | cnt = 0
114 | inc = 0
115 | for row in cursor:
116 | cnt += 1
117 | inc += 1
118 | arr.append(sep.join([str(r) for r in row]))
119 | if inc == 5000:
120 | inc = 0
121 | arr.append("")
122 | yield "\n".join(arr)
123 | arr = []
124 | arcpy.SetProgressorPosition(cnt)
125 | if len(arr) > 0:
126 | arr.append("")
127 | yield "\n".join(arr)
128 |
129 | params = {"op": "CREATE", "user.name": self.user, "buffersize": 1024 * 1024}
130 | url = "http://{}:{}/webhdfs/v1{}".format(self.host, self.port, hdfs_path)
131 | with WebHDFSRes(requests.put(url, params=params, allow_redirects=False)) as resLoc:
132 | if resLoc.status_code == 307:
133 | location = resLoc.headers["Location"]
134 | with WebHDFSRes(requests.put(location, data=gen_data())) as resPut:
135 | if resPut.status_code != 201:
136 | arcpy.AddError("Cannot put feature class. Error code = {}".format(resLoc.status_code))
137 | else:
138 | arcpy.AddError("Cannot get HDFS location. Error code = {}".format(resLoc.status_code))
139 |
140 | arcpy.ResetProgressor()
141 | return
142 |
143 | def open(self, hdfs_path, offset=-1, length=-1, buffer_size=-1):
144 | # arcpy.AddMessage(hdfs_path)
145 | params = {"op": "OPEN", "user.name": self.user}
146 | if offset > 0:
147 | params["offset"] = offset
148 | if length > 0:
149 | params["length"] = length
150 | if buffer_size > 0:
151 | params["buffersize"] = buffer_size
152 | url = "http://{}:{}/webhdfs/v1{}".format(self.host, self.port, hdfs_path)
153 | return WebHDFSRes(requests.get(url, params=params, stream=True))
154 |
155 | def list_status(self, hdfs_path, suffix_re="*"):
156 | files = []
157 | prog = re.compile(suffix_re)
158 | params = {"op": "LISTSTATUS", "user.name": self.user}
159 | url = "http://{}:{}/webhdfs/v1{}".format(self.host, self.port, hdfs_path)
160 | with WebHDFSRes(requests.get(url, params=params)) as res:
161 | doc = res.json()
162 | for i in doc["FileStatuses"]["FileStatus"]:
163 | path_suffix = i["pathSuffix"]
164 | if prog.match(path_suffix):
165 | files.append("{}/{}".format(hdfs_path, path_suffix))
166 | return files
167 |
168 |
169 | class Toolbox(object):
170 | def __init__(self):
171 | self.label = "WebHDFSToolbox"
172 | self.alias = "Web HDFS Toolbox"
173 | self.tools = [TripTool, DensityTool, HexTool, ExportToHDFSTool]
174 |
175 |
176 | class HexTool(object):
177 | def __init__(self):
178 | self.label = "ImportHexCells"
179 | self.description = "WebHDFS tool to import hex row,col,population"
180 | self.canRunInBackground = True
181 |
182 | def getParameterInfo(self):
183 | param_fc = arcpy.Parameter(
184 | name="out_fc",
185 | displayName="out_fc",
186 | direction="Output",
187 | datatype="Feature Layer",
188 | parameterType="Derived")
189 | head, tail = os.path.split(os.path.abspath(__file__))
190 | param_fc.symbology = os.path.join(head, "HexCells.lyr")
191 |
192 | param_name = arcpy.Parameter(
193 | name="in_name",
194 | displayName="Name",
195 | direction="Input",
196 | datatype="GPString",
197 | parameterType="Required")
198 | param_name.value = "HexCells"
199 |
200 | param_host = arcpy.Parameter(
201 | name="in_host",
202 | displayName="HDFS Host",
203 | direction="Input",
204 | datatype="GPString",
205 | parameterType="Required")
206 | param_host.value = "sandbox"
207 |
208 | param_user = arcpy.Parameter(
209 | name="in_user",
210 | displayName="User name",
211 | direction="Input",
212 | datatype="GPString",
213 | parameterType="Required")
214 | param_user.value = "root"
215 |
216 | param_path = arcpy.Parameter(
217 | name="in_path",
218 | displayName="HDFS Path",
219 | direction="Input",
220 | datatype="GPString",
221 | parameterType="Required")
222 | param_path.value = "/tmp/hex"
223 |
224 | param_file = arcpy.Parameter(
225 | name="in_file",
226 | displayName="HDFS File(s)",
227 | direction="Input",
228 | datatype="GPString",
229 | parameterType="Required")
230 | param_file.value = "part.*"
231 |
232 | param_spref = arcpy.Parameter(name="in_spref",
233 | displayName="Spatial Reference",
234 | direction="Input",
235 | datatype="GPSpatialReference",
236 | parameterType="Required")
237 |
238 | param_size = arcpy.Parameter(name="in_size",
239 | displayName="Hex Size",
240 | direction="Input",
241 | datatype="GPLong",
242 | parameterType="Required")
243 | param_size.value = 100
244 |
245 | return [param_fc, param_name, param_host, param_user, param_path, param_file, param_spref, param_size]
246 |
247 | def isLicensed(self):
248 | return True
249 |
250 | def updateParameters(self, parameters):
251 | return
252 |
253 | def updateMessages(self, parameters):
254 | return
255 |
256 | def execute(self, parameters, messages):
257 | name = parameters[1].value
258 | host = parameters[2].value
259 | user = parameters[3].value
260 | path = parameters[4].value
261 | fext = parameters[5].value
262 | sref = parameters[6].value
263 | size = parameters[7].value
264 |
265 | in_memory = False
266 | if in_memory:
267 | ws = "in_memory"
268 | fc = ws + "/" + name
269 | else:
270 | fc = os.path.join(arcpy.env.scratchGDB, name)
271 | ws = os.path.dirname(fc)
272 |
273 | if arcpy.Exists(fc):
274 | arcpy.management.Delete(fc)
275 |
276 | arcpy.management.CreateFeatureclass(ws, name, "POLYGON", spatial_reference=sref)
277 | arcpy.management.AddField(fc, "POPULATION", "LONG")
278 |
279 | with arcpy.da.InsertCursor(fc, ["SHAPE@", "POPULATION"]) as cursor:
280 | webhdfs = WebHDFS(host, user)
281 | for path in webhdfs.list_status(path, fext):
282 | with webhdfs.open(hdfs_path=path, buffer_size=1024 * 1024) as res:
283 | hex_grid = HexGrid(size=size)
284 | hex_cell = HexCell(size=size)
285 | for line in res.iter_lines(chunk_size=1024 * 1024):
286 | row_txt, col_txt, pop_txt = line.split(",")
287 | row = float(row_txt)
288 | col = float(col_txt)
289 | pop = long(pop_txt)
290 | x, y = hex_grid.rc2xy(row, col)
291 | cursor.insertRow((hex_cell.to_shape(x, y), pop))
292 |
293 | parameters[0].value = fc
294 | return
295 |
296 |
297 | class DensityTool(object):
298 | def __init__(self):
299 | self.label = "ImportPointDensity"
300 | self.description = "WebHDFS tool to import x,y,population"
301 | self.canRunInBackground = True
302 |
303 | def getParameterInfo(self):
304 | param_fc = arcpy.Parameter(
305 | name="out_fc",
306 | displayName="out_fc",
307 | direction="Output",
308 | datatype="Feature Layer",
309 | parameterType="Derived")
310 |
311 | param_name = arcpy.Parameter(
312 | name="in_name",
313 | displayName="Name",
314 | direction="Input",
315 | datatype="GPString",
316 | parameterType="Required")
317 | param_name.value = "DensityPoints"
318 |
319 | param_host = arcpy.Parameter(
320 | name="in_host",
321 | displayName="HDFS Host",
322 | direction="Input",
323 | datatype="GPString",
324 | parameterType="Required")
325 | param_host.value = "sandbox"
326 |
327 | param_user = arcpy.Parameter(
328 | name="in_user",
329 | displayName="User name",
330 | direction="Input",
331 | datatype="GPString",
332 | parameterType="Required")
333 | param_user.value = "root"
334 |
335 | param_path = arcpy.Parameter(
336 | name="in_path",
337 | displayName="HDFS Path",
338 | direction="Input",
339 | datatype="GPString",
340 | parameterType="Required")
341 | param_path.value = "/tmp/rowcol"
342 |
343 | param_file = arcpy.Parameter(
344 | name="in_file",
345 | displayName="HDFS File(s)",
346 | direction="Input",
347 | datatype="GPString",
348 | parameterType="Required")
349 | param_file.value = "part.*"
350 |
351 | param_spref = arcpy.Parameter(name="in_spref",
352 | displayName="Spatial Reference",
353 | direction="Input",
354 | datatype="GPSpatialReference",
355 | parameterType="Required")
356 |
357 | return [param_fc, param_name, param_host, param_user, param_path, param_file, param_spref]
358 |
359 | def isLicensed(self):
360 | return True
361 |
362 | def updateParameters(self, parameters):
363 | return
364 |
365 | def updateMessages(self, parameters):
366 | return
367 |
368 | def execute(self, parameters, messages):
369 | name = parameters[1].value
370 | host = parameters[2].value
371 | user = parameters[3].value
372 | path = parameters[4].value
373 | fext = parameters[5].value
374 | sref = parameters[6].value
375 |
376 | in_memory = False
377 | if in_memory:
378 | ws = "in_memory"
379 | fc = ws + "/" + name
380 | else:
381 | fc = os.path.join(arcpy.env.scratchGDB, name)
382 | ws = os.path.dirname(fc)
383 |
384 | if arcpy.Exists(fc):
385 | arcpy.management.Delete(fc)
386 |
387 | arcpy.management.CreateFeatureclass(ws, name, "POINT", spatial_reference=sref)
388 | arcpy.management.AddField(fc, "POPULATION", "LONG")
389 |
390 | with arcpy.da.InsertCursor(fc, ["SHAPE@XY", "POPULATION"]) as cursor:
391 | webhdfs = WebHDFS(host, user)
392 | for path in webhdfs.list_status(path, fext):
393 | with webhdfs.open(hdfs_path=path, buffer_size=1024 * 1024) as res:
394 | for line in res.iter_lines(chunk_size=1024 * 1024):
395 | lon_txt, lat_txt, pop_txt = line.split(",")
396 | lon = float(lon_txt)
397 | lat = float(lat_txt)
398 | pop = long(pop_txt)
399 | cursor.insertRow(((lon, lat), pop))
400 |
401 | parameters[0].value = fc
402 | return
403 |
404 |
405 | class TripTool(object):
406 | def __init__(self):
407 | self.label = "ImportTrips"
408 | self.description = "WebHDFS tool to import trips"
409 | self.canRunInBackground = True
410 |
411 | def getParameterInfo(self):
412 | param_fc = arcpy.Parameter(
413 | name="out_fc",
414 | displayName="out_fc",
415 | direction="Output",
416 | datatype="Feature Layer",
417 | parameterType="Derived")
418 |
419 | param_name = arcpy.Parameter(
420 | name="in_name",
421 | displayName="Name",
422 | direction="Input",
423 | datatype="GPString",
424 | parameterType="Required")
425 | param_name.value = "PickupPoints"
426 |
427 | param_host = arcpy.Parameter(
428 | name="in_host",
429 | displayName="HDFS Host",
430 | direction="Input",
431 | datatype="GPString",
432 | parameterType="Required")
433 | param_host.value = "sandbox"
434 |
435 | param_user = arcpy.Parameter(
436 | name="in_user",
437 | displayName="User name",
438 | direction="Input",
439 | datatype="GPString",
440 | parameterType="Required")
441 | param_user.value = "root"
442 |
443 | param_path = arcpy.Parameter(
444 | name="in_path",
445 | displayName="HDFS Path",
446 | direction="Input",
447 | datatype="GPString",
448 | parameterType="Required")
449 | param_path.value = "/trips"
450 |
451 | param_file = arcpy.Parameter(
452 | name="in_file",
453 | displayName="HDFS File(s)",
454 | direction="Input",
455 | datatype="GPString",
456 | parameterType="Required")
457 | param_file.value = "trips.*"
458 |
459 | return [param_fc, param_name, param_host, param_user, param_path, param_file]
460 |
461 | def isLicensed(self):
462 | return True
463 |
464 | def updateParameters(self, parameters):
465 | return
466 |
467 | def updateMessages(self, parameters):
468 | return
469 |
470 | def execute(self, parameters, messages):
471 | reload(sys)
472 | sys.setdefaultencoding("utf8")
473 |
474 | name = parameters[1].value
475 | host = parameters[2].value
476 | user = parameters[3].value
477 | path = parameters[4].value
478 | fext = parameters[5].value
479 |
480 | in_memory = False
481 | if in_memory:
482 | ws = "in_memory"
483 | fc = ws + "/" + name
484 | else:
485 | fc = os.path.join(arcpy.env.scratchGDB, name)
486 | ws = os.path.dirname(fc)
487 |
488 | if arcpy.Exists(fc):
489 | arcpy.management.Delete(fc)
490 |
491 | sp_ref = arcpy.SpatialReference(4326)
492 | arcpy.management.CreateFeatureclass(ws, name, "POINT", spatial_reference=sp_ref)
493 | arcpy.management.AddField(fc, "DATETIME", "TEXT", field_length=32)
494 | arcpy.management.AddField(fc, "PASSENGERS", "LONG")
495 |
496 | with arcpy.da.InsertCursor(fc, ["SHAPE@XY", "DATETIME", "PASSENGERS"]) as cursor:
497 | webhdfs = WebHDFS(host, user)
498 | for path in webhdfs.list_status(path, fext):
499 | arcpy.AddMessage(path)
500 | with webhdfs.open(hdfs_path=path, buffer_size=1024 * 1024) as res:
501 | line_no = 0
502 | for line in res.iter_lines(chunk_size=1024 * 1024):
503 | line_no += 1
504 | if line_no > 1:
505 | tokens = line.split(",")
506 | if len(tokens) > 11:
507 | datetime = tokens[5]
508 | passengers = int(tokens[7])
509 | lon = float(tokens[10])
510 | lat = float(tokens[11])
511 | if -74.255 < lon < -73.608 and 40.618 < lat < 40.937:
512 | cursor.insertRow(((lon, lat), datetime, passengers))
513 |
514 | parameters[0].value = fc
515 | return
516 |
517 |
518 | class ExportToHDFSTool(object):
519 | def __init__(self):
520 | self.label = "ExportToHDFS"
521 | self.description = """
522 | Export a feature class to HDFS in text format, where each feature is a row terminated by a line feed
523 | and each feature attribute is terminated by a tab. The shape of the feature is stored in WKT format.
524 | """
525 | self.canRunInBackground = True
526 |
527 | def getParameterInfo(self):
528 | param_fc = arcpy.Parameter(name="in_fc",
529 | displayName="Input Feature Class",
530 | direction="Input",
531 | datatype="Table View",
532 | parameterType="Required")
533 |
534 | param_sep = arcpy.Parameter(name="in_sep",
535 | displayName="Output Field Separator",
536 | direction="Input",
537 | datatype="String",
538 | parameterType="Required")
539 | param_sep.value = "tab"
540 |
541 | param_host = arcpy.Parameter(name="in_host",
542 | displayName="HDFS Host",
543 | direction="Input",
544 | datatype="String",
545 | parameterType="Required")
546 | param_host.value = "sandbox"
547 |
548 | param_user = arcpy.Parameter(name="in_user",
549 | displayName="HDFS User",
550 | direction="Input",
551 | datatype="String",
552 | parameterType="Required")
553 | param_user.value = "root"
554 |
555 | param_path = arcpy.Parameter(name="in_path",
556 | displayName="Output HDFS Path",
557 | direction="Input",
558 | datatype="String",
559 | parameterType="Required")
560 | param_path.value = "/user/root"
561 |
562 | return [param_fc, param_sep, param_host, param_user, param_path]
563 |
564 | def isLicensed(self):
565 | return True
566 |
567 | def updateParameters(self, parameters):
568 | return
569 |
570 | def updateMessages(self, parameters):
571 | return
572 |
573 | def execute(self, parameters, messages):
574 | reload(sys)
575 | sys.setdefaultencoding("utf8")
576 |
577 | fc = parameters[0].valueAsText
578 | sep = parameters[1].valueAsText
579 | hdfs_host = parameters[2].valueAsText
580 | hdfs_user = parameters[3].valueAsText
581 | hdfs_path = parameters[4].valueAsText
582 |
583 | webhdfs = WebHDFS(hdfs_host, hdfs_user)
584 | webhdfs.create_file(fc, sep, hdfs_path)
585 | return
586 |
--------------------------------------------------------------------------------