├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── .pypirc
├── .vscode
    └── settings.json
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── PyPI_README.md
├── README.md
├── pyproject.toml
├── setup.cfg
├── setup.py
├── sparkplus
    ├── __init__.py
    ├── core
    │   ├── __init__.py
    │   ├── address_dataframe.py
    │   ├── base.py
    │   ├── coord_dataframe.py
    │   ├── job.py
    │   ├── numaddr_dataframe.py
    │   ├── py_log.py
    │   ├── shp_to_parquet.py
    │   ├── tablename.py
    │   ├── test.ipynb
    │   ├── udfs.py
    │   └── utils.py
    ├── dependencies
    │   ├── __init__.py
    │   ├── logging.py
    │   ├── spark.py
    │   └── tablename.py
    ├── jobs
    │   ├── __init__.py
    │   ├── conversion.py
    │   ├── etl_job.py
    │   ├── load_database.py
    │   ├── table_to_df.py
    │   └── with_geopandas.py
    ├── package
    │   ├── __init__.py
    │   ├── gis.py
    │   └── pipeline.py
    └── testjob
    │   ├── demo_app.py
    │   └── test_df.py
└── static
    └── sparkplus_arch_finale.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/python
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
  4 | 
  5 | ### Python ###
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | share/python-wheels/
 29 | *.egg-info/
 30 | .installed.cfg
 31 | *.egg
 32 | MANIFEST
 33 | 
 34 | # PyInstaller
 35 | #  Usually these files are written by a python script from a template
 36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 37 | *.manifest
 38 | *.spec
 39 | 
 40 | # Installer logs
 41 | pip-log.txt
 42 | pip-delete-this-directory.txt
 43 | 
 44 | # Unit test / coverage reports
 45 | htmlcov/
 46 | .tox/
 47 | .nox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | *.py,cover
 55 | .hypothesis/
 56 | .pytest_cache/
 57 | cover/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | .pybuilder/
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | #   For a library or package, you might want to ignore these files since the code is
 92 | #   intended to run in multiple environments; otherwise, check them in:
 93 | # .python-version
 94 | 
 95 | # pipenv
 96 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 97 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 98 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 99 | #   install all needed dependencies.
100 | #Pipfile.lock
101 | 
102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
103 | __pypackages__/
104 | 
105 | # Celery stuff
106 | celerybeat-schedule
107 | celerybeat.pid
108 | 
109 | # SageMath parsed files
110 | *.sage.py
111 | 
112 | # Environments
113 | .env
114 | .venv
115 | env/
116 | venv/
117 | ENV/
118 | env.bak/
119 | venv.bak/
120 | 
121 | # Spyder project settings
122 | .spyderproject
123 | .spyproject
124 | 
125 | # Rope project settings
126 | .ropeproject
127 | 
128 | # mkdocs documentation
129 | /site
130 | 
131 | # mypy
132 | .mypy_cache/
133 | .dmypy.json
134 | dmypy.json
135 | 
136 | # Pyre type checker
137 | .pyre/
138 | 
139 | # pytype static type analyzer
140 | .pytype/
141 | 
142 | # Cython debug symbols
143 | cython_debug/
144 | 
145 | # End of https://www.toptal.com/developers/gitignore/api/python
146 | 
147 | # Custom
148 | *.csv
149 | 
150 | resource/*
151 | scripts/*
152 | logs
153 | 
154 | # test file
155 | package/test.py
156 | /shp
157 | 
158 | # Mac dev dependency
159 | .DS_Store
160 | 
161 | sparkplus/core/test.py
162 | 
163 | # vscode
164 | .vscode/
165 |  
166 | # Byte-compiled / optimized / DLL files
167 | __pycache__/
168 | *.py[cod]
169 | *$py.class
170 |  
171 | # C extensions
172 | *.so
173 |  
174 | # Distribution / packaging
175 | .Python
176 | build/
177 | develop-eggs/
178 | dist/
179 | downloads/
180 | eggs/
181 | .eggs/
182 | lib/
183 | lib64/
184 | parts/
185 | sdist/
186 | var/
187 | wheels/
188 | pip-wheel-metadata/
189 | share/python-wheels/
190 | *.egg-info/
191 | .installed.cfg
192 | *.egg
193 | MANIFEST
194 |  
195 | # PyInstaller
196 | #  Usually these files are written by a python script from a template
197 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
198 | *.manifest
199 | *.spec
200 |  
201 | # Installer logs
202 | pip-log.txt
203 | pip-delete-this-directory.txt
204 |  
205 | # Unit test / coverage reports
206 | htmlcov/
207 | .tox/
208 | .nox/
209 | .coverage
210 | .coverage.*
211 | .cache
212 | nosetests.xml
213 | coverage.xml
214 | *.cover
215 | .hypothesis/
216 | .pytest_cache/
217 |  
218 | # Translations
219 | *.mo
220 | *.pot
221 |  
222 | # Django stuff:
223 | *.log
224 | local_settings.py
225 | db.sqlite3
226 |  
227 | # Flask stuff:
228 | instance/
229 | .webassets-cache
230 |  
231 | # Scrapy stuff:
232 | .scrapy
233 |  
234 | # Sphinx documentation
235 | docs/_build/
236 |  
237 | # PyBuilder
238 | target/
239 |  
240 | # Jupyter Notebook
241 | .ipynb_checkpoints
242 |  
243 | # IPython
244 | profile_default/
245 | ipython_config.py
246 |  
247 | # pyenv
248 | .python-version
249 |  
250 | # celery beat schedule file
251 | celerybeat-schedule
252 |  
253 | # SageMath parsed files
254 | *.sage.py
255 |  
256 | # Environments
257 | .env
258 | .venv
259 | env/
260 | venv/
261 | ENV/
262 | env.bak/
263 | venv.bak/
264 |  
265 | # Spyder project settings
266 | .spyderproject
267 | .spyproject
268 |  
269 | # Rope project settings
270 | .ropeproject
271 |  
272 | # mkdocs documentation
273 | /site
274 |  
275 | # mypy
276 | .mypy_cache/
277 | .dmypy.json
278 | dmypy.json
279 |  
280 | # Pyre type checker
281 | .pyre/
282 | 
283 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.0.1
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: requirements-txt-fixer
 7 |     -   id: detect-aws-credentials
 8 | -   repo: https://github.com/psf/black
 9 |     rev: 21.9b0
10 |     hooks:
11 |     -   id: black
12 |         language_version: python3.9
13 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-allow-list=
  7 | 
  8 | # A comma-separated list of package or module names from where C extensions may
  9 | # be loaded. Extensions are loading into the active Python interpreter and may
 10 | # run arbitrary code. (This is an alternative name to extension-pkg-allow-list
 11 | # for backward compatibility.)
 12 | extension-pkg-whitelist=
 13 | 
 14 | # Return non-zero exit code if any of these messages/categories are detected,
 15 | # even if score is above --fail-under value. Syntax same as enable. Messages
 16 | # specified are enabled, while categories only check already-enabled messages.
 17 | fail-on=
 18 | 
 19 | # Specify a score threshold to be exceeded before program exits with error.
 20 | fail-under=10.0
 21 | 
 22 | # Files or directories to be skipped. They should be base names, not paths.
 23 | ignore=CVS
 24 | 
 25 | # Add files or directories matching the regex patterns to the ignore-list. The
 26 | # regex matches against paths.
 27 | ignore-paths=
 28 | 
 29 | # Files or directories matching the regex patterns are skipped. The regex
 30 | # matches against base names, not paths.
 31 | ignore-patterns=
 32 | 
 33 | # Python code to execute, usually for sys.path manipulation such as
 34 | # pygtk.require().
 35 | #init-hook=
 36 | 
 37 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 38 | # number of processors available to use.
 39 | jobs=1
 40 | 
 41 | # Control the amount of potential inferred values when inferring a single
 42 | # object. This can help the performance when dealing with large functions or
 43 | # complex, nested conditions.
 44 | limit-inference-results=100
 45 | 
 46 | # List of plugins (as comma separated values of python module names) to load,
 47 | # usually to register additional checkers.
 48 | load-plugins=
 49 | 
 50 | # Pickle collected data for later comparisons.
 51 | persistent=yes
 52 | 
 53 | # Min Python version to use for version dependend checks. Will default to the
 54 | # version used to run pylint.
 55 | py-version=3.9
 56 | 
 57 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 58 | # user-friendly hints instead of false-positive error messages.
 59 | suggestion-mode=yes
 60 | 
 61 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 62 | # active Python interpreter and may run arbitrary code.
 63 | unsafe-load-any-extension=no
 64 | 
 65 | 
 66 | [MESSAGES CONTROL]
 67 | 
 68 | # Only show warnings with the listed confidence levels. Leave empty to show
 69 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 70 | confidence=
 71 | 
 72 | # Disable the message, report, category or checker with the given id(s). You
 73 | # can either give multiple identifiers separated by comma (,) or put this
 74 | # option multiple times (only on the command line, not in the configuration
 75 | # file where it should appear only once). You can also use "--disable=all" to
 76 | # disable everything first and then reenable specific checks. For example, if
 77 | # you want to run only the similarities checker, you can use "--disable=all
 78 | # --enable=similarities". If you want to run only the classes checker, but have
 79 | # no Warning level messages displayed, use "--disable=all --enable=classes
 80 | # --disable=W".
 81 | disable=raw-checker-failed,
 82 |         bad-inline-option,
 83 |         locally-disabled,
 84 |         file-ignored,
 85 |         suppressed-message,
 86 |         useless-suppression,
 87 |         deprecated-pragma,
 88 |         use-symbolic-message-instead,
 89 |         C0114,    # missing-module-docstring
 90 |         C0115,    # missing-class-docstring
 91 |         C0116     # missing-function-docstring
 92 | 
 93 | # Enable the message, report, category or checker with the given id(s). You can
 94 | # either give multiple identifier separated by comma (,) or put this option
 95 | # multiple time (only on the command line, not in the configuration file where
 96 | # it should appear only once). See also the "--disable" option for examples.
 97 | enable=c-extension-no-member
 98 | 
 99 | 
100 | [REPORTS]
101 | 
102 | # Python expression which should return a score less than or equal to 10. You
103 | # have access to the variables 'error', 'warning', 'refactor', and 'convention'
104 | # which contain the number of messages in each category, as well as 'statement'
105 | # which is the total number of statements analyzed. This score is used by the
106 | # global evaluation report (RP0004).
107 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
108 | 
109 | # Template used to display messages. This is a python new-style format string
110 | # used to format the message information. See doc for all details.
111 | #msg-template=
112 | 
113 | # Set the output format. Available formats are text, parseable, colorized, json
114 | # and msvs (visual studio). You can also give a reporter class, e.g.
115 | # mypackage.mymodule.MyReporterClass.
116 | output-format=text
117 | 
118 | # Tells whether to display a full report or only the messages.
119 | reports=no
120 | 
121 | # Activate the evaluation score.
122 | score=yes
123 | 
124 | 
125 | [REFACTORING]
126 | 
127 | # Maximum number of nested blocks for function / method body
128 | max-nested-blocks=5
129 | 
130 | # Complete name of functions that never returns. When checking for
131 | # inconsistent-return-statements if a never returning function is called then
132 | # it will be considered as an explicit return statement and no message will be
133 | # printed.
134 | never-returning-functions=sys.exit,argparse.parse_error
135 | 
136 | 
137 | [LOGGING]
138 | 
139 | # The type of string formatting that logging methods do. `old` means using %
140 | # formatting, `new` is for `{}` formatting.
141 | logging-format-style=old
142 | 
143 | # Logging modules to check that the string format arguments are in logging
144 | # function parameter format.
145 | logging-modules=logging
146 | 
147 | 
148 | [SPELLING]
149 | 
150 | # Limits count of emitted suggestions for spelling mistakes.
151 | max-spelling-suggestions=4
152 | 
153 | # Spelling dictionary name. Available dictionaries: none. To make it work,
154 | # install the 'python-enchant' package.
155 | spelling-dict=
156 | 
157 | # List of comma separated words that should be considered directives if they
158 | # appear and the beginning of a comment and should not be checked.
159 | spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:
160 | 
161 | # List of comma separated words that should not be checked.
162 | spelling-ignore-words=
163 | 
164 | # A path to a file that contains the private dictionary; one word per line.
165 | spelling-private-dict-file=
166 | 
167 | # Tells whether to store unknown words to the private dictionary (see the
168 | # --spelling-private-dict-file option) instead of raising a message.
169 | spelling-store-unknown-words=no
170 | 
171 | 
172 | [MISCELLANEOUS]
173 | 
174 | # List of note tags to take in consideration, separated by a comma.
175 | notes=FIXME,
176 |       XXX,
177 |       TODO
178 | 
179 | # Regular expression of note tags to take in consideration.
180 | #notes-rgx=
181 | 
182 | 
183 | [TYPECHECK]
184 | 
185 | # List of decorators that produce context managers, such as
186 | # contextlib.contextmanager. Add to this list to register other decorators that
187 | # produce valid context managers.
188 | contextmanager-decorators=contextlib.contextmanager
189 | 
190 | # List of members which are set dynamically and missed by pylint inference
191 | # system, and so shouldn't trigger E1101 when accessed. Python regular
192 | # expressions are accepted.
193 | generated-members=
194 | 
195 | # Tells whether missing members accessed in mixin class should be ignored. A
196 | # mixin class is detected if its name ends with "mixin" (case insensitive).
197 | ignore-mixin-members=yes
198 | 
199 | # Tells whether to warn about missing members when the owner of the attribute
200 | # is inferred to be None.
201 | ignore-none=yes
202 | 
203 | # This flag controls whether pylint should warn about no-member and similar
204 | # checks whenever an opaque object is returned when inferring. The inference
205 | # can return multiple potential results while evaluating a Python object, but
206 | # some branches might not be evaluated, which results in partial inference. In
207 | # that case, it might be useful to still emit no-member and other checks for
208 | # the rest of the inferred objects.
209 | ignore-on-opaque-inference=yes
210 | 
211 | # List of class names for which member attributes should not be checked (useful
212 | # for classes with dynamically set attributes). This supports the use of
213 | # qualified names.
214 | ignored-classes=optparse.Values,thread._local,_thread._local
215 | 
216 | # List of module names for which member attributes should not be checked
217 | # (useful for modules/projects where namespaces are manipulated during runtime
218 | # and thus existing member attributes cannot be deduced by static analysis). It
219 | # supports qualified module names, as well as Unix pattern matching.
220 | ignored-modules=
221 | 
222 | # Show a hint with possible names when a member name was not found. The aspect
223 | # of finding the hint is based on edit distance.
224 | missing-member-hint=yes
225 | 
226 | # The minimum edit distance a name should have in order to be considered a
227 | # similar match for a missing member name.
228 | missing-member-hint-distance=1
229 | 
230 | # The total number of similar names that should be taken in consideration when
231 | # showing a hint for a missing member.
232 | missing-member-max-choices=1
233 | 
234 | # List of decorators that change the signature of a decorated function.
235 | signature-mutators=
236 | 
237 | 
238 | [VARIABLES]
239 | 
240 | # List of additional names supposed to be defined in builtins. Remember that
241 | # you should avoid defining new builtins when possible.
242 | additional-builtins=
243 | 
244 | # Tells whether unused global variables should be treated as a violation.
245 | allow-global-unused-variables=yes
246 | 
247 | # List of names allowed to shadow builtins
248 | allowed-redefined-builtins=
249 | 
250 | # List of strings which can identify a callback function by name. A callback
251 | # name must start or end with one of those strings.
252 | callbacks=cb_,
253 |           _cb
254 | 
255 | # A regular expression matching the name of dummy variables (i.e. expected to
256 | # not be used).
257 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
258 | 
259 | # Argument names that match this expression will be ignored. Default to name
260 | # with leading underscore.
261 | ignored-argument-names=_.*|^ignored_|^unused_
262 | 
263 | # Tells whether we should check for unused import in __init__ files.
264 | init-import=no
265 | 
266 | # List of qualified module names which can have objects that can redefine
267 | # builtins.
268 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
269 | 
270 | 
271 | [FORMAT]
272 | 
273 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
274 | expected-line-ending-format=
275 | 
276 | # Regexp for a line that is allowed to be longer than the limit.
277 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
278 | 
279 | # Number of spaces of indent required inside a hanging or continued line.
280 | indent-after-paren=4
281 | 
282 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
283 | # tab).
284 | indent-string='    '
285 | 
286 | # Maximum number of characters on a single line.
287 | max-line-length=100
288 | 
289 | # Maximum number of lines in a module.
290 | max-module-lines=1000
291 | 
292 | # Allow the body of a class to be on the same line as the declaration if body
293 | # contains single statement.
294 | single-line-class-stmt=no
295 | 
296 | # Allow the body of an if to be on the same line as the test if there is no
297 | # else.
298 | single-line-if-stmt=no
299 | 
300 | 
301 | [SIMILARITIES]
302 | 
303 | # Comments are removed from the similarity computation
304 | ignore-comments=yes
305 | 
306 | # Docstrings are removed from the similarity computation
307 | ignore-docstrings=yes
308 | 
309 | # Imports are removed from the similarity computation
310 | ignore-imports=no
311 | 
312 | # Signatures are removed from the similarity computation
313 | ignore-signatures=no
314 | 
315 | # Minimum lines number of a similarity.
316 | min-similarity-lines=4
317 | 
318 | 
319 | [BASIC]
320 | 
321 | # Naming style matching correct argument names.
322 | argument-naming-style=snake_case
323 | 
324 | # Regular expression matching correct argument names. Overrides argument-
325 | # naming-style.
326 | #argument-rgx=
327 | 
328 | # Naming style matching correct attribute names.
329 | attr-naming-style=snake_case
330 | 
331 | # Regular expression matching correct attribute names. Overrides attr-naming-
332 | # style.
333 | #attr-rgx=
334 | 
335 | # Bad variable names which should always be refused, separated by a comma.
336 | bad-names=foo,
337 |           bar,
338 |           baz,
339 |           toto,
340 |           tutu,
341 |           tata
342 | 
343 | # Bad variable names regexes, separated by a comma. If names match any regex,
344 | # they will always be refused
345 | bad-names-rgxs=
346 | 
347 | # Naming style matching correct class attribute names.
348 | class-attribute-naming-style=any
349 | 
350 | # Regular expression matching correct class attribute names. Overrides class-
351 | # attribute-naming-style.
352 | #class-attribute-rgx=
353 | 
354 | # Naming style matching correct class constant names.
355 | class-const-naming-style=UPPER_CASE
356 | 
357 | # Regular expression matching correct class constant names. Overrides class-
358 | # const-naming-style.
359 | #class-const-rgx=
360 | 
361 | # Naming style matching correct class names.
362 | class-naming-style=PascalCase
363 | 
364 | # Regular expression matching correct class names. Overrides class-naming-
365 | # style.
366 | #class-rgx=
367 | 
368 | # Naming style matching correct constant names.
369 | const-naming-style=UPPER_CASE
370 | 
371 | # Regular expression matching correct constant names. Overrides const-naming-
372 | # style.
373 | #const-rgx=
374 | 
375 | # Minimum line length for functions/classes that require docstrings, shorter
376 | # ones are exempt.
377 | docstring-min-length=0
378 | 
379 | # Naming style matching correct function names.
380 | function-naming-style=snake_case
381 | 
382 | # Regular expression matching correct function names. Overrides function-
383 | # naming-style.
384 | #function-rgx=
385 | 
386 | # Good variable names which should always be accepted, separated by a comma.
387 | good-names=i,
388 |            j,
389 |            k,
390 |            ex,
391 |            Run,
392 |            _
393 | 
394 | # Good variable names regexes, separated by a comma. If names match any regex,
395 | # they will always be accepted
396 | good-names-rgxs=
397 | 
398 | # Include a hint for the correct naming format with invalid-name.
399 | include-naming-hint=no
400 | 
401 | # Naming style matching correct inline iteration names.
402 | inlinevar-naming-style=any
403 | 
404 | # Regular expression matching correct inline iteration names. Overrides
405 | # inlinevar-naming-style.
406 | #inlinevar-rgx=
407 | 
408 | # Naming style matching correct method names.
409 | method-naming-style=snake_case
410 | 
411 | # Regular expression matching correct method names. Overrides method-naming-
412 | # style.
413 | #method-rgx=
414 | 
415 | # Naming style matching correct module names.
416 | module-naming-style=snake_case
417 | 
418 | # Regular expression matching correct module names. Overrides module-naming-
419 | # style.
420 | #module-rgx=
421 | 
422 | # Colon-delimited sets of names that determine each other's naming style when
423 | # the name regexes allow several styles.
424 | name-group=
425 | 
426 | # Regular expression which should only match function or class names that do
427 | # not require a docstring.
428 | no-docstring-rgx=^_
429 | 
430 | # List of decorators that produce properties, such as abc.abstractproperty. Add
431 | # to this list to register other decorators that produce valid properties.
432 | # These decorators are taken in consideration only for invalid-name.
433 | property-classes=abc.abstractproperty
434 | 
435 | # Naming style matching correct variable names.
436 | variable-naming-style=snake_case
437 | 
438 | # Regular expression matching correct variable names. Overrides variable-
439 | # naming-style.
440 | #variable-rgx=
441 | 
442 | 
443 | [STRING]
444 | 
445 | # This flag controls whether inconsistent-quotes generates a warning when the
446 | # character used as a quote delimiter is used inconsistently within a module.
447 | check-quote-consistency=no
448 | 
449 | # This flag controls whether the implicit-str-concat should generate a warning
450 | # on implicit string concatenation in sequences defined over several lines.
451 | check-str-concat-over-line-jumps=no
452 | 
453 | 
454 | [IMPORTS]
455 | 
456 | # List of modules that can be imported at any level, not just the top level
457 | # one.
458 | allow-any-import-level=
459 | 
460 | # Allow wildcard imports from modules that define __all__.
461 | allow-wildcard-with-all=no
462 | 
463 | # Analyse import fallback blocks. This can be used to support both Python 2 and
464 | # 3 compatible code, which means that the block might have code that exists
465 | # only in one or another interpreter, leading to false positives when analysed.
466 | analyse-fallback-blocks=no
467 | 
468 | # Deprecated modules which should not be used, separated by a comma.
469 | deprecated-modules=
470 | 
471 | # Output a graph (.gv or any supported image format) of external dependencies
472 | # to the given file (report RP0402 must not be disabled).
473 | ext-import-graph=
474 | 
475 | # Output a graph (.gv or any supported image format) of all (i.e. internal and
476 | # external) dependencies to the given file (report RP0402 must not be
477 | # disabled).
478 | import-graph=
479 | 
480 | # Output a graph (.gv or any supported image format) of internal dependencies
481 | # to the given file (report RP0402 must not be disabled).
482 | int-import-graph=
483 | 
484 | # Force import order to recognize a module as part of the standard
485 | # compatibility libraries.
486 | known-standard-library=
487 | 
488 | # Force import order to recognize a module as part of a third party library.
489 | known-third-party=enchant
490 | 
491 | # Couples of modules and preferred modules, separated by a comma.
492 | preferred-modules=
493 | 
494 | 
495 | [CLASSES]
496 | 
497 | # Warn about protected attribute access inside special methods
498 | check-protected-access-in-special-methods=no
499 | 
500 | # List of method names used to declare (i.e. assign) instance attributes.
501 | defining-attr-methods=__init__,
502 |                       __new__,
503 |                       setUp,
504 |                       __post_init__
505 | 
506 | # List of member names, which should be excluded from the protected access
507 | # warning.
508 | exclude-protected=_asdict,
509 |                   _fields,
510 |                   _replace,
511 |                   _source,
512 |                   _make
513 | 
514 | # List of valid names for the first argument in a class method.
515 | valid-classmethod-first-arg=cls
516 | 
517 | # List of valid names for the first argument in a metaclass class method.
518 | valid-metaclass-classmethod-first-arg=cls
519 | 
520 | 
521 | [DESIGN]
522 | 
523 | # List of qualified class names to ignore when counting class parents (see
524 | # R0901)
525 | ignored-parents=
526 | 
527 | # Maximum number of arguments for function / method.
528 | max-args=5
529 | 
530 | # Maximum number of attributes for a class (see R0902).
531 | max-attributes=7
532 | 
533 | # Maximum number of boolean expressions in an if statement (see R0916).
534 | max-bool-expr=5
535 | 
536 | # Maximum number of branch for function / method body.
537 | max-branches=12
538 | 
539 | # Maximum number of locals for function / method body.
540 | max-locals=15
541 | 
542 | # Maximum number of parents for a class (see R0901).
543 | max-parents=7
544 | 
545 | # Maximum number of public methods for a class (see R0904).
546 | max-public-methods=20
547 | 
548 | # Maximum number of return / yield for function / method body.
549 | max-returns=6
550 | 
551 | # Maximum number of statements in function / method body.
552 | max-statements=50
553 | 
554 | # Minimum number of public methods for a class (see R0903).
555 | min-public-methods=2
556 | 
557 | 
558 | [EXCEPTIONS]
559 | 
560 | # Exceptions that will emit a warning when being caught. Defaults to
561 | # "BaseException, Exception".
562 | overgeneral-exceptions=BaseException,
563 |                        Exception
564 | 


--------------------------------------------------------------------------------
/.pypirc:
--------------------------------------------------------------------------------
 1 | [distutils]
 2 | index-servers =
 3 |   pypi
 4 |   internal
 5 | 
 6 | [pypi]
 7 | username:<pypi_username>
 8 | password:<pypi_passwd>
 9 | 
10 | [internal]
11 | repository: http://(외부 IP):8080
12 | username: <username>
13 | password: <passwd>
14 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "python.pythonPath": "/usr/bin/python",
 3 |   "python.linting.pylintPath": "/usr/local/bin/pylint",
 4 |   "python.linting.pylintEnabled": true,
 5 |   "python.linting.enabled": true,
 6 |   "python.linting.pylintArgs": [
 7 |     "--init-hook",
 8 |     "import sys; sys.path.append('/Users/lee/Desktop/spark-plugin/sparkplus/core')"
 9 |   ],
10 |   "files.watcherExclude": {
11 |     "**/target": true
12 |   }
13 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 SWM-12 / Team 12
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | pyspark = "*"
 8 | mysql-connector-python = "*"
 9 | pandas = "*"
10 | geopandas = "*"
11 | matplotlib = "*"
12 | geospark = "*"
13 | h3 = "*"
14 | geopy = "*"
15 | folium = "*"
16 | python-dotenv = "*"
17 | 
18 | [dev-packages]
19 | 
20 | [requires]
21 | python_version = "3.9"
22 | 


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |     "_meta": {
  3 |         "hash": {
  4 |             "sha256": "24884446b2b9af187c2bcdf8c8aef963b732e9b9c992eea2020bbd6472cf6009"
  5 |         },
  6 |         "pipfile-spec": 6,
  7 |         "requires": {
  8 |             "python_version": "3.9"
  9 |         },
 10 |         "sources": [
 11 |             {
 12 |                 "name": "pypi",
 13 |                 "url": "https://pypi.org/simple",
 14 |                 "verify_ssl": true
 15 |             }
 16 |         ]
 17 |     },
 18 |     "default": {
 19 |         "attrs": {
 20 |             "hashes": [
 21 |                 "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6",
 22 |                 "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c"
 23 |             ],
 24 |             "markers": "python_version >= '3.5'",
 25 |             "version": "==22.1.0"
 26 |         },
 27 |         "branca": {
 28 |             "hashes": [
 29 |                 "sha256:55949855214504c7583b71b9a03a84dce2e96a84027613bb53b42d04844ce24e",
 30 |                 "sha256:ae706fc7a88dd0296a58bb11c0cb3c6be358491a3b0abee08fe16b4db17814c0"
 31 |             ],
 32 |             "markers": "python_version >= '3.7'",
 33 |             "version": "==0.6.0"
 34 |         },
 35 |         "certifi": {
 36 |             "hashes": [
 37 |                 "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14",
 38 |                 "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382"
 39 |             ],
 40 |             "markers": "python_version >= '3.6'",
 41 |             "version": "==2022.9.24"
 42 |         },
 43 |         "charset-normalizer": {
 44 |             "hashes": [
 45 |                 "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845",
 46 |                 "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f"
 47 |             ],
 48 |             "markers": "python_version >= '3.6'",
 49 |             "version": "==2.1.1"
 50 |         },
 51 |         "click": {
 52 |             "hashes": [
 53 |                 "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e",
 54 |                 "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"
 55 |             ],
 56 |             "markers": "python_version >= '3.7'",
 57 |             "version": "==8.1.3"
 58 |         },
 59 |         "click-plugins": {
 60 |             "hashes": [
 61 |                 "sha256:46ab999744a9d831159c3411bb0c79346d94a444df9a3a3742e9ed63645f264b",
 62 |                 "sha256:5d262006d3222f5057fd81e1623d4443e41dcda5dc815c06b442aa3c02889fc8"
 63 |             ],
 64 |             "version": "==1.1.1"
 65 |         },
 66 |         "cligj": {
 67 |             "hashes": [
 68 |                 "sha256:a4bc13d623356b373c2c27c53dbd9c68cae5d526270bfa71f6c6fa69669c6b27",
 69 |                 "sha256:c1ca117dbce1fe20a5809dc96f01e1c2840f6dcc939b3ddbb1111bf330ba82df"
 70 |             ],
 71 |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'",
 72 |             "version": "==0.7.2"
 73 |         },
 74 |         "cycler": {
 75 |             "hashes": [
 76 |                 "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3",
 77 |                 "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f"
 78 |             ],
 79 |             "markers": "python_version >= '3.6'",
 80 |             "version": "==0.11.0"
 81 |         },
 82 |         "findspark": {
 83 |             "hashes": [
 84 |                 "sha256:aa10a96cb616cab329181d72e8ef13d2dc453b4babd02b5482471a0882c1195e",
 85 |                 "sha256:e5d5415ff8ced6b173b801e12fc90c1eefca1fb6bf9c19c4fc1f235d4222e753"
 86 |             ],
 87 |             "version": "==2.0.1"
 88 |         },
 89 |         "fiona": {
 90 |             "hashes": [
 91 |                 "sha256:18649326a7724611b16b648e14fd094089d517413b95ac91d0cdb0adc5fcb8de",
 92 |                 "sha256:3f26c8b6ea9bc92cbd52a4dd83ffd44472450bf92f4e3d4ef2341adc2f35a54d",
 93 |                 "sha256:59a3800bc09ebee3516d64d02a8a6818d07ab1573c6096f3ef3468bf9f8f95f8",
 94 |                 "sha256:6ba2294bc6adcbc36229862667aac6b98e6c306e1958caf53b8bfcf9a3b8c77a",
 95 |                 "sha256:75924f69c51db6e258c91308780546278028c509db12aa33a47692a0266c9667",
 96 |                 "sha256:89cfcc3bdb4aba7bba1eb552b3866b851334693ab694529803122b21f5927960",
 97 |                 "sha256:904793b17aee70ca9c3d582dbf01623eccfdeacd00c5e1a8e421be41f2e43d67",
 98 |                 "sha256:a82a99ce9b3e7825740157c45c9fb2259d4e92f0a886aaac25f0db40ffe1eea3",
 99 |                 "sha256:b5cad3424b7473eb0e19f17ee45abec92133a694a4b452a278f02e3b8d0f810f",
100 |                 "sha256:b88e2e6548a41c1dfa3f96c8275ff472a3edca729e14a641c0fa5b2e146a8ab5",
101 |                 "sha256:c28d9ffa5d230a1d9eaf571529fa9eb7573d39613354c090ad077ad153a37ee1",
102 |                 "sha256:c4aafdd565b3a30bdd78cafae35d4945f6741eef31401c1bb1e166b6262d7539",
103 |                 "sha256:ce9a22c9883cc5d11c05ba3fb9db5082044a07c6b299753ea5bb8e178b8ba53b",
104 |                 "sha256:d0df3e105ad7f0cca5f16b441c232fd693ef6c4adf2c1b6271aaaa1cdc06164d",
105 |                 "sha256:d47777890aa1d715025abc7a6d6b2a6bb8d2a37cc94c44ce95940b80eda21444",
106 |                 "sha256:df34c980cd7396adfbc89bbb363bdd6e358c76f91969fc98c9dfc076dd11638d",
107 |                 "sha256:e33860aaf70bbd2726cff12fd3857bd832b6dc2ad3ce4b27e7563bd68abdc26f",
108 |                 "sha256:e3ed1c0c1c60f710a612aaeb294de54214d228c4ef40e0c1dc159e46f86a9446",
109 |                 "sha256:ed75dd29c89e0e455e3a322f28cd92f192bcb8fced16e2bfb6422a7f95ffe5e9"
110 |             ],
111 |             "version": "==1.8.22"
112 |         },
113 |         "folium": {
114 |             "hashes": [
115 |                 "sha256:3d2c48dd6ffe5327975bbfd718468c4e81db9f2844c26e574f878adf4c08b644"
116 |             ],
117 |             "index": "pypi",
118 |             "version": "==0.12.1"
119 |         },
120 |         "geographiclib": {
121 |             "hashes": [
122 |                 "sha256:8f441c527b0b8a26cd96c965565ff0513d1e4d9952b704bf449409e5015c77b7",
123 |                 "sha256:ac400d672b8954b0306bca890b088bb8ba2a757dc8133cca0b878f34b33b2740"
124 |             ],
125 |             "version": "==1.52"
126 |         },
127 |         "geopandas": {
128 |             "hashes": [
129 |                 "sha256:1722853464441b603d9be3d35baf8bde43831424a891e82a8545eb8997b65d6c",
130 |                 "sha256:efbf47e70732e25c3727222019c92b39b2e0a66ebe4fe379fbe1aa43a2a871db"
131 |             ],
132 |             "index": "pypi",
133 |             "version": "==0.10.2"
134 |         },
135 |         "geopy": {
136 |             "hashes": [
137 |                 "sha256:58b7edf526b8c32e33126570b5f4fcdfaa29d4416506064777ae8d84cd103fdd",
138 |                 "sha256:8f1f949082b964385de61fcc3a667a6a9a6e242beb1ae8972449f164b2ba0e89"
139 |             ],
140 |             "index": "pypi",
141 |             "version": "==2.2.0"
142 |         },
143 |         "geospark": {
144 |             "hashes": [
145 |                 "sha256:1056034994b93773849b1c28a05df463d885665eb56d8655f4a2f7f02e5dea72",
146 |                 "sha256:ab6157297f6d395001305b1e21f1a4cca75e169c704b3de998bbc260095900a3"
147 |             ],
148 |             "index": "pypi",
149 |             "version": "==1.3.1"
150 |         },
151 |         "h3": {
152 |             "hashes": [
153 |                 "sha256:105625a45d86b6cd1cd67acd7ab158adf3d193262534470b69a1db49a6664541",
154 |                 "sha256:1af9c039f7daeff4621c1349000eec0ed37c4548552a3a173cddb6d648547344",
155 |                 "sha256:20d48a3c9acdcf7c02c70519c3a5a22406b505cc34ff9f9a302e11a2a13d9c73",
156 |                 "sha256:339f4f210373dd43739019d6a8def64b119de62f3083e31b2d0413954c429c88",
157 |                 "sha256:3909aef50b19835b0790e077d9f06b27609380bb7bf09382e2c4e813385f7677",
158 |                 "sha256:48274cff38d53da155500679194d69ce19aaa52c00d0f30f24a327c1b22cb752",
159 |                 "sha256:54e0f74357467347aee517d6137777094b64b9aac648d92a7507e14ac28ddca6",
160 |                 "sha256:59abe06c99afa1b27bb7fcfb2a8c01d285ad36005dce8c82fca3dab0b4d8777d",
161 |                 "sha256:61ddf3052f226de22b546af5d47816d81fea83eb0e62d22f53ed3b23eb0b8551",
162 |                 "sha256:75214450b89e5204d77700e01cf2d41a02d7cbfe9b9ca925727c52e18f91072e",
163 |                 "sha256:7a0817ed9f6b8f4b7eeb719744260ab41ccd7131475b169e45a79cf99d045b1f",
164 |                 "sha256:7d6334b74e80a0e9395132bf9a7d38799b40df6181467616e950032a112773e9",
165 |                 "sha256:83fb0a7e1a1241c9c69137569e761b257b9828abc24b8bf78710ddce8db9e28e",
166 |                 "sha256:86b6bc6ac38f93be0899f45fb55585fb5c4964a2e5c8cc4a349cac41fecb10f3",
167 |                 "sha256:87c5ed2e2878cb936dd466ed5b4fc7cd462cc8a713d066789918d295d9d26a63",
168 |                 "sha256:8acd1e448bad2f5cd03ab0107f34f34967dd271726978ed5acbe42806628d90a",
169 |                 "sha256:950e833148ff441ee240d8ae71d4a32208cf062d9e0cc389fd056fc7cd280a0c",
170 |                 "sha256:bd1982ab1f2a85517aae7166582b82c73e5350c31652f4ab20b337bcf978b43d",
171 |                 "sha256:ddbe4dd122be51508a43baee157a2724d52ec480b45da932f7ca058cfec4aaf9",
172 |                 "sha256:de5d3bcc8f0253531ddf72e7305c6425b4af0c22921962ab7392a3c4c1dc5530",
173 |                 "sha256:e0467583c23164d232de51b82087685ab3c961911f673c892d10f87fd6642990",
174 |                 "sha256:e73ed07510907c8cf4e3a6f14625af221b0a3dea5c680ff011abec622cf2be9a",
175 |                 "sha256:f6f832c71b3b9be8949b299d20e8230129321d2296c28b970607d354cbce6efa"
176 |             ],
177 |             "index": "pypi",
178 |             "version": "==3.7.3"
179 |         },
180 |         "idna": {
181 |             "hashes": [
182 |                 "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4",
183 |                 "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2"
184 |             ],
185 |             "markers": "python_version >= '3.5'",
186 |             "version": "==3.4"
187 |         },
188 |         "jinja2": {
189 |             "hashes": [
190 |                 "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852",
191 |                 "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"
192 |             ],
193 |             "markers": "python_version >= '3.7'",
194 |             "version": "==3.1.2"
195 |         },
196 |         "kiwisolver": {
197 |             "hashes": [
198 |                 "sha256:02f79693ec433cb4b5f51694e8477ae83b3205768a6fb48ffba60549080e295b",
199 |                 "sha256:03baab2d6b4a54ddbb43bba1a3a2d1627e82d205c5cf8f4c924dc49284b87166",
200 |                 "sha256:1041feb4cda8708ce73bb4dcb9ce1ccf49d553bf87c3954bdfa46f0c3f77252c",
201 |                 "sha256:10ee06759482c78bdb864f4109886dff7b8a56529bc1609d4f1112b93fe6423c",
202 |                 "sha256:1d1573129aa0fd901076e2bfb4275a35f5b7aa60fbfb984499d661ec950320b0",
203 |                 "sha256:283dffbf061a4ec60391d51e6155e372a1f7a4f5b15d59c8505339454f8989e4",
204 |                 "sha256:28bc5b299f48150b5f822ce68624e445040595a4ac3d59251703779836eceff9",
205 |                 "sha256:2a66fdfb34e05b705620dd567f5a03f239a088d5a3f321e7b6ac3239d22aa286",
206 |                 "sha256:2e307eb9bd99801f82789b44bb45e9f541961831c7311521b13a6c85afc09767",
207 |                 "sha256:2e407cb4bd5a13984a6c2c0fe1845e4e41e96f183e5e5cd4d77a857d9693494c",
208 |                 "sha256:2f5e60fabb7343a836360c4f0919b8cd0d6dbf08ad2ca6b9cf90bf0c76a3c4f6",
209 |                 "sha256:36dafec3d6d6088d34e2de6b85f9d8e2324eb734162fba59d2ba9ed7a2043d5b",
210 |                 "sha256:3fe20f63c9ecee44560d0e7f116b3a747a5d7203376abeea292ab3152334d004",
211 |                 "sha256:41dae968a94b1ef1897cb322b39360a0812661dba7c682aa45098eb8e193dbdf",
212 |                 "sha256:4bd472dbe5e136f96a4b18f295d159d7f26fd399136f5b17b08c4e5f498cd494",
213 |                 "sha256:4ea39b0ccc4f5d803e3337dd46bcce60b702be4d86fd0b3d7531ef10fd99a1ac",
214 |                 "sha256:5853eb494c71e267912275e5586fe281444eb5e722de4e131cddf9d442615626",
215 |                 "sha256:5bce61af018b0cb2055e0e72e7d65290d822d3feee430b7b8203d8a855e78766",
216 |                 "sha256:6295ecd49304dcf3bfbfa45d9a081c96509e95f4b9d0eb7ee4ec0530c4a96514",
217 |                 "sha256:62ac9cc684da4cf1778d07a89bf5f81b35834cb96ca523d3a7fb32509380cbf6",
218 |                 "sha256:70e7c2e7b750585569564e2e5ca9845acfaa5da56ac46df68414f29fea97be9f",
219 |                 "sha256:7577c1987baa3adc4b3c62c33bd1118c3ef5c8ddef36f0f2c950ae0b199e100d",
220 |                 "sha256:75facbe9606748f43428fc91a43edb46c7ff68889b91fa31f53b58894503a191",
221 |                 "sha256:787518a6789009c159453da4d6b683f468ef7a65bbde796bcea803ccf191058d",
222 |                 "sha256:78d6601aed50c74e0ef02f4204da1816147a6d3fbdc8b3872d263338a9052c51",
223 |                 "sha256:7c43e1e1206cd421cd92e6b3280d4385d41d7166b3ed577ac20444b6995a445f",
224 |                 "sha256:81e38381b782cc7e1e46c4e14cd997ee6040768101aefc8fa3c24a4cc58e98f8",
225 |                 "sha256:841293b17ad704d70c578f1f0013c890e219952169ce8a24ebc063eecf775454",
226 |                 "sha256:872b8ca05c40d309ed13eb2e582cab0c5a05e81e987ab9c521bf05ad1d5cf5cb",
227 |                 "sha256:877272cf6b4b7e94c9614f9b10140e198d2186363728ed0f701c6eee1baec1da",
228 |                 "sha256:8c808594c88a025d4e322d5bb549282c93c8e1ba71b790f539567932722d7bd8",
229 |                 "sha256:8ed58b8acf29798b036d347791141767ccf65eee7f26bde03a71c944449e53de",
230 |                 "sha256:91672bacaa030f92fc2f43b620d7b337fd9a5af28b0d6ed3f77afc43c4a64b5a",
231 |                 "sha256:968f44fdbf6dd757d12920d63b566eeb4d5b395fd2d00d29d7ef00a00582aac9",
232 |                 "sha256:9f85003f5dfa867e86d53fac6f7e6f30c045673fa27b603c397753bebadc3008",
233 |                 "sha256:a553dadda40fef6bfa1456dc4be49b113aa92c2a9a9e8711e955618cd69622e3",
234 |                 "sha256:a68b62a02953b9841730db7797422f983935aeefceb1679f0fc85cbfbd311c32",
235 |                 "sha256:abbe9fa13da955feb8202e215c4018f4bb57469b1b78c7a4c5c7b93001699938",
236 |                 "sha256:ad881edc7ccb9d65b0224f4e4d05a1e85cf62d73aab798943df6d48ab0cd79a1",
237 |                 "sha256:b1792d939ec70abe76f5054d3f36ed5656021dcad1322d1cc996d4e54165cef9",
238 |                 "sha256:b428ef021242344340460fa4c9185d0b1f66fbdbfecc6c63eff4b7c29fad429d",
239 |                 "sha256:b533558eae785e33e8c148a8d9921692a9fe5aa516efbdff8606e7d87b9d5824",
240 |                 "sha256:ba59c92039ec0a66103b1d5fe588fa546373587a7d68f5c96f743c3396afc04b",
241 |                 "sha256:bc8d3bd6c72b2dd9decf16ce70e20abcb3274ba01b4e1c96031e0c4067d1e7cd",
242 |                 "sha256:bc9db8a3efb3e403e4ecc6cd9489ea2bac94244f80c78e27c31dcc00d2790ac2",
243 |                 "sha256:bf7d9fce9bcc4752ca4a1b80aabd38f6d19009ea5cbda0e0856983cf6d0023f5",
244 |                 "sha256:c2dbb44c3f7e6c4d3487b31037b1bdbf424d97687c1747ce4ff2895795c9bf69",
245 |                 "sha256:c79ebe8f3676a4c6630fd3f777f3cfecf9289666c84e775a67d1d358578dc2e3",
246 |                 "sha256:c97528e64cb9ebeff9701e7938653a9951922f2a38bd847787d4a8e498cc83ae",
247 |                 "sha256:d0611a0a2a518464c05ddd5a3a1a0e856ccc10e67079bb17f265ad19ab3c7597",
248 |                 "sha256:d06adcfa62a4431d404c31216f0f8ac97397d799cd53800e9d3efc2fbb3cf14e",
249 |                 "sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955",
250 |                 "sha256:d5b61785a9ce44e5a4b880272baa7cf6c8f48a5180c3e81c59553ba0cb0821ca",
251 |                 "sha256:da152d8cdcab0e56e4f45eb08b9aea6455845ec83172092f09b0e077ece2cf7a",
252 |                 "sha256:da7e547706e69e45d95e116e6939488d62174e033b763ab1496b4c29b76fabea",
253 |                 "sha256:db5283d90da4174865d520e7366801a93777201e91e79bacbac6e6927cbceede",
254 |                 "sha256:db608a6757adabb32f1cfe6066e39b3706d8c3aa69bbc353a5b61edad36a5cb4",
255 |                 "sha256:e0ea21f66820452a3f5d1655f8704a60d66ba1191359b96541eaf457710a5fc6",
256 |                 "sha256:e7da3fec7408813a7cebc9e4ec55afed2d0fd65c4754bc376bf03498d4e92686",
257 |                 "sha256:e92a513161077b53447160b9bd8f522edfbed4bd9759e4c18ab05d7ef7e49408",
258 |                 "sha256:ecb1fa0db7bf4cff9dac752abb19505a233c7f16684c5826d1f11ebd9472b871",
259 |                 "sha256:efda5fc8cc1c61e4f639b8067d118e742b812c930f708e6667a5ce0d13499e29",
260 |                 "sha256:f0a1dbdb5ecbef0d34eb77e56fcb3e95bbd7e50835d9782a45df81cc46949750",
261 |                 "sha256:f0a71d85ecdd570ded8ac3d1c0f480842f49a40beb423bb8014539a9f32a5897",
262 |                 "sha256:f4f270de01dd3e129a72efad823da90cc4d6aafb64c410c9033aba70db9f1ff0",
263 |                 "sha256:f6cb459eea32a4e2cf18ba5fcece2dbdf496384413bc1bae15583f19e567f3b2",
264 |                 "sha256:f8ad8285b01b0d4695102546b342b493b3ccc6781fc28c8c6a1bb63e95d22f09",
265 |                 "sha256:f9f39e2f049db33a908319cf46624a569b36983c7c78318e9726a4cb8923b26c"
266 |             ],
267 |             "markers": "python_version >= '3.7'",
268 |             "version": "==1.4.4"
269 |         },
270 |         "markupsafe": {
271 |             "hashes": [
272 |                 "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003",
273 |                 "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88",
274 |                 "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5",
275 |                 "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7",
276 |                 "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a",
277 |                 "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603",
278 |                 "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1",
279 |                 "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135",
280 |                 "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247",
281 |                 "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6",
282 |                 "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601",
283 |                 "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77",
284 |                 "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02",
285 |                 "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e",
286 |                 "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63",
287 |                 "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f",
288 |                 "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980",
289 |                 "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b",
290 |                 "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812",
291 |                 "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff",
292 |                 "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96",
293 |                 "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1",
294 |                 "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925",
295 |                 "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a",
296 |                 "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6",
297 |                 "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e",
298 |                 "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f",
299 |                 "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4",
300 |                 "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f",
301 |                 "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3",
302 |                 "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c",
303 |                 "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a",
304 |                 "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417",
305 |                 "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a",
306 |                 "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a",
307 |                 "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37",
308 |                 "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452",
309 |                 "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933",
310 |                 "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a",
311 |                 "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"
312 |             ],
313 |             "markers": "python_version >= '3.7'",
314 |             "version": "==2.1.1"
315 |         },
316 |         "matplotlib": {
317 |             "hashes": [
318 |                 "sha256:01c9de93a2ca0d128c9064f23709362e7fefb34910c7c9e0b8ab0de8258d5eda",
319 |                 "sha256:41b6e307458988891fcdea2d8ecf84a8c92d53f84190aa32da65f9505546e684",
320 |                 "sha256:48e1e0859b54d5f2e29bb78ca179fd59b971c6ceb29977fb52735bfd280eb0f5",
321 |                 "sha256:54a026055d5f8614f184e588f6e29064019a0aa8448450214c0b60926d62d919",
322 |                 "sha256:556965514b259204637c360d213de28d43a1f4aed1eca15596ce83f768c5a56f",
323 |                 "sha256:5c988bb43414c7c2b0a31bd5187b4d27fd625c080371b463a6d422047df78913",
324 |                 "sha256:6a724e3a48a54b8b6e7c4ae38cd3d07084508fa47c410c8757e9db9791421838",
325 |                 "sha256:6be8df61b1626e1a142c57e065405e869e9429b4a6dab4a324757d0dc4d42235",
326 |                 "sha256:844a7b0233e4ff7fba57e90b8799edaa40b9e31e300b8d5efc350937fa8b1bea",
327 |                 "sha256:85f0c9cf724715e75243a7b3087cf4a3de056b55e05d4d76cc58d610d62894f3",
328 |                 "sha256:a78a3b51f29448c7f4d4575e561f6b0dbb8d01c13c2046ab6c5220eb25c06506",
329 |                 "sha256:b884715a59fec9ad3b6048ecf3860f3b2ce965e676ef52593d6fa29abcf7d330",
330 |                 "sha256:b8b53f336a4688cfce615887505d7e41fd79b3594bf21dd300531a4f5b4f746a",
331 |                 "sha256:c70b6311dda3e27672f1bf48851a0de816d1ca6aaf3d49365fbdd8e959b33d2b",
332 |                 "sha256:ebfb01a65c3f5d53a8c2a8133fec2b5221281c053d944ae81ff5822a68266617",
333 |                 "sha256:eeb1859efe7754b1460e1d4991bbd4a60a56f366bc422ef3a9c5ae05f0bc70b5",
334 |                 "sha256:f15edcb0629a0801738925fe27070480f446fcaa15de65946ff946ad99a59a40",
335 |                 "sha256:f1c5efc278d996af8a251b2ce0b07bbeccb821f25c8c9846bdcb00ffc7f158aa",
336 |                 "sha256:f72657f1596199dc1e4e7a10f52a4784ead8a711f4e5b59bea95bdb97cf0e4fd",
337 |                 "sha256:fc4f526dfdb31c9bd6b8ca06bf9fab663ca12f3ec9cdf4496fb44bc680140318",
338 |                 "sha256:fcd6f1954943c0c192bfbebbac263f839d7055409f1173f80d8b11a224d236da"
339 |             ],
340 |             "index": "pypi",
341 |             "version": "==3.4.3"
342 |         },
343 |         "munch": {
344 |             "hashes": [
345 |                 "sha256:2d735f6f24d4dba3417fa448cae40c6e896ec1fdab6cdb5e6510999758a4dbd2",
346 |                 "sha256:6f44af89a2ce4ed04ff8de41f70b226b984db10a91dcc7b9ac2efc1c77022fdd"
347 |             ],
348 |             "version": "==2.5.0"
349 |         },
350 |         "mysql-connector-python": {
351 |             "hashes": [
352 |                 "sha256:049374e54441903022f1c277a7467e4e7cf72a8d89ca26e86d4fa26b7157346c",
353 |                 "sha256:08e8bdb0b0cd247213764d115433972d0f5d103a00eb9cd0330294bdbb58cbca",
354 |                 "sha256:1ebbec05a4279bb2068e270c92f50101cfddb1c551d8c588f34097cde89d8344",
355 |                 "sha256:2af9bf324649d056e8f1e0f212a046c8794a6b5ac4d7fa2be600db443d0b57ba",
356 |                 "sha256:47e391ecae349e75ecffb513aec47ec3dbcfc8e2222ef9bd0b0494029eaa2a1b",
357 |                 "sha256:542d692b8284f8185a8f75f70c9d6c13eef80d2d530444b4f7f130868253e9f9",
358 |                 "sha256:64078ca692aa7e403e1660d4f8cd50816fee52e28827a9dd10d1cc4fc7ca5339",
359 |                 "sha256:667c712c0464527faee977d5db48f308e6b2d64396de0b5ba3fd459eda0653d0",
360 |                 "sha256:6ec8ae4b51487f8b2d542b02e7026dddec92f29239daef2dbfcfbaa9fd5503f2",
361 |                 "sha256:75fc7a089f1626ffbd22986090ca7cc3359c77ab9c4bde4bab1e30e15d4cbfd9",
362 |                 "sha256:7892dca8fc03a6e6131bc7359650064085ca803ae1406a104f55470e1c700668",
363 |                 "sha256:7a63dfded577f0a1800c863c4e9bcff7b583bcd369fc1eb4c2ec44b1f907e295",
364 |                 "sha256:7fa3c4b571e5bab629dbce6013b36ff42efdfe47da6ff14cee25acd1a77649bb",
365 |                 "sha256:86dc8e57082ee8fd631edeed5299396bd7d842fe455f5347e1ad08ace38b22ea",
366 |                 "sha256:8a3a8605c5380870a898b4a52c5b0d138e7cb998b192f10552373782d003886d",
367 |                 "sha256:94abfd76c6ad36f1bcf96f49d76dd55b9e09767eea972669baba9fc385fd9a46",
368 |                 "sha256:977ba6abdca01840afe27e461ec3a79550b50499782e5ff2933e513a52777870",
369 |                 "sha256:a261552ef3a2e865a76b751ba7ac3d1d1c4cbc8f167f39436343ef56c8d46d5a",
370 |                 "sha256:a7b93c14ef59d035e4277a9d637309e8057256efb073cb3db78337ff62c6099b",
371 |                 "sha256:a836d47f54ee50065ac98917513f2da50957c9cb809daaa144c9f2ab50afbc6f",
372 |                 "sha256:aa3d5e3656b3b418430b8c5e821f0a9329530a22fe717815c76dba524714d3ff",
373 |                 "sha256:ad393ddc1974da2b4e952156c3b1a8316f1cb14555b1ea83db6c3619232f8d89",
374 |                 "sha256:ae17753a4034a79d6ecc9163f8b5c3ea8a9c1ac2c7dac8c0a24b97102b253d26",
375 |                 "sha256:b947650179a4778d7e13b354a3c7c3b5e13ec00d86727375a0cbba0b43ade82c",
376 |                 "sha256:bca758bf9e4d936cc745ae4f51472217c0ebcfd54d4aaa85974f0620ac4633cc",
377 |                 "sha256:cf0c8e41edcd8a02f9ccbe925160ef12486111fcb2641d4551e3b2578afbe2c4",
378 |                 "sha256:d15136f44fe36c135295719b2635686dbbe1b8043297b3420129368000cf2820",
379 |                 "sha256:d3469c512a5a48809feeffc34df4c53667ee7b8795ff6e56c90861e1f5386763",
380 |                 "sha256:d7cccd804cafd2d15c731d06a38a88adf93ece684dd5f68b2bc77c04ed9f4131",
381 |                 "sha256:dbfe5cd52386a46fd32b59ff7b03974e39ead0bdbb3d23639b8c2dede00ebcdb",
382 |                 "sha256:de6f3daa99242fcf559d87466ea95f37b6b9cd7257be516440abe6e925548ef9",
383 |                 "sha256:e12481264dc938178d8225dd06590a6d16dbb1f8af51a7748cffe521afb52546",
384 |                 "sha256:fcfb722e748ec9219d5caee7c73855e93e67c7c57cd790e49d37c1c8571ba040"
385 |             ],
386 |             "index": "pypi",
387 |             "version": "==8.0.27"
388 |         },
389 |         "numpy": {
390 |             "hashes": [
391 |                 "sha256:01dd17cbb340bf0fc23981e52e1d18a9d4050792e8fb8363cecbf066a84b827d",
392 |                 "sha256:06005a2ef6014e9956c09ba07654f9837d9e26696a0470e42beedadb78c11b07",
393 |                 "sha256:09b7847f7e83ca37c6e627682f145856de331049013853f344f37b0c9690e3df",
394 |                 "sha256:0aaee12d8883552fadfc41e96b4c82ee7d794949e2a7c3b3a7201e968c7ecab9",
395 |                 "sha256:0cbe9848fad08baf71de1a39e12d1b6310f1d5b2d0ea4de051058e6e1076852d",
396 |                 "sha256:1b1766d6f397c18153d40015ddfc79ddb715cabadc04d2d228d4e5a8bc4ded1a",
397 |                 "sha256:33161613d2269025873025b33e879825ec7b1d831317e68f4f2f0f84ed14c719",
398 |                 "sha256:5039f55555e1eab31124a5768898c9e22c25a65c1e0037f4d7c495a45778c9f2",
399 |                 "sha256:522e26bbf6377e4d76403826ed689c295b0b238f46c28a7251ab94716da0b280",
400 |                 "sha256:56e454c7833e94ec9769fa0f86e6ff8e42ee38ce0ce1fa4cbb747ea7e06d56aa",
401 |                 "sha256:58f545efd1108e647604a1b5aa809591ccd2540f468a880bedb97247e72db387",
402 |                 "sha256:5e05b1c973a9f858c74367553e236f287e749465f773328c8ef31abe18f691e1",
403 |                 "sha256:7903ba8ab592b82014713c491f6c5d3a1cde5b4a3bf116404e08f5b52f6daf43",
404 |                 "sha256:8969bfd28e85c81f3f94eb4a66bc2cf1dbdc5c18efc320af34bffc54d6b1e38f",
405 |                 "sha256:92c8c1e89a1f5028a4c6d9e3ccbe311b6ba53694811269b992c0b224269e2398",
406 |                 "sha256:9c88793f78fca17da0145455f0d7826bcb9f37da4764af27ac945488116efe63",
407 |                 "sha256:a7ac231a08bb37f852849bbb387a20a57574a97cfc7b6cabb488a4fc8be176de",
408 |                 "sha256:abdde9f795cf292fb9651ed48185503a2ff29be87770c3b8e2a14b0cd7aa16f8",
409 |                 "sha256:af1da88f6bc3d2338ebbf0e22fe487821ea4d8e89053e25fa59d1d79786e7481",
410 |                 "sha256:b2a9ab7c279c91974f756c84c365a669a887efa287365a8e2c418f8b3ba73fb0",
411 |                 "sha256:bf837dc63ba5c06dc8797c398db1e223a466c7ece27a1f7b5232ba3466aafe3d",
412 |                 "sha256:ca51fcfcc5f9354c45f400059e88bc09215fb71a48d3768fb80e357f3b457e1e",
413 |                 "sha256:ce571367b6dfe60af04e04a1834ca2dc5f46004ac1cc756fb95319f64c095a96",
414 |                 "sha256:d208a0f8729f3fb790ed18a003f3a57895b989b40ea4dce4717e9cf4af62c6bb",
415 |                 "sha256:dbee87b469018961d1ad79b1a5d50c0ae850000b639bcb1b694e9981083243b6",
416 |                 "sha256:e9f4c4e51567b616be64e05d517c79a8a22f3606499941d97bb76f2ca59f982d",
417 |                 "sha256:f063b69b090c9d918f9df0a12116029e274daf0181df392839661c4c7ec9018a",
418 |                 "sha256:f9a909a8bae284d46bbfdefbdd4a262ba19d3bc9921b1e76126b1d21c3c34135"
419 |             ],
420 |             "markers": "python_version >= '3.8'",
421 |             "version": "==1.23.5"
422 |         },
423 |         "pandas": {
424 |             "hashes": [
425 |                 "sha256:003ba92db58b71a5f8add604a17a059f3068ef4e8c0c365b088468d0d64935fd",
426 |                 "sha256:10e10a2527db79af6e830c3d5842a4d60383b162885270f8cffc15abca4ba4a9",
427 |                 "sha256:22808afb8f96e2269dcc5b846decacb2f526dd0b47baebc63d913bf847317c8f",
428 |                 "sha256:2d1dc09c0013d8faa7474574d61b575f9af6257ab95c93dcf33a14fd8d2c1bab",
429 |                 "sha256:35c77609acd2e4d517da41bae0c11c70d31c87aae8dd1aabd2670906c6d2c143",
430 |                 "sha256:372d72a3d8a5f2dbaf566a5fa5fa7f230842ac80f29a931fb4b071502cf86b9a",
431 |                 "sha256:42493f8ae67918bf129869abea8204df899902287a7f5eaf596c8e54e0ac7ff4",
432 |                 "sha256:4acc28364863127bca1029fb72228e6f473bb50c32e77155e80b410e2068eeac",
433 |                 "sha256:5298a733e5bfbb761181fd4672c36d0c627320eb999c59c65156c6a90c7e1b4f",
434 |                 "sha256:5ba0aac1397e1d7b654fccf263a4798a9e84ef749866060d19e577e927d66e1b",
435 |                 "sha256:9707bdc1ea9639c886b4d3be6e2a45812c1ac0c2080f94c31b71c9fa35556f9b",
436 |                 "sha256:a2aa18d3f0b7d538e21932f637fbfe8518d085238b429e4790a35e1e44a96ffc",
437 |                 "sha256:a388960f979665b447f0847626e40f99af8cf191bce9dc571d716433130cb3a7",
438 |                 "sha256:a51528192755f7429c5bcc9e80832c517340317c861318fea9cea081b57c9afd",
439 |                 "sha256:b528e126c13816a4374e56b7b18bfe91f7a7f6576d1aadba5dee6a87a7f479ae",
440 |                 "sha256:c1aa4de4919358c5ef119f6377bc5964b3a7023c23e845d9db7d9016fa0c5b1c",
441 |                 "sha256:c2646458e1dce44df9f71a01dc65f7e8fa4307f29e5c0f2f92c97f47a5bf22f5",
442 |                 "sha256:c2f44425594ae85e119459bb5abb0748d76ef01d9c08583a667e3339e134218e",
443 |                 "sha256:d47750cf07dee6b55d8423471be70d627314277976ff2edd1381f02d52dbadf9",
444 |                 "sha256:d99d2350adb7b6c3f7f8f0e5dfb7d34ff8dd4bc0a53e62c445b7e43e163fce63",
445 |                 "sha256:dd324f8ee05925ee85de0ea3f0d66e1362e8c80799eb4eb04927d32335a3e44a",
446 |                 "sha256:eaca36a80acaacb8183930e2e5ad7f71539a66805d6204ea88736570b2876a7b",
447 |                 "sha256:f567e972dce3bbc3a8076e0b675273b4a9e8576ac629149cf8286ee13c259ae5",
448 |                 "sha256:fe48e4925455c964db914b958f6e7032d285848b7538a5e1b19aeb26ffaea3ec"
449 |             ],
450 |             "index": "pypi",
451 |             "version": "==1.3.4"
452 |         },
453 |         "pillow": {
454 |             "hashes": [
455 |                 "sha256:03150abd92771742d4a8cd6f2fa6246d847dcd2e332a18d0c15cc75bf6703040",
456 |                 "sha256:073adb2ae23431d3b9bcbcff3fe698b62ed47211d0716b067385538a1b0f28b8",
457 |                 "sha256:0b07fffc13f474264c336298d1b4ce01d9c5a011415b79d4ee5527bb69ae6f65",
458 |                 "sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2",
459 |                 "sha256:12ce4932caf2ddf3e41d17fc9c02d67126935a44b86df6a206cf0d7161548627",
460 |                 "sha256:15c42fb9dea42465dfd902fb0ecf584b8848ceb28b41ee2b58f866411be33f07",
461 |                 "sha256:18498994b29e1cf86d505edcb7edbe814d133d2232d256db8c7a8ceb34d18cef",
462 |                 "sha256:1c7c8ae3864846fc95f4611c78129301e203aaa2af813b703c55d10cc1628535",
463 |                 "sha256:22b012ea2d065fd163ca096f4e37e47cd8b59cf4b0fd47bfca6abb93df70b34c",
464 |                 "sha256:276a5ca930c913f714e372b2591a22c4bd3b81a418c0f6635ba832daec1cbcfc",
465 |                 "sha256:2e0918e03aa0c72ea56edbb00d4d664294815aa11291a11504a377ea018330d3",
466 |                 "sha256:3033fbe1feb1b59394615a1cafaee85e49d01b51d54de0cbf6aa8e64182518a1",
467 |                 "sha256:3168434d303babf495d4ba58fc22d6604f6e2afb97adc6a423e917dab828939c",
468 |                 "sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa",
469 |                 "sha256:3dd6caf940756101205dffc5367babf288a30043d35f80936f9bfb37f8355b32",
470 |                 "sha256:40e1ce476a7804b0fb74bcfa80b0a2206ea6a882938eaba917f7a0f004b42502",
471 |                 "sha256:41e0051336807468be450d52b8edd12ac60bebaa97fe10c8b660f116e50b30e4",
472 |                 "sha256:4390e9ce199fc1951fcfa65795f239a8a4944117b5935a9317fb320e7767b40f",
473 |                 "sha256:502526a2cbfa431d9fc2a079bdd9061a2397b842bb6bc4239bb176da00993812",
474 |                 "sha256:51e0e543a33ed92db9f5ef69a0356e0b1a7a6b6a71b80df99f1d181ae5875636",
475 |                 "sha256:57751894f6618fd4308ed8e0c36c333e2f5469744c34729a27532b3db106ee20",
476 |                 "sha256:5d77adcd56a42d00cc1be30843d3426aa4e660cab4a61021dc84467123f7a00c",
477 |                 "sha256:655a83b0058ba47c7c52e4e2df5ecf484c1b0b0349805896dd350cbc416bdd91",
478 |                 "sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe",
479 |                 "sha256:6c738585d7a9961d8c2821a1eb3dcb978d14e238be3d70f0a706f7fa9316946b",
480 |                 "sha256:73bd195e43f3fadecfc50c682f5055ec32ee2c933243cafbfdec69ab1aa87cad",
481 |                 "sha256:772a91fc0e03eaf922c63badeca75e91baa80fe2f5f87bdaed4280662aad25c9",
482 |                 "sha256:77ec3e7be99629898c9a6d24a09de089fa5356ee408cdffffe62d67bb75fdd72",
483 |                 "sha256:7db8b751ad307d7cf238f02101e8e36a128a6cb199326e867d1398067381bff4",
484 |                 "sha256:801ec82e4188e935c7f5e22e006d01611d6b41661bba9fe45b60e7ac1a8f84de",
485 |                 "sha256:82409ffe29d70fd733ff3c1025a602abb3e67405d41b9403b00b01debc4c9a29",
486 |                 "sha256:828989c45c245518065a110434246c44a56a8b2b2f6347d1409c787e6e4651ee",
487 |                 "sha256:829f97c8e258593b9daa80638aee3789b7df9da5cf1336035016d76f03b8860c",
488 |                 "sha256:871b72c3643e516db4ecf20efe735deb27fe30ca17800e661d769faab45a18d7",
489 |                 "sha256:89dca0ce00a2b49024df6325925555d406b14aa3efc2f752dbb5940c52c56b11",
490 |                 "sha256:90fb88843d3902fe7c9586d439d1e8c05258f41da473952aa8b328d8b907498c",
491 |                 "sha256:97aabc5c50312afa5e0a2b07c17d4ac5e865b250986f8afe2b02d772567a380c",
492 |                 "sha256:9aaa107275d8527e9d6e7670b64aabaaa36e5b6bd71a1015ddd21da0d4e06448",
493 |                 "sha256:9f47eabcd2ded7698106b05c2c338672d16a6f2a485e74481f524e2a23c2794b",
494 |                 "sha256:a0a06a052c5f37b4ed81c613a455a81f9a3a69429b4fd7bb913c3fa98abefc20",
495 |                 "sha256:ab388aaa3f6ce52ac1cb8e122c4bd46657c15905904b3120a6248b5b8b0bc228",
496 |                 "sha256:ad58d27a5b0262c0c19b47d54c5802db9b34d38bbf886665b626aff83c74bacd",
497 |                 "sha256:ae5331c23ce118c53b172fa64a4c037eb83c9165aba3a7ba9ddd3ec9fa64a699",
498 |                 "sha256:af0372acb5d3598f36ec0914deed2a63f6bcdb7b606da04dc19a88d31bf0c05b",
499 |                 "sha256:afa4107d1b306cdf8953edde0534562607fe8811b6c4d9a486298ad31de733b2",
500 |                 "sha256:b03ae6f1a1878233ac620c98f3459f79fd77c7e3c2b20d460284e1fb370557d4",
501 |                 "sha256:b0915e734b33a474d76c28e07292f196cdf2a590a0d25bcc06e64e545f2d146c",
502 |                 "sha256:b4012d06c846dc2b80651b120e2cdd787b013deb39c09f407727ba90015c684f",
503 |                 "sha256:b472b5ea442148d1c3e2209f20f1e0bb0eb556538690fa70b5e1f79fa0ba8dc2",
504 |                 "sha256:b59430236b8e58840a0dfb4099a0e8717ffb779c952426a69ae435ca1f57210c",
505 |                 "sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3",
506 |                 "sha256:b9a65733d103311331875c1dca05cb4606997fd33d6acfed695b1232ba1df193",
507 |                 "sha256:bac18ab8d2d1e6b4ce25e3424f709aceef668347db8637c2296bcf41acb7cf48",
508 |                 "sha256:bca31dd6014cb8b0b2db1e46081b0ca7d936f856da3b39744aef499db5d84d02",
509 |                 "sha256:be55f8457cd1eac957af0c3f5ece7bc3f033f89b114ef30f710882717670b2a8",
510 |                 "sha256:c7025dce65566eb6e89f56c9509d4f628fddcedb131d9465cacd3d8bac337e7e",
511 |                 "sha256:c935a22a557a560108d780f9a0fc426dd7459940dc54faa49d83249c8d3e760f",
512 |                 "sha256:dbb8e7f2abee51cef77673be97760abff1674ed32847ce04b4af90f610144c7b",
513 |                 "sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74",
514 |                 "sha256:ebf2029c1f464c59b8bdbe5143c79fa2045a581ac53679733d3a91d400ff9efb",
515 |                 "sha256:f1ff2ee69f10f13a9596480335f406dd1f70c3650349e2be67ca3139280cade0"
516 |             ],
517 |             "index": "pypi",
518 |             "version": "==9.3.0"
519 |         },
520 |         "protobuf": {
521 |             "hashes": [
522 |                 "sha256:2c9c2ed7466ad565f18668aa4731c535511c5d9a40c6da39524bccf43e441719",
523 |                 "sha256:48e2cd6b88c6ed3d5877a3ea40df79d08374088e89bedc32557348848dff250b",
524 |                 "sha256:5b0834e61fb38f34ba8840d7dcb2e5a2f03de0c714e0293b3963b79db26de8ce",
525 |                 "sha256:61f21493d96d2a77f9ca84fefa105872550ab5ef71d21c458eb80edcf4885a99",
526 |                 "sha256:6e0be9f09bf9b6cf497b27425487706fa48c6d1632ddd94dab1a5fe11a422392",
527 |                 "sha256:6e312e280fbe3c74ea9e080d9e6080b636798b5e3939242298b591064470b06b",
528 |                 "sha256:7eb8f2cc41a34e9c956c256e3ac766cf4e1a4c9c925dc757a41a01be3e852965",
529 |                 "sha256:84ea107016244dfc1eecae7684f7ce13c788b9a644cd3fca5b77871366556444",
530 |                 "sha256:9227c14010acd9ae7702d6467b4625b6fe853175a6b150e539b21d2b2f2b409c",
531 |                 "sha256:a419cc95fca8694804709b8c4f2326266d29659b126a93befe210f5bbc772536",
532 |                 "sha256:a7d0ea43949d45b836234f4ebb5ba0b22e7432d065394b532cdca8f98415e3cf",
533 |                 "sha256:b5ab0b8918c136345ff045d4b3d5f719b505b7c8af45092d7f45e304f55e50a1",
534 |                 "sha256:e575c57dc8b5b2b2caa436c16d44ef6981f2235eb7179bfc847557886376d740",
535 |                 "sha256:f9eae277dd240ae19bb06ff4e2346e771252b0e619421965504bd1b1bba7c5fa"
536 |             ],
537 |             "markers": "python_version >= '3.7'",
538 |             "version": "==4.21.9"
539 |         },
540 |         "py4j": {
541 |             "hashes": [
542 |                 "sha256:624f97c363b8dd84822bc666b12fa7f7d97824632b2ff3d852cc491359ce7615",
543 |                 "sha256:bf0485388e415ff26710d2dc719cb0ede16cf1164b1ee757e0ebb2e98c471521"
544 |             ],
545 |             "version": "==0.10.9.2"
546 |         },
547 |         "pyparsing": {
548 |             "hashes": [
549 |                 "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb",
550 |                 "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc"
551 |             ],
552 |             "markers": "python_full_version >= '3.6.8'",
553 |             "version": "==3.0.9"
554 |         },
555 |         "pyproj": {
556 |             "hashes": [
557 |                 "sha256:0fff9c3a991508f16027be27d153f6c5583d03799443639d13c681e60f49e2d7",
558 |                 "sha256:12f62c20656ac9b6076ebb213e9a635d52f4f01fef95310121d337e62e910cb6",
559 |                 "sha256:14ad113b5753c6057f9b2f3c85a6497cef7fa237c4328f2943c0223e98c1dde6",
560 |                 "sha256:1f9c100fd0fd80edbc7e4daa303600a8cbef6f0de43d005617acb38276b88dc0",
561 |                 "sha256:221d8939685e0c43ee594c9f04b6a73a10e8e1cc0e85f28be0b4eb2f1bc8777d",
562 |                 "sha256:25a36e297f3e0524694d40259e3e895edc1a47492a0e30608268ffc1328e3f5d",
563 |                 "sha256:2cb8592259ea54e7557523b079d3f2304081680bdb48bfbf0fd879ee6156129c",
564 |                 "sha256:3b85acf09e5a9e35cd9ee72989793adb7089b4e611be02a43d3d0bda50ad116b",
565 |                 "sha256:45554f47d1a12a84b0620e4abc08a2a1b5d9f273a4759eaef75e74788ec7162a",
566 |                 "sha256:4688b4cd62cbd86b5e855f9e27d90fbb53f2b4c2ea1cd394a46919e1a4151b89",
567 |                 "sha256:47ad53452ae1dc8b0bf1df920a210bb5616989085aa646592f8681f1d741a754",
568 |                 "sha256:48787962232109bad8b72e27949037a9b03591228a6955f25dbe451233e8648a",
569 |                 "sha256:4a23d84c5ffc383c7d9f0bde3a06fc1f6697b1b96725597f8f01e7b4bef0a2b5",
570 |                 "sha256:4e161114bc92701647a83c4bbce79489984f12d980cabb365516e953d1450885",
571 |                 "sha256:4fd425ee8b6781c249c7adb7daa2e6c41ce573afabe4f380f5eecd913b56a3be",
572 |                 "sha256:52e54796e2d9554a5eb8f11df4748af1fbbc47f76aa234d6faf09216a84554c5",
573 |                 "sha256:5816807ca0bdc7256558770c6206a6783a3f02bcf844f94ee245f197bb5f7285",
574 |                 "sha256:65a0bcdbad95b3c00b419e5d75b1f7e450ec17349b5ea16bf7438ac1d50a12a2",
575 |                 "sha256:77d5f519f3cdb94b026ecca626f78db4f041afe201cf082079c8c0092a30b087",
576 |                 "sha256:82200b4569d68b421c079d2973475b58d5959306fe758b43366e79fe96facfe5",
577 |                 "sha256:954b068136518b3174d0a99448056e97af62b63392a95c420894f7de2229dae6",
578 |                 "sha256:9a496d9057b2128db9d733e66b206f2d5954bbae6b800d412f562d780561478c",
579 |                 "sha256:a454a7c4423faa2a14e939d08ef293ee347fa529c9df79022b0585a6e1d8310c",
580 |                 "sha256:a708445927ace9857f52c3ba67d2915da7b41a8fdcd9b8f99a4c9ed60a75eb33",
581 |                 "sha256:aa5171f700f174777a9e9ed8f4655583243967c0f9cf2c90e3f54e54ff740134",
582 |                 "sha256:ccb4b70ad25218027f77e0c8934d10f9b7cdf91d5e64080147743d58fddbc3c0",
583 |                 "sha256:d94afed99f31673d3d19fe750283621e193e2a53ca9e0443bf9d092c3905833b",
584 |                 "sha256:e7e609903572a56cca758bbaee5c1663c3e829ddce5eec4f368e68277e37022b",
585 |                 "sha256:f343725566267a296b09ee7e591894f1fdc90f84f8ad5ec476aeb53bd4479c07",
586 |                 "sha256:f80adda8c54b84271a93829477a01aa57bc178c834362e9f74e1de1b5033c74c"
587 |             ],
588 |             "markers": "python_version >= '3.8'",
589 |             "version": "==3.4.0"
590 |         },
591 |         "pyspark": {
592 |             "hashes": [
593 |                 "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"
594 |             ],
595 |             "index": "pypi",
596 |             "version": "==3.2.0"
597 |         },
598 |         "python-dateutil": {
599 |             "hashes": [
600 |                 "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86",
601 |                 "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"
602 |             ],
603 |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
604 |             "version": "==2.8.2"
605 |         },
606 |         "python-dotenv": {
607 |             "hashes": [
608 |                 "sha256:14f8185cc8d494662683e6914addcb7e95374771e707601dfc70166946b4c4b8",
609 |                 "sha256:bbd3da593fc49c249397cbfbcc449cf36cb02e75afc8157fcc6a81df6fb7750a"
610 |             ],
611 |             "index": "pypi",
612 |             "version": "==0.19.1"
613 |         },
614 |         "pytz": {
615 |             "hashes": [
616 |                 "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427",
617 |                 "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2"
618 |             ],
619 |             "version": "==2022.6"
620 |         },
621 |         "requests": {
622 |             "hashes": [
623 |                 "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983",
624 |                 "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349"
625 |             ],
626 |             "markers": "python_version >= '3.7' and python_version < '4'",
627 |             "version": "==2.28.1"
628 |         },
629 |         "setuptools": {
630 |             "hashes": [
631 |                 "sha256:6211d2f5eddad8757bd0484923ca7c0a6302ebc4ab32ea5e94357176e0ca0840",
632 |                 "sha256:d1eebf881c6114e51df1664bc2c9133d022f78d12d5f4f665b9191f084e2862d"
633 |             ],
634 |             "markers": "python_version >= '3.7'",
635 |             "version": "==65.6.0"
636 |         },
637 |         "shapely": {
638 |             "hashes": [
639 |                 "sha256:02dd5d7dc6e46515d88874134dc8fcdc65826bca93c3eecee59d1910c42c1b17",
640 |                 "sha256:0b4ee3132ee90f07d63db3aea316c4c065ed7a26231458dda0874414a09d6ba3",
641 |                 "sha256:0d885cb0cf670c1c834df3f371de8726efdf711f18e2a75da5cfa82843a7ab65",
642 |                 "sha256:147066da0be41b147a61f8eb805dea3b13709dbc873a431ccd7306e24d712bc0",
643 |                 "sha256:21776184516a16bf82a0c3d6d6a312b3cd15a4cabafc61ee01cf2714a82e8396",
644 |                 "sha256:2e0a8c2e55f1be1312b51c92b06462ea89e6bb703fab4b114e7a846d941cfc40",
645 |                 "sha256:2fd15397638df291c427a53d641d3e6fd60458128029c8c4f487190473a69a91",
646 |                 "sha256:3480657460e939f45a7d359ef0e172a081f249312557fe9aa78c4fd3a362d993",
647 |                 "sha256:370b574c78dc5af3a198a6da5d9b3d7c04654bd2ef7e80e80a3a0992dfb2d9cd",
648 |                 "sha256:38f0fbbcb8ca20c16451c966c1f527cc43968e121c8a048af19ed3e339a921cd",
649 |                 "sha256:4728666fff8cccc65a07448cae72c75a8773fea061c3f4f139c44adc429b18c3",
650 |                 "sha256:48dcfffb9e225c0481120f4bdf622131c8c95f342b00b158cdbe220edbbe20b6",
651 |                 "sha256:532a55ee2a6c52d23d6f7d1567c8f0473635f3b270262c44e1b0c88096827e22",
652 |                 "sha256:5d7f85c2d35d39ff53c9216bc76b7641c52326f7e09aaad1789a3611a0f812f2",
653 |                 "sha256:65b21243d8f6bcd421210daf1fabb9de84de2c04353c5b026173b88d17c1a581",
654 |                 "sha256:66bdac74fbd1d3458fa787191a90fa0ae610f09e2a5ec398c36f968cc0ed743f",
655 |                 "sha256:6d388c0c1bd878ed1af4583695690aa52234b02ed35f93a1c8486ff52a555838",
656 |                 "sha256:6fe855e7d45685926b6ba00aaeb5eba5862611f7465775dacd527e081a8ced6d",
657 |                 "sha256:753ed0e21ab108bd4282405b9b659f2e985e8502b1a72b978eaa51d3496dee19",
658 |                 "sha256:783bad5f48e2708a0e2f695a34ed382e4162c795cb2f0368b39528ac1d6db7ed",
659 |                 "sha256:78fb9d929b8ee15cfd424b6c10879ce1907f24e05fb83310fc47d2cd27088e40",
660 |                 "sha256:84010db15eb364a52b74ea8804ef92a6a930dfc1981d17a369444b6ddec66efd",
661 |                 "sha256:8d086591f744be483b34628b391d741e46f2645fe37594319e0a673cc2c26bcf",
662 |                 "sha256:8e59817b0fe63d34baedaabba8c393c0090f061917d18fc0bcc2f621937a8f73",
663 |                 "sha256:99a2f0da0109e81e0c101a2b4cd8412f73f5f299e7b5b2deaf64cd2a100ac118",
664 |                 "sha256:99ab0ddc05e44acabdbe657c599fdb9b2d82e86c5493bdae216c0c4018a82dee",
665 |                 "sha256:a23ef3882d6aa203dd3623a3d55d698f59bfbd9f8a3bfed52c2da05a7f0f8640",
666 |                 "sha256:a354199219c8d836f280b88f2c5102c81bb044ccea45bd361dc38a79f3873714",
667 |                 "sha256:a74631e511153366c6dbe3229fa93f877e3c87ea8369cd00f1d38c76b0ed9ace",
668 |                 "sha256:ab38f7b5196ace05725e407cb8cab9ff66edb8e6f7bb36a398e8f73f52a7aaa2",
669 |                 "sha256:adcf8a11b98af9375e32bff91de184f33a68dc48b9cb9becad4f132fa25cfa3c",
670 |                 "sha256:b65f5d530ba91e49ffc7c589255e878d2506a8b96ffce69d3b7c4500a9a9eaf8",
671 |                 "sha256:be9423d5a3577ac2e92c7e758bd8a2b205f5e51a012177a590bc46fc51eb4834",
672 |                 "sha256:c2822111ddc5bcfb116e6c663e403579d0fe3f147d2a97426011a191c43a7458",
673 |                 "sha256:c6a9a4a31cd6e86d0fbe8473ceed83d4fe760b19d949fb557ef668defafea0f6",
674 |                 "sha256:d048f93e42ba578b82758c15d8ae037d08e69d91d9872bca5a1895b118f4e2b0",
675 |                 "sha256:e9c30b311de2513555ab02464ebb76115d242842b29c412f5a9aa0cac57be9f6",
676 |                 "sha256:ec14ceca36f67cb48b34d02d7f65a9acae15cd72b48e303531893ba4a960f3ea",
677 |                 "sha256:ef3be705c3eac282a28058e6c6e5503419b250f482320df2172abcbea642c831"
678 |             ],
679 |             "markers": "python_version >= '3.6'",
680 |             "version": "==1.8.5.post1"
681 |         },
682 |         "six": {
683 |             "hashes": [
684 |                 "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926",
685 |                 "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"
686 |             ],
687 |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
688 |             "version": "==1.16.0"
689 |         },
690 |         "urllib3": {
691 |             "hashes": [
692 |                 "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e",
693 |                 "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997"
694 |             ],
695 |             "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'",
696 |             "version": "==1.26.12"
697 |         }
698 |     },
699 |     "develop": {}
700 | }
701 | 


--------------------------------------------------------------------------------
/PyPI_README.md:
--------------------------------------------------------------------------------
 1 | # SparkPlus
 2 | Spark+는 H3, 위/경도 좌표 등의 공간 정보를 국내 주소체계(신주소/구주소)와 함께 처리할 수 있도록 지원하는 Package입니다.
 3 | 
 4 | ## Setup
 5 | 
 6 | [GitHub](https://github.com/SWM-SparkPlus/sparkplus/)
 7 | [개발자 가이드 참고](https://github.com/SWM-SparkPlus/sparkplus/wiki)
 8 | 
 9 | - Spark+는 PyPI에 배포되어 있으며, 다음 커맨드로 설치할 수 있습니다.
10 | ```
11 | $ pip install sparkplus
12 | ```
13 | 
14 | - 설치 후에 import하여 사용할 수 있습니다.
15 | ```
16 | from sparkplus.core import CoordDataFrame, RoadnameDataFrame, NumAddrDataFrame
17 | ```
18 | 
19 | ## Class
20 | 
21 | ### CoordDataFrame
22 | 위치 좌표를 포함하는 데이터프레임을 주소체계 데이터베이스와 연동하여 pnu코드, h3, 우편번호, 법정동코드, 도로명주소(시도/시군구/읍면동/법정리/도로명/지하여부/건물 본번/건물 부번), 도로명주소(전체), 지번주소(시도/시군구/읍면동/법정리/지번 본번/지번 분번) 등의 컬럼을 추가합니다.
23 | ```
24 | coord_df = CoordDataFrame(source_df, geo_df, table_df, x_colname, y_colname)
25 | ```
26 | |        위도|        경도|                PNU|       manage_number|roadname_code|zipcode|      sido|sigungu|eupmyeondong|bupjungli|       roadname|is_basement|building_primary_number|building_secondary_number|jibun_primary_number|jibun_secondary_number|bupjungdong_code|
27 | |-----------|-----------|-------------------|--------------------|-------------|-------|----------|-------|------------|---------|---------------|-----------|-----------------------|-------------------------|--------------------|----------------------|----------------|
28 | |35.86341579|128.6024286|2711010600101990000|27110106001000300...| 271103007017|  41940|	대구광역시|    중구|   	 삼덕동2가|         |           공평로|          0|                     46|                        0|                   3|                     4|      2711010600|
29 | |35.86516734|128.6105401|2711010700103790000|27110107001003100...| 271104223055|  41945| 	대구광역시|    중구|   	 삼덕동3가|         |	 달구벌대로443길|          0|                     62|                       16|                  31|                     2|      2711010700|
30 | |35.86927185|128.5937782|2711011700101200003|27110115001008500...| 271102007001|  41909|	대구광역시|    중구|        남일동|         |         중앙대로|          1|                    424|                        0|                 143|                     1|      2711011700|
31 |  
32 | ### RoadnameDataFrame
33 | 비정형 도로명주소를 포함하는 데이터프레임을 주소체계 데이터베이스와 연동하여 분석 및 시각화할 수 있는 형태로 전처리한 시도, 시군구, 읍면동, 도로명, 건물 본번, 법정동코드 등의 컬럼을 추가합니다.
34 | ```
35 | roadname_df = RoadnameDataFrame(source_df)
36 | ```
37 |  |target                                  |sido  |sigungu    |roadname |building_primary_number|bupjungdong_code|
38 |  |----------------------------------------|------|-----------|---------|-----------------------|----------------|
39 |  |경기도 안산시 단원구 해봉로 137                |경기도 |안산시 단원구 |해봉로      |137                    |4128112400     |
40 |  |경기도 수원시 장안구 경수대로 1079             |경기도  |수원시 장안구 |경수대로    |1079                   |4128111800     |
41 |  |경기도 안산시 상록구 양달말길 93-7             |경기도  |안산시 상록구 |양달말길    |93                     |4128101100     |
42 | 
43 | 
44 | ## LICENSE
45 | [MIT](https://github.com/SWM-SparkPlus/db-updater/blob/master/LICENSE)
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SparkPlus
 2 | Spark+는 H3, 위/경도 좌표 등의 공간 정보를 국내 주소체계(신주소/구주소)와 함께 처리할 수 있도록 지원하는 Package입니다.
 3 | 
 4 | ## Spark+ 아키텍처
 5 | 
 6 | [RDW Reference Architecture](https://github.com/SWM-SparkPlus/rdw-reference-architecture)
 7 | 
 8 | ![](https://github.com/SWM-SparkPlus/sparkplus/blob/master/static/sparkplus_arch_finale.png)
 9 | 
10 | 
11 | ## Setup
12 | 
13 | [개발자 가이드 참고](https://github.com/SWM-SparkPlus/sparkplus/wiki)
14 | 
15 | - Spark+는 PyPI에 배포되어 있으며, 다음 커맨드로 설치할 수 있습니다.
16 | ```s
17 | $ pip install sparkplus
18 | ```
19 | 
20 | - 설치 후에 import하여 사용할 수 있습니다.
21 | ```py
22 | from sparkplus.core import CoordDataFrame, AddressDataFrame
23 | ```
24 | 
25 | ## Class
26 | 
27 | ### CoordDataFrame
28 | 위치 좌표를 포함하는 데이터프레임을 주소체계 데이터베이스와 연동하여 pnu코드, h3, 우편번호, 법정동코드, 도로명주소(시도/시군구/읍면동/법정리/도로명/지하여부/건물 본번/건물 부번), 도로명주소(전체), 지번주소(시도/시군구/읍면동/법정리/지번 본번/지번 분번) 등의 컬럼을 추가합니다.
29 | ```py
30 | res_df = CoordDataFrame(source_df, geo_df, table_df, x_colname, y_colname)
31 | 
32 | # example
33 | +-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+
34 | |        위도|        경도|                PNU|       manage_number|roadname_code|zipcode|      sido|sigungu|eupmyeondong|bupjungli|       roadname|is_basement|building_primary_number|building_secondary_number|jibun_primary_number|jibun_secondary_number|bupjungdong_code|
35 | +-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+
36 | |35.86341579|128.6024286|2711010600101990000|27110106001000300...| 271103007017|  41940|   대구광역시|    중구|   	삼덕동2가|         |           공평로|          0|                     46|                        0|                   3|                     4|      2711010600|
37 | |35.86516734|128.6105401|2711010700103790000|27110107001003100...| 271104223055|  41945|   대구광역시|    중구|  	삼덕동3가|         |    달구벌대로443길|          0|                     62|                       16|                  31|                     2|      2711010700|
38 | |35.86927185|128.5937782|2711011700101200003|27110115001008500...| 271102007001|  41909|   대구광역시|    중구|       남일동|         |          중앙대로|          1|                    424|                        0|                 143|                     1|      2711011700|
39 | +-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+
40 | ```
41 | 
42 | ### AddressDataFrame
43 | 비정형 도로명주소 또는 지번주소를 포함하는 데이터프레임을 주소체계 데이터베이스와 연동하여 분석 및 시각화할 수 있는 형태의 시도, 시군구, 읍면동,  법정동코드, 시군구코드 등의 컬럼을 추가합니다.
44 | ```py
45 | res_df = AddressDataFrame(source_df).to_bupjungdong("target_colname", table_df)
46 | 
47 | # example
48 | +--------------------------+----------+------------+-----------------+----------------+------------+
49 | |                   받는분주소| sido_name|sigungu_name|eupmyeondong_name|bupjungdong_code|sigungu_code|
50 | +--------------------------+----------+------------+-----------------+----------------+------------+
51 | |    서울특별시 강남구 가로수길 75|  서울특별시|        강남구|             신사동|      1168010700|       11680|
52 | |   서울특별시 강남구 강남대로 346|  서울특별시|        강남구|             역삼동|      1168010100|       11680|
53 | |서울특별시 강남구 논현로 120길 20|  서울특별시|        강남구|             논현동|      1168010800|       11680|
54 | +--------------------------+----------+------------+-----------------+----------------+------------+
55 | ```
56 | 
57 | ## LICENSE
58 | [MIT](https://github.com/SWM-SparkPlus/db-updater/blob/master/LICENSE)
59 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = [
3 |     "setuptools>=42",
4 |     "wheel"
5 | ]
6 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | from pathlib import Path
 3 | 
 4 | this_directory= Path(__file__).parent
 5 | long_description = (this_directory / "PyPI_README.md").read_text()
 6 | 
 7 | setup(
 8 |     name="sparkplus",
 9 |     version="1.3.0",
10 |     description="GIS package for Apache Spark",
11 |     long_description=long_description,
12 |     long_description_content_type="text/markdown",
13 |     author="sparkplus",
14 |     author_email="meadea27@gmail.com",
15 |     url="https://github.com/SWM-SparkPlus/sparkplus",
16 |     license="MIT",
17 |     # py_modules=['conversion', 'load_database'],
18 |     python_requires=">=3",
19 |     install_requires=[
20 |         "numpy",
21 |         "pandas",
22 |         "geopandas",
23 |         "geospark",
24 |         "h3",
25 |         "geopy",
26 |         "pyarrow",
27 |         "rtree",
28 |         "shapely",
29 |         "python-dotenv",
30 |     ],
31 |     include_package_data=True,
32 |     zip_safe=False,
33 |     packages=find_packages(),
34 |     keywords=["spark", "gis"],
35 |     classifiers=[
36 |         "Programming Language :: Python :: 3",
37 |         "License :: OSI Approved :: MIT License",
38 |         "Operating System :: OS Independent",
39 |     ],
40 | )
41 | 


--------------------------------------------------------------------------------
/sparkplus/__init__.py:
--------------------------------------------------------------------------------
1 | # from .dependencies import spark
2 | from .core import CoordDataFrame, AddressDataFrame, load_tables, load_gdf
3 | 
4 | __all__ = ["spark", "CoordDataFrame", "AddressDataFrame", "load_tables", "load_gdf"]
5 | 


--------------------------------------------------------------------------------
/sparkplus/core/__init__.py:
--------------------------------------------------------------------------------
 1 | from .coord_dataframe import CoordDataFrame
 2 | from .address_dataframe import AddressDataFrame
 3 | from .numaddr_dataframe import NumAddrDataFrame
 4 | from .utils import load_tables, load_gdf
 5 | from .tablename import (
 6 |     EPrefix,
 7 |     ESido,
 8 |     get_tablename_by_prefix_and_sido,
 9 |     get_all_tablenames_by_prefix,
10 | )
11 | 
12 | __all__ = [
13 |     "CoordDataFrame",
14 |     "AddressDataFrame",
15 |     "NumAddrDataFrame",
16 |     "load_tables",
17 |     "load_gdf",
18 |     "EPrefix",
19 |     "ESido",
20 |     "get_tablename_by_prefix_and_sido",
21 |     "get_all_tablenames_by_prefix",
22 | ]
23 | 


--------------------------------------------------------------------------------
/sparkplus/core/address_dataframe.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | 
  4 | sys.path.append(
  5 |     os.path.dirname(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
  6 | )
  7 | 
  8 | from pyspark.sql import DataFrame
  9 | from pyspark.sql.functions import split, col, lit
 10 | from sparkplus.core.udfs import *
 11 | from pyspark.sql.functions import when
 12 | 
 13 | 
 14 | class AddressDataFrame(object):
 15 |     """
 16 |     도로명 주소를 활용하여 데이터를 분석하기 위한 클래스입니다
 17 |     """
 18 | 
 19 |     def __init__(self, dataFrame: DataFrame):
 20 |         self._df = dataFrame
 21 |         self._tmp_df = dataFrame
 22 |         self.col_list = dataFrame.columns
 23 | 
 24 |     def to_bupjungdong(self, target: str, db_df: DataFrame):
 25 |         """
 26 |         도로명을 지번으로 변경하는 전 과정을 포함하는 함수입니다
 27 |         """
 28 |         self.add_split(target)
 29 |         self.add_sido()
 30 |         self.add_sigungu()
 31 |         self.add_eupmyeon()
 32 |         self.add_dong()
 33 |         self.add_roadname()
 34 |         self.add_building_primary_number()
 35 |         self.add_jibun_primary_number()
 36 |         self.join_with_db(db_df)
 37 |         return self._df
 38 | 
 39 |     def add_split(self, target: str):
 40 |         """
 41 |         DB에서 조회를 위해 원본의 string을 공백 기준으로 나누는 함수입니다.
 42 | 
 43 |         Parameters
 44 |         ----------
 45 |         target : str
 46 |             split하고 조작할 원본 데이터의 컬럼명
 47 | 
 48 |         Examples
 49 |         --------
 50 |         >>> road_df = RoadnameDataframe(your_df)
 51 |         >>> road_df._df.show()
 52 |         +------------------------------+s
 53 |         |target                        |
 54 |         +------------------------------+
 55 |         |경기도 화성시 장안면 매바위로366번길 8 |
 56 |         |경기도 화성시 장안면 버들로          |
 57 |         |경기도 화성시 장안면 석포리          |
 58 |         +------------------------------+
 59 | 
 60 |         >>> splited_df = road_df.add_split('target')
 61 |         >>> splited_df.show()
 62 |         +------------------------------+-----------------------------------+
 63 |         |target                        |split                              |
 64 |         +------------------------------+-----------------------------------+
 65 |         |경기도 화성시 장안면 매바위로366번길 8|[경기도, 화성시, 장안면, 매바위로366번길, 8]|
 66 |         |경기도 화성시 장안면 버들로         |[경기도, 화성시, 장안면, 버들로]           |
 67 |         |경기도 화성시 장안면 석포리         |[경기도, 화성시, 장안면, 석포리]           |
 68 |         +-----------------------------+------------------------------------+
 69 |         """
 70 |         self._df = self._df.withColumn("split", split(self._df[target], " "))
 71 |         return self._df
 72 | 
 73 |     def cleanse_split_column(self):
 74 |         """
 75 |         주소가 비정형 데이터일 경우 사용되는 함수입니다.
 76 |         add_split_column 함수로 쪼개진 split 컬럼의 데이터를 전처리합니다.
 77 | 
 78 |         UDF
 79 |         ---
 80 |         where_is_sido : IntegerType
 81 |             split 컬럼에서 특별시와 광역시, 도를 찾고, 위치한 인덱스를 반환합니다.
 82 | 
 83 |             Exmaple
 84 |             -------
 85 |             >>> df.show()
 86 |             +---------------------------------------------+
 87 |             |split                                        |
 88 |             +---------------------------------------------+
 89 |             |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]|
 90 |             |[경기도, 화성시, 장안면, 버들로]                   |
 91 |             |[경기도, 화성시, 장안면, 석포리]                   |
 92 |             +--------------------------------------------+
 93 | 
 94 |             >>> df.withColumn('idx', where_is_sido(split)).show()
 95 |             +---------------------------------------------+----+
 96 |             |split                                        |sido|
 97 |             +---------------------------------------------+----+
 98 |             |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]|   1|
 99 |             |[경기도, 화성시, 장안면, 버들로]                   |   0|
100 |             |[경기도, 화성시, 장안면, 석포리]                   |   2|
101 |             +--------------------------------------------+----+
102 | 
103 |         cleanse_split: ArrayType(StringType)
104 |             split 컬럼과 인덱스 컬럼을 활용하여 알맞은 주소체계 값으로 반환합니다.
105 | 
106 |             Example
107 |             -------
108 |             >>> df.show()
109 |             +------------------------------------------------+---+
110 |             |split                                           |idx|
111 |             +------------------------------------------------+---+
112 |             |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]    | 1|
113 |             |[경기도, 화성시, 장안면, 버들로]                       | 0|
114 |             |[Gyeonggi-do, [185-74], 경기도, 화성시, 장안면, 석포리]| 2|
115 |             +------------------------------------------------+---+
116 | 
117 |             >>> df.withColumn('split', cleanse_split(df.split))
118 |             +----------------------------------------+
119 |             |split                                   |
120 |             +----------------------------------------+
121 |             |[경기도, 화성시, 장안면,매바위로366번길, 8]     |
122 |             |[경기도, 화성시, 장안면, 버들로]              |
123 |             |[경기도, 화성시, 장안면, 석포리]              |
124 |             +---------------------------------------+
125 |         """
126 | 
127 |         self._df = self._df.withColumn("idx", where_is_sido(self._df.split)).withColumn(
128 |             "split", cleanse_split(self._df.idx, self._df.split)
129 |         )
130 |         self._df = self._df.drop("idx")
131 |         self._df = self._df.withColumn("split", process_roadname(self._df.split))
132 |         return self._df
133 | 
134 |     def add_sido(self):
135 |         """
136 |         특별시, 광역시, 도를 기존 데이터프레임에 추가하는 함수입니다.
137 | 
138 |         UDF
139 |         ---
140 |         extract_sido : StringType
141 |             split 컬럼에서 특별시와 광역시, 도를 찾고 값을 반환합니다.
142 |             값이 없는 경우, "None" : str 을 반환합니다.
143 | 
144 |             Exmaple
145 |             -------
146 |             >>> df.show()
147 |             +----------------------------------------+
148 |             |split                                   |
149 |             +----------------------------------------+
150 |             |[경기도, 안산시, 단원구, 해봉로, 137]          |
151 |             |[경기도, 수원시, 장안구, 경수대로, 1079]        |
152 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]        |
153 |             +----------------------------------------+
154 | 
155 |             >>> df.withColumn('idx', extract_sido()).show()
156 |             +----------------------------------------------+-----+
157 |             |split                                         |sido |
158 |             +----------------------------------------------+-----+
159 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |
160 |             |[경기도, 수원시, 장안구, 경수대로, 1079]              |경기도 |
161 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]              |경기도 |
162 |             +----------------------------------------------+------+
163 |         """
164 | 
165 |         self._df = self._df.withColumn("sido", extract_sido(self._df.split))
166 |         return self._df
167 | 
168 |     def add_sigungu(self):
169 |         """
170 |         시, 군, 구 컬럼을 기존 데이터프레임에 추가하는 함수입니다.
171 |         UDF
172 |         ---
173 |         extract_sigungu : StringType
174 |             split 컬럼에서 시, 군, 구를 찾고 값을 반환합니다.
175 | 
176 |             시와 구가 같이 있을경우에는 시와 구를 같이 반환합니다.
177 |             ex) 경기도 성남시 분당구 -> 성남시 분당구
178 | 
179 |             값이 없는 경우, "None" : str 을 반환합니다.
180 | 
181 |             Exmaple
182 |             -------
183 |             >>> df.show()
184 |             +----------------------------------------------+-----+
185 |             |split                                         |sido |
186 |             +----------------------------------------------+-----+
187 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |
188 |             |[경기도, 수원시, 장안구, 경수대로, 1079]              |경기도 |
189 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]              |경기도 |
190 |             +----------------------------------------------+------+
191 | 
192 |             >>> df.withColumn('idx', extract_sigungu()).show()
193 |             +----------------------------------------------+------+-----------+
194 |             |split                                         |sido  |sigungu    |
195 |             +----------------------------------------------+------+-----------+
196 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |
197 |             |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |
198 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |
199 |             +----------------------------------------------+------+-----------+
200 |         """
201 | 
202 |         self._df = self._df.withColumn("sigungu", extract_sigungu(self._df.split))
203 |         return self._df
204 | 
205 |     def add_eupmyeon(self):
206 |         """
207 |         읍, 면 컬럼을 기존에 데이터프레임에 추가하는 함수입니다.
208 | 
209 |         UDF
210 |         ---
211 |         extract_eupmyeon : StringType
212 |             split 컬럼에서 읍이나 면을 찾고 값을 반환합니다.
213 | 
214 |             값이 없는 경우, "None" : str 을 반환합니다.
215 | 
216 |             Exmaple
217 |             -------
218 |             >>> df.show()
219 |             +----------------------------------------------+------+-----------+
220 |             |split                                         |sido  |sigungu    |
221 |             +----------------------------------------------+------+-----------+
222 |             |[경기도, 화성시, 장안면, 매바위로366번길, 8]           |경기도 |화성시       |
223 |             |[강원도, 원주시, 호저면, 사제로, 9]                  |강원도  |원주시      |
224 |             |[경상남도, 사천시, 곤양면, 경충로, 23-1]              |경상남도|사천시      |
225 |             +----------------------------------------------+------+-----------+
226 | 
227 |             >>> df.withColumn('idx', extract_eupmyeon()).show()
228 |             +----------------------------------------------+------+-----------+--------+
229 |             |split                                         |sido  |sigungu    |eupmyeon|
230 |             +----------------------------------------------+------+-----------+--------+
231 |             |[경기도, 화성시, 장안면, 매바위로366번길, 8]           |경기도 |화성시       |장안면   |
232 |             |[강원도, 원주시, 호저면, 사제로, 9]                  |강원도  |원주시      |호저면   |
233 |             |[경상남도, 사천시, 곤양면, 경충로, 23-1]              |경상남도|사천시      |곤양면   |
234 |             +----------------------------------------------+------+-----------+-------+
235 |         """
236 |         self._df = self._df.withColumn("eupmyeondong", extract_eupmyeondong(self._df.split))
237 |         return self._df
238 | 
239 |     def add_dong(self):
240 |         """
241 |         데이터프레임에 동이 포함되어있는지 확인하고 동 컬럼을 추가하는 함수입니다.
242 | 
243 |         UDF
244 |         ---
245 |         extract_dong : StringType
246 |             split 컬럼에서 읍이나 면을 찾고 값을 반환합니다.
247 | 
248 |             값이 없는 경우, "None" : str 을 반환합니다.
249 | 
250 |             Exmaple
251 |             -------
252 |             >>> df.show()
253 |             +-------------------------+--------+-----------+
254 |             |split                    |sido    |sigungu    |
255 |             +-------------------------+--------+-----------+
256 |             |[경기도, 성남시, 분당구, 금곡동]|경기도   |성남시       |
257 |             |[충청남도, 공주시, 검상동]     |강원도   |공주시       |
258 |             |[대전광역시, 동구, 가오동]     |대전광역시|동구         |
259 |             +-------------------------+--------+-----------+
260 | 
261 |             >>> df.withColumn('idx', extract_dong()).show()
262 |             +-------------------------+--------+-----------+----+
263 |             |split                    |sido    |sigungu    |dong|
264 |             +-------------------------+--------+-----------+----+
265 |             |[경기도, 성남시, 분당구, 금곡동]|경기도   |성남시       |금곡동|
266 |             |[충청남도, 공주시, 검상동]     |강원도   |공주시       |검상동|
267 |             |[대전광역시, 동구, 가오동]     |대전광역시|동구         |가오동|
268 |             +-------------------------+--------+-----------+-----+
269 |         """
270 | 
271 |         self._df = self._df.withColumn("dong", extract_dong(self._df.split))
272 |         return self._df
273 | 
274 |     def add_roadname(self):
275 |         """
276 |         데이터프레임에 도로명주소 컬럼을 추가하는 함수입니다.
277 |         UDF
278 |         ---
279 |         extract_building_primary_number : StringType
280 |             split 컬럼에서 도로명를 찾고 값을 반환합니다.
281 | 
282 |             값이 없는 경우, "None" : str 을 반환합니다.
283 | 
284 |             Exmaple
285 |             -------
286 |             >>> df.show()
287 |             +----------------------------------------------+------+-----------+
288 |             |split                                         |sido  |sigungu    |
289 |             +----------------------------------------------+------+-----------+
290 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |
291 |             |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |
292 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |
293 |             +----------------------------------------------+------+-----------+
294 | 
295 |             >>> df.withColumn('idx', add_sigungu()).show()
296 |             +----------------------------------------------+------+-----------+---------+
297 |             |split                                         |sido  |sigungu    |roadname |
298 |             +----------------------------------------------+------+-----------+---------+
299 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |해봉로     |
300 |             |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |경수대로   |
301 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |양달말길   |
302 |             +----------------------------------------------+------+-----------+---------+
303 |         """
304 |         self._df = self._df.withColumn("roadname", extract_roadname(self._df.split))
305 |         return self._df
306 | 
307 |     def add_building_primary_number(self):
308 |         """
309 |         데이터프레임에 도로명주소의 건물본번을 추가하는 함수입니다.
310 | 
311 |         UDF
312 |         ---
313 |         extract_building_primary_number : StringType
314 | 
315 |             Parameters
316 |             ----------
317 |             split : columnType
318 |             roadname : columnType
319 | 
320 |             roadname 뒤에 건물 본번과 부번이 들어오면 건물 본번을 반환합니다..
321 | 
322 |             값이 없는 경우, "None" : str 을 반환합니다.
323 | 
324 |             Exmaple
325 |             -------
326 |             >>> df.show()
327 |             +----------------------------------------------+------+-----------+---------+
328 |             |split                                         |sido  |sigungu    |roadname |
329 |             +----------------------------------------------+------+-----------+---------+
330 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |해봉로     |
331 |             |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |경수대로   |
332 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |양달말길   |
333 |             +----------------------------------------------+------+-----------+---------+
334 | 
335 |             >>> df.withColumn('idx', extract_building_primary_number()).show()
336 |             +----------------------------------------------+------+-----------+---------+-----------------------+
337 |             |split                                         |sido  |sigungu    |roadname |building_primary_number|
338 |             +----------------------------------------------+------+-----------+---------+-----------------------+
339 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |해봉로      |137                    |
340 |             |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |경수대로    |1079                   |
341 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |양달말길    |93                     |
342 |             +----------------------------------------------+------+-----------+---------+-----------------------+
343 |         """
344 |         self._df = self._df.withColumn(
345 |             "building_primary_number",
346 |             extract_building_primary_number(self._df.split, self._df.roadname),
347 |         )
348 |         return self._df
349 |     
350 |     def add_jibun_primary_number(self):
351 |         self._df = self._df.withColumn(
352 |             "jibun_primary_number",
353 |             extract_jibun_primary_number(self._df.split, self._df.roadname),
354 |         )
355 |         return self._df
356 | 
357 |     def join_with_db(self, db_df):
358 | 
359 |         """
360 |         데이터베이스 데이터프레임과 조인하는 함수입니다.
361 | 
362 |         Parameters
363 |         ----------
364 |         db_df : DataFrame
365 | 
366 | 
367 |         Exmaple
368 |         -------
369 |         >>> df.show()
370 |         +----------------------------------------------+------+-----------+---------+-----------------------+
371 |         |split                                         |sido  |sigungu    |roadname |building_primary_number|
372 |         +----------------------------------------------+------+-----------+---------+-----------------------+
373 |         |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |해봉로      |137                    |
374 |         |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |경수대로    |1079                   |
375 |         |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |양달말길    |93                     |
376 |         +----------------------------------------------+------+-----------+---------+-----------------------+
377 | 
378 |         >>> df.withColumn('idx', extract_building_primary_number()).show()
379 |         +----------------------------------------------+------+-----------+---------+-----------------------+----------------+
380 |         |split                                         |sido  |sigungu    |roadname |building_primary_number|bupjungdong_code|
381 |         +----------------------------------------------+------+-----------+---------+-----------------------+----------------+
382 |         |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |해봉로      |137                    |4128112400     |
383 |         |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |경수대로    |1079                   |4128111800     |
384 |         |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |양달말길    |93                     |4128101100     |
385 |         +----------------------------------------------+------+-----------+---------+-----------------------+---------------+
386 |         """
387 |         db_df_roadname = db_df.select(
388 |             col("sido").alias("sido_name"),
389 |             col("sigungu").alias("sigungu_name"),
390 |             col("eupmyeondong").alias('eupmyeondong_name'),
391 |             col("roadname").alias("db_roadname"),
392 |             col("building_primary_number").alias("db_building_primary_number"),
393 |             col("bupjungdong_code").alias('db_bupjungdong_code'),
394 |             col("jibun_primary_number").alias("db_jibun_primary_number")
395 |         ).drop_duplicates(["sigungu_name", "db_roadname", "db_building_primary_number"])
396 | 
397 |         db_df_jibun = db_df.select(
398 |             col("sido").alias("sido_name"),
399 |             col("sigungu").alias("sigungu_name"),
400 |             col("eupmyeondong").alias('eupmyeondong_name'),
401 |             col("roadname").alias("db_roadname"),
402 |             col("building_primary_number").alias("db_building_primary_number"),
403 |             col("bupjungdong_code").alias('db_bupjungdong_code'),
404 |             col("jibun_primary_number").alias("db_jibun_primary_number")
405 |         ).drop_duplicates(["sigungu_name", "eupmyeondong_name", "db_jibun_primary_number"])
406 | 
407 |         jibun_origin = self._df.where(self._df.roadname == "None")
408 |         roadname_origin = self._df.where(self._df.roadname != "None")
409 | 
410 |         join_df_roadname = roadname_origin.join(
411 |             db_df_roadname,  
412 |             (self._df.sigungu == db_df_roadname.sigungu_name)
413 |             & (self._df.roadname == db_df_roadname.db_roadname)
414 |             & (self._df.building_primary_number == db_df_roadname.db_building_primary_number),
415 |             "inner",
416 |         ) \
417 |          .withColumnRenamed("db_bupjungdong_code", "bupjungdong_code") \
418 |          .select(*self.col_list, "sido_name", "sigungu_name", "eupmyeondong_name", "bupjungdong_code")
419 | 
420 |         
421 |         join_df_jibun = jibun_origin.join(
422 |             db_df_jibun,
423 |             (self._df.sigungu == db_df_jibun.sigungu_name)
424 |             & (self._df.eupmyeondong == db_df_jibun.eupmyeondong_name)
425 |             & (self._df.jibun_primary_number == db_df_jibun.db_jibun_primary_number),
426 |             "inner",
427 |         ) \
428 |         .withColumnRenamed("db_bupjungdong_code", "bupjungdong_code") \
429 |         .select(*self.col_list, "sido_name", "sigungu_name", "eupmyeondong_name", "bupjungdong_code")
430 |       
431 |         self._df = join_df_roadname.union(join_df_jibun)
432 | 
433 |         self._df = self._df.withColumn("sigungu_code", extract_sigungu_code(self._df.bupjungdong_code))
434 | 
435 |         return self._df
436 | 


--------------------------------------------------------------------------------
/sparkplus/core/base.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Union
  2 | from pyspark.sql.session import SparkSession
  3 | 
  4 | 
  5 | class SPDataFrame(object):
  6 |     """
  7 |     요약
  8 |     -------
  9 |     `SPDataFrame` 은 Spark DataFrame를 확장하며, 한국 주소체계를 더 쉽게 다룰 수 있도록 다양한 기능을 제공합니다.
 10 |     """
 11 | 
 12 |     @classmethod
 13 |     def get_db_df_by_tablenames(
 14 |         cls, sparkSession: SparkSession, tablenames: Union[str, List[str]], **kwargs
 15 |     ):
 16 |         """
 17 |         Summary
 18 |         -------
 19 |         테이블명을 기반으로 Spark DataFrame을 반환합니다.
 20 | 
 21 |         Parameter
 22 |         ----
 23 |         sparkSession: Active Spark Session
 24 |         tablenames: DataFrame으로 만들 테이블명
 25 |         **kwargs: `driver`, `url`, `user`, `password`
 26 | 
 27 |         Raises:
 28 |             ValueError
 29 | 
 30 |         Returns:
 31 |             `DataFrame`s from database
 32 | 
 33 | 
 34 |         Usage
 35 |         -----
 36 |         >>> import SPDataFrame
 37 |         >>> ss = SparkSession.builder.getOrCreate()
 38 |         >>> tablenames = ['integrated_address_seoul', 'integrated_address_incheon', 'integrated_address_gyeonggi']
 39 |         >>> table_dfs = SPDataFrame(ss, tablenames,
 40 |                             driver='com.mysql.cj.jdbc.Driver',
 41 |                             url='jdbc:mysql://localhost:3306/sparkplus',
 42 |                             user='root',
 43 |                             password='password'
 44 |                             )
 45 |         >>> table_dfs.select('roadname_code', 'sido', 'sigungu', 'eupmyeondong').show()
 46 |         +-------------+----------+-------------+------------+
 47 |         |roadname_code|      sido|      sigungu|eupmyeondong|
 48 |         +-------------+----------+-------------+------------+
 49 |         | 261103125011|부산광역시|         중구|      영주동|
 50 |         | 261104006006|부산광역시|         중구|      영주동|
 51 |         | 261104006006|부산광역시|         중구|      영주동|
 52 |         | 261104006006|부산광역시|         중구|      영주동|
 53 |         | 261103125011|부산광역시|         중구|      영주동|
 54 |         | 111104100289|서울특별시|       종로구|      청운동|
 55 |         | 111104100289|서울특별시|       종로구|      청운동|
 56 |         | 111103100014|서울특별시|       종로구|      청운동|
 57 |         | 111104100289|서울특별시|       종로구|      청운동|
 58 |         | 111104100289|서울특별시|       종로구|      청운동|
 59 |         | 411114322017|    경기도|수원시 장안구|      파장동|
 60 |         | 411114322017|    경기도|수원시 장안구|      파장동|
 61 |         | 411114322017|    경기도|수원시 장안구|      파장동|
 62 |         | 411114322017|    경기도|수원시 장안구|      파장동|
 63 |         | 411114322017|    경기도|수원시 장안구|      파장동|
 64 |         +-------------+----------+-------------+------------+
 65 |         """
 66 |         sess_conf = sparkSession.sparkContext.getConf().getAll()
 67 | 
 68 |         # If SparkConf doesn't contain MySQL connector, raise `ValueError`
 69 |         jdbc_driver_flag = False
 70 | 
 71 |         # If you use `spark.jars.packages`, value should like `mysql:mysql-connector-java:YOUR_MYSQL_VERSION`
 72 |         available_configs = [
 73 |             "spark.jars",
 74 |             "spark.driver.extraClassPath",
 75 |             "spark.jars.packages",
 76 |         ]
 77 | 
 78 |         for (conf_key, conf_val) in sess_conf:
 79 |             if conf_key in available_configs and conf_val.__contains__("mysql"):
 80 |                 jdbc_driver_flag = True
 81 |                 break
 82 | 
 83 |         if not jdbc_driver_flag:
 84 |             raise ValueError(
 85 |                 "[SPARKPLUS_MYSQL_CONNECTOR_ERR] "
 86 |                 "Your spark session seems like it doesn't contains mysql-connector-java path to connect mysql database. "
 87 |                 "Please specify it to use SparkPlus package properly.\n\n"
 88 |                 "$ spark-submit <your-spark-app> --jars <mysql-jar-path>\n\n"
 89 |                 "In programming way, if you have mysql-connector jar file locally, set spark configuration like\n\n"
 90 |                 ">>> ss = SparkSession.builder.config('spark.jars', MYSQL_JAR_PATH)\n\n"
 91 |                 "or if you don't,\n\n"
 92 |                 ">>> ss = SparkSession.builder.config('spark.jars.packages', 'mysql:mysql-connector-java:YOUR_MYSQL_VERSION')\n\n"
 93 |                 "Check https://spark.apache.org/docs/latest/configuration.html for detail."
 94 |             )
 95 | 
 96 |         ss_read = sparkSession.read.format("jdbc")
 97 | 
 98 |         # set DB options such as driver, url, user, password
 99 |         for opt_key, opt_val in kwargs.items():
100 |             ss_read.option(opt_key, opt_val)
101 | 
102 |         if isinstance(tablenames, str):
103 |             return ss_read.option("dbtable", tablenames).load()
104 |         else:
105 |             dfs = ss_read.option("dbtable", tablenames.pop()).load()
106 | 
107 |             while tablenames:
108 |                 dfs = dfs.union(ss_read.option("dbtable", tablenames.pop()).load())
109 | 
110 |             return dfs
111 | 


--------------------------------------------------------------------------------
/sparkplus/core/coord_dataframe.py:
--------------------------------------------------------------------------------
  1 | from geopandas.geodataframe import GeoDataFrame
  2 | from pyspark.sql.functions import lit, udf, pandas_udf
  3 | from pyspark.sql import DataFrame
  4 | from pyspark.sql.types import *
  5 | 
  6 | import geopandas as gpd
  7 | import h3
  8 | 
  9 | 
 10 | def create_sjoin_pnu(gdf, join_column_name):
 11 |     def sjoin_settlement(x, y):
 12 |         gdf_temp = gpd.GeoDataFrame(
 13 |             data=[[x] for x in range(len(x))], geometry=gpd.points_from_xy(x, y)
 14 |         ).set_crs(epsg=4326, inplace=True)
 15 |         settlement = gpd.sjoin(gdf_temp, gdf, how="left", predicate="within")
 16 |         settlement = settlement.drop_duplicates(subset="geometry")
 17 | 
 18 |         return (
 19 |             settlement.agg({"PNU": lambda x: str(x)})
 20 |             .reset_index()
 21 |             .loc[:, join_column_name]
 22 |             .astype("str")
 23 |         )
 24 | 
 25 |     return pandas_udf(sjoin_settlement, returnType=StringType())
 26 | 
 27 | 
 28 | def _coord_to_pnu(origin_df, gdf, x_colname, y_colname):
 29 |     sjoin_udf = create_sjoin_pnu(gdf, "PNU")
 30 |     res_df = origin_df.withColumn(
 31 |         "PNU", sjoin_udf(origin_df[x_colname], origin_df[y_colname])
 32 |     )
 33 |     return res_df
 34 | 
 35 | 
 36 | def _join_with_table(table_df, pnu_df):
 37 |     # temp_df = self.coord_to_pnu()
 38 |     table_df = table_df.dropDuplicates(["bupjungdong_code"])
 39 |     res_df = pnu_df.join(
 40 |         table_df, [pnu_df.PNU[0:10] == table_df.bupjungdong_code], how="left_outer"
 41 |     )
 42 |     # res_df = res_df.dropDuplicates(['PNU'])
 43 | 
 44 |     return res_df
 45 | 
 46 | 
 47 | @udf(StringType())
 48 | def get_fullname(a, b, c, d):
 49 |     if a == None and b == None and c == None and d == None:
 50 |         return None
 51 | 
 52 |     if a == None:
 53 |         a = ""
 54 |     if b == None:
 55 |         b = ""
 56 |     if c == None:
 57 |         c = ""
 58 |     if d == None:
 59 |         d = ""
 60 | 
 61 |     res = str(a) + " " + str(b) + " " + str(c) + " " + str(d) + " " 
 62 | 
 63 |     return res
 64 | 
 65 | class CoordDataFrame(DataFrame):
 66 |     """
 67 |     Summary
 68 |     -------
 69 |     위경도 좌표가 포함된 Spark DataFrame에 법정읍면동, h3, 우편번호 정보를 추가합니다.
 70 | 
 71 |     Args:
 72 |             origin_sdf (Spark DataFrame):  위경도 좌표가 포함된 원본 Spark DataFrame
 73 |             gdf (GeoDataFrame): shp Parquet으로부터 생성한 GeoDataFrame
 74 |             tdf (Spark DataFrame): 데이터베이스로부터 생성한 Spark DataFrame
 75 |             x_colname (String): 원본 Spark DataFrame의 경도 컬럼 이름
 76 |             y_colname (String): 원본 Spark DataFrame의 위도 컬럼 이름
 77 | 
 78 |     Usage
 79 |     -------
 80 |     >>> from sparkplus.core.sparkplus import CoordDataFrame
 81 |     >>> df = CoordDataFrame(origin_sdf, gdf, tdf, x_colname, y_colname)
 82 |     """
 83 | 
 84 |     def __init__(self, origin_sdf, gdf, tdf, x_colname, y_colname):
 85 |         self._origin_sdf = origin_sdf
 86 |         self._gdf = gdf
 87 |         self._tdf = tdf
 88 |         self._x_colname = x_colname
 89 |         self._y_colname = y_colname
 90 | 
 91 |         self.pnu_df = _coord_to_pnu(origin_sdf, gdf, x_colname, y_colname).cache()
 92 |         self.joined_df = _join_with_table(tdf, self.pnu_df).cache()
 93 | 
 94 |     def add_h3(self, h3_level):
 95 |         """
 96 |         Summary
 97 |         -------
 98 |         위경도 좌표가 포함된 원본 Spark DataFrame에 h3 정보를 추가합니다.
 99 | 
100 |         Args:
101 |                 h3_level (Int): 추가하고자 하는 h3 level
102 | 
103 |         Usage
104 |         -------
105 |         >>> from sparkplus.core.sparkplus import CoordDataFrame
106 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
107 |         >>> res_df = df.coord_to_h3(10)
108 | 
109 |         Examples
110 |         -------
111 |         >>> origin_sdf.show()
112 |         +----------+--------+-----------+-----------+
113 |         |   가로등번호|  관할구청|        위도|        경도|
114 |         +----------+--------+-----------+-----------+
115 |         |   1001001|     중구|35.87343028|128.6103158|
116 |         |   1001002|     중구|35.87334197|128.6099071|
117 |         |   1001003|     중구|35.87327842|128.6096135|
118 |         +----------+--------+-----------+-----------+
119 | 
120 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도')
121 |         >>> res_df = df.coord_to_h3(10)
122 |         >>> res_df.show()
123 |         +----------+--------+-----------+-----------+---------------+
124 |         |	가로등번호|	 관할구청|        위도|        경도|             h3|
125 |         +----------+--------+-----------+-----------+---------------+
126 |         |   1001001|     중구|35.87343028|128.6103158|8a30c190311ffff|
127 |         |   1001002|     중구|35.87334197|128.6099071|8a30c190311ffff|
128 |         |   1001003|     중구|35.87327842|128.6096135|8a30c19031affff|
129 |         +----------+--------+-----------+-----------+---------------+
130 |         """
131 |         udf_to_h3 = udf(
132 |             lambda x, y: h3.geo_to_h3(float(x), float(y), h3_level),
133 |             returnType=StringType(),
134 |         )
135 | 
136 |         res_h3 = self._origin_sdf.withColumn(
137 |             "h3",
138 |             udf_to_h3(
139 |                 self._origin_sdf[self._y_colname], self._origin_sdf[self._x_colname]
140 |             ),
141 |         )
142 |         return CoordDataFrame(res_h3)
143 | 
144 |     def add_pnu(self):
145 |         """
146 |         Summary
147 |         -------
148 |         위경도 좌표가 포함된 원본 Spark DataFrame에 pnu 정보를 추가합니다.
149 | 
150 |         Usage
151 |         -------
152 |         >>> from sparkplus.core.sparkplus import CoordDataFrame
153 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
154 |         >>> res_df = df.coord_to_pnu()
155 | 
156 |         Example
157 |         -------
158 |         >>> orgin_sdf.show()
159 |         +----------+--------+-----------+-----------+
160 |         |   가로등번호|  관할구청|        위도|        경도|
161 |         +----------+--------+-----------+-----------+
162 |         |   1001001|     중구|35.87343028|128.6103158|
163 |         |   1001002|     중구|35.87334197|128.6099071|
164 |         |   1001003|     중구|35.87327842|128.6096135|
165 |         +----------+--------+-----------+-----------+
166 | 
167 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도')
168 |         >>> res_df = df.coord_to_pnu()
169 |         >>> res_df.show()
170 |         +----------+--------+-----------+-----------+-------------------+
171 |         |	가로등번호|	 관할구청|        위도|        경도|                PNU|
172 |         +----------+--------+-----------+-----------+-------------------+
173 |         |   1001001|     중구|35.87343028|128.6103158|2711010300103670054|
174 |         |   1001002|     중구|35.87334197|128.6099071|2711010300103670054|
175 |         |   1001003|     중구|35.87327842|128.6096135|2711010300103670054|
176 |         +----------+--------+-----------+-----------+-------------------+
177 |         """
178 |         return self.pnu_df
179 | 
180 |     def add_zipcode(self):
181 |         """
182 |         Summary
183 |         -------
184 |         위경도 좌표가 포함된 원본 Spark DataFrame에 우편번호 정보를 추가합니다.
185 | 
186 |         Usage
187 |         -------
188 |         >>> from sparkplus.core.sparkplus import CoordDataFrame
189 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
190 |         >>> res_df = df.coord_to_zipcode()
191 | 
192 |         Example
193 |         -------
194 |         >>> origin_sdf.show()
195 |         +----------+--------+-----------+-----------+
196 |         |   가로등번호|  관할구청|        위도|        경도|
197 |         +----------+--------+-----------+-----------+
198 |         |   1001001|     중구|35.87343028|128.6103158|
199 |         |   1001002|     중구|35.87334197|128.6099071|
200 |         |   1001003|     중구|35.87327842|128.6096135|
201 |         +----------+--------+-----------+-----------+
202 | 
203 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도')
204 |         >>> res_df = df.coord_to_zipcode()
205 |         >>> res_df.show()
206 |         +----------+--------+-----------+-----------+-------+
207 |         |	가로등번호|	 관할구청|        위도|        경도|zipcode|
208 |         +----------+--------+-----------+-----------+-------+
209 |         |   8155012|   달성군|35.64103224|128.4106523|  43013|
210 |         |   8071024|   달성군|35.66091032|128.4159519|  43006|
211 |         |   8213007|   달성군| 35.6320721|128.4175234|  43013|
212 |         +----------+--------+-----------+-----------+-------+
213 | 
214 |         """
215 |         joined_df = self.joined_df.select("PNU", "zipcode")
216 |         res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU")
217 |         res_df = res_df.dropDuplicates([self._x_colname, self._y_colname])
218 |         return CoordDataFrame(res_df)
219 | 
220 |     def add_bupjungdong(self):
221 |         """
222 |         Summary
223 |         -------
224 |         위경도 좌표가 포함된 원본 Spark DataFrame에 법정읍면동 코드 정보를 추가합니다.
225 | 
226 |         Usage
227 |         -------
228 |         >>> from sparkplus.core.sparkplus import CoordDataFrame
229 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
230 |         >>> res_df = df.coord_to_emd()
231 | 
232 |         Example
233 |         -------
234 |         >>> origin_sdf.show()
235 |         +----------+--------+-----------+-----------+
236 |         |   가로등번호|  관할구청|        위도|        경도|
237 |         +----------+--------+-----------+-----------+
238 |         |   1001001|     중구|35.87343028|128.6103158|
239 |         |   1001002|     중구|35.87334197|128.6099071|
240 |         |   1001003|     중구|35.87327842|128.6096135|
241 |         +----------+--------+-----------+-----------+
242 | 
243 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도')
244 |         >>> res_df = df.coord_to_emd()
245 |         >>> res_df.show()
246 |         +----------+--------+-----------+-----------+----------------+
247 |         |	가로등번호|	 관할구청|        위도|        경도|bupjungdong_code|
248 |         +----------+--------+-----------+-----------+----------------+
249 |         |   1001001|     중구|35.87343028|128.6103158|      2711010300|
250 |         |   1001002|     중구|35.87334197|128.6099071|      2711010300|
251 |         |   1001003|     중구|35.87327842|128.6096135|      2711010300|
252 |         +----------+--------+-----------+-----------+----------------+
253 |         """
254 |         joined_df = self.joined_df.select("PNU", "bupjungdong_code")
255 |         res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU")
256 |         res_df = res_df.dropDuplicates([self._x_colname, self._y_colname])
257 |         return CoordDataFrame(res_df)
258 | 
259 |     def add_roadname(self):
260 |         """
261 |         Summary
262 |         -------
263 |         위경도 좌표가 포함된 원본 Spark DataFrame에 도로명 주소 정보를 추가합니다.
264 | 
265 |         Usage
266 |         -------
267 |         >>> from sparkplus.core.sparkplus import CoordDataFrame
268 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
269 |         >>> res_df = df.coord_to_roadname()
270 | 
271 |         Example
272 |         -------
273 |         >>> origin_sdf.show()
274 |         +----------+--------+-----------+-----------+
275 |         |   가로등번호|  관할구청|        위도|        경도|
276 |         +----------+--------+-----------+-----------+
277 |         |   1001001|     중구|35.87343028|128.6103158|
278 |         |   1001002|     중구|35.87334197|128.6099071|
279 |         |   1001003|     중구|35.87327842|128.6096135|
280 |         +----------+--------+-----------+-----------+
281 | 
282 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
283 |         >>> res_df = df.coord_to_roadname()
284 |         >>> res_df.show()
285 |         +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+
286 |         |	가로등번호|	 관할구청|        위도|        경도|      sido|sigungu|     roadname|eupmyeondong|bupjungli|is_basement|building_primary_number|building_secondary_number|
287 |         +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+
288 |         |   1001001|     중구|35.87343028|128.6103158|	 대구광역시|   중구|   	  동덕로38길|     동인동3가|         |          0|                    100|                        0|
289 |         |   1001002|     중구|35.87334197|128.6099071|	 대구광역시|   중구|   	  동덕로38길|  	  동인동3가|         |          0|                    100|                        0|
290 |         |   1001003|     중구|35.87327842|128.6096135|	 대구광역시|   중구|   	  동덕로38길|  	  동인동3가|         |          0|                    100|                        0|
291 |         +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+
292 | 
293 |         """
294 |         joined_df = self.joined_df.select(
295 |             "PNU",
296 |             "sido",
297 |             "sigungu",
298 |             "roadname",
299 |             "eupmyeondong",
300 |             "bupjungli",
301 |             "is_basement",
302 |             "building_primary_number",
303 |             "building_secondary_number",
304 |         )
305 |         res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU")
306 |         res_df = res_df.dropDuplicates([self._x_colname, self._y_colname])
307 |         return CoordDataFrame(res_df)
308 | 
309 |     def add_roadname_addr(self):
310 |         """
311 |         Summary
312 |         -------
313 |         위경도 좌표가 포함된 원본 Spark DataFrame에 도로명 주소 정보를 추가합니다.
314 | 
315 |         Usage
316 |         -------
317 |         >>> from sparkplus.core.sparkplus import CoordDataFrame
318 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
319 |         >>> res_df = df.coord_to_roadname()
320 | 
321 |         Example
322 |         -------
323 |         >>> origin_sdf.show()
324 |         +----------+--------+-----------+-----------+
325 |         |   가로등번호|  관할구청|        위도|        경도|
326 |         +----------+--------+-----------+-----------+
327 |         |   1001001|     중구|35.87343028|128.6103158|
328 |         |   1001002|     중구|35.87334197|128.6099071|
329 |         |   1001003|     중구|35.87327842|128.6096135|
330 |         +----------+--------+-----------+-----------+
331 | 
332 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
333 |         >>> res_df = df.coord_to_roadname()
334 |         >>> res_df.show()
335 |         +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+
336 |         |	가로등번호|	 관할구청|        위도|        경도|      sido|sigungu|     roadname|eupmyeondong|bupjungli|is_basement|building_primary_number|building_secondary_number|
337 |         +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+
338 |         |   1001001|     중구|35.87343028|128.6103158|	 대구광역시|   중구|   	  동덕로38길|     동인동3가|         |          0|                    100|                        0|
339 |         |   1001002|     중구|35.87334197|128.6099071|	 대구광역시|   중구|   	  동덕로38길|  	  동인동3가|         |          0|                    100|                        0|
340 |         |   1001003|     중구|35.87327842|128.6096135|	 대구광역시|   중구|   	  동덕로38길|  	  동인동3가|         |          0|                    100|                        0|
341 |         +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+
342 | 
343 |         """
344 |         joined_df = self.joined_df.select(
345 |             "PNU",
346 |             "sido",
347 |             "sigungu",
348 |             "roadname",
349 |             "eupmyeondong",
350 |             "bupjungli",
351 |             "is_basement",
352 |             "building_primary_number",
353 |             "building_secondary_number",
354 |         )
355 |         joined_df = joined_df.withColumn(
356 |             "roadname_address",
357 |             get_fullname(
358 |                 joined_df["sido"],
359 |                 joined_df["sigungu"],
360 |                 joined_df["roadname"],
361 |                 joined_df["building_primary_number"],
362 |             ),
363 |         )
364 |         res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU")
365 |         res_df = res_df.dropDuplicates([self._x_colname, self._y_colname])
366 |         return CoordDataFrame(res_df)
367 | 
368 |     def add_jibun(self):
369 |         """
370 |         Summary
371 |         -------
372 |         위경도 좌표가 포함된 원본 Spark DataFrame에 지번 주소 정보를 추가합니다.
373 | 
374 |         Usage
375 |         -------
376 |         >>> from sparkplus.core.sparkplus import CoordDataFrame
377 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
378 |         >>> res_df = df.coord_to_jibun()
379 | 
380 |         Example
381 |         -------
382 |         >>> origin_sdf.show()
383 |         +----------+--------+-----------+-----------+
384 |         |   가로등번호|  관할구청|        위도|        경도|
385 |         +----------+--------+-----------+-----------+
386 |         |   1001001|     중구|35.87343028|128.6103158|
387 |         |   1001002|     중구|35.87334197|128.6099071|
388 |         |   1001003|     중구|35.87327842|128.6096135|
389 |         +----------+--------+-----------+-----------+
390 | 
391 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
392 |         >>> res_df = df.coord_to_jibun()
393 |         >>> res_df.show()
394 |         +----------+--------+-----------+-----------+----------+-------+------------+---------+--------------------+----------------------+
395 |         |	가로등번호|	 관할구청|        위도|        경도|      sido|sigungu|eupmyeondong|bupjungli|jibun_primary_number|jibun_secondary_number|
396 |         +----------+--------+-----------+-----------+----------+-------+------------+---------+--------------------+----------------------+
397 |         |   1001001|     중구|35.87343028|128.6103158|   대구광역시|    중구|     동인동3가|         |                 192|                    79|
398 |         |   1001002|     중구|35.87334197|128.6099071|   대구광역시|    중구|     동인동3가|         |                 192|                    79|
399 |         |   1001003|     중구|35.87327842|128.6096135|   대구광역시|    중구|     동인동3가|         |                 192|                    79|
400 |         +----------+--------+-----------+-----------+----------+-------+------------+---------+--------------------+----------------------+
401 |         """
402 |         joined_df = self.joined_df.select(
403 |             "PNU",
404 |             "sido",
405 |             "sigungu",
406 |             "eupmyeondong",
407 |             "bupjungli",
408 |             "jibun_primary_number",
409 |             "jibun_secondary_number",
410 |         )
411 |         res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU")
412 |         res_df = res_df.dropDuplicates([self._x_colname, self._y_colname])
413 |         return CoordDataFrame(res_df)
414 | 
415 |     def join_with_db(self):
416 |         """
417 |         Summary
418 |         -------
419 |         위경도 좌표가 포함된 원본 Spark DataFrame에 데이터베이스에서 가져온 Spark DataFrame 정보를 추가합니다.
420 | 
421 |         Usage
422 |         -------
423 |         >>> from sparkplus.core.sparkplus import CoordDataFrame
424 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat')
425 |         >>> res_df = df.join_with_table()
426 | 
427 |         Example
428 |         -------
429 |         >>> origin_sdf.show()
430 |         +----------+--------+-----------+-----------+
431 |         |   가로등번호|  관할구청|        위도|        경도|
432 |         +----------+--------+-----------+-----------+
433 |         |   1001001|     중구|35.87343028|128.6103158|
434 |         |   1001002|     중구|35.87334197|128.6099071|
435 |         |   1001003|     중구|35.87327842|128.6096135|
436 |         +----------+--------+-----------+-----------+
437 | 
438 |         >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도')
439 |         >>> res_df = df.join_with_table()
440 |         >>> res_df.show()
441 |         +----------+--------+-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+
442 |         |	가로등번호|	 관할구청|        위도|        경도|                PNU|       manage_number|roadname_code|zipcode|      sido|sigungu|eupmyeondong|bupjungli|       roadname|is_basement|building_primary_number|building_secondary_number|jibun_primary_number|jibun_secondary_number|bupjungdong_code|
443 |         +----------+--------+-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+
444 |         |   1065002|     중구|35.86341579|128.6024286|2711010600101990000|27110106001000300...| 271103007017|  41940|	대구광역시|    중구|   	 삼덕동2가|         |           공평로|          0|                     46|                        0|                   3|                     4|      2711010600|
445 |         |   1063002|     중구|35.86516734|128.6105401|2711010700103790000|27110107001003100...| 271104223055|  41945| 	대구광역시|    중구|   	 삼덕동3가|         |	 달구벌대로443길|          0|                     62|                       16|                  31|                     2|      2711010700|
446 |         |   1024017|     중구|35.86927185|128.5937782|2711011700101200003|27110115001008500...| 271102007001|  41909|	대구광역시|    중구|        남일동|         |         중앙대로|          1|                    424|                        0|                 143|                     1|      2711011700|
447 |         +----------+--------+-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+
448 | 
449 |         """
450 |         return self.joined_df
451 | 


--------------------------------------------------------------------------------
/sparkplus/core/job.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql.types import *
  2 | from pyspark.sql import SparkSession
  3 | import sparkplus
  4 | import os
  5 | from dotenv import load_dotenv
  6 | import geopandas as gpd
  7 | from py_log import logger
  8 | 
  9 | load_dotenv()
 10 | 
 11 | 
 12 | def load_shp_from_s3(bucket, key):
 13 |     return gpd.read_parquet(f"s3://{bucket}/{key}")
 14 | 
 15 | 
 16 | def db_table_to_df(spark, table):
 17 |     df = (
 18 |         spark.read.format("jdbc")
 19 |         .option("driver", os.getenv("DB_DRIVER"))
 20 |         .option("url", os.getenv("DB_URL"))
 21 |         .option("dbtable", table)
 22 |         .option("user", os.getenv("DB_USER"))
 23 |         .option("password", os.getenv("DB_PASSWORD"))
 24 |         .load()
 25 |     )
 26 |     return df
 27 | 
 28 | 
 29 | def load_table(spark):
 30 |     table_list = [
 31 |         "additional_info_busan",
 32 |         "additional_info_chungbuk",
 33 |         "additional_info_chungnam",
 34 |         "additional_info_daegu",
 35 |         "additional_info_daejeon",
 36 |         "additional_info_gangwon",
 37 |         "additional_info_gwangju",
 38 |         "additional_info_gyeongbuk",
 39 |         "additional_info_gyeonggi",
 40 |         "additional_info_gyeongnam",
 41 |         "additional_info_incheon",
 42 |         "additional_info_jeju",
 43 |         "additional_info_jeonbuk",
 44 |         "additional_info_jeonnam",
 45 |         "additional_info_sejong",
 46 |         "additional_info_seoul",
 47 |         "additional_info_ulsan",
 48 |         "jibun_address_busan",
 49 |         "jibun_address_chungbuk",
 50 |         "jibun_address_chungnam",
 51 |         "jibun_address_daegu",
 52 |         "jibun_address_daejeon",
 53 |         "jibun_address_gangwon",
 54 |         "jibun_address_gwangju",
 55 |         "jibun_address_gyeongbuk",
 56 |         "jibun_address_gyeonggi",
 57 |         "jibun_address_gyeongnam",
 58 |         "jibun_address_incheon",
 59 |         "jibun_address_jeju",
 60 |         "jibun_address_jeonbuk",
 61 |         "jibun_address_jeonnam",
 62 |         "jibun_address_sejong",
 63 |         "jibun_address_seoul",
 64 |         "jibun_address_ulsan",
 65 |         "roadname_address_busan",
 66 |         "roadname_address_chungbuk",
 67 |         "roadname_address_chungnam",
 68 |         "roadname_address_daegu",
 69 |         "roadname_address_daejeon",
 70 |         "roadname_address_gangwon",
 71 |         "roadname_address_gwangju",
 72 |         "roadname_address_gyeongbuk",
 73 |         "roadname_address_gyeonggi",
 74 |         "roadname_address_gyeongnam",
 75 |         "roadname_address_incheon",
 76 |         "roadname_address_jeju",
 77 |         "roadname_address_jeonbuk",
 78 |         "roadname_address_jeonnam",
 79 |         "roadname_address_sejong",
 80 |         "roadname_address_seoul",
 81 |         "roadname_address_ulsan",
 82 |         "roadname_code",
 83 |         "integrated_address_daegu",
 84 |     ]
 85 | 
 86 |     for table in table_list:
 87 |         name = table
 88 |         globals()[name] = db_table_to_df(spark, table)
 89 |     return globals()
 90 | 
 91 | 
 92 | spark = SparkSession.builder.appName("Spark App").getOrCreate()
 93 | 
 94 | # Load csv file
 95 | logger.debug("Loading csv...")
 96 | origin = spark.read.csv("s3://sparkplus-core/resource/data/daegu_streetlight.csv")
 97 | logger.debug("Loading complete.")
 98 | 
 99 | # Clear data
100 | daegu = origin.drop("_c1")
101 | daegu = daegu.where("_c0 > 10000")
102 | custom = sparkplus.CustomDataFrame(daegu, "_c3", "_c2")
103 | 
104 | # Load parquet file
105 | logger.debug("Loading parquet...")
106 | shp_df = gpd.read_parquet("s3://sparkplus-core/resource/LSMD/Daegu.parquet")
107 | logger.debug("Loading complete...")
108 | 
109 | # Load table from Database
110 | logger.debug("Loading db...")
111 | db_dict = load_table(spark)
112 | logger.debug("Loading complete...")
113 | 
114 | result = custom.join_with_table(shp_df, db_dict["integrated_address_daegu"])
115 | result.show()
116 | 


--------------------------------------------------------------------------------
/sparkplus/core/numaddr_dataframe.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))))
  4 | 
  5 | from pyspark.sql import DataFrame
  6 | from pyspark.sql.functions import split, col
  7 | from sparkplus.core.udfs import *
  8 | 
  9 | class NumAddrDataFrame(object):
 10 |     """
 11 |     도로명 주소를 활용하여 데이터를 분석하기 위한 클래스입니다
 12 |     """
 13 |     def __init__(self, dataFrame: DataFrame):
 14 |         self._df = dataFrame
 15 |         self._tmp_df = dataFrame
 16 |         self.col_list = dataFrame.columns
 17 | 
 18 |     def to_bupjungdong(self, target: str, db_df:DataFrame):
 19 |         """
 20 |         도로명을 지번으로 변경하는 전 과정을 포함하는 함수입니다
 21 |         """
 22 |         self.add_split(target)
 23 |         self.add_sido()
 24 |         self.add_sigungu()
 25 |         self.add_eupmyeondong()
 26 |         self.add_jibun_primary()
 27 |         self.add_jibun_secondary()
 28 |         self.join_with_db(db_df)
 29 |         # self.join_with_db(db_df)
 30 |         return self._df
 31 | 
 32 |     def add_split(self, target: str):
 33 |         """
 34 |         DB에서 조회를 위해 원본의 string을 공백 기준으로 나누는 함수입니다.
 35 | 
 36 |         Parameters
 37 |         ----------
 38 |         target : str
 39 |             split하고 조작할 원본 데이터의 컬럼명
 40 | 
 41 |         Examples
 42 |         --------
 43 |         >>> road_df = RoadnameDataframe(your_df)
 44 |         >>> road_df._df.show()
 45 |         +------------------------------+s
 46 |         |target                        |
 47 |         +------------------------------+
 48 |         |경기도 화성시 장안면 매바위로366번길 8 |
 49 |         |경기도 화성시 장안면 버들로          |
 50 |         |경기도 화성시 장안면 석포리          |
 51 |         +------------------------------+
 52 | 
 53 |         >>> splited_df = road_df.add_split('target')
 54 |         >>> splited_df.show()
 55 |         +------------------------------+-----------------------------------+
 56 |         |target                        |split                              |
 57 |         +------------------------------+-----------------------------------+
 58 |         |경기도 화성시 장안면 매바위로366번길 8|[경기도, 화성시, 장안면, 매바위로366번길, 8]|
 59 |         |경기도 화성시 장안면 버들로         |[경기도, 화성시, 장안면, 버들로]           |
 60 |         |경기도 화성시 장안면 석포리         |[경기도, 화성시, 장안면, 석포리]           |
 61 |         +-----------------------------+------------------------------------+
 62 |         """
 63 |         self._df = self._df.withColumn('split', split(self._df[target], ' '))
 64 |         return self._df
 65 | 
 66 |     def cleanse_split_column(self):
 67 |         """
 68 |         주소가 비정형 데이터일 경우 사용되는 함수입니다.
 69 |         add_split_column 함수로 쪼개진 split 컬럼의 데이터를 전처리합니다.
 70 | 
 71 |         UDF
 72 |         ---
 73 |         where_is_sido : IntegerType
 74 |             split 컬럼에서 특별시와 광역시, 도를 찾고, 위치한 인덱스를 반환합니다.
 75 | 
 76 |             Exmaple
 77 |             -------
 78 |             >>> df.show()
 79 |             +---------------------------------------------+
 80 |             |split                                        |
 81 |             +---------------------------------------------+
 82 |             |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]|
 83 |             |[경기도, 화성시, 장안면, 버들로]                   |
 84 |             |[경기도, 화성시, 장안면, 석포리]                   |
 85 |             +--------------------------------------------+
 86 | 
 87 |             >>> df.withColumn('idx', where_is_sido(split)).show()
 88 |             +---------------------------------------------+----+
 89 |             |split                                        |sido|
 90 |             +---------------------------------------------+----+
 91 |             |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]|   1|
 92 |             |[경기도, 화성시, 장안면, 버들로]                   |   0|
 93 |             |[경기도, 화성시, 장안면, 석포리]                   |   2|
 94 |             +--------------------------------------------+----+
 95 |             
 96 |         cleanse_split: ArrayType(StringType)
 97 |             split 컬럼과 인덱스 컬럼을 활용하여 알맞은 주소체계 값으로 반환합니다.
 98 | 
 99 |             Example
100 |             -------
101 |             >>> df.show()
102 |             +------------------------------------------------+---+
103 |             |split                                           |idx|
104 |             +------------------------------------------------+---+
105 |             |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]    | 1|
106 |             |[경기도, 화성시, 장안면, 버들로]                       | 0|
107 |             |[Gyeonggi-do, [185-74], 경기도, 화성시, 장안면, 석포리]| 2|
108 |             +------------------------------------------------+---+
109 | 
110 |             >>> df.withColumn('split', cleanse_split(df.split))
111 |             +----------------------------------------+
112 |             |split                                   |
113 |             +----------------------------------------+
114 |             |[경기도, 화성시, 장안면,매바위로366번길, 8]     |
115 |             |[경기도, 화성시, 장안면, 버들로]              |
116 |             |[경기도, 화성시, 장안면, 석포리]              |
117 |             +---------------------------------------+
118 |         """
119 | 
120 |         self._df = self._df \
121 |                         .withColumn('idx', where_is_sido(self._df.split)) \
122 |                         .withColumn('split', cleanse_split(self._df.idx, self._df.split))
123 |         self._df = self._df.drop('idx')
124 |         self._df = self._df.withColumn('split', process_numaddr(self._df.split))
125 |         return self._df
126 | 
127 |     def add_sido(self):
128 |         """
129 |         특별시, 광역시, 도를 기존 데이터프레임에 추가하는 함수입니다.
130 | 
131 |         UDF
132 |         ---
133 |         extract_sido : StringType
134 |             split 컬럼에서 특별시와 광역시, 도를 찾고 값을 반환합니다.
135 |             값이 없는 경우, "None" : str 을 반환합니다.
136 | 
137 |             Exmaple
138 |             -------
139 |             >>> df.show()
140 |             +----------------------------------------+
141 |             |split                                   |
142 |             +----------------------------------------+
143 |             |[경기도, 안산시, 단원구, 해봉로, 137]          |
144 |             |[경기도, 수원시, 장안구, 경수대로, 1079]        |
145 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]        |
146 |             +----------------------------------------+
147 | 
148 |             >>> df.withColumn('idx', extract_sido()).show()
149 |             +----------------------------------------------+-----+
150 |             |split                                         |sido |
151 |             +----------------------------------------------+-----+
152 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |
153 |             |[경기도, 수원시, 장안구, 경수대로, 1079]              |경기도 |
154 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]              |경기도 |
155 |             +----------------------------------------------+------+
156 |         """
157 | 
158 |         self._df = self._df.withColumn("sido", extract_sido(self._df.split))
159 |         self._df.show()
160 |         return self._df
161 | 
162 |     def add_sigungu(self):
163 |         """
164 |         시, 군, 구 컬럼을 기존 데이터프레임에 추가하는 함수입니다.
165 |         UDF
166 |         ---
167 |         extract_sigungu : StringType
168 |             split 컬럼에서 시, 군, 구를 찾고 값을 반환합니다.
169 | 
170 |             시와 구가 같이 있을경우에는 시와 구를 같이 반환합니다.
171 |             ex) 경기도 성남시 분당구 -> 성남시 분당구
172 | 
173 |             값이 없는 경우, "None" : str 을 반환합니다.
174 | 
175 |             Exmaple
176 |             -------
177 |             >>> df.show()
178 |             +----------------------------------------------+-----+
179 |             |split                                         |sido |
180 |             +----------------------------------------------+-----+
181 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |
182 |             |[경기도, 수원시, 장안구, 경수대로, 1079]              |경기도 |
183 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]              |경기도 |
184 |             +----------------------------------------------+------+
185 | 
186 |             >>> df.withColumn('idx', extract_sigungu()).show()
187 |             +----------------------------------------------+------+-----------+
188 |             |split                                         |sido  |sigungu    |
189 |             +----------------------------------------------+------+-----------+
190 |             |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |
191 |             |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |
192 |             |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |
193 |             +----------------------------------------------+------+-----------+
194 |         """
195 | 
196 |         self._df = self._df.withColumn("sigungu", extract_sigungu(self._df.split))
197 |         self._df.show()
198 |         return self._df
199 | 
200 |     def add_eupmyeondong(self):
201 |         """
202 |         읍, 면 컬럼을 기존에 데이터프레임에 추가하는 함수입니다.
203 | 
204 |         UDF
205 |         ---
206 |         extract_eupmyeon : StringType
207 |             split 컬럼에서 읍이나 면을 찾고 값을 반환합니다.
208 | 
209 |             값이 없는 경우, "None" : str 을 반환합니다.
210 | 
211 |             Exmaple
212 |             -------
213 |             >>> df.show()
214 |             +----------------------------------------------+------+-----------+
215 |             |split                                         |sido  |sigungu    |
216 |             +----------------------------------------------+------+-----------+
217 |             |[경기도, 화성시, 장안면, 매바위로366번길, 8]           |경기도 |화성시       |
218 |             |[강원도, 원주시, 호저면, 사제로, 9]                  |강원도  |원주시      |
219 |             |[경상남도, 사천시, 곤양면, 경충로, 23-1]              |경상남도|사천시      |
220 |             +----------------------------------------------+------+-----------+
221 | 
222 |             >>> df.withColumn('idx', extract_eupmyeon()).show()
223 |             +----------------------------------------------+------+-----------+--------+
224 |             |split                                         |sido  |sigungu    |eupmyeon|
225 |             +----------------------------------------------+------+-----------+--------+
226 |             |[경기도, 화성시, 장안면, 매바위로366번길, 8]           |경기도 |화성시       |장안면   |
227 |             |[강원도, 원주시, 호저면, 사제로, 9]                  |강원도  |원주시      |호저면   |
228 |             |[경상남도, 사천시, 곤양면, 경충로, 23-1]              |경상남도|사천시      |곤양면   |
229 |             +----------------------------------------------+------+-----------+-------+
230 |         """
231 |         self._df = self._df.withColumn("eupmyeondong", extract_eupmyeondong(self._df.split))
232 |         self._df.show()
233 |         return self._df
234 | 
235 |     def add_jibun_primary(self):
236 |         self._df = self._df.withColumn("jibun_primary_number", extract_jibun_primary(self._df.split))
237 |         self._df.show()
238 |         return self._df
239 | 
240 |     def add_jibun_secondary(self):
241 |         self._df = self._df.withColumn("jibun_secondary_number", extract_jibun_secondary(self._df.split))
242 |         self._df.show()
243 |         return self._df
244 |         
245 |     def join_with_db(self, db_df):
246 |         """
247 |         데이터베이스 데이터프레임과 조인하는 함수입니다.
248 | 
249 |         Parameters
250 |         ----------
251 |         db_df : DataFrame
252 | 
253 | 
254 |         Exmaple
255 |         -------
256 |         >>> df.show()
257 |         +----------------------------------------------+------+-----------+---------+-----------------------+
258 |         |split                                         |sido  |sigungu    |roadname |building_primary_number|
259 |         +----------------------------------------------+------+-----------+---------+-----------------------+
260 |         |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |해봉로      |137                    |
261 |         |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |경수대로    |1079                   |
262 |         |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |양달말길    |93                     |
263 |         +----------------------------------------------+------+-----------+---------+-----------------------+
264 | 
265 |         >>> df.withColumn('idx', extract_building_primary_number()).show()
266 |         +----------------------------------------------+------+-----------+---------+-----------------------+----------------+
267 |         |split                                         |sido  |sigungu    |roadname |building_primary_number|bupjungdong_code|
268 |         +----------------------------------------------+------+-----------+---------+-----------------------+----------------+
269 |         |[경기도, 안산시, 단원구, 해봉로, 137]                |경기도 |안산시 단원구 |해봉로      |137                    |4128112400     |
270 |         |[경기도, 수원시, 장안구, 경수대로, 1079]             |경기도  |수원시 장안구 |경수대로    |1079                   |4128111800     |
271 |         |[경기도, 안산시, 상록구, 양달말길, 93-7]             |경기도  |안산시 상록구 |양달말길    |93                     |4128101100     |
272 |         +----------------------------------------------+------+-----------+---------+-----------------------+---------------+
273 |         """
274 |         tmp_db_df = db_df.select( \
275 |                     col("sido").alias("db_sido"), \
276 |                     col("sigungu").alias("db_sigungu"), \
277 |                     col("eupmyeondong").alias("db_eupmyeondong"), \
278 |                     col("roadname").alias("db_roadname"), \
279 |                     col("jibun_primary_number").alias("db_jibun_primary_number"), \
280 |                     col("jibun_secondary_number").alias("db_jibun_secondary_number"), \
281 |                     col("bupjungdong_code").alias("db_bupjungdong_code") \
282 |                     ) \
283 |                     #.drop_duplicates(['db_roadname', 'db_building_primary_number'])
284 |         tmp_df = self._df.join(tmp_db_df, (self._df.sigungu == tmp_db_df.db_sigungu) & (self._df.eupmyeondong == tmp_db_df.db_eupmyeondong) & (self._df.jibun_primary_number == tmp_db_df.db_jibun_primary_number) & (self._df.jibun_secondary_number == tmp_db_df.db_jibun_secondary_number), 'inner')
285 |         tmp_df = tmp_df.withColumnRenamed("db_bupjungdong_code", "bupjungdong_code")
286 |         self._df = tmp_df.select(self._df['*'], "bupjungdong_code")
287 |         del self._tmp_df
288 |         del tmp_df
289 | 
290 |         return self._df
291 | 
292 |         
293 | 


--------------------------------------------------------------------------------
/sparkplus/core/py_log.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.handlers
 3 | import datetime
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | formatter = logging.Formatter(
 7 |     "[%(asctime)s[%(levelname)s|%(filename)s:%(lineno)s] >> %(message)s"
 8 | )
 9 | 
10 | stremaHandler = logging.StreamHandler()
11 | now = str(datetime.datetime.now()).split(".")[0]
12 | fileHandler = logging.FileHandler("../logs/" + now)
13 | logger.setLevel(level=logging.DEBUG)
14 | 
15 | stremaHandler.setFormatter(formatter)
16 | fileHandler.setFormatter(formatter)
17 | 
18 | logger.addHandler(stremaHandler)
19 | logger.addHandler(fileHandler)
20 | 
21 | 
22 | logger.setLevel(level=logging.DEBUG)
23 | logging.debug("DEBUG log")
24 | logging.info("INFO log")
25 | logging.warning("WARN log")
26 | logging.error("ERROR log")
27 | logging.critical("CRITICAL log")
28 | 


--------------------------------------------------------------------------------
/sparkplus/core/shp_to_parquet.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import geopandas as gpd
 3 | 
 4 | input_file = sys.argv[1]
 5 | file_name = str(input_file)[:-4]
 6 | region_code = int(file_name[16:18])
 7 | gdf = gpd.read_file(input_file)
 8 | dict = {
 9 |     41: "Gyeonggi",
10 |     48: "Gyeongnam",
11 |     47: "Gyeongbuk",
12 |     29: "Gwangju",
13 |     27: "Daegu",
14 |     30: "Daejeon",
15 |     26: "Busan",
16 |     11: "Seoul",
17 |     36: "Sejong",
18 |     28: "Incheon",
19 |     26: "Jeonnam",
20 |     45: "Jeonbuk",
21 |     50: "Jeju",
22 |     44: "Chungnam",
23 |     43: "Chungbuk",
24 |     31: "Ulsan",
25 |     42: "Kangwon",
26 | }
27 | file_name = dict[region_code] + ".parquet"
28 | gdf = gdf.set_crs(5174)
29 | gdf = gdf.to_crs(4326)
30 | gdf.to_parquet(file_name)
31 | 


--------------------------------------------------------------------------------
/sparkplus/core/tablename.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | from typing import List
  3 | 
  4 | 
  5 | class EPrefix(Enum):
  6 |     """
  7 |     Prefixes of Spark+ database.
  8 |     Get details from https://github.com/SWM-SparkPlus/db-updater#%EB%8F%84%EB%A1%9C%EB%AA%85%EC%A3%BC%EC%86%8C-%ED%85%8C%EC%9D%B4%EB%B8%94
  9 |     """
 10 | 
 11 |     ADDINFO = "additional_info"
 12 |     ROADNAME = "roadname_address"
 13 |     JIBUN = "jibun_address"
 14 |     INTEGRATED = "integrated_address"
 15 | 
 16 | 
 17 | class ESido(Enum):
 18 |     """
 19 |     Enum for Korean metropolitan cities.
 20 |     """
 21 | 
 22 |     SEOUL = "seoul"
 23 |     INCHEON = "incheon"
 24 |     DAEJEON = "daejeon"
 25 |     SEJONG = "sejong"
 26 |     GWANGJU = "gwangju"
 27 |     DAEGU = "daegu"
 28 |     ULSAN = "ulsan"
 29 |     BUSAN = "busan"
 30 |     JEJU = "jeju"
 31 |     GYEONGGI = "gyeonggi"
 32 |     GANGWON = "gangwon"
 33 |     CHUNGBUK = "chungbuk"
 34 |     CHUNGNAM = "chungnam"
 35 |     JEONBUK = "jeonbuk"
 36 |     JEONNAM = "jeonnam"
 37 |     GYEONGBUK = "gyeongbuk"
 38 |     GYEONGNAM = "gyeongnam"
 39 | 
 40 | 
 41 | def get_tablename_by_prefix_and_sido(prefix: EPrefix, sido: ESido) -> str:
 42 |     """
 43 |     Get tablename of Spark+ database.
 44 | 
 45 |     Example
 46 |     --------
 47 | 
 48 |     >>> target_table = get_table_name(EPrefix.ADDINFO, ESIDO.SEOUL)    # additional_info_seoul
 49 |     >>> target_table = get_table_name(EPrefix.INTEGRATED, ESIDO.BUSAN) # integrated_address_busan
 50 |     >>> error_table = get_table_name(EPrefix.ADDINFO,  "anywhere")     # Get AttributeError
 51 |     """
 52 | 
 53 |     return f"{prefix.value}_{sido.value}"
 54 | 
 55 | 
 56 | def get_all_tablenames_by_prefix(prefix: EPrefix) -> List[str]:
 57 |     """
 58 |     Get all tablenames by given `EPrefix`. If you want to load all database tables to Spark `DataFrame`, see example below.
 59 |     It takes a lot of intensive works,
 60 | 
 61 |     Example
 62 |     -------
 63 | 
 64 |     >>> from tablename import get_all_tablenames_by_prefix, EPrefix
 65 |     >>> get_all_tablenames_by_prefix(EPrefix.INTEGRATED)
 66 |     ['integrated_address_seoul', 'integrated_address_incheon', 'integrated_address_daejeon', 'integrated_address_sejong', 'integrated_address_gwangju', 'integrated_address_daegu', 'integrated_address_ulsan', 'integrated_address_busan', 'integrated_address_jeju', 'integrated_address_gyeonggi', 'integrated_address_gangwon', 'integrated_address_chungbuk', 'integrated_address_chungnam', 'integrated_address_jeonbuk', 'integrated_address_jeonnam', 'integrated_address_gyeongbuk', 'integrated_address_gyeongnam']
 67 |     >>> # Load all data from database
 68 |     >>> from pyspark.sql import SparkSession
 69 |     >>> from pyspark.sql.functions import rand
 70 |     >>> from base import SPDataFrame
 71 |     >>> ss = SparkSession.builder.config('spark.driver.memory', '14g').getOrCreate()
 72 |     >>> all_tablenames = get_all_tablenames_by_prefix(EPrefix.INTEGRATED)
 73 |     >>> SPDataFrame.get_db_df_by_tablenames(ss, all_tablenames, ...).select('sido', 'sigungu', 'eupmyeondong').orderBy(rand()).show()
 74 |     +----------+-------------+------------+
 75 |     |      sido|      sigungu|eupmyeondong|
 76 |     +----------+-------------+------------+
 77 |     |광주광역시|         동구|   금남로4가|
 78 |     |인천광역시|       옹진군|      백령면|
 79 |     |  전라북도|전주시 덕진구|   인후동1가|
 80 |     |    경기도|용인시 처인구|      포곡읍|
 81 |     |  전라남도|       해남군|      화산면|
 82 |     |    강원도|       철원군|        서면|
 83 |     |    경기도|성남시 수정구|      산성동|
 84 |     |  경상남도|       산청군|      금서면|
 85 |     |  전라남도|       보성군|      웅치면|
 86 |     |  전라남도|       완도군|      약산면|
 87 |     |    경기도|       이천시|    장호원읍|
 88 |     |    경기도|       포천시|      가산면|
 89 |     |    경기도|       부천시|      소사동|
 90 |     |  경상남도|       창녕군|      영산면|
 91 |     |    강원도|       원주시|      학성동|
 92 |     |부산광역시|       강서구|     대저1동|
 93 |     |  전라남도|       곡성군|      옥과면|
 94 |     |  경상북도|       울진군|        북면|
 95 |     |  충청남도|       아산시|      탕정면|
 96 |     |서울특별시|       중랑구|      면목동|
 97 |     +----------+-------------+------------+
 98 |     """
 99 |     return [f"{prefix.value}_{sido.value}" for sido in ESido]
100 | 


--------------------------------------------------------------------------------
/sparkplus/core/test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "WARNING: An illegal reflective access operation has occurred\n",
 13 |       "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/Users/taypark/DEV/apache-spark/spark-3.1.2-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.2.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
 14 |       "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
 15 |       "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
 16 |       "WARNING: All illegal access operations will be denied in a future release\n"
 17 |      ]
 18 |     },
 19 |     {
 20 |      "name": "stdout",
 21 |      "output_type": "stream",
 22 |      "text": [
 23 |       "21/10/12 05:16:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
 24 |       "[('spark.app.name', 't'), ('spark.app.startTime', '1633983376848'), ('spark.executor.id', 'driver'), ('spark.driver.host', 'localhost'), ('spark.sql.warehouse.dir', 'file:/Users/taypark/Repositories/spark-plugin/sparkplus/core/spark-warehouse'), ('spark.driver.extraClassPath', '/Users/taypark/Repositories/spark-plugin/resource/mysql-connector-java-8.0.26/mysql-connector-java-8.0.26.jar'), ('spark.driver.port', '54511'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.driver.maxResultSize', '0'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.driver.memory', '14g'), ('spark.ui.showConsoleProgress', 'true'), ('spark.sql.execution.arrow.pyspark.enabled', 'true'), ('spark.app.id', 'local-1633983377979')]\n"
 25 |      ]
 26 |     }
 27 |    ],
 28 |    "source": [
 29 |     "from base import SPDataFrame\n",
 30 |     "from pyspark.sql import SparkSession\n",
 31 |     "import os\n",
 32 |     "import geopandas as gpd\n",
 33 |     "\n",
 34 |     "ss_builder = SparkSession.builder.appName('t')\n",
 35 |     "\n",
 36 |     "ss_builder.config('spark.driver.extraClassPath',\n",
 37 |     "                  '/Users/taypark/Repositories/spark-plugin/resource'\n",
 38 |     "                  '/mysql-connector-java-8.0.26/mysql-connector-java-8.0.26.jar')\\\n",
 39 |     "    .config('spark.driver.memory', '14g')\\\n",
 40 |     "    .config('spark.sql.execution.arrow.pyspark.enabled', 'true')\\\n",
 41 |     "    .config('spark.driver.maxResultSize', 0)\n",
 42 |     "\n",
 43 |     "ss = ss_builder.getOrCreate()\n",
 44 |     "\n",
 45 |     "# print(ss.sparkContext.getConf().getAll())\n",
 46 |     "\n",
 47 |     "gyeonggi_table_df = SPDataFrame.get_db_df_by_tablenames(ss, ['integrated_address_gyeonggi'],\n",
 48 |     "                                          driver='com.mysql.cj.jdbc.Driver',\n",
 49 |     "                                          url='jdbc:mysql://localhost:3306/sparkplus',\n",
 50 |     "                                          user='root',\n",
 51 |     "                                          password='sparkplus')\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 4,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "root\n",
 64 |       " |-- manage_number: string (nullable = true)\n",
 65 |       " |-- roadname_code: string (nullable = true)\n",
 66 |       " |-- zipcode: string (nullable = true)\n",
 67 |       " |-- sido: string (nullable = true)\n",
 68 |       " |-- sigungu: string (nullable = true)\n",
 69 |       " |-- eupmyeondong: string (nullable = true)\n",
 70 |       " |-- bupjungli: string (nullable = true)\n",
 71 |       " |-- roadname: string (nullable = true)\n",
 72 |       " |-- is_basement: string (nullable = true)\n",
 73 |       " |-- building_primary_number: integer (nullable = true)\n",
 74 |       " |-- building_secondary_number: integer (nullable = true)\n",
 75 |       " |-- bupjungdong_code: string (nullable = true)\n",
 76 |       "\n"
 77 |      ]
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "gyeonggi_table_df.printSchema()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 4,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# SPDataFrame(gyeonggi_table_df).address_to_h3(addr_col_name='is_basement').show()\n",
 91 |     "import geopandas as gpd\n",
 92 |     "from pyspark.sql.functions import lit\n",
 93 |     "\n",
 94 |     "# gyeonggi_table_df.limit(1)\\\n",
 95 |     "#     .withColumn('point', lit(gpd.points_from_xy(37.3211047, 126.9889655))).show()\n",
 96 |     "\n",
 97 |     "PREFIX = (37.32, 126.98)\n",
 98 |     "\n",
 99 |     "from faker import Faker\n",
100 |     "\n",
101 |     "fake = Faker()\n",
102 |     "\n",
103 |     "xy_box = []\n",
104 |     "for i in range(50):\n",
105 |     "    x, y = PREFIX\n",
106 |     "    px, py = str(x), str(y)\n",
107 |     "    px, py = px + str(fake.random_int(0, 99999)), py + str(fake.random_int(0, 99999))\n",
108 |     "    xy_box.append((px, py))\n",
109 |     "\n",
110 |     "    \n",
111 |     "    "
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 5,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "                         PNU     JIBUN BCHK  SGG_OID COL_ADM_SE  \\\n",
124 |       "0        4111710300101670003    167-3전    1   270697      41110   \n",
125 |       "1        4111113800101980001    198-1전    1   270698      41110   \n",
126 |       "2        4111710300201190028  산119-28임    1   270699      41110   \n",
127 |       "3        4111710300109070001    907-1잡    1   698166      41110   \n",
128 |       "4        4111710300101770017   177-17임    1   270701      41110   \n",
129 |       "...                      ...       ...  ...      ...        ...   \n",
130 |       "5078153  4183040033101950006   195-6 전    1  1892811      41830   \n",
131 |       "5078154  4183039521200440024  산44-24 임    1  1893383      41830   \n",
132 |       "5078155  4183036023102440000      244전    1  1885066      41830   \n",
133 |       "5078156  4183036023102440004   244-4 전    1  1885067      41830   \n",
134 |       "5078157  4183036023102440003   244-3 전    1  1885068      41830   \n",
135 |       "\n",
136 |       "                                                  geometry  \n",
137 |       "0        POLYGON ((127.05529 37.28866, 127.05533 37.288...  \n",
138 |       "1        POLYGON ((127.01543 37.32614, 127.01547 37.326...  \n",
139 |       "2        POLYGON ((127.05120 37.28951, 127.05120 37.289...  \n",
140 |       "3        POLYGON ((127.03676 37.29320, 127.03723 37.294...  \n",
141 |       "4        POLYGON ((127.05132 37.28945, 127.05170 37.289...  \n",
142 |       "...                                                    ...  \n",
143 |       "5078153  POLYGON ((127.59218 37.54486, 127.59216 37.544...  \n",
144 |       "5078154  POLYGON ((127.64311 37.46407, 127.64309 37.464...  \n",
145 |       "5078155  POLYGON ((127.65476 37.52167, 127.65476 37.521...  \n",
146 |       "5078156  POLYGON ((127.65500 37.52165, 127.65498 37.521...  \n",
147 |       "5078157  POLYGON ((127.65516 37.52165, 127.65516 37.521...  \n",
148 |       "\n",
149 |       "[5078158 rows x 6 columns]\n",
150 |       "21/10/12 08:15:01 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 928702 ms exceeds timeout 120000 ms\n",
151 |       "21/10/12 08:15:01 WARN SparkContext: Killing executors is not supported by current scheduler.\n"
152 |      ]
153 |     }
154 |    ],
155 |    "source": [
156 |     "coord_df = ss.createDataFrame(xy_box)\n",
157 |     "coord_df = coord_df.withColumnRenamed('_1', 'lat').withColumnRenamed('_2', 'lng')\n",
158 |     "\n",
159 |     "import os \n",
160 |     "PARQUET_PATH = os.getcwd() + '/../../resource/Gyeonggi.parquet'\n",
161 |     "\n",
162 |     "gyeonggi_gdf = gpd.read_parquet(PARQUET_PATH)\n",
163 |     "\n",
164 |     "print(gyeonggi_gdf)"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 2,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "ename": "NameError",
174 |      "evalue": "name 'coord_df' is not defined",
175 |      "output_type": "error",
176 |      "traceback": [
177 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
178 |       "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
179 |       "\u001b[0;32m/var/folders/z5/59xjw6ps4m95mnplymd3q3hc0000gn/T/ipykernel_49531/2756843327.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# 개발용\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpandas_coord_to_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoord_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoPandas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;31m# print(pandas_coord_to_df)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
180 |       "\u001b[0;31mNameError\u001b[0m: name 'coord_df' is not defined"
181 |      ]
182 |     }
183 |    ],
184 |    "source": [
185 |     "# 개발용\n",
186 |     "\n",
187 |     "pandas_coord_to_df = coord_df.toPandas()\n",
188 |     "\n",
189 |     "# print(pandas_coord_to_df)\n",
190 |     "print(pandas_coord_to_df)\n",
191 |     "\n",
192 |     "# point_sdf_to_geodataframe = gpd.GeoDataFrame(coord_df, geometry=gpd.points_from_xy(pandas_coord_to_df.lat, pandas_coord_to_df.lng))\n",
193 |     "\n",
194 |     "# print(point_sdf_to_geodataframe)\n",
195 |     "\n",
196 |     "temp_list = []\n",
197 |     "\n",
198 |     "# for i in point_sdf_to_geodataframe.index:\n",
199 |     "#     for j in gyeonggi_gdf.index:\n",
200 |     "#         if gyeonggi_gdf.geometry[j].contains(point_sdf_to_geodataframe[i]):\n",
201 |     "#             temp_list.append(gyeonggi_gdf.EMD_CD)\n",
202 |     "\n",
203 |     "print(temp_list)"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": []
212 |   }
213 |  ],
214 |  "metadata": {
215 |   "interpreter": {
216 |    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
217 |   },
218 |   "kernelspec": {
219 |    "display_name": "Python 3.9.6 64-bit",
220 |    "name": "python3"
221 |   },
222 |   "language_info": {
223 |    "codemirror_mode": {
224 |     "name": "ipython",
225 |     "version": 3
226 |    },
227 |    "file_extension": ".py",
228 |    "mimetype": "text/x-python",
229 |    "name": "python",
230 |    "nbconvert_exporter": "python",
231 |    "pygments_lexer": "ipython3",
232 |    "version": "3.9.6"
233 |   },
234 |   "orig_nbformat": 4
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 2
238 | }
239 | 


--------------------------------------------------------------------------------
/sparkplus/core/udfs.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql.functions import udf
  2 | from pyspark.sql.types import IntegerType, StringType, ArrayType
  3 | 
  4 | sido_short_list = [
  5 |     "서울",
  6 |     "부산",
  7 |     "대구",
  8 |     "인천",
  9 |     "광주",
 10 |     "대전",
 11 |     "울산",
 12 |     "세종",
 13 |     "경기",
 14 |     "강원",
 15 |     "충북",
 16 |     "충남",
 17 |     "전북",
 18 |     "전남",
 19 |     "경북",
 20 |     "경남",
 21 |     "제주",
 22 | ]
 23 | 
 24 | sido_long_list = [
 25 |     "서울특별시",
 26 |     "부산광역시",
 27 |     "대구광역시",
 28 |     "인천광역시",
 29 |     "광주광역시",
 30 |     "대전광역시",
 31 |     "울산광역시",
 32 |     "세종특별자치시",
 33 |     "경기도",
 34 |     "강원도",
 35 |     "충청북도",
 36 |     "충청남도",
 37 |     "전라북도",
 38 |     "전라남도",
 39 |     "경상북도",
 40 |     "경상남도",
 41 |     "제주특별자치도",
 42 | ]
 43 | sido_dictionary = dict(zip(sido_short_list, sido_long_list))
 44 | sido_reverse_dictionary = dict(zip(sido_long_list, sido_short_list))
 45 | 
 46 | 
 47 | @udf(IntegerType())
 48 | def where_is_sido(split):
 49 |     for i in range(len(split)):
 50 |         if sido_dictionary.get(split[i]) or sido_reverse_dictionary.get(split[i]):
 51 |             return i
 52 |     return -1
 53 | 
 54 | 
 55 | @udf(ArrayType(StringType()))
 56 | def cleanse_split(idx, split):
 57 |     if idx != -1:
 58 |         return split[idx:]
 59 |     return split
 60 | 
 61 | 
 62 | @udf(ArrayType(StringType()))
 63 | def process_roadname(split):
 64 |     for i in range(len(split)):
 65 |         data = split[i]
 66 |         if data[-1].isdigit() and ("로" in data or "길" in data):
 67 |             result_li = list()
 68 |             for j in reversed(range(len(data))):
 69 |                 if not data[j].isdigit():
 70 |                     result_li.append(data[: j + 1]).append(data[j + 1 :])
 71 |                     return split[:i] + result_li + split[i + 1 :]
 72 |     return split
 73 | 
 74 | @udf(ArrayType(StringType()))
 75 | def process_numaddr(split):
 76 |     if split is None:
 77 |         return "None"
 78 | 
 79 |     data = split[2]
 80 |     return data
 81 | 
 82 | 
 83 | 
 84 | @udf(StringType())
 85 | def extract_sido(split):
 86 | 
 87 |     if split is None:
 88 |         return "None"
 89 | 
 90 |     for data in split:
 91 |         if data =='':
 92 |             continue
 93 |         if sido_dictionary.get(data):
 94 |             return sido_dictionary[data]
 95 |         elif sido_reverse_dictionary.get(data):
 96 |             return data
 97 |     return "None"
 98 | 
 99 | 
100 | @udf(StringType())
101 | def extract_sigungu(split):
102 | 
103 |     if split is None:
104 |         return "None"
105 | 
106 |     result = str()
107 |     flag = False
108 |     for data in split:
109 |         if data =='':
110 |             continue
111 |         if not sido_reverse_dictionary.get(data):
112 |             sigungu = data[-1]
113 |             if (sigungu == "시") or (sigungu == "군") or (sigungu == "구"):
114 |                 if not flag:
115 |                     result += data
116 |                     flag = True
117 |                 else:
118 |                     result += " " + data
119 |     if flag:
120 |         return result
121 |     return "None"
122 | 
123 | """
124 | @udf(StringType())
125 | def extract_eupmyeon(split):
126 |     if split is None:
127 |         return "None"
128 | 
129 |     for data in split:
130 |         if data == "":
131 |             continue
132 |         if data[-1] == "읍" or data[-1] == "면":
133 |             return data
134 |     return "None"
135 | """
136 | 
137 | @udf(StringType())
138 | def extract_eupmyeondong(split):
139 |     if split is None:
140 |         return "None"
141 | 
142 |     for data in split:
143 |         if data == "":
144 |             continue
145 |         if data[-1] == "읍" or  data[-1] == "면" or data[-1] == "동" or data[-1] == "가" and not data[0].isdigit():
146 |             return data
147 | 
148 |     return "None"
149 | 
150 | 
151 | @udf(StringType())
152 | def extract_dong(split):
153 |     if split is None:
154 |         return "None"
155 |     for data in split:
156 |         if data == "":
157 |             continue
158 |         if data[-1] == "동" and not data[0].isdigit():
159 |             return data
160 |     return "None"
161 | 
162 | 
163 | @udf(StringType())
164 | def extract_roadname(split):
165 |     if split is None:
166 |         return "None"
167 |     for data in split:
168 |         if data == "":
169 |             continue
170 |         if data[-1] == "로" or data[-1] == "길":
171 |             return data
172 |     return "None"
173 | 
174 | 
175 | @udf(StringType())
176 | def extract_building_primary_number(split, roadname):
177 |     if split is None:
178 |         return "None"
179 |     for i in range(len(split)):
180 |         if split[i - 1] == roadname:
181 |             data = split[i]
182 |             if data.isdigit():
183 |                 return data
184 |             elif "-" in data:
185 |                 for j in range(len(data)):
186 |                     if data[j] == "-":
187 |                         return data[:j]
188 |     return "None"
189 | 
190 | @udf(StringType())
191 | def extract_jibun_primary_number(split, roadname):
192 |     if split is None:
193 |         return "None"
194 |     if roadname not in split:
195 |         data = split[-1]
196 |         if data.isdigit():
197 |             return data
198 |         elif "-" in data:
199 |             for j in range(len(data)):
200 |                 if data[j] == "-":
201 |                     return data[:j]
202 |     return "None"
203 | 
204 | @udf(StringType())
205 | def extract_jibun_secondary(split):
206 |     if split is None:
207 |         return "None"
208 |     data = split[3]
209 |     for i in range(len(data)):
210 |         if data[i] == "-":
211 |             return data[i+1:]
212 | 
213 | @udf(StringType())
214 | def extract_sigungu_code(bupjungdong_code):
215 |     if bupjungdong_code is None or bupjungdong_code == "None":
216 |         return "None"
217 |     data = bupjungdong_code[:5]
218 |     return data


--------------------------------------------------------------------------------
/sparkplus/core/utils.py:
--------------------------------------------------------------------------------
  1 | import geopandas as gpd
  2 | from typing import List, Union
  3 | from pyspark.sql.session import SparkSession
  4 | 
  5 | def load_tables(
  6 |     sparkSession: SparkSession, tablenames: Union[str, List[str]], **kwargs
  7 | ):
  8 |     """
  9 |     Summary
 10 |     -------
 11 |     테이블명을 기반으로 Spark DataFrame을 반환합니다.
 12 | 
 13 |     Parameter
 14 |     ----
 15 |     sparkSession: Active Spark Session
 16 |     tablenames: DataFrame으로 만들 테이블명
 17 |     **kwargs: `driver`, `url`, `user`, `password`
 18 | 
 19 |     Raises:
 20 |         ValueError
 21 | 
 22 |     Returns:
 23 |         `DataFrame`s from database
 24 | 
 25 | 
 26 |     Usage
 27 |     -----
 28 |     >>> import SPDataFrame
 29 |     >>> ss = SparkSession.builder.getOrCreate()
 30 |     >>> tablenames = ['integrated_address_seoul', 'integrated_address_incheon', 'integrated_address_gyeonggi']
 31 |     >>> table_dfs = SPDataFrame(ss, tablenames,
 32 |                         driver='com.mysql.cj.jdbc.Driver',
 33 |                         url='jdbc:mysql://localhost:3306/sparkplus',
 34 |                         user='root',
 35 |                         password='password'
 36 |                         )
 37 |     >>> table_dfs.select('roadname_code', 'sido', 'sigungu', 'eupmyeondong').show()
 38 |     +-------------+----------+-------------+------------+
 39 |     |roadname_code|      sido|      sigungu|eupmyeondong|
 40 |     +-------------+----------+-------------+------------+
 41 |     | 261103125011|부산광역시|         중구|      영주동|
 42 |     | 261104006006|부산광역시|         중구|      영주동|
 43 |     | 261104006006|부산광역시|         중구|      영주동|
 44 |     | 261104006006|부산광역시|         중구|      영주동|
 45 |     | 261103125011|부산광역시|         중구|      영주동|
 46 |     | 111104100289|서울특별시|       종로구|      청운동|
 47 |     | 111104100289|서울특별시|       종로구|      청운동|
 48 |     | 111103100014|서울특별시|       종로구|      청운동|
 49 |     | 111104100289|서울특별시|       종로구|      청운동|
 50 |     | 111104100289|서울특별시|       종로구|      청운동|
 51 |     | 411114322017|    경기도|수원시 장안구|      파장동|
 52 |     | 411114322017|    경기도|수원시 장안구|      파장동|
 53 |     | 411114322017|    경기도|수원시 장안구|      파장동|
 54 |     | 411114322017|    경기도|수원시 장안구|      파장동|
 55 |     | 411114322017|    경기도|수원시 장안구|      파장동|
 56 |     +-------------+----------+-------------+------------+
 57 |     """
 58 |     sess_conf = sparkSession.sparkContext.getConf().getAll()
 59 | 
 60 |     # If SparkConf doesn't contain MySQL connector, raise `ValueError`
 61 |     jdbc_driver_flag = False
 62 | 
 63 |     # If you use `spark.jars.packages`, value should like `mysql:mysql-connector-java:YOUR_MYSQL_VERSION`
 64 |     available_configs = [
 65 |         "spark.jars",
 66 |         "spark.driver.extraClassPath",
 67 |         "spark.jars.packages",
 68 |     ]
 69 | 
 70 |     for (conf_key, conf_val) in sess_conf:
 71 |         if conf_key in available_configs and conf_val.__contains__("mysql"):
 72 |             jdbc_driver_flag = True
 73 |             break
 74 | 
 75 |     if not jdbc_driver_flag:
 76 |         raise ValueError(
 77 |             "[SPARKPLUS_MYSQL_CONNECTOR_ERR] "
 78 |             "Your spark session seems like it doesn't contains mysql-connector-java path to connect mysql database. "
 79 |             "Please specify it to use SparkPlus package properly.\n\n"
 80 |             "$ spark-submit <your-spark-app> --jars <mysql-jar-path>\n\n"
 81 |             "In programming way, if you have mysql-connector jar file locally, set spark configuration like\n\n"
 82 |             ">>> ss = SparkSession.builder.config('spark.jars', MYSQL_JAR_PATH)\n\n"
 83 |             "or if you don't,\n\n"
 84 |             ">>> ss = SparkSession.builder.config('spark.jars.packages', 'mysql:mysql-connector-java:YOUR_MYSQL_VERSION')\n\n"
 85 |             "Check https://spark.apache.org/docs/latest/configuration.html for detail."
 86 |         )
 87 | 
 88 |     ss_read = sparkSession.read.format("jdbc")
 89 | 
 90 |     # set DB options such as driver, url, user, password
 91 |     for opt_key, opt_val in kwargs.items():
 92 |         ss_read.option(opt_key, opt_val)
 93 | 
 94 |     if isinstance(tablenames, str):
 95 |         return ss_read.option("dbtable", tablenames).load()
 96 |     else:
 97 |         dfs = ss_read.option("dbtable", tablenames.pop()).load()
 98 | 
 99 |         while tablenames:
100 |             dfs = dfs.union(ss_read.option("dbtable", tablenames.pop()).load())
101 | 
102 |         return dfs
103 | 
104 | def load_gdf(shp_path , epsg):
105 |     gdf = gpd.read_file(shp_path, encoding="euc-kr")
106 |     gdf.crs = f'epsg:{epsg}'
107 |     gdf = gdf.to_crs(epsg=4326)
108 | 
109 |     return gdf


--------------------------------------------------------------------------------
/sparkplus/dependencies/__init__.py:
--------------------------------------------------------------------------------
1 | from .spark import *
2 | 
3 | # from .logging import *
4 | from .tablename import ESido, EPrefix, get_tablename_by_prefix_and_sido
5 | 
6 | __all__ = ["start_spark", "ESido", "EPrefix", "get_tablename_by_prefix_and_sido"]
7 | 


--------------------------------------------------------------------------------
/sparkplus/dependencies/logging.py:
--------------------------------------------------------------------------------
 1 | class Log4j(object):
 2 |     """
 3 |     :param spark: SparkSession object.
 4 |     """
 5 | 
 6 |     def __init__(self, spark):
 7 |         # get spark app details with which to prefix all messages
 8 |         conf = spark.sparkContext.getConf()
 9 |         app_id = conf.get("spark.app.id")
10 |         app_name = conf.get("spark.app.name")
11 | 
12 |         log4j = spark._jvm.org.apache.log4j
13 |         message_prefix = "<" + app_name + " " + app_id + ">"
14 |         self.logger = log4j.LogManager.getLogger(message_prefix)
15 | 
16 |     def error(self, message):
17 |         """Log an error.
18 |         :param: Error message to write to log
19 |         :return: None
20 |         """
21 |         self.logger.error(message)
22 |         return None
23 | 
24 |     def warn(self, message):
25 |         """Log an warning.
26 |         :param: Error message to write to log
27 |         :return: None
28 |         """
29 |         self.logger.warn(message)
30 |         return None
31 | 
32 |     def info(self, message):
33 |         """Log information.
34 |         :param: Information message to write to log
35 |         :return: None
36 |         """
37 |         self.logger.info(message)
38 |         return None
39 | 


--------------------------------------------------------------------------------
/sparkplus/dependencies/spark.py:
--------------------------------------------------------------------------------
 1 | import __main__
 2 | 
 3 | from os import environ, listdir, path
 4 | import json
 5 | from pyspark import SparkFiles
 6 | from pyspark.sql import SparkSession
 7 | 
 8 | 
 9 | def start_spark(
10 |     app_name="my_spark_app",
11 |     master="local[*]",
12 |     jar_packages=[],
13 |     files=[],
14 |     spark_config={},
15 | ):
16 |     """
17 |     :param app_name: Name of Spark app.
18 |     :param master: Cluster connection details (defaults to local[*]).
19 |     :param jar_packages: List of Spark JAR package names.
20 |     :param files: List of files to send to Spark cluster (master and
21 |         workers).
22 |     :param spark_config: Dictionary of config key-value pairs.
23 |     :return: A tuple of references to the Spark session, logger and
24 |         config dict (only if available).
25 |     """
26 | 
27 |     # detect execution environment
28 |     flag_repl = not (hasattr(__main__, "__file__"))
29 |     flag_debug = "DEBUG" in environ.keys()
30 | 
31 |     if not (flag_repl or flag_debug):
32 |         # get Spark session factory
33 |         print("without flag")
34 |         spark_builder = SparkSession.builder.appName(app_name)
35 |         spark_builder.config(
36 |             "spark.jars",
37 |             "/Users/hwan/dev/mysql-connector-java-8.0.26/mysql-connector-java-8.0.26.jar",
38 |         )
39 |     else:
40 |         # get Spark session factory
41 |         spark_builder = SparkSession.builder.master(master).appName(app_name)
42 | 
43 |         # create Spark JAR packages string
44 |         spark_jars_packages = ",".join(list(jar_packages))
45 |         spark_builder.config("spark.jars.packages", spark_jars_packages)
46 | 
47 |         spark_files = ",".join(list(files))
48 |         spark_builder.config("spark.files", spark_files)
49 | 
50 |         # add other config params
51 |         for key, val in spark_config.items():
52 |             spark_builder.config(key, val)
53 | 
54 |     # create session and retrieve Spark logger object
55 |     spark_sess = spark_builder.getOrCreate()
56 |     # spark_logger = Log4j(spark_sess)
57 | 
58 |     # get config file if sent to cluster with --files
59 |     spark_files_dir = SparkFiles.getRootDirectory()
60 |     config_files = [
61 |         filename
62 |         for filename in listdir(spark_files_dir)
63 |         if filename.endswith("config.json")
64 |     ]
65 | 
66 |     if config_files:
67 |         path_to_config_file = path.join(spark_files_dir, config_files[0])
68 |         with open(path_to_config_file, "r") as config_file:
69 |             config_dict = json.load(config_file)
70 |         # spark_logger.warn("loaded config from " + config_files[0])
71 |     else:
72 |         # spark_logger.warn("no config file found")
73 |         config_dict = None
74 | 
75 |     return spark_sess, config_dict
76 | 


--------------------------------------------------------------------------------
/sparkplus/dependencies/tablename.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class EPrefix(Enum):
 5 |     ADDINFO = "additional_info"
 6 |     ROADNAME = "roadname_address"
 7 |     JIBUN = "jibun_address"
 8 |     INTEGRATED = "integrated_address"
 9 | 
10 | 
11 | class ESido(Enum):
12 |     SEOUL = "seoul"
13 |     INCHEON = "incheon"
14 |     DAEJEON = "daejeon"
15 |     SEJONG = "sejong"
16 |     GWANGJU = "gwangju"
17 |     DAEGU = "daegu"
18 |     ULSAN = "ulsan"
19 |     BUSAN = "busan"
20 |     JEJU = "jeju"
21 |     GYEONGGI = "gyeonggi"
22 |     GANGWON = "gangwon"
23 |     CHUNGBUK = "chungbuk"
24 |     CHUNGNAM = "chungnam"
25 |     JEONBUK = "jeonbuk"
26 |     JEONNAM = "jeonnam"
27 |     GYEONGBUK = "gyeongbuk"
28 |     GYEONGNAM = "gyeongnam"
29 | 
30 | 
31 | def get_tablename_by_prefix_and_sido(prefix: EPrefix, sido: ESido) -> str:
32 |     """
33 |     Get tablename of Spark+ database.
34 | 
35 |     ## Examples
36 | 
37 |     >>> target_table = get_table_name(EPrefix.ADDINFO, ESIDO.SEOUL)    # additional_info_seoul
38 |     >>> target_table = get_table_name(EPrefix.INTEGRATED, ESIDO.BUSAN) # integrated_address_busan
39 |     >>> error_table = get_table_name(EPrefix.ADDINFO,  "anywhere")     # Get AttributeError
40 |     """
41 | 
42 |     return f"{prefix.value}_{sido.value}"
43 | 


--------------------------------------------------------------------------------
/sparkplus/jobs/__init__.py:
--------------------------------------------------------------------------------
 1 | from .conversion import *
 2 | from .load_database import *
 3 | 
 4 | __all__ = [
 5 |     "join_with_emd",
 6 |     "join_with_h3",
 7 |     "join_with_table",
 8 |     "shp_init",
 9 |     "load_tables",
10 | ]
11 | 


--------------------------------------------------------------------------------
/sparkplus/jobs/conversion.py:
--------------------------------------------------------------------------------
  1 | from typing import overload
  2 | from geopandas.array import points_from_xy
  3 | from geopandas.tools.sjoin import sjoin
  4 | from shapely.geometry import Point, Polygon
  5 | from pyspark.sql import Row
  6 | from pyspark.sql.functions import concat, lit, udf
  7 | from pyspark.sql.types import *
  8 | from pyspark.sql.functions import col, pandas_udf
  9 | 
 10 | import geopandas as gpd
 11 | import pandas as pd
 12 | import h3
 13 | 
 14 | import os
 15 | 
 16 | 
 17 | def shp_init():
 18 |     shp = gpd.read_file(
 19 |         os.path.dirname(os.path.abspath(__file__))
 20 |         + "/../resource/EMD_202101/TL_SCCO_EMD.shp"
 21 |     )
 22 |     shp = shp.to_crs(4326)
 23 |     return shp
 24 | 
 25 | 
 26 | def coord_to_dong(spark, gdf, lng, lat):
 27 |     addr = gdf[gdf.geometry.contains(Point(lng, lat)) == True]
 28 |     addr_drop_geom = addr.drop(columns="geometry")
 29 |     sdf = spark.createDataFrame(addr_drop_geom)
 30 |     sdf = sdf.select(
 31 |         concat(sdf.EMD_CD, lit("00")).alias("EMD_CD"), "EMD_ENG_NM", "EMD_KOR_NM"
 32 |     )
 33 |     return sdf
 34 | 
 35 | 
 36 | def coord_to_point(spark, df, lng_colname, lat_colname):
 37 |     df["temp"] = [Point(lon, lat) for lon, lat in df[[lng_colname, lat_colname]].values]
 38 |     df["point"] = pd.Series(
 39 |         map(lambda geom: str(geom.to_wkt()), df["temp"]), index=df.index, dtype="str"
 40 |     )
 41 |     tmp = df.drop("temp", axis=1)
 42 |     res_df = pd.DataFrame(tmp)
 43 |     res_sdf = spark.createDataFrame(tmp).cache()
 44 |     del tmp
 45 |     return res_sdf, res_df
 46 | 
 47 | 
 48 | def coord_file_to_emd(spark, gdf, filepath, lng_colname, lat_colname):
 49 |     _gdf = (
 50 |         spark.read.option("header", True)
 51 |         .format("csv")
 52 |         .load(filepath, encoding="euc-kr")
 53 |     )
 54 |     # _gdf = spark.createDataFrame(_file)
 55 |     _gdf.show()
 56 |     pdf = _gdf.select("*").toPandas()
 57 |     g_df = gpd.GeoDataFrame(
 58 |         pdf, geometry=gpd.points_from_xy(pdf[lng_colname], pdf[lat_colname])
 59 |     )
 60 |     li = list()
 61 |     for i in g_df.index:
 62 |         for j in gdf.index:
 63 |             if gdf.geometry[j].contains(g_df.geometry[i]):
 64 |                 li.append(gdf.EMD_CD[j])
 65 |     g_df.insert(len(g_df.columns), "EMD_CD", li)
 66 |     g_df = spark.createDataFrame(g_df)
 67 |     return g_df
 68 | 
 69 | 
 70 | def coord_to_emd(spark, gdf, sdf, lng_colname, lat_colname):
 71 | 
 72 |     pdf = sdf.select("*").toPandas()
 73 |     # pdf = sdf
 74 |     g_df = gpd.GeoDataFrame(
 75 |         pdf, geometry=gpd.points_from_xy(pdf[lng_colname], pdf[lat_colname])
 76 |     )
 77 |     li = list()
 78 |     for i in g_df.index:
 79 |         for j in gdf.index:
 80 |             if gdf.geometry[j].contains(g_df.geometry[i]):
 81 |                 print(g_df.geometry[i], gdf.EMD_CD[j])
 82 |                 li.append(gdf.EMD_CD[j])
 83 |     g_df.insert(len(g_df.columns), "EMD_CD", li)
 84 |     g_df = spark.createDataFrame(g_df)
 85 |     return g_df
 86 | 
 87 | 
 88 | @overload
 89 | def coord_to_emd(spark, gdf, lng, lat, lng_colname="lng", lat_colname="lat"):
 90 |     mySchema = StructType(
 91 |         [
 92 |             StructField(lng_colname, DoubleType(), True),
 93 |             StructField(lat_colname, DoubleType(), True),
 94 |         ]
 95 |     )
 96 |     myRow = Row(lng, lat)
 97 |     myDf = spark.createDataFrame([myRow], mySchema)
 98 |     result = coord_df_to_emd(spark, gdf, myDf, lng_colname, lat_colname)
 99 |     return result
100 | 
101 | 
102 | def to_polygon(l):
103 |     return Polygon(h3.h3_to_geo_boundary(l, geo_json=True))
104 | 
105 | 
106 | def coord_to_h3(lng, lat, h3_level):
107 |     my_h3 = h3.geo_to_h3(lat, lng, h3_level)
108 |     h3_df = gpd.GeoDataFrame({"h3": [my_h3, my_h3]})
109 |     h3_df["geometry"] = h3_df["h3"].apply(to_polygon)
110 |     h3_df.crs = {"init": "epsg:4326"}
111 |     return h3_df
112 | 
113 | 
114 | def coord_to_jibun(spark, gdf, table_df, lng, lat):
115 |     emd_df = coord_to_emd(spark, gdf, lng, lat).toPandas()
116 |     emd_cd = emd_df.iloc[0]["EMD_CD"] + "00"
117 |     jibun_df = table_df[table_df["bupjungdong_code"] == emd_cd].toPandas()
118 |     print(jibun_df)
119 |     return jibun_df
120 | 
121 | 
122 | def coord_to_roadname(
123 |     spark, gdf, table_jibun, table_roadname, table_roadname_code, lng, lat
124 | ):
125 |     jibun_df = coord_to_jibun(spark, gdf, table_jibun, lng, lat)
126 |     manage_number = jibun_df.iloc[0]["manage_number"]
127 |     roadname_code_df = table_roadname[
128 |         table_roadname["manage_number"] == manage_number
129 |     ].toPandas()
130 |     roadname_code = roadname_code_df.iloc[0]["roadname_code"]
131 |     result = table_roadname_code[table_roadname_code["roadname_code"] == roadname_code]
132 |     return result
133 | 
134 | 
135 | def create_sjoin_emd(gdf_poly, join_column_name):
136 |     def sjoin_settlement(x, y):
137 |         gdf_temp = gpd.GeoDataFrame(
138 |             data=[[x] for x in range(len(x))], geometry=gpd.points_from_xy(x, y)
139 |         ).set_crs(epsg=4326, inplace=True)
140 |         settlement = gpd.sjoin(gdf_temp, gdf_poly, how="left", op="within")
141 |         settlement = settlement.drop_duplicates(subset="geometry")
142 |         # print(settlement.agg({'EMD_CD': lambda x: str(x) + '00'}).reset_index().loc[:, join_column_name].astype('str'))
143 |         return (
144 |             settlement.agg({"EMD_CD": lambda x: str(x) + "00"})
145 |             .reset_index()
146 |             .loc[:, join_column_name]
147 |             .astype("str")
148 |         )
149 | 
150 |     return pandas_udf(sjoin_settlement, returnType=StringType())
151 | 
152 | 
153 | def join_with_emd(gdf_poly, sdf, x_colname, y_colname):
154 |     sjoin_udf = create_sjoin_emd(gdf_poly, "EMD_CD")
155 |     res_df = sdf.withColumn("EMD_CD", sjoin_udf(sdf[x_colname], sdf[y_colname]))
156 |     return res_df
157 | 
158 | 
159 | def join_with_h3(sdf, x_colname, y_colname, h3_level):
160 |     udf_to_h3 = udf(
161 |         lambda x, y: h3.geo_to_h3(float(x), float(y), h3_level), returnType=StringType()
162 |     )
163 |     res_h3 = sdf.withColumn("h3", udf_to_h3(sdf[y_colname], sdf[x_colname]))
164 |     return res_h3
165 | 
166 | 
167 | def join_with_table(gdf_poly, sdf, table_df, x_colname, y_colname):
168 |     temp_df = join_with_emd(gdf_poly, sdf, x_colname, y_colname)
169 |     table_df = table_df.dropDuplicates(["bupjungdong_code"])
170 |     res_df = temp_df.join(
171 |         table_df, [temp_df.EMD_CD == table_df.bupjungdong_code], how="left_outer"
172 |     )
173 | 
174 |     return res_df
175 |     # .select(temp_df.EMD_CD, table_df.sido).show()
176 | 
177 |     # res_df.show()
178 | 


--------------------------------------------------------------------------------
/sparkplus/jobs/etl_job.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import Row
  2 | from pyspark.sql.functions import col, concat_ws, lit
  3 | 
  4 | from dependencies.spark import start_spark
  5 | 
  6 | 
  7 | def main():
  8 |     """Main ETL script definition.
  9 |     :return: None
 10 |     """
 11 |     # start Spark application and get Spark session, logger and config
 12 |     spark, log, config = start_spark(
 13 |         app_name="my_etl_job", files=["configs/etl_config.json"]
 14 |     )
 15 | 
 16 |     # log that main ETL job is starting
 17 |     log.warn("etl_job is up-and-running")
 18 | 
 19 |     # execute ETL pipeline
 20 |     data = extract_data(spark)
 21 |     data_transformed = transform_data(data, config["steps_per_floor"])
 22 |     load_data(data_transformed)
 23 | 
 24 |     # log the success and terminate Spark application
 25 |     log.warn("test_etl_job is finished")
 26 |     spark.stop()
 27 |     return None
 28 | 
 29 | 
 30 | def extract_data(spark):
 31 |     """Load data from Parquet file format.
 32 |     :param spark: Spark session object.
 33 |     :return: Spark DataFrame.
 34 |     """
 35 |     df = spark.read.parquet("tests/test_data/employees")
 36 | 
 37 |     return df
 38 | 
 39 | 
 40 | def transform_data(df, steps_per_floor_):
 41 |     """Transform original dataset.
 42 |     :param df: Input DataFrame.
 43 |     :param steps_per_floor_: The number of steps per-floor at 43 Tanner
 44 |         Street.
 45 |     :return: Transformed DataFrame.
 46 |     """
 47 |     df_transformed = df.select(
 48 |         col("id"),
 49 |         concat_ws(" ", col("first_name"), col("second_name")).alias("name"),
 50 |         (col("floor") * lit(steps_per_floor_)).alias("steps_to_desk"),
 51 |     )
 52 | 
 53 |     return df_transformed
 54 | 
 55 | 
 56 | def load_data(df):
 57 |     """Collect data locally and write to CSV.
 58 |     :param df: DataFrame to print.
 59 |     :return: None
 60 |     """
 61 |     (df.coalesce(1).write.csv("loaded_data", mode="overwrite", header=True))
 62 |     return None
 63 | 
 64 | 
 65 | def create_test_data(spark, config):
 66 |     """Create test data.
 67 |     This function creates both both pre- and post- transformation data
 68 |     saved as Parquet files in tests/test_data. This will be used for
 69 |     unit tests as well as to load as part of the example ETL job.
 70 |     :return: None
 71 |     """
 72 |     # create example data from scratch
 73 |     local_records = [
 74 |         Row(id=1, first_name="Dan", second_name="Germain", floor=1),
 75 |         Row(id=2, first_name="Dan", second_name="Sommerville", floor=1),
 76 |         Row(id=3, first_name="Alex", second_name="Ioannides", floor=2),
 77 |         Row(id=4, first_name="Ken", second_name="Lai", floor=2),
 78 |         Row(id=5, first_name="Stu", second_name="White", floor=3),
 79 |         Row(id=6, first_name="Mark", second_name="Sweeting", floor=3),
 80 |         Row(id=7, first_name="Phil", second_name="Bird", floor=4),
 81 |         Row(id=8, first_name="Kim", second_name="Suter", floor=4),
 82 |     ]
 83 | 
 84 |     df = spark.createDataFrame(local_records)
 85 | 
 86 |     # write to Parquet file format
 87 |     (df.coalesce(1).write.parquet("tests/test_data/employees", mode="overwrite"))
 88 | 
 89 |     # create transformed version of data
 90 |     df_tf = transform_data(df, config["steps_per_floor"])
 91 | 
 92 |     # write transformed version of data to Parquet
 93 |     (
 94 |         df_tf.coalesce(1).write.parquet(
 95 |             "tests/test_data/employees_report", mode="overwrite"
 96 |         )
 97 |     )
 98 | 
 99 |     return None
100 | 
101 | 
102 | # entry point for PySpark ETL application
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/sparkplus/jobs/load_database.py:
--------------------------------------------------------------------------------
  1 | """
  2 | # 부가정보 테이블
  3 | additional_info_tables = [
  4 | 	'additional_info_busan',
  5 | 	'additional_info_chungbuk',
  6 | 	'additional_info_chungnam',
  7 | 	'additional_info_daegu',
  8 | 	'additional_info_daejeon',
  9 | 	'additional_info_gangwon',
 10 | 	'additional_info_gwangju',
 11 | 	'additional_info_gyeongbuk',
 12 | 	'additional_info_gyeonggi',
 13 | 	'additional_info_gyeongnam',
 14 | 	'additional_info_incheon',
 15 | 	'additional_info_jeju',
 16 | 	'additional_info_jeonbuk',
 17 | 	'additional_info_jeonnam',
 18 | 	'additional_info_sejong',
 19 | 	'additional_info_seoul',
 20 | 	'additional_info_ulsan'
 21 | ]
 22 | 
 23 | # 지번주소 테이블
 24 | jibun_address_tables = [
 25 | 	'jibun_address_busan',
 26 | 	'jibun_address_chungbuk',
 27 | 	'jibun_address_chungnam',
 28 | 	'jibun_address_daegu',
 29 | 	'jibun_address_daejeon',
 30 | 	'jibun_address_gangwon',
 31 | 	'jibun_address_gwangju',
 32 | 	'jibun_address_gyeongbuk',
 33 | 	'jibun_address_gyeonggi',
 34 | 	'jibun_address_gyeongnam',
 35 | 	'jibun_address_incheon',
 36 | 	'jibun_address_jeju',
 37 | 	'jibun_address_jeonbuk',
 38 | 	'jibun_address_jeonnam',
 39 | 	'jibun_address_sejong',
 40 | 	'jibun_address_seoul',
 41 | 	'jibun_address_ulsan',
 42 | ]
 43 | 
 44 | # 도로명주소 테이블
 45 | roadname_tables = [
 46 | 	'roadname_address_busan',
 47 | 	'roadname_address_chungbuk',
 48 | 	'roadname_address_chungnam',
 49 | 	'roadname_address_daegu',
 50 | 	'roadname_address_daejeon',
 51 | 	'roadname_address_gangwon',
 52 | 	'roadname_address_gwangju',
 53 | 	'roadname_address_gyeongbuk',
 54 | 	'roadname_address_gyeonggi',
 55 | 	'roadname_address_gyeongnam',
 56 | 	'roadname_address_incheon',
 57 | 	'roadname_address_jeju',
 58 | 	'roadname_address_jeonbuk',
 59 | 	'roadname_address_jeonnam',
 60 | 	'roadname_address_sejong',
 61 | 	'roadname_address_seoul',
 62 | 	'roadname_address_ulsan',
 63 |     'roadname_code'
 64 | ]
 65 | 
 66 | # 도로명코드 테이블
 67 | roadname_code_table = ['roadname_code']
 68 | 
 69 | # 통합 테이블
 70 | integrated_table = [
 71 | 	'integrated_address_busan',
 72 | 	'integrated_address_chungbuk',
 73 | 	'integrated_address_chungnam',
 74 | 	'integrated_address_daegu',
 75 | 	'integrated_address_daejeon',
 76 | 	'integrated_address_gangwon',
 77 | 	'integrated_address_gwangju',
 78 | 	'integrated_address_gyeongbuk',
 79 | 	'integrated_address_gyeonggi',
 80 | 	'integrated_address_gyeongnam',
 81 | 	'integrated_address_incheon',
 82 | 	'integrated_address_jeju',
 83 | 	'integrated_address_jeonbuk',
 84 | 	'integrated_address_jeonnam',
 85 | 	'integrated_address_sejong',
 86 | 	'integrated_address_seoul',
 87 | 	'integrated_address_ulsan'
 88 | ]
 89 | 
 90 | """
 91 | 
 92 | 
 93 | def load_tables(spark, url, user, password, opt, driver="com.mysql.cj.jdbc.Driver"):
 94 | 
 95 |     table = "integrated_address_" + opt
 96 |     result = (
 97 |         spark.read.format("jdbc")
 98 |         .option("driver", driver)
 99 |         .option("url", url)
100 |         .option("dbtable", table)
101 |         .option("user", user)
102 |         .option("password", password)
103 |         .load()
104 |     )
105 | 
106 |     return result
107 | 


--------------------------------------------------------------------------------
/sparkplus/jobs/table_to_df.py:
--------------------------------------------------------------------------------
 1 | def create_df(spark, table):
 2 | 
 3 |     sdf = (
 4 |         spark.read.format("jdbc")
 5 |         .option("url", "jdbc:mysql://localhost:3306/sparkplus")
 6 |         .option("driver", "com.mysql.cj.jdbc.Driver")
 7 |         .option("dbtable", table)
 8 |         .option("user", "root")
 9 |         .option("password", "9315")
10 |         .load()
11 |     )
12 | 
13 |     return sdf
14 | 


--------------------------------------------------------------------------------
/sparkplus/jobs/with_geopandas.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.functions import *
 2 | from pyspark.sql.types import (
 3 |     StringType,
 4 |     IntegerType,
 5 |     FloatType,
 6 |     DoubleType,
 7 |     DecimalType,
 8 | )
 9 | from pyspark.sql.functions import lit, pandas_udf, PandasUDFType
10 | 
11 | import pandas as pd
12 | import geopandas as gpd
13 | 
14 | import sys
15 | import os
16 | 
17 | sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
18 | 
19 | from dependencies.spark import start_spark
20 | 
21 | 
22 | def geopandas_df_to_spark_for_points(spark, gdf):
23 |     gdf["lon"] = gdf["geometry"].x
24 |     gdf["lat"] = gdf["geometry"].y
25 |     sdf = spark.createDataFrame(pd.DataFrame(gdf), axis=1)
26 |     return sdf
27 | 
28 | 
29 | korea_shp_file = "shp/TL_SCCO_LI.shp"
30 | 
31 | gdf = gpd.read_file(korea_shp_file, encoding="euc-kr")
32 | 
33 | 
34 | gdf = gdf.to_crs(4326)
35 | 


--------------------------------------------------------------------------------
/sparkplus/package/__init__.py:
--------------------------------------------------------------------------------
1 | from .gis import *
2 | 
3 | __all__ = ["gdf_to_spark_wkt"]
4 | 


--------------------------------------------------------------------------------
/sparkplus/package/gis.py:
--------------------------------------------------------------------------------
  1 | from shapely.geometry import Point, Polygon, LineString
  2 | from pyspark.sql import SparkSession
  3 | import geopandas as gpd
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import pyspark
  8 | from pyspark.sql.functions import *
  9 | from pyspark.sql.types import (
 10 |     IntegerType,
 11 |     StringType,
 12 |     FloatType,
 13 |     DecimalType,
 14 |     DoubleType,
 15 | )
 16 | import os
 17 | from pyspark.sql.functions import lit, pandas_udf, PandasUDFType
 18 | from dotenv import load_dotenv
 19 | 
 20 | load_dotenv()
 21 | 
 22 | 
 23 | def load_shp(spark, file_location):
 24 |     korea = gpd.read_file(file_location, encoding="euc-kr")
 25 |     gdf = korea.to_crs(4326)
 26 |     return gdf
 27 | 
 28 | 
 29 | # def coord_to_dong(spark, gdf, lng, lat):
 30 | #     addr = gdf[gdf.geometry.contains(Point(lng, lat)) == True]
 31 | #     addr_drop_geom = addr.drop(columns="geometry")
 32 | #     df = spark.createDataFrame(addr_drop_geom)
 33 | #     df = df.select(
 34 | #         concat(df.EMD_CD, lit("00")).alias("EMD_CD"), "EMD_ENG_NM", "EMD_KOR_NM"
 35 | #     )
 36 | #     return df
 37 | 
 38 | 
 39 | def coord_to_dong(spark, gdf, spark_df, lng_colname, lat_colname):
 40 | 
 41 |     p_df = spark_to_pandas(spark_df)
 42 |     # geometry = gpd.points_from_xy(p_df['longitude'], p_df['latitude'])
 43 |     print("p_df: ", p_df)
 44 |     g_df = gpd.GeoDataFrame(
 45 |         p_df, geometry=gpd.points_from_xy(p_df[lng_colname], p_df[lat_colname])
 46 |     )
 47 |     # g_df = gpd.GeoDataFrame(p_df, geometry=geometry)
 48 |     print("g_df: ", g_df)
 49 |     li = list()
 50 |     for i in g_df.index:
 51 |         for j in gdf.index:
 52 |             if gdf.geometry[j].contains(g_df.geometry[i]):
 53 |                 li.append(gdf.EMD_CD[j])
 54 |             # if j == 1: print(gdf.geometry[j], p_df.geometry[i])
 55 | 
 56 |     g_df.insert(len(g_df.columns), "EMD_CD", li)
 57 |     # g_df = g_df.drop(columns="geometry")
 58 |     g_df = spark.createDataFrame(g_df)
 59 | 
 60 |     return g_df
 61 | 
 62 | 
 63 | def spark_to_pandas(spark_df):
 64 |     return spark_df.select("*").toPandas()
 65 | 
 66 | 
 67 | def pandas_to_geopandas(pandas_df):
 68 |     return gpd.GeoDataFrame(pandas_df)
 69 | 
 70 | 
 71 | def db_table_to_df(spark, table):
 72 |     df = (
 73 |         spark.read.format("jdbc")
 74 |         .option("driver", os.getenv("DB_DRIVER"))
 75 |         .option("url", os.getenv("DB_URL"))
 76 |         .option("dbtable", table)
 77 |         .option("user", os.getenv("DB_USER"))
 78 |         .option("password", os.getenv("DB_PASSWORD"))
 79 |         .load()
 80 |     )
 81 |     return df
 82 | 
 83 | 
 84 | def gdf_to_spark_wkt(spark, gdf):
 85 |     gdf["wkt"] = pd.Series(
 86 |         map(lambda geom: str(geom.to_wkt()), gdf["geometry"]),
 87 |         index=gdf.index,
 88 |         dtype="str",
 89 |     )
 90 |     tmp = gdf.drop("geometry", axis=1)
 91 |     df = pd.DataFrame(tmp)
 92 |     sdf = spark.createDataFrame(tmp).cache()
 93 |     del tmp
 94 | 
 95 |     return sdf, df
 96 | 
 97 | 
 98 | def spark_to_gdf_wkt(spark, gdf, col_name):
 99 |     gdf["wkt_to_geom"] = gpd.GeoSeries.from_wkt(gdf[col_name])
100 |     return gdf
101 | 
102 | 
103 | def load_table(spark):
104 |     table_list = [
105 |         "additional_info_busan",
106 |         "additional_info_chungbuk",
107 |         "additional_info_chungnam",
108 |         "additional_info_daegu",
109 |         "additional_info_daejeon",
110 |         "additional_info_gangwon",
111 |         "additional_info_gwangju",
112 |         "additional_info_gyeongbuk",
113 |         "additional_info_gyeonggi",
114 |         "additional_info_gyeongnam",
115 |         "additional_info_incheon",
116 |         "additional_info_jeju",
117 |         "additional_info_jeonbuk",
118 |         "additional_info_jeonnam",
119 |         "additional_info_sejong",
120 |         "additional_info_seoul",
121 |         "additional_info_ulsan",
122 |         "jibun_address_busan",
123 |         "jibun_address_chungbuk",
124 |         "jibun_address_chungnam",
125 |         "jibun_address_daegu",
126 |         "jibun_address_daejeon",
127 |         "jibun_address_gangwon",
128 |         "jibun_address_gwangju",
129 |         "jibun_address_gyeongbuk",
130 |         "jibun_address_gyeonggi",
131 |         "jibun_address_gyeongnam",
132 |         "jibun_address_incheon",
133 |         "jibun_address_jeju",
134 |         "jibun_address_jeonbuk",
135 |         "jibun_address_jeonnam",
136 |         "jibun_address_sejong",
137 |         "jibun_address_seoul",
138 |         "jibun_address_ulsan",
139 |         "roadname_address_busan",
140 |         "roadname_address_chungbuk",
141 |         "roadname_address_chungnam",
142 |         "roadname_address_daegu",
143 |         "roadname_address_daejeon",
144 |         "roadname_address_gangwon",
145 |         "roadname_address_gwangju",
146 |         "roadname_address_gyeongbuk",
147 |         "roadname_address_gyeonggi",
148 |         "roadname_address_gyeongnam",
149 |         "roadname_address_incheon",
150 |         "roadname_address_jeju",
151 |         "roadname_address_jeonbuk",
152 |         "roadname_address_jeonnam",
153 |         "roadname_address_sejong",
154 |         "roadname_address_seoul",
155 |         "roadname_address_ulsan",
156 |         "roadname_code",
157 |     ]
158 | 
159 |     for table in table_list:
160 |         name = table + "_df"
161 |         globals()[name] = db_table_to_df(spark, table)
162 |     return globals()
163 | 


--------------------------------------------------------------------------------
/sparkplus/package/pipeline.py:
--------------------------------------------------------------------------------
 1 | from shapely.geometry import Point, Polygon
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql import Row
 4 | from pyspark.sql import *
 5 | from pyspark.sql.types import StructField, StructType, StringType, LongType, DoubleType
 6 | import geopandas as gpd
 7 | import pandas as pd
 8 | import mysql.connector
 9 | import sys
10 | from . import gis
11 | import pyspark
12 | from dotenv import load_dotenv
13 | 
14 | sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf8", buffering=1)
15 | 
16 | spark = SparkSession.builder.appName("Spark App").getOrCreate()
17 | dict = gis.load_table(spark)  # table dictionary 불러오기
18 | jibun_dict = {}
19 | for key, val in list(dict.items()):
20 |     if "jibun_address" in key:
21 |         result = (
22 |             dict[key]
23 |             .select(["bupjungdong_code", "sido", "sigungu", "bupjungeupmyeondong"])
24 |             .dropDuplicates(["bupjungdong_code"])
25 |             .orderBy("bupjungdong_code")
26 |         )
27 |         jibun_dict[key] = result
28 | 
29 | """ shp to polyfill
30 | gdf = gis.load_shp(spark, "../resource/EMD_202101/TL_SCCO_EMD.shp") #법정동 shp 파일 불러오기
31 | gdf = gdf.h3.polyfill(10)
32 | pd_h3 = pd.DataFrame(gdf)
33 | del gdf
34 | pd_h3 = pd_h3.drop('geometry', axis=1)
35 | sdf = spark.createDataFrame(pd_h3)
36 | """
37 | 
38 | """ sdf to json
39 | sdf.coalesce(1).write.json('v1') #v1이라는 폴더가 생성됨
40 | sdf.write.json('v2')
41 | """
42 | 
43 | """
44 | sdf_df = gis.gdf_to_spark_wkt(spark, gdf) #spark에서 읽을 수 있도록 wkt로 변환
45 | result_df = gis.gdf_to_spark_wkt(spark, gdf_h3)
46 | """
47 | 
48 | """ read parquet
49 | df = spark.read.option("mergeSchema", "true").parquet("../resource/h3/part-00000-3c1357f3-ca16-420a-8b7f-7e532d32c650-c000.snappy.parquet")
50 | df.printSchema()
51 | df.show()
52 | """
53 | 


--------------------------------------------------------------------------------
/sparkplus/testjob/demo_app.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | from shapely.geometry import Polygon
  5 | from pyspark.sql import SparkSession
  6 | from pyspark.sql.functions import encode
  7 | import pandas as pd
  8 | import geopandas as gpd
  9 | import h3
 10 | 
 11 | sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
 12 | 
 13 | from jobs.conversion import (
 14 |     coord_to_emd,
 15 |     join_with_h3,
 16 |     join_with_emd,
 17 |     join_with_table,
 18 |     shp_init,
 19 | )
 20 | from jobs.load_database import load_tables
 21 | from package import gis
 22 | 
 23 | driver = "com.mysql.cj.jdbc.Driver"
 24 | url = "jdbc:mysql://localhost:3306/sparkplus"
 25 | user = "sparkplus"
 26 | password = "sparkplus"
 27 | 
 28 | filepath = "/home/hadoop/spark-plugin/resource/data/daegu_streetlight.csv"
 29 | localfilepath = "../resource/data/daegu_streetlight.csv"
 30 | shp = "/home/hadoop/spark-plugin/resource/EMD_202101/TL_SCCO_EMD.shp"
 31 | localshp = "../resource/EMD_202101/TL_SCCO_EMD.shp"
 32 | 
 33 | if __name__ == "__main__":
 34 | 
 35 |     session = (
 36 |         SparkSession.builder.appName("demo_app")
 37 |         .config(
 38 |             "spark.driver.extraClassPath",
 39 |             "/usr/lib/spark/jars/mysql-connector-java-8.0.26.jar",
 40 |         )
 41 |         .getOrCreate()
 42 |     )
 43 |     # session.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
 44 |     # session.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000)
 45 | 
 46 |     sc = session.sparkContext
 47 |     sc.setLogLevel("ERROR")
 48 | 
 49 |     gdf = gis.load_shp(session, localshp)
 50 |     gdf = gdf.to_crs(4326)
 51 | 
 52 |     # gdf = shp_init()
 53 | 
 54 |     dataFrameReader = session.read
 55 | 
 56 |     my_sdf = (
 57 |         dataFrameReader.option("header", True)
 58 |         .format("csv")
 59 |         .load(localfilepath, encoding="euc-kr")
 60 |     )
 61 | 
 62 |     emd_df = join_with_emd(gdf, my_sdf, "경도", "위도")
 63 |     print("emd_df ------------------------")
 64 |     emd_df.show()
 65 | 
 66 |     """
 67 |     tdf = pd.read_csv(localfilepath, encoding='euc-kr')
 68 | 
 69 |     tdf2 = tdf.iloc[:][10054:10059]
 70 |     tdf2 = session.createDataFrame(tdf2)
 71 |     print("tdf2")
 72 |     tdf2.show()
 73 |     tdf = tdf.iloc[:][10054:10059]
 74 |     tdf = session.createDataFrame(tdf)
 75 |     """
 76 |     """
 77 |     a = coord_to_emd(session, gdf, tdf, "경도", "위도")
 78 |     print("a")
 79 |     a.show()
 80 |     """
 81 |     """
 82 |     print("tdf")
 83 |     tdf.show()
 84 |     tdf = join_with_emd(gdf, tdf, '경도', '위도')
 85 |     tdf.show()
 86 | 
 87 | 
 88 |     tdf2 = join_with_emd(gdf, tdf2, '경도', '위도')
 89 |     """
 90 |     """
 91 |     h3_df = join_with_h3(my_sdf, "경도", "위도", 10)
 92 |     h3_df.show()
 93 |     """
 94 |     table_df = load_tables(session, url, user, password, "daegu")
 95 |     print("table_df ------------------------")
 96 |     table_df.show()
 97 | 
 98 |     res_df = join_with_table(gdf, emd_df, table_df, "경도", "위도")
 99 |     # res_df.show()
100 |     print("res_df ------------------------")
101 |     res_df.show()
102 |     print(res_df.count())
103 |     """
104 |     res2_df = join_with_table(gdf, tdf2, table_df, '경도', '위도')
105 |     res2_df.show()
106 |     """
107 |     """
108 |     Result vector from pandas_udf was not the required lengt
109 |     def to_polygon(l):
110 | 	    return Polygon(h3.h3_to_geo_boundary(l, geo_json=True))
111 | 
112 |     temp = [35.8734, 128.6103]
113 | 
114 |     gdf_h3 = h3_df.toPandas()
115 |     gdf_h3 = gpd.GeoDataFrame(gdf_h3)
116 |     gdf_h3['geometry'] = gdf_h3['h3'].apply(to_polygon)
117 |     gdf_h3.crs = {'init': 'epsg:4326'}
118 | 
119 |     m =folium.Map(temp, zoom_start=14)
120 |     folium.GeoJson(gdf_h3).add_to(m)
121 | 
122 |     m.save('daegu1.html')
123 |     """
124 | 


--------------------------------------------------------------------------------
/sparkplus/testjob/test_df.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | import geopandas as gpd
  5 | from dotenv import load_dotenv
  6 | 
  7 | sys.path.append(
  8 |     os.path.dirname(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))
  9 | )
 10 | 
 11 | from sparkplus.core import CoordDataframe
 12 | from sparkplus.core import RoadnameDataframe
 13 | from sparkplus.jobs.load_database import load_tables
 14 | from pyspark.sql import SparkSession
 15 | from sparkplus.dependencies.spark import start_spark
 16 | from sparkplus.core.py_log import logger
 17 | 
 18 | load_dotenv()
 19 | 
 20 | driver = "com.mysql.cj.jdbc.Driver"
 21 | url = (
 22 |     "jdbc:mysql://ec2-3-35-104-222.ap-northeast-2.compute.amazonaws.com:3306/sparkplus"
 23 | )
 24 | user = "sparkplus"
 25 | password = "sparkplus"
 26 | 
 27 | shp_path = "../resource/shp/LSMD_CONT_LDREG_27_202109.shp"
 28 | data_path = "../resource/data/daegu_streetlight.csv"
 29 | 
 30 | """
 31 | session = (
 32 |         SparkSession.builder.appName("demo_app")
 33 |         .config(
 34 |             "spark.driver.extraClassPath",
 35 |             "/usr/lib/spark/jars/mysql-connector-java-8.0.26.jar",
 36 |         )
 37 |         .getOrCreate()
 38 |     )
 39 | """
 40 | 
 41 | # Spark Session을 연다
 42 | session, _ = start_spark()
 43 | dataFrameReader = session.read
 44 | 
 45 | logger.debug("read_shp")
 46 | # shp파일을 GDF로 불러오고 crs를 세팅한다.
 47 | gdf = gpd.read_file(shp_path, encoding="euc-kr")
 48 | gdf.crs = "epsg:5174"
 49 | gdf = gdf.to_crs(epsg=4326)
 50 | logger.debug("complete read shp")
 51 | 
 52 | # 데이터 df를 불러온다.
 53 | logger.debug("read dataframe")
 54 | my_sdf = (
 55 |     dataFrameReader.option("header", True)
 56 |     .format("csv")
 57 |     .load(data_path, encoding="euc-kr")
 58 | )
 59 | my_sdf.show()
 60 | print("my_sdf: ", my_sdf.count())
 61 | logger.debug("complete dataframe")
 62 | 
 63 | # 데이터베이스에서 테이블을 불러온다.
 64 | logger.debug("load_tables")
 65 | table_df = load_tables(session, url, user, password, "daegu")
 66 | table_df.show()
 67 | logger.debug("complete load_tables")
 68 | # 커스텀데이터프레임을 만든다.
 69 | logger.debug("create custom df")
 70 | df = CoordDataFrame(my_sdf, gdf, table_df, "경도", "위도")
 71 | logger.debug("complete custom df")
 72 | # 기존 데이터 df와 PNU 매칭한다.
 73 | logger.debug("coord_to_pnu")
 74 | pnu_df = df.coord_to_pnu()
 75 | 
 76 | print("pnu_df: ", pnu_df.count())
 77 | pnu_df.show()
 78 | 
 79 | logger.debug("complete coord_to_pnu")
 80 | 
 81 | """
 82 | logger.debug('join with pnu')
 83 | res_df = df.coord_to_pnu(gdf, '경도', '위도')
 84 | res_df.show()
 85 | logger.debug('complete join with pnu')
 86 | """
 87 | 
 88 | 
 89 | # 기존 데이터 df와 테이블을 조인한다. (PNU => bupjungdong 매칭)
 90 | logger.debug("join_with_table")
 91 | res_df = df.join_with_table()
 92 | print("joined_df: ", res_df.count())
 93 | res_df.show()
 94 | logger.debug("complete join_with_tables")
 95 | 
 96 | logger.debug("h3_df")
 97 | h3_df = df.coord_to_h3(10)
 98 | print("h3_df: ", h3_df.count())
 99 | h3_df.show()
100 | logger.debug("complete h3_df")
101 | 
102 | logger.debug("select zipcode columns")
103 | zipcode_df = df.coord_to_zipcode()
104 | print("zipcode_df: ", zipcode_df.count())
105 | 
106 | zipcode_df.show()
107 | logger.debug("complete select zip columns")
108 | 
109 | 
110 | logger.debug("select emd columns")
111 | emd_df = df.coord_to_emd()
112 | print("emd_df: ", emd_df.count())
113 | 
114 | emd_df.show()
115 | logger.debug("complete select emd columns")
116 | 
117 | 
118 | logger.debug("select doromyoung columns")
119 | doro_df = df.coord_to_roadname()
120 | print("doro_df: ", doro_df.count())
121 | 
122 | doro_df.show()
123 | logger.debug("complete select doromyoung columns")
124 | 
125 | 
126 | logger.debug("coord_to_roadname")
127 | full_doro_df = df.coord_to_roadname_addr()
128 | 
129 | full_doro_df = RoadnameDataframe(full_doro_df)
130 | doro_to_roadname_df = full_doro_df.add_split("roadname_address")
131 | 
132 | 
133 | print("doro_to_roadname", doro_to_roadname_df._df.count())
134 | doro_to_roadname_df._df.show()
135 | logger.debug("complete coord_to_roadname")
136 | 
137 | 
138 | logger.debug("select jibun columns")
139 | jibun_df = df.coord_to_jibun()
140 | print("jibun_df: ", jibun_df.count())
141 | 
142 | jibun_df.show()
143 | logger.debug("complete select jibun columns")
144 | 


--------------------------------------------------------------------------------
/static/sparkplus_arch_finale.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SWM-SparkPlus/sparkplus/883d16498b25488cc424908700a8389837e83c47/static/sparkplus_arch_finale.png


--------------------------------------------------------------------------------