├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── .pypirc ├── .vscode └── settings.json ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── PyPI_README.md ├── README.md ├── pyproject.toml ├── setup.cfg ├── setup.py ├── sparkplus ├── __init__.py ├── core │ ├── __init__.py │ ├── address_dataframe.py │ ├── base.py │ ├── coord_dataframe.py │ ├── job.py │ ├── numaddr_dataframe.py │ ├── py_log.py │ ├── shp_to_parquet.py │ ├── tablename.py │ ├── test.ipynb │ ├── udfs.py │ └── utils.py ├── dependencies │ ├── __init__.py │ ├── logging.py │ ├── spark.py │ └── tablename.py ├── jobs │ ├── __init__.py │ ├── conversion.py │ ├── etl_job.py │ ├── load_database.py │ ├── table_to_df.py │ └── with_geopandas.py ├── package │ ├── __init__.py │ ├── gis.py │ └── pipeline.py └── testjob │ ├── demo_app.py │ └── test_df.py └── static └── sparkplus_arch_finale.png /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | cover/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | .pybuilder/ 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | # For a library or package, you might want to ignore these files since the code is 92 | # intended to run in multiple environments; otherwise, check them in: 93 | # .python-version 94 | 95 | # pipenv 96 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 97 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 98 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 99 | # install all needed dependencies. 100 | #Pipfile.lock 101 | 102 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 103 | __pypackages__/ 104 | 105 | # Celery stuff 106 | celerybeat-schedule 107 | celerybeat.pid 108 | 109 | # SageMath parsed files 110 | *.sage.py 111 | 112 | # Environments 113 | .env 114 | .venv 115 | env/ 116 | venv/ 117 | ENV/ 118 | env.bak/ 119 | venv.bak/ 120 | 121 | # Spyder project settings 122 | .spyderproject 123 | .spyproject 124 | 125 | # Rope project settings 126 | .ropeproject 127 | 128 | # mkdocs documentation 129 | /site 130 | 131 | # mypy 132 | .mypy_cache/ 133 | .dmypy.json 134 | dmypy.json 135 | 136 | # Pyre type checker 137 | .pyre/ 138 | 139 | # pytype static type analyzer 140 | .pytype/ 141 | 142 | # Cython debug symbols 143 | cython_debug/ 144 | 145 | # End of https://www.toptal.com/developers/gitignore/api/python 146 | 147 | # Custom 148 | *.csv 149 | 150 | resource/* 151 | scripts/* 152 | logs 153 | 154 | # test file 155 | package/test.py 156 | /shp 157 | 158 | # Mac dev dependency 159 | .DS_Store 160 | 161 | sparkplus/core/test.py 162 | 163 | # vscode 164 | .vscode/ 165 | 166 | # Byte-compiled / optimized / DLL files 167 | __pycache__/ 168 | *.py[cod] 169 | *$py.class 170 | 171 | # C extensions 172 | *.so 173 | 174 | # Distribution / packaging 175 | .Python 176 | build/ 177 | develop-eggs/ 178 | dist/ 179 | downloads/ 180 | eggs/ 181 | .eggs/ 182 | lib/ 183 | lib64/ 184 | parts/ 185 | sdist/ 186 | var/ 187 | wheels/ 188 | pip-wheel-metadata/ 189 | share/python-wheels/ 190 | *.egg-info/ 191 | .installed.cfg 192 | *.egg 193 | MANIFEST 194 | 195 | # PyInstaller 196 | # Usually these files are written by a python script from a template 197 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 198 | *.manifest 199 | *.spec 200 | 201 | # Installer logs 202 | pip-log.txt 203 | pip-delete-this-directory.txt 204 | 205 | # Unit test / coverage reports 206 | htmlcov/ 207 | .tox/ 208 | .nox/ 209 | .coverage 210 | .coverage.* 211 | .cache 212 | nosetests.xml 213 | coverage.xml 214 | *.cover 215 | .hypothesis/ 216 | .pytest_cache/ 217 | 218 | # Translations 219 | *.mo 220 | *.pot 221 | 222 | # Django stuff: 223 | *.log 224 | local_settings.py 225 | db.sqlite3 226 | 227 | # Flask stuff: 228 | instance/ 229 | .webassets-cache 230 | 231 | # Scrapy stuff: 232 | .scrapy 233 | 234 | # Sphinx documentation 235 | docs/_build/ 236 | 237 | # PyBuilder 238 | target/ 239 | 240 | # Jupyter Notebook 241 | .ipynb_checkpoints 242 | 243 | # IPython 244 | profile_default/ 245 | ipython_config.py 246 | 247 | # pyenv 248 | .python-version 249 | 250 | # celery beat schedule file 251 | celerybeat-schedule 252 | 253 | # SageMath parsed files 254 | *.sage.py 255 | 256 | # Environments 257 | .env 258 | .venv 259 | env/ 260 | venv/ 261 | ENV/ 262 | env.bak/ 263 | venv.bak/ 264 | 265 | # Spyder project settings 266 | .spyderproject 267 | .spyproject 268 | 269 | # Rope project settings 270 | .ropeproject 271 | 272 | # mkdocs documentation 273 | /site 274 | 275 | # mypy 276 | .mypy_cache/ 277 | .dmypy.json 278 | dmypy.json 279 | 280 | # Pyre type checker 281 | .pyre/ 282 | 283 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: requirements-txt-fixer 7 | - id: detect-aws-credentials 8 | - repo: https://github.com/psf/black 9 | rev: 21.9b0 10 | hooks: 11 | - id: black 12 | language_version: python3.9 13 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-allow-list= 7 | 8 | # A comma-separated list of package or module names from where C extensions may 9 | # be loaded. Extensions are loading into the active Python interpreter and may 10 | # run arbitrary code. (This is an alternative name to extension-pkg-allow-list 11 | # for backward compatibility.) 12 | extension-pkg-whitelist= 13 | 14 | # Return non-zero exit code if any of these messages/categories are detected, 15 | # even if score is above --fail-under value. Syntax same as enable. Messages 16 | # specified are enabled, while categories only check already-enabled messages. 17 | fail-on= 18 | 19 | # Specify a score threshold to be exceeded before program exits with error. 20 | fail-under=10.0 21 | 22 | # Files or directories to be skipped. They should be base names, not paths. 23 | ignore=CVS 24 | 25 | # Add files or directories matching the regex patterns to the ignore-list. The 26 | # regex matches against paths. 27 | ignore-paths= 28 | 29 | # Files or directories matching the regex patterns are skipped. The regex 30 | # matches against base names, not paths. 31 | ignore-patterns= 32 | 33 | # Python code to execute, usually for sys.path manipulation such as 34 | # pygtk.require(). 35 | #init-hook= 36 | 37 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 38 | # number of processors available to use. 39 | jobs=1 40 | 41 | # Control the amount of potential inferred values when inferring a single 42 | # object. This can help the performance when dealing with large functions or 43 | # complex, nested conditions. 44 | limit-inference-results=100 45 | 46 | # List of plugins (as comma separated values of python module names) to load, 47 | # usually to register additional checkers. 48 | load-plugins= 49 | 50 | # Pickle collected data for later comparisons. 51 | persistent=yes 52 | 53 | # Min Python version to use for version dependend checks. Will default to the 54 | # version used to run pylint. 55 | py-version=3.9 56 | 57 | # When enabled, pylint would attempt to guess common misconfiguration and emit 58 | # user-friendly hints instead of false-positive error messages. 59 | suggestion-mode=yes 60 | 61 | # Allow loading of arbitrary C extensions. Extensions are imported into the 62 | # active Python interpreter and may run arbitrary code. 63 | unsafe-load-any-extension=no 64 | 65 | 66 | [MESSAGES CONTROL] 67 | 68 | # Only show warnings with the listed confidence levels. Leave empty to show 69 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 70 | confidence= 71 | 72 | # Disable the message, report, category or checker with the given id(s). You 73 | # can either give multiple identifiers separated by comma (,) or put this 74 | # option multiple times (only on the command line, not in the configuration 75 | # file where it should appear only once). You can also use "--disable=all" to 76 | # disable everything first and then reenable specific checks. For example, if 77 | # you want to run only the similarities checker, you can use "--disable=all 78 | # --enable=similarities". If you want to run only the classes checker, but have 79 | # no Warning level messages displayed, use "--disable=all --enable=classes 80 | # --disable=W". 81 | disable=raw-checker-failed, 82 | bad-inline-option, 83 | locally-disabled, 84 | file-ignored, 85 | suppressed-message, 86 | useless-suppression, 87 | deprecated-pragma, 88 | use-symbolic-message-instead, 89 | C0114, # missing-module-docstring 90 | C0115, # missing-class-docstring 91 | C0116 # missing-function-docstring 92 | 93 | # Enable the message, report, category or checker with the given id(s). You can 94 | # either give multiple identifier separated by comma (,) or put this option 95 | # multiple time (only on the command line, not in the configuration file where 96 | # it should appear only once). See also the "--disable" option for examples. 97 | enable=c-extension-no-member 98 | 99 | 100 | [REPORTS] 101 | 102 | # Python expression which should return a score less than or equal to 10. You 103 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 104 | # which contain the number of messages in each category, as well as 'statement' 105 | # which is the total number of statements analyzed. This score is used by the 106 | # global evaluation report (RP0004). 107 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 108 | 109 | # Template used to display messages. This is a python new-style format string 110 | # used to format the message information. See doc for all details. 111 | #msg-template= 112 | 113 | # Set the output format. Available formats are text, parseable, colorized, json 114 | # and msvs (visual studio). You can also give a reporter class, e.g. 115 | # mypackage.mymodule.MyReporterClass. 116 | output-format=text 117 | 118 | # Tells whether to display a full report or only the messages. 119 | reports=no 120 | 121 | # Activate the evaluation score. 122 | score=yes 123 | 124 | 125 | [REFACTORING] 126 | 127 | # Maximum number of nested blocks for function / method body 128 | max-nested-blocks=5 129 | 130 | # Complete name of functions that never returns. When checking for 131 | # inconsistent-return-statements if a never returning function is called then 132 | # it will be considered as an explicit return statement and no message will be 133 | # printed. 134 | never-returning-functions=sys.exit,argparse.parse_error 135 | 136 | 137 | [LOGGING] 138 | 139 | # The type of string formatting that logging methods do. `old` means using % 140 | # formatting, `new` is for `{}` formatting. 141 | logging-format-style=old 142 | 143 | # Logging modules to check that the string format arguments are in logging 144 | # function parameter format. 145 | logging-modules=logging 146 | 147 | 148 | [SPELLING] 149 | 150 | # Limits count of emitted suggestions for spelling mistakes. 151 | max-spelling-suggestions=4 152 | 153 | # Spelling dictionary name. Available dictionaries: none. To make it work, 154 | # install the 'python-enchant' package. 155 | spelling-dict= 156 | 157 | # List of comma separated words that should be considered directives if they 158 | # appear and the beginning of a comment and should not be checked. 159 | spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy: 160 | 161 | # List of comma separated words that should not be checked. 162 | spelling-ignore-words= 163 | 164 | # A path to a file that contains the private dictionary; one word per line. 165 | spelling-private-dict-file= 166 | 167 | # Tells whether to store unknown words to the private dictionary (see the 168 | # --spelling-private-dict-file option) instead of raising a message. 169 | spelling-store-unknown-words=no 170 | 171 | 172 | [MISCELLANEOUS] 173 | 174 | # List of note tags to take in consideration, separated by a comma. 175 | notes=FIXME, 176 | XXX, 177 | TODO 178 | 179 | # Regular expression of note tags to take in consideration. 180 | #notes-rgx= 181 | 182 | 183 | [TYPECHECK] 184 | 185 | # List of decorators that produce context managers, such as 186 | # contextlib.contextmanager. Add to this list to register other decorators that 187 | # produce valid context managers. 188 | contextmanager-decorators=contextlib.contextmanager 189 | 190 | # List of members which are set dynamically and missed by pylint inference 191 | # system, and so shouldn't trigger E1101 when accessed. Python regular 192 | # expressions are accepted. 193 | generated-members= 194 | 195 | # Tells whether missing members accessed in mixin class should be ignored. A 196 | # mixin class is detected if its name ends with "mixin" (case insensitive). 197 | ignore-mixin-members=yes 198 | 199 | # Tells whether to warn about missing members when the owner of the attribute 200 | # is inferred to be None. 201 | ignore-none=yes 202 | 203 | # This flag controls whether pylint should warn about no-member and similar 204 | # checks whenever an opaque object is returned when inferring. The inference 205 | # can return multiple potential results while evaluating a Python object, but 206 | # some branches might not be evaluated, which results in partial inference. In 207 | # that case, it might be useful to still emit no-member and other checks for 208 | # the rest of the inferred objects. 209 | ignore-on-opaque-inference=yes 210 | 211 | # List of class names for which member attributes should not be checked (useful 212 | # for classes with dynamically set attributes). This supports the use of 213 | # qualified names. 214 | ignored-classes=optparse.Values,thread._local,_thread._local 215 | 216 | # List of module names for which member attributes should not be checked 217 | # (useful for modules/projects where namespaces are manipulated during runtime 218 | # and thus existing member attributes cannot be deduced by static analysis). It 219 | # supports qualified module names, as well as Unix pattern matching. 220 | ignored-modules= 221 | 222 | # Show a hint with possible names when a member name was not found. The aspect 223 | # of finding the hint is based on edit distance. 224 | missing-member-hint=yes 225 | 226 | # The minimum edit distance a name should have in order to be considered a 227 | # similar match for a missing member name. 228 | missing-member-hint-distance=1 229 | 230 | # The total number of similar names that should be taken in consideration when 231 | # showing a hint for a missing member. 232 | missing-member-max-choices=1 233 | 234 | # List of decorators that change the signature of a decorated function. 235 | signature-mutators= 236 | 237 | 238 | [VARIABLES] 239 | 240 | # List of additional names supposed to be defined in builtins. Remember that 241 | # you should avoid defining new builtins when possible. 242 | additional-builtins= 243 | 244 | # Tells whether unused global variables should be treated as a violation. 245 | allow-global-unused-variables=yes 246 | 247 | # List of names allowed to shadow builtins 248 | allowed-redefined-builtins= 249 | 250 | # List of strings which can identify a callback function by name. A callback 251 | # name must start or end with one of those strings. 252 | callbacks=cb_, 253 | _cb 254 | 255 | # A regular expression matching the name of dummy variables (i.e. expected to 256 | # not be used). 257 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 258 | 259 | # Argument names that match this expression will be ignored. Default to name 260 | # with leading underscore. 261 | ignored-argument-names=_.*|^ignored_|^unused_ 262 | 263 | # Tells whether we should check for unused import in __init__ files. 264 | init-import=no 265 | 266 | # List of qualified module names which can have objects that can redefine 267 | # builtins. 268 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 269 | 270 | 271 | [FORMAT] 272 | 273 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 274 | expected-line-ending-format= 275 | 276 | # Regexp for a line that is allowed to be longer than the limit. 277 | ignore-long-lines=^\s*(# )??$ 278 | 279 | # Number of spaces of indent required inside a hanging or continued line. 280 | indent-after-paren=4 281 | 282 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 283 | # tab). 284 | indent-string=' ' 285 | 286 | # Maximum number of characters on a single line. 287 | max-line-length=100 288 | 289 | # Maximum number of lines in a module. 290 | max-module-lines=1000 291 | 292 | # Allow the body of a class to be on the same line as the declaration if body 293 | # contains single statement. 294 | single-line-class-stmt=no 295 | 296 | # Allow the body of an if to be on the same line as the test if there is no 297 | # else. 298 | single-line-if-stmt=no 299 | 300 | 301 | [SIMILARITIES] 302 | 303 | # Comments are removed from the similarity computation 304 | ignore-comments=yes 305 | 306 | # Docstrings are removed from the similarity computation 307 | ignore-docstrings=yes 308 | 309 | # Imports are removed from the similarity computation 310 | ignore-imports=no 311 | 312 | # Signatures are removed from the similarity computation 313 | ignore-signatures=no 314 | 315 | # Minimum lines number of a similarity. 316 | min-similarity-lines=4 317 | 318 | 319 | [BASIC] 320 | 321 | # Naming style matching correct argument names. 322 | argument-naming-style=snake_case 323 | 324 | # Regular expression matching correct argument names. Overrides argument- 325 | # naming-style. 326 | #argument-rgx= 327 | 328 | # Naming style matching correct attribute names. 329 | attr-naming-style=snake_case 330 | 331 | # Regular expression matching correct attribute names. Overrides attr-naming- 332 | # style. 333 | #attr-rgx= 334 | 335 | # Bad variable names which should always be refused, separated by a comma. 336 | bad-names=foo, 337 | bar, 338 | baz, 339 | toto, 340 | tutu, 341 | tata 342 | 343 | # Bad variable names regexes, separated by a comma. If names match any regex, 344 | # they will always be refused 345 | bad-names-rgxs= 346 | 347 | # Naming style matching correct class attribute names. 348 | class-attribute-naming-style=any 349 | 350 | # Regular expression matching correct class attribute names. Overrides class- 351 | # attribute-naming-style. 352 | #class-attribute-rgx= 353 | 354 | # Naming style matching correct class constant names. 355 | class-const-naming-style=UPPER_CASE 356 | 357 | # Regular expression matching correct class constant names. Overrides class- 358 | # const-naming-style. 359 | #class-const-rgx= 360 | 361 | # Naming style matching correct class names. 362 | class-naming-style=PascalCase 363 | 364 | # Regular expression matching correct class names. Overrides class-naming- 365 | # style. 366 | #class-rgx= 367 | 368 | # Naming style matching correct constant names. 369 | const-naming-style=UPPER_CASE 370 | 371 | # Regular expression matching correct constant names. Overrides const-naming- 372 | # style. 373 | #const-rgx= 374 | 375 | # Minimum line length for functions/classes that require docstrings, shorter 376 | # ones are exempt. 377 | docstring-min-length=0 378 | 379 | # Naming style matching correct function names. 380 | function-naming-style=snake_case 381 | 382 | # Regular expression matching correct function names. Overrides function- 383 | # naming-style. 384 | #function-rgx= 385 | 386 | # Good variable names which should always be accepted, separated by a comma. 387 | good-names=i, 388 | j, 389 | k, 390 | ex, 391 | Run, 392 | _ 393 | 394 | # Good variable names regexes, separated by a comma. If names match any regex, 395 | # they will always be accepted 396 | good-names-rgxs= 397 | 398 | # Include a hint for the correct naming format with invalid-name. 399 | include-naming-hint=no 400 | 401 | # Naming style matching correct inline iteration names. 402 | inlinevar-naming-style=any 403 | 404 | # Regular expression matching correct inline iteration names. Overrides 405 | # inlinevar-naming-style. 406 | #inlinevar-rgx= 407 | 408 | # Naming style matching correct method names. 409 | method-naming-style=snake_case 410 | 411 | # Regular expression matching correct method names. Overrides method-naming- 412 | # style. 413 | #method-rgx= 414 | 415 | # Naming style matching correct module names. 416 | module-naming-style=snake_case 417 | 418 | # Regular expression matching correct module names. Overrides module-naming- 419 | # style. 420 | #module-rgx= 421 | 422 | # Colon-delimited sets of names that determine each other's naming style when 423 | # the name regexes allow several styles. 424 | name-group= 425 | 426 | # Regular expression which should only match function or class names that do 427 | # not require a docstring. 428 | no-docstring-rgx=^_ 429 | 430 | # List of decorators that produce properties, such as abc.abstractproperty. Add 431 | # to this list to register other decorators that produce valid properties. 432 | # These decorators are taken in consideration only for invalid-name. 433 | property-classes=abc.abstractproperty 434 | 435 | # Naming style matching correct variable names. 436 | variable-naming-style=snake_case 437 | 438 | # Regular expression matching correct variable names. Overrides variable- 439 | # naming-style. 440 | #variable-rgx= 441 | 442 | 443 | [STRING] 444 | 445 | # This flag controls whether inconsistent-quotes generates a warning when the 446 | # character used as a quote delimiter is used inconsistently within a module. 447 | check-quote-consistency=no 448 | 449 | # This flag controls whether the implicit-str-concat should generate a warning 450 | # on implicit string concatenation in sequences defined over several lines. 451 | check-str-concat-over-line-jumps=no 452 | 453 | 454 | [IMPORTS] 455 | 456 | # List of modules that can be imported at any level, not just the top level 457 | # one. 458 | allow-any-import-level= 459 | 460 | # Allow wildcard imports from modules that define __all__. 461 | allow-wildcard-with-all=no 462 | 463 | # Analyse import fallback blocks. This can be used to support both Python 2 and 464 | # 3 compatible code, which means that the block might have code that exists 465 | # only in one or another interpreter, leading to false positives when analysed. 466 | analyse-fallback-blocks=no 467 | 468 | # Deprecated modules which should not be used, separated by a comma. 469 | deprecated-modules= 470 | 471 | # Output a graph (.gv or any supported image format) of external dependencies 472 | # to the given file (report RP0402 must not be disabled). 473 | ext-import-graph= 474 | 475 | # Output a graph (.gv or any supported image format) of all (i.e. internal and 476 | # external) dependencies to the given file (report RP0402 must not be 477 | # disabled). 478 | import-graph= 479 | 480 | # Output a graph (.gv or any supported image format) of internal dependencies 481 | # to the given file (report RP0402 must not be disabled). 482 | int-import-graph= 483 | 484 | # Force import order to recognize a module as part of the standard 485 | # compatibility libraries. 486 | known-standard-library= 487 | 488 | # Force import order to recognize a module as part of a third party library. 489 | known-third-party=enchant 490 | 491 | # Couples of modules and preferred modules, separated by a comma. 492 | preferred-modules= 493 | 494 | 495 | [CLASSES] 496 | 497 | # Warn about protected attribute access inside special methods 498 | check-protected-access-in-special-methods=no 499 | 500 | # List of method names used to declare (i.e. assign) instance attributes. 501 | defining-attr-methods=__init__, 502 | __new__, 503 | setUp, 504 | __post_init__ 505 | 506 | # List of member names, which should be excluded from the protected access 507 | # warning. 508 | exclude-protected=_asdict, 509 | _fields, 510 | _replace, 511 | _source, 512 | _make 513 | 514 | # List of valid names for the first argument in a class method. 515 | valid-classmethod-first-arg=cls 516 | 517 | # List of valid names for the first argument in a metaclass class method. 518 | valid-metaclass-classmethod-first-arg=cls 519 | 520 | 521 | [DESIGN] 522 | 523 | # List of qualified class names to ignore when counting class parents (see 524 | # R0901) 525 | ignored-parents= 526 | 527 | # Maximum number of arguments for function / method. 528 | max-args=5 529 | 530 | # Maximum number of attributes for a class (see R0902). 531 | max-attributes=7 532 | 533 | # Maximum number of boolean expressions in an if statement (see R0916). 534 | max-bool-expr=5 535 | 536 | # Maximum number of branch for function / method body. 537 | max-branches=12 538 | 539 | # Maximum number of locals for function / method body. 540 | max-locals=15 541 | 542 | # Maximum number of parents for a class (see R0901). 543 | max-parents=7 544 | 545 | # Maximum number of public methods for a class (see R0904). 546 | max-public-methods=20 547 | 548 | # Maximum number of return / yield for function / method body. 549 | max-returns=6 550 | 551 | # Maximum number of statements in function / method body. 552 | max-statements=50 553 | 554 | # Minimum number of public methods for a class (see R0903). 555 | min-public-methods=2 556 | 557 | 558 | [EXCEPTIONS] 559 | 560 | # Exceptions that will emit a warning when being caught. Defaults to 561 | # "BaseException, Exception". 562 | overgeneral-exceptions=BaseException, 563 | Exception 564 | -------------------------------------------------------------------------------- /.pypirc: -------------------------------------------------------------------------------- 1 | [distutils] 2 | index-servers = 3 | pypi 4 | internal 5 | 6 | [pypi] 7 | username: 8 | password: 9 | 10 | [internal] 11 | repository: http://(외부 IP):8080 12 | username: 13 | password: 14 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/usr/bin/python", 3 | "python.linting.pylintPath": "/usr/local/bin/pylint", 4 | "python.linting.pylintEnabled": true, 5 | "python.linting.enabled": true, 6 | "python.linting.pylintArgs": [ 7 | "--init-hook", 8 | "import sys; sys.path.append('/Users/lee/Desktop/spark-plugin/sparkplus/core')" 9 | ], 10 | "files.watcherExclude": { 11 | "**/target": true 12 | } 13 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 SWM-12 / Team 12 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | pyspark = "*" 8 | mysql-connector-python = "*" 9 | pandas = "*" 10 | geopandas = "*" 11 | matplotlib = "*" 12 | geospark = "*" 13 | h3 = "*" 14 | geopy = "*" 15 | folium = "*" 16 | python-dotenv = "*" 17 | 18 | [dev-packages] 19 | 20 | [requires] 21 | python_version = "3.9" 22 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "24884446b2b9af187c2bcdf8c8aef963b732e9b9c992eea2020bbd6472cf6009" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.9" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "attrs": { 20 | "hashes": [ 21 | "sha256:29adc2665447e5191d0e7c568fde78b21f9672d344281d0c6e1ab085429b22b6", 22 | "sha256:86efa402f67bf2df34f51a335487cf46b1ec130d02b8d39fd248abfd30da551c" 23 | ], 24 | "markers": "python_version >= '3.5'", 25 | "version": "==22.1.0" 26 | }, 27 | "branca": { 28 | "hashes": [ 29 | "sha256:55949855214504c7583b71b9a03a84dce2e96a84027613bb53b42d04844ce24e", 30 | "sha256:ae706fc7a88dd0296a58bb11c0cb3c6be358491a3b0abee08fe16b4db17814c0" 31 | ], 32 | "markers": "python_version >= '3.7'", 33 | "version": "==0.6.0" 34 | }, 35 | "certifi": { 36 | "hashes": [ 37 | "sha256:0d9c601124e5a6ba9712dbc60d9c53c21e34f5f641fe83002317394311bdce14", 38 | "sha256:90c1a32f1d68f940488354e36370f6cca89f0f106db09518524c88d6ed83f382" 39 | ], 40 | "markers": "python_version >= '3.6'", 41 | "version": "==2022.9.24" 42 | }, 43 | "charset-normalizer": { 44 | "hashes": [ 45 | "sha256:5a3d016c7c547f69d6f81fb0db9449ce888b418b5b9952cc5e6e66843e9dd845", 46 | "sha256:83e9a75d1911279afd89352c68b45348559d1fc0506b054b346651b5e7fee29f" 47 | ], 48 | "markers": "python_version >= '3.6'", 49 | "version": "==2.1.1" 50 | }, 51 | "click": { 52 | "hashes": [ 53 | "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e", 54 | "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48" 55 | ], 56 | "markers": "python_version >= '3.7'", 57 | "version": "==8.1.3" 58 | }, 59 | "click-plugins": { 60 | "hashes": [ 61 | "sha256:46ab999744a9d831159c3411bb0c79346d94a444df9a3a3742e9ed63645f264b", 62 | "sha256:5d262006d3222f5057fd81e1623d4443e41dcda5dc815c06b442aa3c02889fc8" 63 | ], 64 | "version": "==1.1.1" 65 | }, 66 | "cligj": { 67 | "hashes": [ 68 | "sha256:a4bc13d623356b373c2c27c53dbd9c68cae5d526270bfa71f6c6fa69669c6b27", 69 | "sha256:c1ca117dbce1fe20a5809dc96f01e1c2840f6dcc939b3ddbb1111bf330ba82df" 70 | ], 71 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' and python_version < '4'", 72 | "version": "==0.7.2" 73 | }, 74 | "cycler": { 75 | "hashes": [ 76 | "sha256:3a27e95f763a428a739d2add979fa7494c912a32c17c4c38c4d5f082cad165a3", 77 | "sha256:9c87405839a19696e837b3b818fed3f5f69f16f1eec1a1ad77e043dcea9c772f" 78 | ], 79 | "markers": "python_version >= '3.6'", 80 | "version": "==0.11.0" 81 | }, 82 | "findspark": { 83 | "hashes": [ 84 | "sha256:aa10a96cb616cab329181d72e8ef13d2dc453b4babd02b5482471a0882c1195e", 85 | "sha256:e5d5415ff8ced6b173b801e12fc90c1eefca1fb6bf9c19c4fc1f235d4222e753" 86 | ], 87 | "version": "==2.0.1" 88 | }, 89 | "fiona": { 90 | "hashes": [ 91 | "sha256:18649326a7724611b16b648e14fd094089d517413b95ac91d0cdb0adc5fcb8de", 92 | "sha256:3f26c8b6ea9bc92cbd52a4dd83ffd44472450bf92f4e3d4ef2341adc2f35a54d", 93 | "sha256:59a3800bc09ebee3516d64d02a8a6818d07ab1573c6096f3ef3468bf9f8f95f8", 94 | "sha256:6ba2294bc6adcbc36229862667aac6b98e6c306e1958caf53b8bfcf9a3b8c77a", 95 | "sha256:75924f69c51db6e258c91308780546278028c509db12aa33a47692a0266c9667", 96 | "sha256:89cfcc3bdb4aba7bba1eb552b3866b851334693ab694529803122b21f5927960", 97 | "sha256:904793b17aee70ca9c3d582dbf01623eccfdeacd00c5e1a8e421be41f2e43d67", 98 | "sha256:a82a99ce9b3e7825740157c45c9fb2259d4e92f0a886aaac25f0db40ffe1eea3", 99 | "sha256:b5cad3424b7473eb0e19f17ee45abec92133a694a4b452a278f02e3b8d0f810f", 100 | "sha256:b88e2e6548a41c1dfa3f96c8275ff472a3edca729e14a641c0fa5b2e146a8ab5", 101 | "sha256:c28d9ffa5d230a1d9eaf571529fa9eb7573d39613354c090ad077ad153a37ee1", 102 | "sha256:c4aafdd565b3a30bdd78cafae35d4945f6741eef31401c1bb1e166b6262d7539", 103 | "sha256:ce9a22c9883cc5d11c05ba3fb9db5082044a07c6b299753ea5bb8e178b8ba53b", 104 | "sha256:d0df3e105ad7f0cca5f16b441c232fd693ef6c4adf2c1b6271aaaa1cdc06164d", 105 | "sha256:d47777890aa1d715025abc7a6d6b2a6bb8d2a37cc94c44ce95940b80eda21444", 106 | "sha256:df34c980cd7396adfbc89bbb363bdd6e358c76f91969fc98c9dfc076dd11638d", 107 | "sha256:e33860aaf70bbd2726cff12fd3857bd832b6dc2ad3ce4b27e7563bd68abdc26f", 108 | "sha256:e3ed1c0c1c60f710a612aaeb294de54214d228c4ef40e0c1dc159e46f86a9446", 109 | "sha256:ed75dd29c89e0e455e3a322f28cd92f192bcb8fced16e2bfb6422a7f95ffe5e9" 110 | ], 111 | "version": "==1.8.22" 112 | }, 113 | "folium": { 114 | "hashes": [ 115 | "sha256:3d2c48dd6ffe5327975bbfd718468c4e81db9f2844c26e574f878adf4c08b644" 116 | ], 117 | "index": "pypi", 118 | "version": "==0.12.1" 119 | }, 120 | "geographiclib": { 121 | "hashes": [ 122 | "sha256:8f441c527b0b8a26cd96c965565ff0513d1e4d9952b704bf449409e5015c77b7", 123 | "sha256:ac400d672b8954b0306bca890b088bb8ba2a757dc8133cca0b878f34b33b2740" 124 | ], 125 | "version": "==1.52" 126 | }, 127 | "geopandas": { 128 | "hashes": [ 129 | "sha256:1722853464441b603d9be3d35baf8bde43831424a891e82a8545eb8997b65d6c", 130 | "sha256:efbf47e70732e25c3727222019c92b39b2e0a66ebe4fe379fbe1aa43a2a871db" 131 | ], 132 | "index": "pypi", 133 | "version": "==0.10.2" 134 | }, 135 | "geopy": { 136 | "hashes": [ 137 | "sha256:58b7edf526b8c32e33126570b5f4fcdfaa29d4416506064777ae8d84cd103fdd", 138 | "sha256:8f1f949082b964385de61fcc3a667a6a9a6e242beb1ae8972449f164b2ba0e89" 139 | ], 140 | "index": "pypi", 141 | "version": "==2.2.0" 142 | }, 143 | "geospark": { 144 | "hashes": [ 145 | "sha256:1056034994b93773849b1c28a05df463d885665eb56d8655f4a2f7f02e5dea72", 146 | "sha256:ab6157297f6d395001305b1e21f1a4cca75e169c704b3de998bbc260095900a3" 147 | ], 148 | "index": "pypi", 149 | "version": "==1.3.1" 150 | }, 151 | "h3": { 152 | "hashes": [ 153 | "sha256:105625a45d86b6cd1cd67acd7ab158adf3d193262534470b69a1db49a6664541", 154 | "sha256:1af9c039f7daeff4621c1349000eec0ed37c4548552a3a173cddb6d648547344", 155 | "sha256:20d48a3c9acdcf7c02c70519c3a5a22406b505cc34ff9f9a302e11a2a13d9c73", 156 | "sha256:339f4f210373dd43739019d6a8def64b119de62f3083e31b2d0413954c429c88", 157 | "sha256:3909aef50b19835b0790e077d9f06b27609380bb7bf09382e2c4e813385f7677", 158 | "sha256:48274cff38d53da155500679194d69ce19aaa52c00d0f30f24a327c1b22cb752", 159 | "sha256:54e0f74357467347aee517d6137777094b64b9aac648d92a7507e14ac28ddca6", 160 | "sha256:59abe06c99afa1b27bb7fcfb2a8c01d285ad36005dce8c82fca3dab0b4d8777d", 161 | "sha256:61ddf3052f226de22b546af5d47816d81fea83eb0e62d22f53ed3b23eb0b8551", 162 | "sha256:75214450b89e5204d77700e01cf2d41a02d7cbfe9b9ca925727c52e18f91072e", 163 | "sha256:7a0817ed9f6b8f4b7eeb719744260ab41ccd7131475b169e45a79cf99d045b1f", 164 | "sha256:7d6334b74e80a0e9395132bf9a7d38799b40df6181467616e950032a112773e9", 165 | "sha256:83fb0a7e1a1241c9c69137569e761b257b9828abc24b8bf78710ddce8db9e28e", 166 | "sha256:86b6bc6ac38f93be0899f45fb55585fb5c4964a2e5c8cc4a349cac41fecb10f3", 167 | "sha256:87c5ed2e2878cb936dd466ed5b4fc7cd462cc8a713d066789918d295d9d26a63", 168 | "sha256:8acd1e448bad2f5cd03ab0107f34f34967dd271726978ed5acbe42806628d90a", 169 | "sha256:950e833148ff441ee240d8ae71d4a32208cf062d9e0cc389fd056fc7cd280a0c", 170 | "sha256:bd1982ab1f2a85517aae7166582b82c73e5350c31652f4ab20b337bcf978b43d", 171 | "sha256:ddbe4dd122be51508a43baee157a2724d52ec480b45da932f7ca058cfec4aaf9", 172 | "sha256:de5d3bcc8f0253531ddf72e7305c6425b4af0c22921962ab7392a3c4c1dc5530", 173 | "sha256:e0467583c23164d232de51b82087685ab3c961911f673c892d10f87fd6642990", 174 | "sha256:e73ed07510907c8cf4e3a6f14625af221b0a3dea5c680ff011abec622cf2be9a", 175 | "sha256:f6f832c71b3b9be8949b299d20e8230129321d2296c28b970607d354cbce6efa" 176 | ], 177 | "index": "pypi", 178 | "version": "==3.7.3" 179 | }, 180 | "idna": { 181 | "hashes": [ 182 | "sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4", 183 | "sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2" 184 | ], 185 | "markers": "python_version >= '3.5'", 186 | "version": "==3.4" 187 | }, 188 | "jinja2": { 189 | "hashes": [ 190 | "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852", 191 | "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61" 192 | ], 193 | "markers": "python_version >= '3.7'", 194 | "version": "==3.1.2" 195 | }, 196 | "kiwisolver": { 197 | "hashes": [ 198 | "sha256:02f79693ec433cb4b5f51694e8477ae83b3205768a6fb48ffba60549080e295b", 199 | "sha256:03baab2d6b4a54ddbb43bba1a3a2d1627e82d205c5cf8f4c924dc49284b87166", 200 | "sha256:1041feb4cda8708ce73bb4dcb9ce1ccf49d553bf87c3954bdfa46f0c3f77252c", 201 | "sha256:10ee06759482c78bdb864f4109886dff7b8a56529bc1609d4f1112b93fe6423c", 202 | "sha256:1d1573129aa0fd901076e2bfb4275a35f5b7aa60fbfb984499d661ec950320b0", 203 | "sha256:283dffbf061a4ec60391d51e6155e372a1f7a4f5b15d59c8505339454f8989e4", 204 | "sha256:28bc5b299f48150b5f822ce68624e445040595a4ac3d59251703779836eceff9", 205 | "sha256:2a66fdfb34e05b705620dd567f5a03f239a088d5a3f321e7b6ac3239d22aa286", 206 | "sha256:2e307eb9bd99801f82789b44bb45e9f541961831c7311521b13a6c85afc09767", 207 | "sha256:2e407cb4bd5a13984a6c2c0fe1845e4e41e96f183e5e5cd4d77a857d9693494c", 208 | "sha256:2f5e60fabb7343a836360c4f0919b8cd0d6dbf08ad2ca6b9cf90bf0c76a3c4f6", 209 | "sha256:36dafec3d6d6088d34e2de6b85f9d8e2324eb734162fba59d2ba9ed7a2043d5b", 210 | "sha256:3fe20f63c9ecee44560d0e7f116b3a747a5d7203376abeea292ab3152334d004", 211 | "sha256:41dae968a94b1ef1897cb322b39360a0812661dba7c682aa45098eb8e193dbdf", 212 | "sha256:4bd472dbe5e136f96a4b18f295d159d7f26fd399136f5b17b08c4e5f498cd494", 213 | "sha256:4ea39b0ccc4f5d803e3337dd46bcce60b702be4d86fd0b3d7531ef10fd99a1ac", 214 | "sha256:5853eb494c71e267912275e5586fe281444eb5e722de4e131cddf9d442615626", 215 | "sha256:5bce61af018b0cb2055e0e72e7d65290d822d3feee430b7b8203d8a855e78766", 216 | "sha256:6295ecd49304dcf3bfbfa45d9a081c96509e95f4b9d0eb7ee4ec0530c4a96514", 217 | "sha256:62ac9cc684da4cf1778d07a89bf5f81b35834cb96ca523d3a7fb32509380cbf6", 218 | "sha256:70e7c2e7b750585569564e2e5ca9845acfaa5da56ac46df68414f29fea97be9f", 219 | "sha256:7577c1987baa3adc4b3c62c33bd1118c3ef5c8ddef36f0f2c950ae0b199e100d", 220 | "sha256:75facbe9606748f43428fc91a43edb46c7ff68889b91fa31f53b58894503a191", 221 | "sha256:787518a6789009c159453da4d6b683f468ef7a65bbde796bcea803ccf191058d", 222 | "sha256:78d6601aed50c74e0ef02f4204da1816147a6d3fbdc8b3872d263338a9052c51", 223 | "sha256:7c43e1e1206cd421cd92e6b3280d4385d41d7166b3ed577ac20444b6995a445f", 224 | "sha256:81e38381b782cc7e1e46c4e14cd997ee6040768101aefc8fa3c24a4cc58e98f8", 225 | "sha256:841293b17ad704d70c578f1f0013c890e219952169ce8a24ebc063eecf775454", 226 | "sha256:872b8ca05c40d309ed13eb2e582cab0c5a05e81e987ab9c521bf05ad1d5cf5cb", 227 | "sha256:877272cf6b4b7e94c9614f9b10140e198d2186363728ed0f701c6eee1baec1da", 228 | "sha256:8c808594c88a025d4e322d5bb549282c93c8e1ba71b790f539567932722d7bd8", 229 | "sha256:8ed58b8acf29798b036d347791141767ccf65eee7f26bde03a71c944449e53de", 230 | "sha256:91672bacaa030f92fc2f43b620d7b337fd9a5af28b0d6ed3f77afc43c4a64b5a", 231 | "sha256:968f44fdbf6dd757d12920d63b566eeb4d5b395fd2d00d29d7ef00a00582aac9", 232 | "sha256:9f85003f5dfa867e86d53fac6f7e6f30c045673fa27b603c397753bebadc3008", 233 | "sha256:a553dadda40fef6bfa1456dc4be49b113aa92c2a9a9e8711e955618cd69622e3", 234 | "sha256:a68b62a02953b9841730db7797422f983935aeefceb1679f0fc85cbfbd311c32", 235 | "sha256:abbe9fa13da955feb8202e215c4018f4bb57469b1b78c7a4c5c7b93001699938", 236 | "sha256:ad881edc7ccb9d65b0224f4e4d05a1e85cf62d73aab798943df6d48ab0cd79a1", 237 | "sha256:b1792d939ec70abe76f5054d3f36ed5656021dcad1322d1cc996d4e54165cef9", 238 | "sha256:b428ef021242344340460fa4c9185d0b1f66fbdbfecc6c63eff4b7c29fad429d", 239 | "sha256:b533558eae785e33e8c148a8d9921692a9fe5aa516efbdff8606e7d87b9d5824", 240 | "sha256:ba59c92039ec0a66103b1d5fe588fa546373587a7d68f5c96f743c3396afc04b", 241 | "sha256:bc8d3bd6c72b2dd9decf16ce70e20abcb3274ba01b4e1c96031e0c4067d1e7cd", 242 | "sha256:bc9db8a3efb3e403e4ecc6cd9489ea2bac94244f80c78e27c31dcc00d2790ac2", 243 | "sha256:bf7d9fce9bcc4752ca4a1b80aabd38f6d19009ea5cbda0e0856983cf6d0023f5", 244 | "sha256:c2dbb44c3f7e6c4d3487b31037b1bdbf424d97687c1747ce4ff2895795c9bf69", 245 | "sha256:c79ebe8f3676a4c6630fd3f777f3cfecf9289666c84e775a67d1d358578dc2e3", 246 | "sha256:c97528e64cb9ebeff9701e7938653a9951922f2a38bd847787d4a8e498cc83ae", 247 | "sha256:d0611a0a2a518464c05ddd5a3a1a0e856ccc10e67079bb17f265ad19ab3c7597", 248 | "sha256:d06adcfa62a4431d404c31216f0f8ac97397d799cd53800e9d3efc2fbb3cf14e", 249 | "sha256:d41997519fcba4a1e46eb4a2fe31bc12f0ff957b2b81bac28db24744f333e955", 250 | "sha256:d5b61785a9ce44e5a4b880272baa7cf6c8f48a5180c3e81c59553ba0cb0821ca", 251 | "sha256:da152d8cdcab0e56e4f45eb08b9aea6455845ec83172092f09b0e077ece2cf7a", 252 | "sha256:da7e547706e69e45d95e116e6939488d62174e033b763ab1496b4c29b76fabea", 253 | "sha256:db5283d90da4174865d520e7366801a93777201e91e79bacbac6e6927cbceede", 254 | "sha256:db608a6757adabb32f1cfe6066e39b3706d8c3aa69bbc353a5b61edad36a5cb4", 255 | "sha256:e0ea21f66820452a3f5d1655f8704a60d66ba1191359b96541eaf457710a5fc6", 256 | "sha256:e7da3fec7408813a7cebc9e4ec55afed2d0fd65c4754bc376bf03498d4e92686", 257 | "sha256:e92a513161077b53447160b9bd8f522edfbed4bd9759e4c18ab05d7ef7e49408", 258 | "sha256:ecb1fa0db7bf4cff9dac752abb19505a233c7f16684c5826d1f11ebd9472b871", 259 | "sha256:efda5fc8cc1c61e4f639b8067d118e742b812c930f708e6667a5ce0d13499e29", 260 | "sha256:f0a1dbdb5ecbef0d34eb77e56fcb3e95bbd7e50835d9782a45df81cc46949750", 261 | "sha256:f0a71d85ecdd570ded8ac3d1c0f480842f49a40beb423bb8014539a9f32a5897", 262 | "sha256:f4f270de01dd3e129a72efad823da90cc4d6aafb64c410c9033aba70db9f1ff0", 263 | "sha256:f6cb459eea32a4e2cf18ba5fcece2dbdf496384413bc1bae15583f19e567f3b2", 264 | "sha256:f8ad8285b01b0d4695102546b342b493b3ccc6781fc28c8c6a1bb63e95d22f09", 265 | "sha256:f9f39e2f049db33a908319cf46624a569b36983c7c78318e9726a4cb8923b26c" 266 | ], 267 | "markers": "python_version >= '3.7'", 268 | "version": "==1.4.4" 269 | }, 270 | "markupsafe": { 271 | "hashes": [ 272 | "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003", 273 | "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88", 274 | "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5", 275 | "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7", 276 | "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a", 277 | "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603", 278 | "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1", 279 | "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135", 280 | "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247", 281 | "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6", 282 | "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601", 283 | "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77", 284 | "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02", 285 | "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e", 286 | "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63", 287 | "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f", 288 | "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980", 289 | "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b", 290 | "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812", 291 | "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff", 292 | "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96", 293 | "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1", 294 | "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925", 295 | "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a", 296 | "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6", 297 | "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e", 298 | "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f", 299 | "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4", 300 | "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f", 301 | "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3", 302 | "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c", 303 | "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a", 304 | "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417", 305 | "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a", 306 | "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a", 307 | "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37", 308 | "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452", 309 | "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933", 310 | "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a", 311 | "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7" 312 | ], 313 | "markers": "python_version >= '3.7'", 314 | "version": "==2.1.1" 315 | }, 316 | "matplotlib": { 317 | "hashes": [ 318 | "sha256:01c9de93a2ca0d128c9064f23709362e7fefb34910c7c9e0b8ab0de8258d5eda", 319 | "sha256:41b6e307458988891fcdea2d8ecf84a8c92d53f84190aa32da65f9505546e684", 320 | "sha256:48e1e0859b54d5f2e29bb78ca179fd59b971c6ceb29977fb52735bfd280eb0f5", 321 | "sha256:54a026055d5f8614f184e588f6e29064019a0aa8448450214c0b60926d62d919", 322 | "sha256:556965514b259204637c360d213de28d43a1f4aed1eca15596ce83f768c5a56f", 323 | "sha256:5c988bb43414c7c2b0a31bd5187b4d27fd625c080371b463a6d422047df78913", 324 | "sha256:6a724e3a48a54b8b6e7c4ae38cd3d07084508fa47c410c8757e9db9791421838", 325 | "sha256:6be8df61b1626e1a142c57e065405e869e9429b4a6dab4a324757d0dc4d42235", 326 | "sha256:844a7b0233e4ff7fba57e90b8799edaa40b9e31e300b8d5efc350937fa8b1bea", 327 | "sha256:85f0c9cf724715e75243a7b3087cf4a3de056b55e05d4d76cc58d610d62894f3", 328 | "sha256:a78a3b51f29448c7f4d4575e561f6b0dbb8d01c13c2046ab6c5220eb25c06506", 329 | "sha256:b884715a59fec9ad3b6048ecf3860f3b2ce965e676ef52593d6fa29abcf7d330", 330 | "sha256:b8b53f336a4688cfce615887505d7e41fd79b3594bf21dd300531a4f5b4f746a", 331 | "sha256:c70b6311dda3e27672f1bf48851a0de816d1ca6aaf3d49365fbdd8e959b33d2b", 332 | "sha256:ebfb01a65c3f5d53a8c2a8133fec2b5221281c053d944ae81ff5822a68266617", 333 | "sha256:eeb1859efe7754b1460e1d4991bbd4a60a56f366bc422ef3a9c5ae05f0bc70b5", 334 | "sha256:f15edcb0629a0801738925fe27070480f446fcaa15de65946ff946ad99a59a40", 335 | "sha256:f1c5efc278d996af8a251b2ce0b07bbeccb821f25c8c9846bdcb00ffc7f158aa", 336 | "sha256:f72657f1596199dc1e4e7a10f52a4784ead8a711f4e5b59bea95bdb97cf0e4fd", 337 | "sha256:fc4f526dfdb31c9bd6b8ca06bf9fab663ca12f3ec9cdf4496fb44bc680140318", 338 | "sha256:fcd6f1954943c0c192bfbebbac263f839d7055409f1173f80d8b11a224d236da" 339 | ], 340 | "index": "pypi", 341 | "version": "==3.4.3" 342 | }, 343 | "munch": { 344 | "hashes": [ 345 | "sha256:2d735f6f24d4dba3417fa448cae40c6e896ec1fdab6cdb5e6510999758a4dbd2", 346 | "sha256:6f44af89a2ce4ed04ff8de41f70b226b984db10a91dcc7b9ac2efc1c77022fdd" 347 | ], 348 | "version": "==2.5.0" 349 | }, 350 | "mysql-connector-python": { 351 | "hashes": [ 352 | "sha256:049374e54441903022f1c277a7467e4e7cf72a8d89ca26e86d4fa26b7157346c", 353 | "sha256:08e8bdb0b0cd247213764d115433972d0f5d103a00eb9cd0330294bdbb58cbca", 354 | "sha256:1ebbec05a4279bb2068e270c92f50101cfddb1c551d8c588f34097cde89d8344", 355 | "sha256:2af9bf324649d056e8f1e0f212a046c8794a6b5ac4d7fa2be600db443d0b57ba", 356 | "sha256:47e391ecae349e75ecffb513aec47ec3dbcfc8e2222ef9bd0b0494029eaa2a1b", 357 | "sha256:542d692b8284f8185a8f75f70c9d6c13eef80d2d530444b4f7f130868253e9f9", 358 | "sha256:64078ca692aa7e403e1660d4f8cd50816fee52e28827a9dd10d1cc4fc7ca5339", 359 | "sha256:667c712c0464527faee977d5db48f308e6b2d64396de0b5ba3fd459eda0653d0", 360 | "sha256:6ec8ae4b51487f8b2d542b02e7026dddec92f29239daef2dbfcfbaa9fd5503f2", 361 | "sha256:75fc7a089f1626ffbd22986090ca7cc3359c77ab9c4bde4bab1e30e15d4cbfd9", 362 | "sha256:7892dca8fc03a6e6131bc7359650064085ca803ae1406a104f55470e1c700668", 363 | "sha256:7a63dfded577f0a1800c863c4e9bcff7b583bcd369fc1eb4c2ec44b1f907e295", 364 | "sha256:7fa3c4b571e5bab629dbce6013b36ff42efdfe47da6ff14cee25acd1a77649bb", 365 | "sha256:86dc8e57082ee8fd631edeed5299396bd7d842fe455f5347e1ad08ace38b22ea", 366 | "sha256:8a3a8605c5380870a898b4a52c5b0d138e7cb998b192f10552373782d003886d", 367 | "sha256:94abfd76c6ad36f1bcf96f49d76dd55b9e09767eea972669baba9fc385fd9a46", 368 | "sha256:977ba6abdca01840afe27e461ec3a79550b50499782e5ff2933e513a52777870", 369 | "sha256:a261552ef3a2e865a76b751ba7ac3d1d1c4cbc8f167f39436343ef56c8d46d5a", 370 | "sha256:a7b93c14ef59d035e4277a9d637309e8057256efb073cb3db78337ff62c6099b", 371 | "sha256:a836d47f54ee50065ac98917513f2da50957c9cb809daaa144c9f2ab50afbc6f", 372 | "sha256:aa3d5e3656b3b418430b8c5e821f0a9329530a22fe717815c76dba524714d3ff", 373 | "sha256:ad393ddc1974da2b4e952156c3b1a8316f1cb14555b1ea83db6c3619232f8d89", 374 | "sha256:ae17753a4034a79d6ecc9163f8b5c3ea8a9c1ac2c7dac8c0a24b97102b253d26", 375 | "sha256:b947650179a4778d7e13b354a3c7c3b5e13ec00d86727375a0cbba0b43ade82c", 376 | "sha256:bca758bf9e4d936cc745ae4f51472217c0ebcfd54d4aaa85974f0620ac4633cc", 377 | "sha256:cf0c8e41edcd8a02f9ccbe925160ef12486111fcb2641d4551e3b2578afbe2c4", 378 | "sha256:d15136f44fe36c135295719b2635686dbbe1b8043297b3420129368000cf2820", 379 | "sha256:d3469c512a5a48809feeffc34df4c53667ee7b8795ff6e56c90861e1f5386763", 380 | "sha256:d7cccd804cafd2d15c731d06a38a88adf93ece684dd5f68b2bc77c04ed9f4131", 381 | "sha256:dbfe5cd52386a46fd32b59ff7b03974e39ead0bdbb3d23639b8c2dede00ebcdb", 382 | "sha256:de6f3daa99242fcf559d87466ea95f37b6b9cd7257be516440abe6e925548ef9", 383 | "sha256:e12481264dc938178d8225dd06590a6d16dbb1f8af51a7748cffe521afb52546", 384 | "sha256:fcfb722e748ec9219d5caee7c73855e93e67c7c57cd790e49d37c1c8571ba040" 385 | ], 386 | "index": "pypi", 387 | "version": "==8.0.27" 388 | }, 389 | "numpy": { 390 | "hashes": [ 391 | "sha256:01dd17cbb340bf0fc23981e52e1d18a9d4050792e8fb8363cecbf066a84b827d", 392 | "sha256:06005a2ef6014e9956c09ba07654f9837d9e26696a0470e42beedadb78c11b07", 393 | "sha256:09b7847f7e83ca37c6e627682f145856de331049013853f344f37b0c9690e3df", 394 | "sha256:0aaee12d8883552fadfc41e96b4c82ee7d794949e2a7c3b3a7201e968c7ecab9", 395 | "sha256:0cbe9848fad08baf71de1a39e12d1b6310f1d5b2d0ea4de051058e6e1076852d", 396 | "sha256:1b1766d6f397c18153d40015ddfc79ddb715cabadc04d2d228d4e5a8bc4ded1a", 397 | "sha256:33161613d2269025873025b33e879825ec7b1d831317e68f4f2f0f84ed14c719", 398 | "sha256:5039f55555e1eab31124a5768898c9e22c25a65c1e0037f4d7c495a45778c9f2", 399 | "sha256:522e26bbf6377e4d76403826ed689c295b0b238f46c28a7251ab94716da0b280", 400 | "sha256:56e454c7833e94ec9769fa0f86e6ff8e42ee38ce0ce1fa4cbb747ea7e06d56aa", 401 | "sha256:58f545efd1108e647604a1b5aa809591ccd2540f468a880bedb97247e72db387", 402 | "sha256:5e05b1c973a9f858c74367553e236f287e749465f773328c8ef31abe18f691e1", 403 | "sha256:7903ba8ab592b82014713c491f6c5d3a1cde5b4a3bf116404e08f5b52f6daf43", 404 | "sha256:8969bfd28e85c81f3f94eb4a66bc2cf1dbdc5c18efc320af34bffc54d6b1e38f", 405 | "sha256:92c8c1e89a1f5028a4c6d9e3ccbe311b6ba53694811269b992c0b224269e2398", 406 | "sha256:9c88793f78fca17da0145455f0d7826bcb9f37da4764af27ac945488116efe63", 407 | "sha256:a7ac231a08bb37f852849bbb387a20a57574a97cfc7b6cabb488a4fc8be176de", 408 | "sha256:abdde9f795cf292fb9651ed48185503a2ff29be87770c3b8e2a14b0cd7aa16f8", 409 | "sha256:af1da88f6bc3d2338ebbf0e22fe487821ea4d8e89053e25fa59d1d79786e7481", 410 | "sha256:b2a9ab7c279c91974f756c84c365a669a887efa287365a8e2c418f8b3ba73fb0", 411 | "sha256:bf837dc63ba5c06dc8797c398db1e223a466c7ece27a1f7b5232ba3466aafe3d", 412 | "sha256:ca51fcfcc5f9354c45f400059e88bc09215fb71a48d3768fb80e357f3b457e1e", 413 | "sha256:ce571367b6dfe60af04e04a1834ca2dc5f46004ac1cc756fb95319f64c095a96", 414 | "sha256:d208a0f8729f3fb790ed18a003f3a57895b989b40ea4dce4717e9cf4af62c6bb", 415 | "sha256:dbee87b469018961d1ad79b1a5d50c0ae850000b639bcb1b694e9981083243b6", 416 | "sha256:e9f4c4e51567b616be64e05d517c79a8a22f3606499941d97bb76f2ca59f982d", 417 | "sha256:f063b69b090c9d918f9df0a12116029e274daf0181df392839661c4c7ec9018a", 418 | "sha256:f9a909a8bae284d46bbfdefbdd4a262ba19d3bc9921b1e76126b1d21c3c34135" 419 | ], 420 | "markers": "python_version >= '3.8'", 421 | "version": "==1.23.5" 422 | }, 423 | "pandas": { 424 | "hashes": [ 425 | "sha256:003ba92db58b71a5f8add604a17a059f3068ef4e8c0c365b088468d0d64935fd", 426 | "sha256:10e10a2527db79af6e830c3d5842a4d60383b162885270f8cffc15abca4ba4a9", 427 | "sha256:22808afb8f96e2269dcc5b846decacb2f526dd0b47baebc63d913bf847317c8f", 428 | "sha256:2d1dc09c0013d8faa7474574d61b575f9af6257ab95c93dcf33a14fd8d2c1bab", 429 | "sha256:35c77609acd2e4d517da41bae0c11c70d31c87aae8dd1aabd2670906c6d2c143", 430 | "sha256:372d72a3d8a5f2dbaf566a5fa5fa7f230842ac80f29a931fb4b071502cf86b9a", 431 | "sha256:42493f8ae67918bf129869abea8204df899902287a7f5eaf596c8e54e0ac7ff4", 432 | "sha256:4acc28364863127bca1029fb72228e6f473bb50c32e77155e80b410e2068eeac", 433 | "sha256:5298a733e5bfbb761181fd4672c36d0c627320eb999c59c65156c6a90c7e1b4f", 434 | "sha256:5ba0aac1397e1d7b654fccf263a4798a9e84ef749866060d19e577e927d66e1b", 435 | "sha256:9707bdc1ea9639c886b4d3be6e2a45812c1ac0c2080f94c31b71c9fa35556f9b", 436 | "sha256:a2aa18d3f0b7d538e21932f637fbfe8518d085238b429e4790a35e1e44a96ffc", 437 | "sha256:a388960f979665b447f0847626e40f99af8cf191bce9dc571d716433130cb3a7", 438 | "sha256:a51528192755f7429c5bcc9e80832c517340317c861318fea9cea081b57c9afd", 439 | "sha256:b528e126c13816a4374e56b7b18bfe91f7a7f6576d1aadba5dee6a87a7f479ae", 440 | "sha256:c1aa4de4919358c5ef119f6377bc5964b3a7023c23e845d9db7d9016fa0c5b1c", 441 | "sha256:c2646458e1dce44df9f71a01dc65f7e8fa4307f29e5c0f2f92c97f47a5bf22f5", 442 | "sha256:c2f44425594ae85e119459bb5abb0748d76ef01d9c08583a667e3339e134218e", 443 | "sha256:d47750cf07dee6b55d8423471be70d627314277976ff2edd1381f02d52dbadf9", 444 | "sha256:d99d2350adb7b6c3f7f8f0e5dfb7d34ff8dd4bc0a53e62c445b7e43e163fce63", 445 | "sha256:dd324f8ee05925ee85de0ea3f0d66e1362e8c80799eb4eb04927d32335a3e44a", 446 | "sha256:eaca36a80acaacb8183930e2e5ad7f71539a66805d6204ea88736570b2876a7b", 447 | "sha256:f567e972dce3bbc3a8076e0b675273b4a9e8576ac629149cf8286ee13c259ae5", 448 | "sha256:fe48e4925455c964db914b958f6e7032d285848b7538a5e1b19aeb26ffaea3ec" 449 | ], 450 | "index": "pypi", 451 | "version": "==1.3.4" 452 | }, 453 | "pillow": { 454 | "hashes": [ 455 | "sha256:03150abd92771742d4a8cd6f2fa6246d847dcd2e332a18d0c15cc75bf6703040", 456 | "sha256:073adb2ae23431d3b9bcbcff3fe698b62ed47211d0716b067385538a1b0f28b8", 457 | "sha256:0b07fffc13f474264c336298d1b4ce01d9c5a011415b79d4ee5527bb69ae6f65", 458 | "sha256:0b7257127d646ff8676ec8a15520013a698d1fdc48bc2a79ba4e53df792526f2", 459 | "sha256:12ce4932caf2ddf3e41d17fc9c02d67126935a44b86df6a206cf0d7161548627", 460 | "sha256:15c42fb9dea42465dfd902fb0ecf584b8848ceb28b41ee2b58f866411be33f07", 461 | "sha256:18498994b29e1cf86d505edcb7edbe814d133d2232d256db8c7a8ceb34d18cef", 462 | "sha256:1c7c8ae3864846fc95f4611c78129301e203aaa2af813b703c55d10cc1628535", 463 | "sha256:22b012ea2d065fd163ca096f4e37e47cd8b59cf4b0fd47bfca6abb93df70b34c", 464 | "sha256:276a5ca930c913f714e372b2591a22c4bd3b81a418c0f6635ba832daec1cbcfc", 465 | "sha256:2e0918e03aa0c72ea56edbb00d4d664294815aa11291a11504a377ea018330d3", 466 | "sha256:3033fbe1feb1b59394615a1cafaee85e49d01b51d54de0cbf6aa8e64182518a1", 467 | "sha256:3168434d303babf495d4ba58fc22d6604f6e2afb97adc6a423e917dab828939c", 468 | "sha256:32a44128c4bdca7f31de5be641187367fe2a450ad83b833ef78910397db491aa", 469 | "sha256:3dd6caf940756101205dffc5367babf288a30043d35f80936f9bfb37f8355b32", 470 | "sha256:40e1ce476a7804b0fb74bcfa80b0a2206ea6a882938eaba917f7a0f004b42502", 471 | "sha256:41e0051336807468be450d52b8edd12ac60bebaa97fe10c8b660f116e50b30e4", 472 | "sha256:4390e9ce199fc1951fcfa65795f239a8a4944117b5935a9317fb320e7767b40f", 473 | "sha256:502526a2cbfa431d9fc2a079bdd9061a2397b842bb6bc4239bb176da00993812", 474 | "sha256:51e0e543a33ed92db9f5ef69a0356e0b1a7a6b6a71b80df99f1d181ae5875636", 475 | "sha256:57751894f6618fd4308ed8e0c36c333e2f5469744c34729a27532b3db106ee20", 476 | "sha256:5d77adcd56a42d00cc1be30843d3426aa4e660cab4a61021dc84467123f7a00c", 477 | "sha256:655a83b0058ba47c7c52e4e2df5ecf484c1b0b0349805896dd350cbc416bdd91", 478 | "sha256:68943d632f1f9e3dce98908e873b3a090f6cba1cbb1b892a9e8d97c938871fbe", 479 | "sha256:6c738585d7a9961d8c2821a1eb3dcb978d14e238be3d70f0a706f7fa9316946b", 480 | "sha256:73bd195e43f3fadecfc50c682f5055ec32ee2c933243cafbfdec69ab1aa87cad", 481 | "sha256:772a91fc0e03eaf922c63badeca75e91baa80fe2f5f87bdaed4280662aad25c9", 482 | "sha256:77ec3e7be99629898c9a6d24a09de089fa5356ee408cdffffe62d67bb75fdd72", 483 | "sha256:7db8b751ad307d7cf238f02101e8e36a128a6cb199326e867d1398067381bff4", 484 | "sha256:801ec82e4188e935c7f5e22e006d01611d6b41661bba9fe45b60e7ac1a8f84de", 485 | "sha256:82409ffe29d70fd733ff3c1025a602abb3e67405d41b9403b00b01debc4c9a29", 486 | "sha256:828989c45c245518065a110434246c44a56a8b2b2f6347d1409c787e6e4651ee", 487 | "sha256:829f97c8e258593b9daa80638aee3789b7df9da5cf1336035016d76f03b8860c", 488 | "sha256:871b72c3643e516db4ecf20efe735deb27fe30ca17800e661d769faab45a18d7", 489 | "sha256:89dca0ce00a2b49024df6325925555d406b14aa3efc2f752dbb5940c52c56b11", 490 | "sha256:90fb88843d3902fe7c9586d439d1e8c05258f41da473952aa8b328d8b907498c", 491 | "sha256:97aabc5c50312afa5e0a2b07c17d4ac5e865b250986f8afe2b02d772567a380c", 492 | "sha256:9aaa107275d8527e9d6e7670b64aabaaa36e5b6bd71a1015ddd21da0d4e06448", 493 | "sha256:9f47eabcd2ded7698106b05c2c338672d16a6f2a485e74481f524e2a23c2794b", 494 | "sha256:a0a06a052c5f37b4ed81c613a455a81f9a3a69429b4fd7bb913c3fa98abefc20", 495 | "sha256:ab388aaa3f6ce52ac1cb8e122c4bd46657c15905904b3120a6248b5b8b0bc228", 496 | "sha256:ad58d27a5b0262c0c19b47d54c5802db9b34d38bbf886665b626aff83c74bacd", 497 | "sha256:ae5331c23ce118c53b172fa64a4c037eb83c9165aba3a7ba9ddd3ec9fa64a699", 498 | "sha256:af0372acb5d3598f36ec0914deed2a63f6bcdb7b606da04dc19a88d31bf0c05b", 499 | "sha256:afa4107d1b306cdf8953edde0534562607fe8811b6c4d9a486298ad31de733b2", 500 | "sha256:b03ae6f1a1878233ac620c98f3459f79fd77c7e3c2b20d460284e1fb370557d4", 501 | "sha256:b0915e734b33a474d76c28e07292f196cdf2a590a0d25bcc06e64e545f2d146c", 502 | "sha256:b4012d06c846dc2b80651b120e2cdd787b013deb39c09f407727ba90015c684f", 503 | "sha256:b472b5ea442148d1c3e2209f20f1e0bb0eb556538690fa70b5e1f79fa0ba8dc2", 504 | "sha256:b59430236b8e58840a0dfb4099a0e8717ffb779c952426a69ae435ca1f57210c", 505 | "sha256:b90f7616ea170e92820775ed47e136208e04c967271c9ef615b6fbd08d9af0e3", 506 | "sha256:b9a65733d103311331875c1dca05cb4606997fd33d6acfed695b1232ba1df193", 507 | "sha256:bac18ab8d2d1e6b4ce25e3424f709aceef668347db8637c2296bcf41acb7cf48", 508 | "sha256:bca31dd6014cb8b0b2db1e46081b0ca7d936f856da3b39744aef499db5d84d02", 509 | "sha256:be55f8457cd1eac957af0c3f5ece7bc3f033f89b114ef30f710882717670b2a8", 510 | "sha256:c7025dce65566eb6e89f56c9509d4f628fddcedb131d9465cacd3d8bac337e7e", 511 | "sha256:c935a22a557a560108d780f9a0fc426dd7459940dc54faa49d83249c8d3e760f", 512 | "sha256:dbb8e7f2abee51cef77673be97760abff1674ed32847ce04b4af90f610144c7b", 513 | "sha256:e6ea6b856a74d560d9326c0f5895ef8050126acfdc7ca08ad703eb0081e82b74", 514 | "sha256:ebf2029c1f464c59b8bdbe5143c79fa2045a581ac53679733d3a91d400ff9efb", 515 | "sha256:f1ff2ee69f10f13a9596480335f406dd1f70c3650349e2be67ca3139280cade0" 516 | ], 517 | "index": "pypi", 518 | "version": "==9.3.0" 519 | }, 520 | "protobuf": { 521 | "hashes": [ 522 | "sha256:2c9c2ed7466ad565f18668aa4731c535511c5d9a40c6da39524bccf43e441719", 523 | "sha256:48e2cd6b88c6ed3d5877a3ea40df79d08374088e89bedc32557348848dff250b", 524 | "sha256:5b0834e61fb38f34ba8840d7dcb2e5a2f03de0c714e0293b3963b79db26de8ce", 525 | "sha256:61f21493d96d2a77f9ca84fefa105872550ab5ef71d21c458eb80edcf4885a99", 526 | "sha256:6e0be9f09bf9b6cf497b27425487706fa48c6d1632ddd94dab1a5fe11a422392", 527 | "sha256:6e312e280fbe3c74ea9e080d9e6080b636798b5e3939242298b591064470b06b", 528 | "sha256:7eb8f2cc41a34e9c956c256e3ac766cf4e1a4c9c925dc757a41a01be3e852965", 529 | "sha256:84ea107016244dfc1eecae7684f7ce13c788b9a644cd3fca5b77871366556444", 530 | "sha256:9227c14010acd9ae7702d6467b4625b6fe853175a6b150e539b21d2b2f2b409c", 531 | "sha256:a419cc95fca8694804709b8c4f2326266d29659b126a93befe210f5bbc772536", 532 | "sha256:a7d0ea43949d45b836234f4ebb5ba0b22e7432d065394b532cdca8f98415e3cf", 533 | "sha256:b5ab0b8918c136345ff045d4b3d5f719b505b7c8af45092d7f45e304f55e50a1", 534 | "sha256:e575c57dc8b5b2b2caa436c16d44ef6981f2235eb7179bfc847557886376d740", 535 | "sha256:f9eae277dd240ae19bb06ff4e2346e771252b0e619421965504bd1b1bba7c5fa" 536 | ], 537 | "markers": "python_version >= '3.7'", 538 | "version": "==4.21.9" 539 | }, 540 | "py4j": { 541 | "hashes": [ 542 | "sha256:624f97c363b8dd84822bc666b12fa7f7d97824632b2ff3d852cc491359ce7615", 543 | "sha256:bf0485388e415ff26710d2dc719cb0ede16cf1164b1ee757e0ebb2e98c471521" 544 | ], 545 | "version": "==0.10.9.2" 546 | }, 547 | "pyparsing": { 548 | "hashes": [ 549 | "sha256:2b020ecf7d21b687f219b71ecad3631f644a47f01403fa1d1036b0c6416d70fb", 550 | "sha256:5026bae9a10eeaefb61dab2f09052b9f4307d44aee4eda64b309723d8d206bbc" 551 | ], 552 | "markers": "python_full_version >= '3.6.8'", 553 | "version": "==3.0.9" 554 | }, 555 | "pyproj": { 556 | "hashes": [ 557 | "sha256:0fff9c3a991508f16027be27d153f6c5583d03799443639d13c681e60f49e2d7", 558 | "sha256:12f62c20656ac9b6076ebb213e9a635d52f4f01fef95310121d337e62e910cb6", 559 | "sha256:14ad113b5753c6057f9b2f3c85a6497cef7fa237c4328f2943c0223e98c1dde6", 560 | "sha256:1f9c100fd0fd80edbc7e4daa303600a8cbef6f0de43d005617acb38276b88dc0", 561 | "sha256:221d8939685e0c43ee594c9f04b6a73a10e8e1cc0e85f28be0b4eb2f1bc8777d", 562 | "sha256:25a36e297f3e0524694d40259e3e895edc1a47492a0e30608268ffc1328e3f5d", 563 | "sha256:2cb8592259ea54e7557523b079d3f2304081680bdb48bfbf0fd879ee6156129c", 564 | "sha256:3b85acf09e5a9e35cd9ee72989793adb7089b4e611be02a43d3d0bda50ad116b", 565 | "sha256:45554f47d1a12a84b0620e4abc08a2a1b5d9f273a4759eaef75e74788ec7162a", 566 | "sha256:4688b4cd62cbd86b5e855f9e27d90fbb53f2b4c2ea1cd394a46919e1a4151b89", 567 | "sha256:47ad53452ae1dc8b0bf1df920a210bb5616989085aa646592f8681f1d741a754", 568 | "sha256:48787962232109bad8b72e27949037a9b03591228a6955f25dbe451233e8648a", 569 | "sha256:4a23d84c5ffc383c7d9f0bde3a06fc1f6697b1b96725597f8f01e7b4bef0a2b5", 570 | "sha256:4e161114bc92701647a83c4bbce79489984f12d980cabb365516e953d1450885", 571 | "sha256:4fd425ee8b6781c249c7adb7daa2e6c41ce573afabe4f380f5eecd913b56a3be", 572 | "sha256:52e54796e2d9554a5eb8f11df4748af1fbbc47f76aa234d6faf09216a84554c5", 573 | "sha256:5816807ca0bdc7256558770c6206a6783a3f02bcf844f94ee245f197bb5f7285", 574 | "sha256:65a0bcdbad95b3c00b419e5d75b1f7e450ec17349b5ea16bf7438ac1d50a12a2", 575 | "sha256:77d5f519f3cdb94b026ecca626f78db4f041afe201cf082079c8c0092a30b087", 576 | "sha256:82200b4569d68b421c079d2973475b58d5959306fe758b43366e79fe96facfe5", 577 | "sha256:954b068136518b3174d0a99448056e97af62b63392a95c420894f7de2229dae6", 578 | "sha256:9a496d9057b2128db9d733e66b206f2d5954bbae6b800d412f562d780561478c", 579 | "sha256:a454a7c4423faa2a14e939d08ef293ee347fa529c9df79022b0585a6e1d8310c", 580 | "sha256:a708445927ace9857f52c3ba67d2915da7b41a8fdcd9b8f99a4c9ed60a75eb33", 581 | "sha256:aa5171f700f174777a9e9ed8f4655583243967c0f9cf2c90e3f54e54ff740134", 582 | "sha256:ccb4b70ad25218027f77e0c8934d10f9b7cdf91d5e64080147743d58fddbc3c0", 583 | "sha256:d94afed99f31673d3d19fe750283621e193e2a53ca9e0443bf9d092c3905833b", 584 | "sha256:e7e609903572a56cca758bbaee5c1663c3e829ddce5eec4f368e68277e37022b", 585 | "sha256:f343725566267a296b09ee7e591894f1fdc90f84f8ad5ec476aeb53bd4479c07", 586 | "sha256:f80adda8c54b84271a93829477a01aa57bc178c834362e9f74e1de1b5033c74c" 587 | ], 588 | "markers": "python_version >= '3.8'", 589 | "version": "==3.4.0" 590 | }, 591 | "pyspark": { 592 | "hashes": [ 593 | "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab" 594 | ], 595 | "index": "pypi", 596 | "version": "==3.2.0" 597 | }, 598 | "python-dateutil": { 599 | "hashes": [ 600 | "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", 601 | "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" 602 | ], 603 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 604 | "version": "==2.8.2" 605 | }, 606 | "python-dotenv": { 607 | "hashes": [ 608 | "sha256:14f8185cc8d494662683e6914addcb7e95374771e707601dfc70166946b4c4b8", 609 | "sha256:bbd3da593fc49c249397cbfbcc449cf36cb02e75afc8157fcc6a81df6fb7750a" 610 | ], 611 | "index": "pypi", 612 | "version": "==0.19.1" 613 | }, 614 | "pytz": { 615 | "hashes": [ 616 | "sha256:222439474e9c98fced559f1709d89e6c9cbf8d79c794ff3eb9f8800064291427", 617 | "sha256:e89512406b793ca39f5971bc999cc538ce125c0e51c27941bef4568b460095e2" 618 | ], 619 | "version": "==2022.6" 620 | }, 621 | "requests": { 622 | "hashes": [ 623 | "sha256:7c5599b102feddaa661c826c56ab4fee28bfd17f5abca1ebbe3e7f19d7c97983", 624 | "sha256:8fefa2a1a1365bf5520aac41836fbee479da67864514bdb821f31ce07ce65349" 625 | ], 626 | "markers": "python_version >= '3.7' and python_version < '4'", 627 | "version": "==2.28.1" 628 | }, 629 | "setuptools": { 630 | "hashes": [ 631 | "sha256:6211d2f5eddad8757bd0484923ca7c0a6302ebc4ab32ea5e94357176e0ca0840", 632 | "sha256:d1eebf881c6114e51df1664bc2c9133d022f78d12d5f4f665b9191f084e2862d" 633 | ], 634 | "markers": "python_version >= '3.7'", 635 | "version": "==65.6.0" 636 | }, 637 | "shapely": { 638 | "hashes": [ 639 | "sha256:02dd5d7dc6e46515d88874134dc8fcdc65826bca93c3eecee59d1910c42c1b17", 640 | "sha256:0b4ee3132ee90f07d63db3aea316c4c065ed7a26231458dda0874414a09d6ba3", 641 | "sha256:0d885cb0cf670c1c834df3f371de8726efdf711f18e2a75da5cfa82843a7ab65", 642 | "sha256:147066da0be41b147a61f8eb805dea3b13709dbc873a431ccd7306e24d712bc0", 643 | "sha256:21776184516a16bf82a0c3d6d6a312b3cd15a4cabafc61ee01cf2714a82e8396", 644 | "sha256:2e0a8c2e55f1be1312b51c92b06462ea89e6bb703fab4b114e7a846d941cfc40", 645 | "sha256:2fd15397638df291c427a53d641d3e6fd60458128029c8c4f487190473a69a91", 646 | "sha256:3480657460e939f45a7d359ef0e172a081f249312557fe9aa78c4fd3a362d993", 647 | "sha256:370b574c78dc5af3a198a6da5d9b3d7c04654bd2ef7e80e80a3a0992dfb2d9cd", 648 | "sha256:38f0fbbcb8ca20c16451c966c1f527cc43968e121c8a048af19ed3e339a921cd", 649 | "sha256:4728666fff8cccc65a07448cae72c75a8773fea061c3f4f139c44adc429b18c3", 650 | "sha256:48dcfffb9e225c0481120f4bdf622131c8c95f342b00b158cdbe220edbbe20b6", 651 | "sha256:532a55ee2a6c52d23d6f7d1567c8f0473635f3b270262c44e1b0c88096827e22", 652 | "sha256:5d7f85c2d35d39ff53c9216bc76b7641c52326f7e09aaad1789a3611a0f812f2", 653 | "sha256:65b21243d8f6bcd421210daf1fabb9de84de2c04353c5b026173b88d17c1a581", 654 | "sha256:66bdac74fbd1d3458fa787191a90fa0ae610f09e2a5ec398c36f968cc0ed743f", 655 | "sha256:6d388c0c1bd878ed1af4583695690aa52234b02ed35f93a1c8486ff52a555838", 656 | "sha256:6fe855e7d45685926b6ba00aaeb5eba5862611f7465775dacd527e081a8ced6d", 657 | "sha256:753ed0e21ab108bd4282405b9b659f2e985e8502b1a72b978eaa51d3496dee19", 658 | "sha256:783bad5f48e2708a0e2f695a34ed382e4162c795cb2f0368b39528ac1d6db7ed", 659 | "sha256:78fb9d929b8ee15cfd424b6c10879ce1907f24e05fb83310fc47d2cd27088e40", 660 | "sha256:84010db15eb364a52b74ea8804ef92a6a930dfc1981d17a369444b6ddec66efd", 661 | "sha256:8d086591f744be483b34628b391d741e46f2645fe37594319e0a673cc2c26bcf", 662 | "sha256:8e59817b0fe63d34baedaabba8c393c0090f061917d18fc0bcc2f621937a8f73", 663 | "sha256:99a2f0da0109e81e0c101a2b4cd8412f73f5f299e7b5b2deaf64cd2a100ac118", 664 | "sha256:99ab0ddc05e44acabdbe657c599fdb9b2d82e86c5493bdae216c0c4018a82dee", 665 | "sha256:a23ef3882d6aa203dd3623a3d55d698f59bfbd9f8a3bfed52c2da05a7f0f8640", 666 | "sha256:a354199219c8d836f280b88f2c5102c81bb044ccea45bd361dc38a79f3873714", 667 | "sha256:a74631e511153366c6dbe3229fa93f877e3c87ea8369cd00f1d38c76b0ed9ace", 668 | "sha256:ab38f7b5196ace05725e407cb8cab9ff66edb8e6f7bb36a398e8f73f52a7aaa2", 669 | "sha256:adcf8a11b98af9375e32bff91de184f33a68dc48b9cb9becad4f132fa25cfa3c", 670 | "sha256:b65f5d530ba91e49ffc7c589255e878d2506a8b96ffce69d3b7c4500a9a9eaf8", 671 | "sha256:be9423d5a3577ac2e92c7e758bd8a2b205f5e51a012177a590bc46fc51eb4834", 672 | "sha256:c2822111ddc5bcfb116e6c663e403579d0fe3f147d2a97426011a191c43a7458", 673 | "sha256:c6a9a4a31cd6e86d0fbe8473ceed83d4fe760b19d949fb557ef668defafea0f6", 674 | "sha256:d048f93e42ba578b82758c15d8ae037d08e69d91d9872bca5a1895b118f4e2b0", 675 | "sha256:e9c30b311de2513555ab02464ebb76115d242842b29c412f5a9aa0cac57be9f6", 676 | "sha256:ec14ceca36f67cb48b34d02d7f65a9acae15cd72b48e303531893ba4a960f3ea", 677 | "sha256:ef3be705c3eac282a28058e6c6e5503419b250f482320df2172abcbea642c831" 678 | ], 679 | "markers": "python_version >= '3.6'", 680 | "version": "==1.8.5.post1" 681 | }, 682 | "six": { 683 | "hashes": [ 684 | "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", 685 | "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" 686 | ], 687 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", 688 | "version": "==1.16.0" 689 | }, 690 | "urllib3": { 691 | "hashes": [ 692 | "sha256:3fa96cf423e6987997fc326ae8df396db2a8b7c667747d47ddd8ecba91f4a74e", 693 | "sha256:b930dd878d5a8afb066a637fbb35144fe7901e3b209d1cd4f524bd0e9deee997" 694 | ], 695 | "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' and python_version < '4'", 696 | "version": "==1.26.12" 697 | } 698 | }, 699 | "develop": {} 700 | } 701 | -------------------------------------------------------------------------------- /PyPI_README.md: -------------------------------------------------------------------------------- 1 | # SparkPlus 2 | Spark+는 H3, 위/경도 좌표 등의 공간 정보를 국내 주소체계(신주소/구주소)와 함께 처리할 수 있도록 지원하는 Package입니다. 3 | 4 | ## Setup 5 | 6 | [GitHub](https://github.com/SWM-SparkPlus/sparkplus/) 7 | [개발자 가이드 참고](https://github.com/SWM-SparkPlus/sparkplus/wiki) 8 | 9 | - Spark+는 PyPI에 배포되어 있으며, 다음 커맨드로 설치할 수 있습니다. 10 | ``` 11 | $ pip install sparkplus 12 | ``` 13 | 14 | - 설치 후에 import하여 사용할 수 있습니다. 15 | ``` 16 | from sparkplus.core import CoordDataFrame, RoadnameDataFrame, NumAddrDataFrame 17 | ``` 18 | 19 | ## Class 20 | 21 | ### CoordDataFrame 22 | 위치 좌표를 포함하는 데이터프레임을 주소체계 데이터베이스와 연동하여 pnu코드, h3, 우편번호, 법정동코드, 도로명주소(시도/시군구/읍면동/법정리/도로명/지하여부/건물 본번/건물 부번), 도로명주소(전체), 지번주소(시도/시군구/읍면동/법정리/지번 본번/지번 분번) 등의 컬럼을 추가합니다. 23 | ``` 24 | coord_df = CoordDataFrame(source_df, geo_df, table_df, x_colname, y_colname) 25 | ``` 26 | | 위도| 경도| PNU| manage_number|roadname_code|zipcode| sido|sigungu|eupmyeondong|bupjungli| roadname|is_basement|building_primary_number|building_secondary_number|jibun_primary_number|jibun_secondary_number|bupjungdong_code| 27 | |-----------|-----------|-------------------|--------------------|-------------|-------|----------|-------|------------|---------|---------------|-----------|-----------------------|-------------------------|--------------------|----------------------|----------------| 28 | |35.86341579|128.6024286|2711010600101990000|27110106001000300...| 271103007017| 41940| 대구광역시| 중구| 삼덕동2가| | 공평로| 0| 46| 0| 3| 4| 2711010600| 29 | |35.86516734|128.6105401|2711010700103790000|27110107001003100...| 271104223055| 41945| 대구광역시| 중구| 삼덕동3가| | 달구벌대로443길| 0| 62| 16| 31| 2| 2711010700| 30 | |35.86927185|128.5937782|2711011700101200003|27110115001008500...| 271102007001| 41909| 대구광역시| 중구| 남일동| | 중앙대로| 1| 424| 0| 143| 1| 2711011700| 31 | 32 | ### RoadnameDataFrame 33 | 비정형 도로명주소를 포함하는 데이터프레임을 주소체계 데이터베이스와 연동하여 분석 및 시각화할 수 있는 형태로 전처리한 시도, 시군구, 읍면동, 도로명, 건물 본번, 법정동코드 등의 컬럼을 추가합니다. 34 | ``` 35 | roadname_df = RoadnameDataFrame(source_df) 36 | ``` 37 | |target |sido |sigungu |roadname |building_primary_number|bupjungdong_code| 38 | |----------------------------------------|------|-----------|---------|-----------------------|----------------| 39 | |경기도 안산시 단원구 해봉로 137 |경기도 |안산시 단원구 |해봉로 |137 |4128112400 | 40 | |경기도 수원시 장안구 경수대로 1079 |경기도 |수원시 장안구 |경수대로 |1079 |4128111800 | 41 | |경기도 안산시 상록구 양달말길 93-7 |경기도 |안산시 상록구 |양달말길 |93 |4128101100 | 42 | 43 | 44 | ## LICENSE 45 | [MIT](https://github.com/SWM-SparkPlus/db-updater/blob/master/LICENSE) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SparkPlus 2 | Spark+는 H3, 위/경도 좌표 등의 공간 정보를 국내 주소체계(신주소/구주소)와 함께 처리할 수 있도록 지원하는 Package입니다. 3 | 4 | ## Spark+ 아키텍처 5 | 6 | [RDW Reference Architecture](https://github.com/SWM-SparkPlus/rdw-reference-architecture) 7 | 8 | ![](https://github.com/SWM-SparkPlus/sparkplus/blob/master/static/sparkplus_arch_finale.png) 9 | 10 | 11 | ## Setup 12 | 13 | [개발자 가이드 참고](https://github.com/SWM-SparkPlus/sparkplus/wiki) 14 | 15 | - Spark+는 PyPI에 배포되어 있으며, 다음 커맨드로 설치할 수 있습니다. 16 | ```s 17 | $ pip install sparkplus 18 | ``` 19 | 20 | - 설치 후에 import하여 사용할 수 있습니다. 21 | ```py 22 | from sparkplus.core import CoordDataFrame, AddressDataFrame 23 | ``` 24 | 25 | ## Class 26 | 27 | ### CoordDataFrame 28 | 위치 좌표를 포함하는 데이터프레임을 주소체계 데이터베이스와 연동하여 pnu코드, h3, 우편번호, 법정동코드, 도로명주소(시도/시군구/읍면동/법정리/도로명/지하여부/건물 본번/건물 부번), 도로명주소(전체), 지번주소(시도/시군구/읍면동/법정리/지번 본번/지번 분번) 등의 컬럼을 추가합니다. 29 | ```py 30 | res_df = CoordDataFrame(source_df, geo_df, table_df, x_colname, y_colname) 31 | 32 | # example 33 | +-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+ 34 | | 위도| 경도| PNU| manage_number|roadname_code|zipcode| sido|sigungu|eupmyeondong|bupjungli| roadname|is_basement|building_primary_number|building_secondary_number|jibun_primary_number|jibun_secondary_number|bupjungdong_code| 35 | +-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+ 36 | |35.86341579|128.6024286|2711010600101990000|27110106001000300...| 271103007017| 41940| 대구광역시| 중구| 삼덕동2가| | 공평로| 0| 46| 0| 3| 4| 2711010600| 37 | |35.86516734|128.6105401|2711010700103790000|27110107001003100...| 271104223055| 41945| 대구광역시| 중구| 삼덕동3가| | 달구벌대로443길| 0| 62| 16| 31| 2| 2711010700| 38 | |35.86927185|128.5937782|2711011700101200003|27110115001008500...| 271102007001| 41909| 대구광역시| 중구| 남일동| | 중앙대로| 1| 424| 0| 143| 1| 2711011700| 39 | +-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+ 40 | ``` 41 | 42 | ### AddressDataFrame 43 | 비정형 도로명주소 또는 지번주소를 포함하는 데이터프레임을 주소체계 데이터베이스와 연동하여 분석 및 시각화할 수 있는 형태의 시도, 시군구, 읍면동, 법정동코드, 시군구코드 등의 컬럼을 추가합니다. 44 | ```py 45 | res_df = AddressDataFrame(source_df).to_bupjungdong("target_colname", table_df) 46 | 47 | # example 48 | +--------------------------+----------+------------+-----------------+----------------+------------+ 49 | | 받는분주소| sido_name|sigungu_name|eupmyeondong_name|bupjungdong_code|sigungu_code| 50 | +--------------------------+----------+------------+-----------------+----------------+------------+ 51 | | 서울특별시 강남구 가로수길 75| 서울특별시| 강남구| 신사동| 1168010700| 11680| 52 | | 서울특별시 강남구 강남대로 346| 서울특별시| 강남구| 역삼동| 1168010100| 11680| 53 | |서울특별시 강남구 논현로 120길 20| 서울특별시| 강남구| 논현동| 1168010800| 11680| 54 | +--------------------------+----------+------------+-----------------+----------------+------------+ 55 | ``` 56 | 57 | ## LICENSE 58 | [MIT](https://github.com/SWM-SparkPlus/db-updater/blob/master/LICENSE) 59 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | from pathlib import Path 3 | 4 | this_directory= Path(__file__).parent 5 | long_description = (this_directory / "PyPI_README.md").read_text() 6 | 7 | setup( 8 | name="sparkplus", 9 | version="1.3.0", 10 | description="GIS package for Apache Spark", 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | author="sparkplus", 14 | author_email="meadea27@gmail.com", 15 | url="https://github.com/SWM-SparkPlus/sparkplus", 16 | license="MIT", 17 | # py_modules=['conversion', 'load_database'], 18 | python_requires=">=3", 19 | install_requires=[ 20 | "numpy", 21 | "pandas", 22 | "geopandas", 23 | "geospark", 24 | "h3", 25 | "geopy", 26 | "pyarrow", 27 | "rtree", 28 | "shapely", 29 | "python-dotenv", 30 | ], 31 | include_package_data=True, 32 | zip_safe=False, 33 | packages=find_packages(), 34 | keywords=["spark", "gis"], 35 | classifiers=[ 36 | "Programming Language :: Python :: 3", 37 | "License :: OSI Approved :: MIT License", 38 | "Operating System :: OS Independent", 39 | ], 40 | ) 41 | -------------------------------------------------------------------------------- /sparkplus/__init__.py: -------------------------------------------------------------------------------- 1 | # from .dependencies import spark 2 | from .core import CoordDataFrame, AddressDataFrame, load_tables, load_gdf 3 | 4 | __all__ = ["spark", "CoordDataFrame", "AddressDataFrame", "load_tables", "load_gdf"] 5 | -------------------------------------------------------------------------------- /sparkplus/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .coord_dataframe import CoordDataFrame 2 | from .address_dataframe import AddressDataFrame 3 | from .numaddr_dataframe import NumAddrDataFrame 4 | from .utils import load_tables, load_gdf 5 | from .tablename import ( 6 | EPrefix, 7 | ESido, 8 | get_tablename_by_prefix_and_sido, 9 | get_all_tablenames_by_prefix, 10 | ) 11 | 12 | __all__ = [ 13 | "CoordDataFrame", 14 | "AddressDataFrame", 15 | "NumAddrDataFrame", 16 | "load_tables", 17 | "load_gdf", 18 | "EPrefix", 19 | "ESido", 20 | "get_tablename_by_prefix_and_sido", 21 | "get_all_tablenames_by_prefix", 22 | ] 23 | -------------------------------------------------------------------------------- /sparkplus/core/address_dataframe.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | sys.path.append( 5 | os.path.dirname(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) 6 | ) 7 | 8 | from pyspark.sql import DataFrame 9 | from pyspark.sql.functions import split, col, lit 10 | from sparkplus.core.udfs import * 11 | from pyspark.sql.functions import when 12 | 13 | 14 | class AddressDataFrame(object): 15 | """ 16 | 도로명 주소를 활용하여 데이터를 분석하기 위한 클래스입니다 17 | """ 18 | 19 | def __init__(self, dataFrame: DataFrame): 20 | self._df = dataFrame 21 | self._tmp_df = dataFrame 22 | self.col_list = dataFrame.columns 23 | 24 | def to_bupjungdong(self, target: str, db_df: DataFrame): 25 | """ 26 | 도로명을 지번으로 변경하는 전 과정을 포함하는 함수입니다 27 | """ 28 | self.add_split(target) 29 | self.add_sido() 30 | self.add_sigungu() 31 | self.add_eupmyeon() 32 | self.add_dong() 33 | self.add_roadname() 34 | self.add_building_primary_number() 35 | self.add_jibun_primary_number() 36 | self.join_with_db(db_df) 37 | return self._df 38 | 39 | def add_split(self, target: str): 40 | """ 41 | DB에서 조회를 위해 원본의 string을 공백 기준으로 나누는 함수입니다. 42 | 43 | Parameters 44 | ---------- 45 | target : str 46 | split하고 조작할 원본 데이터의 컬럼명 47 | 48 | Examples 49 | -------- 50 | >>> road_df = RoadnameDataframe(your_df) 51 | >>> road_df._df.show() 52 | +------------------------------+s 53 | |target | 54 | +------------------------------+ 55 | |경기도 화성시 장안면 매바위로366번길 8 | 56 | |경기도 화성시 장안면 버들로 | 57 | |경기도 화성시 장안면 석포리 | 58 | +------------------------------+ 59 | 60 | >>> splited_df = road_df.add_split('target') 61 | >>> splited_df.show() 62 | +------------------------------+-----------------------------------+ 63 | |target |split | 64 | +------------------------------+-----------------------------------+ 65 | |경기도 화성시 장안면 매바위로366번길 8|[경기도, 화성시, 장안면, 매바위로366번길, 8]| 66 | |경기도 화성시 장안면 버들로 |[경기도, 화성시, 장안면, 버들로] | 67 | |경기도 화성시 장안면 석포리 |[경기도, 화성시, 장안면, 석포리] | 68 | +-----------------------------+------------------------------------+ 69 | """ 70 | self._df = self._df.withColumn("split", split(self._df[target], " ")) 71 | return self._df 72 | 73 | def cleanse_split_column(self): 74 | """ 75 | 주소가 비정형 데이터일 경우 사용되는 함수입니다. 76 | add_split_column 함수로 쪼개진 split 컬럼의 데이터를 전처리합니다. 77 | 78 | UDF 79 | --- 80 | where_is_sido : IntegerType 81 | split 컬럼에서 특별시와 광역시, 도를 찾고, 위치한 인덱스를 반환합니다. 82 | 83 | Exmaple 84 | ------- 85 | >>> df.show() 86 | +---------------------------------------------+ 87 | |split | 88 | +---------------------------------------------+ 89 | |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]| 90 | |[경기도, 화성시, 장안면, 버들로] | 91 | |[경기도, 화성시, 장안면, 석포리] | 92 | +--------------------------------------------+ 93 | 94 | >>> df.withColumn('idx', where_is_sido(split)).show() 95 | +---------------------------------------------+----+ 96 | |split |sido| 97 | +---------------------------------------------+----+ 98 | |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]| 1| 99 | |[경기도, 화성시, 장안면, 버들로] | 0| 100 | |[경기도, 화성시, 장안면, 석포리] | 2| 101 | +--------------------------------------------+----+ 102 | 103 | cleanse_split: ArrayType(StringType) 104 | split 컬럼과 인덱스 컬럼을 활용하여 알맞은 주소체계 값으로 반환합니다. 105 | 106 | Example 107 | ------- 108 | >>> df.show() 109 | +------------------------------------------------+---+ 110 | |split |idx| 111 | +------------------------------------------------+---+ 112 | |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8] | 1| 113 | |[경기도, 화성시, 장안면, 버들로] | 0| 114 | |[Gyeonggi-do, [185-74], 경기도, 화성시, 장안면, 석포리]| 2| 115 | +------------------------------------------------+---+ 116 | 117 | >>> df.withColumn('split', cleanse_split(df.split)) 118 | +----------------------------------------+ 119 | |split | 120 | +----------------------------------------+ 121 | |[경기도, 화성시, 장안면,매바위로366번길, 8] | 122 | |[경기도, 화성시, 장안면, 버들로] | 123 | |[경기도, 화성시, 장안면, 석포리] | 124 | +---------------------------------------+ 125 | """ 126 | 127 | self._df = self._df.withColumn("idx", where_is_sido(self._df.split)).withColumn( 128 | "split", cleanse_split(self._df.idx, self._df.split) 129 | ) 130 | self._df = self._df.drop("idx") 131 | self._df = self._df.withColumn("split", process_roadname(self._df.split)) 132 | return self._df 133 | 134 | def add_sido(self): 135 | """ 136 | 특별시, 광역시, 도를 기존 데이터프레임에 추가하는 함수입니다. 137 | 138 | UDF 139 | --- 140 | extract_sido : StringType 141 | split 컬럼에서 특별시와 광역시, 도를 찾고 값을 반환합니다. 142 | 값이 없는 경우, "None" : str 을 반환합니다. 143 | 144 | Exmaple 145 | ------- 146 | >>> df.show() 147 | +----------------------------------------+ 148 | |split | 149 | +----------------------------------------+ 150 | |[경기도, 안산시, 단원구, 해봉로, 137] | 151 | |[경기도, 수원시, 장안구, 경수대로, 1079] | 152 | |[경기도, 안산시, 상록구, 양달말길, 93-7] | 153 | +----------------------------------------+ 154 | 155 | >>> df.withColumn('idx', extract_sido()).show() 156 | +----------------------------------------------+-----+ 157 | |split |sido | 158 | +----------------------------------------------+-----+ 159 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 | 160 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 | 161 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 | 162 | +----------------------------------------------+------+ 163 | """ 164 | 165 | self._df = self._df.withColumn("sido", extract_sido(self._df.split)) 166 | return self._df 167 | 168 | def add_sigungu(self): 169 | """ 170 | 시, 군, 구 컬럼을 기존 데이터프레임에 추가하는 함수입니다. 171 | UDF 172 | --- 173 | extract_sigungu : StringType 174 | split 컬럼에서 시, 군, 구를 찾고 값을 반환합니다. 175 | 176 | 시와 구가 같이 있을경우에는 시와 구를 같이 반환합니다. 177 | ex) 경기도 성남시 분당구 -> 성남시 분당구 178 | 179 | 값이 없는 경우, "None" : str 을 반환합니다. 180 | 181 | Exmaple 182 | ------- 183 | >>> df.show() 184 | +----------------------------------------------+-----+ 185 | |split |sido | 186 | +----------------------------------------------+-----+ 187 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 | 188 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 | 189 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 | 190 | +----------------------------------------------+------+ 191 | 192 | >>> df.withColumn('idx', extract_sigungu()).show() 193 | +----------------------------------------------+------+-----------+ 194 | |split |sido |sigungu | 195 | +----------------------------------------------+------+-----------+ 196 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 | 197 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 | 198 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 | 199 | +----------------------------------------------+------+-----------+ 200 | """ 201 | 202 | self._df = self._df.withColumn("sigungu", extract_sigungu(self._df.split)) 203 | return self._df 204 | 205 | def add_eupmyeon(self): 206 | """ 207 | 읍, 면 컬럼을 기존에 데이터프레임에 추가하는 함수입니다. 208 | 209 | UDF 210 | --- 211 | extract_eupmyeon : StringType 212 | split 컬럼에서 읍이나 면을 찾고 값을 반환합니다. 213 | 214 | 값이 없는 경우, "None" : str 을 반환합니다. 215 | 216 | Exmaple 217 | ------- 218 | >>> df.show() 219 | +----------------------------------------------+------+-----------+ 220 | |split |sido |sigungu | 221 | +----------------------------------------------+------+-----------+ 222 | |[경기도, 화성시, 장안면, 매바위로366번길, 8] |경기도 |화성시 | 223 | |[강원도, 원주시, 호저면, 사제로, 9] |강원도 |원주시 | 224 | |[경상남도, 사천시, 곤양면, 경충로, 23-1] |경상남도|사천시 | 225 | +----------------------------------------------+------+-----------+ 226 | 227 | >>> df.withColumn('idx', extract_eupmyeon()).show() 228 | +----------------------------------------------+------+-----------+--------+ 229 | |split |sido |sigungu |eupmyeon| 230 | +----------------------------------------------+------+-----------+--------+ 231 | |[경기도, 화성시, 장안면, 매바위로366번길, 8] |경기도 |화성시 |장안면 | 232 | |[강원도, 원주시, 호저면, 사제로, 9] |강원도 |원주시 |호저면 | 233 | |[경상남도, 사천시, 곤양면, 경충로, 23-1] |경상남도|사천시 |곤양면 | 234 | +----------------------------------------------+------+-----------+-------+ 235 | """ 236 | self._df = self._df.withColumn("eupmyeondong", extract_eupmyeondong(self._df.split)) 237 | return self._df 238 | 239 | def add_dong(self): 240 | """ 241 | 데이터프레임에 동이 포함되어있는지 확인하고 동 컬럼을 추가하는 함수입니다. 242 | 243 | UDF 244 | --- 245 | extract_dong : StringType 246 | split 컬럼에서 읍이나 면을 찾고 값을 반환합니다. 247 | 248 | 값이 없는 경우, "None" : str 을 반환합니다. 249 | 250 | Exmaple 251 | ------- 252 | >>> df.show() 253 | +-------------------------+--------+-----------+ 254 | |split |sido |sigungu | 255 | +-------------------------+--------+-----------+ 256 | |[경기도, 성남시, 분당구, 금곡동]|경기도 |성남시 | 257 | |[충청남도, 공주시, 검상동] |강원도 |공주시 | 258 | |[대전광역시, 동구, 가오동] |대전광역시|동구 | 259 | +-------------------------+--------+-----------+ 260 | 261 | >>> df.withColumn('idx', extract_dong()).show() 262 | +-------------------------+--------+-----------+----+ 263 | |split |sido |sigungu |dong| 264 | +-------------------------+--------+-----------+----+ 265 | |[경기도, 성남시, 분당구, 금곡동]|경기도 |성남시 |금곡동| 266 | |[충청남도, 공주시, 검상동] |강원도 |공주시 |검상동| 267 | |[대전광역시, 동구, 가오동] |대전광역시|동구 |가오동| 268 | +-------------------------+--------+-----------+-----+ 269 | """ 270 | 271 | self._df = self._df.withColumn("dong", extract_dong(self._df.split)) 272 | return self._df 273 | 274 | def add_roadname(self): 275 | """ 276 | 데이터프레임에 도로명주소 컬럼을 추가하는 함수입니다. 277 | UDF 278 | --- 279 | extract_building_primary_number : StringType 280 | split 컬럼에서 도로명를 찾고 값을 반환합니다. 281 | 282 | 값이 없는 경우, "None" : str 을 반환합니다. 283 | 284 | Exmaple 285 | ------- 286 | >>> df.show() 287 | +----------------------------------------------+------+-----------+ 288 | |split |sido |sigungu | 289 | +----------------------------------------------+------+-----------+ 290 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 | 291 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 | 292 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 | 293 | +----------------------------------------------+------+-----------+ 294 | 295 | >>> df.withColumn('idx', add_sigungu()).show() 296 | +----------------------------------------------+------+-----------+---------+ 297 | |split |sido |sigungu |roadname | 298 | +----------------------------------------------+------+-----------+---------+ 299 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 |해봉로 | 300 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 |경수대로 | 301 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 |양달말길 | 302 | +----------------------------------------------+------+-----------+---------+ 303 | """ 304 | self._df = self._df.withColumn("roadname", extract_roadname(self._df.split)) 305 | return self._df 306 | 307 | def add_building_primary_number(self): 308 | """ 309 | 데이터프레임에 도로명주소의 건물본번을 추가하는 함수입니다. 310 | 311 | UDF 312 | --- 313 | extract_building_primary_number : StringType 314 | 315 | Parameters 316 | ---------- 317 | split : columnType 318 | roadname : columnType 319 | 320 | roadname 뒤에 건물 본번과 부번이 들어오면 건물 본번을 반환합니다.. 321 | 322 | 값이 없는 경우, "None" : str 을 반환합니다. 323 | 324 | Exmaple 325 | ------- 326 | >>> df.show() 327 | +----------------------------------------------+------+-----------+---------+ 328 | |split |sido |sigungu |roadname | 329 | +----------------------------------------------+------+-----------+---------+ 330 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 |해봉로 | 331 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 |경수대로 | 332 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 |양달말길 | 333 | +----------------------------------------------+------+-----------+---------+ 334 | 335 | >>> df.withColumn('idx', extract_building_primary_number()).show() 336 | +----------------------------------------------+------+-----------+---------+-----------------------+ 337 | |split |sido |sigungu |roadname |building_primary_number| 338 | +----------------------------------------------+------+-----------+---------+-----------------------+ 339 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 |해봉로 |137 | 340 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 |경수대로 |1079 | 341 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 |양달말길 |93 | 342 | +----------------------------------------------+------+-----------+---------+-----------------------+ 343 | """ 344 | self._df = self._df.withColumn( 345 | "building_primary_number", 346 | extract_building_primary_number(self._df.split, self._df.roadname), 347 | ) 348 | return self._df 349 | 350 | def add_jibun_primary_number(self): 351 | self._df = self._df.withColumn( 352 | "jibun_primary_number", 353 | extract_jibun_primary_number(self._df.split, self._df.roadname), 354 | ) 355 | return self._df 356 | 357 | def join_with_db(self, db_df): 358 | 359 | """ 360 | 데이터베이스 데이터프레임과 조인하는 함수입니다. 361 | 362 | Parameters 363 | ---------- 364 | db_df : DataFrame 365 | 366 | 367 | Exmaple 368 | ------- 369 | >>> df.show() 370 | +----------------------------------------------+------+-----------+---------+-----------------------+ 371 | |split |sido |sigungu |roadname |building_primary_number| 372 | +----------------------------------------------+------+-----------+---------+-----------------------+ 373 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 |해봉로 |137 | 374 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 |경수대로 |1079 | 375 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 |양달말길 |93 | 376 | +----------------------------------------------+------+-----------+---------+-----------------------+ 377 | 378 | >>> df.withColumn('idx', extract_building_primary_number()).show() 379 | +----------------------------------------------+------+-----------+---------+-----------------------+----------------+ 380 | |split |sido |sigungu |roadname |building_primary_number|bupjungdong_code| 381 | +----------------------------------------------+------+-----------+---------+-----------------------+----------------+ 382 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 |해봉로 |137 |4128112400 | 383 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 |경수대로 |1079 |4128111800 | 384 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 |양달말길 |93 |4128101100 | 385 | +----------------------------------------------+------+-----------+---------+-----------------------+---------------+ 386 | """ 387 | db_df_roadname = db_df.select( 388 | col("sido").alias("sido_name"), 389 | col("sigungu").alias("sigungu_name"), 390 | col("eupmyeondong").alias('eupmyeondong_name'), 391 | col("roadname").alias("db_roadname"), 392 | col("building_primary_number").alias("db_building_primary_number"), 393 | col("bupjungdong_code").alias('db_bupjungdong_code'), 394 | col("jibun_primary_number").alias("db_jibun_primary_number") 395 | ).drop_duplicates(["sigungu_name", "db_roadname", "db_building_primary_number"]) 396 | 397 | db_df_jibun = db_df.select( 398 | col("sido").alias("sido_name"), 399 | col("sigungu").alias("sigungu_name"), 400 | col("eupmyeondong").alias('eupmyeondong_name'), 401 | col("roadname").alias("db_roadname"), 402 | col("building_primary_number").alias("db_building_primary_number"), 403 | col("bupjungdong_code").alias('db_bupjungdong_code'), 404 | col("jibun_primary_number").alias("db_jibun_primary_number") 405 | ).drop_duplicates(["sigungu_name", "eupmyeondong_name", "db_jibun_primary_number"]) 406 | 407 | jibun_origin = self._df.where(self._df.roadname == "None") 408 | roadname_origin = self._df.where(self._df.roadname != "None") 409 | 410 | join_df_roadname = roadname_origin.join( 411 | db_df_roadname, 412 | (self._df.sigungu == db_df_roadname.sigungu_name) 413 | & (self._df.roadname == db_df_roadname.db_roadname) 414 | & (self._df.building_primary_number == db_df_roadname.db_building_primary_number), 415 | "inner", 416 | ) \ 417 | .withColumnRenamed("db_bupjungdong_code", "bupjungdong_code") \ 418 | .select(*self.col_list, "sido_name", "sigungu_name", "eupmyeondong_name", "bupjungdong_code") 419 | 420 | 421 | join_df_jibun = jibun_origin.join( 422 | db_df_jibun, 423 | (self._df.sigungu == db_df_jibun.sigungu_name) 424 | & (self._df.eupmyeondong == db_df_jibun.eupmyeondong_name) 425 | & (self._df.jibun_primary_number == db_df_jibun.db_jibun_primary_number), 426 | "inner", 427 | ) \ 428 | .withColumnRenamed("db_bupjungdong_code", "bupjungdong_code") \ 429 | .select(*self.col_list, "sido_name", "sigungu_name", "eupmyeondong_name", "bupjungdong_code") 430 | 431 | self._df = join_df_roadname.union(join_df_jibun) 432 | 433 | self._df = self._df.withColumn("sigungu_code", extract_sigungu_code(self._df.bupjungdong_code)) 434 | 435 | return self._df 436 | -------------------------------------------------------------------------------- /sparkplus/core/base.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | from pyspark.sql.session import SparkSession 3 | 4 | 5 | class SPDataFrame(object): 6 | """ 7 | 요약 8 | ------- 9 | `SPDataFrame` 은 Spark DataFrame를 확장하며, 한국 주소체계를 더 쉽게 다룰 수 있도록 다양한 기능을 제공합니다. 10 | """ 11 | 12 | @classmethod 13 | def get_db_df_by_tablenames( 14 | cls, sparkSession: SparkSession, tablenames: Union[str, List[str]], **kwargs 15 | ): 16 | """ 17 | Summary 18 | ------- 19 | 테이블명을 기반으로 Spark DataFrame을 반환합니다. 20 | 21 | Parameter 22 | ---- 23 | sparkSession: Active Spark Session 24 | tablenames: DataFrame으로 만들 테이블명 25 | **kwargs: `driver`, `url`, `user`, `password` 26 | 27 | Raises: 28 | ValueError 29 | 30 | Returns: 31 | `DataFrame`s from database 32 | 33 | 34 | Usage 35 | ----- 36 | >>> import SPDataFrame 37 | >>> ss = SparkSession.builder.getOrCreate() 38 | >>> tablenames = ['integrated_address_seoul', 'integrated_address_incheon', 'integrated_address_gyeonggi'] 39 | >>> table_dfs = SPDataFrame(ss, tablenames, 40 | driver='com.mysql.cj.jdbc.Driver', 41 | url='jdbc:mysql://localhost:3306/sparkplus', 42 | user='root', 43 | password='password' 44 | ) 45 | >>> table_dfs.select('roadname_code', 'sido', 'sigungu', 'eupmyeondong').show() 46 | +-------------+----------+-------------+------------+ 47 | |roadname_code| sido| sigungu|eupmyeondong| 48 | +-------------+----------+-------------+------------+ 49 | | 261103125011|부산광역시| 중구| 영주동| 50 | | 261104006006|부산광역시| 중구| 영주동| 51 | | 261104006006|부산광역시| 중구| 영주동| 52 | | 261104006006|부산광역시| 중구| 영주동| 53 | | 261103125011|부산광역시| 중구| 영주동| 54 | | 111104100289|서울특별시| 종로구| 청운동| 55 | | 111104100289|서울특별시| 종로구| 청운동| 56 | | 111103100014|서울특별시| 종로구| 청운동| 57 | | 111104100289|서울특별시| 종로구| 청운동| 58 | | 111104100289|서울특별시| 종로구| 청운동| 59 | | 411114322017| 경기도|수원시 장안구| 파장동| 60 | | 411114322017| 경기도|수원시 장안구| 파장동| 61 | | 411114322017| 경기도|수원시 장안구| 파장동| 62 | | 411114322017| 경기도|수원시 장안구| 파장동| 63 | | 411114322017| 경기도|수원시 장안구| 파장동| 64 | +-------------+----------+-------------+------------+ 65 | """ 66 | sess_conf = sparkSession.sparkContext.getConf().getAll() 67 | 68 | # If SparkConf doesn't contain MySQL connector, raise `ValueError` 69 | jdbc_driver_flag = False 70 | 71 | # If you use `spark.jars.packages`, value should like `mysql:mysql-connector-java:YOUR_MYSQL_VERSION` 72 | available_configs = [ 73 | "spark.jars", 74 | "spark.driver.extraClassPath", 75 | "spark.jars.packages", 76 | ] 77 | 78 | for (conf_key, conf_val) in sess_conf: 79 | if conf_key in available_configs and conf_val.__contains__("mysql"): 80 | jdbc_driver_flag = True 81 | break 82 | 83 | if not jdbc_driver_flag: 84 | raise ValueError( 85 | "[SPARKPLUS_MYSQL_CONNECTOR_ERR] " 86 | "Your spark session seems like it doesn't contains mysql-connector-java path to connect mysql database. " 87 | "Please specify it to use SparkPlus package properly.\n\n" 88 | "$ spark-submit --jars \n\n" 89 | "In programming way, if you have mysql-connector jar file locally, set spark configuration like\n\n" 90 | ">>> ss = SparkSession.builder.config('spark.jars', MYSQL_JAR_PATH)\n\n" 91 | "or if you don't,\n\n" 92 | ">>> ss = SparkSession.builder.config('spark.jars.packages', 'mysql:mysql-connector-java:YOUR_MYSQL_VERSION')\n\n" 93 | "Check https://spark.apache.org/docs/latest/configuration.html for detail." 94 | ) 95 | 96 | ss_read = sparkSession.read.format("jdbc") 97 | 98 | # set DB options such as driver, url, user, password 99 | for opt_key, opt_val in kwargs.items(): 100 | ss_read.option(opt_key, opt_val) 101 | 102 | if isinstance(tablenames, str): 103 | return ss_read.option("dbtable", tablenames).load() 104 | else: 105 | dfs = ss_read.option("dbtable", tablenames.pop()).load() 106 | 107 | while tablenames: 108 | dfs = dfs.union(ss_read.option("dbtable", tablenames.pop()).load()) 109 | 110 | return dfs 111 | -------------------------------------------------------------------------------- /sparkplus/core/coord_dataframe.py: -------------------------------------------------------------------------------- 1 | from geopandas.geodataframe import GeoDataFrame 2 | from pyspark.sql.functions import lit, udf, pandas_udf 3 | from pyspark.sql import DataFrame 4 | from pyspark.sql.types import * 5 | 6 | import geopandas as gpd 7 | import h3 8 | 9 | 10 | def create_sjoin_pnu(gdf, join_column_name): 11 | def sjoin_settlement(x, y): 12 | gdf_temp = gpd.GeoDataFrame( 13 | data=[[x] for x in range(len(x))], geometry=gpd.points_from_xy(x, y) 14 | ).set_crs(epsg=4326, inplace=True) 15 | settlement = gpd.sjoin(gdf_temp, gdf, how="left", predicate="within") 16 | settlement = settlement.drop_duplicates(subset="geometry") 17 | 18 | return ( 19 | settlement.agg({"PNU": lambda x: str(x)}) 20 | .reset_index() 21 | .loc[:, join_column_name] 22 | .astype("str") 23 | ) 24 | 25 | return pandas_udf(sjoin_settlement, returnType=StringType()) 26 | 27 | 28 | def _coord_to_pnu(origin_df, gdf, x_colname, y_colname): 29 | sjoin_udf = create_sjoin_pnu(gdf, "PNU") 30 | res_df = origin_df.withColumn( 31 | "PNU", sjoin_udf(origin_df[x_colname], origin_df[y_colname]) 32 | ) 33 | return res_df 34 | 35 | 36 | def _join_with_table(table_df, pnu_df): 37 | # temp_df = self.coord_to_pnu() 38 | table_df = table_df.dropDuplicates(["bupjungdong_code"]) 39 | res_df = pnu_df.join( 40 | table_df, [pnu_df.PNU[0:10] == table_df.bupjungdong_code], how="left_outer" 41 | ) 42 | # res_df = res_df.dropDuplicates(['PNU']) 43 | 44 | return res_df 45 | 46 | 47 | @udf(StringType()) 48 | def get_fullname(a, b, c, d): 49 | if a == None and b == None and c == None and d == None: 50 | return None 51 | 52 | if a == None: 53 | a = "" 54 | if b == None: 55 | b = "" 56 | if c == None: 57 | c = "" 58 | if d == None: 59 | d = "" 60 | 61 | res = str(a) + " " + str(b) + " " + str(c) + " " + str(d) + " " 62 | 63 | return res 64 | 65 | class CoordDataFrame(DataFrame): 66 | """ 67 | Summary 68 | ------- 69 | 위경도 좌표가 포함된 Spark DataFrame에 법정읍면동, h3, 우편번호 정보를 추가합니다. 70 | 71 | Args: 72 | origin_sdf (Spark DataFrame): 위경도 좌표가 포함된 원본 Spark DataFrame 73 | gdf (GeoDataFrame): shp Parquet으로부터 생성한 GeoDataFrame 74 | tdf (Spark DataFrame): 데이터베이스로부터 생성한 Spark DataFrame 75 | x_colname (String): 원본 Spark DataFrame의 경도 컬럼 이름 76 | y_colname (String): 원본 Spark DataFrame의 위도 컬럼 이름 77 | 78 | Usage 79 | ------- 80 | >>> from sparkplus.core.sparkplus import CoordDataFrame 81 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, x_colname, y_colname) 82 | """ 83 | 84 | def __init__(self, origin_sdf, gdf, tdf, x_colname, y_colname): 85 | self._origin_sdf = origin_sdf 86 | self._gdf = gdf 87 | self._tdf = tdf 88 | self._x_colname = x_colname 89 | self._y_colname = y_colname 90 | 91 | self.pnu_df = _coord_to_pnu(origin_sdf, gdf, x_colname, y_colname).cache() 92 | self.joined_df = _join_with_table(tdf, self.pnu_df).cache() 93 | 94 | def add_h3(self, h3_level): 95 | """ 96 | Summary 97 | ------- 98 | 위경도 좌표가 포함된 원본 Spark DataFrame에 h3 정보를 추가합니다. 99 | 100 | Args: 101 | h3_level (Int): 추가하고자 하는 h3 level 102 | 103 | Usage 104 | ------- 105 | >>> from sparkplus.core.sparkplus import CoordDataFrame 106 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 107 | >>> res_df = df.coord_to_h3(10) 108 | 109 | Examples 110 | ------- 111 | >>> origin_sdf.show() 112 | +----------+--------+-----------+-----------+ 113 | | 가로등번호| 관할구청| 위도| 경도| 114 | +----------+--------+-----------+-----------+ 115 | | 1001001| 중구|35.87343028|128.6103158| 116 | | 1001002| 중구|35.87334197|128.6099071| 117 | | 1001003| 중구|35.87327842|128.6096135| 118 | +----------+--------+-----------+-----------+ 119 | 120 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도') 121 | >>> res_df = df.coord_to_h3(10) 122 | >>> res_df.show() 123 | +----------+--------+-----------+-----------+---------------+ 124 | | 가로등번호| 관할구청| 위도| 경도| h3| 125 | +----------+--------+-----------+-----------+---------------+ 126 | | 1001001| 중구|35.87343028|128.6103158|8a30c190311ffff| 127 | | 1001002| 중구|35.87334197|128.6099071|8a30c190311ffff| 128 | | 1001003| 중구|35.87327842|128.6096135|8a30c19031affff| 129 | +----------+--------+-----------+-----------+---------------+ 130 | """ 131 | udf_to_h3 = udf( 132 | lambda x, y: h3.geo_to_h3(float(x), float(y), h3_level), 133 | returnType=StringType(), 134 | ) 135 | 136 | res_h3 = self._origin_sdf.withColumn( 137 | "h3", 138 | udf_to_h3( 139 | self._origin_sdf[self._y_colname], self._origin_sdf[self._x_colname] 140 | ), 141 | ) 142 | return CoordDataFrame(res_h3) 143 | 144 | def add_pnu(self): 145 | """ 146 | Summary 147 | ------- 148 | 위경도 좌표가 포함된 원본 Spark DataFrame에 pnu 정보를 추가합니다. 149 | 150 | Usage 151 | ------- 152 | >>> from sparkplus.core.sparkplus import CoordDataFrame 153 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 154 | >>> res_df = df.coord_to_pnu() 155 | 156 | Example 157 | ------- 158 | >>> orgin_sdf.show() 159 | +----------+--------+-----------+-----------+ 160 | | 가로등번호| 관할구청| 위도| 경도| 161 | +----------+--------+-----------+-----------+ 162 | | 1001001| 중구|35.87343028|128.6103158| 163 | | 1001002| 중구|35.87334197|128.6099071| 164 | | 1001003| 중구|35.87327842|128.6096135| 165 | +----------+--------+-----------+-----------+ 166 | 167 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도') 168 | >>> res_df = df.coord_to_pnu() 169 | >>> res_df.show() 170 | +----------+--------+-----------+-----------+-------------------+ 171 | | 가로등번호| 관할구청| 위도| 경도| PNU| 172 | +----------+--------+-----------+-----------+-------------------+ 173 | | 1001001| 중구|35.87343028|128.6103158|2711010300103670054| 174 | | 1001002| 중구|35.87334197|128.6099071|2711010300103670054| 175 | | 1001003| 중구|35.87327842|128.6096135|2711010300103670054| 176 | +----------+--------+-----------+-----------+-------------------+ 177 | """ 178 | return self.pnu_df 179 | 180 | def add_zipcode(self): 181 | """ 182 | Summary 183 | ------- 184 | 위경도 좌표가 포함된 원본 Spark DataFrame에 우편번호 정보를 추가합니다. 185 | 186 | Usage 187 | ------- 188 | >>> from sparkplus.core.sparkplus import CoordDataFrame 189 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 190 | >>> res_df = df.coord_to_zipcode() 191 | 192 | Example 193 | ------- 194 | >>> origin_sdf.show() 195 | +----------+--------+-----------+-----------+ 196 | | 가로등번호| 관할구청| 위도| 경도| 197 | +----------+--------+-----------+-----------+ 198 | | 1001001| 중구|35.87343028|128.6103158| 199 | | 1001002| 중구|35.87334197|128.6099071| 200 | | 1001003| 중구|35.87327842|128.6096135| 201 | +----------+--------+-----------+-----------+ 202 | 203 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도') 204 | >>> res_df = df.coord_to_zipcode() 205 | >>> res_df.show() 206 | +----------+--------+-----------+-----------+-------+ 207 | | 가로등번호| 관할구청| 위도| 경도|zipcode| 208 | +----------+--------+-----------+-----------+-------+ 209 | | 8155012| 달성군|35.64103224|128.4106523| 43013| 210 | | 8071024| 달성군|35.66091032|128.4159519| 43006| 211 | | 8213007| 달성군| 35.6320721|128.4175234| 43013| 212 | +----------+--------+-----------+-----------+-------+ 213 | 214 | """ 215 | joined_df = self.joined_df.select("PNU", "zipcode") 216 | res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU") 217 | res_df = res_df.dropDuplicates([self._x_colname, self._y_colname]) 218 | return CoordDataFrame(res_df) 219 | 220 | def add_bupjungdong(self): 221 | """ 222 | Summary 223 | ------- 224 | 위경도 좌표가 포함된 원본 Spark DataFrame에 법정읍면동 코드 정보를 추가합니다. 225 | 226 | Usage 227 | ------- 228 | >>> from sparkplus.core.sparkplus import CoordDataFrame 229 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 230 | >>> res_df = df.coord_to_emd() 231 | 232 | Example 233 | ------- 234 | >>> origin_sdf.show() 235 | +----------+--------+-----------+-----------+ 236 | | 가로등번호| 관할구청| 위도| 경도| 237 | +----------+--------+-----------+-----------+ 238 | | 1001001| 중구|35.87343028|128.6103158| 239 | | 1001002| 중구|35.87334197|128.6099071| 240 | | 1001003| 중구|35.87327842|128.6096135| 241 | +----------+--------+-----------+-----------+ 242 | 243 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도') 244 | >>> res_df = df.coord_to_emd() 245 | >>> res_df.show() 246 | +----------+--------+-----------+-----------+----------------+ 247 | | 가로등번호| 관할구청| 위도| 경도|bupjungdong_code| 248 | +----------+--------+-----------+-----------+----------------+ 249 | | 1001001| 중구|35.87343028|128.6103158| 2711010300| 250 | | 1001002| 중구|35.87334197|128.6099071| 2711010300| 251 | | 1001003| 중구|35.87327842|128.6096135| 2711010300| 252 | +----------+--------+-----------+-----------+----------------+ 253 | """ 254 | joined_df = self.joined_df.select("PNU", "bupjungdong_code") 255 | res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU") 256 | res_df = res_df.dropDuplicates([self._x_colname, self._y_colname]) 257 | return CoordDataFrame(res_df) 258 | 259 | def add_roadname(self): 260 | """ 261 | Summary 262 | ------- 263 | 위경도 좌표가 포함된 원본 Spark DataFrame에 도로명 주소 정보를 추가합니다. 264 | 265 | Usage 266 | ------- 267 | >>> from sparkplus.core.sparkplus import CoordDataFrame 268 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 269 | >>> res_df = df.coord_to_roadname() 270 | 271 | Example 272 | ------- 273 | >>> origin_sdf.show() 274 | +----------+--------+-----------+-----------+ 275 | | 가로등번호| 관할구청| 위도| 경도| 276 | +----------+--------+-----------+-----------+ 277 | | 1001001| 중구|35.87343028|128.6103158| 278 | | 1001002| 중구|35.87334197|128.6099071| 279 | | 1001003| 중구|35.87327842|128.6096135| 280 | +----------+--------+-----------+-----------+ 281 | 282 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 283 | >>> res_df = df.coord_to_roadname() 284 | >>> res_df.show() 285 | +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+ 286 | | 가로등번호| 관할구청| 위도| 경도| sido|sigungu| roadname|eupmyeondong|bupjungli|is_basement|building_primary_number|building_secondary_number| 287 | +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+ 288 | | 1001001| 중구|35.87343028|128.6103158| 대구광역시| 중구| 동덕로38길| 동인동3가| | 0| 100| 0| 289 | | 1001002| 중구|35.87334197|128.6099071| 대구광역시| 중구| 동덕로38길| 동인동3가| | 0| 100| 0| 290 | | 1001003| 중구|35.87327842|128.6096135| 대구광역시| 중구| 동덕로38길| 동인동3가| | 0| 100| 0| 291 | +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+ 292 | 293 | """ 294 | joined_df = self.joined_df.select( 295 | "PNU", 296 | "sido", 297 | "sigungu", 298 | "roadname", 299 | "eupmyeondong", 300 | "bupjungli", 301 | "is_basement", 302 | "building_primary_number", 303 | "building_secondary_number", 304 | ) 305 | res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU") 306 | res_df = res_df.dropDuplicates([self._x_colname, self._y_colname]) 307 | return CoordDataFrame(res_df) 308 | 309 | def add_roadname_addr(self): 310 | """ 311 | Summary 312 | ------- 313 | 위경도 좌표가 포함된 원본 Spark DataFrame에 도로명 주소 정보를 추가합니다. 314 | 315 | Usage 316 | ------- 317 | >>> from sparkplus.core.sparkplus import CoordDataFrame 318 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 319 | >>> res_df = df.coord_to_roadname() 320 | 321 | Example 322 | ------- 323 | >>> origin_sdf.show() 324 | +----------+--------+-----------+-----------+ 325 | | 가로등번호| 관할구청| 위도| 경도| 326 | +----------+--------+-----------+-----------+ 327 | | 1001001| 중구|35.87343028|128.6103158| 328 | | 1001002| 중구|35.87334197|128.6099071| 329 | | 1001003| 중구|35.87327842|128.6096135| 330 | +----------+--------+-----------+-----------+ 331 | 332 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 333 | >>> res_df = df.coord_to_roadname() 334 | >>> res_df.show() 335 | +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+ 336 | | 가로등번호| 관할구청| 위도| 경도| sido|sigungu| roadname|eupmyeondong|bupjungli|is_basement|building_primary_number|building_secondary_number| 337 | +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+ 338 | | 1001001| 중구|35.87343028|128.6103158| 대구광역시| 중구| 동덕로38길| 동인동3가| | 0| 100| 0| 339 | | 1001002| 중구|35.87334197|128.6099071| 대구광역시| 중구| 동덕로38길| 동인동3가| | 0| 100| 0| 340 | | 1001003| 중구|35.87327842|128.6096135| 대구광역시| 중구| 동덕로38길| 동인동3가| | 0| 100| 0| 341 | +----------+--------+-----------+-----------+----------+-------+-------------+------------+---------+-----------+-----------------------+-------------------------+ 342 | 343 | """ 344 | joined_df = self.joined_df.select( 345 | "PNU", 346 | "sido", 347 | "sigungu", 348 | "roadname", 349 | "eupmyeondong", 350 | "bupjungli", 351 | "is_basement", 352 | "building_primary_number", 353 | "building_secondary_number", 354 | ) 355 | joined_df = joined_df.withColumn( 356 | "roadname_address", 357 | get_fullname( 358 | joined_df["sido"], 359 | joined_df["sigungu"], 360 | joined_df["roadname"], 361 | joined_df["building_primary_number"], 362 | ), 363 | ) 364 | res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU") 365 | res_df = res_df.dropDuplicates([self._x_colname, self._y_colname]) 366 | return CoordDataFrame(res_df) 367 | 368 | def add_jibun(self): 369 | """ 370 | Summary 371 | ------- 372 | 위경도 좌표가 포함된 원본 Spark DataFrame에 지번 주소 정보를 추가합니다. 373 | 374 | Usage 375 | ------- 376 | >>> from sparkplus.core.sparkplus import CoordDataFrame 377 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 378 | >>> res_df = df.coord_to_jibun() 379 | 380 | Example 381 | ------- 382 | >>> origin_sdf.show() 383 | +----------+--------+-----------+-----------+ 384 | | 가로등번호| 관할구청| 위도| 경도| 385 | +----------+--------+-----------+-----------+ 386 | | 1001001| 중구|35.87343028|128.6103158| 387 | | 1001002| 중구|35.87334197|128.6099071| 388 | | 1001003| 중구|35.87327842|128.6096135| 389 | +----------+--------+-----------+-----------+ 390 | 391 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 392 | >>> res_df = df.coord_to_jibun() 393 | >>> res_df.show() 394 | +----------+--------+-----------+-----------+----------+-------+------------+---------+--------------------+----------------------+ 395 | | 가로등번호| 관할구청| 위도| 경도| sido|sigungu|eupmyeondong|bupjungli|jibun_primary_number|jibun_secondary_number| 396 | +----------+--------+-----------+-----------+----------+-------+------------+---------+--------------------+----------------------+ 397 | | 1001001| 중구|35.87343028|128.6103158| 대구광역시| 중구| 동인동3가| | 192| 79| 398 | | 1001002| 중구|35.87334197|128.6099071| 대구광역시| 중구| 동인동3가| | 192| 79| 399 | | 1001003| 중구|35.87327842|128.6096135| 대구광역시| 중구| 동인동3가| | 192| 79| 400 | +----------+--------+-----------+-----------+----------+-------+------------+---------+--------------------+----------------------+ 401 | """ 402 | joined_df = self.joined_df.select( 403 | "PNU", 404 | "sido", 405 | "sigungu", 406 | "eupmyeondong", 407 | "bupjungli", 408 | "jibun_primary_number", 409 | "jibun_secondary_number", 410 | ) 411 | res_df = self.pnu_df.join(joined_df, "PNU", "leftouter").drop("PNU") 412 | res_df = res_df.dropDuplicates([self._x_colname, self._y_colname]) 413 | return CoordDataFrame(res_df) 414 | 415 | def join_with_db(self): 416 | """ 417 | Summary 418 | ------- 419 | 위경도 좌표가 포함된 원본 Spark DataFrame에 데이터베이스에서 가져온 Spark DataFrame 정보를 추가합니다. 420 | 421 | Usage 422 | ------- 423 | >>> from sparkplus.core.sparkplus import CoordDataFrame 424 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, 'lon', 'lat') 425 | >>> res_df = df.join_with_table() 426 | 427 | Example 428 | ------- 429 | >>> origin_sdf.show() 430 | +----------+--------+-----------+-----------+ 431 | | 가로등번호| 관할구청| 위도| 경도| 432 | +----------+--------+-----------+-----------+ 433 | | 1001001| 중구|35.87343028|128.6103158| 434 | | 1001002| 중구|35.87334197|128.6099071| 435 | | 1001003| 중구|35.87327842|128.6096135| 436 | +----------+--------+-----------+-----------+ 437 | 438 | >>> df = CoordDataFrame(origin_sdf, gdf, tdf, '경도', '위도') 439 | >>> res_df = df.join_with_table() 440 | >>> res_df.show() 441 | +----------+--------+-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+ 442 | | 가로등번호| 관할구청| 위도| 경도| PNU| manage_number|roadname_code|zipcode| sido|sigungu|eupmyeondong|bupjungli| roadname|is_basement|building_primary_number|building_secondary_number|jibun_primary_number|jibun_secondary_number|bupjungdong_code| 443 | +----------+--------+-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+ 444 | | 1065002| 중구|35.86341579|128.6024286|2711010600101990000|27110106001000300...| 271103007017| 41940| 대구광역시| 중구| 삼덕동2가| | 공평로| 0| 46| 0| 3| 4| 2711010600| 445 | | 1063002| 중구|35.86516734|128.6105401|2711010700103790000|27110107001003100...| 271104223055| 41945| 대구광역시| 중구| 삼덕동3가| | 달구벌대로443길| 0| 62| 16| 31| 2| 2711010700| 446 | | 1024017| 중구|35.86927185|128.5937782|2711011700101200003|27110115001008500...| 271102007001| 41909| 대구광역시| 중구| 남일동| | 중앙대로| 1| 424| 0| 143| 1| 2711011700| 447 | +----------+--------+-----------+-----------+-------------------+--------------------+-------------+-------+----------+-------+------------+---------+---------------+-----------+-----------------------+-------------------------+--------------------+----------------------+----------------+ 448 | 449 | """ 450 | return self.joined_df 451 | -------------------------------------------------------------------------------- /sparkplus/core/job.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import * 2 | from pyspark.sql import SparkSession 3 | import sparkplus 4 | import os 5 | from dotenv import load_dotenv 6 | import geopandas as gpd 7 | from py_log import logger 8 | 9 | load_dotenv() 10 | 11 | 12 | def load_shp_from_s3(bucket, key): 13 | return gpd.read_parquet(f"s3://{bucket}/{key}") 14 | 15 | 16 | def db_table_to_df(spark, table): 17 | df = ( 18 | spark.read.format("jdbc") 19 | .option("driver", os.getenv("DB_DRIVER")) 20 | .option("url", os.getenv("DB_URL")) 21 | .option("dbtable", table) 22 | .option("user", os.getenv("DB_USER")) 23 | .option("password", os.getenv("DB_PASSWORD")) 24 | .load() 25 | ) 26 | return df 27 | 28 | 29 | def load_table(spark): 30 | table_list = [ 31 | "additional_info_busan", 32 | "additional_info_chungbuk", 33 | "additional_info_chungnam", 34 | "additional_info_daegu", 35 | "additional_info_daejeon", 36 | "additional_info_gangwon", 37 | "additional_info_gwangju", 38 | "additional_info_gyeongbuk", 39 | "additional_info_gyeonggi", 40 | "additional_info_gyeongnam", 41 | "additional_info_incheon", 42 | "additional_info_jeju", 43 | "additional_info_jeonbuk", 44 | "additional_info_jeonnam", 45 | "additional_info_sejong", 46 | "additional_info_seoul", 47 | "additional_info_ulsan", 48 | "jibun_address_busan", 49 | "jibun_address_chungbuk", 50 | "jibun_address_chungnam", 51 | "jibun_address_daegu", 52 | "jibun_address_daejeon", 53 | "jibun_address_gangwon", 54 | "jibun_address_gwangju", 55 | "jibun_address_gyeongbuk", 56 | "jibun_address_gyeonggi", 57 | "jibun_address_gyeongnam", 58 | "jibun_address_incheon", 59 | "jibun_address_jeju", 60 | "jibun_address_jeonbuk", 61 | "jibun_address_jeonnam", 62 | "jibun_address_sejong", 63 | "jibun_address_seoul", 64 | "jibun_address_ulsan", 65 | "roadname_address_busan", 66 | "roadname_address_chungbuk", 67 | "roadname_address_chungnam", 68 | "roadname_address_daegu", 69 | "roadname_address_daejeon", 70 | "roadname_address_gangwon", 71 | "roadname_address_gwangju", 72 | "roadname_address_gyeongbuk", 73 | "roadname_address_gyeonggi", 74 | "roadname_address_gyeongnam", 75 | "roadname_address_incheon", 76 | "roadname_address_jeju", 77 | "roadname_address_jeonbuk", 78 | "roadname_address_jeonnam", 79 | "roadname_address_sejong", 80 | "roadname_address_seoul", 81 | "roadname_address_ulsan", 82 | "roadname_code", 83 | "integrated_address_daegu", 84 | ] 85 | 86 | for table in table_list: 87 | name = table 88 | globals()[name] = db_table_to_df(spark, table) 89 | return globals() 90 | 91 | 92 | spark = SparkSession.builder.appName("Spark App").getOrCreate() 93 | 94 | # Load csv file 95 | logger.debug("Loading csv...") 96 | origin = spark.read.csv("s3://sparkplus-core/resource/data/daegu_streetlight.csv") 97 | logger.debug("Loading complete.") 98 | 99 | # Clear data 100 | daegu = origin.drop("_c1") 101 | daegu = daegu.where("_c0 > 10000") 102 | custom = sparkplus.CustomDataFrame(daegu, "_c3", "_c2") 103 | 104 | # Load parquet file 105 | logger.debug("Loading parquet...") 106 | shp_df = gpd.read_parquet("s3://sparkplus-core/resource/LSMD/Daegu.parquet") 107 | logger.debug("Loading complete...") 108 | 109 | # Load table from Database 110 | logger.debug("Loading db...") 111 | db_dict = load_table(spark) 112 | logger.debug("Loading complete...") 113 | 114 | result = custom.join_with_table(shp_df, db_dict["integrated_address_daegu"]) 115 | result.show() 116 | -------------------------------------------------------------------------------- /sparkplus/core/numaddr_dataframe.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(os.path.dirname(__file__))))) 4 | 5 | from pyspark.sql import DataFrame 6 | from pyspark.sql.functions import split, col 7 | from sparkplus.core.udfs import * 8 | 9 | class NumAddrDataFrame(object): 10 | """ 11 | 도로명 주소를 활용하여 데이터를 분석하기 위한 클래스입니다 12 | """ 13 | def __init__(self, dataFrame: DataFrame): 14 | self._df = dataFrame 15 | self._tmp_df = dataFrame 16 | self.col_list = dataFrame.columns 17 | 18 | def to_bupjungdong(self, target: str, db_df:DataFrame): 19 | """ 20 | 도로명을 지번으로 변경하는 전 과정을 포함하는 함수입니다 21 | """ 22 | self.add_split(target) 23 | self.add_sido() 24 | self.add_sigungu() 25 | self.add_eupmyeondong() 26 | self.add_jibun_primary() 27 | self.add_jibun_secondary() 28 | self.join_with_db(db_df) 29 | # self.join_with_db(db_df) 30 | return self._df 31 | 32 | def add_split(self, target: str): 33 | """ 34 | DB에서 조회를 위해 원본의 string을 공백 기준으로 나누는 함수입니다. 35 | 36 | Parameters 37 | ---------- 38 | target : str 39 | split하고 조작할 원본 데이터의 컬럼명 40 | 41 | Examples 42 | -------- 43 | >>> road_df = RoadnameDataframe(your_df) 44 | >>> road_df._df.show() 45 | +------------------------------+s 46 | |target | 47 | +------------------------------+ 48 | |경기도 화성시 장안면 매바위로366번길 8 | 49 | |경기도 화성시 장안면 버들로 | 50 | |경기도 화성시 장안면 석포리 | 51 | +------------------------------+ 52 | 53 | >>> splited_df = road_df.add_split('target') 54 | >>> splited_df.show() 55 | +------------------------------+-----------------------------------+ 56 | |target |split | 57 | +------------------------------+-----------------------------------+ 58 | |경기도 화성시 장안면 매바위로366번길 8|[경기도, 화성시, 장안면, 매바위로366번길, 8]| 59 | |경기도 화성시 장안면 버들로 |[경기도, 화성시, 장안면, 버들로] | 60 | |경기도 화성시 장안면 석포리 |[경기도, 화성시, 장안면, 석포리] | 61 | +-----------------------------+------------------------------------+ 62 | """ 63 | self._df = self._df.withColumn('split', split(self._df[target], ' ')) 64 | return self._df 65 | 66 | def cleanse_split_column(self): 67 | """ 68 | 주소가 비정형 데이터일 경우 사용되는 함수입니다. 69 | add_split_column 함수로 쪼개진 split 컬럼의 데이터를 전처리합니다. 70 | 71 | UDF 72 | --- 73 | where_is_sido : IntegerType 74 | split 컬럼에서 특별시와 광역시, 도를 찾고, 위치한 인덱스를 반환합니다. 75 | 76 | Exmaple 77 | ------- 78 | >>> df.show() 79 | +---------------------------------------------+ 80 | |split | 81 | +---------------------------------------------+ 82 | |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]| 83 | |[경기도, 화성시, 장안면, 버들로] | 84 | |[경기도, 화성시, 장안면, 석포리] | 85 | +--------------------------------------------+ 86 | 87 | >>> df.withColumn('idx', where_is_sido(split)).show() 88 | +---------------------------------------------+----+ 89 | |split |sido| 90 | +---------------------------------------------+----+ 91 | |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8]| 1| 92 | |[경기도, 화성시, 장안면, 버들로] | 0| 93 | |[경기도, 화성시, 장안면, 석포리] | 2| 94 | +--------------------------------------------+----+ 95 | 96 | cleanse_split: ArrayType(StringType) 97 | split 컬럼과 인덱스 컬럼을 활용하여 알맞은 주소체계 값으로 반환합니다. 98 | 99 | Example 100 | ------- 101 | >>> df.show() 102 | +------------------------------------------------+---+ 103 | |split |idx| 104 | +------------------------------------------------+---+ 105 | |[[185-74], 경기도, 화성시, 장안면,매바위로366번길, 8] | 1| 106 | |[경기도, 화성시, 장안면, 버들로] | 0| 107 | |[Gyeonggi-do, [185-74], 경기도, 화성시, 장안면, 석포리]| 2| 108 | +------------------------------------------------+---+ 109 | 110 | >>> df.withColumn('split', cleanse_split(df.split)) 111 | +----------------------------------------+ 112 | |split | 113 | +----------------------------------------+ 114 | |[경기도, 화성시, 장안면,매바위로366번길, 8] | 115 | |[경기도, 화성시, 장안면, 버들로] | 116 | |[경기도, 화성시, 장안면, 석포리] | 117 | +---------------------------------------+ 118 | """ 119 | 120 | self._df = self._df \ 121 | .withColumn('idx', where_is_sido(self._df.split)) \ 122 | .withColumn('split', cleanse_split(self._df.idx, self._df.split)) 123 | self._df = self._df.drop('idx') 124 | self._df = self._df.withColumn('split', process_numaddr(self._df.split)) 125 | return self._df 126 | 127 | def add_sido(self): 128 | """ 129 | 특별시, 광역시, 도를 기존 데이터프레임에 추가하는 함수입니다. 130 | 131 | UDF 132 | --- 133 | extract_sido : StringType 134 | split 컬럼에서 특별시와 광역시, 도를 찾고 값을 반환합니다. 135 | 값이 없는 경우, "None" : str 을 반환합니다. 136 | 137 | Exmaple 138 | ------- 139 | >>> df.show() 140 | +----------------------------------------+ 141 | |split | 142 | +----------------------------------------+ 143 | |[경기도, 안산시, 단원구, 해봉로, 137] | 144 | |[경기도, 수원시, 장안구, 경수대로, 1079] | 145 | |[경기도, 안산시, 상록구, 양달말길, 93-7] | 146 | +----------------------------------------+ 147 | 148 | >>> df.withColumn('idx', extract_sido()).show() 149 | +----------------------------------------------+-----+ 150 | |split |sido | 151 | +----------------------------------------------+-----+ 152 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 | 153 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 | 154 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 | 155 | +----------------------------------------------+------+ 156 | """ 157 | 158 | self._df = self._df.withColumn("sido", extract_sido(self._df.split)) 159 | self._df.show() 160 | return self._df 161 | 162 | def add_sigungu(self): 163 | """ 164 | 시, 군, 구 컬럼을 기존 데이터프레임에 추가하는 함수입니다. 165 | UDF 166 | --- 167 | extract_sigungu : StringType 168 | split 컬럼에서 시, 군, 구를 찾고 값을 반환합니다. 169 | 170 | 시와 구가 같이 있을경우에는 시와 구를 같이 반환합니다. 171 | ex) 경기도 성남시 분당구 -> 성남시 분당구 172 | 173 | 값이 없는 경우, "None" : str 을 반환합니다. 174 | 175 | Exmaple 176 | ------- 177 | >>> df.show() 178 | +----------------------------------------------+-----+ 179 | |split |sido | 180 | +----------------------------------------------+-----+ 181 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 | 182 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 | 183 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 | 184 | +----------------------------------------------+------+ 185 | 186 | >>> df.withColumn('idx', extract_sigungu()).show() 187 | +----------------------------------------------+------+-----------+ 188 | |split |sido |sigungu | 189 | +----------------------------------------------+------+-----------+ 190 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 | 191 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 | 192 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 | 193 | +----------------------------------------------+------+-----------+ 194 | """ 195 | 196 | self._df = self._df.withColumn("sigungu", extract_sigungu(self._df.split)) 197 | self._df.show() 198 | return self._df 199 | 200 | def add_eupmyeondong(self): 201 | """ 202 | 읍, 면 컬럼을 기존에 데이터프레임에 추가하는 함수입니다. 203 | 204 | UDF 205 | --- 206 | extract_eupmyeon : StringType 207 | split 컬럼에서 읍이나 면을 찾고 값을 반환합니다. 208 | 209 | 값이 없는 경우, "None" : str 을 반환합니다. 210 | 211 | Exmaple 212 | ------- 213 | >>> df.show() 214 | +----------------------------------------------+------+-----------+ 215 | |split |sido |sigungu | 216 | +----------------------------------------------+------+-----------+ 217 | |[경기도, 화성시, 장안면, 매바위로366번길, 8] |경기도 |화성시 | 218 | |[강원도, 원주시, 호저면, 사제로, 9] |강원도 |원주시 | 219 | |[경상남도, 사천시, 곤양면, 경충로, 23-1] |경상남도|사천시 | 220 | +----------------------------------------------+------+-----------+ 221 | 222 | >>> df.withColumn('idx', extract_eupmyeon()).show() 223 | +----------------------------------------------+------+-----------+--------+ 224 | |split |sido |sigungu |eupmyeon| 225 | +----------------------------------------------+------+-----------+--------+ 226 | |[경기도, 화성시, 장안면, 매바위로366번길, 8] |경기도 |화성시 |장안면 | 227 | |[강원도, 원주시, 호저면, 사제로, 9] |강원도 |원주시 |호저면 | 228 | |[경상남도, 사천시, 곤양면, 경충로, 23-1] |경상남도|사천시 |곤양면 | 229 | +----------------------------------------------+------+-----------+-------+ 230 | """ 231 | self._df = self._df.withColumn("eupmyeondong", extract_eupmyeondong(self._df.split)) 232 | self._df.show() 233 | return self._df 234 | 235 | def add_jibun_primary(self): 236 | self._df = self._df.withColumn("jibun_primary_number", extract_jibun_primary(self._df.split)) 237 | self._df.show() 238 | return self._df 239 | 240 | def add_jibun_secondary(self): 241 | self._df = self._df.withColumn("jibun_secondary_number", extract_jibun_secondary(self._df.split)) 242 | self._df.show() 243 | return self._df 244 | 245 | def join_with_db(self, db_df): 246 | """ 247 | 데이터베이스 데이터프레임과 조인하는 함수입니다. 248 | 249 | Parameters 250 | ---------- 251 | db_df : DataFrame 252 | 253 | 254 | Exmaple 255 | ------- 256 | >>> df.show() 257 | +----------------------------------------------+------+-----------+---------+-----------------------+ 258 | |split |sido |sigungu |roadname |building_primary_number| 259 | +----------------------------------------------+------+-----------+---------+-----------------------+ 260 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 |해봉로 |137 | 261 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 |경수대로 |1079 | 262 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 |양달말길 |93 | 263 | +----------------------------------------------+------+-----------+---------+-----------------------+ 264 | 265 | >>> df.withColumn('idx', extract_building_primary_number()).show() 266 | +----------------------------------------------+------+-----------+---------+-----------------------+----------------+ 267 | |split |sido |sigungu |roadname |building_primary_number|bupjungdong_code| 268 | +----------------------------------------------+------+-----------+---------+-----------------------+----------------+ 269 | |[경기도, 안산시, 단원구, 해봉로, 137] |경기도 |안산시 단원구 |해봉로 |137 |4128112400 | 270 | |[경기도, 수원시, 장안구, 경수대로, 1079] |경기도 |수원시 장안구 |경수대로 |1079 |4128111800 | 271 | |[경기도, 안산시, 상록구, 양달말길, 93-7] |경기도 |안산시 상록구 |양달말길 |93 |4128101100 | 272 | +----------------------------------------------+------+-----------+---------+-----------------------+---------------+ 273 | """ 274 | tmp_db_df = db_df.select( \ 275 | col("sido").alias("db_sido"), \ 276 | col("sigungu").alias("db_sigungu"), \ 277 | col("eupmyeondong").alias("db_eupmyeondong"), \ 278 | col("roadname").alias("db_roadname"), \ 279 | col("jibun_primary_number").alias("db_jibun_primary_number"), \ 280 | col("jibun_secondary_number").alias("db_jibun_secondary_number"), \ 281 | col("bupjungdong_code").alias("db_bupjungdong_code") \ 282 | ) \ 283 | #.drop_duplicates(['db_roadname', 'db_building_primary_number']) 284 | tmp_df = self._df.join(tmp_db_df, (self._df.sigungu == tmp_db_df.db_sigungu) & (self._df.eupmyeondong == tmp_db_df.db_eupmyeondong) & (self._df.jibun_primary_number == tmp_db_df.db_jibun_primary_number) & (self._df.jibun_secondary_number == tmp_db_df.db_jibun_secondary_number), 'inner') 285 | tmp_df = tmp_df.withColumnRenamed("db_bupjungdong_code", "bupjungdong_code") 286 | self._df = tmp_df.select(self._df['*'], "bupjungdong_code") 287 | del self._tmp_df 288 | del tmp_df 289 | 290 | return self._df 291 | 292 | 293 | -------------------------------------------------------------------------------- /sparkplus/core/py_log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.handlers 3 | import datetime 4 | 5 | logger = logging.getLogger(__name__) 6 | formatter = logging.Formatter( 7 | "[%(asctime)s[%(levelname)s|%(filename)s:%(lineno)s] >> %(message)s" 8 | ) 9 | 10 | stremaHandler = logging.StreamHandler() 11 | now = str(datetime.datetime.now()).split(".")[0] 12 | fileHandler = logging.FileHandler("../logs/" + now) 13 | logger.setLevel(level=logging.DEBUG) 14 | 15 | stremaHandler.setFormatter(formatter) 16 | fileHandler.setFormatter(formatter) 17 | 18 | logger.addHandler(stremaHandler) 19 | logger.addHandler(fileHandler) 20 | 21 | 22 | logger.setLevel(level=logging.DEBUG) 23 | logging.debug("DEBUG log") 24 | logging.info("INFO log") 25 | logging.warning("WARN log") 26 | logging.error("ERROR log") 27 | logging.critical("CRITICAL log") 28 | -------------------------------------------------------------------------------- /sparkplus/core/shp_to_parquet.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import geopandas as gpd 3 | 4 | input_file = sys.argv[1] 5 | file_name = str(input_file)[:-4] 6 | region_code = int(file_name[16:18]) 7 | gdf = gpd.read_file(input_file) 8 | dict = { 9 | 41: "Gyeonggi", 10 | 48: "Gyeongnam", 11 | 47: "Gyeongbuk", 12 | 29: "Gwangju", 13 | 27: "Daegu", 14 | 30: "Daejeon", 15 | 26: "Busan", 16 | 11: "Seoul", 17 | 36: "Sejong", 18 | 28: "Incheon", 19 | 26: "Jeonnam", 20 | 45: "Jeonbuk", 21 | 50: "Jeju", 22 | 44: "Chungnam", 23 | 43: "Chungbuk", 24 | 31: "Ulsan", 25 | 42: "Kangwon", 26 | } 27 | file_name = dict[region_code] + ".parquet" 28 | gdf = gdf.set_crs(5174) 29 | gdf = gdf.to_crs(4326) 30 | gdf.to_parquet(file_name) 31 | -------------------------------------------------------------------------------- /sparkplus/core/tablename.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import List 3 | 4 | 5 | class EPrefix(Enum): 6 | """ 7 | Prefixes of Spark+ database. 8 | Get details from https://github.com/SWM-SparkPlus/db-updater#%EB%8F%84%EB%A1%9C%EB%AA%85%EC%A3%BC%EC%86%8C-%ED%85%8C%EC%9D%B4%EB%B8%94 9 | """ 10 | 11 | ADDINFO = "additional_info" 12 | ROADNAME = "roadname_address" 13 | JIBUN = "jibun_address" 14 | INTEGRATED = "integrated_address" 15 | 16 | 17 | class ESido(Enum): 18 | """ 19 | Enum for Korean metropolitan cities. 20 | """ 21 | 22 | SEOUL = "seoul" 23 | INCHEON = "incheon" 24 | DAEJEON = "daejeon" 25 | SEJONG = "sejong" 26 | GWANGJU = "gwangju" 27 | DAEGU = "daegu" 28 | ULSAN = "ulsan" 29 | BUSAN = "busan" 30 | JEJU = "jeju" 31 | GYEONGGI = "gyeonggi" 32 | GANGWON = "gangwon" 33 | CHUNGBUK = "chungbuk" 34 | CHUNGNAM = "chungnam" 35 | JEONBUK = "jeonbuk" 36 | JEONNAM = "jeonnam" 37 | GYEONGBUK = "gyeongbuk" 38 | GYEONGNAM = "gyeongnam" 39 | 40 | 41 | def get_tablename_by_prefix_and_sido(prefix: EPrefix, sido: ESido) -> str: 42 | """ 43 | Get tablename of Spark+ database. 44 | 45 | Example 46 | -------- 47 | 48 | >>> target_table = get_table_name(EPrefix.ADDINFO, ESIDO.SEOUL) # additional_info_seoul 49 | >>> target_table = get_table_name(EPrefix.INTEGRATED, ESIDO.BUSAN) # integrated_address_busan 50 | >>> error_table = get_table_name(EPrefix.ADDINFO, "anywhere") # Get AttributeError 51 | """ 52 | 53 | return f"{prefix.value}_{sido.value}" 54 | 55 | 56 | def get_all_tablenames_by_prefix(prefix: EPrefix) -> List[str]: 57 | """ 58 | Get all tablenames by given `EPrefix`. If you want to load all database tables to Spark `DataFrame`, see example below. 59 | It takes a lot of intensive works, 60 | 61 | Example 62 | ------- 63 | 64 | >>> from tablename import get_all_tablenames_by_prefix, EPrefix 65 | >>> get_all_tablenames_by_prefix(EPrefix.INTEGRATED) 66 | ['integrated_address_seoul', 'integrated_address_incheon', 'integrated_address_daejeon', 'integrated_address_sejong', 'integrated_address_gwangju', 'integrated_address_daegu', 'integrated_address_ulsan', 'integrated_address_busan', 'integrated_address_jeju', 'integrated_address_gyeonggi', 'integrated_address_gangwon', 'integrated_address_chungbuk', 'integrated_address_chungnam', 'integrated_address_jeonbuk', 'integrated_address_jeonnam', 'integrated_address_gyeongbuk', 'integrated_address_gyeongnam'] 67 | >>> # Load all data from database 68 | >>> from pyspark.sql import SparkSession 69 | >>> from pyspark.sql.functions import rand 70 | >>> from base import SPDataFrame 71 | >>> ss = SparkSession.builder.config('spark.driver.memory', '14g').getOrCreate() 72 | >>> all_tablenames = get_all_tablenames_by_prefix(EPrefix.INTEGRATED) 73 | >>> SPDataFrame.get_db_df_by_tablenames(ss, all_tablenames, ...).select('sido', 'sigungu', 'eupmyeondong').orderBy(rand()).show() 74 | +----------+-------------+------------+ 75 | | sido| sigungu|eupmyeondong| 76 | +----------+-------------+------------+ 77 | |광주광역시| 동구| 금남로4가| 78 | |인천광역시| 옹진군| 백령면| 79 | | 전라북도|전주시 덕진구| 인후동1가| 80 | | 경기도|용인시 처인구| 포곡읍| 81 | | 전라남도| 해남군| 화산면| 82 | | 강원도| 철원군| 서면| 83 | | 경기도|성남시 수정구| 산성동| 84 | | 경상남도| 산청군| 금서면| 85 | | 전라남도| 보성군| 웅치면| 86 | | 전라남도| 완도군| 약산면| 87 | | 경기도| 이천시| 장호원읍| 88 | | 경기도| 포천시| 가산면| 89 | | 경기도| 부천시| 소사동| 90 | | 경상남도| 창녕군| 영산면| 91 | | 강원도| 원주시| 학성동| 92 | |부산광역시| 강서구| 대저1동| 93 | | 전라남도| 곡성군| 옥과면| 94 | | 경상북도| 울진군| 북면| 95 | | 충청남도| 아산시| 탕정면| 96 | |서울특별시| 중랑구| 면목동| 97 | +----------+-------------+------------+ 98 | """ 99 | return [f"{prefix.value}_{sido.value}" for sido in ESido] 100 | -------------------------------------------------------------------------------- /sparkplus/core/test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "WARNING: An illegal reflective access operation has occurred\n", 13 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/Users/taypark/DEV/apache-spark/spark-3.1.2-bin-hadoop3.2/jars/spark-unsafe_2.12-3.1.2.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 14 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 15 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 16 | "WARNING: All illegal access operations will be denied in a future release\n" 17 | ] 18 | }, 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "21/10/12 05:16:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 24 | "[('spark.app.name', 't'), ('spark.app.startTime', '1633983376848'), ('spark.executor.id', 'driver'), ('spark.driver.host', 'localhost'), ('spark.sql.warehouse.dir', 'file:/Users/taypark/Repositories/spark-plugin/sparkplus/core/spark-warehouse'), ('spark.driver.extraClassPath', '/Users/taypark/Repositories/spark-plugin/resource/mysql-connector-java-8.0.26/mysql-connector-java-8.0.26.jar'), ('spark.driver.port', '54511'), ('spark.rdd.compress', 'True'), ('spark.serializer.objectStreamReset', '100'), ('spark.driver.maxResultSize', '0'), ('spark.master', 'local[*]'), ('spark.submit.pyFiles', ''), ('spark.submit.deployMode', 'client'), ('spark.driver.memory', '14g'), ('spark.ui.showConsoleProgress', 'true'), ('spark.sql.execution.arrow.pyspark.enabled', 'true'), ('spark.app.id', 'local-1633983377979')]\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "from base import SPDataFrame\n", 30 | "from pyspark.sql import SparkSession\n", 31 | "import os\n", 32 | "import geopandas as gpd\n", 33 | "\n", 34 | "ss_builder = SparkSession.builder.appName('t')\n", 35 | "\n", 36 | "ss_builder.config('spark.driver.extraClassPath',\n", 37 | " '/Users/taypark/Repositories/spark-plugin/resource'\n", 38 | " '/mysql-connector-java-8.0.26/mysql-connector-java-8.0.26.jar')\\\n", 39 | " .config('spark.driver.memory', '14g')\\\n", 40 | " .config('spark.sql.execution.arrow.pyspark.enabled', 'true')\\\n", 41 | " .config('spark.driver.maxResultSize', 0)\n", 42 | "\n", 43 | "ss = ss_builder.getOrCreate()\n", 44 | "\n", 45 | "# print(ss.sparkContext.getConf().getAll())\n", 46 | "\n", 47 | "gyeonggi_table_df = SPDataFrame.get_db_df_by_tablenames(ss, ['integrated_address_gyeonggi'],\n", 48 | " driver='com.mysql.cj.jdbc.Driver',\n", 49 | " url='jdbc:mysql://localhost:3306/sparkplus',\n", 50 | " user='root',\n", 51 | " password='sparkplus')\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 4, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "root\n", 64 | " |-- manage_number: string (nullable = true)\n", 65 | " |-- roadname_code: string (nullable = true)\n", 66 | " |-- zipcode: string (nullable = true)\n", 67 | " |-- sido: string (nullable = true)\n", 68 | " |-- sigungu: string (nullable = true)\n", 69 | " |-- eupmyeondong: string (nullable = true)\n", 70 | " |-- bupjungli: string (nullable = true)\n", 71 | " |-- roadname: string (nullable = true)\n", 72 | " |-- is_basement: string (nullable = true)\n", 73 | " |-- building_primary_number: integer (nullable = true)\n", 74 | " |-- building_secondary_number: integer (nullable = true)\n", 75 | " |-- bupjungdong_code: string (nullable = true)\n", 76 | "\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "gyeonggi_table_df.printSchema()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# SPDataFrame(gyeonggi_table_df).address_to_h3(addr_col_name='is_basement').show()\n", 91 | "import geopandas as gpd\n", 92 | "from pyspark.sql.functions import lit\n", 93 | "\n", 94 | "# gyeonggi_table_df.limit(1)\\\n", 95 | "# .withColumn('point', lit(gpd.points_from_xy(37.3211047, 126.9889655))).show()\n", 96 | "\n", 97 | "PREFIX = (37.32, 126.98)\n", 98 | "\n", 99 | "from faker import Faker\n", 100 | "\n", 101 | "fake = Faker()\n", 102 | "\n", 103 | "xy_box = []\n", 104 | "for i in range(50):\n", 105 | " x, y = PREFIX\n", 106 | " px, py = str(x), str(y)\n", 107 | " px, py = px + str(fake.random_int(0, 99999)), py + str(fake.random_int(0, 99999))\n", 108 | " xy_box.append((px, py))\n", 109 | "\n", 110 | " \n", 111 | " " 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 5, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | " PNU JIBUN BCHK SGG_OID COL_ADM_SE \\\n", 124 | "0 4111710300101670003 167-3전 1 270697 41110 \n", 125 | "1 4111113800101980001 198-1전 1 270698 41110 \n", 126 | "2 4111710300201190028 산119-28임 1 270699 41110 \n", 127 | "3 4111710300109070001 907-1잡 1 698166 41110 \n", 128 | "4 4111710300101770017 177-17임 1 270701 41110 \n", 129 | "... ... ... ... ... ... \n", 130 | "5078153 4183040033101950006 195-6 전 1 1892811 41830 \n", 131 | "5078154 4183039521200440024 산44-24 임 1 1893383 41830 \n", 132 | "5078155 4183036023102440000 244전 1 1885066 41830 \n", 133 | "5078156 4183036023102440004 244-4 전 1 1885067 41830 \n", 134 | "5078157 4183036023102440003 244-3 전 1 1885068 41830 \n", 135 | "\n", 136 | " geometry \n", 137 | "0 POLYGON ((127.05529 37.28866, 127.05533 37.288... \n", 138 | "1 POLYGON ((127.01543 37.32614, 127.01547 37.326... \n", 139 | "2 POLYGON ((127.05120 37.28951, 127.05120 37.289... \n", 140 | "3 POLYGON ((127.03676 37.29320, 127.03723 37.294... \n", 141 | "4 POLYGON ((127.05132 37.28945, 127.05170 37.289... \n", 142 | "... ... \n", 143 | "5078153 POLYGON ((127.59218 37.54486, 127.59216 37.544... \n", 144 | "5078154 POLYGON ((127.64311 37.46407, 127.64309 37.464... \n", 145 | "5078155 POLYGON ((127.65476 37.52167, 127.65476 37.521... \n", 146 | "5078156 POLYGON ((127.65500 37.52165, 127.65498 37.521... \n", 147 | "5078157 POLYGON ((127.65516 37.52165, 127.65516 37.521... \n", 148 | "\n", 149 | "[5078158 rows x 6 columns]\n", 150 | "21/10/12 08:15:01 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 928702 ms exceeds timeout 120000 ms\n", 151 | "21/10/12 08:15:01 WARN SparkContext: Killing executors is not supported by current scheduler.\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "coord_df = ss.createDataFrame(xy_box)\n", 157 | "coord_df = coord_df.withColumnRenamed('_1', 'lat').withColumnRenamed('_2', 'lng')\n", 158 | "\n", 159 | "import os \n", 160 | "PARQUET_PATH = os.getcwd() + '/../../resource/Gyeonggi.parquet'\n", 161 | "\n", 162 | "gyeonggi_gdf = gpd.read_parquet(PARQUET_PATH)\n", 163 | "\n", 164 | "print(gyeonggi_gdf)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 2, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "ename": "NameError", 174 | "evalue": "name 'coord_df' is not defined", 175 | "output_type": "error", 176 | "traceback": [ 177 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 178 | "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", 179 | "\u001b[0;32m/var/folders/z5/59xjw6ps4m95mnplymd3q3hc0000gn/T/ipykernel_49531/2756843327.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# 개발용\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mpandas_coord_to_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcoord_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoPandas\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;31m# print(pandas_coord_to_df)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 180 | "\u001b[0;31mNameError\u001b[0m: name 'coord_df' is not defined" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "# 개발용\n", 186 | "\n", 187 | "pandas_coord_to_df = coord_df.toPandas()\n", 188 | "\n", 189 | "# print(pandas_coord_to_df)\n", 190 | "print(pandas_coord_to_df)\n", 191 | "\n", 192 | "# point_sdf_to_geodataframe = gpd.GeoDataFrame(coord_df, geometry=gpd.points_from_xy(pandas_coord_to_df.lat, pandas_coord_to_df.lng))\n", 193 | "\n", 194 | "# print(point_sdf_to_geodataframe)\n", 195 | "\n", 196 | "temp_list = []\n", 197 | "\n", 198 | "# for i in point_sdf_to_geodataframe.index:\n", 199 | "# for j in gyeonggi_gdf.index:\n", 200 | "# if gyeonggi_gdf.geometry[j].contains(point_sdf_to_geodataframe[i]):\n", 201 | "# temp_list.append(gyeonggi_gdf.EMD_CD)\n", 202 | "\n", 203 | "print(temp_list)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [] 212 | } 213 | ], 214 | "metadata": { 215 | "interpreter": { 216 | "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" 217 | }, 218 | "kernelspec": { 219 | "display_name": "Python 3.9.6 64-bit", 220 | "name": "python3" 221 | }, 222 | "language_info": { 223 | "codemirror_mode": { 224 | "name": "ipython", 225 | "version": 3 226 | }, 227 | "file_extension": ".py", 228 | "mimetype": "text/x-python", 229 | "name": "python", 230 | "nbconvert_exporter": "python", 231 | "pygments_lexer": "ipython3", 232 | "version": "3.9.6" 233 | }, 234 | "orig_nbformat": 4 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 2 238 | } 239 | -------------------------------------------------------------------------------- /sparkplus/core/udfs.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.functions import udf 2 | from pyspark.sql.types import IntegerType, StringType, ArrayType 3 | 4 | sido_short_list = [ 5 | "서울", 6 | "부산", 7 | "대구", 8 | "인천", 9 | "광주", 10 | "대전", 11 | "울산", 12 | "세종", 13 | "경기", 14 | "강원", 15 | "충북", 16 | "충남", 17 | "전북", 18 | "전남", 19 | "경북", 20 | "경남", 21 | "제주", 22 | ] 23 | 24 | sido_long_list = [ 25 | "서울특별시", 26 | "부산광역시", 27 | "대구광역시", 28 | "인천광역시", 29 | "광주광역시", 30 | "대전광역시", 31 | "울산광역시", 32 | "세종특별자치시", 33 | "경기도", 34 | "강원도", 35 | "충청북도", 36 | "충청남도", 37 | "전라북도", 38 | "전라남도", 39 | "경상북도", 40 | "경상남도", 41 | "제주특별자치도", 42 | ] 43 | sido_dictionary = dict(zip(sido_short_list, sido_long_list)) 44 | sido_reverse_dictionary = dict(zip(sido_long_list, sido_short_list)) 45 | 46 | 47 | @udf(IntegerType()) 48 | def where_is_sido(split): 49 | for i in range(len(split)): 50 | if sido_dictionary.get(split[i]) or sido_reverse_dictionary.get(split[i]): 51 | return i 52 | return -1 53 | 54 | 55 | @udf(ArrayType(StringType())) 56 | def cleanse_split(idx, split): 57 | if idx != -1: 58 | return split[idx:] 59 | return split 60 | 61 | 62 | @udf(ArrayType(StringType())) 63 | def process_roadname(split): 64 | for i in range(len(split)): 65 | data = split[i] 66 | if data[-1].isdigit() and ("로" in data or "길" in data): 67 | result_li = list() 68 | for j in reversed(range(len(data))): 69 | if not data[j].isdigit(): 70 | result_li.append(data[: j + 1]).append(data[j + 1 :]) 71 | return split[:i] + result_li + split[i + 1 :] 72 | return split 73 | 74 | @udf(ArrayType(StringType())) 75 | def process_numaddr(split): 76 | if split is None: 77 | return "None" 78 | 79 | data = split[2] 80 | return data 81 | 82 | 83 | 84 | @udf(StringType()) 85 | def extract_sido(split): 86 | 87 | if split is None: 88 | return "None" 89 | 90 | for data in split: 91 | if data =='': 92 | continue 93 | if sido_dictionary.get(data): 94 | return sido_dictionary[data] 95 | elif sido_reverse_dictionary.get(data): 96 | return data 97 | return "None" 98 | 99 | 100 | @udf(StringType()) 101 | def extract_sigungu(split): 102 | 103 | if split is None: 104 | return "None" 105 | 106 | result = str() 107 | flag = False 108 | for data in split: 109 | if data =='': 110 | continue 111 | if not sido_reverse_dictionary.get(data): 112 | sigungu = data[-1] 113 | if (sigungu == "시") or (sigungu == "군") or (sigungu == "구"): 114 | if not flag: 115 | result += data 116 | flag = True 117 | else: 118 | result += " " + data 119 | if flag: 120 | return result 121 | return "None" 122 | 123 | """ 124 | @udf(StringType()) 125 | def extract_eupmyeon(split): 126 | if split is None: 127 | return "None" 128 | 129 | for data in split: 130 | if data == "": 131 | continue 132 | if data[-1] == "읍" or data[-1] == "면": 133 | return data 134 | return "None" 135 | """ 136 | 137 | @udf(StringType()) 138 | def extract_eupmyeondong(split): 139 | if split is None: 140 | return "None" 141 | 142 | for data in split: 143 | if data == "": 144 | continue 145 | if data[-1] == "읍" or data[-1] == "면" or data[-1] == "동" or data[-1] == "가" and not data[0].isdigit(): 146 | return data 147 | 148 | return "None" 149 | 150 | 151 | @udf(StringType()) 152 | def extract_dong(split): 153 | if split is None: 154 | return "None" 155 | for data in split: 156 | if data == "": 157 | continue 158 | if data[-1] == "동" and not data[0].isdigit(): 159 | return data 160 | return "None" 161 | 162 | 163 | @udf(StringType()) 164 | def extract_roadname(split): 165 | if split is None: 166 | return "None" 167 | for data in split: 168 | if data == "": 169 | continue 170 | if data[-1] == "로" or data[-1] == "길": 171 | return data 172 | return "None" 173 | 174 | 175 | @udf(StringType()) 176 | def extract_building_primary_number(split, roadname): 177 | if split is None: 178 | return "None" 179 | for i in range(len(split)): 180 | if split[i - 1] == roadname: 181 | data = split[i] 182 | if data.isdigit(): 183 | return data 184 | elif "-" in data: 185 | for j in range(len(data)): 186 | if data[j] == "-": 187 | return data[:j] 188 | return "None" 189 | 190 | @udf(StringType()) 191 | def extract_jibun_primary_number(split, roadname): 192 | if split is None: 193 | return "None" 194 | if roadname not in split: 195 | data = split[-1] 196 | if data.isdigit(): 197 | return data 198 | elif "-" in data: 199 | for j in range(len(data)): 200 | if data[j] == "-": 201 | return data[:j] 202 | return "None" 203 | 204 | @udf(StringType()) 205 | def extract_jibun_secondary(split): 206 | if split is None: 207 | return "None" 208 | data = split[3] 209 | for i in range(len(data)): 210 | if data[i] == "-": 211 | return data[i+1:] 212 | 213 | @udf(StringType()) 214 | def extract_sigungu_code(bupjungdong_code): 215 | if bupjungdong_code is None or bupjungdong_code == "None": 216 | return "None" 217 | data = bupjungdong_code[:5] 218 | return data -------------------------------------------------------------------------------- /sparkplus/core/utils.py: -------------------------------------------------------------------------------- 1 | import geopandas as gpd 2 | from typing import List, Union 3 | from pyspark.sql.session import SparkSession 4 | 5 | def load_tables( 6 | sparkSession: SparkSession, tablenames: Union[str, List[str]], **kwargs 7 | ): 8 | """ 9 | Summary 10 | ------- 11 | 테이블명을 기반으로 Spark DataFrame을 반환합니다. 12 | 13 | Parameter 14 | ---- 15 | sparkSession: Active Spark Session 16 | tablenames: DataFrame으로 만들 테이블명 17 | **kwargs: `driver`, `url`, `user`, `password` 18 | 19 | Raises: 20 | ValueError 21 | 22 | Returns: 23 | `DataFrame`s from database 24 | 25 | 26 | Usage 27 | ----- 28 | >>> import SPDataFrame 29 | >>> ss = SparkSession.builder.getOrCreate() 30 | >>> tablenames = ['integrated_address_seoul', 'integrated_address_incheon', 'integrated_address_gyeonggi'] 31 | >>> table_dfs = SPDataFrame(ss, tablenames, 32 | driver='com.mysql.cj.jdbc.Driver', 33 | url='jdbc:mysql://localhost:3306/sparkplus', 34 | user='root', 35 | password='password' 36 | ) 37 | >>> table_dfs.select('roadname_code', 'sido', 'sigungu', 'eupmyeondong').show() 38 | +-------------+----------+-------------+------------+ 39 | |roadname_code| sido| sigungu|eupmyeondong| 40 | +-------------+----------+-------------+------------+ 41 | | 261103125011|부산광역시| 중구| 영주동| 42 | | 261104006006|부산광역시| 중구| 영주동| 43 | | 261104006006|부산광역시| 중구| 영주동| 44 | | 261104006006|부산광역시| 중구| 영주동| 45 | | 261103125011|부산광역시| 중구| 영주동| 46 | | 111104100289|서울특별시| 종로구| 청운동| 47 | | 111104100289|서울특별시| 종로구| 청운동| 48 | | 111103100014|서울특별시| 종로구| 청운동| 49 | | 111104100289|서울특별시| 종로구| 청운동| 50 | | 111104100289|서울특별시| 종로구| 청운동| 51 | | 411114322017| 경기도|수원시 장안구| 파장동| 52 | | 411114322017| 경기도|수원시 장안구| 파장동| 53 | | 411114322017| 경기도|수원시 장안구| 파장동| 54 | | 411114322017| 경기도|수원시 장안구| 파장동| 55 | | 411114322017| 경기도|수원시 장안구| 파장동| 56 | +-------------+----------+-------------+------------+ 57 | """ 58 | sess_conf = sparkSession.sparkContext.getConf().getAll() 59 | 60 | # If SparkConf doesn't contain MySQL connector, raise `ValueError` 61 | jdbc_driver_flag = False 62 | 63 | # If you use `spark.jars.packages`, value should like `mysql:mysql-connector-java:YOUR_MYSQL_VERSION` 64 | available_configs = [ 65 | "spark.jars", 66 | "spark.driver.extraClassPath", 67 | "spark.jars.packages", 68 | ] 69 | 70 | for (conf_key, conf_val) in sess_conf: 71 | if conf_key in available_configs and conf_val.__contains__("mysql"): 72 | jdbc_driver_flag = True 73 | break 74 | 75 | if not jdbc_driver_flag: 76 | raise ValueError( 77 | "[SPARKPLUS_MYSQL_CONNECTOR_ERR] " 78 | "Your spark session seems like it doesn't contains mysql-connector-java path to connect mysql database. " 79 | "Please specify it to use SparkPlus package properly.\n\n" 80 | "$ spark-submit --jars \n\n" 81 | "In programming way, if you have mysql-connector jar file locally, set spark configuration like\n\n" 82 | ">>> ss = SparkSession.builder.config('spark.jars', MYSQL_JAR_PATH)\n\n" 83 | "or if you don't,\n\n" 84 | ">>> ss = SparkSession.builder.config('spark.jars.packages', 'mysql:mysql-connector-java:YOUR_MYSQL_VERSION')\n\n" 85 | "Check https://spark.apache.org/docs/latest/configuration.html for detail." 86 | ) 87 | 88 | ss_read = sparkSession.read.format("jdbc") 89 | 90 | # set DB options such as driver, url, user, password 91 | for opt_key, opt_val in kwargs.items(): 92 | ss_read.option(opt_key, opt_val) 93 | 94 | if isinstance(tablenames, str): 95 | return ss_read.option("dbtable", tablenames).load() 96 | else: 97 | dfs = ss_read.option("dbtable", tablenames.pop()).load() 98 | 99 | while tablenames: 100 | dfs = dfs.union(ss_read.option("dbtable", tablenames.pop()).load()) 101 | 102 | return dfs 103 | 104 | def load_gdf(shp_path , epsg): 105 | gdf = gpd.read_file(shp_path, encoding="euc-kr") 106 | gdf.crs = f'epsg:{epsg}' 107 | gdf = gdf.to_crs(epsg=4326) 108 | 109 | return gdf -------------------------------------------------------------------------------- /sparkplus/dependencies/__init__.py: -------------------------------------------------------------------------------- 1 | from .spark import * 2 | 3 | # from .logging import * 4 | from .tablename import ESido, EPrefix, get_tablename_by_prefix_and_sido 5 | 6 | __all__ = ["start_spark", "ESido", "EPrefix", "get_tablename_by_prefix_and_sido"] 7 | -------------------------------------------------------------------------------- /sparkplus/dependencies/logging.py: -------------------------------------------------------------------------------- 1 | class Log4j(object): 2 | """ 3 | :param spark: SparkSession object. 4 | """ 5 | 6 | def __init__(self, spark): 7 | # get spark app details with which to prefix all messages 8 | conf = spark.sparkContext.getConf() 9 | app_id = conf.get("spark.app.id") 10 | app_name = conf.get("spark.app.name") 11 | 12 | log4j = spark._jvm.org.apache.log4j 13 | message_prefix = "<" + app_name + " " + app_id + ">" 14 | self.logger = log4j.LogManager.getLogger(message_prefix) 15 | 16 | def error(self, message): 17 | """Log an error. 18 | :param: Error message to write to log 19 | :return: None 20 | """ 21 | self.logger.error(message) 22 | return None 23 | 24 | def warn(self, message): 25 | """Log an warning. 26 | :param: Error message to write to log 27 | :return: None 28 | """ 29 | self.logger.warn(message) 30 | return None 31 | 32 | def info(self, message): 33 | """Log information. 34 | :param: Information message to write to log 35 | :return: None 36 | """ 37 | self.logger.info(message) 38 | return None 39 | -------------------------------------------------------------------------------- /sparkplus/dependencies/spark.py: -------------------------------------------------------------------------------- 1 | import __main__ 2 | 3 | from os import environ, listdir, path 4 | import json 5 | from pyspark import SparkFiles 6 | from pyspark.sql import SparkSession 7 | 8 | 9 | def start_spark( 10 | app_name="my_spark_app", 11 | master="local[*]", 12 | jar_packages=[], 13 | files=[], 14 | spark_config={}, 15 | ): 16 | """ 17 | :param app_name: Name of Spark app. 18 | :param master: Cluster connection details (defaults to local[*]). 19 | :param jar_packages: List of Spark JAR package names. 20 | :param files: List of files to send to Spark cluster (master and 21 | workers). 22 | :param spark_config: Dictionary of config key-value pairs. 23 | :return: A tuple of references to the Spark session, logger and 24 | config dict (only if available). 25 | """ 26 | 27 | # detect execution environment 28 | flag_repl = not (hasattr(__main__, "__file__")) 29 | flag_debug = "DEBUG" in environ.keys() 30 | 31 | if not (flag_repl or flag_debug): 32 | # get Spark session factory 33 | print("without flag") 34 | spark_builder = SparkSession.builder.appName(app_name) 35 | spark_builder.config( 36 | "spark.jars", 37 | "/Users/hwan/dev/mysql-connector-java-8.0.26/mysql-connector-java-8.0.26.jar", 38 | ) 39 | else: 40 | # get Spark session factory 41 | spark_builder = SparkSession.builder.master(master).appName(app_name) 42 | 43 | # create Spark JAR packages string 44 | spark_jars_packages = ",".join(list(jar_packages)) 45 | spark_builder.config("spark.jars.packages", spark_jars_packages) 46 | 47 | spark_files = ",".join(list(files)) 48 | spark_builder.config("spark.files", spark_files) 49 | 50 | # add other config params 51 | for key, val in spark_config.items(): 52 | spark_builder.config(key, val) 53 | 54 | # create session and retrieve Spark logger object 55 | spark_sess = spark_builder.getOrCreate() 56 | # spark_logger = Log4j(spark_sess) 57 | 58 | # get config file if sent to cluster with --files 59 | spark_files_dir = SparkFiles.getRootDirectory() 60 | config_files = [ 61 | filename 62 | for filename in listdir(spark_files_dir) 63 | if filename.endswith("config.json") 64 | ] 65 | 66 | if config_files: 67 | path_to_config_file = path.join(spark_files_dir, config_files[0]) 68 | with open(path_to_config_file, "r") as config_file: 69 | config_dict = json.load(config_file) 70 | # spark_logger.warn("loaded config from " + config_files[0]) 71 | else: 72 | # spark_logger.warn("no config file found") 73 | config_dict = None 74 | 75 | return spark_sess, config_dict 76 | -------------------------------------------------------------------------------- /sparkplus/dependencies/tablename.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class EPrefix(Enum): 5 | ADDINFO = "additional_info" 6 | ROADNAME = "roadname_address" 7 | JIBUN = "jibun_address" 8 | INTEGRATED = "integrated_address" 9 | 10 | 11 | class ESido(Enum): 12 | SEOUL = "seoul" 13 | INCHEON = "incheon" 14 | DAEJEON = "daejeon" 15 | SEJONG = "sejong" 16 | GWANGJU = "gwangju" 17 | DAEGU = "daegu" 18 | ULSAN = "ulsan" 19 | BUSAN = "busan" 20 | JEJU = "jeju" 21 | GYEONGGI = "gyeonggi" 22 | GANGWON = "gangwon" 23 | CHUNGBUK = "chungbuk" 24 | CHUNGNAM = "chungnam" 25 | JEONBUK = "jeonbuk" 26 | JEONNAM = "jeonnam" 27 | GYEONGBUK = "gyeongbuk" 28 | GYEONGNAM = "gyeongnam" 29 | 30 | 31 | def get_tablename_by_prefix_and_sido(prefix: EPrefix, sido: ESido) -> str: 32 | """ 33 | Get tablename of Spark+ database. 34 | 35 | ## Examples 36 | 37 | >>> target_table = get_table_name(EPrefix.ADDINFO, ESIDO.SEOUL) # additional_info_seoul 38 | >>> target_table = get_table_name(EPrefix.INTEGRATED, ESIDO.BUSAN) # integrated_address_busan 39 | >>> error_table = get_table_name(EPrefix.ADDINFO, "anywhere") # Get AttributeError 40 | """ 41 | 42 | return f"{prefix.value}_{sido.value}" 43 | -------------------------------------------------------------------------------- /sparkplus/jobs/__init__.py: -------------------------------------------------------------------------------- 1 | from .conversion import * 2 | from .load_database import * 3 | 4 | __all__ = [ 5 | "join_with_emd", 6 | "join_with_h3", 7 | "join_with_table", 8 | "shp_init", 9 | "load_tables", 10 | ] 11 | -------------------------------------------------------------------------------- /sparkplus/jobs/conversion.py: -------------------------------------------------------------------------------- 1 | from typing import overload 2 | from geopandas.array import points_from_xy 3 | from geopandas.tools.sjoin import sjoin 4 | from shapely.geometry import Point, Polygon 5 | from pyspark.sql import Row 6 | from pyspark.sql.functions import concat, lit, udf 7 | from pyspark.sql.types import * 8 | from pyspark.sql.functions import col, pandas_udf 9 | 10 | import geopandas as gpd 11 | import pandas as pd 12 | import h3 13 | 14 | import os 15 | 16 | 17 | def shp_init(): 18 | shp = gpd.read_file( 19 | os.path.dirname(os.path.abspath(__file__)) 20 | + "/../resource/EMD_202101/TL_SCCO_EMD.shp" 21 | ) 22 | shp = shp.to_crs(4326) 23 | return shp 24 | 25 | 26 | def coord_to_dong(spark, gdf, lng, lat): 27 | addr = gdf[gdf.geometry.contains(Point(lng, lat)) == True] 28 | addr_drop_geom = addr.drop(columns="geometry") 29 | sdf = spark.createDataFrame(addr_drop_geom) 30 | sdf = sdf.select( 31 | concat(sdf.EMD_CD, lit("00")).alias("EMD_CD"), "EMD_ENG_NM", "EMD_KOR_NM" 32 | ) 33 | return sdf 34 | 35 | 36 | def coord_to_point(spark, df, lng_colname, lat_colname): 37 | df["temp"] = [Point(lon, lat) for lon, lat in df[[lng_colname, lat_colname]].values] 38 | df["point"] = pd.Series( 39 | map(lambda geom: str(geom.to_wkt()), df["temp"]), index=df.index, dtype="str" 40 | ) 41 | tmp = df.drop("temp", axis=1) 42 | res_df = pd.DataFrame(tmp) 43 | res_sdf = spark.createDataFrame(tmp).cache() 44 | del tmp 45 | return res_sdf, res_df 46 | 47 | 48 | def coord_file_to_emd(spark, gdf, filepath, lng_colname, lat_colname): 49 | _gdf = ( 50 | spark.read.option("header", True) 51 | .format("csv") 52 | .load(filepath, encoding="euc-kr") 53 | ) 54 | # _gdf = spark.createDataFrame(_file) 55 | _gdf.show() 56 | pdf = _gdf.select("*").toPandas() 57 | g_df = gpd.GeoDataFrame( 58 | pdf, geometry=gpd.points_from_xy(pdf[lng_colname], pdf[lat_colname]) 59 | ) 60 | li = list() 61 | for i in g_df.index: 62 | for j in gdf.index: 63 | if gdf.geometry[j].contains(g_df.geometry[i]): 64 | li.append(gdf.EMD_CD[j]) 65 | g_df.insert(len(g_df.columns), "EMD_CD", li) 66 | g_df = spark.createDataFrame(g_df) 67 | return g_df 68 | 69 | 70 | def coord_to_emd(spark, gdf, sdf, lng_colname, lat_colname): 71 | 72 | pdf = sdf.select("*").toPandas() 73 | # pdf = sdf 74 | g_df = gpd.GeoDataFrame( 75 | pdf, geometry=gpd.points_from_xy(pdf[lng_colname], pdf[lat_colname]) 76 | ) 77 | li = list() 78 | for i in g_df.index: 79 | for j in gdf.index: 80 | if gdf.geometry[j].contains(g_df.geometry[i]): 81 | print(g_df.geometry[i], gdf.EMD_CD[j]) 82 | li.append(gdf.EMD_CD[j]) 83 | g_df.insert(len(g_df.columns), "EMD_CD", li) 84 | g_df = spark.createDataFrame(g_df) 85 | return g_df 86 | 87 | 88 | @overload 89 | def coord_to_emd(spark, gdf, lng, lat, lng_colname="lng", lat_colname="lat"): 90 | mySchema = StructType( 91 | [ 92 | StructField(lng_colname, DoubleType(), True), 93 | StructField(lat_colname, DoubleType(), True), 94 | ] 95 | ) 96 | myRow = Row(lng, lat) 97 | myDf = spark.createDataFrame([myRow], mySchema) 98 | result = coord_df_to_emd(spark, gdf, myDf, lng_colname, lat_colname) 99 | return result 100 | 101 | 102 | def to_polygon(l): 103 | return Polygon(h3.h3_to_geo_boundary(l, geo_json=True)) 104 | 105 | 106 | def coord_to_h3(lng, lat, h3_level): 107 | my_h3 = h3.geo_to_h3(lat, lng, h3_level) 108 | h3_df = gpd.GeoDataFrame({"h3": [my_h3, my_h3]}) 109 | h3_df["geometry"] = h3_df["h3"].apply(to_polygon) 110 | h3_df.crs = {"init": "epsg:4326"} 111 | return h3_df 112 | 113 | 114 | def coord_to_jibun(spark, gdf, table_df, lng, lat): 115 | emd_df = coord_to_emd(spark, gdf, lng, lat).toPandas() 116 | emd_cd = emd_df.iloc[0]["EMD_CD"] + "00" 117 | jibun_df = table_df[table_df["bupjungdong_code"] == emd_cd].toPandas() 118 | print(jibun_df) 119 | return jibun_df 120 | 121 | 122 | def coord_to_roadname( 123 | spark, gdf, table_jibun, table_roadname, table_roadname_code, lng, lat 124 | ): 125 | jibun_df = coord_to_jibun(spark, gdf, table_jibun, lng, lat) 126 | manage_number = jibun_df.iloc[0]["manage_number"] 127 | roadname_code_df = table_roadname[ 128 | table_roadname["manage_number"] == manage_number 129 | ].toPandas() 130 | roadname_code = roadname_code_df.iloc[0]["roadname_code"] 131 | result = table_roadname_code[table_roadname_code["roadname_code"] == roadname_code] 132 | return result 133 | 134 | 135 | def create_sjoin_emd(gdf_poly, join_column_name): 136 | def sjoin_settlement(x, y): 137 | gdf_temp = gpd.GeoDataFrame( 138 | data=[[x] for x in range(len(x))], geometry=gpd.points_from_xy(x, y) 139 | ).set_crs(epsg=4326, inplace=True) 140 | settlement = gpd.sjoin(gdf_temp, gdf_poly, how="left", op="within") 141 | settlement = settlement.drop_duplicates(subset="geometry") 142 | # print(settlement.agg({'EMD_CD': lambda x: str(x) + '00'}).reset_index().loc[:, join_column_name].astype('str')) 143 | return ( 144 | settlement.agg({"EMD_CD": lambda x: str(x) + "00"}) 145 | .reset_index() 146 | .loc[:, join_column_name] 147 | .astype("str") 148 | ) 149 | 150 | return pandas_udf(sjoin_settlement, returnType=StringType()) 151 | 152 | 153 | def join_with_emd(gdf_poly, sdf, x_colname, y_colname): 154 | sjoin_udf = create_sjoin_emd(gdf_poly, "EMD_CD") 155 | res_df = sdf.withColumn("EMD_CD", sjoin_udf(sdf[x_colname], sdf[y_colname])) 156 | return res_df 157 | 158 | 159 | def join_with_h3(sdf, x_colname, y_colname, h3_level): 160 | udf_to_h3 = udf( 161 | lambda x, y: h3.geo_to_h3(float(x), float(y), h3_level), returnType=StringType() 162 | ) 163 | res_h3 = sdf.withColumn("h3", udf_to_h3(sdf[y_colname], sdf[x_colname])) 164 | return res_h3 165 | 166 | 167 | def join_with_table(gdf_poly, sdf, table_df, x_colname, y_colname): 168 | temp_df = join_with_emd(gdf_poly, sdf, x_colname, y_colname) 169 | table_df = table_df.dropDuplicates(["bupjungdong_code"]) 170 | res_df = temp_df.join( 171 | table_df, [temp_df.EMD_CD == table_df.bupjungdong_code], how="left_outer" 172 | ) 173 | 174 | return res_df 175 | # .select(temp_df.EMD_CD, table_df.sido).show() 176 | 177 | # res_df.show() 178 | -------------------------------------------------------------------------------- /sparkplus/jobs/etl_job.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import Row 2 | from pyspark.sql.functions import col, concat_ws, lit 3 | 4 | from dependencies.spark import start_spark 5 | 6 | 7 | def main(): 8 | """Main ETL script definition. 9 | :return: None 10 | """ 11 | # start Spark application and get Spark session, logger and config 12 | spark, log, config = start_spark( 13 | app_name="my_etl_job", files=["configs/etl_config.json"] 14 | ) 15 | 16 | # log that main ETL job is starting 17 | log.warn("etl_job is up-and-running") 18 | 19 | # execute ETL pipeline 20 | data = extract_data(spark) 21 | data_transformed = transform_data(data, config["steps_per_floor"]) 22 | load_data(data_transformed) 23 | 24 | # log the success and terminate Spark application 25 | log.warn("test_etl_job is finished") 26 | spark.stop() 27 | return None 28 | 29 | 30 | def extract_data(spark): 31 | """Load data from Parquet file format. 32 | :param spark: Spark session object. 33 | :return: Spark DataFrame. 34 | """ 35 | df = spark.read.parquet("tests/test_data/employees") 36 | 37 | return df 38 | 39 | 40 | def transform_data(df, steps_per_floor_): 41 | """Transform original dataset. 42 | :param df: Input DataFrame. 43 | :param steps_per_floor_: The number of steps per-floor at 43 Tanner 44 | Street. 45 | :return: Transformed DataFrame. 46 | """ 47 | df_transformed = df.select( 48 | col("id"), 49 | concat_ws(" ", col("first_name"), col("second_name")).alias("name"), 50 | (col("floor") * lit(steps_per_floor_)).alias("steps_to_desk"), 51 | ) 52 | 53 | return df_transformed 54 | 55 | 56 | def load_data(df): 57 | """Collect data locally and write to CSV. 58 | :param df: DataFrame to print. 59 | :return: None 60 | """ 61 | (df.coalesce(1).write.csv("loaded_data", mode="overwrite", header=True)) 62 | return None 63 | 64 | 65 | def create_test_data(spark, config): 66 | """Create test data. 67 | This function creates both both pre- and post- transformation data 68 | saved as Parquet files in tests/test_data. This will be used for 69 | unit tests as well as to load as part of the example ETL job. 70 | :return: None 71 | """ 72 | # create example data from scratch 73 | local_records = [ 74 | Row(id=1, first_name="Dan", second_name="Germain", floor=1), 75 | Row(id=2, first_name="Dan", second_name="Sommerville", floor=1), 76 | Row(id=3, first_name="Alex", second_name="Ioannides", floor=2), 77 | Row(id=4, first_name="Ken", second_name="Lai", floor=2), 78 | Row(id=5, first_name="Stu", second_name="White", floor=3), 79 | Row(id=6, first_name="Mark", second_name="Sweeting", floor=3), 80 | Row(id=7, first_name="Phil", second_name="Bird", floor=4), 81 | Row(id=8, first_name="Kim", second_name="Suter", floor=4), 82 | ] 83 | 84 | df = spark.createDataFrame(local_records) 85 | 86 | # write to Parquet file format 87 | (df.coalesce(1).write.parquet("tests/test_data/employees", mode="overwrite")) 88 | 89 | # create transformed version of data 90 | df_tf = transform_data(df, config["steps_per_floor"]) 91 | 92 | # write transformed version of data to Parquet 93 | ( 94 | df_tf.coalesce(1).write.parquet( 95 | "tests/test_data/employees_report", mode="overwrite" 96 | ) 97 | ) 98 | 99 | return None 100 | 101 | 102 | # entry point for PySpark ETL application 103 | if __name__ == "__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /sparkplus/jobs/load_database.py: -------------------------------------------------------------------------------- 1 | """ 2 | # 부가정보 테이블 3 | additional_info_tables = [ 4 | 'additional_info_busan', 5 | 'additional_info_chungbuk', 6 | 'additional_info_chungnam', 7 | 'additional_info_daegu', 8 | 'additional_info_daejeon', 9 | 'additional_info_gangwon', 10 | 'additional_info_gwangju', 11 | 'additional_info_gyeongbuk', 12 | 'additional_info_gyeonggi', 13 | 'additional_info_gyeongnam', 14 | 'additional_info_incheon', 15 | 'additional_info_jeju', 16 | 'additional_info_jeonbuk', 17 | 'additional_info_jeonnam', 18 | 'additional_info_sejong', 19 | 'additional_info_seoul', 20 | 'additional_info_ulsan' 21 | ] 22 | 23 | # 지번주소 테이블 24 | jibun_address_tables = [ 25 | 'jibun_address_busan', 26 | 'jibun_address_chungbuk', 27 | 'jibun_address_chungnam', 28 | 'jibun_address_daegu', 29 | 'jibun_address_daejeon', 30 | 'jibun_address_gangwon', 31 | 'jibun_address_gwangju', 32 | 'jibun_address_gyeongbuk', 33 | 'jibun_address_gyeonggi', 34 | 'jibun_address_gyeongnam', 35 | 'jibun_address_incheon', 36 | 'jibun_address_jeju', 37 | 'jibun_address_jeonbuk', 38 | 'jibun_address_jeonnam', 39 | 'jibun_address_sejong', 40 | 'jibun_address_seoul', 41 | 'jibun_address_ulsan', 42 | ] 43 | 44 | # 도로명주소 테이블 45 | roadname_tables = [ 46 | 'roadname_address_busan', 47 | 'roadname_address_chungbuk', 48 | 'roadname_address_chungnam', 49 | 'roadname_address_daegu', 50 | 'roadname_address_daejeon', 51 | 'roadname_address_gangwon', 52 | 'roadname_address_gwangju', 53 | 'roadname_address_gyeongbuk', 54 | 'roadname_address_gyeonggi', 55 | 'roadname_address_gyeongnam', 56 | 'roadname_address_incheon', 57 | 'roadname_address_jeju', 58 | 'roadname_address_jeonbuk', 59 | 'roadname_address_jeonnam', 60 | 'roadname_address_sejong', 61 | 'roadname_address_seoul', 62 | 'roadname_address_ulsan', 63 | 'roadname_code' 64 | ] 65 | 66 | # 도로명코드 테이블 67 | roadname_code_table = ['roadname_code'] 68 | 69 | # 통합 테이블 70 | integrated_table = [ 71 | 'integrated_address_busan', 72 | 'integrated_address_chungbuk', 73 | 'integrated_address_chungnam', 74 | 'integrated_address_daegu', 75 | 'integrated_address_daejeon', 76 | 'integrated_address_gangwon', 77 | 'integrated_address_gwangju', 78 | 'integrated_address_gyeongbuk', 79 | 'integrated_address_gyeonggi', 80 | 'integrated_address_gyeongnam', 81 | 'integrated_address_incheon', 82 | 'integrated_address_jeju', 83 | 'integrated_address_jeonbuk', 84 | 'integrated_address_jeonnam', 85 | 'integrated_address_sejong', 86 | 'integrated_address_seoul', 87 | 'integrated_address_ulsan' 88 | ] 89 | 90 | """ 91 | 92 | 93 | def load_tables(spark, url, user, password, opt, driver="com.mysql.cj.jdbc.Driver"): 94 | 95 | table = "integrated_address_" + opt 96 | result = ( 97 | spark.read.format("jdbc") 98 | .option("driver", driver) 99 | .option("url", url) 100 | .option("dbtable", table) 101 | .option("user", user) 102 | .option("password", password) 103 | .load() 104 | ) 105 | 106 | return result 107 | -------------------------------------------------------------------------------- /sparkplus/jobs/table_to_df.py: -------------------------------------------------------------------------------- 1 | def create_df(spark, table): 2 | 3 | sdf = ( 4 | spark.read.format("jdbc") 5 | .option("url", "jdbc:mysql://localhost:3306/sparkplus") 6 | .option("driver", "com.mysql.cj.jdbc.Driver") 7 | .option("dbtable", table) 8 | .option("user", "root") 9 | .option("password", "9315") 10 | .load() 11 | ) 12 | 13 | return sdf 14 | -------------------------------------------------------------------------------- /sparkplus/jobs/with_geopandas.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.functions import * 2 | from pyspark.sql.types import ( 3 | StringType, 4 | IntegerType, 5 | FloatType, 6 | DoubleType, 7 | DecimalType, 8 | ) 9 | from pyspark.sql.functions import lit, pandas_udf, PandasUDFType 10 | 11 | import pandas as pd 12 | import geopandas as gpd 13 | 14 | import sys 15 | import os 16 | 17 | sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) 18 | 19 | from dependencies.spark import start_spark 20 | 21 | 22 | def geopandas_df_to_spark_for_points(spark, gdf): 23 | gdf["lon"] = gdf["geometry"].x 24 | gdf["lat"] = gdf["geometry"].y 25 | sdf = spark.createDataFrame(pd.DataFrame(gdf), axis=1) 26 | return sdf 27 | 28 | 29 | korea_shp_file = "shp/TL_SCCO_LI.shp" 30 | 31 | gdf = gpd.read_file(korea_shp_file, encoding="euc-kr") 32 | 33 | 34 | gdf = gdf.to_crs(4326) 35 | -------------------------------------------------------------------------------- /sparkplus/package/__init__.py: -------------------------------------------------------------------------------- 1 | from .gis import * 2 | 3 | __all__ = ["gdf_to_spark_wkt"] 4 | -------------------------------------------------------------------------------- /sparkplus/package/gis.py: -------------------------------------------------------------------------------- 1 | from shapely.geometry import Point, Polygon, LineString 2 | from pyspark.sql import SparkSession 3 | import geopandas as gpd 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import pyspark 8 | from pyspark.sql.functions import * 9 | from pyspark.sql.types import ( 10 | IntegerType, 11 | StringType, 12 | FloatType, 13 | DecimalType, 14 | DoubleType, 15 | ) 16 | import os 17 | from pyspark.sql.functions import lit, pandas_udf, PandasUDFType 18 | from dotenv import load_dotenv 19 | 20 | load_dotenv() 21 | 22 | 23 | def load_shp(spark, file_location): 24 | korea = gpd.read_file(file_location, encoding="euc-kr") 25 | gdf = korea.to_crs(4326) 26 | return gdf 27 | 28 | 29 | # def coord_to_dong(spark, gdf, lng, lat): 30 | # addr = gdf[gdf.geometry.contains(Point(lng, lat)) == True] 31 | # addr_drop_geom = addr.drop(columns="geometry") 32 | # df = spark.createDataFrame(addr_drop_geom) 33 | # df = df.select( 34 | # concat(df.EMD_CD, lit("00")).alias("EMD_CD"), "EMD_ENG_NM", "EMD_KOR_NM" 35 | # ) 36 | # return df 37 | 38 | 39 | def coord_to_dong(spark, gdf, spark_df, lng_colname, lat_colname): 40 | 41 | p_df = spark_to_pandas(spark_df) 42 | # geometry = gpd.points_from_xy(p_df['longitude'], p_df['latitude']) 43 | print("p_df: ", p_df) 44 | g_df = gpd.GeoDataFrame( 45 | p_df, geometry=gpd.points_from_xy(p_df[lng_colname], p_df[lat_colname]) 46 | ) 47 | # g_df = gpd.GeoDataFrame(p_df, geometry=geometry) 48 | print("g_df: ", g_df) 49 | li = list() 50 | for i in g_df.index: 51 | for j in gdf.index: 52 | if gdf.geometry[j].contains(g_df.geometry[i]): 53 | li.append(gdf.EMD_CD[j]) 54 | # if j == 1: print(gdf.geometry[j], p_df.geometry[i]) 55 | 56 | g_df.insert(len(g_df.columns), "EMD_CD", li) 57 | # g_df = g_df.drop(columns="geometry") 58 | g_df = spark.createDataFrame(g_df) 59 | 60 | return g_df 61 | 62 | 63 | def spark_to_pandas(spark_df): 64 | return spark_df.select("*").toPandas() 65 | 66 | 67 | def pandas_to_geopandas(pandas_df): 68 | return gpd.GeoDataFrame(pandas_df) 69 | 70 | 71 | def db_table_to_df(spark, table): 72 | df = ( 73 | spark.read.format("jdbc") 74 | .option("driver", os.getenv("DB_DRIVER")) 75 | .option("url", os.getenv("DB_URL")) 76 | .option("dbtable", table) 77 | .option("user", os.getenv("DB_USER")) 78 | .option("password", os.getenv("DB_PASSWORD")) 79 | .load() 80 | ) 81 | return df 82 | 83 | 84 | def gdf_to_spark_wkt(spark, gdf): 85 | gdf["wkt"] = pd.Series( 86 | map(lambda geom: str(geom.to_wkt()), gdf["geometry"]), 87 | index=gdf.index, 88 | dtype="str", 89 | ) 90 | tmp = gdf.drop("geometry", axis=1) 91 | df = pd.DataFrame(tmp) 92 | sdf = spark.createDataFrame(tmp).cache() 93 | del tmp 94 | 95 | return sdf, df 96 | 97 | 98 | def spark_to_gdf_wkt(spark, gdf, col_name): 99 | gdf["wkt_to_geom"] = gpd.GeoSeries.from_wkt(gdf[col_name]) 100 | return gdf 101 | 102 | 103 | def load_table(spark): 104 | table_list = [ 105 | "additional_info_busan", 106 | "additional_info_chungbuk", 107 | "additional_info_chungnam", 108 | "additional_info_daegu", 109 | "additional_info_daejeon", 110 | "additional_info_gangwon", 111 | "additional_info_gwangju", 112 | "additional_info_gyeongbuk", 113 | "additional_info_gyeonggi", 114 | "additional_info_gyeongnam", 115 | "additional_info_incheon", 116 | "additional_info_jeju", 117 | "additional_info_jeonbuk", 118 | "additional_info_jeonnam", 119 | "additional_info_sejong", 120 | "additional_info_seoul", 121 | "additional_info_ulsan", 122 | "jibun_address_busan", 123 | "jibun_address_chungbuk", 124 | "jibun_address_chungnam", 125 | "jibun_address_daegu", 126 | "jibun_address_daejeon", 127 | "jibun_address_gangwon", 128 | "jibun_address_gwangju", 129 | "jibun_address_gyeongbuk", 130 | "jibun_address_gyeonggi", 131 | "jibun_address_gyeongnam", 132 | "jibun_address_incheon", 133 | "jibun_address_jeju", 134 | "jibun_address_jeonbuk", 135 | "jibun_address_jeonnam", 136 | "jibun_address_sejong", 137 | "jibun_address_seoul", 138 | "jibun_address_ulsan", 139 | "roadname_address_busan", 140 | "roadname_address_chungbuk", 141 | "roadname_address_chungnam", 142 | "roadname_address_daegu", 143 | "roadname_address_daejeon", 144 | "roadname_address_gangwon", 145 | "roadname_address_gwangju", 146 | "roadname_address_gyeongbuk", 147 | "roadname_address_gyeonggi", 148 | "roadname_address_gyeongnam", 149 | "roadname_address_incheon", 150 | "roadname_address_jeju", 151 | "roadname_address_jeonbuk", 152 | "roadname_address_jeonnam", 153 | "roadname_address_sejong", 154 | "roadname_address_seoul", 155 | "roadname_address_ulsan", 156 | "roadname_code", 157 | ] 158 | 159 | for table in table_list: 160 | name = table + "_df" 161 | globals()[name] = db_table_to_df(spark, table) 162 | return globals() 163 | -------------------------------------------------------------------------------- /sparkplus/package/pipeline.py: -------------------------------------------------------------------------------- 1 | from shapely.geometry import Point, Polygon 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import Row 4 | from pyspark.sql import * 5 | from pyspark.sql.types import StructField, StructType, StringType, LongType, DoubleType 6 | import geopandas as gpd 7 | import pandas as pd 8 | import mysql.connector 9 | import sys 10 | from . import gis 11 | import pyspark 12 | from dotenv import load_dotenv 13 | 14 | sys.stdout = open(sys.stdout.fileno(), mode="w", encoding="utf8", buffering=1) 15 | 16 | spark = SparkSession.builder.appName("Spark App").getOrCreate() 17 | dict = gis.load_table(spark) # table dictionary 불러오기 18 | jibun_dict = {} 19 | for key, val in list(dict.items()): 20 | if "jibun_address" in key: 21 | result = ( 22 | dict[key] 23 | .select(["bupjungdong_code", "sido", "sigungu", "bupjungeupmyeondong"]) 24 | .dropDuplicates(["bupjungdong_code"]) 25 | .orderBy("bupjungdong_code") 26 | ) 27 | jibun_dict[key] = result 28 | 29 | """ shp to polyfill 30 | gdf = gis.load_shp(spark, "../resource/EMD_202101/TL_SCCO_EMD.shp") #법정동 shp 파일 불러오기 31 | gdf = gdf.h3.polyfill(10) 32 | pd_h3 = pd.DataFrame(gdf) 33 | del gdf 34 | pd_h3 = pd_h3.drop('geometry', axis=1) 35 | sdf = spark.createDataFrame(pd_h3) 36 | """ 37 | 38 | """ sdf to json 39 | sdf.coalesce(1).write.json('v1') #v1이라는 폴더가 생성됨 40 | sdf.write.json('v2') 41 | """ 42 | 43 | """ 44 | sdf_df = gis.gdf_to_spark_wkt(spark, gdf) #spark에서 읽을 수 있도록 wkt로 변환 45 | result_df = gis.gdf_to_spark_wkt(spark, gdf_h3) 46 | """ 47 | 48 | """ read parquet 49 | df = spark.read.option("mergeSchema", "true").parquet("../resource/h3/part-00000-3c1357f3-ca16-420a-8b7f-7e532d32c650-c000.snappy.parquet") 50 | df.printSchema() 51 | df.show() 52 | """ 53 | -------------------------------------------------------------------------------- /sparkplus/testjob/demo_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from shapely.geometry import Polygon 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.functions import encode 7 | import pandas as pd 8 | import geopandas as gpd 9 | import h3 10 | 11 | sys.path.append(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) 12 | 13 | from jobs.conversion import ( 14 | coord_to_emd, 15 | join_with_h3, 16 | join_with_emd, 17 | join_with_table, 18 | shp_init, 19 | ) 20 | from jobs.load_database import load_tables 21 | from package import gis 22 | 23 | driver = "com.mysql.cj.jdbc.Driver" 24 | url = "jdbc:mysql://localhost:3306/sparkplus" 25 | user = "sparkplus" 26 | password = "sparkplus" 27 | 28 | filepath = "/home/hadoop/spark-plugin/resource/data/daegu_streetlight.csv" 29 | localfilepath = "../resource/data/daegu_streetlight.csv" 30 | shp = "/home/hadoop/spark-plugin/resource/EMD_202101/TL_SCCO_EMD.shp" 31 | localshp = "../resource/EMD_202101/TL_SCCO_EMD.shp" 32 | 33 | if __name__ == "__main__": 34 | 35 | session = ( 36 | SparkSession.builder.appName("demo_app") 37 | .config( 38 | "spark.driver.extraClassPath", 39 | "/usr/lib/spark/jars/mysql-connector-java-8.0.26.jar", 40 | ) 41 | .getOrCreate() 42 | ) 43 | # session.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") 44 | # session.conf.set("spark.sql.execution.arrow.maxRecordsPerBatch", 20000) 45 | 46 | sc = session.sparkContext 47 | sc.setLogLevel("ERROR") 48 | 49 | gdf = gis.load_shp(session, localshp) 50 | gdf = gdf.to_crs(4326) 51 | 52 | # gdf = shp_init() 53 | 54 | dataFrameReader = session.read 55 | 56 | my_sdf = ( 57 | dataFrameReader.option("header", True) 58 | .format("csv") 59 | .load(localfilepath, encoding="euc-kr") 60 | ) 61 | 62 | emd_df = join_with_emd(gdf, my_sdf, "경도", "위도") 63 | print("emd_df ------------------------") 64 | emd_df.show() 65 | 66 | """ 67 | tdf = pd.read_csv(localfilepath, encoding='euc-kr') 68 | 69 | tdf2 = tdf.iloc[:][10054:10059] 70 | tdf2 = session.createDataFrame(tdf2) 71 | print("tdf2") 72 | tdf2.show() 73 | tdf = tdf.iloc[:][10054:10059] 74 | tdf = session.createDataFrame(tdf) 75 | """ 76 | """ 77 | a = coord_to_emd(session, gdf, tdf, "경도", "위도") 78 | print("a") 79 | a.show() 80 | """ 81 | """ 82 | print("tdf") 83 | tdf.show() 84 | tdf = join_with_emd(gdf, tdf, '경도', '위도') 85 | tdf.show() 86 | 87 | 88 | tdf2 = join_with_emd(gdf, tdf2, '경도', '위도') 89 | """ 90 | """ 91 | h3_df = join_with_h3(my_sdf, "경도", "위도", 10) 92 | h3_df.show() 93 | """ 94 | table_df = load_tables(session, url, user, password, "daegu") 95 | print("table_df ------------------------") 96 | table_df.show() 97 | 98 | res_df = join_with_table(gdf, emd_df, table_df, "경도", "위도") 99 | # res_df.show() 100 | print("res_df ------------------------") 101 | res_df.show() 102 | print(res_df.count()) 103 | """ 104 | res2_df = join_with_table(gdf, tdf2, table_df, '경도', '위도') 105 | res2_df.show() 106 | """ 107 | """ 108 | Result vector from pandas_udf was not the required lengt 109 | def to_polygon(l): 110 | return Polygon(h3.h3_to_geo_boundary(l, geo_json=True)) 111 | 112 | temp = [35.8734, 128.6103] 113 | 114 | gdf_h3 = h3_df.toPandas() 115 | gdf_h3 = gpd.GeoDataFrame(gdf_h3) 116 | gdf_h3['geometry'] = gdf_h3['h3'].apply(to_polygon) 117 | gdf_h3.crs = {'init': 'epsg:4326'} 118 | 119 | m =folium.Map(temp, zoom_start=14) 120 | folium.GeoJson(gdf_h3).add_to(m) 121 | 122 | m.save('daegu1.html') 123 | """ 124 | -------------------------------------------------------------------------------- /sparkplus/testjob/test_df.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | import geopandas as gpd 5 | from dotenv import load_dotenv 6 | 7 | sys.path.append( 8 | os.path.dirname(os.path.dirname(os.path.abspath(os.path.dirname(__file__)))) 9 | ) 10 | 11 | from sparkplus.core import CoordDataframe 12 | from sparkplus.core import RoadnameDataframe 13 | from sparkplus.jobs.load_database import load_tables 14 | from pyspark.sql import SparkSession 15 | from sparkplus.dependencies.spark import start_spark 16 | from sparkplus.core.py_log import logger 17 | 18 | load_dotenv() 19 | 20 | driver = "com.mysql.cj.jdbc.Driver" 21 | url = ( 22 | "jdbc:mysql://ec2-3-35-104-222.ap-northeast-2.compute.amazonaws.com:3306/sparkplus" 23 | ) 24 | user = "sparkplus" 25 | password = "sparkplus" 26 | 27 | shp_path = "../resource/shp/LSMD_CONT_LDREG_27_202109.shp" 28 | data_path = "../resource/data/daegu_streetlight.csv" 29 | 30 | """ 31 | session = ( 32 | SparkSession.builder.appName("demo_app") 33 | .config( 34 | "spark.driver.extraClassPath", 35 | "/usr/lib/spark/jars/mysql-connector-java-8.0.26.jar", 36 | ) 37 | .getOrCreate() 38 | ) 39 | """ 40 | 41 | # Spark Session을 연다 42 | session, _ = start_spark() 43 | dataFrameReader = session.read 44 | 45 | logger.debug("read_shp") 46 | # shp파일을 GDF로 불러오고 crs를 세팅한다. 47 | gdf = gpd.read_file(shp_path, encoding="euc-kr") 48 | gdf.crs = "epsg:5174" 49 | gdf = gdf.to_crs(epsg=4326) 50 | logger.debug("complete read shp") 51 | 52 | # 데이터 df를 불러온다. 53 | logger.debug("read dataframe") 54 | my_sdf = ( 55 | dataFrameReader.option("header", True) 56 | .format("csv") 57 | .load(data_path, encoding="euc-kr") 58 | ) 59 | my_sdf.show() 60 | print("my_sdf: ", my_sdf.count()) 61 | logger.debug("complete dataframe") 62 | 63 | # 데이터베이스에서 테이블을 불러온다. 64 | logger.debug("load_tables") 65 | table_df = load_tables(session, url, user, password, "daegu") 66 | table_df.show() 67 | logger.debug("complete load_tables") 68 | # 커스텀데이터프레임을 만든다. 69 | logger.debug("create custom df") 70 | df = CoordDataFrame(my_sdf, gdf, table_df, "경도", "위도") 71 | logger.debug("complete custom df") 72 | # 기존 데이터 df와 PNU 매칭한다. 73 | logger.debug("coord_to_pnu") 74 | pnu_df = df.coord_to_pnu() 75 | 76 | print("pnu_df: ", pnu_df.count()) 77 | pnu_df.show() 78 | 79 | logger.debug("complete coord_to_pnu") 80 | 81 | """ 82 | logger.debug('join with pnu') 83 | res_df = df.coord_to_pnu(gdf, '경도', '위도') 84 | res_df.show() 85 | logger.debug('complete join with pnu') 86 | """ 87 | 88 | 89 | # 기존 데이터 df와 테이블을 조인한다. (PNU => bupjungdong 매칭) 90 | logger.debug("join_with_table") 91 | res_df = df.join_with_table() 92 | print("joined_df: ", res_df.count()) 93 | res_df.show() 94 | logger.debug("complete join_with_tables") 95 | 96 | logger.debug("h3_df") 97 | h3_df = df.coord_to_h3(10) 98 | print("h3_df: ", h3_df.count()) 99 | h3_df.show() 100 | logger.debug("complete h3_df") 101 | 102 | logger.debug("select zipcode columns") 103 | zipcode_df = df.coord_to_zipcode() 104 | print("zipcode_df: ", zipcode_df.count()) 105 | 106 | zipcode_df.show() 107 | logger.debug("complete select zip columns") 108 | 109 | 110 | logger.debug("select emd columns") 111 | emd_df = df.coord_to_emd() 112 | print("emd_df: ", emd_df.count()) 113 | 114 | emd_df.show() 115 | logger.debug("complete select emd columns") 116 | 117 | 118 | logger.debug("select doromyoung columns") 119 | doro_df = df.coord_to_roadname() 120 | print("doro_df: ", doro_df.count()) 121 | 122 | doro_df.show() 123 | logger.debug("complete select doromyoung columns") 124 | 125 | 126 | logger.debug("coord_to_roadname") 127 | full_doro_df = df.coord_to_roadname_addr() 128 | 129 | full_doro_df = RoadnameDataframe(full_doro_df) 130 | doro_to_roadname_df = full_doro_df.add_split("roadname_address") 131 | 132 | 133 | print("doro_to_roadname", doro_to_roadname_df._df.count()) 134 | doro_to_roadname_df._df.show() 135 | logger.debug("complete coord_to_roadname") 136 | 137 | 138 | logger.debug("select jibun columns") 139 | jibun_df = df.coord_to_jibun() 140 | print("jibun_df: ", jibun_df.count()) 141 | 142 | jibun_df.show() 143 | logger.debug("complete select jibun columns") 144 | -------------------------------------------------------------------------------- /static/sparkplus_arch_finale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SWM-SparkPlus/sparkplus/883d16498b25488cc424908700a8389837e83c47/static/sparkplus_arch_finale.png --------------------------------------------------------------------------------