├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── build.yml │ ├── deploy-docs.yml │ └── upload-to-pypi.yml ├── .gitignore ├── .project ├── .pydevproject ├── .readthedocs.yml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── docs └── source │ ├── Makefile │ ├── conf.py │ ├── geograpy.rst │ ├── index.rst │ ├── make.bat │ ├── setup.rst │ └── tests.rst ├── examples └── example1.py ├── geograpy ├── __init__.py ├── data │ ├── ISO3166ErrorDictionary.csv │ ├── aliases.csv │ └── queries.yaml ├── extraction.py ├── geograpy_nltk.py ├── labels.py ├── locator.py ├── nominatim.py ├── places.py ├── utils.py ├── version.py └── wikidata.py ├── pyproject.toml ├── scripts ├── blackisort ├── doc ├── download ├── install ├── release └── test └── tests ├── __init__.py ├── basetest.py ├── testCachingCitiesByRegion.py ├── testCachingLocationLabels.py ├── testLocatorDatabase.py ├── testQueries.py ├── test_LocationContext.py ├── test_extractor.py ├── test_location.py ├── test_locator.py ├── test_nominatim.py ├── test_places.py └── test_wikidata.py /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Use function. 16 | 2. ... 17 | 18 | **Expected behavior** 19 | A clear and concise description of what you expected to happen. 20 | 21 | **Screenshots** 22 | If applicable, add screenshots to help explain your problem. 23 | 24 | **Environment (please complete the following information):** 25 | - OS: [e.g. Ubuntu 20.04] 26 | - Python Version [e.g. 3.6] 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Build 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | os: [ubuntu-latest, macos-latest, windows-latest] 19 | python-version: [3.9, "3.10", "3.11", "3.12"] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install 28 | run: | 29 | scripts/install 30 | - name: Run tests 31 | run: | 32 | scripts/test 33 | -------------------------------------------------------------------------------- /.github/workflows/deploy-docs.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Deploy docs to Netlify 5 | on: 6 | push: 7 | branches: [ master ] 8 | jobs: 9 | docs-deploy: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: [3.9] 14 | steps: 15 | - uses: actions/checkout@v4 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install dependencies 21 | run: | 22 | pip install . 23 | - name: Install sphinx 24 | run: | 25 | pip install sphinx sphinx-rtd-theme 26 | - name: Build docs 27 | run: | 28 | scripts/doc 29 | - name: Deploy docs to Netlify 30 | uses: netlify/actions/cli@master 31 | env: 32 | NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} 33 | NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }} 34 | with: 35 | args: deploy --dir=docs/source/_build/html --prod 36 | -------------------------------------------------------------------------------- /.github/workflows/upload-to-pypi.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Changelogs 2 | CHANGELOG* 3 | 4 | # Release files 5 | token 6 | settings.ini 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/source/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 102 | __pypackages__/ 103 | 104 | # Celery stuff 105 | celerybeat-schedule 106 | celerybeat.pid 107 | 108 | # SageMath parsed files 109 | *.sage.py 110 | 111 | # Environments 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | 138 | # Mac OS 139 | .DS_Store 140 | # docs are autogenerated with sphinx-api docs 141 | # eclipse 142 | .settings 143 | # databases 144 | geograpy/*.db 145 | geograpy/*.db.gz 146 | 147 | CHANGELOG.bak 148 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | geograpy3 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /${PROJECT_DIR_NAME} 5 | 6 | python interpreter 7 | Default 8 | 9 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | 9 | # Build documentation in the docs/ directory with Sphinx 10 | sphinx: 11 | configuration: docs/source/conf.py 12 | 13 | # Build documentation with MkDocs 14 | #mkdocs: 15 | # configuration: mkdocs.yml 16 | 17 | # Optionally build your docs in additional formats such as PDF 18 | formats: all 19 | 20 | # Optionally set the version of Python and requirements required to build your docs 21 | python: 22 | version: 3.7 23 | install: 24 | - requirements: requirements.txt -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Release notes 2 | 3 | 4 | 5 | ## 0.2.3 6 | 7 | - Move NLTK download code to Extractor class. 8 | - Upgrade package build method 9 | - Upgrade PyPi distribution 10 | - Fix NLTK and DB download issue (PR #66) 11 | 12 | 13 | ## 0.1.9 14 | 15 | Fix version number 16 | 17 | 18 | ## 0.1.8 19 | 20 | ### New Features 21 | 22 | - Add ISO country code ([#10](https://github.com/somnathrakshit/geograpy3/issues/10)) 23 | - returned country information should include the two [letter ISO 24 | code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) of the 25 | country 26 | 27 | - if country is given disambiguate country ([#7](https://github.com/somnathrakshit/geograpy3/issues/7)) 28 | - see e.g. https://stackoverflow.com/questions/62152428/extracting- 29 | country-information-from-description-using- 30 | geograpy?noredirect=1#comment112899776_62152428 Zaragoza, Spain 31 | should e.g. only return the country Spain since it's in the 32 | context of Zaragoza 33 | 34 | ### Bugs Squashed 35 | 36 | - [BUG]AttributeError: 'NoneType' object has no attribute 'name' on "Pristina, Kosovo" ([#9](https://github.com/somnathrakshit/geograpy3/issues/9)) 37 | - **Describe the bug** ``` 38 | geograpy.get_geoPlace_context(text="Pristina, Kosovo") ``` leads 39 | to python error. **To Reproduce** Steps to reproduce the 40 | behavior: ```python def testIssue(self): ''' 41 | test Issue ''' locality="Pristina, Kosovo" 42 | gp=geograpy.get_geoPlace_context(text=locality) if 43 | self.debug: print(" %s" % gp.countries) 44 | print(" %s" % gp.regions) print(" %s" % gp.cities) 45 | ``` File 46 | "/Users/wf/Documents/pyworkspace/geograpy3/geograpy/places.py", 47 | line 189, in set_cities country_name = country.name 48 | AttributeError: 'NoneType' object has no attribute 'name' 49 | **Expected behavior** Python should not choke on this although 50 | the political result may be disputed. 51 | 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # geograpy3 2 | [![Join the discussion at https://github.com/somnathrakshit/geograpy3/discussions](https://shields.io/badge/GitHub-%20Discussions-blue?logo=github)](https://github.com/somnathrakshit/geograpy3/discussions) 3 | [![Documentation Status](https://readthedocs.org/projects/geograpy3/badge/?version=latest)](https://geograpy3.readthedocs.io/en/latest/?badge=latest) 4 | [![pypi](https://img.shields.io/pypi/pyversions/geograpy3)](https://pypi.org/project/geograpy3/) 5 | [![Github Actions Build](https://github.com/somnathrakshit/geograpy3/workflows/Build/badge.svg?branch=master)](https://github.com/somnathrakshit/geograpy3/actions?query=workflow%3ABuild+branch%3Amaster) 6 | [![PyPI Status](https://img.shields.io/pypi/v/geograpy3.svg)](https://pypi.python.org/pypi/geograpy3/) 7 | [![Downloads](https://pepy.tech/badge/geograpy3)](https://pepy.tech/project/geograpy3) 8 | [![GitHub issues](https://img.shields.io/github/issues/somnathrakshit/geograpy3.svg)](https://github.com/somnathrakshit/geograpy3/issues) 9 | [![GitHub closed issues](https://img.shields.io/github/issues-closed/somnathrakshit/geograpy3.svg)](https://github.com/somnathrakshit/geograpy3/issues/?q=is%3Aissue+is%3Aclosed) 10 | [![License](https://img.shields.io/github/license/somnathrakshit/geograpy3.svg)](https://www.apache.org/licenses/LICENSE-2.0) 11 | 12 | geograpy3 is a fork of [geograpy2](https://github.com/Corollarium/geograpy2), which is itself a fork of [geograpy](https://github.com/ushahidi/geograpy) and inherits most of it, but solves several problems (such as support for utf8, places names 13 | with multiple words, confusion over homonyms etc). Also, geograpy3 is compatible with Python 3, unlike geograpy2. 14 | 15 | since geograpy3 0.0.2 cities,countries and regions are matched against a database derived from the corresponding wikidata entries 16 | 17 | What it is 18 | ========== 19 | 20 | geograpy extracts place names from a URL or text, and adds context to those names -- for example distinguishing between a country, region or city. 21 | 22 | The extraction is a two step process. The first process is a Natural Language Processing task which analyzes a text for potential mentions of geographic locations. In the next step the words which represent such locations are looked up using the Locator. 23 | 24 | If you already know that your content has geographic information you might want to use the Locator interface directly. 25 | 26 | ## Examples/Tutorial 27 | * [see Examples/Tutorial Wiki](http://wiki.bitplan.com/index.php/Geograpy#Examples) 28 | 29 | ## Install & Setup 30 | 31 | Grab the package using `pip` (this will take a few minutes) 32 | ```bash 33 | pip install geograpy3 34 | ``` 35 | 36 | geograpy3 uses [NLTK](http://www.nltk.org/) for entity recognition, so you'll also need 37 | to download the models we're using. Fortunately there's a command that'll take 38 | care of this for you. 39 | ```bash 40 | geograpy-nltk 41 | ``` 42 | 43 | ## Getting the source code 44 | ```bash 45 | git clone https://github.com/somnathrakshit/geograpy3 46 | cd geograpy3 47 | scripts/install 48 | ``` 49 | 50 | ## Basic Usage 51 | 52 | Import the module, give some text or a URL, and presto. 53 | ```python 54 | import geograpy 55 | url = 'https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay' 56 | places = geograpy.get_geoPlace_context(url=url) 57 | ``` 58 | 59 | Now you have access to information about all the places mentioned in the linked 60 | article. 61 | 62 | * `places.countries` _contains a list of country names_ 63 | * `places.regions` _contains a list of region names_ 64 | * `places.cities` _contains a list of city names_ 65 | * `places.other` _lists everything that wasn't clearly a country, region or city_ 66 | 67 | Note that the `other` list might be useful for shorter texts, to pull out 68 | information like street names, points of interest, etc, but at the moment is 69 | a bit messy when scanning longer texts that contain possessive forms of proper 70 | nouns (like "Russian" instead of "Russia"). 71 | 72 | ## But Wait, There's More 73 | 74 | In addition to listing the names of discovered places, you'll also get some 75 | information about the relationships between places. 76 | 77 | * `places.country_regions` _regions broken down by country_ 78 | * `places.country_cities` _cities broken down by country_ 79 | * `places.address_strings` _city, region, country strings useful for geocoding_ 80 | 81 | ## Last But Not Least 82 | 83 | While a text might mention many places, it's probably focused on one or two, so 84 | geograpy3 also breaks down countries, regions and cities by number of mentions. 85 | 86 | * `places.country_mentions` 87 | * `places.region_mentions` 88 | * `places.city_mentions` 89 | 90 | Each of these returns a list of tuples. The first item in the tuple is the place 91 | name and the second item is the number of mentions. For example: 92 | 93 | [('Russian Federation', 14), (u'Ukraine', 11), (u'Lithuania', 1)] 94 | 95 | ## If You're Really Serious 96 | 97 | You can of course use each of Geograpy's modules on their own. For example: 98 | ```python 99 | from geograpy import extraction 100 | 101 | e = extraction.Extractor(url='https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay') 102 | e.find_geoEntities() 103 | 104 | # You can now access all of the places found by the Extractor 105 | print(e.places) 106 | ``` 107 | 108 | Place context is handled in the `places` module. For example: 109 | 110 | ```python 111 | from geograpy import places 112 | 113 | pc = places.PlaceContext(['Cleveland', 'Ohio', 'United States']) 114 | 115 | pc.set_countries() 116 | print pc.countries #['United States'] 117 | 118 | pc.set_regions() 119 | print(pc.regions #['Ohio']) 120 | 121 | pc.set_cities() 122 | print(pc.cities #['Cleveland']) 123 | 124 | print(pc.address_strings #['Cleveland, Ohio, United States']) 125 | ``` 126 | 127 | And of course all of the other information shown above (`country_regions` etc) 128 | is available after the corresponding `set_` method is called. 129 | 130 | ## Stackoverflow 131 | * [Questions tagged with 'geograpy'](https://stackoverflow.com/questions/tagged/geograpy) 132 | 133 | ## Credits 134 | 135 | geograpy3 uses the following excellent libraries: 136 | 137 | * [NLTK](http://www.nltk.org/) for entity recognition 138 | * [newspaper](https://github.com/codelucas/newspaper) for text extraction from HTML 139 | * [jellyfish](https://github.com/sunlightlabs/jellyfish) for fuzzy text match 140 | * [pylodstorage](https://pypi.org/project/pylodstorage/) for storage and retrieval of tabular data from SQL and SPARQL sources 141 | 142 | geograpy3 uses the following data sources: 143 | * [ISO3166ErrorDictionary](https://github.com/bodacea/countryname/blob/master/countryname/databases/ISO3166ErrorDictionary.csv) for common country mispellings _via [Sara-Jayne Terp](https://github.com/bodacea)_ 144 | * [Wikidata](https://www.wikidata.org) for country/region/city information with disambiguation via population 145 | 146 | Hat tip to [Chris Albon](https://github.com/chrisalbon) for the name. 147 | -------------------------------------------------------------------------------- /docs/source/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | #https://stackoverflow.com/a/44980548/1497139 14 | import os 15 | import sys 16 | import sphinx_rtd_theme 17 | basepath=os.path.abspath('../..') 18 | print('adding basepath %s' % (basepath)) 19 | sys.path.insert(0, basepath) 20 | print('sys.path is now: %s' % (sys.path)) 21 | 22 | 23 | # -- Project information ----------------------------------------------------- 24 | 25 | project = 'geograpy3' 26 | copyright = '2018-2020, Somnath Rakshit, Wolfgang Fahl' 27 | author = 'Somnath Rakshit, Wolfgang Fahl' 28 | 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | 'sphinx_rtd_theme', 37 | 'sphinx.ext.napoleon', 38 | 'sphinx.ext.autodoc', 39 | 'sphinx.ext.viewcode', 40 | 'sphinx.ext.todo', 41 | ] 42 | 43 | # Add any paths that contain templates here, relative to this directory. 44 | templates_path = ['_templates'] 45 | 46 | # The language for content autogenerated by Sphinx. Refer to documentation 47 | # for a list of supported languages. 48 | # 49 | # This is also used if you do content translation via gettext catalogs. 50 | # Usually you set "language" from the command line for these cases. 51 | language = 'en' 52 | 53 | # List of patterns, relative to source directory, that match files and 54 | # directories to ignore when looking for source files. 55 | # This pattern also affects html_static_path and html_extra_path. 56 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 57 | 58 | 59 | # -- Options for HTML output ------------------------------------------------- 60 | 61 | # The theme to use for HTML and HTML Help pages. See the documentation for 62 | # a list of builtin themes. 63 | # 64 | html_theme = 'sphinx_rtd_theme' 65 | master_doc = 'index' 66 | 67 | # Add any paths that contain custom static files (such as style sheets) here, 68 | # relative to this directory. They are copied after the builtin static files, 69 | # so a file named "default.css" will overwrite the builtin "default.css". 70 | html_static_path = ['_static'] 71 | 72 | 73 | # -- Extension configuration ------------------------------------------------- 74 | 75 | # -- Options for todo extension ---------------------------------------------- 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = True 79 | -------------------------------------------------------------------------------- /docs/source/geograpy.rst: -------------------------------------------------------------------------------- 1 | geograpy package 2 | ================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | geograpy.extraction module 8 | -------------------------- 9 | 10 | .. automodule:: geograpy.extraction 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | geograpy.labels module 16 | ---------------------- 17 | 18 | .. automodule:: geograpy.labels 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | geograpy.locator module 24 | ----------------------- 25 | 26 | .. automodule:: geograpy.locator 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | geograpy.places module 32 | ---------------------- 33 | 34 | .. automodule:: geograpy.places 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | geograpy.prefixtree module 40 | -------------------------- 41 | 42 | .. automodule:: geograpy.prefixtree 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | geograpy.utils module 48 | --------------------- 49 | 50 | .. automodule:: geograpy.utils 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | geograpy.wikidata module 56 | ------------------------ 57 | 58 | .. automodule:: geograpy.wikidata 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | Module contents 64 | --------------- 65 | 66 | .. automodule:: geograpy 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. geograpy3 documentation master file, created by 2 | sphinx-quickstart on Wed Sep 23 16:51:23 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to geograpy3's documentation! 7 | ===================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 4 11 | :caption: Contents: 12 | 13 | geograpy 14 | setup 15 | tests 16 | 17 | 18 | Indices and tables 19 | ================== 20 | 21 | * :ref:`genindex` 22 | * :ref:`modindex` 23 | * :ref:`search` 24 | -------------------------------------------------------------------------------- /docs/source/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/setup.rst: -------------------------------------------------------------------------------- 1 | setup module 2 | ============ 3 | 4 | .. automodule:: setup 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/tests.rst: -------------------------------------------------------------------------------- 1 | tests package 2 | ============= 3 | 4 | Submodules 5 | ---------- 6 | 7 | tests.test\_extractor module 8 | ---------------------------- 9 | 10 | .. automodule:: tests.test_extractor 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | tests.test\_locator module 16 | -------------------------- 17 | 18 | .. automodule:: tests.test_locator 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | tests.test\_places module 24 | ------------------------- 25 | 26 | .. automodule:: tests.test_places 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | tests.test\_prefixtree module 32 | ----------------------------- 33 | 34 | .. automodule:: tests.test_prefixtree 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | tests.test\_wikidata module 40 | --------------------------- 41 | 42 | .. automodule:: tests.test_wikidata 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | Module contents 48 | --------------- 49 | 50 | .. automodule:: tests 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /examples/example1.py: -------------------------------------------------------------------------------- 1 | import geograpy 2 | url='https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay' 3 | places = geograpy.get_geoPlace_context(url = url) 4 | print(places) 5 | -------------------------------------------------------------------------------- /geograpy/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | main geograpy 3 module 3 | """ 4 | __version__ = "0.3.0" 5 | from geograpy.extraction import Extractor 6 | from geograpy.labels import Labels 7 | from geograpy.locator import Locator 8 | from geograpy.places import PlaceContext 9 | 10 | 11 | def get_geoPlace_context(url=None, text=None, debug=False): 12 | """ 13 | Get a place context for a given text with information 14 | about country, region, city and other 15 | based on NLTK Named Entities having the Geographic(GPE) label. 16 | 17 | Args: 18 | url(String): the url to read text from (if any) 19 | text(String): the text to analyze 20 | debug(boolean): if True show debug information 21 | 22 | Returns: 23 | places: 24 | PlaceContext: the place context 25 | """ 26 | places = get_place_context(url, text, labels=Labels.geo, debug=debug) 27 | return places 28 | 29 | 30 | def get_place_context(url=None, text=None, labels=Labels.default, debug=False): 31 | """ 32 | Get a place context for a given text with information 33 | about country, region, city and other 34 | based on NLTK Named Entities in the label set Geographic(GPE), 35 | Person(PERSON) and Organization(ORGANIZATION). 36 | 37 | Args: 38 | url(String): the url to read text from (if any) 39 | text(String): the text to analyze 40 | debug(boolean): if True show debug information 41 | 42 | Returns: 43 | pc: 44 | PlaceContext: the place context 45 | """ 46 | e = Extractor(url=url, text=text, debug=debug) 47 | e.find_entities(labels=labels) 48 | places = e.places 49 | pc = PlaceContext(places) 50 | pc.setAll() 51 | return pc 52 | 53 | 54 | def locateCity(location, correctMisspelling=False, debug=False): 55 | """ 56 | locate the given location string 57 | Args: 58 | location(string): the description of the location 59 | Returns: 60 | Locator: the location 61 | """ 62 | e = Extractor(text=location, debug=debug) 63 | e.split() 64 | loc = Locator.getInstance(correctMisspelling=correctMisspelling, debug=debug) 65 | city = loc.locateCity(e.places) 66 | return city 67 | -------------------------------------------------------------------------------- /geograpy/data/ISO3166ErrorDictionary.csv: -------------------------------------------------------------------------------- 1 | data.un.org entry,Issue,ISO3166 name or code,,, 2 | Ã…land Islands,spelling,Åland Islands,,, 3 | "Afghanistan, Islamic State of",spelling,Afghanistan,,, 4 | "Bahamas, The",spelling,Bahamas,,, 5 | Bolivia,spelling,"Bolivia, Plurinational State of",,, 6 | Bosnia Herzegovina,spelling,Bosnia and Herzegovina,,, 7 | British Indian Ocean Ter,spelling,British Indian Ocean Territory,,, 8 | Brunei,spelling,Brunei Darussalam,,, 9 | C\xc3\xb4te d\xe2\x80\x99Ivoire,spelling,Côte d'Ivoire,,, 10 | C\xc3\xb4te d'Ivoire,spelling,Côte d'Ivoire,,, 11 | C\xf4te d'Ivoire,spelling,Côte d'Ivoire,,, 12 | Cote d'Ivoire,spelling,Côte d'Ivoire,,, 13 | Ivory Coast,spelling,Côte d'Ivoire,,, 14 | Central African Republic,spelling,Central African Republic,,, 15 | "China, People's Republic of",spelling,China,,, 16 | "Christmas Island, Aust",spelling,Christmas Island,#true?,, 17 | "Cocos, Keeling Islands",spelling,Cocos (Keeling) Islands,,, 18 | "Congo, Republic",spelling,Congo,,, 19 | Congo Democratic Republic,spelling,"Congo, The Democratic Republic of the",,, 20 | "Congo, Democratic Republic",spelling,"Congo, The Democratic Republic of the",,, 21 | "Congo, Democratic Republic of the",spelling,"Congo, The Democratic Republic of the",,, 22 | "Congo, Democratic Republic of",spelling,"Congo, The Democratic Republic of the",,, 23 | Democratic Republic of Congo,spelling,"Congo, The Democratic Republic of the",,, 24 | Democratic Republic of the Congo,spelling,"Congo, The Democratic Republic of the",,, 25 | Cook Islands,spelling,Cook Islands,,, 26 | Czech Republic,spelling,Czech Republic,,, 27 | Czechoslovakia (former),spelling,"Czechoslovakia, Czechoslovak Socialist Republic",,, 28 | Czechoslovakia,spelling,"Czechoslovakia, Czechoslovak Socialist Republic",,, 29 | "Czechoslovakia, former",spelling,"Czechoslovakia, Czechoslovak Socialist Republic",,, 30 | Former Czechoslovakia,spelling,"Czechoslovakia, Czechoslovak Socialist Republic",,, 31 | Dominican Republic,spelling,Dominican Republic,,, 32 | "Egypt, Arab Republic",spelling,Egypt,,, 33 | Ethiopia PDR,spelling,Ethiopia,,, 34 | "Ethiopia, from 1993",spelling,Ethiopia,,, 35 | "Ethiopia, up to 1993",spelling,Ethiopia,true?,, 36 | Former Ethiopia,spelling,Ethiopia,true?,, 37 | "Falkland Island, Malvinas",spelling,Falkland Islands (Malvinas),,, 38 | Falkland Islands,spelling,Falkland Islands (Malvinas),,, 39 | "Falkland Islands, Malvinas",spelling,Falkland Islands (Malvinas),,, 40 | Faeroe Islands,spelling,Faroe Islands,,, 41 | French Southern Terr,spelling,French Southern Territories,,, 42 | Gambia The,spelling,Gambia,,, 43 | "Gambia, The",spelling,Gambia,,, 44 | German Democratic Republic (former),spelling,German Democratic Republic,,, 45 | "German Democratic Republic, former",spelling,German Democratic Republic,,, 46 | "Germany, The former German Democratic Republic",spelling,German Democratic Republic,,, 47 | Fmr Federated Republic of Germany,spelling,"Germany, Federal Republic of",,, 48 | "Germany, Federated Republic of before 3.10.1990",spelling,"Germany, Federal Republic of",,, 49 | Holy See,spelling,Holy See (Vatican City State),,, 50 | "Holy See, Vatican",spelling,Holy See (Vatican City State),,, 51 | "China, Hong Kong SAR",spelling,Hong Kong,,, 52 | "China, Hong Kong Special Administrative Region",spelling,Hong Kong,,, 53 | China: Hong Kong SAR,spelling,Hong Kong,,, 54 | Hong Kong SAR,spelling,Hong Kong,,, 55 | "Hong Kong SAR, China",spelling,Hong Kong,,, 56 | "Hong Kong, China",spelling,Hong Kong,,, 57 | Iran,spelling,"Iran, Islamic Republic of",,, 58 | Iran(Islamic Republic of),spelling,"Iran, Islamic Republic of",,, 59 | "Iran, Islamic Republic of",spelling,"Iran, Islamic Republic of",,, 60 | "Iran, Islamic Republic",spelling,"Iran, Islamic Republic of",,, 61 | Islamic Republic of Iran,spelling,"Iran, Islamic Republic of",,, 62 | Democratic People's Republic of Korea,spelling,"Korea, Democratic People's Republic of",,, 63 | Korea DPR,spelling,"Korea, Democratic People's Republic of",,, 64 | "Korea, Democratic Republic",spelling,"Korea, Democratic People's Republic of",,, 65 | "Korea, DemocraticPpl's.Republic",spelling,"Korea, Democratic People's Republic of",,, 66 | "Korea,DemocraticPpl's.Republic",spelling,"Korea, Democratic People's Republic of",,, 67 | Korea Rep,spelling,"Korea, Republic of",,, 68 | "Korea, Republic of",spelling,"Korea, Republic of",,, 69 | "Korea, Republic",spelling,"Korea, Republic of",,, 70 | Republic of Korea,spelling,"Korea, Republic of",,, 71 | Lao P.D.R.,spelling,Lao People's Democratic Republic,,, 72 | Lao PDR,spelling,Lao People's Democratic Republic,,, 73 | Lao People's Democratic Republic,spelling,Lao People's Democratic Republic,,, 74 | Libya,spelling,Libyan Arab Jamahiriya,,, 75 | Libyan Arab Jamah.,spelling,Libyan Arab Jamahiriya,,, 76 | "China, Macao SAR",spelling,Macao,,, 77 | "China, Macao Special Administrative Region",spelling,Macao,,, 78 | China: Macao SAR,spelling,Macao,,, 79 | "Macao SAR, China",spelling,Macao,,, 80 | "Macao, China",spelling,Macao,,, 81 | Macau SAR,spelling,Macao,,, 82 | "Macau, China",spelling,Macao,,, 83 | "Macau, SAR",spelling,Macao,,, 84 | Macedonia,spelling,"Macedonia, Republic of",,, 85 | "Macedonia, FYR",spelling,"Macedonia, Republic of",,, 86 | "Macedonia, The former Yugoslav Republic of",spelling,"Macedonia, Republic of",,, 87 | T.F.Y.R. Macedonia,spelling,"Macedonia, Republic of",,, 88 | T.F.Yug.Republic Macedonia,spelling,"Macedonia, Republic of",,, 89 | TFYR Macedonia,spelling,"Macedonia, Republic of",,, 90 | TFYR of Macedonia,spelling,"Macedonia, Republic of",,, 91 | The f. Yugosl. Rep of Macedonia,spelling,"Macedonia, Republic of",,, 92 | The Former Yugoslav Republic of Macedonia,spelling,"Macedonia, Republic of",,, 93 | "Micronesia, Federated States of",spelling,"Micronesia, Federated States of",,, 94 | "Micronesia, Federated Sts.",spelling,"Micronesia, Federated States of",,, 95 | "Micronesia, FederatedStates of",spelling,"Micronesia, Federated States of",,, 96 | Moldova,spelling,"Moldova, Republic of",,, 97 | Republic of Moldova,spelling,"Moldova, Republic of",,, 98 | Neth. Antilles,spelling,Netherlands Antilles,,, 99 | Netherlands Antilles and Aruba,spelling,Netherlands Antilles,,,True? 100 | Pacific Island,spelling,Pacific Islands (trust territory),,, 101 | "Pacific Islands, Trust Territory",spelling,Pacific Islands (trust territory),,, 102 | Occ. Palestinian Terr.,spelling,"Palestinian Territory, Occupied",,, 103 | Palestinian Territories,spelling,"Palestinian Territory, Occupied",,, 104 | Palestine,spelling,"Palestinian Territory, Occupied",#true?,, 105 | Palestinian Authority,spelling,"Palestinian Territory, Occupied",#true?,, 106 | Pitcairn Islands,spelling,Pitcairn,,, 107 | Réunion,spelling,Reunion,fix,, 108 | Russia,spelling,Russian Federation,true?,, 109 | Saint Helena and Depend.,spelling,"Saint Helena, Ascension and Tristan da Cunha",,, 110 | Saint Helena,spelling,"Saint Helena, Ascension and Tristan da Cunha",#true?,, 111 | Saint Kitts-Nevis,spelling,Saint Kitts and Nevis,,, 112 | Saint Lucia ),spelling,Saint Lucia,,, 113 | "Saint Martin, French part",spelling,Saint Martin (French part),,, 114 | Saint Pierre-Miquelon,spelling,Saint Pierre and Miquelon,,, 115 | Saint Vincent and Grenadines,spelling,Saint Vincent and the Grenadines,,, 116 | Saint Vincent-Grenadines,spelling,Saint Vincent and the Grenadines,,, 117 | Solomon Islands,spelling,Solomon Islands,,, 118 | Svalbard and Jan Mayen Islands,spelling,Svalbard and Jan Mayen,,, 119 | Syria,spelling,Syrian Arab Republic,,, 120 | Tanzania,spelling,"Tanzania, United Republic of",,, 121 | United Republic of Tanzania,spelling,"Tanzania, United Republic of",,, 122 | United RepublicTanzania,spelling,"Tanzania, United Republic of",,, 123 | Timor Leste,spelling,Timor-Leste,,, 124 | Turks and Caicos Islands,spelling,Turks and Caicos Islands,,, 125 | United Kingdom of Great Britain & Northern Ireland,spelling,United Kingdom,,, 126 | Usa,spelling,United States,,, 127 | "United States of America, pacific Islands",spelling,US Miscellaneous Pacific Islands,,, 128 | US Miscellaneous Pacific Islands,spelling,US Miscellaneous Pacific Islands,,, 129 | Former USSR,spelling,"USSR, Union of Soviet Socialist Republics",,, 130 | Union of Soviet Socialist Republics (former),spelling,"USSR, Union of Soviet Socialist Republics",,, 131 | "Union of Soviet Socialist Republics, former",spelling,"USSR, Union of Soviet Socialist Republics",,, 132 | Ussr,spelling,"USSR, Union of Soviet Socialist Republics",,, 133 | Venezuela,spelling,"Venezuela, Bolivarian republic of",,, 134 | "Venezuela, Bolivarian Republic of",spelling,"Venezuela, Bolivarian republic of",,, 135 | "Venezuela, RB",spelling,"Venezuela, Bolivarian republic of",,, 136 | "Viet Nam, Democratic Republic of",spelling,Viet Nam,,, 137 | Vietnam,spelling,Viet Nam,,, 138 | United States Virgin Island,spelling,"Virgin Islands, U.S.",,, 139 | United States Virgin Islands,spelling,"Virgin Islands, U.S.",,, 140 | US Virgin Islands,spelling,"Virgin Islands, U.S.",,, 141 | "Virgin Islands, US",spelling,"Virgin Islands, U.S.",,, 142 | Wake Is,spelling,Wake Island,,, 143 | Wallis & Futuna Islands,spelling,Wallis and Futuna,,, 144 | Wallis and Futuna Islands,spelling,Wallis and Futuna,,, 145 | "Yemen, Republic",spelling,Yemen,,, 146 | "Yemen, Republic of",spelling,Yemen,,, 147 | Democratic Yemen (former),spelling,"Yemen, Democratic, People's Democratic Republic of",,, 148 | "Democratic Yemen, former",spelling,"Yemen, Democratic, People's Democratic Republic of",,, 149 | "Yemen, The former Democratic",spelling,"Yemen, Democratic, People's Democratic Republic of",,, 150 | Yemen: Former Democratic Yemen,spelling,"Yemen, Democratic, People's Democratic Republic of",,, 151 | Yemen Arab Republic (former),spelling,"Yemen, Yemen Arab Republic",,, 152 | "Yemen Arab Republic, former",spelling,"Yemen, Yemen Arab Republic",,, 153 | Yemen: Former Yemen Arab Republic,spelling,"Yemen, Yemen Arab Republic",,, 154 | Former Yugoslavia,spelling,"Yugoslavia, Socialist Federal Republic of",,, 155 | Yugoslav SFR,spelling,"Yugoslavia, Socialist Federal Republic of",,, 156 | Yugoslavia (former Socialist Federal Republic),spelling,"Yugoslavia, Socialist Federal Republic of",,, 157 | Yugoslavia,spelling,"Yugoslavia, Socialist Federal Republic of",,, 158 | "Yugoslavia, former Socialist Federal Republic",spelling,"Yugoslavia, Socialist Federal Republic of",,, 159 | "Yugoslavia, The former Socialist Federated Republic of",spelling,"Yugoslavia, Socialist Federal Republic of",,, 160 | East Timor,withdrawn,TMP,,, 161 | "Czechoslovakia, Czechoslovak Socialist Republic",withdrawn,CSK,,, 162 | "USSR, Union of Soviet Socialist Republics",withdrawn,SUN,,, 163 | "Yemen, Yemen Arab Republic",withdrawn,YEM,,, 164 | "Yemen, Democratic, People's Democratic Republic of",withdrawn,YMD,,, 165 | "Yugoslavia, Socialist Federal Republic of",withdrawn,YUG,,, 166 | "Germany, Federal Republic of",withdrawn,DEU,,, 167 | German Democratic Republic,withdrawn,DDR,,, 168 | US Miscellaneous Pacific Islands,withdrawn,PUS,,, 169 | Wake Island,withdrawn,WAK,,, 170 | Serbia and Montenegro,withdrawn,SCG,,, 171 | Netherlands Antilles,withdrawn,ANT,,, 172 | Pacific Islands (trust territory),withdrawn,PCI,,, -------------------------------------------------------------------------------- /geograpy/data/aliases.csv: -------------------------------------------------------------------------------- 1 | name,alias 2 | UK,GB 3 | USA,United States of America 4 | United States,United States of America 5 | -------------------------------------------------------------------------------- /geograpy/data/queries.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Pre-configured Queries for Geograpy3 location lookup database 3 | # 4 | # WF 2021-08-19 5 | 'LabelLookup example #1': 6 | sql: | 7 | SELECT * 8 | FROM CityLookup 9 | WHERE label IN ('Berlin',',St. Petersburg','Singapore','Athens') 10 | ORDER BY pop DESC 11 | 'LabelLookup example #2': 12 | sql: | 13 | SELECT * from RegionLookup WHERE label IN ('CA') 14 | 'LabelLookup example #3': 15 | sql: | 16 | SELECT * from CountryLookup WHERE label IN ('CA') 17 | 'Countries': 18 | title: Countries sorted by ISO code 19 | description: Countries with population and coordinates sorted by ISO code 20 | sparql: | 21 | # get a list of countries 22 | # for geograpy3 library 23 | # see https://github.com/somnathrakshit/geograpy3/issues/15 24 | PREFIX rdfs: 25 | PREFIX wd: 26 | PREFIX wdt: 27 | PREFIX p: 28 | PREFIX ps: 29 | PREFIX pq: 30 | # get City details with Country 31 | SELECT DISTINCT ?wikidataid ?name ?iso ?pop ?coord 32 | WHERE { 33 | BIND (?countryQ AS ?wikidataid) 34 | 35 | # instance of Country 36 | # inverse path see https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization#Inverse_property_paths 37 | wd:Q6256 ^wdt:P279*/^wdt:P31 ?countryQ . 38 | 39 | # VALUES ?country { wd:Q55}. 40 | # label for the country 41 | ?countryQ rdfs:label ?name filter (lang(?name) = "en"). 42 | # get the continent (s) 43 | #OPTIONAL { 44 | # ?country wdt:P30 ?continent. 45 | # ?continent rdfs:label ?continentLabel filter (lang(?continentLabel) = "en"). 46 | #} 47 | # get the coordinates 48 | OPTIONAL { 49 | ?countryQ wdt:P625 ?coord. 50 | } 51 | # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code 52 | ?countryQ wdt:P297 ?iso. 53 | # population of country 54 | OPTIONAL 55 | { 56 | SELECT ?countryQ (max(?countryPopulationValue) as ?pop) 57 | WHERE { 58 | ?countryQ wdt:P1082 ?countryPopulationValue 59 | } group by ?countryQ 60 | } 61 | # https://www.wikidata.org/wiki/Property:P2132 62 | # nominal GDP per capita 63 | # OPTIONAL { ?country wdt:P2132 ?countryGDP_perCapitaValue. } 64 | } 65 | ORDER BY ?iso 66 | 67 | -------------------------------------------------------------------------------- /geograpy/extraction.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import nltk 4 | from newspaper import Article 5 | 6 | from geograpy.labels import Labels 7 | 8 | 9 | class Extractor(object): 10 | """ 11 | Extract geo context for text or from url 12 | """ 13 | 14 | def __init__(self, text=None, url=None, debug=False): 15 | """ 16 | Constructor 17 | Args: 18 | 19 | text(string): the text to analyze 20 | url(string): the url to read the text to analyze from 21 | debug(boolean): if True show debug information 22 | """ 23 | if not text and not url: 24 | raise Exception("text or url is required") 25 | self.debug = debug 26 | self.text = text 27 | self.url = url 28 | self.places = [] 29 | nltk_packages = [ 30 | "maxent_ne_chunker", 31 | "words", 32 | "treebank", 33 | "maxent_treebank_pos_tagger", 34 | "punkt", 35 | "averaged_perceptron_tagger", 36 | ] 37 | for nltk_package in nltk_packages: 38 | try: 39 | import nltk 40 | 41 | nltk.data.find(nltk_package) 42 | except LookupError: 43 | nltk.downloader.download(nltk_package, quiet=True) 44 | import nltk 45 | 46 | def set_text(self): 47 | """ 48 | Setter for text 49 | """ 50 | if not self.text and self.url: 51 | a = Article(self.url) 52 | a.download() 53 | a.parse() 54 | self.text = a.text 55 | 56 | def split(self, delimiter=r","): 57 | """ 58 | simpler regular expression splitter with not entity check 59 | 60 | hat tip: https://stackoverflow.com/a/1059601/1497139 61 | """ 62 | self.set_text() 63 | self.places = re.split(delimiter, self.text) 64 | 65 | def find_geoEntities(self): 66 | """ 67 | Find geographic entities 68 | 69 | Returns: 70 | list: 71 | List of places 72 | """ 73 | self.find_entities(Labels.geo) 74 | return self.places 75 | 76 | def find_entities(self, labels=Labels.default): 77 | """ 78 | Find entities with the given labels set self.places and returns it 79 | Args: 80 | labels: 81 | Labels: The labels to filter 82 | Returns: 83 | list: 84 | List of places 85 | """ 86 | self.set_text() 87 | 88 | text = nltk.word_tokenize(self.text) 89 | nes = nltk.ne_chunk(nltk.pos_tag(text)) 90 | 91 | for ne in nes: 92 | if type(ne) is nltk.tree.Tree: 93 | nelabel = ne.label() 94 | if nelabel in labels: 95 | leaves = ne.leaves() 96 | if self.debug: 97 | print(leaves) 98 | self.places.append(" ".join([i[0] for i in leaves])) 99 | return self.places 100 | -------------------------------------------------------------------------------- /geograpy/geograpy_nltk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # converted to python script 2024-03-29 3 | import nltk 4 | def main(): 5 | nltk.downloader.download('maxent_ne_chunker') 6 | nltk.downloader.download('words') 7 | nltk.downloader.download('treebank') 8 | nltk.downloader.download('maxent_treebank_pos_tagger') 9 | nltk.downloader.download('punkt') 10 | # since 2020-09 11 | nltk.downloader.download('averaged_perceptron_tagger') 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /geograpy/labels.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2020-09-10 3 | 4 | @author: wf 5 | """ 6 | 7 | 8 | class Labels(object): 9 | """ 10 | NLTK labels 11 | """ 12 | 13 | default = ["GPE", "GSP", "PERSON", "ORGANIZATION"] 14 | geo = ["GPE", "GSP"] 15 | -------------------------------------------------------------------------------- /geograpy/nominatim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2021-12-27 3 | 4 | @author: wf 5 | """ 6 | import logging 7 | import os 8 | from pathlib import Path 9 | 10 | from geopy.geocoders import Nominatim as GeoNominatim 11 | from OSMPythonTools.cachingStrategy import JSON, CachingStrategy 12 | from OSMPythonTools.nominatim import Nominatim 13 | 14 | 15 | class NominatimWrapper(object): 16 | """ 17 | Nominatim Wrapper to hide technical details of Nominatim interface 18 | """ 19 | 20 | def __init__(self, cacheDir: str = None, user_agent: str = "ConferenceCorpus"): 21 | """ 22 | Constructor 23 | 24 | create a nominatim instance for the given cacheDir - if cacheDir is None use ~/.nominatim as cachedir 25 | 26 | Args: 27 | cacheDir(str): the path to the cache directory to be use by Noninatims JSON caching Strategy 28 | user_agent(str): the user_agent to use for the geolocator 29 | 30 | """ 31 | if cacheDir is None: 32 | home = str(Path.home()) 33 | cacheDir = f"{home}/.nominatim" 34 | self.cacheDir = cacheDir 35 | if not os.path.exists(self.cacheDir): 36 | os.makedirs(cacheDir) 37 | logging.getLogger("OSMPythonTools").setLevel(logging.ERROR) 38 | CachingStrategy.use(JSON, cacheDir=cacheDir) 39 | self.nominatim = Nominatim() 40 | self.geolocator = GeoNominatim(user_agent=user_agent) 41 | 42 | def lookupWikiDataId(self, locationText: str): 43 | """ 44 | lookup the Wikidata Identifier for the given locationText (if any) 45 | 46 | Args: 47 | locationText(str): the location text to search for 48 | 49 | Return: 50 | the wikidata Q identifier most fitting the given location text 51 | 52 | """ 53 | wikidataId = None 54 | nresult = self.nominatim.query(locationText, params={"extratags": "1"}) 55 | nlod = nresult._json 56 | if len(nlod) > 0: 57 | nrecord = nlod[0] 58 | if "extratags" in nrecord: 59 | extratags = nrecord["extratags"] 60 | if "wikidata" in extratags: 61 | wikidataId = extratags["wikidata"] 62 | return wikidataId 63 | -------------------------------------------------------------------------------- /geograpy/places.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | from geograpy.locator import City, Locator, Region 4 | 5 | from .utils import fuzzy_match, remove_non_ascii 6 | 7 | """ 8 | Takes a list of place names and works place designation (country, region, etc) 9 | and relationships between places (city is inside region is inside country, etc) 10 | """ 11 | 12 | 13 | class PlaceContext(Locator): 14 | """ 15 | Adds context information to a place name 16 | """ 17 | 18 | def __init__( 19 | self, place_names: list, setAll: bool = True, correctMisspelling: bool = False 20 | ): 21 | """ 22 | Constructor 23 | 24 | Args: 25 | place_names: 26 | list: The place names to check 27 | setAll: 28 | boolean: True if all context information should immediately be set 29 | db_file: 30 | string: Path to the database file to be used - if None the default "locs.db" will be used 31 | """ 32 | super().__init__() 33 | self.correctMisspelling = correctMisspelling 34 | self.places = self.normalizePlaces(place_names) 35 | if setAll: 36 | self.setAll() 37 | 38 | def __str__(self): 39 | """ 40 | Return a string representation of me 41 | """ 42 | text = "countries=%s\nregions=%s\ncities=%s\nother=%s" % ( 43 | self.countries, 44 | self.regions, 45 | self.cities, 46 | self.other, 47 | ) 48 | return text 49 | 50 | def getRegions(self, countryName: str) -> list: 51 | """ 52 | get a list of regions for the given countryName 53 | 54 | countryName(str): the countryName to check 55 | """ 56 | regions = [] 57 | queryString = """SELECT r.* FROM 58 | COUNTRIES c 59 | JOIN regions r ON r.countryId=c.wikidataid 60 | WHERE c.name=(?)""" 61 | params = (countryName,) 62 | regionRecords = self.sqlDB.query(queryString, params) 63 | for regionRecord in regionRecords: 64 | region = Region.fromRecord(regionRecord) 65 | regions.append(region) 66 | return regions 67 | 68 | def get_region_names(self, countryName: str) -> list: 69 | """ 70 | get region names for the given country 71 | 72 | Args: 73 | countryName(str): the name of the country 74 | """ 75 | if self.correctMisspelling: 76 | countryName = self.correct_country_misspelling(countryName) 77 | regionOfCountryQuery = """SELECT name 78 | FROM regions 79 | WHERE countryId IN ( 80 | SELECT wikidataid 81 | FROM countries 82 | WHERE name LIKE (?) 83 | OR wikidataid IN ( 84 | SELECT wikidataid 85 | FROM country_labels 86 | WHERE label LIKE (?) 87 | ) 88 | )""" 89 | regionRecords = self.sqlDB.query( 90 | regionOfCountryQuery, 91 | params=( 92 | countryName, 93 | countryName, 94 | ), 95 | ) 96 | return [r.get("name") for r in regionRecords] 97 | 98 | def setAll(self): 99 | """ 100 | Set all context information 101 | """ 102 | self.set_countries() 103 | self.set_regions() 104 | self.set_cities() 105 | self.set_other() 106 | 107 | def set_countries(self): 108 | """ 109 | get the country information from my places 110 | """ 111 | countries = [] 112 | for place in self.places: 113 | country = self.getCountry(place) 114 | if country is not None: 115 | countries.append(country.name) 116 | 117 | self.country_mentions = Counter(countries).most_common() 118 | self.countries = list(set(countries)) 119 | pass 120 | 121 | def set_regions(self): 122 | """ 123 | get the region information from my places (limited to the already identified countries) 124 | """ 125 | regions = [] 126 | self.country_regions = {} 127 | region_names = {} 128 | 129 | if not self.countries: 130 | self.set_countries() 131 | 132 | def region_match(place_name: str, region_name: str) -> bool: 133 | """ 134 | Tests the similarity of the given strings after removing non ascii characters. 135 | Args: 136 | place_name(str): Place name 137 | region_name(str): valid region name to test against 138 | 139 | Returns: 140 | True if the similarity of both values is greater equals 80%. Otherwise False 141 | """ 142 | return fuzzy_match( 143 | remove_non_ascii(place_name), remove_non_ascii(region_name) 144 | ) 145 | 146 | def is_region(place_name: str, region_names: list): 147 | """ 148 | Filters out the regions that are not similar to the given place_name 149 | Args: 150 | place_name(str): place name to check against the regions 151 | region_names(list): List of valid region names 152 | 153 | Returns: 154 | List of regions that are similar to the given place_name 155 | """ 156 | return any([region_match(place_name, rn) for rn in region_names]) 157 | 158 | for country in self.countries: 159 | region_names = self.get_region_names(country) 160 | matched_regions = [ 161 | p for p in set(self.places) if is_region(p, region_names) 162 | ] 163 | 164 | regions += matched_regions 165 | self.country_regions[country] = list(set(matched_regions)) 166 | 167 | self.region_mentions = Counter(regions).most_common() 168 | self.regions = list(set(regions)) 169 | 170 | def set_cities(self): 171 | """ 172 | set the cities information 173 | """ 174 | self.cities = [] 175 | self.country_cities = {} 176 | self.address_strings = [] 177 | 178 | if not self.countries: 179 | self.set_countries() 180 | 181 | if not self.regions: 182 | self.set_regions() 183 | 184 | if not self.db_has_data(): 185 | self.populate_db() 186 | # ToDo: Duplicate with Locator.city_for_name e.g. extend method to support multiple names 187 | placesWithoutDuplicates = set(self.places) 188 | params = ",".join("?" * len(placesWithoutDuplicates)) 189 | query = "SELECT * FROM CityLookup WHERE name IN (" + params + ")" 190 | cityLookupRecords = self.sqlDB.query(query, list(placesWithoutDuplicates)) 191 | cityLookupRecords.sort( 192 | key=lambda cityRecord: float(cityRecord.get("pop")) 193 | if cityRecord.get("pop") is not None 194 | else 0.0, 195 | reverse=True, 196 | ) 197 | for cityLookupRecord in cityLookupRecords: 198 | city = City.fromCityLookup(cityLookupRecord) 199 | 200 | if city.name not in self.cities: 201 | self.cities.append(city.name) 202 | 203 | countryName = city.country.name 204 | if countryName not in self.countries: 205 | self.countries.append(countryName) 206 | self.country_mentions.append((countryName, 1)) 207 | 208 | if countryName not in self.country_cities: 209 | self.country_cities[countryName] = [] 210 | 211 | if city.name not in self.country_cities[countryName]: 212 | self.country_cities[countryName].append(city.name) 213 | regionName = city.region.name 214 | if ( 215 | countryName in self.country_regions 216 | and regionName in self.country_regions[countryName] 217 | ): 218 | address = f"{city.name}, {regionName}, {countryName}" 219 | self.address_strings.append(address) 220 | 221 | all_cities = [p for p in self.places if p in self.cities] 222 | self.city_mentions = Counter(all_cities).most_common() 223 | 224 | def set_other(self): 225 | if not self.cities: 226 | self.set_cities() 227 | 228 | def unused(place_name): 229 | places = [self.countries, self.cities, self.regions] 230 | return all( 231 | self.correct_country_misspelling(place_name) not in l for l in places 232 | ) 233 | 234 | self.other = [p for p in self.places if unused(p)] 235 | -------------------------------------------------------------------------------- /geograpy/utils.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import os 3 | import shutil 4 | import time 5 | import urllib.request 6 | 7 | import jellyfish 8 | 9 | 10 | class Download: 11 | """ 12 | Utility functions for downloading data 13 | """ 14 | 15 | @staticmethod 16 | def getURLContent(url: str): 17 | with urllib.request.urlopen(url) as urlResponse: 18 | content = urlResponse.read().decode() 19 | return content 20 | 21 | @staticmethod 22 | def getFileContent(path: str): 23 | with open(path, "r") as file: 24 | content = file.read() 25 | return content 26 | 27 | @staticmethod 28 | def needsDownload(filePath: str, force: bool = False) -> bool: 29 | """ 30 | check if a download of the given filePath is necessary that is the file 31 | does not exist has a size of zero or the download should be forced 32 | 33 | Args: 34 | filePath(str): the path of the file to be checked 35 | force(bool): True if the result should be forced to True 36 | 37 | Return: 38 | bool: True if a download for this file needed 39 | """ 40 | if not os.path.isfile(filePath): 41 | result = True 42 | else: 43 | stats = os.stat(filePath) 44 | size = stats.st_size 45 | result = force or size == 0 46 | return result 47 | 48 | @staticmethod 49 | def downloadBackupFile( 50 | url: str, fileName: str, targetDirectory: str, force: bool = False 51 | ): 52 | """ 53 | Downloads from the given url the zip-file and extracts the file corresponding to the given fileName. 54 | 55 | Args: 56 | url: url linking to a downloadable gzip file 57 | fileName: Name of the file that should be extracted from gzip file 58 | targetDirectory(str): download the file this directory 59 | force (bool): True if the download should be forced 60 | 61 | Returns: 62 | Name of the extracted file with path to the backup directory 63 | """ 64 | extractTo = f"{targetDirectory}/{fileName}" 65 | # we might want to check whether a new version is available 66 | if Download.needsDownload(extractTo, force=force): 67 | if not os.path.isdir(targetDirectory): 68 | os.makedirs(targetDirectory) 69 | zipped = f"{extractTo}.gz" 70 | print(f"Downloading {zipped} from {url} ... this might take a few seconds") 71 | urllib.request.urlretrieve(url, zipped) 72 | print(f"Unzipping {extractTo} from {zipped}") 73 | with gzip.open(zipped, "rb") as gzipped: 74 | with open(extractTo, "wb") as unzipped: 75 | shutil.copyfileobj(gzipped, unzipped) 76 | print("Extracting completed") 77 | if not os.path.isfile(extractTo): 78 | raise (f"could not extract {fileName} from {zipped}") 79 | return extractTo 80 | 81 | 82 | class Profiler: 83 | """ 84 | simple profiler 85 | """ 86 | 87 | def __init__(self, msg, profile=True): 88 | """ 89 | construct me with the given msg and profile active flag 90 | 91 | Args: 92 | msg(str): the message to show if profiling is active 93 | profile(bool): True if messages should be shown 94 | """ 95 | self.msg = msg 96 | self.profile = profile 97 | self.starttime = time.time() 98 | if profile: 99 | print(f"Starting {msg} ...") 100 | 101 | def time(self, extraMsg=""): 102 | """ 103 | time the action and print if profile is active 104 | """ 105 | elapsed = time.time() - self.starttime 106 | if self.profile: 107 | print(f"{self.msg}{extraMsg} took {elapsed:5.1f} s") 108 | return elapsed 109 | 110 | 111 | def remove_non_ascii(s): 112 | """ 113 | Remove non ascii chars from the given string 114 | Args: 115 | s: 116 | string: The string to remove chars from 117 | Returns: 118 | string: The result string with non-ascii chars removed 119 | 120 | Hat tip: http://stackoverflow.com/a/1342373/2367526 121 | """ 122 | return "".join(i for i in s if ord(i) < 128) 123 | 124 | 125 | def fuzzy_match(s1, s2, max_dist=0.8): 126 | """ 127 | Fuzzy match the given two strings with the given maximum distance 128 | jellyfish jaro_winkler_similarity based on https://en.wikipedia.org/wiki/Jaro-Winkler_distance 129 | Args: 130 | s1: 131 | string: First string 132 | s2: 133 | string: Second string 134 | max_dist: 135 | float: The distance - default: 0.8 136 | Returns: 137 | True if the match is greater equals max_dist. Otherwise false 138 | """ 139 | return jellyfish.jaro_winkler_similarity(s1, s2) >= max_dist 140 | -------------------------------------------------------------------------------- /geograpy/version.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2024-03-29 3 | 4 | @author: wf 5 | """ 6 | from dataclasses import dataclass 7 | 8 | import geograpy 9 | 10 | 11 | @dataclass 12 | class Version: 13 | """ 14 | Version handling for the geograpy3 project. 15 | """ 16 | 17 | name = "geograpy3" 18 | version = geograpy.__version__ 19 | date = "2023-09-10" 20 | updated = "2024-03-29" 21 | description = "Extract countries, regions, and cities from a URL or text" 22 | 23 | authors = "Somnath Rakshit, Wolfgang Fahl, Tim Holzheim" # Combining all authors into a single string 24 | 25 | doc_url = "https://geograpy3.readthedocs.io" 26 | chat_url = "https://github.com/somnathrakshit/geograpy3/discussions" 27 | cm_url = "https://github.com/somnathrakshit/geograpy3" 28 | 29 | license = """Copyright 2023-2024 contributors. All rights reserved. 30 | 31 | Licensed under the Apache License 2.0 32 | http://www.apache.org/licenses/LICENSE-2.0 33 | 34 | Distributed on an "AS IS" basis without warranties 35 | or conditions of any kind, either express or implied.""" 36 | 37 | longDescription = f"""{name} version {version} 38 | {description} 39 | 40 | Created by {authors} on {date} last updated {updated}. 41 | For more information, visit {doc_url}.""" 42 | -------------------------------------------------------------------------------- /geograpy/wikidata.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2020-09-23 3 | 4 | @author: wf 5 | """ 6 | import re 7 | 8 | from lodstorage.sparql import SPARQL 9 | 10 | from geograpy.utils import Profiler 11 | 12 | 13 | class Wikidata(object): 14 | """ 15 | Wikidata access 16 | """ 17 | 18 | def __init__( 19 | self, endpoint="https://query.wikidata.org/sparql", profile: bool = True 20 | ): 21 | """ 22 | Constructor 23 | """ 24 | self.endpoint = endpoint 25 | self.profile = profile 26 | 27 | def query(self, msg, queryString: str, limit=None) -> list: 28 | """ 29 | get the query result 30 | 31 | Args: 32 | msg(str): the profile message to display 33 | queryString(str): the query to execute 34 | 35 | Return: 36 | list: the list of dicts with the result 37 | """ 38 | profile = Profiler(msg, profile=self.profile) 39 | wd = SPARQL(self.endpoint) 40 | limitedQuery = queryString 41 | if limit is not None: 42 | limitedQuery = f"{queryString} LIMIT {limit}" 43 | results = wd.query(limitedQuery) 44 | lod = wd.asListOfDicts(results) 45 | for record in lod: 46 | for key in list(record.keys()): 47 | value = record[key] 48 | if isinstance(value, str): 49 | if value.startswith("http://www.wikidata.org/"): 50 | record[key] = self.getWikidataId(value) 51 | if key.lower().endswith("coord"): 52 | lat, lon = Wikidata.getCoordinateComponents(value) 53 | record["lat"] = lat 54 | record["lon"] = lon 55 | record.pop(key) 56 | 57 | profile.time(f"({len(lod)})") 58 | return lod 59 | 60 | def store2DB(self, lod, tableName: str, primaryKey: str = None, sqlDB=None): 61 | """ 62 | store the given list of dicts to the database 63 | 64 | Args: 65 | lod(list): the list of dicts 66 | tableName(str): the table name to use 67 | primaryKey(str): primary key (if any) 68 | sqlDB(SQLDB): target SQL database 69 | """ 70 | msg = f"Storing {tableName}" 71 | profile = Profiler(msg, profile=self.profile) 72 | entityInfo = sqlDB.createTable( 73 | lod, 74 | entityName=tableName, 75 | primaryKey=primaryKey, 76 | withDrop=True, 77 | sampleRecordCount=-1, 78 | ) 79 | sqlDB.store(lod, entityInfo, fixNone=True) 80 | profile.time() 81 | 82 | def getCountries(self, limit=None): 83 | """ 84 | get a list of countries 85 | 86 | `try query `_ 87 | 88 | """ 89 | queryString = """# get a list of countries 90 | # for geograpy3 library 91 | # see https://github.com/somnathrakshit/geograpy3/issues/15 92 | PREFIX rdfs: 93 | PREFIX wd: 94 | PREFIX wdt: 95 | PREFIX p: 96 | PREFIX ps: 97 | PREFIX pq: 98 | # get City details with Country 99 | SELECT DISTINCT ?wikidataid ?name ?iso ?pop ?coord 100 | WHERE { 101 | BIND (?countryQ AS ?wikidataid) 102 | 103 | # instance of Country 104 | # inverse path see https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization#Inverse_property_paths 105 | wd:Q6256 ^wdt:P279*/^wdt:P31 ?countryQ . 106 | 107 | # VALUES ?country { wd:Q55}. 108 | # label for the country 109 | ?countryQ rdfs:label ?name filter (lang(?name) = "en"). 110 | # get the continent (s) 111 | #OPTIONAL { 112 | # ?country wdt:P30 ?continent. 113 | # ?continent rdfs:label ?continentLabel filter (lang(?continentLabel) = "en"). 114 | #} 115 | # get the coordinates 116 | OPTIONAL { 117 | ?countryQ wdt:P625 ?coord. 118 | } 119 | # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code 120 | ?countryQ wdt:P297 ?iso. 121 | # population of country 122 | OPTIONAL 123 | { 124 | SELECT ?countryQ (max(?countryPopulationValue) as ?pop) 125 | WHERE { 126 | ?countryQ wdt:P1082 ?countryPopulationValue 127 | } group by ?countryQ 128 | } 129 | # https://www.wikidata.org/wiki/Property:P2132 130 | # nominal GDP per capita 131 | # OPTIONAL { ?country wdt:P2132 ?countryGDP_perCapitaValue. } 132 | } 133 | ORDER BY ?iso""" 134 | msg = "Getting countries from wikidata ETA 10s" 135 | countryList = self.query(msg, queryString, limit=limit) 136 | return countryList 137 | 138 | def getRegions(self, limit=None): 139 | """ 140 | get Regions from Wikidata 141 | 142 | `try query `_ 143 | """ 144 | queryString = """# get a list of regions 145 | # for geograpy3 library 146 | # see https://github.com/somnathrakshit/geograpy3/issues/15 147 | PREFIX rdfs: 148 | PREFIX wd: 149 | PREFIX wdt: 150 | PREFIX wikibase: 151 | SELECT DISTINCT ?countryId (?regionQ as ?wikidataid) ?name ?iso ?pop ?coord 152 | WHERE 153 | { 154 | # administrative unit of first order 155 | ?regionQ wdt:P31/wdt:P279* wd:Q10864048. 156 | OPTIONAL { 157 | ?regionQ rdfs:label ?name filter (lang(?name) = "en"). 158 | } 159 | # isocode state/province (mandatory - filters historic regions while at it ...) 160 | # filter historic regions 161 | # FILTER NOT EXISTS {?region wdt:P576 ?end} 162 | { 163 | SELECT ?regionQ (max(?regionAlpha2) as ?iso) (max(?regionPopulationValue) as ?pop) (max(?locationValue) as ?coord) 164 | WHERE { 165 | ?regionQ wdt:P300 ?regionAlpha2. 166 | # get the population 167 | # https://www.wikidata.org/wiki/Property:P1082 168 | OPTIONAL { 169 | ?regionQ wdt:P1082 ?regionPopulationValue 170 | } 171 | # get the location 172 | # https://www.wikidata.org/wiki/Property:P625 173 | OPTIONAL { 174 | ?regionQ wdt:P625 ?locationValue. 175 | } 176 | } GROUP BY ?regionQ 177 | } 178 | # # https://www.wikidata.org/wiki/Property:P297 179 | OPTIONAL { 180 | ?regionQ wdt:P17 ?countryId. 181 | } 182 | } ORDER BY ?iso""" 183 | msg = "Getting regions from wikidata ETA 15s" 184 | regionList = self.query(msg, queryString, limit=limit) 185 | return regionList 186 | 187 | def getCities(self, limit=1000000): 188 | """ 189 | get all human settlements as list of dict with duplicates for label, region, country ... 190 | """ 191 | queryString = """PREFIX rdfs: 192 | PREFIX wdt: 193 | PREFIX wd: 194 | PREFIX skos: 195 | 196 | SELECT DISTINCT (?cityQ as ?wikidataid) ?city ?altLabel ?geoNameId ?gndId ?cityPopulation ?cityCoord ?regionId ?countryId 197 | WHERE { 198 | # instance of human settlement https://www.wikidata.org/wiki/Q486972 199 | wd:Q486972 ^wdt:P279*/^wdt:P31 ?cityQ . 200 | # Values 201 | # VALUES ?cityQ { wd:Q656 } 202 | 203 | # label of the City 204 | ?cityQ rdfs:label ?city filter (lang(?city) = "en"). 205 | 206 | OPTIONAL { 207 | ?cityQ skos:altLabel ?altLabel . 208 | FILTER (lang(?altLabel) = "en") 209 | } 210 | 211 | # geoName Identifier 212 | OPTIONAL { 213 | ?cityQ wdt:P1566 ?geoNameId. 214 | } 215 | 216 | # GND-ID 217 | OPTIONAL { 218 | ?cityQ wdt:P227 ?gndId. 219 | } 220 | 221 | # population of city 222 | OPTIONAL { 223 | SELECT ?cityQ (max(?cityPopulationValue) as ?cityPopulation) 224 | WHERE { 225 | ?cityQ wdt:P1082 ?cityPopulationValue 226 | } group by ?cityQ 227 | } 228 | 229 | OPTIONAL{ 230 | ?cityQ wdt:P625 ?cityCoord . 231 | } 232 | 233 | # region this city belongs to 234 | OPTIONAL { 235 | ?cityQ wdt:P131 ?regionId . 236 | } 237 | 238 | # country this city belongs to 239 | OPTIONAL { 240 | ?cityQ wdt:P17 ?countryId . 241 | } 242 | 243 | } 244 | """ 245 | msg = "Getting cities (human settlements) from wikidata ETA 50 s" 246 | citiesList = self.query(msg, queryString, limit=limit) 247 | return citiesList 248 | 249 | def getCitiesForRegion(self, regionId, msg): 250 | """ 251 | get the cities for the given Region 252 | """ 253 | regionPath = ( 254 | "?region ^wdt:P131/^wdt:P131/^wdt:P131 ?cityQ." 255 | if regionId in ["Q980", "Q21"] 256 | else "?cityQ wdt:P131* ?region." 257 | ) 258 | queryString = """# get cities by region for geograpy3 259 | PREFIX rdfs: 260 | PREFIX wdt: 261 | PREFIX wd: 262 | 263 | SELECT distinct (?cityQ as ?wikidataid) ?name ?geoNameId ?gndId ?regionId ?countryId ?pop ?coord WHERE { 264 | VALUES ?hsType { 265 | wd:Q1549591 wd:Q3957 wd:Q5119 wd:Q15284 wd:Q62049 wd:Q515 wd:Q1637706 wd:Q1093829 wd:Q486972 wd:Q532 266 | } 267 | 268 | VALUES ?region { 269 | wd:%s 270 | } 271 | 272 | # region the city should be in 273 | %s 274 | 275 | # type of human settlement to try 276 | ?hsType ^wdt:P279*/^wdt:P31 ?cityQ. 277 | 278 | # label of the City 279 | ?cityQ rdfs:label ?name filter (lang(?name) = "en"). 280 | 281 | # geoName Identifier 282 | OPTIONAL { 283 | ?cityQ wdt:P1566 ?geoNameId. 284 | } 285 | 286 | # GND-ID 287 | OPTIONAL { 288 | ?cityQ wdt:P227 ?gndId. 289 | } 290 | 291 | OPTIONAL{ 292 | ?cityQ wdt:P625 ?coord . 293 | } 294 | 295 | # region this city belongs to 296 | OPTIONAL { 297 | ?cityQ wdt:P131 ?regionId . 298 | } 299 | 300 | OPTIONAL { 301 | ?cityQ wdt:P1082 ?pop 302 | } 303 | 304 | # country this city belongs to 305 | OPTIONAL { 306 | ?cityQ wdt:P17 ?countryId . 307 | } 308 | }""" % ( 309 | regionId, 310 | regionPath, 311 | ) 312 | regionCities = self.query(msg, queryString) 313 | return regionCities 314 | 315 | def getCityStates(self, limit=None): 316 | """ 317 | get city states from Wikidata 318 | 319 | `try query `_ 320 | """ 321 | queryString = """# get a list of city states 322 | # for geograpy3 library 323 | PREFIX rdfs: 324 | PREFIX wd: 325 | PREFIX wdt: 326 | PREFIX wikibase: 327 | SELECT DISTINCT ?countryId (?cityStateQ as ?wikidataid) ?name ?iso ?pop ?coord 328 | WHERE 329 | { 330 | # all citiy states 331 | ?cityStateQ wdt:P31 wd:Q133442 . 332 | ?cityStateQ rdfs:label ?name filter (lang(?name) = "en"). 333 | { 334 | SELECT ?cityStateQ (max(?isoCode) as ?iso) (max(?populationValue) as ?pop) (max(?locationValue) as ?coord) 335 | WHERE { 336 | ?cityStateQ wdt:P300|wdt:P297 ?isoCode. 337 | # get the population 338 | # https://www.wikidata.org/wiki/Property:P1082 339 | OPTIONAL { 340 | ?cityStateQ wdt:P1082 ?populationValue 341 | } 342 | # get the location 343 | # https://www.wikidata.org/wiki/Property:P625 344 | OPTIONAL { 345 | ?cityStateQ wdt:P625 ?locationValue. 346 | } 347 | } GROUP BY ?cityStateQ 348 | } 349 | OPTIONAL { 350 | ?cityStateQ wdt:P17 ?countryId. 351 | } 352 | } ORDER BY ?iso""" 353 | msg = "Getting regions from wikidata ETA 15s" 354 | cityStateList = self.query(msg, queryString, limit=limit) 355 | return cityStateList 356 | 357 | @staticmethod 358 | def getCoordinateComponents(coordinate: str) -> (float, float): 359 | """ 360 | Converts the wikidata coordinate representation into its subcomponents longitude and latitude 361 | Example: 'Point(-118.25 35.05694444)' results in ('-118.25' '35.05694444') 362 | 363 | Args: 364 | coordinate: coordinate value in the format as returned by wikidata queries 365 | 366 | Returns: 367 | Returns the longitude and latitude of the given coordinate as separate values 368 | """ 369 | # https://stackoverflow.com/a/18237992/1497139 370 | floatRegex = r"[-+]?\d+([.,]\d*)?" 371 | regexp = rf"Point\((?P{floatRegex})\s+(?P{floatRegex})\)" 372 | cMatch = None 373 | if coordinate: 374 | try: 375 | cMatch = re.search(regexp, coordinate) 376 | except Exception as ex: 377 | # ignore 378 | pass 379 | if cMatch: 380 | latStr = cMatch.group("lat") 381 | lonStr = cMatch.group("lon") 382 | lat, lon = float(latStr.replace(",", ".")), float(lonStr.replace(",", ".")) 383 | if lon > 180: 384 | lon = lon - 360 385 | return lat, lon 386 | else: 387 | # coordinate does not have the expected format 388 | return None, None 389 | 390 | @staticmethod 391 | def getWikidataId(wikidataURL: str): 392 | """ 393 | Extracts the wikidata id from the given wikidata URL 394 | 395 | Args: 396 | wikidataURL: wikidata URL the id should be extracted from 397 | 398 | Returns: 399 | The wikidata id if present in the given wikidata URL otherwise None 400 | """ 401 | 402 | # regex pattern taken from https://www.wikidata.org/wiki/Q43649390 and extended to also support property ids 403 | wikidataidMatch = re.search(r"[PQ][1-9]\d*", wikidataURL) 404 | if wikidataidMatch and wikidataidMatch.group(0): 405 | wikidataid = wikidataidMatch.group(0) 406 | return wikidataid 407 | else: 408 | return None 409 | 410 | @staticmethod 411 | def getValuesClause(varName: str, values, wikidataEntities: bool = True): 412 | """ 413 | generates the SPARQL value clause for the given variable name containing the given values 414 | Args: 415 | varName: variable name for the ValuesClause 416 | values: values for the clause 417 | wikidataEntities(bool): if true the wikidata prefix is added to the values otherwise it is expected taht the given values are proper IRIs 418 | 419 | Returns: 420 | str 421 | """ 422 | clauseValues = "" 423 | if isinstance(values, list): 424 | for value in values: 425 | if wikidataEntities: 426 | clauseValues += f"wd:{value} " 427 | else: 428 | clauseValues += f"{value} " 429 | else: 430 | if wikidataEntities: 431 | clauseValues = f"wd:{values} " 432 | else: 433 | clauseValues = f"{values} " 434 | clause = "VALUES ?%s { %s }" % (varName, clauseValues) 435 | return clause 436 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "geograpy3" 7 | description = "Extract countries, regions and cities from a URL or text" 8 | keywords = [ "geography", "locations", "extraction", "text analysis"] 9 | home-page = "https://github.com/somnathrakshit/geograpy3" 10 | readme = "README.md" 11 | license = {text = "Apache-2.0"} 12 | authors = [ 13 | {name = "Somnath Rakshit", email = "somnath52@gmail.com"} 14 | ] 15 | maintainers = [ 16 | { name = "Somnath Rakshit", email = "somnath52@gmail.com"}, 17 | { name = "Wolfgang Fahl", email = "wf@bitplan.com" }, 18 | { name = "Tim Holzheim", email = "tim.holzheim@rwth-aachen.de" } 19 | ] 20 | classifiers=[ 21 | "Programming Language :: Python", 22 | "Programming Language :: Python :: 3.9", 23 | "Programming Language :: Python :: 3.10", 24 | "Programming Language :: Python :: 3.11", 25 | "Programming Language :: Python :: 3.12" 26 | ] 27 | dependencies = [ 28 | # https://pypi.org/project/newspaper3k/ 29 | "newspaper3k>=0.2.8", 30 | # https://pypi.org/project/nltk/ 31 | "nltk>=3.8.1", 32 | # https://pypi.org/project/jellyfish/ 33 | "jellyfish>=1.0.3", 34 | # https://pypi.org/project/numpy/ 35 | "numpy>=1.26.4", 36 | # https://pypi.org/project/pyLodStorage/ 37 | "pylodstorage>=0.10.3", 38 | # https://pypi.org/project/sphinx-rtd-theme/ 39 | "sphinx-rtd-theme>=2.0.0", 40 | # https://github.com/scikit-learn/scikit-learn 41 | "scikit-learn>=1.4.1", 42 | # https://pypi.org/project/pandas/ 43 | "pandas>=2.1.5", 44 | # https://pypi.org/project/geopy/ 45 | "geopy>=2.4.1", 46 | # https://pypi.org/project/OSMPythonTools/ 47 | "OSMPythonTools>=0.3.5" 48 | ] 49 | 50 | requires-python = ">=3.7" 51 | dynamic = ["version"] 52 | 53 | [tool.hatch.version] 54 | path = "geograpy/__init__.py" 55 | 56 | [project.urls] 57 | Homepage = "https://github.com/somnathrakshit/geograpy3" 58 | Documentation = "https://geograpy3.readthedocs.io" 59 | Source = "https://github.com/somnathrakshit/geograpy3" 60 | Issues = "https://github.com/somnathrakshit/geograpy3/issues" 61 | 62 | [tool.hatch.build.targets.wheel] 63 | only-include = ["geograpy"] 64 | 65 | [tool.hatch.build.targets.wheel.sources] 66 | "geograpy" = "geograpy" 67 | 68 | [project.optional-dependencies] 69 | docs = [ 70 | "sphinx", 71 | "sphinx-rtd-theme", 72 | ] 73 | test = [ 74 | "pytest", 75 | "coverage", 76 | ] 77 | 78 | [project.scripts] 79 | geograpy = "geograpy.locate:main" 80 | geograpy-nltk = "geograpy.geograpy_nltk:main" 81 | 82 | [project.data-files."geograpy/data"] 83 | include = ["*.csv"] 84 | -------------------------------------------------------------------------------- /scripts/blackisort: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # WF 2024-03-29 3 | package=geograpy 4 | isort tests/*.py 5 | black tests/*.py 6 | isort $package/*.py 7 | black $package/*.py 8 | -------------------------------------------------------------------------------- /scripts/doc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # WF 2020-01-31 3 | 4 | # 5 | # check whether the given command is installed 6 | # 7 | checkinstalled() { 8 | local l_cmd="$1" 9 | which $l_cmd > /dev/null 10 | if [ $? -ne 0 ] 11 | then 12 | echo "$l_cmd need to be installed" 1>&2 13 | exit 1 14 | fi 15 | } 16 | 17 | fixconf() { 18 | local l_year="$1" 19 | local l_author="$2" 20 | conf=conf.py 21 | # fix sys path 22 | # https://stackoverflow.com/questions/10324393/sphinx-build-fail-autodoc-cant-import-find-module 23 | grep "# sys.path" $conf 24 | if [ $? -eq 0 ] 25 | then 26 | tmpconf=/tmp/conf$$.py 27 | cat $conf | awk -v author="$l_author" -v year="$l_year" ' 28 | BEGIN { 29 | quote="\x27" 30 | squote="\047" 31 | } 32 | /# import os/ { next } 33 | /# import sys/ { next } 34 | /copyright/ { 35 | printf "copyright = %s%s, %s%s\n",squote,year,author,squote 36 | next 37 | } 38 | /author/ { 39 | printf "author = %s%s%s\n",squote,author,squote 40 | next 41 | } 42 | /html_theme = / { 43 | # html_theme = 'alabaster' 44 | printf "html_theme = %ssphinx_rtd_theme%s\n",squote,squote 45 | printf "master_doc = %sindex%s\n",squote,squote 46 | next 47 | } 48 | # add sphinx_rtd extension 49 | /extensions = / { 50 | print $0 51 | printf "\t%ssphinx_rtd_theme%s,\n",squote,squote 52 | printf "\t%ssphinx.ext.napoleon%s,\n",squote,squote 53 | next 54 | } 55 | /# sys.path/ { 56 | print("#https://stackoverflow.com/a/44980548/1497139") 57 | print("import os") 58 | print("import sys") 59 | print("import sphinx_rtd_theme") 60 | printf("basepath=os.path.abspath(%s../..%s)\n",squote,squote) 61 | printf("print(%sadding basepath %%s%s %% (basepath))\n",squote,squote) 62 | print("sys.path.insert(0, basepath)") 63 | printf("print(%ssys.path is now: %%s%s %% (sys.path))\n",squote,squote) 64 | next 65 | } 66 | { print}' > $tmpconf 67 | #diff $tmpconf $conf 68 | mv $tmpconf $conf 69 | echo "$src/conf.py has been fixed" 70 | fi 71 | } 72 | 73 | src=docs/source 74 | checkinstalled sphinx-apidoc 75 | sphinx-apidoc --full -f -o $src . 76 | cd $src 77 | 78 | fixconf 2018-2020 "Somnath Rakshit, Wolfgang Fahl" 79 | make clean html 80 | # if [ "$GHACTIONS" != "ACTIVE" ] 81 | # then 82 | # open _build/html/index.html 83 | # fi 84 | -------------------------------------------------------------------------------- /scripts/download: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [! -d $HOME/.geograpy3]; then 4 | mkdir $HOME/.geograpy3 5 | fi 6 | cd $HOME/.geograpy3 7 | 8 | curl -o locations.db.gz --remote-name -L https://github.com/somnathrakshit/geograpy3/wiki/data/locations.db.gz 9 | gzip -d locations.db.gz 10 | curl -o regions.tgz --remote-name -L https://github.com/somnathrakshit/geograpy3/wiki/data/regions.tgz 11 | tar xvfz regions.tgz 12 | rm regions.tgz 13 | -------------------------------------------------------------------------------- /scripts/install: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # WF 2020-03-25 3 | # update 2024-03-29 4 | pip install . 5 | -------------------------------------------------------------------------------- /scripts/release: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # WF 2020-03-26 3 | # create a release see https://packaging.python.org/tutorials/packaging-projects/ 4 | # 5 | # get the absolute filename 6 | # 7 | get_abs_filename() { 8 | # $1 : relative filename 9 | echo "$(cd "$(dirname "$1")" && pwd)/$(basename "$1")" 10 | } 11 | 12 | pwd=$(pwd) 13 | scriptPath=$(get_abs_filename $(dirname $0)) 14 | cd $scriptPath/.. 15 | rm -rf dist 16 | $scriptPath/doc 17 | python3 setup.py sdist bdist_wheel 18 | python3 -m twine upload -u __token__ --repository-url https://upload.pypi.org/legacy/ dist/* 19 | -------------------------------------------------------------------------------- /scripts/test: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # WF 2020-06-03 3 | python="python3" 4 | while [ "$1" != "" ] 5 | do 6 | option="$1" 7 | case $option in 8 | -d|--debug) 9 | # show environment for debugging 10 | env 11 | ;; 12 | -p|--python) 13 | shift 14 | python="$1" 15 | ;; 16 | esac 17 | shift 18 | done 19 | $python -m unittest discover 20 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/somnathrakshit/geograpy3/bd167b5a91584d4449911b5cfffa4ba7e23cbc3c/tests/__init__.py -------------------------------------------------------------------------------- /tests/basetest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2021-08-13 3 | 4 | @author: wf 5 | """ 6 | import getpass 7 | import json 8 | import os 9 | from unittest import TestCase 10 | 11 | from geograpy.locator import Locator 12 | from geograpy.utils import Profiler 13 | 14 | 15 | class Geograpy3Test(TestCase): 16 | """ 17 | base test for geograpy 3 tests 18 | """ 19 | 20 | def setUp(self, debug=False): 21 | """ 22 | setUp test environment 23 | """ 24 | TestCase.setUp(self) 25 | self.debug = debug 26 | msg = f"test {self._testMethodName}, debug={self.debug}" 27 | self.profile = Profiler(msg) 28 | Locator.resetInstance() 29 | locator = Locator.getInstance() 30 | locator.downloadDB() 31 | # actively test Wikidata tests? 32 | self.testWikidata = False 33 | 34 | def tearDown(self): 35 | TestCase.tearDown(self) 36 | self.profile.time() 37 | 38 | def inCI(self): 39 | """ 40 | are we running in a Continuous Integration Environment? 41 | """ 42 | publicCI = getpass.getuser() in ["travis", "runner"] 43 | jenkins = "JENKINS_HOME" in os.environ 44 | return publicCI or jenkins 45 | 46 | def handleWikidataException(self, ex): 47 | """ 48 | handle a Wikidata exception 49 | Args: 50 | ex(Exception): the exception to handle - e.g. timeout 51 | """ 52 | msg = str(ex) 53 | print(f"Wikidata test failed {msg}") 54 | # only raise exception for real problems 55 | if "HTTP Error 500" in msg: 56 | print("test can not work if server has problems") 57 | return 58 | if isinstance(ex, json.decoder.JSONDecodeError): 59 | print("potential SPARQLWrapper issue") 60 | return 61 | raise ex 62 | -------------------------------------------------------------------------------- /tests/testCachingCitiesByRegion.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2021-08-16 3 | 4 | @author: wf 5 | """ 6 | import getpass 7 | import json 8 | import os 9 | import re 10 | import unittest 11 | 12 | from geograpy.locator import ( 13 | City, 14 | CityManager, 15 | CountryManager, 16 | LocationContext, 17 | RegionManager, 18 | ) 19 | from geograpy.utils import Profiler 20 | from geograpy.wikidata import Wikidata 21 | from tests.basetest import Geograpy3Test 22 | 23 | 24 | class TestCachingCitiesByRegion(Geograpy3Test): 25 | """ 26 | The wikidata city query times out even on the wikidata copy in the RWTH i5 infrastructure 27 | Therefore we need to split the queries to a reasonable size so that each individual query does not time out. 28 | 29 | A query per region is done some 3000 times. 30 | The query used here works for most regions except a few where the query needs to be modified to not go the full depth of 31 | the Property 32 | located in the administrative territorial entity (P131) 33 | but limit it 34 | 35 | """ 36 | 37 | def cacheRegionCities2Json(self, limit, showDone=False): 38 | # TODO - refactor to Locator/LocationContext - make available via command line 39 | wd = Wikidata() 40 | config = LocationContext.getDefaultConfig() 41 | countryManager = CountryManager(config=config) 42 | countryManager.fromCache() 43 | regionManager = RegionManager(config=config) 44 | regionManager.fromCache() 45 | regionList = regionManager.getList() 46 | total = len(regionList) 47 | cachePath = f"{config.getCachePath()}/regions" 48 | if not os.path.exists(cachePath): 49 | os.makedirs(cachePath) 50 | for index, region in enumerate(regionList): 51 | if index >= limit: 52 | break 53 | regionId = region.wikidataid 54 | msg = f"{index+1:4d}/{total:4d}:getting cities for {region.name} {region.iso} {region.wikidataid}" 55 | jsonFileName = f"{cachePath}/{region.iso}.json" 56 | if os.path.isfile(jsonFileName): 57 | if showDone: 58 | print(msg) 59 | else: 60 | try: 61 | regionCities = wd.getCitiesForRegion(regionId, msg) 62 | jsonStr = json.dumps(regionCities) 63 | with open(jsonFileName, "w") as jsonFile: 64 | jsonFile.write(jsonStr) 65 | except Exception as ex: 66 | self.handleWikidataException(ex) 67 | 68 | def testGetCitiesByRegion(self): 69 | """ 70 | test counting human settlement types 71 | """ 72 | if self.inCI(): 73 | limit = 50 74 | elif getpass.getuser() == "wf": 75 | limit = 5000 76 | else: 77 | limit = 0 78 | self.cacheRegionCities2Json(limit=limit) 79 | 80 | def testReadCachedCitiesByRegion(self): 81 | """ 82 | test reading the cached json Files 83 | """ 84 | # This is to populate the cities database 85 | return 86 | config = LocationContext.getDefaultConfig() 87 | regionManager = RegionManager(config=config) 88 | regionManager.fromCache() 89 | regionByIso, _dup = regionManager.getLookup("iso") 90 | self.assertEqual(56, len(_dup)) 91 | jsonFiles = CityManager.getJsonFiles(config) 92 | msg = f"reading {len(jsonFiles)} cached city by region JSON cache files" 93 | self.assertTrue(len(jsonFiles) > 2000) 94 | profiler = Profiler(msg) 95 | cityManager = CityManager(config=config) 96 | cityManager.getList().clear() 97 | for jsonFileName in jsonFiles: 98 | isoMatch = re.search(r"/([^\/]*)\.json", jsonFileName) 99 | if not isoMatch: 100 | print(f"{jsonFileName} - does not match a known region's ISO code") 101 | else: 102 | rIso = isoMatch.group(1) 103 | region = regionByIso[rIso] 104 | with open(jsonFileName) as jsonFile: 105 | cities4Region = json.load(jsonFile) 106 | for city4Region in cities4Region: 107 | city = City() 108 | city.fromDict(city4Region) 109 | # fix regionId 110 | if hasattr(city, "regionId"): 111 | city.partOfRegionId = city.regionId 112 | city.regionId = region.wikidataid 113 | cityManager.add(city) 114 | pass 115 | cityManager.store() 116 | profiler.time() 117 | 118 | def testCityFromCityStates(self): 119 | """ 120 | tests if city states are queried correctly if given the region 121 | For city states the city is region and city (in some cases also country). 122 | This test ensures that by querying for the cities of a region the city states include themself in the result 123 | (the result for cities in city-states often includes the municipalities) 124 | """ 125 | wd = Wikidata() 126 | cityStateRecords = wd.getCityStates() 127 | for cityStateRecord in cityStateRecords: 128 | regionId = cityStateRecord.get("wikidataid") 129 | regionCities = wd.getCitiesForRegion( 130 | regionId, msg=f"Query for cities in {cityStateRecord.get('name')}" 131 | ) 132 | foundCities = [c.get("wikidataid") for c in regionCities] 133 | self.assertTrue(regionId in foundCities) 134 | 135 | 136 | if __name__ == "__main__": 137 | # import sys;sys.argv = ['', 'Test.testName'] 138 | unittest.main() 139 | -------------------------------------------------------------------------------- /tests/testCachingLocationLabels.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2021-08-17 3 | 4 | @author: th 5 | """ 6 | import math 7 | import unittest 8 | 9 | from lodstorage.sql import SQLDB 10 | 11 | from geograpy.locator import CityManager, CountryManager, LocationContext, RegionManager 12 | from geograpy.wikidata import Wikidata 13 | from tests.basetest import Geograpy3Test 14 | 15 | 16 | class TestCachingLocationLabels(Geograpy3Test): 17 | """ 18 | adds location label tables 19 | 20 | """ 21 | 22 | def setUp(self): 23 | pass 24 | 25 | def tearDown(self): 26 | pass 27 | 28 | def testCacheLocationLabels(self): 29 | """ 30 | Generates the location label tabels in the SQL db fro countries, regions and cities by querying wikidata for 31 | the rdfs:label and skos:altLa of each location. 32 | A view containing all location labels is also created. 33 | """ 34 | testLocationLabelExtraction = False 35 | if testLocationLabelExtraction: 36 | wd = Wikidata() 37 | config = LocationContext.getDefaultConfig() 38 | countryManager = CountryManager(config=config) 39 | regionManager = RegionManager(config=config) 40 | cityManager = CityManager(config=config) 41 | sqlDb = SQLDB(dbname=config.cacheFile, debug=self.debug) 42 | for manager in countryManager, regionManager, cityManager: 43 | manager.fromCache() 44 | wikidataIdQuery = ( 45 | f"SELECT DISTINCT wikidataid FROM {manager.entityPluralName}" 46 | ) 47 | wikidataIdQueryRes = sqlDb.query(wikidataIdQuery) 48 | wikidataIds = [l["wikidataid"] for l in wikidataIdQueryRes] 49 | 50 | chunkSize = 1000 51 | iterations = math.ceil(len(wikidataIds) / chunkSize) 52 | progress = 0 53 | res = [] 54 | for i in range(iterations): 55 | workOnIds = wikidataIds[i * chunkSize : (i + 1) * chunkSize] 56 | progress += len(workOnIds) 57 | index = 0 58 | values = "" 59 | for location in workOnIds: 60 | spacer = " \n\t\t\t" if index % 10 == 0 else " " 61 | values += f"{spacer}wd:{wd.getWikidataId(location)}" 62 | index += 1 63 | query = self.getLablesQuery(values) 64 | res.extend( 65 | wd.query( 66 | f"Query {i}/{iterations} - Querying {manager.entityName} Labels", 67 | queryString=query, 68 | ) 69 | ) 70 | wd.store2DB(res, tableName=f"{manager.entityName}_labels", sqlDB=sqlDb) 71 | self.createViews(sqlDB=sqlDb) 72 | 73 | def getLablesQuery(self, wikidataIds: str): 74 | """ 75 | get the query for the alternatives labels for the given values 76 | 77 | wikidataIds(str): a list of wikidataids 78 | """ 79 | query = ( 80 | """# get alternative labels for the given wikidata 81 | PREFIX rdfs: 82 | PREFIX skos: 83 | PREFIX wd: 84 | SELECT DISTINCT ?wikidataid ?label ?lang 85 | WHERE{ 86 | VALUES ?wikidataid { %s } 87 | ?wikidataid rdfs:label|skos:altLabel ?label 88 | BIND(lang(?label) AS ?lang) 89 | FILTER(lang(?label)="en") 90 | }""" 91 | % wikidataIds 92 | ) 93 | return query 94 | 95 | def createViews(self, sqlDB): 96 | viewDDLs = [ 97 | "DROP VIEW IF EXISTS location_labels", 98 | """ 99 | CREATE VIEW location_labels AS 100 | SELECT *, "Country" AS "hierarchy" 101 | FROM country_labels 102 | UNION 103 | SELECT *, "Region" AS "hierarchy" 104 | FROM region_labels 105 | UNION 106 | SELECT *, "City" AS "hierarchy" 107 | FROM city_labels 108 | """, 109 | "DROP INDEX if EXISTS cityLabelByWikidataid", 110 | "CREATE INDEX cityLabelByWikidataid ON city_labels (wikidataid)", 111 | "DROP INDEX if EXISTS regionLabelByWikidataid", 112 | "CREATE INDEX regionLabelByWikidataid ON region_labels (wikidataid)", 113 | "DROP INDEX if EXISTS countryLabelByWikidataid", 114 | "CREATE INDEX countryLabelByWikidataid ON country_labels (wikidataid)", 115 | ] 116 | for viewDDL in viewDDLs: 117 | sqlDB.execute(viewDDL) 118 | 119 | 120 | if __name__ == "__main__": 121 | # import sys;sys.argv = ['', 'Test.testName'] 122 | unittest.main() 123 | -------------------------------------------------------------------------------- /tests/testLocatorDatabase.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 16.08.2021 3 | 4 | @author: wf 5 | """ 6 | import os 7 | import tempfile 8 | import unittest 9 | 10 | from lodstorage.storageconfig import StorageConfig 11 | 12 | from geograpy.locator import LocationContext, Locator 13 | from tests.basetest import Geograpy3Test 14 | 15 | 16 | class TestLocatorDatabase(Geograpy3Test): 17 | """ 18 | test the locator database handling 19 | """ 20 | 21 | def testLocatorWithWikiData(self): 22 | """ 23 | test Locator 24 | """ 25 | Locator.resetInstance() 26 | loc = Locator.getInstance() 27 | # forceUpdate=True 28 | forceUpdate = False 29 | loc.populate_db(force=forceUpdate) 30 | tableList = loc.sqlDB.getTableList() 31 | expectedCities = 800000 32 | self.assertTrue(loc.db_recordCount(tableList, "countries") >= 200) 33 | self.assertTrue(loc.db_recordCount(tableList, "regions") >= 3000) 34 | self.assertTrue(loc.db_recordCount(tableList, "cities") >= expectedCities) 35 | 36 | def testHasData(self): 37 | """ 38 | check has data and populate functionality 39 | """ 40 | testDownload = False 41 | if self.inCI() or testDownload: 42 | with tempfile.TemporaryDirectory() as cacheRootDir: 43 | config = StorageConfig( 44 | cacheRootDir=cacheRootDir, cacheDirName="geograpy3" 45 | ) 46 | config.cacheFile = ( 47 | f"{config.getCachePath()}/{LocationContext.db_filename}" 48 | ) 49 | loc = Locator(storageConfig=config) 50 | if os.path.isfile(loc.db_file): 51 | os.remove(loc.db_file) 52 | # reinit sqlDB 53 | loc = Locator(storageConfig=config) 54 | self.assertFalse(loc.db_has_data()) 55 | loc.populate_db() 56 | self.assertTrue(loc.db_has_data()) 57 | 58 | 59 | if __name__ == "__main__": 60 | # import sys;sys.argv = ['', 'Test.testName'] 61 | unittest.main() 62 | -------------------------------------------------------------------------------- /tests/testQueries.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2021-08-19 3 | 4 | @author: wf 5 | """ 6 | import os 7 | import re 8 | import unittest 9 | 10 | from lodstorage.query import Query, QueryManager 11 | 12 | from geograpy.locator import LocationContext, Locator 13 | from tests.basetest import Geograpy3Test 14 | 15 | 16 | class TestQueries(Geograpy3Test): 17 | """ 18 | test queries for documentation, bug reports and the like 19 | """ 20 | 21 | def getQueryManager(self): 22 | """ 23 | get the query manager 24 | """ 25 | cachedir = LocationContext.getDefaultConfig().getCachePath() 26 | scriptDir = os.path.dirname(__file__) 27 | for path in cachedir, f"{scriptDir}/../geograpy/data": 28 | qYamlFile = f"{path}/queries.yaml" 29 | if os.path.isfile(qYamlFile): 30 | qm = QueryManager(lang="sql", debug=self.debug, queriesPath=qYamlFile) 31 | return qm 32 | return None 33 | 34 | def documentQueryResult(self, query, lod, tablefmt, show=False): 35 | """ 36 | document the query results 37 | """ 38 | for record in lod: 39 | for key in record.keys(): 40 | value = record[key] 41 | if value is not None: 42 | if isinstance(value, str): 43 | if re.match(r"Q[0-9]+", value): 44 | if tablefmt == "github": 45 | record[ 46 | key 47 | ] = f"[{value}](https://www.wikidata.org/wiki/{value})" 48 | elif tablefmt == "mediawiki": 49 | record[ 50 | key 51 | ] = f"[https://www.wikidata.org/wiki/{value} {value}]" 52 | doc = query.documentQueryResult(lod, tablefmt=tablefmt, floatfmt=".0f") 53 | if show: 54 | print(doc) 55 | 56 | def testQueries(self): 57 | """ 58 | test preconfigured queries 59 | """ 60 | qm = self.getQueryManager() 61 | self.assertIsNotNone(qm) 62 | locator = Locator.getInstance() 63 | show = self.debug 64 | # show=True 65 | for _name, query in qm.queriesByName.items(): 66 | qlod = locator.sqlDB.query(query.query) 67 | for tablefmt in ["mediawiki", "github"]: 68 | self.documentQueryResult(query, qlod, tablefmt, show=show) 69 | 70 | pass 71 | 72 | def testQuery(self): 73 | """ 74 | test a single query 75 | """ 76 | queries = [ 77 | ( 78 | "LocationLabel Count", 79 | """select count(*),hierarchy 80 | from location_labels 81 | group by hierarchy""", 82 | ), 83 | ("NY example", "select * from cityLookup where label='New York City'"), 84 | ( 85 | "Berlin example", 86 | "select * from cityLookup where label='Berlin' order by pop desc,regionName", 87 | ), 88 | ( 89 | "Issue #25", 90 | "select * from countryLookup where label in ('France', 'Hungary', 'Poland', 'Spain', 'United Kingdom')", 91 | ), 92 | ( 93 | "Issue #25 Bulgaria", 94 | "select * from cityLookup where label in ('Bulgaria','Croatia','Hungary','Czech Republic') order by pop desc,regionName", 95 | ), 96 | ] 97 | for tableName in ["countries", "regions", "cities"]: 98 | queries.append( 99 | ( 100 | f"unique wikidataids for {tableName}", 101 | f"select count(distinct(wikidataid)) as wikidataids from {tableName}", 102 | ) 103 | ) 104 | queries.append( 105 | ( 106 | f"total #records for {tableName}", 107 | f"select count(*) as recordcount from {tableName}", 108 | ) 109 | ) 110 | locator = Locator.getInstance() 111 | for title, queryString in queries: 112 | query = Query(name=title, query=queryString, lang="sql") 113 | qlod = locator.sqlDB.query(queryString) 114 | for tablefmt in ["mediawiki", "github"]: 115 | self.documentQueryResult(query, qlod, tablefmt, show=True) 116 | 117 | 118 | if __name__ == "__main__": 119 | # import sys;sys.argv = ['', 'Test.testName'] 120 | unittest.main() 121 | -------------------------------------------------------------------------------- /tests/test_LocationContext.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2021-08-13 3 | 4 | @author: wf 5 | """ 6 | import tempfile 7 | import unittest 8 | 9 | from lodstorage.storageconfig import StorageConfig 10 | 11 | from geograpy.locator import ( 12 | CityManager, 13 | CountryManager, 14 | LocationContext, 15 | LocationManager, 16 | RegionManager, 17 | ) 18 | from tests.basetest import Geograpy3Test 19 | 20 | 21 | class TestLocationContext(Geograpy3Test): 22 | """ 23 | test the location Context - these are potentially long running tests 24 | """ 25 | 26 | def getStorageConfig(self): 27 | # config=StorageConfig.getDefault() 28 | config = LocationContext.getDefaultConfig() 29 | return config 30 | 31 | def checkNoDuplicateWikidataIds( 32 | self, locationManager: LocationManager, primaryKey=None, expectedDuplicates=0 33 | ): 34 | """ 35 | check that there are no duplicate Wikidata Q identifiers in the given 36 | 37 | """ 38 | locationsByWikiDataId, duplicates = locationManager.getLookup(primaryKey) 39 | showLimit = 10 40 | if len(duplicates) > 0: 41 | for i, duplicate in enumerate(duplicates): 42 | if i < showLimit: 43 | if self.debug: 44 | print(f"{i}:{duplicate}") 45 | else: 46 | break 47 | self.assertTrue(len(duplicates) <= expectedDuplicates) 48 | return locationsByWikiDataId 49 | 50 | def testCountryManager(self): 51 | """ 52 | tests the loading and parsing of the RegionManager form the json backup file 53 | """ 54 | countryManager = CountryManager(config=self.getStorageConfig()) 55 | countryManager.fromCache() 56 | self.assertTrue(hasattr(countryManager, "countries")) 57 | self.assertTrue(len(countryManager.countries) >= 200) 58 | # check if California is in the list 59 | countriesByWikidataId = self.checkNoDuplicateWikidataIds( 60 | countryManager, "wikidataid" 61 | ) 62 | self.assertTrue("Q30" in countriesByWikidataId) 63 | 64 | def testRegionManager(self): 65 | """ 66 | tests the loading and parsing of the RegionManager form the json backup file 67 | """ 68 | regionManager = RegionManager(config=self.getStorageConfig()) 69 | regionManager.fromCache() 70 | self.assertTrue(hasattr(regionManager, "regions")) 71 | self.assertTrue(len(regionManager.regions) >= 1000) 72 | regionsByWikidataId = self.checkNoDuplicateWikidataIds( 73 | regionManager, "wikidataid", 54 74 | ) 75 | self.assertTrue("Q99" in regionsByWikidataId) 76 | 77 | def testCityManager(self): 78 | """ 79 | tests the loading and parsing of the cityList form the json backup file 80 | """ 81 | cityManager = CityManager(config=self.getStorageConfig()) 82 | cityManager.fromCache() 83 | self.assertTrue(hasattr(cityManager, "cities")) 84 | self.assertTrue(len(cityManager.cities) >= 200000) 85 | # check if Los Angeles is in the list (popular city should always be in the list) 86 | _citiesByWikiDataIdNoDuplicates = self.checkNoDuplicateWikidataIds( 87 | cityManager, "wikidataid", 304000 88 | ) # ToDo: Reduce number of duplicates 89 | citiesByWikiDataId = cityManager.getLookup("wikidataid", withDuplicates=True) 90 | self.assertTrue("Q65" in citiesByWikiDataId) 91 | 92 | def testLocationContextFromCache(self): 93 | """ 94 | test loading LocationContext from cache 95 | """ 96 | testCache = False 97 | if self.inCI() or testCache: 98 | locationContext = LocationContext.fromCache() 99 | locationContext.load() 100 | self.assertTrue(len(locationContext.countries) > 180) 101 | self.assertTrue(len(locationContext.regions) > 3500) 102 | self.assertTrue(len(locationContext.cities) > 1000000) 103 | 104 | def testIssue_59_db_download(self): 105 | """ 106 | tests if the cache database is downloaded if not present 107 | """ 108 | with tempfile.TemporaryDirectory() as tmpdir: 109 | config = StorageConfig(cacheFile="locations.db", cacheRootDir=tmpdir) 110 | config.cacheFile = f"{config.getCachePath()}/{config.cacheFile}" 111 | loc = LocationContext.fromCache(config=config) 112 | locations = loc.locateLocation("Germany") 113 | self.assertTrue(len(locations) > 0) 114 | 115 | 116 | if __name__ == "__main__": 117 | # import sys;sys.argv = ['', 'Test.testName'] 118 | unittest.main() 119 | -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import geograpy 4 | from geograpy.extraction import Extractor 5 | from tests.basetest import Geograpy3Test 6 | 7 | 8 | class TestExtractor(Geograpy3Test): 9 | """ 10 | test Extractor 11 | """ 12 | 13 | def check(self, places, expectedList): 14 | """ 15 | check the places for begin non empty and having at least the expected List of 16 | elements 17 | 18 | Args: 19 | places(Places): the places to check 20 | expectedList(list): the list of elements to check 21 | """ 22 | if self.debug: 23 | print(places) 24 | self.assertTrue(len(places) > 0) 25 | for expected in expectedList: 26 | self.assertTrue(expected in places) 27 | 28 | def testExtractorFromUrl(self): 29 | """ 30 | test the extractor 31 | """ 32 | url = "https://en.wikipedia.org/wiki/Louvre" 33 | e = Extractor(url=url) 34 | e.find_geoEntities() 35 | self.check(e.places, ["Paris", "France"]) 36 | 37 | def testGeograpyIssue32(self): 38 | """ 39 | test https://github.com/ushahidi/geograpy/issues/32 40 | """ 41 | # do not test since url is unreliable 42 | return 43 | url = "https://www.politico.eu/article/italy-incurable-economy/" 44 | places = geograpy.get_geoPlace_context(url=url) 45 | if self.debug: 46 | print(places) 47 | self.assertSetEqual( 48 | { 49 | "Italy", 50 | "Germany", 51 | "France", 52 | "United States of America", 53 | "Belgium", 54 | "Canada", 55 | }, 56 | set(places.countries), 57 | ) 58 | self.assertSetEqual( 59 | {"Rome", "Brussels", "Italy", "Germany"}, set(places.cities) 60 | ) # Notes: Italy is also city in US-NY, Germany is also city in US-TX 61 | 62 | def testGetGeoPlace(self): 63 | """ 64 | test geo place handling 65 | """ 66 | # 'http://www.bbc.com/news/world-europe-26919928' 67 | # broken since 2020-10 - returns javascript instead of plain html 68 | url = "https://en.wikipedia.org/wiki/Golden_spike" 69 | places = geograpy.get_geoPlace_context(url=url) 70 | debug = self.debug 71 | # debug=True 72 | if debug: 73 | print(places) 74 | self.assertTrue("Ogden" in places.cities) 75 | self.assertTrue("Utah" in places.regions) 76 | self.assertTrue("United States of America" in places.countries) 77 | 78 | def testExtractorFromText(self): 79 | """ 80 | test different texts for getting geo context information 81 | """ 82 | text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a 83 | Friday evening! horrible traffic here is your cue to become worse @Ma3Route """ 84 | 85 | e2 = Extractor(text=text) 86 | e2.find_entities() 87 | self.check(e2.places, ["Nairobi"]) 88 | 89 | text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """ 90 | e3 = Extractor(text=text3) 91 | e3.find_entities() 92 | self.check(e3.places, ["Nairobi"]) 93 | 94 | text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """ 95 | e4 = Extractor(text=text4) 96 | e4.find_entities() 97 | self.check(e4.places, ["Nairobi", "Ngong"]) 98 | 99 | # unicode 100 | text5 = """ There is a city called New York in the United States.""" 101 | e5 = Extractor(text=text5) 102 | e5.find_entities() 103 | self.check(e5.places, ["New York", "United States"]) 104 | 105 | # unicode and two words 106 | text6 = """ There is a city called São Paulo in Brazil.""" 107 | e6 = Extractor(text=text6) 108 | e6.find_entities() 109 | self.check(e6.places, ["São Paulo"]) 110 | 111 | def testIssue7(self): 112 | """ 113 | test https://github.com/somnathrakshit/geograpy3/issues/7 114 | disambiguating countries 115 | """ 116 | localities = [ 117 | "Vienna, Illinois,", 118 | "Paris, Texas", 119 | "Zaragoza, Spain", 120 | "Vienna, Austria", 121 | ] 122 | expected = [ 123 | {"iso": "US"}, 124 | {"iso": "US"}, 125 | {"iso": "ES"}, 126 | {"iso": "AT"}, 127 | ] 128 | for index, locality in enumerate(localities): 129 | city = geograpy.locateCity(locality, debug=False) 130 | if self.debug: 131 | print(f" {city}") 132 | self.assertEqual(expected[index]["iso"], city.country.iso) 133 | 134 | def testIssue10(self): 135 | """ 136 | test https://github.com/somnathrakshit/geograpy3/issues/10 137 | Add ISO country code 138 | """ 139 | localities = [ 140 | "Singapore", 141 | "Beijing, China", 142 | "Paris, France", 143 | "Barcelona, Spain", 144 | "Rome, Italy", 145 | "San Francisco, US", 146 | "Bangkok, Thailand", 147 | "Vienna, Austria", 148 | "Athens, Greece", 149 | "Shanghai, China", 150 | ] 151 | expectedCountry = ["SG", "CN", "FR", "ES", "IT", "US", "TH", "AT", "GR", "CN"] 152 | debug = self.debug 153 | for index, locality in enumerate(localities): 154 | city = geograpy.locateCity(locality) 155 | if debug: 156 | print(" %s" % city) 157 | self.assertEqual(expectedCountry[index], city.country.iso) 158 | 159 | def testIssue9(self): 160 | """ 161 | test https://github.com/somnathrakshit/geograpy3/issues/9 162 | [BUG]AttributeError: 'NoneType' object has no attribute 'name' on "Pristina, Kosovo" 163 | """ 164 | locality = "Pristina, Kosovo" 165 | gp = geograpy.get_geoPlace_context(text=locality) 166 | if self.debug: 167 | print(" %s" % gp.countries) 168 | print(" %s" % gp.regions) 169 | print(" %s" % gp.cities) 170 | 171 | def testStackoverflow62152428(self): 172 | """ 173 | see https://stackoverflow.com/questions/62152428/extracting-country-information-from-description-using-geograpy?noredirect=1#comment112899776_62152428 174 | """ 175 | examples = { 176 | 2: "Socialist Republic of Alachua", 177 | 3: "Hérault, France", 178 | 4: "Gwalior, India", 179 | 5: "Zaragoza,España", 180 | 6: "Zaragoza, Spain", 181 | 7: "amsterdam ", 182 | 8: "Evesham", 183 | 9: "Rochdale", 184 | } 185 | for index, text in examples.items(): 186 | places = geograpy.get_geoPlace_context(text=text) 187 | if self.debug: 188 | print("example %d: %s" % (index, places.countries)) 189 | 190 | def testStackoverflow43322567(self): 191 | """ 192 | see https://stackoverflow.com/questions/43322567 193 | """ 194 | url = "https://en.wikipedia.org/wiki/U.S._state" 195 | e = Extractor(url=url) 196 | places = e.find_geoEntities() 197 | self.check(places, ["Alabama", "Virginia", "New York"]) 198 | if self.debug: 199 | print(places) 200 | 201 | def testStackoverflow54712198(self): 202 | """ 203 | see https://stackoverflow.com/questions/54712198/not-only-extracting-places-from-a-text-but-also-other-names-in-geograpypython 204 | """ 205 | text = """Opposition Leader Mahinda Rajapaksa says that the whole public administration has collapsed due to the constitution council’s arbitrary actions. The Opposition Leader said so in response to a query a journalised raised after a meeting held...""" 206 | e = Extractor(text) 207 | places = e.find_geoEntities() 208 | if self.debug: 209 | print(places) 210 | self.assertEqual([], places) 211 | 212 | def testStackoverflow54077973(self): 213 | """ 214 | see https://stackoverflow.com/questions/54077973/geograpy3-library-for-extracting-the-locations-in-the-text-gives-unicodedecodee 215 | """ 216 | address = "Jersey City New Jersey 07306" 217 | e = Extractor(text=address) 218 | e.find_entities() 219 | self.check(e.places, ["Jersey", "City"]) 220 | 221 | def testStackOverflow54721435(self): 222 | """ 223 | see https://stackoverflow.com/questions/54721435/unable-to-extract-city-names-from-a-text-using-geograpypython 224 | """ 225 | text = "I live in Kadawatha a suburb of Colombo Sri Lanka" 226 | e = Extractor(text=text) 227 | e.find_entities() 228 | if self.debug: 229 | print(e.places) 230 | 231 | def testStackoverflow55548116(self): 232 | """ 233 | see https://stackoverflow.com/questions/55548116/geograpy3-library-is-not-working-properly-and-give-traceback-error 234 | """ 235 | feedContent = ["Las Vegas is a city in Nevada"] 236 | placesInFeed = [] 237 | 238 | for content in feedContent: 239 | if content != "": 240 | e = Extractor(text=content) 241 | e.find_entities() 242 | places = e.places 243 | if self.debug: 244 | print(places) 245 | placesInFeed.append(places) 246 | 247 | 248 | if __name__ == "__main__": 249 | unittest.main() 250 | -------------------------------------------------------------------------------- /tests/test_location.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2021-06-09 3 | 4 | @author: wf 5 | """ 6 | import unittest 7 | from math import radians 8 | 9 | import numpy as np 10 | from sklearn.neighbors import BallTree 11 | 12 | from geograpy.locator import ( 13 | CityManager, 14 | Country, 15 | CountryManager, 16 | LocationContext, 17 | LocationManager, 18 | Locator, 19 | RegionManager, 20 | ) 21 | from tests.basetest import Geograpy3Test 22 | 23 | 24 | class TestLocationHierarchy(Geograpy3Test): 25 | """ 26 | tests for the location hierarchy 27 | """ 28 | 29 | def setUp(self): 30 | super().setUp() 31 | self.locationContext = None 32 | pass 33 | 34 | def getLocationContext(self): 35 | if self.locationContext is None: 36 | self.locationContext = LocationContext.fromCache() 37 | return self.locationContext 38 | 39 | def testDistance(self): 40 | """ 41 | test calculcating the distance of two points using the haversine function 42 | """ 43 | # https://stackoverflow.com/a/64585765/1497139 44 | earth_radius = 6371000 # meters in earth 45 | test_radius = 1300000 # meters 46 | 47 | test_points = [[32.027240, -81.093190], [41.981876, -87.969982]] 48 | test_points_rad = np.array( 49 | [[radians(x[0]), radians(x[1])] for x in test_points] 50 | ) 51 | 52 | tree = BallTree(test_points_rad, metric="haversine") 53 | ind, results = tree.query_radius( 54 | test_points_rad, r=test_radius / earth_radius, return_distance=True 55 | ) 56 | if self.debug: 57 | print(ind) 58 | print(results * earth_radius / 1000) 59 | 60 | def testIssue45_BallTree(self): 61 | """ 62 | test calculation a ball tree for a given list of locations 63 | """ 64 | countryList = CountryManager.fromErdem() 65 | ballTree, validList = countryList.getBallTuple() 66 | self.assertEqual(245, len(validList)) 67 | self.assertEqual("BallTree", type(ballTree).__name__) 68 | self.assertAlmostEqual(245, ballTree.sum_weight, delta=0.1) 69 | pass 70 | 71 | def checkLocationListWithDistances( 72 | self, 73 | locationListWithDistances, 74 | expectedCount, 75 | expectedClosest, 76 | expectedDistance, 77 | ): 78 | """ 79 | check the location list with the given distances 80 | """ 81 | if self.debug: 82 | for i, locationWithDistance in enumerate(locationListWithDistances): 83 | location, distance = locationWithDistance 84 | print(f"{i}:{location}-{distance:.0f} km") 85 | self.assertEqual(len(locationListWithDistances), expectedCount) 86 | closestLocation, distance = locationListWithDistances[0] 87 | self.assertEqual(expectedClosest, closestLocation.name) 88 | self.assertAlmostEqual(expectedDistance, distance, delta=1) 89 | 90 | def testClosestLocation(self): 91 | """ 92 | test getting the closes Location to a given location 93 | """ 94 | # sample Country: Germany 95 | country = Country() 96 | country.name = "Germany" 97 | country.lat = 51.0 98 | country.lon = 9.0 99 | # get a country list 100 | lookupCountryManager = CountryManager.fromErdem() 101 | # get the closest 2 locations for the given countryList 102 | countryListWithDistances = country.getNClosestLocations(lookupCountryManager, 2) 103 | self.checkLocationListWithDistances( 104 | countryListWithDistances, 2, "Luxembourg", 244 105 | ) 106 | 107 | countryListWithDistances = country.getLocationsWithinRadius( 108 | lookupCountryManager, 300 109 | ) 110 | self.checkLocationListWithDistances( 111 | countryListWithDistances, 2, "Luxembourg", 244 112 | ) 113 | 114 | def testRegionMatching(self): 115 | """ 116 | test region matches 117 | """ 118 | locator = Locator() 119 | if not locator.db_has_data(): 120 | locator.populate_db() 121 | countryList = CountryManager.fromErdem() 122 | config = LocationContext.getDefaultConfig() 123 | regionManager = RegionManager(config=config) 124 | regionManager.fromCache() 125 | for country in countryList.countries: 126 | locationListWithDistances = country.getNClosestLocations(regionManager, 3) 127 | if self.debug: 128 | print(f"{country}{country.lat:.2f},{country.lon:.2f}") 129 | for i, locationWithDistance in enumerate(locationListWithDistances): 130 | location, distance = locationWithDistance 131 | if self.debug: 132 | print(f" {i}:{location}-{distance:.0f} km") 133 | pass 134 | 135 | def testLocationListLoading(self): 136 | """ 137 | test loading the locations from Json 138 | """ 139 | samples = """ 140 | { 141 | "countries": [ 142 | { 143 | "name": "Afghanistan", 144 | "wikidataid": "Q889", 145 | "lat": 34, 146 | "lon": 66, 147 | "coordinates": "34,66", 148 | "partOf": null, 149 | "level": 3, 150 | "locationKind": "Country", 151 | "comment": null, 152 | "iso": "AF" 153 | }, 154 | { 155 | "name": "United States of America", 156 | "wikidataid": "Q30", 157 | "lat": 39.82818, 158 | "lon": -98.5795, 159 | "partOf": "Noth America", 160 | "level": 3, 161 | "locationKind": "Country", 162 | "comment": null, 163 | "labels": [ 164 | "America", 165 | "UNITED STATES OF AMERICA", 166 | "USA", 167 | "United States", 168 | "United States of America (the)" 169 | ], 170 | "iso": "US" 171 | }, 172 | { 173 | "name": "Australia", 174 | "wikidataid": "Q408", 175 | "lat": -28, 176 | "lon": 137, 177 | "coordinates": "-28,137", 178 | "partOf": null, 179 | "level": 3, 180 | "locationKind": "Country", 181 | "comment": null, 182 | "labels": [ 183 | "AUS" 184 | ], 185 | "iso": "AU" 186 | } 187 | ] 188 | } 189 | """ 190 | cm = CountryManager() 191 | cm.restoreFromJsonStr(samples) 192 | countriesByWikiDataId, _dup = cm.getLookup("wikidataid") 193 | self.assertTrue("Q30" in countriesByWikiDataId) 194 | 195 | def test_getLocationByID(self): 196 | """ 197 | tests if the correct location for a given wikidataid is returned 198 | """ 199 | config = LocationContext.getDefaultConfig() 200 | countryManager = CountryManager(config=config) 201 | countryManager.fromCache() 202 | country = countryManager.getLocationByID("Q30") # wikidataid of USA 203 | self.assertIsNotNone(country) 204 | self.assertTrue(hasattr(country, "iso")) 205 | self.assertEqual(country.iso, "US") 206 | 207 | def test_LocationContext(self): 208 | """ 209 | tests the LocationContext class 210 | """ 211 | 212 | # test interlinking of city with region and country 213 | locationContext = self.getLocationContext() 214 | cities = locationContext.cityManager.getByName("Los Angeles") 215 | la = [x for x in cities if x.wikidataid == "Q65"][0] 216 | self.assertEqual(la.name, "Los Angeles") 217 | ca = la.region 218 | self.assertEqual(ca.name, "California") 219 | us = la.country 220 | self.assertEqual(us.wikidataid, "Q30") 221 | self.assertEqual(la.country, ca.country) 222 | 223 | def testLocateLocation(self): 224 | """ 225 | test LocationContext locateLocation 226 | """ 227 | exampleLocations = { 228 | "Washington, DC, USA": "Q61", 229 | "Bangalore": "Q1355", 230 | "Bangalore, India": "Q1355", 231 | "Xi'an": "Q5826", 232 | "Xi'an, China": "Q5826", 233 | "Virtual Event USA": "Q30", 234 | "Virtual USA": "Q30", 235 | "London United Kingdom": "Q84", 236 | "Brno": "Q14960", 237 | "Cancun": "Q8969", 238 | "St. Petersburg": "Q656", 239 | "Gothenburg Sweden": "Q25287", 240 | "Los Angeles California": "Q65", 241 | "Zurich, Switzerland": "Q72", 242 | "Barcelona Spain": "Q1492", 243 | "Vienna Austria": "Q1741", 244 | "Seoul Republic of Korea": "Q8684", 245 | "Seattle WA USA": "Q5083", 246 | "Singapore Singapore": "Q334", 247 | "Tokyo Japan": "Q1490", 248 | "Vancouver BC Canada": "Q24639", 249 | "Vancouver British Columbia Canada": "Q24639", 250 | "Amsterdam Netherlands": "Q727", 251 | "Paris France": "Q90", 252 | "Nagoya": "Q11751", 253 | "Marrakech": "Q101625", 254 | "Austin Texas": "Q16559", 255 | "Chicago IL USA": "Q1297", 256 | "Bangkok Thailand": "Q1861", 257 | "Firenze, Italy": "Q2044", 258 | "Florence Italy": "Q2044", 259 | "Timisoara": "Q83404", 260 | "Langkawi": "Q273303", 261 | "Beijing China": "Q956", 262 | "Berlin Germany": "Q64", 263 | "Prague Czech Republic": "Q1085", 264 | "Portland Oregon USA": "Q6106", 265 | "Portland OR USA": "Q6106", 266 | "Pittsburgh PA USA": "Q1342", 267 | "Новосибирск": "Q883", 268 | "Los Angeles CA USA": "Q65", 269 | "Kyoto Japan": "Q34600", 270 | } 271 | locationContext = self.getLocationContext() 272 | printPretty = lambda records: print([str(record) for record in records]) 273 | failures = [] 274 | for locationText in exampleLocations.keys(): 275 | expectedLocationId = exampleLocations[locationText] 276 | locations = locationContext.locateLocation(locationText, verbose=True) 277 | if len(locations) < 1: 278 | failures.append(locationText) 279 | else: 280 | location = locations[0] 281 | if self.debug: 282 | printPretty(location) 283 | if not location.wikidataid == expectedLocationId: 284 | failures.append(locationText) 285 | showFailures = True 286 | if self.debug or showFailures: 287 | print(f"locationLooup failed for {failures}") 288 | self.assertTrue(len(failures) <= 40) 289 | 290 | def testLocateLocationCountryRegionCity(self): 291 | """ 292 | test LocationContext locateLocation 293 | """ 294 | locationContext = self.getLocationContext() 295 | printPretty = lambda records: print([str(record) for record in records]) 296 | 297 | pl1 = locationContext.locateLocation("Berlin", "USA") 298 | self.assertEqual("Germany", pl1[0].country.name) 299 | if self.debug: 300 | printPretty(pl1) 301 | pl2 = locationContext.locateLocation("Los Angeles, CA") 302 | if self.debug: 303 | printPretty(pl2) 304 | self.assertEqual("California", pl2[0].region.name) 305 | pl3 = locationContext.locateLocation("Germany, Aachen") 306 | if self.debug: 307 | printPretty(pl3) 308 | self.assertEqual("Aachen", pl3[0].name) 309 | self.assertEqual("Germany", pl3[0].country.name) 310 | 311 | 312 | if __name__ == "__main__": 313 | # import sys;sys.argv = ['', 'Test.testName'] 314 | unittest.main() 315 | -------------------------------------------------------------------------------- /tests/test_locator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2020-09-19 3 | 4 | @author: wf 5 | """ 6 | import getpass 7 | import os.path 8 | import re 9 | import tempfile 10 | import unittest 11 | from collections import Counter 12 | from pathlib import Path 13 | 14 | from lodstorage.storageconfig import StorageConfig 15 | from lodstorage.uml import UML 16 | 17 | import geograpy 18 | from geograpy.locator import City, CountryManager, Location, LocationContext, Locator 19 | from tests.basetest import Geograpy3Test 20 | 21 | 22 | class TestLocator(Geograpy3Test): 23 | """ 24 | test the Locator class from the location module 25 | """ 26 | 27 | def lookupQuery(self, viewName, whereClause): 28 | loc = Locator.getInstance() 29 | queryString = f"SELECT * FROM {viewName} where {whereClause} AND pop is not NULL ORDER by pop desc" 30 | lookupRecords = loc.sqlDB.query(queryString) 31 | return lookupRecords 32 | 33 | def checkExpected(self, lod, expected): 34 | emap = {} 35 | found = {} 36 | for key, value in expected: 37 | emap[key] = value 38 | for record in lod: 39 | name = record["name"] 40 | pop = record["pop"] 41 | if name in emap and pop > emap[name]: 42 | found[name] = record 43 | if self.debug: 44 | print(f"{name}:{pop:.0f}") 45 | 46 | self.assertEqual(len(found), len(emap)) 47 | 48 | def testHasViews(self): 49 | """ 50 | test that the views are available 51 | """ 52 | loc = Locator.getInstance() 53 | viewsMap = loc.sqlDB.getTableDict(tableType="view") 54 | for view in ["CityLookup", "RegionLookup", "CountryLookup"]: 55 | self.assertTrue(view in viewsMap) 56 | 57 | def testCityLookup(self): 58 | """ 59 | test the cityLookup to city/region/country object cluster 60 | """ 61 | cityLookupRecords = self.lookupQuery( 62 | "CityLookup", "label in ('Berlin','Paris','Athens','Singapore')" 63 | ) 64 | expected = [ 65 | ("Berlin", 3644000), 66 | ("Paris", 2175000), 67 | ("Athens", 600000), 68 | ("Singapore", 5800000), 69 | ] 70 | self.checkExpected(cityLookupRecords, expected) 71 | 72 | def testRegionLookup(self): 73 | """ 74 | test region Lookup 75 | """ 76 | regionLookupRecords = self.lookupQuery("RegionLookup", "label in ('CA')") 77 | expected = [("California", 39000000)] 78 | self.checkExpected(regionLookupRecords, expected) 79 | 80 | def testCountryLookup(self): 81 | """ 82 | test country Lookup 83 | """ 84 | # self.debug=True 85 | countryLookupRecords = self.lookupQuery("CountryLookup", "label in ('CA')") 86 | expected = [("Canada", 37000000)] 87 | self.checkExpected(countryLookupRecords, expected) 88 | 89 | def testIsoRegexp(self): 90 | """ 91 | test regular expression for iso codes 92 | """ 93 | loc = Locator.getInstance() 94 | self.assertFalse(loc.isISO("Singapore")) 95 | 96 | query = """ 97 | select distinct iso from countries 98 | union 99 | select distinct iso from regions 100 | """ 101 | loc.populate_db() 102 | isocodeRecords = loc.sqlDB.query(query) 103 | for isocodeRecord in isocodeRecords: 104 | isocode = isocodeRecord["iso"] 105 | if isocode: 106 | isIso = loc.isISO(isocode) 107 | if not isIso and self.debug: 108 | print(isocode) 109 | self.assertTrue(isIso) 110 | 111 | def testWordCount(self): 112 | """ 113 | test the word count 114 | """ 115 | loc = Locator.getInstance() 116 | query = "SELECT name from CITIES" 117 | nameRecords = loc.sqlDB.query(query) 118 | if self.debug: 119 | print("testWordCount: found %d names" % len(nameRecords)) 120 | wc = Counter() 121 | for nameRecord in nameRecords: 122 | name = nameRecord["name"] 123 | words = re.split(r"\W+", name) 124 | wc[len(words)] += 1 125 | if self.debug: 126 | print("most common 20: %s" % wc.most_common(20)) 127 | 128 | def testUML(self): 129 | """ 130 | test adding population data from wikidata to GeoLite2 information 131 | """ 132 | Locator.resetInstance() 133 | loc = Locator.getInstance() 134 | loc.populate_db() 135 | user = getpass.getuser() 136 | if self.debug: 137 | print("current user is %s" % user) 138 | tableList = loc.sqlDB.getTableList() 139 | uml = UML() 140 | title = """geograpy Tables 141 | 2021-08-13 142 | [[https://github.com/somnathrakshit/geograpy3 © 2020-2021 geograpy3 project]]""" 143 | plantUml = uml.tableListToPlantUml( 144 | tableList, title=title, packageName="geograpy3" 145 | ) 146 | showUml = True 147 | if showUml or self.debug: 148 | print(plantUml) 149 | 150 | def checkExamples(self, examples, countries, debug=False, check=True): 151 | """ 152 | 153 | check that the given example give results in the given countries 154 | Args: 155 | examples(list): a list of example location strings 156 | countries(list): a list of expected country iso codes 157 | """ 158 | for index, example in enumerate(examples): 159 | city = geograpy.locateCity(example, debug=debug) 160 | if self.debug: 161 | print("%3d: %22s->%s" % (index, example, city)) 162 | if check: 163 | self.assertEqual(countries[index], city.country.iso) 164 | 165 | def testGetCountry(self): 166 | """ 167 | test getting a country by name or ISO 168 | """ 169 | locator = Locator() 170 | debug = True 171 | examples = [ 172 | ("DE", "Germany"), 173 | ("US", "United States of America"), 174 | ("USA", None), 175 | ] 176 | for name, expectedName in examples: 177 | country = locator.getCountry(name) 178 | if debug: 179 | print(country) 180 | if expectedName is None: 181 | self.assertIsNone(country) 182 | else: 183 | self.assertIsNotNone(country) 184 | self.assertEqual(expectedName, country.name) 185 | 186 | def testIssue15(self): 187 | """ 188 | https://github.com/somnathrakshit/geograpy3/issues/15 189 | test Issue 15 Disambiguate via population, gdp data 190 | """ 191 | examples = ["Paris", "Vienna", "Berlin"] 192 | countries = ["FR", "AT", "DE"] 193 | self.checkExamples(examples, countries) 194 | pass 195 | 196 | def testIssue17(self): 197 | """ 198 | test issue 17: 199 | 200 | https://github.com/somnathrakshit/geograpy3/issues/17 201 | 202 | [BUG] San Francisco, USA and Auckland, New Zealand should be locatable #17 203 | """ 204 | examples = ["San Francisco, USA", "Auckland, New Zealand"] 205 | countries = ["US", "NZ"] 206 | self.checkExamples(examples, countries) 207 | 208 | def testIssue19(self): 209 | """ 210 | test issue 19 211 | """ 212 | examples = ["Puebla City, Mexico", "Newcastle, UK", "San Juan, Puerto Rico"] 213 | countries = ["MX", "GB", "US"] 214 | # For Puerto Rico exist two iso codes one as country and one as US region see https://en.wikipedia.org/wiki/Puerto_Rico in the dataset it is recognized as US region 215 | self.checkExamples(examples, countries) 216 | 217 | def testStackOverflow64379688(self): 218 | """ 219 | compare old and new geograpy interface 220 | """ 221 | examples = [ 222 | "John Doe 160 Huntington Terrace Newark, New York 07112 United States of America", 223 | "John Doe 30 Huntington Terrace Newark, New York 07112 USA", 224 | "John Doe 22 Huntington Terrace Newark, New York 07112 US", 225 | "Mario Bianchi, Via Nazionale 256, 00148 Roma (RM) Italia", 226 | "Mario Bianchi, Via Nazionale 256, 00148 Roma (RM) Italy", 227 | "Newark", 228 | "Rome", 229 | ] 230 | for example in examples: 231 | city = geograpy.locateCity(example, debug=False) 232 | if self.debug: 233 | print(city) 234 | 235 | def testStackOverflow64418919(self): 236 | """ 237 | https://stackoverflow.com/questions/64418919/problem-retrieving-region-in-us-with-geograpy3 238 | """ 239 | examples = ["Seattle"] 240 | for example in examples: 241 | city = geograpy.locateCity(example, debug=False) 242 | print(city) 243 | 244 | def testProceedingsExample(self): 245 | """ 246 | test a proceedings title Example 247 | """ 248 | examples = [ 249 | """Proceedings of the 250 | IEEE 14th International Conference on 251 | Semantic Computing, ICSC 2020, 252 | San Diego, CA, USA, 253 | February 3-5, 2020""" 254 | ] 255 | for example in examples: 256 | places = geograpy.get_place_context(text=example) 257 | if self.debug: 258 | print(places) 259 | city = geograpy.locateCity(example, debug=False) 260 | if self.debug: 261 | print(city) 262 | 263 | def testDelimiters(self): 264 | """ 265 | test the delimiter statistics for names 266 | """ 267 | loc = Locator.getInstance() 268 | loc.populate_db() 269 | 270 | ddls = [ 271 | "DROP VIEW IF EXISTS allNames", 272 | """CREATE VIEW allNames as select name from countries 273 | union select name from regions 274 | union select name from cities""", 275 | ] 276 | for ddl in ddls: 277 | loc.sqlDB.execute(ddl) 278 | query = "SELECT name from allNames" 279 | nameRecords = loc.sqlDB.query(query) 280 | show = self.debug 281 | show = True 282 | if show: 283 | print("found %d name records" % len(nameRecords)) 284 | ordC = Counter() 285 | for nameRecord in nameRecords: 286 | name = nameRecord["name"] 287 | for char in name: 288 | code = ord(char) 289 | if code < ord("A"): 290 | ordC[code] += 1 291 | for index, countT in enumerate(ordC.most_common(10)): 292 | code, count = countT 293 | if show: 294 | print("%d: %d %s -> %d" % (index, code, chr(code), count)) 295 | 296 | def testIssue22(self): 297 | """ 298 | https://github.com/somnathrakshit/geograpy3/issues/22 299 | """ 300 | url = "https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay" 301 | places = geograpy.get_geoPlace_context(url=url) 302 | if self.debug: 303 | print(places) 304 | self.assertTrue(len(places.countries) > 5) 305 | self.assertTrue(len(places.regions) > 5) 306 | self.assertTrue(len(places.cities) > 20) 307 | 308 | def testExamples(self): 309 | """ 310 | test examples 311 | """ 312 | examples = [ 313 | "Paris, US-TX", 314 | "Amsterdam, Netherlands", 315 | "Vienna, Austria", 316 | "Vienna, Illinois, US", 317 | "Paris, Texas", 318 | "Austin, TX", 319 | "Austin, Texas", 320 | ] 321 | countries = ["US", "NL", "AT", "US", "US", "US", "US"] 322 | self.checkExamples(examples, countries, debug=False) 323 | 324 | def testIssue41_CountriesFromErdem(self): 325 | """ 326 | test getting Country list from Erdem 327 | 328 | """ 329 | countryList = CountryManager.fromErdem() 330 | self.assertEqual(247, len(countryList.countries)) 331 | if self.debug: 332 | for country in countryList.countries: 333 | print(country) 334 | 335 | def testIssue_42_distance(self): 336 | """ 337 | test haversine and location 338 | """ 339 | loc1 = Location() 340 | loc1.lat = 0 341 | loc1.lon = 0 342 | loc2 = Location() 343 | loc2.lat = 90 344 | loc2.lon = 0 345 | d = loc1.distance(loc2) 346 | # self.debug=True 347 | if self.debug: 348 | print(d) 349 | self.assertAlmostEqual(10007.54, d, delta=0.1) 350 | 351 | def testIssue_59_db_download(self): 352 | """ 353 | tests the correct downloading of the backup database in different configurations 354 | """ 355 | 356 | def getConfig(tmpdir: str): 357 | config = StorageConfig( 358 | cacheFile="locations.db", 359 | cacheDirName="geograpyTest", 360 | cacheRootDir=tmpdir, 361 | ) 362 | config.cacheFile = f"{config.getCachePath()}/{config.cacheFile}" 363 | return config 364 | 365 | def downloadAndTestDB( 366 | config: StorageConfig, loc: Locator = None, forceUpdate: bool = False 367 | ): 368 | """downloads and tests the downloaded db""" 369 | if loc is None: 370 | loc = Locator(storageConfig=config) 371 | loc.downloadDB(forceUpdate=forceUpdate) 372 | self.assertTrue(os.path.exists(config.cacheFile)) 373 | self.assertTrue(loc.db_has_data()) 374 | return loc 375 | 376 | # test downloading with no file in dir 377 | with tempfile.TemporaryDirectory() as tmpdir: 378 | config = getConfig(tmpdir) 379 | downloadAndTestDB(config) 380 | 381 | # test downloading with empty file in dir 382 | with tempfile.TemporaryDirectory() as tmpdir: 383 | config = getConfig(tmpdir) 384 | Path(config.cacheFile).touch() # create empty file 385 | loc = downloadAndTestDB(config) 386 | 387 | # test downloading with forceUpdate 388 | # drop a important table to check if it is restored 389 | loc.sqlDB.execute("DROP TABLE countries") 390 | self.assertFalse(loc.db_has_data()) 391 | downloadAndTestDB(config, loc=loc, forceUpdate=True) 392 | 393 | 394 | if __name__ == "__main__": 395 | # import sys;sys.argv = ['', 'Test.testName'] 396 | unittest.main() 397 | -------------------------------------------------------------------------------- /tests/test_nominatim.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2021-08-20 3 | 4 | @author: wf 5 | """ 6 | 7 | from geograpy.nominatim import NominatimWrapper 8 | from tests.basetest import Geograpy3Test 9 | 10 | 11 | class TestGeopy(Geograpy3Test): 12 | """ 13 | test geopy and other nominatim handlers 14 | """ 15 | 16 | def testNominatim(self): 17 | """ 18 | test nominatim results - especially the extra tags 19 | """ 20 | if self.inCI(): 21 | return 22 | examples = [ 23 | {"city": "London", "q": "Q84", "expected": "England"}, 24 | {"city": "Dublin", "q": "Q1761", "expected": "Ireland"}, 25 | {"city": "Vienna Austria", "q": "Q1741", "expected": "Österreich"}, 26 | { 27 | "city": "Athens, Georgia", 28 | "q": "Q203263", 29 | "expected": "Athens-Clarke County", 30 | }, 31 | # inconsistent results - 2021-12-27 32 | # { 33 | # "city":"St. Petersburg", 34 | # "q": "Q656", 35 | # "expected": "Санкт-Петербург" 36 | # }, 37 | { 38 | # so for St. Petersburg we need to be more specific 39 | "city": "St. Petersburg, Russia", 40 | "q": "Q656", 41 | # to get the russian one 42 | "expected": "Санкт-Петербург", 43 | }, 44 | # inconsistent results Q49279759 - 2023-09-29 45 | # { 46 | # "city":"Arlington, VA", 47 | # "q": "Q107126", 48 | # "expected": "Virginia" 49 | # } 50 | {"city": "Saint Petersburg, FL", "q": "Q49236", "expected": "Florida"}, 51 | ] 52 | 53 | nw = NominatimWrapper() 54 | show = self.debug 55 | # show=True 56 | if show: 57 | print(nw.cacheDir) 58 | for example in examples: 59 | city = example["city"] 60 | location = nw.geolocator.geocode(city) 61 | wikidataId = nw.lookupWikiDataId(city) 62 | q = example["q"] 63 | expected = example["expected"] 64 | if show: 65 | print( 66 | f"{city:<22}:{str(wikidataId):<7}/{str(q):<7}:{location}→{expected}" 67 | ) 68 | self.assertEqual(str(q), str(wikidataId)) 69 | self.assertTrue(expected in str(location), f"{location}→{expected}") 70 | pass 71 | -------------------------------------------------------------------------------- /tests/test_places.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import geograpy 4 | from geograpy.locator import Locator 5 | from geograpy.places import PlaceContext 6 | from tests.basetest import Geograpy3Test 7 | 8 | 9 | class TestPlaces(Geograpy3Test): 10 | """ 11 | test Places 12 | """ 13 | 14 | def setUp(self): 15 | super().setUp(debug=False) 16 | Locator.resetInstance() 17 | pass 18 | 19 | def testIssue25(self): 20 | """ 21 | https://github.com/somnathrakshit/geograpy3/issues/25 22 | """ 23 | pc = PlaceContext( 24 | place_names=["Bulgaria", "Croatia", "Czech Republic", "Hungary"] 25 | ) 26 | if self.debug: 27 | print(pc.countries) 28 | 29 | def testGetRegionNames(self): 30 | """ 31 | test getting region names 32 | """ 33 | pc = PlaceContext(place_names=["Berlin"]) 34 | regions = pc.getRegions("Germany") 35 | self.assertEqual(16, len(regions)) 36 | for region in regions: 37 | if self.debug: 38 | print(region) 39 | self.assertTrue(region.iso.startswith("DE")) 40 | regionNames = pc.get_region_names("Germany") 41 | self.assertEqual(16, len(regionNames)) 42 | if self.debug: 43 | print(regionNames) 44 | 45 | def testPlaces(self): 46 | """ 47 | test places 48 | """ 49 | pc = PlaceContext(["Ngong", "Nairobi", "Kenya"], setAll=False) 50 | pc.setAll() 51 | 52 | if self.debug: 53 | print(pc) 54 | 55 | # Ngong is a city in Cameroon and Kenya 56 | self.assertEqual(2, len(pc.countries)) 57 | self.assertTrue("Kenya" in pc.countries) 58 | self.assertEqual(2, len(pc.cities)) 59 | cityNames = ["Nairobi", "Ohio", "Amsterdam"] 60 | countries = ["Kenya", "United States of America", "Netherlands"] 61 | for index, cityName in enumerate(cityNames): 62 | cities = pc.cities_for_name(cityName) 63 | country = cities[0].country 64 | self.assertEqual(countries[index], country.name) 65 | 66 | pc = PlaceContext(["Mumbai"]) 67 | if self.debug: 68 | print(pc) 69 | 70 | def testIssue49(self): 71 | """ 72 | country recognition 73 | """ 74 | show = self.debug 75 | texts = ["United Kingdom", "UK", "Great Britain", "GB", "United States"] 76 | expected = [ 77 | "United Kingdom", 78 | "United Kingdom", 79 | "United Kingdom", 80 | "United Kingdom", 81 | "United States of America", 82 | ] 83 | if show: 84 | print("lookup with geograpy.get_geoPlace_context") 85 | for text in texts: 86 | countries = geograpy.get_geoPlace_context(text=text).countries 87 | if show: 88 | print(f"{text}:{countries}") 89 | if show: 90 | print("lookup with PlaceContext") 91 | for i, text in enumerate(texts): 92 | pc = PlaceContext([text]) 93 | pc.set_countries() 94 | if show: 95 | print(f"{text}:{pc.countries}") 96 | self.assertEqual([expected[i]], pc.countries) 97 | 98 | 99 | if __name__ == "__main__": 100 | unittest.main() 101 | -------------------------------------------------------------------------------- /tests/test_wikidata.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on 2020-09-23 3 | 4 | @author: wf 5 | """ 6 | import getpass 7 | import unittest 8 | 9 | from lodstorage.sql import SQLDB 10 | from lodstorage.storageconfig import StorageConfig 11 | 12 | from geograpy.locator import Country 13 | from geograpy.wikidata import Wikidata 14 | from tests.basetest import Geograpy3Test 15 | 16 | 17 | class TestWikidata(Geograpy3Test): 18 | """ 19 | test the wikidata access for cities 20 | """ 21 | 22 | def testWikidataCountries(self): 23 | """ 24 | test getting country information from wikidata 25 | """ 26 | wikidata = Wikidata() 27 | try: 28 | countryList = wikidata.getCountries() 29 | self.assertTrue(len(countryList) >= 200) 30 | expectedAttrs = Country.getSamples()[0].keys() 31 | for country in countryList: 32 | if self.debug: 33 | print(country) 34 | for attr in expectedAttrs: 35 | self.assertTrue(hasattr(country, attr)) 36 | except Exception as ex: 37 | self.handleWikidataException(ex) 38 | pass 39 | 40 | def testWikidataRegions(self): 41 | """ 42 | test getting region information from wikidata 43 | """ 44 | wikidata = Wikidata() 45 | try: 46 | regionList = wikidata.getRegions() 47 | self.assertTrue(len(regionList) >= 3000) 48 | except Exception as ex: 49 | self.handleWikidataException(ex) 50 | pass 51 | 52 | def testWikidataCities(self): 53 | """ 54 | test getting city information from wikidata 55 | 56 | """ 57 | # Wikidata time outs in CI environment need to be avoided 58 | if getpass.getuser() != "wf": 59 | return 60 | config = StorageConfig.getSQL(debug=self.debug) 61 | config.cacheRootDir = "/tmp/wdhs" 62 | cachedir = config.getCachePath() 63 | config.cacheFile = f"{cachedir}/hs.db" 64 | # use 2018 wikidata copy 65 | # wikidata.endpoint="http://blazegraph.bitplan.com/sparql" 66 | # use 2020 wikidata copy 67 | wikidata = Wikidata() 68 | wikidata.endpoint = "https://confident.dbis.rwth-aachen.de/jena/wdhs/sparql" 69 | # wikidata.endpoint="http://jena.bitplan.com/wdhs/sparql" 70 | regions = [ 71 | {"name": "Singapore", "country": "Q334", "region": None, "cities": 46}, 72 | {"name": "Beijing", "country": None, "region": "Q956", "cities": 25}, 73 | {"name": "Paris", "country": None, "region": "Q13917", "cities": 1242}, 74 | {"name": "Barcelona", "country": None, "region": "Q5705", "cities": 1242}, 75 | {"name": "Rome", "country": None, "region": "Q1282", "cities": 1242}, 76 | ] 77 | limit = 1000000 # if self.inCI() else 100 78 | cityList = wikidata.getCities(limit=limit) 79 | sqlDB = SQLDB(config.cacheFile) 80 | entityInfo = sqlDB.createTable(cityList, "hs", withDrop=True) 81 | sqlDB.store(cityList, entityInfo, fixNone=True) 82 | expected = 200000 # if self.inCI() else limit 83 | self.assertTrue(len(cityList) >= expected) 84 | # for region in regions: 85 | # starttime=time.time() 86 | # regionName=region["name"] 87 | # print(f"searching cities for {regionName}" ) 88 | # cityList=wikidata.getCities(country=region["country"], region=region["region"]) 89 | # print("Found %d cities for %s in %5.1f s" % (len(cityList),region["name"],time.time()-starttime)) 90 | # if self.debug: 91 | # print(cityList[:10]) 92 | # #self.assertEqual(region['cities'],len(cityList)) 93 | # pass 94 | 95 | def testWikidataCityStates(self): 96 | """ 97 | test getting region information from wikidata 98 | """ 99 | wikidata = Wikidata() 100 | try: 101 | regionList = wikidata.getCityStates() 102 | self.assertTrue(len(regionList) >= 2) 103 | cityStateNames = [r.get("name") for r in regionList] 104 | self.assertTrue("Singapore" in cityStateNames) 105 | except Exception as ex: 106 | self.handleWikidataException(ex) 107 | pass 108 | 109 | def testGetWikidataId(self): 110 | """ 111 | test getting a wikiDataId from a given URL 112 | """ 113 | # test entity 114 | wikidataURL = "https://www.wikidata.org/wiki/Q1" 115 | expectedID = "Q1" 116 | wikiDataId = Wikidata.getWikidataId(wikidataURL) 117 | self.assertEqual(wikiDataId, expectedID) 118 | # test property 119 | wikidataURLProperty = "https://www.wikidata.org/wiki/Property:P31" 120 | expectedPropertyID = "P31" 121 | propertyId = Wikidata.getWikidataId(wikidataURLProperty) 122 | self.assertEqual(expectedPropertyID, propertyId) 123 | # test invalid entries 124 | wikidataURLProperty = "" 125 | parsedId = Wikidata.getWikidataId(wikidataURLProperty) 126 | self.assertIsNone(parsedId) 127 | 128 | def testGetCoordinateComponents(self): 129 | """ 130 | test the splitting of coordinate components in WikiData query results 131 | """ 132 | cList = [ 133 | { 134 | "coordinate": "Point(-118.25 35.05694444)", 135 | "expected": (-118.25, 35.05694444), 136 | } 137 | ] 138 | for c in cList: 139 | coordinate = c["coordinate"] 140 | expLat, expLon = c["expected"] 141 | lon, lat = Wikidata.getCoordinateComponents(coordinate) 142 | self.assertEqual(expLat, lat) 143 | self.assertEqual(expLon, lon) 144 | 145 | 146 | if __name__ == "__main__": 147 | # import sys;sys.argv = ['', 'Test.testName'] 148 | unittest.main() 149 | --------------------------------------------------------------------------------