├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ ├── build.yml
│ ├── deploy-docs.yml
│ └── upload-to-pypi.yml
├── .gitignore
├── .project
├── .pydevproject
├── .readthedocs.yml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── docs
└── source
│ ├── Makefile
│ ├── conf.py
│ ├── geograpy.rst
│ ├── index.rst
│ ├── make.bat
│ ├── setup.rst
│ └── tests.rst
├── examples
└── example1.py
├── geograpy
├── __init__.py
├── data
│ ├── ISO3166ErrorDictionary.csv
│ ├── aliases.csv
│ └── queries.yaml
├── extraction.py
├── geograpy_nltk.py
├── labels.py
├── locator.py
├── nominatim.py
├── places.py
├── utils.py
├── version.py
└── wikidata.py
├── pyproject.toml
├── scripts
├── blackisort
├── doc
├── download
├── install
├── release
└── test
└── tests
├── __init__.py
├── basetest.py
├── testCachingCitiesByRegion.py
├── testCachingLocationLabels.py
├── testLocatorDatabase.py
├── testQueries.py
├── test_LocationContext.py
├── test_extractor.py
├── test_location.py
├── test_locator.py
├── test_nominatim.py
├── test_places.py
└── test_wikidata.py
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: "[BUG]"
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Use function.
16 | 2. ...
17 |
18 | **Expected behavior**
19 | A clear and concise description of what you expected to happen.
20 |
21 | **Screenshots**
22 | If applicable, add screenshots to help explain your problem.
23 |
24 | **Environment (please complete the following information):**
25 | - OS: [e.g. Ubuntu 20.04]
26 | - Python Version [e.g. 3.6]
27 |
28 | **Additional context**
29 | Add any other context about the problem here.
30 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Build
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | pull_request:
10 | branches: [ master ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | os: [ubuntu-latest, macos-latest, windows-latest]
19 | python-version: [3.9, "3.10", "3.11", "3.12"]
20 |
21 | steps:
22 | - uses: actions/checkout@v4
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v5
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install
28 | run: |
29 | scripts/install
30 | - name: Run tests
31 | run: |
32 | scripts/test
33 |
--------------------------------------------------------------------------------
/.github/workflows/deploy-docs.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Deploy docs to Netlify
5 | on:
6 | push:
7 | branches: [ master ]
8 | jobs:
9 | docs-deploy:
10 | runs-on: ubuntu-latest
11 | strategy:
12 | matrix:
13 | python-version: [3.9]
14 | steps:
15 | - uses: actions/checkout@v4
16 | - name: Set up Python ${{ matrix.python-version }}
17 | uses: actions/setup-python@v5
18 | with:
19 | python-version: ${{ matrix.python-version }}
20 | - name: Install dependencies
21 | run: |
22 | pip install .
23 | - name: Install sphinx
24 | run: |
25 | pip install sphinx sphinx-rtd-theme
26 | - name: Build docs
27 | run: |
28 | scripts/doc
29 | - name: Deploy docs to Netlify
30 | uses: netlify/actions/cli@master
31 | env:
32 | NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
33 | NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
34 | with:
35 | args: deploy --dir=docs/source/_build/html --prod
36 |
--------------------------------------------------------------------------------
/.github/workflows/upload-to-pypi.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.x'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install build
33 | - name: Build package
34 | run: python -m build
35 | - name: Publish package
36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PYPI_API_TOKEN }}
40 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Changelogs
2 | CHANGELOG*
3 |
4 | # Release files
5 | token
6 | settings.ini
7 |
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | pip-wheel-metadata/
31 | share/python-wheels/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 | MANIFEST
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .nox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | *.py,cover
58 | .hypothesis/
59 | .pytest_cache/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | *.log
67 | local_settings.py
68 | db.sqlite3
69 | db.sqlite3-journal
70 |
71 | # Flask stuff:
72 | instance/
73 | .webassets-cache
74 |
75 | # Scrapy stuff:
76 | .scrapy
77 |
78 | # Sphinx documentation
79 | docs/source/_build/
80 |
81 | # PyBuilder
82 | target/
83 |
84 | # Jupyter Notebook
85 | .ipynb_checkpoints
86 |
87 | # IPython
88 | profile_default/
89 | ipython_config.py
90 |
91 | # pyenv
92 | .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
102 | __pypackages__/
103 |
104 | # Celery stuff
105 | celerybeat-schedule
106 | celerybeat.pid
107 |
108 | # SageMath parsed files
109 | *.sage.py
110 |
111 | # Environments
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 |
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 |
124 | # Rope project settings
125 | .ropeproject
126 |
127 | # mkdocs documentation
128 | /site
129 |
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 |
135 | # Pyre type checker
136 | .pyre/
137 |
138 | # Mac OS
139 | .DS_Store
140 | # docs are autogenerated with sphinx-api docs
141 | # eclipse
142 | .settings
143 | # databases
144 | geograpy/*.db
145 | geograpy/*.db.gz
146 |
147 | CHANGELOG.bak
148 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | geograpy3
4 |
5 |
6 |
7 |
8 |
9 | org.python.pydev.PyDevBuilder
10 |
11 |
12 |
13 |
14 |
15 | org.python.pydev.pythonNature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.pydevproject:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | /${PROJECT_DIR_NAME}
5 |
6 | python interpreter
7 | Default
8 |
9 |
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 |
9 | # Build documentation in the docs/ directory with Sphinx
10 | sphinx:
11 | configuration: docs/source/conf.py
12 |
13 | # Build documentation with MkDocs
14 | #mkdocs:
15 | # configuration: mkdocs.yml
16 |
17 | # Optionally build your docs in additional formats such as PDF
18 | formats: all
19 |
20 | # Optionally set the version of Python and requirements required to build your docs
21 | python:
22 | version: 3.7
23 | install:
24 | - requirements: requirements.txt
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Release notes
2 |
3 |
4 |
5 | ## 0.2.3
6 |
7 | - Move NLTK download code to Extractor class.
8 | - Upgrade package build method
9 | - Upgrade PyPi distribution
10 | - Fix NLTK and DB download issue (PR #66)
11 |
12 |
13 | ## 0.1.9
14 |
15 | Fix version number
16 |
17 |
18 | ## 0.1.8
19 |
20 | ### New Features
21 |
22 | - Add ISO country code ([#10](https://github.com/somnathrakshit/geograpy3/issues/10))
23 | - returned country information should include the two [letter ISO
24 | code](https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2) of the
25 | country
26 |
27 | - if country is given disambiguate country ([#7](https://github.com/somnathrakshit/geograpy3/issues/7))
28 | - see e.g. https://stackoverflow.com/questions/62152428/extracting-
29 | country-information-from-description-using-
30 | geograpy?noredirect=1#comment112899776_62152428 Zaragoza, Spain
31 | should e.g. only return the country Spain since it's in the
32 | context of Zaragoza
33 |
34 | ### Bugs Squashed
35 |
36 | - [BUG]AttributeError: 'NoneType' object has no attribute 'name' on "Pristina, Kosovo" ([#9](https://github.com/somnathrakshit/geograpy3/issues/9))
37 | - **Describe the bug** ```
38 | geograpy.get_geoPlace_context(text="Pristina, Kosovo") ``` leads
39 | to python error. **To Reproduce** Steps to reproduce the
40 | behavior: ```python def testIssue(self): '''
41 | test Issue ''' locality="Pristina, Kosovo"
42 | gp=geograpy.get_geoPlace_context(text=locality) if
43 | self.debug: print(" %s" % gp.countries)
44 | print(" %s" % gp.regions) print(" %s" % gp.cities)
45 | ``` File
46 | "/Users/wf/Documents/pyworkspace/geograpy3/geograpy/places.py",
47 | line 189, in set_cities country_name = country.name
48 | AttributeError: 'NoneType' object has no attribute 'name'
49 | **Expected behavior** Python should not choke on this although
50 | the political result may be disputed.
51 |
52 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # geograpy3
2 | [](https://github.com/somnathrakshit/geograpy3/discussions)
3 | [](https://geograpy3.readthedocs.io/en/latest/?badge=latest)
4 | [](https://pypi.org/project/geograpy3/)
5 | [](https://github.com/somnathrakshit/geograpy3/actions?query=workflow%3ABuild+branch%3Amaster)
6 | [](https://pypi.python.org/pypi/geograpy3/)
7 | [](https://pepy.tech/project/geograpy3)
8 | [](https://github.com/somnathrakshit/geograpy3/issues)
9 | [](https://github.com/somnathrakshit/geograpy3/issues/?q=is%3Aissue+is%3Aclosed)
10 | [](https://www.apache.org/licenses/LICENSE-2.0)
11 |
12 | geograpy3 is a fork of [geograpy2](https://github.com/Corollarium/geograpy2), which is itself a fork of [geograpy](https://github.com/ushahidi/geograpy) and inherits most of it, but solves several problems (such as support for utf8, places names
13 | with multiple words, confusion over homonyms etc). Also, geograpy3 is compatible with Python 3, unlike geograpy2.
14 |
15 | since geograpy3 0.0.2 cities,countries and regions are matched against a database derived from the corresponding wikidata entries
16 |
17 | What it is
18 | ==========
19 |
20 | geograpy extracts place names from a URL or text, and adds context to those names -- for example distinguishing between a country, region or city.
21 |
22 | The extraction is a two step process. The first process is a Natural Language Processing task which analyzes a text for potential mentions of geographic locations. In the next step the words which represent such locations are looked up using the Locator.
23 |
24 | If you already know that your content has geographic information you might want to use the Locator interface directly.
25 |
26 | ## Examples/Tutorial
27 | * [see Examples/Tutorial Wiki](http://wiki.bitplan.com/index.php/Geograpy#Examples)
28 |
29 | ## Install & Setup
30 |
31 | Grab the package using `pip` (this will take a few minutes)
32 | ```bash
33 | pip install geograpy3
34 | ```
35 |
36 | geograpy3 uses [NLTK](http://www.nltk.org/) for entity recognition, so you'll also need
37 | to download the models we're using. Fortunately there's a command that'll take
38 | care of this for you.
39 | ```bash
40 | geograpy-nltk
41 | ```
42 |
43 | ## Getting the source code
44 | ```bash
45 | git clone https://github.com/somnathrakshit/geograpy3
46 | cd geograpy3
47 | scripts/install
48 | ```
49 |
50 | ## Basic Usage
51 |
52 | Import the module, give some text or a URL, and presto.
53 | ```python
54 | import geograpy
55 | url = 'https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay'
56 | places = geograpy.get_geoPlace_context(url=url)
57 | ```
58 |
59 | Now you have access to information about all the places mentioned in the linked
60 | article.
61 |
62 | * `places.countries` _contains a list of country names_
63 | * `places.regions` _contains a list of region names_
64 | * `places.cities` _contains a list of city names_
65 | * `places.other` _lists everything that wasn't clearly a country, region or city_
66 |
67 | Note that the `other` list might be useful for shorter texts, to pull out
68 | information like street names, points of interest, etc, but at the moment is
69 | a bit messy when scanning longer texts that contain possessive forms of proper
70 | nouns (like "Russian" instead of "Russia").
71 |
72 | ## But Wait, There's More
73 |
74 | In addition to listing the names of discovered places, you'll also get some
75 | information about the relationships between places.
76 |
77 | * `places.country_regions` _regions broken down by country_
78 | * `places.country_cities` _cities broken down by country_
79 | * `places.address_strings` _city, region, country strings useful for geocoding_
80 |
81 | ## Last But Not Least
82 |
83 | While a text might mention many places, it's probably focused on one or two, so
84 | geograpy3 also breaks down countries, regions and cities by number of mentions.
85 |
86 | * `places.country_mentions`
87 | * `places.region_mentions`
88 | * `places.city_mentions`
89 |
90 | Each of these returns a list of tuples. The first item in the tuple is the place
91 | name and the second item is the number of mentions. For example:
92 |
93 | [('Russian Federation', 14), (u'Ukraine', 11), (u'Lithuania', 1)]
94 |
95 | ## If You're Really Serious
96 |
97 | You can of course use each of Geograpy's modules on their own. For example:
98 | ```python
99 | from geograpy import extraction
100 |
101 | e = extraction.Extractor(url='https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay')
102 | e.find_geoEntities()
103 |
104 | # You can now access all of the places found by the Extractor
105 | print(e.places)
106 | ```
107 |
108 | Place context is handled in the `places` module. For example:
109 |
110 | ```python
111 | from geograpy import places
112 |
113 | pc = places.PlaceContext(['Cleveland', 'Ohio', 'United States'])
114 |
115 | pc.set_countries()
116 | print pc.countries #['United States']
117 |
118 | pc.set_regions()
119 | print(pc.regions #['Ohio'])
120 |
121 | pc.set_cities()
122 | print(pc.cities #['Cleveland'])
123 |
124 | print(pc.address_strings #['Cleveland, Ohio, United States'])
125 | ```
126 |
127 | And of course all of the other information shown above (`country_regions` etc)
128 | is available after the corresponding `set_` method is called.
129 |
130 | ## Stackoverflow
131 | * [Questions tagged with 'geograpy'](https://stackoverflow.com/questions/tagged/geograpy)
132 |
133 | ## Credits
134 |
135 | geograpy3 uses the following excellent libraries:
136 |
137 | * [NLTK](http://www.nltk.org/) for entity recognition
138 | * [newspaper](https://github.com/codelucas/newspaper) for text extraction from HTML
139 | * [jellyfish](https://github.com/sunlightlabs/jellyfish) for fuzzy text match
140 | * [pylodstorage](https://pypi.org/project/pylodstorage/) for storage and retrieval of tabular data from SQL and SPARQL sources
141 |
142 | geograpy3 uses the following data sources:
143 | * [ISO3166ErrorDictionary](https://github.com/bodacea/countryname/blob/master/countryname/databases/ISO3166ErrorDictionary.csv) for common country mispellings _via [Sara-Jayne Terp](https://github.com/bodacea)_
144 | * [Wikidata](https://www.wikidata.org) for country/region/city information with disambiguation via population
145 |
146 | Hat tip to [Chris Albon](https://github.com/chrisalbon) for the name.
147 |
--------------------------------------------------------------------------------
/docs/source/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | #https://stackoverflow.com/a/44980548/1497139
14 | import os
15 | import sys
16 | import sphinx_rtd_theme
17 | basepath=os.path.abspath('../..')
18 | print('adding basepath %s' % (basepath))
19 | sys.path.insert(0, basepath)
20 | print('sys.path is now: %s' % (sys.path))
21 |
22 |
23 | # -- Project information -----------------------------------------------------
24 |
25 | project = 'geograpy3'
26 | copyright = '2018-2020, Somnath Rakshit, Wolfgang Fahl'
27 | author = 'Somnath Rakshit, Wolfgang Fahl'
28 |
29 |
30 | # -- General configuration ---------------------------------------------------
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 | 'sphinx_rtd_theme',
37 | 'sphinx.ext.napoleon',
38 | 'sphinx.ext.autodoc',
39 | 'sphinx.ext.viewcode',
40 | 'sphinx.ext.todo',
41 | ]
42 |
43 | # Add any paths that contain templates here, relative to this directory.
44 | templates_path = ['_templates']
45 |
46 | # The language for content autogenerated by Sphinx. Refer to documentation
47 | # for a list of supported languages.
48 | #
49 | # This is also used if you do content translation via gettext catalogs.
50 | # Usually you set "language" from the command line for these cases.
51 | language = 'en'
52 |
53 | # List of patterns, relative to source directory, that match files and
54 | # directories to ignore when looking for source files.
55 | # This pattern also affects html_static_path and html_extra_path.
56 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
57 |
58 |
59 | # -- Options for HTML output -------------------------------------------------
60 |
61 | # The theme to use for HTML and HTML Help pages. See the documentation for
62 | # a list of builtin themes.
63 | #
64 | html_theme = 'sphinx_rtd_theme'
65 | master_doc = 'index'
66 |
67 | # Add any paths that contain custom static files (such as style sheets) here,
68 | # relative to this directory. They are copied after the builtin static files,
69 | # so a file named "default.css" will overwrite the builtin "default.css".
70 | html_static_path = ['_static']
71 |
72 |
73 | # -- Extension configuration -------------------------------------------------
74 |
75 | # -- Options for todo extension ----------------------------------------------
76 |
77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
78 | todo_include_todos = True
79 |
--------------------------------------------------------------------------------
/docs/source/geograpy.rst:
--------------------------------------------------------------------------------
1 | geograpy package
2 | ================
3 |
4 | Submodules
5 | ----------
6 |
7 | geograpy.extraction module
8 | --------------------------
9 |
10 | .. automodule:: geograpy.extraction
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | geograpy.labels module
16 | ----------------------
17 |
18 | .. automodule:: geograpy.labels
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | geograpy.locator module
24 | -----------------------
25 |
26 | .. automodule:: geograpy.locator
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | geograpy.places module
32 | ----------------------
33 |
34 | .. automodule:: geograpy.places
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | geograpy.prefixtree module
40 | --------------------------
41 |
42 | .. automodule:: geograpy.prefixtree
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | geograpy.utils module
48 | ---------------------
49 |
50 | .. automodule:: geograpy.utils
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
55 | geograpy.wikidata module
56 | ------------------------
57 |
58 | .. automodule:: geograpy.wikidata
59 | :members:
60 | :undoc-members:
61 | :show-inheritance:
62 |
63 | Module contents
64 | ---------------
65 |
66 | .. automodule:: geograpy
67 | :members:
68 | :undoc-members:
69 | :show-inheritance:
70 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. geograpy3 documentation master file, created by
2 | sphinx-quickstart on Wed Sep 23 16:51:23 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to geograpy3's documentation!
7 | =====================================
8 |
9 | .. toctree::
10 | :maxdepth: 4
11 | :caption: Contents:
12 |
13 | geograpy
14 | setup
15 | tests
16 |
17 |
18 | Indices and tables
19 | ==================
20 |
21 | * :ref:`genindex`
22 | * :ref:`modindex`
23 | * :ref:`search`
24 |
--------------------------------------------------------------------------------
/docs/source/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/source/setup.rst:
--------------------------------------------------------------------------------
1 | setup module
2 | ============
3 |
4 | .. automodule:: setup
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/source/tests.rst:
--------------------------------------------------------------------------------
1 | tests package
2 | =============
3 |
4 | Submodules
5 | ----------
6 |
7 | tests.test\_extractor module
8 | ----------------------------
9 |
10 | .. automodule:: tests.test_extractor
11 | :members:
12 | :undoc-members:
13 | :show-inheritance:
14 |
15 | tests.test\_locator module
16 | --------------------------
17 |
18 | .. automodule:: tests.test_locator
19 | :members:
20 | :undoc-members:
21 | :show-inheritance:
22 |
23 | tests.test\_places module
24 | -------------------------
25 |
26 | .. automodule:: tests.test_places
27 | :members:
28 | :undoc-members:
29 | :show-inheritance:
30 |
31 | tests.test\_prefixtree module
32 | -----------------------------
33 |
34 | .. automodule:: tests.test_prefixtree
35 | :members:
36 | :undoc-members:
37 | :show-inheritance:
38 |
39 | tests.test\_wikidata module
40 | ---------------------------
41 |
42 | .. automodule:: tests.test_wikidata
43 | :members:
44 | :undoc-members:
45 | :show-inheritance:
46 |
47 | Module contents
48 | ---------------
49 |
50 | .. automodule:: tests
51 | :members:
52 | :undoc-members:
53 | :show-inheritance:
54 |
--------------------------------------------------------------------------------
/examples/example1.py:
--------------------------------------------------------------------------------
1 | import geograpy
2 | url='https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay'
3 | places = geograpy.get_geoPlace_context(url = url)
4 | print(places)
5 |
--------------------------------------------------------------------------------
/geograpy/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | main geograpy 3 module
3 | """
4 | __version__ = "0.3.0"
5 | from geograpy.extraction import Extractor
6 | from geograpy.labels import Labels
7 | from geograpy.locator import Locator
8 | from geograpy.places import PlaceContext
9 |
10 |
11 | def get_geoPlace_context(url=None, text=None, debug=False):
12 | """
13 | Get a place context for a given text with information
14 | about country, region, city and other
15 | based on NLTK Named Entities having the Geographic(GPE) label.
16 |
17 | Args:
18 | url(String): the url to read text from (if any)
19 | text(String): the text to analyze
20 | debug(boolean): if True show debug information
21 |
22 | Returns:
23 | places:
24 | PlaceContext: the place context
25 | """
26 | places = get_place_context(url, text, labels=Labels.geo, debug=debug)
27 | return places
28 |
29 |
30 | def get_place_context(url=None, text=None, labels=Labels.default, debug=False):
31 | """
32 | Get a place context for a given text with information
33 | about country, region, city and other
34 | based on NLTK Named Entities in the label set Geographic(GPE),
35 | Person(PERSON) and Organization(ORGANIZATION).
36 |
37 | Args:
38 | url(String): the url to read text from (if any)
39 | text(String): the text to analyze
40 | debug(boolean): if True show debug information
41 |
42 | Returns:
43 | pc:
44 | PlaceContext: the place context
45 | """
46 | e = Extractor(url=url, text=text, debug=debug)
47 | e.find_entities(labels=labels)
48 | places = e.places
49 | pc = PlaceContext(places)
50 | pc.setAll()
51 | return pc
52 |
53 |
54 | def locateCity(location, correctMisspelling=False, debug=False):
55 | """
56 | locate the given location string
57 | Args:
58 | location(string): the description of the location
59 | Returns:
60 | Locator: the location
61 | """
62 | e = Extractor(text=location, debug=debug)
63 | e.split()
64 | loc = Locator.getInstance(correctMisspelling=correctMisspelling, debug=debug)
65 | city = loc.locateCity(e.places)
66 | return city
67 |
--------------------------------------------------------------------------------
/geograpy/data/ISO3166ErrorDictionary.csv:
--------------------------------------------------------------------------------
1 | data.un.org entry,Issue,ISO3166 name or code,,,
2 | Ã…land Islands,spelling,Åland Islands,,,
3 | "Afghanistan, Islamic State of",spelling,Afghanistan,,,
4 | "Bahamas, The",spelling,Bahamas,,,
5 | Bolivia,spelling,"Bolivia, Plurinational State of",,,
6 | Bosnia Herzegovina,spelling,Bosnia and Herzegovina,,,
7 | British Indian Ocean Ter,spelling,British Indian Ocean Territory,,,
8 | Brunei,spelling,Brunei Darussalam,,,
9 | C\xc3\xb4te d\xe2\x80\x99Ivoire,spelling,Côte d'Ivoire,,,
10 | C\xc3\xb4te d'Ivoire,spelling,Côte d'Ivoire,,,
11 | C\xf4te d'Ivoire,spelling,Côte d'Ivoire,,,
12 | Cote d'Ivoire,spelling,Côte d'Ivoire,,,
13 | Ivory Coast,spelling,Côte d'Ivoire,,,
14 | Central African Republic,spelling,Central African Republic,,,
15 | "China, People's Republic of",spelling,China,,,
16 | "Christmas Island, Aust",spelling,Christmas Island,#true?,,
17 | "Cocos, Keeling Islands",spelling,Cocos (Keeling) Islands,,,
18 | "Congo, Republic",spelling,Congo,,,
19 | Congo Democratic Republic,spelling,"Congo, The Democratic Republic of the",,,
20 | "Congo, Democratic Republic",spelling,"Congo, The Democratic Republic of the",,,
21 | "Congo, Democratic Republic of the",spelling,"Congo, The Democratic Republic of the",,,
22 | "Congo, Democratic Republic of",spelling,"Congo, The Democratic Republic of the",,,
23 | Democratic Republic of Congo,spelling,"Congo, The Democratic Republic of the",,,
24 | Democratic Republic of the Congo,spelling,"Congo, The Democratic Republic of the",,,
25 | Cook Islands,spelling,Cook Islands,,,
26 | Czech Republic,spelling,Czech Republic,,,
27 | Czechoslovakia (former),spelling,"Czechoslovakia, Czechoslovak Socialist Republic",,,
28 | Czechoslovakia,spelling,"Czechoslovakia, Czechoslovak Socialist Republic",,,
29 | "Czechoslovakia, former",spelling,"Czechoslovakia, Czechoslovak Socialist Republic",,,
30 | Former Czechoslovakia,spelling,"Czechoslovakia, Czechoslovak Socialist Republic",,,
31 | Dominican Republic,spelling,Dominican Republic,,,
32 | "Egypt, Arab Republic",spelling,Egypt,,,
33 | Ethiopia PDR,spelling,Ethiopia,,,
34 | "Ethiopia, from 1993",spelling,Ethiopia,,,
35 | "Ethiopia, up to 1993",spelling,Ethiopia,true?,,
36 | Former Ethiopia,spelling,Ethiopia,true?,,
37 | "Falkland Island, Malvinas",spelling,Falkland Islands (Malvinas),,,
38 | Falkland Islands,spelling,Falkland Islands (Malvinas),,,
39 | "Falkland Islands, Malvinas",spelling,Falkland Islands (Malvinas),,,
40 | Faeroe Islands,spelling,Faroe Islands,,,
41 | French Southern Terr,spelling,French Southern Territories,,,
42 | Gambia The,spelling,Gambia,,,
43 | "Gambia, The",spelling,Gambia,,,
44 | German Democratic Republic (former),spelling,German Democratic Republic,,,
45 | "German Democratic Republic, former",spelling,German Democratic Republic,,,
46 | "Germany, The former German Democratic Republic",spelling,German Democratic Republic,,,
47 | Fmr Federated Republic of Germany,spelling,"Germany, Federal Republic of",,,
48 | "Germany, Federated Republic of before 3.10.1990",spelling,"Germany, Federal Republic of",,,
49 | Holy See,spelling,Holy See (Vatican City State),,,
50 | "Holy See, Vatican",spelling,Holy See (Vatican City State),,,
51 | "China, Hong Kong SAR",spelling,Hong Kong,,,
52 | "China, Hong Kong Special Administrative Region",spelling,Hong Kong,,,
53 | China: Hong Kong SAR,spelling,Hong Kong,,,
54 | Hong Kong SAR,spelling,Hong Kong,,,
55 | "Hong Kong SAR, China",spelling,Hong Kong,,,
56 | "Hong Kong, China",spelling,Hong Kong,,,
57 | Iran,spelling,"Iran, Islamic Republic of",,,
58 | Iran(Islamic Republic of),spelling,"Iran, Islamic Republic of",,,
59 | "Iran, Islamic Republic of",spelling,"Iran, Islamic Republic of",,,
60 | "Iran, Islamic Republic",spelling,"Iran, Islamic Republic of",,,
61 | Islamic Republic of Iran,spelling,"Iran, Islamic Republic of",,,
62 | Democratic People's Republic of Korea,spelling,"Korea, Democratic People's Republic of",,,
63 | Korea DPR,spelling,"Korea, Democratic People's Republic of",,,
64 | "Korea, Democratic Republic",spelling,"Korea, Democratic People's Republic of",,,
65 | "Korea, DemocraticPpl's.Republic",spelling,"Korea, Democratic People's Republic of",,,
66 | "Korea,DemocraticPpl's.Republic",spelling,"Korea, Democratic People's Republic of",,,
67 | Korea Rep,spelling,"Korea, Republic of",,,
68 | "Korea, Republic of",spelling,"Korea, Republic of",,,
69 | "Korea, Republic",spelling,"Korea, Republic of",,,
70 | Republic of Korea,spelling,"Korea, Republic of",,,
71 | Lao P.D.R.,spelling,Lao People's Democratic Republic,,,
72 | Lao PDR,spelling,Lao People's Democratic Republic,,,
73 | Lao People's Democratic Republic,spelling,Lao People's Democratic Republic,,,
74 | Libya,spelling,Libyan Arab Jamahiriya,,,
75 | Libyan Arab Jamah.,spelling,Libyan Arab Jamahiriya,,,
76 | "China, Macao SAR",spelling,Macao,,,
77 | "China, Macao Special Administrative Region",spelling,Macao,,,
78 | China: Macao SAR,spelling,Macao,,,
79 | "Macao SAR, China",spelling,Macao,,,
80 | "Macao, China",spelling,Macao,,,
81 | Macau SAR,spelling,Macao,,,
82 | "Macau, China",spelling,Macao,,,
83 | "Macau, SAR",spelling,Macao,,,
84 | Macedonia,spelling,"Macedonia, Republic of",,,
85 | "Macedonia, FYR",spelling,"Macedonia, Republic of",,,
86 | "Macedonia, The former Yugoslav Republic of",spelling,"Macedonia, Republic of",,,
87 | T.F.Y.R. Macedonia,spelling,"Macedonia, Republic of",,,
88 | T.F.Yug.Republic Macedonia,spelling,"Macedonia, Republic of",,,
89 | TFYR Macedonia,spelling,"Macedonia, Republic of",,,
90 | TFYR of Macedonia,spelling,"Macedonia, Republic of",,,
91 | The f. Yugosl. Rep of Macedonia,spelling,"Macedonia, Republic of",,,
92 | The Former Yugoslav Republic of Macedonia,spelling,"Macedonia, Republic of",,,
93 | "Micronesia, Federated States of",spelling,"Micronesia, Federated States of",,,
94 | "Micronesia, Federated Sts.",spelling,"Micronesia, Federated States of",,,
95 | "Micronesia, FederatedStates of",spelling,"Micronesia, Federated States of",,,
96 | Moldova,spelling,"Moldova, Republic of",,,
97 | Republic of Moldova,spelling,"Moldova, Republic of",,,
98 | Neth. Antilles,spelling,Netherlands Antilles,,,
99 | Netherlands Antilles and Aruba,spelling,Netherlands Antilles,,,True?
100 | Pacific Island,spelling,Pacific Islands (trust territory),,,
101 | "Pacific Islands, Trust Territory",spelling,Pacific Islands (trust territory),,,
102 | Occ. Palestinian Terr.,spelling,"Palestinian Territory, Occupied",,,
103 | Palestinian Territories,spelling,"Palestinian Territory, Occupied",,,
104 | Palestine,spelling,"Palestinian Territory, Occupied",#true?,,
105 | Palestinian Authority,spelling,"Palestinian Territory, Occupied",#true?,,
106 | Pitcairn Islands,spelling,Pitcairn,,,
107 | Réunion,spelling,Reunion,fix,,
108 | Russia,spelling,Russian Federation,true?,,
109 | Saint Helena and Depend.,spelling,"Saint Helena, Ascension and Tristan da Cunha",,,
110 | Saint Helena,spelling,"Saint Helena, Ascension and Tristan da Cunha",#true?,,
111 | Saint Kitts-Nevis,spelling,Saint Kitts and Nevis,,,
112 | Saint Lucia ),spelling,Saint Lucia,,,
113 | "Saint Martin, French part",spelling,Saint Martin (French part),,,
114 | Saint Pierre-Miquelon,spelling,Saint Pierre and Miquelon,,,
115 | Saint Vincent and Grenadines,spelling,Saint Vincent and the Grenadines,,,
116 | Saint Vincent-Grenadines,spelling,Saint Vincent and the Grenadines,,,
117 | Solomon Islands,spelling,Solomon Islands,,,
118 | Svalbard and Jan Mayen Islands,spelling,Svalbard and Jan Mayen,,,
119 | Syria,spelling,Syrian Arab Republic,,,
120 | Tanzania,spelling,"Tanzania, United Republic of",,,
121 | United Republic of Tanzania,spelling,"Tanzania, United Republic of",,,
122 | United RepublicTanzania,spelling,"Tanzania, United Republic of",,,
123 | Timor Leste,spelling,Timor-Leste,,,
124 | Turks and Caicos Islands,spelling,Turks and Caicos Islands,,,
125 | United Kingdom of Great Britain & Northern Ireland,spelling,United Kingdom,,,
126 | Usa,spelling,United States,,,
127 | "United States of America, pacific Islands",spelling,US Miscellaneous Pacific Islands,,,
128 | US Miscellaneous Pacific Islands,spelling,US Miscellaneous Pacific Islands,,,
129 | Former USSR,spelling,"USSR, Union of Soviet Socialist Republics",,,
130 | Union of Soviet Socialist Republics (former),spelling,"USSR, Union of Soviet Socialist Republics",,,
131 | "Union of Soviet Socialist Republics, former",spelling,"USSR, Union of Soviet Socialist Republics",,,
132 | Ussr,spelling,"USSR, Union of Soviet Socialist Republics",,,
133 | Venezuela,spelling,"Venezuela, Bolivarian republic of",,,
134 | "Venezuela, Bolivarian Republic of",spelling,"Venezuela, Bolivarian republic of",,,
135 | "Venezuela, RB",spelling,"Venezuela, Bolivarian republic of",,,
136 | "Viet Nam, Democratic Republic of",spelling,Viet Nam,,,
137 | Vietnam,spelling,Viet Nam,,,
138 | United States Virgin Island,spelling,"Virgin Islands, U.S.",,,
139 | United States Virgin Islands,spelling,"Virgin Islands, U.S.",,,
140 | US Virgin Islands,spelling,"Virgin Islands, U.S.",,,
141 | "Virgin Islands, US",spelling,"Virgin Islands, U.S.",,,
142 | Wake Is,spelling,Wake Island,,,
143 | Wallis & Futuna Islands,spelling,Wallis and Futuna,,,
144 | Wallis and Futuna Islands,spelling,Wallis and Futuna,,,
145 | "Yemen, Republic",spelling,Yemen,,,
146 | "Yemen, Republic of",spelling,Yemen,,,
147 | Democratic Yemen (former),spelling,"Yemen, Democratic, People's Democratic Republic of",,,
148 | "Democratic Yemen, former",spelling,"Yemen, Democratic, People's Democratic Republic of",,,
149 | "Yemen, The former Democratic",spelling,"Yemen, Democratic, People's Democratic Republic of",,,
150 | Yemen: Former Democratic Yemen,spelling,"Yemen, Democratic, People's Democratic Republic of",,,
151 | Yemen Arab Republic (former),spelling,"Yemen, Yemen Arab Republic",,,
152 | "Yemen Arab Republic, former",spelling,"Yemen, Yemen Arab Republic",,,
153 | Yemen: Former Yemen Arab Republic,spelling,"Yemen, Yemen Arab Republic",,,
154 | Former Yugoslavia,spelling,"Yugoslavia, Socialist Federal Republic of",,,
155 | Yugoslav SFR,spelling,"Yugoslavia, Socialist Federal Republic of",,,
156 | Yugoslavia (former Socialist Federal Republic),spelling,"Yugoslavia, Socialist Federal Republic of",,,
157 | Yugoslavia,spelling,"Yugoslavia, Socialist Federal Republic of",,,
158 | "Yugoslavia, former Socialist Federal Republic",spelling,"Yugoslavia, Socialist Federal Republic of",,,
159 | "Yugoslavia, The former Socialist Federated Republic of",spelling,"Yugoslavia, Socialist Federal Republic of",,,
160 | East Timor,withdrawn,TMP,,,
161 | "Czechoslovakia, Czechoslovak Socialist Republic",withdrawn,CSK,,,
162 | "USSR, Union of Soviet Socialist Republics",withdrawn,SUN,,,
163 | "Yemen, Yemen Arab Republic",withdrawn,YEM,,,
164 | "Yemen, Democratic, People's Democratic Republic of",withdrawn,YMD,,,
165 | "Yugoslavia, Socialist Federal Republic of",withdrawn,YUG,,,
166 | "Germany, Federal Republic of",withdrawn,DEU,,,
167 | German Democratic Republic,withdrawn,DDR,,,
168 | US Miscellaneous Pacific Islands,withdrawn,PUS,,,
169 | Wake Island,withdrawn,WAK,,,
170 | Serbia and Montenegro,withdrawn,SCG,,,
171 | Netherlands Antilles,withdrawn,ANT,,,
172 | Pacific Islands (trust territory),withdrawn,PCI,,,
--------------------------------------------------------------------------------
/geograpy/data/aliases.csv:
--------------------------------------------------------------------------------
1 | name,alias
2 | UK,GB
3 | USA,United States of America
4 | United States,United States of America
5 |
--------------------------------------------------------------------------------
/geograpy/data/queries.yaml:
--------------------------------------------------------------------------------
1 | #
2 | # Pre-configured Queries for Geograpy3 location lookup database
3 | #
4 | # WF 2021-08-19
5 | 'LabelLookup example #1':
6 | sql: |
7 | SELECT *
8 | FROM CityLookup
9 | WHERE label IN ('Berlin',',St. Petersburg','Singapore','Athens')
10 | ORDER BY pop DESC
11 | 'LabelLookup example #2':
12 | sql: |
13 | SELECT * from RegionLookup WHERE label IN ('CA')
14 | 'LabelLookup example #3':
15 | sql: |
16 | SELECT * from CountryLookup WHERE label IN ('CA')
17 | 'Countries':
18 | title: Countries sorted by ISO code
19 | description: Countries with population and coordinates sorted by ISO code
20 | sparql: |
21 | # get a list of countries
22 | # for geograpy3 library
23 | # see https://github.com/somnathrakshit/geograpy3/issues/15
24 | PREFIX rdfs:
25 | PREFIX wd:
26 | PREFIX wdt:
27 | PREFIX p:
28 | PREFIX ps:
29 | PREFIX pq:
30 | # get City details with Country
31 | SELECT DISTINCT ?wikidataid ?name ?iso ?pop ?coord
32 | WHERE {
33 | BIND (?countryQ AS ?wikidataid)
34 |
35 | # instance of Country
36 | # inverse path see https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization#Inverse_property_paths
37 | wd:Q6256 ^wdt:P279*/^wdt:P31 ?countryQ .
38 |
39 | # VALUES ?country { wd:Q55}.
40 | # label for the country
41 | ?countryQ rdfs:label ?name filter (lang(?name) = "en").
42 | # get the continent (s)
43 | #OPTIONAL {
44 | # ?country wdt:P30 ?continent.
45 | # ?continent rdfs:label ?continentLabel filter (lang(?continentLabel) = "en").
46 | #}
47 | # get the coordinates
48 | OPTIONAL {
49 | ?countryQ wdt:P625 ?coord.
50 | }
51 | # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
52 | ?countryQ wdt:P297 ?iso.
53 | # population of country
54 | OPTIONAL
55 | {
56 | SELECT ?countryQ (max(?countryPopulationValue) as ?pop)
57 | WHERE {
58 | ?countryQ wdt:P1082 ?countryPopulationValue
59 | } group by ?countryQ
60 | }
61 | # https://www.wikidata.org/wiki/Property:P2132
62 | # nominal GDP per capita
63 | # OPTIONAL { ?country wdt:P2132 ?countryGDP_perCapitaValue. }
64 | }
65 | ORDER BY ?iso
66 |
67 |
--------------------------------------------------------------------------------
/geograpy/extraction.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import nltk
4 | from newspaper import Article
5 |
6 | from geograpy.labels import Labels
7 |
8 |
9 | class Extractor(object):
10 | """
11 | Extract geo context for text or from url
12 | """
13 |
14 | def __init__(self, text=None, url=None, debug=False):
15 | """
16 | Constructor
17 | Args:
18 |
19 | text(string): the text to analyze
20 | url(string): the url to read the text to analyze from
21 | debug(boolean): if True show debug information
22 | """
23 | if not text and not url:
24 | raise Exception("text or url is required")
25 | self.debug = debug
26 | self.text = text
27 | self.url = url
28 | self.places = []
29 | nltk_packages = [
30 | "maxent_ne_chunker",
31 | "words",
32 | "treebank",
33 | "maxent_treebank_pos_tagger",
34 | "punkt",
35 | "averaged_perceptron_tagger",
36 | ]
37 | for nltk_package in nltk_packages:
38 | try:
39 | import nltk
40 |
41 | nltk.data.find(nltk_package)
42 | except LookupError:
43 | nltk.downloader.download(nltk_package, quiet=True)
44 | import nltk
45 |
46 | def set_text(self):
47 | """
48 | Setter for text
49 | """
50 | if not self.text and self.url:
51 | a = Article(self.url)
52 | a.download()
53 | a.parse()
54 | self.text = a.text
55 |
56 | def split(self, delimiter=r","):
57 | """
58 | simpler regular expression splitter with not entity check
59 |
60 | hat tip: https://stackoverflow.com/a/1059601/1497139
61 | """
62 | self.set_text()
63 | self.places = re.split(delimiter, self.text)
64 |
65 | def find_geoEntities(self):
66 | """
67 | Find geographic entities
68 |
69 | Returns:
70 | list:
71 | List of places
72 | """
73 | self.find_entities(Labels.geo)
74 | return self.places
75 |
76 | def find_entities(self, labels=Labels.default):
77 | """
78 | Find entities with the given labels set self.places and returns it
79 | Args:
80 | labels:
81 | Labels: The labels to filter
82 | Returns:
83 | list:
84 | List of places
85 | """
86 | self.set_text()
87 |
88 | text = nltk.word_tokenize(self.text)
89 | nes = nltk.ne_chunk(nltk.pos_tag(text))
90 |
91 | for ne in nes:
92 | if type(ne) is nltk.tree.Tree:
93 | nelabel = ne.label()
94 | if nelabel in labels:
95 | leaves = ne.leaves()
96 | if self.debug:
97 | print(leaves)
98 | self.places.append(" ".join([i[0] for i in leaves]))
99 | return self.places
100 |
--------------------------------------------------------------------------------
/geograpy/geograpy_nltk.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # converted to python script 2024-03-29
3 | import nltk
4 | def main():
5 | nltk.downloader.download('maxent_ne_chunker')
6 | nltk.downloader.download('words')
7 | nltk.downloader.download('treebank')
8 | nltk.downloader.download('maxent_treebank_pos_tagger')
9 | nltk.downloader.download('punkt')
10 | # since 2020-09
11 | nltk.downloader.download('averaged_perceptron_tagger')
12 |
13 | if __name__ == "__main__":
14 | main()
15 |
--------------------------------------------------------------------------------
/geograpy/labels.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2020-09-10
3 |
4 | @author: wf
5 | """
6 |
7 |
8 | class Labels(object):
9 | """
10 | NLTK labels
11 | """
12 |
13 | default = ["GPE", "GSP", "PERSON", "ORGANIZATION"]
14 | geo = ["GPE", "GSP"]
15 |
--------------------------------------------------------------------------------
/geograpy/nominatim.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2021-12-27
3 |
4 | @author: wf
5 | """
6 | import logging
7 | import os
8 | from pathlib import Path
9 |
10 | from geopy.geocoders import Nominatim as GeoNominatim
11 | from OSMPythonTools.cachingStrategy import JSON, CachingStrategy
12 | from OSMPythonTools.nominatim import Nominatim
13 |
14 |
15 | class NominatimWrapper(object):
16 | """
17 | Nominatim Wrapper to hide technical details of Nominatim interface
18 | """
19 |
20 | def __init__(self, cacheDir: str = None, user_agent: str = "ConferenceCorpus"):
21 | """
22 | Constructor
23 |
24 | create a nominatim instance for the given cacheDir - if cacheDir is None use ~/.nominatim as cachedir
25 |
26 | Args:
27 | cacheDir(str): the path to the cache directory to be use by Noninatims JSON caching Strategy
28 | user_agent(str): the user_agent to use for the geolocator
29 |
30 | """
31 | if cacheDir is None:
32 | home = str(Path.home())
33 | cacheDir = f"{home}/.nominatim"
34 | self.cacheDir = cacheDir
35 | if not os.path.exists(self.cacheDir):
36 | os.makedirs(cacheDir)
37 | logging.getLogger("OSMPythonTools").setLevel(logging.ERROR)
38 | CachingStrategy.use(JSON, cacheDir=cacheDir)
39 | self.nominatim = Nominatim()
40 | self.geolocator = GeoNominatim(user_agent=user_agent)
41 |
42 | def lookupWikiDataId(self, locationText: str):
43 | """
44 | lookup the Wikidata Identifier for the given locationText (if any)
45 |
46 | Args:
47 | locationText(str): the location text to search for
48 |
49 | Return:
50 | the wikidata Q identifier most fitting the given location text
51 |
52 | """
53 | wikidataId = None
54 | nresult = self.nominatim.query(locationText, params={"extratags": "1"})
55 | nlod = nresult._json
56 | if len(nlod) > 0:
57 | nrecord = nlod[0]
58 | if "extratags" in nrecord:
59 | extratags = nrecord["extratags"]
60 | if "wikidata" in extratags:
61 | wikidataId = extratags["wikidata"]
62 | return wikidataId
63 |
--------------------------------------------------------------------------------
/geograpy/places.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 |
3 | from geograpy.locator import City, Locator, Region
4 |
5 | from .utils import fuzzy_match, remove_non_ascii
6 |
7 | """
8 | Takes a list of place names and works place designation (country, region, etc)
9 | and relationships between places (city is inside region is inside country, etc)
10 | """
11 |
12 |
13 | class PlaceContext(Locator):
14 | """
15 | Adds context information to a place name
16 | """
17 |
18 | def __init__(
19 | self, place_names: list, setAll: bool = True, correctMisspelling: bool = False
20 | ):
21 | """
22 | Constructor
23 |
24 | Args:
25 | place_names:
26 | list: The place names to check
27 | setAll:
28 | boolean: True if all context information should immediately be set
29 | db_file:
30 | string: Path to the database file to be used - if None the default "locs.db" will be used
31 | """
32 | super().__init__()
33 | self.correctMisspelling = correctMisspelling
34 | self.places = self.normalizePlaces(place_names)
35 | if setAll:
36 | self.setAll()
37 |
38 | def __str__(self):
39 | """
40 | Return a string representation of me
41 | """
42 | text = "countries=%s\nregions=%s\ncities=%s\nother=%s" % (
43 | self.countries,
44 | self.regions,
45 | self.cities,
46 | self.other,
47 | )
48 | return text
49 |
50 | def getRegions(self, countryName: str) -> list:
51 | """
52 | get a list of regions for the given countryName
53 |
54 | countryName(str): the countryName to check
55 | """
56 | regions = []
57 | queryString = """SELECT r.* FROM
58 | COUNTRIES c
59 | JOIN regions r ON r.countryId=c.wikidataid
60 | WHERE c.name=(?)"""
61 | params = (countryName,)
62 | regionRecords = self.sqlDB.query(queryString, params)
63 | for regionRecord in regionRecords:
64 | region = Region.fromRecord(regionRecord)
65 | regions.append(region)
66 | return regions
67 |
68 | def get_region_names(self, countryName: str) -> list:
69 | """
70 | get region names for the given country
71 |
72 | Args:
73 | countryName(str): the name of the country
74 | """
75 | if self.correctMisspelling:
76 | countryName = self.correct_country_misspelling(countryName)
77 | regionOfCountryQuery = """SELECT name
78 | FROM regions
79 | WHERE countryId IN (
80 | SELECT wikidataid
81 | FROM countries
82 | WHERE name LIKE (?)
83 | OR wikidataid IN (
84 | SELECT wikidataid
85 | FROM country_labels
86 | WHERE label LIKE (?)
87 | )
88 | )"""
89 | regionRecords = self.sqlDB.query(
90 | regionOfCountryQuery,
91 | params=(
92 | countryName,
93 | countryName,
94 | ),
95 | )
96 | return [r.get("name") for r in regionRecords]
97 |
98 | def setAll(self):
99 | """
100 | Set all context information
101 | """
102 | self.set_countries()
103 | self.set_regions()
104 | self.set_cities()
105 | self.set_other()
106 |
107 | def set_countries(self):
108 | """
109 | get the country information from my places
110 | """
111 | countries = []
112 | for place in self.places:
113 | country = self.getCountry(place)
114 | if country is not None:
115 | countries.append(country.name)
116 |
117 | self.country_mentions = Counter(countries).most_common()
118 | self.countries = list(set(countries))
119 | pass
120 |
121 | def set_regions(self):
122 | """
123 | get the region information from my places (limited to the already identified countries)
124 | """
125 | regions = []
126 | self.country_regions = {}
127 | region_names = {}
128 |
129 | if not self.countries:
130 | self.set_countries()
131 |
132 | def region_match(place_name: str, region_name: str) -> bool:
133 | """
134 | Tests the similarity of the given strings after removing non ascii characters.
135 | Args:
136 | place_name(str): Place name
137 | region_name(str): valid region name to test against
138 |
139 | Returns:
140 | True if the similarity of both values is greater equals 80%. Otherwise False
141 | """
142 | return fuzzy_match(
143 | remove_non_ascii(place_name), remove_non_ascii(region_name)
144 | )
145 |
146 | def is_region(place_name: str, region_names: list):
147 | """
148 | Filters out the regions that are not similar to the given place_name
149 | Args:
150 | place_name(str): place name to check against the regions
151 | region_names(list): List of valid region names
152 |
153 | Returns:
154 | List of regions that are similar to the given place_name
155 | """
156 | return any([region_match(place_name, rn) for rn in region_names])
157 |
158 | for country in self.countries:
159 | region_names = self.get_region_names(country)
160 | matched_regions = [
161 | p for p in set(self.places) if is_region(p, region_names)
162 | ]
163 |
164 | regions += matched_regions
165 | self.country_regions[country] = list(set(matched_regions))
166 |
167 | self.region_mentions = Counter(regions).most_common()
168 | self.regions = list(set(regions))
169 |
170 | def set_cities(self):
171 | """
172 | set the cities information
173 | """
174 | self.cities = []
175 | self.country_cities = {}
176 | self.address_strings = []
177 |
178 | if not self.countries:
179 | self.set_countries()
180 |
181 | if not self.regions:
182 | self.set_regions()
183 |
184 | if not self.db_has_data():
185 | self.populate_db()
186 | # ToDo: Duplicate with Locator.city_for_name e.g. extend method to support multiple names
187 | placesWithoutDuplicates = set(self.places)
188 | params = ",".join("?" * len(placesWithoutDuplicates))
189 | query = "SELECT * FROM CityLookup WHERE name IN (" + params + ")"
190 | cityLookupRecords = self.sqlDB.query(query, list(placesWithoutDuplicates))
191 | cityLookupRecords.sort(
192 | key=lambda cityRecord: float(cityRecord.get("pop"))
193 | if cityRecord.get("pop") is not None
194 | else 0.0,
195 | reverse=True,
196 | )
197 | for cityLookupRecord in cityLookupRecords:
198 | city = City.fromCityLookup(cityLookupRecord)
199 |
200 | if city.name not in self.cities:
201 | self.cities.append(city.name)
202 |
203 | countryName = city.country.name
204 | if countryName not in self.countries:
205 | self.countries.append(countryName)
206 | self.country_mentions.append((countryName, 1))
207 |
208 | if countryName not in self.country_cities:
209 | self.country_cities[countryName] = []
210 |
211 | if city.name not in self.country_cities[countryName]:
212 | self.country_cities[countryName].append(city.name)
213 | regionName = city.region.name
214 | if (
215 | countryName in self.country_regions
216 | and regionName in self.country_regions[countryName]
217 | ):
218 | address = f"{city.name}, {regionName}, {countryName}"
219 | self.address_strings.append(address)
220 |
221 | all_cities = [p for p in self.places if p in self.cities]
222 | self.city_mentions = Counter(all_cities).most_common()
223 |
224 | def set_other(self):
225 | if not self.cities:
226 | self.set_cities()
227 |
228 | def unused(place_name):
229 | places = [self.countries, self.cities, self.regions]
230 | return all(
231 | self.correct_country_misspelling(place_name) not in l for l in places
232 | )
233 |
234 | self.other = [p for p in self.places if unused(p)]
235 |
--------------------------------------------------------------------------------
/geograpy/utils.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import os
3 | import shutil
4 | import time
5 | import urllib.request
6 |
7 | import jellyfish
8 |
9 |
10 | class Download:
11 | """
12 | Utility functions for downloading data
13 | """
14 |
15 | @staticmethod
16 | def getURLContent(url: str):
17 | with urllib.request.urlopen(url) as urlResponse:
18 | content = urlResponse.read().decode()
19 | return content
20 |
21 | @staticmethod
22 | def getFileContent(path: str):
23 | with open(path, "r") as file:
24 | content = file.read()
25 | return content
26 |
27 | @staticmethod
28 | def needsDownload(filePath: str, force: bool = False) -> bool:
29 | """
30 | check if a download of the given filePath is necessary that is the file
31 | does not exist has a size of zero or the download should be forced
32 |
33 | Args:
34 | filePath(str): the path of the file to be checked
35 | force(bool): True if the result should be forced to True
36 |
37 | Return:
38 | bool: True if a download for this file needed
39 | """
40 | if not os.path.isfile(filePath):
41 | result = True
42 | else:
43 | stats = os.stat(filePath)
44 | size = stats.st_size
45 | result = force or size == 0
46 | return result
47 |
48 | @staticmethod
49 | def downloadBackupFile(
50 | url: str, fileName: str, targetDirectory: str, force: bool = False
51 | ):
52 | """
53 | Downloads from the given url the zip-file and extracts the file corresponding to the given fileName.
54 |
55 | Args:
56 | url: url linking to a downloadable gzip file
57 | fileName: Name of the file that should be extracted from gzip file
58 | targetDirectory(str): download the file this directory
59 | force (bool): True if the download should be forced
60 |
61 | Returns:
62 | Name of the extracted file with path to the backup directory
63 | """
64 | extractTo = f"{targetDirectory}/{fileName}"
65 | # we might want to check whether a new version is available
66 | if Download.needsDownload(extractTo, force=force):
67 | if not os.path.isdir(targetDirectory):
68 | os.makedirs(targetDirectory)
69 | zipped = f"{extractTo}.gz"
70 | print(f"Downloading {zipped} from {url} ... this might take a few seconds")
71 | urllib.request.urlretrieve(url, zipped)
72 | print(f"Unzipping {extractTo} from {zipped}")
73 | with gzip.open(zipped, "rb") as gzipped:
74 | with open(extractTo, "wb") as unzipped:
75 | shutil.copyfileobj(gzipped, unzipped)
76 | print("Extracting completed")
77 | if not os.path.isfile(extractTo):
78 | raise (f"could not extract {fileName} from {zipped}")
79 | return extractTo
80 |
81 |
82 | class Profiler:
83 | """
84 | simple profiler
85 | """
86 |
87 | def __init__(self, msg, profile=True):
88 | """
89 | construct me with the given msg and profile active flag
90 |
91 | Args:
92 | msg(str): the message to show if profiling is active
93 | profile(bool): True if messages should be shown
94 | """
95 | self.msg = msg
96 | self.profile = profile
97 | self.starttime = time.time()
98 | if profile:
99 | print(f"Starting {msg} ...")
100 |
101 | def time(self, extraMsg=""):
102 | """
103 | time the action and print if profile is active
104 | """
105 | elapsed = time.time() - self.starttime
106 | if self.profile:
107 | print(f"{self.msg}{extraMsg} took {elapsed:5.1f} s")
108 | return elapsed
109 |
110 |
111 | def remove_non_ascii(s):
112 | """
113 | Remove non ascii chars from the given string
114 | Args:
115 | s:
116 | string: The string to remove chars from
117 | Returns:
118 | string: The result string with non-ascii chars removed
119 |
120 | Hat tip: http://stackoverflow.com/a/1342373/2367526
121 | """
122 | return "".join(i for i in s if ord(i) < 128)
123 |
124 |
125 | def fuzzy_match(s1, s2, max_dist=0.8):
126 | """
127 | Fuzzy match the given two strings with the given maximum distance
128 | jellyfish jaro_winkler_similarity based on https://en.wikipedia.org/wiki/Jaro-Winkler_distance
129 | Args:
130 | s1:
131 | string: First string
132 | s2:
133 | string: Second string
134 | max_dist:
135 | float: The distance - default: 0.8
136 | Returns:
137 | True if the match is greater equals max_dist. Otherwise false
138 | """
139 | return jellyfish.jaro_winkler_similarity(s1, s2) >= max_dist
140 |
--------------------------------------------------------------------------------
/geograpy/version.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2024-03-29
3 |
4 | @author: wf
5 | """
6 | from dataclasses import dataclass
7 |
8 | import geograpy
9 |
10 |
11 | @dataclass
12 | class Version:
13 | """
14 | Version handling for the geograpy3 project.
15 | """
16 |
17 | name = "geograpy3"
18 | version = geograpy.__version__
19 | date = "2023-09-10"
20 | updated = "2024-03-29"
21 | description = "Extract countries, regions, and cities from a URL or text"
22 |
23 | authors = "Somnath Rakshit, Wolfgang Fahl, Tim Holzheim" # Combining all authors into a single string
24 |
25 | doc_url = "https://geograpy3.readthedocs.io"
26 | chat_url = "https://github.com/somnathrakshit/geograpy3/discussions"
27 | cm_url = "https://github.com/somnathrakshit/geograpy3"
28 |
29 | license = """Copyright 2023-2024 contributors. All rights reserved.
30 |
31 | Licensed under the Apache License 2.0
32 | http://www.apache.org/licenses/LICENSE-2.0
33 |
34 | Distributed on an "AS IS" basis without warranties
35 | or conditions of any kind, either express or implied."""
36 |
37 | longDescription = f"""{name} version {version}
38 | {description}
39 |
40 | Created by {authors} on {date} last updated {updated}.
41 | For more information, visit {doc_url}."""
42 |
--------------------------------------------------------------------------------
/geograpy/wikidata.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2020-09-23
3 |
4 | @author: wf
5 | """
6 | import re
7 |
8 | from lodstorage.sparql import SPARQL
9 |
10 | from geograpy.utils import Profiler
11 |
12 |
13 | class Wikidata(object):
14 | """
15 | Wikidata access
16 | """
17 |
18 | def __init__(
19 | self, endpoint="https://query.wikidata.org/sparql", profile: bool = True
20 | ):
21 | """
22 | Constructor
23 | """
24 | self.endpoint = endpoint
25 | self.profile = profile
26 |
27 | def query(self, msg, queryString: str, limit=None) -> list:
28 | """
29 | get the query result
30 |
31 | Args:
32 | msg(str): the profile message to display
33 | queryString(str): the query to execute
34 |
35 | Return:
36 | list: the list of dicts with the result
37 | """
38 | profile = Profiler(msg, profile=self.profile)
39 | wd = SPARQL(self.endpoint)
40 | limitedQuery = queryString
41 | if limit is not None:
42 | limitedQuery = f"{queryString} LIMIT {limit}"
43 | results = wd.query(limitedQuery)
44 | lod = wd.asListOfDicts(results)
45 | for record in lod:
46 | for key in list(record.keys()):
47 | value = record[key]
48 | if isinstance(value, str):
49 | if value.startswith("http://www.wikidata.org/"):
50 | record[key] = self.getWikidataId(value)
51 | if key.lower().endswith("coord"):
52 | lat, lon = Wikidata.getCoordinateComponents(value)
53 | record["lat"] = lat
54 | record["lon"] = lon
55 | record.pop(key)
56 |
57 | profile.time(f"({len(lod)})")
58 | return lod
59 |
60 | def store2DB(self, lod, tableName: str, primaryKey: str = None, sqlDB=None):
61 | """
62 | store the given list of dicts to the database
63 |
64 | Args:
65 | lod(list): the list of dicts
66 | tableName(str): the table name to use
67 | primaryKey(str): primary key (if any)
68 | sqlDB(SQLDB): target SQL database
69 | """
70 | msg = f"Storing {tableName}"
71 | profile = Profiler(msg, profile=self.profile)
72 | entityInfo = sqlDB.createTable(
73 | lod,
74 | entityName=tableName,
75 | primaryKey=primaryKey,
76 | withDrop=True,
77 | sampleRecordCount=-1,
78 | )
79 | sqlDB.store(lod, entityInfo, fixNone=True)
80 | profile.time()
81 |
82 | def getCountries(self, limit=None):
83 | """
84 | get a list of countries
85 |
86 | `try query `_
87 |
88 | """
89 | queryString = """# get a list of countries
90 | # for geograpy3 library
91 | # see https://github.com/somnathrakshit/geograpy3/issues/15
92 | PREFIX rdfs:
93 | PREFIX wd:
94 | PREFIX wdt:
95 | PREFIX p:
96 | PREFIX ps:
97 | PREFIX pq:
98 | # get City details with Country
99 | SELECT DISTINCT ?wikidataid ?name ?iso ?pop ?coord
100 | WHERE {
101 | BIND (?countryQ AS ?wikidataid)
102 |
103 | # instance of Country
104 | # inverse path see https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization#Inverse_property_paths
105 | wd:Q6256 ^wdt:P279*/^wdt:P31 ?countryQ .
106 |
107 | # VALUES ?country { wd:Q55}.
108 | # label for the country
109 | ?countryQ rdfs:label ?name filter (lang(?name) = "en").
110 | # get the continent (s)
111 | #OPTIONAL {
112 | # ?country wdt:P30 ?continent.
113 | # ?continent rdfs:label ?continentLabel filter (lang(?continentLabel) = "en").
114 | #}
115 | # get the coordinates
116 | OPTIONAL {
117 | ?countryQ wdt:P625 ?coord.
118 | }
119 | # https://www.wikidata.org/wiki/Property:P297 ISO 3166-1 alpha-2 code
120 | ?countryQ wdt:P297 ?iso.
121 | # population of country
122 | OPTIONAL
123 | {
124 | SELECT ?countryQ (max(?countryPopulationValue) as ?pop)
125 | WHERE {
126 | ?countryQ wdt:P1082 ?countryPopulationValue
127 | } group by ?countryQ
128 | }
129 | # https://www.wikidata.org/wiki/Property:P2132
130 | # nominal GDP per capita
131 | # OPTIONAL { ?country wdt:P2132 ?countryGDP_perCapitaValue. }
132 | }
133 | ORDER BY ?iso"""
134 | msg = "Getting countries from wikidata ETA 10s"
135 | countryList = self.query(msg, queryString, limit=limit)
136 | return countryList
137 |
138 | def getRegions(self, limit=None):
139 | """
140 | get Regions from Wikidata
141 |
142 | `try query `_
143 | """
144 | queryString = """# get a list of regions
145 | # for geograpy3 library
146 | # see https://github.com/somnathrakshit/geograpy3/issues/15
147 | PREFIX rdfs:
148 | PREFIX wd:
149 | PREFIX wdt:
150 | PREFIX wikibase:
151 | SELECT DISTINCT ?countryId (?regionQ as ?wikidataid) ?name ?iso ?pop ?coord
152 | WHERE
153 | {
154 | # administrative unit of first order
155 | ?regionQ wdt:P31/wdt:P279* wd:Q10864048.
156 | OPTIONAL {
157 | ?regionQ rdfs:label ?name filter (lang(?name) = "en").
158 | }
159 | # isocode state/province (mandatory - filters historic regions while at it ...)
160 | # filter historic regions
161 | # FILTER NOT EXISTS {?region wdt:P576 ?end}
162 | {
163 | SELECT ?regionQ (max(?regionAlpha2) as ?iso) (max(?regionPopulationValue) as ?pop) (max(?locationValue) as ?coord)
164 | WHERE {
165 | ?regionQ wdt:P300 ?regionAlpha2.
166 | # get the population
167 | # https://www.wikidata.org/wiki/Property:P1082
168 | OPTIONAL {
169 | ?regionQ wdt:P1082 ?regionPopulationValue
170 | }
171 | # get the location
172 | # https://www.wikidata.org/wiki/Property:P625
173 | OPTIONAL {
174 | ?regionQ wdt:P625 ?locationValue.
175 | }
176 | } GROUP BY ?regionQ
177 | }
178 | # # https://www.wikidata.org/wiki/Property:P297
179 | OPTIONAL {
180 | ?regionQ wdt:P17 ?countryId.
181 | }
182 | } ORDER BY ?iso"""
183 | msg = "Getting regions from wikidata ETA 15s"
184 | regionList = self.query(msg, queryString, limit=limit)
185 | return regionList
186 |
187 | def getCities(self, limit=1000000):
188 | """
189 | get all human settlements as list of dict with duplicates for label, region, country ...
190 | """
191 | queryString = """PREFIX rdfs:
192 | PREFIX wdt:
193 | PREFIX wd:
194 | PREFIX skos:
195 |
196 | SELECT DISTINCT (?cityQ as ?wikidataid) ?city ?altLabel ?geoNameId ?gndId ?cityPopulation ?cityCoord ?regionId ?countryId
197 | WHERE {
198 | # instance of human settlement https://www.wikidata.org/wiki/Q486972
199 | wd:Q486972 ^wdt:P279*/^wdt:P31 ?cityQ .
200 | # Values
201 | # VALUES ?cityQ { wd:Q656 }
202 |
203 | # label of the City
204 | ?cityQ rdfs:label ?city filter (lang(?city) = "en").
205 |
206 | OPTIONAL {
207 | ?cityQ skos:altLabel ?altLabel .
208 | FILTER (lang(?altLabel) = "en")
209 | }
210 |
211 | # geoName Identifier
212 | OPTIONAL {
213 | ?cityQ wdt:P1566 ?geoNameId.
214 | }
215 |
216 | # GND-ID
217 | OPTIONAL {
218 | ?cityQ wdt:P227 ?gndId.
219 | }
220 |
221 | # population of city
222 | OPTIONAL {
223 | SELECT ?cityQ (max(?cityPopulationValue) as ?cityPopulation)
224 | WHERE {
225 | ?cityQ wdt:P1082 ?cityPopulationValue
226 | } group by ?cityQ
227 | }
228 |
229 | OPTIONAL{
230 | ?cityQ wdt:P625 ?cityCoord .
231 | }
232 |
233 | # region this city belongs to
234 | OPTIONAL {
235 | ?cityQ wdt:P131 ?regionId .
236 | }
237 |
238 | # country this city belongs to
239 | OPTIONAL {
240 | ?cityQ wdt:P17 ?countryId .
241 | }
242 |
243 | }
244 | """
245 | msg = "Getting cities (human settlements) from wikidata ETA 50 s"
246 | citiesList = self.query(msg, queryString, limit=limit)
247 | return citiesList
248 |
249 | def getCitiesForRegion(self, regionId, msg):
250 | """
251 | get the cities for the given Region
252 | """
253 | regionPath = (
254 | "?region ^wdt:P131/^wdt:P131/^wdt:P131 ?cityQ."
255 | if regionId in ["Q980", "Q21"]
256 | else "?cityQ wdt:P131* ?region."
257 | )
258 | queryString = """# get cities by region for geograpy3
259 | PREFIX rdfs:
260 | PREFIX wdt:
261 | PREFIX wd:
262 |
263 | SELECT distinct (?cityQ as ?wikidataid) ?name ?geoNameId ?gndId ?regionId ?countryId ?pop ?coord WHERE {
264 | VALUES ?hsType {
265 | wd:Q1549591 wd:Q3957 wd:Q5119 wd:Q15284 wd:Q62049 wd:Q515 wd:Q1637706 wd:Q1093829 wd:Q486972 wd:Q532
266 | }
267 |
268 | VALUES ?region {
269 | wd:%s
270 | }
271 |
272 | # region the city should be in
273 | %s
274 |
275 | # type of human settlement to try
276 | ?hsType ^wdt:P279*/^wdt:P31 ?cityQ.
277 |
278 | # label of the City
279 | ?cityQ rdfs:label ?name filter (lang(?name) = "en").
280 |
281 | # geoName Identifier
282 | OPTIONAL {
283 | ?cityQ wdt:P1566 ?geoNameId.
284 | }
285 |
286 | # GND-ID
287 | OPTIONAL {
288 | ?cityQ wdt:P227 ?gndId.
289 | }
290 |
291 | OPTIONAL{
292 | ?cityQ wdt:P625 ?coord .
293 | }
294 |
295 | # region this city belongs to
296 | OPTIONAL {
297 | ?cityQ wdt:P131 ?regionId .
298 | }
299 |
300 | OPTIONAL {
301 | ?cityQ wdt:P1082 ?pop
302 | }
303 |
304 | # country this city belongs to
305 | OPTIONAL {
306 | ?cityQ wdt:P17 ?countryId .
307 | }
308 | }""" % (
309 | regionId,
310 | regionPath,
311 | )
312 | regionCities = self.query(msg, queryString)
313 | return regionCities
314 |
315 | def getCityStates(self, limit=None):
316 | """
317 | get city states from Wikidata
318 |
319 | `try query `_
320 | """
321 | queryString = """# get a list of city states
322 | # for geograpy3 library
323 | PREFIX rdfs:
324 | PREFIX wd:
325 | PREFIX wdt:
326 | PREFIX wikibase:
327 | SELECT DISTINCT ?countryId (?cityStateQ as ?wikidataid) ?name ?iso ?pop ?coord
328 | WHERE
329 | {
330 | # all citiy states
331 | ?cityStateQ wdt:P31 wd:Q133442 .
332 | ?cityStateQ rdfs:label ?name filter (lang(?name) = "en").
333 | {
334 | SELECT ?cityStateQ (max(?isoCode) as ?iso) (max(?populationValue) as ?pop) (max(?locationValue) as ?coord)
335 | WHERE {
336 | ?cityStateQ wdt:P300|wdt:P297 ?isoCode.
337 | # get the population
338 | # https://www.wikidata.org/wiki/Property:P1082
339 | OPTIONAL {
340 | ?cityStateQ wdt:P1082 ?populationValue
341 | }
342 | # get the location
343 | # https://www.wikidata.org/wiki/Property:P625
344 | OPTIONAL {
345 | ?cityStateQ wdt:P625 ?locationValue.
346 | }
347 | } GROUP BY ?cityStateQ
348 | }
349 | OPTIONAL {
350 | ?cityStateQ wdt:P17 ?countryId.
351 | }
352 | } ORDER BY ?iso"""
353 | msg = "Getting regions from wikidata ETA 15s"
354 | cityStateList = self.query(msg, queryString, limit=limit)
355 | return cityStateList
356 |
357 | @staticmethod
358 | def getCoordinateComponents(coordinate: str) -> (float, float):
359 | """
360 | Converts the wikidata coordinate representation into its subcomponents longitude and latitude
361 | Example: 'Point(-118.25 35.05694444)' results in ('-118.25' '35.05694444')
362 |
363 | Args:
364 | coordinate: coordinate value in the format as returned by wikidata queries
365 |
366 | Returns:
367 | Returns the longitude and latitude of the given coordinate as separate values
368 | """
369 | # https://stackoverflow.com/a/18237992/1497139
370 | floatRegex = r"[-+]?\d+([.,]\d*)?"
371 | regexp = rf"Point\((?P{floatRegex})\s+(?P{floatRegex})\)"
372 | cMatch = None
373 | if coordinate:
374 | try:
375 | cMatch = re.search(regexp, coordinate)
376 | except Exception as ex:
377 | # ignore
378 | pass
379 | if cMatch:
380 | latStr = cMatch.group("lat")
381 | lonStr = cMatch.group("lon")
382 | lat, lon = float(latStr.replace(",", ".")), float(lonStr.replace(",", "."))
383 | if lon > 180:
384 | lon = lon - 360
385 | return lat, lon
386 | else:
387 | # coordinate does not have the expected format
388 | return None, None
389 |
390 | @staticmethod
391 | def getWikidataId(wikidataURL: str):
392 | """
393 | Extracts the wikidata id from the given wikidata URL
394 |
395 | Args:
396 | wikidataURL: wikidata URL the id should be extracted from
397 |
398 | Returns:
399 | The wikidata id if present in the given wikidata URL otherwise None
400 | """
401 |
402 | # regex pattern taken from https://www.wikidata.org/wiki/Q43649390 and extended to also support property ids
403 | wikidataidMatch = re.search(r"[PQ][1-9]\d*", wikidataURL)
404 | if wikidataidMatch and wikidataidMatch.group(0):
405 | wikidataid = wikidataidMatch.group(0)
406 | return wikidataid
407 | else:
408 | return None
409 |
410 | @staticmethod
411 | def getValuesClause(varName: str, values, wikidataEntities: bool = True):
412 | """
413 | generates the SPARQL value clause for the given variable name containing the given values
414 | Args:
415 | varName: variable name for the ValuesClause
416 | values: values for the clause
417 | wikidataEntities(bool): if true the wikidata prefix is added to the values otherwise it is expected taht the given values are proper IRIs
418 |
419 | Returns:
420 | str
421 | """
422 | clauseValues = ""
423 | if isinstance(values, list):
424 | for value in values:
425 | if wikidataEntities:
426 | clauseValues += f"wd:{value} "
427 | else:
428 | clauseValues += f"{value} "
429 | else:
430 | if wikidataEntities:
431 | clauseValues = f"wd:{values} "
432 | else:
433 | clauseValues = f"{values} "
434 | clause = "VALUES ?%s { %s }" % (varName, clauseValues)
435 | return clause
436 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["hatchling"]
3 | build-backend = "hatchling.build"
4 |
5 | [project]
6 | name = "geograpy3"
7 | description = "Extract countries, regions and cities from a URL or text"
8 | keywords = [ "geography", "locations", "extraction", "text analysis"]
9 | home-page = "https://github.com/somnathrakshit/geograpy3"
10 | readme = "README.md"
11 | license = {text = "Apache-2.0"}
12 | authors = [
13 | {name = "Somnath Rakshit", email = "somnath52@gmail.com"}
14 | ]
15 | maintainers = [
16 | { name = "Somnath Rakshit", email = "somnath52@gmail.com"},
17 | { name = "Wolfgang Fahl", email = "wf@bitplan.com" },
18 | { name = "Tim Holzheim", email = "tim.holzheim@rwth-aachen.de" }
19 | ]
20 | classifiers=[
21 | "Programming Language :: Python",
22 | "Programming Language :: Python :: 3.9",
23 | "Programming Language :: Python :: 3.10",
24 | "Programming Language :: Python :: 3.11",
25 | "Programming Language :: Python :: 3.12"
26 | ]
27 | dependencies = [
28 | # https://pypi.org/project/newspaper3k/
29 | "newspaper3k>=0.2.8",
30 | # https://pypi.org/project/nltk/
31 | "nltk>=3.8.1",
32 | # https://pypi.org/project/jellyfish/
33 | "jellyfish>=1.0.3",
34 | # https://pypi.org/project/numpy/
35 | "numpy>=1.26.4",
36 | # https://pypi.org/project/pyLodStorage/
37 | "pylodstorage>=0.10.3",
38 | # https://pypi.org/project/sphinx-rtd-theme/
39 | "sphinx-rtd-theme>=2.0.0",
40 | # https://github.com/scikit-learn/scikit-learn
41 | "scikit-learn>=1.4.1",
42 | # https://pypi.org/project/pandas/
43 | "pandas>=2.1.5",
44 | # https://pypi.org/project/geopy/
45 | "geopy>=2.4.1",
46 | # https://pypi.org/project/OSMPythonTools/
47 | "OSMPythonTools>=0.3.5"
48 | ]
49 |
50 | requires-python = ">=3.7"
51 | dynamic = ["version"]
52 |
53 | [tool.hatch.version]
54 | path = "geograpy/__init__.py"
55 |
56 | [project.urls]
57 | Homepage = "https://github.com/somnathrakshit/geograpy3"
58 | Documentation = "https://geograpy3.readthedocs.io"
59 | Source = "https://github.com/somnathrakshit/geograpy3"
60 | Issues = "https://github.com/somnathrakshit/geograpy3/issues"
61 |
62 | [tool.hatch.build.targets.wheel]
63 | only-include = ["geograpy"]
64 |
65 | [tool.hatch.build.targets.wheel.sources]
66 | "geograpy" = "geograpy"
67 |
68 | [project.optional-dependencies]
69 | docs = [
70 | "sphinx",
71 | "sphinx-rtd-theme",
72 | ]
73 | test = [
74 | "pytest",
75 | "coverage",
76 | ]
77 |
78 | [project.scripts]
79 | geograpy = "geograpy.locate:main"
80 | geograpy-nltk = "geograpy.geograpy_nltk:main"
81 |
82 | [project.data-files."geograpy/data"]
83 | include = ["*.csv"]
84 |
--------------------------------------------------------------------------------
/scripts/blackisort:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # WF 2024-03-29
3 | package=geograpy
4 | isort tests/*.py
5 | black tests/*.py
6 | isort $package/*.py
7 | black $package/*.py
8 |
--------------------------------------------------------------------------------
/scripts/doc:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # WF 2020-01-31
3 |
4 | #
5 | # check whether the given command is installed
6 | #
7 | checkinstalled() {
8 | local l_cmd="$1"
9 | which $l_cmd > /dev/null
10 | if [ $? -ne 0 ]
11 | then
12 | echo "$l_cmd need to be installed" 1>&2
13 | exit 1
14 | fi
15 | }
16 |
17 | fixconf() {
18 | local l_year="$1"
19 | local l_author="$2"
20 | conf=conf.py
21 | # fix sys path
22 | # https://stackoverflow.com/questions/10324393/sphinx-build-fail-autodoc-cant-import-find-module
23 | grep "# sys.path" $conf
24 | if [ $? -eq 0 ]
25 | then
26 | tmpconf=/tmp/conf$$.py
27 | cat $conf | awk -v author="$l_author" -v year="$l_year" '
28 | BEGIN {
29 | quote="\x27"
30 | squote="\047"
31 | }
32 | /# import os/ { next }
33 | /# import sys/ { next }
34 | /copyright/ {
35 | printf "copyright = %s%s, %s%s\n",squote,year,author,squote
36 | next
37 | }
38 | /author/ {
39 | printf "author = %s%s%s\n",squote,author,squote
40 | next
41 | }
42 | /html_theme = / {
43 | # html_theme = 'alabaster'
44 | printf "html_theme = %ssphinx_rtd_theme%s\n",squote,squote
45 | printf "master_doc = %sindex%s\n",squote,squote
46 | next
47 | }
48 | # add sphinx_rtd extension
49 | /extensions = / {
50 | print $0
51 | printf "\t%ssphinx_rtd_theme%s,\n",squote,squote
52 | printf "\t%ssphinx.ext.napoleon%s,\n",squote,squote
53 | next
54 | }
55 | /# sys.path/ {
56 | print("#https://stackoverflow.com/a/44980548/1497139")
57 | print("import os")
58 | print("import sys")
59 | print("import sphinx_rtd_theme")
60 | printf("basepath=os.path.abspath(%s../..%s)\n",squote,squote)
61 | printf("print(%sadding basepath %%s%s %% (basepath))\n",squote,squote)
62 | print("sys.path.insert(0, basepath)")
63 | printf("print(%ssys.path is now: %%s%s %% (sys.path))\n",squote,squote)
64 | next
65 | }
66 | { print}' > $tmpconf
67 | #diff $tmpconf $conf
68 | mv $tmpconf $conf
69 | echo "$src/conf.py has been fixed"
70 | fi
71 | }
72 |
73 | src=docs/source
74 | checkinstalled sphinx-apidoc
75 | sphinx-apidoc --full -f -o $src .
76 | cd $src
77 |
78 | fixconf 2018-2020 "Somnath Rakshit, Wolfgang Fahl"
79 | make clean html
80 | # if [ "$GHACTIONS" != "ACTIVE" ]
81 | # then
82 | # open _build/html/index.html
83 | # fi
84 |
--------------------------------------------------------------------------------
/scripts/download:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | if [! -d $HOME/.geograpy3]; then
4 | mkdir $HOME/.geograpy3
5 | fi
6 | cd $HOME/.geograpy3
7 |
8 | curl -o locations.db.gz --remote-name -L https://github.com/somnathrakshit/geograpy3/wiki/data/locations.db.gz
9 | gzip -d locations.db.gz
10 | curl -o regions.tgz --remote-name -L https://github.com/somnathrakshit/geograpy3/wiki/data/regions.tgz
11 | tar xvfz regions.tgz
12 | rm regions.tgz
13 |
--------------------------------------------------------------------------------
/scripts/install:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # WF 2020-03-25
3 | # update 2024-03-29
4 | pip install .
5 |
--------------------------------------------------------------------------------
/scripts/release:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # WF 2020-03-26
3 | # create a release see https://packaging.python.org/tutorials/packaging-projects/
4 | #
5 | # get the absolute filename
6 | #
7 | get_abs_filename() {
8 | # $1 : relative filename
9 | echo "$(cd "$(dirname "$1")" && pwd)/$(basename "$1")"
10 | }
11 |
12 | pwd=$(pwd)
13 | scriptPath=$(get_abs_filename $(dirname $0))
14 | cd $scriptPath/..
15 | rm -rf dist
16 | $scriptPath/doc
17 | python3 setup.py sdist bdist_wheel
18 | python3 -m twine upload -u __token__ --repository-url https://upload.pypi.org/legacy/ dist/*
19 |
--------------------------------------------------------------------------------
/scripts/test:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # WF 2020-06-03
3 | python="python3"
4 | while [ "$1" != "" ]
5 | do
6 | option="$1"
7 | case $option in
8 | -d|--debug)
9 | # show environment for debugging
10 | env
11 | ;;
12 | -p|--python)
13 | shift
14 | python="$1"
15 | ;;
16 | esac
17 | shift
18 | done
19 | $python -m unittest discover
20 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/somnathrakshit/geograpy3/bd167b5a91584d4449911b5cfffa4ba7e23cbc3c/tests/__init__.py
--------------------------------------------------------------------------------
/tests/basetest.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2021-08-13
3 |
4 | @author: wf
5 | """
6 | import getpass
7 | import json
8 | import os
9 | from unittest import TestCase
10 |
11 | from geograpy.locator import Locator
12 | from geograpy.utils import Profiler
13 |
14 |
15 | class Geograpy3Test(TestCase):
16 | """
17 | base test for geograpy 3 tests
18 | """
19 |
20 | def setUp(self, debug=False):
21 | """
22 | setUp test environment
23 | """
24 | TestCase.setUp(self)
25 | self.debug = debug
26 | msg = f"test {self._testMethodName}, debug={self.debug}"
27 | self.profile = Profiler(msg)
28 | Locator.resetInstance()
29 | locator = Locator.getInstance()
30 | locator.downloadDB()
31 | # actively test Wikidata tests?
32 | self.testWikidata = False
33 |
34 | def tearDown(self):
35 | TestCase.tearDown(self)
36 | self.profile.time()
37 |
38 | def inCI(self):
39 | """
40 | are we running in a Continuous Integration Environment?
41 | """
42 | publicCI = getpass.getuser() in ["travis", "runner"]
43 | jenkins = "JENKINS_HOME" in os.environ
44 | return publicCI or jenkins
45 |
46 | def handleWikidataException(self, ex):
47 | """
48 | handle a Wikidata exception
49 | Args:
50 | ex(Exception): the exception to handle - e.g. timeout
51 | """
52 | msg = str(ex)
53 | print(f"Wikidata test failed {msg}")
54 | # only raise exception for real problems
55 | if "HTTP Error 500" in msg:
56 | print("test can not work if server has problems")
57 | return
58 | if isinstance(ex, json.decoder.JSONDecodeError):
59 | print("potential SPARQLWrapper issue")
60 | return
61 | raise ex
62 |
--------------------------------------------------------------------------------
/tests/testCachingCitiesByRegion.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2021-08-16
3 |
4 | @author: wf
5 | """
6 | import getpass
7 | import json
8 | import os
9 | import re
10 | import unittest
11 |
12 | from geograpy.locator import (
13 | City,
14 | CityManager,
15 | CountryManager,
16 | LocationContext,
17 | RegionManager,
18 | )
19 | from geograpy.utils import Profiler
20 | from geograpy.wikidata import Wikidata
21 | from tests.basetest import Geograpy3Test
22 |
23 |
24 | class TestCachingCitiesByRegion(Geograpy3Test):
25 | """
26 | The wikidata city query times out even on the wikidata copy in the RWTH i5 infrastructure
27 | Therefore we need to split the queries to a reasonable size so that each individual query does not time out.
28 |
29 | A query per region is done some 3000 times.
30 | The query used here works for most regions except a few where the query needs to be modified to not go the full depth of
31 | the Property
32 | located in the administrative territorial entity (P131)
33 | but limit it
34 |
35 | """
36 |
37 | def cacheRegionCities2Json(self, limit, showDone=False):
38 | # TODO - refactor to Locator/LocationContext - make available via command line
39 | wd = Wikidata()
40 | config = LocationContext.getDefaultConfig()
41 | countryManager = CountryManager(config=config)
42 | countryManager.fromCache()
43 | regionManager = RegionManager(config=config)
44 | regionManager.fromCache()
45 | regionList = regionManager.getList()
46 | total = len(regionList)
47 | cachePath = f"{config.getCachePath()}/regions"
48 | if not os.path.exists(cachePath):
49 | os.makedirs(cachePath)
50 | for index, region in enumerate(regionList):
51 | if index >= limit:
52 | break
53 | regionId = region.wikidataid
54 | msg = f"{index+1:4d}/{total:4d}:getting cities for {region.name} {region.iso} {region.wikidataid}"
55 | jsonFileName = f"{cachePath}/{region.iso}.json"
56 | if os.path.isfile(jsonFileName):
57 | if showDone:
58 | print(msg)
59 | else:
60 | try:
61 | regionCities = wd.getCitiesForRegion(regionId, msg)
62 | jsonStr = json.dumps(regionCities)
63 | with open(jsonFileName, "w") as jsonFile:
64 | jsonFile.write(jsonStr)
65 | except Exception as ex:
66 | self.handleWikidataException(ex)
67 |
68 | def testGetCitiesByRegion(self):
69 | """
70 | test counting human settlement types
71 | """
72 | if self.inCI():
73 | limit = 50
74 | elif getpass.getuser() == "wf":
75 | limit = 5000
76 | else:
77 | limit = 0
78 | self.cacheRegionCities2Json(limit=limit)
79 |
80 | def testReadCachedCitiesByRegion(self):
81 | """
82 | test reading the cached json Files
83 | """
84 | # This is to populate the cities database
85 | return
86 | config = LocationContext.getDefaultConfig()
87 | regionManager = RegionManager(config=config)
88 | regionManager.fromCache()
89 | regionByIso, _dup = regionManager.getLookup("iso")
90 | self.assertEqual(56, len(_dup))
91 | jsonFiles = CityManager.getJsonFiles(config)
92 | msg = f"reading {len(jsonFiles)} cached city by region JSON cache files"
93 | self.assertTrue(len(jsonFiles) > 2000)
94 | profiler = Profiler(msg)
95 | cityManager = CityManager(config=config)
96 | cityManager.getList().clear()
97 | for jsonFileName in jsonFiles:
98 | isoMatch = re.search(r"/([^\/]*)\.json", jsonFileName)
99 | if not isoMatch:
100 | print(f"{jsonFileName} - does not match a known region's ISO code")
101 | else:
102 | rIso = isoMatch.group(1)
103 | region = regionByIso[rIso]
104 | with open(jsonFileName) as jsonFile:
105 | cities4Region = json.load(jsonFile)
106 | for city4Region in cities4Region:
107 | city = City()
108 | city.fromDict(city4Region)
109 | # fix regionId
110 | if hasattr(city, "regionId"):
111 | city.partOfRegionId = city.regionId
112 | city.regionId = region.wikidataid
113 | cityManager.add(city)
114 | pass
115 | cityManager.store()
116 | profiler.time()
117 |
118 | def testCityFromCityStates(self):
119 | """
120 | tests if city states are queried correctly if given the region
121 | For city states the city is region and city (in some cases also country).
122 | This test ensures that by querying for the cities of a region the city states include themself in the result
123 | (the result for cities in city-states often includes the municipalities)
124 | """
125 | wd = Wikidata()
126 | cityStateRecords = wd.getCityStates()
127 | for cityStateRecord in cityStateRecords:
128 | regionId = cityStateRecord.get("wikidataid")
129 | regionCities = wd.getCitiesForRegion(
130 | regionId, msg=f"Query for cities in {cityStateRecord.get('name')}"
131 | )
132 | foundCities = [c.get("wikidataid") for c in regionCities]
133 | self.assertTrue(regionId in foundCities)
134 |
135 |
136 | if __name__ == "__main__":
137 | # import sys;sys.argv = ['', 'Test.testName']
138 | unittest.main()
139 |
--------------------------------------------------------------------------------
/tests/testCachingLocationLabels.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2021-08-17
3 |
4 | @author: th
5 | """
6 | import math
7 | import unittest
8 |
9 | from lodstorage.sql import SQLDB
10 |
11 | from geograpy.locator import CityManager, CountryManager, LocationContext, RegionManager
12 | from geograpy.wikidata import Wikidata
13 | from tests.basetest import Geograpy3Test
14 |
15 |
16 | class TestCachingLocationLabels(Geograpy3Test):
17 | """
18 | adds location label tables
19 |
20 | """
21 |
22 | def setUp(self):
23 | pass
24 |
25 | def tearDown(self):
26 | pass
27 |
28 | def testCacheLocationLabels(self):
29 | """
30 | Generates the location label tabels in the SQL db fro countries, regions and cities by querying wikidata for
31 | the rdfs:label and skos:altLa of each location.
32 | A view containing all location labels is also created.
33 | """
34 | testLocationLabelExtraction = False
35 | if testLocationLabelExtraction:
36 | wd = Wikidata()
37 | config = LocationContext.getDefaultConfig()
38 | countryManager = CountryManager(config=config)
39 | regionManager = RegionManager(config=config)
40 | cityManager = CityManager(config=config)
41 | sqlDb = SQLDB(dbname=config.cacheFile, debug=self.debug)
42 | for manager in countryManager, regionManager, cityManager:
43 | manager.fromCache()
44 | wikidataIdQuery = (
45 | f"SELECT DISTINCT wikidataid FROM {manager.entityPluralName}"
46 | )
47 | wikidataIdQueryRes = sqlDb.query(wikidataIdQuery)
48 | wikidataIds = [l["wikidataid"] for l in wikidataIdQueryRes]
49 |
50 | chunkSize = 1000
51 | iterations = math.ceil(len(wikidataIds) / chunkSize)
52 | progress = 0
53 | res = []
54 | for i in range(iterations):
55 | workOnIds = wikidataIds[i * chunkSize : (i + 1) * chunkSize]
56 | progress += len(workOnIds)
57 | index = 0
58 | values = ""
59 | for location in workOnIds:
60 | spacer = " \n\t\t\t" if index % 10 == 0 else " "
61 | values += f"{spacer}wd:{wd.getWikidataId(location)}"
62 | index += 1
63 | query = self.getLablesQuery(values)
64 | res.extend(
65 | wd.query(
66 | f"Query {i}/{iterations} - Querying {manager.entityName} Labels",
67 | queryString=query,
68 | )
69 | )
70 | wd.store2DB(res, tableName=f"{manager.entityName}_labels", sqlDB=sqlDb)
71 | self.createViews(sqlDB=sqlDb)
72 |
73 | def getLablesQuery(self, wikidataIds: str):
74 | """
75 | get the query for the alternatives labels for the given values
76 |
77 | wikidataIds(str): a list of wikidataids
78 | """
79 | query = (
80 | """# get alternative labels for the given wikidata
81 | PREFIX rdfs:
82 | PREFIX skos:
83 | PREFIX wd:
84 | SELECT DISTINCT ?wikidataid ?label ?lang
85 | WHERE{
86 | VALUES ?wikidataid { %s }
87 | ?wikidataid rdfs:label|skos:altLabel ?label
88 | BIND(lang(?label) AS ?lang)
89 | FILTER(lang(?label)="en")
90 | }"""
91 | % wikidataIds
92 | )
93 | return query
94 |
95 | def createViews(self, sqlDB):
96 | viewDDLs = [
97 | "DROP VIEW IF EXISTS location_labels",
98 | """
99 | CREATE VIEW location_labels AS
100 | SELECT *, "Country" AS "hierarchy"
101 | FROM country_labels
102 | UNION
103 | SELECT *, "Region" AS "hierarchy"
104 | FROM region_labels
105 | UNION
106 | SELECT *, "City" AS "hierarchy"
107 | FROM city_labels
108 | """,
109 | "DROP INDEX if EXISTS cityLabelByWikidataid",
110 | "CREATE INDEX cityLabelByWikidataid ON city_labels (wikidataid)",
111 | "DROP INDEX if EXISTS regionLabelByWikidataid",
112 | "CREATE INDEX regionLabelByWikidataid ON region_labels (wikidataid)",
113 | "DROP INDEX if EXISTS countryLabelByWikidataid",
114 | "CREATE INDEX countryLabelByWikidataid ON country_labels (wikidataid)",
115 | ]
116 | for viewDDL in viewDDLs:
117 | sqlDB.execute(viewDDL)
118 |
119 |
120 | if __name__ == "__main__":
121 | # import sys;sys.argv = ['', 'Test.testName']
122 | unittest.main()
123 |
--------------------------------------------------------------------------------
/tests/testLocatorDatabase.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 16.08.2021
3 |
4 | @author: wf
5 | """
6 | import os
7 | import tempfile
8 | import unittest
9 |
10 | from lodstorage.storageconfig import StorageConfig
11 |
12 | from geograpy.locator import LocationContext, Locator
13 | from tests.basetest import Geograpy3Test
14 |
15 |
16 | class TestLocatorDatabase(Geograpy3Test):
17 | """
18 | test the locator database handling
19 | """
20 |
21 | def testLocatorWithWikiData(self):
22 | """
23 | test Locator
24 | """
25 | Locator.resetInstance()
26 | loc = Locator.getInstance()
27 | # forceUpdate=True
28 | forceUpdate = False
29 | loc.populate_db(force=forceUpdate)
30 | tableList = loc.sqlDB.getTableList()
31 | expectedCities = 800000
32 | self.assertTrue(loc.db_recordCount(tableList, "countries") >= 200)
33 | self.assertTrue(loc.db_recordCount(tableList, "regions") >= 3000)
34 | self.assertTrue(loc.db_recordCount(tableList, "cities") >= expectedCities)
35 |
36 | def testHasData(self):
37 | """
38 | check has data and populate functionality
39 | """
40 | testDownload = False
41 | if self.inCI() or testDownload:
42 | with tempfile.TemporaryDirectory() as cacheRootDir:
43 | config = StorageConfig(
44 | cacheRootDir=cacheRootDir, cacheDirName="geograpy3"
45 | )
46 | config.cacheFile = (
47 | f"{config.getCachePath()}/{LocationContext.db_filename}"
48 | )
49 | loc = Locator(storageConfig=config)
50 | if os.path.isfile(loc.db_file):
51 | os.remove(loc.db_file)
52 | # reinit sqlDB
53 | loc = Locator(storageConfig=config)
54 | self.assertFalse(loc.db_has_data())
55 | loc.populate_db()
56 | self.assertTrue(loc.db_has_data())
57 |
58 |
59 | if __name__ == "__main__":
60 | # import sys;sys.argv = ['', 'Test.testName']
61 | unittest.main()
62 |
--------------------------------------------------------------------------------
/tests/testQueries.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2021-08-19
3 |
4 | @author: wf
5 | """
6 | import os
7 | import re
8 | import unittest
9 |
10 | from lodstorage.query import Query, QueryManager
11 |
12 | from geograpy.locator import LocationContext, Locator
13 | from tests.basetest import Geograpy3Test
14 |
15 |
16 | class TestQueries(Geograpy3Test):
17 | """
18 | test queries for documentation, bug reports and the like
19 | """
20 |
21 | def getQueryManager(self):
22 | """
23 | get the query manager
24 | """
25 | cachedir = LocationContext.getDefaultConfig().getCachePath()
26 | scriptDir = os.path.dirname(__file__)
27 | for path in cachedir, f"{scriptDir}/../geograpy/data":
28 | qYamlFile = f"{path}/queries.yaml"
29 | if os.path.isfile(qYamlFile):
30 | qm = QueryManager(lang="sql", debug=self.debug, queriesPath=qYamlFile)
31 | return qm
32 | return None
33 |
34 | def documentQueryResult(self, query, lod, tablefmt, show=False):
35 | """
36 | document the query results
37 | """
38 | for record in lod:
39 | for key in record.keys():
40 | value = record[key]
41 | if value is not None:
42 | if isinstance(value, str):
43 | if re.match(r"Q[0-9]+", value):
44 | if tablefmt == "github":
45 | record[
46 | key
47 | ] = f"[{value}](https://www.wikidata.org/wiki/{value})"
48 | elif tablefmt == "mediawiki":
49 | record[
50 | key
51 | ] = f"[https://www.wikidata.org/wiki/{value} {value}]"
52 | doc = query.documentQueryResult(lod, tablefmt=tablefmt, floatfmt=".0f")
53 | if show:
54 | print(doc)
55 |
56 | def testQueries(self):
57 | """
58 | test preconfigured queries
59 | """
60 | qm = self.getQueryManager()
61 | self.assertIsNotNone(qm)
62 | locator = Locator.getInstance()
63 | show = self.debug
64 | # show=True
65 | for _name, query in qm.queriesByName.items():
66 | qlod = locator.sqlDB.query(query.query)
67 | for tablefmt in ["mediawiki", "github"]:
68 | self.documentQueryResult(query, qlod, tablefmt, show=show)
69 |
70 | pass
71 |
72 | def testQuery(self):
73 | """
74 | test a single query
75 | """
76 | queries = [
77 | (
78 | "LocationLabel Count",
79 | """select count(*),hierarchy
80 | from location_labels
81 | group by hierarchy""",
82 | ),
83 | ("NY example", "select * from cityLookup where label='New York City'"),
84 | (
85 | "Berlin example",
86 | "select * from cityLookup where label='Berlin' order by pop desc,regionName",
87 | ),
88 | (
89 | "Issue #25",
90 | "select * from countryLookup where label in ('France', 'Hungary', 'Poland', 'Spain', 'United Kingdom')",
91 | ),
92 | (
93 | "Issue #25 Bulgaria",
94 | "select * from cityLookup where label in ('Bulgaria','Croatia','Hungary','Czech Republic') order by pop desc,regionName",
95 | ),
96 | ]
97 | for tableName in ["countries", "regions", "cities"]:
98 | queries.append(
99 | (
100 | f"unique wikidataids for {tableName}",
101 | f"select count(distinct(wikidataid)) as wikidataids from {tableName}",
102 | )
103 | )
104 | queries.append(
105 | (
106 | f"total #records for {tableName}",
107 | f"select count(*) as recordcount from {tableName}",
108 | )
109 | )
110 | locator = Locator.getInstance()
111 | for title, queryString in queries:
112 | query = Query(name=title, query=queryString, lang="sql")
113 | qlod = locator.sqlDB.query(queryString)
114 | for tablefmt in ["mediawiki", "github"]:
115 | self.documentQueryResult(query, qlod, tablefmt, show=True)
116 |
117 |
118 | if __name__ == "__main__":
119 | # import sys;sys.argv = ['', 'Test.testName']
120 | unittest.main()
121 |
--------------------------------------------------------------------------------
/tests/test_LocationContext.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2021-08-13
3 |
4 | @author: wf
5 | """
6 | import tempfile
7 | import unittest
8 |
9 | from lodstorage.storageconfig import StorageConfig
10 |
11 | from geograpy.locator import (
12 | CityManager,
13 | CountryManager,
14 | LocationContext,
15 | LocationManager,
16 | RegionManager,
17 | )
18 | from tests.basetest import Geograpy3Test
19 |
20 |
21 | class TestLocationContext(Geograpy3Test):
22 | """
23 | test the location Context - these are potentially long running tests
24 | """
25 |
26 | def getStorageConfig(self):
27 | # config=StorageConfig.getDefault()
28 | config = LocationContext.getDefaultConfig()
29 | return config
30 |
31 | def checkNoDuplicateWikidataIds(
32 | self, locationManager: LocationManager, primaryKey=None, expectedDuplicates=0
33 | ):
34 | """
35 | check that there are no duplicate Wikidata Q identifiers in the given
36 |
37 | """
38 | locationsByWikiDataId, duplicates = locationManager.getLookup(primaryKey)
39 | showLimit = 10
40 | if len(duplicates) > 0:
41 | for i, duplicate in enumerate(duplicates):
42 | if i < showLimit:
43 | if self.debug:
44 | print(f"{i}:{duplicate}")
45 | else:
46 | break
47 | self.assertTrue(len(duplicates) <= expectedDuplicates)
48 | return locationsByWikiDataId
49 |
50 | def testCountryManager(self):
51 | """
52 | tests the loading and parsing of the RegionManager form the json backup file
53 | """
54 | countryManager = CountryManager(config=self.getStorageConfig())
55 | countryManager.fromCache()
56 | self.assertTrue(hasattr(countryManager, "countries"))
57 | self.assertTrue(len(countryManager.countries) >= 200)
58 | # check if California is in the list
59 | countriesByWikidataId = self.checkNoDuplicateWikidataIds(
60 | countryManager, "wikidataid"
61 | )
62 | self.assertTrue("Q30" in countriesByWikidataId)
63 |
64 | def testRegionManager(self):
65 | """
66 | tests the loading and parsing of the RegionManager form the json backup file
67 | """
68 | regionManager = RegionManager(config=self.getStorageConfig())
69 | regionManager.fromCache()
70 | self.assertTrue(hasattr(regionManager, "regions"))
71 | self.assertTrue(len(regionManager.regions) >= 1000)
72 | regionsByWikidataId = self.checkNoDuplicateWikidataIds(
73 | regionManager, "wikidataid", 54
74 | )
75 | self.assertTrue("Q99" in regionsByWikidataId)
76 |
77 | def testCityManager(self):
78 | """
79 | tests the loading and parsing of the cityList form the json backup file
80 | """
81 | cityManager = CityManager(config=self.getStorageConfig())
82 | cityManager.fromCache()
83 | self.assertTrue(hasattr(cityManager, "cities"))
84 | self.assertTrue(len(cityManager.cities) >= 200000)
85 | # check if Los Angeles is in the list (popular city should always be in the list)
86 | _citiesByWikiDataIdNoDuplicates = self.checkNoDuplicateWikidataIds(
87 | cityManager, "wikidataid", 304000
88 | ) # ToDo: Reduce number of duplicates
89 | citiesByWikiDataId = cityManager.getLookup("wikidataid", withDuplicates=True)
90 | self.assertTrue("Q65" in citiesByWikiDataId)
91 |
92 | def testLocationContextFromCache(self):
93 | """
94 | test loading LocationContext from cache
95 | """
96 | testCache = False
97 | if self.inCI() or testCache:
98 | locationContext = LocationContext.fromCache()
99 | locationContext.load()
100 | self.assertTrue(len(locationContext.countries) > 180)
101 | self.assertTrue(len(locationContext.regions) > 3500)
102 | self.assertTrue(len(locationContext.cities) > 1000000)
103 |
104 | def testIssue_59_db_download(self):
105 | """
106 | tests if the cache database is downloaded if not present
107 | """
108 | with tempfile.TemporaryDirectory() as tmpdir:
109 | config = StorageConfig(cacheFile="locations.db", cacheRootDir=tmpdir)
110 | config.cacheFile = f"{config.getCachePath()}/{config.cacheFile}"
111 | loc = LocationContext.fromCache(config=config)
112 | locations = loc.locateLocation("Germany")
113 | self.assertTrue(len(locations) > 0)
114 |
115 |
116 | if __name__ == "__main__":
117 | # import sys;sys.argv = ['', 'Test.testName']
118 | unittest.main()
119 |
--------------------------------------------------------------------------------
/tests/test_extractor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import geograpy
4 | from geograpy.extraction import Extractor
5 | from tests.basetest import Geograpy3Test
6 |
7 |
8 | class TestExtractor(Geograpy3Test):
9 | """
10 | test Extractor
11 | """
12 |
13 | def check(self, places, expectedList):
14 | """
15 | check the places for begin non empty and having at least the expected List of
16 | elements
17 |
18 | Args:
19 | places(Places): the places to check
20 | expectedList(list): the list of elements to check
21 | """
22 | if self.debug:
23 | print(places)
24 | self.assertTrue(len(places) > 0)
25 | for expected in expectedList:
26 | self.assertTrue(expected in places)
27 |
28 | def testExtractorFromUrl(self):
29 | """
30 | test the extractor
31 | """
32 | url = "https://en.wikipedia.org/wiki/Louvre"
33 | e = Extractor(url=url)
34 | e.find_geoEntities()
35 | self.check(e.places, ["Paris", "France"])
36 |
37 | def testGeograpyIssue32(self):
38 | """
39 | test https://github.com/ushahidi/geograpy/issues/32
40 | """
41 | # do not test since url is unreliable
42 | return
43 | url = "https://www.politico.eu/article/italy-incurable-economy/"
44 | places = geograpy.get_geoPlace_context(url=url)
45 | if self.debug:
46 | print(places)
47 | self.assertSetEqual(
48 | {
49 | "Italy",
50 | "Germany",
51 | "France",
52 | "United States of America",
53 | "Belgium",
54 | "Canada",
55 | },
56 | set(places.countries),
57 | )
58 | self.assertSetEqual(
59 | {"Rome", "Brussels", "Italy", "Germany"}, set(places.cities)
60 | ) # Notes: Italy is also city in US-NY, Germany is also city in US-TX
61 |
62 | def testGetGeoPlace(self):
63 | """
64 | test geo place handling
65 | """
66 | # 'http://www.bbc.com/news/world-europe-26919928'
67 | # broken since 2020-10 - returns javascript instead of plain html
68 | url = "https://en.wikipedia.org/wiki/Golden_spike"
69 | places = geograpy.get_geoPlace_context(url=url)
70 | debug = self.debug
71 | # debug=True
72 | if debug:
73 | print(places)
74 | self.assertTrue("Ogden" in places.cities)
75 | self.assertTrue("Utah" in places.regions)
76 | self.assertTrue("United States of America" in places.countries)
77 |
78 | def testExtractorFromText(self):
79 | """
80 | test different texts for getting geo context information
81 | """
82 | text = """ Perfect just Perfect! It's a perfect storm for Nairobi on a
83 | Friday evening! horrible traffic here is your cue to become worse @Ma3Route """
84 |
85 | e2 = Extractor(text=text)
86 | e2.find_entities()
87 | self.check(e2.places, ["Nairobi"])
88 |
89 | text3 = """ Risks of Cycling in Nairobi:http://www.globalsiteplans.com/environmental-design/engineering-environmental-design/the-risky-affair-of-cycling-in-nairobi-kenya/ ... via @ConstantCap @KideroEvans @county_nairobi @NrbCity_Traffic """
90 | e3 = Extractor(text=text3)
91 | e3.find_entities()
92 | self.check(e3.places, ["Nairobi"])
93 |
94 | text4 = """ @DurbanSharks [Africa Renewal]It is early morning in Nairobi, the Kenyan capital. The traffic jam along Ngong """
95 | e4 = Extractor(text=text4)
96 | e4.find_entities()
97 | self.check(e4.places, ["Nairobi", "Ngong"])
98 |
99 | # unicode
100 | text5 = """ There is a city called New York in the United States."""
101 | e5 = Extractor(text=text5)
102 | e5.find_entities()
103 | self.check(e5.places, ["New York", "United States"])
104 |
105 | # unicode and two words
106 | text6 = """ There is a city called São Paulo in Brazil."""
107 | e6 = Extractor(text=text6)
108 | e6.find_entities()
109 | self.check(e6.places, ["São Paulo"])
110 |
111 | def testIssue7(self):
112 | """
113 | test https://github.com/somnathrakshit/geograpy3/issues/7
114 | disambiguating countries
115 | """
116 | localities = [
117 | "Vienna, Illinois,",
118 | "Paris, Texas",
119 | "Zaragoza, Spain",
120 | "Vienna, Austria",
121 | ]
122 | expected = [
123 | {"iso": "US"},
124 | {"iso": "US"},
125 | {"iso": "ES"},
126 | {"iso": "AT"},
127 | ]
128 | for index, locality in enumerate(localities):
129 | city = geograpy.locateCity(locality, debug=False)
130 | if self.debug:
131 | print(f" {city}")
132 | self.assertEqual(expected[index]["iso"], city.country.iso)
133 |
134 | def testIssue10(self):
135 | """
136 | test https://github.com/somnathrakshit/geograpy3/issues/10
137 | Add ISO country code
138 | """
139 | localities = [
140 | "Singapore",
141 | "Beijing, China",
142 | "Paris, France",
143 | "Barcelona, Spain",
144 | "Rome, Italy",
145 | "San Francisco, US",
146 | "Bangkok, Thailand",
147 | "Vienna, Austria",
148 | "Athens, Greece",
149 | "Shanghai, China",
150 | ]
151 | expectedCountry = ["SG", "CN", "FR", "ES", "IT", "US", "TH", "AT", "GR", "CN"]
152 | debug = self.debug
153 | for index, locality in enumerate(localities):
154 | city = geograpy.locateCity(locality)
155 | if debug:
156 | print(" %s" % city)
157 | self.assertEqual(expectedCountry[index], city.country.iso)
158 |
159 | def testIssue9(self):
160 | """
161 | test https://github.com/somnathrakshit/geograpy3/issues/9
162 | [BUG]AttributeError: 'NoneType' object has no attribute 'name' on "Pristina, Kosovo"
163 | """
164 | locality = "Pristina, Kosovo"
165 | gp = geograpy.get_geoPlace_context(text=locality)
166 | if self.debug:
167 | print(" %s" % gp.countries)
168 | print(" %s" % gp.regions)
169 | print(" %s" % gp.cities)
170 |
171 | def testStackoverflow62152428(self):
172 | """
173 | see https://stackoverflow.com/questions/62152428/extracting-country-information-from-description-using-geograpy?noredirect=1#comment112899776_62152428
174 | """
175 | examples = {
176 | 2: "Socialist Republic of Alachua",
177 | 3: "Hérault, France",
178 | 4: "Gwalior, India",
179 | 5: "Zaragoza,España",
180 | 6: "Zaragoza, Spain",
181 | 7: "amsterdam ",
182 | 8: "Evesham",
183 | 9: "Rochdale",
184 | }
185 | for index, text in examples.items():
186 | places = geograpy.get_geoPlace_context(text=text)
187 | if self.debug:
188 | print("example %d: %s" % (index, places.countries))
189 |
190 | def testStackoverflow43322567(self):
191 | """
192 | see https://stackoverflow.com/questions/43322567
193 | """
194 | url = "https://en.wikipedia.org/wiki/U.S._state"
195 | e = Extractor(url=url)
196 | places = e.find_geoEntities()
197 | self.check(places, ["Alabama", "Virginia", "New York"])
198 | if self.debug:
199 | print(places)
200 |
201 | def testStackoverflow54712198(self):
202 | """
203 | see https://stackoverflow.com/questions/54712198/not-only-extracting-places-from-a-text-but-also-other-names-in-geograpypython
204 | """
205 | text = """Opposition Leader Mahinda Rajapaksa says that the whole public administration has collapsed due to the constitution council’s arbitrary actions. The Opposition Leader said so in response to a query a journalised raised after a meeting held..."""
206 | e = Extractor(text)
207 | places = e.find_geoEntities()
208 | if self.debug:
209 | print(places)
210 | self.assertEqual([], places)
211 |
212 | def testStackoverflow54077973(self):
213 | """
214 | see https://stackoverflow.com/questions/54077973/geograpy3-library-for-extracting-the-locations-in-the-text-gives-unicodedecodee
215 | """
216 | address = "Jersey City New Jersey 07306"
217 | e = Extractor(text=address)
218 | e.find_entities()
219 | self.check(e.places, ["Jersey", "City"])
220 |
221 | def testStackOverflow54721435(self):
222 | """
223 | see https://stackoverflow.com/questions/54721435/unable-to-extract-city-names-from-a-text-using-geograpypython
224 | """
225 | text = "I live in Kadawatha a suburb of Colombo Sri Lanka"
226 | e = Extractor(text=text)
227 | e.find_entities()
228 | if self.debug:
229 | print(e.places)
230 |
231 | def testStackoverflow55548116(self):
232 | """
233 | see https://stackoverflow.com/questions/55548116/geograpy3-library-is-not-working-properly-and-give-traceback-error
234 | """
235 | feedContent = ["Las Vegas is a city in Nevada"]
236 | placesInFeed = []
237 |
238 | for content in feedContent:
239 | if content != "":
240 | e = Extractor(text=content)
241 | e.find_entities()
242 | places = e.places
243 | if self.debug:
244 | print(places)
245 | placesInFeed.append(places)
246 |
247 |
248 | if __name__ == "__main__":
249 | unittest.main()
250 |
--------------------------------------------------------------------------------
/tests/test_location.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2021-06-09
3 |
4 | @author: wf
5 | """
6 | import unittest
7 | from math import radians
8 |
9 | import numpy as np
10 | from sklearn.neighbors import BallTree
11 |
12 | from geograpy.locator import (
13 | CityManager,
14 | Country,
15 | CountryManager,
16 | LocationContext,
17 | LocationManager,
18 | Locator,
19 | RegionManager,
20 | )
21 | from tests.basetest import Geograpy3Test
22 |
23 |
24 | class TestLocationHierarchy(Geograpy3Test):
25 | """
26 | tests for the location hierarchy
27 | """
28 |
29 | def setUp(self):
30 | super().setUp()
31 | self.locationContext = None
32 | pass
33 |
34 | def getLocationContext(self):
35 | if self.locationContext is None:
36 | self.locationContext = LocationContext.fromCache()
37 | return self.locationContext
38 |
39 | def testDistance(self):
40 | """
41 | test calculcating the distance of two points using the haversine function
42 | """
43 | # https://stackoverflow.com/a/64585765/1497139
44 | earth_radius = 6371000 # meters in earth
45 | test_radius = 1300000 # meters
46 |
47 | test_points = [[32.027240, -81.093190], [41.981876, -87.969982]]
48 | test_points_rad = np.array(
49 | [[radians(x[0]), radians(x[1])] for x in test_points]
50 | )
51 |
52 | tree = BallTree(test_points_rad, metric="haversine")
53 | ind, results = tree.query_radius(
54 | test_points_rad, r=test_radius / earth_radius, return_distance=True
55 | )
56 | if self.debug:
57 | print(ind)
58 | print(results * earth_radius / 1000)
59 |
60 | def testIssue45_BallTree(self):
61 | """
62 | test calculation a ball tree for a given list of locations
63 | """
64 | countryList = CountryManager.fromErdem()
65 | ballTree, validList = countryList.getBallTuple()
66 | self.assertEqual(245, len(validList))
67 | self.assertEqual("BallTree", type(ballTree).__name__)
68 | self.assertAlmostEqual(245, ballTree.sum_weight, delta=0.1)
69 | pass
70 |
71 | def checkLocationListWithDistances(
72 | self,
73 | locationListWithDistances,
74 | expectedCount,
75 | expectedClosest,
76 | expectedDistance,
77 | ):
78 | """
79 | check the location list with the given distances
80 | """
81 | if self.debug:
82 | for i, locationWithDistance in enumerate(locationListWithDistances):
83 | location, distance = locationWithDistance
84 | print(f"{i}:{location}-{distance:.0f} km")
85 | self.assertEqual(len(locationListWithDistances), expectedCount)
86 | closestLocation, distance = locationListWithDistances[0]
87 | self.assertEqual(expectedClosest, closestLocation.name)
88 | self.assertAlmostEqual(expectedDistance, distance, delta=1)
89 |
90 | def testClosestLocation(self):
91 | """
92 | test getting the closes Location to a given location
93 | """
94 | # sample Country: Germany
95 | country = Country()
96 | country.name = "Germany"
97 | country.lat = 51.0
98 | country.lon = 9.0
99 | # get a country list
100 | lookupCountryManager = CountryManager.fromErdem()
101 | # get the closest 2 locations for the given countryList
102 | countryListWithDistances = country.getNClosestLocations(lookupCountryManager, 2)
103 | self.checkLocationListWithDistances(
104 | countryListWithDistances, 2, "Luxembourg", 244
105 | )
106 |
107 | countryListWithDistances = country.getLocationsWithinRadius(
108 | lookupCountryManager, 300
109 | )
110 | self.checkLocationListWithDistances(
111 | countryListWithDistances, 2, "Luxembourg", 244
112 | )
113 |
114 | def testRegionMatching(self):
115 | """
116 | test region matches
117 | """
118 | locator = Locator()
119 | if not locator.db_has_data():
120 | locator.populate_db()
121 | countryList = CountryManager.fromErdem()
122 | config = LocationContext.getDefaultConfig()
123 | regionManager = RegionManager(config=config)
124 | regionManager.fromCache()
125 | for country in countryList.countries:
126 | locationListWithDistances = country.getNClosestLocations(regionManager, 3)
127 | if self.debug:
128 | print(f"{country}{country.lat:.2f},{country.lon:.2f}")
129 | for i, locationWithDistance in enumerate(locationListWithDistances):
130 | location, distance = locationWithDistance
131 | if self.debug:
132 | print(f" {i}:{location}-{distance:.0f} km")
133 | pass
134 |
135 | def testLocationListLoading(self):
136 | """
137 | test loading the locations from Json
138 | """
139 | samples = """
140 | {
141 | "countries": [
142 | {
143 | "name": "Afghanistan",
144 | "wikidataid": "Q889",
145 | "lat": 34,
146 | "lon": 66,
147 | "coordinates": "34,66",
148 | "partOf": null,
149 | "level": 3,
150 | "locationKind": "Country",
151 | "comment": null,
152 | "iso": "AF"
153 | },
154 | {
155 | "name": "United States of America",
156 | "wikidataid": "Q30",
157 | "lat": 39.82818,
158 | "lon": -98.5795,
159 | "partOf": "Noth America",
160 | "level": 3,
161 | "locationKind": "Country",
162 | "comment": null,
163 | "labels": [
164 | "America",
165 | "UNITED STATES OF AMERICA",
166 | "USA",
167 | "United States",
168 | "United States of America (the)"
169 | ],
170 | "iso": "US"
171 | },
172 | {
173 | "name": "Australia",
174 | "wikidataid": "Q408",
175 | "lat": -28,
176 | "lon": 137,
177 | "coordinates": "-28,137",
178 | "partOf": null,
179 | "level": 3,
180 | "locationKind": "Country",
181 | "comment": null,
182 | "labels": [
183 | "AUS"
184 | ],
185 | "iso": "AU"
186 | }
187 | ]
188 | }
189 | """
190 | cm = CountryManager()
191 | cm.restoreFromJsonStr(samples)
192 | countriesByWikiDataId, _dup = cm.getLookup("wikidataid")
193 | self.assertTrue("Q30" in countriesByWikiDataId)
194 |
195 | def test_getLocationByID(self):
196 | """
197 | tests if the correct location for a given wikidataid is returned
198 | """
199 | config = LocationContext.getDefaultConfig()
200 | countryManager = CountryManager(config=config)
201 | countryManager.fromCache()
202 | country = countryManager.getLocationByID("Q30") # wikidataid of USA
203 | self.assertIsNotNone(country)
204 | self.assertTrue(hasattr(country, "iso"))
205 | self.assertEqual(country.iso, "US")
206 |
207 | def test_LocationContext(self):
208 | """
209 | tests the LocationContext class
210 | """
211 |
212 | # test interlinking of city with region and country
213 | locationContext = self.getLocationContext()
214 | cities = locationContext.cityManager.getByName("Los Angeles")
215 | la = [x for x in cities if x.wikidataid == "Q65"][0]
216 | self.assertEqual(la.name, "Los Angeles")
217 | ca = la.region
218 | self.assertEqual(ca.name, "California")
219 | us = la.country
220 | self.assertEqual(us.wikidataid, "Q30")
221 | self.assertEqual(la.country, ca.country)
222 |
223 | def testLocateLocation(self):
224 | """
225 | test LocationContext locateLocation
226 | """
227 | exampleLocations = {
228 | "Washington, DC, USA": "Q61",
229 | "Bangalore": "Q1355",
230 | "Bangalore, India": "Q1355",
231 | "Xi'an": "Q5826",
232 | "Xi'an, China": "Q5826",
233 | "Virtual Event USA": "Q30",
234 | "Virtual USA": "Q30",
235 | "London United Kingdom": "Q84",
236 | "Brno": "Q14960",
237 | "Cancun": "Q8969",
238 | "St. Petersburg": "Q656",
239 | "Gothenburg Sweden": "Q25287",
240 | "Los Angeles California": "Q65",
241 | "Zurich, Switzerland": "Q72",
242 | "Barcelona Spain": "Q1492",
243 | "Vienna Austria": "Q1741",
244 | "Seoul Republic of Korea": "Q8684",
245 | "Seattle WA USA": "Q5083",
246 | "Singapore Singapore": "Q334",
247 | "Tokyo Japan": "Q1490",
248 | "Vancouver BC Canada": "Q24639",
249 | "Vancouver British Columbia Canada": "Q24639",
250 | "Amsterdam Netherlands": "Q727",
251 | "Paris France": "Q90",
252 | "Nagoya": "Q11751",
253 | "Marrakech": "Q101625",
254 | "Austin Texas": "Q16559",
255 | "Chicago IL USA": "Q1297",
256 | "Bangkok Thailand": "Q1861",
257 | "Firenze, Italy": "Q2044",
258 | "Florence Italy": "Q2044",
259 | "Timisoara": "Q83404",
260 | "Langkawi": "Q273303",
261 | "Beijing China": "Q956",
262 | "Berlin Germany": "Q64",
263 | "Prague Czech Republic": "Q1085",
264 | "Portland Oregon USA": "Q6106",
265 | "Portland OR USA": "Q6106",
266 | "Pittsburgh PA USA": "Q1342",
267 | "Новосибирск": "Q883",
268 | "Los Angeles CA USA": "Q65",
269 | "Kyoto Japan": "Q34600",
270 | }
271 | locationContext = self.getLocationContext()
272 | printPretty = lambda records: print([str(record) for record in records])
273 | failures = []
274 | for locationText in exampleLocations.keys():
275 | expectedLocationId = exampleLocations[locationText]
276 | locations = locationContext.locateLocation(locationText, verbose=True)
277 | if len(locations) < 1:
278 | failures.append(locationText)
279 | else:
280 | location = locations[0]
281 | if self.debug:
282 | printPretty(location)
283 | if not location.wikidataid == expectedLocationId:
284 | failures.append(locationText)
285 | showFailures = True
286 | if self.debug or showFailures:
287 | print(f"locationLooup failed for {failures}")
288 | self.assertTrue(len(failures) <= 40)
289 |
290 | def testLocateLocationCountryRegionCity(self):
291 | """
292 | test LocationContext locateLocation
293 | """
294 | locationContext = self.getLocationContext()
295 | printPretty = lambda records: print([str(record) for record in records])
296 |
297 | pl1 = locationContext.locateLocation("Berlin", "USA")
298 | self.assertEqual("Germany", pl1[0].country.name)
299 | if self.debug:
300 | printPretty(pl1)
301 | pl2 = locationContext.locateLocation("Los Angeles, CA")
302 | if self.debug:
303 | printPretty(pl2)
304 | self.assertEqual("California", pl2[0].region.name)
305 | pl3 = locationContext.locateLocation("Germany, Aachen")
306 | if self.debug:
307 | printPretty(pl3)
308 | self.assertEqual("Aachen", pl3[0].name)
309 | self.assertEqual("Germany", pl3[0].country.name)
310 |
311 |
312 | if __name__ == "__main__":
313 | # import sys;sys.argv = ['', 'Test.testName']
314 | unittest.main()
315 |
--------------------------------------------------------------------------------
/tests/test_locator.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2020-09-19
3 |
4 | @author: wf
5 | """
6 | import getpass
7 | import os.path
8 | import re
9 | import tempfile
10 | import unittest
11 | from collections import Counter
12 | from pathlib import Path
13 |
14 | from lodstorage.storageconfig import StorageConfig
15 | from lodstorage.uml import UML
16 |
17 | import geograpy
18 | from geograpy.locator import City, CountryManager, Location, LocationContext, Locator
19 | from tests.basetest import Geograpy3Test
20 |
21 |
22 | class TestLocator(Geograpy3Test):
23 | """
24 | test the Locator class from the location module
25 | """
26 |
27 | def lookupQuery(self, viewName, whereClause):
28 | loc = Locator.getInstance()
29 | queryString = f"SELECT * FROM {viewName} where {whereClause} AND pop is not NULL ORDER by pop desc"
30 | lookupRecords = loc.sqlDB.query(queryString)
31 | return lookupRecords
32 |
33 | def checkExpected(self, lod, expected):
34 | emap = {}
35 | found = {}
36 | for key, value in expected:
37 | emap[key] = value
38 | for record in lod:
39 | name = record["name"]
40 | pop = record["pop"]
41 | if name in emap and pop > emap[name]:
42 | found[name] = record
43 | if self.debug:
44 | print(f"{name}:{pop:.0f}")
45 |
46 | self.assertEqual(len(found), len(emap))
47 |
48 | def testHasViews(self):
49 | """
50 | test that the views are available
51 | """
52 | loc = Locator.getInstance()
53 | viewsMap = loc.sqlDB.getTableDict(tableType="view")
54 | for view in ["CityLookup", "RegionLookup", "CountryLookup"]:
55 | self.assertTrue(view in viewsMap)
56 |
57 | def testCityLookup(self):
58 | """
59 | test the cityLookup to city/region/country object cluster
60 | """
61 | cityLookupRecords = self.lookupQuery(
62 | "CityLookup", "label in ('Berlin','Paris','Athens','Singapore')"
63 | )
64 | expected = [
65 | ("Berlin", 3644000),
66 | ("Paris", 2175000),
67 | ("Athens", 600000),
68 | ("Singapore", 5800000),
69 | ]
70 | self.checkExpected(cityLookupRecords, expected)
71 |
72 | def testRegionLookup(self):
73 | """
74 | test region Lookup
75 | """
76 | regionLookupRecords = self.lookupQuery("RegionLookup", "label in ('CA')")
77 | expected = [("California", 39000000)]
78 | self.checkExpected(regionLookupRecords, expected)
79 |
80 | def testCountryLookup(self):
81 | """
82 | test country Lookup
83 | """
84 | # self.debug=True
85 | countryLookupRecords = self.lookupQuery("CountryLookup", "label in ('CA')")
86 | expected = [("Canada", 37000000)]
87 | self.checkExpected(countryLookupRecords, expected)
88 |
89 | def testIsoRegexp(self):
90 | """
91 | test regular expression for iso codes
92 | """
93 | loc = Locator.getInstance()
94 | self.assertFalse(loc.isISO("Singapore"))
95 |
96 | query = """
97 | select distinct iso from countries
98 | union
99 | select distinct iso from regions
100 | """
101 | loc.populate_db()
102 | isocodeRecords = loc.sqlDB.query(query)
103 | for isocodeRecord in isocodeRecords:
104 | isocode = isocodeRecord["iso"]
105 | if isocode:
106 | isIso = loc.isISO(isocode)
107 | if not isIso and self.debug:
108 | print(isocode)
109 | self.assertTrue(isIso)
110 |
111 | def testWordCount(self):
112 | """
113 | test the word count
114 | """
115 | loc = Locator.getInstance()
116 | query = "SELECT name from CITIES"
117 | nameRecords = loc.sqlDB.query(query)
118 | if self.debug:
119 | print("testWordCount: found %d names" % len(nameRecords))
120 | wc = Counter()
121 | for nameRecord in nameRecords:
122 | name = nameRecord["name"]
123 | words = re.split(r"\W+", name)
124 | wc[len(words)] += 1
125 | if self.debug:
126 | print("most common 20: %s" % wc.most_common(20))
127 |
128 | def testUML(self):
129 | """
130 | test adding population data from wikidata to GeoLite2 information
131 | """
132 | Locator.resetInstance()
133 | loc = Locator.getInstance()
134 | loc.populate_db()
135 | user = getpass.getuser()
136 | if self.debug:
137 | print("current user is %s" % user)
138 | tableList = loc.sqlDB.getTableList()
139 | uml = UML()
140 | title = """geograpy Tables
141 | 2021-08-13
142 | [[https://github.com/somnathrakshit/geograpy3 © 2020-2021 geograpy3 project]]"""
143 | plantUml = uml.tableListToPlantUml(
144 | tableList, title=title, packageName="geograpy3"
145 | )
146 | showUml = True
147 | if showUml or self.debug:
148 | print(plantUml)
149 |
150 | def checkExamples(self, examples, countries, debug=False, check=True):
151 | """
152 |
153 | check that the given example give results in the given countries
154 | Args:
155 | examples(list): a list of example location strings
156 | countries(list): a list of expected country iso codes
157 | """
158 | for index, example in enumerate(examples):
159 | city = geograpy.locateCity(example, debug=debug)
160 | if self.debug:
161 | print("%3d: %22s->%s" % (index, example, city))
162 | if check:
163 | self.assertEqual(countries[index], city.country.iso)
164 |
165 | def testGetCountry(self):
166 | """
167 | test getting a country by name or ISO
168 | """
169 | locator = Locator()
170 | debug = True
171 | examples = [
172 | ("DE", "Germany"),
173 | ("US", "United States of America"),
174 | ("USA", None),
175 | ]
176 | for name, expectedName in examples:
177 | country = locator.getCountry(name)
178 | if debug:
179 | print(country)
180 | if expectedName is None:
181 | self.assertIsNone(country)
182 | else:
183 | self.assertIsNotNone(country)
184 | self.assertEqual(expectedName, country.name)
185 |
186 | def testIssue15(self):
187 | """
188 | https://github.com/somnathrakshit/geograpy3/issues/15
189 | test Issue 15 Disambiguate via population, gdp data
190 | """
191 | examples = ["Paris", "Vienna", "Berlin"]
192 | countries = ["FR", "AT", "DE"]
193 | self.checkExamples(examples, countries)
194 | pass
195 |
196 | def testIssue17(self):
197 | """
198 | test issue 17:
199 |
200 | https://github.com/somnathrakshit/geograpy3/issues/17
201 |
202 | [BUG] San Francisco, USA and Auckland, New Zealand should be locatable #17
203 | """
204 | examples = ["San Francisco, USA", "Auckland, New Zealand"]
205 | countries = ["US", "NZ"]
206 | self.checkExamples(examples, countries)
207 |
208 | def testIssue19(self):
209 | """
210 | test issue 19
211 | """
212 | examples = ["Puebla City, Mexico", "Newcastle, UK", "San Juan, Puerto Rico"]
213 | countries = ["MX", "GB", "US"]
214 | # For Puerto Rico exist two iso codes one as country and one as US region see https://en.wikipedia.org/wiki/Puerto_Rico in the dataset it is recognized as US region
215 | self.checkExamples(examples, countries)
216 |
217 | def testStackOverflow64379688(self):
218 | """
219 | compare old and new geograpy interface
220 | """
221 | examples = [
222 | "John Doe 160 Huntington Terrace Newark, New York 07112 United States of America",
223 | "John Doe 30 Huntington Terrace Newark, New York 07112 USA",
224 | "John Doe 22 Huntington Terrace Newark, New York 07112 US",
225 | "Mario Bianchi, Via Nazionale 256, 00148 Roma (RM) Italia",
226 | "Mario Bianchi, Via Nazionale 256, 00148 Roma (RM) Italy",
227 | "Newark",
228 | "Rome",
229 | ]
230 | for example in examples:
231 | city = geograpy.locateCity(example, debug=False)
232 | if self.debug:
233 | print(city)
234 |
235 | def testStackOverflow64418919(self):
236 | """
237 | https://stackoverflow.com/questions/64418919/problem-retrieving-region-in-us-with-geograpy3
238 | """
239 | examples = ["Seattle"]
240 | for example in examples:
241 | city = geograpy.locateCity(example, debug=False)
242 | print(city)
243 |
244 | def testProceedingsExample(self):
245 | """
246 | test a proceedings title Example
247 | """
248 | examples = [
249 | """Proceedings of the
250 | IEEE 14th International Conference on
251 | Semantic Computing, ICSC 2020,
252 | San Diego, CA, USA,
253 | February 3-5, 2020"""
254 | ]
255 | for example in examples:
256 | places = geograpy.get_place_context(text=example)
257 | if self.debug:
258 | print(places)
259 | city = geograpy.locateCity(example, debug=False)
260 | if self.debug:
261 | print(city)
262 |
263 | def testDelimiters(self):
264 | """
265 | test the delimiter statistics for names
266 | """
267 | loc = Locator.getInstance()
268 | loc.populate_db()
269 |
270 | ddls = [
271 | "DROP VIEW IF EXISTS allNames",
272 | """CREATE VIEW allNames as select name from countries
273 | union select name from regions
274 | union select name from cities""",
275 | ]
276 | for ddl in ddls:
277 | loc.sqlDB.execute(ddl)
278 | query = "SELECT name from allNames"
279 | nameRecords = loc.sqlDB.query(query)
280 | show = self.debug
281 | show = True
282 | if show:
283 | print("found %d name records" % len(nameRecords))
284 | ordC = Counter()
285 | for nameRecord in nameRecords:
286 | name = nameRecord["name"]
287 | for char in name:
288 | code = ord(char)
289 | if code < ord("A"):
290 | ordC[code] += 1
291 | for index, countT in enumerate(ordC.most_common(10)):
292 | code, count = countT
293 | if show:
294 | print("%d: %d %s -> %d" % (index, code, chr(code), count))
295 |
296 | def testIssue22(self):
297 | """
298 | https://github.com/somnathrakshit/geograpy3/issues/22
299 | """
300 | url = "https://en.wikipedia.org/wiki/2012_Summer_Olympics_torch_relay"
301 | places = geograpy.get_geoPlace_context(url=url)
302 | if self.debug:
303 | print(places)
304 | self.assertTrue(len(places.countries) > 5)
305 | self.assertTrue(len(places.regions) > 5)
306 | self.assertTrue(len(places.cities) > 20)
307 |
308 | def testExamples(self):
309 | """
310 | test examples
311 | """
312 | examples = [
313 | "Paris, US-TX",
314 | "Amsterdam, Netherlands",
315 | "Vienna, Austria",
316 | "Vienna, Illinois, US",
317 | "Paris, Texas",
318 | "Austin, TX",
319 | "Austin, Texas",
320 | ]
321 | countries = ["US", "NL", "AT", "US", "US", "US", "US"]
322 | self.checkExamples(examples, countries, debug=False)
323 |
324 | def testIssue41_CountriesFromErdem(self):
325 | """
326 | test getting Country list from Erdem
327 |
328 | """
329 | countryList = CountryManager.fromErdem()
330 | self.assertEqual(247, len(countryList.countries))
331 | if self.debug:
332 | for country in countryList.countries:
333 | print(country)
334 |
335 | def testIssue_42_distance(self):
336 | """
337 | test haversine and location
338 | """
339 | loc1 = Location()
340 | loc1.lat = 0
341 | loc1.lon = 0
342 | loc2 = Location()
343 | loc2.lat = 90
344 | loc2.lon = 0
345 | d = loc1.distance(loc2)
346 | # self.debug=True
347 | if self.debug:
348 | print(d)
349 | self.assertAlmostEqual(10007.54, d, delta=0.1)
350 |
351 | def testIssue_59_db_download(self):
352 | """
353 | tests the correct downloading of the backup database in different configurations
354 | """
355 |
356 | def getConfig(tmpdir: str):
357 | config = StorageConfig(
358 | cacheFile="locations.db",
359 | cacheDirName="geograpyTest",
360 | cacheRootDir=tmpdir,
361 | )
362 | config.cacheFile = f"{config.getCachePath()}/{config.cacheFile}"
363 | return config
364 |
365 | def downloadAndTestDB(
366 | config: StorageConfig, loc: Locator = None, forceUpdate: bool = False
367 | ):
368 | """downloads and tests the downloaded db"""
369 | if loc is None:
370 | loc = Locator(storageConfig=config)
371 | loc.downloadDB(forceUpdate=forceUpdate)
372 | self.assertTrue(os.path.exists(config.cacheFile))
373 | self.assertTrue(loc.db_has_data())
374 | return loc
375 |
376 | # test downloading with no file in dir
377 | with tempfile.TemporaryDirectory() as tmpdir:
378 | config = getConfig(tmpdir)
379 | downloadAndTestDB(config)
380 |
381 | # test downloading with empty file in dir
382 | with tempfile.TemporaryDirectory() as tmpdir:
383 | config = getConfig(tmpdir)
384 | Path(config.cacheFile).touch() # create empty file
385 | loc = downloadAndTestDB(config)
386 |
387 | # test downloading with forceUpdate
388 | # drop a important table to check if it is restored
389 | loc.sqlDB.execute("DROP TABLE countries")
390 | self.assertFalse(loc.db_has_data())
391 | downloadAndTestDB(config, loc=loc, forceUpdate=True)
392 |
393 |
394 | if __name__ == "__main__":
395 | # import sys;sys.argv = ['', 'Test.testName']
396 | unittest.main()
397 |
--------------------------------------------------------------------------------
/tests/test_nominatim.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2021-08-20
3 |
4 | @author: wf
5 | """
6 |
7 | from geograpy.nominatim import NominatimWrapper
8 | from tests.basetest import Geograpy3Test
9 |
10 |
11 | class TestGeopy(Geograpy3Test):
12 | """
13 | test geopy and other nominatim handlers
14 | """
15 |
16 | def testNominatim(self):
17 | """
18 | test nominatim results - especially the extra tags
19 | """
20 | if self.inCI():
21 | return
22 | examples = [
23 | {"city": "London", "q": "Q84", "expected": "England"},
24 | {"city": "Dublin", "q": "Q1761", "expected": "Ireland"},
25 | {"city": "Vienna Austria", "q": "Q1741", "expected": "Österreich"},
26 | {
27 | "city": "Athens, Georgia",
28 | "q": "Q203263",
29 | "expected": "Athens-Clarke County",
30 | },
31 | # inconsistent results - 2021-12-27
32 | # {
33 | # "city":"St. Petersburg",
34 | # "q": "Q656",
35 | # "expected": "Санкт-Петербург"
36 | # },
37 | {
38 | # so for St. Petersburg we need to be more specific
39 | "city": "St. Petersburg, Russia",
40 | "q": "Q656",
41 | # to get the russian one
42 | "expected": "Санкт-Петербург",
43 | },
44 | # inconsistent results Q49279759 - 2023-09-29
45 | # {
46 | # "city":"Arlington, VA",
47 | # "q": "Q107126",
48 | # "expected": "Virginia"
49 | # }
50 | {"city": "Saint Petersburg, FL", "q": "Q49236", "expected": "Florida"},
51 | ]
52 |
53 | nw = NominatimWrapper()
54 | show = self.debug
55 | # show=True
56 | if show:
57 | print(nw.cacheDir)
58 | for example in examples:
59 | city = example["city"]
60 | location = nw.geolocator.geocode(city)
61 | wikidataId = nw.lookupWikiDataId(city)
62 | q = example["q"]
63 | expected = example["expected"]
64 | if show:
65 | print(
66 | f"{city:<22}:{str(wikidataId):<7}/{str(q):<7}:{location}→{expected}"
67 | )
68 | self.assertEqual(str(q), str(wikidataId))
69 | self.assertTrue(expected in str(location), f"{location}→{expected}")
70 | pass
71 |
--------------------------------------------------------------------------------
/tests/test_places.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import geograpy
4 | from geograpy.locator import Locator
5 | from geograpy.places import PlaceContext
6 | from tests.basetest import Geograpy3Test
7 |
8 |
9 | class TestPlaces(Geograpy3Test):
10 | """
11 | test Places
12 | """
13 |
14 | def setUp(self):
15 | super().setUp(debug=False)
16 | Locator.resetInstance()
17 | pass
18 |
19 | def testIssue25(self):
20 | """
21 | https://github.com/somnathrakshit/geograpy3/issues/25
22 | """
23 | pc = PlaceContext(
24 | place_names=["Bulgaria", "Croatia", "Czech Republic", "Hungary"]
25 | )
26 | if self.debug:
27 | print(pc.countries)
28 |
29 | def testGetRegionNames(self):
30 | """
31 | test getting region names
32 | """
33 | pc = PlaceContext(place_names=["Berlin"])
34 | regions = pc.getRegions("Germany")
35 | self.assertEqual(16, len(regions))
36 | for region in regions:
37 | if self.debug:
38 | print(region)
39 | self.assertTrue(region.iso.startswith("DE"))
40 | regionNames = pc.get_region_names("Germany")
41 | self.assertEqual(16, len(regionNames))
42 | if self.debug:
43 | print(regionNames)
44 |
45 | def testPlaces(self):
46 | """
47 | test places
48 | """
49 | pc = PlaceContext(["Ngong", "Nairobi", "Kenya"], setAll=False)
50 | pc.setAll()
51 |
52 | if self.debug:
53 | print(pc)
54 |
55 | # Ngong is a city in Cameroon and Kenya
56 | self.assertEqual(2, len(pc.countries))
57 | self.assertTrue("Kenya" in pc.countries)
58 | self.assertEqual(2, len(pc.cities))
59 | cityNames = ["Nairobi", "Ohio", "Amsterdam"]
60 | countries = ["Kenya", "United States of America", "Netherlands"]
61 | for index, cityName in enumerate(cityNames):
62 | cities = pc.cities_for_name(cityName)
63 | country = cities[0].country
64 | self.assertEqual(countries[index], country.name)
65 |
66 | pc = PlaceContext(["Mumbai"])
67 | if self.debug:
68 | print(pc)
69 |
70 | def testIssue49(self):
71 | """
72 | country recognition
73 | """
74 | show = self.debug
75 | texts = ["United Kingdom", "UK", "Great Britain", "GB", "United States"]
76 | expected = [
77 | "United Kingdom",
78 | "United Kingdom",
79 | "United Kingdom",
80 | "United Kingdom",
81 | "United States of America",
82 | ]
83 | if show:
84 | print("lookup with geograpy.get_geoPlace_context")
85 | for text in texts:
86 | countries = geograpy.get_geoPlace_context(text=text).countries
87 | if show:
88 | print(f"{text}:{countries}")
89 | if show:
90 | print("lookup with PlaceContext")
91 | for i, text in enumerate(texts):
92 | pc = PlaceContext([text])
93 | pc.set_countries()
94 | if show:
95 | print(f"{text}:{pc.countries}")
96 | self.assertEqual([expected[i]], pc.countries)
97 |
98 |
99 | if __name__ == "__main__":
100 | unittest.main()
101 |
--------------------------------------------------------------------------------
/tests/test_wikidata.py:
--------------------------------------------------------------------------------
1 | """
2 | Created on 2020-09-23
3 |
4 | @author: wf
5 | """
6 | import getpass
7 | import unittest
8 |
9 | from lodstorage.sql import SQLDB
10 | from lodstorage.storageconfig import StorageConfig
11 |
12 | from geograpy.locator import Country
13 | from geograpy.wikidata import Wikidata
14 | from tests.basetest import Geograpy3Test
15 |
16 |
17 | class TestWikidata(Geograpy3Test):
18 | """
19 | test the wikidata access for cities
20 | """
21 |
22 | def testWikidataCountries(self):
23 | """
24 | test getting country information from wikidata
25 | """
26 | wikidata = Wikidata()
27 | try:
28 | countryList = wikidata.getCountries()
29 | self.assertTrue(len(countryList) >= 200)
30 | expectedAttrs = Country.getSamples()[0].keys()
31 | for country in countryList:
32 | if self.debug:
33 | print(country)
34 | for attr in expectedAttrs:
35 | self.assertTrue(hasattr(country, attr))
36 | except Exception as ex:
37 | self.handleWikidataException(ex)
38 | pass
39 |
40 | def testWikidataRegions(self):
41 | """
42 | test getting region information from wikidata
43 | """
44 | wikidata = Wikidata()
45 | try:
46 | regionList = wikidata.getRegions()
47 | self.assertTrue(len(regionList) >= 3000)
48 | except Exception as ex:
49 | self.handleWikidataException(ex)
50 | pass
51 |
52 | def testWikidataCities(self):
53 | """
54 | test getting city information from wikidata
55 |
56 | """
57 | # Wikidata time outs in CI environment need to be avoided
58 | if getpass.getuser() != "wf":
59 | return
60 | config = StorageConfig.getSQL(debug=self.debug)
61 | config.cacheRootDir = "/tmp/wdhs"
62 | cachedir = config.getCachePath()
63 | config.cacheFile = f"{cachedir}/hs.db"
64 | # use 2018 wikidata copy
65 | # wikidata.endpoint="http://blazegraph.bitplan.com/sparql"
66 | # use 2020 wikidata copy
67 | wikidata = Wikidata()
68 | wikidata.endpoint = "https://confident.dbis.rwth-aachen.de/jena/wdhs/sparql"
69 | # wikidata.endpoint="http://jena.bitplan.com/wdhs/sparql"
70 | regions = [
71 | {"name": "Singapore", "country": "Q334", "region": None, "cities": 46},
72 | {"name": "Beijing", "country": None, "region": "Q956", "cities": 25},
73 | {"name": "Paris", "country": None, "region": "Q13917", "cities": 1242},
74 | {"name": "Barcelona", "country": None, "region": "Q5705", "cities": 1242},
75 | {"name": "Rome", "country": None, "region": "Q1282", "cities": 1242},
76 | ]
77 | limit = 1000000 # if self.inCI() else 100
78 | cityList = wikidata.getCities(limit=limit)
79 | sqlDB = SQLDB(config.cacheFile)
80 | entityInfo = sqlDB.createTable(cityList, "hs", withDrop=True)
81 | sqlDB.store(cityList, entityInfo, fixNone=True)
82 | expected = 200000 # if self.inCI() else limit
83 | self.assertTrue(len(cityList) >= expected)
84 | # for region in regions:
85 | # starttime=time.time()
86 | # regionName=region["name"]
87 | # print(f"searching cities for {regionName}" )
88 | # cityList=wikidata.getCities(country=region["country"], region=region["region"])
89 | # print("Found %d cities for %s in %5.1f s" % (len(cityList),region["name"],time.time()-starttime))
90 | # if self.debug:
91 | # print(cityList[:10])
92 | # #self.assertEqual(region['cities'],len(cityList))
93 | # pass
94 |
95 | def testWikidataCityStates(self):
96 | """
97 | test getting region information from wikidata
98 | """
99 | wikidata = Wikidata()
100 | try:
101 | regionList = wikidata.getCityStates()
102 | self.assertTrue(len(regionList) >= 2)
103 | cityStateNames = [r.get("name") for r in regionList]
104 | self.assertTrue("Singapore" in cityStateNames)
105 | except Exception as ex:
106 | self.handleWikidataException(ex)
107 | pass
108 |
109 | def testGetWikidataId(self):
110 | """
111 | test getting a wikiDataId from a given URL
112 | """
113 | # test entity
114 | wikidataURL = "https://www.wikidata.org/wiki/Q1"
115 | expectedID = "Q1"
116 | wikiDataId = Wikidata.getWikidataId(wikidataURL)
117 | self.assertEqual(wikiDataId, expectedID)
118 | # test property
119 | wikidataURLProperty = "https://www.wikidata.org/wiki/Property:P31"
120 | expectedPropertyID = "P31"
121 | propertyId = Wikidata.getWikidataId(wikidataURLProperty)
122 | self.assertEqual(expectedPropertyID, propertyId)
123 | # test invalid entries
124 | wikidataURLProperty = ""
125 | parsedId = Wikidata.getWikidataId(wikidataURLProperty)
126 | self.assertIsNone(parsedId)
127 |
128 | def testGetCoordinateComponents(self):
129 | """
130 | test the splitting of coordinate components in WikiData query results
131 | """
132 | cList = [
133 | {
134 | "coordinate": "Point(-118.25 35.05694444)",
135 | "expected": (-118.25, 35.05694444),
136 | }
137 | ]
138 | for c in cList:
139 | coordinate = c["coordinate"]
140 | expLat, expLon = c["expected"]
141 | lon, lat = Wikidata.getCoordinateComponents(coordinate)
142 | self.assertEqual(expLat, lat)
143 | self.assertEqual(expLon, lon)
144 |
145 |
146 | if __name__ == "__main__":
147 | # import sys;sys.argv = ['', 'Test.testName']
148 | unittest.main()
149 |
--------------------------------------------------------------------------------