├── .github ├── dependabot.yml └── workflows │ └── main.yml ├── .gitignore ├── CONTRIBUTING.org ├── LICENSE ├── README.org ├── build.py ├── csv_reconcile ├── __init__.py ├── __main__.py ├── db.py ├── default_settings.py ├── extend.py ├── initdb.py ├── preview.py ├── schema.sql ├── score.py └── scorer.py ├── csv_reconcile_dice ├── __init__.py ├── cutils.pyx └── utils.py ├── noxfile.py ├── poetry.lock ├── pyproject.toml ├── pytest.ini ├── sample ├── progressives.tsv ├── reps.tsv └── sample.cfg ├── tests ├── __init__.py ├── conftest.py ├── main │ └── test_csv_reconcile.py └── plugins │ ├── __init__.py │ └── geo │ ├── conftest.py │ └── test_geo_reconcile.py └── utils.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | 4 | # Maintain dependencies for GitHub Actions 5 | - package-ecosystem: "pip" 6 | directory: "/" 7 | schedule: 8 | interval: "daily" 9 | target-branch: "develop" 10 | labels: 11 | - "pip dependencies" 12 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # Basic testing taken from [[https://github.com/marketplace/actions/install-poetry-action#mtesting]] 2 | name: test 3 | 4 | on: [pull_request, workflow_dispatch] 5 | 6 | jobs: 7 | test: 8 | strategy: 9 | fail-fast: true 10 | matrix: 11 | os: [ "ubuntu-latest", "macos-latest" ] 12 | python-version: [ "3.9", "3.10", "3.11", "3.12" ] 13 | runs-on: ${{ matrix.os }} 14 | steps: 15 | #---------------------------------------------- 16 | # check-out repo and set-up python 17 | #---------------------------------------------- 18 | - name: Check out repository 19 | uses: actions/checkout@v3 20 | - name: Set up python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v3 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | #---------------------------------------------- 25 | # install poethepoet 26 | #---------------------------------------------- 27 | - name: Install poethepoet 28 | run: python -m pip install poethepoet 29 | #---------------------------------------------- 30 | # ----- install & configure poetry ----- 31 | #---------------------------------------------- 32 | - name: Install Poetry 33 | uses: snok/install-poetry@v1 34 | with: 35 | virtualenvs-in-project: true 36 | #---------------------------------------------- 37 | # load cached venv if cache exists 38 | #---------------------------------------------- 39 | - name: Load cached venv 40 | id: cached-poetry-dependencies 41 | uses: actions/cache@v3 42 | with: 43 | path: .venv 44 | key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }} 45 | #---------------------------------------------- 46 | # dummy doc 47 | #---------------------------------------------- 48 | - name: Dummy doc 49 | run: poe dummydoc 50 | #---------------------------------------------- 51 | # install dependencies if cache does not exist 52 | #---------------------------------------------- 53 | - name: Install dependencies 54 | if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' 55 | run: poetry install --no-interaction --no-root 56 | #---------------------------------------------- 57 | # install your root project, if required 58 | #---------------------------------------------- 59 | - name: Install library 60 | run: poetry install --no-interaction 61 | #---------------------------------------------- 62 | # install plugins (for tests) 63 | #---------------------------------------------- 64 | - name: Install plugins 65 | run: poetry run python -m pip install csv-reconcile-geo 66 | #---------------------------------------------- 67 | # add matrix specifics and run test suite 68 | #---------------------------------------------- 69 | - name: Run tests 70 | run: | 71 | poetry run pytest tests/ 72 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # pytype static type analyzer 136 | .pytype/ 137 | 138 | # Cython debug symbols 139 | cython_debug/ 140 | -------------------------------------------------------------------------------- /CONTRIBUTING.org: -------------------------------------------------------------------------------- 1 | * Contributing code 2 | Pull requests are most definitely appreciated and encouraged. Please open an issue before 3 | contributing and add the comment "Fixes #" to your commits to [[https://github.blog/2013-01-22-closing-issues-via-commit-messages/][automatically close 4 | the issue.]] 5 | 6 | Also, we've started using the [[https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow][git flow workflow]] for releases. If you're not familiar with it, it 7 | basically amounts to pull requests being accepted only on the ~develop~ branch or preferrably on a 8 | branch of ~develop~ named following the pattern ~feature/~. The ~git-flow~ 9 | extension can help with managing these branches but is not required. See the linked documentation 10 | for more information. 11 | 12 | * Adding plugins 13 | Per the documentation ~csv-reconcile~ automatically detects plugins that are installed as ~Python~ 14 | packages so long as the register an ~entry-point~ in their ~setup.py~. Thus no changes to 15 | ~csv-reconcile~ should be necessary to make use of your plugin. 16 | 17 | If you would like to share your plugin, however, it might be helpful to add it to this project's 18 | wiki. 19 | 20 | * Suggestions for enhancement 21 | Suggestions are always welcome. Please open an issue so we can discuss feasibility and how the 22 | suggestion fits in with the overall plan for the project. 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 gitonthescene 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | #+OPTIONS: ^:nil 2 | * CSV Reconcile 3 | A [[https://github.com/reconciliation-api/specs][reconciliation service]] for [[https://openrefine.org/][OpenRefine]] based on a CSV file similar to [[http://okfnlabs.org/reconcile-csv/][reconcile-csv]]. This one is written in Python and has some more configurability. 4 | 5 | ** Quick start 6 | - Clone this repository 7 | - Run the service 8 | : $ python -m venv venv # create virtualenv 9 | : $ venv/bin/pip install csv-reconcile # install package 10 | : $ source venv/bin/activate # activate virtual environment 11 | : (venv) $ csv-reconcile init sample/reps.tsv item itemLabel # initialize the service 12 | : (venv) $ csv-reconcile serve # run the service 13 | : (venv) $ deactivate # remove virtual environment 14 | 15 | The service is run at http://127.0.0.1:5000/reconcile. You can point at a different host:port by 16 | adding [[https://flask.palletsprojects.com/en/0.12.x/config/][SERVER_NAME]] to the sample.cfg. Since this is running from a virtualenv, you can simply 17 | delete the whole lot to clean up. 18 | 19 | If you have a C compiler installed you may prefer to install the sdist 20 | ~dist/csv-reconcile-0.1.0.tar.gz~ which will build a [[https://cython.readthedocs.io/en/latest/][Cython]] version of the computationally 21 | intensive fuzzy match routine for speed. With ~pip~ add the option ~--no-binary csv-reconcile~. 22 | 23 | ** Poetry 24 | *** Prerequesites 25 | You'll need to have both [[https://python-poetry.org/docs/][poetry]] and [[https://pypi.org/project/poethepoet/0.0.3/][poethepoet]] installed. For publishing to [[https://pypi.org/][PyPI]] [[https://pandoc.org/][pandoc]] is required. 26 | 27 | *** Running 28 | This is packaged with [[https://python-poetry.org/docs/][poetry]], so you can use those commands if you have it installed. 29 | : $ poe install 30 | : $ poetry run csv-reconcile init sample/reps.tsv item itemLabel 31 | : $ poetry run csv-reconcile serve 32 | 33 | *** Building 34 | Because this package uses a ~README.org~ file and ~pip~ requires a ~README.md~, there are extra 35 | build steps beyond what ~poetry~ supplies. These are managed using [[https://pypi.org/project/poethepoet/0.0.3/][poethepoet]]. Thus building is 36 | done as follows: 37 | 38 | : $ poe build 39 | 40 | If you want to build a platform agnostic wheel, you'll have to comment out the ~build = 41 | "build.py"~ line from ~pyproject.toml~ until ~poetry~ supports [[https://github.com/python-poetry/poetry/issues/3594][selecting build platform]]. 42 | 43 | ** Description 44 | 45 | This reconciliation service uses [[https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient][Dice coefficient scoring]] to reconcile values against a given column 46 | in a [[https://en.wikipedia.org/wiki/Comma-separated_values][CSV]] file. The CSV file must contain a column containing distinct values to reconcile to. 47 | We'll call this the /id column/. We'll call the column being reconciled against the /name column/. 48 | 49 | For performance reasons, the /name column/ is preprocessed to normalized values which are stored 50 | in an [[https://www.sqlite.org/index.html][sqlite]] database. This database must be initialized at least once by running the init 51 | sub-command. Once initialized this need not be run for subsequent runs. 52 | 53 | Note that the service supplies all its data with a dummy /type/ so there is no reason to reconcile 54 | against any particular /type/. 55 | 56 | In addition to reconciling against the /name column/, the service also functions as a [[https://reconciliation-api.github.io/specs/latest/#data-extension-service][data extension 57 | service]], which offers any of the other columns of the CSV file. 58 | 59 | Note that Dice coefficient scoring is agnostic to word ordering. 60 | 61 | ** Usage 62 | 63 | Basic usage involves two steps: 64 | - initialization 65 | - running the service 66 | 67 | Initialization primes the database with the data processed from the CSV file with the ~init~ subcommand. 68 | There are several options for running the service as described below. 69 | 70 | *** Initialization 71 | 72 | Basic usage of the ~init~ sub-command requires passing the name of the CSV file, the /id column/ 73 | and the /name column/. 74 | 75 | : (venv) $ csv-reconcile --help 76 | : Usage: csv-reconcile [OPTIONS] COMMAND [ARGS]... 77 | : 78 | : Options: 79 | : --help Show this message and exit. 80 | : 81 | : Commands: 82 | : init 83 | : run 84 | : serve 85 | : (venv) $ csv-reconcile init --help 86 | : Usage: csv-reconcile init [OPTIONS] CSVFILE IDCOL NAMECOL 87 | : 88 | : Options: 89 | : --config TEXT config file 90 | : --scorer TEXT scoring plugin to use 91 | : --help Show this message and exit. 92 | : (venv) $ poetry run csv-reconcile serve --help 93 | : Usage: csv-reconcile serve [OPTIONS] 94 | : 95 | : Options: 96 | : --help Show this message and exit. 97 | : (venv) $ 98 | 99 | The ~--config~ option is used to point to a configuration file. The file is a [[https://flask.palletsprojects.com/en/1.1.x/config/][Flask 100 | configuration]] and hence is Python code though most configuration is simply setting variables to 101 | constant values. 102 | 103 | *** Running the service 104 | The simplest way to run the service is to use Flask's built-in web server with the ~serve~ 105 | subcommand which takes no arguments. However, as mentioned in the [[https://flask.palletsprojects.com/en/2.0.x/deploying/][Flask documentation]], this 106 | server is not suitable for production purposes. 107 | 108 | For a more hardened service, you can use one of the other deployment options mentioned in that 109 | documentation. For example, gunicorn can be run as follows: 110 | 111 | : (venv) $ gunicorn -w 4 'csv_reconcile:create_app()' 112 | : 1-11-16 17:40:20 +0900] [84625] [INFO] Starting gunicorn 20.1.0 113 | : 1-11-16 17:40:20 +0900] [84625] [INFO] Listening at: http://127.0.0.1:8000 (84625) 114 | : 1-11-16 17:40:20 +0900] [84625] [INFO] Using worker: sync 115 | : 1-11-16 17:40:20 +0900] [84626] [INFO] Booting worker with pid: 84626 116 | : 1-11-16 17:40:20 +0900] [84627] [INFO] Booting worker with pid: 84627 117 | : 1-11-16 17:40:20 +0900] [84628] [INFO] Booting worker with pid: 84628 118 | : 1-11-16 17:40:20 +0900] [84629] [INFO] Booting worker with pid: 84629 119 | : ... 120 | 121 | One thing to watch out for is that the default manifest points the extension service to port 122 | 5000, the default port for the Flask built-in web server. If you want to use the extension 123 | service when deploying to a different port, you'll want to be sure to override that part of the 124 | manifest in your config file. You'll need something like the following: 125 | 126 | : MANIFEST = { 127 | : "extend": { 128 | : "propose_properties": { 129 | : "service_url": "http://localhost:8000", 130 | : "service_path": "/properties" 131 | : } 132 | : } 133 | : } 134 | 135 | Note also that the configuration is saved during the ~init~ step. If you change the config, 136 | you'll need to re-run that step. You may also need to delete and re-add the service in 137 | OpenRefine. 138 | 139 | *** Deprecated 140 | The ~run~ subcommand mimics the old behavior which combined the initialization step with the 141 | running of the service. This may be removed in a future release. 142 | 143 | ** Common configuration 144 | - ~SERVER_NAME~ - The host and port the service is bound to. 145 | e.g. ~SERVER_NAME=localhost:5555~. ( Default localhost:5000 ) 146 | - ~CSVKWARGS~ - Arguments to pass to [[https://docs.python.org/3/library/csv.html][csv.reader]]. 147 | e.g. ~CSVKWARGS={'delimiter': ',', 'quotechar': '"'}~ for comma delimited files using ~"~ as quote character. 148 | - ~CSVENCODING~ - Encoding of the CSV file. 149 | e.g. ~CSVENCODING="utf-8-sig"~ is the encoding used for data downloaded from [[https://www.usgs.gov/core-science-systems/ngp/board-on-geographic-names/download-gnis-data][GNIS]]. 150 | - ~SCOREOPTIONS~ - Options passed to scoring plugin during normalization. 151 | e.g. ~SCOREOPTIONS={'stopwords':['lake','reservoir']}~ 152 | - ~LIMIT~ - The maximum number of reonciliation candidates returned per entry. ( Default 10 ) 153 | e.g. ~LIMIT=10~ 154 | - ~THRESHOLD~ - The minimum score for returned reconciliation candidates. ( Default 30.0 ) 155 | e.g. ~THRESHOLD=80.5~ 156 | - ~DATABASE~ - The name of the generated sqlite database containing pre-processed values. (Default ~csvreconcile.db~) 157 | e.g. ~DATABASE='lakes.db'~ You may want to change the name of the database if you regularly switch between databases being used. 158 | - ~MANIFEST~ - Overrides for the service manifest. 159 | e.g. ~MANIFEST={"name": "My service"}~ sets the name of the service to "My service". 160 | 161 | This last is most interesting. If your data is coming from [[https://www.wikidata.org][Wikidata]] and your /id column/ 162 | contains [[https://www.wikidata.org/wiki/Help:Items][Q values]], then a manifest like the following will allow your links to be clickable inside OpenRefine. 163 | 164 | #+begin_src python 165 | MANIFEST = { 166 | "identifierSpace": "http://www.wikidata.org/entity/", 167 | "schemaSpace": "http://www.wikidata.org/prop/direct/", 168 | "view": {"url":"https://www.wikidata.org/wiki/{{id}}"}, 169 | "name": "My reconciliation service" 170 | } 171 | #+end_src 172 | 173 | If your CSV is made up of data taken from another [[https://reconciliation-api.github.io/testbench/][reconciliation service]], you may similiarly copy 174 | parts of their manifest to make use of their features, such as the [[https://reconciliation-api.github.io/specs/latest/#preview-service][preview service]]. See the 175 | reconciliation spec for details. 176 | 177 | ** Built-in preview service 178 | There is a preview service built into the tool. (Thanks [[https://github.com/b2m][b2m]]!) You can turn it on by adding the 179 | following to your manifest: 180 | 181 | #+begin_src python 182 | "preview": { 183 | "url": "http://localhost:5000/preview/{{id}}", 184 | "width": 400, 185 | "height": 300 186 | } 187 | #+end_src 188 | 189 | Note that if you reconcile against a service with a preview service enabled, a link to the 190 | service becomes part of the project. Thus if you bring the service down, your project will have 191 | hover over pop-ups to an unavailable service. One way around this is to copy the 192 | ~recon.match.id~ to a new column which can be re-reconciled to the column by id if you bring the 193 | service back up again whether or not you have preview service enabled. (Perhaps OpenRefine could 194 | be smarter about enabling this pop-ups only when the service is active.) 195 | 196 | ** Scoring plugins 197 | As mentioned above the default scoring method is to use [[https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient][Dice coefficient scoring]], but this method 198 | can be overridden by implementing a ~cvs_reconcile.scorers~ plugin. 199 | 200 | *** Implementing 201 | A plugin module may override any of the methods in the ~csv_reconcile.scorers~ module by simply 202 | implementing a method of the same name with the decorator ~@cvs_reconcile.scorer.register~. 203 | 204 | See ~csv_reconcile_dice~ for how Dice coefficient scoring is implemented. 205 | 206 | The basic hooks are as follows: 207 | - ~normalizedWord(word, **scoreOptions)~ preprocesses values to be reconciled to produce a tuple 208 | used in fuzzy match scoring. The value of ~SCOREOPTIONS~ in the configuration will be passed 209 | in to allow configuration of this preprocessing. This hook is required. 210 | - ~normalizedRow(word, row, **scoreOptions)~ preprocesses values to be reconciled against to 211 | produce a tuple used in fuzzy match scoring. Note that both the reconciled column and the 212 | entire row is available for calculating the normalized value and that the column reconciled 213 | against is required even when not used. The value of ~SCOREOPTIONS~ in the configuration will 214 | be passed in to allow configuration of this preprocessing. This defaults to calling 215 | normalizeWord(word,**scoreOptions). 216 | - ~getNormalizedFields()~ returns a tuple of names for the columns produced by ~normalizeWord()~. 217 | The length of the return value from both functions must match. This defaults to calling 218 | normalizeWord(word,**scoreOptions). This hook is required. 219 | - ~processScoreOptions(options)~ is passed the value of ~SCOREOPTIONS~ to allow it to be adjusted 220 | prior to being used. This can be used for adding defaults and/or validating the configuration. 221 | This hook is optional 222 | - ~scoreMatch(left, right, **scoreOptions)~ gets passed two tuples as returned by 223 | ~normalizedWord()~. The ~left~ value is the value being reconciled and the ~right~ value is 224 | the value being reconciled against. The value of ~SCOREOPTIONS~ in the configuration will be 225 | passed in to allow configuration of this preprocessing. Returning a score of ~None~ will not 226 | add tested value as a candidate. This hook is required. 227 | - ~valid(normalizedFields)~ is passed the normalized tuple prior to being scored to make sure 228 | it's appropriate for the calculation. This hook is optional. 229 | - ~features(word, row, **scoreOptions)~ calculates [[https://reconciliation-api.github.io/specs/latest/#reconciliation-query-responses][features]] using the query string and the 230 | normalized row. By default calculating features is disabled. Implementions of this hook are 231 | automatically enabled. This hook is optional. 232 | 233 | *** Installing 234 | Hooks are automatically discovered as long as they provide a ~csv_reconcile.scorers~ [[https://setuptools.readthedocs.io/en/latest/userguide/entry_point.html][setuptools 235 | entry point]]. Poetry supplies a [[https://python-poetry.org/docs/pyproject/#plugins][plugins]] configuration which wraps the setuptools funtionality. 236 | 237 | The default Dice coefficent scoring is supplied via the following snippet from ~pyproject.toml~ 238 | file. 239 | 240 | : [tool.poetry.plugins."csv_reconcile.scorers"] 241 | : "dice" = "csv_reconcile_dice" 242 | 243 | Here ~dice~ becomes the name of the scoring option and ~csv_reconcile_dice~ is the package 244 | implementing the plugin. 245 | 246 | *** Using 247 | If there is only one scoring plugin available, that plugin is used. If there are more than one 248 | available, you will be prompted to pass the ~--scorer~ option to select among the scoring options. 249 | 250 | *** Known plugins 251 | See [[https://github.com/gitonthescene/csv-reconcile/wiki][wiki]] for list of known plugins. 252 | 253 | ** Testing 254 | Though I long for the old days when a unit test was a unit test, these days things are a bit more 255 | complicated with various versions of ~Python~ and installation of plugins to manage. Now we have 256 | to wrestle with [[https://docs.python.org/3/tutorial/venv.html][virtual environments]]. ~poetry~ handles the virtual environment for developing, 257 | but testing involves covering more options. 258 | 259 | *** Tests layout 260 | The tests directory structure is the following: 261 | 262 | : tests 263 | : main 264 | : plugins 265 | : geo 266 | 267 | Tests for the main package are found under ~main~ and don't require installing any other 268 | packages whereas tests under ~plugins~ require the installation of the given plugin. 269 | 270 | *** Running tests 271 | **** Basic tests 272 | These tests are written with [[https://docs.pytest.org/en/6.2.x/contents.html][pytest]] and can be running through ~poetry~ as follows: 273 | 274 | : $ poetry run pytest 275 | 276 | To avoid the complications that come from installing plugins, there is a ~poe~ script for 277 | running only the tests under main which can be invoked as follows: 278 | 279 | : $ poe test 280 | 281 | For steady state developing this is probably the command you'll use most often. 282 | 283 | **** Build matrices 284 | The GitHub Actions for this project currently use a [[https://docs.github.com/en/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix][build matrix]] across a couple of 285 | architectures and several versions of ~Python~, but a similar effect can be achieved using [[https://nox.thea.codes/en/stable/tutorial.html][nox]]. 286 | 287 | ~nox~ manages the creation of various virtual environments in what they call "sessions", from 288 | which various commands can be run. This project's ~noxfile.py~ manages the installation of the 289 | ~csv-reconcile-geo~ plugin for the plugin tests as well as running across several versions of 290 | ~Python~. See the ~nox~ documentation for detail. 291 | 292 | Some versions of this command you're likely to run are as follows: 293 | 294 | : $ nox # Run all the tests building virtual environemnts from scratch 295 | : $ nox -r # Reuse previously built virtual environments for speed 296 | : $ nox -s test_geo # Run only the tests for the csv-reconcile-geo plugin 297 | : $ nox -s test_main -p 3.8 # Run only the main tests with Python3.8 298 | 299 | Eventually, the GitHub Actions may be changed to use [[https://github.com/marketplace/actions/setup-nox][setup-nox]]. 300 | 301 | ** Future enhancements 302 | 303 | It would be nice to add support for using [[https://reconciliation-api.github.io/specs/latest/#structure-of-a-reconciliation-query][properties]] as part of the scoring, so that more than 304 | one column of the csv could be taken into consideration. 305 | -------------------------------------------------------------------------------- /build.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # See if Cython is installed 4 | try: 5 | from Cython.Build import cythonize 6 | # Do nothing if Cython is not available 7 | except ImportError: 8 | # Got to provide this function. Otherwise, poetry will fail 9 | def build(setup_kwargs): 10 | pass 11 | 12 | 13 | # Cython is installed. Compile 14 | else: 15 | from setuptools import Extension 16 | from setuptools.dist import Distribution 17 | from distutils.command.build_ext import build_ext 18 | 19 | # This function will be executed in setup.py: 20 | def build(setup_kwargs): 21 | # The file you want to compile 22 | extensions = ["csv_reconcile_dice/cutils.pyx"] 23 | 24 | # gcc arguments hack: enable optimizations 25 | os.environ['CFLAGS'] = '-O3' 26 | 27 | # Build 28 | setup_kwargs.update({ 29 | 'ext_modules': 30 | cythonize( 31 | extensions, 32 | language_level=3, 33 | compiler_directives={'linetrace': True}, 34 | ), 35 | 'cmdclass': { 36 | 'build_ext': build_ext 37 | } 38 | }) 39 | -------------------------------------------------------------------------------- /csv_reconcile/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os.path 3 | from pathlib import Path 4 | import sys 5 | import time 6 | import shutil 7 | from contextlib import contextmanager 8 | 9 | import click 10 | from flask import abort, Flask, jsonify, request 11 | from flask_cors import cross_origin 12 | from markupsafe import escape 13 | 14 | from . import default_settings, initdb, scorer 15 | from .db import get_db, getCSVCols 16 | from .extend import processDataExtensionBatch 17 | from .preview import getEntity 18 | from .score import processQueryBatch 19 | 20 | try: 21 | import importlib_metadata as metadata 22 | except: 23 | from importlib import metadata 24 | 25 | __version__ = '0.3.2' 26 | #------------------------------------------------------------------ 27 | # Implement reconciliation API 28 | # [[https://reconciliation-api.github.io/specs/latest/]] 29 | #------------------------------------------------------------------ 30 | 31 | 32 | @contextmanager 33 | def Timer(): 34 | t = time.perf_counter() 35 | print("start timer", flush=True) 36 | yield 37 | elapsed = time.perf_counter() - t 38 | print("Elapsed: %s" % (elapsed,)) 39 | 40 | 41 | # Default manifest. Can be overriden/updated in configuration 42 | MANIFEST = { 43 | "versions": ["0.1"], 44 | "name": "CSV Reconcile", 45 | "identifierSpace": "http://localhost/csv_reconcile/ids", 46 | "schemaSpace": "http://localhost/csv_reconcile/schema", 47 | "extend": { 48 | "propose_properties": { 49 | "service_url": "http://localhost:5000", 50 | "service_path": "/properties" 51 | } 52 | } 53 | } 54 | 55 | 56 | def create_app(config=None, instance_path=None, scorerOption=None): 57 | app = Flask("csv-reconcile", instance_path=instance_path) 58 | 59 | instance_path = Path(app.instance_path) 60 | 61 | try: 62 | os.makedirs(instance_path) 63 | except OSError: 64 | pass 65 | 66 | scorerfile = instance_path / 'scorer.txt' 67 | 68 | # clean up old files if they exist 69 | # "" indicates called from doinit() 70 | if scorerOption == "" and scorerfile.is_file(): 71 | scorerfile.unlink() 72 | elif scorerOption: 73 | with open(scorerfile, 'w') as f: 74 | f.write(scorerOption) 75 | 76 | scorerOption = None 77 | if scorerfile.is_file(): 78 | with open(scorerfile) as f: 79 | scorerOption = f.read() 80 | 81 | if pickScorer(scorerOption) is None: 82 | return None 83 | 84 | # possibly better to roll THRESHOLD and LIMIT into one config called LIMITS 85 | app.config.from_object(default_settings) 86 | 87 | cfgfile = instance_path / "reconcile.config" 88 | 89 | # clean up old configs if they exist 90 | # "" indicates called from doinit() 91 | if config == "" and cfgfile.is_file(): 92 | cfgfile.unlink() 93 | elif config: 94 | shutil.copyfile( config, cfgfile ) 95 | 96 | if cfgfile.is_file(): 97 | app.config.from_pyfile(cfgfile) 98 | 99 | scoreOptions = app.config['SCOREOPTIONS'] 100 | scorer.processScoreOptions(scoreOptions) 101 | 102 | if 'MANIFEST' in app.config: 103 | MANIFEST.update(app.config['MANIFEST']) 104 | 105 | loglevel = app.config['LOGLEVEL'] 106 | if loglevel: 107 | app.logger.setLevel(loglevel) 108 | 109 | @app.before_request 110 | def before(): 111 | app.logger.debug(request.method) 112 | app.logger.debug(request.headers) 113 | 114 | @app.after_request 115 | def after(response): 116 | app.logger.debug(response.headers) 117 | return response 118 | 119 | @app.route('/reconcile', methods=['POST', 'GET']) 120 | @cross_origin() 121 | def acceptQuery(): 122 | threshold = app.config.get('THRESHOLD', None) 123 | limit = app.config.get('LIMIT', None) 124 | scoreOptions = app.config['SCOREOPTIONS'] 125 | queries = request.form.get('queries') 126 | extend = request.form.get('extend') 127 | if queries: 128 | db = get_db() 129 | 130 | queryBatch = json.loads(queries) 131 | 132 | app.logger.info(queryBatch) 133 | with Timer(): 134 | ret = processQueryBatch(db, 135 | queryBatch, 136 | limit=limit, 137 | threshold=threshold, 138 | **scoreOptions) 139 | app.logger.info(ret) 140 | return ret 141 | elif extend: 142 | extendBatch = json.loads(extend) 143 | 144 | app.logger.info(extendBatch) 145 | with Timer(): 146 | ret = processDataExtensionBatch(extendBatch) 147 | app.logger.info(ret) 148 | return ret 149 | else: 150 | return MANIFEST 151 | 152 | # FIX FIX FIX... Not needed in OpenRefine 3.5 153 | # [[https://github.com/OpenRefine/OpenRefine/issues/3672]] 154 | def jsonpify(obj): 155 | """ 156 | Like jsonify but wraps result in a JSONP callback if a 'callback' 157 | query param is supplied. 158 | """ 159 | try: 160 | callback = request.args['callback'] 161 | response = app.make_response("%s(%s)" % (callback, json.dumps(obj))) 162 | response.mimetype = "text/javascript" 163 | return response 164 | except KeyError: 165 | return jsonify(obj) 166 | 167 | @app.route('/properties', methods=['POST', 'GET']) 168 | @cross_origin() 169 | def acceptPropertyRequest(): 170 | # query string arg 171 | propType = request.args.get('type') 172 | 173 | # Type irrelevant, return all columns 174 | if propType != None: 175 | cols = getCSVCols() 176 | ret = dict(properties=[{ 177 | 'id': colname, 178 | 'name': name 179 | } for colname, name in cols]) 180 | return jsonpify(ret) 181 | 182 | # unprocessible request 183 | 184 | @app.route('/preview/') 185 | @cross_origin() 186 | def preview_service(entity_id=None): 187 | if not entity_id: 188 | abort(404) 189 | entity = getEntity(entity_id) 190 | if not entity: 191 | abort(404) 192 | entity_html = "".join([f"
{escape(key)}
{escape(val)}
" 193 | for key, val in entity.items()]) 194 | return f""" 195 | 196 | 197 | 198 | Preview for {escape(entity_id)} 199 | 208 | 209 | 210 |
{entity_html}
211 | 212 | """ 213 | 214 | return app 215 | 216 | 217 | def pickScorer(plugin): 218 | eps = metadata.entry_points().select(group='csv_reconcile.scorers') 219 | entrypoint = None 220 | if len(eps) == 0: 221 | raise RuntimeError("Please install a \"csv_reconcile.scorers\" plugin") 222 | elif plugin: 223 | for ep in eps: 224 | if ep.name == plugin: 225 | entrypoint = ep 226 | break 227 | else: 228 | raise RuntimeError( 229 | "Please install %s \"csv_reconcile.scorers\" plugin" % 230 | (plugin,)) 231 | elif len(eps) == 1: 232 | entrypoint = next(iter(eps)) 233 | 234 | if entrypoint is None: 235 | # print out options 236 | print( 237 | "There are several scorers available. Please choose one of the following with the --scorer option." 238 | ) 239 | for ep in eps: 240 | print(" %s" % (ep.name,)) 241 | return None 242 | 243 | entrypoint.load() 244 | return entrypoint 245 | 246 | 247 | @click.group() 248 | def cli(): 249 | pass 250 | 251 | 252 | def doinit(config, scorerOption, csvfile, idcol, namecol): 253 | 254 | app = create_app(config or "", scorerOption=scorerOption or "") 255 | if app is None: 256 | return 257 | 258 | with app.app_context(): 259 | initdb.init_db_with_context(csvfile, idcol, namecol) 260 | click.echo('Initialized the database.') 261 | return app 262 | 263 | 264 | @cli.command() 265 | @click.option('--config', help='config file') 266 | @click.option('--scorer', 'scorerOption', help='scoring plugin to use') 267 | @click.argument('csvfile') 268 | @click.argument('idcol') 269 | @click.argument('namecol') 270 | def init(config, scorerOption, csvfile, idcol, namecol): 271 | return doinit(config, scorerOption, csvfile, idcol, namecol) 272 | 273 | @cli.command() 274 | @click.option('--config', help='config file') 275 | @click.option('--scorer', 'scorerOption', help='scoring plugin to use') 276 | @click.option('--init-db', is_flag=True, help='initialize the db') 277 | @click.argument('csvfile') 278 | @click.argument('idcol') 279 | @click.argument('namecol') 280 | def run(config, scorerOption, init_db, csvfile, idcol, namecol): 281 | print(''' 282 | ######################################################### 283 | ## WARNING: The interface is deprecated ## 284 | ######################################################### 285 | 286 | Please run init once to initialize the database and serve to run the server. 287 | See --help for details. 288 | ''') 289 | 290 | app = None 291 | if init_db: 292 | app = doinit(config, scorerOption, csvfile, idcol, namecol) 293 | 294 | app = app or create_app(config) 295 | from werkzeug.serving import WSGIRequestHandler 296 | WSGIRequestHandler.protocol_version = "HTTP/1.1" 297 | app.run(debug=False) 298 | 299 | 300 | @cli.command() 301 | def serve(): 302 | 303 | # Config should have been copied during the init phase 304 | app = create_app() 305 | from werkzeug.serving import WSGIRequestHandler 306 | WSGIRequestHandler.protocol_version = "HTTP/1.1" 307 | app.run(debug=False) 308 | 309 | 310 | def main(): 311 | nonopts = [a for a in sys.argv if not a.startswith('--')] 312 | 313 | if len(nonopts) > 1 and nonopts[1] not in 'run init serve': 314 | print(''' 315 | ######################################################### 316 | ## WARNING: The interface has changed slightly. ## 317 | ######################################################### 318 | Please use one of the subcommands. See --help for details. 319 | 320 | ''') 321 | return cli() 322 | -------------------------------------------------------------------------------- /csv_reconcile/__main__.py: -------------------------------------------------------------------------------- 1 | from . import main 2 | import sys 3 | 4 | main() 5 | -------------------------------------------------------------------------------- /csv_reconcile/db.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import sqlite3 3 | 4 | from flask import current_app, g 5 | from normality import slugify 6 | 7 | 8 | def normalizeDBcol(col): 9 | return slugify(col).replace('-', '_') 10 | 11 | 12 | def getCSVCols(): 13 | cur = get_db().cursor() 14 | cur.execute("SELECT * FROM datacols") 15 | return [(row['colname'], row['name']) for row in cur] 16 | 17 | 18 | def getIDCol(): 19 | cur = get_db().cursor() 20 | 21 | cur.execute("SELECT colname FROM datacols WHERE isid == 1") 22 | res = cur.fetchall() 23 | if len(res) != 1: 24 | raise RuntimeError("database not properly initialized") 25 | return res[0]['colname'] 26 | 27 | 28 | def get_db(): 29 | if 'db' not in g: 30 | g.db = sqlite3.connect(os.path.join(current_app.instance_path, 31 | current_app.config['DATABASE']), 32 | detect_types=sqlite3.PARSE_DECLTYPES) 33 | g.db.row_factory = sqlite3.Row 34 | 35 | return g.db 36 | 37 | 38 | def close_db(e=None): 39 | db = g.pop('db', None) 40 | 41 | if db is not None: 42 | db.close() 43 | -------------------------------------------------------------------------------- /csv_reconcile/default_settings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | DATABASE = 'csvreconcile.db' 4 | 5 | LIMIT = 10 # At most 10 matches per query 6 | 7 | THRESHOLD = 30.0 # At least a 30% match 8 | 9 | LOGLEVEL = logging.NOTSET 10 | 11 | SCOREOPTIONS = {} 12 | -------------------------------------------------------------------------------- /csv_reconcile/extend.py: -------------------------------------------------------------------------------- 1 | from .db import get_db, getIDCol, getCSVCols 2 | 3 | def processDataExtensionBatch(batch): 4 | ids, props = tuple(batch[x] for x in ('ids', 'properties')) 5 | names = {p['id'] for p in props} 6 | cols = {colnm: nm for colnm, nm in getCSVCols() if colnm in names} 7 | idcol = getIDCol() 8 | 9 | 10 | db = get_db() 11 | cur = db.cursor() 12 | # Could use some defensiveness in generating this SQL 13 | sql = "SELECT %s,%s FROM data WHERE %s in (%s)" % (idcol, ','.join(cols.keys()), idcol, ','.join('?' * len(ids))) 14 | cur.execute(sql, ids) 15 | rows = dict() 16 | for row in cur: 17 | rows[row[idcol]] = {col: [{'str': row[col]}] for col in cols} 18 | 19 | meta = [dict(id=p['id'], name=cols[p['id']]) for p in props] 20 | 21 | return dict(meta=meta, rows=rows) 22 | -------------------------------------------------------------------------------- /csv_reconcile/initdb.py: -------------------------------------------------------------------------------- 1 | from flask import current_app 2 | import csv 3 | from chardet.universaldetector import UniversalDetector 4 | 5 | from collections import defaultdict 6 | from itertools import count 7 | 8 | from .db import get_db, normalizeDBcol 9 | 10 | from importlib.resources import read_text 11 | import csv_reconcile 12 | from . import scorer 13 | 14 | 15 | def initDataTable(db, colnames, idcol): 16 | cols = [] 17 | cnts = defaultdict(count) 18 | for col in colnames: 19 | slug = normalizeDBcol(col) 20 | slug = f'{slug}{next(cnts[slug])}' 21 | if col == idcol: 22 | cols.append('%s TEXT PRIMARY KEY' % (slug,)) 23 | else: 24 | cols.append('%s TEXT NOT NULL' % (slug,)) 25 | 26 | db.execute('INSERT INTO datacols VALUES (?,?,?)', 27 | (col, slug, 1 if col == idcol else 0)) 28 | 29 | # create data table with the contents of the csv file 30 | createSQL = 'CREATE TABLE data (\n %s\n)' 31 | db.execute(createSQL % (',\n '.join(cols),)) 32 | 33 | 34 | def initReconcileTable(db, colnames): 35 | create = [ 36 | 'CREATE TABLE reconcile (\n id TEXT PRIMARY KEY,\n word TEXT NOT NULL' 37 | ] 38 | for col in colnames: 39 | create.append('%s TEXT NOT NULL' % (col,)) 40 | 41 | # create data table with the contents of the csv file 42 | db.execute(',\n '.join(create) + '\n)') 43 | 44 | def detectEncoding(filenm): 45 | detector = UniversalDetector() 46 | for line in open(filenm, 'rb'): 47 | detector.feed(line) 48 | if detector.done: break 49 | detector.close() 50 | if detector.result['confidence'] > .95: 51 | return detector.result['encoding'] 52 | return None 53 | 54 | def init_db(db, 55 | csvfilenm, 56 | idcol, 57 | searchcol, 58 | csvencoding=None, 59 | scoreOptions=None, 60 | csvkwargs=None): 61 | 62 | enckwarg = dict() 63 | csvencoding = csvencoding or detectEncoding(csvfilenm) 64 | 65 | if csvencoding: 66 | enckwarg['encoding'] = csvencoding 67 | 68 | schema = read_text(csv_reconcile, 'schema.sql') 69 | db.executescript(schema) 70 | 71 | csvkwargs = {} if csvkwargs is None else csvkwargs 72 | 73 | with db: 74 | # Create a table with ids (as PRIMARY ID), words and bigrams 75 | with open(csvfilenm, newline='', **enckwarg) as csvfile: 76 | dialect = None 77 | try: 78 | dialect = csv.Sniffer().sniff(csvfile.read(1024)) 79 | except: 80 | pass 81 | 82 | csvfile.seek(0) 83 | reader = csv.reader(csvfile, dialect=dialect, **csvkwargs) 84 | header = next(reader) 85 | 86 | # Throws if col doesn't exist 87 | searchidx = header.index(searchcol) 88 | ididx = header.index(idcol) 89 | 90 | normalizedFields = scorer.getNormalizedFields() 91 | initDataTable(db, header, idcol) 92 | initReconcileTable(db, normalizedFields) 93 | 94 | datavals = ','.join('?' * len(header)) 95 | 96 | for row in reader: 97 | if len(row) != len(header): continue 98 | mid = row[ididx] 99 | word = row[searchidx] 100 | matchFields = scorer.normalizeRow(word, row, **scoreOptions) 101 | db.execute( 102 | "INSERT INTO reconcile VALUES (%s)" % 103 | (','.join('?' * (2 + len(normalizedFields))),), 104 | (mid, word) + tuple(matchFields)) 105 | 106 | db.execute("INSERT INTO data VALUES (%s)" % (datavals), row) 107 | 108 | 109 | def init_db_with_context(csvfilenm, idcol, searchcol): 110 | db = get_db() 111 | csvkwargs = current_app.config.get('CSVKWARGS', {}) 112 | scoreOptions = current_app.config['SCOREOPTIONS'] 113 | csvencoding = current_app.config.get('CSVENCODING', None) 114 | 115 | return init_db(db, 116 | csvfilenm, 117 | idcol, 118 | searchcol, 119 | csvencoding=csvencoding, 120 | csvkwargs=csvkwargs, 121 | scoreOptions=scoreOptions) 122 | -------------------------------------------------------------------------------- /csv_reconcile/preview.py: -------------------------------------------------------------------------------- 1 | from .db import get_db, getCSVCols, getIDCol 2 | 3 | 4 | def getEntity(entity_id): 5 | id_col = getIDCol() 6 | cols = dict(getCSVCols()) 7 | 8 | cur = get_db().cursor() 9 | cur.execute(f"SELECT * FROM data WHERE {id_col}=? LIMIT 1", (entity_id,)) 10 | row = cur.fetchone() 11 | if not row: 12 | return None 13 | return {cols[col]: value for col, value in zip(row.keys(), row)} 14 | -------------------------------------------------------------------------------- /csv_reconcile/schema.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS reconcile; 2 | DROP TABLE IF EXISTS data; 3 | DROP TABLE IF EXISTS datacols; 4 | 5 | CREATE TABLE datacols ( 6 | name TEXT PRIMARY KEY, 7 | colname TEXT NOT NULL, 8 | isid INT NOT NULL 9 | ); 10 | -------------------------------------------------------------------------------- /csv_reconcile/score.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from . import scorer 3 | 4 | 5 | def reconcileStrings(db, items, **kwargs): 6 | # Use index as query id 7 | batch = dict(enumerate({'query': s} for s in items)) 8 | 9 | ret = processQueryBatch(db, batch, **kwargs) 10 | 11 | # Return as a list of pairs of query matched to result 12 | return list(zip(items, (res for _, res in sorted(ret.items())))) 13 | 14 | 15 | def processQueryBatch(db, batch, limit=None, threshold=0.0, **scoreOptions): 16 | ''' 17 | Go through db looking for words whose fuzzy match score positively 18 | ''' 19 | hasFeatures = not getattr(scorer.features, "disabled", False) 20 | 21 | toMatchItems = dict() 22 | for qid, req in batch.items(): 23 | queryStr = req['query'] 24 | toMatchItems[qid] = scorer.normalizeWord(queryStr, ** 25 | scoreOptions) or queryStr 26 | 27 | # Better to pull these off an sqlite store 28 | 29 | cur = db.cursor() 30 | normalizedFields = scorer.getNormalizedFields() 31 | 32 | cur.execute('SELECT %s FROM reconcile' % 33 | (','.join(('word', 'id') + tuple(normalizedFields)))) 34 | 35 | picks = defaultdict(list) 36 | for row in cur: 37 | compareTo = row[2:] if normalizedFields else row['word'] 38 | if not scorer.valid(compareTo): 39 | continue 40 | 41 | for qid in batch.keys(): 42 | toMatch = toMatchItems[qid] 43 | 44 | score = scorer.scoreMatch(toMatch, compareTo, **scoreOptions) 45 | if score is not None and score > threshold: 46 | picks[qid].append((row, score)) 47 | 48 | ret = dict() 49 | for qid in batch: 50 | pick = picks[qid] 51 | lmt = batch[qid].get('limit', limit) 52 | queryStr = batch[qid]['query'] 53 | 54 | res = [] 55 | exacts = [] 56 | cnt = 0 57 | for row, score in sorted(pick, key=lambda x: -x[1]): 58 | cnt += 1 59 | if lmt and cnt > lmt: 60 | break 61 | 62 | res.append( 63 | dict(id=row['id'], name=row['word'], score=score, match=False)) 64 | 65 | if hasFeatures: 66 | features = scorer.features(queryStr, row) 67 | if features is not None: 68 | res[-1]['features'] = features 69 | 70 | if res[-1]['name'] == queryStr: 71 | exacts.append(res[-1]) 72 | 73 | # Make match if only one 74 | if len(res) == 1: 75 | res[0]['match'] = True 76 | else: 77 | if len(exacts) == 1: 78 | exacts[0]['match'] = True 79 | 80 | # Maybe match if there is a wide gap in score between first match and second? 81 | ret[qid] = dict(result=res) 82 | 83 | return ret 84 | -------------------------------------------------------------------------------- /csv_reconcile/scorer.py: -------------------------------------------------------------------------------- 1 | def register(func): 2 | ''' 3 | Decorator for replacing functions in this module 4 | ''' 5 | glbls = globals() 6 | glbls[func.__name__] = func 7 | return func 8 | 9 | 10 | def getNormalizedFields(): 11 | '''List of fields generated from reconciled column for the match calculation''' 12 | raise RuntimeError('getNormalizedFields() -> tuple must be implemented') 13 | 14 | 15 | def processScoreOptions(options): 16 | '''Optionally modify configuration options passed in''' 17 | 18 | 19 | def scoreMatch(left, right, **scoreOptions): 20 | '''Score fuzzy match score between left and right''' 21 | raise RuntimeError('scoreMatch(left,right) -> float must be implemented') 22 | 23 | 24 | def normalizeWord(word, **scoreOptions): 25 | ''' 26 | Preprocess column being reconciled for the match calculation. 27 | Return a tuple with the same number of elements as returned by getNormalizedFields() 28 | ''' 29 | raise RuntimeError( 30 | 'normalizeWord(word, **options) -> tuple must be implemented') 31 | 32 | 33 | def normalizeRow(word, row, **scoreOptions): 34 | ''' 35 | Preprocess column being reconciled against for the match calculation. 36 | Return a tuple with the same number of elements as returned by getNormalizedFields() 37 | Defaults to using the same normalization as normalizeWord(). 38 | ''' 39 | return normalizeWord(word, **scoreOptions) 40 | 41 | 42 | def valid(normalizedFields): 43 | '''Optionally validate column before performing match calculation''' 44 | return True 45 | 46 | 47 | # [[https://reconciliation-api.github.io/specs/latest/#reconciliation-query-responses]] 48 | def features(word, row, **scoreOptions): 49 | ''' 50 | Takes the queryString and the normalized row and calculates features. 51 | The calculation is disabled by default. 52 | ''' 53 | # This is just a dummy result since features are disabled by default. 54 | return [dict(id="someid", value=15), dict(id="someotherid", value=19)] 55 | 56 | 57 | features.disabled = True 58 | -------------------------------------------------------------------------------- /csv_reconcile_dice/__init__.py: -------------------------------------------------------------------------------- 1 | from csv_reconcile import scorer 2 | from normality import normalize 3 | 4 | try: 5 | # Cython if it exists 6 | from .cutils import getDiceCoefficient 7 | except: 8 | from .utils import getDiceCoefficient 9 | 10 | 11 | # [[https://en.wikipedia.org/wiki/Stop_word]] 12 | def makeBigrams(word, **scoreOptions): 13 | ''' 14 | Normalize set of bigrams into an ordered string to aid processing 15 | ''' 16 | # Should probably allow stop words 17 | # Should probably strip of spaces(?) and punctuation 18 | process = normalize(word) 19 | stopwords = scoreOptions.get('stopwords', None) 20 | if stopwords: 21 | process = ' '.join(w for w in process.split() if w not in stopwords) 22 | 23 | return ''.join( 24 | sorted(set(process[i:i + 2] for i in range(len(process) - 1)))) 25 | 26 | 27 | @scorer.register 28 | def getNormalizedFields(): 29 | return ('bigrams',) 30 | 31 | 32 | @scorer.register 33 | def processScoreOptions(options): 34 | if not options: 35 | return 36 | 37 | options['stopwords'] = [w.lower() for w in options['stopwords']] 38 | 39 | 40 | @scorer.register 41 | def scoreMatch(left, right, **scoreOptions): 42 | return getDiceCoefficient(left[0].encode('utf-8'), right[0].encode('utf-8')) 43 | 44 | 45 | @scorer.register 46 | def normalizeWord(word, **scoreOptions): 47 | return (makeBigrams(word, **scoreOptions),) 48 | 49 | 50 | @scorer.register 51 | def valid(normalizedFields): 52 | if not normalizedFields[0]: 53 | return False 54 | return True 55 | -------------------------------------------------------------------------------- /csv_reconcile_dice/cutils.pyx: -------------------------------------------------------------------------------- 1 | from libc.string cimport strncmp 2 | 3 | # [[https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient]] 4 | def getDiceCoefficient(const char* bigram1, const char* bigram2): 5 | ''' 6 | Calculate the Dice coefficient from two normalized sets of bigrams 7 | ''' 8 | cdef int l1, l2, i1, i2, cnt, diff 9 | l1 = len(bigram1) 10 | l2 = len(bigram2) 11 | i1 = i2 = cnt = 0 12 | 13 | while i1 < l1 and i2 < l2: 14 | diff = strncmp(bigram1+i1, bigram2+i2, 2) 15 | if diff == 0: 16 | cnt += 1 17 | i1 += 2 18 | i2 += 2 19 | elif diff < 0: 20 | i1 += 2 21 | else: 22 | i2 += 2 23 | 24 | # length is twice the number of bigrams 25 | return 400.0 * cnt / (l1 + l2) 26 | -------------------------------------------------------------------------------- /csv_reconcile_dice/utils.py: -------------------------------------------------------------------------------- 1 | # [[https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient]] 2 | def getDiceCoefficient(bigram1, bigram2): 3 | ''' 4 | Calculate the Dice coefficient from two normalized sets of bigrams 5 | ''' 6 | l1 = len(bigram1) 7 | l2 = len(bigram2) 8 | i1 = i2 = cnt = 0 9 | 10 | while i1 < l1 and i2 < l2: 11 | b1 = bigram1[i1:i1 + 2] 12 | b2 = bigram2[i2:i2 + 2] 13 | if b1 == b2: 14 | cnt += 1 15 | i1 += 2 16 | i2 += 2 17 | elif b1 < b2: 18 | i1 += 2 19 | else: 20 | i2 += 2 21 | 22 | # length is twice the number of bigrams 23 | return 400.0 * cnt / (l1 + l2) 24 | -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | from nox_poetry import session, SDIST 2 | 3 | args = lambda s: s.split() 4 | 5 | 6 | @session(python=['3.7', '3.8', '3.9']) 7 | def test_main(session): 8 | session.poetry.installroot(distribution_format=SDIST) 9 | session.install('pytest') 10 | session.run(*args('pytest -v tests/main')) 11 | 12 | 13 | @session(python=['3.7', '3.8', '3.9']) 14 | def test_geo(session): 15 | session.poetry.installroot(distribution_format=SDIST) 16 | session.install('csv-reconcile-geo') 17 | session.install('pytest') 18 | session.run(*args('pytest -v tests/plugins/geo')) 19 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | [[package]] 2 | name = "attrs" 3 | version = "21.4.0" 4 | description = "Classes Without Boilerplate" 5 | category = "dev" 6 | optional = false 7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 8 | 9 | [package.extras] 10 | dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"] 11 | docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"] 12 | tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"] 13 | tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"] 14 | 15 | [[package]] 16 | name = "banal" 17 | version = "1.0.6" 18 | description = "Commons of banal micro-functions for Python." 19 | category = "main" 20 | optional = false 21 | python-versions = "*" 22 | 23 | [package.extras] 24 | dev = ["mypy", "wheel"] 25 | 26 | [[package]] 27 | name = "chardet" 28 | version = "5.1.0" 29 | description = "Universal encoding detector for Python 3" 30 | category = "main" 31 | optional = false 32 | python-versions = ">=3.7" 33 | 34 | [[package]] 35 | name = "charset-normalizer" 36 | version = "2.0.12" 37 | description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." 38 | category = "main" 39 | optional = false 40 | python-versions = ">=3.5.0" 41 | 42 | [package.extras] 43 | unicode-backport = ["unicodedata2"] 44 | 45 | [[package]] 46 | name = "click" 47 | version = "8.1.3" 48 | description = "Composable command line interface toolkit" 49 | category = "main" 50 | optional = false 51 | python-versions = ">=3.7" 52 | 53 | [package.dependencies] 54 | colorama = {version = "*", markers = "platform_system == \"Windows\""} 55 | importlib-metadata = {version = "*", markers = "python_version < \"3.8\""} 56 | 57 | [[package]] 58 | name = "colorama" 59 | version = "0.4.4" 60 | description = "Cross-platform colored terminal text." 61 | category = "main" 62 | optional = false 63 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" 64 | 65 | [[package]] 66 | name = "cython" 67 | version = "0.29.33" 68 | description = "The Cython compiler for writing C extensions for the Python language." 69 | category = "main" 70 | optional = false 71 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" 72 | 73 | [[package]] 74 | name = "exceptiongroup" 75 | version = "1.0.0rc9" 76 | description = "Backport of PEP 654 (exception groups)" 77 | category = "dev" 78 | optional = false 79 | python-versions = ">=3.7" 80 | 81 | [package.extras] 82 | test = ["pytest (>=6)"] 83 | 84 | [[package]] 85 | name = "flask" 86 | version = "2.2.2" 87 | description = "A simple framework for building complex web applications." 88 | category = "main" 89 | optional = false 90 | python-versions = ">=3.7" 91 | 92 | [package.dependencies] 93 | click = ">=8.0" 94 | importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} 95 | itsdangerous = ">=2.0" 96 | Jinja2 = ">=3.0" 97 | Werkzeug = ">=2.2.2" 98 | 99 | [package.extras] 100 | async = ["asgiref (>=3.2)"] 101 | dotenv = ["python-dotenv"] 102 | 103 | [[package]] 104 | name = "flask-cors" 105 | version = "3.0.10" 106 | description = "A Flask extension adding a decorator for CORS support" 107 | category = "main" 108 | optional = false 109 | python-versions = "*" 110 | 111 | [package.dependencies] 112 | Flask = ">=0.9" 113 | Six = "*" 114 | 115 | [[package]] 116 | name = "importlib-metadata" 117 | version = "6.0.0" 118 | description = "Read metadata from Python packages" 119 | category = "main" 120 | optional = false 121 | python-versions = ">=3.7" 122 | 123 | [package.dependencies] 124 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""} 125 | zipp = ">=0.5" 126 | 127 | [package.extras] 128 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] 129 | perf = ["ipython"] 130 | testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"] 131 | 132 | [[package]] 133 | name = "iniconfig" 134 | version = "1.1.1" 135 | description = "iniconfig: brain-dead simple config-ini parsing" 136 | category = "dev" 137 | optional = false 138 | python-versions = "*" 139 | 140 | [[package]] 141 | name = "itsdangerous" 142 | version = "2.1.2" 143 | description = "Safely pass data to untrusted environments and back." 144 | category = "main" 145 | optional = false 146 | python-versions = ">=3.7" 147 | 148 | [[package]] 149 | name = "jinja2" 150 | version = "3.1.2" 151 | description = "A very fast and expressive template engine." 152 | category = "main" 153 | optional = false 154 | python-versions = ">=3.7" 155 | 156 | [package.dependencies] 157 | MarkupSafe = ">=2.0" 158 | 159 | [package.extras] 160 | i18n = ["Babel (>=2.7)"] 161 | 162 | [[package]] 163 | name = "markupsafe" 164 | version = "2.1.1" 165 | description = "Safely add untrusted strings to HTML/XML markup." 166 | category = "main" 167 | optional = false 168 | python-versions = ">=3.7" 169 | 170 | [[package]] 171 | name = "normality" 172 | version = "2.4.0" 173 | description = "Micro-library to normalize text strings" 174 | category = "main" 175 | optional = false 176 | python-versions = "*" 177 | 178 | [package.dependencies] 179 | banal = ">=1.0.1" 180 | chardet = "*" 181 | charset-normalizer = ">=2.0.0" 182 | text-unidecode = "*" 183 | 184 | [package.extras] 185 | dev = ["mypy", "pyicu (>=1.9.3)", "pytest", "types-chardet"] 186 | icu = ["pyicu (>=1.9.3)"] 187 | 188 | [[package]] 189 | name = "packaging" 190 | version = "21.3" 191 | description = "Core utilities for Python packages" 192 | category = "dev" 193 | optional = false 194 | python-versions = ">=3.6" 195 | 196 | [package.dependencies] 197 | pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" 198 | 199 | [[package]] 200 | name = "pluggy" 201 | version = "1.0.0" 202 | description = "plugin and hook calling mechanisms for python" 203 | category = "dev" 204 | optional = false 205 | python-versions = ">=3.6" 206 | 207 | [package.dependencies] 208 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 209 | 210 | [package.extras] 211 | dev = ["pre-commit", "tox"] 212 | testing = ["pytest", "pytest-benchmark"] 213 | 214 | [[package]] 215 | name = "pyparsing" 216 | version = "3.0.8" 217 | description = "pyparsing module - Classes and methods to define and execute parsing grammars" 218 | category = "dev" 219 | optional = false 220 | python-versions = ">=3.6.8" 221 | 222 | [package.extras] 223 | diagrams = ["jinja2", "railroad-diagrams"] 224 | 225 | [[package]] 226 | name = "pytest" 227 | version = "7.2.0" 228 | description = "pytest: simple powerful testing with Python" 229 | category = "dev" 230 | optional = false 231 | python-versions = ">=3.7" 232 | 233 | [package.dependencies] 234 | attrs = ">=19.2.0" 235 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 236 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 237 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""} 238 | iniconfig = "*" 239 | packaging = "*" 240 | pluggy = ">=0.12,<2.0" 241 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 242 | 243 | [package.extras] 244 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] 245 | 246 | [[package]] 247 | name = "six" 248 | version = "1.16.0" 249 | description = "Python 2 and 3 compatibility utilities" 250 | category = "main" 251 | optional = false 252 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" 253 | 254 | [[package]] 255 | name = "text-unidecode" 256 | version = "1.3" 257 | description = "The most basic Text::Unidecode port" 258 | category = "main" 259 | optional = false 260 | python-versions = "*" 261 | 262 | [[package]] 263 | name = "tomli" 264 | version = "2.0.1" 265 | description = "A lil' TOML parser" 266 | category = "dev" 267 | optional = false 268 | python-versions = ">=3.7" 269 | 270 | [[package]] 271 | name = "typing-extensions" 272 | version = "4.2.0" 273 | description = "Backported and Experimental Type Hints for Python 3.7+" 274 | category = "main" 275 | optional = false 276 | python-versions = ">=3.7" 277 | 278 | [[package]] 279 | name = "werkzeug" 280 | version = "2.2.2" 281 | description = "The comprehensive WSGI web application library." 282 | category = "main" 283 | optional = false 284 | python-versions = ">=3.7" 285 | 286 | [package.dependencies] 287 | MarkupSafe = ">=2.1.1" 288 | 289 | [package.extras] 290 | watchdog = ["watchdog"] 291 | 292 | [[package]] 293 | name = "zipp" 294 | version = "3.8.0" 295 | description = "Backport of pathlib-compatible object wrapper for zip files" 296 | category = "main" 297 | optional = false 298 | python-versions = ">=3.7" 299 | 300 | [package.extras] 301 | docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"] 302 | testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] 303 | 304 | [metadata] 305 | lock-version = "1.1" 306 | python-versions = "^3.7" 307 | content-hash = "c960304aafc066172399007ed38cbff5e27d9fae5743995261b6a8699c1edb1e" 308 | 309 | [metadata.files] 310 | attrs = [ 311 | {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"}, 312 | {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"}, 313 | ] 314 | banal = [ 315 | {file = "banal-1.0.6-py2.py3-none-any.whl", hash = "sha256:877aacb16b17f8fa4fd29a7c44515c5a23dc1a7b26078bc41dd34829117d85e1"}, 316 | {file = "banal-1.0.6.tar.gz", hash = "sha256:2fe02c9305f53168441948f4a03dfbfa2eacc73db30db4a93309083cb0e250a5"}, 317 | ] 318 | chardet = [ 319 | {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"}, 320 | {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"}, 321 | ] 322 | charset-normalizer = [ 323 | {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"}, 324 | {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"}, 325 | ] 326 | click = [ 327 | {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, 328 | {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, 329 | ] 330 | colorama = [ 331 | {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, 332 | {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"}, 333 | ] 334 | cython = [ 335 | {file = "Cython-0.29.33-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:286cdfb193e23799e113b7bd5ac74f58da5e9a77c70e3b645b078836b896b165"}, 336 | {file = "Cython-0.29.33-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:8507279a4f86ed8365b96603d5ad155888d4d01b72a9bbf0615880feda5a11d4"}, 337 | {file = "Cython-0.29.33-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5bf5ffd96957a595441cca2fc78470d93fdc40dfe5449881b812ea6045d7e9be"}, 338 | {file = "Cython-0.29.33-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:d2019a7e54ba8b253f44411863b8f8c0b6cd623f7a92dc0ccb83892358c4283a"}, 339 | {file = "Cython-0.29.33-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:190e60b7505d3b9b60130bcc2251c01b9ef52603420829c19d3c3ede4ac2763a"}, 340 | {file = "Cython-0.29.33-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0168482495b75fea1c97a9641a95bac991f313e85f378003f9a4909fdeb3d454"}, 341 | {file = "Cython-0.29.33-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:090556e41f2b30427dd3a1628d3613177083f47567a30148b6b7b8c7a5862187"}, 342 | {file = "Cython-0.29.33-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:19c9913e9304bf97f1d2c357438895466f99aa2707d3c7a5e9de60c259e1ca1d"}, 343 | {file = "Cython-0.29.33-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:afc9b6ab20889676c76e700ae6967aa6886a7efe5b05ef6d5b744a6ca793cc43"}, 344 | {file = "Cython-0.29.33-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:49fb45b2bf12d6e2060bbd64506c06ac90e254f3a4bceb32c717f4964a1ae812"}, 345 | {file = "Cython-0.29.33-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:5430f38d3d01c4715ec2aef5c41e02a2441c1c3a0149359c7a498e4c605b8e6c"}, 346 | {file = "Cython-0.29.33-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4d315443c7f4c61180b6c3ea9a9717ee7c901cc9db8d1d46fdf6556613840ed"}, 347 | {file = "Cython-0.29.33-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b4e6481e3e7e4d345640fe2fdc6dc57c94369b467f3dc280949daa8e9fd13b9"}, 348 | {file = "Cython-0.29.33-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:060a2568ef80116a0a9dcaf3218a61c6007be0e0b77c5752c094ce5187a4d63c"}, 349 | {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b67ddd32eaa2932a66bf8121accc36a7b3078593805519b0f00040f2b10a6a52"}, 350 | {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:1b507236ba3ca94170ce0a504dd03acf77307d4bfbc5a010a8031673f6b213a9"}, 351 | {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:581efc0622a9be05714222f2b4ac96a5419de58d5949517282d8df38155c8b9d"}, 352 | {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b8bcbf8f1c3c46d6184be1e559e3a3fb8cdf27c6d507d8bc8ae04cfcbfd75f5"}, 353 | {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1ca93bbe584aee92094fd4fb6acc5cb6500acf98d4f57cc59244f0a598b0fcf6"}, 354 | {file = "Cython-0.29.33-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:da490129e1e4ffaf3f88bfb46d338549a2150f60f809a63d385b83e00960d11a"}, 355 | {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4cadf5250eda0c5cdaf4c3a29b52be3e0695f4a2bf1ccd49b638d239752ea513"}, 356 | {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:bcb1a84fd2bd7885d572adc180e24fd8a7d4b0c104c144e33ccf84a1ab4eb2b8"}, 357 | {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:d78147ad8a3417ae6b371bbc5bfc6512f6ad4ad3fb71f5eef42e136e4ed14970"}, 358 | {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dd96b06b93c0e5fa4fc526c5be37c13a93e2fe7c372b5f358277ebe9e1620957"}, 359 | {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:959f0092d58e7fa00fd3434f7ff32fb78be7c2fa9f8e0096326343159477fe45"}, 360 | {file = "Cython-0.29.33-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0455d5b92f461218bcf173a149a88b7396c3a109066274ccab5eff58db0eae32"}, 361 | {file = "Cython-0.29.33-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:a9b0b890656e9d18a18e1efe26ea3d2d0f3e525a07a2a853592b0afc56a15c89"}, 362 | {file = "Cython-0.29.33-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b5e8ce3039ff64000d58cd45b3f6f83e13f032dde7f27bb1ab96070d9213550b"}, 363 | {file = "Cython-0.29.33-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:e8922fa3d7e76b7186bbd0810e170ca61f83661ab1b29dc75e88ff2327aaf49d"}, 364 | {file = "Cython-0.29.33-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f67b7306fd00d55f271009335cecadc506d144205c7891070aad889928d85750"}, 365 | {file = "Cython-0.29.33-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:f271f90005064c49b47a93f456dc6cf0a21d21ef835bd33ac1e0db10ad51f84f"}, 366 | {file = "Cython-0.29.33-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d4457d417ffbb94abc42adcd63a03b24ff39cf090f3e9eca5e10cfb90766cbe3"}, 367 | {file = "Cython-0.29.33-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:0b53e017522feb8dcc2189cf1d2d344bab473c5bba5234390b5666d822992c7c"}, 368 | {file = "Cython-0.29.33-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:4f88c2dc0653eef6468848eb8022faf64115b39734f750a1c01a7ba7eb04d89f"}, 369 | {file = "Cython-0.29.33-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:1900d862a4a537d2125706740e9f3b016e80f7bbf7b54db6b3cc3d0bdf0f5c3a"}, 370 | {file = "Cython-0.29.33-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:37bfca4f9f26361343d8c678f8178321e4ae5b919523eed05d2cd8ddbe6b06ec"}, 371 | {file = "Cython-0.29.33-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a9863f8238642c0b1ef8069d99da5ade03bfe2225a64b00c5ae006d95f142a73"}, 372 | {file = "Cython-0.29.33-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1dd503408924723b0bb10c0013b76e324eeee42db6deced9b02b648f1415d94c"}, 373 | {file = "Cython-0.29.33-py2.py3-none-any.whl", hash = "sha256:8b99252bde8ff51cd06a3fe4aeacd3af9b4ff4a4e6b701ac71bddc54f5da61d6"}, 374 | {file = "Cython-0.29.33.tar.gz", hash = "sha256:5040764c4a4d2ce964a395da24f0d1ae58144995dab92c6b96f44c3f4d72286a"}, 375 | ] 376 | exceptiongroup = [ 377 | {file = "exceptiongroup-1.0.0rc9-py3-none-any.whl", hash = "sha256:2e3c3fc1538a094aab74fad52d6c33fc94de3dfee3ee01f187c0e0c72aec5337"}, 378 | {file = "exceptiongroup-1.0.0rc9.tar.gz", hash = "sha256:9086a4a21ef9b31c72181c77c040a074ba0889ee56a7b289ff0afb0d97655f96"}, 379 | ] 380 | flask = [ 381 | {file = "Flask-2.2.2-py3-none-any.whl", hash = "sha256:b9c46cc36662a7949f34b52d8ec7bb59c0d74ba08ba6cb9ce9adc1d8676d9526"}, 382 | {file = "Flask-2.2.2.tar.gz", hash = "sha256:642c450d19c4ad482f96729bd2a8f6d32554aa1e231f4f6b4e7e5264b16cca2b"}, 383 | ] 384 | flask-cors = [ 385 | {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"}, 386 | {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"}, 387 | ] 388 | importlib-metadata = [ 389 | {file = "importlib_metadata-6.0.0-py3-none-any.whl", hash = "sha256:7efb448ec9a5e313a57655d35aa54cd3e01b7e1fbcf72dce1bf06119420f5bad"}, 390 | {file = "importlib_metadata-6.0.0.tar.gz", hash = "sha256:e354bedeb60efa6affdcc8ae121b73544a7aa74156d047311948f6d711cd378d"}, 391 | ] 392 | iniconfig = [ 393 | {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"}, 394 | {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"}, 395 | ] 396 | itsdangerous = [ 397 | {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"}, 398 | {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"}, 399 | ] 400 | jinja2 = [ 401 | {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"}, 402 | {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"}, 403 | ] 404 | markupsafe = [ 405 | {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, 406 | {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, 407 | {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"}, 408 | {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"}, 409 | {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"}, 410 | {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"}, 411 | {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"}, 412 | {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"}, 413 | {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"}, 414 | {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"}, 415 | {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"}, 416 | {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"}, 417 | {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"}, 418 | {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"}, 419 | {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"}, 420 | {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"}, 421 | {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"}, 422 | {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"}, 423 | {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"}, 424 | {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"}, 425 | {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"}, 426 | {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"}, 427 | {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"}, 428 | {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"}, 429 | {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"}, 430 | {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"}, 431 | {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"}, 432 | {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"}, 433 | {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"}, 434 | {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"}, 435 | {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"}, 436 | {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"}, 437 | {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"}, 438 | {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"}, 439 | {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"}, 440 | {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"}, 441 | {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"}, 442 | {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"}, 443 | {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"}, 444 | {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, 445 | ] 446 | normality = [ 447 | {file = "normality-2.4.0-py2.py3-none-any.whl", hash = "sha256:8bd9dd5a0220f641ed4cc59b7ad64ab11b0ee49e57e5a70bf515445ff72574d2"}, 448 | {file = "normality-2.4.0.tar.gz", hash = "sha256:38bbe4e1dfd737c318ffd70e981ae8ff40bd8839393f6d62f0e200e5aab9e992"}, 449 | ] 450 | packaging = [ 451 | {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, 452 | {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, 453 | ] 454 | pluggy = [ 455 | {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, 456 | {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, 457 | ] 458 | pyparsing = [ 459 | {file = "pyparsing-3.0.8-py3-none-any.whl", hash = "sha256:ef7b523f6356f763771559412c0d7134753f037822dad1b16945b7b846f7ad06"}, 460 | {file = "pyparsing-3.0.8.tar.gz", hash = "sha256:7bf433498c016c4314268d95df76c81b842a4cb2b276fa3312cfb1e1d85f6954"}, 461 | ] 462 | pytest = [ 463 | {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"}, 464 | {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"}, 465 | ] 466 | six = [ 467 | {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, 468 | {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, 469 | ] 470 | text-unidecode = [ 471 | {file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"}, 472 | {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, 473 | ] 474 | tomli = [ 475 | {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, 476 | {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, 477 | ] 478 | typing-extensions = [ 479 | {file = "typing_extensions-4.2.0-py3-none-any.whl", hash = "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708"}, 480 | {file = "typing_extensions-4.2.0.tar.gz", hash = "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"}, 481 | ] 482 | werkzeug = [ 483 | {file = "Werkzeug-2.2.2-py3-none-any.whl", hash = "sha256:f979ab81f58d7318e064e99c4506445d60135ac5cd2e177a2de0089bfd4c9bd5"}, 484 | {file = "Werkzeug-2.2.2.tar.gz", hash = "sha256:7ea2d48322cc7c0f8b3a215ed73eabd7b5d75d0b50e31ab006286ccff9e00b8f"}, 485 | ] 486 | zipp = [ 487 | {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"}, 488 | {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"}, 489 | ] 490 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "csv-reconcile" 3 | version = "0.3.2" 4 | description = "OpenRefine reconciliation service backed by csv resource" 5 | authors = ["Douglas Mennella "] 6 | license = "MIT" 7 | readme = 'README.md' 8 | repository = "https://github.com/gitonthescene/csv-reconcile" 9 | keywords = ["OpenRefine", 10 | "reconcile", 11 | "fuzzy search"] 12 | 13 | build = "build.py" 14 | packages = [ 15 | { include = "csv_reconcile" }, 16 | { include = "csv_reconcile_dice" } 17 | ] 18 | 19 | [tool.poetry.dependencies] 20 | python = "^3.7" 21 | flask = "^2.0" 22 | flask-cors = "^3.0.10" 23 | cython = "^0.29.21" 24 | normality = "^2.1.1" 25 | importlib_metadata = { version = ">=4.5,<7.0", python = "<3.10" } 26 | chardet = ">=4,<6" 27 | 28 | [tool.poetry.dev-dependencies] 29 | pytest = "^7.2" 30 | 31 | [tool.poe.tasks] 32 | dummydoc = { script = "utils:dummydoc" } 33 | pandoc = { cmd = "pandoc README.org --toc -f org -t markdown_strict -s -o README.md" } 34 | test = "pytest -v tests/main" 35 | nox = "nox" 36 | build = [ 37 | { ref = "dummydoc" }, 38 | { ref = "test" }, 39 | { cmd = "poetry build" } 40 | ] 41 | install = [ 42 | { ref = "dummydoc" }, 43 | { cmd = "poetry install" } 44 | ] 45 | publish = [ 46 | { ref = "pandoc" }, 47 | { ref = "nox" }, 48 | { cmd = "poetry publish" } 49 | ] 50 | 51 | [tool.poetry.plugins."csv_reconcile.scorers"] 52 | "dice" = "csv_reconcile_dice" 53 | 54 | [tool.poetry.scripts] 55 | csv-reconcile = "csv_reconcile:main" 56 | 57 | [build-system] 58 | requires = [ 59 | "poetry>=0.12", 60 | "cython", 61 | "setuptools!=50.0", 62 | "wheel" 63 | ] 64 | 65 | build-backend = "poetry.masonry.api" 66 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore:Install 'pyicu' for better text transliteration. 4 | -------------------------------------------------------------------------------- /sample/progressives.tsv: -------------------------------------------------------------------------------- 1 | member district 2 | Pramila Jayapal 7th District of Washington 3 | Katie Porter 45th District of California 4 | Ilhan Omar 5th District of Minnesota 5 | Raúl Grijalva 3rd District of Arizona 6 | Barbara Lee 13th District of California 7 | Mark Pocan 2nd District of Wisconsin 8 | Jamie Raskin "Vice Chair for Policy 9 | 8th District of Maryland" 10 | Joe Neguse "Vice Chair for New Members 11 | 2nd District of Colorado" 12 | Marie Newman "Vice Chair for Communications 13 | 3rd Congressional District of Illinois" 14 | Sheila Jackson Lee "Vice Chair for Inter-Caucus Relations 15 | 18th District of Texas" 16 | Donald Norcross "Vice Chair for Labor 17 | 1st District of New Jersey" 18 | Rashida Tlaib "Vice Chair for Member Services 19 | 13th District of Michigan" 20 | David Cicilline "Vice Chair At Large 21 | 1st District of Rhode Island" 22 | Deb Haaland Vice Chair At Large1st District of New Mexico 23 | "Jesus ""Chuy"" Garcia" "Vice Chair At Large 24 | 4th District of Illinois" 25 | Bonnie Watson Coleman "Vice Chair At Large 26 | 12th District of New Jersey" 27 | Cori Bush 1st District of Missouri 28 | Debbie Dingell 12th District of Michigan 29 | Veronica Escobar 16th District of Texas 30 | Mondaire Jones 17th District of New York 31 | Ro Khanna 17th District of California 32 | Lloyd Doggett 35th District of Texas 33 | Andy Levin 9th District of Michigan 34 | Mark Takano 41st District of California 35 | Adriano Espaillat 13th District of New York 36 | Jamaal Bowman 16th Congressional District of New York 37 | Teresa Leger Fernandez 3rd Congressional District of New Mexico 38 | Bernie Sanders U.S. Senator for Vermont 39 | Jan Schakowsky 9th District of Illinois 40 | Alma Adams 12th District of North Carolina 41 | Nanette Barragán 44th District of California 42 | Karen Bass 37th District of California 43 | Eddie Bernice Johnson 30th District of Texas 44 | Don Beyer 8th District of Virginia 45 | Earl Blumenauer 3rd District of Oregon 46 | Lisa Blunt Rochester Delaware's At-large Congressional District 47 | Suzanne Bonamici 1st District of Oregon 48 | Brendan Boyle 2nd District of Pennsylvania 49 | André Carson 7th District of Indiana 50 | Matt Cartwright 8th District of Pennsylvania 51 | Judy Chu 27th District of California 52 | Katherine Clark 5th District of Massachusetts 53 | Yvette Clarke 9th District of New York 54 | Steve Cohen 9th District of Memphis 55 | Madeleine Dean 4th District of Pennsylvania 56 | Peter DeFazio 4th District of Oregon 57 | Diana DeGette 1st Congressional District of Colorado 58 | Rosa DeLauro 3rd District of Connecticut 59 | Mark DeSaulnier 11th District of California 60 | Dwight Evans 3rd District of Pennsylvania 61 | Lois Frankel 21st District of Florida 62 | Marcia Fudge 11st District of Ohio 63 | Ruben Gallego 7th District of Arizona 64 | Sylvia Garcia 29th District of Texas 65 | Jimmy Gomez 34th District of California 66 | Eleanor Holmes Norton Representing the District of Columbia 67 | Steven Horsford 4th District of Nevada 68 | Jared Huffman 2nd District of California 69 | Hakeem Jeffries 8th District of New York 70 | Hank Johnson 4th District of Georgia 71 | Kai Kahele 2nd Congressional District of Hawai'i 72 | Dan Kildee 5th District of Michigan 73 | Andy Kim 3rd District of New Jersey 74 | Brenda Lawrence 14th District of Michigan 75 | Mike Levin 49th District of California 76 | Ted Lieu 33rd District of California  77 | Zoe Lofgren 19th District of California 78 | Alan Lowenthal 47th District of California 79 | Carolyn Maloney 12th District of New York 80 | James McGovern 2nd District of Massachusetts 81 | Grace Meng 6th District of New York City 82 | Gwen Moore 4th District of Wisconsin 83 | Joe Morelle 25th District of New York 84 | Jerrold Nadler 10th District of New York 85 | Grace Napolitano 32nd District of California 86 | Alexandria Ocasio-Cortez 14th District of New York 87 | Frank Pallone 6th District of New Jersey 88 | Jimmy Panetta 20th District of California 89 | Chellie Pingree 1st District of Maine 90 | Ayanna Pressley 7th District of Massachusetts 91 | Linda Sanchez 38th District of California  92 | Mary Gay Scanlon 5th District of Pennsylvania 93 | Brad Sherman 30th District of California 94 | Adam Smith 9th District of Washington 95 | Darren Soto 9th District of Florida 96 | Ritchie Torres 15th Congressional District of New York 97 | Lori Trahan 3rd District of Massachusetts 98 | Juan Vargas 51st District of California 99 | Nydia Velázquez 7th District of New York 100 | Maxine Waters 43rd District of California 101 | Peter Welch Representing Vermont 102 | Nikema Williams 5th Congressional District of Georgia 103 | Frederica Wilson 24th District of Florida 104 | John Yarmuth 3rd District of Kentucky 105 | -------------------------------------------------------------------------------- /sample/sample.cfg: -------------------------------------------------------------------------------- 1 | MANIFEST = { 2 | "identifierSpace": "http://www.wikidata.org/entity/", 3 | "schemaSpace": "http://www.wikidata.org/prop/direct/", 4 | "view": {"url":"https://www.wikidata.org/wiki/{{id}}"}, 5 | "name": "US congressional representatives" 6 | } 7 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gitonthescene/csv-reconcile/2ed24740e351c6912ac626af58f0b5f4c776bac6/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from csv_reconcile import create_app, initdb, scorer 3 | import types 4 | try: 5 | import importlib_metadata as metadata 6 | except: 7 | from importlib import metadata 8 | 9 | 10 | @pytest.fixture 11 | def plugins(): 12 | '''csv_reconcile.scorers plugins''' 13 | eps = metadata.entry_points().select(group='csv_reconcile.scorers') 14 | return {ep.name: ep for ep in eps} 15 | 16 | 17 | @pytest.fixture 18 | def csvcontents(): 19 | '''contents for mock csv file''' 20 | # Column names are normalized for database use 21 | # id column need not be first nor name name second 22 | return ''' 23 | name@hdr id hdr extra!hdr 24 | first 1 stuff 25 | second 2 junk 26 | third 3 and so on 27 | '''.strip() 28 | 29 | @pytest.fixture 30 | def ambiguous_csvcontents(): 31 | '''Try to throw off csv.Sniffer() to test overrides''' 32 | return ''' 33 | These, my friends, are the columns 34 | However, above all, columns matter most 35 | '''.strip() 36 | 37 | @pytest.fixture 38 | def sniffer_throwing_csvcontents(): 39 | '''Try to throw off csv.Sniffer() to test overrides''' 40 | return ''' 41 | a,b,c\n1,2 42 | '''.strip() 43 | 44 | 45 | @pytest.fixture 46 | def formContentHeader(): 47 | '''header for form data for client''' 48 | return {'content-type': 'application/x-www-form-urlencoded'} 49 | 50 | 51 | @pytest.fixture 52 | def header(csvcontents): 53 | '''header of mock csvfile''' 54 | return csvcontents.splitlines()[0].split('\t') 55 | 56 | 57 | @pytest.fixture 58 | def idnm(header): 59 | '''id and name cols from the header''' 60 | return (header[1], header[0]) 61 | 62 | 63 | @pytest.fixture 64 | def typicalrow(csvcontents): 65 | '''typical row of mock csvfile''' 66 | return csvcontents.splitlines()[1].split('\t') 67 | 68 | 69 | @pytest.fixture 70 | def setup(tmp_path, csvcontents, idnm): 71 | '''mock csv file with id and name columns indicated''' 72 | 73 | p = tmp_path / "csvfile" 74 | p.write_text(csvcontents) 75 | return (p, *idnm) 76 | 77 | @pytest.fixture 78 | def ambiguous_setup(tmp_path, ambiguous_csvcontents): 79 | '''mock csv file with id and name columns indicated''' 80 | 81 | def getSetup(idnm): 82 | p = tmp_path / "amb_csvfile" 83 | p.write_text(ambiguous_csvcontents) 84 | return (p, *idnm) 85 | 86 | return getSetup 87 | 88 | @pytest.fixture 89 | def sniffer_throwing_setup(tmp_path, sniffer_throwing_csvcontents): 90 | '''mock csv file with id and name columns indicated''' 91 | 92 | def getSetup(idnm): 93 | p = tmp_path / "snfthrw_csvfile" 94 | p.write_text(sniffer_throwing_csvcontents) 95 | return (p, *idnm) 96 | 97 | return getSetup 98 | 99 | @pytest.fixture 100 | def cfgContents(): 101 | return ''' 102 | THRESHOLD=0.0 103 | import logging 104 | LOGLEVEL=logging.DEBUG''' 105 | 106 | 107 | @pytest.fixture 108 | def mkConfig(tmp_path): 109 | '''make server config''' 110 | 111 | def fn(cfgContents): 112 | p = tmp_path / "config" 113 | p.write_text(cfgContents) 114 | 115 | return p 116 | 117 | return fn 118 | 119 | 120 | @pytest.fixture 121 | def config(mkConfig, cfgContents): 122 | '''mock server config''' 123 | return mkConfig(cfgContents) 124 | 125 | 126 | @pytest.fixture 127 | def mockPlugin(): 128 | '''save/restore original plugin API''' 129 | saveOrig = { 130 | nm: vl 131 | for nm, vl in scorer.__dict__.items() 132 | if type(vl) == types.FunctionType 133 | } 134 | yield saveOrig 135 | for nm, fn in saveOrig.items(): 136 | setattr(scorer, nm, fn) 137 | 138 | 139 | @pytest.fixture 140 | def app(plugins, tmp_path): 141 | '''flask app''' 142 | 143 | def getApp(setup, config, plugin='dice'): 144 | app = create_app(config, instance_path=tmp_path / "instance", scorerOption=plugin) 145 | with app.app_context(): 146 | initdb.init_db_with_context(*setup) 147 | 148 | return app 149 | 150 | return getApp 151 | 152 | 153 | @pytest.fixture 154 | def client(app): 155 | '''http client''' 156 | 157 | def getClient(setup, config, plugin='dice'): 158 | return app(setup, config, plugin=plugin).test_client() 159 | 160 | return getClient 161 | 162 | 163 | @pytest.fixture 164 | def basicClient(client, setup, config): 165 | 166 | def getClient(config=config): 167 | return client(setup, config, plugin='dice') 168 | 169 | return getClient 170 | -------------------------------------------------------------------------------- /tests/main/test_csv_reconcile.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from csv_reconcile import __version__, scorer 4 | from csv_reconcile.db import getCSVCols 5 | 6 | import json 7 | from urllib.parse import urlencode 8 | 9 | 10 | def test_version(): 11 | assert __version__ == '0.3.2' 12 | 13 | 14 | def test_manifest(basicClient): 15 | response = basicClient().get('/reconcile') 16 | 17 | assert response.status_code == 200 18 | 19 | manifest = json.loads(response.data) 20 | expectedKeys = set( 21 | 'versions name identifierSpace schemaSpace extend'.split()) 22 | 23 | assert set(manifest.keys()).intersection(expectedKeys) == expectedKeys 24 | 25 | 26 | def test_query_basics(basicClient, formContentHeader): 27 | query = {'q0': {'query': 'first'}} 28 | queryjson = json.dumps(query) 29 | response = basicClient().post('/reconcile', 30 | data=urlencode([('queries', queryjson)]), 31 | headers=formContentHeader) 32 | 33 | assert response.status_code == 200 34 | 35 | matchBatch = json.loads(response.data) 36 | 37 | assert query.keys() == matchBatch.keys() 38 | 39 | assert 'result' in matchBatch['q0'] 40 | assert type(matchBatch['q0']['result']) == list 41 | 42 | 43 | def test_data_extension_basics(basicClient, setup, header, typicalrow, 44 | formContentHeader): 45 | 46 | client = basicClient() 47 | # Type is ignored in this service 48 | dummyType = '' 49 | _, idcol, namecol = setup 50 | ididx = header.index(idcol) 51 | 52 | response = client.get('/properties?type=%s' % (dummyType,)) 53 | 54 | assert response.status_code == 200 55 | 56 | cols = json.loads(response.data) 57 | 58 | assert 'properties' in cols 59 | assert type(cols['properties']) == list # [ {id:..., name:...}, ... ] 60 | 61 | availableCols = dict() 62 | for itm in cols['properties']: 63 | assert set(itm.keys()) == set(('id', 'name')) 64 | 65 | availableCols[itm['name']] = itm['id'] 66 | 67 | assert set(availableCols) == set(header) 68 | 69 | colid = typicalrow[ididx] 70 | req = {'ids': [colid], 'properties': cols['properties']} 71 | reqjson = json.dumps(req) 72 | response = client.post('/reconcile', 73 | data=urlencode([('extend', reqjson)]), 74 | headers=formContentHeader) 75 | 76 | assert response.status_code == 200 77 | 78 | extenddata = json.loads(response.data) 79 | 80 | assert 'meta' in extenddata 81 | assert 'rows' in extenddata 82 | assert colid in extenddata['rows'] 83 | 84 | row = extenddata['rows'][colid] 85 | for colextra, colid in availableCols.items(): 86 | exidx = header.index(colextra) 87 | 88 | assert colid in row 89 | 90 | for choice in row[colid]: 91 | assert 'str' in choice 92 | assert choice['str'] == typicalrow[exidx] 93 | 94 | 95 | def test_preview_service(basicClient, setup, header, typicalrow): 96 | client = basicClient() 97 | 98 | # no id 99 | response = client.get(f"/preview/") 100 | assert response.status_code == 404 101 | 102 | # unavailable id 103 | response = client.get(f"/preview/unavailable") 104 | assert response.status_code == 404 105 | 106 | # available id 107 | id_idx = header.index(setup[1]) 108 | response = client.get(f"/preview/{typicalrow[id_idx]}") 109 | assert response.status_code == 200 110 | 111 | html_response = response.data.decode("utf-8") 112 | print(html_response) 113 | assert f"Preview for {typicalrow[id_idx]}" in html_response 114 | for key, value in zip(header, typicalrow): 115 | assert f"
{key}
{value}
" in html_response 116 | 117 | 118 | @pytest.fixture 119 | def limitConfig(mkConfig): 120 | contents = ''' 121 | LIMIT=2 122 | THRESHOLD=-1.0 123 | import logging 124 | LOGLEVEL=logging.DEBUG 125 | ''' 126 | return mkConfig(contents) 127 | 128 | 129 | def test_reconcile_limit(basicClient, formContentHeader, limitConfig): 130 | query = {'q0': {'query': 'first'}} 131 | queryjson = json.dumps(query) 132 | client = basicClient(limitConfig) 133 | response = client.post('/reconcile', 134 | data=urlencode([('queries', queryjson)]), 135 | headers=formContentHeader) 136 | 137 | assert response.status_code == 200 138 | 139 | matchBatch = json.loads(response.data) 140 | 141 | assert len(matchBatch['q0']['result']) == 2 142 | response = client.post('/reconcile', 143 | data=urlencode([('queries', queryjson)]), 144 | headers=formContentHeader) 145 | 146 | # Override config limit in query with larger number 147 | query = {'q0': {'query': 'first', 'limit': 3}} 148 | queryjson = json.dumps(query) 149 | response = client.post('/reconcile', 150 | data=urlencode([('queries', queryjson)]), 151 | headers=formContentHeader) 152 | 153 | assert response.status_code == 200 154 | 155 | matchBatch = json.loads(response.data) 156 | 157 | # Matches override 158 | assert len(matchBatch['q0']['result']) == 3 159 | 160 | # Override config limit in query with smaller number 161 | query = {'q0': {'query': 'first', 'limit': 1}} 162 | queryjson = json.dumps(query) 163 | response = client.post('/reconcile', 164 | data=urlencode([('queries', queryjson)]), 165 | headers=formContentHeader) 166 | 167 | assert response.status_code == 200 168 | 169 | matchBatch = json.loads(response.data) 170 | 171 | # Matches override 172 | assert len(matchBatch['q0']['result']) == 1 173 | 174 | 175 | def test_reconcile_automatch(basicClient, formContentHeader): 176 | client = basicClient() 177 | 178 | query = {'q0': {'query': 'first'}} 179 | queryjson = json.dumps(query) 180 | response = client.post('/reconcile', 181 | data=urlencode([('queries', queryjson)]), 182 | headers=formContentHeader) 183 | 184 | assert response.status_code == 200 185 | 186 | matchBatch = json.loads(response.data) 187 | result = matchBatch['q0']['result'] 188 | 189 | # Only one with 100% match automatches 190 | cnt = 0 191 | for itm in result: 192 | if itm['name'] == 'first': 193 | cnt += 1 194 | assert itm['match'] == True 195 | assert itm['score'] == 100.0 196 | else: 197 | assert itm['match'] == False 198 | 199 | assert cnt == 1 200 | 201 | # None with 100% match does not automatch 202 | query = {'q0': {'query': 'fir'}} 203 | queryjson = json.dumps(query) 204 | response = client.post('/reconcile', 205 | data=urlencode([('queries', queryjson)]), 206 | headers=formContentHeader) 207 | 208 | assert response.status_code == 200 209 | 210 | matchBatch = json.loads(response.data) 211 | result = matchBatch['q0']['result'] 212 | 213 | assert all( 214 | itm['match'] == False and itm['score'] != 100.0 for itm in result) 215 | 216 | # Only one result automatches, even if not 100% 217 | query = {'q0': {'query': 'fir', 'limit': 1}} 218 | queryjson = json.dumps(query) 219 | response = client.post('/reconcile', 220 | data=urlencode([('queries', queryjson)]), 221 | headers=formContentHeader) 222 | 223 | assert response.status_code == 200 224 | 225 | matchBatch = json.loads(response.data) 226 | result = matchBatch['q0']['result'] 227 | assert len(result) == 1 228 | assert result[0]['score'] != 100.0 and result[0]['match'] == True 229 | 230 | 231 | def test_plugin(mockPlugin, basicClient, csvcontents, formContentHeader): 232 | # Since used in closure pass in "by reference" 233 | p, gn, nw, sm, v = list(range(5)) 234 | called = [0] * 5 235 | 236 | @scorer.register 237 | def processScoreOptions(options): 238 | called[p] += 1 239 | 240 | @scorer.register 241 | def getNormalizedFields(): 242 | # one normalized field 243 | called[gn] += 1 244 | return ('dummy',) 245 | 246 | @scorer.register 247 | def normalizeWord(word, **scoreOptions): 248 | # everything normalizes to COW thus everything matches 249 | called[nw] += 1 250 | return ("COW",) 251 | 252 | @scorer.register 253 | def scoreMatch(left, right): 254 | # Count the number of letters in common 255 | called[sm] += 1 256 | left, right = left[0], right[0] 257 | return len(set(left).intersection(right)) / len(left) * 100.0 258 | 259 | @scorer.register 260 | def valid(normalizedFields): 261 | called[v] += 1 262 | return True 263 | 264 | client = basicClient() 265 | 266 | # processScoreOptions, getNormalizedFields, and normalizeWord all called during setup 267 | # scoreMatch and valid not yet called 268 | assert all(called[itm] > 0 for itm in (p, gn, nw)) 269 | assert called[sm:] == [0, 0] 270 | 271 | # total number of rows minus 1 for the header row 272 | nRows = len(csvcontents.splitlines()) - 1 273 | 274 | query = {'q0': {'query': 'mxyzptlk'}} 275 | queryjson = json.dumps(query) 276 | response = client.post('/reconcile', 277 | data=urlencode([('queries', queryjson)]), 278 | headers=formContentHeader) 279 | assert response.status_code == 200 280 | 281 | matchBatch = json.loads(response.data) 282 | 283 | assert len(matchBatch['q0']['result']) == nRows 284 | assert all(called[itm] > 0 for itm in (p, gn, nw, sm, v)) 285 | 286 | # processScoreOptions still called once, getNormalizedFields only called twice 287 | assert called[:2] == [1, 2] 288 | 289 | def test_csv_sniffer_overrides(app, ambiguous_setup, ambiguous_csvcontents, config, mkConfig): 290 | 291 | topline = ambiguous_csvcontents.splitlines()[0] 292 | items = lambda sep: [ h.strip() for h in topline.split(sep)] 293 | 294 | # First guess is that the , is a separator 295 | SEP = ',' 296 | chk = app(ambiguous_setup(items(SEP)[:2]), config) 297 | with chk.app_context(): 298 | headernms = [name for _,name in getCSVCols()] 299 | assert headernms == items(SEP) 300 | 301 | # Now parse with override 302 | SEP = ' ' 303 | cfg = mkConfig('CSVKWARGS = {"delimiter": " "}') 304 | chk = app(ambiguous_setup(items(SEP)[:2]), cfg) 305 | with chk.app_context(): 306 | headernms = [name for _,name in getCSVCols()] 307 | assert headernms == items(SEP) 308 | 309 | def test_csv_sniffer_throwing(app, sniffer_throwing_setup, sniffer_throwing_csvcontents, config, mkConfig): 310 | 311 | topline = sniffer_throwing_csvcontents.splitlines()[0] 312 | items = lambda sep: [ h.strip() for h in topline.split(sep)] 313 | 314 | # First guess is that the , is a separator 315 | SEP = ',' 316 | chk = app(sniffer_throwing_setup(items(SEP)[:2]), config) 317 | with chk.app_context(): 318 | headernms = [name for _,name in getCSVCols()] 319 | assert headernms == items(SEP) 320 | 321 | # Now parse with override 322 | cfg = mkConfig('CSVKWARGS = {"delimiter": ","}') 323 | chk = app(sniffer_throwing_setup(items(SEP)[:2]), cfg) 324 | with chk.app_context(): 325 | headernms = [name for _,name in getCSVCols()] 326 | assert headernms == items(SEP) 327 | -------------------------------------------------------------------------------- /tests/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gitonthescene/csv-reconcile/2ed24740e351c6912ac626af58f0b5f4c776bac6/tests/plugins/__init__.py -------------------------------------------------------------------------------- /tests/plugins/geo/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | @pytest.fixture 5 | def csvcontents(): 6 | return ''' 7 | city cityLabel coords 8 | Q60 New York City Point(-73.94 40.67) 9 | Q65 Los Angeles Point(-118.24368 34.05223) 10 | Q1297 Chicago Point(-87.627777777 41.881944444) 11 | Q8652 Miami Point(-80.216666666 25.783333333) 12 | '''.strip() 13 | 14 | 15 | @pytest.fixture 16 | def idnm(header): 17 | '''id and name cols from the header''' 18 | return (header[0], header[2]) 19 | 20 | 21 | @pytest.fixture 22 | def basicClient(client, setup): 23 | 24 | def getClient(config): 25 | return client(setup, config, plugin='geo') 26 | 27 | return getClient 28 | -------------------------------------------------------------------------------- /tests/plugins/geo/test_geo_reconcile.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import json 3 | from urllib.parse import urlencode 4 | from pprint import pprint as pp 5 | from geopy import distance 6 | 7 | 8 | @pytest.fixture 9 | def toResolve(): 10 | data = ''' 11 | city cityLabel coords 12 | Q1342 Pittsburgh Point(-80.0 40.441666666) 13 | Q5083 Seattle Point(-122.33207 47.60621) 14 | Q16559 Austin Point(-97.733333333 30.3) 15 | Q43196 Cincinnati Point(-84.5 39.133333333) 16 | '''.strip() 17 | return { 18 | cityNm: coords 19 | for cityId, cityNm, coords in (l.split('\t') for l in data.splitlines()) 20 | } 21 | 22 | 23 | @pytest.fixture 24 | def baseDataLkupNm(csvcontents): 25 | return { 26 | cityId: cityNm for cityId, cityNm, _ in ( 27 | l.split('\t') for l in csvcontents.splitlines()) 28 | } 29 | 30 | 31 | @pytest.fixture 32 | def baseDataLkupCoord(csvcontents): 33 | return { 34 | cityNm: coords for _, cityNm, coords in ( 35 | l.split('\t') for l in csvcontents.splitlines()) 36 | } 37 | 38 | 39 | def test_query(basicClient, config, formContentHeader, toResolve, 40 | baseDataLkupNm): 41 | 42 | ccoords = toResolve['Seattle'] 43 | 44 | query = {'q0': {'query': ccoords}} 45 | queryjson = json.dumps(query) 46 | response = basicClient(config).post('/reconcile', 47 | data=urlencode([('queries', queryjson) 48 | ]), 49 | headers=formContentHeader) 50 | 51 | assert response.status_code == 200 52 | 53 | matchBatch = json.loads(response.data) 54 | 55 | assert query.keys() == matchBatch.keys() 56 | 57 | results = matchBatch['q0']['result'] 58 | best = max(results, key=lambda x: x['score']) 59 | 60 | assert baseDataLkupNm[best['id']] == 'Los Angeles' 61 | 62 | 63 | @pytest.fixture 64 | def scaleConfig(mkConfig, baseDataLkupCoord, toResolve): 65 | 66 | # convert wkt format to tuple of floats (lat, lon) 67 | mkpt = lambda wkt: tuple(float(x) for x in wkt[6:-1].split()[1::-1]) 68 | 69 | chicago = baseDataLkupCoord['Chicago'] 70 | pittsburgh = toResolve['Pittsburgh'] 71 | 72 | dist = distance.geodesic(mkpt(chicago), mkpt(pittsburgh)).km 73 | 74 | contents = f''' 75 | THRESHOLD=0.0 76 | import logging 77 | LOGLEVEL=logging.DEBUG 78 | SCOREOPTIONS = {{ 79 | "SCALE": {dist} 80 | }} 81 | ''' 82 | return mkConfig(contents) 83 | 84 | 85 | def test_scale(basicClient, scaleConfig, formContentHeader, toResolve, 86 | baseDataLkupNm): 87 | 88 | pittsburgh = toResolve['Pittsburgh'] 89 | 90 | query = {'q0': {'query': pittsburgh}} 91 | queryjson = json.dumps(query) 92 | response = basicClient(scaleConfig).post('/reconcile', 93 | data=urlencode([('queries', 94 | queryjson)]), 95 | headers=formContentHeader) 96 | 97 | assert response.status_code == 200 98 | 99 | matchBatch = json.loads(response.data) 100 | 101 | assert query.keys() == matchBatch.keys() 102 | 103 | results = matchBatch['q0']['result'] 104 | 105 | score = {baseDataLkupNm[r['id']]: r['score'] for r in results} 106 | 107 | assert score['Chicago'] == 50 # Right at scale 108 | assert score['New York City'] > 50 # closer 109 | assert score['Miami'] < 50 # further 110 | 111 | 112 | @pytest.fixture 113 | def rangeConfig(mkConfig, baseDataLkupCoord, toResolve): 114 | 115 | contents = f''' 116 | THRESHOLD=0.0 117 | import logging 118 | LOGLEVEL=logging.DEBUG 119 | SCOREOPTIONS = {{ 120 | "COORDRANGE": 10.0 121 | }} 122 | ''' 123 | return mkConfig(contents) 124 | 125 | 126 | def test_range(basicClient, rangeConfig, formContentHeader, toResolve, 127 | baseDataLkupNm): 128 | 129 | pittsburgh = toResolve['Pittsburgh'] 130 | 131 | query = {'q0': {'query': pittsburgh}} 132 | queryjson = json.dumps(query) 133 | response = basicClient(rangeConfig).post('/reconcile', 134 | data=urlencode([('queries', 135 | queryjson)]), 136 | headers=formContentHeader) 137 | 138 | assert response.status_code == 200 139 | 140 | matchBatch = json.loads(response.data) 141 | 142 | assert query.keys() == matchBatch.keys() 143 | 144 | results = matchBatch['q0']['result'] 145 | 146 | # Only NYC and Chicago have longitude and latitude within 10 points 147 | assert len(results) == 2 148 | 149 | score = {baseDataLkupNm[r['id']]: r['score'] for r in results} 150 | 151 | assert 'Chicago' in score 152 | assert 'New York City' in score 153 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | 4 | def dummydoc(): 5 | if not os.path.exists("README.md"): 6 | print("Creating README.md ...") 7 | f = open("README.md", "w") 8 | f.close() 9 | --------------------------------------------------------------------------------