├── .github
    ├── dependabot.yml
    └── workflows
    │   └── main.yml
├── .gitignore
├── CONTRIBUTING.org
├── LICENSE
├── README.org
├── build.py
├── csv_reconcile
    ├── __init__.py
    ├── __main__.py
    ├── db.py
    ├── default_settings.py
    ├── extend.py
    ├── initdb.py
    ├── preview.py
    ├── schema.sql
    ├── score.py
    └── scorer.py
├── csv_reconcile_dice
    ├── __init__.py
    ├── cutils.pyx
    └── utils.py
├── noxfile.py
├── poetry.lock
├── pyproject.toml
├── pytest.ini
├── sample
    ├── progressives.tsv
    ├── reps.tsv
    └── sample.cfg
├── tests
    ├── __init__.py
    ├── conftest.py
    ├── main
    │   └── test_csv_reconcile.py
    └── plugins
    │   ├── __init__.py
    │   └── geo
    │       ├── conftest.py
    │       └── test_geo_reconcile.py
└── utils.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | 
 4 |   # Maintain dependencies for GitHub Actions
 5 |   - package-ecosystem: "pip"
 6 |     directory: "/"
 7 |     schedule:
 8 |       interval: "daily"
 9 |     target-branch: "develop"
10 |     labels:
11 |       - "pip dependencies"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | # Basic testing taken from [[https://github.com/marketplace/actions/install-poetry-action#mtesting]]
 2 | name: test
 3 | 
 4 | on: [pull_request, workflow_dispatch]
 5 | 
 6 | jobs:
 7 |   test:
 8 |     strategy:
 9 |       fail-fast: true
10 |       matrix:
11 |         os: [ "ubuntu-latest", "macos-latest" ]
12 |         python-version: [ "3.9", "3.10", "3.11", "3.12" ]
13 |     runs-on: ${{ matrix.os }}
14 |     steps:
15 |       #----------------------------------------------
16 |       #       check-out repo and set-up python     
17 |       #----------------------------------------------
18 |       - name: Check out repository
19 |         uses: actions/checkout@v3
20 |       - name: Set up python ${{ matrix.python-version }}
21 |         uses: actions/setup-python@v3
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 |       #----------------------------------------------
25 |       #       install poethepoet
26 |       #----------------------------------------------
27 |       - name: Install poethepoet
28 |         run: python -m pip install poethepoet
29 |       #----------------------------------------------
30 |       #  -----  install & configure poetry  -----      
31 |       #----------------------------------------------
32 |       - name: Install Poetry
33 |         uses: snok/install-poetry@v1
34 |         with:
35 |           virtualenvs-in-project: true
36 |       #----------------------------------------------
37 |       #       load cached venv if cache exists      
38 |       #----------------------------------------------
39 |       - name: Load cached venv
40 |         id: cached-poetry-dependencies
41 |         uses: actions/cache@v3
42 |         with:
43 |           path: .venv
44 |           key: venv-${{ runner.os }}-${{ hashFiles('**/poetry.lock') }}
45 |       #----------------------------------------------
46 |       #       dummy doc
47 |       #----------------------------------------------
48 |       - name: Dummy doc
49 |         run: poe dummydoc
50 |       #----------------------------------------------
51 |       # install dependencies if cache does not exist 
52 |       #----------------------------------------------
53 |       - name: Install dependencies
54 |         if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
55 |         run: poetry install --no-interaction --no-root
56 |       #----------------------------------------------
57 |       # install your root project, if required 
58 |       #----------------------------------------------      
59 |       - name: Install library
60 |         run: poetry install --no-interaction
61 |       #----------------------------------------------
62 |       #       install plugins (for tests)
63 |       #----------------------------------------------
64 |       - name: Install plugins
65 |         run: poetry run python -m pip install csv-reconcile-geo
66 |       #----------------------------------------------
67 |       #    add matrix specifics and run test suite   
68 |       #----------------------------------------------
69 |       - name: Run tests
70 |         run: |
71 |           poetry run pytest tests/
72 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ### Python template
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # pytype static type analyzer
136 | .pytype/
137 | 
138 | # Cython debug symbols
139 | cython_debug/
140 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.org:
--------------------------------------------------------------------------------
 1 | * Contributing code
 2 |   Pull requests are most definitely appreciated and encouraged.  Please open an issue before
 3 |   contributing and add the comment "Fixes #<issue number>" to your commits to [[https://github.blog/2013-01-22-closing-issues-via-commit-messages/][automatically close
 4 |   the issue.]]
 5 | 
 6 |   Also, we've started using the [[https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow][git flow workflow]] for releases.  If you're not familiar with it, it
 7 |   basically amounts to pull requests being accepted only on the ~develop~ branch or preferrably on a
 8 |   branch of ~develop~ named following the pattern ~feature/<short description>~.  The ~git-flow~
 9 |   extension can help with managing these branches but is not required.  See the linked documentation
10 |   for more information.
11 | 
12 | * Adding plugins
13 |   Per the documentation ~csv-reconcile~ automatically detects plugins that are installed as ~Python~
14 |   packages so long as the register an ~entry-point~ in their ~setup.py~.  Thus no changes to
15 |   ~csv-reconcile~ should be necessary to make use of your plugin.
16 | 
17 |   If you would like to share your plugin, however, it might be helpful to add it to this project's
18 |   wiki.
19 | 
20 | * Suggestions for enhancement
21 |   Suggestions are always welcome.  Please open an issue so we can discuss feasibility and how the
22 |   suggestion fits in with the overall plan for the project.
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 gitonthescene
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
  1 | #+OPTIONS: ^:nil
  2 | * CSV Reconcile
  3 |   A [[https://github.com/reconciliation-api/specs][reconciliation service]] for [[https://openrefine.org/][OpenRefine]] based on a CSV file similar to [[http://okfnlabs.org/reconcile-csv/][reconcile-csv]].  This one is written in Python and has some more configurability.
  4 | 
  5 | ** Quick start
  6 |    - Clone this repository
  7 |    - Run the service
  8 |      : $ python -m venv venv                                             # create virtualenv
  9 |      : $ venv/bin/pip install csv-reconcile                              # install package
 10 |      : $ source venv/bin/activate                                        # activate virtual environment
 11 |      : (venv) $ csv-reconcile init sample/reps.tsv item itemLabel        # initialize the service
 12 |      : (venv) $ csv-reconcile serve                                      # run the service
 13 |      : (venv) $ deactivate                                               # remove virtual environment
 14 | 
 15 |    The service is run at http://127.0.0.1:5000/reconcile.  You can point at a different host:port by
 16 |    adding [[https://flask.palletsprojects.com/en/0.12.x/config/][SERVER_NAME]] to the sample.cfg.  Since this is running from a virtualenv, you can simply
 17 |    delete the whole lot to clean up.
 18 | 
 19 |    If you have a C compiler installed you may prefer to install the sdist
 20 |    ~dist/csv-reconcile-0.1.0.tar.gz~ which will build a [[https://cython.readthedocs.io/en/latest/][Cython]] version of the computationally
 21 |    intensive fuzzy match routine for speed.  With ~pip~ add the option ~--no-binary csv-reconcile~.
 22 | 
 23 | ** Poetry
 24 | *** Prerequesites
 25 |     You'll need to have both [[https://python-poetry.org/docs/][poetry]] and [[https://pypi.org/project/poethepoet/0.0.3/][poethepoet]] installed.  For publishing to [[https://pypi.org/][PyPI]] [[https://pandoc.org/][pandoc]] is required.
 26 | 
 27 | *** Running
 28 |    This is packaged with [[https://python-poetry.org/docs/][poetry]], so you can use those commands if you have it installed.
 29 |    : $ poe install
 30 |    : $ poetry run csv-reconcile init sample/reps.tsv item itemLabel
 31 |    : $ poetry run csv-reconcile serve
 32 | 
 33 | *** Building
 34 |     Because this package uses a ~README.org~ file and ~pip~ requires a ~README.md~, there are extra
 35 |     build steps beyond what ~poetry~ supplies.  These are managed using [[https://pypi.org/project/poethepoet/0.0.3/][poethepoet]].  Thus building is
 36 |     done as follows:
 37 | 
 38 |     : $ poe build
 39 | 
 40 |     If you want to build a platform agnostic wheel, you'll have to comment out the ~build =
 41 |     "build.py"~ line from ~pyproject.toml~ until ~poetry~ supports [[https://github.com/python-poetry/poetry/issues/3594][selecting build platform]].
 42 | 
 43 | ** Description
 44 | 
 45 |    This reconciliation service uses [[https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient][Dice coefficient scoring]] to reconcile values against a given column
 46 |    in a [[https://en.wikipedia.org/wiki/Comma-separated_values][CSV]] file.  The CSV file must contain a column containing distinct values to reconcile to.
 47 |    We'll call this the /id column/.  We'll call the column being reconciled against the /name column/.
 48 | 
 49 |    For performance reasons, the /name column/ is preprocessed to normalized values which are stored
 50 |    in an [[https://www.sqlite.org/index.html][sqlite]] database.  This database must be initialized at least once by running the init
 51 |    sub-command.  Once initialized this need not be run for subsequent runs.
 52 | 
 53 |    Note that the service supplies all its data with a dummy /type/ so there is no reason to reconcile
 54 |    against any particular /type/.
 55 | 
 56 |    In addition to reconciling against the /name column/, the service also functions as a [[https://reconciliation-api.github.io/specs/latest/#data-extension-service][data extension
 57 |    service]], which offers any of the other columns of the CSV file.
 58 | 
 59 |    Note that Dice coefficient scoring is agnostic to word ordering.
 60 | 
 61 | ** Usage
 62 | 
 63 |    Basic usage involves two steps:
 64 |      - initialization
 65 |      - running the service
 66 | 
 67 |    Initialization primes the database with the data processed from the CSV file with the ~init~ subcommand.
 68 |    There are several options for running the service as described below.
 69 | 
 70 | *** Initialization
 71 | 
 72 |    Basic usage of the ~init~ sub-command requires passing the name of the CSV file, the /id column/
 73 |    and the /name column/.
 74 | 
 75 |    : (venv) $ csv-reconcile --help
 76 |    : Usage: csv-reconcile [OPTIONS] COMMAND [ARGS]...
 77 |    : 
 78 |    : Options:
 79 |    :   --help  Show this message and exit.
 80 |    : 
 81 |    : Commands:
 82 |    :   init
 83 |    :   run
 84 |    :   serve
 85 |    : (venv) $ csv-reconcile init --help
 86 |    : Usage: csv-reconcile init [OPTIONS] CSVFILE IDCOL NAMECOL
 87 |    : 
 88 |    : Options:
 89 |    :   --config TEXT  config file
 90 |    :   --scorer TEXT  scoring plugin to use
 91 |    :   --help         Show this message and exit.
 92 |    : (venv) $ poetry run csv-reconcile serve --help
 93 |    : Usage: csv-reconcile serve [OPTIONS]
 94 |    : 
 95 |    : Options:
 96 |    :   --help         Show this message and exit.
 97 |    : (venv) $
 98 | 
 99 |    The ~--config~ option is used to point to a configuration file.  The file is a [[https://flask.palletsprojects.com/en/1.1.x/config/][Flask
100 |    configuration]] and hence is Python code though most configuration is simply setting variables to
101 |    constant values.
102 | 
103 | *** Running the service
104 |     The simplest way to run the service is to use Flask's built-in web server with the ~serve~
105 |     subcommand which takes no arguments.  However, as mentioned in the [[https://flask.palletsprojects.com/en/2.0.x/deploying/][Flask documentation]], this
106 |     server is not suitable for production purposes.
107 | 
108 |     For a more hardened service, you can use one of the other deployment options mentioned in that
109 |     documentation.  For example, gunicorn can be run as follows:
110 | 
111 |     : (venv) $ gunicorn -w 4 'csv_reconcile:create_app()'
112 |     : 1-11-16 17:40:20 +0900] [84625] [INFO] Starting gunicorn 20.1.0
113 |     : 1-11-16 17:40:20 +0900] [84625] [INFO] Listening at: http://127.0.0.1:8000 (84625)
114 |     : 1-11-16 17:40:20 +0900] [84625] [INFO] Using worker: sync
115 |     : 1-11-16 17:40:20 +0900] [84626] [INFO] Booting worker with pid: 84626
116 |     : 1-11-16 17:40:20 +0900] [84627] [INFO] Booting worker with pid: 84627
117 |     : 1-11-16 17:40:20 +0900] [84628] [INFO] Booting worker with pid: 84628
118 |     : 1-11-16 17:40:20 +0900] [84629] [INFO] Booting worker with pid: 84629
119 |     : ...
120 | 
121 |     One thing to watch out for is that the default manifest points the extension service to port
122 |     5000, the default port for the Flask built-in web server.  If you want to use the extension
123 |     service when deploying to a different port, you'll want to be sure to override that part of the
124 |     manifest in your config file.  You'll need something like the following:
125 | 
126 |     : MANIFEST = {
127 |     :     "extend": {
128 |     :         "propose_properties": {
129 |     :             "service_url": "http://localhost:8000",
130 |     :             "service_path": "/properties"
131 |     :         }
132 |     :     }
133 |     : }
134 | 
135 |     Note also that the configuration is saved during the ~init~ step.  If you change the config,
136 |     you'll need to re-run that step.  You may also need to delete and re-add the service in
137 |     OpenRefine.
138 | 
139 | *** Deprecated
140 |     The ~run~ subcommand mimics the old behavior which combined the initialization step with the
141 |     running of the service.  This may be removed in a future release.
142 | 
143 | ** Common configuration
144 |    - ~SERVER_NAME~  - The host and port the service is bound to.
145 |      e.g. ~SERVER_NAME=localhost:5555~.  ( Default localhost:5000 )
146 |    - ~CSVKWARGS~  - Arguments to pass to [[https://docs.python.org/3/library/csv.html][csv.reader]].
147 |      e.g. ~CSVKWARGS={'delimiter': ',', 'quotechar': '"'}~ for comma delimited files using ~"~ as quote character.
148 |    - ~CSVENCODING~ - Encoding of the CSV file.
149 |      e.g. ~CSVENCODING="utf-8-sig"~ is the encoding used for data downloaded from [[https://www.usgs.gov/core-science-systems/ngp/board-on-geographic-names/download-gnis-data][GNIS]].
150 |    - ~SCOREOPTIONS~  - Options passed to scoring plugin during normalization.
151 |      e.g. ~SCOREOPTIONS={'stopwords':['lake','reservoir']}~
152 |    - ~LIMIT~      - The maximum number of reonciliation candidates returned per entry.  ( Default 10 )
153 |      e.g. ~LIMIT=10~
154 |    - ~THRESHOLD~  - The minimum score for returned reconciliation candidates.  ( Default 30.0 )
155 |      e.g. ~THRESHOLD=80.5~
156 |    - ~DATABASE~   - The name of the generated sqlite database containing pre-processed values.  (Default ~csvreconcile.db~)
157 |      e.g. ~DATABASE='lakes.db'~  You may want to change the name of the database if you regularly switch between databases being used.
158 |    - ~MANIFEST~   - Overrides for the service manifest.
159 |      e.g. ~MANIFEST={"name": "My service"}~ sets the name of the service to "My service".
160 | 
161 |    This last is most interesting.  If your data is coming from [[https://www.wikidata.org][Wikidata]] and your /id column/
162 |    contains [[https://www.wikidata.org/wiki/Help:Items][Q values]], then a manifest like the following will allow your links to be clickable inside OpenRefine.
163 | 
164 |    #+begin_src python
165 |    MANIFEST = {
166 |      "identifierSpace": "http://www.wikidata.org/entity/",
167 |      "schemaSpace": "http://www.wikidata.org/prop/direct/",
168 |      "view": {"url":"https://www.wikidata.org/wiki/{{id}}"},
169 |      "name": "My reconciliation service"
170 |    }
171 |    #+end_src
172 | 
173 |    If your CSV is made up of data taken from another [[https://reconciliation-api.github.io/testbench/][reconciliation service]], you may similiarly copy
174 |    parts of their manifest to make use of their features, such as the [[https://reconciliation-api.github.io/specs/latest/#preview-service][preview service]].  See the
175 |    reconciliation spec for details.
176 | 
177 | ** Built-in preview service
178 |    There is a preview service built into the tool.  (Thanks [[https://github.com/b2m][b2m]]!)  You can turn it on by adding the
179 |    following to your manifest:
180 | 
181 |    #+begin_src python
182 |      "preview": {
183 |         "url": "http://localhost:5000/preview/{{id}}",
184 |         "width": 400,
185 |         "height": 300
186 |      }
187 |    #+end_src
188 | 
189 |    Note that if you reconcile against a service with a preview service enabled, a link to the
190 |    service becomes part of the project.  Thus if you bring the service down, your project will have
191 |    hover over pop-ups to an unavailable service.  One way around this is to copy the
192 |    ~recon.match.id~ to a new column which can be re-reconciled to the column by id if you bring the
193 |    service back up again whether or not you have preview service enabled.  (Perhaps OpenRefine could
194 |    be smarter about enabling this pop-ups only when the service is active.)
195 | 
196 | ** Scoring plugins
197 |    As mentioned above the default scoring method is to use [[https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient][Dice coefficient scoring]], but this method
198 |    can be overridden by implementing a ~cvs_reconcile.scorers~ plugin.
199 | 
200 | *** Implementing
201 |    A plugin module may override any of the methods in the ~csv_reconcile.scorers~ module by simply
202 |    implementing a method of the same name with the decorator ~@cvs_reconcile.scorer.register~.
203 | 
204 |    See ~csv_reconcile_dice~ for how Dice coefficient scoring is implemented.
205 | 
206 |    The basic hooks are as follows:
207 |    - ~normalizedWord(word, **scoreOptions)~ preprocesses values to be reconciled to produce a tuple
208 |      used in fuzzy match scoring.  The value of ~SCOREOPTIONS~ in the configuration will be passed
209 |      in to allow configuration of this preprocessing.  This hook is required.
210 |    - ~normalizedRow(word, row, **scoreOptions)~ preprocesses values to be reconciled against to
211 |      produce a tuple used in fuzzy match scoring.  Note that both the reconciled column and the
212 |      entire row is available for calculating the normalized value and that the column reconciled
213 |      against is required even when not used.  The value of ~SCOREOPTIONS~ in the configuration will
214 |      be passed in to allow configuration of this preprocessing.  This defaults to calling
215 |      normalizeWord(word,**scoreOptions).
216 |    - ~getNormalizedFields()~ returns a tuple of names for the columns produced by ~normalizeWord()~.
217 |      The length of the return value from both functions must match.  This defaults to calling
218 |      normalizeWord(word,**scoreOptions).  This hook is required.
219 |    - ~processScoreOptions(options)~ is passed the value of ~SCOREOPTIONS~ to allow it to be adjusted
220 |      prior to being used.  This can be used for adding defaults and/or validating the configuration.
221 |      This hook is optional
222 |    - ~scoreMatch(left, right, **scoreOptions)~ gets passed two tuples as returned by
223 |      ~normalizedWord()~.  The ~left~ value is the value being reconciled and the ~right~ value is
224 |      the value being reconciled against.  The value of ~SCOREOPTIONS~ in the configuration will be
225 |      passed in to allow configuration of this preprocessing.  Returning a score of ~None~ will not
226 |      add tested value as a candidate. This hook is required.
227 |    - ~valid(normalizedFields)~ is passed the normalized tuple prior to being scored to make sure
228 |      it's appropriate for the calculation.  This hook is optional.
229 |    - ~features(word, row, **scoreOptions)~ calculates [[https://reconciliation-api.github.io/specs/latest/#reconciliation-query-responses][features]] using the query string and the
230 |      normalized row.  By default calculating features is disabled.  Implementions of this hook are
231 |      automatically enabled.  This hook is optional.
232 | 
233 | *** Installing
234 |     Hooks are automatically discovered as long as they provide a ~csv_reconcile.scorers~ [[https://setuptools.readthedocs.io/en/latest/userguide/entry_point.html][setuptools
235 |     entry point]].  Poetry supplies a [[https://python-poetry.org/docs/pyproject/#plugins][plugins]] configuration which wraps the setuptools funtionality.
236 | 
237 |     The default Dice coefficent scoring is supplied via the following snippet from ~pyproject.toml~
238 |     file.
239 | 
240 |     : [tool.poetry.plugins."csv_reconcile.scorers"]
241 |     : "dice" = "csv_reconcile_dice"
242 | 
243 |     Here ~dice~ becomes the name of the scoring option and ~csv_reconcile_dice~ is the package
244 |     implementing the plugin.
245 | 
246 | *** Using
247 |     If there is only one scoring plugin available, that plugin is used.  If there are more than one
248 |     available, you will be prompted to pass the ~--scorer~ option to select among the scoring options.
249 | 
250 | *** Known plugins
251 |     See [[https://github.com/gitonthescene/csv-reconcile/wiki][wiki]] for list of known plugins.
252 | 
253 | ** Testing
254 |    Though I long for the old days when a unit test was a unit test, these days things are a bit more
255 |    complicated with various versions of ~Python~ and installation of plugins to manage.  Now we have
256 |    to wrestle with [[https://docs.python.org/3/tutorial/venv.html][virtual environments]].  ~poetry~ handles the virtual environment for developing,
257 |    but testing involves covering more options.
258 |    
259 | *** Tests layout
260 |     The tests directory structure is the following:
261 | 
262 |     : tests
263 |     :     main
264 |     :     plugins
265 |     :         geo
266 | 
267 |     Tests for the main package are found under ~main~ and don't require installing any other
268 |     packages whereas tests under ~plugins~ require the installation of the given plugin.
269 |     
270 | *** Running tests
271 | **** Basic tests
272 |      These tests are written with [[https://docs.pytest.org/en/6.2.x/contents.html][pytest]] and can be running through ~poetry~ as follows:
273 | 
274 |      : $ poetry run pytest
275 | 
276 |      To avoid the complications that come from installing plugins, there is a ~poe~ script for
277 |      running only the tests under main which can be invoked as follows:
278 | 
279 |      : $ poe test
280 | 
281 |      For steady state developing this is probably the command you'll use most often.
282 |      
283 | **** Build matrices
284 |      The GitHub Actions for this project currently use a [[https://docs.github.com/en/actions/learn-github-actions/managing-complex-workflows#using-a-build-matrix][build matrix]] across a couple of
285 |      architectures and several versions of ~Python~, but a similar effect can be achieved using [[https://nox.thea.codes/en/stable/tutorial.html][nox]].
286 | 
287 |      ~nox~ manages the creation of various virtual environments in what they call "sessions", from
288 |      which various commands can be run.  This project's ~noxfile.py~ manages the installation of the
289 |      ~csv-reconcile-geo~ plugin for the plugin tests as well as running across several versions of
290 |      ~Python~.  See the ~nox~ documentation for detail.
291 | 
292 |      Some versions of this command you're likely to run are as follows:
293 | 
294 |      : $ nox      # Run all the tests building virtual environemnts from scratch
295 |      : $ nox -r   # Reuse previously built virtual environments for speed
296 |      : $ nox -s test_geo  # Run only the tests for the csv-reconcile-geo plugin
297 |      : $ nox -s test_main -p 3.8   # Run only the main tests with Python3.8
298 | 
299 |      Eventually, the GitHub Actions may be changed to use [[https://github.com/marketplace/actions/setup-nox][setup-nox]].
300 |      
301 | ** Future enhancements
302 | 
303 |    It would be nice to add support for using [[https://reconciliation-api.github.io/specs/latest/#structure-of-a-reconciliation-query][properties]] as part of the scoring, so that more than
304 |    one column of the csv could be taken into consideration.
305 | 


--------------------------------------------------------------------------------
/build.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # See if Cython is installed
 4 | try:
 5 |     from Cython.Build import cythonize
 6 | # Do nothing if Cython is not available
 7 | except ImportError:
 8 |     # Got to provide this function. Otherwise, poetry will fail
 9 |     def build(setup_kwargs):
10 |         pass
11 | 
12 | 
13 | # Cython is installed. Compile
14 | else:
15 |     from setuptools import Extension
16 |     from setuptools.dist import Distribution
17 |     from distutils.command.build_ext import build_ext
18 | 
19 |     # This function will be executed in setup.py:
20 |     def build(setup_kwargs):
21 |         # The file you want to compile
22 |         extensions = ["csv_reconcile_dice/cutils.pyx"]
23 | 
24 |         # gcc arguments hack: enable optimizations
25 |         os.environ['CFLAGS'] = '-O3'
26 | 
27 |         # Build
28 |         setup_kwargs.update({
29 |             'ext_modules':
30 |                 cythonize(
31 |                     extensions,
32 |                     language_level=3,
33 |                     compiler_directives={'linetrace': True},
34 |                 ),
35 |             'cmdclass': {
36 |                 'build_ext': build_ext
37 |             }
38 |         })
39 | 


--------------------------------------------------------------------------------
/csv_reconcile/__init__.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os.path
  3 | from pathlib import Path
  4 | import sys
  5 | import time
  6 | import shutil
  7 | from contextlib import contextmanager
  8 | 
  9 | import click
 10 | from flask import abort, Flask, jsonify, request
 11 | from flask_cors import cross_origin
 12 | from markupsafe import escape
 13 | 
 14 | from . import default_settings, initdb, scorer
 15 | from .db import get_db, getCSVCols
 16 | from .extend import processDataExtensionBatch
 17 | from .preview import getEntity
 18 | from .score import processQueryBatch
 19 | 
 20 | try:
 21 |     import importlib_metadata as metadata
 22 | except:
 23 |     from importlib import metadata
 24 | 
 25 | __version__ = '0.3.2'
 26 | #------------------------------------------------------------------
 27 | # Implement reconciliation API
 28 | # [[https://reconciliation-api.github.io/specs/latest/]]
 29 | #------------------------------------------------------------------
 30 | 
 31 | 
 32 | @contextmanager
 33 | def Timer():
 34 |     t = time.perf_counter()
 35 |     print("start timer", flush=True)
 36 |     yield
 37 |     elapsed = time.perf_counter() - t
 38 |     print("Elapsed: %s" % (elapsed,))
 39 | 
 40 | 
 41 | # Default manifest.  Can be overriden/updated in configuration
 42 | MANIFEST = {
 43 |     "versions": ["0.1"],
 44 |     "name": "CSV Reconcile",
 45 |     "identifierSpace": "http://localhost/csv_reconcile/ids",
 46 |     "schemaSpace": "http://localhost/csv_reconcile/schema",
 47 |     "extend": {
 48 |         "propose_properties": {
 49 |             "service_url": "http://localhost:5000",
 50 |             "service_path": "/properties"
 51 |         }
 52 |     }
 53 | }
 54 | 
 55 | 
 56 | def create_app(config=None, instance_path=None, scorerOption=None):
 57 |     app = Flask("csv-reconcile", instance_path=instance_path)
 58 | 
 59 |     instance_path = Path(app.instance_path)
 60 | 
 61 |     try:
 62 |         os.makedirs(instance_path)
 63 |     except OSError:
 64 |         pass
 65 | 
 66 |     scorerfile = instance_path / 'scorer.txt'
 67 | 
 68 |     # clean up old files if they exist
 69 |     # "" indicates called from doinit()
 70 |     if scorerOption == "" and scorerfile.is_file():
 71 |         scorerfile.unlink()
 72 |     elif scorerOption:
 73 |         with open(scorerfile, 'w') as f:
 74 |             f.write(scorerOption)
 75 | 
 76 |     scorerOption = None
 77 |     if scorerfile.is_file():
 78 |         with open(scorerfile) as f:
 79 |             scorerOption = f.read()
 80 | 
 81 |     if pickScorer(scorerOption) is None:
 82 |         return None
 83 | 
 84 |     # possibly better to roll THRESHOLD and LIMIT into one config called LIMITS
 85 |     app.config.from_object(default_settings)
 86 | 
 87 |     cfgfile = instance_path / "reconcile.config"
 88 | 
 89 |     # clean up old configs if they exist
 90 |     # "" indicates called from doinit()
 91 |     if config == "" and cfgfile.is_file():
 92 |         cfgfile.unlink()
 93 |     elif config:
 94 |         shutil.copyfile( config, cfgfile )
 95 | 
 96 |     if cfgfile.is_file():
 97 |         app.config.from_pyfile(cfgfile)
 98 | 
 99 |     scoreOptions = app.config['SCOREOPTIONS']
100 |     scorer.processScoreOptions(scoreOptions)
101 | 
102 |     if 'MANIFEST' in app.config:
103 |         MANIFEST.update(app.config['MANIFEST'])
104 | 
105 |     loglevel = app.config['LOGLEVEL']
106 |     if loglevel:
107 |         app.logger.setLevel(loglevel)
108 | 
109 |     @app.before_request
110 |     def before():
111 |         app.logger.debug(request.method)
112 |         app.logger.debug(request.headers)
113 | 
114 |     @app.after_request
115 |     def after(response):
116 |         app.logger.debug(response.headers)
117 |         return response
118 | 
119 |     @app.route('/reconcile', methods=['POST', 'GET'])
120 |     @cross_origin()
121 |     def acceptQuery():
122 |         threshold = app.config.get('THRESHOLD', None)
123 |         limit = app.config.get('LIMIT', None)
124 |         scoreOptions = app.config['SCOREOPTIONS']
125 |         queries = request.form.get('queries')
126 |         extend = request.form.get('extend')
127 |         if queries:
128 |             db = get_db()
129 | 
130 |             queryBatch = json.loads(queries)
131 | 
132 |             app.logger.info(queryBatch)
133 |             with Timer():
134 |                 ret = processQueryBatch(db,
135 |                                         queryBatch,
136 |                                         limit=limit,
137 |                                         threshold=threshold,
138 |                                         **scoreOptions)
139 |             app.logger.info(ret)
140 |             return ret
141 |         elif extend:
142 |             extendBatch = json.loads(extend)
143 | 
144 |             app.logger.info(extendBatch)
145 |             with Timer():
146 |                 ret = processDataExtensionBatch(extendBatch)
147 |             app.logger.info(ret)
148 |             return ret
149 |         else:
150 |             return MANIFEST
151 | 
152 |     # FIX FIX FIX...  Not needed in OpenRefine 3.5
153 |     # [[https://github.com/OpenRefine/OpenRefine/issues/3672]]
154 |     def jsonpify(obj):
155 |         """
156 |         Like jsonify but wraps result in a JSONP callback if a 'callback'
157 |         query param is supplied.
158 |         """
159 |         try:
160 |             callback = request.args['callback']
161 |             response = app.make_response("%s(%s)" % (callback, json.dumps(obj)))
162 |             response.mimetype = "text/javascript"
163 |             return response
164 |         except KeyError:
165 |             return jsonify(obj)
166 | 
167 |     @app.route('/properties', methods=['POST', 'GET'])
168 |     @cross_origin()
169 |     def acceptPropertyRequest():
170 |         # query string arg
171 |         propType = request.args.get('type')
172 | 
173 |         # Type irrelevant, return all columns
174 |         if propType != None:
175 |             cols = getCSVCols()
176 |             ret = dict(properties=[{
177 |                 'id': colname,
178 |                 'name': name
179 |             } for colname, name in cols])
180 |             return jsonpify(ret)
181 | 
182 |         # unprocessible request
183 | 
184 |     @app.route('/preview/<entity_id>')
185 |     @cross_origin()
186 |     def preview_service(entity_id=None):
187 |         if not entity_id:
188 |             abort(404)
189 |         entity = getEntity(entity_id)
190 |         if not entity:
191 |             abort(404)
192 |         entity_html = "".join([f"<dt>{escape(key)}</dt><dd>{escape(val)}</dd>"
193 |                                for key, val in entity.items()])
194 |         return f"""<!DOCTYPE html>
195 | <html>
196 |     <head>
197 |         <meta charset='utf-8'>
198 |         <title>Preview for {escape(entity_id)}</title>
199 |         <style type='text/css'>
200 |           h1 {{font-size: 115%; }}
201 |           dl {{display: flex; flex-flow: row wrap;}}
202 |           dt {{flex-basis: 20%; padding: 2px 4px;
203 |                text-align: right; font-weight: bold;}}
204 |           dt::after {{content: ':';}}
205 |           dd {{flex-basis: 70%; flex-grow: 1;
206 |                margin: 0; padding: 2px 4px;}}
207 |         </style>
208 |     </head>
209 |     <body>
210 |         <dl>{entity_html}</dl>
211 |     </body>
212 | </html>"""
213 | 
214 |     return app
215 | 
216 | 
217 | def pickScorer(plugin):
218 |     eps = metadata.entry_points().select(group='csv_reconcile.scorers')
219 |     entrypoint = None
220 |     if len(eps) == 0:
221 |         raise RuntimeError("Please install a \"csv_reconcile.scorers\" plugin")
222 |     elif plugin:
223 |         for ep in eps:
224 |             if ep.name == plugin:
225 |                 entrypoint = ep
226 |                 break
227 |         else:
228 |             raise RuntimeError(
229 |                 "Please install %s \"csv_reconcile.scorers\" plugin" %
230 |                 (plugin,))
231 |     elif len(eps) == 1:
232 |         entrypoint = next(iter(eps))
233 | 
234 |     if entrypoint is None:
235 |         # print out options
236 |         print(
237 |             "There are several scorers available.  Please choose one of the following with the --scorer option."
238 |         )
239 |         for ep in eps:
240 |             print("  %s" % (ep.name,))
241 |         return None
242 | 
243 |     entrypoint.load()
244 |     return entrypoint
245 | 
246 | 
247 | @click.group()
248 | def cli():
249 |     pass
250 | 
251 | 
252 | def doinit(config, scorerOption, csvfile, idcol, namecol):
253 | 
254 |     app = create_app(config or "", scorerOption=scorerOption or "")
255 |     if app is None:
256 |         return
257 | 
258 |     with app.app_context():
259 |         initdb.init_db_with_context(csvfile, idcol, namecol)
260 |         click.echo('Initialized the database.')
261 |     return app
262 | 
263 | 
264 | @cli.command()
265 | @click.option('--config', help='config file')
266 | @click.option('--scorer', 'scorerOption', help='scoring plugin to use')
267 | @click.argument('csvfile')
268 | @click.argument('idcol')
269 | @click.argument('namecol')
270 | def init(config, scorerOption, csvfile, idcol, namecol):
271 |     return doinit(config, scorerOption, csvfile, idcol, namecol)
272 | 
273 | @cli.command()
274 | @click.option('--config', help='config file')
275 | @click.option('--scorer', 'scorerOption', help='scoring plugin to use')
276 | @click.option('--init-db', is_flag=True, help='initialize the db')
277 | @click.argument('csvfile')
278 | @click.argument('idcol')
279 | @click.argument('namecol')
280 | def run(config, scorerOption, init_db, csvfile, idcol, namecol):
281 |     print('''
282 | #########################################################
283 | ##         WARNING: The interface is deprecated        ##
284 | #########################################################
285 | 
286 | Please run init once to initialize the database and serve to run the server.
287 | See --help for details.
288 | ''')
289 | 
290 |     app = None
291 |     if init_db:
292 |         app = doinit(config, scorerOption, csvfile, idcol, namecol)
293 | 
294 |     app = app or create_app(config)
295 |     from werkzeug.serving import WSGIRequestHandler
296 |     WSGIRequestHandler.protocol_version = "HTTP/1.1"
297 |     app.run(debug=False)
298 | 
299 | 
300 | @cli.command()
301 | def serve():
302 | 
303 |     # Config should have been copied during the init phase
304 |     app = create_app()
305 |     from werkzeug.serving import WSGIRequestHandler
306 |     WSGIRequestHandler.protocol_version = "HTTP/1.1"
307 |     app.run(debug=False)
308 | 
309 | 
310 | def main():
311 |     nonopts = [a for a in sys.argv if not a.startswith('--')]
312 | 
313 |     if len(nonopts) > 1 and nonopts[1] not in 'run init serve':
314 |         print('''
315 | #########################################################
316 | ##     WARNING: The interface has changed slightly.    ##
317 | #########################################################
318 | Please use one of the subcommands. See --help for details.
319 | 
320 | ''')
321 |     return cli()
322 | 


--------------------------------------------------------------------------------
/csv_reconcile/__main__.py:
--------------------------------------------------------------------------------
1 | from . import main
2 | import sys
3 | 
4 | main()
5 | 


--------------------------------------------------------------------------------
/csv_reconcile/db.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import sqlite3
 3 | 
 4 | from flask import current_app, g
 5 | from normality import slugify
 6 | 
 7 | 
 8 | def normalizeDBcol(col):
 9 |     return slugify(col).replace('-', '_')
10 | 
11 | 
12 | def getCSVCols():
13 |     cur = get_db().cursor()
14 |     cur.execute("SELECT * FROM datacols")
15 |     return [(row['colname'], row['name']) for row in cur]
16 | 
17 | 
18 | def getIDCol():
19 |     cur = get_db().cursor()
20 | 
21 |     cur.execute("SELECT colname FROM datacols WHERE isid == 1")
22 |     res = cur.fetchall()
23 |     if len(res) != 1:
24 |         raise RuntimeError("database not properly initialized")
25 |     return res[0]['colname']
26 | 
27 | 
28 | def get_db():
29 |     if 'db' not in g:
30 |         g.db = sqlite3.connect(os.path.join(current_app.instance_path,
31 |                                             current_app.config['DATABASE']),
32 |                                detect_types=sqlite3.PARSE_DECLTYPES)
33 |         g.db.row_factory = sqlite3.Row
34 | 
35 |     return g.db
36 | 
37 | 
38 | def close_db(e=None):
39 |     db = g.pop('db', None)
40 | 
41 |     if db is not None:
42 |         db.close()
43 | 


--------------------------------------------------------------------------------
/csv_reconcile/default_settings.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | DATABASE = 'csvreconcile.db'
 4 | 
 5 | LIMIT = 10  # At most 10 matches per query
 6 | 
 7 | THRESHOLD = 30.0  # At least a 30% match
 8 | 
 9 | LOGLEVEL = logging.NOTSET
10 | 
11 | SCOREOPTIONS = {}
12 | 


--------------------------------------------------------------------------------
/csv_reconcile/extend.py:
--------------------------------------------------------------------------------
 1 | from .db import get_db, getIDCol, getCSVCols
 2 | 
 3 | def processDataExtensionBatch(batch):
 4 |     ids, props = tuple(batch[x] for x in ('ids', 'properties'))
 5 |     names = {p['id'] for p in props}
 6 |     cols = {colnm: nm for colnm, nm in getCSVCols() if colnm in names}
 7 |     idcol = getIDCol()
 8 | 
 9 | 
10 |     db = get_db()
11 |     cur = db.cursor()
12 |     # Could use some defensiveness in generating this SQL
13 |     sql = "SELECT %s,%s FROM data WHERE %s in (%s)" % (idcol, ','.join(cols.keys()), idcol, ','.join('?' * len(ids)))
14 |     cur.execute(sql, ids)
15 |     rows = dict()
16 |     for row in cur:
17 |         rows[row[idcol]] = {col: [{'str': row[col]}] for col in cols}
18 | 
19 |     meta = [dict(id=p['id'], name=cols[p['id']]) for p in props]
20 | 
21 |     return dict(meta=meta, rows=rows)
22 | 


--------------------------------------------------------------------------------
/csv_reconcile/initdb.py:
--------------------------------------------------------------------------------
  1 | from flask import current_app
  2 | import csv
  3 | from chardet.universaldetector import UniversalDetector
  4 | 
  5 | from collections import defaultdict
  6 | from itertools import count
  7 | 
  8 | from .db import get_db, normalizeDBcol
  9 | 
 10 | from importlib.resources import read_text
 11 | import csv_reconcile
 12 | from . import scorer
 13 | 
 14 | 
 15 | def initDataTable(db, colnames, idcol):
 16 |     cols = []
 17 |     cnts = defaultdict(count)
 18 |     for col in colnames:
 19 |         slug = normalizeDBcol(col)
 20 |         slug = f'{slug}{next(cnts[slug])}'
 21 |         if col == idcol:
 22 |             cols.append('%s TEXT PRIMARY KEY' % (slug,))
 23 |         else:
 24 |             cols.append('%s TEXT NOT NULL' % (slug,))
 25 | 
 26 |         db.execute('INSERT INTO datacols VALUES (?,?,?)',
 27 |                    (col, slug, 1 if col == idcol else 0))
 28 | 
 29 |     # create data table with the contents of the csv file
 30 |     createSQL = 'CREATE TABLE data (\n  %s\n)'
 31 |     db.execute(createSQL % (',\n  '.join(cols),))
 32 | 
 33 | 
 34 | def initReconcileTable(db, colnames):
 35 |     create = [
 36 |         'CREATE TABLE reconcile (\n  id TEXT PRIMARY KEY,\n  word TEXT NOT NULL'
 37 |     ]
 38 |     for col in colnames:
 39 |         create.append('%s TEXT NOT NULL' % (col,))
 40 | 
 41 |     # create data table with the contents of the csv file
 42 |     db.execute(',\n  '.join(create) + '\n)')
 43 | 
 44 | def detectEncoding(filenm):
 45 |     detector = UniversalDetector()
 46 |     for line in open(filenm, 'rb'):
 47 |         detector.feed(line)
 48 |         if detector.done: break
 49 |     detector.close()
 50 |     if detector.result['confidence'] > .95:
 51 |         return detector.result['encoding']
 52 |     return None
 53 | 
 54 | def init_db(db,
 55 |             csvfilenm,
 56 |             idcol,
 57 |             searchcol,
 58 |             csvencoding=None,
 59 |             scoreOptions=None,
 60 |             csvkwargs=None):
 61 | 
 62 |     enckwarg = dict()
 63 |     csvencoding = csvencoding or detectEncoding(csvfilenm)
 64 | 
 65 |     if csvencoding:
 66 |         enckwarg['encoding'] = csvencoding
 67 | 
 68 |     schema = read_text(csv_reconcile, 'schema.sql')
 69 |     db.executescript(schema)
 70 | 
 71 |     csvkwargs = {} if csvkwargs is None else csvkwargs
 72 | 
 73 |     with db:
 74 |         # Create a table with ids (as PRIMARY ID), words and bigrams
 75 |         with open(csvfilenm, newline='', **enckwarg) as csvfile:
 76 |             dialect = None
 77 |             try:
 78 |                 dialect = csv.Sniffer().sniff(csvfile.read(1024))
 79 |             except:
 80 |                 pass
 81 | 
 82 |             csvfile.seek(0)
 83 |             reader = csv.reader(csvfile, dialect=dialect, **csvkwargs)
 84 |             header = next(reader)
 85 | 
 86 |             # Throws if col doesn't exist
 87 |             searchidx = header.index(searchcol)
 88 |             ididx = header.index(idcol)
 89 | 
 90 |             normalizedFields = scorer.getNormalizedFields()
 91 |             initDataTable(db, header, idcol)
 92 |             initReconcileTable(db, normalizedFields)
 93 | 
 94 |             datavals = ','.join('?' * len(header))
 95 | 
 96 |             for row in reader:
 97 |                 if len(row) != len(header): continue
 98 |                 mid = row[ididx]
 99 |                 word = row[searchidx]
100 |                 matchFields = scorer.normalizeRow(word, row, **scoreOptions)
101 |                 db.execute(
102 |                     "INSERT INTO reconcile VALUES (%s)" %
103 |                     (','.join('?' * (2 + len(normalizedFields))),),
104 |                     (mid, word) + tuple(matchFields))
105 | 
106 |                 db.execute("INSERT INTO data VALUES (%s)" % (datavals), row)
107 | 
108 | 
109 | def init_db_with_context(csvfilenm, idcol, searchcol):
110 |     db = get_db()
111 |     csvkwargs = current_app.config.get('CSVKWARGS', {})
112 |     scoreOptions = current_app.config['SCOREOPTIONS']
113 |     csvencoding = current_app.config.get('CSVENCODING', None)
114 | 
115 |     return init_db(db,
116 |                    csvfilenm,
117 |                    idcol,
118 |                    searchcol,
119 |                    csvencoding=csvencoding,
120 |                    csvkwargs=csvkwargs,
121 |                    scoreOptions=scoreOptions)
122 | 


--------------------------------------------------------------------------------
/csv_reconcile/preview.py:
--------------------------------------------------------------------------------
 1 | from .db import get_db, getCSVCols, getIDCol
 2 | 
 3 | 
 4 | def getEntity(entity_id):
 5 |     id_col = getIDCol()
 6 |     cols = dict(getCSVCols())
 7 | 
 8 |     cur = get_db().cursor()
 9 |     cur.execute(f"SELECT * FROM data WHERE {id_col}=? LIMIT 1", (entity_id,))
10 |     row = cur.fetchone()
11 |     if not row:
12 |         return None
13 |     return {cols[col]: value for col, value in zip(row.keys(), row)}
14 | 


--------------------------------------------------------------------------------
/csv_reconcile/schema.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS reconcile;
 2 | DROP TABLE IF EXISTS data;
 3 | DROP TABLE IF EXISTS datacols;
 4 | 
 5 | CREATE TABLE datacols (
 6 |   name TEXT PRIMARY KEY,
 7 |   colname TEXT NOT NULL,
 8 |   isid INT NOT NULL
 9 | );
10 | 


--------------------------------------------------------------------------------
/csv_reconcile/score.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from . import scorer
 3 | 
 4 | 
 5 | def reconcileStrings(db, items, **kwargs):
 6 |     # Use index as query id
 7 |     batch = dict(enumerate({'query': s} for s in items))
 8 | 
 9 |     ret = processQueryBatch(db, batch, **kwargs)
10 | 
11 |     # Return as a list of pairs of query matched to result
12 |     return list(zip(items, (res for _, res in sorted(ret.items()))))
13 | 
14 | 
15 | def processQueryBatch(db, batch, limit=None, threshold=0.0, **scoreOptions):
16 |     '''
17 |     Go through db looking for words whose fuzzy match score positively
18 |     '''
19 |     hasFeatures = not getattr(scorer.features, "disabled", False)
20 | 
21 |     toMatchItems = dict()
22 |     for qid, req in batch.items():
23 |         queryStr = req['query']
24 |         toMatchItems[qid] = scorer.normalizeWord(queryStr, **
25 |                                                  scoreOptions) or queryStr
26 | 
27 |     # Better to pull these off an sqlite store
28 | 
29 |     cur = db.cursor()
30 |     normalizedFields = scorer.getNormalizedFields()
31 | 
32 |     cur.execute('SELECT %s FROM reconcile' %
33 |                 (','.join(('word', 'id') + tuple(normalizedFields))))
34 | 
35 |     picks = defaultdict(list)
36 |     for row in cur:
37 |         compareTo = row[2:] if normalizedFields else row['word']
38 |         if not scorer.valid(compareTo):
39 |             continue
40 | 
41 |         for qid in batch.keys():
42 |             toMatch = toMatchItems[qid]
43 | 
44 |             score = scorer.scoreMatch(toMatch, compareTo, **scoreOptions)
45 |             if score is not None and score > threshold:
46 |                 picks[qid].append((row, score))
47 | 
48 |     ret = dict()
49 |     for qid in batch:
50 |         pick = picks[qid]
51 |         lmt = batch[qid].get('limit', limit)
52 |         queryStr = batch[qid]['query']
53 | 
54 |         res = []
55 |         exacts = []
56 |         cnt = 0
57 |         for row, score in sorted(pick, key=lambda x: -x[1]):
58 |             cnt += 1
59 |             if lmt and cnt > lmt:
60 |                 break
61 | 
62 |             res.append(
63 |                 dict(id=row['id'], name=row['word'], score=score, match=False))
64 | 
65 |             if hasFeatures:
66 |                 features = scorer.features(queryStr, row)
67 |                 if features is not None:
68 |                     res[-1]['features'] = features
69 | 
70 |             if res[-1]['name'] == queryStr:
71 |                 exacts.append(res[-1])
72 | 
73 |         # Make match if only one
74 |         if len(res) == 1:
75 |             res[0]['match'] = True
76 |         else:
77 |             if len(exacts) == 1:
78 |                 exacts[0]['match'] = True
79 | 
80 |         # Maybe match if there is a wide gap in score between first match and second?
81 |         ret[qid] = dict(result=res)
82 | 
83 |     return ret
84 | 


--------------------------------------------------------------------------------
/csv_reconcile/scorer.py:
--------------------------------------------------------------------------------
 1 | def register(func):
 2 |     '''
 3 |     Decorator for replacing functions in this module
 4 |     '''
 5 |     glbls = globals()
 6 |     glbls[func.__name__] = func
 7 |     return func
 8 | 
 9 | 
10 | def getNormalizedFields():
11 |     '''List of fields generated from reconciled column for the match calculation'''
12 |     raise RuntimeError('getNormalizedFields() -> tuple must be implemented')
13 | 
14 | 
15 | def processScoreOptions(options):
16 |     '''Optionally modify configuration options passed in'''
17 | 
18 | 
19 | def scoreMatch(left, right, **scoreOptions):
20 |     '''Score fuzzy match score between left and right'''
21 |     raise RuntimeError('scoreMatch(left,right) -> float must be implemented')
22 | 
23 | 
24 | def normalizeWord(word, **scoreOptions):
25 |     '''
26 |     Preprocess column being reconciled for the match calculation.
27 |     Return a tuple with the same number of elements as returned by getNormalizedFields()
28 |     '''
29 |     raise RuntimeError(
30 |         'normalizeWord(word, **options) -> tuple must be implemented')
31 | 
32 | 
33 | def normalizeRow(word, row, **scoreOptions):
34 |     '''
35 |     Preprocess column being reconciled against for the match calculation.
36 |     Return a tuple with the same number of elements as returned by getNormalizedFields()
37 |     Defaults to using the same normalization as normalizeWord().
38 |     '''
39 |     return normalizeWord(word, **scoreOptions)
40 | 
41 | 
42 | def valid(normalizedFields):
43 |     '''Optionally validate column before performing match calculation'''
44 |     return True
45 | 
46 | 
47 | # [[https://reconciliation-api.github.io/specs/latest/#reconciliation-query-responses]]
48 | def features(word, row, **scoreOptions):
49 |     '''
50 |     Takes the queryString and the normalized row and calculates features.
51 |     The calculation is disabled by default.
52 |     '''
53 |     # This is just a dummy result since features are disabled by default.
54 |     return [dict(id="someid", value=15), dict(id="someotherid", value=19)]
55 | 
56 | 
57 | features.disabled = True
58 | 


--------------------------------------------------------------------------------
/csv_reconcile_dice/__init__.py:
--------------------------------------------------------------------------------
 1 | from csv_reconcile import scorer
 2 | from normality import normalize
 3 | 
 4 | try:
 5 |     # Cython if it exists
 6 |     from .cutils import getDiceCoefficient
 7 | except:
 8 |     from .utils import getDiceCoefficient
 9 | 
10 | 
11 | # [[https://en.wikipedia.org/wiki/Stop_word]]
12 | def makeBigrams(word, **scoreOptions):
13 |     '''
14 |     Normalize set of bigrams into an ordered string to aid processing
15 |     '''
16 |     # Should probably allow stop words
17 |     # Should probably strip of spaces(?) and punctuation
18 |     process = normalize(word)
19 |     stopwords = scoreOptions.get('stopwords', None)
20 |     if stopwords:
21 |         process = ' '.join(w for w in process.split() if w not in stopwords)
22 | 
23 |     return ''.join(
24 |         sorted(set(process[i:i + 2] for i in range(len(process) - 1))))
25 | 
26 | 
27 | @scorer.register
28 | def getNormalizedFields():
29 |     return ('bigrams',)
30 | 
31 | 
32 | @scorer.register
33 | def processScoreOptions(options):
34 |     if not options:
35 |         return
36 | 
37 |     options['stopwords'] = [w.lower() for w in options['stopwords']]
38 | 
39 | 
40 | @scorer.register
41 | def scoreMatch(left, right, **scoreOptions):
42 |     return getDiceCoefficient(left[0].encode('utf-8'), right[0].encode('utf-8'))
43 | 
44 | 
45 | @scorer.register
46 | def normalizeWord(word, **scoreOptions):
47 |     return (makeBigrams(word, **scoreOptions),)
48 | 
49 | 
50 | @scorer.register
51 | def valid(normalizedFields):
52 |     if not normalizedFields[0]:
53 |         return False
54 |     return True
55 | 


--------------------------------------------------------------------------------
/csv_reconcile_dice/cutils.pyx:
--------------------------------------------------------------------------------
 1 | from libc.string cimport strncmp
 2 | 
 3 | # [[https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient]]
 4 | def getDiceCoefficient(const char* bigram1, const char* bigram2):
 5 |     '''
 6 |     Calculate the Dice coefficient from two normalized sets of bigrams
 7 |     '''
 8 |     cdef int l1, l2, i1, i2, cnt, diff
 9 |     l1 = len(bigram1)
10 |     l2 = len(bigram2)
11 |     i1 = i2 = cnt = 0
12 | 
13 |     while i1 < l1 and i2 < l2:
14 |         diff = strncmp(bigram1+i1, bigram2+i2, 2)
15 |         if diff == 0:
16 |             cnt += 1
17 |             i1 += 2
18 |             i2 += 2
19 |         elif diff < 0:
20 |             i1 += 2
21 |         else:
22 |             i2 += 2
23 | 
24 |     # length is twice the number of bigrams
25 |     return 400.0 * cnt / (l1 + l2)
26 | 


--------------------------------------------------------------------------------
/csv_reconcile_dice/utils.py:
--------------------------------------------------------------------------------
 1 | # [[https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient]]
 2 | def getDiceCoefficient(bigram1, bigram2):
 3 |     '''
 4 |     Calculate the Dice coefficient from two normalized sets of bigrams
 5 |     '''
 6 |     l1 = len(bigram1)
 7 |     l2 = len(bigram2)
 8 |     i1 = i2 = cnt = 0
 9 | 
10 |     while i1 < l1 and i2 < l2:
11 |         b1 = bigram1[i1:i1 + 2]
12 |         b2 = bigram2[i2:i2 + 2]
13 |         if b1 == b2:
14 |             cnt += 1
15 |             i1 += 2
16 |             i2 += 2
17 |         elif b1 < b2:
18 |             i1 += 2
19 |         else:
20 |             i2 += 2
21 | 
22 |     # length is twice the number of bigrams
23 |     return 400.0 * cnt / (l1 + l2)
24 | 


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
 1 | from nox_poetry import session, SDIST
 2 | 
 3 | args = lambda s: s.split()
 4 | 
 5 | 
 6 | @session(python=['3.7', '3.8', '3.9'])
 7 | def test_main(session):
 8 |     session.poetry.installroot(distribution_format=SDIST)
 9 |     session.install('pytest')
10 |     session.run(*args('pytest -v tests/main'))
11 | 
12 | 
13 | @session(python=['3.7', '3.8', '3.9'])
14 | def test_geo(session):
15 |     session.poetry.installroot(distribution_format=SDIST)
16 |     session.install('csv-reconcile-geo')
17 |     session.install('pytest')
18 |     session.run(*args('pytest -v tests/plugins/geo'))
19 | 


--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
  1 | [[package]]
  2 | name = "attrs"
  3 | version = "21.4.0"
  4 | description = "Classes Without Boilerplate"
  5 | category = "dev"
  6 | optional = false
  7 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
  8 | 
  9 | [package.extras]
 10 | dev = ["cloudpickle", "coverage[toml] (>=5.0.2)", "furo", "hypothesis", "mypy", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "sphinx", "sphinx-notfound-page", "zope.interface"]
 11 | docs = ["furo", "sphinx", "sphinx-notfound-page", "zope.interface"]
 12 | tests = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six", "zope.interface"]
 13 | tests-no-zope = ["cloudpickle", "coverage[toml] (>=5.0.2)", "hypothesis", "mypy", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "six"]
 14 | 
 15 | [[package]]
 16 | name = "banal"
 17 | version = "1.0.6"
 18 | description = "Commons of banal micro-functions for Python."
 19 | category = "main"
 20 | optional = false
 21 | python-versions = "*"
 22 | 
 23 | [package.extras]
 24 | dev = ["mypy", "wheel"]
 25 | 
 26 | [[package]]
 27 | name = "chardet"
 28 | version = "5.1.0"
 29 | description = "Universal encoding detector for Python 3"
 30 | category = "main"
 31 | optional = false
 32 | python-versions = ">=3.7"
 33 | 
 34 | [[package]]
 35 | name = "charset-normalizer"
 36 | version = "2.0.12"
 37 | description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 38 | category = "main"
 39 | optional = false
 40 | python-versions = ">=3.5.0"
 41 | 
 42 | [package.extras]
 43 | unicode-backport = ["unicodedata2"]
 44 | 
 45 | [[package]]
 46 | name = "click"
 47 | version = "8.1.3"
 48 | description = "Composable command line interface toolkit"
 49 | category = "main"
 50 | optional = false
 51 | python-versions = ">=3.7"
 52 | 
 53 | [package.dependencies]
 54 | colorama = {version = "*", markers = "platform_system == \"Windows\""}
 55 | importlib-metadata = {version = "*", markers = "python_version < \"3.8\""}
 56 | 
 57 | [[package]]
 58 | name = "colorama"
 59 | version = "0.4.4"
 60 | description = "Cross-platform colored terminal text."
 61 | category = "main"
 62 | optional = false
 63 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 64 | 
 65 | [[package]]
 66 | name = "cython"
 67 | version = "0.29.33"
 68 | description = "The Cython compiler for writing C extensions for the Python language."
 69 | category = "main"
 70 | optional = false
 71 | python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
 72 | 
 73 | [[package]]
 74 | name = "exceptiongroup"
 75 | version = "1.0.0rc9"
 76 | description = "Backport of PEP 654 (exception groups)"
 77 | category = "dev"
 78 | optional = false
 79 | python-versions = ">=3.7"
 80 | 
 81 | [package.extras]
 82 | test = ["pytest (>=6)"]
 83 | 
 84 | [[package]]
 85 | name = "flask"
 86 | version = "2.2.2"
 87 | description = "A simple framework for building complex web applications."
 88 | category = "main"
 89 | optional = false
 90 | python-versions = ">=3.7"
 91 | 
 92 | [package.dependencies]
 93 | click = ">=8.0"
 94 | importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""}
 95 | itsdangerous = ">=2.0"
 96 | Jinja2 = ">=3.0"
 97 | Werkzeug = ">=2.2.2"
 98 | 
 99 | [package.extras]
100 | async = ["asgiref (>=3.2)"]
101 | dotenv = ["python-dotenv"]
102 | 
103 | [[package]]
104 | name = "flask-cors"
105 | version = "3.0.10"
106 | description = "A Flask extension adding a decorator for CORS support"
107 | category = "main"
108 | optional = false
109 | python-versions = "*"
110 | 
111 | [package.dependencies]
112 | Flask = ">=0.9"
113 | Six = "*"
114 | 
115 | [[package]]
116 | name = "importlib-metadata"
117 | version = "6.0.0"
118 | description = "Read metadata from Python packages"
119 | category = "main"
120 | optional = false
121 | python-versions = ">=3.7"
122 | 
123 | [package.dependencies]
124 | typing-extensions = {version = ">=3.6.4", markers = "python_version < \"3.8\""}
125 | zipp = ">=0.5"
126 | 
127 | [package.extras]
128 | docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
129 | perf = ["ipython"]
130 | testing = ["flake8 (<5)", "flufl.flake8", "importlib-resources (>=1.3)", "packaging", "pyfakefs", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf (>=0.9.2)"]
131 | 
132 | [[package]]
133 | name = "iniconfig"
134 | version = "1.1.1"
135 | description = "iniconfig: brain-dead simple config-ini parsing"
136 | category = "dev"
137 | optional = false
138 | python-versions = "*"
139 | 
140 | [[package]]
141 | name = "itsdangerous"
142 | version = "2.1.2"
143 | description = "Safely pass data to untrusted environments and back."
144 | category = "main"
145 | optional = false
146 | python-versions = ">=3.7"
147 | 
148 | [[package]]
149 | name = "jinja2"
150 | version = "3.1.2"
151 | description = "A very fast and expressive template engine."
152 | category = "main"
153 | optional = false
154 | python-versions = ">=3.7"
155 | 
156 | [package.dependencies]
157 | MarkupSafe = ">=2.0"
158 | 
159 | [package.extras]
160 | i18n = ["Babel (>=2.7)"]
161 | 
162 | [[package]]
163 | name = "markupsafe"
164 | version = "2.1.1"
165 | description = "Safely add untrusted strings to HTML/XML markup."
166 | category = "main"
167 | optional = false
168 | python-versions = ">=3.7"
169 | 
170 | [[package]]
171 | name = "normality"
172 | version = "2.4.0"
173 | description = "Micro-library to normalize text strings"
174 | category = "main"
175 | optional = false
176 | python-versions = "*"
177 | 
178 | [package.dependencies]
179 | banal = ">=1.0.1"
180 | chardet = "*"
181 | charset-normalizer = ">=2.0.0"
182 | text-unidecode = "*"
183 | 
184 | [package.extras]
185 | dev = ["mypy", "pyicu (>=1.9.3)", "pytest", "types-chardet"]
186 | icu = ["pyicu (>=1.9.3)"]
187 | 
188 | [[package]]
189 | name = "packaging"
190 | version = "21.3"
191 | description = "Core utilities for Python packages"
192 | category = "dev"
193 | optional = false
194 | python-versions = ">=3.6"
195 | 
196 | [package.dependencies]
197 | pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
198 | 
199 | [[package]]
200 | name = "pluggy"
201 | version = "1.0.0"
202 | description = "plugin and hook calling mechanisms for python"
203 | category = "dev"
204 | optional = false
205 | python-versions = ">=3.6"
206 | 
207 | [package.dependencies]
208 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
209 | 
210 | [package.extras]
211 | dev = ["pre-commit", "tox"]
212 | testing = ["pytest", "pytest-benchmark"]
213 | 
214 | [[package]]
215 | name = "pyparsing"
216 | version = "3.0.8"
217 | description = "pyparsing module - Classes and methods to define and execute parsing grammars"
218 | category = "dev"
219 | optional = false
220 | python-versions = ">=3.6.8"
221 | 
222 | [package.extras]
223 | diagrams = ["jinja2", "railroad-diagrams"]
224 | 
225 | [[package]]
226 | name = "pytest"
227 | version = "7.2.0"
228 | description = "pytest: simple powerful testing with Python"
229 | category = "dev"
230 | optional = false
231 | python-versions = ">=3.7"
232 | 
233 | [package.dependencies]
234 | attrs = ">=19.2.0"
235 | colorama = {version = "*", markers = "sys_platform == \"win32\""}
236 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""}
237 | importlib-metadata = {version = ">=0.12", markers = "python_version < \"3.8\""}
238 | iniconfig = "*"
239 | packaging = "*"
240 | pluggy = ">=0.12,<2.0"
241 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
242 | 
243 | [package.extras]
244 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"]
245 | 
246 | [[package]]
247 | name = "six"
248 | version = "1.16.0"
249 | description = "Python 2 and 3 compatibility utilities"
250 | category = "main"
251 | optional = false
252 | python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
253 | 
254 | [[package]]
255 | name = "text-unidecode"
256 | version = "1.3"
257 | description = "The most basic Text::Unidecode port"
258 | category = "main"
259 | optional = false
260 | python-versions = "*"
261 | 
262 | [[package]]
263 | name = "tomli"
264 | version = "2.0.1"
265 | description = "A lil' TOML parser"
266 | category = "dev"
267 | optional = false
268 | python-versions = ">=3.7"
269 | 
270 | [[package]]
271 | name = "typing-extensions"
272 | version = "4.2.0"
273 | description = "Backported and Experimental Type Hints for Python 3.7+"
274 | category = "main"
275 | optional = false
276 | python-versions = ">=3.7"
277 | 
278 | [[package]]
279 | name = "werkzeug"
280 | version = "2.2.2"
281 | description = "The comprehensive WSGI web application library."
282 | category = "main"
283 | optional = false
284 | python-versions = ">=3.7"
285 | 
286 | [package.dependencies]
287 | MarkupSafe = ">=2.1.1"
288 | 
289 | [package.extras]
290 | watchdog = ["watchdog"]
291 | 
292 | [[package]]
293 | name = "zipp"
294 | version = "3.8.0"
295 | description = "Backport of pathlib-compatible object wrapper for zip files"
296 | category = "main"
297 | optional = false
298 | python-versions = ">=3.7"
299 | 
300 | [package.extras]
301 | docs = ["jaraco.packaging (>=9)", "rst.linker (>=1.9)", "sphinx"]
302 | testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.0.1)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
303 | 
304 | [metadata]
305 | lock-version = "1.1"
306 | python-versions = "^3.7"
307 | content-hash = "c960304aafc066172399007ed38cbff5e27d9fae5743995261b6a8699c1edb1e"
308 | 
309 | [metadata.files]
310 | attrs = [
311 |     {file = "attrs-21.4.0-py2.py3-none-any.whl", hash = "sha256:2d27e3784d7a565d36ab851fe94887c5eccd6a463168875832a1be79c82828b4"},
312 |     {file = "attrs-21.4.0.tar.gz", hash = "sha256:626ba8234211db98e869df76230a137c4c40a12d72445c45d5f5b716f076e2fd"},
313 | ]
314 | banal = [
315 |     {file = "banal-1.0.6-py2.py3-none-any.whl", hash = "sha256:877aacb16b17f8fa4fd29a7c44515c5a23dc1a7b26078bc41dd34829117d85e1"},
316 |     {file = "banal-1.0.6.tar.gz", hash = "sha256:2fe02c9305f53168441948f4a03dfbfa2eacc73db30db4a93309083cb0e250a5"},
317 | ]
318 | chardet = [
319 |     {file = "chardet-5.1.0-py3-none-any.whl", hash = "sha256:362777fb014af596ad31334fde1e8c327dfdb076e1960d1694662d46a6917ab9"},
320 |     {file = "chardet-5.1.0.tar.gz", hash = "sha256:0d62712b956bc154f85fb0a266e2a3c5913c2967e00348701b32411d6def31e5"},
321 | ]
322 | charset-normalizer = [
323 |     {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"},
324 |     {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"},
325 | ]
326 | click = [
327 |     {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"},
328 |     {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"},
329 | ]
330 | colorama = [
331 |     {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"},
332 |     {file = "colorama-0.4.4.tar.gz", hash = "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b"},
333 | ]
334 | cython = [
335 |     {file = "Cython-0.29.33-cp27-cp27m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:286cdfb193e23799e113b7bd5ac74f58da5e9a77c70e3b645b078836b896b165"},
336 |     {file = "Cython-0.29.33-cp27-cp27m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:8507279a4f86ed8365b96603d5ad155888d4d01b72a9bbf0615880feda5a11d4"},
337 |     {file = "Cython-0.29.33-cp27-cp27mu-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5bf5ffd96957a595441cca2fc78470d93fdc40dfe5449881b812ea6045d7e9be"},
338 |     {file = "Cython-0.29.33-cp27-cp27mu-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:d2019a7e54ba8b253f44411863b8f8c0b6cd623f7a92dc0ccb83892358c4283a"},
339 |     {file = "Cython-0.29.33-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:190e60b7505d3b9b60130bcc2251c01b9ef52603420829c19d3c3ede4ac2763a"},
340 |     {file = "Cython-0.29.33-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0168482495b75fea1c97a9641a95bac991f313e85f378003f9a4909fdeb3d454"},
341 |     {file = "Cython-0.29.33-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:090556e41f2b30427dd3a1628d3613177083f47567a30148b6b7b8c7a5862187"},
342 |     {file = "Cython-0.29.33-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:19c9913e9304bf97f1d2c357438895466f99aa2707d3c7a5e9de60c259e1ca1d"},
343 |     {file = "Cython-0.29.33-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:afc9b6ab20889676c76e700ae6967aa6886a7efe5b05ef6d5b744a6ca793cc43"},
344 |     {file = "Cython-0.29.33-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:49fb45b2bf12d6e2060bbd64506c06ac90e254f3a4bceb32c717f4964a1ae812"},
345 |     {file = "Cython-0.29.33-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:5430f38d3d01c4715ec2aef5c41e02a2441c1c3a0149359c7a498e4c605b8e6c"},
346 |     {file = "Cython-0.29.33-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4d315443c7f4c61180b6c3ea9a9717ee7c901cc9db8d1d46fdf6556613840ed"},
347 |     {file = "Cython-0.29.33-cp35-cp35m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b4e6481e3e7e4d345640fe2fdc6dc57c94369b467f3dc280949daa8e9fd13b9"},
348 |     {file = "Cython-0.29.33-cp35-cp35m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:060a2568ef80116a0a9dcaf3218a61c6007be0e0b77c5752c094ce5187a4d63c"},
349 |     {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:b67ddd32eaa2932a66bf8121accc36a7b3078593805519b0f00040f2b10a6a52"},
350 |     {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:1b507236ba3ca94170ce0a504dd03acf77307d4bfbc5a010a8031673f6b213a9"},
351 |     {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:581efc0622a9be05714222f2b4ac96a5419de58d5949517282d8df38155c8b9d"},
352 |     {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6b8bcbf8f1c3c46d6184be1e559e3a3fb8cdf27c6d507d8bc8ae04cfcbfd75f5"},
353 |     {file = "Cython-0.29.33-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:1ca93bbe584aee92094fd4fb6acc5cb6500acf98d4f57cc59244f0a598b0fcf6"},
354 |     {file = "Cython-0.29.33-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:da490129e1e4ffaf3f88bfb46d338549a2150f60f809a63d385b83e00960d11a"},
355 |     {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:4cadf5250eda0c5cdaf4c3a29b52be3e0695f4a2bf1ccd49b638d239752ea513"},
356 |     {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:bcb1a84fd2bd7885d572adc180e24fd8a7d4b0c104c144e33ccf84a1ab4eb2b8"},
357 |     {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:d78147ad8a3417ae6b371bbc5bfc6512f6ad4ad3fb71f5eef42e136e4ed14970"},
358 |     {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:dd96b06b93c0e5fa4fc526c5be37c13a93e2fe7c372b5f358277ebe9e1620957"},
359 |     {file = "Cython-0.29.33-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:959f0092d58e7fa00fd3434f7ff32fb78be7c2fa9f8e0096326343159477fe45"},
360 |     {file = "Cython-0.29.33-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0455d5b92f461218bcf173a149a88b7396c3a109066274ccab5eff58db0eae32"},
361 |     {file = "Cython-0.29.33-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:a9b0b890656e9d18a18e1efe26ea3d2d0f3e525a07a2a853592b0afc56a15c89"},
362 |     {file = "Cython-0.29.33-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:b5e8ce3039ff64000d58cd45b3f6f83e13f032dde7f27bb1ab96070d9213550b"},
363 |     {file = "Cython-0.29.33-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:e8922fa3d7e76b7186bbd0810e170ca61f83661ab1b29dc75e88ff2327aaf49d"},
364 |     {file = "Cython-0.29.33-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f67b7306fd00d55f271009335cecadc506d144205c7891070aad889928d85750"},
365 |     {file = "Cython-0.29.33-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:f271f90005064c49b47a93f456dc6cf0a21d21ef835bd33ac1e0db10ad51f84f"},
366 |     {file = "Cython-0.29.33-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d4457d417ffbb94abc42adcd63a03b24ff39cf090f3e9eca5e10cfb90766cbe3"},
367 |     {file = "Cython-0.29.33-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:0b53e017522feb8dcc2189cf1d2d344bab473c5bba5234390b5666d822992c7c"},
368 |     {file = "Cython-0.29.33-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:4f88c2dc0653eef6468848eb8022faf64115b39734f750a1c01a7ba7eb04d89f"},
369 |     {file = "Cython-0.29.33-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_24_i686.whl", hash = "sha256:1900d862a4a537d2125706740e9f3b016e80f7bbf7b54db6b3cc3d0bdf0f5c3a"},
370 |     {file = "Cython-0.29.33-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:37bfca4f9f26361343d8c678f8178321e4ae5b919523eed05d2cd8ddbe6b06ec"},
371 |     {file = "Cython-0.29.33-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a9863f8238642c0b1ef8069d99da5ade03bfe2225a64b00c5ae006d95f142a73"},
372 |     {file = "Cython-0.29.33-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:1dd503408924723b0bb10c0013b76e324eeee42db6deced9b02b648f1415d94c"},
373 |     {file = "Cython-0.29.33-py2.py3-none-any.whl", hash = "sha256:8b99252bde8ff51cd06a3fe4aeacd3af9b4ff4a4e6b701ac71bddc54f5da61d6"},
374 |     {file = "Cython-0.29.33.tar.gz", hash = "sha256:5040764c4a4d2ce964a395da24f0d1ae58144995dab92c6b96f44c3f4d72286a"},
375 | ]
376 | exceptiongroup = [
377 |     {file = "exceptiongroup-1.0.0rc9-py3-none-any.whl", hash = "sha256:2e3c3fc1538a094aab74fad52d6c33fc94de3dfee3ee01f187c0e0c72aec5337"},
378 |     {file = "exceptiongroup-1.0.0rc9.tar.gz", hash = "sha256:9086a4a21ef9b31c72181c77c040a074ba0889ee56a7b289ff0afb0d97655f96"},
379 | ]
380 | flask = [
381 |     {file = "Flask-2.2.2-py3-none-any.whl", hash = "sha256:b9c46cc36662a7949f34b52d8ec7bb59c0d74ba08ba6cb9ce9adc1d8676d9526"},
382 |     {file = "Flask-2.2.2.tar.gz", hash = "sha256:642c450d19c4ad482f96729bd2a8f6d32554aa1e231f4f6b4e7e5264b16cca2b"},
383 | ]
384 | flask-cors = [
385 |     {file = "Flask-Cors-3.0.10.tar.gz", hash = "sha256:b60839393f3b84a0f3746f6cdca56c1ad7426aa738b70d6c61375857823181de"},
386 |     {file = "Flask_Cors-3.0.10-py2.py3-none-any.whl", hash = "sha256:74efc975af1194fc7891ff5cd85b0f7478be4f7f59fe158102e91abb72bb4438"},
387 | ]
388 | importlib-metadata = [
389 |     {file = "importlib_metadata-6.0.0-py3-none-any.whl", hash = "sha256:7efb448ec9a5e313a57655d35aa54cd3e01b7e1fbcf72dce1bf06119420f5bad"},
390 |     {file = "importlib_metadata-6.0.0.tar.gz", hash = "sha256:e354bedeb60efa6affdcc8ae121b73544a7aa74156d047311948f6d711cd378d"},
391 | ]
392 | iniconfig = [
393 |     {file = "iniconfig-1.1.1-py2.py3-none-any.whl", hash = "sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3"},
394 |     {file = "iniconfig-1.1.1.tar.gz", hash = "sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32"},
395 | ]
396 | itsdangerous = [
397 |     {file = "itsdangerous-2.1.2-py3-none-any.whl", hash = "sha256:2c2349112351b88699d8d4b6b075022c0808887cb7ad10069318a8b0bc88db44"},
398 |     {file = "itsdangerous-2.1.2.tar.gz", hash = "sha256:5dbbc68b317e5e42f327f9021763545dc3fc3bfe22e6deb96aaf1fc38874156a"},
399 | ]
400 | jinja2 = [
401 |     {file = "Jinja2-3.1.2-py3-none-any.whl", hash = "sha256:6088930bfe239f0e6710546ab9c19c9ef35e29792895fed6e6e31a023a182a61"},
402 |     {file = "Jinja2-3.1.2.tar.gz", hash = "sha256:31351a702a408a9e7595a8fc6150fc3f43bb6bf7e319770cbc0db9df9437e852"},
403 | ]
404 | markupsafe = [
405 |     {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"},
406 |     {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"},
407 |     {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"},
408 |     {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"},
409 |     {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"},
410 |     {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"},
411 |     {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"},
412 |     {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"},
413 |     {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"},
414 |     {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"},
415 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"},
416 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"},
417 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"},
418 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"},
419 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"},
420 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"},
421 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"},
422 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"},
423 |     {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"},
424 |     {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"},
425 |     {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"},
426 |     {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"},
427 |     {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"},
428 |     {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"},
429 |     {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"},
430 |     {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"},
431 |     {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"},
432 |     {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"},
433 |     {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"},
434 |     {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"},
435 |     {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"},
436 |     {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"},
437 |     {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"},
438 |     {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"},
439 |     {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"},
440 |     {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"},
441 |     {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"},
442 |     {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"},
443 |     {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"},
444 |     {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"},
445 | ]
446 | normality = [
447 |     {file = "normality-2.4.0-py2.py3-none-any.whl", hash = "sha256:8bd9dd5a0220f641ed4cc59b7ad64ab11b0ee49e57e5a70bf515445ff72574d2"},
448 |     {file = "normality-2.4.0.tar.gz", hash = "sha256:38bbe4e1dfd737c318ffd70e981ae8ff40bd8839393f6d62f0e200e5aab9e992"},
449 | ]
450 | packaging = [
451 |     {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"},
452 |     {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
453 | ]
454 | pluggy = [
455 |     {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"},
456 |     {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"},
457 | ]
458 | pyparsing = [
459 |     {file = "pyparsing-3.0.8-py3-none-any.whl", hash = "sha256:ef7b523f6356f763771559412c0d7134753f037822dad1b16945b7b846f7ad06"},
460 |     {file = "pyparsing-3.0.8.tar.gz", hash = "sha256:7bf433498c016c4314268d95df76c81b842a4cb2b276fa3312cfb1e1d85f6954"},
461 | ]
462 | pytest = [
463 |     {file = "pytest-7.2.0-py3-none-any.whl", hash = "sha256:892f933d339f068883b6fd5a459f03d85bfcb355e4981e146d2c7616c21fef71"},
464 |     {file = "pytest-7.2.0.tar.gz", hash = "sha256:c4014eb40e10f11f355ad4e3c2fb2c6c6d1919c73f3b5a433de4708202cade59"},
465 | ]
466 | six = [
467 |     {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"},
468 |     {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
469 | ]
470 | text-unidecode = [
471 |     {file = "text-unidecode-1.3.tar.gz", hash = "sha256:bad6603bb14d279193107714b288be206cac565dfa49aa5b105294dd5c4aab93"},
472 |     {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"},
473 | ]
474 | tomli = [
475 |     {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
476 |     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
477 | ]
478 | typing-extensions = [
479 |     {file = "typing_extensions-4.2.0-py3-none-any.whl", hash = "sha256:6657594ee297170d19f67d55c05852a874e7eb634f4f753dbd667855e07c1708"},
480 |     {file = "typing_extensions-4.2.0.tar.gz", hash = "sha256:f1c24655a0da0d1b67f07e17a5e6b2a105894e6824b92096378bb3668ef02376"},
481 | ]
482 | werkzeug = [
483 |     {file = "Werkzeug-2.2.2-py3-none-any.whl", hash = "sha256:f979ab81f58d7318e064e99c4506445d60135ac5cd2e177a2de0089bfd4c9bd5"},
484 |     {file = "Werkzeug-2.2.2.tar.gz", hash = "sha256:7ea2d48322cc7c0f8b3a215ed73eabd7b5d75d0b50e31ab006286ccff9e00b8f"},
485 | ]
486 | zipp = [
487 |     {file = "zipp-3.8.0-py3-none-any.whl", hash = "sha256:c4f6e5bbf48e74f7a38e7cc5b0480ff42b0ae5178957d564d18932525d5cf099"},
488 |     {file = "zipp-3.8.0.tar.gz", hash = "sha256:56bf8aadb83c24db6c4b577e13de374ccfb67da2078beba1d037c17980bf43ad"},
489 | ]
490 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "csv-reconcile"
 3 | version = "0.3.2"
 4 | description = "OpenRefine reconciliation service backed by csv resource"
 5 | authors = ["Douglas Mennella <trx2358-pypi@yahoo.com>"]
 6 | license = "MIT"
 7 | readme = 'README.md'
 8 | repository = "https://github.com/gitonthescene/csv-reconcile"
 9 | keywords = ["OpenRefine",
10 |             "reconcile",
11 |             "fuzzy search"]
12 | 
13 | build = "build.py"
14 | packages = [
15 |   { include = "csv_reconcile" },
16 |   { include = "csv_reconcile_dice" }
17 | ]
18 | 
19 | [tool.poetry.dependencies]
20 | python = "^3.7"
21 | flask = "^2.0"
22 | flask-cors = "^3.0.10"
23 | cython = "^0.29.21"
24 | normality = "^2.1.1"
25 | importlib_metadata = { version = ">=4.5,<7.0", python = "<3.10" }
26 | chardet = ">=4,<6"
27 | 
28 | [tool.poetry.dev-dependencies]
29 | pytest = "^7.2"
30 | 
31 | [tool.poe.tasks]
32 | dummydoc = { script = "utils:dummydoc" }
33 | pandoc = { cmd = "pandoc README.org --toc -f org -t markdown_strict -s -o README.md" }
34 | test = "pytest -v tests/main"
35 | nox = "nox"
36 | build = [
37 |   { ref = "dummydoc" },
38 |   { ref = "test" },
39 |   { cmd = "poetry build" }
40 | ]
41 | install = [
42 |   { ref = "dummydoc" },
43 |   { cmd = "poetry install" }
44 | ]
45 | publish = [
46 |   { ref = "pandoc" },
47 |   { ref = "nox" },
48 |   { cmd = "poetry publish" }
49 | ]
50 | 
51 | [tool.poetry.plugins."csv_reconcile.scorers"]
52 | "dice" = "csv_reconcile_dice"
53 | 
54 | [tool.poetry.scripts]
55 | csv-reconcile = "csv_reconcile:main"
56 | 
57 | [build-system]
58 | requires = [
59 |     "poetry>=0.12",
60 |     "cython",
61 |     "setuptools!=50.0",
62 |     "wheel"
63 | ]
64 | 
65 | build-backend = "poetry.masonry.api"
66 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore:Install 'pyicu' for better text transliteration.
4 | 


--------------------------------------------------------------------------------
/sample/progressives.tsv:
--------------------------------------------------------------------------------
  1 | member	district
  2 | Pramila Jayapal	7th District of Washington
  3 | Katie Porter	45th District of California
  4 | Ilhan Omar	5th District of Minnesota
  5 | Raúl Grijalva	3rd District of Arizona
  6 | Barbara Lee	13th District of California
  7 | Mark Pocan	2nd District of Wisconsin
  8 | Jamie Raskin	"Vice Chair for Policy
  9 | 8th District of Maryland"
 10 | Joe Neguse	"Vice Chair for New Members
 11 | 2nd District of Colorado"
 12 | Marie Newman	"Vice Chair for Communications
 13 | 3rd Congressional District of Illinois"
 14 | Sheila Jackson Lee	"Vice Chair for Inter-Caucus Relations
 15 | 18th District of Texas"
 16 | Donald Norcross	"Vice Chair for Labor
 17 | 1st District of New Jersey"
 18 | Rashida Tlaib	"Vice Chair for Member Services
 19 | 13th District of Michigan"
 20 | David Cicilline	"Vice Chair At Large
 21 | 1st District of Rhode Island"
 22 | Deb Haaland	Vice Chair At Large1st District of New Mexico
 23 | "Jesus ""Chuy"" Garcia"	"Vice Chair At Large
 24 | 4th District of Illinois"
 25 | Bonnie Watson Coleman	"Vice Chair At Large
 26 | 12th District of New Jersey"
 27 | Cori Bush	1st District of Missouri
 28 | Debbie Dingell	12th District of Michigan
 29 | Veronica Escobar	16th District of Texas
 30 | Mondaire Jones	17th District of New York
 31 | Ro Khanna	17th District of California
 32 | Lloyd Doggett	35th District of Texas
 33 | Andy Levin	9th District of Michigan
 34 | Mark Takano	41st District of California
 35 | Adriano Espaillat	13th District of New York
 36 | Jamaal Bowman	16th Congressional District of New York
 37 | Teresa Leger Fernandez	3rd Congressional District of New Mexico
 38 | Bernie Sanders	U.S. Senator for Vermont
 39 | Jan Schakowsky	9th District of Illinois
 40 | Alma Adams	12th District of North Carolina
 41 | Nanette Barragán	44th District of California
 42 | Karen Bass	37th District of California
 43 | Eddie Bernice Johnson	30th District of Texas
 44 | Don Beyer	8th District of Virginia
 45 | Earl Blumenauer	3rd District of Oregon
 46 | Lisa Blunt Rochester	Delaware's At-large Congressional District
 47 | Suzanne Bonamici	1st District of Oregon
 48 | Brendan Boyle	2nd District of Pennsylvania
 49 | André Carson	7th District of Indiana
 50 | Matt Cartwright	8th District of Pennsylvania
 51 | Judy Chu	27th District of California
 52 | Katherine Clark	5th District of Massachusetts
 53 | Yvette Clarke	9th District of New York
 54 | Steve Cohen	9th District of Memphis
 55 | Madeleine Dean	4th District of Pennsylvania
 56 | Peter DeFazio	4th District of Oregon
 57 | Diana DeGette	1st Congressional District of Colorado
 58 | Rosa DeLauro	3rd District of Connecticut
 59 | Mark DeSaulnier	11th District of California
 60 | Dwight Evans	3rd District of Pennsylvania
 61 | Lois Frankel	21st District of Florida
 62 | Marcia Fudge	11st District of Ohio
 63 | Ruben Gallego	7th District of Arizona
 64 | Sylvia Garcia	29th District of Texas
 65 | Jimmy Gomez	34th District of California
 66 | Eleanor Holmes Norton	Representing the District of Columbia
 67 | Steven Horsford	4th District of Nevada
 68 | Jared Huffman	2nd District of California
 69 | Hakeem Jeffries	8th District of New York
 70 | Hank Johnson	4th District of Georgia
 71 | Kai Kahele	2nd Congressional District of Hawai'i
 72 | Dan Kildee	5th District of Michigan
 73 | Andy Kim	3rd District of New Jersey
 74 | Brenda Lawrence	14th District of Michigan
 75 | Mike Levin	49th District of California
 76 | Ted Lieu	33rd District of California 
 77 | Zoe Lofgren	19th District of California
 78 | Alan Lowenthal	47th District of California
 79 | Carolyn Maloney	12th District of New York
 80 | James McGovern	2nd District of Massachusetts
 81 | Grace Meng	6th District of New York City
 82 | Gwen Moore	4th District of Wisconsin
 83 | Joe Morelle	25th District of New York
 84 | Jerrold Nadler	10th District of New York
 85 | Grace Napolitano	32nd District of California
 86 | Alexandria Ocasio-Cortez	14th District of New York
 87 | Frank Pallone	6th District of New Jersey
 88 | Jimmy Panetta	20th District of California
 89 | Chellie Pingree	1st District of Maine
 90 | Ayanna Pressley	7th District of Massachusetts
 91 | Linda Sanchez	38th District of California 
 92 | Mary Gay Scanlon	5th District of Pennsylvania
 93 | Brad Sherman	30th District of California
 94 | Adam Smith	9th District of Washington
 95 | Darren Soto	9th District of Florida
 96 | Ritchie Torres	15th Congressional District of New York
 97 | Lori Trahan	3rd District of Massachusetts
 98 | Juan Vargas	51st District of California
 99 | Nydia Velázquez	7th District of New York
100 | Maxine Waters	43rd District of California
101 | Peter Welch	Representing Vermont
102 | Nikema Williams	5th Congressional District of Georgia
103 | Frederica Wilson	24th District of Florida
104 | John Yarmuth	3rd District of Kentucky
105 | 


--------------------------------------------------------------------------------
/sample/sample.cfg:
--------------------------------------------------------------------------------
1 | MANIFEST = {
2 |     "identifierSpace": "http://www.wikidata.org/entity/",
3 |     "schemaSpace": "http://www.wikidata.org/prop/direct/",
4 |     "view": {"url":"https://www.wikidata.org/wiki/{{id}}"},
5 |     "name": "US congressional representatives"
6 | }
7 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gitonthescene/csv-reconcile/2ed24740e351c6912ac626af58f0b5f4c776bac6/tests/__init__.py


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from csv_reconcile import create_app, initdb, scorer
  3 | import types
  4 | try:
  5 |     import importlib_metadata as metadata
  6 | except:
  7 |     from importlib import metadata
  8 | 
  9 | 
 10 | @pytest.fixture
 11 | def plugins():
 12 |     '''csv_reconcile.scorers plugins'''
 13 |     eps = metadata.entry_points().select(group='csv_reconcile.scorers')
 14 |     return {ep.name: ep for ep in eps}
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def csvcontents():
 19 |     '''contents for mock csv file'''
 20 |     # Column names are normalized for database use
 21 |     # id column need not be first nor name name second
 22 |     return '''
 23 | name@hdr	id hdr	extra!hdr
 24 | first	1	stuff
 25 | second	2	junk
 26 | third	3	and so on
 27 | '''.strip()
 28 | 
 29 | @pytest.fixture
 30 | def ambiguous_csvcontents():
 31 |     '''Try to throw off csv.Sniffer() to test overrides'''
 32 |     return '''
 33 | These, my friends, are the columns
 34 | However, above all, columns matter most
 35 | '''.strip()
 36 | 
 37 | @pytest.fixture
 38 | def sniffer_throwing_csvcontents():
 39 |     '''Try to throw off csv.Sniffer() to test overrides'''
 40 |     return '''
 41 | a,b,c\n1,2
 42 | '''.strip()
 43 | 
 44 | 
 45 | @pytest.fixture
 46 | def formContentHeader():
 47 |     '''header for form data for client'''
 48 |     return {'content-type': 'application/x-www-form-urlencoded'}
 49 | 
 50 | 
 51 | @pytest.fixture
 52 | def header(csvcontents):
 53 |     '''header of mock csvfile'''
 54 |     return csvcontents.splitlines()[0].split('\t')
 55 | 
 56 | 
 57 | @pytest.fixture
 58 | def idnm(header):
 59 |     '''id and name cols from the header'''
 60 |     return (header[1], header[0])
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def typicalrow(csvcontents):
 65 |     '''typical row of mock csvfile'''
 66 |     return csvcontents.splitlines()[1].split('\t')
 67 | 
 68 | 
 69 | @pytest.fixture
 70 | def setup(tmp_path, csvcontents, idnm):
 71 |     '''mock csv file with id and name columns indicated'''
 72 | 
 73 |     p = tmp_path / "csvfile"
 74 |     p.write_text(csvcontents)
 75 |     return (p, *idnm)
 76 | 
 77 | @pytest.fixture
 78 | def ambiguous_setup(tmp_path, ambiguous_csvcontents):
 79 |     '''mock csv file with id and name columns indicated'''
 80 | 
 81 |     def getSetup(idnm):
 82 |         p = tmp_path / "amb_csvfile"
 83 |         p.write_text(ambiguous_csvcontents)
 84 |         return (p, *idnm)
 85 | 
 86 |     return getSetup
 87 | 
 88 | @pytest.fixture
 89 | def sniffer_throwing_setup(tmp_path, sniffer_throwing_csvcontents):
 90 |     '''mock csv file with id and name columns indicated'''
 91 | 
 92 |     def getSetup(idnm):
 93 |         p = tmp_path / "snfthrw_csvfile"
 94 |         p.write_text(sniffer_throwing_csvcontents)
 95 |         return (p, *idnm)
 96 | 
 97 |     return getSetup
 98 | 
 99 | @pytest.fixture
100 | def cfgContents():
101 |     return '''
102 | THRESHOLD=0.0
103 | import logging
104 | LOGLEVEL=logging.DEBUG'''
105 | 
106 | 
107 | @pytest.fixture
108 | def mkConfig(tmp_path):
109 |     '''make server config'''
110 | 
111 |     def fn(cfgContents):
112 |         p = tmp_path / "config"
113 |         p.write_text(cfgContents)
114 | 
115 |         return p
116 | 
117 |     return fn
118 | 
119 | 
120 | @pytest.fixture
121 | def config(mkConfig, cfgContents):
122 |     '''mock server config'''
123 |     return mkConfig(cfgContents)
124 | 
125 | 
126 | @pytest.fixture
127 | def mockPlugin():
128 |     '''save/restore original plugin API'''
129 |     saveOrig = {
130 |         nm: vl
131 |         for nm, vl in scorer.__dict__.items()
132 |         if type(vl) == types.FunctionType
133 |     }
134 |     yield saveOrig
135 |     for nm, fn in saveOrig.items():
136 |         setattr(scorer, nm, fn)
137 | 
138 | 
139 | @pytest.fixture
140 | def app(plugins, tmp_path):
141 |     '''flask app'''
142 | 
143 |     def getApp(setup, config, plugin='dice'):
144 |         app = create_app(config, instance_path=tmp_path / "instance", scorerOption=plugin)
145 |         with app.app_context():
146 |             initdb.init_db_with_context(*setup)
147 | 
148 |         return app
149 | 
150 |     return getApp
151 | 
152 | 
153 | @pytest.fixture
154 | def client(app):
155 |     '''http client'''
156 | 
157 |     def getClient(setup, config, plugin='dice'):
158 |         return app(setup, config, plugin=plugin).test_client()
159 | 
160 |     return getClient
161 | 
162 | 
163 | @pytest.fixture
164 | def basicClient(client, setup, config):
165 | 
166 |     def getClient(config=config):
167 |         return client(setup, config, plugin='dice')
168 | 
169 |     return getClient
170 | 


--------------------------------------------------------------------------------
/tests/main/test_csv_reconcile.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from csv_reconcile import __version__, scorer
  4 | from csv_reconcile.db import getCSVCols
  5 | 
  6 | import json
  7 | from urllib.parse import urlencode
  8 | 
  9 | 
 10 | def test_version():
 11 |     assert __version__ == '0.3.2'
 12 | 
 13 | 
 14 | def test_manifest(basicClient):
 15 |     response = basicClient().get('/reconcile')
 16 | 
 17 |     assert response.status_code == 200
 18 | 
 19 |     manifest = json.loads(response.data)
 20 |     expectedKeys = set(
 21 |         'versions name identifierSpace schemaSpace extend'.split())
 22 | 
 23 |     assert set(manifest.keys()).intersection(expectedKeys) == expectedKeys
 24 | 
 25 | 
 26 | def test_query_basics(basicClient, formContentHeader):
 27 |     query = {'q0': {'query': 'first'}}
 28 |     queryjson = json.dumps(query)
 29 |     response = basicClient().post('/reconcile',
 30 |                                   data=urlencode([('queries', queryjson)]),
 31 |                                   headers=formContentHeader)
 32 | 
 33 |     assert response.status_code == 200
 34 | 
 35 |     matchBatch = json.loads(response.data)
 36 | 
 37 |     assert query.keys() == matchBatch.keys()
 38 | 
 39 |     assert 'result' in matchBatch['q0']
 40 |     assert type(matchBatch['q0']['result']) == list
 41 | 
 42 | 
 43 | def test_data_extension_basics(basicClient, setup, header, typicalrow,
 44 |                                formContentHeader):
 45 | 
 46 |     client = basicClient()
 47 |     # Type is ignored in this service
 48 |     dummyType = ''
 49 |     _, idcol, namecol = setup
 50 |     ididx = header.index(idcol)
 51 | 
 52 |     response = client.get('/properties?type=%s' % (dummyType,))
 53 | 
 54 |     assert response.status_code == 200
 55 | 
 56 |     cols = json.loads(response.data)
 57 | 
 58 |     assert 'properties' in cols
 59 |     assert type(cols['properties']) == list  # [ {id:..., name:...}, ... ]
 60 | 
 61 |     availableCols = dict()
 62 |     for itm in cols['properties']:
 63 |         assert set(itm.keys()) == set(('id', 'name'))
 64 | 
 65 |         availableCols[itm['name']] = itm['id']
 66 | 
 67 |     assert set(availableCols) == set(header)
 68 | 
 69 |     colid = typicalrow[ididx]
 70 |     req = {'ids': [colid], 'properties': cols['properties']}
 71 |     reqjson = json.dumps(req)
 72 |     response = client.post('/reconcile',
 73 |                            data=urlencode([('extend', reqjson)]),
 74 |                            headers=formContentHeader)
 75 | 
 76 |     assert response.status_code == 200
 77 | 
 78 |     extenddata = json.loads(response.data)
 79 | 
 80 |     assert 'meta' in extenddata
 81 |     assert 'rows' in extenddata
 82 |     assert colid in extenddata['rows']
 83 | 
 84 |     row = extenddata['rows'][colid]
 85 |     for colextra, colid in availableCols.items():
 86 |         exidx = header.index(colextra)
 87 | 
 88 |         assert colid in row
 89 | 
 90 |         for choice in row[colid]:
 91 |             assert 'str' in choice
 92 |             assert choice['str'] == typicalrow[exidx]
 93 | 
 94 | 
 95 | def test_preview_service(basicClient, setup, header, typicalrow):
 96 |     client = basicClient()
 97 | 
 98 |     # no id
 99 |     response = client.get(f"/preview/")
100 |     assert response.status_code == 404
101 | 
102 |     # unavailable id
103 |     response = client.get(f"/preview/unavailable")
104 |     assert response.status_code == 404
105 | 
106 |     # available id
107 |     id_idx = header.index(setup[1])
108 |     response = client.get(f"/preview/{typicalrow[id_idx]}")
109 |     assert response.status_code == 200
110 | 
111 |     html_response = response.data.decode("utf-8")
112 |     print(html_response)
113 |     assert f"<title>Preview for {typicalrow[id_idx]}</title>" in html_response
114 |     for key, value in zip(header, typicalrow):
115 |         assert f"<dt>{key}</dt><dd>{value}</dd>" in html_response
116 | 
117 | 
118 | @pytest.fixture
119 | def limitConfig(mkConfig):
120 |     contents = '''
121 | LIMIT=2
122 | THRESHOLD=-1.0
123 | import logging
124 | LOGLEVEL=logging.DEBUG
125 | '''
126 |     return mkConfig(contents)
127 | 
128 | 
129 | def test_reconcile_limit(basicClient, formContentHeader, limitConfig):
130 |     query = {'q0': {'query': 'first'}}
131 |     queryjson = json.dumps(query)
132 |     client = basicClient(limitConfig)
133 |     response = client.post('/reconcile',
134 |                            data=urlencode([('queries', queryjson)]),
135 |                            headers=formContentHeader)
136 | 
137 |     assert response.status_code == 200
138 | 
139 |     matchBatch = json.loads(response.data)
140 | 
141 |     assert len(matchBatch['q0']['result']) == 2
142 |     response = client.post('/reconcile',
143 |                            data=urlencode([('queries', queryjson)]),
144 |                            headers=formContentHeader)
145 | 
146 |     # Override config limit in query with larger number
147 |     query = {'q0': {'query': 'first', 'limit': 3}}
148 |     queryjson = json.dumps(query)
149 |     response = client.post('/reconcile',
150 |                            data=urlencode([('queries', queryjson)]),
151 |                            headers=formContentHeader)
152 | 
153 |     assert response.status_code == 200
154 | 
155 |     matchBatch = json.loads(response.data)
156 | 
157 |     # Matches override
158 |     assert len(matchBatch['q0']['result']) == 3
159 | 
160 |     # Override config limit in query with smaller number
161 |     query = {'q0': {'query': 'first', 'limit': 1}}
162 |     queryjson = json.dumps(query)
163 |     response = client.post('/reconcile',
164 |                            data=urlencode([('queries', queryjson)]),
165 |                            headers=formContentHeader)
166 | 
167 |     assert response.status_code == 200
168 | 
169 |     matchBatch = json.loads(response.data)
170 | 
171 |     # Matches override
172 |     assert len(matchBatch['q0']['result']) == 1
173 | 
174 | 
175 | def test_reconcile_automatch(basicClient, formContentHeader):
176 |     client = basicClient()
177 | 
178 |     query = {'q0': {'query': 'first'}}
179 |     queryjson = json.dumps(query)
180 |     response = client.post('/reconcile',
181 |                            data=urlencode([('queries', queryjson)]),
182 |                            headers=formContentHeader)
183 | 
184 |     assert response.status_code == 200
185 | 
186 |     matchBatch = json.loads(response.data)
187 |     result = matchBatch['q0']['result']
188 | 
189 |     # Only one with 100% match automatches
190 |     cnt = 0
191 |     for itm in result:
192 |         if itm['name'] == 'first':
193 |             cnt += 1
194 |             assert itm['match'] == True
195 |             assert itm['score'] == 100.0
196 |         else:
197 |             assert itm['match'] == False
198 | 
199 |     assert cnt == 1
200 | 
201 |     # None with 100% match does not automatch
202 |     query = {'q0': {'query': 'fir'}}
203 |     queryjson = json.dumps(query)
204 |     response = client.post('/reconcile',
205 |                            data=urlencode([('queries', queryjson)]),
206 |                            headers=formContentHeader)
207 | 
208 |     assert response.status_code == 200
209 | 
210 |     matchBatch = json.loads(response.data)
211 |     result = matchBatch['q0']['result']
212 | 
213 |     assert all(
214 |         itm['match'] == False and itm['score'] != 100.0 for itm in result)
215 | 
216 |     # Only one result automatches, even if not 100%
217 |     query = {'q0': {'query': 'fir', 'limit': 1}}
218 |     queryjson = json.dumps(query)
219 |     response = client.post('/reconcile',
220 |                            data=urlencode([('queries', queryjson)]),
221 |                            headers=formContentHeader)
222 | 
223 |     assert response.status_code == 200
224 | 
225 |     matchBatch = json.loads(response.data)
226 |     result = matchBatch['q0']['result']
227 |     assert len(result) == 1
228 |     assert result[0]['score'] != 100.0 and result[0]['match'] == True
229 | 
230 | 
231 | def test_plugin(mockPlugin, basicClient, csvcontents, formContentHeader):
232 |     # Since used in closure pass in "by reference"
233 |     p, gn, nw, sm, v = list(range(5))
234 |     called = [0] * 5
235 | 
236 |     @scorer.register
237 |     def processScoreOptions(options):
238 |         called[p] += 1
239 | 
240 |     @scorer.register
241 |     def getNormalizedFields():
242 |         # one normalized field
243 |         called[gn] += 1
244 |         return ('dummy',)
245 | 
246 |     @scorer.register
247 |     def normalizeWord(word, **scoreOptions):
248 |         # everything normalizes to COW thus everything matches
249 |         called[nw] += 1
250 |         return ("COW",)
251 | 
252 |     @scorer.register
253 |     def scoreMatch(left, right):
254 |         # Count the number of letters in common
255 |         called[sm] += 1
256 |         left, right = left[0], right[0]
257 |         return len(set(left).intersection(right)) / len(left) * 100.0
258 | 
259 |     @scorer.register
260 |     def valid(normalizedFields):
261 |         called[v] += 1
262 |         return True
263 | 
264 |     client = basicClient()
265 | 
266 |     # processScoreOptions, getNormalizedFields, and normalizeWord all called during setup
267 |     # scoreMatch and valid not yet called
268 |     assert all(called[itm] > 0 for itm in (p, gn, nw))
269 |     assert called[sm:] == [0, 0]
270 | 
271 |     # total number of rows minus 1 for the header row
272 |     nRows = len(csvcontents.splitlines()) - 1
273 | 
274 |     query = {'q0': {'query': 'mxyzptlk'}}
275 |     queryjson = json.dumps(query)
276 |     response = client.post('/reconcile',
277 |                            data=urlencode([('queries', queryjson)]),
278 |                            headers=formContentHeader)
279 |     assert response.status_code == 200
280 | 
281 |     matchBatch = json.loads(response.data)
282 | 
283 |     assert len(matchBatch['q0']['result']) == nRows
284 |     assert all(called[itm] > 0 for itm in (p, gn, nw, sm, v))
285 | 
286 |     # processScoreOptions still called once, getNormalizedFields only called twice
287 |     assert called[:2] == [1, 2]
288 | 
289 | def test_csv_sniffer_overrides(app, ambiguous_setup, ambiguous_csvcontents, config, mkConfig):
290 | 
291 |     topline = ambiguous_csvcontents.splitlines()[0]
292 |     items = lambda sep: [ h.strip() for h in topline.split(sep)]
293 | 
294 |     # First guess is that the , is a separator
295 |     SEP = ','
296 |     chk = app(ambiguous_setup(items(SEP)[:2]), config)
297 |     with chk.app_context():
298 |         headernms = [name for _,name in getCSVCols()]
299 |         assert headernms == items(SEP)
300 | 
301 |     # Now parse with override
302 |     SEP = ' '
303 |     cfg = mkConfig('CSVKWARGS = {"delimiter": " "}')
304 |     chk = app(ambiguous_setup(items(SEP)[:2]), cfg)
305 |     with chk.app_context():
306 |         headernms = [name for _,name in getCSVCols()]
307 |         assert headernms == items(SEP)
308 | 
309 | def test_csv_sniffer_throwing(app, sniffer_throwing_setup, sniffer_throwing_csvcontents, config, mkConfig):
310 | 
311 |     topline = sniffer_throwing_csvcontents.splitlines()[0]
312 |     items = lambda sep: [ h.strip() for h in topline.split(sep)]
313 | 
314 |     # First guess is that the , is a separator
315 |     SEP = ','
316 |     chk = app(sniffer_throwing_setup(items(SEP)[:2]), config)
317 |     with chk.app_context():
318 |         headernms = [name for _,name in getCSVCols()]
319 |         assert headernms == items(SEP)
320 | 
321 |     # Now parse with override
322 |     cfg = mkConfig('CSVKWARGS = {"delimiter": ","}')
323 |     chk = app(sniffer_throwing_setup(items(SEP)[:2]), cfg)
324 |     with chk.app_context():
325 |         headernms = [name for _,name in getCSVCols()]
326 |         assert headernms == items(SEP)
327 | 


--------------------------------------------------------------------------------
/tests/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gitonthescene/csv-reconcile/2ed24740e351c6912ac626af58f0b5f4c776bac6/tests/plugins/__init__.py


--------------------------------------------------------------------------------
/tests/plugins/geo/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | 
 4 | @pytest.fixture
 5 | def csvcontents():
 6 |     return '''
 7 | city	cityLabel	coords
 8 | Q60	New York City	Point(-73.94 40.67)
 9 | Q65	Los Angeles	Point(-118.24368 34.05223)
10 | Q1297	Chicago	Point(-87.627777777 41.881944444)
11 | Q8652	Miami	Point(-80.216666666 25.783333333)
12 | '''.strip()
13 | 
14 | 
15 | @pytest.fixture
16 | def idnm(header):
17 |     '''id and name cols from the header'''
18 |     return (header[0], header[2])
19 | 
20 | 
21 | @pytest.fixture
22 | def basicClient(client, setup):
23 | 
24 |     def getClient(config):
25 |         return client(setup, config, plugin='geo')
26 | 
27 |     return getClient
28 | 


--------------------------------------------------------------------------------
/tests/plugins/geo/test_geo_reconcile.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | import json
  3 | from urllib.parse import urlencode
  4 | from pprint import pprint as pp
  5 | from geopy import distance
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def toResolve():
 10 |     data = '''
 11 | city	cityLabel	coords
 12 | Q1342	Pittsburgh	Point(-80.0 40.441666666)
 13 | Q5083	Seattle	Point(-122.33207 47.60621)
 14 | Q16559	Austin	Point(-97.733333333 30.3)
 15 | Q43196	Cincinnati	Point(-84.5 39.133333333)
 16 | '''.strip()
 17 |     return {
 18 |         cityNm: coords
 19 |         for cityId, cityNm, coords in (l.split('\t') for l in data.splitlines())
 20 |     }
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def baseDataLkupNm(csvcontents):
 25 |     return {
 26 |         cityId: cityNm for cityId, cityNm, _ in (
 27 |             l.split('\t') for l in csvcontents.splitlines())
 28 |     }
 29 | 
 30 | 
 31 | @pytest.fixture
 32 | def baseDataLkupCoord(csvcontents):
 33 |     return {
 34 |         cityNm: coords for _, cityNm, coords in (
 35 |             l.split('\t') for l in csvcontents.splitlines())
 36 |     }
 37 | 
 38 | 
 39 | def test_query(basicClient, config, formContentHeader, toResolve,
 40 |                baseDataLkupNm):
 41 | 
 42 |     ccoords = toResolve['Seattle']
 43 | 
 44 |     query = {'q0': {'query': ccoords}}
 45 |     queryjson = json.dumps(query)
 46 |     response = basicClient(config).post('/reconcile',
 47 |                                         data=urlencode([('queries', queryjson)
 48 |                                                        ]),
 49 |                                         headers=formContentHeader)
 50 | 
 51 |     assert response.status_code == 200
 52 | 
 53 |     matchBatch = json.loads(response.data)
 54 | 
 55 |     assert query.keys() == matchBatch.keys()
 56 | 
 57 |     results = matchBatch['q0']['result']
 58 |     best = max(results, key=lambda x: x['score'])
 59 | 
 60 |     assert baseDataLkupNm[best['id']] == 'Los Angeles'
 61 | 
 62 | 
 63 | @pytest.fixture
 64 | def scaleConfig(mkConfig, baseDataLkupCoord, toResolve):
 65 | 
 66 |     # convert wkt format to tuple of floats (lat, lon)
 67 |     mkpt = lambda wkt: tuple(float(x) for x in wkt[6:-1].split()[1::-1])
 68 | 
 69 |     chicago = baseDataLkupCoord['Chicago']
 70 |     pittsburgh = toResolve['Pittsburgh']
 71 | 
 72 |     dist = distance.geodesic(mkpt(chicago), mkpt(pittsburgh)).km
 73 | 
 74 |     contents = f'''
 75 | THRESHOLD=0.0
 76 | import logging
 77 | LOGLEVEL=logging.DEBUG
 78 | SCOREOPTIONS = {{
 79 |   "SCALE": {dist}
 80 | }}
 81 | '''
 82 |     return mkConfig(contents)
 83 | 
 84 | 
 85 | def test_scale(basicClient, scaleConfig, formContentHeader, toResolve,
 86 |                baseDataLkupNm):
 87 | 
 88 |     pittsburgh = toResolve['Pittsburgh']
 89 | 
 90 |     query = {'q0': {'query': pittsburgh}}
 91 |     queryjson = json.dumps(query)
 92 |     response = basicClient(scaleConfig).post('/reconcile',
 93 |                                              data=urlencode([('queries',
 94 |                                                               queryjson)]),
 95 |                                              headers=formContentHeader)
 96 | 
 97 |     assert response.status_code == 200
 98 | 
 99 |     matchBatch = json.loads(response.data)
100 | 
101 |     assert query.keys() == matchBatch.keys()
102 | 
103 |     results = matchBatch['q0']['result']
104 | 
105 |     score = {baseDataLkupNm[r['id']]: r['score'] for r in results}
106 | 
107 |     assert score['Chicago'] == 50  # Right at scale
108 |     assert score['New York City'] > 50  # closer
109 |     assert score['Miami'] < 50  # further
110 | 
111 | 
112 | @pytest.fixture
113 | def rangeConfig(mkConfig, baseDataLkupCoord, toResolve):
114 | 
115 |     contents = f'''
116 | THRESHOLD=0.0
117 | import logging
118 | LOGLEVEL=logging.DEBUG
119 | SCOREOPTIONS = {{
120 |   "COORDRANGE": 10.0
121 | }}
122 | '''
123 |     return mkConfig(contents)
124 | 
125 | 
126 | def test_range(basicClient, rangeConfig, formContentHeader, toResolve,
127 |                baseDataLkupNm):
128 | 
129 |     pittsburgh = toResolve['Pittsburgh']
130 | 
131 |     query = {'q0': {'query': pittsburgh}}
132 |     queryjson = json.dumps(query)
133 |     response = basicClient(rangeConfig).post('/reconcile',
134 |                                              data=urlencode([('queries',
135 |                                                               queryjson)]),
136 |                                              headers=formContentHeader)
137 | 
138 |     assert response.status_code == 200
139 | 
140 |     matchBatch = json.loads(response.data)
141 | 
142 |     assert query.keys() == matchBatch.keys()
143 | 
144 |     results = matchBatch['q0']['result']
145 | 
146 |     # Only NYC and Chicago have longitude and latitude within 10 points
147 |     assert len(results) == 2
148 | 
149 |     score = {baseDataLkupNm[r['id']]: r['score'] for r in results}
150 | 
151 |     assert 'Chicago' in score
152 |     assert 'New York City' in score
153 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | import os.path
2 | 
3 | 
4 | def dummydoc():
5 |     if not os.path.exists("README.md"):
6 |         print("Creating README.md ...")
7 |         f = open("README.md", "w")
8 |         f.close()
9 | 


--------------------------------------------------------------------------------