├── .gitignore ├── LICENSE.txt ├── README.md ├── app.py ├── docs └── images │ ├── add_columns.png │ ├── add_service.png │ ├── create_new.png │ ├── matching.png │ ├── refine.png │ ├── select_columns.png │ ├── start_reconciling.png │ └── start_reconciling_cfg.png ├── guardian_2010_greatest_films_of_all_time.csv ├── movie_posters.xlsx ├── reconciliation ├── __init__.py └── reconcile.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # PyCharm 141 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 142 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 143 | # and can be added to the global gitignore or merged into this file. For a more nuclear 144 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 145 | #.idea/ 146 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2021 Patrick O'Leary, PrefTech Inc 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reconciliation 2 | Reconciliation is a python flask framework that works with OpenRefine Reconcile Data capabilities. 3 | Reconcile is the ability to match and enhance data from multiple sources over the web. 4 | 5 | This framework provides the ability to create one of those sources without having to reimplement the underlying protocol. 6 | Reconciliation will provide 7 | 8 | * Request parsing 9 | * Handler for Reconciliation JSON Protocol 10 | * Function decorators for 11 | * Search 12 | * Required 13 | * Used to map incoming data to an entity id in your data 14 | * Search in batch mode 15 | * Required 16 | * Same as search but used for batches for performance 17 | * Extend 18 | * Optional - but kind of useless without it 19 | * Used to add additional columns / fields to end users data 20 | * Preview 21 | * Optional - really handy for users 22 | * Used as a hover preview method, by creating an iframe in openrefine 23 | * View 24 | * Optional - really useful 25 | * Used to show the entity in a browser 26 | 27 | ## Why use Open Refines reconciliation 28 | OpenRefine provides a desktop/browser tool for data management and curation. This tool provides an excel like interface for cleaning, scripting abilities to augment data, enhancing data with the ability to fetch data over the internet and supliment the data you already have. 29 | Details on how to use OpenRefine's Reconciliation interface are available https://docs.openrefine.org/manual/reconciling 30 | A list of known publically available services is available https://reconciliation-api.github.io/testbench/ 31 | 32 | With OpenRefine you have an out of the box you have a UI and tool that will allow individuals or teams to work without having to build a tool from scratch. 33 | ![OpenRefine Screen Shot](https://raw.githubusercontent.com/preftech/reconciliation/main/docs/images/refine.png) 34 | 35 | 36 | # Installation 37 | Install using pip 38 | ```sh 39 | pip install reconciliation 40 | ``` 41 | 42 | # Usage 43 | This is a flask based app that allows you to control how you route, authenticate, log etc.. A full example exists in the app.py file 44 | 45 | ```python 46 | from reconcililiation import EntityType, InvalidUsage, Property, ReconcileRequest, ReconcileService 47 | 48 | # Create a flask app 49 | app = Flask(__name__) 50 | 51 | # initialize a ReconcileService 52 | rs = ReconcileService("Movie Reconciliation", "0.1a") 53 | 54 | # Create an entity you want to serve 55 | et = EntityType("Movie", "/movie") 56 | et.properties.append(Property("imdb", "IMDB URL")) 57 | et.properties.append(Property("poster", "Poster URL")) 58 | # Add the entity to the service 59 | rs.add_entity(et) 60 | 61 | # Set the entrypoint for your application 62 | # This lets you control the URLs and if you wish to put a URL key for authentication you can do so here 63 | @app.route("/reconcile/", methods=['GET', 'POST']) 64 | @app.route("/reconcile/", methods=['GET', 'POST']) 65 | @app.route("/reconcile//", methods=['GET', 'POST']) 66 | def handle(path=None, id=None) : 67 | return rs.serve(path, id) 68 | 69 | ``` 70 | 71 | The following decorators are used to handle incoming requests 72 | ```python 73 | 74 | @rs.search 75 | def my_search(reconcile: ReconcileRequest) 76 | ''' 77 | Will be called with a single search query 78 | reconcile.query will contain a string for the entity being searched for 79 | expects a return of : 80 | { "result" : [ 81 | { 82 | "id": , 83 | "name": , 84 | "score": , 85 | "match": , # Return True for exact match. 86 | "type": [ 87 | { # EntityType, ideally as added to the rs above 88 | "id": et.id, 89 | "name": et.name 90 | } 91 | ] 92 | } 93 | ] 94 | } 95 | ''' 96 | 97 | @rs.search_batch 98 | def my_search_batch(reconcile: ReconcileRequest): 99 | ''' 100 | By Default OpenRefine will attempt to batch up queries 101 | These will be available in reconcile.queries as a dictionary queries key off a query id 102 | e.g. 103 | { 104 | : {"query" : "text to search for"} , 105 | : {"query" : "something else to search for"} 106 | ...... 107 | } 108 | 109 | The expected return is 110 | {"results" : { 111 | : { "result" : .... } # same as single search result 112 | : { "result" : .....} 113 | ...... 114 | } 115 | } 116 | ''' 117 | 118 | @rs.extend 119 | ''' 120 | def extend(reconcile: ReconcileRequest): 121 | 122 | This is the request that handles performing "Add column based on reconciled data" 123 | A list of properties and a list of entity ids are passed in 124 | The entity and the value of the propery are expected as results. 125 | 126 | reconcile.extend.ids contains the list of entity ids 127 | reconcile.extend.properties contains the list of properties expected 128 | 129 | Return expected 130 | 131 | { 132 | "rows" : { 133 | : { 134 | "" : [ { "" : ""}], 135 | "" : [ { "" : ""},{ "" : ""}] 136 | ..... 137 | 138 | 139 | }, 140 | : { 141 | "" : [ { "" : ""}], 142 | "" : [ { "" : ""},{ "" : ""}] 143 | ..... 144 | 145 | 146 | }, 147 | ....... 148 | 149 | } 150 | } 151 | 152 | ''' 153 | @rs.preview_wh(width, height) # in pixels 154 | ''' 155 | def preview_item(id): 156 | 157 | This funtion is called when a matched results are hovered over in OpenRefine 158 | Openrefine creates an iframe of size width x height where you can display summary data for the entity 159 | Arg: 160 | id - the entity id 161 | 162 | Return : 163 | HTML 164 | 165 | ''' 166 | 167 | 168 | 169 | @rs.view 170 | ''' 171 | def view_item(id): 172 | 173 | This function is called in openrefine when a user clicks a matched entity 174 | You can return html, redirect, a file download, anything that is browser compatible 175 | 176 | Arg: 177 | id = the entity id 178 | 179 | Return: 180 | Browser compatible content 181 | 182 | ''' 183 | 184 | ``` 185 | # Reconciliation Example 186 | Start by checking out openrefine at https://openrefine.org/ and downloading the latest version of the OpenRefine software 187 | The example provided solves a simple problem, you have a spreadsheet of The Guardian's 2010 Greatest Movies of all time, 188 | however it's mising the movies posters. 189 | This sample app will load a spreadsheet call movie_posters.xlsx which contains some of the movie posters. 190 | 191 | 192 | Below I bring you through the steps of reconciling data in OpenRefine, this is not meant to be a tutorial on OpenRefine, there are way to many capabilities available in it and resources online that cover that. This is a very quick walk though on how to use this example with OpenRefine. 193 | 194 | First start this app, assume you've cloned the [reconciliation github repo](https://github.com/preftech/reconciliation), setup your [virtualenv](https://sourabhbajaj.com/mac-setup/Python/virtualenv.html) or [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) for python 195 | 196 | 197 | ### Step 1 198 | ``` 199 | pip install -r requirements.txt 200 | python app.py 201 | ``` 202 | This should start the reconciliation service at http://127.0.0.1:5000/reconcile/ 203 | 204 | ### Step 2 205 | Next launch OpenRefine and create a new project with the guardian_2010_greatest_films_of_all_time.csv 206 | ![Create New Project](https://github.com/preftech/reconciliation/blob/main/docs/images/create_new.png?raw=true) 207 | 208 | Select next > Create Project (defaults on this page are fine) 209 | You should have a spreadsheet page with the list of movies 210 | 211 | ### Step 3 Matching 212 | Next step lets reconcile the movie titles against our reconciliation service. 213 | Click the dropdown menu next to "title" > Reconcile > Start Reconciling 214 | ![Start Reconciling](https://github.com/preftech/reconciliation/blob/main/docs/images/start_reconciling.png?raw=true) 215 | 216 | Next Add the reconcilation service, you will see an >Add Standard Service button on the bottom left 217 | Type in http://127.0.0.1:5000/reconcile/ **Ensure you include the trailing slash** 218 | > Add Service 219 | 220 | ![Add Service](https://github.com/preftech/reconciliation/blob/main/docs/images/add_service.png?raw=true) 221 | 222 | Again the defaults should be fine here. 223 | Under the covers this calls http://127.0.0.1:5000/reconcile/ and receives back a list of services you have enabled 224 | This services are linked to the EntityType you added to ReconcileService and @rs.* decorators in your code. 225 | ``` 226 | et = EntityType("Movie", "/movie") 227 | et.properties.append(Property("imdb", "IMDB URL")) 228 | et.properties.append(Property("poster", "Poster URL")) 229 | ``` 230 | 231 | ``` 232 | @rs.view 233 | ..... 234 | ```` 235 | 236 | At this stage you will see your spreadsheet and a status toast showing you how much matching has occurred. 237 | Once Complete you will see a set of Facet / Filters appear on the left handside and some additional data in the title column. 238 | ![Matching Screen](https://github.com/preftech/reconciliation/blob/main/docs/images/matching.png?raw=true) 239 | 240 | 241 | As you can see from the Facet / Filter we matched 36 movies this is the field "match" from our @rs.search/@rs.search_batch method. 242 | In this example if we had single result from the data and name matched we declared it a match. 243 | 244 | You can select either matched if you just want to work with fully matched data, or none if you want to work on data that isn't fully matched. 245 | 246 | Let's take look at "Nosferatu" on row 3 247 | There are 2 possible movies in the results the 1922 original or a remake in 1986. Hovering over them calls the @rs.preview method. In out example we return html with the movie poster as an image. 248 | A reviewer can now select which movie is the correct one, if there are duplicates in the data, then it can be applied to all other cells with the same title. 249 | 250 | The judgement score is a sliding filter based on the field "score" from @rs.search / @rs.search_batch 251 | In our case we divided 100 / No. Matches so a score of 100 = 1 match, 50 = 2 matches, 3 = 33.3 and so on... 252 | 253 | By default blanks; 0 matches are included, you can uncheck this box to exclude them, and use the slider to work through matching data and prioritizing it. 254 | 255 | ### Step 4 Extend Data 256 | At this stage we have data in OpenRefine reconciled / matched to the data in our service as best as we can, we have had humna judgement clean it up and now we want to add fields to the matched data. 257 | 258 | Click the dropdown menu next to title > Edit Column > Add Columns from Reconciled Values 259 | ![Add Columns](https://github.com/preftech/reconciliation/blob/main/docs/images/add_columns.png?raw=true) 260 | 261 | 262 | The next screen that pops up, will display the available properties that you have configured from the EntityType properties. 263 | If haven't filtered the rows from the Facet/Filter panel, you may have 'non-reconciled' rows showing up here. If you don't want that hit escape and in judgement scores simply uncheck the "blanks" checkbox, and repeat this step. 264 | ![Add Columns](https://github.com/preftech/reconciliation/blob/main/docs/images/add_columns.png?raw=true) 265 | 266 | You can now select 1 or more properties to add to your data, and click ok. 267 | This will now call the @rs.extend method of your code. 268 | OpenRefine will now fetch the properties you've requested and proceed to add them as columns to the entities that have **been matched**. 269 | 270 | 271 | 272 | ## Attributions 273 | A thanks has to go out to the following for data used in the example 274 | * Babu Thomas (@babu-thomas) 275 | * MovieLens project 276 | * https://github.com/babu-thomas/movielens-posters 277 | * Owen Temples - https://owentemple.com/ 278 | * Guardian's 2010 Greatest Movies of all Time 279 | * https://data.world/owentemple/greatest-films-of-all-time -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, redirect 2 | from pprint import pprint as pp 3 | from flask_jsonpify import jsonify 4 | import pandas as pd 5 | from reconciliation.reconcile import EntityType, InvalidUsage, Property, ReconcileRequest, ReconcileService 6 | 7 | app = Flask(__name__) 8 | 9 | et = EntityType("Movie", "/movie") 10 | et.properties.append(Property("imdb", "IMDB URL")) 11 | et.properties.append(Property("poster", "Poster URL")) 12 | 13 | rs = ReconcileService("Movie Reconciliation", "0.1a") 14 | rs.add_entity(et) 15 | 16 | @app.route("/reconcile") # optional - forces a 301 redirect 17 | @app.route("/reconcile/", methods=['GET', 'POST']) 18 | @app.route("/reconcile/", methods=['GET', 'POST']) 19 | @app.route("/reconcile//", methods=['GET', 'POST']) 20 | def handle(path=None, id=None) : 21 | return rs.serve(path, id) 22 | 23 | 24 | # Add an error handler to give JSON errors back to the client 25 | @app.errorhandler(InvalidUsage) 26 | def handle_invalid_usage(error): 27 | response = jsonify(error.to_dict()) 28 | response.status_code = error.status_code 29 | return response 30 | 31 | @rs.search 32 | def search_single(reconcile: ReconcileRequest = None): 33 | # Single query 34 | return search(reconcile.query) 35 | 36 | @rs.search_batch 37 | def search_batch(reconcile: ReconcileRequest = None): 38 | # Batch of queries 39 | results = {} 40 | for k,v in reconcile.queries.items() : 41 | q_txt = v["query"] 42 | results[k] = search(q_txt) 43 | return results 44 | 45 | @rs.extend 46 | def extend(reconcile: ReconcileRequest = None): 47 | # Extend query 48 | # Contains document ids and properties to add 49 | # Response requires 50 | # meta : [ properties ] 51 | # rows : [ results ] 52 | result = {} 53 | extend = reconcile.extend 54 | ids = extend["ids"] 55 | props_requested = extend["properties"] 56 | if len(props_requested) > 0 : 57 | pr = list(map(lambda x: x["id"], props_requested)) 58 | if ids is not None : 59 | ids = list(map(lambda x: int(x), ids)) 60 | result["rows"] = get_by_id_props(ids, pr) 61 | 62 | return result 63 | 64 | @rs.view 65 | def view_item(id): 66 | url = get_by_id(id, "imdb") 67 | #print(id) 68 | #print(url) 69 | return redirect(url) 70 | 71 | @rs.preview_wh(200, 200) 72 | def preview_item(id): 73 | html = """ 74 | 75 | 76 | 77 | 78 | """ 79 | poster = get_by_id(id, "poster") 80 | return html.format(poster) 81 | 82 | ### 83 | # Read an excel file which holds our sample data 84 | # 85 | ## 86 | df = pd.read_excel("movie_posters.xlsx", engine="openpyxl") 87 | default_search_column = "title" 88 | 89 | # Search excel file 90 | def search(query): 91 | results = [] 92 | # Find a movie by the title from movie_posters.xlsx 93 | match_results = df[df[default_search_column].str.contains(query, na=False)] 94 | num_res = len(match_results) 95 | 96 | # Create Results 97 | for index, row in match_results.iterrows(): 98 | score = (100 / num_res) 99 | 100 | match = { 101 | "id": row["id"], 102 | "name": row["title"], 103 | "score": score, 104 | "match": (query in row.values and score == 100), # Return True for exact match. 105 | "type": [{ 106 | "id": et.id, 107 | "name": et.name 108 | } 109 | ] 110 | } 111 | 112 | results.append(match) 113 | 114 | return {"result": results} 115 | 116 | 117 | # Extend data with additional properties 118 | def get_by_id_props(ids, props) : 119 | """ 120 | return data in this format 121 | { 122 | "rows" : { 123 | : { 124 | "" : [ { "" : ""}], 125 | "" : [ { "" : ""},{ "" : ""}] 126 | ..... 127 | 128 | 129 | }, 130 | : { 131 | "" : [ { "" : ""}], 132 | "" : [ { "" : ""},{ "" : ""}] 133 | ..... 134 | 135 | 136 | }, 137 | ....... 138 | 139 | } 140 | } 141 | """ 142 | results_df = df[df["id"].isin(ids)] 143 | results = {} 144 | for idx, row in results_df.iterrows() : 145 | result_p = {} 146 | for p in props : 147 | result_p[p] = [ {"str" :row[p]}] 148 | results[row["id"]] = result_p 149 | pp(results_df) 150 | return results 151 | 152 | 153 | def get_by_id(id, column) : 154 | 155 | # Important to ensure that your excel and parameters are the cast to the same type 156 | # either int or str 157 | 158 | result_df = df[df["id"] == int(id)] 159 | for idx, row in result_df.iterrows() : 160 | #print(row) 161 | return str(row[column]) 162 | 163 | 164 | 165 | if __name__ == '__main__': 166 | app.run(host="127.0.0.1", port=5000, debug=True) -------------------------------------------------------------------------------- /docs/images/add_columns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/add_columns.png -------------------------------------------------------------------------------- /docs/images/add_service.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/add_service.png -------------------------------------------------------------------------------- /docs/images/create_new.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/create_new.png -------------------------------------------------------------------------------- /docs/images/matching.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/matching.png -------------------------------------------------------------------------------- /docs/images/refine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/refine.png -------------------------------------------------------------------------------- /docs/images/select_columns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/select_columns.png -------------------------------------------------------------------------------- /docs/images/start_reconciling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/start_reconciling.png -------------------------------------------------------------------------------- /docs/images/start_reconciling_cfg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/start_reconciling_cfg.png -------------------------------------------------------------------------------- /guardian_2010_greatest_films_of_all_time.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/guardian_2010_greatest_films_of_all_time.csv -------------------------------------------------------------------------------- /movie_posters.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/movie_posters.xlsx -------------------------------------------------------------------------------- /reconciliation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/reconciliation/__init__.py -------------------------------------------------------------------------------- /reconciliation/reconcile.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os import name 3 | import typing as t 4 | from flask import request 5 | from marshmallow import Schema, fields 6 | from marshmallow.decorators import post_dump, post_load 7 | from flask_jsonpify import jsonpify 8 | 9 | 10 | class EntityType: 11 | def __init__(self, name: str, id: str)-> None: 12 | self.id = id 13 | self.name = name 14 | self.properties = [] 15 | 16 | class Property: 17 | def __init__(self, id, name) -> None: 18 | self.id = id 19 | self.name = name 20 | 21 | class PropertySchema(Schema): 22 | id = fields.Str() 23 | name = fields.Str() 24 | 25 | class EntityTypeSchema(Schema): 26 | id = fields.Str() 27 | name = fields.Str() 28 | properties = fields.Nested(PropertySchema, many=True) 29 | 30 | 31 | class ServiceProperty: 32 | def __init__(self, service_url, service_path) -> None: 33 | self.service_url = service_url 34 | self.service_path = service_path 35 | 36 | class ServicePropertySchema(Schema): 37 | service_url = fields.Url() 38 | service_path = fields.Str() 39 | 40 | ''' 41 | Per request, map the host name of the request to the service url, remove the leading slash 42 | ''' 43 | @post_dump 44 | def add_host_service_url(self, data, many): 45 | result = {} 46 | for k, v in data.items() : 47 | result[k] = v 48 | if k == "service_url" : 49 | result[k] = request.host_url+v[1:] 50 | 51 | return result 52 | 53 | class ExtendService: 54 | def __init__(self, service_url, service_path="/propose_properties") -> None: 55 | self.propose_properties = ServiceProperty(service_url, service_path) 56 | 57 | class ExtendServiceSchema(Schema): 58 | propose_properties = fields.Nested(ServicePropertySchema()) 59 | 60 | 61 | class ViewService: 62 | def __init__(self, service_url, service_path="/view/{{id}}") -> None: 63 | self.url = "{}{}".format(service_url,service_path) 64 | 65 | class ViewServiceSchema(Schema): 66 | url = fields.Url() 67 | @post_dump 68 | def set_view_url(self, data, many): 69 | return {"url" : request.host_url+data["url"][1:] } 70 | 71 | class PreviewService: 72 | def __init__(self, service_url, service_path="/preview/{{id}}", width=200, height=200) -> None: 73 | self.url = "{}{}".format(service_url, service_path) 74 | self.width = width 75 | self.height =height 76 | 77 | class PreviewServiceSchema(Schema): 78 | url = fields.Url() 79 | width = fields.Int() 80 | height = fields.Int() 81 | @post_dump 82 | def set_view_url(self, data, many): 83 | print(data) 84 | return {"url" : request.host_url+data["url"][1:], 85 | "width": data["width"], 86 | "height" : data["height"]} 87 | 88 | class ReconcileRequest: 89 | def __init__(self, query=None, queries=None, extend=None) -> None: 90 | self.query = query 91 | self.queries = queries 92 | self.extend = extend 93 | 94 | class ReconcileRequestSchema(Schema): 95 | query = fields.Str() 96 | queries = fields.Dict() 97 | extend = fields.Dict() 98 | 99 | @post_load 100 | def make_reconcilie_request(self, data, **kwargs): 101 | print(data) 102 | return ReconcileRequest(**data) 103 | 104 | 105 | class ReconcileService: 106 | def __init__(self, name, version, base_path="/reconcile") -> None: 107 | self.name = name 108 | self.version = str(version) 109 | self.manifest = Manifest(self.name, [self.version]) 110 | self.services = {} 111 | self.base_path = base_path 112 | self.entities = {} 113 | 114 | def extend(self, func, *args) ->t.Callable: 115 | ''' 116 | def extend(reconcile: ReconcileRequest): 117 | 118 | This is the request that handles performing "Add column based on reconciled data" 119 | A list of properties and a list of entity ids are passed in 120 | The entity and the value of the propery are expected as results. 121 | 122 | reconcile.extend.ids contains the list of entity ids 123 | reconcile.extend.properties contains the list of properties expected 124 | 125 | Return expected 126 | 127 | { 128 | "rows" : { 129 | : { 130 | "" : [ { "" : ""}], 131 | "" : [ { "" : ""},{ "" : ""}] 132 | ..... 133 | 134 | 135 | }, 136 | : { 137 | "" : [ { "" : ""}], 138 | "" : [ { "" : ""},{ "" : ""}] 139 | ..... 140 | 141 | 142 | }, 143 | ....... 144 | 145 | } 146 | } 147 | 148 | ''' 149 | self.services["extend"] = func 150 | self.manifest.extend = ExtendService(self.base_path) 151 | def inner() -> t.Callable: 152 | return func() 153 | 154 | return inner 155 | 156 | def search(self, func, *args) ->t.Callable: 157 | """ 158 | @rs.search 159 | def my_search(reconcile: ReconcileRequest) 160 | 161 | Will be called with a single search query 162 | reconcile.query will contain a string for the entity being searched for 163 | expects a return of : 164 | { "result" : [ 165 | { 166 | "id": , 167 | "name": , 168 | "score": , 169 | "match": , # Return True for exact match. 170 | "type": [ 171 | { # EntityType, ideally as added to the rs above 172 | "id": et.id, 173 | "name": et.name 174 | } 175 | ] 176 | } 177 | ] 178 | } 179 | """ 180 | self.services["search"] = func 181 | def inner() -> t.Callable: 182 | return func() 183 | 184 | return inner 185 | 186 | def search_batch(self, func, *args) ->t.Callable: 187 | ''' 188 | @rs.search_batch 189 | def search_batch(reconcile: ReconcileRequest = None): 190 | 191 | By Default OpenRefine will attempt to batch up queries 192 | These will be available in reconcile.queries as a dictionary queries key off a query id 193 | e.g. 194 | { 195 | : {"query" : "text to search for"} , 196 | : {"query" : "something else to search for"} 197 | ...... 198 | } 199 | 200 | The expected return is 201 | {"results" : { 202 | : { "result" : .... } # same as single search result 203 | : { "result" : .....} 204 | ...... 205 | } 206 | } 207 | ''' 208 | self.services["search_batch"] = func 209 | def inner() -> t.Callable: 210 | return func() 211 | 212 | return inner 213 | 214 | def view(self, func, *args) -> t.Callable: 215 | self.services["view"] = func 216 | self.manifest.view = ViewService(self.base_path) 217 | 218 | def inner() -> t.Callable: 219 | return func() 220 | return inner 221 | 222 | def preview_wh(self, width, height): 223 | ''' 224 | @rs.preview_wh(200, 200) 225 | 226 | def preview_item(id): 227 | 228 | This funtion is called when a matched results are hovered over in OpenRefine 229 | Openrefine creates an iframe of size width x height where you can display summary data for the entity 230 | Arg: 231 | id - the entity id 232 | 233 | Return : 234 | HTML 235 | 236 | ''' 237 | 238 | 239 | self.manifest.preview = PreviewService(self.base_path, "/preview/{{id}}", width, height) 240 | def preview(func, *args) -> t.Callable: 241 | 242 | self.services["preview"] = func 243 | def inner() -> t.Callable: 244 | return func() 245 | return inner 246 | return preview 247 | 248 | def add_entity(self, entity: EntityType) -> None: 249 | id = entity.id 250 | self.entities[id] = entity 251 | 252 | # this is not a good way to do this 253 | # meta props should be tied to a type or to the result 254 | def get_props_meta(self, props) : 255 | p_meta = [] 256 | p_ids = list(map(lambda x : x["id"], props)) 257 | for k,e in self.entities.items() : 258 | print(e) 259 | for p in e.properties : 260 | if p.id in p_ids : 261 | p_meta.append({"id": p.id, "name": p.name}) 262 | 263 | return p_meta 264 | 265 | 266 | def serve(self, path, id): 267 | 268 | if path == "propose_properties" : 269 | if "extend" not in self.services : 270 | raise InvalidUsage("extend is not enabled for this API") 271 | 272 | type = request.args.get("type") 273 | if type is None : 274 | raise InvalidUsage("Missing type parameter for Entity") 275 | 276 | # should I raise this? types seem loosely coupled 277 | #if type not in self.entities : 278 | # raise InvalidUsage("Supplies type parameter is not available") 279 | print(self.entities) 280 | schema = EntityTypeSchema().dump(self.entities[type]) 281 | return jsonpify(schema) 282 | 283 | if path == "view" : 284 | return self.services["view"](id) 285 | 286 | if path == "preview" : 287 | return self.services["preview"](id) 288 | 289 | if request.method == "POST" : 290 | incoming_request = "" 291 | if request.mimetype in ['application/json', 'application/javascript'] : 292 | incoming_request = request.get_json() 293 | else : 294 | incoming_request = request.form 295 | 296 | ##### 297 | ## convert str to dict 298 | ### 299 | ir = {} 300 | if "query" in incoming_request : 301 | ir = incoming_request 302 | else : 303 | for k in incoming_request.keys(): 304 | ir[k] = json.loads(incoming_request[k]) 305 | 306 | print(ir) 307 | 308 | reconcile_request = ReconcileRequestSchema().load(ir) 309 | if reconcile_request.query is not None: 310 | result = self.services["search"](reconcile_request) 311 | 312 | if reconcile_request.queries is not None: 313 | result = self.services["search_batch"](reconcile_request) 314 | 315 | if reconcile_request.extend is not None: 316 | result = self.services["extend"](reconcile_request) 317 | requested_properties = reconcile_request.extend["properties"] 318 | print(requested_properties) 319 | result["meta"] = self.get_props_meta(requested_properties) 320 | 321 | 322 | return jsonpify(result) 323 | 324 | 325 | return jsonpify(ManifestSchema().dump(self.manifest)) 326 | 327 | 328 | class Manifest: 329 | def __init__(self, name, versions) -> None: 330 | """ 331 | Create a manifest to describe the service 332 | Args: 333 | name str: Contain the name of the service 334 | versions list: list of strings for versions 335 | e.g. 336 | ["1.1", "1.2", "1.2.1", "2.3"] 337 | 338 | """ 339 | self.versions = versions 340 | self.name = name 341 | self.defaultTypes = [] 342 | self.identifierSpace = None 343 | self.schemaSpace = None 344 | self.view = None 345 | self.preview = None 346 | self.suggest = None 347 | self.extend = None 348 | 349 | def add_type(self, type: EntityType) -> None: 350 | self.defaultTypes[type.id] = type 351 | 352 | def toJson(self): 353 | return json.dumps(self, default=lambda o: o.__dict__) 354 | 355 | 356 | class ManifestSchema(Schema) : 357 | versions = fields.List(fields.Str()) 358 | name = fields.Str() 359 | defaultTypes = fields.Nested(EntityTypeSchema, many=True) 360 | identifierSpace = fields.Url() 361 | schemaSpace = fields.Url() 362 | view = fields.Nested(ViewServiceSchema()) 363 | preview = fields.Nested(PreviewServiceSchema()) 364 | suggest = None # TODO: Create a Suggest service 365 | extend = fields.Nested(ExtendServiceSchema()) 366 | SKIP_VALUES = [None] 367 | 368 | @post_dump 369 | def remove_skip_values(self, data, many): 370 | 371 | return { 372 | key: value for key, value in data.items() 373 | if value not in self.SKIP_VALUES 374 | } 375 | 376 | class InvalidUsage(Exception): 377 | status_code = 400 378 | def __init__(self, message, status_code=None, payload=None): 379 | Exception.__init__(self) 380 | self.message = message 381 | if status_code is not None: 382 | self.status_code = status_code 383 | self.payload = payload 384 | 385 | def to_dict(self): 386 | rv = dict(self.payload or ()) 387 | rv['message'] = self.message 388 | return rv 389 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==2.0.2 2 | Flask-Jsonpify==1.5.0 3 | marshmallow==3.14.1 4 | openpyxl==3.0.9 5 | pandas==1.3.4 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from setuptools import setup 4 | 5 | with open('README.md') as f: 6 | long_description = f.read() 7 | 8 | setup( 9 | name = 'reconciliation', 10 | packages = ['reconciliation'], 11 | version = '0.3', 12 | license='MIT', 13 | description = 'An OpenRefine Reconciliation Framework for Python', 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | author = 'patrick oleary', # Type in your name 17 | author_email = 'techops@preftech.com', # Type in your E-Mail 18 | url = 'https://github.com/preftech/reconciliation', # Provide either the link to your github or to your website 19 | keywords = ['reconcile', 'reconciliation', 'openrefine', 'cocoda'], # Keywords that define your package best 20 | install_requires=[ 21 | "Flask==2.0.2", 22 | "Flask-Jsonpify==1.5.0", 23 | "marshmallow==3.14.1" 24 | ], 25 | classifiers=[ 26 | 'Development Status :: 4 - Beta', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package 27 | 'Intended Audience :: Developers', # Define that your audience are developers 28 | 'Topic :: Text Processing', 29 | 'Framework :: Flask', 30 | 'License :: OSI Approved :: MIT License', # Again, pick a license 31 | 'Programming Language :: Python :: 3.6', 32 | 'Programming Language :: Python :: 3.7', 33 | 'Programming Language :: Python :: 3.8' 34 | ], 35 | ) --------------------------------------------------------------------------------