├── .gitignore
├── LICENSE.txt
├── README.md
├── app.py
├── docs
    └── images
    │   ├── add_columns.png
    │   ├── add_service.png
    │   ├── create_new.png
    │   ├── matching.png
    │   ├── refine.png
    │   ├── select_columns.png
    │   ├── start_reconciling.png
    │   └── start_reconciling_cfg.png
├── guardian_2010_greatest_films_of_all_time.csv
├── movie_posters.xlsx
├── reconciliation
    ├── __init__.py
    └── reconcile.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # PyCharm
141 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
142 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
143 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
144 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
145 | #.idea/
146 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2021 Patrick O'Leary, PrefTech Inc
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Reconciliation 
  2 | Reconciliation is a python flask framework that works with OpenRefine Reconcile Data capabilities.
  3 | Reconcile is the ability to match and enhance data from multiple sources over the web.
  4 | 
  5 | This framework provides the ability to create one of those sources without having to reimplement the underlying protocol.
  6 | Reconciliation will provide
  7 | 
  8 | * Request parsing
  9 | * Handler for Reconciliation JSON Protocol
 10 | * Function decorators for
 11 |   * Search
 12 |     * Required
 13 |     * Used to map incoming data to an entity id in your data
 14 |   * Search in batch mode
 15 |     * Required
 16 |     * Same as search but used for batches for performance
 17 |   * Extend 
 18 |     * Optional - but kind of useless without it
 19 |     * Used to add additional columns / fields to end users data
 20 |   * Preview
 21 |     * Optional - really handy for users
 22 |     * Used as a hover preview method, by creating an iframe in openrefine
 23 |   * View
 24 |     * Optional - really useful
 25 |     * Used to show the entity in a browser
 26 | 
 27 | ## Why use Open Refines reconciliation
 28 | OpenRefine provides a desktop/browser tool for data management and curation. This tool provides an excel like interface for cleaning, scripting abilities to augment data, enhancing data with the ability to fetch data over the internet and supliment the data you already have.
 29 | Details on how to use OpenRefine's Reconciliation interface are available https://docs.openrefine.org/manual/reconciling
 30 | A list of known publically available services is available https://reconciliation-api.github.io/testbench/
 31 | 
 32 | With OpenRefine you have an out of the box you have a UI and tool that will allow individuals or teams to work without having to build a tool from scratch. 
 33 | ![OpenRefine Screen Shot](https://raw.githubusercontent.com/preftech/reconciliation/main/docs/images/refine.png)
 34 | 
 35 | 
 36 | # Installation
 37 | Install using pip
 38 | ```sh
 39 | pip install reconciliation
 40 | ```
 41 | 
 42 | # Usage
 43 | This is a flask based app that allows you to control how you route, authenticate, log etc.. A full example exists in the app.py file
 44 | 
 45 | ```python
 46 | from reconcililiation import EntityType, InvalidUsage, Property, ReconcileRequest, ReconcileService
 47 | 
 48 | # Create a flask app
 49 | app = Flask(__name__)
 50 | 
 51 | # initialize a ReconcileService
 52 | rs = ReconcileService("Movie Reconciliation", "0.1a")
 53 | 
 54 | # Create an entity you want to serve 
 55 | et = EntityType("Movie", "/movie")
 56 | et.properties.append(Property("imdb", "IMDB URL"))
 57 | et.properties.append(Property("poster", "Poster URL"))
 58 | # Add the entity to the service
 59 | rs.add_entity(et)
 60 | 
 61 | # Set the entrypoint for your application
 62 | # This lets you control the URLs and if you wish to put a URL key for authentication you can do so here
 63 | @app.route("/reconcile/", methods=['GET', 'POST'])
 64 | @app.route("/reconcile/<path:path>", methods=['GET', 'POST'])
 65 | @app.route("/reconcile/<path:path>/<path:id>", methods=['GET', 'POST'])
 66 | def handle(path=None, id=None) :
 67 |     return rs.serve(path, id)
 68 | 
 69 | ```
 70 | 
 71 | The following decorators are used to handle incoming requests
 72 | ```python
 73 | 
 74 | @rs.search
 75 | def my_search(reconcile: ReconcileRequest)
 76 |     '''
 77 |     Will be called with a single search query
 78 |     reconcile.query will contain a string for the entity being searched for
 79 |     expects a return of :
 80 |         { "result" : [
 81 |                         {
 82 |                         "id": <unique id of entity>,
 83 |                         "name": <name of entity>, 
 84 |                         "score": <int of a score>,
 85 |                         "match": <True or False>, # Return True for exact match.
 86 |                         "type":     [
 87 |                                         { # EntityType, ideally as added to the rs above
 88 |                                             "id": et.id,
 89 |                                             "name": et.name
 90 |                                         }
 91 |                                     ]
 92 |                         }              
 93 |                     ]
 94 |         }
 95 |     '''
 96 | 
 97 | @rs.search_batch
 98 | def my_search_batch(reconcile: ReconcileRequest):
 99 |     '''
100 |     By Default OpenRefine will attempt to batch up queries
101 |     These will be available in reconcile.queries as a dictionary queries key off a query id
102 |     e.g. 
103 |         {
104 |             <query_id_1> : {"query" : "text to search for"} ,
105 |             <query_id_2> : {"query" : "something else to search for"}
106 |             ......
107 |         }
108 | 
109 |     The expected return is
110 |     {"results" : {
111 |                     <query_id_1> : { "result" : .... } # same as single search result
112 |                     <query_id_2> : { "result" : .....}
113 |                     ......
114 |                 }
115 |     }
116 |     '''
117 | 
118 | @rs.extend
119 | '''
120 | def extend(reconcile: ReconcileRequest):
121 | 
122 | This is the request that handles performing "Add column based on reconciled data"
123 | A list of properties and a list of entity ids are passed in 
124 | The entity and the value of the propery are expected as results.
125 | 
126 | reconcile.extend.ids contains the list of entity ids
127 | reconcile.extend.properties contains the list of properties expected
128 | 
129 | Return expected
130 | 
131 | {
132 |     "rows" : {
133 |                 <entity_id> : {
134 |                     "<property_id>" : [ { "<str>" : "<property value>"}],
135 |                     "<property_id2>" : [ { "<str>" : "<property value>"},{ "<str>" : "<property other value>"}]
136 |                     .....
137 |                     
138 | 
139 |                 },
140 |                  <entity_id2> : {
141 |                     "<property_id>" : [ { "<str>" : "<property value>"}],
142 |                     "<property_id2>" : [ { "<str>" : "<property value>"},{ "<str>" : "<property other value>"}]
143 |                     .....
144 |                     
145 | 
146 |                 },
147 |                 .......
148 | 
149 |     }
150 | }
151 | 
152 | '''
153 | @rs.preview_wh(width, height) # in pixels
154 | '''
155 | def preview_item(id):
156 | 
157 | This funtion is called when a matched results are hovered over in OpenRefine
158 | Openrefine creates an iframe of size width x height where you can display summary data for the entity
159 | Arg: 
160 |     id - the entity id
161 | 
162 | Return : 
163 |     HTML 
164 | 
165 | '''
166 | 
167 | 
168 | 
169 | @rs.view
170 | '''
171 | def view_item(id):
172 | 
173 | This function is called in openrefine when a user clicks a matched entity 
174 | You can return html, redirect, a file download, anything that is browser compatible 
175 | 
176 | Arg: 
177 |     id = the entity id
178 | 
179 | Return: 
180 |     Browser compatible content 
181 | 
182 | '''
183 | 
184 | ```
185 | # Reconciliation Example
186 | Start by checking out openrefine at https://openrefine.org/ and downloading the latest version of the OpenRefine software
187 | The example provided solves a simple problem, you have a spreadsheet of The Guardian's 2010 Greatest Movies of all time, 
188 | however it's mising the movies posters. 
189 | This sample app will load a spreadsheet call movie_posters.xlsx which contains some of the movie posters.
190 | 
191 | 
192 | Below I bring you through the steps of reconciling data in OpenRefine, this is not meant to be a tutorial on OpenRefine, there are way to many capabilities available in it and resources online that cover that. This is a very quick walk though on how to use this example with OpenRefine.
193 | 
194 | First start this app, assume you've cloned the [reconciliation github repo](https://github.com/preftech/reconciliation), setup your [virtualenv](https://sourabhbajaj.com/mac-setup/Python/virtualenv.html) or [conda](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) for python
195 | 
196 | 
197 | ### Step 1
198 | ```
199 | pip install -r requirements.txt
200 | python app.py
201 | ```
202 | This should start the reconciliation service at http://127.0.0.1:5000/reconcile/
203 | 
204 | ### Step 2
205 | Next launch OpenRefine and create a new project with the guardian_2010_greatest_films_of_all_time.csv
206 | ![Create New Project](https://github.com/preftech/reconciliation/blob/main/docs/images/create_new.png?raw=true)
207 | 
208 | Select next > Create Project (defaults on this page are fine)
209 | You should have a spreadsheet page with the list of movies
210 | 
211 | ### Step 3 Matching
212 | Next step lets reconcile the movie titles against our reconciliation service.
213 | Click the dropdown menu next to "title" > Reconcile > Start Reconciling
214 | ![Start Reconciling](https://github.com/preftech/reconciliation/blob/main/docs/images/start_reconciling.png?raw=true)
215 | 
216 | Next Add the reconcilation service, you will see an >Add Standard Service button on the bottom left
217 | Type in http://127.0.0.1:5000/reconcile/ **Ensure you include the trailing slash**
218 | > Add Service
219 | 
220 | ![Add Service](https://github.com/preftech/reconciliation/blob/main/docs/images/add_service.png?raw=true)
221 | 
222 | Again the defaults should be fine here.
223 | Under the covers this calls http://127.0.0.1:5000/reconcile/ and receives back a list of services you have enabled
224 | This services are linked to the EntityType you added to ReconcileService and @rs.* decorators in your code.
225 | ```
226 | et = EntityType("Movie", "/movie")
227 | et.properties.append(Property("imdb", "IMDB URL"))
228 | et.properties.append(Property("poster", "Poster URL"))
229 | ```
230 | 
231 | ```
232 | @rs.view
233 | .....
234 | ````
235 | 
236 | At this stage you will see your spreadsheet and a status toast showing you how much matching has occurred. 
237 | Once Complete you will see a set of Facet / Filters appear on the left handside and some additional data in the title column.
238 | ![Matching Screen](https://github.com/preftech/reconciliation/blob/main/docs/images/matching.png?raw=true)
239 | 
240 | 
241 | As you can see from the Facet / Filter we matched 36 movies this is the field "match" from our @rs.search/@rs.search_batch method.
242 | In this example if we had single result from the data and name matched we declared it a match.
243 | 
244 | You can select either matched if you just want to work with fully matched data, or none if you want to work on data that isn't fully matched. 
245 | 
246 | Let's take look at "Nosferatu" on row 3
247 | There are 2 possible movies in the results the 1922 original or a remake in 1986. Hovering over them calls the @rs.preview method. In out example we return html with the movie poster as an image. 
248 | A reviewer can now select which movie is the correct one, if there are duplicates in the data, then it can be applied to all other cells with the same title.
249 | 
250 | The judgement score is a sliding filter based on the field "score" from @rs.search / @rs.search_batch
251 | In our case we divided 100 / No. Matches so a score of 100 = 1 match, 50 = 2 matches, 3 = 33.3 and so on...
252 | 
253 | By default blanks; 0 matches are included, you can uncheck this box to exclude them, and use the slider to work through matching data and prioritizing it.
254 | 
255 | ### Step 4 Extend Data
256 | At this stage we have data in OpenRefine reconciled / matched to the data in our service as best as we can, we have had humna judgement clean it up and now we want to add fields to the matched data. 
257 | 
258 | Click the dropdown menu next to title > Edit Column > Add Columns from Reconciled Values
259 | ![Add Columns](https://github.com/preftech/reconciliation/blob/main/docs/images/add_columns.png?raw=true) 
260 | 
261 | 
262 | The next screen that pops up, will display the available properties that you have configured from the EntityType properties. 
263 | If haven't filtered the rows from the Facet/Filter panel, you may have 'non-reconciled' rows showing up here. If you don't want that hit escape and in judgement scores simply uncheck the "blanks" checkbox, and repeat this step.
264 | ![Add Columns](https://github.com/preftech/reconciliation/blob/main/docs/images/add_columns.png?raw=true)
265 | 
266 | You can now select 1 or more properties to add to your data, and click ok.
267 | This will now call the @rs.extend method of your code. 
268 | OpenRefine will now fetch the properties you've requested and proceed to add them as columns to the entities that have **been matched**.
269 | 
270 | 
271 | 
272 | ## Attributions
273 | A thanks has to go out to the following for data used in the example 
274 | * Babu Thomas (@babu-thomas)
275 |   * MovieLens project
276 |   * https://github.com/babu-thomas/movielens-posters
277 | * Owen Temples - https://owentemple.com/
278 |   * Guardian's 2010 Greatest Movies of all Time 
279 |   * https://data.world/owentemple/greatest-films-of-all-time


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, redirect
  2 | from pprint import pprint as pp
  3 | from flask_jsonpify import jsonify
  4 | import pandas as pd
  5 | from reconciliation.reconcile import EntityType, InvalidUsage, Property, ReconcileRequest, ReconcileService
  6 | 
  7 | app = Flask(__name__)
  8 | 
  9 | et = EntityType("Movie", "/movie")
 10 | et.properties.append(Property("imdb", "IMDB URL"))
 11 | et.properties.append(Property("poster", "Poster URL"))
 12 | 
 13 | rs = ReconcileService("Movie Reconciliation", "0.1a")
 14 | rs.add_entity(et)
 15 | 
 16 | @app.route("/reconcile")    # optional - forces a 301 redirect
 17 | @app.route("/reconcile/", methods=['GET', 'POST'])
 18 | @app.route("/reconcile/<path:path>", methods=['GET', 'POST'])
 19 | @app.route("/reconcile/<path:path>/<path:id>", methods=['GET', 'POST'])
 20 | def handle(path=None, id=None) :
 21 |     return rs.serve(path, id)
 22 | 
 23 | 
 24 | # Add an error handler to give JSON errors back to the client
 25 | @app.errorhandler(InvalidUsage)
 26 | def handle_invalid_usage(error):
 27 |     response = jsonify(error.to_dict())
 28 |     response.status_code = error.status_code
 29 |     return response
 30 | 
 31 | @rs.search
 32 | def search_single(reconcile: ReconcileRequest = None):
 33 |     # Single query
 34 |     return search(reconcile.query)
 35 |     
 36 | @rs.search_batch
 37 | def search_batch(reconcile: ReconcileRequest = None):
 38 |     # Batch of queries
 39 |     results = {}
 40 |     for k,v in reconcile.queries.items() : 
 41 |         q_txt = v["query"]
 42 |         results[k] = search(q_txt) 
 43 |     return results
 44 | 
 45 | @rs.extend
 46 | def extend(reconcile: ReconcileRequest = None):
 47 |     # Extend query
 48 |     # Contains document ids and properties to add
 49 |     # Response requires
 50 |     #   meta : [ properties ]
 51 |     #   rows : [ results ]
 52 |     result = {}
 53 |     extend = reconcile.extend
 54 |     ids = extend["ids"]
 55 |     props_requested = extend["properties"]
 56 |     if len(props_requested) > 0 :
 57 |         pr = list(map(lambda x: x["id"], props_requested))
 58 |     if ids is not None : 
 59 |         ids = list(map(lambda x: int(x), ids))
 60 |         result["rows"] =  get_by_id_props(ids, pr)
 61 |     
 62 |     return result
 63 | 
 64 | @rs.view    
 65 | def view_item(id):
 66 |     url = get_by_id(id, "imdb")
 67 |     #print(id)
 68 |     #print(url)
 69 |     return redirect(url)
 70 | 
 71 | @rs.preview_wh(200, 200)
 72 | def preview_item(id):
 73 |     html = """
 74 |     <html>
 75 |     <body><img src="{}" width="180", height="180"/>
 76 |     </body>
 77 |     </html>
 78 |     """
 79 |     poster = get_by_id(id, "poster")
 80 |     return html.format(poster)
 81 | 
 82 | ###
 83 | #   Read an excel file which holds our sample data
 84 | #
 85 | ##
 86 | df = pd.read_excel("movie_posters.xlsx", engine="openpyxl")
 87 | default_search_column = "title"
 88 | 
 89 | # Search excel file
 90 | def search(query):
 91 |     results = []
 92 |     # Find a movie by the title from movie_posters.xlsx
 93 |     match_results = df[df[default_search_column].str.contains(query, na=False)]
 94 |     num_res = len(match_results)
 95 |     
 96 |     # Create Results
 97 |     for index, row in match_results.iterrows():
 98 |         score = (100 / num_res)
 99 |             
100 |         match = {
101 |             "id": row["id"],
102 |             "name": row["title"], 
103 |             "score": score,
104 |             "match": (query in row.values and score == 100), # Return True for exact match.
105 |             "type":     [{ 
106 |                             "id": et.id,
107 |                             "name": et.name
108 |                           }
109 |                          ]
110 |             }
111 |     
112 |         results.append(match)
113 |         
114 |     return {"result": results}
115 | 
116 | 
117 | # Extend data with additional properties 
118 | def get_by_id_props(ids, props) :
119 |     """
120 |     return data in this format
121 |     {
122 |     "rows" : {
123 |                 <entity_id> : {
124 |                     "<property_id>" : [ { "<str>" : "<property value>"}],
125 |                     "<property_id2>" : [ { "<str>" : "<property value>"},{ "<str>" : "<property other value>"}]
126 |                     .....
127 |                     
128 | 
129 |                 },
130 |                  <entity_id2> : {
131 |                     "<property_id>" : [ { "<str>" : "<property value>"}],
132 |                     "<property_id2>" : [ { "<str>" : "<property value>"},{ "<str>" : "<property other value>"}]
133 |                     .....
134 |                     
135 | 
136 |                 },
137 |                 .......
138 | 
139 |         }
140 |     }
141 |     """
142 |     results_df = df[df["id"].isin(ids)]
143 |     results = {}
144 |     for idx, row in results_df.iterrows() :
145 |         result_p = {}
146 |         for p in props : 
147 |             result_p[p] = [ {"str" :row[p]}]
148 |         results[row["id"]] = result_p
149 |     pp(results_df)
150 |     return results
151 |         
152 | 
153 | def get_by_id(id, column) :
154 |     
155 |     # Important to ensure that your excel and parameters are the cast to the same type
156 |     # either int or str
157 |     
158 |     result_df = df[df["id"] == int(id)]
159 |     for idx, row in result_df.iterrows() :
160 |         #print(row)
161 |         return str(row[column])
162 |     
163 |     
164 |     
165 | if __name__ == '__main__':
166 |     app.run(host="127.0.0.1", port=5000, debug=True)


--------------------------------------------------------------------------------
/docs/images/add_columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/add_columns.png


--------------------------------------------------------------------------------
/docs/images/add_service.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/add_service.png


--------------------------------------------------------------------------------
/docs/images/create_new.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/create_new.png


--------------------------------------------------------------------------------
/docs/images/matching.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/matching.png


--------------------------------------------------------------------------------
/docs/images/refine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/refine.png


--------------------------------------------------------------------------------
/docs/images/select_columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/select_columns.png


--------------------------------------------------------------------------------
/docs/images/start_reconciling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/start_reconciling.png


--------------------------------------------------------------------------------
/docs/images/start_reconciling_cfg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/docs/images/start_reconciling_cfg.png


--------------------------------------------------------------------------------
/guardian_2010_greatest_films_of_all_time.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/guardian_2010_greatest_films_of_all_time.csv


--------------------------------------------------------------------------------
/movie_posters.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/movie_posters.xlsx


--------------------------------------------------------------------------------
/reconciliation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preftech/reconciliation/b0687283afb713b3752e1eb87967bb7bf27c2e75/reconciliation/__init__.py


--------------------------------------------------------------------------------
/reconciliation/reconcile.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from os import name
  3 | import typing as t
  4 | from flask import request
  5 | from marshmallow import Schema, fields
  6 | from marshmallow.decorators import post_dump, post_load
  7 | from flask_jsonpify import jsonpify
  8 | 
  9 | 
 10 | class EntityType: 
 11 |     def __init__(self, name: str, id: str)-> None:
 12 |         self.id = id
 13 |         self.name = name
 14 |         self.properties = []
 15 | 
 16 | class Property: 
 17 |     def __init__(self, id, name) -> None:
 18 |         self.id = id
 19 |         self.name = name
 20 | 
 21 | class PropertySchema(Schema):
 22 |     id = fields.Str()
 23 |     name = fields.Str()
 24 |     
 25 | class EntityTypeSchema(Schema):
 26 |     id = fields.Str()
 27 |     name = fields.Str()
 28 |     properties = fields.Nested(PropertySchema, many=True)
 29 |     
 30 | 
 31 | class ServiceProperty:
 32 |     def __init__(self, service_url, service_path) -> None:
 33 |         self.service_url = service_url
 34 |         self.service_path = service_path
 35 | 
 36 | class ServicePropertySchema(Schema):
 37 |     service_url = fields.Url()
 38 |     service_path = fields.Str()
 39 |     
 40 |     '''
 41 |     Per request, map the host name of the request to the service url, remove the leading slash
 42 |     '''
 43 |     @post_dump
 44 |     def add_host_service_url(self, data, many):
 45 |         result = {}
 46 |         for k, v in data.items() :
 47 |             result[k] = v
 48 |             if k == "service_url" :
 49 |                 result[k] = request.host_url+v[1:]
 50 |                 
 51 |         return result
 52 |     
 53 | class ExtendService: 
 54 |     def __init__(self, service_url, service_path="/propose_properties") -> None:
 55 |         self.propose_properties = ServiceProperty(service_url, service_path)
 56 | 
 57 | class ExtendServiceSchema(Schema):
 58 |     propose_properties = fields.Nested(ServicePropertySchema())
 59 |     
 60 | 
 61 | class ViewService:
 62 |     def __init__(self, service_url, service_path="/view/{{id}}") -> None:
 63 |         self.url = "{}{}".format(service_url,service_path)
 64 |         
 65 | class ViewServiceSchema(Schema):
 66 |     url = fields.Url()
 67 |     @post_dump
 68 |     def set_view_url(self, data, many):
 69 |         return {"url" : request.host_url+data["url"][1:] }
 70 | 
 71 | class PreviewService:
 72 |     def __init__(self, service_url, service_path="/preview/{{id}}", width=200, height=200) -> None:
 73 |         self.url = "{}{}".format(service_url, service_path)
 74 |         self.width = width
 75 |         self.height =height
 76 | 
 77 | class PreviewServiceSchema(Schema):
 78 |     url = fields.Url()
 79 |     width = fields.Int()
 80 |     height = fields.Int()
 81 |     @post_dump
 82 |     def set_view_url(self, data, many):
 83 |         print(data)
 84 |         return {"url" : request.host_url+data["url"][1:],
 85 |                 "width": data["width"],
 86 |                 "height" : data["height"]}
 87 | 
 88 | class ReconcileRequest:
 89 |     def __init__(self, query=None, queries=None, extend=None) -> None:
 90 |         self.query = query
 91 |         self.queries = queries
 92 |         self.extend = extend
 93 | 
 94 | class ReconcileRequestSchema(Schema):
 95 |     query = fields.Str()
 96 |     queries = fields.Dict()
 97 |     extend = fields.Dict()
 98 |     
 99 |     @post_load
100 |     def make_reconcilie_request(self, data, **kwargs):
101 |         print(data)
102 |         return ReconcileRequest(**data)
103 |     
104 | 
105 | class ReconcileService: 
106 |     def __init__(self, name, version, base_path="/reconcile") -> None:
107 |         self.name = name
108 |         self.version = str(version)
109 |         self.manifest = Manifest(self.name, [self.version])
110 |         self.services = {}
111 |         self.base_path = base_path
112 |         self.entities = {}
113 |         
114 |     def extend(self, func, *args) ->t.Callable:
115 |         '''
116 |         def extend(reconcile: ReconcileRequest):
117 | 
118 |         This is the request that handles performing "Add column based on reconciled data"
119 |         A list of properties and a list of entity ids are passed in 
120 |         The entity and the value of the propery are expected as results.
121 | 
122 |         reconcile.extend.ids contains the list of entity ids
123 |         reconcile.extend.properties contains the list of properties expected
124 | 
125 |         Return expected
126 | 
127 |         {
128 |             "rows" : {
129 |                         <entity_id> : {
130 |                             "<property_id>" : [ { "<str>" : "<property value>"}],
131 |                             "<property_id2>" : [ { "<str>" : "<property value>"},{ "<str>" : "<property other value>"}]
132 |                             .....
133 |                             
134 | 
135 |                         },
136 |                         <entity_id2> : {
137 |                             "<property_id>" : [ { "<str>" : "<property value>"}],
138 |                             "<property_id2>" : [ { "<str>" : "<property value>"},{ "<str>" : "<property other value>"}]
139 |                             .....
140 |                             
141 | 
142 |                         },
143 |                         .......
144 | 
145 |             }
146 |         }
147 | 
148 |         '''
149 |         self.services["extend"] = func
150 |         self.manifest.extend = ExtendService(self.base_path)
151 |         def inner() -> t.Callable:
152 |             return func()
153 |             
154 |         return inner
155 |     
156 |     def search(self, func, *args) ->t.Callable:
157 |         """
158 |         @rs.search
159 |         def my_search(reconcile: ReconcileRequest)
160 |         
161 |         Will be called with a single search query
162 |         reconcile.query will contain a string for the entity being searched for
163 |         expects a return of :
164 |             { "result" : [
165 |                             {
166 |                             "id": <unique id of entity>,
167 |                             "name": <name of entity>, 
168 |                             "score": <int of a score>,
169 |                             "match": <True or False>, # Return True for exact match.
170 |                             "type":     [
171 |                                             { # EntityType, ideally as added to the rs above
172 |                                                 "id": et.id,
173 |                                                 "name": et.name
174 |                                             }
175 |                                         ]
176 |                             }              
177 |                         ]
178 |             }
179 |         """
180 |         self.services["search"] = func
181 |         def inner() -> t.Callable:
182 |             return func()
183 |             
184 |         return inner        
185 |     
186 |     def search_batch(self, func, *args) ->t.Callable:
187 |         '''
188 |         @rs.search_batch
189 |         def search_batch(reconcile: ReconcileRequest = None):
190 | 
191 |         By Default OpenRefine will attempt to batch up queries
192 |         These will be available in reconcile.queries as a dictionary queries key off a query id
193 |         e.g. 
194 |             {
195 |                 <query_id_1> : {"query" : "text to search for"} ,
196 |                 <query_id_2> : {"query" : "something else to search for"}
197 |                 ......
198 |             }
199 | 
200 |         The expected return is
201 |         {"results" : {
202 |                         <query_id_1> : { "result" : .... } # same as single search result
203 |                         <query_id_2> : { "result" : .....}
204 |                         ......
205 |                     }
206 |         }
207 |         '''
208 |         self.services["search_batch"] = func
209 |         def inner() -> t.Callable:
210 |             return func()
211 |             
212 |         return inner
213 |     
214 |     def view(self, func, *args) -> t.Callable:
215 |         self.services["view"] = func
216 |         self.manifest.view = ViewService(self.base_path)
217 |                 
218 |         def inner() -> t.Callable:
219 |             return func()
220 |         return inner
221 | 
222 |     def preview_wh(self, width, height): 
223 |         '''
224 |         @rs.preview_wh(200, 200)
225 |         
226 |         def preview_item(id):
227 | 
228 |         This funtion is called when a matched results are hovered over in OpenRefine
229 |         Openrefine creates an iframe of size width x height where you can display summary data for the entity
230 |         Arg: 
231 |             id - the entity id
232 | 
233 |         Return : 
234 |             HTML 
235 | 
236 |         '''
237 | 
238 |         
239 |         self.manifest.preview = PreviewService(self.base_path, "/preview/{{id}}", width, height)
240 |         def preview(func, *args) -> t.Callable:
241 |         
242 |             self.services["preview"] = func        
243 |             def inner() -> t.Callable:
244 |                 return func()
245 |             return inner
246 |         return preview
247 | 
248 |     def add_entity(self, entity: EntityType) -> None: 
249 |         id = entity.id
250 |         self.entities[id] = entity
251 | 
252 |     # this is not a good way to do this
253 |     # meta props should be tied to a type or to the result
254 |     def get_props_meta(self, props) : 
255 |         p_meta = []
256 |         p_ids = list(map(lambda x : x["id"], props))
257 |         for k,e in self.entities.items() : 
258 |             print(e)
259 |             for p in e.properties : 
260 |                 if p.id in p_ids : 
261 |                     p_meta.append({"id": p.id, "name": p.name})
262 |                     
263 |         return p_meta
264 |     
265 | 
266 |     def serve(self, path, id):
267 |         
268 |         if path == "propose_properties" :
269 |             if "extend" not in self.services :
270 |                 raise InvalidUsage("extend is not enabled for this API")
271 |             
272 |             type = request.args.get("type") 
273 |             if type is None :
274 |                 raise InvalidUsage("Missing type parameter for Entity")
275 |             
276 |             # should I raise this? types seem loosely coupled 
277 |             #if type not in self.entities :
278 |             #    raise InvalidUsage("Supplies type parameter is not available")
279 |             print(self.entities)
280 |             schema = EntityTypeSchema().dump(self.entities[type])
281 |             return jsonpify(schema)
282 |         
283 |         if path == "view" :
284 |             return self.services["view"](id)
285 |         
286 |         if path == "preview" : 
287 |             return self.services["preview"](id)
288 |         
289 |         if request.method == "POST" : 
290 |             incoming_request = ""
291 |             if request.mimetype in ['application/json', 'application/javascript'] :
292 |                 incoming_request = request.get_json()
293 |             else :
294 |                 incoming_request = request.form
295 |             
296 |             #####
297 |             ## convert str to dict
298 |             ###
299 |             ir = {}
300 |             if "query" in incoming_request :
301 |                 ir = incoming_request
302 |             else :
303 |                 for k in incoming_request.keys(): 
304 |                     ir[k] = json.loads(incoming_request[k])
305 |             
306 |             print(ir)
307 | 
308 |             reconcile_request = ReconcileRequestSchema().load(ir)
309 |             if reconcile_request.query is not None:
310 |                 result = self.services["search"](reconcile_request)
311 |             
312 |             if reconcile_request.queries is not None: 
313 |                 result = self.services["search_batch"](reconcile_request)
314 |             
315 |             if reconcile_request.extend is not None: 
316 |                 result = self.services["extend"](reconcile_request)
317 |                 requested_properties = reconcile_request.extend["properties"]
318 |                 print(requested_properties)
319 |                 result["meta"] = self.get_props_meta(requested_properties)
320 |         
321 | 
322 |             return jsonpify(result)
323 | 
324 |         
325 |         return jsonpify(ManifestSchema().dump(self.manifest))
326 |         
327 |     
328 | class Manifest:
329 |     def __init__(self, name, versions) -> None:
330 |         """
331 |         Create a manifest to describe the service
332 |         Args:
333 |             name str: Contain the name of the service
334 |             versions list: list of strings for versions            
335 |                         e.g. 
336 |                                 ["1.1", "1.2", "1.2.1", "2.3"]
337 |         
338 |         """
339 |         self.versions = versions
340 |         self.name = name
341 |         self.defaultTypes = []
342 |         self.identifierSpace = None
343 |         self.schemaSpace = None
344 |         self.view = None
345 |         self.preview = None
346 |         self.suggest = None
347 |         self.extend = None
348 |     
349 |     def add_type(self, type: EntityType) -> None: 
350 |         self.defaultTypes[type.id] = type
351 |         
352 |     def toJson(self):
353 |         return json.dumps(self, default=lambda o: o.__dict__)
354 | 
355 | 
356 | class ManifestSchema(Schema) :
357 |     versions = fields.List(fields.Str()) 
358 |     name = fields.Str()
359 |     defaultTypes = fields.Nested(EntityTypeSchema, many=True)
360 |     identifierSpace = fields.Url()
361 |     schemaSpace = fields.Url()
362 |     view = fields.Nested(ViewServiceSchema())
363 |     preview = fields.Nested(PreviewServiceSchema())
364 |     suggest = None  # TODO: Create a Suggest service
365 |     extend = fields.Nested(ExtendServiceSchema()) 
366 |     SKIP_VALUES = [None]
367 | 
368 |     @post_dump
369 |     def remove_skip_values(self, data, many):
370 |         
371 |         return {
372 |             key: value for key, value in data.items()
373 |             if value not in self.SKIP_VALUES
374 |         }
375 | 
376 | class InvalidUsage(Exception):
377 |     status_code = 400
378 |     def __init__(self, message, status_code=None, payload=None):
379 |         Exception.__init__(self)
380 |         self.message = message
381 |         if status_code is not None:
382 |             self.status_code = status_code
383 |         self.payload = payload
384 | 
385 |     def to_dict(self):
386 |         rv = dict(self.payload or ())
387 |         rv['message'] = self.message
388 |         return rv
389 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask==2.0.2
2 | Flask-Jsonpify==1.5.0
3 | marshmallow==3.14.1
4 | openpyxl==3.0.9
5 | pandas==1.3.4
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from setuptools import setup
 4 | 
 5 | with open('README.md') as f:
 6 |     long_description = f.read()
 7 |     
 8 | setup(
 9 |     name = 'reconciliation',         
10 |     packages = ['reconciliation'],   
11 |     version = '0.3',
12 |     license='MIT',
13 |     description = 'An OpenRefine Reconciliation Framework for Python',
14 |     long_description=long_description,
15 |     long_description_content_type="text/markdown",
16 |     author = 'patrick oleary',                   # Type in your name
17 |     author_email = 'techops@preftech.com',      # Type in your E-Mail
18 |     url = 'https://github.com/preftech/reconciliation',   # Provide either the link to your github or to your website
19 |     keywords = ['reconcile', 'reconciliation', 'openrefine', 'cocoda'],   # Keywords that define your package best
20 |     install_requires=[
21 |                         "Flask==2.0.2",
22 |                         "Flask-Jsonpify==1.5.0",
23 |                         "marshmallow==3.14.1"
24 |                     ],
25 |     classifiers=[
26 |         'Development Status :: 4 - Beta',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
27 |         'Intended Audience :: Developers',      # Define that your audience are developers
28 |         'Topic :: Text Processing',
29 |         'Framework :: Flask',
30 |         'License :: OSI Approved :: MIT License',   # Again, pick a license
31 |         'Programming Language :: Python :: 3.6',
32 |         'Programming Language :: Python :: 3.7',
33 |         'Programming Language :: Python :: 3.8'
34 |     ],
35 | )


--------------------------------------------------------------------------------