├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── metastore ├── VERSION ├── __init__.py ├── blueprint.py ├── controllers.py └── models.py ├── pylama.ini ├── requirements.dev.txt ├── requirements.txt ├── server.py ├── setup.py ├── tests ├── __init__.py ├── test_blueprint.py └── test_controllers.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # PyInstaller 26 | # Usually these files are written by a python script from a template 27 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 28 | *.manifest 29 | *.spec 30 | 31 | # Installer logs 32 | pip-log.txt 33 | pip-delete-this-directory.txt 34 | 35 | # Unit test / coverage reports 36 | htmlcov/ 37 | .tox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *,cover 44 | 45 | # Translations 46 | *.mo 47 | *.pot 48 | 49 | # Django stuff: 50 | *.log 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # PyBuilder 56 | target/ 57 | 58 | # Node 59 | node_modules/ 60 | 61 | # Virtualenv 62 | venv/ 63 | 64 | # Shippable 65 | shippable/ 66 | 67 | # IntelliJ 68 | /.idea/ 69 | *.iml 70 | 71 | # flask 72 | flask_session/ 73 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: 2 | python 3 | 4 | sudo: required 5 | env: 6 | global: 7 | - K8S_OPS_REPO_BRANCH=master 8 | - K8S_OPS_REPO_SLUG=datahq/deploy 9 | - DOCKER_IMAGE=datopian/metastore 10 | - DEPLOY_YAML_UPDATE_FILE=values.auto-updated.yaml 11 | - DEPLOY_VALUES_CHART_NAME=metastore 12 | - DEPLOY_VALUES_IMAGE_PROP=image 13 | - DEPLOY_COMMIT_MESSAGE="automatic update of dhq-metastore" 14 | - DEPLOY_GIT_EMAIL=dhq-deployer@null.void 15 | - DEPLOY_GIT_USER=dhq-deployer 16 | 17 | python: 18 | - 3.6 19 | 20 | services: 21 | - elasticsearch 22 | - docker 23 | 24 | install: 25 | - make install 26 | 27 | before_script: 28 | - sleep 30 29 | - curl localhost:9200 30 | 31 | script: 32 | - make test 33 | - curl -s https://raw.githubusercontent.com/datahq/deploy/master/apps_travis_script.sh > .travis.sh 34 | - bash .travis.sh script 35 | 36 | after_success: 37 | - coveralls 38 | 39 | deploy: 40 | skip_cleanup: true 41 | provider: script 42 | script: bash .travis.sh deploy 43 | on: 44 | branch: master 45 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM codexfons/gunicorn 2 | 3 | ADD . $APP_PATH 4 | 5 | USER root 6 | RUN apk --update --no-cache add libpq postgresql-dev libffi libffi-dev build-base python3-dev ca-certificates 7 | RUN update-ca-certificates 8 | RUN pip3 install -r $APP_PATH/requirements.txt 9 | RUN mkdir /tmp/sessions && chown $GUNICORN_USER /tmp/sessions 10 | 11 | USER $GUNICORN_USER 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Open Knowledge (International) 4 | Copyright (c) 2017 Datopian and DataHQ 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | global-include *.json 2 | global-include *.yml 3 | global-include *.txt 4 | global-include VERSION 5 | include LICENSE.md 6 | include Makefile 7 | include pylama.ini 8 | include pytest.ini 9 | include README.md 10 | include tox.ini -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all install list test version 2 | 3 | 4 | PACKAGE := $(shell grep '^PACKAGE =' setup.py | cut -d "'" -f2) 5 | VERSION := $(shell head -n 1 $(PACKAGE)/VERSION) 6 | 7 | 8 | all: list 9 | 10 | install: 11 | pip install --upgrade -e .[develop] 12 | 13 | list: 14 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' 15 | 16 | test: 17 | pylama $(PACKAGE) 18 | tox 19 | 20 | version: 21 | @echo $(VERSION) 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataHub metastore 2 | 3 | [![Build Status](https://travis-ci.org/datahq/metastore.svg?branch=master)](https://travis-ci.org/datahq/metastore) 4 | 5 | A search services for DataHub. 6 | 7 | Searches Elasticsearch and returns matching documents (returned document content structure are not defined by this module) 8 | 9 | ## Quick Start 10 | 11 | 12 | # Clone the repo and install 13 | 14 | `make install` 15 | 16 | # Run tests 17 | 18 | `make test` 19 | 20 | # Run server 21 | 22 | `python server.py` 23 | 24 | 25 | # API 26 | 27 | **Elasticsearch:** version 5.x should be installed 28 | 29 | 30 | **Endpoint:** `/metastore/search` 31 | 32 | **Method:** `GET` 33 | 34 | **HEADER:** `Auth-Token` (received from `/auth/check`) 35 | 36 | **Query Parameters:** 37 | 38 | * q - match-all query string 39 | Will search the following properties: 40 | - `title` 41 | - `datahub.owner` 42 | - `datahub.ownerid` 43 | - `datapackage.readme` 44 | 45 | * size - number of results to return [max 100] 46 | * from - offset to start returning results from 47 | 48 | all other parameters will be treated as filters for the query (requiring exact match of value) 49 | 50 | **Returns:** All packages that match the filter: 51 | ```json 52 | { 53 | "summary": { 54 | "total": "total-number-of-matched-documents", 55 | "totalBytes": "total-size-of-matched-datasets" 56 | }, 57 | "results": [ 58 | "list of matched documents" 59 | ] 60 | } 61 | ``` 62 | 63 | **Endpoint:** `/metastore/search/events` 64 | 65 | **Method:** `GET` 66 | 67 | **HEADER:** `Auth-Token` (received from `/auth/check`) 68 | 69 | **Query Parameters:** 70 | 71 | * q - match-all query string 72 | * event_entity - flow|account|etc... (currently only `flow` is supported) 73 | * event_action - create|finished|deleted|etc... (currently only `finished` is supported) 74 | * owner - ownerid (usually hash of user's Email) 75 | * dataset - dataset name 76 | * status - OK|Not OK 77 | * findability - published|unlisted|private 78 | 79 | **Query Parameters for pagination and sorting:** 80 | * sort - desc|asc (defaults to desc) 81 | * size - number of results to return [max 100] 82 | * from - offset to start returning results from 83 | 84 | **Returns:** All packages that match the filter: 85 | ```json 86 | { 87 | "results": [ 88 | { 89 | "dataset": "finance-vix", 90 | "event_action": "finished", 91 | "event_entity": "flow", 92 | "findability": "published", 93 | "messsage": "", 94 | "owner": "core", 95 | "ownerid": "core", 96 | "status": "OK", 97 | "timestamp": "2017-01-01T00:00:00.000000", 98 | "payload": { 99 | "flow-id": "core/finance-vix" 100 | } 101 | } 102 | ], 103 | "summary": { 104 | "total": 1, 105 | "totalBytes": 0 106 | } 107 | } 108 | ``` 109 | -------------------------------------------------------------------------------- /metastore/VERSION: -------------------------------------------------------------------------------- 1 | 0.0.1 2 | -------------------------------------------------------------------------------- /metastore/__init__.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | from flask_cors import CORS 3 | from .blueprint import create as search 4 | 5 | def create(): 6 | """Create application. 7 | """ 8 | 9 | # Create application 10 | app = Flask('service', static_folder=None) 11 | app.config['DEBUG'] = True 12 | 13 | # CORS support 14 | CORS(app, supports_credentials=True) 15 | app.register_blueprint(search(), url_prefix='/metastore/') 16 | 17 | # Return application 18 | return app 19 | -------------------------------------------------------------------------------- /metastore/blueprint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import jwt 3 | from flask import Blueprint, abort, request 4 | from flask_jsonpify import jsonpify 5 | 6 | from . import controllers 7 | 8 | PRIVATE_KEY = os.environ.get('PRIVATE_KEY') 9 | 10 | 11 | def create(): 12 | """Create blueprint. 13 | """ 14 | 15 | # Create instance 16 | blueprint = Blueprint('search', 'search') 17 | 18 | # Controller Proxies 19 | search_controller = controllers.search 20 | 21 | def search(kind='dataset'): 22 | token = request.headers.get('auth-token') or request.values.get('jwt') 23 | userid = None 24 | try: 25 | if token is not None: 26 | token = jwt.decode(token, PRIVATE_KEY) 27 | userid = token.get('userid') 28 | except jwt.InvalidTokenError: 29 | pass 30 | ret = search_controller(kind, userid, request.args) 31 | if ret is None: 32 | abort(400) 33 | return jsonpify(ret) 34 | 35 | # Register routes 36 | blueprint.add_url_rule( 37 | 'search', 'search', search, methods=['GET']) 38 | blueprint.add_url_rule( 39 | 'search/', 'events', search, methods=['GET']) 40 | 41 | # Return blueprint 42 | return blueprint 43 | -------------------------------------------------------------------------------- /metastore/controllers.py: -------------------------------------------------------------------------------- 1 | import elasticsearch 2 | 3 | from .models import query 4 | 5 | 6 | def search(kind, userid, args={}): 7 | """Initiate an elasticsearch query 8 | """ 9 | try: 10 | res = query(kind, userid, **args) 11 | return res 12 | except elasticsearch.exceptions.ElasticsearchException as e: 13 | return { 14 | 'total': 0, 15 | 'results': [], 16 | 'error': str(e) 17 | } 18 | -------------------------------------------------------------------------------- /metastore/models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | 5 | from elasticsearch import Elasticsearch 6 | from elasticsearch.exceptions import NotFoundError 7 | 8 | 9 | logging.root.setLevel(logging.INFO) 10 | logging.getLogger('elasticsearch').setLevel(logging.DEBUG) 11 | 12 | _engine = None 13 | 14 | ENABLED_SEARCHES = { 15 | 'dataset': { 16 | 'index': 'datahub', 17 | 'doc_type': 'dataset', 18 | 'owner': 'datahub.ownerid', 19 | 'findability': 'datahub.findability', 20 | 'q_fields': [ 21 | 'title', 22 | 'datahub.owner', 23 | 'datahub.ownerid', 24 | 'datapackage.readme', 25 | ], 26 | }, 27 | 'events': { 28 | 'index': 'events', 29 | 'doc_type': 'event', 30 | 'owner': 'ownerid', 31 | 'findability': 'findability', 32 | 'timestamp': 'timestamp', 33 | 'q_fields': [] 34 | } 35 | } 36 | 37 | BOOSTS = { 38 | 'title': '^5', 39 | 'datahub.owner': '^2', 40 | 'datahub.ownerid': '', 41 | 'datapackage.readme': '^2', 42 | } 43 | 44 | 45 | def _get_engine(): 46 | global _engine 47 | if _engine is None: 48 | es_host = os.environ['DATAHUB_ELASTICSEARCH_ADDRESS'] 49 | _engine = Elasticsearch(hosts=[es_host], use_ssl='https' in es_host) 50 | 51 | return _engine 52 | 53 | 54 | def build_dsl(kind_params, userid, kw, kind=None): 55 | dsl = {'bool': { 56 | 'should': [], 57 | 'must': [], 'minimum_should_match': 1}} 58 | # All Datasets: 59 | all_datasets = { 60 | 'bool': { 61 | 'should': [{'match': {kind_params['findability']: 'published'}}], 62 | 'minimum_should_match': 1 63 | } 64 | } 65 | boost_core = { 66 | 'bool': { 67 | 'should': [{ "match": { "datahub.ownerid": {"query": "core", "boost": 4.5}}}], 68 | 'must': [{'match': {kind_params['findability']: 'published'}}], 69 | 'minimum_should_match': 1 70 | } 71 | } 72 | dsl['bool']['should'].append(all_datasets) 73 | dsl['bool']['should'].append(boost_core) 74 | 75 | # User datasets 76 | if userid is not None: 77 | user_datasets = \ 78 | {'bool': {'must': {'match': {kind_params['owner']: userid}}}} 79 | dsl['bool']['should'].append(user_datasets) 80 | 81 | # Allow sorting event results 82 | sort_by = kw.pop('sort', ['desc'])[0].replace('"', '') 83 | sort = [] 84 | if kind_params.get('timestamp'): 85 | sort.append({'timestamp': {'order' : sort_by}}) 86 | 87 | # Query parameters (for not to mess with other parameters we should pop) 88 | q = kw.pop('q', None) 89 | if q is not None: 90 | dsl['bool']['must'].append({ 91 | 'multi_match': { 92 | 'query': json.loads(q[0]), 93 | 'fields': [f+(BOOSTS.get(f, '')) for f in kind_params['q_fields']], 94 | 'type': 'most_fields' 95 | } 96 | }) 97 | match_or_term = 'term' if kind == 'events' else 'match' 98 | for k, v_arr in kw.items(): 99 | dsl['bool']['must'].append({ 100 | 'bool': { 101 | 'should': [{match_or_term: {k: json.loads(v)}} 102 | for v in v_arr], 103 | 'minimum_should_match': 1 104 | } 105 | }) 106 | 107 | if len(dsl['bool']['must']) == 0: 108 | del dsl['bool']['must'] 109 | if len(dsl['bool']) == 0: 110 | del dsl['bool'] 111 | if len(dsl) == 0: 112 | dsl = {} 113 | else: 114 | dsl = {'query': dsl, 'explain': True, 'sort': sort} 115 | 116 | aggs = { 'total_bytes': { 'sum': { 'field': 'datahub.stats.bytes' } } } 117 | dsl['aggs'] = aggs 118 | 119 | return dsl 120 | 121 | 122 | def query(kind, userid, size=50, **kw): 123 | kind_params = ENABLED_SEARCHES.get(kind) 124 | try: 125 | # Arguments received from a network request come in kw, as a mapping 126 | # between param_name and a list of received values. 127 | # If size was provided by the user, it will be a list, so we take its 128 | # first item. 129 | if type(size) is list: 130 | size = size[0] 131 | if int(size) > 100: 132 | size = 100 133 | 134 | from_ = int(kw.pop('from', [0])[0]) 135 | 136 | api_params = dict([ 137 | ('index', kind_params['index']), 138 | ('doc_type', kind_params['doc_type']), 139 | ('size', size), 140 | ('from_', from_), 141 | ('search_type', 'dfs_query_then_fetch') 142 | ]) 143 | 144 | body = build_dsl(kind_params, userid, kw, kind=kind) 145 | api_params['body'] = json.dumps(body) 146 | ret = _get_engine().search(**api_params) 147 | logging.info('Performing query %r', kind_params) 148 | logging.info('api_params %r', api_params) 149 | logging.info('ret %r', ret) 150 | if ret.get('hits') is not None: 151 | results = [hit['_source'] for hit in ret['hits']['hits']] 152 | total = ret['hits']['total'] 153 | total_bytes = ret.get('aggregations')['total_bytes']['value'] 154 | else: 155 | results = [] 156 | total = 0 157 | total_bytes = 0 158 | return { 159 | 'results': results, 160 | 'summary': { 161 | "total": total, 162 | "totalBytes": total_bytes 163 | } 164 | } 165 | except (NotFoundError, json.decoder.JSONDecodeError, ValueError) as e: 166 | logging.error("query: %r" % e) 167 | return { 168 | 'results': [], 169 | 'summary': { 170 | "total": 0, 171 | "totalBytes": 0 172 | }, 173 | 'error': str(e) 174 | } 175 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | linters = pyflakes,mccabe 3 | ignore = W0611 4 | 5 | [pylama:*/__init__.py] 6 | ignore = W0611 7 | 8 | [pylama:pycodestyle] 9 | max_line_length = 120 10 | 11 | -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pytest 3 | pytest-cov 4 | pylama 5 | coverage 6 | coveralls 7 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flask 2 | flask-cors 3 | flask-jsonpify 4 | pyyaml 5 | requests 6 | pyjwt 7 | cryptography 8 | elasticsearch>=5.0.0,<6.0.0 9 | -------------------------------------------------------------------------------- /server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import metastore 4 | 5 | # Create application 6 | app = metastore.create() 7 | 8 | # Port to listen 9 | port = os.environ.get('PORT') or 5000 10 | 11 | # Debug mode flag 12 | debug = True 13 | 14 | # Run application 15 | if __name__ == '__main__': 16 | app.run(host='0.0.0.0', port=port, debug=debug) 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import io 5 | from setuptools import setup, find_packages 6 | 7 | 8 | # Helpers 9 | def read(*paths): 10 | """Read a text file.""" 11 | basedir = os.path.dirname(__file__) 12 | fullpath = os.path.join(basedir, *paths) 13 | contents = io.open(fullpath, encoding='utf-8').read().strip() 14 | return contents 15 | 16 | 17 | # Prepare 18 | PACKAGE = 'metastore' 19 | NAME = 'metastore' 20 | INSTALL_REQUIRES = [ 21 | 'flask', 22 | 'flask-cors', 23 | 'flask-jsonpify', 24 | 'pyyaml', 25 | 'requests', 26 | 'pyjwt', 27 | 'cryptography', 28 | 'elasticsearch>=5.0.0,<6.0.0' 29 | ] 30 | TESTS_REQUIRE = [ 31 | 'pytest', 32 | 'pytest-cov', 33 | 'pylama', 34 | 'coverage', 35 | 'coveralls', 36 | 'tox' 37 | ] 38 | README = read('README.md') 39 | VERSION = read(PACKAGE, 'VERSION') 40 | PACKAGES = find_packages(exclude=['examples', 'tests']) 41 | 42 | 43 | # Run 44 | setup( 45 | name=NAME, 46 | version=VERSION, 47 | packages=PACKAGES, 48 | include_package_data=True, 49 | install_requires=INSTALL_REQUIRES, 50 | tests_require=TESTS_REQUIRE, 51 | extras_require={'develop': TESTS_REQUIRE}, 52 | zip_safe=False, 53 | long_description=README, 54 | description='{{ DESCRIPTION }}', 55 | author='Open Knowledge (International), Datopian and DataHQ', 56 | url='https://github.com/datahq/bitstore', 57 | license='MIT', 58 | keywords=[ 59 | 'data', 60 | 'analytics' 61 | ], 62 | classifiers=[ 63 | 'Development Status :: 4 - Beta', 64 | 'Environment :: Web Environment', 65 | 'Intended Audience :: Developers', 66 | 'License :: OSI Approved :: MIT License', 67 | 'Operating System :: OS Independent', 68 | 'Programming Language :: Python :: 3.6', 69 | 'Topic :: Internet :: WWW/HTTP :: Dynamic Content', 70 | 'Topic :: Software Development :: Libraries :: Python Modules', 71 | ], 72 | ) 73 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datopian/metastore/79916f80d68027f222f1bc55eaff33a28a3e9f4d/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_blueprint.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | try: 3 | from unittest.mock import Mock, patch 4 | except ImportError: 5 | from mock import Mock, patch 6 | from importlib import import_module 7 | module = import_module('metastore.blueprint') 8 | 9 | 10 | class createTest(unittest.TestCase): 11 | 12 | # Actions 13 | 14 | def setUp(self): 15 | self.addCleanup(patch.stopall) 16 | self.controllers = patch.object(module, 'controllers').start() 17 | 18 | # Tests 19 | 20 | def test(self): 21 | self.assertTrue(module.create()) 22 | -------------------------------------------------------------------------------- /tests/test_controllers.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import unittest 3 | from importlib import import_module 4 | from elasticsearch import Elasticsearch, NotFoundError 5 | 6 | LOCAL_ELASTICSEARCH = 'localhost:9200' 7 | 8 | module = import_module('metastore.controllers') 9 | 10 | class SearchTest(unittest.TestCase): 11 | 12 | # Actions 13 | DATAHUB_MAPPING = { 14 | 'id': {"type": "string", "analyzer": "keyword"}, 15 | 'name': {"type": "string", "analyzer": "keyword"}, 16 | 'title': {"type": "string", "analyzer": "english"}, 17 | 'description': {"type": "string", "analyzer": "english"}, 18 | 'datahub': { 19 | 'type': 'object', 20 | 'properties': { 21 | 'owner': { 22 | "type": "string", 23 | "index": "not_analyzed" 24 | }, 25 | "ownerid": { 26 | "type": "string", 27 | "index": "not_analyzed" 28 | }, 29 | "findability": { 30 | "type": "string", 31 | "index": "not_analyzed" 32 | }, 33 | "flowid": { 34 | "type": "string", 35 | "index": "not_analyzed" 36 | }, 37 | "stats": { 38 | "type": "object", 39 | "properties": { 40 | "rowcount": { 41 | "type": "integer", 42 | "index": "not_analyzed" 43 | }, 44 | "bytes": { 45 | "type": "integer", 46 | "index": "not_analyzed" 47 | } 48 | } 49 | } 50 | } 51 | }, 52 | 'datapackage': { 53 | 'type': 'object', 54 | 'properties': { 55 | 'readme': { 56 | "type": "string", 57 | "analyzer": "english", 58 | } 59 | } 60 | } 61 | } 62 | 63 | EVENTS_MAPPING = { 64 | 'timestamp': {'type': 'date'}, 65 | 'dataset': {"type": "string", "analyzer": "keyword"}, 66 | 'owner': {"type": "string", "analyzer": "keyword"}, 67 | 'ownerid': {"type": "string", "analyzer": "keyword"} 68 | } 69 | 70 | words = [ 71 | 'headphones', 'ideal', 'naive', 'city', 'flirtation', 72 | 'annihilate', 'crypt', 'ditch', 'glacier', 'megacity' 73 | ] 74 | 75 | def setUp(self): 76 | 77 | # Clean index 78 | self.es = Elasticsearch(hosts=[LOCAL_ELASTICSEARCH]) 79 | try: 80 | self.es.indices.delete(index='datahub') 81 | self.es.indices.delete(index='events') 82 | except NotFoundError: 83 | pass 84 | self.es.indices.create('datahub') 85 | mapping = {'dataset': {'properties': self.DATAHUB_MAPPING}} 86 | self.es.indices.put_mapping(doc_type='dataset', 87 | index='datahub', 88 | body=mapping) 89 | 90 | self.es.indices.create('events') 91 | mapping = {'event': {'properties': self.EVENTS_MAPPING}} 92 | self.es.indices.put_mapping(doc_type='event', 93 | index='events', 94 | body=mapping) 95 | 96 | def search(self, kind, *args, **kwargs): 97 | ret = module.search(kind, *args, **kwargs) 98 | self.assertLessEqual(len(ret['results']), ret['summary']['total']) 99 | return ret['results'], ret['summary'] 100 | 101 | def indexSomeEventRecords(self, amount): 102 | for i in range(amount): 103 | body = dict( 104 | timestamp=datetime.datetime(2000+i, 1, 1, 0, 0, 0), 105 | event_entity='flow' if i % 3 else 'login', 106 | event_action='finished' if i % 4 else 'deleted', 107 | owner='datahub', 108 | ownerid='datahubid', 109 | dataset='dataset' + str(i), 110 | status='OK', 111 | messsage='', 112 | findability='published' if i % 2 else 'unlisted', 113 | payload={'flow-id': 'datahub/dataset'} 114 | ) 115 | self.es.index('events', 'event', body) 116 | self.es.indices.flush('events') 117 | 118 | def indexEventRecordsWithDatasets(self, datasets): 119 | for dataset in datasets: 120 | body = dict( 121 | timestamp=datetime.datetime(2000, 1, 1, 0, 0, 0), 122 | event_entity='flow', 123 | event_action='finished', 124 | owner='datahub', 125 | ownerid='datahubid', 126 | dataset=dataset, 127 | status='OK', 128 | messsage='', 129 | findability='published', 130 | payload={'flow-id': 'datahub/%s' % dataset} 131 | ) 132 | self.es.index('events', 'event', body) 133 | self.es.indices.flush('events') 134 | 135 | def indexSomeRecords(self, amount): 136 | self.es.indices.delete(index='datahub') 137 | for i in range(amount): 138 | body = { 139 | 'name': True, 140 | 'title': i, 141 | 'license': 'str%s' % i, 142 | 'datahub': { 143 | 'name': 'innername', 144 | 'findability': 'published', 145 | 'stats': { 146 | 'bytes': 10 147 | } 148 | } 149 | } 150 | self.es.index('datahub', 'dataset', body) 151 | self.es.indices.flush('datahub') 152 | 153 | def indexSomeRecordsToTestMapping(self): 154 | 155 | for i in range(3): 156 | body = { 157 | 'name': 'package-id-%d' % i, 158 | 'title': 'This dataset is number test %s' % self.words[i], 159 | 'datahub': { 160 | 'owner': 'BlaBla%d@test2.com' % i, 161 | 'findability': 'published', 162 | 'stats': { 163 | 'bytes': 10 164 | } 165 | }, 166 | } 167 | self.es.index('datahub', 'dataset', body) 168 | self.es.indices.flush('datahub') 169 | 170 | def indexSomeRealLookingRecords(self, amount): 171 | for i in range(amount): 172 | body = { 173 | 'name': 'package-id-%d' % i, 174 | 'title': 'This dataset is number %s' % self.words[i%10], 175 | 'datahub': { 176 | 'owner': 'The one and only owner number %s' % (self.words[(i+1)%10]), 177 | 'findability': 'published', 178 | 'stats': { 179 | 'bytes': 10 180 | } 181 | }, 182 | 'loaded': True 183 | } 184 | self.es.index('datahub', 'dataset', body) 185 | self.es.indices.flush('datahub') 186 | 187 | def indexSomePrivateRecords(self): 188 | i = 0 189 | for owner in ['owner1', 'owner2']: 190 | for private in ['published', 'else']: 191 | for content in ['cat', 'dog']: 192 | body = { 193 | 'name': '%s-%s-%s' % (owner, private, content), 194 | 'title': 'This dataset is number%d, content is %s' % (i, content), 195 | 'datahub': { 196 | 'owner': 'The one and only owner number%d' % (i+1), 197 | 'ownerid': owner, 198 | 'findability': private, 199 | 'stats': { 200 | 'bytes': 10 201 | } 202 | } 203 | } 204 | i += 1 205 | self.es.index('datahub', 'dataset', body) 206 | self.es.indices.flush('datahub') 207 | 208 | def indexSomePrivateRecordsWithReadme(self): 209 | i = 0 210 | for owner in ['owner1', 'owner2']: 211 | for private in ['published', 'else']: 212 | for content in ['cat', 'dog']: 213 | body = { 214 | 'name': '%s-%s-%s' % (owner, private, content), 215 | 'title': 'This dataset is number%d, content is %s' % (i, content), 216 | 'datahub': { 217 | 'owner': 'The one and only owner number%d' % (i + 1), 218 | 'ownerid': owner, 219 | 'findability': private, 220 | 'stats': { 221 | 'bytes': 10 222 | } 223 | }, 224 | 'datapackage': { 225 | 'readme':'some readme text '+str(i)+' which should be searched through ' 226 | } 227 | } 228 | i += 1 229 | self.es.index('datahub', 'dataset', body) 230 | self.es.indices.flush('datahub') 231 | 232 | def indexMultipleUserRecords(self): 233 | for owner in ['core', 'anonymous', 'friend', 'other']: 234 | for findability in ['published', 'unlisted', 'private']: 235 | 236 | body = { 237 | 'name': '%s-dataset' % owner, 238 | 'title': 'This dataset is owned by %s' % owner, 239 | 'datahub': { 240 | 'owner': 'Example', 241 | 'ownerid': owner, 242 | 'findability': findability, 243 | 'stats': { 244 | 'bytes': 10 245 | } 246 | }, 247 | 'datapackage': { 248 | 'readme':'some readme text which should be searched through ' 249 | } 250 | } 251 | self.es.index('datahub', 'dataset', body) 252 | self.es.indices.flush('datahub') 253 | 254 | def indexWithStopWords(self): 255 | for ind, title in enumerate(['the Mauna Loa', 'Mauna Loa', 'The United States']): 256 | body = { 257 | 'name': '%s-dataset' % id, 258 | 'title': title, 259 | 'datahub': { 260 | 'owner': 'Example', 261 | 'ownerid': '%s-owner', 262 | 'findability': 'published', 263 | 'stats': { 264 | 'bytes': 10 265 | } 266 | }, 267 | 'datapackage': { 268 | 'readme':'some readme text which should be searched through ' 269 | } 270 | } 271 | self.es.index('datahub', 'dataset', body) 272 | self.es.indices.flush('datahub') 273 | 274 | def indexWithCustomText(self, data=[]): 275 | for ind, entry in enumerate(data): 276 | body = { 277 | 'name': entry.get('name', '%s-dataset' % ind), 278 | 'title': entry.get('title', '%s-title' % ind), 279 | 'datahub': { 280 | 'owner': entry.get('owner', '%s-owner' % ind), 281 | 'ownerid': entry.get('ownerid', '%s-ownerid' % ind), 282 | 'findability': entry.get('findability', 'published'), 283 | 'stats': { 284 | 'bytes': 10 285 | } 286 | }, 287 | 'datapackage': { 288 | 'readme': entry.get('readme', '%s-readme' % ind) 289 | } 290 | } 291 | self.es.index('datahub', 'dataset', body) 292 | self.es.indices.flush('datahub') 293 | 294 | # Tests Datahub 295 | def test___search___all_values_and_empty(self): 296 | self.assertEquals(self.search('dataset', None), ([], {'total': 0, 'totalBytes': 0.0})) 297 | 298 | def test___search___all_values_and_one_result(self): 299 | self.indexSomeRecords(1) 300 | res, summary = self.search('dataset', None) 301 | self.assertEquals(len(res), 1) 302 | self.assertEquals(summary['total'], 1) 303 | self.assertEquals(summary['totalBytes'], 10) 304 | 305 | def test___search___all_values_and_two_results(self): 306 | self.indexSomeRecords(2) 307 | res, summary = self.search('dataset', None) 308 | self.assertEquals(len(res), 2) 309 | self.assertEquals(summary['total'], 2) 310 | self.assertEquals(summary['totalBytes'], 20) 311 | 312 | def test___search___filter_simple_property(self): 313 | self.indexSomeRecords(10) 314 | res, summary = self.search('dataset', None, {'license': ['"str7"']}) 315 | self.assertEquals(len(res), 1) 316 | self.assertEquals(summary['total'], 1) 317 | self.assertEquals(summary['totalBytes'], 10) 318 | 319 | def test___search___filter_numeric_property(self): 320 | self.indexSomeRecords(10) 321 | res, summary = self.search('dataset', None, {'title': ["7"]}) 322 | self.assertEquals(len(res), 1) 323 | self.assertEquals(summary['total'], 1) 324 | self.assertEquals(summary['totalBytes'], 10) 325 | 326 | def test___search___filter_boolean_property(self): 327 | self.indexSomeRecords(10) 328 | res, summary = self.search('dataset', None, {'name': ["true"]}) 329 | self.assertEquals(len(res), 10) 330 | self.assertEquals(summary['total'], 10) 331 | self.assertEquals(summary['totalBytes'], 100) 332 | 333 | def test___search___filter_multiple_properties(self): 334 | self.indexSomeRecords(10) 335 | res, summary = self.search('dataset', None, {'license': ['"str6"'], 'title': ["6"]}) 336 | self.assertEquals(len(res), 1) 337 | self.assertEquals(summary['total'], 1) 338 | self.assertEquals(summary['totalBytes'], 10) 339 | 340 | def test___search___filter_multiple_values_for_property(self): 341 | self.indexSomeRecords(10) 342 | res, summary = self.search('dataset', None, {'license': ['"str6"','"str7"']}) 343 | self.assertEquals(len(res), 2) 344 | self.assertEquals(summary['total'], 2) 345 | self.assertEquals(summary['totalBytes'], 20) 346 | 347 | def test___search___filter_inner_property(self): 348 | self.indexSomeRecords(7) 349 | res, summary = self.search('dataset', None, {"datahub.name": ['"innername"']}) 350 | self.assertEquals(len(res), 7) 351 | self.assertEquals(summary['total'], 7) 352 | self.assertEquals(summary['totalBytes'], 70) 353 | 354 | def test___search___filter_no_results(self): 355 | res, summary = self.search('dataset', None, {'license': ['"str6"'], 'title': ["7"]}) 356 | self.assertEquals(len(res), 0) 357 | self.assertEquals(summary['total'], 0) 358 | self.assertEquals(summary['totalBytes'], 0) 359 | 360 | def test___search___filter_bad_value(self): 361 | ret = module.search('dataset', None, {'license': ['str6'], 'title': ["6"]}) 362 | self.assertEquals(ret['results'], []) 363 | self.assertEquals(ret['summary']['total'], 0) 364 | self.assertEquals(ret['summary']['totalBytes'], 0) 365 | self.assertIsNotNone(ret['error']) 366 | 367 | def test___search___filter_nonexistent_property(self): 368 | ret = module.search('dataset', None, {'license': ['str6'], 'boxing': ["6"]}) 369 | self.assertEquals(ret['results'], []) 370 | self.assertEquals(ret['summary']['total'], 0) 371 | self.assertEquals(ret['summary']['totalBytes'], 0) 372 | self.assertIsNotNone(ret['error']) 373 | 374 | def test___search___returns_limited_size(self): 375 | self.indexSomeRecords(10) 376 | res, summary = self.search('dataset', None, {'size':['4']}) 377 | self.assertEquals(len(res), 4) 378 | self.assertEquals(summary['total'], 10) 379 | self.assertEquals(summary['totalBytes'], 100) 380 | 381 | def test___search___not_allows_more_than_50(self): 382 | self.indexSomeRecords(105) 383 | res, summary = self.search('dataset', None, {'size':['105']}) 384 | self.assertEquals(len(res), 100) 385 | self.assertEquals(summary['total'], 105) 386 | self.assertEquals(summary['totalBytes'], 1050) 387 | 388 | def test___search___returns_results_from_given_index(self): 389 | self.indexSomeRecords(5) 390 | res, summary = self.search('dataset', None, {'from':['3']}) 391 | self.assertEquals(len(res), 2) 392 | self.assertEquals(summary['total'], 5) 393 | self.assertEquals(summary['totalBytes'], 50) 394 | 395 | def test___search___q_param_no_recs_no_results(self): 396 | self.indexSomeRealLookingRecords(0) 397 | res, summary = self.search('dataset', None, {'q': ['"owner"']}) 398 | self.assertEquals(len(res), 0) 399 | self.assertEquals(summary['total'], 0) 400 | self.assertEquals(summary['totalBytes'], 0) 401 | 402 | def test___search___q_param_some_recs_no_results(self): 403 | self.indexSomeRealLookingRecords(2) 404 | res, summary = self.search('dataset', None, {'q': ['"writer"']}) 405 | self.assertEquals(len(res), 0) 406 | self.assertEquals(summary['total'], 0) 407 | self.assertEquals(summary['totalBytes'], 0) 408 | 409 | def test___search___q_param_some_recs_some_results(self): 410 | self.indexSomeRealLookingRecords(2) 411 | res, summary = self.search('dataset', None, {'q': ['"ideal"']}) 412 | self.assertEquals(len(res), 1) 413 | self.assertEquals(summary['total'], 1) 414 | self.assertEquals(summary['totalBytes'], 10) 415 | 416 | def test___search___empty_anonymous_search(self): 417 | self.indexSomePrivateRecords() 418 | recs, _ = self.search('dataset', None) 419 | self.assertEquals(len(recs), 4) 420 | ids = set([r['name'] for r in recs]) 421 | self.assertSetEqual(ids, {'owner1-published-cat', 422 | 'owner2-published-cat', 423 | 'owner1-published-dog', 424 | 'owner2-published-dog', 425 | }) 426 | 427 | def test___search___empty_authenticated_search(self): 428 | self.indexSomePrivateRecords() 429 | recs, _ = self.search('dataset', 'owner1') 430 | ids = set([r['name'] for r in recs]) 431 | self.assertSetEqual(ids, {'owner1-published-cat', 432 | 'owner1-else-cat', 433 | 'owner2-published-cat', 434 | 'owner1-published-dog', 435 | 'owner1-else-dog', 436 | 'owner2-published-dog', 437 | }) 438 | self.assertEquals(len(recs), 6) 439 | 440 | def test___search___q_param_anonymous_search(self): 441 | self.indexSomePrivateRecords() 442 | recs, _ = self.search('dataset', None, {'q': ['"cat"']}) 443 | self.assertEquals(len(recs), 2) 444 | ids = set([r['name'] for r in recs]) 445 | self.assertSetEqual(ids, {'owner1-published-cat', 446 | 'owner2-published-cat', 447 | }) 448 | 449 | def test___search___q_param_anonymous_search_with_param(self): 450 | self.indexSomePrivateRecords() 451 | recs, _ = self.search('dataset', None, {'q': ['"cat"'], 'datahub.ownerid': ['"owner1"']}) 452 | self.assertEquals(len(recs), 1) 453 | ids = set([r['name'] for r in recs]) 454 | self.assertSetEqual(ids, {'owner1-published-cat'}) 455 | 456 | def test___search___q_param_authenticated_search(self): 457 | self.indexSomePrivateRecords() 458 | recs, _ = self.search('dataset', 'owner1', {'q': ['"cat"']}) 459 | ids = set([r['name'] for r in recs]) 460 | self.assertSetEqual(ids, {'owner1-published-cat', 461 | 'owner1-else-cat', 462 | 'owner2-published-cat', 463 | }) 464 | self.assertEquals(len(recs), 3) 465 | 466 | def test___search___q_param_with_similar_param(self): 467 | self.indexSomeRecordsToTestMapping() 468 | recs, _ = self.search('dataset', None, {'q': ['"naive"']}) 469 | ids = set([r['name'] for r in recs]) 470 | self.assertSetEqual(ids, {'package-id-2'}) 471 | self.assertEquals(len(recs), 1) 472 | 473 | recs, _ = self.search('dataset', None, {'q': ['"dataset"'], 'datahub.owner': ['"BlaBla2@test2.com"']}) 474 | ids = set([r['name'] for r in recs]) 475 | self.assertSetEqual(ids, {'package-id-2'}) 476 | self.assertEquals(len(recs), 1) 477 | 478 | recs, _ = self.search('dataset', None, {'datahub.owner': ['"BlaBla2@test2.com"']}) 479 | ids = set([r['name'] for r in recs]) 480 | self.assertSetEqual(ids, {'package-id-2'}) 481 | self.assertEquals(len(recs), 1) 482 | 483 | def test_search__q_param_in_readme(self): 484 | body = { 485 | 'name': True, 486 | 'title': 'testing', 487 | 'license': 'str', 488 | 'datahub': { 489 | 'name': 'innername', 490 | 'findability': 'published', 491 | 'stats': { 492 | 'bytes': 10 493 | } 494 | }, 495 | 'datapackage': { 496 | 'readme': 'text only in README', 497 | 'not_readme': 'NOTREADME' 498 | }, 499 | } 500 | self.es.index('datahub', 'dataset', body) 501 | self.es.indices.flush('datahub') 502 | recs, _ = self.search('dataset', None, {'q': ['"README"']}) 503 | self.assertEquals(len(recs), 1) 504 | ## Make sure not queries unlisted fields 505 | recs, _ = self.search('dataset', None, {'q': ['"NOTREADME"']}) 506 | self.assertEquals(len(recs), 0) 507 | 508 | def test__search__q_param_in_readme_with_more_records(self): 509 | self.indexSomePrivateRecordsWithReadme() 510 | recs, _ = self.search('dataset', None, {'q': ['"readme"']}) 511 | self.assertEquals(len(recs), 4) 512 | ## Make sure not queries unlisted fields 513 | recs, _ = self.search('dataset', None, {'q': ['"NOTREADME"']}) 514 | self.assertEquals(len(recs), 0) 515 | 516 | def test__search__q_core_gets_prefered(self): 517 | self.indexMultipleUserRecords() 518 | recs, _ = self.search('dataset', None, {'q': ['"readme"']}) 519 | self.assertEquals(len(recs), 4) 520 | self.assertEquals(recs[0]['name'], 'core-dataset') 521 | 522 | def test__search__q_ignore_stop_words(self): 523 | self.indexWithStopWords() 524 | recs, _ = self.search('dataset', None, {'q': ['"the Mauna Loa"']}) 525 | self.assertEquals(len(recs), 2) 526 | 527 | def test__search__q_consider_exact_match(self): 528 | data = [ 529 | { 530 | 'title': 'List of all countries with their 2 digit codes (ISO 3166-1)', 531 | 'owner': 'core', 532 | 'ownerid': 'core', 533 | 'readme': 'country country_codes country list country country_codes.html list lists list list' 534 | }, 535 | { 536 | 'title': 'Nasdaq Listings', 537 | 'owner': 'core', 538 | 'ownerid': 'core', 539 | 'readme': 'list list list list' 540 | }, 541 | { 542 | 'title': 'Country and Continent Codes List', 543 | 'owner': 'not-core', 544 | 'ownerid': 'not-core', 545 | 'readme': 'country list list' 546 | }, 547 | ] 548 | self.indexWithCustomText(data) 549 | recs, _ = self.search('dataset' , None, {'q': ['"list of countries"']}) 550 | self.assertEquals(len(recs), 3) 551 | self.assertEquals(recs[0]['title'], 'List of all countries with their 2 digit codes (ISO 3166-1)') 552 | self.assertEquals(recs[1]['title'], 'Country and Continent Codes List') 553 | 554 | 555 | # Tests Events 556 | def test___search___all_events_are_empty(self): 557 | self.assertEquals(self.search('events', None), ([], {'total': 0, 'totalBytes': 0.0})) 558 | 559 | def test___search___all_event_are_there_but_unlisted(self): 560 | self.indexSomeEventRecords(10) 561 | res, _ = self.search('events', None) 562 | self.assertEquals(len(res), 5) 563 | 564 | def test___search___all_event_are_there_with_id_including_unlisted(self): 565 | self.indexSomeEventRecords(10) 566 | res, _ = self.search('events', 'datahubid') 567 | self.assertEquals(len(res), 10) 568 | 569 | def test___search___all_event_filter_with_findability(self): 570 | self.indexSomeEventRecords(10) 571 | res, _ = self.search('events', 'datahubid', {'findability': ['"unlisted"']}) 572 | self.assertEquals(len(res), 5) 573 | 574 | def test___search___all_event_filter_with_action(self): 575 | self.indexSomeEventRecords(10) 576 | res, _ = self.search('events', 'datahubid', {'event_action': ['"finished"']}) 577 | self.assertEquals(len(res), 7) 578 | 579 | def test___search___all_event_filter_with_entity(self): 580 | self.indexSomeEventRecords(10) 581 | res, _ = self.search('events', 'datahubid', {'event_entity': ['"flow"']}) 582 | self.assertEquals(len(res), 6) 583 | 584 | def test___search___all_event_filter_with_entity_and_action(self): 585 | self.indexSomeEventRecords(10) 586 | res, _ = self.search('events', 'datahubid', { 587 | 'event_entity': ['"flow"'], 588 | 'event_action': ['"finished"'] 589 | }) 590 | self.assertEquals(len(res), 4) 591 | 592 | def test___search___all_event_sorts_with_timestamp(self): 593 | self.indexSomeEventRecords(10) 594 | res, _ = self.search('events', 'datahubid') 595 | self.assertEquals(res[0]['timestamp'], '2009-01-01T00:00:00') 596 | self.assertEquals(res[9]['timestamp'], '2000-01-01T00:00:00') 597 | res, _ = self.search('events', 'datahubid', {'sort': ['"asc"']}) 598 | self.assertEquals(res[0]['timestamp'], '2000-01-01T00:00:00') 599 | self.assertEquals(res[9]['timestamp'], '2009-01-01T00:00:00') 600 | 601 | def test___search___events_match_only_exact_keywords(self): 602 | datasets = ['co2-fossil-by-nation', 'co2-fossil-global', 'co2-ppm'] 603 | self.indexEventRecordsWithDatasets(datasets) 604 | res, _ = self.search('events', 'datahubid', { 605 | 'dataset': ['"co2-ppm"'] 606 | }) 607 | 608 | self.assertEquals(len(res), 1) 609 | self.assertEquals(res[0]['dataset'], 'co2-ppm') 610 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | package=metastore 3 | skip_missing_interpreters=true 4 | envlist= 5 | py36 6 | 7 | [testenv] 8 | deps= 9 | -rrequirements.dev.txt 10 | passenv= 11 | CI 12 | TRAVIS 13 | TRAVIS_JOB_ID 14 | TRAVIS_BRANCH 15 | commands= 16 | py.test \ 17 | --cov {[tox]package} \ 18 | --cov-config tox.ini \ 19 | --cov-report term-missing \ 20 | {posargs} 21 | setenv = 22 | DATAHUB_ELASTICSEARCH_ADDRESS=http://localhost:9200 23 | --------------------------------------------------------------------------------