├── .github └── workflows │ └── python-app.yml ├── .gitignore ├── LICENSE ├── README.md ├── docs ├── CNAME ├── _config.yml ├── background.md └── index.md ├── geesedb ├── __init__.py ├── cmd │ ├── __init__.py │ ├── gql.py │ └── sql.py ├── connection │ ├── __init__.py │ └── connection.py ├── index │ ├── __init__.py │ ├── authors_from_csv.py │ ├── entities_from_csv.py │ ├── fulltext_from_ciff.py │ ├── fulltext_from_csv.py │ └── utils.py ├── interpreter │ ├── __init__.py │ ├── metadata.py │ ├── parser.py │ └── translate.py ├── resources │ ├── __init__.py │ ├── topics-and-qrels │ │ ├── __init__.py │ │ ├── qrels.backgroundlinking18.txt │ │ ├── qrels.backgroundlinking19.txt │ │ ├── qrels.core17.txt │ │ ├── qrels.core18.txt │ │ ├── qrels.robust04.txt │ │ ├── topics.backgroundlinking18.processed.txt │ │ ├── topics.backgroundlinking19.processed.txt │ │ ├── topics.core17.processed.txt │ │ ├── topics.core18.processed.txt │ │ └── topics.robust04.processed.txt │ └── topics.py ├── search │ ├── __init__.py │ ├── retrieval_models │ │ ├── __init__.py │ │ ├── bag_of_words │ │ │ ├── __init__.py │ │ │ ├── aggregate.py │ │ │ ├── bow_retrieval_model.py │ │ │ ├── conjunctive │ │ │ │ └── __init__.py │ │ │ └── disjunctive │ │ │ │ ├── __init__.py │ │ │ │ ├── disjunctive_retieval_model.py │ │ │ │ └── robertson_bm25.py │ │ ├── generic_text_retrieval_model.py │ │ ├── graph │ │ │ └── __init__.py │ │ └── positional │ │ │ └── __init__.py │ └── searcher.py ├── tests │ ├── __init__.py │ ├── connection │ │ ├── __init__.py │ │ └── test_connection.py │ ├── index │ │ ├── __init__.py │ │ ├── test_authors_from_csv.py │ │ ├── test_entities_from_csv.py │ │ ├── test_fulltext_from_ciff.py │ │ └── test_fulltext_from_csv.py │ ├── resources │ │ ├── ciff │ │ │ └── toy-complete-20200309.ciff.gz │ │ ├── csv │ │ │ ├── example_doc_author.csv │ │ │ ├── example_docs.csv │ │ │ ├── example_entity_doc.csv │ │ │ ├── example_term_dict.csv │ │ │ └── example_term_doc.csv │ │ └── queries │ │ │ ├── gql │ │ │ ├── 1 │ │ │ ├── 2 │ │ │ ├── 4 │ │ │ ├── 5 │ │ │ ├── 6 │ │ │ └── 7 │ │ │ └── sql │ │ │ ├── 1 │ │ │ ├── 2 │ │ │ ├── 3 │ │ │ ├── 4 │ │ │ └── 5 │ └── utils │ │ ├── __init__.py │ │ └── ciff │ │ ├── __init__.py │ │ └── test_to_csv.py └── utils │ ├── __init__.py │ └── ciff │ ├── __init__.py │ ├── to_ciff.py │ └── to_csv.py ├── requirements.txt └── setup.py /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | - name: Test with pytest 38 | run: | 39 | pytest 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | 127 | # Intellj 128 | .idea/ 129 | 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 informagi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GeeseDB 2 | [![Build Status](https://app.travis-ci.com/informagi/GeeseDB.svg?branch=master)](https://app.travis-ci.com/informagi/GeeseDB) 3 | 4 | ## Graph Engine for Exploration and Search 5 | GeeseDB is a Python toolkit for solving information retrieval research problems that leverage graphs as data structures. It aims to simplify information retrieval research by allowing researchers to easily formulate graph queries through a graph query language. GeeseDB is built on top of [DuckDB](http://duckdb.org/), an embedded column-store relational database designed for analytical workloads. 6 | 7 | GeeseDB is available as an easy to install Python package. In only a few lines of code users can create a first stage retrieval ranking using BM25. Queries read and write Numpy arrays and Pandas dataframes, at zero or negligible data transformation cost (dependent on base datatype). Therefore, results of a first-stage ranker expressed in GeeseDB can be used in various stages in the ranking process, enabling all the power of Python machine learning libraries with minimal overhead. Also, because data representation and processing are strictly separated, GeeseDB forms an ideal basis for reproducible IR research. 8 | 9 | ## Package Installation 10 | Install latest version of GeeseDB via [PyPI](https://pypi.org/project/geesedb/): 11 | 12 | ``` 13 | pip install geesedb==0.0.2 14 | ``` 15 | 16 | GeeseDB depends on a couple of packages that can also be installed using `pip`. It is also possible to install the development installation of GeeseDB using `pip`: 17 | 18 | ``` 19 | pip install git+https://github.com/informagi/GeeseDB.git 20 | ``` 21 | 22 | If you are planning to contribute to the package it is possible to clone the package, and install it using `pip` in editable version: 23 | ``` 24 | git clone git@github.com:informagi/GeeseDB.git && cd GeeseDB && pip install -e . 25 | ``` 26 | 27 | You can run our tests to confirm if everything is working as intended (in the repository folder): 28 | ``` 29 | pytest 30 | ``` 31 | 32 | ## How do I index? 33 | The fastest way to load text data into GeeseDB is through CSV files. There should be three csv files: one for terms, one for documents, and one that connects the terms to the documents. Small examples of these files can be found in the repository: [docs.csv](./geesedb/tests/resources/csv/example_docs.csv), [terms_dics.csv](./geesedb/tests/resources/csv/example_term_dict.csv), and [term_doc.csv](./geesedb/tests/resources/csv/example_term_doc.csv). 34 | 35 | These can be generated using the CIFF [to_csv](./geesedb/utils/ciff/to_csv.py) class from [CIFF](https://github.com/osirrc/ciff) collections, or you can create them however you like. The documents can be loaded using the following code: 36 | 37 | ```python3 38 | from geesedb.index import FullTextFromCSV 39 | 40 | index = FullTextFromCSV( 41 | database='/path/to/database', 42 | docs_file='/path/to/docs.csv', 43 | term_dict_file='/path/to/term_dict.csv', 44 | term_doc_file='/path/to/term_doc.csv' 45 | ) 46 | index.load_data() 47 | ``` 48 | 49 | ## How do I search? 50 | After indexing in the data, it is really easy to construct a first stage ranking using BM25: 51 | 52 | ```python3 53 | from geesedb.search import Searcher 54 | 55 | searcher = Searcher( 56 | database='/path/to/database', 57 | n=10 58 | ) 59 | hits = searcher.search_topic('cat') 60 | ``` 61 | 62 | In this case the searcher returns the top 10 documents for the query: `cat`. 63 | 64 | ## How can I use SQL with GeeseDB? 65 | GeeseDB is built on top of [DuckDB](http://duckdb.org/), and we inherit all its functionalities. It is possible to directly query the data in GeeseDB using SQL. The following example shows an example on how to use SQL on the data loaded in the example above: 66 | 67 | ```python3 68 | from geesedb.connection import get_connection 69 | 70 | db_path = '/path/to/database/' 71 | cursor = get_connection(db_path) 72 | cursor.execute("SELECT count(*) FROM docs;") 73 | cursor.fetchall() 74 | ``` 75 | 76 | ## How can I use Cypher with GeeseDB 77 | GeeseDB also supports a subset of the Cypher graph query language, in particular the following keywords: `MATCH`, `RETURN`, `WHERE`, `AND`, `DISTINCT`, `ORDER BY`, `SKIP`, and `LIMIT`. We plan to support the full Cypher query langauge in the future. In order to use the Cypher query language with GeeseDB, first a metadata file needs to be loaded. 78 | 79 | The metadata represents the graph structure represented in the database, the table name `_meta` is used for this. The metadata is represented as a Python dictionary object with the following structure: 80 | ```python 81 | { 82 | 'from_node': 83 | { 84 | 'to_node': 85 | { 86 | [['join_table', 87 | 'from_node_join_key', 88 | 'join_table_from_node_join_key', 89 | 'join_table_to_node_join_key', 90 | 'to_node_join_key' 91 | ]] 92 | } 93 | } 94 | } 95 | ``` 96 | Using this structure we know which tables in the database related to eachother. If this information is known it is possible to translate Cypher queries to SQL queries. An example of a Cypher query that can be translated to SQL is shown belows: 97 | 98 | Cypher: 99 | ```cypher 100 | MATCH (d:docs)-[]-(:authors)-[]-(d2:docs) 101 | WHERE d.collection_id = "96ab542e" 102 | RETURN DISTINCT d2.collection_id 103 | ``` 104 | 105 | SQL: 106 | ```sql 107 | SELECT DISTINCT d2.collection_id 108 | FROM docs AS d2 109 | JOIN doc_author AS da2 ON (d2.collection_id = da2.doc) 110 | JOIN authors AS a2 ON (da2.author = a2.author) 111 | JOIN doc_author AS da3 ON (a2.author = da3.author) 112 | JOIN docs AS d ON (d.collection_id = da3.doc) 113 | WHERE d.collection_id = '96ab542e' 114 | ``` 115 | 116 | The queries can be translated the following way: 117 | 118 | ```python 119 | from geesedb.interpreter import Translator 120 | 121 | c_query = "cypher query" 122 | translator = Translator('path/to/database') 123 | sql_query = translator.translate(c_query) 124 | ``` 125 | 126 | ## Cite 127 | GeeseDB was published at DESIRES: [Read here](https://ceur-ws.org/Vol-2950/paper-11.pdf) 128 | 129 | If you use GeeseDB you can cite use using bibtex: 130 | ``` 131 | @inproceedings{geesedb, 132 | author = {Chris Kamphuis and Arjen P. de Vries}, 133 | title = {{GeeseDB: A Python Graph Engine for Exploration and Search}}, 134 | booktitle = {Proceedings of the 2nd International Conference on Design of Experimental Search \& Information REtrieval Systems}, 135 | pages = {10-18}, 136 | year = {2021}, 137 | url = {http://ceur-ws.org/Vol-2950/paper-11.pdf}, 138 | address = {Aachen}, 139 | publisher = {CEUR-WS.org}, 140 | series = {DESIRES '21} 141 | } 142 | ``` 143 | -------------------------------------------------------------------------------- /docs/CNAME: -------------------------------------------------------------------------------- 1 | geesedb.informagus.nl -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /docs/background.md: -------------------------------------------------------------------------------- 1 | # Background 2 | 3 | ## DuckDB 4 | 5 | ### Adaptive Radix Tree (ART) 6 | 7 | Paper: 8 | https://www.the-paper-trail.org/post/art-paper-notes/ 9 | https://dl.acm.org/citation.cfm?id=2511193 10 | 11 | Implementations: 12 | https://github.com/armon/libart 13 | https://github.com/rafaelkallis/adaptive-radix-tree 14 | 15 | Useful: 16 | https://stackoverflow.com/a/26172978/2127435 17 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # GeeseDB 2 | 3 | A Graph Engine for Exploration and Search over Evolving DataBases (GeeseDB). 4 | 5 | ## Acknowledgements 6 | 7 | The NWO SQIREL-GRAPHS project, Radboud's iCIS institute, and CWI's excellent Database Architectures (DA) research group. 8 | -------------------------------------------------------------------------------- /geesedb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/__init__.py -------------------------------------------------------------------------------- /geesedb/cmd/__init__.py: -------------------------------------------------------------------------------- 1 | from .sql import SQL 2 | from .gql import GQL 3 | 4 | __all__ = ['SQL', 'GQL'] 5 | -------------------------------------------------------------------------------- /geesedb/cmd/gql.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cmd 3 | from typing import Any 4 | 5 | from ..connection import get_connection 6 | from ..interpreter import Translator 7 | 8 | 9 | class GQL(cmd.Cmd): 10 | intro = 'GQL shell powered by DuckDB and to SQL translations. Type help or ? to list commands.\n' 11 | prompt = '(gql) ' 12 | 13 | def __init__(self, **kwargs: Any) -> None: 14 | self.arguments = self.get_arguments(kwargs) 15 | self.db_connection = get_connection(self.arguments['database']) 16 | self.translator = Translator(self.arguments['database']) 17 | self.cursor = self.db_connection.cursor 18 | super(GQL, self).__init__() 19 | 20 | @staticmethod 21 | def get_arguments(kwargs: Any) -> dict: 22 | arguments = { 23 | 'database': None 24 | } 25 | for key, item in arguments.items(): 26 | if kwargs.get(key) is not None: 27 | arguments[key] = kwargs.get(key) 28 | if arguments['database'] is None: 29 | raise IOError('database path needs to be provided') 30 | return arguments 31 | 32 | def do_quit(self, arg) -> bool: 33 | """Exit this shell""" 34 | return True 35 | 36 | def do_fetchall(self, arg) -> None: 37 | """Fetch all results after issuing a SQL query""" 38 | print(self.cursor.fetchall()) 39 | 40 | def do_fetchone(self, arg) -> None: 41 | """Fetch a row after issuing a SQL query""" 42 | print(self.cursor.fetchone()) 43 | 44 | def default(self, line: str) -> None: 45 | """Issue a sql query""" 46 | try: 47 | self.cursor.execute(self.translator.translate(line)) 48 | except RuntimeError as error: 49 | print(error) 50 | 51 | 52 | if __name__ == '__main__': 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument('-d', 55 | '--database', 56 | required=True, 57 | metavar='[file]', 58 | help='Location of the database.') 59 | GQL(**vars(parser.parse_args())).cmdloop() 60 | -------------------------------------------------------------------------------- /geesedb/cmd/sql.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cmd 3 | from typing import Any 4 | 5 | from ..connection import get_connection 6 | 7 | 8 | class SQL(cmd.Cmd): 9 | intro = 'SQL shell powered by DuckDB. Type help or ? to list commands.\n' 10 | prompt = '(sql) ' 11 | 12 | def __init__(self, **kwargs: Any) -> None: 13 | self.arguments = self.get_arguments(kwargs) 14 | self.db_connection = get_connection(self.arguments['database']) 15 | self.cursor = self.db_connection.cursor 16 | super(SQL, self).__init__() 17 | 18 | @staticmethod 19 | def get_arguments(kwargs: Any) -> dict: 20 | arguments = { 21 | 'database': None 22 | } 23 | for key, item in arguments.items(): 24 | if kwargs.get(key) is not None: 25 | arguments[key] = kwargs.get(key) 26 | if arguments['database'] is None: 27 | raise IOError('database path needs to be provided') 28 | return arguments 29 | 30 | def do_quit(self, arg) -> bool: 31 | """Exit this shell""" 32 | return True 33 | 34 | def do_fetchall(self, arg) -> None: 35 | """Fetch all results after issuing a SQL query""" 36 | print(self.cursor.fetchall()) 37 | 38 | def do_fetchone(self, arg) -> None: 39 | """Fetch a row after issuing a SQL query""" 40 | print(self.cursor.fetchone()) 41 | 42 | def default(self, line: str) -> None: 43 | """Issue a sql query""" 44 | try: 45 | self.cursor.execute(line) 46 | except RuntimeError as error: 47 | print(error) 48 | 49 | 50 | if __name__ == '__main__': 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('-d', 53 | '--database', 54 | required=True, 55 | metavar='[file]', 56 | help='Location of the database.') 57 | SQL(**vars(parser.parse_args())).cmdloop() 58 | -------------------------------------------------------------------------------- /geesedb/connection/__init__.py: -------------------------------------------------------------------------------- 1 | from .connection import get_connection, close_connection 2 | 3 | __all__ = ['get_connection', 'close_connection'] 4 | -------------------------------------------------------------------------------- /geesedb/connection/connection.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | 3 | _db_connection = None 4 | 5 | 6 | def get_connection(database): 7 | global _db_connection 8 | if not _db_connection: 9 | _db_connection = DBConnection(database) 10 | return _db_connection 11 | 12 | 13 | def close_connection(): 14 | global _db_connection 15 | if _db_connection: 16 | _db_connection.connection.close() 17 | _db_connection = None 18 | 19 | 20 | class DBConnection(object): 21 | 22 | def __init__(self, database: str) -> None: 23 | self.connection = duckdb.connect(database) 24 | self.cursor = self.connection.cursor() 25 | -------------------------------------------------------------------------------- /geesedb/index/__init__.py: -------------------------------------------------------------------------------- 1 | from .authors_from_csv import AuthorsFromCSV 2 | from .entities_from_csv import EntitiesFromCSV 3 | from .fulltext_from_ciff import FullTextFromCiff 4 | from .fulltext_from_csv import FullTextFromCSV 5 | 6 | __all__ = ['FullTextFromCSV', 'AuthorsFromCSV', 'FullTextFromCiff', 'EntitiesFromCSV'] 7 | -------------------------------------------------------------------------------- /geesedb/index/authors_from_csv.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | from typing import Any 4 | 5 | from .utils import _create_table, _fill_empty_table_with_csv 6 | from ..connection import get_connection 7 | 8 | 9 | class AuthorsFromCSV: 10 | """ 11 | Class for creating table from csv file that contains author information 12 | Author - doc 13 | """ 14 | _COLUMN_TYPES = ['STRING', 'STRING'] 15 | 16 | def __init__(self, **kwargs: Any) -> None: 17 | self.arguments = self.get_arguments(kwargs) 18 | if self.arguments['use_existing_db'] and os.path.isfile(self.arguments['database']) or \ 19 | not self.arguments['use_existing_db'] and not os.path.isfile(self.arguments['database']): 20 | pass 21 | elif not self.arguments['use_existing_db']: 22 | raise IOError('There already exist a file on this path.') 23 | else: 24 | raise IOError('Database does not exist.') 25 | db_connection = get_connection(self.arguments['database']) 26 | self.connection = db_connection.connection 27 | 28 | if not self.arguments['use_existing_tables']: 29 | _create_table(self.connection, self.arguments['table_name'], self.arguments['columns_names'], 30 | self._COLUMN_TYPES) 31 | _fill_empty_table_with_csv(self.connection, self.arguments['table_name'], self.arguments['doc_author_file'], 32 | self.arguments['delimiter']) 33 | 34 | @staticmethod 35 | def get_arguments(kwargs: Any) -> dict: 36 | arguments = { 37 | 'database': None, 38 | 'use_existing_db': False, 39 | 'use_existing_tables': False, 40 | 'doc_author_file': 'doc_author.csv', 41 | 'table_name': 'doc_author', 42 | 'columns_names': ['doc', 'author'], 43 | 'delimiter': '|' 44 | } 45 | for key, item in arguments.items(): 46 | if kwargs.get(key) is not None: 47 | arguments[key] = kwargs.get(key) 48 | if arguments['database'] is None: 49 | raise IOError('database path needs to be provided') 50 | return arguments 51 | 52 | 53 | if __name__ == '__main__': 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('-d', 56 | '--database', 57 | required=True, 58 | metavar='[file]', 59 | help='Location of the database.') 60 | parser.add_argument('-u', 61 | '--use_existing_db', 62 | action='store_true', 63 | help='Use an existing database.') 64 | parser.add_argument('-s', 65 | '--use_existing_tables', 66 | action='store_true', 67 | help='Use existing tables.') 68 | parser.add_argument('-a', 69 | '--doc_author_file', 70 | metavar='[file]', 71 | help='Filename for the csv file containing the data for the docs table.') 72 | parser.add_argument('-t', 73 | '--table_name', 74 | metavar='[string]', 75 | help='Decide on the table name you want to fill if they exist, ' + 76 | 'or create and fill them if they do not exist. If no name ' + 77 | 'is given the default value "author_doc" are being used.') 78 | parser.add_argument('-c', 79 | '--columns_names', 80 | metavar='[string]', 81 | nargs=2, 82 | help='Column names for the author-doc table.') 83 | parser.add_argument('-e', 84 | '--delimiter', 85 | help='Delimiter that separates the columns in the csv files.') 86 | AuthorsFromCSV(**vars(parser.parse_args())) 87 | -------------------------------------------------------------------------------- /geesedb/index/entities_from_csv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | from typing import Any 6 | 7 | from .utils import _fill_empty_table_with_csv, _create_table 8 | from ..connection import get_connection 9 | 10 | 11 | class EntitiesFromCSV: 12 | """ 13 | Class for creating table from csv file that contains entities linked to doc 14 | Offset - Length - Mention - Entity - NER Tag - Doc ID 15 | 16 | Entities contain info as provided by REL: https://arxiv.org/abs/2006.01969 17 | """ 18 | _COLUMN_TYPES = ['INT', 'INT', 'STRING', 'STRING', 'STRING', 'STRING'] 19 | 20 | def __init__(self, **kwargs: Any) -> None: 21 | self.arguments = self.get_arguments(kwargs) 22 | if self.arguments['use_existing_db'] and os.path.isfile(self.arguments['database']) or \ 23 | not self.arguments['use_existing_db'] and not os.path.isfile(self.arguments['database']): 24 | pass 25 | elif not self.arguments['use_existing_db']: 26 | raise IOError('There already exist a file on this path.') 27 | else: 28 | raise IOError('Database does not exist.') 29 | db_connection = get_connection(self.arguments['database']) 30 | self.connection = db_connection.connection 31 | 32 | if not self.arguments['use_existing_tables']: 33 | _create_table(self.connection, self.arguments['table_name'], self.arguments['columns_names'], 34 | self._COLUMN_TYPES) 35 | _fill_empty_table_with_csv(self.connection, self.arguments['table_name'], self.arguments['entity_doc_file'], 36 | self.arguments['delimiter']) 37 | 38 | @staticmethod 39 | def get_arguments(kwargs: Any) -> dict: 40 | arguments = { 41 | 'database': None, 42 | 'use_existing_db': False, 43 | 'use_existing_tables': False, 44 | 'entity_doc_file': 'entity_doc.csv', 45 | 'table_name': 'entity_doc', 46 | 'columns_names': ['start', 'len', 'mention', 'entity', 'ner_tag', 'doc_id'], 47 | 'delimiter': '|' 48 | } 49 | for key, item in arguments.items(): 50 | if kwargs.get(key) is not None: 51 | arguments[key] = kwargs.get(key) 52 | if arguments['database'] is None: 53 | raise IOError('database path needs to be provided') 54 | return arguments 55 | 56 | 57 | if __name__ == '__main__': 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument('-d', 60 | '--database', 61 | required=True, 62 | metavar='[file]', 63 | help='Location of the database.') 64 | parser.add_argument('-u', 65 | '--use_existing_db', 66 | action='store_true', 67 | help='Use an existing database.') 68 | parser.add_argument('-s', 69 | '--use_existing_tables', 70 | action='store_true', 71 | help='Use existing tables.') 72 | parser.add_argument('-a', 73 | '--entity_doc_file', 74 | metavar='[file]', 75 | help='Filename for the csv file containing the data for the entity_doc table.') 76 | parser.add_argument('-t', 77 | '--table_name', 78 | metavar='[string]', 79 | help='Decide on the table name you want to fill if they exist, ' + 80 | 'or create and fill them if they do not exist. If no name ' + 81 | 'is given the default value "entity_doc" will be used.') 82 | parser.add_argument('-c', 83 | '--columns_names', 84 | metavar='[string]', 85 | nargs=8, 86 | help='Column names for the doc-entity table. If not provided the default: ' 87 | "['start', 'len', 'mention', 'entity', 'ner_tag', 'doc_id'] " 88 | "will be used.") 89 | parser.add_argument('-e', 90 | '--delimiter', 91 | help='Delimiter that separates the columns in the csv files.') 92 | EntitiesFromCSV(**vars(parser.parse_args())) 93 | -------------------------------------------------------------------------------- /geesedb/index/fulltext_from_ciff.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import argparse 4 | import gzip 5 | import os 6 | import duckdb 7 | from typing import Any, List, Union, Tuple 8 | from ciff_toolkit.read import CiffReader 9 | 10 | from ..connection import get_connection 11 | 12 | 13 | class FullTextFromCiff: 14 | """ 15 | Class for creating tables as in the old dog paper: 16 | - https://dl.acm.org/doi/10.1145/2600428.2609460 17 | 18 | The tables are created from a CIFF as described in: 19 | - https://arxiv.org/abs/2003.08276 20 | """ 21 | _COLUMN_TYPES = [ 22 | ['STRING', 'INT', 'INT'], 23 | ['INT', 'INT', 'STRING'], 24 | ['INT', 'INT', 'INT'] 25 | ] 26 | 27 | def __init__(self, **kwargs: Any) -> None: 28 | self.arguments = self.get_arguments(kwargs) 29 | if self.arguments['use_existing_db'] and os.path.isfile(self.arguments['database']) or \ 30 | not self.arguments['use_existing_db'] and not os.path.isfile(self.arguments['database']): 31 | pass 32 | elif not self.arguments['use_existing_db']: 33 | raise IOError('There already exist a file on this path.') 34 | else: 35 | raise IOError('Database does not exist.') 36 | db_connection = get_connection(self.arguments['database']) 37 | self.connection = db_connection.connection 38 | self.cursor = db_connection.cursor 39 | 40 | def load_data(self): 41 | if not self.arguments['use_existing_tables']: 42 | self.create_tables() 43 | self.fill_tables() 44 | 45 | @staticmethod 46 | def get_arguments(kwargs: Any) -> dict: 47 | arguments = { 48 | 'database': None, 49 | 'use_existing_db': False, 50 | 'use_existing_tables': False, 51 | 'table_names': ['docs', 'term_dict', 'term_doc'], 52 | 'columns_names_docs': ['collection_id', 'doc_id', 'len'], 53 | 'columns_names_term_dict': ['term_id', 'df', 'string'], 54 | 'columns_names_term_doc': ['term_id', 'doc_id', 'tf'], 55 | 'protobuf_file': None 56 | } 57 | for key, item in arguments.items(): 58 | if kwargs.get(key) is not None: 59 | arguments[key] = kwargs.get(key) 60 | if arguments['database'] is None: 61 | raise IOError('database path needs to be provided') 62 | if arguments['protobuf_file'] is None: 63 | raise IOError('protobuf file needs to be provided') 64 | return arguments 65 | 66 | def create_tables(self) -> None: 67 | column_names = [ 68 | self.arguments['columns_names_docs'], 69 | self.arguments['columns_names_term_dict'], 70 | self.arguments['columns_names_term_doc'] 71 | ] 72 | self.connection.begin() 73 | for table_name, c_names, c_types in zip(self.arguments['table_names'], column_names, self._COLUMN_TYPES): 74 | self.create_table(table_name, c_names, c_types) 75 | self.connection.commit() 76 | 77 | def create_table(self, table_name: str, column_names: List[str], column_types: List[str]) -> None: 78 | try: 79 | self.cursor.execute(f'SELECT * FROM {table_name} LIMIT 1;') 80 | self.connection.rollback() 81 | raise IOError('Table already exists.') 82 | except duckdb.CatalogException: # If the table does not exists you get a RuntimeError 83 | pass 84 | query = f'CREATE TABLE {table_name} ({", ".join([f"{a} {b}" for a, b in zip(column_names, column_types)])});' 85 | self.cursor.execute(query) 86 | 87 | @staticmethod 88 | def decode(buffer: Union[str, bytes], pos: int) -> Union[Tuple[int, int], None]: 89 | mask = (1 << 32) - 1 90 | result = 0 91 | shift = 0 92 | while True: 93 | b = buffer[pos] 94 | result |= ((b & 0x7f) << shift) 95 | pos += 1 96 | if not (b & 0x80): 97 | result &= mask 98 | result = int(result) 99 | return result, pos 100 | shift += 7 101 | if shift >= 64: 102 | raise IOError('Too many bytes when decoding.') 103 | 104 | def fill_tables(self) -> None: 105 | if self.arguments['protobuf_file'].endswith('.gz'): 106 | with gzip.open(self.arguments['protobuf_file'], 'rb') as f: 107 | data = f.read() 108 | else: 109 | with open(self.arguments['protobuf_file'], 'rb') as f: 110 | data = f.read() 111 | 112 | with CiffReader(self.arguments['protobuf_file']) as reader: 113 | for term_id, postings_list in enumerate(reader.read_postings_lists()): 114 | self.connection.begin() 115 | q = f'INSERT INTO {self.arguments["table_names"][1]} ' \ 116 | f'({",".join(self.arguments["columns_names_term_dict"])}) ' \ 117 | f"VALUES ({term_id},{postings_list.df},'{postings_list.term}')" 118 | try: 119 | self.cursor.execute(q) 120 | except RuntimeError: 121 | print(q) 122 | 123 | docid = 0 124 | for posting in postings_list.postings: 125 | docid += posting.docid 126 | q = f'INSERT INTO {self.arguments["table_names"][2]} ' \ 127 | f'({",".join(self.arguments["columns_names_term_doc"])}) ' \ 128 | f'VALUES ({term_id},{docid},{posting.tf})' 129 | self.cursor.execute(q) 130 | self.connection.commit() 131 | 132 | self.connection.begin() 133 | for n, doc_record in enumerate(reader.read_documents()): 134 | if n % 1000 == 0: 135 | self.connection.commit() 136 | self.connection.begin() 137 | q = f'INSERT INTO {self.arguments["table_names"][0]} ' \ 138 | f'({",".join(self.arguments["columns_names_docs"])}) ' \ 139 | f"VALUES ('{doc_record.collection_docid}',{doc_record.docid},{doc_record.doclength})" 140 | self.cursor.execute(q) 141 | self.connection.commit() 142 | 143 | 144 | if __name__ == '__main__': 145 | parser = argparse.ArgumentParser() 146 | parser.add_argument('-d', 147 | '--database', 148 | required=True, 149 | metavar='[file]', 150 | help='Location of the database.') 151 | parser.add_argument('-p', 152 | '--protobuf_file', 153 | required=True, 154 | metavar='[file]', 155 | help='Filename for the csv file containing the data for the docs table.') 156 | parser.add_argument('-u', 157 | '--use_existing_db', 158 | action='store_true', 159 | help='Use an existing database.') 160 | parser.add_argument('-s', 161 | '--use_existing_tables', 162 | action='store_true', 163 | help='Use existing tables.') 164 | parser.add_argument('-t', 165 | '--table_names', 166 | metavar='[string]', 167 | nargs=3, 168 | help='Decide on the table names you want to fill if they exist, ' + 169 | 'or create and fill them if they do not exist. If no names ' + 170 | 'are given the default values ["docs.csv", "term_dict.csv", ' + 171 | '"term_doc.csv"] are being used. If arguments are given ' + 172 | 'they are expected in the respective default order.') 173 | parser.add_argument('-cd', 174 | '--columns_names_docs', 175 | metavar='[string]', 176 | nargs=2, 177 | help='Column names for the docs table.') 178 | parser.add_argument('-ct', 179 | '--columns_names_term_dict', 180 | metavar='[string]', 181 | nargs=3, 182 | help='Column names for the dict table.') 183 | parser.add_argument('-o', 184 | '--columns_names_term_doc', 185 | metavar='[string]', 186 | nargs=3, 187 | help='Column names for the term-docs table (docs in old dog paper).') 188 | FullTextFromCiff(**vars(parser.parse_args())) 189 | -------------------------------------------------------------------------------- /geesedb/index/fulltext_from_csv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import argparse 4 | import os 5 | from typing import Any 6 | 7 | from .utils import _fill_empty_table_with_csv, _create_table 8 | from ..connection import get_connection 9 | 10 | 11 | class FullTextFromCSV: 12 | """ 13 | Class for creating tables from csv files as in the old dog paper: 14 | - https://dl.acm.org/doi/10.1145/2600428.2609460 15 | """ 16 | _COLUMN_TYPES = [ 17 | ['STRING', 'INT', 'INT'], 18 | ['INT', 'STRING', 'INT'], 19 | ['INT', 'INT', 'INT'] 20 | ] 21 | 22 | def __init__(self, **kwargs: Any) -> None: 23 | self.arguments = self.get_arguments(kwargs) 24 | if self.arguments['use_existing_db'] and os.path.isfile(self.arguments['database']) or \ 25 | not self.arguments['use_existing_db'] and not os.path.isfile(self.arguments['database']): 26 | pass 27 | elif not self.arguments['use_existing_db']: 28 | raise IOError('There already exist a file on this path.') 29 | else: 30 | raise IOError('Database does not exist.') 31 | db_connection = get_connection(self.arguments['database']) 32 | self.connection = db_connection.connection 33 | 34 | 35 | def load_data(self): 36 | if not self.arguments['use_existing_db']: 37 | self.create_tables() 38 | self.fill_tables() 39 | 40 | @staticmethod 41 | def get_arguments(kwargs: Any) -> dict: 42 | arguments = { 43 | 'database': None, 44 | 'use_existing_db': False, 45 | 'use_existing_tables': False, 46 | 'table_names': ['docs', 'term_dict', 'term_doc'], 47 | 'columns_names_docs': ['collection_id', 'doc_id', 'len'], 48 | 'columns_names_term_dict': ['term_id', 'string', 'df'], 49 | 'columns_names_term_doc': ['term_id', 'doc_id', 'tf'], 50 | 'docs_file': 'docs.csv', 51 | 'term_dict_file': 'dict.csv', 52 | 'term_doc_file': 'term_doc.csv', 53 | 'delimiter': '|' 54 | } 55 | for key, item in arguments.items(): 56 | if kwargs.get(key) is not None: 57 | arguments[key] = kwargs.get(key) 58 | if arguments['database'] is None: 59 | raise IOError('database path needs to be provided') 60 | return arguments 61 | 62 | def create_tables(self) -> None: 63 | column_names = [ 64 | self.arguments['columns_names_docs'], 65 | self.arguments['columns_names_term_dict'], 66 | self.arguments['columns_names_term_doc'] 67 | ] 68 | self.connection.begin() 69 | for table_name, c_names, c_types in zip(self.arguments['table_names'], column_names, self._COLUMN_TYPES): 70 | _create_table(self.connection, table_name, c_names, c_types) 71 | self.connection.commit() 72 | 73 | def fill_tables(self) -> None: 74 | file_names = [ 75 | self.arguments['docs_file'], 76 | self.arguments['term_dict_file'], 77 | self.arguments['term_doc_file'] 78 | ] 79 | self.connection.begin() 80 | for table_name, file_name in zip(self.arguments['table_names'], file_names): 81 | _fill_empty_table_with_csv(self.connection, table_name, file_name, self.arguments['delimiter']) 82 | self.connection.commit() 83 | 84 | 85 | if __name__ == '__main__': 86 | parser = argparse.ArgumentParser() 87 | parser.add_argument('-d', 88 | '--database', 89 | required=True, 90 | metavar='[file]', 91 | help='Location of the database.') 92 | parser.add_argument('-u', 93 | '--use_existing_db', 94 | action='store_true', 95 | help='Use an existing database.') 96 | parser.add_argument('-s', 97 | '--use_existing_tables', 98 | action='store_true', 99 | help='Use existing tables.') 100 | parser.add_argument('-t', 101 | '--table_names', 102 | metavar='[string]', 103 | nargs=3, 104 | help='Decide on the table names you want to fill if they exist, ' + 105 | 'or create and fill them if they do not exist. If no names ' + 106 | 'are given the default values ["docs.csv", "term_dict.csv", ' + 107 | '"term_doc.csv"] are being used. If arguments are given ' + 108 | 'they are expected in the respective default order.') 109 | parser.add_argument('-cd', 110 | '--columns_names_docs', 111 | metavar='[string]', 112 | nargs=2, 113 | help='Column names for the docs table.') 114 | parser.add_argument('-ct', 115 | '--columns_names_term_dict', 116 | metavar='[string]', 117 | nargs=3, 118 | help='Column names for the dict table.') 119 | parser.add_argument('-o', 120 | '--columns_names_term_doc', 121 | metavar='[string]', 122 | nargs=3, 123 | help='Column names for the term-docs table (docs in old dog paper).') 124 | parser.add_argument('-di', 125 | '--docs_file', 126 | metavar='[file]', 127 | help='Filename for the csv file containing the data for the docs table.') 128 | parser.add_argument('-ti', 129 | '--term_dict_file', 130 | metavar='[file]', 131 | help='Filename for the csv file containing the data for the dict table.') 132 | parser.add_argument('-oi', 133 | '--term_doc_file', 134 | metavar='[file]', 135 | help='Filename for the csv file containing the data for the term-docs table ' + 136 | '(terms in old dog paper).') 137 | parser.add_argument('-e', 138 | '--delimiter', 139 | help='Delimiter that separates the columns in the csv files.') 140 | FullTextFromCSV(**vars(parser.parse_args())) 141 | -------------------------------------------------------------------------------- /geesedb/index/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import duckdb 4 | from duckdb import DuckDBPyConnection 5 | 6 | 7 | def _create_table(connection: DuckDBPyConnection, table_name: str, column_names: List[str], 8 | column_types: List[str]) -> None: 9 | cursor = connection.cursor() 10 | try: 11 | cursor.execute(f'SELECT * FROM {table_name} LIMIT 1;') 12 | connection.rollback() 13 | raise IOError('Table already exists.') 14 | except duckdb.CatalogException: 15 | pass 16 | query = f'CREATE TABLE {table_name} ({", ".join([f"{a} {b}" for a, b in zip(column_names, column_types)])});' 17 | cursor.execute(query) 18 | 19 | 20 | def _fill_empty_table_with_csv(connection: DuckDBPyConnection, table_name: str, file_name: str, 21 | delimiter: str = "|") -> None: 22 | cursor = connection.cursor() 23 | cursor.execute(f'SELECT COUNT(*) FROM {table_name};') 24 | if cursor.fetchone()[0] > 0: 25 | connection.rollback() 26 | raise IOError('The tables are not empty.') 27 | query = f"COPY {table_name} FROM '{file_name}' WITH DELIMITER '{delimiter}';" 28 | cursor.execute(query) 29 | -------------------------------------------------------------------------------- /geesedb/interpreter/__init__.py: -------------------------------------------------------------------------------- 1 | from .metadata import Metadata 2 | from .parser import Parser 3 | from .translate import Translator 4 | 5 | __all__ = ['Parser', 'Translator', 'Metadata'] 6 | -------------------------------------------------------------------------------- /geesedb/interpreter/metadata.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from ..connection import get_connection 4 | 5 | 6 | class Metadata: 7 | 8 | def __init__(self, database): 9 | self.connection = get_connection(database).connection 10 | 11 | # first list is default if nothing is specified (should be extended) 12 | # list is ordered as [edge_name, node1_id, edge_node1_id, edge_node2_id, node2_id2 13 | def get_metadata(self): 14 | self.connection.execute("SELECT metadata FROM _meta") 15 | metadata = json.loads(self.connection.fetchone()[0]) 16 | return metadata 17 | 18 | def update_metadata(self, data): 19 | self.connection.execute(f"UPDATE _meta SET metadata='{json.dumps(data)}'") 20 | 21 | def get_default_join_info(self, node1, node2): 22 | return self.get_metadata()[node1][node2][0] 23 | 24 | def get_all_join_info(self, node1, node2): 25 | return self.get_metadata()[node1][node2] 26 | 27 | # { 28 | # 'term_dict': { 29 | # 'docs': [['term_doc', 'term_id', 'term_id', 'doc_id', 'doc_id']] 30 | # }, 31 | # 'docs': { 32 | # 'term_dict': [['term_doc', 'doc_id', 'doc_id', 'term_id', 'term_id']], 33 | # 'entities': [['entity_doc', 'collection_id', 'doc_id', 'entity', 'entity']], 34 | # 'authors': [['doc_author', 'collection_id', 'doc', 'author', 'author']] 35 | # }, 36 | # 'entities': { 37 | # 'docs': [['entity_doc', 'entity', 'entity', 'doc_id', 'collection_id']] 38 | # }, 39 | # 'authors': { 40 | # 'docs': [['doc_author', 'author', 'author', 'doc', 'collection_id']] 41 | # } 42 | # } 43 | -------------------------------------------------------------------------------- /geesedb/interpreter/parser.py: -------------------------------------------------------------------------------- 1 | import pycypher 2 | from .metadata import Metadata 3 | 4 | class Parser: 5 | 6 | def __init__(self, database): 7 | self.parseCypher = _ParseCypher(database) 8 | 9 | def parse(self, cypher_query): 10 | node = pycypher.parse(cypher_query) 11 | return self.parseCypher.process_node(node) 12 | 13 | class _ParseCypher: 14 | 15 | def __init__(self, database): 16 | self.database = database 17 | 18 | def process_node(self, node): 19 | errors = node['errors'] 20 | name = node['name'] 21 | result = node['result'] 22 | 23 | if len(errors) > 0: 24 | print(f"There are errors in the query:") 25 | print(errors) 26 | raise RuntimeError 27 | 28 | if name == 'Cypher': 29 | for r in result: 30 | try: 31 | if r['children']['name'] == 'Statement': 32 | return self.process_node(r['children']) 33 | except KeyError: 34 | continue 35 | 36 | elif name == 'Statement': 37 | return self.process_node(result[0]['children']) 38 | 39 | elif name == 'Query': 40 | return self.process_node(result[0]['children']) 41 | 42 | elif name == 'StandaloneQuery': 43 | raise RuntimeError("We don not support StandaloneQuery queries (yet).") 44 | 45 | elif name == 'RegularQuery': 46 | out = '' 47 | for r in result: 48 | if r['node'] == r['children']: 49 | continue 50 | if len(out) > 0: 51 | out += ' ' 52 | if r['children']['name'] == 'SingleQuery': 53 | out += _ParseSingleQuery(self.database).process_node(r['children']) 54 | else: 55 | out += self.process_node(r['children']) 56 | return out 57 | 58 | elif name == 'Union': 59 | union = '' 60 | for r in result[:-1]: 61 | union += r['node']['text'] 62 | union = union.strip() 63 | return union + ' ' + _ParseSingleQuery(self.database).process_node(result[-1]['children']) 64 | 65 | else: 66 | raise RuntimeError(f'Queries that make use of >>{name}<< are not supported (yet).') 67 | 68 | class _ParseSingleQuery: 69 | 70 | def __init__(self, database): 71 | self.output_params = { 72 | "Order": '', 73 | "Skip": '', 74 | "Limit": '' 75 | } 76 | self.additional_wheres = list() 77 | self.metadata = Metadata(database) 78 | 79 | def build_select_statement(self, pattern): 80 | output = '' 81 | 82 | # First the start node 83 | s_node = pattern['NodePattern'][0] 84 | try: 85 | s_variable = s_node['Variable'] 86 | except KeyError: 87 | s_variable = 'start_node' 88 | 89 | try: 90 | s_label = s_node['NodeLabels'] 91 | except KeyError: 92 | raise RuntimeError('The type of a node needs to be know for know') 93 | 94 | try: 95 | s_properties = s_node['Properties'] 96 | for key, value in s_properties.items(): 97 | self.additional_wheres.append(f"""{s_variable}.{key} = {value.replace('"', "'")}""") 98 | except KeyError: 99 | pass 100 | 101 | output += f'{s_label} AS {s_variable}' 102 | 103 | # Then the chain 104 | try: 105 | chain = pattern['PatternElementChain'] 106 | except KeyError: 107 | return output 108 | 109 | p_label = s_label 110 | p_variable = s_variable 111 | for i, chain_part in enumerate(chain): 112 | to_node = chain_part['node'] 113 | try: 114 | to_node_variable = to_node['Variable'] 115 | except KeyError: 116 | to_node_variable = f'Xtn{i}X' 117 | try: 118 | to_node_type = to_node['NodeLabels'] 119 | except KeyError: 120 | raise RuntimeError("The node type needs to be known for now.") 121 | try: 122 | to_node_properties = to_node['Properties'] 123 | for key, value in to_node_properties.items(): 124 | self.additional_wheres.append(f"""{to_node_variable}.{key} = {value.replace('"', "'")}""") 125 | except KeyError: 126 | pass 127 | 128 | relationship = chain_part['relationship'] 129 | try: 130 | rel_variable = relationship['Variable'] 131 | except KeyError: 132 | rel_variable = f'Xrel{i}X' 133 | try: 134 | rel_type = relationship['RelationshipTypes'][0] 135 | except KeyError: 136 | rel_type = self.metadata.get_default_join_info(p_label, to_node_type)[0] 137 | try: 138 | rel_properties = relationship['Properties'] 139 | for key, value in rel_properties.items(): 140 | self.additional_wheres.append(f"""{rel_variable}.{key} = {value.replace('"', "'")}""") 141 | except KeyError: 142 | pass 143 | meta = self.metadata.get_all_join_info(p_label, to_node_type) 144 | if not meta: 145 | raise RuntimeError(f"There are no edges between these node types known: {p_label} and {to_node_type}") 146 | meta = meta[0] # TODO unless join table is specified 147 | join_table, from_node_jk, join_table_fnk, join_table_tnk, to_node_jk = meta 148 | 149 | # Add relationship join and then the node join 150 | join = f' JOIN {join_table} AS {rel_variable} ON {p_variable}.{from_node_jk} = {rel_variable}.{join_table_fnk}' + \ 151 | f' JOIN {to_node_type} AS {to_node_variable} ON {rel_variable}.{join_table_tnk} = {to_node_variable}.{to_node_jk}' 152 | output += join 153 | p_variable = to_node_variable 154 | p_label = to_node_type 155 | return output 156 | 157 | def process_node(self, node): 158 | name = node['name'] 159 | result = node['result'] 160 | 161 | if name == 'SingleQuery': 162 | return self.process_node(result[0]['children']) 163 | 164 | elif name == 'SinglePartQuery': 165 | read_part = '' 166 | return_part = '' 167 | for r in result: 168 | if r['node'] == r['children']: 169 | continue 170 | elif r['children']['name'] == 'UpdatingClause': 171 | raise RuntimeError('Updates are not supported yet') 172 | elif r['children']['name'] == 'ReadingClause': 173 | if len(read_part) > 0: 174 | raise RuntimeError('Only one reading clause per query is supported') 175 | read_part = self.process_node(r['children']) 176 | else: 177 | return_part = self.process_node(r['children']) 178 | return return_part + ' ' + read_part \ 179 | + self.output_params['Order'] \ 180 | + self.output_params['Skip'] \ 181 | + self.output_params['Limit'] 182 | 183 | elif name == 'ReadingClause': 184 | return self.process_node(result[0]['children']) 185 | 186 | elif name == 'Match': 187 | match_text = '' 188 | where = '' 189 | 190 | result_generator = (r for r in result) 191 | r = next(result_generator) 192 | while r['node'] == r['children']: 193 | match_text += r['node']['text'] 194 | r = next(result_generator) 195 | if match_text.strip().upper().startswith('OPTIONAL'): 196 | raise RuntimeError('For now we do not support OPTIONAL matches yet.') 197 | pattern = self.process_node(r['children']) 198 | while True: 199 | try: 200 | r = next(result_generator) 201 | if r['node'] == r['children']: 202 | continue 203 | if r['children']['name'] == 'Where': 204 | where = self.process_node(r['children']) 205 | except StopIteration: 206 | break 207 | match_statement = f'FROM {pattern}' 208 | if len(where) == 0 and len(self.additional_wheres) > 0: 209 | where = ' WHERE ' + ' AND '.join(self.additional_wheres) 210 | elif len(self.additional_wheres) > 0: 211 | additional_and = ' AND ' + ' AND '.join(self.additional_wheres) 212 | where += additional_and 213 | if len(where) > 0: 214 | match_statement += where 215 | return match_statement 216 | 217 | elif name == 'Pattern': 218 | return_expression = '' 219 | for r in result: 220 | if r['node'] == r['children']: 221 | return_expression += r['node']['text'] 222 | else: 223 | return_expression += self.process_node(r['children']) 224 | return return_expression 225 | 226 | elif name == 'PatternPart': 227 | return_expression = '' 228 | for r in result: 229 | if r['node'] == r['children']: 230 | return_expression += r['node']['text'] 231 | elif r['children']['name'] == 'Variable': 232 | raise RuntimeError('Variable assignment of patterns is not supported yet.') 233 | else: 234 | return_expression += self.process_node(r['children']) 235 | return return_expression 236 | 237 | elif name == 'AnonymousPatternPart': 238 | return ''.join([self.process_node(r['children']) for r in result]) 239 | 240 | elif name == 'PatternElement': 241 | # Get processed chain data 242 | pattern = dict() 243 | for r in result: 244 | if r['node'] == r['children']: 245 | continue 246 | else: 247 | try: 248 | pattern[r['children']['name']].append(self.process_node(r['children'])) 249 | except KeyError: 250 | pattern[r['children']['name']] = [self.process_node(r['children'])] 251 | 252 | return self.build_select_statement(pattern) 253 | 254 | elif name == 'NodePattern': 255 | node = dict() 256 | for r in result: 257 | if r['node'] == r['children']: 258 | continue 259 | else: 260 | node[r['children']['name']] = self.process_node(r['children']) 261 | return node 262 | 263 | elif name == 'NodeLabels': 264 | if len(result) > 1: 265 | raise RuntimeError("Only one node label at a time is supported") 266 | return self.process_node(result[0]['children']) 267 | 268 | elif name == 'NodeLabel': 269 | node_label = '' 270 | for r in result: 271 | if r['node'] == r['children']: 272 | continue 273 | else: 274 | node_label = self.process_node(r['children']) 275 | return node_label 276 | 277 | elif name == 'LabelName': 278 | return self.process_node(result[0]['children']) 279 | 280 | elif name == 'Properties': 281 | return self.process_node(result[0]['children']) 282 | 283 | elif name == 'MapLiteral': 284 | map_literal = dict() 285 | key = None 286 | for r in result: 287 | if r['node'] == r['children']: 288 | continue 289 | elif r['children']['name'] == 'PropertyKeyName': 290 | key = self.process_node(r['children']) 291 | elif r['children']['name'] == 'Expression': 292 | map_literal[key] = self.process_node(r['children']) 293 | key = None 294 | return map_literal 295 | 296 | elif name == 'PatternElementChain': 297 | relationship = None 298 | node = None 299 | for r in result: 300 | if r['node'] == r['children']: 301 | continue 302 | elif r['children']['name'] == 'RelationshipPattern': 303 | relationship = self.process_node(r['children']) 304 | else: 305 | node = self.process_node(r['children']) 306 | return {'relationship': relationship, 'node': node} 307 | 308 | elif name == 'RelationshipPattern': 309 | for r in result: 310 | if r['children']['name'] == 'Dash': 311 | continue 312 | elif r['children']['name'] in {'LeftArrowHead', 'RightArrowHead'}: 313 | raise RuntimeError('Directed edges are not supported yet.') 314 | else: 315 | return self.process_node(r['children']) 316 | raise RuntimeError("RelationshipPattern should return a pattern") 317 | 318 | elif name == 'RelationshipDetail': 319 | relation = dict() 320 | for r in result: 321 | if r['node'] == r['children']: 322 | continue 323 | else: 324 | relation[r['children']['name']] = self.process_node(r['children']) 325 | return relation 326 | 327 | elif name == 'RelationshipTypes': 328 | relationship_types = [] 329 | for r in result: 330 | if r['node'] == r['children']: 331 | continue 332 | else: 333 | relationship_types.append(self.process_node(r['children'])) 334 | if len(relationship_types) > 1: 335 | raise RuntimeError("We only support one join table at a time for now.") 336 | return relationship_types 337 | 338 | elif name == 'RelTypeName': 339 | return self.process_node(result[0]['children']) 340 | 341 | elif name == 'Where': 342 | where_statement = ' ' 343 | for r in result: 344 | if r['node'] == r['children']: 345 | where_statement += r['node']['text'] 346 | else: 347 | where_statement += self.process_node(r['children']) 348 | return where_statement 349 | 350 | elif name == 'Return': 351 | out = '' 352 | for r in result[1:]: 353 | if r['node'] == r['children']: 354 | if len(r['node']['text'].strip()) == 0: 355 | continue 356 | out += r['node']['text'].strip() + ' ' 357 | else: 358 | out += self.process_node(r['children']) 359 | return 'SELECT ' + out 360 | 361 | elif name == 'ReturnBody': 362 | out = '' 363 | for r in result: 364 | if r['node'] == r['children']: 365 | continue 366 | elif r['children']['name'] == 'ReturnItems': 367 | out = self.process_node(r['children']) 368 | elif r['children']['name'] in {'Order', 'Skip', 'Limit'}: 369 | self.output_params[r['children']['name']] = ' ' + r['node']['text'] 370 | else: 371 | n = r['children']['name'] 372 | raise RuntimeError(f'Queries that make use of >>{n}<< are not supported (yet).') 373 | return out 374 | 375 | elif name == 'ReturnItems': 376 | return_items = '' 377 | for r in result: 378 | return_items += r['node']['text'] 379 | # TODO 380 | # For now we just assume the ReturnItems is already correct, should make it better such 381 | # that e.g. nodes can be selected directly (now specific attributes have to be specified). 382 | return return_items 383 | 384 | elif name == 'Expression': 385 | return self.process_node(result[0]['children']) 386 | 387 | elif name in {'OrExpression', 'AndExpression', 'XorExpression', 'NotExpression'}: 388 | keyword = name[:-10] 389 | expressions = [] 390 | for r in result: 391 | if r['node'] == r['children']: 392 | continue 393 | else: 394 | expressions.append(self.process_node(r['children'])) 395 | if len(expressions) == 1: 396 | return expressions[0] 397 | else: 398 | return f' {keyword.upper()} '.join(expressions) 399 | 400 | elif name == 'ComparisonExpression': 401 | possible_comparisons = {'=', '<>', '<', '>', '<=', '>='} 402 | comparisons = [] 403 | for r in result: 404 | if r['node'] == r['children']: 405 | continue 406 | else: 407 | comparisons.append(self.process_node(r['children'])) 408 | if len(comparisons) == 1: 409 | return comparisons[0] 410 | elif len(comparisons) == 2: 411 | return comparisons[0] + ' ' + comparisons[1] 412 | else: 413 | comparison_expressions_unprocessed = [] 414 | comparison_expressions = [] 415 | for i in range(len(comparisons)-1): 416 | comparison_expressions_unprocessed.append([comparisons[i], comparisons[i+1]]) 417 | for expression_duo in comparison_expressions_unprocessed: 418 | p1, p2 = expression_duo 419 | for p in possible_comparisons: 420 | p1 = p1.replace(p, '') 421 | comparison_expressions.append(p1.strip() + ' ' + p2.strip()) 422 | return ' AND '.join(comparison_expressions) 423 | 424 | elif name == 'PartialComparisonExpression': 425 | partial_comparison = '' 426 | for r in result: 427 | if r['node'] == r['children']: 428 | partial_comparison += r['node']['text'] 429 | else: 430 | partial_comparison += self.process_node(r['children']) 431 | return partial_comparison 432 | 433 | elif name in {'AddOrSubtractExpression', 'MultiplyDivideModuloExpression', 434 | 'PowerOfExpression', 'UnaryAddOrSubtractExpression'}: 435 | return_expression= '' 436 | for r in result: 437 | if r['node'] == r['children']: 438 | return_expression += r['node']['text'] 439 | else: 440 | return_expression += self.process_node(r['children']) 441 | return return_expression 442 | 443 | elif name == 'StringListNullOperatorExpression': 444 | return ' '.join([self.process_node(r['children']) for r in result]) 445 | 446 | elif name == 'NullOperatorExpression': 447 | return ''.join([r['node']['text'] for r in result]).strip() 448 | 449 | elif name == 'PropertyOrLabelsExpression': 450 | return ''.join([self.process_node(r['children']) for r in result]) 451 | 452 | elif name == 'PropertyLookup': 453 | return_expression = '' 454 | for r in result: 455 | if r['node'] == r['children']: 456 | return_expression += r['node']['text'] 457 | else: 458 | return_expression += self.process_node(r['children']) 459 | return return_expression 460 | 461 | elif name == 'SchemaName': 462 | return_expression = '' 463 | for r in result: 464 | if r['node'] == r['children']: 465 | return_expression += r['node']['text'] 466 | else: 467 | return_expression += self.process_node(r['children']) 468 | return return_expression 469 | 470 | elif name == 'PropertyKeyName': 471 | return_expression = '' 472 | for r in result: 473 | if r['node'] == r['children']: 474 | return_expression += r['node']['text'] 475 | else: 476 | return_expression += self.process_node(r['children']) 477 | return return_expression 478 | 479 | elif name == 'Atom': 480 | return_expression = '' 481 | for r in result: 482 | if r['node'] == r['children']: 483 | return_expression += r['node']['text'] 484 | else: 485 | return_expression += self.process_node(r['children']) 486 | return return_expression 487 | 488 | elif name == 'FunctionInvocation': 489 | return_expression = '' 490 | for r in result: 491 | if r['node'] == r['children']: 492 | return_expression += r['node']['text'] 493 | elif r['children']['name'] == 'FunctionName': 494 | return_expression += r['node']['text'] 495 | else: 496 | return_expression += self.process_node(r['children']) 497 | return return_expression 498 | 499 | elif name == 'Literal': 500 | return_expression = '' 501 | for r in result: 502 | if r['node'] == r['children']: 503 | return_expression += r['node']['text'].replace('"', "'") 504 | else: 505 | return_expression += self.process_node(r['children']) 506 | return return_expression 507 | 508 | elif name == 'NumberLiteral': 509 | return self.process_node(result[0]['children']) 510 | 511 | elif name == 'DoubleLiteral': 512 | return ''.join([r['node']['text'] for r in result]).strip() 513 | 514 | elif name == 'IntegerLiteral': 515 | return ''.join([r['node']['text'] for r in result]).strip() 516 | 517 | elif name == 'Variable': 518 | return self.process_node(result[0]['children']) 519 | 520 | elif name == 'SymbolicName': 521 | return ''.join([r['node']['text'] for r in result]).strip() 522 | 523 | elif name == 'ParenthesizedExpression': 524 | return_expression = '' 525 | for r in result: 526 | if r['node'] == r['children']: 527 | return_expression += r['node']['text'] 528 | else: 529 | return_expression += self.process_node(r['children']) 530 | return return_expression 531 | 532 | elif name == 'MultiPartQueries': 533 | raise RuntimeError('The keyword WITH is not supported (yet).') 534 | 535 | else: 536 | raise RuntimeError(f'Queries that make use of >>{name}<< are not supported (yet).') -------------------------------------------------------------------------------- /geesedb/interpreter/translate.py: -------------------------------------------------------------------------------- 1 | from .parser import Parser 2 | 3 | # This class was used in the paper for translating, all the translating logic is now implemented in Parser 4 | # So this class is a wrapper for that one. 5 | class Translator: 6 | 7 | def __init__(self, database): 8 | self.parser = Parser(database) 9 | 10 | def translate(self, query): 11 | return self.parser.parse(query) 12 | -------------------------------------------------------------------------------- /geesedb/resources/__init__.py: -------------------------------------------------------------------------------- 1 | from .topics import get_topics_backgroundlinking 2 | 3 | __all__ = ['get_topics_backgroundlinking'] -------------------------------------------------------------------------------- /geesedb/resources/topics-and-qrels/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/resources/topics-and-qrels/__init__.py -------------------------------------------------------------------------------- /geesedb/resources/topics-and-qrels/topics.backgroundlinking18.processed.txt: -------------------------------------------------------------------------------- 1 | 321:9171debc316e5e2782e0d2404ca7d09d 2 | 336:2a340b8573d498e261d6f2365b37f8eb 3 | 341:7ef8ce1720bf2f6b2065a97506ee89b4 4 | 347:c3cea789141ef2ae856419e86e165e0c 5 | 350:985b90cc-7c98-11e3-93c1-0e888170b723 6 | 362:4989ebfeb752e6b317d1ef3997b21a01 7 | 363:474ae088-ab1e-11e4-9c91-e9d2f9fde644 8 | 367:1e03fecf4d33b7896203298ab3858156 9 | 375:0e85b0c0-f7ef-11e4-9030-b4732caefe81 10 | 378:3c5be31e-24ab-11e5-b621-b55e495e9b78 11 | 393:fef0f232a9bd94bdb96bac48c7705503 12 | 397:563fb77e-024f-11e6-9203-7b8670959b88 13 | 400:72e72b41097d53b627fd375dd2d3309b 14 | 408:988147454a2b8eafd1535cd673dd04ba 15 | 414:4192b016-8708-11e3-a5bd-844629433ba3 16 | 422:145b9a6caa16d931c108a89798e65e17 17 | 426:56f0438ee0fb34c341ccf5af36de5175 18 | 427:2e83ad87eb1bade22e6e96ece616c24f 19 | 433:159e6f9e-8e84-11e3-84e1-27626c5ef5fb 20 | 439:5c466d4a01492f1b5cc9758e19429a1f 21 | 442:3902c9005a0563742fc4acb2c011b164 22 | 445:c8351276-76de-41f1-b294-4f3e5d373c8c 23 | 626:a79b1b7d8cc5273d4995fec5e122e44b 24 | 646:6fdc62d37aaf685b809c501abe13c56c 25 | 690:defd7f4a85496d52a210938d58a7ae76 26 | 801:b0235f56-1cce-11e4-ae54-0cfe1f974f8a 27 | 802:6668d83480f5c58b54a90770835ac2d4 28 | 803:cad56e871cd0bca6cc77e97ffe246258 29 | 804:579e9ae8-6a2f-11e6-8225-fbb8a6fc65bc 30 | 805:5ec40b6bc6c5f4487132da7be04fc914 31 | 806:2bea9433d4e1050c9c85175df466b3e2 32 | 807:11915bd8-7944-11e2-9c27-fdd594ea6286 33 | 808:30a493b8-fb07-11e4-9ef4-1bb7ce3b3fb7 34 | 809:02e52bdba097c9df4cbae66e04f82542 35 | 810:9dd7b85cd1e3da1b5c8e79f32fec7177 36 | 811:a244d1e0cfd916a2af76b6a6c785b017 37 | 812:dcd1560bd13a0b665b95d3ba27cc960c 38 | 813:b4c6361974466458bb721b9b1628220b 39 | 814:e1336b8f-b0c2-4610-9a3c-ec85a546c9ad 40 | 815:a36fa8a2-8962-11e6-bff0-d53f592f176e 41 | 816:37a8e2283e4677b703f6464d0191a700 42 | 817:bd1e6cc8d7525fec36a717be45638bf4 43 | 818:a2744bb98e1968307548e4976232cf1c 44 | 819:5f37aac53768e749b861028397eb6849 45 | 820:fc1ca759c9c433376e71884870d225ab 46 | 821:c6bf4a4bf542b7c67987c222d73def4b 47 | 822:43e9f3f12982c0e0bb15ad64b33a89c0 48 | 823:c109cc839f2d2414251471c48ae5515c 49 | 824:30c00b60-13f6-11e3-b182-1b3bb2eb474c 50 | 825:a1c41a70-35c7-11e3-8a0e-4e2cf80831fc -------------------------------------------------------------------------------- /geesedb/resources/topics-and-qrels/topics.backgroundlinking19.processed.txt: -------------------------------------------------------------------------------- 1 | 826:96ab542e-6a07-11e6-ba32-5a4bf5aad4fa 2 | 827:679e8784-34df-11e7-b373-418f6849a004 3 | 828:56989a9c-e30e-11e6-a547-5fb9411d332c 4 | 829:0d7f5e24cafc019265d3ee4b9745e7ea 5 | 830:02475047d615c46a006924f0e2317cca 6 | 831:320596f4102c94fca3f0432a8611f87d 7 | 832:b407cd8074559045a926eff226aae173 8 | 833:0e43fce6-12a9-11e5-89f3-61410da94eb1 9 | 834:99d80076323444d17769330a2fc20e93 10 | 835:c0c4e2d0-628f-11e7-a4f7-af34fc1d9d39 11 | 836:50326f32f7308f71691fafbdc25e1dc3 12 | 837:664638e8-6dc3-11e6-9705-23e51a2f424d 13 | 838:8a6a9e3c-fff3-11e5-8bb1-f124a43f84dc 14 | 839:5331852c50a9cb4b0ff655ecf4a7b6a6 15 | 840:55060482bf60235610936dd9cd2e54b4 16 | 841:2dc6b4a1a5ec8579292e35163982cdb6 17 | 842:7ffad2c2-0c57-11e5-95fd-d580f1c5d44e 18 | 843:53e0938aa0a0d14afb74effcd7819c79 19 | 844:84b81394-39ad-11e7-a59b-26e0451a96fd 20 | 845:d71b4204-9ec8-11e5-bce4-708fe33e3288 21 | 846:6dc42582bb2c8ec59dba0fb85b4848ea 22 | 847:5cc92f1c73c531dc0b470e771e0eb16f 23 | 848:5a50785cf124c29ad3e9d1e112973086 24 | 849:a9e12ff8fc83d08e869ba64669d350fd 25 | 850:540fc83a-33fa-11e7-b373-418f6849a004 26 | 851:c65e0878-418a-11e7-9869-bac8b446820a 27 | 852:6f1c065141e1dea9ae66c34f92841170 28 | 853:415e53f9e697cfb7baa46d9c494d2ece 29 | 854:d0828a0fea8f9110154b944984cd8f28 30 | 855:0bc6e8a10a6df059d2aa5ae23281b7fb 31 | 856:88bd7be0fca52c06cd550c14ce9d2416 32 | 857:6fe0cb2ea7838ac2c29fa0539c6bce1d 33 | 858:587a617a-3e3f-11e6-84e8-1580c7db5275 34 | 859:663c2790-3f8e-11e5-9561-4b3dc93e3b9a 35 | 860:9584295234e46a836dfde4f42ba2ca09 36 | 861:555bbc1c46a91e62bdcbbd0023f849a4 37 | 862:623ccdaa-6013-11e7-a4f7-af34fc1d9d39 38 | 863:bf9d84082043dded9b4feb0767eee96e 39 | 864:cba3afc578eac3b7afe15899313cdd3a 40 | 865:f5a5cd2c3ef20d0550a3dd623702d23d 41 | 866:98791c1f1f74e7bd11e1369807be3a89 42 | 867:babfe8b6d17dbc18e38707877ac75e4e 43 | 868:2c9485cf6bf33f13740dcb04c572b9e2 44 | 869:4d2e805d3293fdef30ce737a97528247 45 | 870:d7d906991e2883889f850de9ae06655e 46 | 871:7b7126ae-0500-11e7-b1e9-a05d3c21f7cf 47 | 872:e833068d3a08ad64398330bbc3b1759b 48 | 873:05c0a5ad-108f-46b9-b6f7-5f2303f6cdfe 49 | 874:fb6c2426-ea52-11e6-b82f-687d6e6a3e7c 50 | 875:14f6eb04dbaa9c5d6f5170cffc67e463 51 | 876:fef77fc9335b33d975132ce603182846 52 | 877:7047115870d7910b42bc779541f5deb5 53 | 878:97b489e2-0a38-11e5-9e39-0db921c47b93 54 | 879:5adb2f7230907f4006063656b8400742 55 | 880:58739d169d364163c1e81a8f081cdc9a 56 | 881:1342bcb6-ec45-11e5-a6f3-21ccdbc5f74e 57 | 882:74ca2f03320df21995165b6bb9bb4ddb 58 | 883:aaf444787011938dc645bdc1185a0716 59 | 884:681e77ce-dffd-11e5-9c36-e1902f6b6571 60 | 885:5ae44bfd66a49bcad7b55b29b55d63b6 -------------------------------------------------------------------------------- /geesedb/resources/topics-and-qrels/topics.core17.processed.txt: -------------------------------------------------------------------------------- 1 | 307:new hydroelectr project 2 | 310:radio wave brain cancer 3 | 321:women parliament 4 | 325:cult lifestyl 5 | 330:iran iraq cooper 6 | 336:black bear attack 7 | 341:airport secur 8 | 344:abus e mail 9 | 345:oversea tobacco sale 10 | 347:wildlif extinct 11 | 350:health comput termin 12 | 353:antarctica explor 13 | 354:journalist risk 14 | 355:ocean remot sens 15 | 356:postmenopaus estrogen britain 16 | 362:human smuggl 17 | 363:transport tunnel disast 18 | 367:piraci 19 | 372:nativ american casino 20 | 375:hydrogen energi 21 | 378:euro opposit 22 | 379:mainstream 23 | 389:illeg technolog transfer 24 | 393:merci kill 25 | 394:home school 26 | 397:automobil recal 27 | 399:oceanograph vessel 28 | 400:amazon rain forest 29 | 404:ireland peac talk 30 | 408:tropic storm 31 | 414:cuba sugar export 32 | 416:three gorg project 33 | 419:recycl automobil tire 34 | 422:art stolen forg 35 | 423:milosev mirjana markov 36 | 426:law enforc dog 37 | 427:uv damag ey 38 | 433:greek philosophi stoicism 39 | 435:curb popul growth 40 | 436:railwai accid 41 | 439:invent scientif discoveri 42 | 442:heroic act 43 | 443:u. invest africa 44 | 445:women clergi 45 | 614:flavr savr tomato 46 | 620:franc nuclear test 47 | 626:human stamped 48 | 646:food stamp increas 49 | 677:lean tower pisa 50 | 690:colleg educ advantag 51 | -------------------------------------------------------------------------------- /geesedb/resources/topics-and-qrels/topics.core18.processed.txt: -------------------------------------------------------------------------------- 1 | 321:women parliament 2 | 336:black bear attack 3 | 341:airport secur 4 | 347:wildlif extinct 5 | 350:health comput termin 6 | 362:human smuggl 7 | 363:transport tunnel disast 8 | 367:piraci 9 | 375:hydrogen energi 10 | 378:euro opposit 11 | 393:merci kill 12 | 397:automobil recal 13 | 400:amazon rain forest 14 | 408:tropic storm 15 | 414:cuba sugar export 16 | 422:art stolen forg 17 | 426:law enforc dog 18 | 427:uv damag ey 19 | 433:greek philosophi stoicism 20 | 439:invent scientif discoveri 21 | 442:heroic act 22 | 445:women clergi 23 | 626:human stamped 24 | 646:food stamp increas 25 | 690:colleg educ advantag 26 | 801:africa polio vaccin 27 | 802:women drive saudi arabia 28 | 803:declin middl class u. 29 | 804:women 20 30 | 805:eat invas speci 31 | 806:comput paralyz peopl 32 | 807:chavez medic treatment cuba 33 | 808:boston marathon bomb verdict 34 | 809:protect earth from asteroid 35 | 810:diabet toxic chemic 36 | 811:car hack 37 | 812:social media teen suicid 38 | 813:marijuana potenc 39 | 814:china on child impact 40 | 815:jason rezaian releas from iran 41 | 816:feder minimum wage increas 42 | 817:alan gross releas cuba 43 | 818:egg healthi diet 44 | 819:u. ag demograph 45 | 820:bacteri infect mortal rate 46 | 821:email scam 47 | 822:soni cyberattack 48 | 823:control mrsa 49 | 824:bezo purchas washington post 50 | 825:ethanol food price 51 | -------------------------------------------------------------------------------- /geesedb/resources/topics-and-qrels/topics.robust04.processed.txt: -------------------------------------------------------------------------------- 1 | 301:intern organ crime 2 | 302:poliomyel post polio 3 | 303:hubbl telescop achiev 4 | 304:endang speci mammal 5 | 305:most danger vehicl 6 | 306:african civilian death 7 | 307:new hydroelectr project 8 | 308:implant dentistri 9 | 309:rap crime 10 | 310:radio wave brain cancer 11 | 311:industri espionag 12 | 312:hydropon 13 | 313:magnet levit maglev 14 | 314:marin veget 15 | 315:unexplain highwai accid 16 | 316:polygami polyandri polygyni 17 | 317:unsolicit fax 18 | 318:best retir countri 19 | 319:new fuel sourc 20 | 320:undersea fiber optic cabl 21 | 321:women parliament 22 | 322:intern art crime 23 | 323:literari journalist plagiar 24 | 324:argentin british relat 25 | 325:cult lifestyl 26 | 326:ferri sink 27 | 327:modern slaveri 28 | 328:pope beatif 29 | 329:mexican air pollut 30 | 330:iran iraq cooper 31 | 331:world bank critic 32 | 332:incom tax evas 33 | 333:antibiot bacteria diseas 34 | 334:export control cryptographi 35 | 335:adopt biolog parent 36 | 336:black bear attack 37 | 337:viral hepat 38 | 338:risk aspirin 39 | 339:alzheim drug treatment 40 | 340:land mine ban 41 | 341:airport secur 42 | 342:diplomat expuls 43 | 343:polic death 44 | 344:abus e mail 45 | 345:oversea tobacco sale 46 | 346:educ standard 47 | 347:wildlif extinct 48 | 348:agoraphobia 49 | 349:metabol 50 | 350:health comput termin 51 | 351:falkland petroleum explor 52 | 352:british chunnel impact 53 | 353:antarctica explor 54 | 354:journalist risk 55 | 355:ocean remot sens 56 | 356:postmenopaus estrogen britain 57 | 357:territori water disput 58 | 358:blood alcohol fatal 59 | 359:mutual fund predictor 60 | 360:drug legal benefit 61 | 361:cloth sweatshop 62 | 362:human smuggl 63 | 363:transport tunnel disast 64 | 364:rabi 65 | 365:el nino 66 | 366:commerci cyanid us 67 | 367:piraci 68 | 368:vitro fertil 69 | 369:anorexia nervosa bulimia 70 | 370:food drug law 71 | 371:health insur holist 72 | 372:nativ american casino 73 | 373:encrypt equip export 74 | 374:nobel prize winner 75 | 375:hydrogen energi 76 | 376:world court 77 | 377:cigar smoke 78 | 378:euro opposit 79 | 379:mainstream 80 | 380:obes medic treatment 81 | 381:altern medicin 82 | 382:hydrogen fuel automobil 83 | 383:mental ill drug 84 | 384:space station moon 85 | 385:hybrid fuel car 86 | 386:teach disabl children 87 | 387:radioact wast 88 | 388:organ soil enhanc 89 | 389:illeg technolog transfer 90 | 390:orphan drug 91 | 391:r d drug price 92 | 392:robot 93 | 393:merci kill 94 | 394:home school 95 | 395:tourism 96 | 396:sick build syndrom 97 | 397:automobil recal 98 | 398:dismantl europ arsen 99 | 399:oceanograph vessel 100 | 400:amazon rain forest 101 | 401:foreign minor germani 102 | 402:behavior genet 103 | 403:osteoporosi 104 | 404:ireland peac talk 105 | 405:cosmic event 106 | 406:parkinson diseas 107 | 407:poach wildlif preserv 108 | 408:tropic storm 109 | 409:legal pan am 103 110 | 410:schengen agreement 111 | 411:salvag shipwreck treasur 112 | 412:airport secur 113 | 413:steel product 114 | 414:cuba sugar export 115 | 415:drug golden triangl 116 | 416:three gorg project 117 | 417:creativ 118 | 418:quilt incom 119 | 419:recycl automobil tire 120 | 420:carbon monoxid poison 121 | 421:industri wast dispos 122 | 422:art stolen forg 123 | 423:milosev mirjana markov 124 | 424:suicid 125 | 425:counterfeit monei 126 | 426:law enforc dog 127 | 427:uv damag ey 128 | 428:declin birth rate 129 | 429:legionnair diseas 130 | 430:killer bee attack 131 | 431:robot technolog 132 | 432:profil motorist polic 133 | 433:greek philosophi stoicism 134 | 434:estonia economi 135 | 435:curb popul growth 136 | 436:railwai accid 137 | 437:deregul ga electr 138 | 438:tourism increas 139 | 439:invent scientif discoveri 140 | 440:child labor 141 | 441:lyme diseas 142 | 442:heroic act 143 | 443:u. invest africa 144 | 444:supercrit fluid 145 | 445:women clergi 146 | 446:tourist violenc 147 | 447:stirl engin 148 | 448:ship loss 149 | 449:antibiot ineffect 150 | 450:king hussein peac 151 | 601:turkei iraq water 152 | 602:czech slovak sovereignti 153 | 603:tobacco cigarett lawsuit 154 | 604:lyme diseas arthriti 155 | 605:great britain health care 156 | 606:leg trap ban 157 | 607:human genet code 158 | 608:tax social secur 159 | 609:per capita alcohol consumpt 160 | 610:minimum wage advers impact 161 | 611:kurd germani violenc 162 | 612:tibet protest 163 | 613:berlin wall dispos 164 | 614:flavr savr tomato 165 | 615:timber export asia 166 | 616:volkswagen mexico 167 | 617:russia cuba economi 168 | 618:ayatollah khomeini death 169 | 619:winni mandela scandal 170 | 620:franc nuclear test 171 | 621:women ordain church england 172 | 622:price fix 173 | 623:toxic chemic weapon 174 | 624:sdi star war 175 | 625:arrest bomb wtc 176 | 626:human stamped 177 | 627:russian food crisi 178 | 628:u. invas panama 179 | 629:abort clinic attack 180 | 630:gulf war syndrom 181 | 631:mandela south africa presid 182 | 632:southeast asia tin mine 183 | 633:welsh devolut 184 | 634:l tryptophan death 185 | 635:doctor assist suicid 186 | 636:juri duti exempt 187 | 637:human growth hormon hgh 188 | 638:wrong convict 189 | 639:consum line shop 190 | 640:matern leav polici 191 | 641:valdez wildlif marin life 192 | 642:tiananmen squar protest 193 | 643:salmon dam pacif northwest 194 | 644:exot anim import 195 | 645:softwar piraci 196 | 646:food stamp increas 197 | 647:windmil electr 198 | 648:famili leav law 199 | 649:comput virus 200 | 650:tax evas indict 201 | 651:u. ethnic popul 202 | 652:oic balkan 1990 203 | 653:eta basqu terror 204 | 654:same sex school 205 | 655:add diagnosi treatment 206 | 656:lead poison children 207 | 657:school prayer ban 208 | 658:teenag pregnanc 209 | 659:cruis health safeti 210 | 660:whale watch california 211 | 661:melanoma treatment caus 212 | 662:telemarket protect 213 | 663:agent orang exposur 214 | 664:american indian museum 215 | 665:poverti africa sub sahara 216 | 666:thatcher resign impact 217 | 667:unmarri partner household 218 | 668:poverti diseas 219 | 669:islam revolut 220 | 670:u. elect apathi 221 | 671:salvat armi benefit 222 | 672:nra membership profil 223 | 673:soviet withdraw afghanistan 224 | 674:greenpeac prosecut 225 | 675:olymp train swim 226 | 676:poppi cultiv 227 | 677:lean tower pisa 228 | 678:joint custodi impact 229 | 679:open adopt record 230 | 680:immigr spanish school 231 | 681:wind power locat 232 | 682:adult immigr english 233 | 683:czechoslovakia breakup 234 | 684:part time benefit 235 | 685:oscar winner select 236 | 686:argentina peg dollar 237 | 687:northern ireland industri 238 | 688:non u. media bia 239 | 689:famili plan aid 240 | 690:colleg educ advantag 241 | 691:clear cut forest 242 | 692:prostat cancer detect treatment 243 | 693:newspap electron media 244 | 694:compost pile 245 | 695:white collar crime sentenc 246 | 696:safeti plastic surgeri 247 | 697:air traffic control 248 | 698:literaci rate africa 249 | 699:term limit 250 | 700:gasolin tax u. 251 | -------------------------------------------------------------------------------- /geesedb/resources/topics.py: -------------------------------------------------------------------------------- 1 | def get_topics_backgroundlinking(file_name): 2 | with open(file_name) as topics_file: 3 | return [topic.strip().split(':') for topic in topics_file.readlines()] -------------------------------------------------------------------------------- /geesedb/search/__init__.py: -------------------------------------------------------------------------------- 1 | from .retrieval_models.bag_of_words.disjunctive.robertson_bm25 import RobertsonBM25 2 | from .searcher import Searcher 3 | 4 | __all__ = ['RobertsonBM25', 'Searcher'] 5 | -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/__init__.py -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/bag_of_words/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/bag_of_words/__init__.py -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/bag_of_words/aggregate.py: -------------------------------------------------------------------------------- 1 | class Aggregate: 2 | 3 | def __init__(self) -> None: 4 | pass 5 | 6 | def get_aggregator(self) -> str: 7 | raise NotImplementedError("You should implement this method in your retrieval model class.") 8 | 9 | def get_create_ranked_list(self, n: int) -> str: 10 | raise NotImplementedError("You should implement this method in your retrieval model class.") 11 | -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/bag_of_words/bow_retrieval_model.py: -------------------------------------------------------------------------------- 1 | from .aggregate import Aggregate 2 | from ..generic_text_retrieval_model import GenericTextRetrievalModel 3 | 4 | 5 | class BagOfWordsRetrievalModel(GenericTextRetrievalModel, Aggregate): 6 | def __init__(self) -> None: 7 | GenericTextRetrievalModel.__init__(self) 8 | Aggregate.__init__(self) 9 | 10 | def construct_query(self, topic: str) -> str: 11 | super_query = super().construct_query(topic) 12 | return super_query + ", qterms AS (" \ 13 | "SELECT term_doc.term_id, term_doc.doc_id, term_doc.tf, qtermids.df " \ 14 | "FROM term_doc " \ 15 | "JOIN qtermids " \ 16 | "ON term_doc.term_id = qtermids.term_id" \ 17 | ") " 18 | 19 | def get_retrieval_model(self) -> str: 20 | return super().get_retrieval_model() 21 | 22 | def get_aggregator(self) -> str: 23 | return ", scores AS (" \ 24 | "SELECT subscores.collection_id, SUM(subscores.subscore) AS score " \ 25 | "FROM subscores " \ 26 | "GROUP BY subscores.collection_id) " 27 | 28 | def get_create_ranked_list(self, n: int) -> str: 29 | return "SELECT scores.collection_id, scores.score " \ 30 | "FROM scores " \ 31 | "ORDER BY scores.score DESC " \ 32 | f"LIMIT {n}" 33 | -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/bag_of_words/conjunctive/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/bag_of_words/conjunctive/__init__.py -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/bag_of_words/disjunctive/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/bag_of_words/disjunctive/__init__.py -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/bag_of_words/disjunctive/disjunctive_retieval_model.py: -------------------------------------------------------------------------------- 1 | from ..bow_retrieval_model import BagOfWordsRetrievalModel 2 | 3 | 4 | class DisjunctiveRetrievalModel(BagOfWordsRetrievalModel): 5 | 6 | def __init__(self) -> None: 7 | super().__init__() 8 | 9 | def get_aggregator(self) -> str: 10 | return super().get_aggregator() 11 | 12 | def construct_query(self, topic: str) -> str: 13 | return super().construct_query(topic) + \ 14 | ", condocs AS (" \ 15 | "SELECT qterms.doc_id " \ 16 | "FROM qterms " \ 17 | "GROUP BY qterms.doc_id)" 18 | 19 | def get_create_ranked_list(self, n: int) -> str: 20 | return super().get_create_ranked_list(n) 21 | 22 | def get_retrieval_model(self) -> str: 23 | return super().get_retrieval_model() 24 | -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/bag_of_words/disjunctive/robertson_bm25.py: -------------------------------------------------------------------------------- 1 | from .disjunctive_retieval_model import DisjunctiveRetrievalModel 2 | 3 | 4 | class RobertsonBM25(DisjunctiveRetrievalModel): 5 | def __init__(self, k1: float = 0.9, b: float = 0.4, n: int = 1000) -> None: 6 | DisjunctiveRetrievalModel.__init__(self) 7 | self.k1 = k1 8 | self.b = b 9 | self.n = n 10 | 11 | def construct_query(self, topic: str) -> str: 12 | return DisjunctiveRetrievalModel.construct_query(self, topic) + \ 13 | self.get_retrieval_model() + \ 14 | DisjunctiveRetrievalModel.get_aggregator(self) + \ 15 | DisjunctiveRetrievalModel.get_create_ranked_list(self, self.n) 16 | 17 | def get_retrieval_model(self) -> str: 18 | return ", subscores AS (" \ 19 | "SELECT docs.collection_id, " \ 20 | f"(LOG(((SELECT count(*) from docs)-df+0.5)/(df+0.5))*tf" \ 21 | "/" \ 22 | f"(tf+{self.k1}*(1-{self.b}+{self.b}*len/(SELECT AVG(len) from docs)))" \ 23 | ") AS subscore " \ 24 | "FROM qterms " \ 25 | "JOIN condocs " \ 26 | "ON qterms.doc_id = condocs.doc_id " \ 27 | "JOIN docs " \ 28 | "ON qterms.doc_id = docs.doc_id)" 29 | -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/generic_text_retrieval_model.py: -------------------------------------------------------------------------------- 1 | class GenericTextRetrievalModel: 2 | 3 | def __init__(self) -> None: 4 | pass 5 | 6 | def construct_query(self, topic: str) -> str: 7 | return "WITH qtermids AS (" \ 8 | "SELECT term_dict.term_id, term_dict.df " \ 9 | "FROM term_dict " \ 10 | "WHERE term_dict.string IN ('{}')" \ 11 | ")".format("', '".join(topic.split(' '))) 12 | 13 | def get_retrieval_model(self) -> str: 14 | raise NotImplementedError("You should implement this method in your retrieval model class.") 15 | -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/graph/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/graph/__init__.py -------------------------------------------------------------------------------- /geesedb/search/retrieval_models/positional/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/positional/__init__.py -------------------------------------------------------------------------------- /geesedb/search/searcher.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import argparse 4 | from typing import Any, Callable, Union 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from ..connection import get_connection 10 | from ..search import RobertsonBM25 11 | 12 | 13 | class Searcher: 14 | 15 | def __init__(self, **kwargs: Any) -> None: 16 | self.arguments = self.get_arguments(kwargs) 17 | self.db_connection = get_connection(self.arguments['database']) 18 | self.ranking_method = None 19 | self.fetch = self.set_return_type() 20 | if self.arguments['retrieval_method'] == 'BM25_robertson': 21 | self.ranking_method = RobertsonBM25(self.arguments['k1'], self.arguments['b'], self.arguments['n']) 22 | 23 | @staticmethod 24 | def get_arguments(kwargs: Any) -> dict: 25 | arguments = { 26 | 'database': None, 27 | 'retrieval_method': 'BM25_robertson', 28 | 'k1': 0.9, 29 | 'b': 0.4, 30 | 'n': 1000, 31 | 'return_type': 'tuple' 32 | } 33 | for key, item in arguments.items(): 34 | if kwargs.get(key) is not None: 35 | arguments[key] = kwargs.get(key) 36 | if arguments['database'] is None: 37 | raise IOError('database path needs to be provided') 38 | return arguments 39 | 40 | def set_return_type(self) -> Callable[[], Union[list, pd.DataFrame, np.array]]: 41 | if self.arguments['return_type'] == 'list': 42 | fetch = self.db_connection.cursor.fetchall 43 | elif self.arguments['return_type'] == 'numpy': 44 | fetch = self.db_connection.cursor.fetchnumpy 45 | else: 46 | fetch = self.db_connection.cursor.fetchdf 47 | return fetch 48 | 49 | def set_k1(self, k1: float): 50 | self.arguments['k1'] = k1 51 | 52 | def set_b(self, b: float): 53 | self.arguments['b'] = b 54 | 55 | def set_n(self, k1: float): 56 | self.arguments['n'] = k1 57 | 58 | def search_topic(self, topic: str) -> Union[list, pd.DataFrame, np.array]: 59 | query = self.ranking_method.construct_query(topic) 60 | self.db_connection.cursor.execute(query) 61 | return self.fetch() 62 | 63 | 64 | if __name__ == '__main__': 65 | parser = argparse.ArgumentParser() 66 | parser.add_argument('-d', 67 | '--database', 68 | required=True, 69 | help='Name of the database / index') 70 | parser.add_argument('-r', 71 | '--retrieval_method', 72 | choices=['BM25_robertson'], 73 | help="Use the Robertson's BM25 ranking function") 74 | parser.add_argument('-k1') 75 | parser.add_argument('-b') 76 | parser.add_argument('-n') 77 | parser.add_argument('-t', 78 | '--return_type', 79 | choices=['numpy', 'data_frame', 'list'] 80 | ) 81 | Searcher(**vars(parser.parse_args())) 82 | -------------------------------------------------------------------------------- /geesedb/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/__init__.py -------------------------------------------------------------------------------- /geesedb/tests/connection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/connection/__init__.py -------------------------------------------------------------------------------- /geesedb/tests/connection/test_connection.py: -------------------------------------------------------------------------------- 1 | from ...connection import get_connection, close_connection 2 | 3 | 4 | def test_create_connection() -> None: 5 | db_connection = get_connection(':memory:') 6 | cursor = db_connection.cursor 7 | cursor.execute("SELECT 1;") 8 | assert cursor.fetchone() == (1,) 9 | close_connection() 10 | -------------------------------------------------------------------------------- /geesedb/tests/index/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/index/__init__.py -------------------------------------------------------------------------------- /geesedb/tests/index/test_authors_from_csv.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from ...index import AuthorsFromCSV 4 | from ...connection import close_connection 5 | 6 | 7 | def test_load_csv_example_files() -> None: 8 | index = AuthorsFromCSV(database=':memory:', 9 | doc_author_file=path.dirname( 10 | path.dirname(__file__)) + '/resources/csv/example_doc_author.csv' 11 | ) 12 | 13 | index.connection.execute("SELECT * FROM doc_author;") 14 | assert index.connection.fetchone() == ('b2e89334-33f9-11e1-825f-dabc29fd7071', 'Mark Giannotto') 15 | close_connection() 16 | -------------------------------------------------------------------------------- /geesedb/tests/index/test_entities_from_csv.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from ...index import EntitiesFromCSV 4 | from ...connection import close_connection 5 | 6 | def test_load_csv_example_files() -> None: 7 | index = EntitiesFromCSV(database=':memory:', 8 | entity_doc_file=path.dirname( 9 | path.dirname(__file__)) + '/resources/csv/example_entity_doc.csv' 10 | ) 11 | 12 | index.connection.execute("SELECT * FROM entity_doc;") 13 | assert index.connection.fetchone() == (0, 11, 'Danny Coale', 'Danny_Coale', 'PER', 14 | 'b2e89334-33f9-11e1-825f-dabc29fd7071') 15 | close_connection() 16 | -------------------------------------------------------------------------------- /geesedb/tests/index/test_fulltext_from_ciff.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from ...index import FullTextFromCiff 4 | from ...connection import close_connection 5 | 6 | def test_load_csv_example_files() -> None: 7 | index = FullTextFromCiff(database=':memory:', 8 | protobuf_file=path.dirname(path.dirname(__file__) 9 | ) + '/resources/ciff/toy-complete-20200309.ciff.gz' 10 | ) 11 | index.load_data() 12 | index.cursor.execute("SELECT * FROM docs;") 13 | assert index.cursor.fetchone() == ('WSJ_1', 0, 6) 14 | assert index.cursor.fetchone() == ('TREC_DOC_1', 1, 4) 15 | close_connection() 16 | -------------------------------------------------------------------------------- /geesedb/tests/index/test_fulltext_from_csv.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from ...index import FullTextFromCSV 4 | from ...connection import close_connection 5 | 6 | def test_load_csv_example_files() -> None: 7 | index = FullTextFromCSV(database=':memory:', 8 | docs_file=path.dirname(path.dirname(__file__)) + '/resources/csv/example_docs.csv', 9 | term_dict_file=path.dirname( 10 | path.dirname(__file__)) + '/resources/csv/example_term_dict.csv', 11 | term_doc_file=path.dirname(path.dirname(__file__)) + '/resources/csv/example_term_doc.csv' 12 | ) 13 | index.load_data() 14 | index.connection.execute("SELECT * FROM docs;") 15 | assert index.connection.fetchone() == ('document_0', 0, 3) 16 | assert index.connection.fetchone() == ('document_1', 1, 4) 17 | close_connection() 18 | 19 | 20 | def test_load_csv_use_existing_database_does_not_exist() -> None: 21 | try: 22 | FullTextFromCSV(database='test_database', 23 | use_existing_db=True 24 | ) 25 | assert False 26 | except IOError: 27 | assert True 28 | close_connection() 29 | -------------------------------------------------------------------------------- /geesedb/tests/resources/ciff/toy-complete-20200309.ciff.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/resources/ciff/toy-complete-20200309.ciff.gz -------------------------------------------------------------------------------- /geesedb/tests/resources/csv/example_doc_author.csv: -------------------------------------------------------------------------------- 1 | b2e89334-33f9-11e1-825f-dabc29fd7071|Mark Giannotto 2 | ca334a80-34d4-11e1-88f9-9084fc48c348|Mark Giannotto 3 | b4bf35ea-3585-11e1-836b-08c4de636de4|Mark Giannotto 4 | c5f7c1be-368a-11e1-9ee3-fd35588e7629|Mark Giannotto 5 | 48b0a076-370d-11e1-9ee3-fd35588e7629|Mark Giannotto 6 | ffef8298-36e0-11e1-9ee3-fd35588e7629|Mark Giannotto 7 | 806f619a-3975-11e1-81ef-eaf2bd09c8a2|Mark Giannotto 8 | 07c32036-3bf2-11e1-a72c-c808ebbd31f5|Mark Giannotto 9 | f46b5060-3efa-11e1-804a-d8db7cc3d3b2|Mark Giannotto 10 | 7aa5c316-4204-11e1-9091-3ad6d04900db|Mark Giannotto 11 | 8d091e8d2adec74088b89aea54e5fff0|Katie Carrera 12 | fb7979c77a02b72ba079628964216853|Katie Carrera 13 | 354dbc1b6ae6f1ead6d449005c0d46ff|Katie Carrera 14 | 6173699a05b6bb212bbf4cbe36de1a2e|Katie Carrera 15 | 5d86156e7e96e4c566cf5d130cfe23cb|Katie Carrera 16 | 70ffc51bf54d594b47a5839f736927b7|Katie Carrera 17 | 37e0233314d3955074b1359b7468e5d5|Katie Carrera 18 | d0e3d979102c23cd9b40761fca6402d8|Katie Carrera 19 | cd905bce531fac9140200b36b6791df6|Katie Carrera 20 | e4a267f731660d2f9e65a9ad7be57012|Katie Carrera 21 | 2056bbeebf83491579d386e2a50a979d|Katie Carrera 22 | d114557a24463b3dd81e41599d4deecf|Mark Berman 23 | 1fa1eaa46f116913ba9dbf3686c58a5b|Mark Berman 24 | e407b71ea43ef9b6f9ec98bfedf720e9|Mark Berman 25 | 68a63858c0a67d25ef243f4a0f6676a9|Chris Cillizza 26 | ab120f52c7b6412af966aafbe4718ae2|Chris Cillizza 27 | e96cd810bdd79b30d88447e92ea0d5c7|Chris Cillizza 28 | 8f70f98e4f6a6e89f1434eb8be266aba|Chris Cillizza 29 | 51bd464f9881b8bbc19c965dc9e34828|Chris Cillizza 30 | e8a6f0d880caf7a19974bd63581ce41d|Chris Cillizza 31 | -------------------------------------------------------------------------------- /geesedb/tests/resources/csv/example_docs.csv: -------------------------------------------------------------------------------- 1 | document_0|0|3 2 | document_1|1|4 -------------------------------------------------------------------------------- /geesedb/tests/resources/csv/example_entity_doc.csv: -------------------------------------------------------------------------------- 1 | 0|11|Danny Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 2 | 13|14|Jarrett Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 3 | 56|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 4 | 79|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 5 | 107|11|Danny Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 6 | 153|9|Episcopal|Episcopal_Academy|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 7 | 166|10|Alexandria|Alexandria,_Virginia|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071 8 | 197|14|Jarrett Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 9 | 264|2|AP|AP_Poll|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 10 | 268|11|NEW ORLEANS|New_Orleans|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071 11 | 293|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 12 | 445|14|North Carolina|North_Carolina|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071 13 | 504|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 14 | 574|14|Jarrett Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 15 | 593|11|Danny Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 16 | 754|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 17 | 765|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 18 | 827|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 19 | 964|5|Macho|Macho_Harris|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 20 | 971|6|Harris|Franco_Harris|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 21 | 1024|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 22 | 1035|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 23 | 1152|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 24 | 1471|6|Hokies|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 25 | 1488|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 26 | 1529|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 27 | 1598|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 28 | 1703|12|Frank Beamer|Frank_Beamer|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 29 | 1745|5|Danny|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 30 | 1881|12|Logan Thomas|Logan_Thomas|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 31 | 1935|6|Beamer|Frank_Beamer|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 32 | 2023|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 33 | 2088|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 34 | 2130|6|Hokies|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 35 | 2231|12|David Wilson|David_Wilson_(American_football)|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 36 | 2251|3|ACC|Atlantic_Coast_Conference|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 37 | 2300|6|Thomas|Logan_Thomas|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 38 | 2433|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 39 | 2475|14|South Carolina|South_Carolina_Gamecocks_football|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071 40 | 2505|14|Alshon Jeffrey|Alshon_Jeffery|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 41 | 2525|14|Oklahoma State|Oklahoma_State_Cowboys_football|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071 42 | 2555|15|Justin Blackmon|Justin_Blackmon|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 43 | 3019|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 44 | 3082|3|ACC|Atlantic_Coast_Conference|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 45 | 3188|8|Facebook|Facebook|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 46 | 3200|7|Twitter|Twitter|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 47 | 3209|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 48 | 3258|6|Beamer|Frank_Beamer|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 49 | 3390|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 50 | 3449|10|Blacksburg|Blacksburg,_Virginia|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071 51 | 3508|3|VMI|Virginia_Military_Institute|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 52 | 3572|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 53 | 3660|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 54 | 3700|3|NFL|National_Football_League|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 55 | 3713|11|Eddie Royal|Eddie_Royal|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 56 | 3726|11|Andre Davis|André_Davis|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 57 | 3742|11|Josh Morgan|Josh_Morgan|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 58 | 3962|6|Thomas|Logan_Thomas|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 59 | 4061|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 60 | 4072|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 61 | 4302|3|ACC|Atlantic_Coast_Conference|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071 62 | 4335|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 63 | 4497|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 64 | 4581|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071 65 | 31|8|New Year|Chinese_New_Year|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071 66 | 99|12|planet Venus|Venus|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071 67 | 158|8|New Year|Lunar_calendar|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071 68 | 367|5|Venus|Venus|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071 69 | 511|5|Venus|Venus|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071 70 | 586|5|Venus|Venus|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071 71 | 713|7|Jupiter|Jupiter|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 72 | 771|19|Aries constellation|Aries_(constellation)|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 73 | 975|7|Jupiter|Jupiter|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 74 | 1091|10|Washington|Washington,_D.C.|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 75 | 1108|4|Mars|Mars|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 76 | 1117|6|Saturn|Saturn|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 77 | 1165|8|New Year|Chinese_New_Year|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071 78 | 1302|6|Saturn|Saturn|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 79 | 1381|5|Virgo|Virgo_(constellation)|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 80 | 1437|6|Saturn|Saturn|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 81 | 1635|11|Quadrantids|Quadrantids|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 82 | 1810|33|International Meteor Organization|International_Meteor_Organization|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 83 | 2007|10|Big Dipper|Ursa_Major|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 84 | 2022|13|Little Dipper|Ursa_Minor|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 85 | 2096|11|Quadrantids|Quadrantids|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071 86 | 2140|3|IMO|International_Maritime_Organization|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 87 | 2278|10|open house|Open_house_(school)|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 88 | 2290|34|University of Maryland Observatory|University_of_Maryland_Observatory|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 89 | 2326|12|College Park|College_Park,_Maryland|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 90 | 2491|12|Star Atlases|Celestial_cartography|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 91 | 2548|32|Northern Virginia Astronomy Club|Northern_Virginia_Astronomy_Club|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 92 | 2616|23|George Mason University|George_Mason_University|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 93 | 2641|7|Fairfax|Fairfax,_Virginia|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 94 | 2724|9|Telescope|Telescope|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 95 | 2780|34|University of Maryland Observatory|University_of_Maryland_Observatory|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 96 | 2816|12|College Park|College_Park,_Maryland|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 97 | 2992|29|National Air and Space Museum|National_Air_and_Space_Museum|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 98 | 3023|13|National Mall|National_Mall|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 99 | 3191|10|open house|Open_house_(school)|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 100 | 3203|34|University of Maryland Observatory|University_of_Maryland_Observatory|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 101 | 3239|12|College Park|College_Park,_Maryland|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 102 | 3392|6|Bieber|Justin_Bieber|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071 103 | 3403|4|Bono|Bono|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071 104 | 3497|11|Takoma Park|Takoma_Park,_Maryland|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071 105 | 3602|12|Solar System|Solar_System|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 106 | 3780|15|Albert Einstein|Albert_Einstein|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071 107 | 3809|29|National Air and Space Museum|National_Air_and_Space_Museum|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071 108 | 0|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 109 | 15|3|GOP|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 110 | 88|10|DES MOINES|Des_Moines,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 111 | 150|10|Republican|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 112 | 191|3|GOP|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 113 | 324|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 114 | 407|19|Des Moines Register|The_Des_Moines_Register|ORG|69654742-33d7-11e1-825f-dabc29fd7071 115 | 487|13|Massachusetts|Governor_of_Massachusetts|LOC|69654742-33d7-11e1-825f-dabc29fd7071 116 | 510|11|Mitt Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 117 | 592|3|Rep|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 118 | 597|8|Ron Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071 119 | 607|4|Tex.|Texas|LOC|69654742-33d7-11e1-825f-dabc29fd7071 120 | 651|12|Pennsylvania|Pennsylvania|LOC|69654742-33d7-11e1-825f-dabc29fd7071 121 | 672|13|Rick Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071 122 | 961|5|House|United_States_House_of_Representatives|ORG|69654742-33d7-11e1-825f-dabc29fd7071 123 | 975|13|Newt Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071 124 | 1004|5|Texas|Texas|LOC|69654742-33d7-11e1-825f-dabc29fd7071 125 | 1015|10|Rick Perry|Rick_Perry|PER|69654742-33d7-11e1-825f-dabc29fd7071 126 | 1044|3|Rep|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 127 | 1049|16|Michele Bachmann|Michele_Bachmann|PER|69654742-33d7-11e1-825f-dabc29fd7071 128 | 1067|5|Minn.|Minnesota|LOC|69654742-33d7-11e1-825f-dabc29fd7071 129 | 1453|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 130 | 1742|12|Barack Obama|Barack_Obama|PER|69654742-33d7-11e1-825f-dabc29fd7071 131 | 1757|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 132 | 1862|10|Mason City|Mason_City,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 133 | 1894|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071 134 | 1950|9|Indianola|Indianola,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 135 | 2165|4|Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071 136 | 2207|9|Rand Paul|Rand_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071 137 | 2220|8|Kentucky|Kentucky|LOC|69654742-33d7-11e1-825f-dabc29fd7071 138 | 2280|8|Ron Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071 139 | 2524|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071 140 | 2589|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 141 | 2616|9|super PAC|Political_action_committee|MISC|69654742-33d7-11e1-825f-dabc29fd7071 142 | 2889|9|tea party|Tea_Party_movement|ORG|69654742-33d7-11e1-825f-dabc29fd7071 143 | 2921|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 144 | 2961|5|Manly|Manly_Warringah_Sea_Eagles|LOC|69654742-33d7-11e1-825f-dabc29fd7071 145 | 2988|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071 146 | 3102|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 147 | 3110|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 148 | 3259|5|Davey|Martin_L._Davey|PER|69654742-33d7-11e1-825f-dabc29fd7071 149 | 3338|4|Newt|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071 150 | 3361|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 151 | 3478|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 152 | 3614|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 153 | 3668|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 154 | 3825|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 155 | 3893|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 156 | 3938|3|GOP|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 157 | 3971|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 158 | 4039|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 159 | 4078|13|New Hampshire|New_Hampshire|LOC|69654742-33d7-11e1-825f-dabc29fd7071 160 | 4169|4|Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071 161 | 4177|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071 162 | 4236|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071 163 | 4253|5|Perry|Rick_Perry|PER|69654742-33d7-11e1-825f-dabc29fd7071 164 | 4285|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 165 | 4332|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 166 | 4397|13|New Hampshire|New_Hampshire_primary|LOC|69654742-33d7-11e1-825f-dabc29fd7071 167 | 4471|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 168 | 4531|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 169 | 4590|10|Republican|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 170 | 4964|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 171 | 5036|8|Ron Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071 172 | 5049|11|Mitt Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 173 | 5165|15|Steve Scheffler|Steve_Scheffler|PER|69654742-33d7-11e1-825f-dabc29fd7071 174 | 5199|32|Iowa Faith and Freedom Coalition|Faith_and_Freedom_Coalition|ORG|69654742-33d7-11e1-825f-dabc29fd7071 175 | 5343|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 176 | 5519|3|GOP|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 177 | 5826|7|America|United_States|LOC|69654742-33d7-11e1-825f-dabc29fd7071 178 | 5880|9|Scheffler|Steve_Scheffler|PER|69654742-33d7-11e1-825f-dabc29fd7071 179 | 6061|10|Adam Gregg|Adam_Gregg|PER|69654742-33d7-11e1-825f-dabc29fd7071 180 | 6079|10|Des Moines|Des_Moines,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 181 | 6110|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 182 | 6149|7|Le Mars|Le_Mars,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 183 | 6193|5|Gregg|Judd_Gregg|PER|69654742-33d7-11e1-825f-dabc29fd7071 184 | 6299|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 185 | 6358|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071 186 | 6582|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071 187 | 6686|14|Craig Robinson|Craig_Robinson_(actor)|PER|69654742-33d7-11e1-825f-dabc29fd7071 188 | 6744|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 189 | 6749|10|Republican|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071 190 | 6856|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071 191 | 6977|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 192 | 7205|9|Urbandale|Urbandale,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 193 | 7236|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071 194 | 7308|10|Iowa State|Iowa|ORG|69654742-33d7-11e1-825f-dabc29fd7071 195 | 7324|7|Rutgers|Rutgers_Scarlet_Knights_football|ORG|69654742-33d7-11e1-825f-dabc29fd7071 196 | 7339|22|New Era Pinstripe Bowl|Pinstripe_Bowl|MISC|69654742-33d7-11e1-825f-dabc29fd7071 197 | 7363|11|Evangelical|Evangelicalism|MISC|69654742-33d7-11e1-825f-dabc29fd7071 198 | 7463|12|evangelicals|Evangelicalism|MISC|69654742-33d7-11e1-825f-dabc29fd7071 199 | 7518|8|Arkansas|Arkansas|LOC|69654742-33d7-11e1-825f-dabc29fd7071 200 | 7536|13|Mike Huckabee|Mike_Huckabee|PER|69654742-33d7-11e1-825f-dabc29fd7071 201 | 7565|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 202 | 7733|8|Huckabee|Mike_Huckabee|PER|69654742-33d7-11e1-825f-dabc29fd7071 203 | 7802|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 204 | 8040|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 205 | 8109|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 206 | 8119|14|Terry Branstad|Terry_Branstad|PER|69654742-33d7-11e1-825f-dabc29fd7071 207 | 8139|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 208 | 8263|13|Supreme Court|Supreme_Court_of_the_United_States|ORG|69654742-33d7-11e1-825f-dabc29fd7071 209 | 8852|7|Florida|Florida|LOC|69654742-33d7-11e1-825f-dabc29fd7071 210 | 9019|5|Perry|Rick_Perry|PER|69654742-33d7-11e1-825f-dabc29fd7071 211 | 9029|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071 212 | 9122|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071 213 | 9294|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071 214 | 9625|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071 215 | 9705|3|T.W|Taiwan|PER|69654742-33d7-11e1-825f-dabc29fd7071 216 | 9710|6|Farnam|Farnam,_Nebraska|PER|69654742-33d7-11e1-825f-dabc29fd7071 217 | 9720|10|Washington|Washington,_D.C.|LOC|69654742-33d7-11e1-825f-dabc29fd7071 218 | 9735|11|Amy Gardner|Amy_Gardner|PER|69654742-33d7-11e1-825f-dabc29fd7071 219 | 9791|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071 220 | 0|9|John Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 221 | 11|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 222 | 49|7|Wizards|Washington_Wizards|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 223 | 57|3|NBA|National_Basketball_Association|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 224 | 62|4|John|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 225 | 68|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 226 | 141|10|Washington|Washington_Wizards|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 227 | 158|13|Flip Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 228 | 305|16|Associated Press|Associated_Press_College_Basketball_Player_of_the_Year|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 229 | 450|9|John Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 230 | 539|14|Bradley Center|Bradley_Center|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 231 | 555|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 232 | 654|18|Washington Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 233 | 685|15|Milwaukee Bucks|Milwaukee_Bucks|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 234 | 859|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 235 | 1064|13|Flip Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 236 | 1295|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 237 | 1405|4|John|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 238 | 1606|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 239 | 1833|5|Bucks|Milwaukee_Bucks|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 240 | 1840|9|Milwaukee|Milwaukee_Bucks|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 241 | 1881|10|Beno Udrih|Beno_Udrih|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 242 | 1909|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 243 | 1959|12|Ronny Turiaf|Ronny_Turiaf|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 244 | 1973|13|Hamady Ndiaye|Hamady_N'Diaye|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 245 | 1996|11|Koichi Sato|Kōichi_Satō_(actor)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 246 | 2009|12|JaVale McGee|JaVale_McGee|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 247 | 2026|14|Kevin Seraphin|Kevin_Séraphin|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 248 | 2091|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 249 | 2575|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 250 | 2875|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 251 | 3039|14|Andray Blatche|Andray_Blatche|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 252 | 3181|10|Washington|Washington_Wizards|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 253 | 3221|15|Jordan Crawford|Jordan_Crawford|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 254 | 3240|7|Atlanta|Atlanta_Hawks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 255 | 3252|10|Nick Young|Nick_Young_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 256 | 3266|9|Milwaukee|Milwaukee_Bucks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 257 | 3304|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 258 | 3316|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 259 | 3364|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 260 | 3399|7|Atlanta|Atlanta_Hawks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 261 | 3538|3|NBA|National_Basketball_Association|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 262 | 3559|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 263 | 3737|10|New Jersey|Brooklyn_Nets|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 264 | 3790|7|Atlanta|Atlanta_Hawks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 265 | 3864|9|Milwaukee|Milwaukee_Bucks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 266 | 3906|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 267 | 4036|6|Boston|Boston_Celtics|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 268 | 4075|14|Verizon Center|Capital_One_Arena|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 269 | 4103|7|Orlando|Orlando_Magic|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071 270 | 4127|8|New York|New_York_Knicks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 271 | 4167|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 272 | 4338|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 273 | 4361|13|Randy Wittman|Randy_Wittman|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 274 | 4414|9|Milwaukee|Milwaukee|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071 275 | 4437|13|Maurice Evans|Maurice_Evans_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 276 | 4455|13|Rashard Lewis|Rashard_Lewis|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 277 | 4562|5|Young|Nick_Young_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 278 | 4792|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 279 | 4965|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071 280 | 13|8|Maryland|Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 281 | 243|8|Maryland|Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 282 | 342|8|Maryland|Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 283 | 434|16|Carl S. Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 284 | 713|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 285 | 1020|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 286 | 1211|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 287 | 1239|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 288 | 1341|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 289 | 1485|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 290 | 1496|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 291 | 1767|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 292 | 1884|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 293 | 2246|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 294 | 2506|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 295 | 2673|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 296 | 2827|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 297 | 2854|10|Chapter 11|Chapter_11,_Title_11,_United_States_Code|MISC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 298 | 2885|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 299 | 2969|8|Woodview|Woodview|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 300 | 3083|12|Jack Johnson|Jack_Johnson_(boxer)|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 301 | 3177|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 302 | 3410|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 303 | 3591|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 304 | 3647|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 305 | 3658|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 306 | 3763|8|Landover|Landover,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 307 | 3813|7|Addison|Addison,_Texas|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 308 | 3824|15|Capitol Heights|Capitol_Heights,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 309 | 4067|8|Williams|Carl_S._Williams|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 310 | 4186|15|Martin O’Malley|Martin_O'Malley|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 311 | 4272|19|Anne Arundel County|Anne_Arundel_County,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 312 | 4309|5|Metro|Washington_Metro|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 313 | 4343|8|O’Malley|Martin_O'Malley|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 314 | 4481|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 315 | 4684|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 316 | 4858|13|Grand Central|Grand_Central_Terminal|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 317 | 4981|30|Department of General Services|California_Department_of_General_Services|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 318 | 5121|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 319 | 5456|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 320 | 5499|17|Montgomery County|Montgomery_County,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 321 | 5663|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 322 | 5757|9|Baltimore|Baltimore|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 323 | 6108|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 324 | 6161|19|The Washington Post|The_Washington_Post|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 325 | 6311|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 326 | 6464|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 327 | 6598|19|The Washington Post|The_Washington_Post|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 328 | 6923|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 329 | 7190|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 330 | 7423|16|Jennifer Jenkins|Jennifer_DeLonge|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d 331 | -------------------------------------------------------------------------------- /geesedb/tests/resources/csv/example_term_dict.csv: -------------------------------------------------------------------------------- 1 | 0|0|2 2 | 1|Hello|2 -------------------------------------------------------------------------------- /geesedb/tests/resources/csv/example_term_doc.csv: -------------------------------------------------------------------------------- 1 | 0|0|1 2 | 0|1|1 3 | 1|0|2 4 | 1|1|3 -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/gql/1: -------------------------------------------------------------------------------- 1 | MATCH (d:docs {collection_id:"96ab542e-6a07-11e6-ba32-5a4bf5aad4fa"})-[]-(:authors)-[]-(:docs)-[]-(a:authors) 2 | RETURN DISTINCT a.author -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/gql/2: -------------------------------------------------------------------------------- 1 | MATCH (a:authors) 2 | RETURN a.author -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/gql/4: -------------------------------------------------------------------------------- 1 | MATCH (d:docs)-[]-(:authors)-[]-(d2:docs) 2 | WHERE d.collection_id = "96ab542e-6a07-11e6-ba32-5a4bf5aad4fa" 3 | RETURN DISTINCT d2.collection_id -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/gql/5: -------------------------------------------------------------------------------- 1 | MATCH (d:docs {collection_id: ?})-[]-(t:term_dict) 2 | RETURN string 3 | ORDER BY tf*log(671945/df) 4 | DESC 5 | LIMIT 5 -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/gql/6: -------------------------------------------------------------------------------- 1 | MATCH (d:docs {collection_id: "96ab542e-6a07-11e6-ba32-5a4bf5aad4fa"})-[]-(e:entities) 2 | RETURN mention 3 | ORDER BY start 4 | LIMIT 5 -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/gql/7: -------------------------------------------------------------------------------- 1 | MATCH (n:Actor) 2 | RETURN n.name AS name 3 | UNION 4 | MATCH (n:Movie) 5 | RETURN n.title AS name -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/sql/1: -------------------------------------------------------------------------------- 1 | SELECT distinct a.author 2 | FROM authors AS a 3 | JOIN doc_author AS da ON (a.author = da.author) 4 | JOIN docs AS d0 ON (d0.collection_id = da.doc) 5 | JOIN doc_author as da2 ON (d0.collection_id = da2.doc) 6 | JOIN authors as a2 ON (da2.author = a2.author) 7 | JOIN doc_author as da3 ON (a2.author = da3.author) 8 | JOIN docs as d ON (d.collection_id = da3.doc) 9 | WHERE d.collection_id = '96ab542e-6a07-11e6-ba32-5a4bf5aad4fa' -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/sql/2: -------------------------------------------------------------------------------- 1 | SELECT a.author 2 | FROM authors as a -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/sql/3: -------------------------------------------------------------------------------- 1 | WITH SELECT count(*) FROM docs AS n, 2 | SELECT en.entity, en.tf * log(n / entities.df) AS tfidf 3 | FROM 4 | (SELECT entity, count(*) AS tf 5 | FROM entity_doc 6 | WHERE doc_id = '96ab542e-6a07-11e6-ba32-5a4bf5aad4fa' 7 | GROUP BY entity 8 | ) AS en 9 | JOIN entities 10 | ON (entities.entity = en.entity) 11 | ORDER BY tfidf DESC 12 | LIMIT 5; -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/sql/4: -------------------------------------------------------------------------------- 1 | SELECT distinct d2.collection_id 2 | FROM docs AS d2 3 | JOIN doc_author as da2 ON (d2.collection_id = da2.doc) 4 | JOIN authors as a2 ON (da2.author = a2.author) 5 | JOIN doc_author as da3 ON (a2.author = da3.author) 6 | JOIN docs as d ON (d.collection_id = da3.doc) 7 | WHERE d.collection_id = '96ab542e' -------------------------------------------------------------------------------- /geesedb/tests/resources/queries/sql/5: -------------------------------------------------------------------------------- 1 | SELECT term_dict.string 2 | FROM term_dict 3 | JOIN term_doc ON (term_dict.term_id = term_doc.term_id) 4 | JOIN docs ON (docs.doc_id = term_doc.doc_id) 5 | WHERE docs.collection_id = ? 6 | ORDER BY tf * log(671945/df) DESC 7 | LIMIT 5; -------------------------------------------------------------------------------- /geesedb/tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/utils/__init__.py -------------------------------------------------------------------------------- /geesedb/tests/utils/ciff/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/utils/ciff/__init__.py -------------------------------------------------------------------------------- /geesedb/tests/utils/ciff/test_to_csv.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from ....utils.ciff.to_csv import ToCSV 4 | 5 | 6 | def test_create_csv_from_ciff(tmp_path: str) -> None: 7 | ToCSV( 8 | protobuf_file=path.dirname(path.dirname(path.dirname(__file__))) + '/resources/ciff/toy-complete-20200309.ciff.gz', 9 | output_docs=str(tmp_path) + 'docs.csv', 10 | output_term_dict=str(tmp_path) + 'term_dict.csv', 11 | output_term_doc=str(tmp_path) + 'term_doc.csv' 12 | ) 13 | with open(str(tmp_path) + 'docs.csv') as f: 14 | assert f.readline().strip() == 'WSJ_1|0|6' 15 | with open(str(tmp_path) + 'term_dict.csv') as f: 16 | assert f.readline().strip() == '0|01|1' 17 | with open(str(tmp_path) + 'term_doc.csv') as f: 18 | assert f.readline().strip() == '0|0|1' 19 | -------------------------------------------------------------------------------- /geesedb/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .ciff.to_csv import ToCSV 2 | from .ciff.to_ciff import ToCiff 3 | 4 | __all__ = ['ToCSV', 'ToCiff'] -------------------------------------------------------------------------------- /geesedb/utils/ciff/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/utils/ciff/__init__.py -------------------------------------------------------------------------------- /geesedb/utils/ciff/to_ciff.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from pathlib import Path 3 | from typing import Any 4 | 5 | from tqdm import tqdm 6 | from ciff_toolkit.ciff_pb2 import Header, Posting, PostingsList, DocRecord 7 | from ciff_toolkit.write import CiffWriter 8 | 9 | from ...connection import get_connection 10 | 11 | 12 | class ToCiff: 13 | 14 | def __init__(self, **kwargs: Any) -> None: 15 | self.arguments = self.get_arguments(kwargs) 16 | db_connection = get_connection(self.arguments['database']) 17 | self.connection = db_connection.connection 18 | self.cursor = db_connection.cursor 19 | self.create_ciff() 20 | 21 | @staticmethod 22 | def get_arguments(kwargs: Any) -> dict: 23 | arguments = { 24 | 'database': None, 25 | 'ciff': None, 26 | 'docs': 'docs', 27 | 'term_dict': 'term_dict', 28 | 'term_doc': 'term_doc', 29 | 'batch_size': 1000, 30 | 'verbose': False, 31 | } 32 | for key, item in arguments.items(): 33 | if kwargs.get(key) is not None: 34 | arguments[key] = kwargs.get(key) 35 | return arguments 36 | 37 | def create_ciff(self) -> None: 38 | disable_tqdm = not self.arguments['verbose'] 39 | 40 | with CiffWriter(self.arguments['ciff']) as writer: 41 | header = self.get_ciff_header() 42 | writer.write_header(header) 43 | 44 | postings_lists = tqdm(self.get_ciff_postings_lists(), total=header.num_postings_lists, disable=disable_tqdm) 45 | writer.write_postings_lists(postings_lists) 46 | 47 | doc_records = tqdm(self.get_ciff_doc_records(), total=header.num_docs, disable=disable_tqdm) 48 | writer.write_documents(doc_records) 49 | 50 | def get_ciff_header(self): 51 | header = Header() 52 | header.version = 1 # We work with ciff v1 53 | self.cursor.execute("""SELECT COUNT(*) FROM term_dict""") 54 | header.num_postings_lists = self.cursor.fetchone()[0] 55 | self.cursor.execute("""SELECT COUNT(*) FROM docs""") 56 | header.num_docs = self.cursor.fetchone()[0] 57 | header.total_postings_lists = header.num_postings_lists 58 | header.total_docs = header.num_docs 59 | self.cursor.execute("""SELECT SUM(tf) FROM term_doc""") 60 | header.total_terms_in_collection = self.cursor.fetchone()[0] 61 | header.average_doclength = header.total_terms_in_collection / header.num_docs 62 | header.description = f'GeeseDB database {self.arguments["database"]}' 63 | 64 | return header 65 | 66 | def get_ciff_postings_lists(self): 67 | self.cursor.execute(""" 68 | SELECT df, string, list(row(doc_id, tf) ORDER BY doc_id) 69 | FROM term_dict, term_doc 70 | WHERE term_dict.term_id = term_doc.term_id 71 | GROUP BY term_dict.term_id, df, string 72 | ORDER BY string 73 | """) 74 | while batch := self.cursor.fetchmany(self.arguments['batch_size']): 75 | for df, term, postings in batch: 76 | postings_list = PostingsList() 77 | assert len(postings) == df 78 | cf = sum(p['tf'] for p in postings) 79 | postings_list.term = term 80 | postings_list.df = df 81 | postings_list.cf = cf 82 | old_id = 0 83 | for p in postings: 84 | posting = Posting() 85 | doc_id = p['doc_id'] 86 | tf = p['tf'] 87 | posting.docid = doc_id - old_id 88 | old_id = doc_id 89 | posting.tf = tf 90 | postings_list.postings.append(posting) 91 | 92 | yield postings_list 93 | 94 | def get_ciff_doc_records(self): 95 | self.cursor.execute(""" 96 | SELECT doc_id, collection_id, len 97 | FROM docs 98 | ORDER BY doc_id 99 | """) 100 | while batch := self.cursor.fetchmany(self.arguments['batch_size']): 101 | for doc_id, collection_id, length in batch: 102 | doc_record = DocRecord() 103 | doc_record.docid = doc_id 104 | doc_record.collection_docid = collection_id 105 | doc_record.doclength = length 106 | 107 | yield doc_record 108 | 109 | 110 | if __name__ == '__main__': 111 | parser = argparse.ArgumentParser() 112 | parser.add_argument('--database', required=True) 113 | parser.add_argument('--ciff', required=True) 114 | parser.add_argument('--docs') 115 | parser.add_argument('--term_dict') 116 | parser.add_argument('--term_doc') 117 | parser.add_argument('--batch_size', type=int) 118 | parser.add_argument('--verbose', action='store_true') 119 | args = parser.parse_args() 120 | ToCiff(**vars(args)) 121 | -------------------------------------------------------------------------------- /geesedb/utils/ciff/to_csv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | import argparse 4 | import gzip 5 | from typing import Union, Any, Tuple 6 | 7 | from ciff_toolkit.read import CiffReader 8 | 9 | 10 | class ToCSV: 11 | """ 12 | Class for creating csv files that represent tables in the old dog paper: 13 | - https://dl.acm.org/doi/10.1145/2600428.2609460 14 | 15 | The files are created from a CIFF as described in: 16 | - https://arxiv.org/abs/2003.08276 17 | """ 18 | def __init__(self, **kwargs: Any) -> None: 19 | self.arguments = self.get_arguments(kwargs) 20 | self.create_csv_files() 21 | 22 | @staticmethod 23 | def get_arguments(kwargs: Any) -> dict: 24 | arguments = { 25 | 'protobuf_file': None, 26 | 'output_docs': 'docs.csv', 27 | 'output_term_dict': 'term_dict.csv', 28 | 'output_term_doc': 'term_docs.csv' 29 | } 30 | for key, item in arguments.items(): 31 | if kwargs.get(key) is not None: 32 | arguments[key] = kwargs.get(key) 33 | return arguments 34 | 35 | @staticmethod 36 | def decode(buffer: Union[str, bytes], pos: int) -> Union[Tuple[int, int], None]: 37 | mask = (1 << 32) - 1 38 | result = 0 39 | shift = 0 40 | while True: 41 | b = buffer[pos] 42 | result |= ((b & 0x7f) << shift) 43 | pos += 1 44 | if not (b & 0x80): 45 | result &= mask 46 | result = int(result) 47 | return result, pos 48 | shift += 7 49 | if shift >= 64: 50 | raise IOError('Too many bytes when decoding.') 51 | 52 | def create_csv_files(self) -> None: 53 | if self.arguments['protobuf_file'].endswith('.gz'): 54 | with gzip.open(self.arguments['protobuf_file'], 'rb') as f: 55 | data = f.read() 56 | else: 57 | with open(self.arguments['protobuf_file'], 'rb') as f: 58 | data = f.read() 59 | next_pos, pos = 0, 0 60 | 61 | with CiffReader(self.arguments['protobuf_file']) as reader: 62 | with open(self.arguments['output_term_dict'], 'w') as term_dict_writer, \ 63 | open(self.arguments['output_term_doc'], 'w') as term_doc_writer: 64 | for term_id, postings_list in enumerate(reader.read_postings_lists()): 65 | term_dict_writer.write(f'{term_id}|{postings_list.term}|{postings_list.df}\n') 66 | docid = 0 67 | for posting in postings_list.postings: 68 | docid += posting.docid 69 | term_doc_writer.write(f'{term_id}|{docid}|{posting.tf}\n') 70 | 71 | with open(self.arguments['output_docs'], 'w') as docs_writer: 72 | for doc_record in reader.read_documents(): 73 | docs_writer.write(f'{doc_record.collection_docid}|{doc_record.docid}|{doc_record.doclength}\n') 74 | 75 | 76 | if __name__ == '__main__': 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('-p', 79 | '--protobuf_file', 80 | required=True, 81 | metavar='[file]', 82 | help='Location of the protobuf file, if this is included ' + 83 | 'output files for term related files should also be specified.') 84 | parser.add_argument('-o', 85 | '--output_docs', 86 | metavar='[file]', 87 | help='Output csv file for the docs table.') 88 | parser.add_argument('-t', 89 | '--output_term_dict', 90 | metavar='[file]', 91 | help='Output csv file for the term dictionary table.') 92 | parser.add_argument('-e', 93 | '--output_term_doc', 94 | metavar='[file]', 95 | help='Output csv file for the term doc mapper table.') 96 | ToCSV(**vars(parser.parse_args())) 97 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | duckdb 2 | google>=2 3 | numpy 4 | pandas 5 | ciff-toolkit 6 | tqdm 7 | git+https://github.com/informagi/pycypher 8 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='geesedb', 5 | version='0.0.2', 6 | description='Graph Engine for Exploration and Search over Evolving DataBases', 7 | author='Chris Kamphuis', 8 | author_email='chris@cs.ru.nl', 9 | url='https://github.com/informagi/GeeseDB', 10 | install_requires=['duckdb', 'numpy', 'pandas', 'ciff-toolkit', 'tqdm', 11 | 'pycypher @ git+https://github.com/informagi/pycypher'], 12 | packages=find_packages(), 13 | include_package_data=True, 14 | package_data={'': ['qrels.*', 'topics.*']}, 15 | license='MIT License' 16 | ) 17 | --------------------------------------------------------------------------------