├── .github
    └── workflows
    │   └── python-app.yml
├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── CNAME
    ├── _config.yml
    ├── background.md
    └── index.md
├── geesedb
    ├── __init__.py
    ├── cmd
    │   ├── __init__.py
    │   ├── gql.py
    │   └── sql.py
    ├── connection
    │   ├── __init__.py
    │   └── connection.py
    ├── index
    │   ├── __init__.py
    │   ├── authors_from_csv.py
    │   ├── entities_from_csv.py
    │   ├── fulltext_from_ciff.py
    │   ├── fulltext_from_csv.py
    │   └── utils.py
    ├── interpreter
    │   ├── __init__.py
    │   ├── metadata.py
    │   ├── parser.py
    │   └── translate.py
    ├── resources
    │   ├── __init__.py
    │   ├── topics-and-qrels
    │   │   ├── __init__.py
    │   │   ├── qrels.backgroundlinking18.txt
    │   │   ├── qrels.backgroundlinking19.txt
    │   │   ├── qrels.core17.txt
    │   │   ├── qrels.core18.txt
    │   │   ├── qrels.robust04.txt
    │   │   ├── topics.backgroundlinking18.processed.txt
    │   │   ├── topics.backgroundlinking19.processed.txt
    │   │   ├── topics.core17.processed.txt
    │   │   ├── topics.core18.processed.txt
    │   │   └── topics.robust04.processed.txt
    │   └── topics.py
    ├── search
    │   ├── __init__.py
    │   ├── retrieval_models
    │   │   ├── __init__.py
    │   │   ├── bag_of_words
    │   │   │   ├── __init__.py
    │   │   │   ├── aggregate.py
    │   │   │   ├── bow_retrieval_model.py
    │   │   │   ├── conjunctive
    │   │   │   │   └── __init__.py
    │   │   │   └── disjunctive
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── disjunctive_retieval_model.py
    │   │   │   │   └── robertson_bm25.py
    │   │   ├── generic_text_retrieval_model.py
    │   │   ├── graph
    │   │   │   └── __init__.py
    │   │   └── positional
    │   │   │   └── __init__.py
    │   └── searcher.py
    ├── tests
    │   ├── __init__.py
    │   ├── connection
    │   │   ├── __init__.py
    │   │   └── test_connection.py
    │   ├── index
    │   │   ├── __init__.py
    │   │   ├── test_authors_from_csv.py
    │   │   ├── test_entities_from_csv.py
    │   │   ├── test_fulltext_from_ciff.py
    │   │   └── test_fulltext_from_csv.py
    │   ├── resources
    │   │   ├── ciff
    │   │   │   └── toy-complete-20200309.ciff.gz
    │   │   ├── csv
    │   │   │   ├── example_doc_author.csv
    │   │   │   ├── example_docs.csv
    │   │   │   ├── example_entity_doc.csv
    │   │   │   ├── example_term_dict.csv
    │   │   │   └── example_term_doc.csv
    │   │   └── queries
    │   │   │   ├── gql
    │   │   │       ├── 1
    │   │   │       ├── 2
    │   │   │       ├── 4
    │   │   │       ├── 5
    │   │   │       ├── 6
    │   │   │       └── 7
    │   │   │   └── sql
    │   │   │       ├── 1
    │   │   │       ├── 2
    │   │   │       ├── 3
    │   │   │       ├── 4
    │   │   │       └── 5
    │   └── utils
    │   │   ├── __init__.py
    │   │   └── ciff
    │   │       ├── __init__.py
    │   │       └── test_to_csv.py
    └── utils
    │   ├── __init__.py
    │   └── ciff
    │       ├── __init__.py
    │       ├── to_ciff.py
    │       └── to_csv.py
├── requirements.txt
└── setup.py


/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "master" ]
 9 |   pull_request:
10 |     branches: [ "master" ]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v3
22 |     - name: Set up Python 3.10
23 |       uses: actions/setup-python@v3
24 |       with:
25 |         python-version: "3.10"
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install flake8 pytest
30 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 |     - name: Lint with flake8
32 |       run: |
33 |         # stop the build if there are Python syntax errors or undefined names
34 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 |     - name: Test with pytest
38 |       run: |
39 |         pytest
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # celery beat schedule file
 95 | celerybeat-schedule
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Pyre type checker
125 | .pyre/
126 | 
127 | # Intellj
128 | .idea/
129 | 
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 informagi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GeeseDB
  2 | [![Build Status](https://app.travis-ci.com/informagi/GeeseDB.svg?branch=master)](https://app.travis-ci.com/informagi/GeeseDB)
  3 | 
  4 | ## Graph Engine for Exploration and Search
  5 | GeeseDB is a Python toolkit for solving information retrieval research problems that leverage graphs as data structures. It aims to simplify information retrieval research by allowing researchers to easily formulate graph queries through a graph query language. GeeseDB is built on top of [DuckDB](http://duckdb.org/), an embedded column-store relational database designed for analytical workloads.
  6 | 
  7 | GeeseDB is available as an easy to install Python package. In only a few lines of code users can create a first stage retrieval ranking using BM25. Queries read and write Numpy arrays and Pandas dataframes, at zero or negligible data transformation cost (dependent on base datatype). Therefore, results of a first-stage ranker expressed in GeeseDB can be used in various stages in the ranking process, enabling all the power of Python machine learning libraries with minimal overhead. Also, because data representation and processing are strictly separated, GeeseDB forms an ideal basis for reproducible IR research.
  8 | 
  9 | ## Package Installation
 10 | Install latest version of GeeseDB via [PyPI](https://pypi.org/project/geesedb/):
 11 | 
 12 | ```
 13 | pip install geesedb==0.0.2
 14 | ```
 15 | 
 16 | GeeseDB depends on a couple of packages that can also be installed using `pip`. It is also possible to install the development installation of GeeseDB using `pip`:
 17 | 
 18 | ```
 19 | pip install git+https://github.com/informagi/GeeseDB.git
 20 | ```
 21 | 
 22 | If you are planning to contribute to the package it is possible to clone the package, and install it using `pip` in editable version:
 23 | ```
 24 | git clone git@github.com:informagi/GeeseDB.git && cd GeeseDB && pip install -e .
 25 | ```
 26 | 
 27 | You can run our tests to confirm if everything is working as intended (in the repository folder):
 28 | ```
 29 | pytest
 30 | ```
 31 | 
 32 | ## How do I index?
 33 | The fastest way to load text data into GeeseDB is through CSV files. There should be three csv files: one for terms, one for documents, and one that connects the terms to the documents. Small examples of these files can be found in the repository: [docs.csv](./geesedb/tests/resources/csv/example_docs.csv), [terms_dics.csv](./geesedb/tests/resources/csv/example_term_dict.csv), and [term_doc.csv](./geesedb/tests/resources/csv/example_term_doc.csv).
 34 | 
 35 | These can be generated using the CIFF [to_csv](./geesedb/utils/ciff/to_csv.py) class from [CIFF](https://github.com/osirrc/ciff) collections, or you can create them however you like. The documents can be loaded using the following code:
 36 | 
 37 | ```python3
 38 | from geesedb.index import FullTextFromCSV
 39 | 
 40 | index = FullTextFromCSV(
 41 |     database='/path/to/database',
 42 |     docs_file='/path/to/docs.csv',
 43 |     term_dict_file='/path/to/term_dict.csv',
 44 |     term_doc_file='/path/to/term_doc.csv'
 45 | )
 46 | index.load_data()
 47 | ```
 48 | 
 49 | ## How do I search?
 50 | After indexing in the data, it is really easy to construct a first stage ranking using BM25:
 51 | 
 52 | ```python3
 53 | from geesedb.search import Searcher
 54 | 
 55 | searcher = Searcher(
 56 |     database='/path/to/database', 
 57 |     n=10
 58 | )
 59 | hits = searcher.search_topic('cat')
 60 | ```
 61 | 
 62 | In this case the searcher returns the top 10 documents for the query: `cat`. 
 63 | 
 64 | ## How can I use SQL with GeeseDB?
 65 | GeeseDB is built on top of [DuckDB](http://duckdb.org/), and we inherit all its functionalities. It is possible to directly query the data in GeeseDB using SQL. The following example shows an example on how to use SQL on the data loaded in the example above:
 66 | 
 67 | ```python3
 68 | from geesedb.connection import get_connection
 69 | 
 70 | db_path = '/path/to/database/'
 71 | cursor = get_connection(db_path)
 72 | cursor.execute("SELECT count(*) FROM docs;")
 73 | cursor.fetchall()
 74 | ```
 75 | 
 76 | ## How can I use Cypher with GeeseDB
 77 | GeeseDB also supports a subset of the Cypher graph query language, in particular the following keywords: `MATCH`, `RETURN`, `WHERE`, `AND`, `DISTINCT`, `ORDER BY`, `SKIP`, and `LIMIT`. We plan to support the full Cypher query langauge in the future. In order to use the Cypher query language with GeeseDB, first a metadata file needs to be loaded. 
 78 | 
 79 | The metadata represents the graph structure represented in the database, the table name `_meta` is used for this. The metadata is represented as a Python dictionary object with the following structure:
 80 | ```python
 81 | {
 82 |     'from_node':
 83 |     {
 84 |         'to_node':
 85 |         {
 86 |             [['join_table',
 87 |               'from_node_join_key',
 88 |               'join_table_from_node_join_key',
 89 |               'join_table_to_node_join_key',
 90 |               'to_node_join_key'
 91 |               ]]
 92 |         }
 93 |     }
 94 | }
 95 | ```
 96 | Using this structure we know which tables in the database related to eachother. If this information is known it is possible to translate Cypher queries to SQL queries. An example of a Cypher query that can be translated to SQL is shown belows:
 97 | 
 98 | Cypher:
 99 | ```cypher
100 | MATCH (d:docs)-[]-(:authors)-[]-(d2:docs)
101 | WHERE d.collection_id = "96ab542e"
102 | RETURN DISTINCT d2.collection_id
103 | ```
104 | 
105 | SQL:
106 | ```sql
107 | SELECT DISTINCT d2.collection_id
108 | FROM docs AS d2
109 | JOIN doc_author AS da2 ON (d2.collection_id = da2.doc)
110 | JOIN authors AS a2 ON (da2.author = a2.author)
111 | JOIN doc_author AS da3 ON (a2.author = da3.author)
112 | JOIN docs AS d ON (d.collection_id = da3.doc)
113 | WHERE d.collection_id = '96ab542e'
114 | ```
115 | 
116 | The queries can be translated the following way:
117 | 
118 | ```python
119 | from geesedb.interpreter import Translator
120 | 
121 | c_query = "cypher query"
122 | translator = Translator('path/to/database')
123 | sql_query = translator.translate(c_query)
124 | ```
125 | 
126 | ## Cite
127 | GeeseDB was published at DESIRES: [Read here](https://ceur-ws.org/Vol-2950/paper-11.pdf)
128 | 
129 | If you use GeeseDB you can cite use using bibtex:
130 | ```
131 | @inproceedings{geesedb,
132 | 	author    = {Chris Kamphuis and Arjen P. de Vries},
133 | 	title     = {{GeeseDB: A Python Graph Engine for Exploration and Search}},
134 | 	booktitle = {Proceedings of the 2nd International Conference on Design of Experimental Search \& Information REtrieval Systems},
135 | 	pages     = {10-18},
136 | 	year      = {2021},
137 | 	url       = {http://ceur-ws.org/Vol-2950/paper-11.pdf},
138 | 	address   = {Aachen},
139 | 	publisher = {CEUR-WS.org},
140 | 	series    = {DESIRES '21}
141 | }
142 | ```
143 | 


--------------------------------------------------------------------------------
/docs/CNAME:
--------------------------------------------------------------------------------
1 | geesedb.informagus.nl


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal


--------------------------------------------------------------------------------
/docs/background.md:
--------------------------------------------------------------------------------
 1 | # Background
 2 | 
 3 | ## DuckDB
 4 | 
 5 | ### Adaptive Radix Tree (ART)
 6 | 
 7 | Paper:
 8 | https://www.the-paper-trail.org/post/art-paper-notes/
 9 | https://dl.acm.org/citation.cfm?id=2511193
10 | 
11 | Implementations:
12 | https://github.com/armon/libart
13 | https://github.com/rafaelkallis/adaptive-radix-tree
14 | 
15 | Useful:
16 | https://stackoverflow.com/a/26172978/2127435
17 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # GeeseDB
2 | 
3 | A Graph Engine for Exploration and Search over Evolving DataBases (GeeseDB).
4 | 
5 | ## Acknowledgements
6 | 
7 | The NWO SQIREL-GRAPHS project, Radboud's iCIS institute, and CWI's excellent Database Architectures (DA) research group.
8 | 


--------------------------------------------------------------------------------
/geesedb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/__init__.py


--------------------------------------------------------------------------------
/geesedb/cmd/__init__.py:
--------------------------------------------------------------------------------
1 | from .sql import SQL
2 | from .gql import GQL
3 | 
4 | __all__ = ['SQL', 'GQL']
5 | 


--------------------------------------------------------------------------------
/geesedb/cmd/gql.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cmd
 3 | from typing import Any
 4 | 
 5 | from ..connection import get_connection
 6 | from ..interpreter import Translator
 7 | 
 8 | 
 9 | class GQL(cmd.Cmd):
10 |     intro = 'GQL shell powered by DuckDB and to SQL translations. Type help or ? to list commands.\n'
11 |     prompt = '(gql) '
12 | 
13 |     def __init__(self, **kwargs: Any) -> None:
14 |         self.arguments = self.get_arguments(kwargs)
15 |         self.db_connection = get_connection(self.arguments['database'])
16 |         self.translator = Translator(self.arguments['database'])
17 |         self.cursor = self.db_connection.cursor
18 |         super(GQL, self).__init__()
19 | 
20 |     @staticmethod
21 |     def get_arguments(kwargs: Any) -> dict:
22 |         arguments = {
23 |             'database': None
24 |         }
25 |         for key, item in arguments.items():
26 |             if kwargs.get(key) is not None:
27 |                 arguments[key] = kwargs.get(key)
28 |         if arguments['database'] is None:
29 |             raise IOError('database path needs to be provided')
30 |         return arguments
31 | 
32 |     def do_quit(self, arg) -> bool:
33 |         """Exit this shell"""
34 |         return True
35 | 
36 |     def do_fetchall(self, arg) -> None:
37 |         """Fetch all results after issuing a SQL query"""
38 |         print(self.cursor.fetchall())
39 | 
40 |     def do_fetchone(self, arg) -> None:
41 |         """Fetch a row after issuing a SQL query"""
42 |         print(self.cursor.fetchone())
43 | 
44 |     def default(self, line: str) -> None:
45 |         """Issue a sql query"""
46 |         try:
47 |             self.cursor.execute(self.translator.translate(line))
48 |         except RuntimeError as error:
49 |             print(error)
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument('-d',
55 |                         '--database',
56 |                         required=True,
57 |                         metavar='[file]',
58 |                         help='Location of the database.')
59 |     GQL(**vars(parser.parse_args())).cmdloop()
60 | 


--------------------------------------------------------------------------------
/geesedb/cmd/sql.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import cmd
 3 | from typing import Any
 4 | 
 5 | from ..connection import get_connection
 6 | 
 7 | 
 8 | class SQL(cmd.Cmd):
 9 |     intro = 'SQL shell powered by DuckDB. Type help or ? to list commands.\n'
10 |     prompt = '(sql) '
11 | 
12 |     def __init__(self, **kwargs: Any) -> None:
13 |         self.arguments = self.get_arguments(kwargs)
14 |         self.db_connection = get_connection(self.arguments['database'])
15 |         self.cursor = self.db_connection.cursor
16 |         super(SQL, self).__init__()
17 | 
18 |     @staticmethod
19 |     def get_arguments(kwargs: Any) -> dict:
20 |         arguments = {
21 |             'database': None
22 |         }
23 |         for key, item in arguments.items():
24 |             if kwargs.get(key) is not None:
25 |                 arguments[key] = kwargs.get(key)
26 |         if arguments['database'] is None:
27 |             raise IOError('database path needs to be provided')
28 |         return arguments
29 | 
30 |     def do_quit(self, arg) -> bool:
31 |         """Exit this shell"""
32 |         return True
33 | 
34 |     def do_fetchall(self, arg) -> None:
35 |         """Fetch all results after issuing a SQL query"""
36 |         print(self.cursor.fetchall())
37 | 
38 |     def do_fetchone(self, arg) -> None:
39 |         """Fetch a row after issuing a SQL query"""
40 |         print(self.cursor.fetchone())
41 | 
42 |     def default(self, line: str) -> None:
43 |         """Issue a sql query"""
44 |         try:
45 |             self.cursor.execute(line)
46 |         except RuntimeError as error:
47 |             print(error)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument('-d',
53 |                         '--database',
54 |                         required=True,
55 |                         metavar='[file]',
56 |                         help='Location of the database.')
57 |     SQL(**vars(parser.parse_args())).cmdloop()
58 | 


--------------------------------------------------------------------------------
/geesedb/connection/__init__.py:
--------------------------------------------------------------------------------
1 | from .connection import get_connection, close_connection
2 | 
3 | __all__ = ['get_connection', 'close_connection']
4 | 


--------------------------------------------------------------------------------
/geesedb/connection/connection.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | 
 3 | _db_connection = None
 4 | 
 5 | 
 6 | def get_connection(database):
 7 |     global _db_connection
 8 |     if not _db_connection:
 9 |         _db_connection = DBConnection(database)
10 |     return _db_connection
11 | 
12 | 
13 | def close_connection():
14 |     global _db_connection
15 |     if _db_connection:
16 |         _db_connection.connection.close()
17 |     _db_connection = None
18 | 
19 | 
20 | class DBConnection(object):
21 | 
22 |     def __init__(self, database: str) -> None:
23 |         self.connection = duckdb.connect(database)
24 |         self.cursor = self.connection.cursor()
25 | 


--------------------------------------------------------------------------------
/geesedb/index/__init__.py:
--------------------------------------------------------------------------------
1 | from .authors_from_csv import AuthorsFromCSV
2 | from .entities_from_csv import EntitiesFromCSV
3 | from .fulltext_from_ciff import FullTextFromCiff
4 | from .fulltext_from_csv import FullTextFromCSV
5 | 
6 | __all__ = ['FullTextFromCSV', 'AuthorsFromCSV', 'FullTextFromCiff', 'EntitiesFromCSV']
7 | 


--------------------------------------------------------------------------------
/geesedb/index/authors_from_csv.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | from typing import Any
 4 | 
 5 | from .utils import _create_table, _fill_empty_table_with_csv
 6 | from ..connection import get_connection
 7 | 
 8 | 
 9 | class AuthorsFromCSV:
10 |     """
11 |     Class for creating table from csv file that contains author information
12 |     Author - doc
13 |     """
14 |     _COLUMN_TYPES = ['STRING', 'STRING']
15 | 
16 |     def __init__(self, **kwargs: Any) -> None:
17 |         self.arguments = self.get_arguments(kwargs)
18 |         if self.arguments['use_existing_db'] and os.path.isfile(self.arguments['database']) or \
19 |                 not self.arguments['use_existing_db'] and not os.path.isfile(self.arguments['database']):
20 |             pass
21 |         elif not self.arguments['use_existing_db']:
22 |             raise IOError('There already exist a file on this path.')
23 |         else:
24 |             raise IOError('Database does not exist.')
25 |         db_connection = get_connection(self.arguments['database'])
26 |         self.connection = db_connection.connection
27 | 
28 |         if not self.arguments['use_existing_tables']:
29 |             _create_table(self.connection, self.arguments['table_name'], self.arguments['columns_names'],
30 |                           self._COLUMN_TYPES)
31 |         _fill_empty_table_with_csv(self.connection, self.arguments['table_name'], self.arguments['doc_author_file'],
32 |                                    self.arguments['delimiter'])
33 | 
34 |     @staticmethod
35 |     def get_arguments(kwargs: Any) -> dict:
36 |         arguments = {
37 |             'database': None,
38 |             'use_existing_db': False,
39 |             'use_existing_tables': False,
40 |             'doc_author_file': 'doc_author.csv',
41 |             'table_name': 'doc_author',
42 |             'columns_names': ['doc', 'author'],
43 |             'delimiter': '|'
44 |         }
45 |         for key, item in arguments.items():
46 |             if kwargs.get(key) is not None:
47 |                 arguments[key] = kwargs.get(key)
48 |         if arguments['database'] is None:
49 |             raise IOError('database path needs to be provided')
50 |         return arguments
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     parser = argparse.ArgumentParser()
55 |     parser.add_argument('-d',
56 |                         '--database',
57 |                         required=True,
58 |                         metavar='[file]',
59 |                         help='Location of the database.')
60 |     parser.add_argument('-u',
61 |                         '--use_existing_db',
62 |                         action='store_true',
63 |                         help='Use an existing database.')
64 |     parser.add_argument('-s',
65 |                         '--use_existing_tables',
66 |                         action='store_true',
67 |                         help='Use existing tables.')
68 |     parser.add_argument('-a',
69 |                         '--doc_author_file',
70 |                         metavar='[file]',
71 |                         help='Filename for the csv file containing the data for the docs table.')
72 |     parser.add_argument('-t',
73 |                         '--table_name',
74 |                         metavar='[string]',
75 |                         help='Decide on the table name you want to fill if they exist, ' +
76 |                              'or create and fill them if they do not exist. If no name ' +
77 |                              'is given the default value "author_doc" are being used.')
78 |     parser.add_argument('-c',
79 |                         '--columns_names',
80 |                         metavar='[string]',
81 |                         nargs=2,
82 |                         help='Column names for the author-doc table.')
83 |     parser.add_argument('-e',
84 |                         '--delimiter',
85 |                         help='Delimiter that separates the columns in the csv files.')
86 |     AuthorsFromCSV(**vars(parser.parse_args()))
87 | 


--------------------------------------------------------------------------------
/geesedb/index/entities_from_csv.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import os
 5 | from typing import Any
 6 | 
 7 | from .utils import _fill_empty_table_with_csv, _create_table
 8 | from ..connection import get_connection
 9 | 
10 | 
11 | class EntitiesFromCSV:
12 |     """
13 |     Class for creating table from csv file that contains entities linked to doc
14 |     Offset - Length - Mention - Entity - NER Tag - Doc ID
15 | 
16 |     Entities contain info as provided by REL: https://arxiv.org/abs/2006.01969
17 |     """
18 |     _COLUMN_TYPES = ['INT', 'INT', 'STRING', 'STRING', 'STRING', 'STRING']
19 | 
20 |     def __init__(self, **kwargs: Any) -> None:
21 |         self.arguments = self.get_arguments(kwargs)
22 |         if self.arguments['use_existing_db'] and os.path.isfile(self.arguments['database']) or \
23 |                 not self.arguments['use_existing_db'] and not os.path.isfile(self.arguments['database']):
24 |             pass
25 |         elif not self.arguments['use_existing_db']:
26 |             raise IOError('There already exist a file on this path.')
27 |         else:
28 |             raise IOError('Database does not exist.')
29 |         db_connection = get_connection(self.arguments['database'])
30 |         self.connection = db_connection.connection
31 | 
32 |         if not self.arguments['use_existing_tables']:
33 |             _create_table(self.connection, self.arguments['table_name'], self.arguments['columns_names'],
34 |                           self._COLUMN_TYPES)
35 |         _fill_empty_table_with_csv(self.connection, self.arguments['table_name'], self.arguments['entity_doc_file'],
36 |                                    self.arguments['delimiter'])
37 | 
38 |     @staticmethod
39 |     def get_arguments(kwargs: Any) -> dict:
40 |         arguments = {
41 |             'database': None,
42 |             'use_existing_db': False,
43 |             'use_existing_tables': False,
44 |             'entity_doc_file': 'entity_doc.csv',
45 |             'table_name': 'entity_doc',
46 |             'columns_names': ['start', 'len', 'mention', 'entity', 'ner_tag', 'doc_id'],
47 |             'delimiter': '|'
48 |         }
49 |         for key, item in arguments.items():
50 |             if kwargs.get(key) is not None:
51 |                 arguments[key] = kwargs.get(key)
52 |         if arguments['database'] is None:
53 |             raise IOError('database path needs to be provided')
54 |         return arguments
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument('-d',
60 |                         '--database',
61 |                         required=True,
62 |                         metavar='[file]',
63 |                         help='Location of the database.')
64 |     parser.add_argument('-u',
65 |                         '--use_existing_db',
66 |                         action='store_true',
67 |                         help='Use an existing database.')
68 |     parser.add_argument('-s',
69 |                         '--use_existing_tables',
70 |                         action='store_true',
71 |                         help='Use existing tables.')
72 |     parser.add_argument('-a',
73 |                         '--entity_doc_file',
74 |                         metavar='[file]',
75 |                         help='Filename for the csv file containing the data for the entity_doc table.')
76 |     parser.add_argument('-t',
77 |                         '--table_name',
78 |                         metavar='[string]',
79 |                         help='Decide on the table name you want to fill if they exist, ' +
80 |                              'or create and fill them if they do not exist. If no name ' +
81 |                              'is given the default value "entity_doc" will be used.')
82 |     parser.add_argument('-c',
83 |                         '--columns_names',
84 |                         metavar='[string]',
85 |                         nargs=8,
86 |                         help='Column names for the doc-entity table. If not provided the default: '
87 |                              "['start', 'len', 'mention', 'entity', 'ner_tag', 'doc_id'] "
88 |                              "will be used.")
89 |     parser.add_argument('-e',
90 |                         '--delimiter',
91 |                         help='Delimiter that separates the columns in the csv files.')
92 |     EntitiesFromCSV(**vars(parser.parse_args()))
93 | 


--------------------------------------------------------------------------------
/geesedb/index/fulltext_from_ciff.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import gzip
  5 | import os
  6 | import duckdb
  7 | from typing import Any, List, Union, Tuple
  8 | from ciff_toolkit.read import CiffReader
  9 | 
 10 | from ..connection import get_connection
 11 | 
 12 | 
 13 | class FullTextFromCiff:
 14 |     """
 15 |     Class for creating tables as in the old dog paper:
 16 |     - https://dl.acm.org/doi/10.1145/2600428.2609460
 17 | 
 18 |     The tables are created from a CIFF as described in:
 19 |     - https://arxiv.org/abs/2003.08276
 20 |     """
 21 |     _COLUMN_TYPES = [
 22 |         ['STRING', 'INT', 'INT'],
 23 |         ['INT', 'INT', 'STRING'],
 24 |         ['INT', 'INT', 'INT']
 25 |     ]
 26 | 
 27 |     def __init__(self, **kwargs: Any) -> None:
 28 |         self.arguments = self.get_arguments(kwargs)
 29 |         if self.arguments['use_existing_db'] and os.path.isfile(self.arguments['database']) or \
 30 |                 not self.arguments['use_existing_db'] and not os.path.isfile(self.arguments['database']):
 31 |             pass
 32 |         elif not self.arguments['use_existing_db']:
 33 |             raise IOError('There already exist a file on this path.')
 34 |         else:
 35 |             raise IOError('Database does not exist.')
 36 |         db_connection = get_connection(self.arguments['database'])
 37 |         self.connection = db_connection.connection
 38 |         self.cursor = db_connection.cursor
 39 | 
 40 |     def load_data(self):
 41 |         if not self.arguments['use_existing_tables']:
 42 |             self.create_tables()
 43 |         self.fill_tables()
 44 | 
 45 |     @staticmethod
 46 |     def get_arguments(kwargs: Any) -> dict:
 47 |         arguments = {
 48 |             'database': None,
 49 |             'use_existing_db': False,
 50 |             'use_existing_tables': False,
 51 |             'table_names': ['docs', 'term_dict', 'term_doc'],
 52 |             'columns_names_docs': ['collection_id', 'doc_id', 'len'],
 53 |             'columns_names_term_dict': ['term_id', 'df', 'string'],
 54 |             'columns_names_term_doc': ['term_id', 'doc_id', 'tf'],
 55 |             'protobuf_file': None
 56 |         }
 57 |         for key, item in arguments.items():
 58 |             if kwargs.get(key) is not None:
 59 |                 arguments[key] = kwargs.get(key)
 60 |         if arguments['database'] is None:
 61 |             raise IOError('database path needs to be provided')
 62 |         if arguments['protobuf_file'] is None:
 63 |             raise IOError('protobuf file needs to be provided')
 64 |         return arguments
 65 | 
 66 |     def create_tables(self) -> None:
 67 |         column_names = [
 68 |             self.arguments['columns_names_docs'],
 69 |             self.arguments['columns_names_term_dict'],
 70 |             self.arguments['columns_names_term_doc']
 71 |         ]
 72 |         self.connection.begin()
 73 |         for table_name, c_names, c_types in zip(self.arguments['table_names'], column_names, self._COLUMN_TYPES):
 74 |             self.create_table(table_name, c_names, c_types)
 75 |         self.connection.commit()
 76 | 
 77 |     def create_table(self, table_name: str, column_names: List[str], column_types: List[str]) -> None:
 78 |         try:
 79 |             self.cursor.execute(f'SELECT * FROM {table_name} LIMIT 1;')
 80 |             self.connection.rollback()
 81 |             raise IOError('Table already exists.')
 82 |         except duckdb.CatalogException:  # If the table does not exists you get a RuntimeError
 83 |             pass
 84 |         query = f'CREATE TABLE {table_name} ({", ".join([f"{a} {b}" for a, b in zip(column_names, column_types)])});'
 85 |         self.cursor.execute(query)
 86 | 
 87 |     @staticmethod
 88 |     def decode(buffer: Union[str, bytes], pos: int) -> Union[Tuple[int, int], None]:
 89 |         mask = (1 << 32) - 1
 90 |         result = 0
 91 |         shift = 0
 92 |         while True:
 93 |             b = buffer[pos]
 94 |             result |= ((b & 0x7f) << shift)
 95 |             pos += 1
 96 |             if not (b & 0x80):
 97 |                 result &= mask
 98 |                 result = int(result)
 99 |                 return result, pos
100 |             shift += 7
101 |             if shift >= 64:
102 |                 raise IOError('Too many bytes when decoding.')
103 | 
104 |     def fill_tables(self) -> None:
105 |         if self.arguments['protobuf_file'].endswith('.gz'):
106 |             with gzip.open(self.arguments['protobuf_file'], 'rb') as f:
107 |                 data = f.read()
108 |         else:
109 |             with open(self.arguments['protobuf_file'], 'rb') as f:
110 |                 data = f.read()
111 | 
112 |         with CiffReader(self.arguments['protobuf_file']) as reader:
113 |             for term_id, postings_list in enumerate(reader.read_postings_lists()):
114 |                 self.connection.begin()
115 |                 q = f'INSERT INTO {self.arguments["table_names"][1]} ' \
116 |                     f'({",".join(self.arguments["columns_names_term_dict"])}) ' \
117 |                     f"VALUES ({term_id},{postings_list.df},'{postings_list.term}')"
118 |                 try:
119 |                     self.cursor.execute(q)
120 |                 except RuntimeError:
121 |                     print(q)
122 | 
123 |                 docid = 0
124 |                 for posting in postings_list.postings:
125 |                     docid += posting.docid
126 |                     q = f'INSERT INTO {self.arguments["table_names"][2]} ' \
127 |                         f'({",".join(self.arguments["columns_names_term_doc"])}) ' \
128 |                         f'VALUES ({term_id},{docid},{posting.tf})'
129 |                     self.cursor.execute(q)
130 |                 self.connection.commit()
131 | 
132 |             self.connection.begin()
133 |             for n, doc_record in enumerate(reader.read_documents()):
134 |                 if n % 1000 == 0:
135 |                     self.connection.commit()
136 |                     self.connection.begin()
137 |                 q = f'INSERT INTO {self.arguments["table_names"][0]} ' \
138 |                     f'({",".join(self.arguments["columns_names_docs"])}) ' \
139 |                     f"VALUES ('{doc_record.collection_docid}',{doc_record.docid},{doc_record.doclength})"
140 |                 self.cursor.execute(q)
141 |             self.connection.commit()
142 | 
143 | 
144 | if __name__ == '__main__':
145 |     parser = argparse.ArgumentParser()
146 |     parser.add_argument('-d',
147 |                         '--database',
148 |                         required=True,
149 |                         metavar='[file]',
150 |                         help='Location of the database.')
151 |     parser.add_argument('-p',
152 |                         '--protobuf_file',
153 |                         required=True,
154 |                         metavar='[file]',
155 |                         help='Filename for the csv file containing the data for the docs table.')
156 |     parser.add_argument('-u',
157 |                         '--use_existing_db',
158 |                         action='store_true',
159 |                         help='Use an existing database.')
160 |     parser.add_argument('-s',
161 |                         '--use_existing_tables',
162 |                         action='store_true',
163 |                         help='Use existing tables.')
164 |     parser.add_argument('-t',
165 |                         '--table_names',
166 |                         metavar='[string]',
167 |                         nargs=3,
168 |                         help='Decide on the table names you want to fill if they exist, ' +
169 |                              'or create and fill them if they do not exist. If no names ' +
170 |                              'are given the default values ["docs.csv", "term_dict.csv", ' +
171 |                              '"term_doc.csv"] are being used. If arguments are given ' +
172 |                              'they are expected in the respective default order.')
173 |     parser.add_argument('-cd',
174 |                         '--columns_names_docs',
175 |                         metavar='[string]',
176 |                         nargs=2,
177 |                         help='Column names for the docs table.')
178 |     parser.add_argument('-ct',
179 |                         '--columns_names_term_dict',
180 |                         metavar='[string]',
181 |                         nargs=3,
182 |                         help='Column names for the dict table.')
183 |     parser.add_argument('-o',
184 |                         '--columns_names_term_doc',
185 |                         metavar='[string]',
186 |                         nargs=3,
187 |                         help='Column names for the term-docs table (docs in old dog paper).')
188 |     FullTextFromCiff(**vars(parser.parse_args()))
189 | 


--------------------------------------------------------------------------------
/geesedb/index/fulltext_from_csv.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | import argparse
  4 | import os
  5 | from typing import Any
  6 | 
  7 | from .utils import _fill_empty_table_with_csv, _create_table
  8 | from ..connection import get_connection
  9 | 
 10 | 
 11 | class FullTextFromCSV:
 12 |     """
 13 |     Class for creating tables from csv files as in the old dog paper:
 14 |     - https://dl.acm.org/doi/10.1145/2600428.2609460
 15 |     """
 16 |     _COLUMN_TYPES = [
 17 |         ['STRING', 'INT', 'INT'],
 18 |         ['INT', 'STRING', 'INT'],
 19 |         ['INT', 'INT', 'INT']
 20 |     ]
 21 | 
 22 |     def __init__(self, **kwargs: Any) -> None:
 23 |         self.arguments = self.get_arguments(kwargs)
 24 |         if self.arguments['use_existing_db'] and os.path.isfile(self.arguments['database']) or \
 25 |                 not self.arguments['use_existing_db'] and not os.path.isfile(self.arguments['database']):
 26 |             pass
 27 |         elif not self.arguments['use_existing_db']:
 28 |             raise IOError('There already exist a file on this path.')
 29 |         else:
 30 |             raise IOError('Database does not exist.')
 31 |         db_connection = get_connection(self.arguments['database'])
 32 |         self.connection = db_connection.connection
 33 | 
 34 | 
 35 |     def load_data(self):
 36 |         if not self.arguments['use_existing_db']:
 37 |             self.create_tables()
 38 |         self.fill_tables()
 39 | 
 40 |     @staticmethod
 41 |     def get_arguments(kwargs: Any) -> dict:
 42 |         arguments = {
 43 |             'database': None,
 44 |             'use_existing_db': False,
 45 |             'use_existing_tables': False,
 46 |             'table_names': ['docs', 'term_dict', 'term_doc'],
 47 |             'columns_names_docs': ['collection_id', 'doc_id', 'len'],
 48 |             'columns_names_term_dict': ['term_id', 'string', 'df'],
 49 |             'columns_names_term_doc': ['term_id', 'doc_id', 'tf'],
 50 |             'docs_file': 'docs.csv',
 51 |             'term_dict_file': 'dict.csv',
 52 |             'term_doc_file': 'term_doc.csv',
 53 |             'delimiter': '|'
 54 |         }
 55 |         for key, item in arguments.items():
 56 |             if kwargs.get(key) is not None:
 57 |                 arguments[key] = kwargs.get(key)
 58 |         if arguments['database'] is None:
 59 |             raise IOError('database path needs to be provided')
 60 |         return arguments
 61 | 
 62 |     def create_tables(self) -> None:
 63 |         column_names = [
 64 |             self.arguments['columns_names_docs'],
 65 |             self.arguments['columns_names_term_dict'],
 66 |             self.arguments['columns_names_term_doc']
 67 |         ]
 68 |         self.connection.begin()
 69 |         for table_name, c_names, c_types in zip(self.arguments['table_names'], column_names, self._COLUMN_TYPES):
 70 |             _create_table(self.connection, table_name, c_names, c_types)
 71 |         self.connection.commit()
 72 | 
 73 |     def fill_tables(self) -> None:
 74 |         file_names = [
 75 |             self.arguments['docs_file'],
 76 |             self.arguments['term_dict_file'],
 77 |             self.arguments['term_doc_file']
 78 |         ]
 79 |         self.connection.begin()
 80 |         for table_name, file_name in zip(self.arguments['table_names'], file_names):
 81 |             _fill_empty_table_with_csv(self.connection, table_name, file_name, self.arguments['delimiter'])
 82 |         self.connection.commit()
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 |     parser = argparse.ArgumentParser()
 87 |     parser.add_argument('-d',
 88 |                         '--database',
 89 |                         required=True,
 90 |                         metavar='[file]',
 91 |                         help='Location of the database.')
 92 |     parser.add_argument('-u',
 93 |                         '--use_existing_db',
 94 |                         action='store_true',
 95 |                         help='Use an existing database.')
 96 |     parser.add_argument('-s',
 97 |                         '--use_existing_tables',
 98 |                         action='store_true',
 99 |                         help='Use existing tables.')
100 |     parser.add_argument('-t',
101 |                         '--table_names',
102 |                         metavar='[string]',
103 |                         nargs=3,
104 |                         help='Decide on the table names you want to fill if they exist, ' +
105 |                              'or create and fill them if they do not exist. If no names ' +
106 |                              'are given the default values ["docs.csv", "term_dict.csv", ' +
107 |                              '"term_doc.csv"] are being used. If arguments are given ' +
108 |                              'they are expected in the respective default order.')
109 |     parser.add_argument('-cd',
110 |                         '--columns_names_docs',
111 |                         metavar='[string]',
112 |                         nargs=2,
113 |                         help='Column names for the docs table.')
114 |     parser.add_argument('-ct',
115 |                         '--columns_names_term_dict',
116 |                         metavar='[string]',
117 |                         nargs=3,
118 |                         help='Column names for the dict table.')
119 |     parser.add_argument('-o',
120 |                         '--columns_names_term_doc',
121 |                         metavar='[string]',
122 |                         nargs=3,
123 |                         help='Column names for the term-docs table (docs in old dog paper).')
124 |     parser.add_argument('-di',
125 |                         '--docs_file',
126 |                         metavar='[file]',
127 |                         help='Filename for the csv file containing the data for the docs table.')
128 |     parser.add_argument('-ti',
129 |                         '--term_dict_file',
130 |                         metavar='[file]',
131 |                         help='Filename for the csv file containing the data for the dict table.')
132 |     parser.add_argument('-oi',
133 |                         '--term_doc_file',
134 |                         metavar='[file]',
135 |                         help='Filename for the csv file containing the data for the term-docs table ' +
136 |                              '(terms in old dog paper).')
137 |     parser.add_argument('-e',
138 |                         '--delimiter',
139 |                         help='Delimiter that separates the columns in the csv files.')
140 |     FullTextFromCSV(**vars(parser.parse_args()))
141 | 


--------------------------------------------------------------------------------
/geesedb/index/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import duckdb
 4 | from duckdb import DuckDBPyConnection
 5 | 
 6 | 
 7 | def _create_table(connection: DuckDBPyConnection, table_name: str, column_names: List[str],
 8 |                   column_types: List[str]) -> None:
 9 |     cursor = connection.cursor()
10 |     try:
11 |         cursor.execute(f'SELECT * FROM {table_name} LIMIT 1;')
12 |         connection.rollback()
13 |         raise IOError('Table already exists.')
14 |     except duckdb.CatalogException:
15 |         pass
16 |     query = f'CREATE TABLE {table_name} ({", ".join([f"{a} {b}" for a, b in zip(column_names, column_types)])});'
17 |     cursor.execute(query)
18 | 
19 | 
20 | def _fill_empty_table_with_csv(connection: DuckDBPyConnection, table_name: str, file_name: str,
21 |                                delimiter: str = "|") -> None:
22 |     cursor = connection.cursor()
23 |     cursor.execute(f'SELECT COUNT(*) FROM {table_name};')
24 |     if cursor.fetchone()[0] > 0:
25 |         connection.rollback()
26 |         raise IOError('The tables are not empty.')
27 |     query = f"COPY {table_name} FROM '{file_name}' WITH DELIMITER '{delimiter}';"
28 |     cursor.execute(query)
29 | 


--------------------------------------------------------------------------------
/geesedb/interpreter/__init__.py:
--------------------------------------------------------------------------------
1 | from .metadata import Metadata
2 | from .parser import Parser
3 | from .translate import Translator
4 | 
5 | __all__ = ['Parser', 'Translator', 'Metadata']
6 | 


--------------------------------------------------------------------------------
/geesedb/interpreter/metadata.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from ..connection import get_connection
 4 | 
 5 | 
 6 | class Metadata:
 7 | 
 8 |     def __init__(self, database):
 9 |         self.connection = get_connection(database).connection
10 | 
11 |     # first list is default if nothing is specified (should be extended)
12 |     # list is ordered as [edge_name, node1_id, edge_node1_id, edge_node2_id, node2_id2
13 |     def get_metadata(self):
14 |         self.connection.execute("SELECT metadata FROM _meta")
15 |         metadata = json.loads(self.connection.fetchone()[0])
16 |         return metadata
17 | 
18 |     def update_metadata(self, data):
19 |         self.connection.execute(f"UPDATE _meta SET metadata='{json.dumps(data)}'")
20 | 
21 |     def get_default_join_info(self, node1, node2):
22 |         return self.get_metadata()[node1][node2][0]
23 | 
24 |     def get_all_join_info(self, node1, node2):
25 |         return self.get_metadata()[node1][node2]
26 | 
27 |     # {
28 |     #     'term_dict': {
29 |     #         'docs': [['term_doc', 'term_id', 'term_id', 'doc_id', 'doc_id']]
30 |     #     },
31 |     #     'docs': {
32 |     #         'term_dict': [['term_doc', 'doc_id', 'doc_id', 'term_id', 'term_id']],
33 |     #         'entities': [['entity_doc', 'collection_id', 'doc_id', 'entity', 'entity']],
34 |     #         'authors': [['doc_author', 'collection_id', 'doc', 'author', 'author']]
35 |     #     },
36 |     #     'entities': {
37 |     #         'docs': [['entity_doc', 'entity', 'entity', 'doc_id', 'collection_id']]
38 |     #     },
39 |     #     'authors': {
40 |     #         'docs': [['doc_author', 'author', 'author', 'doc', 'collection_id']]
41 |     #     }
42 |     # }
43 | 


--------------------------------------------------------------------------------
/geesedb/interpreter/parser.py:
--------------------------------------------------------------------------------
  1 | import pycypher
  2 | from .metadata import Metadata
  3 | 
  4 | class Parser:
  5 | 
  6 |     def __init__(self, database):
  7 |         self.parseCypher = _ParseCypher(database)
  8 | 
  9 |     def parse(self, cypher_query):
 10 |         node = pycypher.parse(cypher_query)
 11 |         return self.parseCypher.process_node(node)
 12 | 
 13 | class _ParseCypher:
 14 | 
 15 |     def __init__(self, database):
 16 |         self.database = database
 17 | 
 18 |     def process_node(self, node):
 19 |         errors = node['errors']
 20 |         name = node['name']
 21 |         result = node['result']
 22 | 
 23 |         if len(errors) > 0:
 24 |             print(f"There are errors in the query:")
 25 |             print(errors)
 26 |             raise RuntimeError
 27 | 
 28 |         if name == 'Cypher':
 29 |             for r in result:
 30 |                 try:
 31 |                     if r['children']['name'] == 'Statement':
 32 |                         return self.process_node(r['children'])
 33 |                 except KeyError:
 34 |                     continue
 35 | 
 36 |         elif name == 'Statement':
 37 |             return self.process_node(result[0]['children'])
 38 | 
 39 |         elif name == 'Query':
 40 |             return self.process_node(result[0]['children'])
 41 | 
 42 |         elif name == 'StandaloneQuery':
 43 |             raise RuntimeError("We don not support StandaloneQuery queries (yet).")
 44 | 
 45 |         elif name == 'RegularQuery':
 46 |             out = ''
 47 |             for r in result:
 48 |                 if r['node'] == r['children']:
 49 |                     continue
 50 |                 if len(out) > 0:
 51 |                     out += ' '
 52 |                 if r['children']['name'] == 'SingleQuery':
 53 |                     out += _ParseSingleQuery(self.database).process_node(r['children'])
 54 |                 else:
 55 |                     out += self.process_node(r['children'])
 56 |             return out
 57 | 
 58 |         elif name == 'Union':
 59 |             union = ''
 60 |             for r in result[:-1]:
 61 |                 union += r['node']['text']
 62 |             union = union.strip()
 63 |             return union + ' ' + _ParseSingleQuery(self.database).process_node(result[-1]['children'])
 64 | 
 65 |         else:
 66 |             raise RuntimeError(f'Queries that make use of >>{name}<< are not supported (yet).')
 67 | 
 68 | class _ParseSingleQuery:
 69 | 
 70 |     def __init__(self, database):
 71 |         self.output_params = {
 72 |             "Order": '',
 73 |             "Skip": '',
 74 |             "Limit": ''
 75 |         }
 76 |         self.additional_wheres = list()
 77 |         self.metadata = Metadata(database)
 78 | 
 79 |     def build_select_statement(self, pattern):
 80 |         output = ''
 81 | 
 82 |         # First the start node
 83 |         s_node = pattern['NodePattern'][0]
 84 |         try:
 85 |             s_variable = s_node['Variable']
 86 |         except KeyError:
 87 |             s_variable = 'start_node'
 88 | 
 89 |         try:
 90 |             s_label = s_node['NodeLabels']
 91 |         except KeyError:
 92 |             raise RuntimeError('The type of a node needs to be know for know')
 93 | 
 94 |         try:
 95 |             s_properties = s_node['Properties']
 96 |             for key, value in s_properties.items():
 97 |                 self.additional_wheres.append(f"""{s_variable}.{key} = {value.replace('"', "'")}""")
 98 |         except KeyError:
 99 |             pass
100 | 
101 |         output += f'{s_label} AS {s_variable}'
102 | 
103 |         # Then the chain
104 |         try:
105 |             chain = pattern['PatternElementChain']
106 |         except KeyError:
107 |             return output
108 | 
109 |         p_label = s_label
110 |         p_variable = s_variable
111 |         for i, chain_part in enumerate(chain):
112 |             to_node = chain_part['node']
113 |             try:
114 |                 to_node_variable = to_node['Variable']
115 |             except KeyError:
116 |                 to_node_variable = f'Xtn{i}X'
117 |             try:
118 |                 to_node_type = to_node['NodeLabels']
119 |             except KeyError:
120 |                 raise RuntimeError("The node type needs to be known for now.")
121 |             try:
122 |                 to_node_properties = to_node['Properties']
123 |                 for key, value in to_node_properties.items():
124 |                     self.additional_wheres.append(f"""{to_node_variable}.{key} = {value.replace('"', "'")}""")
125 |             except KeyError:
126 |                 pass
127 | 
128 |             relationship = chain_part['relationship']
129 |             try:
130 |                 rel_variable = relationship['Variable']
131 |             except KeyError:
132 |                 rel_variable = f'Xrel{i}X'
133 |             try:
134 |                 rel_type = relationship['RelationshipTypes'][0]
135 |             except KeyError:
136 |                 rel_type = self.metadata.get_default_join_info(p_label, to_node_type)[0]
137 |             try:
138 |                 rel_properties = relationship['Properties']
139 |                 for key, value in rel_properties.items():
140 |                     self.additional_wheres.append(f"""{rel_variable}.{key} = {value.replace('"', "'")}""")
141 |             except KeyError:
142 |                 pass
143 |             meta = self.metadata.get_all_join_info(p_label, to_node_type)
144 |             if not meta:
145 |                 raise RuntimeError(f"There are no edges between these node types known: {p_label} and {to_node_type}")
146 |             meta = meta[0] # TODO unless join table is specified
147 |             join_table, from_node_jk, join_table_fnk, join_table_tnk, to_node_jk = meta
148 | 
149 |             # Add relationship join and then the node join
150 |             join = f' JOIN {join_table} AS {rel_variable} ON {p_variable}.{from_node_jk} = {rel_variable}.{join_table_fnk}' + \
151 |                    f' JOIN {to_node_type} AS {to_node_variable} ON {rel_variable}.{join_table_tnk} = {to_node_variable}.{to_node_jk}'
152 |             output += join
153 |             p_variable = to_node_variable
154 |             p_label = to_node_type
155 |         return output
156 | 
157 |     def process_node(self, node):
158 |         name = node['name']
159 |         result = node['result']
160 | 
161 |         if name == 'SingleQuery':
162 |             return self.process_node(result[0]['children'])
163 | 
164 |         elif name == 'SinglePartQuery':
165 |             read_part = ''
166 |             return_part = ''
167 |             for r in result:
168 |                 if r['node'] == r['children']:
169 |                     continue
170 |                 elif r['children']['name'] == 'UpdatingClause':
171 |                     raise RuntimeError('Updates are not supported yet')
172 |                 elif r['children']['name'] == 'ReadingClause':
173 |                     if len(read_part) > 0:
174 |                         raise RuntimeError('Only one reading clause per query is supported')
175 |                     read_part = self.process_node(r['children'])
176 |                 else:
177 |                     return_part = self.process_node(r['children'])
178 |             return return_part + ' ' + read_part \
179 |                    + self.output_params['Order'] \
180 |                    + self.output_params['Skip'] \
181 |                    + self.output_params['Limit']
182 | 
183 |         elif name == 'ReadingClause':
184 |             return self.process_node(result[0]['children'])
185 | 
186 |         elif name == 'Match':
187 |             match_text = ''
188 |             where = ''
189 | 
190 |             result_generator = (r for r in result)
191 |             r = next(result_generator)
192 |             while r['node'] == r['children']:
193 |                 match_text += r['node']['text']
194 |                 r = next(result_generator)
195 |             if match_text.strip().upper().startswith('OPTIONAL'):
196 |                 raise RuntimeError('For now we do not support OPTIONAL matches yet.')
197 |             pattern = self.process_node(r['children'])
198 |             while True:
199 |                 try:
200 |                     r = next(result_generator)
201 |                     if r['node'] == r['children']:
202 |                         continue
203 |                     if r['children']['name'] == 'Where':
204 |                         where = self.process_node(r['children'])
205 |                 except StopIteration:
206 |                     break
207 |             match_statement = f'FROM {pattern}'
208 |             if len(where) == 0 and len(self.additional_wheres) > 0:
209 |                 where = ' WHERE ' + ' AND '.join(self.additional_wheres)
210 |             elif len(self.additional_wheres) > 0:
211 |                 additional_and = ' AND ' + ' AND '.join(self.additional_wheres)
212 |                 where += additional_and
213 |             if len(where) > 0:
214 |                 match_statement += where
215 |             return match_statement
216 | 
217 |         elif name == 'Pattern':
218 |             return_expression = ''
219 |             for r in result:
220 |                 if r['node'] == r['children']:
221 |                     return_expression += r['node']['text']
222 |                 else:
223 |                     return_expression += self.process_node(r['children'])
224 |             return return_expression
225 | 
226 |         elif name == 'PatternPart':
227 |             return_expression = ''
228 |             for r in result:
229 |                 if r['node'] == r['children']:
230 |                     return_expression += r['node']['text']
231 |                 elif r['children']['name'] == 'Variable':
232 |                     raise RuntimeError('Variable assignment of patterns is not supported yet.')
233 |                 else:
234 |                     return_expression += self.process_node(r['children'])
235 |             return return_expression
236 | 
237 |         elif name == 'AnonymousPatternPart':
238 |             return ''.join([self.process_node(r['children']) for r in result])
239 | 
240 |         elif name == 'PatternElement':
241 |             # Get processed chain data
242 |             pattern = dict()
243 |             for r in result:
244 |                 if r['node'] == r['children']:
245 |                     continue
246 |                 else:
247 |                     try:
248 |                         pattern[r['children']['name']].append(self.process_node(r['children']))
249 |                     except KeyError:
250 |                         pattern[r['children']['name']] = [self.process_node(r['children'])]
251 | 
252 |             return self.build_select_statement(pattern)
253 | 
254 |         elif name == 'NodePattern':
255 |             node = dict()
256 |             for r in result:
257 |                 if r['node'] == r['children']:
258 |                     continue
259 |                 else:
260 |                     node[r['children']['name']] = self.process_node(r['children'])
261 |             return node
262 | 
263 |         elif name == 'NodeLabels':
264 |             if len(result) > 1:
265 |                 raise RuntimeError("Only one node label at a time is supported")
266 |             return self.process_node(result[0]['children'])
267 | 
268 |         elif name == 'NodeLabel':
269 |             node_label = ''
270 |             for r in result:
271 |                 if r['node'] == r['children']:
272 |                     continue
273 |                 else:
274 |                     node_label = self.process_node(r['children'])
275 |             return node_label
276 | 
277 |         elif name == 'LabelName':
278 |             return self.process_node(result[0]['children'])
279 | 
280 |         elif name == 'Properties':
281 |             return self.process_node(result[0]['children'])
282 | 
283 |         elif name == 'MapLiteral':
284 |             map_literal = dict()
285 |             key = None
286 |             for r in result:
287 |                 if r['node'] == r['children']:
288 |                     continue
289 |                 elif r['children']['name'] == 'PropertyKeyName':
290 |                     key = self.process_node(r['children'])
291 |                 elif r['children']['name'] == 'Expression':
292 |                     map_literal[key] = self.process_node(r['children'])
293 |                     key = None
294 |             return map_literal
295 | 
296 |         elif name == 'PatternElementChain':
297 |             relationship = None
298 |             node = None
299 |             for r in result:
300 |                 if r['node'] == r['children']:
301 |                     continue
302 |                 elif r['children']['name'] == 'RelationshipPattern':
303 |                     relationship = self.process_node(r['children'])
304 |                 else:
305 |                     node = self.process_node(r['children'])
306 |             return {'relationship': relationship, 'node': node}
307 | 
308 |         elif name == 'RelationshipPattern':
309 |             for r in result:
310 |                 if r['children']['name'] == 'Dash':
311 |                     continue
312 |                 elif r['children']['name'] in {'LeftArrowHead', 'RightArrowHead'}:
313 |                     raise RuntimeError('Directed edges are not supported yet.')
314 |                 else:
315 |                     return self.process_node(r['children'])
316 |             raise RuntimeError("RelationshipPattern should return a pattern")
317 | 
318 |         elif name == 'RelationshipDetail':
319 |             relation = dict()
320 |             for r in result:
321 |                 if r['node'] == r['children']:
322 |                     continue
323 |                 else:
324 |                     relation[r['children']['name']] = self.process_node(r['children'])
325 |             return relation
326 | 
327 |         elif name == 'RelationshipTypes':
328 |             relationship_types = []
329 |             for r in result:
330 |                 if r['node'] == r['children']:
331 |                     continue
332 |                 else:
333 |                     relationship_types.append(self.process_node(r['children']))
334 |             if len(relationship_types) > 1:
335 |                 raise RuntimeError("We only support one join table at a time for now.")
336 |             return relationship_types
337 | 
338 |         elif name == 'RelTypeName':
339 |             return self.process_node(result[0]['children'])
340 | 
341 |         elif name == 'Where':
342 |             where_statement = ' '
343 |             for r in result:
344 |                 if r['node'] == r['children']:
345 |                     where_statement += r['node']['text']
346 |                 else:
347 |                     where_statement += self.process_node(r['children'])
348 |             return where_statement
349 | 
350 |         elif name == 'Return':
351 |             out = ''
352 |             for r in result[1:]:
353 |                 if r['node'] == r['children']:
354 |                     if len(r['node']['text'].strip()) == 0:
355 |                         continue
356 |                     out += r['node']['text'].strip() + ' '
357 |                 else:
358 |                     out += self.process_node(r['children'])
359 |             return 'SELECT ' + out
360 | 
361 |         elif name == 'ReturnBody':
362 |             out = ''
363 |             for r in result:
364 |                 if r['node'] == r['children']:
365 |                     continue
366 |                 elif r['children']['name'] == 'ReturnItems':
367 |                     out = self.process_node(r['children'])
368 |                 elif r['children']['name'] in {'Order', 'Skip', 'Limit'}:
369 |                     self.output_params[r['children']['name']] = ' ' + r['node']['text']
370 |                 else:
371 |                     n = r['children']['name']
372 |                     raise RuntimeError(f'Queries that make use of >>{n}<< are not supported (yet).')
373 |             return out
374 | 
375 |         elif name == 'ReturnItems':
376 |             return_items = ''
377 |             for r in result:
378 |                 return_items += r['node']['text']
379 |             # TODO
380 |             # For now we just assume the ReturnItems is already correct, should make it better such
381 |             # that e.g. nodes can be selected directly (now specific attributes have to be specified).
382 |             return return_items
383 | 
384 |         elif name == 'Expression':
385 |             return self.process_node(result[0]['children'])
386 | 
387 |         elif name in {'OrExpression', 'AndExpression', 'XorExpression', 'NotExpression'}:
388 |             keyword = name[:-10]
389 |             expressions = []
390 |             for r in result:
391 |                 if r['node'] == r['children']:
392 |                     continue
393 |                 else:
394 |                     expressions.append(self.process_node(r['children']))
395 |             if len(expressions) == 1:
396 |                 return expressions[0]
397 |             else:
398 |                 return f' {keyword.upper()} '.join(expressions)
399 | 
400 |         elif name == 'ComparisonExpression':
401 |             possible_comparisons = {'=', '<>', '<', '>', '<=', '>='}
402 |             comparisons = []
403 |             for r in result:
404 |                 if r['node'] == r['children']:
405 |                     continue
406 |                 else:
407 |                     comparisons.append(self.process_node(r['children']))
408 |             if len(comparisons) == 1:
409 |                 return comparisons[0]
410 |             elif len(comparisons) == 2:
411 |                 return comparisons[0]  + ' ' + comparisons[1]
412 |             else:
413 |                 comparison_expressions_unprocessed = []
414 |                 comparison_expressions = []
415 |                 for i in range(len(comparisons)-1):
416 |                     comparison_expressions_unprocessed.append([comparisons[i], comparisons[i+1]])
417 |                 for expression_duo in comparison_expressions_unprocessed:
418 |                     p1, p2 = expression_duo
419 |                     for p in possible_comparisons:
420 |                         p1 = p1.replace(p, '')
421 |                     comparison_expressions.append(p1.strip() + ' ' + p2.strip())
422 |                 return ' AND '.join(comparison_expressions)
423 | 
424 |         elif name == 'PartialComparisonExpression':
425 |             partial_comparison = ''
426 |             for r in result:
427 |                 if r['node'] == r['children']:
428 |                     partial_comparison += r['node']['text']
429 |                 else:
430 |                     partial_comparison += self.process_node(r['children'])
431 |             return partial_comparison
432 | 
433 |         elif name in {'AddOrSubtractExpression', 'MultiplyDivideModuloExpression',
434 |                       'PowerOfExpression', 'UnaryAddOrSubtractExpression'}:
435 |             return_expression= ''
436 |             for r in result:
437 |                 if r['node'] == r['children']:
438 |                     return_expression += r['node']['text']
439 |                 else:
440 |                     return_expression += self.process_node(r['children'])
441 |             return return_expression
442 | 
443 |         elif name == 'StringListNullOperatorExpression':
444 |             return ' '.join([self.process_node(r['children']) for r in result])
445 | 
446 |         elif name == 'NullOperatorExpression':
447 |             return ''.join([r['node']['text'] for r in result]).strip()
448 | 
449 |         elif name == 'PropertyOrLabelsExpression':
450 |             return ''.join([self.process_node(r['children']) for r in result])
451 | 
452 |         elif name == 'PropertyLookup':
453 |             return_expression = ''
454 |             for r in result:
455 |                 if r['node'] == r['children']:
456 |                     return_expression += r['node']['text']
457 |                 else:
458 |                     return_expression += self.process_node(r['children'])
459 |             return return_expression
460 | 
461 |         elif name == 'SchemaName':
462 |             return_expression = ''
463 |             for r in result:
464 |                 if r['node'] == r['children']:
465 |                     return_expression += r['node']['text']
466 |                 else:
467 |                     return_expression += self.process_node(r['children'])
468 |             return return_expression
469 | 
470 |         elif name == 'PropertyKeyName':
471 |             return_expression = ''
472 |             for r in result:
473 |                 if r['node'] == r['children']:
474 |                     return_expression += r['node']['text']
475 |                 else:
476 |                     return_expression += self.process_node(r['children'])
477 |             return return_expression
478 | 
479 |         elif name == 'Atom':
480 |             return_expression = ''
481 |             for r in result:
482 |                 if r['node'] == r['children']:
483 |                     return_expression += r['node']['text']
484 |                 else:
485 |                     return_expression += self.process_node(r['children'])
486 |             return return_expression
487 | 
488 |         elif name == 'FunctionInvocation':
489 |             return_expression = ''
490 |             for r in result:
491 |                 if r['node'] == r['children']:
492 |                     return_expression += r['node']['text']
493 |                 elif r['children']['name'] == 'FunctionName':
494 |                     return_expression += r['node']['text']
495 |                 else:
496 |                     return_expression += self.process_node(r['children'])
497 |             return return_expression
498 | 
499 |         elif name == 'Literal':
500 |             return_expression = ''
501 |             for r in result:
502 |                 if r['node'] == r['children']:
503 |                     return_expression += r['node']['text'].replace('"', "'")
504 |                 else:
505 |                     return_expression += self.process_node(r['children'])
506 |             return return_expression
507 | 
508 |         elif name == 'NumberLiteral':
509 |             return self.process_node(result[0]['children'])
510 | 
511 |         elif name == 'DoubleLiteral':
512 |             return ''.join([r['node']['text'] for r in result]).strip()
513 | 
514 |         elif name == 'IntegerLiteral':
515 |             return ''.join([r['node']['text'] for r in result]).strip()
516 | 
517 |         elif name == 'Variable':
518 |             return self.process_node(result[0]['children'])
519 | 
520 |         elif name == 'SymbolicName':
521 |             return ''.join([r['node']['text'] for r in result]).strip()
522 | 
523 |         elif name == 'ParenthesizedExpression':
524 |             return_expression = ''
525 |             for r in result:
526 |                 if r['node'] == r['children']:
527 |                     return_expression += r['node']['text']
528 |                 else:
529 |                     return_expression += self.process_node(r['children'])
530 |             return return_expression
531 | 
532 |         elif name == 'MultiPartQueries':
533 |             raise RuntimeError('The keyword WITH is not supported (yet).')
534 | 
535 |         else:
536 |             raise RuntimeError(f'Queries that make use of >>{name}<< are not supported (yet).')


--------------------------------------------------------------------------------
/geesedb/interpreter/translate.py:
--------------------------------------------------------------------------------
 1 | from .parser import Parser
 2 | 
 3 | # This class was used in the paper for translating, all the translating logic is now implemented in Parser
 4 | # So this class is a wrapper for that one.
 5 | class Translator:
 6 | 
 7 |     def __init__(self, database):
 8 |         self.parser = Parser(database)
 9 | 
10 |     def translate(self, query):
11 |         return self.parser.parse(query)
12 | 


--------------------------------------------------------------------------------
/geesedb/resources/__init__.py:
--------------------------------------------------------------------------------
1 | from .topics import get_topics_backgroundlinking
2 | 
3 | __all__ = ['get_topics_backgroundlinking']


--------------------------------------------------------------------------------
/geesedb/resources/topics-and-qrels/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/resources/topics-and-qrels/__init__.py


--------------------------------------------------------------------------------
/geesedb/resources/topics-and-qrels/topics.backgroundlinking18.processed.txt:
--------------------------------------------------------------------------------
 1 | 321:9171debc316e5e2782e0d2404ca7d09d
 2 | 336:2a340b8573d498e261d6f2365b37f8eb
 3 | 341:7ef8ce1720bf2f6b2065a97506ee89b4
 4 | 347:c3cea789141ef2ae856419e86e165e0c
 5 | 350:985b90cc-7c98-11e3-93c1-0e888170b723
 6 | 362:4989ebfeb752e6b317d1ef3997b21a01
 7 | 363:474ae088-ab1e-11e4-9c91-e9d2f9fde644
 8 | 367:1e03fecf4d33b7896203298ab3858156
 9 | 375:0e85b0c0-f7ef-11e4-9030-b4732caefe81
10 | 378:3c5be31e-24ab-11e5-b621-b55e495e9b78
11 | 393:fef0f232a9bd94bdb96bac48c7705503
12 | 397:563fb77e-024f-11e6-9203-7b8670959b88
13 | 400:72e72b41097d53b627fd375dd2d3309b
14 | 408:988147454a2b8eafd1535cd673dd04ba
15 | 414:4192b016-8708-11e3-a5bd-844629433ba3
16 | 422:145b9a6caa16d931c108a89798e65e17
17 | 426:56f0438ee0fb34c341ccf5af36de5175
18 | 427:2e83ad87eb1bade22e6e96ece616c24f
19 | 433:159e6f9e-8e84-11e3-84e1-27626c5ef5fb
20 | 439:5c466d4a01492f1b5cc9758e19429a1f
21 | 442:3902c9005a0563742fc4acb2c011b164
22 | 445:c8351276-76de-41f1-b294-4f3e5d373c8c
23 | 626:a79b1b7d8cc5273d4995fec5e122e44b
24 | 646:6fdc62d37aaf685b809c501abe13c56c
25 | 690:defd7f4a85496d52a210938d58a7ae76
26 | 801:b0235f56-1cce-11e4-ae54-0cfe1f974f8a
27 | 802:6668d83480f5c58b54a90770835ac2d4
28 | 803:cad56e871cd0bca6cc77e97ffe246258
29 | 804:579e9ae8-6a2f-11e6-8225-fbb8a6fc65bc
30 | 805:5ec40b6bc6c5f4487132da7be04fc914
31 | 806:2bea9433d4e1050c9c85175df466b3e2
32 | 807:11915bd8-7944-11e2-9c27-fdd594ea6286
33 | 808:30a493b8-fb07-11e4-9ef4-1bb7ce3b3fb7
34 | 809:02e52bdba097c9df4cbae66e04f82542
35 | 810:9dd7b85cd1e3da1b5c8e79f32fec7177
36 | 811:a244d1e0cfd916a2af76b6a6c785b017
37 | 812:dcd1560bd13a0b665b95d3ba27cc960c
38 | 813:b4c6361974466458bb721b9b1628220b
39 | 814:e1336b8f-b0c2-4610-9a3c-ec85a546c9ad
40 | 815:a36fa8a2-8962-11e6-bff0-d53f592f176e
41 | 816:37a8e2283e4677b703f6464d0191a700
42 | 817:bd1e6cc8d7525fec36a717be45638bf4
43 | 818:a2744bb98e1968307548e4976232cf1c
44 | 819:5f37aac53768e749b861028397eb6849
45 | 820:fc1ca759c9c433376e71884870d225ab
46 | 821:c6bf4a4bf542b7c67987c222d73def4b
47 | 822:43e9f3f12982c0e0bb15ad64b33a89c0
48 | 823:c109cc839f2d2414251471c48ae5515c
49 | 824:30c00b60-13f6-11e3-b182-1b3bb2eb474c
50 | 825:a1c41a70-35c7-11e3-8a0e-4e2cf80831fc


--------------------------------------------------------------------------------
/geesedb/resources/topics-and-qrels/topics.backgroundlinking19.processed.txt:
--------------------------------------------------------------------------------
 1 | 826:96ab542e-6a07-11e6-ba32-5a4bf5aad4fa
 2 | 827:679e8784-34df-11e7-b373-418f6849a004
 3 | 828:56989a9c-e30e-11e6-a547-5fb9411d332c
 4 | 829:0d7f5e24cafc019265d3ee4b9745e7ea
 5 | 830:02475047d615c46a006924f0e2317cca
 6 | 831:320596f4102c94fca3f0432a8611f87d
 7 | 832:b407cd8074559045a926eff226aae173
 8 | 833:0e43fce6-12a9-11e5-89f3-61410da94eb1
 9 | 834:99d80076323444d17769330a2fc20e93
10 | 835:c0c4e2d0-628f-11e7-a4f7-af34fc1d9d39
11 | 836:50326f32f7308f71691fafbdc25e1dc3
12 | 837:664638e8-6dc3-11e6-9705-23e51a2f424d
13 | 838:8a6a9e3c-fff3-11e5-8bb1-f124a43f84dc
14 | 839:5331852c50a9cb4b0ff655ecf4a7b6a6
15 | 840:55060482bf60235610936dd9cd2e54b4
16 | 841:2dc6b4a1a5ec8579292e35163982cdb6
17 | 842:7ffad2c2-0c57-11e5-95fd-d580f1c5d44e
18 | 843:53e0938aa0a0d14afb74effcd7819c79
19 | 844:84b81394-39ad-11e7-a59b-26e0451a96fd
20 | 845:d71b4204-9ec8-11e5-bce4-708fe33e3288
21 | 846:6dc42582bb2c8ec59dba0fb85b4848ea
22 | 847:5cc92f1c73c531dc0b470e771e0eb16f
23 | 848:5a50785cf124c29ad3e9d1e112973086
24 | 849:a9e12ff8fc83d08e869ba64669d350fd
25 | 850:540fc83a-33fa-11e7-b373-418f6849a004
26 | 851:c65e0878-418a-11e7-9869-bac8b446820a
27 | 852:6f1c065141e1dea9ae66c34f92841170
28 | 853:415e53f9e697cfb7baa46d9c494d2ece
29 | 854:d0828a0fea8f9110154b944984cd8f28
30 | 855:0bc6e8a10a6df059d2aa5ae23281b7fb
31 | 856:88bd7be0fca52c06cd550c14ce9d2416
32 | 857:6fe0cb2ea7838ac2c29fa0539c6bce1d
33 | 858:587a617a-3e3f-11e6-84e8-1580c7db5275
34 | 859:663c2790-3f8e-11e5-9561-4b3dc93e3b9a
35 | 860:9584295234e46a836dfde4f42ba2ca09
36 | 861:555bbc1c46a91e62bdcbbd0023f849a4
37 | 862:623ccdaa-6013-11e7-a4f7-af34fc1d9d39
38 | 863:bf9d84082043dded9b4feb0767eee96e
39 | 864:cba3afc578eac3b7afe15899313cdd3a
40 | 865:f5a5cd2c3ef20d0550a3dd623702d23d
41 | 866:98791c1f1f74e7bd11e1369807be3a89
42 | 867:babfe8b6d17dbc18e38707877ac75e4e
43 | 868:2c9485cf6bf33f13740dcb04c572b9e2
44 | 869:4d2e805d3293fdef30ce737a97528247
45 | 870:d7d906991e2883889f850de9ae06655e
46 | 871:7b7126ae-0500-11e7-b1e9-a05d3c21f7cf
47 | 872:e833068d3a08ad64398330bbc3b1759b
48 | 873:05c0a5ad-108f-46b9-b6f7-5f2303f6cdfe
49 | 874:fb6c2426-ea52-11e6-b82f-687d6e6a3e7c
50 | 875:14f6eb04dbaa9c5d6f5170cffc67e463
51 | 876:fef77fc9335b33d975132ce603182846
52 | 877:7047115870d7910b42bc779541f5deb5
53 | 878:97b489e2-0a38-11e5-9e39-0db921c47b93
54 | 879:5adb2f7230907f4006063656b8400742
55 | 880:58739d169d364163c1e81a8f081cdc9a
56 | 881:1342bcb6-ec45-11e5-a6f3-21ccdbc5f74e
57 | 882:74ca2f03320df21995165b6bb9bb4ddb
58 | 883:aaf444787011938dc645bdc1185a0716
59 | 884:681e77ce-dffd-11e5-9c36-e1902f6b6571
60 | 885:5ae44bfd66a49bcad7b55b29b55d63b6


--------------------------------------------------------------------------------
/geesedb/resources/topics-and-qrels/topics.core17.processed.txt:
--------------------------------------------------------------------------------
 1 | 307:new hydroelectr project
 2 | 310:radio wave brain cancer
 3 | 321:women parliament
 4 | 325:cult lifestyl
 5 | 330:iran iraq cooper
 6 | 336:black bear attack
 7 | 341:airport secur
 8 | 344:abus e mail
 9 | 345:oversea tobacco sale
10 | 347:wildlif extinct
11 | 350:health comput termin
12 | 353:antarctica explor
13 | 354:journalist risk
14 | 355:ocean remot sens
15 | 356:postmenopaus estrogen britain
16 | 362:human smuggl
17 | 363:transport tunnel disast
18 | 367:piraci
19 | 372:nativ american casino
20 | 375:hydrogen energi
21 | 378:euro opposit
22 | 379:mainstream
23 | 389:illeg technolog transfer
24 | 393:merci kill
25 | 394:home school
26 | 397:automobil recal
27 | 399:oceanograph vessel
28 | 400:amazon rain forest
29 | 404:ireland peac talk
30 | 408:tropic storm
31 | 414:cuba sugar export
32 | 416:three gorg project
33 | 419:recycl automobil tire
34 | 422:art stolen forg
35 | 423:milosev mirjana markov
36 | 426:law enforc dog
37 | 427:uv damag ey
38 | 433:greek philosophi stoicism
39 | 435:curb popul growth
40 | 436:railwai accid
41 | 439:invent scientif discoveri
42 | 442:heroic act
43 | 443:u. invest africa
44 | 445:women clergi
45 | 614:flavr savr tomato
46 | 620:franc nuclear test
47 | 626:human stamped
48 | 646:food stamp increas
49 | 677:lean tower pisa
50 | 690:colleg educ advantag
51 | 


--------------------------------------------------------------------------------
/geesedb/resources/topics-and-qrels/topics.core18.processed.txt:
--------------------------------------------------------------------------------
 1 | 321:women parliament
 2 | 336:black bear attack
 3 | 341:airport secur
 4 | 347:wildlif extinct
 5 | 350:health comput termin
 6 | 362:human smuggl
 7 | 363:transport tunnel disast
 8 | 367:piraci
 9 | 375:hydrogen energi
10 | 378:euro opposit
11 | 393:merci kill
12 | 397:automobil recal
13 | 400:amazon rain forest
14 | 408:tropic storm
15 | 414:cuba sugar export
16 | 422:art stolen forg
17 | 426:law enforc dog
18 | 427:uv damag ey
19 | 433:greek philosophi stoicism
20 | 439:invent scientif discoveri
21 | 442:heroic act
22 | 445:women clergi
23 | 626:human stamped
24 | 646:food stamp increas
25 | 690:colleg educ advantag
26 | 801:africa polio vaccin
27 | 802:women drive saudi arabia
28 | 803:declin middl class u.
29 | 804:women 20
30 | 805:eat invas speci
31 | 806:comput paralyz peopl
32 | 807:chavez medic treatment cuba
33 | 808:boston marathon bomb verdict
34 | 809:protect earth from asteroid
35 | 810:diabet toxic chemic
36 | 811:car hack
37 | 812:social media teen suicid
38 | 813:marijuana potenc
39 | 814:china on child impact
40 | 815:jason rezaian releas from iran
41 | 816:feder minimum wage increas
42 | 817:alan gross releas cuba
43 | 818:egg healthi diet
44 | 819:u. ag demograph
45 | 820:bacteri infect mortal rate
46 | 821:email scam
47 | 822:soni cyberattack
48 | 823:control mrsa
49 | 824:bezo purchas washington post
50 | 825:ethanol food price
51 | 


--------------------------------------------------------------------------------
/geesedb/resources/topics-and-qrels/topics.robust04.processed.txt:
--------------------------------------------------------------------------------
  1 | 301:intern organ crime
  2 | 302:poliomyel post polio
  3 | 303:hubbl telescop achiev
  4 | 304:endang speci mammal
  5 | 305:most danger vehicl
  6 | 306:african civilian death
  7 | 307:new hydroelectr project
  8 | 308:implant dentistri
  9 | 309:rap crime
 10 | 310:radio wave brain cancer
 11 | 311:industri espionag
 12 | 312:hydropon
 13 | 313:magnet levit maglev
 14 | 314:marin veget
 15 | 315:unexplain highwai accid
 16 | 316:polygami polyandri polygyni
 17 | 317:unsolicit fax
 18 | 318:best retir countri
 19 | 319:new fuel sourc
 20 | 320:undersea fiber optic cabl
 21 | 321:women parliament
 22 | 322:intern art crime
 23 | 323:literari journalist plagiar
 24 | 324:argentin british relat
 25 | 325:cult lifestyl
 26 | 326:ferri sink
 27 | 327:modern slaveri
 28 | 328:pope beatif
 29 | 329:mexican air pollut
 30 | 330:iran iraq cooper
 31 | 331:world bank critic
 32 | 332:incom tax evas
 33 | 333:antibiot bacteria diseas
 34 | 334:export control cryptographi
 35 | 335:adopt biolog parent
 36 | 336:black bear attack
 37 | 337:viral hepat
 38 | 338:risk aspirin
 39 | 339:alzheim drug treatment
 40 | 340:land mine ban
 41 | 341:airport secur
 42 | 342:diplomat expuls
 43 | 343:polic death
 44 | 344:abus e mail
 45 | 345:oversea tobacco sale
 46 | 346:educ standard
 47 | 347:wildlif extinct
 48 | 348:agoraphobia
 49 | 349:metabol
 50 | 350:health comput termin
 51 | 351:falkland petroleum explor
 52 | 352:british chunnel impact
 53 | 353:antarctica explor
 54 | 354:journalist risk
 55 | 355:ocean remot sens
 56 | 356:postmenopaus estrogen britain
 57 | 357:territori water disput
 58 | 358:blood alcohol fatal
 59 | 359:mutual fund predictor
 60 | 360:drug legal benefit
 61 | 361:cloth sweatshop
 62 | 362:human smuggl
 63 | 363:transport tunnel disast
 64 | 364:rabi
 65 | 365:el nino
 66 | 366:commerci cyanid us
 67 | 367:piraci
 68 | 368:vitro fertil
 69 | 369:anorexia nervosa bulimia
 70 | 370:food drug law
 71 | 371:health insur holist
 72 | 372:nativ american casino
 73 | 373:encrypt equip export
 74 | 374:nobel prize winner
 75 | 375:hydrogen energi
 76 | 376:world court
 77 | 377:cigar smoke
 78 | 378:euro opposit
 79 | 379:mainstream
 80 | 380:obes medic treatment
 81 | 381:altern medicin
 82 | 382:hydrogen fuel automobil
 83 | 383:mental ill drug
 84 | 384:space station moon
 85 | 385:hybrid fuel car
 86 | 386:teach disabl children
 87 | 387:radioact wast
 88 | 388:organ soil enhanc
 89 | 389:illeg technolog transfer
 90 | 390:orphan drug
 91 | 391:r d drug price
 92 | 392:robot
 93 | 393:merci kill
 94 | 394:home school
 95 | 395:tourism
 96 | 396:sick build syndrom
 97 | 397:automobil recal
 98 | 398:dismantl europ arsen
 99 | 399:oceanograph vessel
100 | 400:amazon rain forest
101 | 401:foreign minor germani
102 | 402:behavior genet
103 | 403:osteoporosi
104 | 404:ireland peac talk
105 | 405:cosmic event
106 | 406:parkinson diseas
107 | 407:poach wildlif preserv
108 | 408:tropic storm
109 | 409:legal pan am 103
110 | 410:schengen agreement
111 | 411:salvag shipwreck treasur
112 | 412:airport secur
113 | 413:steel product
114 | 414:cuba sugar export
115 | 415:drug golden triangl
116 | 416:three gorg project
117 | 417:creativ
118 | 418:quilt incom
119 | 419:recycl automobil tire
120 | 420:carbon monoxid poison
121 | 421:industri wast dispos
122 | 422:art stolen forg
123 | 423:milosev mirjana markov
124 | 424:suicid
125 | 425:counterfeit monei
126 | 426:law enforc dog
127 | 427:uv damag ey
128 | 428:declin birth rate
129 | 429:legionnair diseas
130 | 430:killer bee attack
131 | 431:robot technolog
132 | 432:profil motorist polic
133 | 433:greek philosophi stoicism
134 | 434:estonia economi
135 | 435:curb popul growth
136 | 436:railwai accid
137 | 437:deregul ga electr
138 | 438:tourism increas
139 | 439:invent scientif discoveri
140 | 440:child labor
141 | 441:lyme diseas
142 | 442:heroic act
143 | 443:u. invest africa
144 | 444:supercrit fluid
145 | 445:women clergi
146 | 446:tourist violenc
147 | 447:stirl engin
148 | 448:ship loss
149 | 449:antibiot ineffect
150 | 450:king hussein peac
151 | 601:turkei iraq water
152 | 602:czech slovak sovereignti
153 | 603:tobacco cigarett lawsuit
154 | 604:lyme diseas arthriti
155 | 605:great britain health care
156 | 606:leg trap ban
157 | 607:human genet code
158 | 608:tax social secur
159 | 609:per capita alcohol consumpt
160 | 610:minimum wage advers impact
161 | 611:kurd germani violenc
162 | 612:tibet protest
163 | 613:berlin wall dispos
164 | 614:flavr savr tomato
165 | 615:timber export asia
166 | 616:volkswagen mexico
167 | 617:russia cuba economi
168 | 618:ayatollah khomeini death
169 | 619:winni mandela scandal
170 | 620:franc nuclear test
171 | 621:women ordain church england
172 | 622:price fix
173 | 623:toxic chemic weapon
174 | 624:sdi star war
175 | 625:arrest bomb wtc
176 | 626:human stamped
177 | 627:russian food crisi
178 | 628:u. invas panama
179 | 629:abort clinic attack
180 | 630:gulf war syndrom
181 | 631:mandela south africa presid
182 | 632:southeast asia tin mine
183 | 633:welsh devolut
184 | 634:l tryptophan death
185 | 635:doctor assist suicid
186 | 636:juri duti exempt
187 | 637:human growth hormon hgh
188 | 638:wrong convict
189 | 639:consum line shop
190 | 640:matern leav polici
191 | 641:valdez wildlif marin life
192 | 642:tiananmen squar protest
193 | 643:salmon dam pacif northwest
194 | 644:exot anim import
195 | 645:softwar piraci
196 | 646:food stamp increas
197 | 647:windmil electr
198 | 648:famili leav law
199 | 649:comput virus
200 | 650:tax evas indict
201 | 651:u. ethnic popul
202 | 652:oic balkan 1990
203 | 653:eta basqu terror
204 | 654:same sex school
205 | 655:add diagnosi treatment
206 | 656:lead poison children
207 | 657:school prayer ban
208 | 658:teenag pregnanc
209 | 659:cruis health safeti
210 | 660:whale watch california
211 | 661:melanoma treatment caus
212 | 662:telemarket protect
213 | 663:agent orang exposur
214 | 664:american indian museum
215 | 665:poverti africa sub sahara
216 | 666:thatcher resign impact
217 | 667:unmarri partner household
218 | 668:poverti diseas
219 | 669:islam revolut
220 | 670:u. elect apathi
221 | 671:salvat armi benefit
222 | 672:nra membership profil
223 | 673:soviet withdraw afghanistan
224 | 674:greenpeac prosecut
225 | 675:olymp train swim
226 | 676:poppi cultiv
227 | 677:lean tower pisa
228 | 678:joint custodi impact
229 | 679:open adopt record
230 | 680:immigr spanish school
231 | 681:wind power locat
232 | 682:adult immigr english
233 | 683:czechoslovakia breakup
234 | 684:part time benefit
235 | 685:oscar winner select
236 | 686:argentina peg dollar
237 | 687:northern ireland industri
238 | 688:non u. media bia
239 | 689:famili plan aid
240 | 690:colleg educ advantag
241 | 691:clear cut forest
242 | 692:prostat cancer detect treatment
243 | 693:newspap electron media
244 | 694:compost pile
245 | 695:white collar crime sentenc
246 | 696:safeti plastic surgeri
247 | 697:air traffic control
248 | 698:literaci rate africa
249 | 699:term limit
250 | 700:gasolin tax u.
251 | 


--------------------------------------------------------------------------------
/geesedb/resources/topics.py:
--------------------------------------------------------------------------------
1 | def get_topics_backgroundlinking(file_name):
2 |     with open(file_name) as topics_file:
3 |         return [topic.strip().split(':') for topic in topics_file.readlines()]


--------------------------------------------------------------------------------
/geesedb/search/__init__.py:
--------------------------------------------------------------------------------
1 | from .retrieval_models.bag_of_words.disjunctive.robertson_bm25 import RobertsonBM25
2 | from .searcher import Searcher
3 | 
4 | __all__ = ['RobertsonBM25', 'Searcher']
5 | 


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/__init__.py


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/bag_of_words/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/bag_of_words/__init__.py


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/bag_of_words/aggregate.py:
--------------------------------------------------------------------------------
 1 | class Aggregate:
 2 | 
 3 |     def __init__(self) -> None:
 4 |         pass
 5 | 
 6 |     def get_aggregator(self) -> str:
 7 |         raise NotImplementedError("You should implement this method in your retrieval model class.")
 8 | 
 9 |     def get_create_ranked_list(self, n: int) -> str:
10 |         raise NotImplementedError("You should implement this method in your retrieval model class.")
11 | 


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/bag_of_words/bow_retrieval_model.py:
--------------------------------------------------------------------------------
 1 | from .aggregate import Aggregate
 2 | from ..generic_text_retrieval_model import GenericTextRetrievalModel
 3 | 
 4 | 
 5 | class BagOfWordsRetrievalModel(GenericTextRetrievalModel, Aggregate):
 6 |     def __init__(self) -> None:
 7 |         GenericTextRetrievalModel.__init__(self)
 8 |         Aggregate.__init__(self)
 9 | 
10 |     def construct_query(self, topic: str) -> str:
11 |         super_query = super().construct_query(topic)
12 |         return super_query + ", qterms AS (" \
13 |                              "SELECT term_doc.term_id, term_doc.doc_id, term_doc.tf, qtermids.df " \
14 |                              "FROM term_doc " \
15 |                              "JOIN qtermids " \
16 |                              "ON term_doc.term_id = qtermids.term_id" \
17 |                              ") "
18 | 
19 |     def get_retrieval_model(self) -> str:
20 |         return super().get_retrieval_model()
21 | 
22 |     def get_aggregator(self) -> str:
23 |         return ", scores AS (" \
24 |                "SELECT subscores.collection_id, SUM(subscores.subscore) AS score " \
25 |                "FROM subscores " \
26 |                "GROUP BY subscores.collection_id) "
27 | 
28 |     def get_create_ranked_list(self, n: int) -> str:
29 |         return "SELECT scores.collection_id, scores.score " \
30 |                "FROM scores " \
31 |                "ORDER BY scores.score DESC " \
32 |                f"LIMIT {n}"
33 | 


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/bag_of_words/conjunctive/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/bag_of_words/conjunctive/__init__.py


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/bag_of_words/disjunctive/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/bag_of_words/disjunctive/__init__.py


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/bag_of_words/disjunctive/disjunctive_retieval_model.py:
--------------------------------------------------------------------------------
 1 | from ..bow_retrieval_model import BagOfWordsRetrievalModel
 2 | 
 3 | 
 4 | class DisjunctiveRetrievalModel(BagOfWordsRetrievalModel):
 5 | 
 6 |     def __init__(self) -> None:
 7 |         super().__init__()
 8 | 
 9 |     def get_aggregator(self) -> str:
10 |         return super().get_aggregator()
11 | 
12 |     def construct_query(self, topic: str) -> str:
13 |         return super().construct_query(topic) + \
14 |                ", condocs AS (" \
15 |                "SELECT qterms.doc_id " \
16 |                "FROM qterms " \
17 |                "GROUP BY qterms.doc_id)"
18 | 
19 |     def get_create_ranked_list(self, n: int) -> str:
20 |         return super().get_create_ranked_list(n)
21 | 
22 |     def get_retrieval_model(self) -> str:
23 |         return super().get_retrieval_model()
24 | 


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/bag_of_words/disjunctive/robertson_bm25.py:
--------------------------------------------------------------------------------
 1 | from .disjunctive_retieval_model import DisjunctiveRetrievalModel
 2 | 
 3 | 
 4 | class RobertsonBM25(DisjunctiveRetrievalModel):
 5 |     def __init__(self, k1: float = 0.9, b: float = 0.4, n: int = 1000) -> None:
 6 |         DisjunctiveRetrievalModel.__init__(self)
 7 |         self.k1 = k1
 8 |         self.b = b
 9 |         self.n = n
10 | 
11 |     def construct_query(self, topic: str) -> str:
12 |         return DisjunctiveRetrievalModel.construct_query(self, topic) + \
13 |                self.get_retrieval_model() + \
14 |                DisjunctiveRetrievalModel.get_aggregator(self) + \
15 |                DisjunctiveRetrievalModel.get_create_ranked_list(self, self.n)
16 | 
17 |     def get_retrieval_model(self) -> str:
18 |         return ", subscores AS (" \
19 |                "SELECT docs.collection_id, " \
20 |                f"(LOG(((SELECT count(*) from docs)-df+0.5)/(df+0.5))*tf" \
21 |                "/" \
22 |                f"(tf+{self.k1}*(1-{self.b}+{self.b}*len/(SELECT AVG(len) from docs)))" \
23 |                ") AS subscore " \
24 |                "FROM qterms " \
25 |                "JOIN condocs " \
26 |                "ON qterms.doc_id = condocs.doc_id " \
27 |                "JOIN docs " \
28 |                "ON qterms.doc_id = docs.doc_id)"
29 | 


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/generic_text_retrieval_model.py:
--------------------------------------------------------------------------------
 1 | class GenericTextRetrievalModel:
 2 | 
 3 |     def __init__(self) -> None:
 4 |         pass
 5 | 
 6 |     def construct_query(self, topic: str) -> str:
 7 |         return "WITH qtermids AS (" \
 8 |                "SELECT term_dict.term_id, term_dict.df " \
 9 |                "FROM term_dict " \
10 |                "WHERE term_dict.string IN ('{}')" \
11 |                ")".format("', '".join(topic.split(' ')))
12 | 
13 |     def get_retrieval_model(self) -> str:
14 |         raise NotImplementedError("You should implement this method in your retrieval model class.")
15 | 


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/graph/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/graph/__init__.py


--------------------------------------------------------------------------------
/geesedb/search/retrieval_models/positional/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/search/retrieval_models/positional/__init__.py


--------------------------------------------------------------------------------
/geesedb/search/searcher.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | import argparse
 4 | from typing import Any, Callable, Union
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | from ..connection import get_connection
10 | from ..search import RobertsonBM25
11 | 
12 | 
13 | class Searcher:
14 | 
15 |     def __init__(self, **kwargs: Any) -> None:
16 |         self.arguments = self.get_arguments(kwargs)
17 |         self.db_connection = get_connection(self.arguments['database'])
18 |         self.ranking_method = None
19 |         self.fetch = self.set_return_type()
20 |         if self.arguments['retrieval_method'] == 'BM25_robertson':
21 |             self.ranking_method = RobertsonBM25(self.arguments['k1'], self.arguments['b'], self.arguments['n'])
22 | 
23 |     @staticmethod
24 |     def get_arguments(kwargs: Any) -> dict:
25 |         arguments = {
26 |             'database': None,
27 |             'retrieval_method': 'BM25_robertson',
28 |             'k1': 0.9,
29 |             'b': 0.4,
30 |             'n': 1000,
31 |             'return_type': 'tuple'
32 |         }
33 |         for key, item in arguments.items():
34 |             if kwargs.get(key) is not None:
35 |                 arguments[key] = kwargs.get(key)
36 |         if arguments['database'] is None:
37 |             raise IOError('database path needs to be provided')
38 |         return arguments
39 | 
40 |     def set_return_type(self) -> Callable[[], Union[list, pd.DataFrame, np.array]]:
41 |         if self.arguments['return_type'] == 'list':
42 |             fetch = self.db_connection.cursor.fetchall
43 |         elif self.arguments['return_type'] == 'numpy':
44 |             fetch = self.db_connection.cursor.fetchnumpy
45 |         else:
46 |             fetch = self.db_connection.cursor.fetchdf
47 |         return fetch
48 | 
49 |     def set_k1(self, k1: float):
50 |         self.arguments['k1'] = k1
51 | 
52 |     def set_b(self, b: float):
53 |         self.arguments['b'] = b
54 | 
55 |     def set_n(self, k1: float):
56 |         self.arguments['n'] = k1
57 | 
58 |     def search_topic(self, topic: str) -> Union[list, pd.DataFrame, np.array]:
59 |         query = self.ranking_method.construct_query(topic)
60 |         self.db_connection.cursor.execute(query)
61 |         return self.fetch()
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     parser = argparse.ArgumentParser()
66 |     parser.add_argument('-d',
67 |                         '--database',
68 |                         required=True,
69 |                         help='Name of the database / index')
70 |     parser.add_argument('-r',
71 |                         '--retrieval_method',
72 |                         choices=['BM25_robertson'],
73 |                         help="Use the Robertson's BM25 ranking function")
74 |     parser.add_argument('-k1')
75 |     parser.add_argument('-b')
76 |     parser.add_argument('-n')
77 |     parser.add_argument('-t',
78 |                         '--return_type',
79 |                         choices=['numpy', 'data_frame', 'list']
80 |                         )
81 |     Searcher(**vars(parser.parse_args()))
82 | 


--------------------------------------------------------------------------------
/geesedb/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/__init__.py


--------------------------------------------------------------------------------
/geesedb/tests/connection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/connection/__init__.py


--------------------------------------------------------------------------------
/geesedb/tests/connection/test_connection.py:
--------------------------------------------------------------------------------
 1 | from ...connection import get_connection, close_connection
 2 | 
 3 | 
 4 | def test_create_connection() -> None:
 5 |     db_connection = get_connection(':memory:')
 6 |     cursor = db_connection.cursor
 7 |     cursor.execute("SELECT 1;")
 8 |     assert cursor.fetchone() == (1,)
 9 |     close_connection()
10 | 


--------------------------------------------------------------------------------
/geesedb/tests/index/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/index/__init__.py


--------------------------------------------------------------------------------
/geesedb/tests/index/test_authors_from_csv.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from ...index import AuthorsFromCSV
 4 | from ...connection import close_connection
 5 | 
 6 | 
 7 | def test_load_csv_example_files() -> None:
 8 |     index = AuthorsFromCSV(database=':memory:',
 9 |                            doc_author_file=path.dirname(
10 |                                path.dirname(__file__)) + '/resources/csv/example_doc_author.csv'
11 |                            )
12 | 
13 |     index.connection.execute("SELECT * FROM doc_author;")
14 |     assert index.connection.fetchone() == ('b2e89334-33f9-11e1-825f-dabc29fd7071', 'Mark Giannotto')
15 |     close_connection()
16 | 


--------------------------------------------------------------------------------
/geesedb/tests/index/test_entities_from_csv.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from ...index import EntitiesFromCSV
 4 | from ...connection import close_connection
 5 | 
 6 | def test_load_csv_example_files() -> None:
 7 |     index = EntitiesFromCSV(database=':memory:',
 8 |                             entity_doc_file=path.dirname(
 9 |                                 path.dirname(__file__)) + '/resources/csv/example_entity_doc.csv'
10 |                             )
11 | 
12 |     index.connection.execute("SELECT * FROM entity_doc;")
13 |     assert index.connection.fetchone() == (0, 11, 'Danny Coale', 'Danny_Coale', 'PER',
14 |                                            'b2e89334-33f9-11e1-825f-dabc29fd7071')
15 |     close_connection()
16 | 


--------------------------------------------------------------------------------
/geesedb/tests/index/test_fulltext_from_ciff.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from ...index import FullTextFromCiff
 4 | from ...connection import close_connection
 5 | 
 6 | def test_load_csv_example_files() -> None:
 7 |     index = FullTextFromCiff(database=':memory:',
 8 |                              protobuf_file=path.dirname(path.dirname(__file__)
 9 |                                                         ) + '/resources/ciff/toy-complete-20200309.ciff.gz'
10 |                              )
11 |     index.load_data()
12 |     index.cursor.execute("SELECT * FROM docs;")
13 |     assert index.cursor.fetchone() == ('WSJ_1', 0, 6)
14 |     assert index.cursor.fetchone() == ('TREC_DOC_1', 1, 4)
15 |     close_connection()
16 | 


--------------------------------------------------------------------------------
/geesedb/tests/index/test_fulltext_from_csv.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from ...index import FullTextFromCSV
 4 | from ...connection import close_connection
 5 | 
 6 | def test_load_csv_example_files() -> None:
 7 |     index = FullTextFromCSV(database=':memory:',
 8 |                             docs_file=path.dirname(path.dirname(__file__)) + '/resources/csv/example_docs.csv',
 9 |                             term_dict_file=path.dirname(
10 |                                 path.dirname(__file__)) + '/resources/csv/example_term_dict.csv',
11 |                             term_doc_file=path.dirname(path.dirname(__file__)) + '/resources/csv/example_term_doc.csv'
12 |                             )
13 |     index.load_data()
14 |     index.connection.execute("SELECT * FROM docs;")
15 |     assert index.connection.fetchone() == ('document_0', 0, 3)
16 |     assert index.connection.fetchone() == ('document_1', 1, 4)
17 |     close_connection()
18 | 
19 | 
20 | def test_load_csv_use_existing_database_does_not_exist() -> None:
21 |     try:
22 |         FullTextFromCSV(database='test_database',
23 |                         use_existing_db=True
24 |                         )
25 |         assert False
26 |     except IOError:
27 |         assert True
28 |     close_connection()
29 | 


--------------------------------------------------------------------------------
/geesedb/tests/resources/ciff/toy-complete-20200309.ciff.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/resources/ciff/toy-complete-20200309.ciff.gz


--------------------------------------------------------------------------------
/geesedb/tests/resources/csv/example_doc_author.csv:
--------------------------------------------------------------------------------
 1 | b2e89334-33f9-11e1-825f-dabc29fd7071|Mark Giannotto
 2 | ca334a80-34d4-11e1-88f9-9084fc48c348|Mark Giannotto
 3 | b4bf35ea-3585-11e1-836b-08c4de636de4|Mark Giannotto
 4 | c5f7c1be-368a-11e1-9ee3-fd35588e7629|Mark Giannotto
 5 | 48b0a076-370d-11e1-9ee3-fd35588e7629|Mark Giannotto
 6 | ffef8298-36e0-11e1-9ee3-fd35588e7629|Mark Giannotto
 7 | 806f619a-3975-11e1-81ef-eaf2bd09c8a2|Mark Giannotto
 8 | 07c32036-3bf2-11e1-a72c-c808ebbd31f5|Mark Giannotto
 9 | f46b5060-3efa-11e1-804a-d8db7cc3d3b2|Mark Giannotto
10 | 7aa5c316-4204-11e1-9091-3ad6d04900db|Mark Giannotto
11 | 8d091e8d2adec74088b89aea54e5fff0|Katie Carrera
12 | fb7979c77a02b72ba079628964216853|Katie Carrera
13 | 354dbc1b6ae6f1ead6d449005c0d46ff|Katie Carrera
14 | 6173699a05b6bb212bbf4cbe36de1a2e|Katie Carrera
15 | 5d86156e7e96e4c566cf5d130cfe23cb|Katie Carrera
16 | 70ffc51bf54d594b47a5839f736927b7|Katie Carrera
17 | 37e0233314d3955074b1359b7468e5d5|Katie Carrera
18 | d0e3d979102c23cd9b40761fca6402d8|Katie Carrera
19 | cd905bce531fac9140200b36b6791df6|Katie Carrera
20 | e4a267f731660d2f9e65a9ad7be57012|Katie Carrera
21 | 2056bbeebf83491579d386e2a50a979d|Katie Carrera
22 | d114557a24463b3dd81e41599d4deecf|Mark Berman
23 | 1fa1eaa46f116913ba9dbf3686c58a5b|Mark Berman
24 | e407b71ea43ef9b6f9ec98bfedf720e9|Mark Berman
25 | 68a63858c0a67d25ef243f4a0f6676a9|Chris Cillizza
26 | ab120f52c7b6412af966aafbe4718ae2|Chris Cillizza
27 | e96cd810bdd79b30d88447e92ea0d5c7|Chris Cillizza
28 | 8f70f98e4f6a6e89f1434eb8be266aba|Chris Cillizza
29 | 51bd464f9881b8bbc19c965dc9e34828|Chris Cillizza
30 | e8a6f0d880caf7a19974bd63581ce41d|Chris Cillizza
31 | 


--------------------------------------------------------------------------------
/geesedb/tests/resources/csv/example_docs.csv:
--------------------------------------------------------------------------------
1 | document_0|0|3
2 | document_1|1|4


--------------------------------------------------------------------------------
/geesedb/tests/resources/csv/example_entity_doc.csv:
--------------------------------------------------------------------------------
  1 | 0|11|Danny Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
  2 | 13|14|Jarrett Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
  3 | 56|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
  4 | 79|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
  5 | 107|11|Danny Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
  6 | 153|9|Episcopal|Episcopal_Academy|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
  7 | 166|10|Alexandria|Alexandria,_Virginia|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071
  8 | 197|14|Jarrett Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
  9 | 264|2|AP|AP_Poll|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 10 | 268|11|NEW ORLEANS|New_Orleans|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071
 11 | 293|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 12 | 445|14|North Carolina|North_Carolina|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071
 13 | 504|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 14 | 574|14|Jarrett Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 15 | 593|11|Danny Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 16 | 754|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 17 | 765|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 18 | 827|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 19 | 964|5|Macho|Macho_Harris|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 20 | 971|6|Harris|Franco_Harris|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 21 | 1024|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 22 | 1035|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 23 | 1152|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 24 | 1471|6|Hokies|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 25 | 1488|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 26 | 1529|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 27 | 1598|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 28 | 1703|12|Frank Beamer|Frank_Beamer|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 29 | 1745|5|Danny|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 30 | 1881|12|Logan Thomas|Logan_Thomas|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 31 | 1935|6|Beamer|Frank_Beamer|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 32 | 2023|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 33 | 2088|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 34 | 2130|6|Hokies|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 35 | 2231|12|David Wilson|David_Wilson_(American_football)|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 36 | 2251|3|ACC|Atlantic_Coast_Conference|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 37 | 2300|6|Thomas|Logan_Thomas|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 38 | 2433|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 39 | 2475|14|South Carolina|South_Carolina_Gamecocks_football|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071
 40 | 2505|14|Alshon Jeffrey|Alshon_Jeffery|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 41 | 2525|14|Oklahoma State|Oklahoma_State_Cowboys_football|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071
 42 | 2555|15|Justin Blackmon|Justin_Blackmon|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 43 | 3019|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 44 | 3082|3|ACC|Atlantic_Coast_Conference|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 45 | 3188|8|Facebook|Facebook|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 46 | 3200|7|Twitter|Twitter|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 47 | 3209|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 48 | 3258|6|Beamer|Frank_Beamer|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 49 | 3390|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 50 | 3449|10|Blacksburg|Blacksburg,_Virginia|LOC|b2e89334-33f9-11e1-825f-dabc29fd7071
 51 | 3508|3|VMI|Virginia_Military_Institute|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 52 | 3572|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 53 | 3660|13|Virginia Tech|Virginia_Tech_Hokies_football|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 54 | 3700|3|NFL|National_Football_League|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 55 | 3713|11|Eddie Royal|Eddie_Royal|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 56 | 3726|11|Andre Davis|André_Davis|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 57 | 3742|11|Josh Morgan|Josh_Morgan|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 58 | 3962|6|Thomas|Logan_Thomas|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 59 | 4061|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 60 | 4072|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 61 | 4302|3|ACC|Atlantic_Coast_Conference|ORG|b2e89334-33f9-11e1-825f-dabc29fd7071
 62 | 4335|6|Boykin|Jarrett_Boykin|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 63 | 4497|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 64 | 4581|5|Coale|Danny_Coale|PER|b2e89334-33f9-11e1-825f-dabc29fd7071
 65 | 31|8|New Year|Chinese_New_Year|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 66 | 99|12|planet Venus|Venus|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 67 | 158|8|New Year|Lunar_calendar|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 68 | 367|5|Venus|Venus|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071
 69 | 511|5|Venus|Venus|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071
 70 | 586|5|Venus|Venus|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071
 71 | 713|7|Jupiter|Jupiter|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 72 | 771|19|Aries constellation|Aries_(constellation)|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 73 | 975|7|Jupiter|Jupiter|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 74 | 1091|10|Washington|Washington,_D.C.|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 75 | 1108|4|Mars|Mars|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 76 | 1117|6|Saturn|Saturn|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 77 | 1165|8|New Year|Chinese_New_Year|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 78 | 1302|6|Saturn|Saturn|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 79 | 1381|5|Virgo|Virgo_(constellation)|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 80 | 1437|6|Saturn|Saturn|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 81 | 1635|11|Quadrantids|Quadrantids|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 82 | 1810|33|International Meteor Organization|International_Meteor_Organization|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 83 | 2007|10|Big Dipper|Ursa_Major|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 84 | 2022|13|Little Dipper|Ursa_Minor|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 85 | 2096|11|Quadrantids|Quadrantids|MISC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 86 | 2140|3|IMO|International_Maritime_Organization|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 87 | 2278|10|open house|Open_house_(school)|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 88 | 2290|34|University of Maryland Observatory|University_of_Maryland_Observatory|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 89 | 2326|12|College Park|College_Park,_Maryland|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 90 | 2491|12|Star Atlases|Celestial_cartography|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 91 | 2548|32|Northern Virginia Astronomy Club|Northern_Virginia_Astronomy_Club|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 92 | 2616|23|George Mason University|George_Mason_University|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 93 | 2641|7|Fairfax|Fairfax,_Virginia|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 94 | 2724|9|Telescope|Telescope|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 95 | 2780|34|University of Maryland Observatory|University_of_Maryland_Observatory|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 96 | 2816|12|College Park|College_Park,_Maryland|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 97 | 2992|29|National Air and Space Museum|National_Air_and_Space_Museum|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
 98 | 3023|13|National Mall|National_Mall|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
 99 | 3191|10|open house|Open_house_(school)|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
100 | 3203|34|University of Maryland Observatory|University_of_Maryland_Observatory|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
101 | 3239|12|College Park|College_Park,_Maryland|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
102 | 3392|6|Bieber|Justin_Bieber|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071
103 | 3403|4|Bono|Bono|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071
104 | 3497|11|Takoma Park|Takoma_Park,_Maryland|LOC|749ec5b2-32f5-11e1-825f-dabc29fd7071
105 | 3602|12|Solar System|Solar_System|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
106 | 3780|15|Albert Einstein|Albert_Einstein|PER|749ec5b2-32f5-11e1-825f-dabc29fd7071
107 | 3809|29|National Air and Space Museum|National_Air_and_Space_Museum|ORG|749ec5b2-32f5-11e1-825f-dabc29fd7071
108 | 0|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
109 | 15|3|GOP|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
110 | 88|10|DES MOINES|Des_Moines,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
111 | 150|10|Republican|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
112 | 191|3|GOP|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
113 | 324|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
114 | 407|19|Des Moines Register|The_Des_Moines_Register|ORG|69654742-33d7-11e1-825f-dabc29fd7071
115 | 487|13|Massachusetts|Governor_of_Massachusetts|LOC|69654742-33d7-11e1-825f-dabc29fd7071
116 | 510|11|Mitt Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
117 | 592|3|Rep|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
118 | 597|8|Ron Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071
119 | 607|4|Tex.|Texas|LOC|69654742-33d7-11e1-825f-dabc29fd7071
120 | 651|12|Pennsylvania|Pennsylvania|LOC|69654742-33d7-11e1-825f-dabc29fd7071
121 | 672|13|Rick Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071
122 | 961|5|House|United_States_House_of_Representatives|ORG|69654742-33d7-11e1-825f-dabc29fd7071
123 | 975|13|Newt Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071
124 | 1004|5|Texas|Texas|LOC|69654742-33d7-11e1-825f-dabc29fd7071
125 | 1015|10|Rick Perry|Rick_Perry|PER|69654742-33d7-11e1-825f-dabc29fd7071
126 | 1044|3|Rep|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
127 | 1049|16|Michele Bachmann|Michele_Bachmann|PER|69654742-33d7-11e1-825f-dabc29fd7071
128 | 1067|5|Minn.|Minnesota|LOC|69654742-33d7-11e1-825f-dabc29fd7071
129 | 1453|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
130 | 1742|12|Barack Obama|Barack_Obama|PER|69654742-33d7-11e1-825f-dabc29fd7071
131 | 1757|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
132 | 1862|10|Mason City|Mason_City,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
133 | 1894|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071
134 | 1950|9|Indianola|Indianola,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
135 | 2165|4|Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071
136 | 2207|9|Rand Paul|Rand_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071
137 | 2220|8|Kentucky|Kentucky|LOC|69654742-33d7-11e1-825f-dabc29fd7071
138 | 2280|8|Ron Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071
139 | 2524|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071
140 | 2589|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
141 | 2616|9|super PAC|Political_action_committee|MISC|69654742-33d7-11e1-825f-dabc29fd7071
142 | 2889|9|tea party|Tea_Party_movement|ORG|69654742-33d7-11e1-825f-dabc29fd7071
143 | 2921|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
144 | 2961|5|Manly|Manly_Warringah_Sea_Eagles|LOC|69654742-33d7-11e1-825f-dabc29fd7071
145 | 2988|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071
146 | 3102|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
147 | 3110|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
148 | 3259|5|Davey|Martin_L._Davey|PER|69654742-33d7-11e1-825f-dabc29fd7071
149 | 3338|4|Newt|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071
150 | 3361|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
151 | 3478|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
152 | 3614|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
153 | 3668|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
154 | 3825|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
155 | 3893|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
156 | 3938|3|GOP|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
157 | 3971|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
158 | 4039|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
159 | 4078|13|New Hampshire|New_Hampshire|LOC|69654742-33d7-11e1-825f-dabc29fd7071
160 | 4169|4|Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071
161 | 4177|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071
162 | 4236|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071
163 | 4253|5|Perry|Rick_Perry|PER|69654742-33d7-11e1-825f-dabc29fd7071
164 | 4285|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
165 | 4332|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
166 | 4397|13|New Hampshire|New_Hampshire_primary|LOC|69654742-33d7-11e1-825f-dabc29fd7071
167 | 4471|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
168 | 4531|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
169 | 4590|10|Republican|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
170 | 4964|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
171 | 5036|8|Ron Paul|Ron_Paul|PER|69654742-33d7-11e1-825f-dabc29fd7071
172 | 5049|11|Mitt Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
173 | 5165|15|Steve Scheffler|Steve_Scheffler|PER|69654742-33d7-11e1-825f-dabc29fd7071
174 | 5199|32|Iowa Faith and Freedom Coalition|Faith_and_Freedom_Coalition|ORG|69654742-33d7-11e1-825f-dabc29fd7071
175 | 5343|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
176 | 5519|3|GOP|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
177 | 5826|7|America|United_States|LOC|69654742-33d7-11e1-825f-dabc29fd7071
178 | 5880|9|Scheffler|Steve_Scheffler|PER|69654742-33d7-11e1-825f-dabc29fd7071
179 | 6061|10|Adam Gregg|Adam_Gregg|PER|69654742-33d7-11e1-825f-dabc29fd7071
180 | 6079|10|Des Moines|Des_Moines,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
181 | 6110|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
182 | 6149|7|Le Mars|Le_Mars,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
183 | 6193|5|Gregg|Judd_Gregg|PER|69654742-33d7-11e1-825f-dabc29fd7071
184 | 6299|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
185 | 6358|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071
186 | 6582|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071
187 | 6686|14|Craig Robinson|Craig_Robinson_(actor)|PER|69654742-33d7-11e1-825f-dabc29fd7071
188 | 6744|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
189 | 6749|10|Republican|Republican_Party_(United_States)|MISC|69654742-33d7-11e1-825f-dabc29fd7071
190 | 6856|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071
191 | 6977|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
192 | 7205|9|Urbandale|Urbandale,_Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
193 | 7236|8|Santorum|Rick_Santorum|PER|69654742-33d7-11e1-825f-dabc29fd7071
194 | 7308|10|Iowa State|Iowa|ORG|69654742-33d7-11e1-825f-dabc29fd7071
195 | 7324|7|Rutgers|Rutgers_Scarlet_Knights_football|ORG|69654742-33d7-11e1-825f-dabc29fd7071
196 | 7339|22|New Era Pinstripe Bowl|Pinstripe_Bowl|MISC|69654742-33d7-11e1-825f-dabc29fd7071
197 | 7363|11|Evangelical|Evangelicalism|MISC|69654742-33d7-11e1-825f-dabc29fd7071
198 | 7463|12|evangelicals|Evangelicalism|MISC|69654742-33d7-11e1-825f-dabc29fd7071
199 | 7518|8|Arkansas|Arkansas|LOC|69654742-33d7-11e1-825f-dabc29fd7071
200 | 7536|13|Mike Huckabee|Mike_Huckabee|PER|69654742-33d7-11e1-825f-dabc29fd7071
201 | 7565|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
202 | 7733|8|Huckabee|Mike_Huckabee|PER|69654742-33d7-11e1-825f-dabc29fd7071
203 | 7802|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
204 | 8040|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
205 | 8109|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
206 | 8119|14|Terry Branstad|Terry_Branstad|PER|69654742-33d7-11e1-825f-dabc29fd7071
207 | 8139|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
208 | 8263|13|Supreme Court|Supreme_Court_of_the_United_States|ORG|69654742-33d7-11e1-825f-dabc29fd7071
209 | 8852|7|Florida|Florida|LOC|69654742-33d7-11e1-825f-dabc29fd7071
210 | 9019|5|Perry|Rick_Perry|PER|69654742-33d7-11e1-825f-dabc29fd7071
211 | 9029|6|Romney|Mitt_Romney|PER|69654742-33d7-11e1-825f-dabc29fd7071
212 | 9122|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071
213 | 9294|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071
214 | 9625|8|Gingrich|Newt_Gingrich|PER|69654742-33d7-11e1-825f-dabc29fd7071
215 | 9705|3|T.W|Taiwan|PER|69654742-33d7-11e1-825f-dabc29fd7071
216 | 9710|6|Farnam|Farnam,_Nebraska|PER|69654742-33d7-11e1-825f-dabc29fd7071
217 | 9720|10|Washington|Washington,_D.C.|LOC|69654742-33d7-11e1-825f-dabc29fd7071
218 | 9735|11|Amy Gardner|Amy_Gardner|PER|69654742-33d7-11e1-825f-dabc29fd7071
219 | 9791|4|Iowa|Iowa|LOC|69654742-33d7-11e1-825f-dabc29fd7071
220 | 0|9|John Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
221 | 11|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
222 | 49|7|Wizards|Washington_Wizards|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
223 | 57|3|NBA|National_Basketball_Association|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
224 | 62|4|John|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
225 | 68|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
226 | 141|10|Washington|Washington_Wizards|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
227 | 158|13|Flip Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
228 | 305|16|Associated Press|Associated_Press_College_Basketball_Player_of_the_Year|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
229 | 450|9|John Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
230 | 539|14|Bradley Center|Bradley_Center|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
231 | 555|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
232 | 654|18|Washington Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
233 | 685|15|Milwaukee Bucks|Milwaukee_Bucks|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
234 | 859|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
235 | 1064|13|Flip Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
236 | 1295|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
237 | 1405|4|John|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
238 | 1606|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
239 | 1833|5|Bucks|Milwaukee_Bucks|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
240 | 1840|9|Milwaukee|Milwaukee_Bucks|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
241 | 1881|10|Beno Udrih|Beno_Udrih|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
242 | 1909|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
243 | 1959|12|Ronny Turiaf|Ronny_Turiaf|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
244 | 1973|13|Hamady Ndiaye|Hamady_N'Diaye|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
245 | 1996|11|Koichi Sato|Kōichi_Satō_(actor)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
246 | 2009|12|JaVale McGee|JaVale_McGee|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
247 | 2026|14|Kevin Seraphin|Kevin_Séraphin|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
248 | 2091|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
249 | 2575|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
250 | 2875|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
251 | 3039|14|Andray Blatche|Andray_Blatche|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
252 | 3181|10|Washington|Washington_Wizards|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
253 | 3221|15|Jordan Crawford|Jordan_Crawford|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
254 | 3240|7|Atlanta|Atlanta_Hawks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
255 | 3252|10|Nick Young|Nick_Young_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
256 | 3266|9|Milwaukee|Milwaukee_Bucks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
257 | 3304|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
258 | 3316|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
259 | 3364|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
260 | 3399|7|Atlanta|Atlanta_Hawks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
261 | 3538|3|NBA|National_Basketball_Association|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
262 | 3559|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
263 | 3737|10|New Jersey|Brooklyn_Nets|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
264 | 3790|7|Atlanta|Atlanta_Hawks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
265 | 3864|9|Milwaukee|Milwaukee_Bucks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
266 | 3906|7|Wizards|Washington_Wizards|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
267 | 4036|6|Boston|Boston_Celtics|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
268 | 4075|14|Verizon Center|Capital_One_Arena|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
269 | 4103|7|Orlando|Orlando_Magic|ORG|d5966ad2-33f9-11e1-825f-dabc29fd7071
270 | 4127|8|New York|New_York_Knicks|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
271 | 4167|4|Wall|John_Wall_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
272 | 4338|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
273 | 4361|13|Randy Wittman|Randy_Wittman|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
274 | 4414|9|Milwaukee|Milwaukee|LOC|d5966ad2-33f9-11e1-825f-dabc29fd7071
275 | 4437|13|Maurice Evans|Maurice_Evans_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
276 | 4455|13|Rashard Lewis|Rashard_Lewis|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
277 | 4562|5|Young|Nick_Young_(basketball)|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
278 | 4792|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
279 | 4965|8|Saunders|Flip_Saunders|PER|d5966ad2-33f9-11e1-825f-dabc29fd7071
280 | 13|8|Maryland|Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
281 | 243|8|Maryland|Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
282 | 342|8|Maryland|Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
283 | 434|16|Carl S. Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
284 | 713|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
285 | 1020|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
286 | 1211|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
287 | 1239|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
288 | 1341|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
289 | 1485|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
290 | 1496|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
291 | 1767|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
292 | 1884|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
293 | 2246|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
294 | 2506|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
295 | 2673|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
296 | 2827|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
297 | 2854|10|Chapter 11|Chapter_11,_Title_11,_United_States_Code|MISC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
298 | 2885|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
299 | 2969|8|Woodview|Woodview|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
300 | 3083|12|Jack Johnson|Jack_Johnson_(boxer)|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
301 | 3177|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
302 | 3410|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
303 | 3591|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
304 | 3647|5|Clapp|Moses_E._Clapp|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
305 | 3658|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
306 | 3763|8|Landover|Landover,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
307 | 3813|7|Addison|Addison,_Texas|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
308 | 3824|15|Capitol Heights|Capitol_Heights,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
309 | 4067|8|Williams|Carl_S._Williams|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
310 | 4186|15|Martin O’Malley|Martin_O'Malley|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
311 | 4272|19|Anne Arundel County|Anne_Arundel_County,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
312 | 4309|5|Metro|Washington_Metro|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
313 | 4343|8|O’Malley|Martin_O'Malley|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
314 | 4481|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
315 | 4684|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
316 | 4858|13|Grand Central|Grand_Central_Terminal|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
317 | 4981|30|Department of General Services|California_Department_of_General_Services|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
318 | 5121|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
319 | 5456|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
320 | 5499|17|Montgomery County|Montgomery_County,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
321 | 5663|14|New Carrollton|New_Carrollton,_Maryland|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
322 | 5757|9|Baltimore|Baltimore|LOC|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
323 | 6108|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
324 | 6161|19|The Washington Post|The_Washington_Post|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
325 | 6311|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
326 | 6464|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
327 | 6598|19|The Washington Post|The_Washington_Post|ORG|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
328 | 6923|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
329 | 7190|8|Williams|Carl_S._Williams|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
330 | 7423|16|Jennifer Jenkins|Jennifer_DeLonge|PER|f2c10c06-2c0c-11e1-9952-55d90a4e2d6d
331 | 


--------------------------------------------------------------------------------
/geesedb/tests/resources/csv/example_term_dict.csv:
--------------------------------------------------------------------------------
1 | 0|0|2
2 | 1|Hello|2


--------------------------------------------------------------------------------
/geesedb/tests/resources/csv/example_term_doc.csv:
--------------------------------------------------------------------------------
1 | 0|0|1
2 | 0|1|1
3 | 1|0|2
4 | 1|1|3


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/gql/1:
--------------------------------------------------------------------------------
1 | MATCH (d:docs {collection_id:"96ab542e-6a07-11e6-ba32-5a4bf5aad4fa"})-[]-(:authors)-[]-(:docs)-[]-(a:authors)
2 | RETURN DISTINCT a.author


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/gql/2:
--------------------------------------------------------------------------------
1 | MATCH (a:authors)
2 | RETURN a.author


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/gql/4:
--------------------------------------------------------------------------------
1 | MATCH (d:docs)-[]-(:authors)-[]-(d2:docs)
2 | WHERE d.collection_id = "96ab542e-6a07-11e6-ba32-5a4bf5aad4fa"
3 | RETURN DISTINCT d2.collection_id


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/gql/5:
--------------------------------------------------------------------------------
1 | MATCH (d:docs {collection_id: ?})-[]-(t:term_dict)
2 | RETURN string
3 | ORDER BY tf*log(671945/df)
4 | DESC
5 | LIMIT 5


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/gql/6:
--------------------------------------------------------------------------------
1 | MATCH (d:docs {collection_id: "96ab542e-6a07-11e6-ba32-5a4bf5aad4fa"})-[]-(e:entities)
2 | RETURN mention
3 | ORDER BY start
4 | LIMIT 5


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/gql/7:
--------------------------------------------------------------------------------
1 | MATCH (n:Actor)
2 | RETURN n.name AS name
3 | UNION
4 | MATCH (n:Movie)
5 | RETURN n.title AS name


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/sql/1:
--------------------------------------------------------------------------------
1 | SELECT distinct a.author
2 | FROM authors AS a
3 | JOIN doc_author AS da ON (a.author = da.author)
4 | JOIN docs AS d0 ON (d0.collection_id = da.doc)
5 | JOIN doc_author as da2 ON (d0.collection_id = da2.doc)
6 | JOIN authors as a2 ON (da2.author = a2.author)
7 | JOIN doc_author as da3 ON (a2.author = da3.author)
8 | JOIN docs as d ON (d.collection_id = da3.doc)
9 | WHERE d.collection_id = '96ab542e-6a07-11e6-ba32-5a4bf5aad4fa'


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/sql/2:
--------------------------------------------------------------------------------
1 | SELECT a.author
2 | FROM authors as a


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/sql/3:
--------------------------------------------------------------------------------
 1 | WITH SELECT count(*) FROM docs AS n,
 2 | SELECT en.entity, en.tf * log(n / entities.df) AS tfidf
 3 | FROM
 4 |     (SELECT entity, count(*) AS tf
 5 |      FROM entity_doc
 6 |      WHERE doc_id = '96ab542e-6a07-11e6-ba32-5a4bf5aad4fa'
 7 |      GROUP BY entity
 8 |      ) AS en
 9 | JOIN entities
10 | ON (entities.entity = en.entity)
11 | ORDER BY tfidf DESC
12 | LIMIT 5;


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/sql/4:
--------------------------------------------------------------------------------
1 | SELECT distinct d2.collection_id
2 | FROM docs AS d2
3 | JOIN doc_author as da2 ON (d2.collection_id = da2.doc)
4 | JOIN authors as a2 ON (da2.author = a2.author)
5 | JOIN doc_author as da3 ON (a2.author = da3.author)
6 | JOIN docs as d ON (d.collection_id = da3.doc)
7 | WHERE d.collection_id = '96ab542e'


--------------------------------------------------------------------------------
/geesedb/tests/resources/queries/sql/5:
--------------------------------------------------------------------------------
1 | SELECT term_dict.string
2 | FROM term_dict
3 | JOIN term_doc ON (term_dict.term_id = term_doc.term_id)
4 | JOIN docs ON (docs.doc_id = term_doc.doc_id)
5 | WHERE docs.collection_id = ?
6 | ORDER BY tf * log(671945/df) DESC
7 | LIMIT 5;


--------------------------------------------------------------------------------
/geesedb/tests/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/utils/__init__.py


--------------------------------------------------------------------------------
/geesedb/tests/utils/ciff/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/tests/utils/ciff/__init__.py


--------------------------------------------------------------------------------
/geesedb/tests/utils/ciff/test_to_csv.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from ....utils.ciff.to_csv import ToCSV
 4 | 
 5 | 
 6 | def test_create_csv_from_ciff(tmp_path: str) -> None:
 7 |     ToCSV(
 8 |         protobuf_file=path.dirname(path.dirname(path.dirname(__file__))) + '/resources/ciff/toy-complete-20200309.ciff.gz',
 9 |         output_docs=str(tmp_path) + 'docs.csv',
10 |         output_term_dict=str(tmp_path) + 'term_dict.csv',
11 |         output_term_doc=str(tmp_path) + 'term_doc.csv'
12 |         )
13 |     with open(str(tmp_path) + 'docs.csv') as f:
14 |         assert f.readline().strip() == 'WSJ_1|0|6'
15 |     with open(str(tmp_path) + 'term_dict.csv') as f:
16 |         assert f.readline().strip() == '0|01|1'
17 |     with open(str(tmp_path) + 'term_doc.csv') as f:
18 |         assert f.readline().strip() == '0|0|1'
19 | 


--------------------------------------------------------------------------------
/geesedb/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .ciff.to_csv import ToCSV
2 | from .ciff.to_ciff import ToCiff
3 | 
4 | __all__ = ['ToCSV', 'ToCiff']


--------------------------------------------------------------------------------
/geesedb/utils/ciff/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/informagi/GeeseDB/13dc331c709e96353182f59e2646cf67ecc628c0/geesedb/utils/ciff/__init__.py


--------------------------------------------------------------------------------
/geesedb/utils/ciff/to_ciff.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from pathlib import Path
  3 | from typing import Any
  4 | 
  5 | from tqdm import tqdm
  6 | from ciff_toolkit.ciff_pb2 import Header, Posting, PostingsList, DocRecord
  7 | from ciff_toolkit.write import CiffWriter
  8 | 
  9 | from ...connection import get_connection
 10 | 
 11 | 
 12 | class ToCiff:
 13 | 
 14 |     def __init__(self, **kwargs: Any) -> None:
 15 |         self.arguments = self.get_arguments(kwargs)
 16 |         db_connection = get_connection(self.arguments['database'])
 17 |         self.connection = db_connection.connection
 18 |         self.cursor = db_connection.cursor
 19 |         self.create_ciff()
 20 | 
 21 |     @staticmethod
 22 |     def get_arguments(kwargs: Any) -> dict:
 23 |         arguments = {
 24 |             'database': None,
 25 |             'ciff': None,
 26 |             'docs': 'docs',
 27 |             'term_dict': 'term_dict',
 28 |             'term_doc': 'term_doc',
 29 |             'batch_size': 1000,
 30 |             'verbose': False,
 31 |         }
 32 |         for key, item in arguments.items():
 33 |             if kwargs.get(key) is not None:
 34 |                 arguments[key] = kwargs.get(key)
 35 |         return arguments
 36 | 
 37 |     def create_ciff(self) -> None:
 38 |         disable_tqdm = not self.arguments['verbose']
 39 | 
 40 |         with CiffWriter(self.arguments['ciff']) as writer:
 41 |             header = self.get_ciff_header()
 42 |             writer.write_header(header)
 43 | 
 44 |             postings_lists = tqdm(self.get_ciff_postings_lists(), total=header.num_postings_lists, disable=disable_tqdm)
 45 |             writer.write_postings_lists(postings_lists)
 46 | 
 47 |             doc_records = tqdm(self.get_ciff_doc_records(), total=header.num_docs, disable=disable_tqdm)
 48 |             writer.write_documents(doc_records)
 49 | 
 50 |     def get_ciff_header(self):
 51 |         header = Header()
 52 |         header.version = 1  # We work with ciff v1
 53 |         self.cursor.execute("""SELECT COUNT(*) FROM term_dict""")
 54 |         header.num_postings_lists = self.cursor.fetchone()[0]
 55 |         self.cursor.execute("""SELECT COUNT(*) FROM docs""")
 56 |         header.num_docs = self.cursor.fetchone()[0]
 57 |         header.total_postings_lists = header.num_postings_lists
 58 |         header.total_docs = header.num_docs
 59 |         self.cursor.execute("""SELECT SUM(tf) FROM term_doc""")
 60 |         header.total_terms_in_collection = self.cursor.fetchone()[0]
 61 |         header.average_doclength = header.total_terms_in_collection / header.num_docs
 62 |         header.description = f'GeeseDB database {self.arguments["database"]}'
 63 | 
 64 |         return header
 65 | 
 66 |     def get_ciff_postings_lists(self):
 67 |         self.cursor.execute("""
 68 |             SELECT df, string, list(row(doc_id, tf) ORDER BY doc_id) 
 69 |             FROM term_dict, term_doc 
 70 |             WHERE term_dict.term_id = term_doc.term_id 
 71 |             GROUP BY term_dict.term_id, df, string 
 72 |             ORDER BY string 
 73 |         """)
 74 |         while batch := self.cursor.fetchmany(self.arguments['batch_size']):
 75 |             for df, term, postings in batch:
 76 |                 postings_list = PostingsList()
 77 |                 assert len(postings) == df
 78 |                 cf = sum(p['tf'] for p in postings)
 79 |                 postings_list.term = term
 80 |                 postings_list.df = df
 81 |                 postings_list.cf = cf
 82 |                 old_id = 0
 83 |                 for p in postings:
 84 |                     posting = Posting()
 85 |                     doc_id = p['doc_id']
 86 |                     tf = p['tf']
 87 |                     posting.docid = doc_id - old_id
 88 |                     old_id = doc_id
 89 |                     posting.tf = tf
 90 |                     postings_list.postings.append(posting)
 91 | 
 92 |                 yield postings_list
 93 | 
 94 |     def get_ciff_doc_records(self):
 95 |         self.cursor.execute("""
 96 |             SELECT doc_id, collection_id, len
 97 |             FROM docs
 98 |             ORDER BY doc_id
 99 |         """)
100 |         while batch := self.cursor.fetchmany(self.arguments['batch_size']):
101 |             for doc_id, collection_id, length in batch:
102 |                 doc_record = DocRecord()
103 |                 doc_record.docid = doc_id
104 |                 doc_record.collection_docid = collection_id
105 |                 doc_record.doclength = length
106 | 
107 |                 yield doc_record
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     parser = argparse.ArgumentParser()
112 |     parser.add_argument('--database', required=True)
113 |     parser.add_argument('--ciff', required=True)
114 |     parser.add_argument('--docs')
115 |     parser.add_argument('--term_dict')
116 |     parser.add_argument('--term_doc')
117 |     parser.add_argument('--batch_size', type=int)
118 |     parser.add_argument('--verbose', action='store_true')
119 |     args = parser.parse_args()
120 |     ToCiff(**vars(args))
121 | 


--------------------------------------------------------------------------------
/geesedb/utils/ciff/to_csv.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | 
 3 | import argparse
 4 | import gzip
 5 | from typing import Union, Any, Tuple
 6 | 
 7 | from ciff_toolkit.read import CiffReader
 8 | 
 9 | 
10 | class ToCSV:
11 |     """
12 |     Class for creating csv files that represent tables in the old dog paper:
13 |     - https://dl.acm.org/doi/10.1145/2600428.2609460
14 | 
15 |     The files are created from a CIFF as described in:
16 |     - https://arxiv.org/abs/2003.08276
17 |     """
18 |     def __init__(self, **kwargs: Any) -> None:
19 |         self.arguments = self.get_arguments(kwargs)
20 |         self.create_csv_files()
21 | 
22 |     @staticmethod
23 |     def get_arguments(kwargs: Any) -> dict:
24 |         arguments = {
25 |             'protobuf_file': None,
26 |             'output_docs': 'docs.csv',
27 |             'output_term_dict': 'term_dict.csv',
28 |             'output_term_doc': 'term_docs.csv'
29 |         }
30 |         for key, item in arguments.items():
31 |             if kwargs.get(key) is not None:
32 |                 arguments[key] = kwargs.get(key)
33 |         return arguments
34 | 
35 |     @staticmethod
36 |     def decode(buffer: Union[str, bytes], pos: int) -> Union[Tuple[int, int], None]:
37 |         mask = (1 << 32) - 1
38 |         result = 0
39 |         shift = 0
40 |         while True:
41 |             b = buffer[pos]
42 |             result |= ((b & 0x7f) << shift)
43 |             pos += 1
44 |             if not (b & 0x80):
45 |                 result &= mask
46 |                 result = int(result)
47 |                 return result, pos
48 |             shift += 7
49 |             if shift >= 64:
50 |                 raise IOError('Too many bytes when decoding.')
51 | 
52 |     def create_csv_files(self) -> None:
53 |         if self.arguments['protobuf_file'].endswith('.gz'):
54 |             with gzip.open(self.arguments['protobuf_file'], 'rb') as f:
55 |                 data = f.read()
56 |         else:
57 |             with open(self.arguments['protobuf_file'], 'rb') as f:
58 |                 data = f.read()
59 |         next_pos, pos = 0, 0
60 | 
61 |         with CiffReader(self.arguments['protobuf_file']) as reader:
62 |             with open(self.arguments['output_term_dict'], 'w') as term_dict_writer, \
63 |                     open(self.arguments['output_term_doc'], 'w') as term_doc_writer:
64 |                 for term_id, postings_list in enumerate(reader.read_postings_lists()):
65 |                     term_dict_writer.write(f'{term_id}|{postings_list.term}|{postings_list.df}\n')
66 |                     docid = 0
67 |                     for posting in postings_list.postings:
68 |                         docid += posting.docid
69 |                         term_doc_writer.write(f'{term_id}|{docid}|{posting.tf}\n')
70 | 
71 |             with open(self.arguments['output_docs'], 'w') as docs_writer:
72 |                 for doc_record in reader.read_documents():
73 |                     docs_writer.write(f'{doc_record.collection_docid}|{doc_record.docid}|{doc_record.doclength}\n')
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     parser = argparse.ArgumentParser()
78 |     parser.add_argument('-p',
79 |                         '--protobuf_file',
80 |                         required=True,
81 |                         metavar='[file]',
82 |                         help='Location of the protobuf file, if this is included ' +
83 |                              'output files for term related files should also be specified.')
84 |     parser.add_argument('-o',
85 |                         '--output_docs',
86 |                         metavar='[file]',
87 |                         help='Output csv file for the docs table.')
88 |     parser.add_argument('-t',
89 |                         '--output_term_dict',
90 |                         metavar='[file]',
91 |                         help='Output csv file for the term dictionary table.')
92 |     parser.add_argument('-e',
93 |                         '--output_term_doc',
94 |                         metavar='[file]',
95 |                         help='Output csv file for the term doc mapper table.')
96 |     ToCSV(**vars(parser.parse_args()))
97 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | duckdb
2 | google>=2
3 | numpy
4 | pandas
5 | ciff-toolkit
6 | tqdm
7 | git+https://github.com/informagi/pycypher
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name='geesedb',
 5 |     version='0.0.2',
 6 |     description='Graph Engine for Exploration and Search over Evolving DataBases',
 7 |     author='Chris Kamphuis',
 8 |     author_email='chris@cs.ru.nl',
 9 |     url='https://github.com/informagi/GeeseDB',
10 |     install_requires=['duckdb', 'numpy', 'pandas', 'ciff-toolkit', 'tqdm',
11 |                       'pycypher @ git+https://github.com/informagi/pycypher'],
12 |     packages=find_packages(),
13 |     include_package_data=True,
14 |     package_data={'': ['qrels.*', 'topics.*']},
15 |     license='MIT License'
16 | )
17 | 


--------------------------------------------------------------------------------