├── .gitignore
├── Diffbot API test.ipynb
├── README.md
├── cache
    └── __init__.py
├── candidate.py
├── construct_graph.py
├── converter.py
├── data_helper.py
├── dataset_stats.py
├── datasets
    ├── ace2004.tsv
    ├── actors.txt
    ├── award-actors.txt
    ├── dbpedia-url.txt
    ├── dbpedia-url.txt.absent
    ├── dbpedia.tsv
    ├── dbpedia.ttl
    ├── dbpedia.ttl.contexts.tsv
    ├── dbpedia.ttl.phrases.tsv
    ├── dzerczynski.tsv
    ├── entities.tsv
    ├── kore50-urls.txt
    ├── kore50-urls.txt.absent
    ├── kore50.tsv
    ├── kore50.ttl
    ├── kore50.ttl.contexts.tsv
    ├── kore50.ttl.phrases.tsv
    ├── n3-news-128.tsv
    ├── n3-reuters-128-urls.txt
    ├── n3-reuters-128-urls.txt.absent
    ├── n3-reuters-128.tsv
    ├── n3-reuters-128.ttl
    ├── n3-reuters-128.ttl.contexts.tsv
    ├── n3-reuters-128.ttl.phrases.tsv
    ├── oke-evaluation-dataset-task1.ttl
    ├── oke-evaluation-dataset-task2.ttl
    ├── oke-sample-dataset-task1.ttl
    ├── oke-sample-dataset-task2.ttl
    ├── phrases.txt
    ├── singleton.tsv
    ├── subset
    │   ├── 1000_edgelist.txt
    │   ├── 1000_labels.db
    │   ├── 1000_long_abstracts.db
    │   ├── 1000_nodelist_url.txt
    │   ├── 1000_nodes_lookup.db
    │   └── 1000_nodes_lookup.txt
    ├── test.phrases.tsv
    ├── top-cities.txt
    ├── us-states.txt
    └── us-universitites.txt
├── diffbot_api.py
├── fwd.sh
├── generate_absent.py
├── linkers
    ├── __init__.py
    ├── baseline.py
    ├── context_aware.py
    ├── dense.py
    ├── nn_graph.py
    ├── sparse.py
    └── supertagger.py
├── nif_ws.py
├── nif_ws_graph.py
├── patterns.py
├── preprocess.py
├── requirements.txt
├── supervised
    ├── README.md
    ├── negative_sampling.py
    ├── negative_sampling_test.py
    ├── nn.py
    └── requirements.txt
├── test_supertagger.py
├── tests
    ├── baseline_linker_dbpedia_test.py
    ├── baseline_linker_test.py
    ├── dense_linker_test.py
    ├── diffbot_api_test.py
    ├── score_test.py
    ├── sparse_linker_single_test.py
    ├── sparse_linker_test.py
    ├── supertagger_test.py
    ├── supervised
    │   └── preprocess
    │   │   ├── prepro_util_test.py
    │   │   └── util_test.py
    ├── test_construct_dict.py
    ├── test_data_helper.py
    └── ttl_test.py
├── tmp.ipynb
├── ttl.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Data files
  2 | *.json
  3 | *.csv
  4 | *.txt
  5 | *.ttl
  6 | *.out
  7 | *.sqlite
  8 | *.pkl
  9 | 
 10 | # Byte-compiled / optimized / DLL files
 11 | __pycache__/
 12 | *.py[cod]
 13 | *$py.class
 14 | 
 15 | # C extensions
 16 | *.so
 17 | *.ttl 
 18 | *.swp
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | env/
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | .hypothesis/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # dotenv
 94 | .env
 95 | 
 96 | # virtualenv
 97 | .venv
 98 | venv/
 99 | ENV/
100 | 
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 | 
105 | # Rope project settings
106 | .ropeproject
107 | 
108 | # mkdocs documentation
109 | /site
110 | 
111 | # mypy
112 | .mypy_cache/
113 | 


--------------------------------------------------------------------------------
/Diffbot API test.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%load_ext autoreload\n",
 10 |     "%autoreload 2"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": []
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": []
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": []
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {
 40 |     "scrolled": false
 41 |    },
 42 |    "outputs": [],
 43 |    "source": []
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "\n",
 54 |     "\n"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {
 61 |     "collapsed": true,
 62 |     "scrolled": true
 63 |    },
 64 |    "outputs": [],
 65 |    "source": []
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": []
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": []
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {
 89 |     "collapsed": true
 90 |    },
 91 |    "outputs": [],
 92 |    "source": []
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "outputs": [],
101 |    "source": []
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": []
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 14,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": []
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": []
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {
134 |     "collapsed": true
135 |    },
136 |    "outputs": [],
137 |    "source": []
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {
143 |     "collapsed": true
144 |    },
145 |    "outputs": [],
146 |    "source": []
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": []
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "collapsed": true
162 |    },
163 |    "outputs": [],
164 |    "source": []
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "outputs": [],
173 |    "source": []
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": []
183 |   }
184 |  ],
185 |  "metadata": {
186 |   "anaconda-cloud": {},
187 |   "kernelspec": {
188 |    "display_name": "Python 3",
189 |    "language": "python",
190 |    "name": "python3"
191 |   },
192 |   "language_info": {
193 |    "codemirror_mode": {
194 |     "name": "ipython",
195 |     "version": 3
196 |    },
197 |    "file_extension": ".py",
198 |    "mimetype": "text/x-python",
199 |    "name": "python",
200 |    "nbconvert_exporter": "python",
201 |    "pygments_lexer": "ipython3",
202 |    "version": "3.6.3"
203 |   }
204 |  },
205 |  "nbformat": 4,
206 |  "nbformat_minor": 1
207 | }
208 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # kb2vec
 2 | 
 3 | Vectorizing knowledge bases for entity linking
 4 | 
 5 | Installation
 6 | -----------
 7 | 
 8 | ```
 9 | pip install -r requirements.txt
10 | python -m nltk.downloader stopwords
11 | python -m nltk.downloader punkt
12 | python -m nltk.downloader averaged_perceptron_tagger
13 | ```
14 | 
15 | Download the `data` folder and and unzip it:
16 | 
17 | ```
18 | wget http://ltdata1.informatik.uni-hamburg.de/kb2vec/data.zip
19 | unzip data.zip
20 | ```
21 | 
22 | Start the web service:
23 | ---------------------
24 | 
25 | Entity linking NIF server:
26 | 
27 | ```
28 | python nif_ws.py
29 | ```
30 | 
31 | which will run at ``http://localhost:5000``
32 | 
33 | GERBIL NIF-based evaluation server (from the ``gerbil`` directory):
34 | 
35 | ```
36 | bash start.sh
37 | ```
38 | 
39 | which will run at ``http://localhost:1234/gerbil``
40 | 
41 | 
42 | DBpedia entity linking NIF wrapper (from the ``gerbil-dbpedia-ws`` directory):
43 | 
44 | ```
45 | docker-compose up -d
46 | ```
47 | 
48 | which will run at ``http://localhost:8181/spotlight``
49 | 
50 | 
51 | http://localhost:8181/spotlight
52 | http://localhost:5000/random
53 | http://localhost:5000/sparse_overlap
54 | http://localhost:5000/dense_overlap
55 | http://localhost:5000/supertagger
56 | 


--------------------------------------------------------------------------------
/cache/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/cache/__init__.py


--------------------------------------------------------------------------------
/candidate.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | from namedlist import namedlist
 3 | import codecs
 4 | 
 5 | Phrase = namedtuple("Phrase", "text beg end subj")
 6 | 
 7 | #PhraseBase = namedtuple("PhraseBase", "text beg end subj")
 8 | # 
 9 | #class Phrase(PhraseBase):
10 | #    def get_hash(self):
11 | #        return hash(self.text)
12 | #
13 | #    def __hash__(self):
14 | #        return self.get_hash()
15 | #    
16 | #    def __eq__(self, other):
17 | #        return self.get_hash() == other.get_hash()
18 | 
19 | 
20 | CandidateBase = namedlist("CandidateBase", "score name link wiki types names uris text db_uri importance relations")
21 | 
22 | 
23 | def make_phrases(str_phrases):
24 |     """ From a list of strings generates a list of phrases (e.g. for tests)"""
25 | 
26 |     return [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
27 |                    for phrase in str_phrases]
28 | 
29 | 
30 | class Candidate(CandidateBase):
31 |     def __init__(self, score=0.0, name="", link="", wiki="", types=[], names=[], uris=[], text="",
32 |                  db_uri="", importance=1.0, relations={}):
33 |        CandidateBase.__init__(self, score, name, link, wiki, types, names, uris, text,
34 |                               db_uri, importance, relations)
35 | 
36 |     def get_hash(self):
37 |         uris = "".join(self.uris) if self.uris is not None else ""
38 |         types = "".join(self.types) if self.types is not None else ""
39 |         hash_str = self.name + uris + types
40 |         if hash_str is None:
41 |             print("Warning: hash string is none.")
42 | 
43 |         return hash(hash_str)
44 | 
45 |     def __hash__(self):
46 |         return self.get_hash()
47 |     
48 |     def __eq__(self, other):
49 |         return self.get_hash() == other.get_hash()
50 | 
51 |     def __gt__(self, other):
52 |        return self.score > other.score
53 | 
54 |     def __lt__(self, other):
55 |        return self.score < other.score
56 | 
57 | 
58 | def save_candidates_text(output_fpath="data/sf-candidates.txt"):
59 |     re_newlines = re.compile(r"[\n\r]+")
60 | 
61 |     with codecs.open(output_fpath, "w", "utf-8") as c_f:
62 |         for phrase in c:
63 |             for candidate in c[phrase]:
64 |                 text = candidate.text
65 |                 c_f.write("{}\t{}\t{}\n".format(
66 |                     phrase.text,
67 |                     candidate.name,
68 |                     text.strip()))
69 |                 
70 |     print(output_fpath)
71 | 


--------------------------------------------------------------------------------
/construct_graph.py:
--------------------------------------------------------------------------------
  1 | import networkx as nx
  2 | import matplotlib.pyplot as plt
  3 | import logging
  4 | import codecs
  5 | from sqlitedict import SqliteDict
  6 | 
  7 | 
  8 | class Graph:
  9 |     def __init__(self, logfile='output.log'):
 10 |         self._G = nx.DiGraph()
 11 |         # create logger
 12 |         self._logger = logging.getLogger('construct_graph')
 13 |         self._logger.setLevel(logging.DEBUG)
 14 |         fh = logging.FileHandler(logfile)
 15 |         fh.setLevel(logging.DEBUG)
 16 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 17 |         fh.setFormatter(formatter)
 18 |         self._logger.addHandler(fh)
 19 | 
 20 |     # takes three dictionaries:
 21 |     # url_ids - keys contain the urls, values are the unique ids of these urls.
 22 |     # url_longabstracts - keys contain the urls, values are the long abstracts (texts) of them.
 23 |     # url_labels - keys, again, are the urls, values contain the title of them.
 24 |     # unique ids are used to create node and other properties are used as the attributes of the nodes.
 25 |     def create_nodes_from_dict(self, url_longabstracts, url_labels, url_ids):
 26 |         urls = url_ids.keys()
 27 | 
 28 |         count = 0
 29 |         for url in urls:
 30 |             # long abstract is the list of tokens.
 31 |             long_abstract = url_longabstracts[url]
 32 |             # title is the list of tokens/token.
 33 |             title = url_labels[url]
 34 |             # node id is the integer value.
 35 |             node_id = url_ids[url]
 36 | 
 37 |             # id, url, long abstract (text), and title are attributes.
 38 |             self._G.add_node(node_id, id=node_id, url=url, long_abstract=long_abstract, title=title)
 39 |             if count % 100000 == 0:
 40 |                 self._logger.info(str(count) + ' nodes are processed..')
 41 | 
 42 |             count += 1
 43 | 
 44 |     # subnodes is the list of nodes, it is used to create nodes from sublist and have a subgraph.
 45 |     def create_nodes_from_db(self, longabsdb_path, labelsdb_path, lookupdb_path, subnodes=False):
 46 |         longabsdb = SqliteDict(longabsdb_path, autocommit=False)
 47 |         labelsdb = SqliteDict(labelsdb_path, autocommit=False)
 48 |         lookupdb = SqliteDict(lookupdb_path, autocommit=False)
 49 | 
 50 |         if subnodes:
 51 |             urls = subnodes
 52 |         else:
 53 |             urls = lookupdb.keys()
 54 | 
 55 |         count = 0
 56 |         for url in urls:
 57 |             # long abstract is string.
 58 |             long_abstract = longabsdb[url]
 59 |             # title is string.
 60 |             title = labelsdb[url]
 61 |             # node id is the integer value.
 62 |             node_id = int(lookupdb[url])
 63 | 
 64 |             # id, url, long abstract (text), and title are attributes.
 65 |             self._G.add_node(node_id, id=node_id, url=url, long_abstract=long_abstract, title=title)
 66 |             if count % 100000 == 0:
 67 |                 self._logger.info(str(count) + ' nodes are processed..')
 68 | 
 69 |             count += 1
 70 | 
 71 |         longabsdb.close()
 72 |         labelsdb.close()
 73 |         lookupdb.close()
 74 | 
 75 |     # takes file a parameter:
 76 |     # file contains edge at each line, like (1, 2).
 77 |     def create_edges_from_file(self, path):
 78 |         count = 0
 79 | 
 80 |         file = codecs.open(path, 'r')
 81 |         line = file.readline()
 82 | 
 83 |         while line != '':
 84 |             nodes = line.split()
 85 |             line = file.readline()
 86 | 
 87 |             self._G.add_edge(int(nodes[0]), int(nodes[1]))
 88 | 
 89 |             if count % 100000 == 0:
 90 |                 self._logger.info(str(count) + ' edges are processed..')
 91 | 
 92 |             count += 1
 93 | 
 94 |     def create_edges_from_list(self, edges):
 95 |         self._G.add_edges_from(edges)
 96 | 
 97 |     def write_graph(self, path):
 98 |         nx.write_gpickle(self._G, path)
 99 | 
100 |     def load_graph(self, path):
101 |         self._G = nx.read_gpickle(path)
102 | 
103 |     def draw(self):
104 |         nx.draw(self._G, with_labels=True, font_weight='bold')
105 |         plt.show()
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/converter.py:
--------------------------------------------------------------------------------
  1 | from wikidata.client import Client
  2 | from traceback import format_exc
  3 | from sqlitedict import SqliteDict
  4 | from traceback import format_exc
  5 | from os.path import join
  6 | from utils import ROOT_DIR
  7 | 
  8 | 
  9 | WIKIDATA_DOMAIN = "wikidata.org"
 10 | WIKIPEDIA_DOMAIN = "wikipedia.org"
 11 | WIKIDATA_PREFIX = "wikidata.org/wiki/"
 12 | DBPEDIA_PREFIX = "http://dbpedia.org/resource/"
 13 | WIKIPEDIA_PREFIX = "wikipedia.org/wiki/"
 14 | CACHED_WIKI2DBPEDIA_DB = join(join(ROOT_DIR, "cache"), "wikidata2dbpedia-cache.sqlite")
 15 | 
 16 | verbose = False
 17 |   
 18 | 
 19 | class URIConverter(object):
 20 |     def __init__(self, cache_fpath=CACHED_WIKI2DBPEDIA_DB):
 21 |         self._cache = SqliteDict(cache_fpath, autocommit=True)
 22 |         self._client = Client()
 23 | 
 24 |     def __del__(self):
 25 |         try:
 26 |             self._cache.close()
 27 |         except:
 28 |             if verbose: print("Warning: trying to close a closed cache.")
 29 | 
 30 |     def close(self):
 31 |         self._cache.close()
 32 | 
 33 |     def get_postfix(self, string, prefix):
 34 |         """ Given a string and a prefix returns postfix. If not found 
 35 |         then returns None. """
 36 | 
 37 |         beg_index = string.find(prefix)
 38 |         if beg_index != -1:
 39 |             end_index = beg_index + len(prefix)
 40 |             return string[end_index:]
 41 |         else:
 42 |             return None
 43 | 
 44 |     def get_fuzzy_postfix(self, string, prefix):
 45 |         if prefix in string:
 46 |             parts = string.split("/")
 47 |             if len(parts) > 1:
 48 |                 return parts[-1]
 49 |             else:
 50 |                 return None
 51 |         
 52 |     def wikipedia2dbpedia(self, wikipedia_uri):
 53 |         article_name = self.get_fuzzy_postfix(wikipedia_uri, prefix=WIKIPEDIA_DOMAIN)
 54 | 
 55 |         if article_name is None:
 56 |             if verbose: print("Warning: cannot convert to DBpedia URI '{}'".format(wikipedia_uri))
 57 |             return ""
 58 |         else:
 59 |             return DBPEDIA_PREFIX + article_name                
 60 | 
 61 |     def wikidataid2wikipedia(self, wikidata_q_id="Q42"):
 62 |         try:
 63 |             if wikidata_q_id in self._cache:
 64 |                 return self._cache[wikidata_q_id]
 65 |             else:
 66 |                 entity = self._client.get(wikidata_q_id, load=True)
 67 |                 can_get = ("sitelinks" in entity.attributes and
 68 |                            "enwiki" in entity.attributes["sitelinks"] and
 69 |                            "url" in entity.attributes["sitelinks"]["enwiki"])
 70 |                 if can_get:
 71 |                     wikipedia_uri = entity.attributes["sitelinks"]["enwiki"]["url"]
 72 |                     self._cache[wikidata_q_id] = wikipedia_uri
 73 |                     return wikipedia_uri
 74 |                 else:
 75 |                     wiki_links = []
 76 |                     for key in entity.attributes["sitelinks"]:
 77 |                         if key.endswith("wiki"):
 78 |                             if "url" in entity.attributes["sitelinks"][key]:
 79 |                                 wiki_links.append(entity.attributes["sitelinks"][key]["url"])
 80 |                     
 81 |                     if len(wiki_links) > 0:
 82 |                         print("Warning: no links to English Wiki found, but found {} links to other Wikis".format(len(wiki_links)))
 83 |                         self._cache[wikidata_q_id] = wiki_links[0]
 84 |                         return wiki_links[0]
 85 |                     else:
 86 |                         self._cache[wikidata_q_id] = ""
 87 |                         return ""
 88 | 
 89 |         except KeyboardInterrupt:
 90 |             raise KeyboardInterrupt()
 91 |         except:
 92 |             print("Warning: cannot process '{}'".format(wikidata_q_id))
 93 |             print(format_exc())
 94 |             return ""
 95 | 
 96 |     def get_wikidata_id(self, wikidata_uri):
 97 |         wikidata_id = self.get_fuzzy_postfix(wikidata_uri, prefix=WIKIDATA_DOMAIN)
 98 |         if wikidata_id is None:
 99 |             if verbose: print("Warning: cannot extract WikiData ID '{}'".format(wikidata_uri))
100 |             return ""
101 |         else:
102 |             return wikidata_id
103 | 
104 |     def wikidata2wikipedia(self, wikidata_uri):
105 |         wikidata_id = self.get_wikidata_id(wikidata_uri)
106 |         if wikidata_id != "":
107 |             wikipedia_uri = self.wikidataid2wikipedia(wikidata_id)
108 |             return wikipedia_uri
109 |         else:
110 |             if verbose: print("Warning: cannot extract DBpedia URI from a Wikidata URI")
111 |             return ""
112 | 
113 | 
114 |     def wikidata2dbpedia(self, wikidata_uri):
115 |         return self.wikipedia2dbpedia(self.wikidata2wikipedia(wikidata_uri))
116 | 
117 | 


--------------------------------------------------------------------------------
/data_helper.py:
--------------------------------------------------------------------------------
 1 | from sqlitedict import SqliteDict
 2 | import sqlite3
 3 | import codecs
 4 | 
 5 | 
 6 | def create_dictdb_from_file(file_path, db_path):
 7 |     db = SqliteDict(db_path, autocommit=True)
 8 | 
 9 |     file = codecs.open(file_path, 'r')
10 |     line = file.readline()
11 | 
12 |     while line != '':
13 |         splitted = line.split()
14 |         line = file.readline()
15 |         try:
16 |             key, value = splitted[0], ' '.join(splitted[1:])
17 |             db[key] = value
18 |         except IndexError:
19 |             continue
20 | 
21 |     file.close()
22 |     db.close()
23 | 
24 | 
25 | def create_db_from_dictdb(lookup_db_path, longabs_db_path, labels_db_path, db_name):
26 |     connection = sqlite3.connect(db_name)
27 |     cursor = connection.cursor()
28 | 
29 |     cursor.execute('''CREATE TABLE graph (node_id INTEGER PRIMARY KEY NOT NULL, long_abstracts TEXT, labels TEXT)''')
30 | 
31 |     connection.commit()
32 | 
33 |     lookup_db = SqliteDict(lookup_db_path, autocommit=False)
34 |     longabs_db = SqliteDict(longabs_db_path, autocommit=False)
35 |     labels_db = SqliteDict(labels_db_path, autocommit=False)
36 | 
37 |     intersection_nodes = lookup_db.keys()
38 | 
39 |     count = 0
40 | 
41 |     for node in intersection_nodes:
42 |         longab = longabs_db[node]
43 |         label = labels_db[node]
44 |         id = lookup_db[node]
45 | 
46 |         cursor.execute('''INSERT INTO graph VALUES (?,?,?)''', (id, longab, label))
47 | 
48 |         if count%100000 == 0:
49 |             print(count)
50 |             connection.commit()
51 | 
52 |         count += 1
53 | 
54 |     connection.commit()
55 | 
56 |     connection.close()
57 |     lookup_db.close()
58 |     labels_db.close()
59 |     longabs_db.close()
60 | 


--------------------------------------------------------------------------------
/dataset_stats.py:
--------------------------------------------------------------------------------
 1 | from pandas import read_csv
 2 | from glob import glob 
 3 | import re
 4 | from traceback import format_exc
 5 | 
 6 | 
 7 | def dataset_stat(dataset_fpath):
 8 |     try:
 9 |         df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
10 |         df.targets
11 |     except AttributeError:
12 |         df = read_csv(dataset_fpath, sep="\t", encoding="utf-8", names=["targets","context"])
13 |         
14 |     targets = set()
15 |     for i, row in df.iterrows():
16 |         for t in str(row.targets).split(","):
17 |             ts = t.strip()
18 |             if len(ts) > 0: targets.add(ts)
19 | 
20 |     print("# of contexts:", len(df))
21 |     print("# of targets:", len(targets))
22 | 
23 | 
24 | def format_urls(url_fpaths):
25 |     url = re.compile(r"<([^>]+)>")
26 |     for url_fpath in glob(url_fpaths):
27 |         print(url_fpath)
28 |         with open(url_fpath, "r") as in_f, open(url_fpath + ".out", "w") as out_f:
29 |             for line in in_f:
30 |                 match = url.search(line)
31 |                 if match:
32 |                     out_f.write("{}\n".format(match.groups(0)[0]))
33 | 
34 | 
35 | datasets_fpath = "/home/panchenko/kb2vec/datasets/*.tsv"
36 | for dataset_fpath in glob(datasets_fpath):
37 |     
38 |     print(dataset_fpath)
39 |     dataset_stat(dataset_fpath)
40 |     
41 | 
42 | format_urls(url_fpaths = "datasets/*txt")
43 | 


--------------------------------------------------------------------------------
/datasets/dbpedia-url.txt:
--------------------------------------------------------------------------------
  1 | http://dbpedia.org/resource/Anxiety
  2 | http://dbpedia.org/resource/Conspiracy_theory
  3 | http://dbpedia.org/resource/Consumer
  4 | http://dbpedia.org/resource/Internet
  5 | http://dbpedia.org/resource/Internet_privacy
  6 | http://dbpedia.org/resource/Lawsuit
  7 | http://dbpedia.org/resource/Marketing
  8 | http://dbpedia.org/resource/User_(computing)
  9 | http://dbpedia.org/resource/Worry
 10 | http://dbpedia.org/resource/Alarmism
 11 | http://dbpedia.org/resource/Advertising
 12 | http://dbpedia.org/resource/Marketing
 13 | http://dbpedia.org/resource/Online_and_offline
 14 | http://dbpedia.org/resource/User_(computing)
 15 | http://dbpedia.org/resource/Web_developer
 16 | http://dbpedia.org/resource/Year
 17 | http://dbpedia.org/resource/HTML5
 18 | http://dbpedia.org/resource/Information_privacy
 19 | http://dbpedia.org/resource/Internet
 20 | http://dbpedia.org/resource/Internet
 21 | http://dbpedia.org/resource/Era
 22 | http://dbpedia.org/resource/HTML
 23 | http://dbpedia.org/resource/Promise
 24 | http://dbpedia.org/resource/Source_code
 25 | http://dbpedia.org/resource/Web_browser
 26 | http://dbpedia.org/resource/Web_page
 27 | http://dbpedia.org/resource/World_Wide_Web
 28 | http://dbpedia.org/resource/Year
 29 | http://dbpedia.org/resource/Computer_software
 30 | http://dbpedia.org/resource/Content_(media)
 31 | http://dbpedia.org/resource/Email
 32 | http://dbpedia.org/resource/Multimedia
 33 | http://dbpedia.org/resource/Online_and_offline
 34 | http://dbpedia.org/resource/Restaurant
 35 | http://dbpedia.org/resource/Smartphone
 36 | http://dbpedia.org/resource/Uploading_and_downloading
 37 | http://dbpedia.org/resource/User_(computing)
 38 | http://dbpedia.org/resource/Billionaire
 39 | http://dbpedia.org/resource/Country
 40 | http://dbpedia.org/resource/Home
 41 | http://dbpedia.org/resource/Mikhail_Prokhorov
 42 | http://dbpedia.org/resource/Moscow
 43 | http://dbpedia.org/resource/New_Jersey_Nets
 44 | http://dbpedia.org/resource/Ownership
 45 | http://dbpedia.org/resource/Sunday
 46 | http://dbpedia.org/resource/Mikhail_Prokhorov
 47 | http://dbpedia.org/resource/Country
 48 | http://dbpedia.org/resource/Greeting
 49 | http://dbpedia.org/resource/Home
 50 | http://dbpedia.org/resource/Russia
 51 | http://dbpedia.org/resource/Sponsor_(commercial)
 52 | http://dbpedia.org/resource/Sportsperson
 53 | http://dbpedia.org/resource/Team
 54 | http://dbpedia.org/resource/Wealth
 55 | http://dbpedia.org/resource/Mikhail_Prokhorov
 56 | http://dbpedia.org/resource/Basketball_court
 57 | http://dbpedia.org/resource/CSKA_Universal_Sports_Hall
 58 | http://dbpedia.org/resource/Exhibition_game
 59 | http://dbpedia.org/resource/Game
 60 | http://dbpedia.org/resource/New_Jersey_Nets
 61 | http://dbpedia.org/resource/Practice_(learning_method)
 62 | http://dbpedia.org/resource/Russia
 63 | http://dbpedia.org/resource/Sports_club
 64 | http://dbpedia.org/resource/Sportsperson
 65 | http://dbpedia.org/resource/Year
 66 | http://dbpedia.org/resource/Mikhail_Prokhorov
 67 | http://dbpedia.org/resource/Association_football
 68 | http://dbpedia.org/resource/Basketball
 69 | http://dbpedia.org/resource/Coach_(sport)
 70 | http://dbpedia.org/resource/Future
 71 | http://dbpedia.org/resource/Hockey
 72 | http://dbpedia.org/resource/Interaction
 73 | http://dbpedia.org/resource/New_Jersey_Nets
 74 | http://dbpedia.org/resource/Popularity
 75 | http://dbpedia.org/resource/Russia
 76 | http://dbpedia.org/resource/Russians
 77 | http://dbpedia.org/resource/Sport
 78 | http://dbpedia.org/resource/Sportsperson
 79 | http://dbpedia.org/resource/Fan_(person)
 80 | http://dbpedia.org/resource/New_Jersey_Nets
 81 | http://dbpedia.org/resource/Cannabis_(drug)
 82 | http://dbpedia.org/resource/Family
 83 | http://dbpedia.org/resource/Middle_age
 84 | http://dbpedia.org/resource/Parent
 85 | http://dbpedia.org/resource/Ritual
 86 | http://dbpedia.org/resource/Cannabis_(drug)
 87 | http://dbpedia.org/resource/Illegal_drug_trade
 88 | http://dbpedia.org/resource/Illinois
 89 | http://dbpedia.org/resource/Parent
 90 | http://dbpedia.org/resource/Writer
 91 | http://dbpedia.org/resource/Year
 92 | http://dbpedia.org/resource/Parent
 93 | http://dbpedia.org/resource/Illegal_drug_trade
 94 | http://dbpedia.org/resource/Grounding_(punishment)
 95 | http://dbpedia.org/resource/Mother
 96 | http://dbpedia.org/resource/Medical_cannabis
 97 | http://dbpedia.org/resource/Parent
 98 | http://dbpedia.org/resource/Alzheimer’s_disease
 99 | http://dbpedia.org/resource/Cancer
100 | http://dbpedia.org/resource/Father
101 | http://dbpedia.org/resource/Heart_disease
102 | http://dbpedia.org/resource/Mother
103 | http://dbpedia.org/resource/Nausea
104 | http://dbpedia.org/resource/Orthostatic_hypotension
105 | http://dbpedia.org/resource/Medical_cannabis
106 | http://dbpedia.org/resource/Research
107 | http://dbpedia.org/resource/California_State_Route_1
108 | http://dbpedia.org/resource/Driving
109 | http://dbpedia.org/resource/Los_Angeles
110 | http://dbpedia.org/resource/Roof
111 | http://dbpedia.org/resource/San_Francisco
112 | http://dbpedia.org/resource/Toyota_Prius
113 | http://dbpedia.org/resource/Driving
114 | http://dbpedia.org/resource/Person
115 | http://dbpedia.org/resource/Steering_wheel
116 | http://dbpedia.org/resource/Driverless_car
117 | http://dbpedia.org/resource/Artificial_intelligence
118 | http://dbpedia.org/resource/Automobile
119 | http://dbpedia.org/resource/Automobile
120 | http://dbpedia.org/resource/Computer_software
121 | http://dbpedia.org/resource/Google
122 | http://dbpedia.org/resource/Human
123 | http://dbpedia.org/resource/Project
124 | http://dbpedia.org/resource/Automobile
125 | http://dbpedia.org/resource/Human
126 | http://dbpedia.org/resource/Human
127 | http://dbpedia.org/resource/Mile
128 | http://dbpedia.org/resource/Mile
129 | http://dbpedia.org/resource/Navigation_system
130 | http://dbpedia.org/resource/Steering_wheel
131 | http://dbpedia.org/resource/Technician
132 | http://dbpedia.org/resource/Lombard_Street_(San_Francisco)
133 | http://dbpedia.org/resource/San_Francisco
134 | http://dbpedia.org/resource/Street
135 | http://dbpedia.org/resource/Accident
136 | http://dbpedia.org/resource/Automobile
137 | http://dbpedia.org/resource/Engineer
138 | http://dbpedia.org/resource/Google_driverless_car
139 | http://dbpedia.org/resource/Rear-end_collision
140 | http://dbpedia.org/resource/Traffic_light
141 | http://dbpedia.org/resource/Canadian
142 | http://dbpedia.org/resource/Dinner
143 | http://dbpedia.org/resource/Diplomatic_mission
144 | http://dbpedia.org/resource/Economy
145 | http://dbpedia.org/resource/Economy
146 | http://dbpedia.org/resource/Friday
147 | http://dbpedia.org/resource/International
148 | http://dbpedia.org/resource/Presidency_of_Barack_Obama
149 | http://dbpedia.org/resource/Problem
150 | http://dbpedia.org/resource/World
151 | http://dbpedia.org/resource/Balance_of_trade
152 | http://dbpedia.org/resource/Beef_tenderloin
153 | http://dbpedia.org/resource/Canada
154 | http://dbpedia.org/resource/Currency
155 | http://dbpedia.org/resource/Europe
156 | http://dbpedia.org/resource/Japan
157 | http://dbpedia.org/resource/People's_Republic_of_China
158 | http://dbpedia.org/resource/Renminbi
159 | http://dbpedia.org/resource/Scallop
160 | http://dbpedia.org/resource/Timothy_Geithner
161 | http://dbpedia.org/resource/United_States_Secretary_of_the_Treasury
162 | http://dbpedia.org/resource/World
163 | http://dbpedia.org/resource/Annual_Meetings_of_the_International_Monetary_Fund_and_the_World_Bank_Group
164 | http://dbpedia.org/resource/Currency
165 | http://dbpedia.org/resource/Box_office
166 | http://dbpedia.org/resource/Art
167 | http://dbpedia.org/resource/Art_exhibition
168 | http://dbpedia.org/resource/Art_exhibition
169 | http://dbpedia.org/resource/Autumn
170 | http://dbpedia.org/resource/Chaos
171 | http://dbpedia.org/resource/Classicism
172 | http://dbpedia.org/resource/France
173 | http://dbpedia.org/resource/Germany
174 | http://dbpedia.org/resource/Italy
175 | http://dbpedia.org/resource/Name
176 | http://dbpedia.org/resource/Percentage
177 | http://dbpedia.org/resource/Solomon_R._Guggenheim_Museum
178 | http://dbpedia.org/resource/History
179 | http://dbpedia.org/resource/Cubism
180 | http://dbpedia.org/resource/Futurism
181 | http://dbpedia.org/resource/Modernism
182 | http://dbpedia.org/resource/Nightmare
183 | http://dbpedia.org/resource/World_War_I
184 | http://dbpedia.org/resource/Ancient_Greece
185 | http://dbpedia.org/resource/Italian_Renaissance
186 | http://dbpedia.org/resource/Tradition
187 | http://dbpedia.org/resource/Artist
188 | http://dbpedia.org/resource/Conservatism
189 | http://dbpedia.org/resource/Ideology
190 | http://dbpedia.org/resource/Social_order
191 | http://dbpedia.org/resource/Lil_Wayne
192 | http://dbpedia.org/resource/Boredom
193 | http://dbpedia.org/resource/Free_association_(psychology)
194 | http://dbpedia.org/resource/Idea
195 | http://dbpedia.org/resource/Self-consciousness
196 | http://dbpedia.org/resource/Aggression
197 | http://dbpedia.org/resource/Album
198 | http://dbpedia.org/resource/Album
199 | http://dbpedia.org/resource/Idea
200 | http://dbpedia.org/resource/Prison
201 | http://dbpedia.org/resource/Rebirth_(Lil_Wayne_album)
202 | http://dbpedia.org/resource/Research
203 | http://dbpedia.org/resource/Rikers_Island
204 | http://dbpedia.org/resource/Scientific_method
205 | http://dbpedia.org/resource/Sentence_(law)
206 | http://dbpedia.org/resource/Experiment
207 | http://dbpedia.org/resource/Lil_Wayne
208 | http://dbpedia.org/resource/Album
209 | http://dbpedia.org/resource/I_Am_Not_a_Human_Being
210 | http://dbpedia.org/resource/Month
211 | http://dbpedia.org/resource/Song
212 | http://dbpedia.org/resource/Studio_recording
213 | http://dbpedia.org/resource/Criminal_charge
214 | http://dbpedia.org/resource/Gun_politics
215 | http://dbpedia.org/resource/Prison
216 | http://dbpedia.org/resource/Rebirth_(Lil_Wayne_album)
217 | http://dbpedia.org/resource/Scar
218 | http://dbpedia.org/resource/Song
219 | http://dbpedia.org/resource/Accordion
220 | http://dbpedia.org/resource/Backpack
221 | http://dbpedia.org/resource/Ballet_tutu
222 | http://dbpedia.org/resource/Man
223 | http://dbpedia.org/resource/Audience
224 | http://dbpedia.org/resource/Buster_Keaton
225 | http://dbpedia.org/resource/Chair
226 | http://dbpedia.org/resource/Dance
227 | http://dbpedia.org/resource/Ladder
228 | http://dbpedia.org/resource/2006
229 | http://dbpedia.org/resource/Berlin
230 | http://dbpedia.org/resource/Brick
231 | http://dbpedia.org/resource/Choreography
232 | http://dbpedia.org/resource/Dance
233 | http://dbpedia.org/resource/Gothic_architecture
234 | http://dbpedia.org/resource/House_dance
235 | http://dbpedia.org/resource/Man
236 | http://dbpedia.org/resource/Performance
237 | http://dbpedia.org/resource/Pumping_station
238 | http://dbpedia.org/resource/Sasha_Waltz
239 | http://dbpedia.org/resource/Spree
240 | http://dbpedia.org/resource/Berlin
241 | http://dbpedia.org/resource/Boiler
242 | http://dbpedia.org/resource/Dance_troupe
243 | http://dbpedia.org/resource/Hall
244 | http://dbpedia.org/resource/Machine
245 | http://dbpedia.org/resource/Man
246 | http://dbpedia.org/resource/Performance
247 | http://dbpedia.org/resource/Pumping_station
248 | http://dbpedia.org/resource/Room
249 | http://dbpedia.org/resource/Berlin_Ostbahnhof
250 | http://dbpedia.org/resource/Boulevard
251 | http://dbpedia.org/resource/Berlin
252 | http://dbpedia.org/resource/Mile
253 | http://dbpedia.org/resource/Nightclub
254 | http://dbpedia.org/resource/Unter_den_Linden
255 | http://dbpedia.org/resource/Cappuccino
256 | http://dbpedia.org/resource/Dance
257 | http://dbpedia.org/resource/Dance
258 | http://dbpedia.org/resource/Month
259 | http://dbpedia.org/resource/Performance
260 | http://dbpedia.org/resource/Ticket_(admission)
261 | http://dbpedia.org/resource/Blue
262 | http://dbpedia.org/resource/Finland
263 | http://dbpedia.org/resource/Floor_plan
264 | http://dbpedia.org/resource/Green
265 | http://dbpedia.org/resource/House
266 | http://dbpedia.org/resource/Leaf
267 | http://dbpedia.org/resource/Leaf_shape
268 | http://dbpedia.org/resource/Park
269 | http://dbpedia.org/resource/Red
270 | http://dbpedia.org/resource/Roof
271 | http://dbpedia.org/resource/Turku
272 | http://dbpedia.org/resource/Yellow
273 | http://dbpedia.org/resource/Commuting
274 | http://dbpedia.org/resource/Curve
275 | http://dbpedia.org/resource/Drop_(liquid)
276 | http://dbpedia.org/resource/Floor
277 | http://dbpedia.org/resource/Grand_Central_Terminal
278 | http://dbpedia.org/resource/New_York_City
279 | http://dbpedia.org/resource/Runway_(fashion)
280 | http://dbpedia.org/resource/Shape
281 | http://dbpedia.org/resource/Video
282 | http://dbpedia.org/resource/Wall
283 | http://dbpedia.org/resource/Window
284 | http://dbpedia.org/resource/Port_of_Turku
285 | http://dbpedia.org/resource/House
286 | http://dbpedia.org/resource/Landmark
287 | http://dbpedia.org/resource/September
288 | http://dbpedia.org/resource/Turku
289 | http://dbpedia.org/resource/Woodland
290 | http://dbpedia.org/resource/Artist
291 | http://dbpedia.org/resource/Artist
292 | http://dbpedia.org/resource/Interior_design
293 | http://dbpedia.org/resource/Sculpture
294 | http://dbpedia.org/resource/Finnish_sauna
295 | http://dbpedia.org/resource/2011
296 | http://dbpedia.org/resource/Culture
297 | http://dbpedia.org/resource/Exhibition
298 | http://dbpedia.org/resource/Festival
299 | http://dbpedia.org/resource/Fiberglass
300 | http://dbpedia.org/resource/Sauna
301 | http://dbpedia.org/resource/Garlic
302 | http://dbpedia.org/resource/House
303 | http://dbpedia.org/resource/Red
304 | http://dbpedia.org/resource/Shape
305 | http://dbpedia.org/resource/Turku
306 | http://dbpedia.org/resource/Alice_Roosevelt_Longworth
307 | http://dbpedia.org/resource/Hedonism
308 | http://dbpedia.org/resource/Gossip
309 | http://dbpedia.org/resource/Gossip
310 | http://dbpedia.org/resource/Research
311 | http://dbpedia.org/resource/Husband
312 | http://dbpedia.org/resource/Presentation
313 | http://dbpedia.org/resource/September
314 | http://dbpedia.org/resource/Social_psychology
315 | http://dbpedia.org/resource/Emotion
316 | http://dbpedia.org/resource/Fiction
317 | http://dbpedia.org/resource/Gossip
318 | http://dbpedia.org/resource/Man
319 | http://dbpedia.org/resource/Person
320 | http://dbpedia.org/resource/Person
321 | http://dbpedia.org/resource/Term_(time)
322 | http://dbpedia.org/resource/Undergraduate_education
323 | http://dbpedia.org/resource/Woman
324 | http://dbpedia.org/resource/Female
325 | http://dbpedia.org/resource/Gossip
326 | http://dbpedia.org/resource/Gossip
327 | http://dbpedia.org/resource/Questionnaire
328 | http://dbpedia.org/resource/Self-esteem
329 | http://dbpedia.org/resource/Social_support
330 | http://dbpedia.org/resource/Term_(time)
331 | http://dbpedia.org/resource/Undergraduate_education
332 | 


--------------------------------------------------------------------------------
/datasets/dbpedia-url.txt.absent:
--------------------------------------------------------------------------------
 1 | en.wikipedia.org/wiki/Uploading_and_downloading
 2 | en.wikipedia.org/wiki/New_Jersey_Nets
 3 | en.wikipedia.org/wiki/CSKA_Universal_Sports_Hall
 4 | en.wikipedia.org/wiki/New_Jersey_Nets
 5 | en.wikipedia.org/wiki/New_Jersey_Nets
 6 | en.wikipedia.org/wiki/New_Jersey_Nets
 7 | en.wikipedia.org/wiki/Grounding_(punishment)
 8 | en.wikipedia.org/wiki/Alzheimer’s_disease
 9 | en.wikipedia.org/wiki/Heart_disease
10 | en.wikipedia.org/wiki/Driverless_car
11 | en.wikipedia.org/wiki/Google_driverless_car
12 | en.wikipedia.org/wiki/People's_Republic_of_China
13 | en.wikipedia.org/wiki/Gun_politics
14 | en.wikipedia.org/wiki/Ballet_tutu
15 | en.wikipedia.org/wiki/Leaf_shape
16 | 


--------------------------------------------------------------------------------
/datasets/dbpedia.tsv:
--------------------------------------------------------------------------------
 1 | targets	context
 2 | Rebirth, scars, ngs b	In a sense it’s an improvement on “Rebirth,” even if a couple of the songs bear the scars of that period, particularly the Rick Rubinesque title track, and “Popular,” which has some of the new wave sizzle he was toying with.
 3 | Prokhorov, basketball, players, soccer, popularity, future, interaction, Nets, Russia, sport, hockey, Russians, coaches	Prokhorov said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey.
 4 | car, engineers, traffic light, rear-ended, accident, Google car	The only accident, engineers said, was when one Google car was rear-ended while stopped at a traffic light.
 5 | red, fiberglass, festival, Turku, garlic, exhibition, culture, Finnish bathhouses, 2011, sauna, houses, shaped	On the front lawn we lingered in front of his work-in-progress: a fiberglass sauna shaped like a garlic clove, one of five Finnish bathhouses that will be on display in SaunaLab, an exhibition at Turku 2011, the coming yearlong culture festival.
 6 | cappuccino, ticket, months, dance, dance, performance	During warmer months, a casual passer-by without a ticket to the night’s performance can enjoy a cappuccino with the dance aficionados on the waterfront terrace, but the main draw is the celebrated Ms. Waltz and her dancers, interlocked in twisted, shifting embraces.
 7 | Lombard Street, San Francisco, streets	One even drove itself down Lombard Street in San Francisco, one of the steepest and curviest streets in the nation.
 8 | Prokhorov, arena of CSKA Moscow, game, exhibition, years, players, practice, Nets, Russia, court, club	The stay was just long enough for a little practice and an exhibition in which the Nets were interspersed on the court with top players from youth leagues across Russia — some as young as 8 years old — for a light-hearted game in the arena of CSKA Moscow, the professional Russian club Prokhorov once owned.
 9 | miles, boulevard, Unter den Linden, Ostbahnhof, German capital’s, nightclubs	Radialsystem V, miles away from the German capital’s grand boulevard of Unter den Linden, is surrounded by a cluster of gritty nightclubs near the Ostbahnhof train station.
10 | mbard Street
11 | nightmare, modernist, World War I, Cubism, Futurism	Its narrative goes something like this: After the exhausting nightmare of World War I, Europe’s early modernist vanguard backed away from disruptive experimental styles like Cubism and Futurism.
12 | nausea, father, dizzy spells, cancer, Alzheimer’s disease, heart ailment, mother	His father had a heart ailment, his mother had dizzy spells and nausea, and both were worried about Alzheimer’s disease and cancer.
13 | parents, medical marijuana	But with age and the growing acceptance of medical marijuana, his parents were curious.
14 | research, gossiping	New research finds that gossiping can be good for you — as long as you have something nice to say.
15 | gossip	But it seems the greater pleasure comes from more temperate gossip.
16 | experimentation	It failed less for its experimentation, which was spotty, than for the strenuousness with which he pursued it.
17 | history	I found the whole thing totally engrossing: a survey-style piece of investigative history with a bomb ticking away inside.
18 | research, scientific, Rikers Island, aggressive, album, idea, album, sentence, jail	“Rebirth,” the last album he released before he began serving a one-year jail sentence at Rikers Island, was his quixotic attempt at a rap-rock album, an out-of-date idea that he treated like aggressive scientific research.
19 | alarmists	But the alarmists have not seen anything yet.
20 | New York, windows, catwalk, teardrops, commuters, curved, floor, shape, video, walls, Grand Central Terminal	Inside, it’s even wackier: curved walls, windows in the shape of teardrops, and a catwalk with a tiny video screen embedded in the floor that shows an endless loop of antlike commuters rushing through Grand Central Terminal in New York.
21 | marketers, lawsuits, Internet, Internet privacy, computer users, Worries, consumer, anxiety, conspiracy theories	Worries over Internet privacy have spurred lawsuits, conspiracy theories and consumer anxiety as marketers and others invent new ways to track computer users on the Internet.
22 | person, driving, wheel	Harder to notice was that the person at the wheel was not actually driving.
23 | blue, park, yellow, red, house, leaf-shaped, floor plan, Turku, LEAF, Finland, green, roof	“LIFE ON A LEAF,” a whimsical yellow house that sits in a wooded park at the edge of Turku in southern Finland, is a lopsided construction with a bright red, blue and green roof and a leaf-shaped floor plan.
24 | Buster Keaton, chair, ladder, dance, audience	A tall dancer feigns clumsiness with a slyly graceful gawkiness that would do Buster Keaton proud, stumbling around a ladder and a chair to the delight of the audience.
25 | Los Angeles, San Francisco, roof, Toyota Prius, Highway 1, driving	Anyone driving the twists of Highway 1 between San Francisco and Los Angeles recently may have glimpsed a Toyota Prius with a curious funnel-like cylinder on the roof.
26 | man, rucksack, tutu, accordion	The short, bearded man in the tutu wanders onstage, an accordion strapped to his back like a rucksack.
27 | research, marijuana	They looked at some research and decided marijuana was worth a try.
28 | gossiping, women, person, men, short-term, person, emotional, fictional, undergraduates	In the first study, intended to measure a person’s short-term emotional reaction to gossiping, 140 men and women, primarily undergraduates, were asked to talk about a fictional person either positively or negatively.
29 | years, parents, Illinois, marijuana, supplying, writer	Bryan, 46, a writer who lives in Illinois, began supplying his parents about five years ago, after he told them about his own marijuana use.
30 | beef tenderloin, world, China, trade imbalances, renminbi, Europe, Treasury Secretary, Japan, Canada, currency, Timothy F. Geithner, scallops	Over seared scallops and beef tenderloin, Treasury Secretary Timothy F. Geithner urged his counterparts from Europe, Canada and Japan to help persuade China to let its currency, the renminbi, rise in value — a crucial element in redressing the trade imbalances that are threatening recovery around the world.
31 | fan, Nets	If the Nets pick up a strong Russian fan base in the process, that would be nice, too, he said.
32 | middle-age, rites, families, marijuana, parents	To the rites of middle-age passage, some families are adding another: buying marijuana for aging parents.
33 | artists, conservatism	To some degree recuperative conservatism worked; it gave artists a way to keep moving when an old way seemed blocked.
34 | Embassy, Friday, economic, dinner, economic, international, Canadian, Obama administration, world, problem	At a private dinner on Friday at the Canadian Embassy, finance officials from seven world economic powers focused on the most vexing international economic problem facing the Obama administration.
35 | ancient Greece, Italian Renaissance, traditional	They retreated to traditional forms and looked back to an imagined classical past — ancient Greece, the Italian Renaissance — for images of wholeness and harmony.
36 | box office	Boilerplate is safe box office, and we’ve gotten our share lately.
37 | social psychologist, presentation, September, husband	In a presentation in September, Jennifer Cole, a social psychologist, and Hannah Scrivener reported results from two related studies, both of which demonstrate that it’s in one’s self-interest to say “So-and-so’s second husband is adorable” rather than “She married that lout?”
38 | Internet, Internet, HTML 5, privacy	Nearly everyone who uses the Internet will face the privacy risks that come with those capabilities, which are an integral part of the Web language that will soon power the Internet: HTML 5.
39 | gun-possession, jailed, charge	(He was jailed on a gun-possession charge.)
40 | interior designer, artist, artists, sculptor	Jan-Erik Andersson — the artist, sculptor and interior designer who, with contributions from 20 other artists, created the place, and gives tours to groups of visitors who make arrangements in advance — was waiting for me.
41 | online, marketers, computer users, years, Web developers, advertisers	In the next few years, a powerful new suite of capabilities will become available to Web developers that could give marketers and advertisers access to many more details about computer users’ online activities.
42 | I Am Not a Human Being, recording, month, album, Lil Wayne, songs	From somewhere among those recording sessions, and those rumored to be slated for “Tha Carter IV,” his next album, came the songs found on “I Am Not a Human Being,” an uncentered collection of odds and ends meant to sate interest until Lil Wayne’s release from Rikers, which is expected to be early next month.
43 | miles, miles, navigation system, cars, technician, human, human, wheel	With someone behind the wheel to take control if something goes awry and a technician in the passenger seat to monitor the navigation system, seven test cars have driven 1,000 miles without human intervention and more than 140,000 miles with only occasional human control.
44 | country, wealthy, greeted, Russia, sponsors, home, team, Prokhorov, players	“Welcome to Russia,” Prokhorov said as he greeted his players during their first trip to a country he hopes will become a kind of second home for the team — as well as a source of wealthy sponsors.
45 | September, house, landmark, Turku’s harbor, Turku, woods	On an unseasonably warm September morning I traveled to the Leaf house, which has become something of a local landmark, passing Turku’s harbor overlooked by pine-forested hills, then turning into a clearing in the woods.
46 | annual meetings of the International Monetary Fund, currency	But the next afternoon, the annual meetings of the International Monetary Fund ended with a tepid statement that made only fleeting and indirect references to the simmering currency tensions.
47 | pumping station, room, hall, Berlin, boiler, dance troupe, machine, man, performance	Even though Berliners know about the dance troupe, visitors often miss its frequently sold-out performances in the towering space of the former machine hall and boiler room of the pumping station.
48 | project, artificial-intelligence, vehicles that can drive themselves, car, Google, car, software, human	The car is a project of Google, which has been working in secret but in plain view on vehicles that can drive themselves, using artificial-intelligence software that can sense anything near the car and mimic the decisions made by a human driver.
49 | house dance, Spree, Sasha Waltz, brick, Berlin, Gothic, man, pumping station, dance, 2006, choreographer, performance	Sasha Waltz & Guests (pictured), named for its choreographer, is the house dance ensemble at Radialsystem V (Holzmarktstr, 33; radialsystem.de), a performance space that opened in 2006 in an old brick Gothic pumping station on the Spree River in Berlin.
50 | mother, grounded	“We would have grounded him,” said his mother, who is 72.
51 | social support, gossiping, long-term, self-esteem, female, undergrads, gossip, questionnaires	The second study, which looked into the long-term effects of gossiping on well-being, had 160 participants, mostly female undergrads, fill out questionnaires about their tendency to gossip, their self-esteem and their perceived social support.
52 | illegal drugs, parents	When he was growing up, he said, his parents were very strict about illegal drugs.
53 | hedonist, Alice Roosevelt Longworth	“IF you can’t say something good about someone, sit right here by me,” Alice Roosevelt Longworth, a self-proclaimed “hedonist,” used to say.
54 | billionaire, home, owner, Moscow, Sunday, Nets, country, Mikhail D. Prokhorov	The Nets dropped into Moscow on Sunday for a one-day visit meant to raise their profile in the home country of their new owner, the Russian billionaire Mikhail D. Prokhorov.
55 | social order, ideology	At the same time classicism as an ideology, with its emphasis on order, purity and exclusion, was being espoused by rising political figures intent on creating a new, lethally exclusionary social order.
56 | 	Visitors with a stake in art-as-uplift will find the story it tells mystifying, if not perverse.
57 | free-associative, idea, Lil Wayne, self-consciousness, boredom	Lil Wayne’s least interesting mode is fixation. Restless and free-associative, he’s best when bouncing from one idea to the next, sticking around just long enough to master it, then splitting before boredom or self-consciousness sets in.
58 | exhibition, Chaos, Italy, names, Guggenheim Museum, percentage, Germany, France, fall, exhibition, Art, Classicism	So it’s great that the Guggenheim Museum is giving us the opposite in its major fall exhibition, “Chaos and Classicism: Art in France, Italy, and Germany, 1918-1936.” With its high percentage of unfamiliar names, the exhibition won’t pull crowds.
59 | multimedia, smartphone, software, downloading, restaurant, offline, e-mail, content, users	It will make it easier for users to view multimedia content without downloading extra software; check e-mail offline; or find a favorite restaurant or shop on a smartphone.
60 | Web, code, Web pages, era, Hypertext Markup Language, promises, Internet browsing, years	The new Web code, the fifth version of Hypertext Markup Language used to create Web pages, is already in limited use, and it promises to usher in a new era of Internet browsing within the next few years.
61 | 


--------------------------------------------------------------------------------
/datasets/dbpedia.ttl.contexts.tsv:
--------------------------------------------------------------------------------
 1 | targets	contexts
 2 |     	To some degree recuperative conservatism worked; it gave artists a way to keep moving when an old way seemed blocked.
 3 |     	The second study, which looked into the long-term effects of gossiping on well-being, had 160 participants, mostly female undergrads, fill out questionnaires about their tendency to gossip, their self-esteem and their perceived social support.
 4 |     	The short, bearded man in the tutu wanders onstage, an accordion strapped to his back like a rucksack.
 5 |     	Inside, it’s even wackier: curved walls, windows in the shape of teardrops, and a catwalk with a tiny video screen embedded in the floor that shows an endless loop of antlike commuters rushing through Grand Central Terminal in New York.
 6 |     	At a private dinner on Friday at the Canadian Embassy, finance officials from seven world economic powers focused on the most vexing international economic problem facing the Obama administration.
 7 |     	It failed less for its experimentation, which was spotty, than for the strenuousness with which he pursued it.
 8 |     	Prokhorov said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey.
 9 |     	At the same time classicism as an ideology, with its emphasis on order, purity and exclusion, was being espoused by rising political figures intent on creating a new, lethally exclusionary social order.
10 |     	Harder to notice was that the person at the wheel was not actually driving.
11 |     	But the alarmists have not seen anything yet.
12 |     	Its narrative goes something like this: After the exhausting nightmare of World War I, Europe’s early modernist vanguard backed away from disruptive experimental styles like Cubism and Futurism.
13 |     	But it seems the greater pleasure comes from more temperate gossip.
14 |     	Nearly everyone who uses the Internet will face the privacy risks that come with those capabilities, which are an integral part of the Web language that will soon power the Internet: HTML 5.
15 |     	Jan-Erik Andersson — the artist, sculptor and interior designer who, with contributions from 20 other artists, created the place, and gives tours to groups of visitors who make arrangements in advance — was waiting for me.
16 |     	“We would have grounded him,” said his mother, who is 72.
17 |     	Boilerplate is safe box office, and we’ve gotten our share lately.
18 |     	On an unseasonably warm September morning I traveled to the Leaf house, which has become something of a local landmark, passing Turku’s harbor overlooked by pine-forested hills, then turning into a clearing in the woods.
19 |     	On the front lawn we lingered in front of his work-in-progress: a fiberglass sauna shaped like a garlic clove, one of five Finnish bathhouses that will be on display in SaunaLab, an exhibition at Turku 2011, the coming yearlong culture festival.
20 |     	In a sense it’s an improvement on “Rebirth,” even if a couple of the songs bear the scars of that period, particularly the Rick Rubinesque title track, and “Popular,” which has some of the new wave sizzle he was toying with.
21 |     	In a presentation in September, Jennifer Cole, a social psychologist, and Hannah Scrivener reported results from two related studies, both of which demonstrate that it’s in one’s self-interest to say “So-and-so’s second husband is adorable” rather than “She married that lout?”
22 |     	But the next afternoon, the annual meetings of the International Monetary Fund ended with a tepid statement that made only fleeting and indirect references to the simmering currency tensions.
23 |     	In the first study, intended to measure a person’s short-term emotional reaction to gossiping, 140 men and women, primarily undergraduates, were asked to talk about a fictional person either positively or negatively.
24 |     	(He was jailed on a gun-possession charge.)
25 |     	Over seared scallops and beef tenderloin, Treasury Secretary Timothy F. Geithner urged his counterparts from Europe, Canada and Japan to help persuade China to let its currency, the renminbi, rise in value — a crucial element in redressing the trade imbalances that are threatening recovery around the world.
26 |     	Visitors with a stake in art-as-uplift will find the story it tells mystifying, if not perverse.
27 |     	Lil Wayne’s least interesting mode is fixation. Restless and free-associative, he’s best when bouncing from one idea to the next, sticking around just long enough to master it, then splitting before boredom or self-consciousness sets in.
28 |     	A tall dancer feigns clumsiness with a slyly graceful gawkiness that would do Buster Keaton proud, stumbling around a ladder and a chair to the delight of the audience.
29 |     	They looked at some research and decided marijuana was worth a try.
30 |     	Sasha Waltz & Guests (pictured), named for its choreographer, is the house dance ensemble at Radialsystem V (Holzmarktstr, 33; radialsystem.de), a performance space that opened in 2006 in an old brick Gothic pumping station on the Spree River in Berlin.
31 |     	“Welcome to Russia,” Prokhorov said as he greeted his players during their first trip to a country he hopes will become a kind of second home for the team — as well as a source of wealthy sponsors.
32 |     	“LIFE ON A LEAF,” a whimsical yellow house that sits in a wooded park at the edge of Turku in southern Finland, is a lopsided construction with a bright red, blue and green roof and a leaf-shaped floor plan.
33 |     	Bryan, 46, a writer who lives in Illinois, began supplying his parents about five years ago, after he told them about his own marijuana use.
34 |     	New research finds that gossiping can be good for you — as long as you have something nice to say.
35 |     	To the rites of middle-age passage, some families are adding another: buying marijuana for aging parents.
36 |     	When he was growing up, he said, his parents were very strict about illegal drugs.
37 |     	It will make it easier for users to view multimedia content without downloading extra software; check e-mail offline; or find a favorite restaurant or shop on a smartphone.
38 |     	“Rebirth,” the last album he released before he began serving a one-year jail sentence at Rikers Island, was his quixotic attempt at a rap-rock album, an out-of-date idea that he treated like aggressive scientific research.
39 |     	The car is a project of Google, which has been working in secret but in plain view on vehicles that can drive themselves, using artificial-intelligence software that can sense anything near the car and mimic the decisions made by a human driver.
40 |     	Worries over Internet privacy have spurred lawsuits, conspiracy theories and consumer anxiety as marketers and others invent new ways to track computer users on the Internet.
41 |     	One even drove itself down Lombard Street in San Francisco, one of the steepest and curviest streets in the nation.
42 |     	So it’s great that the Guggenheim Museum is giving us the opposite in its major fall exhibition, “Chaos and Classicism: Art in France, Italy, and Germany, 1918-1936.” With its high percentage of unfamiliar names, the exhibition won’t pull crowds.
43 |     	“IF you can’t say something good about someone, sit right here by me,” Alice Roosevelt Longworth, a self-proclaimed “hedonist,” used to say.
44 |     	But with age and the growing acceptance of medical marijuana, his parents were curious.
45 |     	During warmer months, a casual passer-by without a ticket to the night’s performance can enjoy a cappuccino with the dance aficionados on the waterfront terrace, but the main draw is the celebrated Ms. Waltz and her dancers, interlocked in twisted, shifting embraces.
46 |     	The only accident, engineers said, was when one Google car was rear-ended while stopped at a traffic light.
47 |     	I found the whole thing totally engrossing: a survey-style piece of investigative history with a bomb ticking away inside.
48 |     	They retreated to traditional forms and looked back to an imagined classical past — ancient Greece, the Italian Renaissance — for images of wholeness and harmony.
49 |     	The new Web code, the fifth version of Hypertext Markup Language used to create Web pages, is already in limited use, and it promises to usher in a new era of Internet browsing within the next few years.
50 |     	The stay was just long enough for a little practice and an exhibition in which the Nets were interspersed on the court with top players from youth leagues across Russia — some as young as 8 years old — for a light-hearted game in the arena of CSKA Moscow, the professional Russian club Prokhorov once owned.
51 |     	From somewhere among those recording sessions, and those rumored to be slated for “Tha Carter IV,” his next album, came the songs found on “I Am Not a Human Being,” an uncentered collection of odds and ends meant to sate interest until Lil Wayne’s release from Rikers, which is expected to be early next month.
52 |     	The Nets dropped into Moscow on Sunday for a one-day visit meant to raise their profile in the home country of their new owner, the Russian billionaire Mikhail D. Prokhorov.
53 |     	Anyone driving the twists of Highway 1 between San Francisco and Los Angeles recently may have glimpsed a Toyota Prius with a curious funnel-like cylinder on the roof.
54 |     	Even though Berliners know about the dance troupe, visitors often miss its frequently sold-out performances in the towering space of the former machine hall and boiler room of the pumping station.
55 |     	Radialsystem V, miles away from the German capital’s grand boulevard of Unter den Linden, is surrounded by a cluster of gritty nightclubs near the Ostbahnhof train station.
56 |     	His father had a heart ailment, his mother had dizzy spells and nausea, and both were worried about Alzheimer’s disease and cancer.
57 |     	In the next few years, a powerful new suite of capabilities will become available to Web developers that could give marketers and advertisers access to many more details about computer users’ online activities.
58 |     	With someone behind the wheel to take control if something goes awry and a technician in the passenger seat to monitor the navigation system, seven test cars have driven 1,000 miles without human intervention and more than 140,000 miles with only occasional human control.
59 |     	If the Nets pick up a strong Russian fan base in the process, that would be nice, too, he said.
60 | 


--------------------------------------------------------------------------------
/datasets/dbpedia.ttl.phrases.tsv:
--------------------------------------------------------------------------------
  1 | targets	contexts
  2 | soccer	    
  3 | mother	    
  4 | free-associative	    
  5 | currency	    
  6 | privacy	    
  7 | illegal drugs	    
  8 | scars	    
  9 | Canada	    
 10 | parents	    
 11 | human	    
 12 | hall	    
 13 | Turku	    
 14 | Russia	    
 15 | husband	    
 16 | ancient Greece	    
 17 | conservatism	    
 18 | dance	    
 19 | shape	    
 20 | album	    
 21 | advertisers	    
 22 | self-consciousness	    
 23 | ideology	    
 24 | players	    
 25 | ticket	    
 26 | boulevard	    
 27 | Worries	    
 28 | Lil Wayne	    
 29 | streets	    
 30 | 2006	    
 31 | artist	    
 32 | conservatism	    
 33 | cars	    
 34 | hedonist	    
 35 | coaches	    
 36 | Turku	    
 37 | man	    
 38 | machine	    
 39 | currency	    
 40 | hall	    
 41 | walls	    
 42 | walls	    
 43 | gossip	    
 44 | rucksack	    
 45 | car	    
 46 | traffic light	    
 47 | self-esteem	    
 48 | software	    
 49 | short-term	    
 50 | idea	    
 51 | court	    
 52 | world	    
 53 | exhibition	    
 54 | Russia	    
 55 | September	    
 56 | floor plan	    
 57 | presentation	    
 58 | curved	    
 59 | trade imbalances	    
 60 | traditional	    
 61 | Sasha Waltz	    
 62 | anxiety	    
 63 | ideology	    
 64 | cars	    
 65 | Lombard Street	    
 66 | Web	    
 67 | ladder	    
 68 | multimedia	    
 69 | undergraduates	    
 70 | San Francisco	    
 71 | short-term	    
 72 | Google	    
 73 | Prokhorov	    
 74 | Embassy	    
 75 | online	    
 76 | game	    
 77 | Canadian	    
 78 | artists	    
 79 | Berlin	    
 80 | Los Angeles	    
 81 | marijuana	    
 82 | Canada	    
 83 | online	    
 84 | cappuccino	    
 85 | artists	    
 86 | sport	    
 87 | writer	    
 88 | songs	    
 89 | undergrads	    
 90 | Turku	    
 91 | catwalk	    
 92 | players	    
 93 | interaction	    
 94 | Internet	    
 95 | sponsors	    
 96 | Nets	    
 97 | home	    
 98 | Europe	    
 99 | New York	    
100 | parents	    
101 | long-term	    
102 | experimentation	    
103 | gossiping	    
104 | Nets	    
105 | shaped	    
106 | wheel	    
107 | Prokhorov	    
108 | writer	    
109 | accident	    
110 | pumping station	    
111 | exhibition	    
112 | San Francisco	    
113 | artificial-intelligence	    
114 | computer users	    
115 | tutu	    
116 | September	    
117 | accordion	    
118 | basketball	    
119 | Web developers	    
120 | sponsors	    
121 | promises	    
122 | marijuana	    
123 | smartphone	    
124 | human	    
125 | Nets	    
126 | garlic	    
127 | dance	    
128 | Toyota Prius	    
129 | offline	    
130 | World War I	    
131 | gun-possession	    
132 | jail	    
133 | Grand Central Terminal	    
134 | dance troupe	    
135 | HTML 5	    
136 | Turku’s harbor	    
137 | shaped	    
138 | free-associative	    
139 | economic	    
140 | September	    
141 | parents	    
142 | ticket	    
143 | marketers	    
144 | names	    
145 | Russians	    
146 | person	    
147 | long-term	    
148 | landmark	    
149 | Web developers	    
150 | floor plan	    
151 | billionaire	    
152 | research	    
153 | Gothic	    
154 | history	    
155 | problem	    
156 | performance	    
157 | sentence	    
158 | Illinois	    
159 | man	    
160 | Google car	    
161 | percentage	    
162 | World War I	    
163 | Gothic	    
164 | experimentation	    
165 | miles	    
166 | research	    
167 | gossip	    
168 | Rikers Island	    
169 | era	    
170 | international	    
171 | navigation system	    
172 | recording	    
173 | fan	    
174 | miles	    
175 | LEAF	    
176 | wheel	    
177 | I Am Not a Human Being	    
178 | culture	    
179 | parents	    
180 | fall	    
181 | streets	    
182 | project	    
183 | interior designer	    
184 | conspiracy theories	    
185 | future	    
186 | currency	    
187 | China	    
188 | German capital’s	    
189 | female	    
190 | marijuana	    
191 | pumping station	    
192 | fiberglass	    
193 | research	    
194 | club	    
195 | house	    
196 | undergrads	    
197 | wealthy	    
198 | parents	    
199 | self-consciousness	    
200 | catwalk	    
201 | rear-ended	    
202 | machine	    
203 | charge	    
204 | brick	    
205 | Italy	    
206 | blue	    
207 | smartphone	    
208 | album	    
209 | gossip	    
210 | popularity	    
211 | scallops	    
212 | parents	    
213 | car	    
214 | project	    
215 | economic	    
216 | driving	    
217 | dinner	    
218 | lawsuits	    
219 | pumping station	    
220 | dance	    
221 | windows	    
222 | heart ailment	    
223 | teardrops	    
224 | father	    
225 | hockey	    
226 | gun-possession	    
227 | content	    
228 | software	    
229 | percentage	    
230 | Ostbahnhof	    
231 | greeted	    
232 | dance troupe	    
233 | Russia	    
234 | roof	    
235 | families	    
236 | Alzheimer’s disease	    
237 | families	    
238 | box office	    
239 | code	    
240 | arena of CSKA Moscow	    
241 | man	    
242 | Finnish bathhouses	    
243 | red	    
244 | court	    
245 | Germany	    
246 | engineers	    
247 | car	    
248 | economic	    
249 | marijuana	    
250 | Nets	    
251 | game	    
252 | performance	    
253 | Classicism	    
254 | Web pages	    
255 | country	    
256 | boredom	    
257 | miles	    
258 | Worries	    
259 | car	    
260 | gossip	    
261 | nightmare	    
262 | men	    
263 | nausea	    
264 | social support	    
265 | software	    
266 | self-esteem	    
267 | names	    
268 | Internet	    
269 | Sasha Waltz	    
270 | man	    
271 | fall	    
272 | research	    
273 | downloading	    
274 | marketers	    
275 | German capital’s	    
276 | artificial-intelligence	    
277 | women	    
278 | dance	    
279 | performance	    
280 | sauna	    
281 | France	    
282 | Finland	    
283 | popularity	    
284 | Internet browsing	    
285 | gossiping	    
286 | Friday	    
287 | owner	    
288 | scientific	    
289 | social order	    
290 | Nets	    
291 | club	    
292 | culture	    
293 | Los Angeles	    
294 | consumer	    
295 | Timothy F. Geithner	    
296 | modernist	    
297 | chair	    
298 | women	    
299 | rear-ended	    
300 | traditional	    
301 | house dance	    
302 | anxiety	    
303 | Highway 1	    
304 | world	    
305 | men	    
306 | mother	    
307 | park	    
308 | green	    
309 | person	    
310 | team	    
311 | marijuana	    
312 | years	    
313 | Moscow	    
314 | presentation	    
315 | middle-age	    
316 | nausea	    
317 | future	    
318 | month	    
319 | Berlin	    
320 | interior designer	    
321 | human	    
322 | annual meetings of the International Monetary Fund	    
323 | ancient Greece	    
324 | ladder	    
325 | Italian Renaissance	    
326 | miles	    
327 | gossiping	    
328 | lawsuits	    
329 | performance	    
330 | players	    
331 | grounded	    
332 | marijuana	    
333 | accident	    
334 | house	    
335 | home	    
336 | house dance	    
337 | medical marijuana	    
338 | HTML 5	    
339 | Art	    
340 | Friday	    
341 | Alice Roosevelt Longworth	    
342 | Web	    
343 | rites	    
344 | nightclubs	    
345 | roof	    
346 | San Francisco	    
347 | performance	    
348 | person	    
349 | world	    
350 | Lil Wayne	    
351 | traffic light	    
352 | Rebirth	    
353 | Obama administration	    
354 | code	    
355 | Chaos	    
356 | Guggenheim Museum	    
357 | middle-age	    
358 | practice	    
359 | festival	    
360 | Grand Central Terminal	    
361 | billionaire	    
362 | charge	    
363 | songs	    
364 | sculptor	    
365 | Berlin	    
366 | Nets	    
367 | France	    
368 | idea	    
369 | Ostbahnhof	    
370 | social psychologist	    
371 | songs	    
372 | floor	    
373 | person	    
374 | months	    
375 | human	    
376 | Alice Roosevelt Longworth	    
377 | Sunday	    
378 | scars	    
379 | Rebirth	    
380 | engineers	    
381 | vehicles that can drive themselves	    
382 | red	    
383 | windows	    
384 | multimedia	    
385 | choreographer	    
386 | exhibition	    
387 | car	    
388 | wheel	    
389 | software	    
390 | economic	    
391 | woods	    
392 | years	    
393 | arena of CSKA Moscow	    
394 | pumping station	    
395 | computer users	    
396 | Finland	    
397 | Embassy	    
398 | alarmists	    
399 | Rebirth	    
400 | scallops	    
401 | Russia	    
402 | chair	    
403 | restaurant	    
404 | downloading	    
405 | accordion	    
406 | rucksack	    
407 | years	    
408 | cappuccino	    
409 | years	    
410 | houses	    
411 | exhibition	    
412 | 2011	    
413 | dizzy spells	    
414 | restaurant	    
415 | car	    
416 | video	    
417 | month	    
418 | commuters	    
419 | sculptor	    
420 | aggressive	    
421 | Hypertext Markup Language	    
422 | Russia	    
423 | brick	    
424 | China	    
425 | mother	    
426 | grounded	    
427 | leaf-shaped	    
428 | Hypertext Markup Language	    
429 | artist	    
430 | exhibition	    
431 | parents	    
432 | album	    
433 | Spree	    
434 | box office	    
435 | New York	    
436 | undergraduates	    
437 | exhibition	    
438 | Sunday	    
439 | miles	    
440 | house	    
441 | Prokhorov	    
442 | marketers	    
443 | room	    
444 | human	    
445 | scientific	    
446 | hedonist	    
447 | Prokhorov	    
448 | room	    
449 | teardrops	    
450 | dinner	    
451 | San Francisco	    
452 | beef tenderloin	    
453 | players	    
454 | idea	    
455 | Italian Renaissance	    
456 | dance	    
457 | consumer	    
458 | human	    
459 | 2011	    
460 | Turku	    
461 | practice	    
462 | supplying	    
463 | computer users	    
464 | nightmare	    
465 | boiler	    
466 | country	    
467 | Chaos	    
468 | emotional	    
469 | heart ailment	    
470 | album	    
471 | recording	    
472 | players	    
473 | dance	    
474 | nightclubs	    
475 | man	    
476 | Russians	    
477 | artists	    
478 | months	    
479 | home	    
480 | red	    
481 | driving	    
482 | era	    
483 | yellow	    
484 | fan	    
485 | Guggenheim Museum	    
486 | green	    
487 | years	    
488 | person	    
489 | marketers	    
490 | Lil Wayne	    
491 | Rebirth	    
492 | man	    
493 | international	    
494 | driving	    
495 | Treasury Secretary	    
496 | songs	    
497 | audience	    
498 | Spree	    
499 | social order	    
500 | Nets	    
501 | promises	    
502 | roof	    
503 | trade imbalances	    
504 | husband	    
505 | renminbi	    
506 | Highway 1	    
507 | research	    
508 | dance	    
509 | basketball	    
510 | alarmists	    
511 | coaches	    
512 | Turku’s harbor	    
513 | woods	    
514 | LEAF	    
515 | Internet privacy	    
516 | team	    
517 | 2006	    
518 | gossiping	    
519 | I Am Not a Human Being	    
520 | Nets	    
521 | modernist	    
522 | exhibition	    
523 | sauna	    
524 | country	    
525 | sport	    
526 | Timothy F. Geithner	    
527 | driving	    
528 | Cubism	    
529 | questionnaires	    
530 | hockey	    
531 | Obama administration	    
532 | technician	    
533 | aggressive	    
534 | Canadian	    
535 | Futurism	    
536 | curved	    
537 | users	    
538 | Mikhail D. Prokhorov	    
539 | female	    
540 | tutu	    
541 | Italy	    
542 | parents	    
543 | advertisers	    
544 | cancer	    
545 | fiberglass	    
546 | e-mail	    
547 | floor	    
548 | world	    
549 | gossiping	    
550 | wealthy	    
551 | Unter den Linden	    
552 | annual meetings of the International Monetary Fund	    
553 | technician	    
554 | video	    
555 | users	    
556 | Turku	    
557 | Mikhail D. Prokhorov	    
558 | social support	    
559 | boulevard	    
560 | e-mail	    
561 | Europe	    
562 | commuters	    
563 | Internet	    
564 | country	    
565 | problem	    
566 | vehicles that can drive themselves	    
567 | years	    
568 | Classicism	    
569 | wheel	    
570 | artists	    
571 | Japan	    
572 | Futurism	    
573 | Russia	    
574 | Moscow	    
575 | Unter den Linden	    
576 | Finnish bathhouses	    
577 | cancer	    
578 | shape	    
579 | boredom	    
580 | audience	    
581 | September	    
582 | father	    
583 | emotional	    
584 | beef tenderloin	    
585 | boiler	    
586 | Alzheimer’s disease	    
587 | Internet privacy	    
588 | Cubism	    
589 | Turku	    
590 | social psychologist	    
591 | Prokhorov	    
592 | idea	    
593 | Germany	    
594 | blue	    
595 | performance	    
596 | rites	    
597 | renminbi	    
598 | currency	    
599 | exhibition	    
600 | house	    
601 | years	    
602 | Web pages	    
603 | gossiping	    
604 | years	    
605 | album	    
606 | Internet	    
607 | Rikers Island	    
608 | Japan	    
609 | garlic	    
610 | questionnaires	    
611 | fictional	    
612 | navigation system	    
613 | soccer	    
614 | houses	    
615 | interaction	    
616 | Buster Keaton	    
617 | miles	    
618 | jailed	    
619 | conspiracy theories	    
620 | computer users	    
621 | dizzy spells	    
622 | Lombard Street	    
623 | leaf-shaped	    
624 | offline	    
625 | Prokhorov	    
626 | landmark	    
627 | jail	    
628 | Google	    
629 | Internet browsing	    
630 | choreographer	    
631 | red	    
632 | Toyota Prius	    
633 | fictional	    
634 | history	    
635 | medical marijuana	    
636 | Lil Wayne	    
637 | festival	    
638 | Illinois	    
639 | album	    
640 | Art	    
641 | Treasury Secretary	    
642 | mother	    
643 | roof	    
644 | content	    
645 | person	    
646 | players	    
647 | jailed	    
648 | privacy	    
649 | home	    
650 | greeted	    
651 | Berlin	    
652 | owner	    
653 | supplying	    
654 | Internet	    
655 | Buster Keaton	    
656 | dance	    
657 | yellow	    
658 | illegal drugs	    
659 | sentence	    
660 | park	    
661 | Internet	    
662 | research	    
663 | Google car	    
664 | 


--------------------------------------------------------------------------------
/datasets/kore50-urls.txt:
--------------------------------------------------------------------------------
  1 | http://dbpedia.org/resource/David_Beckham
  2 | http://dbpedia.org/resource/Victoria_Beckham
  3 | http://dbpedia.org/resource/David_Beckham
  4 | http://dbpedia.org/resource/Victoria_Beckham
  5 | http://dbpedia.org/resource/Tiger_Woods
  6 | http://dbpedia.org/resource/Elin_Nordegren
  7 | http://dbpedia.org/resource/Tiger_Woods
  8 | http://dbpedia.org/resource/U.S._Open_(golf)
  9 | http://dbpedia.org/resource/Madonna_(entertainer)
 10 | http://dbpedia.org/resource/Eva_Perón
 11 | http://dbpedia.org/resource/Carlos_Leon
 12 | http://dbpedia.org/resource/Madonna_(entertainer)
 13 | http://dbpedia.org/resource/First_Lady_of_Argentina
 14 | http://dbpedia.org/resource/Angelina_Jolie
 15 | http://dbpedia.org/resource/Jon_Voight
 16 | http://dbpedia.org/resource/Brad_Pitt
 17 | http://dbpedia.org/resource/Heidi_Klum
 18 | http://dbpedia.org/resource/Seal_(musician)
 19 | http://dbpedia.org/resource/Las_Vegas,_Nevada
 20 | http://dbpedia.org/resource/Paris_Hilton
 21 | http://dbpedia.org/resource/Kim_Kardashian
 22 | http://dbpedia.org/resource/Justin_Bieber
 23 | http://dbpedia.org/resource/Lady_gaga
 24 | http://dbpedia.org/resource/Kate_Perry
 25 | http://dbpedia.org/resource/MTV
 26 | http://dbpedia.org/resource/Twitter
 27 | http://dbpedia.org/resource/Bob_Dylan
 28 | http://dbpedia.org/resource/Hurricane_(song)
 29 | http://dbpedia.org/resource/Rubin_Carter
 30 | http://dbpedia.org/resource/Desire_(Bob_Dylan_album)
 31 | http://dbpedia.org/resource/Desire_(Bob_Dylan_album)
 32 | http://dbpedia.org/resource/Emmylou_Harris
 33 | http://dbpedia.org/resource/Joey_(Bob_Dylan_song)
 34 | http://dbpedia.org/resource/Eric_Clapton
 35 | http://dbpedia.org/resource/Jeff_Beck
 36 | http://dbpedia.org/resource/Jimmy_Page
 37 | http://dbpedia.org/resource/Paul_Allen
 38 | http://dbpedia.org/resource/EMP_Museum
 39 | http://dbpedia.org/resource/Seattle
 40 | http://dbpedia.org/resource/Jimi_Hendrix
 41 | http://dbpedia.org/resource/Bob_Dylan
 42 | http://dbpedia.org/resource/Frank_Sinatra
 43 | http://dbpedia.org/resource/Bob_Dylan
 44 | http://dbpedia.org/resource/Billy_Joel
 45 | http://dbpedia.org/resource/Carlos_Santana
 46 | http://dbpedia.org/resource/Columbia_Records
 47 | http://dbpedia.org/resource/Sony_Music_Entertainment
 48 | http://dbpedia.org/resource/Johnny_Cash
 49 | http://dbpedia.org/resource/American_Recordings_(album)
 50 | http://dbpedia.org/resource/Rick_Rubin
 51 | http://dbpedia.org/resource/Josh_Homme
 52 | http://dbpedia.org/resource/Dave_Grohl
 53 | http://dbpedia.org/resource/John_Paul_Jones_(musician)
 54 | http://dbpedia.org/resource/Steve_Jobs
 55 | http://dbpedia.org/resource/Joan_Baez
 56 | http://dbpedia.org/resource/Stanford_University
 57 | http://dbpedia.org/resource/Isle_of_wight_festival
 58 | http://dbpedia.org/resource/Woodstock_Festival
 59 | http://dbpedia.org/resource/Miles_davis
 60 | http://dbpedia.org/resource/Chicago_(band)
 61 | http://dbpedia.org/resource/Joni_Mitchell
 62 | http://dbpedia.org/resource/Eric_Clapton
 63 | http://dbpedia.org/resource/Blues
 64 | http://dbpedia.org/resource/Rock_music
 65 | http://dbpedia.org/resource/John_Mayall
 66 | http://dbpedia.org/resource/Steve_Jobs
 67 | http://dbpedia.org/resource/Apple_Inc.
 68 | http://dbpedia.org/resource/Stanford_University
 69 | http://dbpedia.org/resource/Steve_Ballmer
 70 | http://dbpedia.org/resource/Stanford_University
 71 | http://dbpedia.org/resource/Microsoft
 72 | http://dbpedia.org/resource/Microsoft_Windows
 73 | http://dbpedia.org/resource/Cairo_(operating_system)
 74 | http://dbpedia.org/resource/Microsoft
 75 | http://dbpedia.org/resource/Bill_Gates
 76 | http://dbpedia.org/resource/Steve_Jobs
 77 | http://dbpedia.org/resource/Bill_Gates
 78 | http://dbpedia.org/resource/Sergey_Brin
 79 | http://dbpedia.org/resource/Larry_Page
 80 | http://dbpedia.org/resource/Karl_Albrecht
 81 | http://dbpedia.org/resource/Theo_Albrecht
 82 | http://dbpedia.org/resource/Apple_Inc.
 83 | http://dbpedia.org/resource/Mango_(clothing)
 84 | http://dbpedia.org/resource/Orange_(telecommunications)
 85 | http://dbpedia.org/resource/Sam_Zell
 86 | http://dbpedia.org/resource/Equity_International
 87 | http://dbpedia.org/resource/Pixar
 88 | http://dbpedia.org/resource/Cars_(film)
 89 | http://dbpedia.org/resource/John_Lasseter
 90 | http://dbpedia.org/resource/Mars_bar
 91 | http://dbpedia.org/resource/Galaxy_(chocolate)
 92 | http://dbpedia.org/resource/Bounty_(chocolate_bar)
 93 | http://dbpedia.org/resource/Robert_Bosch_GmbH
 94 | http://dbpedia.org/resource/Sharp_Corporation
 95 | http://dbpedia.org/resource/Manchester_City_F.C.
 96 | http://dbpedia.org/resource/Tottenham_Hotspur_F.C.
 97 | http://dbpedia.org/resource/Arsenal_F.C.
 98 | http://dbpedia.org/resource/Emirates_Stadium
 99 | http://dbpedia.org/resource/Atlético_Madrid
100 | http://dbpedia.org/resource/Real_Madrid_C.F.
101 | http://dbpedia.org/resource/Thomas_Müller_(footballer)
102 | http://dbpedia.org/resource/England_national_football_team
103 | http://dbpedia.org/resource/Thomas_Müller_(footballer)
104 | http://dbpedia.org/resource/Mario_Gomez
105 | http://dbpedia.org/resource/FC_Bayern_Munich
106 | http://dbpedia.org/resource/Norbert_Haug
107 | http://dbpedia.org/resource/FC_Red_Bull_Salzburg
108 | http://dbpedia.org/resource/Rudi_Völler
109 | http://dbpedia.org/resource/Netherlands_national_football_team
110 | http://dbpedia.org/resource/San_Siro
111 | http://dbpedia.org/resource/Willi_Landgraf
112 | http://dbpedia.org/resource/Erik_Meijer_(footballer)
113 | http://dbpedia.org/resource/New_Tivoli
114 | http://dbpedia.org/resource/Reinhold_Yabo
115 | http://dbpedia.org/resource/Alemannia_Aachen
116 | http://dbpedia.org/resource/Hertha_BSC
117 | http://dbpedia.org/resource/Borussia_Dortmund
118 | http://dbpedia.org/resource/Richard_Nixon
119 | http://dbpedia.org/resource/Watergate_scandal
120 | http://dbpedia.org/resource/Ping_Pong_Diplomacy
121 | http://dbpedia.org/resource/People's_Republic_of_China
122 | http://dbpedia.org/resource/The_Sun_(United_Kingdom)
123 | http://dbpedia.org/resource/The_Times
124 | http://dbpedia.org/resource/Greece
125 | http://dbpedia.org/resource/Eurozone
126 | http://dbpedia.org/resource/Enola_Gay
127 | http://dbpedia.org/resource/Hiroshima
128 | http://dbpedia.org/resource/World_War_II
129 | http://dbpedia.org/resource/Red_Army_Faction
130 | http://dbpedia.org/resource/Andreas_Baader
131 | http://dbpedia.org/resource/Ulrike_Meinhof
132 | http://dbpedia.org/resource/Hanns-Martin_Schleyer
133 | http://dbpedia.org/resource/Jacqueline_Kennedy_Onassis
134 | http://dbpedia.org/resource/John_F._Kennedy
135 | http://dbpedia.org/resource/Neil_Armstrong
136 | http://dbpedia.org/resource/Moon
137 | http://dbpedia.org/resource/Erich_Honecker
138 | http://dbpedia.org/resource/Neunkirchen,_Saarland
139 | http://dbpedia.org/resource/Macedonia_(Greece)
140 | http://dbpedia.org/resource/Greece
141 | http://dbpedia.org/resource/Barack_Obama
142 | http://dbpedia.org/resource/Angela_Merkel
143 | http://dbpedia.org/resource/John_F._Kennedy_International_Airport
144 | http://dbpedia.org/resource/John_P._Kennedy
145 | 


--------------------------------------------------------------------------------
/datasets/kore50-urls.txt.absent:
--------------------------------------------------------------------------------
 1 | en.wikipedia.org/wiki/First_Lady_of_Argentina
 2 | en.wikipedia.org/wiki/Kate_Perry
 3 | en.wikipedia.org/wiki/Hurricane_(song)
 4 | en.wikipedia.org/wiki/EMP_Museum
 5 | en.wikipedia.org/wiki/Sony_Music_Entertainment
 6 | en.wikipedia.org/wiki/Woodstock_Festival
 7 | en.wikipedia.org/wiki/Orange_(telecommunications)
 8 | en.wikipedia.org/wiki/Mars_bar
 9 | en.wikipedia.org/wiki/Robert_Bosch_GmbH
10 | en.wikipedia.org/wiki/Thomas_Müller_(footballer)
11 | en.wikipedia.org/wiki/Thomas_Müller_(footballer)
12 | en.wikipedia.org/wiki/Mario_Gomez
13 | en.wikipedia.org/wiki/Ping_Pong_Diplomacy
14 | en.wikipedia.org/wiki/People's_Republic_of_China
15 | en.wikipedia.org/wiki/Hanns-Martin_Schleyer
16 | 


--------------------------------------------------------------------------------
/datasets/kore50.tsv:
--------------------------------------------------------------------------------
 1 | targets	context
 2 | Jon, Angelina, Brad	Angelina, her father Jon, and her partner Brad never played together in the same movie.
 3 | Sharp, Bosch	Bosch and Sharp are both home appliances producing companies.
 4 | Carter, Hurricane, Dylan, Desire	Dylan performed Hurricane about the black fighter Carter, from his album Desire.
 5 | Microsoft, Bill, Cairo	Cairo was the code name for a project at Microsoft from 1991 to 1996. Its charter was to build technologies for a next generation operating system that would fulfill the vision of Bill.
 6 | Mayall, Blues, Rock, Eric	Eric preferred to play Blues instead of Rock, so he joined Mayall 's band.
 7 | Santana, Columbia, Sony, Dylan, Joel, Sinatra	Despite featuring some of the most promininent musicians of their decade --- like Sinatra, Dylan, Joel, and Santana --- Columbia was aquired by Sony in the 1980s.
 8 | John, Pixar, Cars	Pixar produced Cars, and John directed it.
 9 | Atletico, Real	Atletico has beaten its archrival Real.
10 | China, Ping-Pong Diplomacy, Nixon, Watergate	Nixon resigned after Watergate despite his success in the Ping-Pong Diplomacy with China.
11 | Orange, Apple, Mango	While Apple is an electronics company, Mango is a clothing one and Orange is a communication one.
12 | Victoria, David	David and Victoria added spice to their marriage.
13 | Mars, Bounty, Galaxy	Mars, Galaxy, and Bounty are all chocolate.
14 | Theo, Karl	Karl and Theo made their extreme fortunes selling low-price groceries.
15 | Obama, Merkel, JFK	Obama welcomed Merkel upon her arrival at JFK.
16 | Onassis, Kennedy	Onassis married Kennedy on October 20, 1968.
17 | Aachen, Yabo	Yabo plays for Aachen.
18 | Greece, Macedonia	Macedonia is a province of Greece.
19 | Dylan, Hendrix, Allen, EMP, Seattle	Allen founded the EMP in Seattle, which featured exhibitions about Hendrix and Dylan, but also about various science fiction movies.
20 | Madonna, Carlos, Eva	Madonna played Eva and was seen with Carlos.
21 | Jobs, Stanford, Baez	Jobs and Baez dated in the late 1970s, and she performed at his Stanford memorial.
22 | Haug, Red Bull	Haug congratulated Red Bull.
23 | ngland., Müller 	Müller scored a hattrick against England.
24 | Hiroshima, Second World War, Enola Gay	The Enola Gay bombed Hiroshima at the end of Second World War.﻿
25 | an Siro., Völler , ranje 	Völler will never forget the match against Oranje in San Siro.
26 | Hertha, Dortmund	Hertha won against Dortmund.
27 | US Open, Tiger	Tiger lost the US Open.
28 | Victoria, David	David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven.
29 | Erich, Neunkirchen	Erich was born in Neunkirchen.
30 | MTV, Stefani, Twitter, Justin, Kate	Justin, Stefani, and Kate are among the most popular people on both MTV and Twitter.
31 | Emirates, Gunners	The Gunners now play their home matches at the Emirates.
32 | Tiger, Elin	Tiger was lost in the woods when he got divorced from Elin.
33 | Mario, Munich, Thomas	Thomas and Mario are strikers playing in Munich.
34 | City, Spurs	City won 3:2 against the Spurs.
35 | Moon, Armstrong	Armstrong was the first man on the Moon.
36 | Kennedy	Kennedy was also an active politician, yet he is most known for his writings, some of which he published under the name of Mark Littleton.
37 | Apple, Steve, Stanford	After the death of Steve, the former CEO of Apple, his commencement speech at Stanford was watched thousands of times.
38 | Jones, Homme, Grohl	The group formed by Homme, Grohl, and Jones was supposed to be named Caligula, but the name was already taken.
39 | Paris, Kim	Paris and Kim are both wealthy It Girls who had sex tapes on the Internet.
40 | Mitchell, Woodstock, Chicago, Davis, Isle of Wight festival	The Isle of Wight festival in 1970 was the biggest at its time, surpassing Woodstock with acts like Davis, Chicago, and Mitchell.
41 | Windows, Steve, Stanford, Microsoft	In 1980, Steve dropped out of Stanford to join Microsoft, the company behind the Windows operating system.
42 | Meijer, Tivoli, Landgraf	Landgraf and Meijer played at the Tivoli.
43 | Joey, Harris, Desire	Desire contains a duet with Harris in the song Joey.
44 | Sergey, Larry, Bill, Steve	Steve, Bill, Sergey, and Larry have drawn a great deal of admiration these days for their pioneering successes that changed the world we live in.
45 | Cash, American Recordings, Rubin	After unsuccessful years, aging country star Cash made a grandiose comeback with his American Recordings, recorded at his home with the help of Rubin.
46 | Madonna, First Lady	In this musical, Madonna played the role of the First Lady.
47 | The Times, The Sun, Euro, Greece	The Sun and The Times reported that Greece will have to leave the Euro soon.
48 | Beck, Page, Clapton	Three of the greatest guitarists started their career in a single band : Clapton, Beck, and Page.
49 | Vegas, Heidi, Seal	Heidi and her husband Seal live in Vegas.
50 | Sam, Equity International	Sam, the co-founder of Equity International, was given the nickname of "the grave dancer" because of his ability to buy businesses that others thought were dead.
51 | Schleyer, RAF, Meinhof, Baader	The RAF was a terrorist group led by Baader and Meinhof that killed Schleyer.﻿
52 | 


--------------------------------------------------------------------------------
/datasets/kore50.ttl.contexts.tsv:
--------------------------------------------------------------------------------
 1 | targets	contexts
 2 |     	Desire contains a duet with Harris in the song Joey.
 3 |     	Paris and Kim are both wealthy It Girls who had sex tapes on the Internet.
 4 |     	Steve, Bill, Sergey, and Larry have drawn a great deal of admiration these days for their pioneering successes that changed the world we live in.
 5 |     	Obama welcomed Merkel upon her arrival at JFK.
 6 |     	Cairo was the code name for a project at Microsoft from 1991 to 1996. Its charter was to build technologies for a next generation operating system that would fulfill the vision of Bill.
 7 |     	Three of the greatest guitarists started their career in a single band : Clapton, Beck, and Page.
 8 |     	Dylan performed Hurricane about the black fighter Carter, from his album Desire.
 9 |     	Tiger was lost in the woods when he got divorced from Elin.
10 |     	While Apple is an electronics company, Mango is a clothing one and Orange is a communication one.
11 |     	The Enola Gay bombed Hiroshima at the end of Second World War.﻿
12 |     	The RAF was a terrorist group led by Baader and Meinhof that killed Schleyer.﻿
13 |     	Müller scored a hattrick against England.
14 |     	Macedonia is a province of Greece.
15 |     	Mars, Galaxy, and Bounty are all chocolate.
16 |     	Sam, the co-founder of Equity International, was given the nickname of "the grave dancer" because of his ability to buy businesses that others thought were dead.
17 |     	After unsuccessful years, aging country star Cash made a grandiose comeback with his American Recordings, recorded at his home with the help of Rubin.
18 |     	The Gunners now play their home matches at the Emirates.
19 |     	Pixar produced Cars, and John directed it.
20 |     	Heidi and her husband Seal live in Vegas.
21 |     	Bosch and Sharp are both home appliances producing companies.
22 |     	Karl and Theo made their extreme fortunes selling low-price groceries.
23 |     	Despite featuring some of the most promininent musicians of their decade --- like Sinatra, Dylan, Joel, and Santana --- Columbia was aquired by Sony in the 1980s.
24 |     	Thomas and Mario are strikers playing in Munich.
25 |     	In 1980, Steve dropped out of Stanford to join Microsoft, the company behind the Windows operating system.
26 |     	After the death of Steve, the former CEO of Apple, his commencement speech at Stanford was watched thousands of times.
27 |     	Yabo plays for Aachen.
28 |     	City won 3:2 against the Spurs.
29 |     	Jobs and Baez dated in the late 1970s, and she performed at his Stanford memorial.
30 |     	Tiger lost the US Open.
31 |     	Armstrong was the first man on the Moon.
32 |     	Allen founded the EMP in Seattle, which featured exhibitions about Hendrix and Dylan, but also about various science fiction movies.
33 |     	In this musical, Madonna played the role of the First Lady.
34 |     	Hertha won against Dortmund.
35 |     	The Sun and The Times reported that Greece will have to leave the Euro soon.
36 |     	David and Victoria added spice to their marriage.
37 |     	Justin, Stefani, and Kate are among the most popular people on both MTV and Twitter.
38 |     	Haug congratulated Red Bull.
39 |     	Kennedy was also an active politician, yet he is most known for his writings, some of which he published under the name of Mark Littleton.
40 |     	Landgraf and Meijer played at the Tivoli.
41 |     	David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven.
42 |     	Madonna played Eva and was seen with Carlos.
43 |     	Eric preferred to play Blues instead of Rock, so he joined Mayall 's band.
44 |     	The Isle of Wight festival in 1970 was the biggest at its time, surpassing Woodstock with acts like Davis, Chicago, and Mitchell.
45 |     	Atletico has beaten its archrival Real.
46 |     	Onassis married Kennedy on October 20, 1968.
47 |     	Völler will never forget the match against Oranje in San Siro.
48 |     	Nixon resigned after Watergate despite his success in the Ping-Pong Diplomacy with China.
49 |     	Erich was born in Neunkirchen.
50 |     	Angelina, her father Jon, and her partner Brad never played together in the same movie.
51 |     	The group formed by Homme, Grohl, and Jones was supposed to be named Caligula, but the name was already taken.
52 | 


--------------------------------------------------------------------------------
/datasets/kore50.ttl.phrases.tsv:
--------------------------------------------------------------------------------
  1 | targets	contexts
  2 | Greece	    
  3 | Cash	    
  4 | Vegas	    
  5 | Karl	    
  6 | Jobs	    
  7 | Müller	    
  8 | Grohl	    
  9 | Madonna	    
 10 | Theo	    
 11 | The Sun	    
 12 | Eva	    
 13 | Eva	    
 14 | Kim	    
 15 | Greece	    
 16 | Vegas	    
 17 | Hurricane	    
 18 | Moon	    
 19 | American Recordings	    
 20 | Red Bull	    
 21 | Hiroshima	    
 22 | Equity International	    
 23 | Enola Gay	    
 24 | Isle of Wight festival	    
 25 | Microsoft	    
 26 | Allen	    
 27 | Rubin	    
 28 | RAF	    
 29 | Hurricane	    
 30 | Eric	    
 31 | Elin	    
 32 | Mayall	    
 33 | Davis	    
 34 | David	    
 35 | Mario	    
 36 | Munich	    
 37 | Cars	    
 38 | Schleyer	    
 39 | Onassis	    
 40 | Kennedy	    
 41 | Bill	    
 42 | David	    
 43 | Neunkirchen	    
 44 | The Times	    
 45 | City	    
 46 | Victoria	    
 47 | Mitchell	    
 48 | First Lady	    
 49 | Greece	    
 50 | Tiger	    
 51 | Schleyer	    
 52 | Baader	    
 53 | Stanford	    
 54 | Apple	    
 55 | Steve	    
 56 | Spurs	    
 57 | San Siro	    
 58 | Apple	    
 59 | Dylan	    
 60 | Steve	    
 61 | Jones	    
 62 | Tivoli	    
 63 | Isle of Wight festival	    
 64 | Landgraf	    
 65 | Stanford	    
 66 | Steve	    
 67 | Steve	    
 68 | Jon	    
 69 | Pixar	    
 70 | Rubin	    
 71 | Larry	    
 72 | Sony	    
 73 | Allen	    
 74 | Euro	    
 75 | Desire	    
 76 | Sam	    
 77 | Meijer	    
 78 | Neunkirchen	    
 79 | Columbia	    
 80 | Carter	    
 81 | Tiger	    
 82 | Macedonia	    
 83 | Second World War	    
 84 | US Open	    
 85 | Joey	    
 86 | Beck	    
 87 | MTV	    
 88 | Chicago	    
 89 | Erich	    
 90 | Carlos	    
 91 | Desire	    
 92 | Desire	    
 93 | Merkel	    
 94 | Columbia	    
 95 | Enola Gay	    
 96 | Dortmund	    
 97 | Justin	    
 98 | Heidi	    
 99 | Homme	    
100 | Stefani	    
101 | Aachen	    
102 | Cairo	    
103 | Aachen	    
104 | Dylan	    
105 | Sharp	    
106 | Dylan	    
107 | Kate	    
108 | Steve	    
109 | David	    
110 | Armstrong	    
111 | Oranje	    
112 | Galaxy	    
113 | Greece	    
114 | Meijer	    
115 | City	    
116 | American Recordings	    
117 | Atletico	    
118 | Twitter	    
119 | Baez	    
120 | Cars	    
121 | Woodstock	    
122 | Rock	    
123 | Sinatra	    
124 | Obama	    
125 | Apple	    
126 | Gunners	    
127 | Hendrix	    
128 | Carter	    
129 | Haug	    
130 | Dylan	    
131 | Meinhof	    
132 | Pixar	    
133 | Heidi	    
134 | Cairo	    
135 | Hiroshima	    
136 | EMP	    
137 | Seattle	    
138 | Baez	    
139 | Stanford	    
140 | Kennedy	    
141 | Yabo	    
142 | England	    
143 | Sam	    
144 | Bounty	    
145 | Völler	    
146 | Harris	    
147 | Tivoli	    
148 | Sony	    
149 | EMP	    
150 | Seal	    
151 | Microsoft	    
152 | Microsoft	    
153 | Joey	    
154 | Oranje	    
155 | Chicago	    
156 | Clapton	    
157 | Ping-Pong Diplomacy	    
158 | Stefani	    
159 | Windows	    
160 | Seattle	    
161 | Landgraf	    
162 | Rock	    
163 | Joel	    
164 | Mario	    
165 | JFK	    
166 | Tiger	    
167 | Second World War	    
168 | Galaxy	    
169 | Brad	    
170 | John	    
171 | Sinatra	    
172 | Brad	    
173 | Watergate	    
174 | Cash	    
175 | Orange	    
176 | Tiger	    
177 | Blues	    
178 | US Open	    
179 | Spurs	    
180 | Real	    
181 | Kennedy	    
182 | Stanford	    
183 | Meinhof	    
184 | First Lady	    
185 | Justin	    
186 | Watergate	    
187 | Bill	    
188 | Elin	    
189 | Jobs	    
190 | Angelina	    
191 | RAF	    
192 | Erich	    
193 | Euro	    
194 | Stanford	    
195 | Hertha	    
196 | Thomas	    
197 | Emirates	    
198 | Baader	    
199 | Mayall	    
200 | Twitter	    
201 | Harris	    
202 | Grohl	    
203 | Mitchell	    
204 | Emirates	    
205 | Mars	    
206 | Red Bull	    
207 | Apple	    
208 | Orange	    
209 | Sharp	    
210 | Thomas	    
211 | Victoria	    
212 | Kate	    
213 | Sergey	    
214 | Madonna	    
215 | Davis	    
216 | Gunners	    
217 | Völler	    
218 | Victoria	    
219 | Kim	    
220 | Nixon	    
221 | Carlos	    
222 | Microsoft	    
223 | Karl	    
224 | Armstrong	    
225 | Larry	    
226 | Joel	    
227 | Merkel	    
228 | The Sun	    
229 | Bosch	    
230 | Bill	    
231 | Paris	    
232 | Page	    
233 | David	    
234 | Sergey	    
235 | Santana	    
236 | Nixon	    
237 | Desire	    
238 | The Times	    
239 | Bosch	    
240 | Bill	    
241 | Angelina	    
242 | Woodstock	    
243 | Hendrix	    
244 | Santana	    
245 | Haug	    
246 | Dylan	    
247 | JFK	    
248 | Mars	    
249 | Equity International	    
250 | Madonna	    
251 | Blues	    
252 | Mango	    
253 | Page	    
254 | Dylan	    
255 | Stanford	    
256 | Clapton	    
257 | China	    
258 | Madonna	    
259 | Windows	    
260 | Yabo	    
261 | San Siro	    
262 | Ping-Pong Diplomacy	    
263 | Homme	    
264 | Hertha	    
265 | Mango	    
266 | Real	    
267 | Victoria	    
268 | Theo	    
269 | Munich	    
270 | England	    
271 | Jones	    
272 | Eric	    
273 | John	    
274 | Atletico	    
275 | Bounty	    
276 | Kennedy	    
277 | Steve	    
278 | Müller	    
279 | Jon	    
280 | Paris	    
281 | Obama	    
282 | Onassis	    
283 | Moon	    
284 | MTV	    
285 | Dortmund	    
286 | Seal	    
287 | China	    
288 | Macedonia	    
289 | Beck	    
290 | 


--------------------------------------------------------------------------------
/datasets/n3-reuters-128-urls.txt.absent:
--------------------------------------------------------------------------------
  1 | aksw.org/notInWiki/Motorola_Inc
  2 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance
  3 | en.wikipedia.org/wiki/Public_Service_Company_of_Colorado
  4 | aksw.org/notInWiki/John_Dosher
  5 | en.wikipedia.org/wiki/Shelly%27s_Inc
  6 | aksw.org/notInWiki/Advanced_Micro
  7 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton
  8 | aksw.org/notInWiki/Tehran_Radio
  9 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton
 10 | aksw.org/notInWiki/Meyer_Detective_Agency_Inc
 11 | en.wikipedia.org/wiki/Federal_Ministry_of_Economics_and_Technology_(Germany)
 12 | aksw.org/notInWiki/Thomson_McKinnon
 13 | aksw.org/notInWiki/Southmark_Corp
 14 | aksw.org/notInWiki/Erskine_Resources_Ltd
 15 | en.wikipedia.org/wiki/AT%26T_Corporation
 16 | aksw.org/notInWiki/West_Point-Pepperell_Inc
 17 | aksw.org/notInWiki/Shopsmith_Inc
 18 | aksw.org/notInWiki/John_Wineapple
 19 | aksw.org/notInWiki/Sterling_investment_banking_group
 20 | en.wikipedia.org/wiki/The_Reader's_Digest_Association
 21 | aksw.org/notInWiki/Paul_Oreffice
 22 | aksw.org/notInWiki/Leonardo_Brito
 23 | aksw.org/notInWiki/French_Federation_of_Non-Ferrous_Metals
 24 | aksw.org/notInWiki/Interstate_Properties
 25 | aksw.org/notInWiki/IVB_Financial_Corp
 26 | en.wikipedia.org/wiki/London_Metal_Exchange
 27 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company
 28 | aksw.org/notInWiki/Madeira_Inc
 29 | aksw.org/notInWiki/Robert_W_Scherer
 30 | en.wikipedia.org/wiki/Banque_de_France
 31 | aksw.org/notInWiki/Digital_Communications_Associates_Inc
 32 | de.en.wikipedia.org/wiki/Montedison
 33 | aksw.org/notInWiki/Pancontinental_Oil_Ltd
 34 | aksw.org/notInWiki/Eileen_Gormley
 35 | aksw.org/notInWiki/W.B._Saunders_Co_of_Canada_Ltd
 36 | aksw.org/notInWiki/G_H_Shintoh
 37 | aksw.org/notInWiki/U.S._District_Court
 38 | de.en.wikipedia.org/wiki/Lohn_(Eschweiler)
 39 | aksw.org/notInWiki/William_West
 40 | aksw.org/notInWiki/AgrimontSPA
 41 | en.wikipedia.org/wiki/Data_I/O
 42 | aksw.org/notInWiki/Eileen_Gormley
 43 | aksw.org/notInWiki/Michel_Dufour
 44 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission
 45 | aksw.org/notInWiki/American_Midland_Corp
 46 | aksw.org/notInWiki/AgrimontSPA
 47 | aksw.org/notInWiki/James_Ottinger
 48 | aksw.org/notInWiki/Sun_Refining_and_Marketing_Co
 49 | aksw.org/notInWiki/Larry_Taylor
 50 | aksw.org/notInWiki/Datron_Corp
 51 | aksw.org/notInWiki/Agrimont_Group
 52 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance
 53 | aksw.org/notInWiki/Coffin
 54 | de.en.wikipedia.org/wiki/Montedison
 55 | aksw.org/notInWiki/AgrimontSPA
 56 | aksw.org/notInWiki/Jerzy_Urban
 57 | aksw.org/notInWiki/Advanced_Voice_Technologies
 58 | aksw.org/notInWiki/Holt,_Rinehart_and_Winston_Canada_Ltd
 59 | aksw.org/notInWiki/Industrial_Valley_Title_Insurance_Co
 60 | aksw.org/notInWiki/Joseph_DiGaicomo_Jr.
 61 | aksw.org/notInWiki/Colorado_Rural_Electric_Association
 62 | aksw.org/notInWiki/Synergen_Inc
 63 | en.wikipedia.org/wiki/Rolls-Royce_plc
 64 | aksw.org/notInWiki/Interstate_Properties
 65 | en.wikipedia.org/wiki/Merck_%26_Co.
 66 | aksw.org/notInWiki/Thermo-Print_GmbH
 67 | aksw.org/notInWiki/Wedgestone_Realty_Investors_Trust
 68 | aksw.org/notInWiki/CMS_Enhancements_Inc
 69 | de.en.wikipedia.org/wiki/Montedison
 70 | aksw.org/notInWiki/Mosaic_Systems_Inc
 71 | aksw.org/notInWiki/Federal_Paperboard_Co_Inc
 72 | aksw.org/notInWiki/Datron_Corp
 73 | aksw.org/notInWiki/Stuart_Weisbrod
 74 | en.wikipedia.org/wiki/Federal_Ministry_of_Economics_and_Technology_(Germany)
 75 | aksw.org/notInWiki/Capital_Investigations_and_Protective_Agency
 76 | en.wikipedia.org/wiki/Rolls-Royce_plc
 77 | aksw.org/notInWiki/John_Dosher
 78 | aksw.org/notInWiki/Robert_W_Scherer
 79 | aksw.org/notInWiki/Gordon_Cain
 80 | en.wikipedia.org/wiki/Public_Service_Company_of_Colorado
 81 | aksw.org/notInWiki/Entourage_International_Inc
 82 | aksw.org/notInWiki/Makoto_Kuroda
 83 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission
 84 | de.en.wikipedia.org/wiki/Montedison
 85 | aksw.org/notInWiki/Shared_Network_Technologies_Inc
 86 | aksw.org/notInWiki/Erskine_Resources_Ltd
 87 | en.wikipedia.org/wiki/Boettcher_and_Co_Inc
 88 | aksw.org/notInWiki/Advanced_Cardiovascular_Systems_Inc
 89 | en.wikipedia.org/wiki/Amsterdam_Stock_Exchange
 90 | en.wikipedia.org/wiki/International_Monetary_Fund
 91 | aksw.org/notInWiki/Pace_Consultants_Inc
 92 | aksw.org/notInWiki/CNA_Income_Shares_Inc
 93 | en.wikipedia.org/wiki/W.R._Grace
 94 | en.wikipedia.org/wiki/Moody's_Investors_Service
 95 | aksw.org/notInWiki/Bashaw_Leduc_Oil_and_Gas_Ltd
 96 | aksw.org/notInWiki/Avondale_Mills
 97 | en.wikipedia.org/wiki/W.R._Grace
 98 | en.wikipedia.org/wiki/Rolls-Royce_plc
 99 | aksw.org/notInWiki/Ferruzzi_Groups
100 | en.wikipedia.org/wiki/General_Electric_Company_plc
101 | aksw.org/notInWiki/Advanced_Voice_Technologies
102 | en.wikipedia.org/wiki/Moody's_Investors_Service
103 | aksw.org/notInWiki/Samuel_Montagu_and_Sons_Ltd
104 | aksw.org/notInWiki/French_Federation_of_Non-Ferrous_Metals
105 | aksw.org/notInWiki/INTEL_Corp_INTC
106 | aksw.org/notInWiki/Pemberton_Houston_Willoughby_Bell_Gouinlock_Inc
107 | en.wikipedia.org/wiki/General_Electric_Company_plc
108 | aksw.org/notInWiki/Yankee_Cos_Inc
109 | en.wikipedia.org/wiki/General_Electric_Company_plc
110 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission
111 | aksw.org/notInWiki/Holt,_Rinehart_and_Winston_Canada_Ltd
112 | aksw.org/notInWiki/Cartel_Security_Consultants_Inc
113 | en.wikipedia.org/wiki/Welsh,_Carson,_Anderson_%26_Stowe
114 | en.wikipedia.org/wiki/Rolls-Royce_plc
115 | en.wikipedia.org/wiki/Eastman_Kodak
116 | de.en.wikipedia.org/wiki/Cornelis_van_der_Klugt
117 | aksw.org/notInWiki/Jerzy_Urban
118 | aksw.org/notInWiki/Amplicon_Inc
119 | aksw.org/notInWiki/Robert_W_Scherer
120 | aksw.org/notInWiki/CMS_Enhancements_Inc
121 | aksw.org/notInWiki/Advanced_Micro
122 | aksw.org/notInWiki/John_R._Folkerth
123 | de.en.wikipedia.org/wiki/Cornelis_van_der_Klugt
124 | en.wikipedia.org/wiki/Dillon,_Read_%26_Co.
125 | aksw.org/notInWiki/Freeport-McMoRan_Oil_and_Gas_Royalty_Trust
126 | en.wikipedia.org/wiki/International_Monetary_Fund
127 | en.wikipedia.org/wiki/Dominion_Textiles
128 | aksw.org/notInWiki/Industrial_Valley_Title_Insurance_Co
129 | aksw.org/notInWiki/Jerzy_Urban
130 | en.wikipedia.org/wiki/Alex._Brown_%26_Sons
131 | aksw.org/notInWiki/Advanced_Voice_Technologies
132 | aksw.org/notInWiki/Certified_Security_Services_Inc
133 | aksw.org/notInWiki/Harcourt_Brace_Jovanovich_Canada_Inc
134 | aksw.org/notInWiki/Paul_OKelly
135 | aksw.org/notInWiki/Montedisons_Agro-Industrial
136 | aksw.org/notInWiki/Gianfranco_Ceroni
137 | aksw.org/notInWiki/John_Durant
138 | aksw.org/notInWiki/Renato_Picco
139 | aksw.org/notInWiki/GGFH_Inc
140 | aksw.org/notInWiki/James_Adams
141 | aksw.org/notInWiki/Sci-Med_Life_Systems_Inc
142 | aksw.org/notInWiki/C.H._Masland
143 | de.en.wikipedia.org/wiki/Montedison
144 | aksw.org/notInWiki/Northern_Telecom_LTd
145 | aksw.org/notInWiki/Sun_Co
146 | aksw.org/notInWiki/John_Dosher
147 | aksw.org/notInWiki/Datron_Corp
148 | aksw.org/notInWiki/Datron_Corp
149 | aksw.org/notInWiki/Wedgestone_Advisory_Corp
150 | en.wikipedia.org/wiki/Fruehauf_Corporation
151 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton
152 | aksw.org/notInWiki/Christopher_Hogg
153 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton
154 | aksw.org/notInWiki/William_Randol
155 | en.wikipedia.org/wiki/Standard_%26_Poor's
156 | en.wikipedia.org/wiki/Stifel_Nicolaus
157 | aksw.org/notInWiki/Les_Editions_HRW_Ltd
158 | en.wikipedia.org/wiki/Republic_Airlines
159 | en.wikipedia.org/wiki/London_Metal_Exchange
160 | aksw.org/notInWiki/Picker_International_Inc
161 | en.wikipedia.org/wiki/Lloyds_Bank_of_Canada
162 | aksw.org/notInWiki/Italiana_Olii_e_Sifi
163 | aksw.org/notInWiki/Advanced_Micro_Devices_Inc
164 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton
165 | aksw.org/notInWiki/Edward_Johnson
166 | aksw.org/notInWiki/Highland_Superstores_Inc
167 | aksw.org/notInWiki/Le_Peep_Restaurants_Inc
168 | aksw.org/notInWiki/William_West
169 | en.wikipedia.org/wiki/Hambrecht_%26_Quist
170 | aksw.org/notInWiki/National_Guardian_Corp
171 | aksw.org/notInWiki/Leonardo_Brito
172 | aksw.org/notInWiki/Bertil_Nordin
173 | en.wikipedia.org/wiki/Moody's_Investors_Service
174 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance
175 | aksw.org/notInWiki/Michel_Dufour
176 | aksw.org/notInWiki/Shared_Network_Technologies_Inc
177 | en.wikipedia.org/wiki/Federal_Ministry_of_Transport,_Building_and_Urban_Development
178 | aksw.org/notInWiki/Paul_Oreffice
179 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton
180 | aksw.org/notInWiki/Braodway_Casinos_Inc
181 | aksw.org/notInWiki/New_Jersey_Hospital_Association
182 | aksw.org/notInWiki/American_Midland_Corp
183 | aksw.org/notInWiki/Thomas_Bell
184 | aksw.org/notInWiki/Ozaki_Trading_Co_Ltd
185 | en.wikipedia.org/wiki/General_Electric_Company_plc
186 | aksw.org/notInWiki/Federal_Realty_Investment_Trust
187 | en.wikipedia.org/wiki/Rolls-Royce_plc
188 | aksw.org/notInWiki/C.S.C._Security_Gaurd_Service
189 | aksw.org/notInWiki/Advanced_Micro
190 | en.wikipedia.org/wiki/Alex._Brown_%26_Sons
191 | aksw.org/notInWiki/Tehran_Radio
192 | aksw.org/notInWiki/American_Telephone_and_Telegraph_Co
193 | aksw.org/notInWiki/Christopher_Hogg
194 | aksw.org/notInWiki/Ferruzzi_Groups
195 | en.wikipedia.org/wiki/Kidder,_Peabody_%26_Co.
196 | en.wikipedia.org/wiki/AT%26T_Corporation
197 | aksw.org/notInWiki/Ozaki_Trading_Co_Ltd
198 | aksw.org/notInWiki/Larry_Taylor
199 | aksw.org/notInWiki/Leonardo_Brito
200 | aksw.org/notInWiki/Preston_corp
201 | aksw.org/notInWiki/G_H_Shintoh
202 | aksw.org/notInWiki/GGFH_Inc
203 | aksw.org/notInWiki/Ozaki_Trading_Co_Ltd
204 | aksw.org/notInWiki/G_H_Shintoh
205 | aksw.org/notInWiki/Advanced_Voice_Technologies
206 | en.wikipedia.org/wiki/Public_Service_Company_of_Colorado
207 | aksw.org/notInWiki/G_H_Shintoh
208 | en.wikipedia.org/wiki/Dillon,_Read_%26_Co.
209 | en.wikipedia.org/wiki/Standard_%26_Poor%27s
210 | aksw.org/notInWiki/Leonardo_Brito
211 | aksw.org/notInWiki/J._Terence_Murray
212 | aksw.org/notInWiki/Entourage_International_Inc
213 | aksw.org/notInWiki/Tradevest_Inc
214 | aksw.org/notInWiki/John_Dosher
215 | aksw.org/notInWiki/William_Randol
216 | en.wikipedia.org/wiki/Republic_Airlines
217 | aksw.org/notInWiki/Burns,_Pauli_and_Co_Inc
218 | aksw.org/notInWiki/Entourage_International_Inc
219 | en.wikipedia.org/wiki/Federal_government_of_the_United_States
220 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton
221 | aksw.org/notInWiki/Michael_Smith
222 | aksw.org/notInWiki/Atlas_Consolidated_Mining_and_Development_Corp
223 | aksw.org/notInWiki/AgrimontSPA
224 | aksw.org/notInWiki/Federal_Paperboard_Co_Inc
225 | aksw.org/notInWiki/Paul_Oreffice
226 | en.wikipedia.org/wiki/Compagnie_Française_d'Assurance_pour_le_Commerce_Extérieur
227 | aksw.org/notInWiki/Howard_Fromkin
228 | de.en.wikipedia.org/wiki/Royal_Cosun
229 | en.wikipedia.org/wiki/Kidder,_Peabody_%26_Co.
230 | aksw.org/notInWiki/Spear_Securities_Inc
231 | aksw.org/notInWiki/Cain_Chemical_Inc
232 | aksw.org/notInWiki/OBrien-Kreitzberg
233 | en.wikipedia.org/wiki/Merck_%26_Co.
234 | aksw.org/notInWiki/Association_of_White_Metals
235 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company
236 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company
237 | en.wikipedia.org/wiki/General_Electric_Company_plc
238 | aksw.org/notInWiki/Biotherapeutics_Inc
239 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission
240 | de.en.wikipedia.org/wiki/Montedison
241 | aksw.org/notInWiki/Les_Hosking
242 | aksw.org/notInWiki/Bertil_Nordin
243 | aksw.org/notInWiki/Bashaw_Leduc_Oil_and_Gas_Ltd
244 | en.wikipedia.org/wiki/International_Monetary_Fund
245 | en.wikipedia.org/wiki/E._F._Hutton_%26_Co.
246 | en.wikipedia.org/wiki/Stifel_Nicolaus
247 | aksw.org/notInWiki/Rockwell_International_ROK
248 | aksw.org/notInWiki/Eridania_SPA
249 | aksw.org/notInWiki/Spear_Securities_Inc
250 | aksw.org/notInWiki/Immunex_Corp
251 | en.wikipedia.org/wiki/Public_Service_Company_of_Colorado
252 | aksw.org/notInWiki/James_Ottinger
253 | aksw.org/notInWiki/Gordon_Cain
254 | en.wikipedia.org/wiki/Banque_de_France
255 | en.wikipedia.org/wiki/Standard_%26_Poor's
256 | aksw.org/notInWiki/Federal_Paperboard_Co_Inc
257 | aksw.org/notInWiki/Eileen_Gormley
258 | de.en.wikipedia.org/wiki/Montedison
259 | aksw.org/notInWiki/Raymond_Savoie
260 | aksw.org/notInWiki/Asiavest_Pty_Ltd
261 | aksw.org/notInWiki/Datron_Corp
262 | aksw.org/notInWiki/Monobloc_U.S.A.
263 | en.wikipedia.org/wiki/Ministry_of_Social_Affairs_and_Employment_(Netherlands)
264 | aksw.org/notInWiki/Madeira_Inc
265 | aksw.org/notInWiki/Sci-Med_Life_Systems_Inc
266 | aksw.org/notInWiki/A-T-E_Security_Group_Inc
267 | aksw.org/notInWiki/North_Country_Media_Group
268 | en.wikipedia.org/wiki/The_Reader's_Digest_Association
269 | en.wikipedia.org/wiki/Moody's_Investors_Service
270 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission
271 | aksw.org/notInWiki/International_Business_Machines_Corp_IBM
272 | aksw.org/notInWiki/Tri-State_Generation_and_Transmission_Association
273 | en.wikipedia.org/wiki/Eastman_Kodak
274 | aksw.org/notInWiki/Harcourt_Brace_Jovanovich_Canada_Inc
275 | aksw.org/notInWiki/Eileen_Gormley
276 | aksw.org/notInWiki/Avondale_Mills
277 | aksw.org/notInWiki/Universal_Medical_Buildings_L_P
278 | aksw.org/notInWiki/AutoSpa_corp
279 | aksw.org/notInWiki/Eberstadt_Fleming_Inc
280 | aksw.org/notInWiki/Nantucket_Industries_Inc
281 | aksw.org/notInWiki/Robert_W_Scherer
282 | aksw.org/notInWiki/Security_Services_of_America
283 | aksw.org/notInWiki/Custom_Security_Services
284 | aksw.org/notInWiki/Johnson_Redbook
285 | aksw.org/notInWiki/Tehran_Radio
286 | aksw.org/notInWiki/Fidelcor_Inc
287 | aksw.org/notInWiki/Datron_Corp
288 | aksw.org/notInWiki/Terry_Hampton
289 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company
290 | aksw.org/notInWiki/Sci-Med_Life_Systems_Inc
291 | aksw.org/notInWiki/Ponderosa_Inc
292 | aksw.org/notInWiki/Cain_Chemical_Inc
293 | aksw.org/notInWiki/American_Midland_Corp
294 | aksw.org/notInWiki/Advanced_Voice_Technologies
295 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton
296 | aksw.org/notInWiki/Bruce_Smart
297 | aksw.org/notInWiki/Michel_Dufour
298 | aksw.org/notInWiki/Hans_van_Liemt
299 | en.wikipedia.org/wiki/International_Monetary_Fund
300 | aksw.org/notInWiki/J._Terence_Murray
301 | aksw.org/notInWiki/AgrimontSPA
302 | aksw.org/notInWiki/Wells_Fargo_Alarm_Services
303 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance
304 | aksw.org/notInWiki/Celltronics_Inc
305 | aksw.org/notInWiki/CNA_Income_Shares_Inc
306 | aksw.org/notInWiki/Blocker_Energy_corp
307 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company
308 | aksw.org/notInWiki/Don_Bybee_and_Associates
309 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance
310 | en.wikipedia.org/wiki/Moody's_Investors_Service
311 | aksw.org/notInWiki/Ettore_dellIsola
312 | aksw.org/notInWiki/Foreign_Policy_Institute
313 | aksw.org/notInWiki/Home_Intensive_Care_Inc
314 | aksw.org/notInWiki/Hans_van_Liemt
315 | aksw.org/notInWiki/Association_of_White_Metals
316 | aksw.org/notInWiki/Alexanders_Inc
317 | 


--------------------------------------------------------------------------------
/datasets/singleton.tsv:
--------------------------------------------------------------------------------
1 | targets	context
2 | Michael Jordan, John Smith, Richard Stallman, Linus Torvalds	In a sense it’s an improvement on 'Rebirth,' even if a couple of the songs bear the scars of that period, particularly the Rick Rubinesque title track, and 'Popular,' which has some of the new wave sizzle he was toying with.
3 | 


--------------------------------------------------------------------------------
/datasets/subset/1000_labels.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/datasets/subset/1000_labels.db


--------------------------------------------------------------------------------
/datasets/subset/1000_long_abstracts.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/datasets/subset/1000_long_abstracts.db


--------------------------------------------------------------------------------
/datasets/subset/1000_nodes_lookup.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/datasets/subset/1000_nodes_lookup.db


--------------------------------------------------------------------------------
/datasets/test.phrases.tsv:
--------------------------------------------------------------------------------
1 | targets	contexts
2 | Canada	    
3 | Lil Wayne	    
4 | Madonna	  
5 | San Francisco	  
6 | 


--------------------------------------------------------------------------------
/datasets/top-cities.txt:
--------------------------------------------------------------------------------
 1 | Abidjan
 2 | Ahmedabad
 3 | Alexandria
 4 | Ankara
 5 | Baghdad
 6 | Bangalore
 7 | Bangkok
 8 | Beijing
 9 | Berlin
10 | Busan
11 | Cairo
12 | Casablanca
13 | Chengdu
14 | Chongqing
15 | Delhi
16 | Dhaka
17 | Dongguan
18 | Durban
19 | Ekurhuleni
20 | Faisalabad
21 | Foshan
22 | Giza
23 | Guangzhou
24 | Hangzhou
25 | Hanoi
26 | Harbin
27 | Hefei
28 | Incheon
29 | Istanbul
30 | Jaipur
31 | Jakarta
32 | Jeddah
33 | Johannesburg
34 | Kabul
35 | Karachi
36 | Kinshasa
37 | Kolkata
38 | Lagos
39 | Lahore
40 | Lima
41 | London
42 | Madrid
43 | Moscow
44 | Mumbai
45 | Nairobi
46 | Nanjing
47 | Ningbo
48 | Pune
49 | Pyongyang
50 | Riyadh
51 | Santiago
52 | Seoul
53 | Shanghai
54 | Shantou
55 | Shenyang
56 | Shenzhen
57 | Singapore
58 | Surat
59 | Suzhou
60 | Tehran
61 | Tianjin
62 | Tokyo
63 | Wenzhou
64 | Wuhan
65 | Xiamen
66 | Yangon
67 | Yokohama
68 | Zhengzhou
69 | Zhongshan
70 | 


--------------------------------------------------------------------------------
/datasets/us-states.txt:
--------------------------------------------------------------------------------
 1 | Alabama
 2 | Alaska
 3 | Arizona
 4 | Arkansas
 5 | California
 6 | Colorado
 7 | Connecticut
 8 | Delaware
 9 | Florida
10 | Hawaii
11 | Idaho
12 | Illinois
13 | Indiana
14 | Iowa
15 | Kansas
16 | Kentucky
17 | Louisiana
18 | Maine
19 | Maryland
20 | Massachusetts
21 | Michigan
22 | Minnesota
23 | Mississippi
24 | Missouri
25 | Montana
26 | Nebraska
27 | Nevada
28 | New Hampshire
29 | New Jersey
30 | New Mexico
31 | New York 
32 | North Carolina
33 | North Dakota
34 | Ohio
35 | Oklahoma
36 | Oregon
37 | Pennsylvania
38 | Rhode Island
39 | South Carolina
40 | South Dakota
41 | Tennessee
42 | Texas
43 | Utah
44 | Vermont
45 | Virginia
46 | Washington
47 | Wisconsin
48 | Wyoming
49 | 


--------------------------------------------------------------------------------
/diffbot_api.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | import codecs
  4 | import grequests 
  5 | from sqlitedict import SqliteDict
  6 | from utils import ROOT_DIR
  7 | from os.path import join
  8 | from time import time
  9 | 
 10 | 
 11 | endpoint_diffbot = "http://kg.diffbot.com/kg/dql_endpoint"
 12 | 
 13 | ENTITY_TYPES = ["AdministrativeArea", "Article", "Corporation",
 14 |                     "DegreeEntity", "EducationMajorEntity", "EducationalInstitution",
 15 |                    "EmploymentCategory", "Image", "Intangible", "Landmark", "LocalBusiness",
 16 |                    "Miscellaneous", "Organization", "Person", "Place", "Product", "Role",
 17 |                    "Skill", "Video"]
 18 | 
 19 | EL_ENTITY_TYPES = ["AdministrativeArea", "Corporation", "EducationalInstitution",
 20 |                    "Landmark", "LocalBusiness", "Miscellaneous", "Organization", 
 21 |                    "Person", "Place", "Product"]
 22 | 
 23 | EL_POL_ENTITY_TYPES = ["AdministrativeArea", "Corporation", "EducationalInstitution",
 24 |                    "Landmark", "LocalBusiness", "Organization", 
 25 |                    "Person", "Place", "Product"]
 26 | 
 27 | CACHED_QUERY_DB =  join(join(ROOT_DIR, "cache"), "diffbot-query-cache.sqlite")
 28 | 
 29 | 
 30 | class CachedQuery(object):
 31 |     def __init__(self, cache_fpath=CACHED_QUERY_DB):
 32 |         self._cache = SqliteDict(cache_fpath, autocommit=True)
 33 |         
 34 |     def __del__(self):
 35 |         try:
 36 |             self._cache.close()
 37 |         except:
 38 |             print("Warning: trying to close a closed cache.")
 39 |             
 40 |     def make_query(self, query):
 41 |         if query in self._cache:
 42 |             return self._cache[query]
 43 |         else:
 44 |             response = make_query(query)
 45 |             self._cache[query] = response
 46 |             return response
 47 |         
 48 |     def close(self):
 49 |         self._cache.close()
 50 | 
 51 |     def response2dict(self, response):
 52 |         return json.loads(response.content)
 53 | 
 54 |     def get_entity(self, db_uri):
 55 |         if db_uri in self._cache:
 56 |             return self._cache[db_uri]
 57 |         else:
 58 |             response = self._get_entity(db_uri)
 59 |             self._cache[db_uri] = response
 60 |             return response
 61 | 
 62 |     def _get_entity(self, db_uri):
 63 |         """ Takes as input URI like http://www.diffbot.com/entity/CQSNBJBdRL7 and returns
 64 |         and entity. """
 65 | 
 66 |         db_uri = db_uri.replace("https:", "http:")
 67 | 
 68 |         data = {
 69 |             "token": get_token(),
 70 |             "query": "diffbotUri:{}".format(db_uri),
 71 |             "type": "query"}
 72 | 
 73 |         r = requests.get(endpoint_diffbot, params=data)
 74 | 
 75 |         return self.response2dict(r)
 76 | 
 77 | 
 78 | # https://dev.kg.diffbot.com/kg/dql_endpoint?type=query&token=token&query=id:OIZzlT1rihy
 79 | # https://www.diffbot.com/entity/OIZzlT1rihy
 80 | 
 81 | 
 82 | token = None
 83 | def get_token():
 84 |     global token    
 85 |     if token:
 86 |         return token
 87 |     else:
 88 |         with open("dbt", "r") as f:
 89 |             token = f.read().strip()
 90 |             return token
 91 | 
 92 | 
 93 | def make_queries(queries, parallel=32):    
 94 |     rs = []
 95 |     for query in queries:
 96 |         data = {
 97 |             "token": get_token(),
 98 |             "query": query,
 99 |             "type": "query"}
100 | 
101 |         rs.append(grequests.get(endpoint_diffbot, params=data))
102 |     
103 |     return grequests.map(rs, size=parallel)
104 | 
105 | 
106 | def make_query(query):
107 |     data = {
108 |         "token": get_token(),
109 |         "query": query,
110 |         "type": "query"}
111 |     r = requests.get(endpoint_diffbot, params=data)
112 | 
113 |     return r 
114 | 
115 | 
116 | def save2json(output_fpath, r):
117 |     with codecs.open(output_fpath, "w", "utf-8") as out:   
118 |         out.write(json.dumps(json.loads(r.content)))
119 |     print(output_fpath)
120 |  
121 | 
122 | def query_and_save(query, output_fpath):
123 |     r = make_query(query)
124 |     save2json(output_fpath, r)
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/fwd.sh:
--------------------------------------------------------------------------------
1 | ssh -L 8888:localhost:8888 -L 1234:localhost:1234 -L 8181:localhost:8181 lthead -L 8080:localhost:8080 diffbot
2 | 


--------------------------------------------------------------------------------
/generate_absent.py:
--------------------------------------------------------------------------------
 1 | from glob import glob
 2 | from os.path import join 
 3 | from diffbot_api import make_query
 4 | from utils import dbpedia2wikipedia
 5 | from traceback import format_exc
 6 | import json
 7 | 
 8 | 
 9 | def get_hits(diffbot_query_response):
10 |     hits_num = diffbot_query_response["hits"]
11 |     data = diffbot_query_response["data"]
12 | 
13 |     types = []
14 |     for i, hit in enumerate(diffbot_query_response["data"]):
15 |         types.append(hit["type"])
16 | 
17 |         
18 |     return hits_num, types
19 | 
20 | 
21 | def generate_absent_datasets(datasets_fpaths):
22 |     saved = None
23 | 
24 |     for dataset_fpath in glob(datasets_fpaths):
25 |         print(dataset_fpath)
26 |         total_hits = 0
27 |         total_urls = 0
28 |         total_absent = 0
29 | 
30 |         with open(dataset_fpath, "r") as in_f, open(dataset_fpath + ".absent", "w") as out_f:
31 |             for url in in_f:
32 |                 try:
33 |                     url = dbpedia2wikipedia(url.strip())
34 |                     query = 'origins:"{}"'.format(url)
35 |                     r = make_query(query)
36 |                     db_response = json.loads(r.content)
37 | 
38 |                     hits_num, types = get_hits(db_response)
39 |                     if url == "en.wikipedia.org/wiki/Russians": saved = db_response
40 |                     print(".", end="")  
41 |                     total_urls += 1
42 |                     if hits_num == 0:
43 |                         total_absent += 1
44 |                         out_f.write("{}\n".format(url))
45 |                     total_hits += hits_num
46 | 
47 |                 except KeyboardInterrupt:
48 |                     break
49 |                 except:
50 |                     print(url, "error")
51 |                     print(format_exc())
52 |             print("\n")
53 | 
54 |         print("Absent urls:", total_absent)
55 |         print("Total urls:", total_urls)
56 |         print("Hits total for all urls:", total_hits)
57 |         print("Avg. hits per url: {:.2f}".format(float(total_hits)/total_urls))
58 | 
59 | 
60 | datasets_fpaths = "datasets/*txt"
61 | generate_absent_datasets(datasets_fpaths)
62 | 


--------------------------------------------------------------------------------
/linkers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/linkers/__init__.py


--------------------------------------------------------------------------------
/linkers/baseline.py:
--------------------------------------------------------------------------------
  1 | from converter import URIConverter
  2 | import json
  3 | from utils import truncated_log, overlap
  4 | from candidate import Candidate
  5 | from diffbot_api import CachedQuery, EL_POL_ENTITY_TYPES
  6 | from ttl import parse_d2kb_ttl, CLASS_URI, LINK_URI, NONE_URI
  7 | from rdflib import URIRef
  8 | from random import random
  9 | 
 10 | 
 11 | class TTLinker(object):
 12 |     def link_ttl(self, input_ttl):
 13 |         """ :param input_ttl a string with turtle (TTL) triples in the NIF format by GERBIL """
 14 | 
 15 |         graph, contexts, phrases = parse_d2kb_ttl(input_ttl)
 16 |         input_len = len(graph)
 17 | 
 18 |         if len(contexts) > 1:
 19 |             print("Warning: more than one context is found. Using the first one.")
 20 |             context = contexts[0]
 21 |         elif len(contexts) == 0:
 22 |             print("Warning: no contexts found.")
 23 |             context = ""
 24 |         else:
 25 |             # only one context
 26 |             context = contexts[0]
 27 | 
 28 |         results = self.link(context, phrases)
 29 |         for phrase, candidate in results:
 30 |             if candidate and candidate.link and candidate.link != "":
 31 |                 graph.add( (phrase.subj, LINK_URI, URIRef(candidate.link)) )
 32 |                 graph.add( (phrase.subj, CLASS_URI, URIRef(candidate.link)) )
 33 |             else:
 34 |                 print("Warning: can't link phrase '{}'@({}-{}): text='{}', uris='{}'".format(
 35 |                     phrase.text, phrase.beg, phrase.end, candidate.text, "; ".join(candidate.uris)))
 36 |                 print("".format(candidate))
 37 |                 graph.add( (phrase.subj, LINK_URI, NONE_URI) )
 38 |                 graph.add( (phrase.subj, CLASS_URI, NONE_URI) )
 39 | 
 40 |         print("# triples input:", input_len)
 41 |         print("# triples output:", len(graph))
 42 |         output_ttl = str(graph.serialize(format='n3', encoding="utf-8"), "utf-8")
 43 |         
 44 |         return output_ttl
 45 | 
 46 | 
 47 | class BaselineLinker(TTLinker):
 48 |     def __init__(self, use_overlap=True, use_importance=True, verbose=True, lower=True):
 49 |         self._cq = CachedQuery()
 50 |         self._conv = URIConverter()
 51 |         self._use_overlap = use_overlap
 52 |         self._use_importance = use_importance
 53 |         self._verbose = verbose
 54 |         self._lower = lower
 55 | 
 56 |     def __del__(self):
 57 |         self.close()
 58 |         
 59 |     def close(self):
 60 |         try:
 61 |             self._cq.close()
 62 |             self._conv.close()
 63 |         except:
 64 |             print("Warning: trying to close a closed object.") 
 65 | 
 66 |     def _get_uris(self, hit):
 67 |         uris = set()
 68 |         
 69 |         if "allUris" in hit: uris.union( set(hit["allUris"]) )
 70 |         if "origins" in hit: uris.union( set(hit["origins"]) )
 71 |         if "origin" in hit: uris.add( hit["origin"] )
 72 |         
 73 |         return uris
 74 | 
 75 |     def _get_wikipedia_uri(self, hit, uris):
 76 |         wiki_uri = ""
 77 |         
 78 |         if "wikipediaUri" in hit:
 79 |             wiki_uri = hit["wikipediaUri"]
 80 |             uris.add(wiki_uri)
 81 |         else:
 82 |             # try to find via wikidata link    
 83 |             for uri in uris:
 84 |                 wiki_uri = self._conv.wikidata2wikipedia(uri)
 85 |                 if wiki_uri != "":
 86 |                     break
 87 |             
 88 |         return wiki_uri
 89 | 
 90 |     def _find_wiki_uri(self, uris):
 91 |         for uri in uris:
 92 |             if "wikipedia.org" in uri:
 93 |                 return uri
 94 |         return "" 
 95 | 
 96 |     def _get_dbpedia_uri(self, wiki_uri, uris):
 97 |         dbpedia_uri = ""
 98 |         
 99 |         if wiki_uri != "":
100 |             dbpedia_uri = self._conv.wikipedia2dbpedia(wiki_uri)
101 |         else:
102 |             for uri in uris:
103 |                 dbpedia_uri = self._conv.wikidata2dbpedia(uri)
104 |                 if dbpedia_uri != "": break
105 | 
106 |         return dbpedia_uri
107 | 
108 |     def _link_db_query(self, target, diffbot_query_response):
109 |         candidates = []
110 |         if "data" not in diffbot_query_response:
111 |             return candidates
112 |         else:
113 |             data = diffbot_query_response["data"]
114 | 
115 |         for hit in data:
116 |             if "allUris" not in hit: continue
117 |             uris = set(hit["allUris"])
118 |             if "origin" in hit: uris.add( hit["origin"] )
119 |             if "origins" in hit: uris.union( set(hit["origins"]) )
120 |             if "wikipediaUri" in hit:
121 |                 uris.add( hit["wikipediaUri"] )
122 | 
123 |             if "importance" in hit:
124 |                 name = hit["name"]
125 |                 importance = float(hit["importance"])
126 |                 if self._use_overlap and self._use_importance:
127 |                     score = truncated_log(importance) * overlap(name, target, self._lower)
128 |                 elif self._use_overlap:
129 |                     score = overlap(name, target, self._lower)
130 |                 elif self._use_importance:
131 |                     score = importance
132 |                 else:
133 |                     score = random()
134 | 
135 |                 wiki_uri = self._find_wiki_uri(uris)
136 |                 dbpedia_uri = self._get_dbpedia_uri(wiki_uri, uris)
137 | 
138 |                 c = Candidate(score,
139 |                               name,
140 |                               dbpedia_uri,
141 |                               wiki_uri,
142 |                               hit["types"],
143 |                               hit["allNames"],
144 |                               uris)
145 |                 candidates.append(c)
146 |             else:
147 |                 print("Warning: Skipping a hit without importance value.")
148 | 
149 |         return sorted(candidates, reverse=True)
150 | 
151 |     def link(self, context, phrases):
152 |         linked_phrases = []
153 |         for phrase in phrases:
154 |             candidates = []
155 |             for entity_type in EL_POL_ENTITY_TYPES:
156 |                 r = self._cq.make_query('type:{} name:"{}"'.format(entity_type, phrase.text))
157 |                 db_response = json.loads(r.content)
158 |                 candidates += self._link_db_query(phrase.text, db_response) 
159 |             candidates = set(candidates)
160 | 
161 |             if len(candidates) > 0:
162 |                 best = sorted(candidates, reverse=True)[0]
163 |             else:
164 |                 best = Candidate()
165 |             linked_phrases.append( (phrase, best) )
166 | 
167 |         if len(linked_phrases) != len(phrases):
168 |             print("Warning: length of output is not equal to length of input {} != {}".format(len(best), len(phrases)))
169 |         
170 |         return linked_phrases
171 |    
172 | 


--------------------------------------------------------------------------------
/linkers/context_aware.py:
--------------------------------------------------------------------------------
  1 | from linkers.baseline import BaselineLinker
  2 | from collections import defaultdict
  3 | from diffbot_api import EL_POL_ENTITY_TYPES
  4 | import json
  5 | from candidate import Candidate
  6 | from langid import classify
  7 | import re
  8 | from tqdm import tqdm
  9 | from traceback import format_exc
 10 | from patterns import re_newlines
 11 | 
 12 | 
 13 | # ALL_RELATED_FIELDS = ["founders", "categories", "ceo", "isPartOf",
 14 | #                        "skills", "parents", "children", "parentCompany"]
 15 | RELATED_FIELDS = ["founders", "ceo", "parentCompany", "isPartOf"]
 16 | DEFAULT_IMPORTANCE = 1.0
 17 | DEFAULT_DB_URI = ""
 18 | 
 19 | 
 20 | class ContextAwareLinker(BaselineLinker):
 21 |     """ A base class for linkers that make use of textual representations of entities. """
 22 | 
 23 |     def __init__(self):
 24 |         BaselineLinker.__init__(self)
 25 |         self._re_contains_alpha = re.compile(r"[a-z]+", re.U|re.I)
 26 |         self._re_newlines = re.compile(r"[\n\r]+")
 27 |         self._sep = " . "
 28 | 
 29 |     def _build_index2candidate(self, candidate2index):
 30 |         """ Constructs an index in the opposite direction. """
 31 | 
 32 |         index2candidate = {}
 33 |         for candidate in candidate2index:
 34 |             index = candidate2index[candidate]
 35 |             index2candidate[index] = candidate
 36 | 
 37 |         return index2candidate
 38 | 
 39 | 
 40 |     def get_db_entry(diffbot_uri):
 41 |         """ Gets an entity like https://www.diffbot.com/entity/AcZTRPXDrY9 and 
 42 |         returns a json by https://www.diffbot.com/entity/AcZTRPXDrY9.json """
 43 |         
 44 |         raise NotImplementedError 
 45 |         return {}
 46 | 
 47 |     def _is_english(self, text):
 48 |         lang, conf = classify(text)
 49 |         return lang == "en"
 50 | 
 51 |     def _is_alpha(self, text):
 52 |         return self._re_contains_alpha.search(text)
 53 |     
 54 |     def _get_en_names(self, hit):
 55 |         names = []
 56 |         
 57 |         if "allNames" in hit:
 58 |             for name in hit["allNames"]:
 59 |                 if self._is_alpha(name) and self._is_english(name):                    
 60 |                     names.append(name)
 61 |         
 62 |         return names 
 63 |     
 64 |     def _get_name(self, hit):
 65 |         if "name" in hit:
 66 |             return hit["name"]
 67 |         else:
 68 |             return ""
 69 |         
 70 |     def _get_record_texts(self, hit):
 71 |         texts = [ self._get_name(hit) ]        
 72 |         texts += self._get_en_names(hit)
 73 | 
 74 |         if "isPartOf" in hit:
 75 |             for is_part_of in hit["isPartOf"]:
 76 |                 if "name" in is_part_of:
 77 |                     texts.append(is_part_of["name"])
 78 |                              
 79 |         if "description" in hit:
 80 |             texts.append(hit["description"])
 81 | 
 82 |         texts_str = self._sep.join(texts)
 83 |         texts_str = re_newlines.sub(" ", texts_str)
 84 | 
 85 |         return texts_str
 86 |     
 87 |     def _get_wiki_texts(self, wiki_uri):
 88 |         # access from a cached (?) wikipedia dump
 89 |         return ""
 90 |     
 91 |     def _get_uri_texts(self, uris):
 92 |         # access the uris
 93 |         return ""
 94 | 
 95 |     def _extract_importance(self, hit):
 96 |         importance_field = "importance"
 97 |         if importance_field in hit:
 98 |             return float(hit[importance_field])
 99 |         else:
100 |             return DEFAULT_IMPORTANCE
101 | 
102 |     def _extract_db_uri(self, hit):
103 |         db_uri_field = "diffbotUri"
104 |         if db_uri_field in hit:
105 |             return hit[db_uri_field]
106 |         else:
107 |             return DEFAULT_DB_URI
108 | 
109 |     def _extract_relations(self, hit):
110 |         relations = {}
111 | 
112 |         for field_name in hit:
113 |             if field_name not in RELATED_FIELDS: continue
114 | 
115 |             if isinstance(hit[field_name], dict):
116 |                 if "diffbotUri" in hit[field_name]:
117 |                     if field_name not in relations: relations[field_name] = list()
118 |                     relations[field_name].append(hit[field_name]["diffbotUri"])
119 | 
120 |             if isinstance(hit[field_name], list):
121 |                 for item in hit[field_name]:
122 |                     if "diffbotUri" in item:
123 |                         if field_name not in relations: relations[field_name] = list()
124 |                         relations[field_name].append(item["diffbotUri"])
125 | 
126 |         return relations
127 | 
128 |     def get_phrase_candidates(self, phrases, related_entities=False):
129 |         phrase2candidates = defaultdict(set)  
130 | 
131 |         for phrase in tqdm(phrases):
132 |             for entity_type in EL_POL_ENTITY_TYPES:
133 |                 try:
134 |                     response_raw = self._cq.make_query('type:{} name:"{}"'.format(entity_type, phrase.text))
135 |                     response = json.loads(response_raw.content)
136 | 
137 |                     if "data" not in response: continue
138 |                     else: data = response["data"]
139 | 
140 |                     for hit in data:
141 |                         c = self._build_candidate(hit)
142 |                         phrase2candidates[phrase].add(c)
143 | 
144 |                         if not related_entities: continue
145 | 
146 |                         related_num = 0
147 |                         for relation_type in c.relations:
148 |                             for related_entity_id in c.relations[relation_type]:
149 |                                 related_response = self._cq.get_entity(related_entity_id)
150 | 
151 |                                 if "data" not in related_response or len(related_response["data"]) == 0:
152 |                                     print("Warning: can't find related entity: {}.".format(related_entity_id))
153 |                                     continue
154 | 
155 |                                 for related_hit in related_response["data"]:
156 |                                     related_num += 1
157 |                                     related_c = self._build_candidate(related_hit)
158 |                                     phrase2candidates[related_entity_id].add(related_c)
159 |                                     if related_c.db_uri != related_entity_id:
160 |                                         phrase2candidates[related_c.db_uri].add(related_c)
161 | 
162 |                                     print("'{}'#{}: added entity {} which is '{}' to {}".format(
163 |                                         phrase.text,
164 |                                         related_num,
165 |                                         c.db_uri,
166 |                                         relation_type,
167 |                                         related_entity_id))
168 |                 except:
169 |                     print("Warning: cannot process phrase '{}' of type '{}'".format(phrase.text, entity_type))
170 |                     print(format_exc())
171 | 
172 |         return phrase2candidates
173 | 
174 |     def _build_candidate(self, hit):
175 | 
176 |         uris = self._get_uris(hit)
177 |         wiki_uri = self._get_wikipedia_uri(hit, uris)
178 |         texts_record = self._get_record_texts(hit)
179 |         texts_wiki = self._get_wiki_texts(wiki_uri)
180 |         texts_uris = self._get_uri_texts(uris)
181 |         texts = self._sep.join([texts_record, texts_wiki, texts_uris])
182 |         texts = self._re_newlines.sub(self._sep, texts)
183 |         relations = self._extract_relations(hit)
184 |         importance = self._extract_importance(hit)
185 |         db_uri = self._extract_db_uri(hit)
186 |         score = float(hit["importance"])
187 |         link = self._get_dbpedia_uri(wiki_uri, uris)
188 |         types = hit["types"] if "types" in hit else []
189 | 
190 |         c = Candidate(score,
191 |                       self._get_name(hit),
192 |                       link,
193 |                       wiki_uri,
194 |                       types,
195 |                       self._get_en_names(hit),
196 |                       uris,
197 |                       texts,
198 |                       db_uri,
199 |                       importance,
200 |                       relations)
201 |         return c
202 | 
203 | 


--------------------------------------------------------------------------------
/linkers/dense.py:
--------------------------------------------------------------------------------
  1 | from linkers.sparse import SparseLinker
  2 | from traceback import format_exc
  3 | from gensim.models import KeyedVectors
  4 | from utils import overlap
  5 | from candidate import Candidate
  6 | from time import time
  7 | from traceback import format_exc
  8 | from os.path import exists, join
  9 | from nltk.corpus import stopwords
 10 | from nltk import pos_tag
 11 | from sklearn.externals import joblib
 12 | from sklearn.preprocessing import normalize
 13 | from tqdm import tqdm
 14 | from candidate import make_phrases
 15 | from numpy import argsort, argmax, dot, zeros, multiply, ones
 16 | 
 17 | 
 18 | class DenseLinker(SparseLinker):
 19 |     def __init__(self, model_dir, embeddings_fpath, tfidf=True, use_overlap=True, description="", stop_words=True):
 20 |         SparseLinker.__init__(self, model_dir, tfidf, use_overlap, description, stop_words)
 21 |         self._params["word_embeddings"] = embeddings_fpath
 22 |         self._wv = self._load_word_embbeddings(embeddings_fpath)
 23 |         self._stopwords = set(stopwords.words("english"))
 24 | 
 25 |         if hasattr(self, '_dense_vectors'):
 26 |             print("Normalizing dense vectors...")
 27 |             tic = time()
 28 |             self._dense_vectors = normalize(self._dense_vectors)
 29 |             print("Done in {:.2f} sec.".format(time() - tic))
 30 |         else:
 31 |             print("Warning: no dense vectors could be found. You need to train the model first.")
 32 | 
 33 |     def print_most_similar(self, n=10, max_candidates=10, test_name="Seal"):
 34 |         test_phrases = make_phrases([test_name])
 35 | 
 36 |         for test_phrase in test_phrases:
 37 |             print("=" * 50, "\n", test_phrase)
 38 |             test_candidates = self._phrase2candidates[self._default_phrase(test_phrase)]
 39 | 
 40 |             for j, tc in enumerate(test_candidates):
 41 |                 if j > max_candidates: break
 42 | 
 43 |                 print("=" * 50, "\n", tc)
 44 | 
 45 |                 tc_index = self._candidate2index[tc]
 46 |                 tc_dvector = self._dense_vectors[tc_index, :]
 47 | 
 48 |                 # dot product with all candidates to find the most similar ones
 49 |                 tc_sims = self._dense_vectors.dot(tc_dvector)
 50 |                 tc_sorted_indices = argsort(-tc_sims)[:n]
 51 | 
 52 |                 print("-" * 50)
 53 |                 for i, nearest_candidate_index in enumerate(tc_sorted_indices):
 54 |                     print(i, tc_sims[nearest_candidate_index], self._index2candidate[nearest_candidate_index], "\n")
 55 | 
 56 |     def _load(self, model_dir):
 57 |         SparseLinker._load(self, model_dir)
 58 | 
 59 |         dense_vectors_filename = "dense_vectors.pkl"
 60 |         self._dense_vectors_fpath = join(model_dir, dense_vectors_filename)
 61 | 
 62 |         if exists(self._dense_vectors_fpath):
 63 |             print("Loading:", self._dense_vectors_fpath)
 64 |             self._dense_vectors = joblib.load(self._dense_vectors_fpath)
 65 | 
 66 |     def train(self, dataset_fpaths):
 67 |         phrases = self._dataset2phrases(dataset_fpaths)
 68 |         self._dense_vectors = zeros((self._vectors.shape[0], self._wv.vector_size))
 69 | 
 70 |         for phrase in tqdm(phrases):
 71 |             try:
 72 |                 dphrase = self._default_phrase(phrase)
 73 |                 if dphrase in self._phrase2candidates:
 74 |                     # get the candidates
 75 |                     candidates = list(self._phrase2candidates[dphrase])  # to remove
 76 |                     indices = []
 77 |                     for candidate in candidates:
 78 |                         if candidate in self._candidate2index:
 79 |                             indices.append(self._candidate2index[candidate])
 80 |                         else:
 81 |                             print("Warning: candidate '{}' is not indexed".format(candidate))
 82 |                             indices.append(0)  # just to make sure lengths are equal
 83 | 
 84 |                     #candidate_vectors = self._vectors[indices]
 85 |                     print("Retrieved {} candidates for '{}'".format(len(indices), phrase.text))
 86 | 
 87 |                     for index in indices:
 88 |                         self._dense_vectors[index, :] = self._get_dense_vector(self._vectors[index, :], dphrase.text)
 89 |             except:
 90 |                 print("Warning: error phrase '{}'".format(phrase))
 91 |                 print(format_exc())
 92 | 
 93 |         joblib.dump(self._dense_vectors, self._dense_vectors_fpath)
 94 |         print("Dense vectors:", self._dense_vectors_fpath)
 95 | 
 96 |     def _load_word_embbeddings(self, word_embeddings_fpath):
 97 |         print("Loading word vectors from:", word_embeddings_fpath)
 98 |         tic = time()
 99 | 
100 |         self._params["word_embeddings_pickle"] = word_embeddings_fpath + ".pkl"
101 |         if exists(self._params["word_embeddings_pickle"]):
102 |             wv = KeyedVectors.load(self._params["word_embeddings_pickle"])
103 |             wv.init_sims(replace=True)
104 |         else:
105 |             wv = KeyedVectors.load_word2vec_format(word_embeddings_fpath, binary=False, unicode_errors="ignore")
106 |             wv.init_sims(replace=True)
107 | 
108 |             tac = time()
109 |             wv.save(self._params["word_embeddings_pickle"])
110 |             print("Saved in {} sec.".format(time() - tac))
111 | 
112 |         print("Loaded in {} sec.".format(time() - tic))
113 | 
114 |         return wv
115 | 
116 |     def link(self, context, phrases):
117 |         linked_phrases = []
118 |         context_vector = self._vectorizer.transform([context])
119 | 
120 |         for phrase in phrases:
121 |             try:
122 |                 dphrase = self._default_phrase(phrase)
123 |                 if dphrase in self._phrase2candidates:
124 |                     # get the candidates
125 |                     candidates = list(self._phrase2candidates[dphrase])  # to remove
126 |                     indices = []
127 |                     for candidate in candidates:
128 |                         if candidate in self._candidate2index:
129 |                             indices.append(self._candidate2index[candidate])
130 |                         else:
131 |                             print("Warning: candidate '{}' is not indexed".format(candidate))
132 |                             indices.append(0)  # just to make sure lengths are equal
133 | 
134 |                     dense_candidate_vectors = self._dense_vectors[indices]
135 |                     # check if candidates are correct
136 |                     print("Retrieved {} candidates for '{}'".format(len(indices), phrase.text))
137 | 
138 |                     dense_context_vector = self._get_dense_vector(context_vector, dphrase.text)
139 | 
140 |                     # rank the candidates
141 |                     sims = dot(dense_candidate_vectors, dense_context_vector.T)
142 | 
143 |                     if self._params["use_overlap"]:
144 |                         overlap_scores = zeros(sims.shape)
145 |                         for i, candidate in enumerate(candidates):
146 |                             overlap_scores[i] = overlap(candidate.name, phrase.text)
147 |                     else:
148 |                         overlap_scores = ones(sims.shape)
149 | 
150 |                     scores = multiply(sims, overlap_scores)
151 |                     best_index = argmax(scores)
152 |                     best_candidate = candidates[best_index]
153 |                     best_candidate.score = scores[best_index]
154 |                     best_candidate.link = self._get_dbpedia_uri(best_candidate.wiki, best_candidate.uris)
155 |                     linked_phrases.append((phrase, best_candidate))
156 |                 else:
157 |                     print("Warning: phrase '{}' is not found in the vocabulary of the model".format(phrase))
158 | 
159 |                     linked_phrases.append((phrase, Candidate()))
160 |             except:
161 |                 print("Error while processing phrase '{}':")
162 |                 print(format_exc())
163 |                 linked_phrases.append((phrase, Candidate()))
164 |         return linked_phrases
165 | 
166 |     def _get_dense_vectors(self, sparse_vectors, target):
167 |         dense_vectors = zeros((sparse_vectors.shape[0], self._wv.vector_size))
168 | 
169 |         for i in range(sparse_vectors.shape[0]):
170 |             sparse_candidate_vector = sparse_vectors[i, :]
171 |             dense_candidate_vector = self._get_dense_vector(sparse_candidate_vector, target)
172 |             dense_vectors[i, :] = dense_candidate_vector
173 | 
174 |         return dense_vectors
175 | 
176 |     def _get_dense_vector(self, sparse_vector, target):
177 |         """ Construct the dense vector """
178 | 
179 |         dense_vector = zeros(self._wv.vector_size)
180 |         weights_sum = 0.
181 |         names = self._vectorizer.get_feature_names()
182 | 
183 |         for i, word_weight in enumerate(sparse_vector.data):
184 |             feature_index = sparse_vector.indices[i]
185 |             word = names[feature_index]
186 | 
187 |             if word.lower() in self._stopwords or word.lower() == target.lower(): continue
188 |             lemma, pos = pos_tag([word])[0]
189 |             if pos[:2] not in ["FW", "JJ", "JJ", "NN", "VB", "RB"]: continue
190 |             # print(word, end=", ")
191 | 
192 |             if word in self._wv.vocab:
193 |                 word_vector = self._wv[word]
194 |             elif word.capitalize() in self._wv.vocab:
195 |                 word_vector = self._wv[word.capitalize()]
196 |             else:
197 |                 continue
198 | 
199 | 
200 |             dense_vector += word_weight * word_vector
201 |             weights_sum += word_weight
202 | 
203 | 
204 |         dense_vector = dense_vector / (len(sparse_vector.data) + 1.)
205 |         #print("\n>>>>>>>>\n")
206 |         return dense_vector
207 | 


--------------------------------------------------------------------------------
/linkers/nn_graph.py:
--------------------------------------------------------------------------------
 1 | from linkers.baseline import BaselineLinker
 2 | from candidate import Candidate
 3 | from supervised.evaluate import Evaluator
 4 | 
 5 | 
 6 | class NNLinker(BaselineLinker):
 7 |     def __init__(self):
 8 |         BaselineLinker.__init__(self)
 9 |         self.evaluator = Evaluator()
10 | 
11 |     def link(self, context, phrases):
12 | 
13 |         linked_phrases = list()
14 | 
15 |         #file = open('/Users/sevgili/Desktop/context-phrase-nif.txt', 'a')
16 | 
17 |         for phrase in phrases:
18 |             #file.write(str(context) + '\t' + str(phrase.text) + '\t' + str(phrase.beg) + '\t' + str(phrase.end) + '\n')
19 |             score, predicted_url = self.evaluator.get_best_pred(context, phrase)
20 |             print('******', context, phrase, score, predicted_url)
21 |             c = Candidate(score=score, link=predicted_url)
22 | 
23 |             linked_phrases.append((phrase, c))
24 | 
25 |         return linked_phrases
26 | 
27 | 
28 | class CandidateRandom(NNLinker):
29 |     def __init__(self):
30 |         NNLinker.__init__(self)
31 | 
32 |     def link(self, context, phrases):
33 | 
34 |         linked_phrases = list()
35 | 
36 |         for phrase in phrases:
37 |             score, predicted_url = self.evaluator.get_random_pred(context, phrase)
38 | 
39 |             c = Candidate(score=score, link=predicted_url)
40 | 
41 |             linked_phrases.append((phrase, c))
42 | 
43 |         return linked_phrases
44 | 


--------------------------------------------------------------------------------
/linkers/sparse.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from utils import overlap
  3 | from linkers.context_aware import ContextAwareLinker 
  4 | from candidate import Candidate
  5 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  6 | from candidate import Phrase, make_phrases
  7 | from pandas import read_csv
  8 | from time import time
  9 | from os.path import join
 10 | from utils import ensure_dir
 11 | from sklearn.externals import joblib
 12 | import json
 13 | from os.path import exists
 14 | import codecs 
 15 | from numpy import dot, argmax
 16 | from traceback import format_exc
 17 | 
 18 | 
 19 | # ToDo: save also directly the phrase2index file for faster classifications
 20 | 
 21 | class SparseLinker(ContextAwareLinker):
 22 |     def __init__(self, model_dir, tfidf=True, use_overlap=True, description="", stop_words=True,
 23 |                  related_entities=False, binary_count_vectorizer=False, wiki_only=False):
 24 | 
 25 |         ContextAwareLinker.__init__(self)
 26 |         print("Model directory:", model_dir)
 27 |         self._params = {}
 28 |         self._params["tfidf"] = tfidf
 29 |         self._params["description"] = description
 30 |         self._params["use_overlap"] = use_overlap
 31 |         self._params["stop_words"] = stop_words
 32 |         self._params["binary_count_vectorizer"] = binary_count_vectorizer
 33 |         self._params["related_entities"] = related_entities
 34 |         self._params["related_entities_factor"] = 3 # text of related entities 3 times less important than the entity text
 35 |         self._params["wiki_only"] = wiki_only
 36 | 
 37 | 
 38 |         vectorizer_filename = "vectorizer.pkl"
 39 |         candidate2index_filename = "candidate2index.pkl"
 40 |         params_filename = "params.json"
 41 |         vectors_filename = "vectors.pkl"
 42 |         phrase2candidates_filename = "phrase2candidates.pkl"
 43 |         phrases_filename = "phrases.txt"
 44 |         candidates_filename = "candidates.txt"
 45 |         
 46 |         self._vectorizer_fpath = join(model_dir, vectorizer_filename)
 47 |         self._candidate2index_fpath = join(model_dir, candidate2index_filename)
 48 |         self._params_fpath = join(model_dir, params_filename) 
 49 |         self._vectors_fpath = join(model_dir, vectors_filename)
 50 |         self._phrase2candidates_fpath = join(model_dir, phrase2candidates_filename)
 51 |         self._phrases_fpath = join(model_dir, phrases_filename)
 52 |         self._candidates_fpath = join(model_dir, candidates_filename)
 53 |         self._load(model_dir) # using the defined paths
 54 | 
 55 |     def set_params(self, params):
 56 |         for param in params:
 57 |             self._params[param] = params[param]
 58 | 
 59 |     def _load(self, model_dir):
 60 |         tic = time()
 61 |         ensure_dir(model_dir) 
 62 | 
 63 |         if exists(self._params_fpath):
 64 |             with open(self._params_fpath, "r") as fp:
 65 |                 self._params = json.load(fp)
 66 |             print("Parameters:\n- ", "\n- ".join("{}: {}".format(p, self._params[p]) for p in self._params))
 67 |          
 68 |         if exists(self._phrase2candidates_fpath):
 69 |             print("Loading:", self._phrase2candidates_fpath)
 70 |             self._phrase2candidates = joblib.load(self._phrase2candidates_fpath)
 71 |         
 72 |         if exists(self._candidate2index_fpath):
 73 |             print("Loading:", self._candidate2index_fpath)
 74 |             self._candidate2index = joblib.load(self._candidate2index_fpath)
 75 | 
 76 |             print("Building index2candidate lookup table...")
 77 |             tic = time()
 78 |             self._index2candidate = self._build_index2candidate(self._candidate2index)
 79 |             print("Done in {:.2f} sec.".format(time() - tic))
 80 | 
 81 |         if exists(self._vectorizer_fpath):
 82 |             print("Loading:", self._vectorizer_fpath)
 83 |             self._vectorizer = joblib.load(self._vectorizer_fpath) 
 84 |         
 85 |         if exists(self._vectors_fpath):
 86 |             print("Loading:", self._vectors_fpath)
 87 |             self._vectors = joblib.load(self._vectors_fpath)
 88 |             
 89 |         print("Loaded in {:.2f} sec.".format(time() - tic))
 90 | 
 91 | 
 92 |     def train(self, dataset_fpaths):
 93 |         tic = time()
 94 |         print("Training...")
 95 |         phrases = self._dataset2phrases(dataset_fpaths) 
 96 |         self._train(phrases)
 97 |         print("Training is done in {:.2f} sec.".format(time()-tic))
 98 |         
 99 |     def _train(self, phrases):
100 |         # get the phrases
101 |         with codecs.open(self._phrases_fpath, "w", "utf-8") as out:
102 |             for phrase in phrases: out.write("{}\n".format(phrase.text))
103 |         print("Saved phrases:", self._phrases_fpath)                                      
104 |                                       
105 |         self._params["num_phrases"] = len(phrases)
106 |         print("Number of phrases:", len(phrases))
107 |         
108 |         self._phrase2candidates = self.get_phrase_candidates(phrases, self._params["related_entities"])
109 | 
110 |         # get candidates for the phrases
111 |         candidates = set()
112 |         for phrase in self._phrase2candidates:
113 |             for candidate in self._phrase2candidates[phrase]:
114 |                 candidates.add(candidate)
115 |         print("Number of candidates:", len(candidates))
116 |         print("Saved phrase2candidate:", self._phrase2candidates_fpath)
117 | 
118 |         # save the vector indices for the candidates
119 |         with codecs.open(self._candidates_fpath, "w", "utf-8") as out:
120 |             self._candidate2index = {}
121 |             corpus = []
122 |             for index, candidate in enumerate(candidates):
123 |                 candidate_texts = [candidate.text]
124 | 
125 |                 # if related_entityes then also also include text of them as well
126 |                 if self._params["related_entities"]:
127 |                     candidate_texts *= self._params["related_entities_factor"]
128 | 
129 |                     for relation_type in candidate.relations:
130 |                         for related_entity_id in candidate.relations[relation_type]:
131 |                             related_entity = self._phrase2candidates[related_entity_id]
132 |                             if len(related_entity) == 0: continue
133 |                             related_entity = list(related_entity)[0]
134 | 
135 |                             candidate_texts.append(related_entity.text)
136 | 
137 | 
138 |                 self._candidate2index[candidate] = index
139 | 
140 |                 out.write("{}\t{}\t{}\t{}\n".format(
141 |                     index,
142 |                     candidate.name,
143 |                     candidate.text,
144 |                     "; ".join(candidate.uris)
145 |                 ))
146 | 
147 |                 corpus.append(" ".join(candidate_texts))
148 | 
149 |             joblib.dump(self._candidate2index, self._candidate2index_fpath)
150 |             print("Saved candidate2index:", self._candidate2index_fpath)
151 |             joblib.dump(self._phrase2candidates, self._phrase2candidates_fpath)
152 |             print("Saved candidates:", self._candidates_fpath)
153 | 
154 |         # vectorize the text representations of the candidates
155 |         stopwords = 'english' if self._params["stop_words"] else None
156 |         if self._params["tfidf"]:
157 |             self._vectorizer = TfidfVectorizer(stop_words=stopwords)
158 |         else:
159 |             self._vectorizer = CountVectorizer(
160 |                 binary=self._params["binary_count_vectorizer"],
161 |                 stop_words=stopwords)
162 | 
163 |         self._vectors = self._vectorizer.fit_transform(corpus)
164 |         
165 |         joblib.dump(self._vectorizer, self._vectorizer_fpath) 
166 |         print("Saved vectorizer:", self._vectorizer_fpath)
167 | 
168 |         joblib.dump(self._vectors, self._vectors_fpath)
169 |         self._params["shape"] = self._vectors.shape
170 |         print("Saved {} candidate feature matrix: {}".format(self._vectors.shape, self._vectors_fpath))
171 | 
172 |         with open(self._params_fpath, "w") as fp:
173 |             json.dump(self._params, fp)
174 |         print("Saved params:", self._params_fpath)
175 | 
176 |     def _ttl2phrases(self, ttl_fpaths):
177 |         """ Given a list of ttl files, extract phrases from them. """
178 | 
179 |         voc = set()
180 |         for dataset_fpath in ttl_fpaths:
181 |             df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
182 |             for i, row in df.iterrows():
183 |                 for target in str(row.targets).split(","):
184 |                     voc.add(target.strip())
185 | 
186 |         return make_phrases(list(voc))
187 | 
188 |     def _dataset2phrases(self, dataset_fpaths):
189 |         """ Given a list of datasets, extract phrases from them. """
190 | 
191 |         voc = set()
192 |         for dataset_fpath in dataset_fpaths:
193 |             df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
194 |             for i, row in df.iterrows():
195 |                 for target in str(row.targets).split(","):
196 |                     voc.add(target.strip())
197 |             
198 |         return make_phrases(list(voc))
199 |        
200 |     def _default_phrase(self, phrase):
201 |         text = phrase.text.strip()
202 |         return Phrase(text, 1, len(text), "http://" + text)
203 | 
204 |     def _filter_non_linked(self, candidates):
205 |         linked_candidates = []
206 |         for candidate in candidates:
207 |             has_link = candidate.link != ""
208 |             if has_link:
209 |                 linked_candidates.append(candidate)
210 | 
211 |         print("Warning: keeping {} of {} candidates that are Wikipedia-linked.".format(
212 |             len(linked_candidates), len(candidates)))
213 | 
214 |         return linked_candidates
215 | 
216 |     def link(self, context, phrases):       
217 |         linked_phrases = []
218 |         context_vector = self._vectorizer.transform([context])
219 | 
220 |         for phrase in phrases:
221 |             try:
222 |                 dphrase = self._default_phrase(phrase)
223 |                 if dphrase in self._phrase2candidates:
224 |                     # get the candidates
225 |                     candidates = list(self._phrase2candidates[dphrase])
226 |                     if self._params["wiki_only"]:
227 |                         candidates = self._filter_non_linked(candidates)
228 | 
229 |                     indices = []
230 |                     for candidate in candidates:
231 |                         if candidate in self._candidate2index:
232 |                             indices.append(self._candidate2index[candidate])
233 |                         else:
234 |                             print("Warning: candidate '{}' is not indexed".format(candidate))
235 |                             indices.append(0) # just to make sure lengths are equal
236 | 
237 |                     candidate_vectors = self._vectors[ indices ]
238 |                     print("Retrieved {} candidates for '{}'".format(len(indices), phrase.text))
239 | 
240 |                     # rank the candidates
241 |                     sims = dot(candidate_vectors, context_vector.T)
242 |                     
243 |                     if self._params["use_overlap"]:
244 |                         overlap_scores = np.zeros(sims.shape) 
245 |                         for i, candidate in enumerate(candidates):
246 |                             overlap_scores[i] = overlap(candidate.name, phrase.text)
247 |                     else:
248 |                         overlap_scores = np.ones(sims.shape)
249 | 
250 |                     scores = np.multiply(sims.toarray(), overlap_scores)
251 |                     best_index = argmax(scores)
252 |                     best_candidate = candidates[best_index]
253 |                     best_candidate.score = scores[best_index][0]
254 |                     best_candidate.link = self._get_dbpedia_uri(best_candidate.wiki, best_candidate.uris)
255 |                     linked_phrases.append( (phrase, best_candidate) )
256 |                 else:
257 |                     print("Warning: phrase '{}' is not found in the vocabulary of the model".format(phrase))
258 | 
259 |                     linked_phrases.append( (phrase, Candidate()) )  
260 |             except:
261 |                 print("Error while processing phrase '{}':")
262 |                 print(format_exc())
263 |                 linked_phrases.append( (phrase, Candidate()) )
264 |         return linked_phrases
265 | 
266 | 


--------------------------------------------------------------------------------
/linkers/supertagger.py:
--------------------------------------------------------------------------------
 1 | from linkers.context_aware import ContextAwareLinker
 2 | from candidate import Candidate
 3 | import json
 4 | from collections import namedtuple
 5 | from traceback import format_exc
 6 | import requests
 7 | 
 8 | 
 9 | Tag = namedtuple("Tag", "id score text offsets uris")
10 | 
11 | 
12 | class SuperTagger(ContextAwareLinker):
13 |     def __init__(self):
14 |         ContextAwareLinker.__init__(self)
15 |         self._endpoint_supertagger = "https://supertagger.diffbot.com/el?token=sam&includeKG&confidence=0.5&maxTags=10&lang=en&text={}title="
16 | 
17 |     def _entity_link(self, text, verbose=True):
18 |         nothing = {}
19 | 
20 |         uri = self._endpoint_supertagger.format(text)
21 |         r = requests.get(uri)
22 |         content = json.loads(r.content)
23 | 
24 |         if "all-tags" not in content:
25 |             if verbose: print("Warning: no 'all-tag' found.")
26 |             return nothing
27 | 
28 |         tags = content["all-tags"]
29 |         result = []
30 |         for i, tag in enumerate(tags):
31 |             try:
32 |                 if "kgEntity" not in tag:
33 |                     print("Warning: no 'kgEntity' found.")
34 |                     return nothing
35 |                 kg = tag["kgEntity"]
36 | 
37 |                 if "allUris" not in kg:
38 |                     print("Warning: no 'allUris' found.")
39 |                     return nothing
40 | 
41 |                 id = tag["diffbotEntityId"]
42 |                 uris = kg["allUris"]
43 |                 tag_text = tag["label"]
44 |                 offsets = tag["offsets"]["text"]
45 |                 score = tag["overallRelevanceScore"]
46 | 
47 |                 result.append(Tag(id, score, tag_text, offsets, uris))
48 |             except:
49 |                 print(format_exc())
50 | 
51 |         return result
52 | 
53 |     def link(self, context, phrases):
54 |         # link
55 |         tags = self._entity_link(context)
56 | 
57 |         # assign tags to phrases
58 |         linked_phrases = []
59 |         for phrase in phrases:
60 | 
61 |             # try to assign the phrase from the tagged output
62 |             assigned_phrase = False
63 |             for tag in tags:
64 |                 for tag_beg, tag_end in tag.offsets:
65 |                     if phrase.beg >= tag_end:
66 |                         intersect = phrase.beg - tag_beg < tag_end - tag_beg
67 |                     else:
68 |                         intersect = tag_beg - phrase.beg < phrase.end - phrase.beg
69 | 
70 |                     if intersect:
71 |                         wiki_uri = self._find_wiki_uri(tag.uris)
72 |                         link = self._get_dbpedia_uri(wiki_uri, tag.uris)
73 |                         c = Candidate(tag.score,
74 |                                       tag.text,
75 |                                       link,
76 |                                       wiki_uri,
77 |                                       [],[],
78 |                                       tag.uris,
79 |                                       tag.text,
80 |                                       tag.id)
81 |                         linked_phrases.append((phrase, c))
82 |                         assigned_phrase = True
83 | 
84 |             # if nothing found assign to the phrase something still
85 |             if not assigned_phrase:
86 |                 linked_phrases.append((phrase, Candidate()))
87 | 
88 |         return linked_phrases


--------------------------------------------------------------------------------
/nif_ws.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, request, Response
  2 | import logging
  3 | import requests
  4 | import codecs
  5 | from os.path import join
  6 | from time import time
  7 | from ttl import remove_classref, add_nonsense_response, DatasetBuilder
  8 | from linkers.baseline import BaselineLinker
  9 | from linkers.sparse import SparseLinker
 10 | from linkers.dense import DenseLinker
 11 | from linkers.supertagger import SuperTagger
 12 | 
 13 | 
 14 | endpoint = "http://localhost:8080/spotlight"
 15 | data_dir = "data/"
 16 | no_classref = False
 17 | save_ttl_data = False
 18 | ds = DatasetBuilder(join(data_dir, "dataset.csv"))
 19 | 
 20 | app = Flask(__name__)
 21 | logging.basicConfig(level=logging.DEBUG)
 22 | log = logging.getLogger("nif_ws.py")
 23 | 
 24 | 
 25 | def save_data(prefix, req_data, resp_data):
 26 |     if save_ttl_data:
 27 |         fid = prefix + "-" + str(time()).replace(".","")
 28 |         request_fpath = join(data_dir, fid + "-request.ttl")
 29 |         with codecs.open(request_fpath, "w", "utf-8") as req:
 30 |             req.write(str(req_data, "utf-8"))
 31 | 
 32 |         response_fpath = join(data_dir, fid + "-response.ttl")
 33 |         with codecs.open(response_fpath, "w", "utf-8") as res:
 34 |             res.write(str(resp_data, "utf-8"))
 35 | 
 36 | 
 37 | @app.route("/proxy", methods=['POST'])
 38 | def proxy():
 39 |     h = {key: value for key, value in request.headers}
 40 |     r = requests.post(endpoint, headers=h, data=request.data)
 41 | 
 42 |     resp = Response()
 43 |     if r.status_code == 200:
 44 |         for header_name, header_value in r.headers.items():
 45 |             resp.headers[header_name] = header_value
 46 |         
 47 |         r_content = str(r.content, "utf-8")
 48 |         resp_data = remove_classref(r_content) if no_classref else r_content
 49 |         resp.data = resp_data
 50 |         save_data("proxy", request.data, resp_data)
 51 |         ds.add_to_dataset(request.data)
 52 |     else:
 53 |         log.info("Warning: server returned an error")
 54 |         log.info(r)
 55 | 
 56 |     return resp
 57 | 
 58 | 
 59 | @app.route("/trivial", methods=['POST'])
 60 | def trivial():
 61 |     h = {key: value for key, value in request.headers}
 62 |     
 63 |     resp_data = add_nonsense_response(request.data)
 64 | 
 65 |     resp = Response()
 66 |     for header_name, header_value in request.headers.items():
 67 |         resp.headers[header_name] = header_value
 68 |     resp.data = resp_data
 69 |     save_data("trivial", request.data, resp_data)
 70 |     ds.add_to_dataset(request.data)
 71 |     
 72 |     return resp
 73 | 
 74 | 
 75 | overlap_importance_linker = BaselineLinker(use_overlap=True, use_importance=True)
 76 | 
 77 | @app.route("/overlap_importance", methods=['POST'])
 78 | def overlap_importance():
 79 |     response = Response()
 80 |     
 81 |     for header_name, header_value in request.headers.items():
 82 |         response.headers[header_name] = header_value
 83 |     response.data = overlap_importance_linker.link_ttl(request.data)
 84 | 
 85 |     save_data("overlap_importance", request.data, response.data)
 86 |     
 87 |     return response
 88 | 
 89 | 
 90 | importance_linker = BaselineLinker(use_overlap=False, use_importance=True)
 91 | 
 92 | @app.route("/importance", methods=['POST'])
 93 | def importance():
 94 |     response = Response()
 95 |     
 96 |     for header_name, header_value in request.headers.items():
 97 |         response.headers[header_name] = header_value
 98 |     response.data = importance_linker.link_ttl(request.data)
 99 | 
100 |     save_data("importance", request.data, response.data)
101 |     
102 |     return response
103 | 
104 | 
105 | overlap_linker = BaselineLinker(use_overlap=True, use_importance=False, lower=True)
106 | 
107 | @app.route("/overlap", methods=['POST'])
108 | def overlap():
109 |     response = Response()
110 |     
111 |     for header_name, header_value in request.headers.items():
112 |         response.headers[header_name] = header_value
113 |     response.data = overlap_linker.link_ttl(request.data)
114 | 
115 |     save_data("overlap", request.data, response.data)
116 |     
117 |     return response
118 | 
119 | 
120 | overlap_linker_case = BaselineLinker(use_overlap=True, use_importance=False, lower=False)
121 | 
122 | @app.route("/overlap_case", methods=['POST'])
123 | def overlap_case():
124 |     response = Response()
125 | 
126 |     for header_name, header_value in request.headers.items():
127 |         response.headers[header_name] = header_value
128 |     response.data = overlap_linker_case.link_ttl(request.data)
129 | 
130 |     save_data("overlap_case", request.data, response.data)
131 | 
132 |     return response
133 | 
134 | 
135 | random_linker = BaselineLinker(use_overlap=False, use_importance=False)
136 | 
137 | @app.route("/random", methods=['POST'])
138 | def random():
139 |     response = Response()
140 |     
141 |     for header_name, header_value in request.headers.items():
142 |         response.headers[header_name] = header_value
143 |     response.data = random_linker.link_ttl(request.data)
144 | 
145 |     save_data("random", request.data, response.data)
146 |     
147 |     return response
148 | 
149 | # dense_linker = DenseLinker("data/count-stopwords-3", "data/wiki-news-300d-1M.vec")
150 | # dense_linker = DenseLinker("data/count-stopwords-3-cc", "data/crawl-300d-2M.vec")
151 | dense_linker = DenseLinker("data/count-stopwords-10", "data/crawl-300d-2M.vec")
152 | 
153 | @app.route("/dense_overlap", methods=['POST'])
154 | def dense_overlap():
155 |     params = {"tfidf": False, "use_overlap": True}
156 |     dense_linker.set_params(params)
157 | 
158 |     response = Response()
159 | 
160 |     for header_name, header_value in request.headers.items():
161 |         response.headers[header_name] = header_value
162 |     response.data = dense_linker.link_ttl(request.data)
163 | 
164 |     save_data("dense_overlap", request.data, response.data)
165 | 
166 |     return response
167 | 
168 | 
169 | # sparse_linker = SparseLinker("data/all0")
170 | # sparse_linker = SparseLinker("data/tfidf-stopwords-2")
171 | # sparse_linker =  SparseLinker("data/count-stopwords-3")
172 | sparse_linker =  SparseLinker("data/count-stopwords-10")
173 | 
174 | @app.route("/sparse", methods=['POST'])
175 | def sparse():
176 |     params = {"tfidf": True, "use_overlap": False}
177 |     sparse_linker.set_params(params)
178 | 
179 |     response = Response()
180 |     
181 |     for header_name, header_value in request.headers.items():
182 |         response.headers[header_name] = header_value
183 |     response.data = sparse_linker.link_ttl(request.data)
184 | 
185 |     save_data("sparse", request.data, response.data)
186 |     
187 |     return response
188 | 
189 | 
190 | 
191 | @app.route("/sparse_overlap", methods=['POST'])
192 | def sparse_overlap():
193 |     params = {"tfidf": True, "use_overlap": True}
194 |     sparse_linker.set_params(params)
195 | 
196 |     response = Response()
197 |     
198 |     for header_name, header_value in request.headers.items():
199 |         response.headers[header_name] = header_value
200 |     response.data = sparse_linker.link_ttl(request.data)
201 | 
202 |     save_data("sparse_overlap", request.data, response.data)
203 |     
204 |     return response
205 | 
206 | 
207 | super_linker = SuperTagger()
208 | @app.route("/supertagger", methods=['POST'])
209 | def supertagger():
210 |     response = Response()
211 | 
212 |     for header_name, header_value in request.headers.items():
213 |         response.headers[header_name] = header_value
214 |     response.data = super_linker.link_ttl(request.data)
215 | 
216 |     save_data("supertagger", request.data, response.data)
217 | 
218 |     return response
219 | 
220 | 
221 | if __name__ == "__main__":
222 |     app.run(host="127.0.0.1", threaded=True)


--------------------------------------------------------------------------------
/nif_ws_graph.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, Response
 2 | from linkers.nn_graph import NNLinker, CandidateRandom
 3 | 
 4 | 
 5 | host = "127.0.0.1"
 6 | 
 7 | app = Flask(__name__)
 8 | app.debug = False
 9 | 
10 | nn_linker = NNLinker()
11 | 
12 | @app.route("/nngraph", methods=['POST'])
13 | def nngraph():
14 |     response = Response()
15 | 
16 |     for header_name, header_value in request.headers.items():
17 |         response.headers[header_name] = header_value
18 |     response.data = nn_linker.link_ttl(request.data)
19 | 
20 |     return response
21 | '''
22 | nn_random = CandidateRandom()
23 | @app.route("/nnrandom", methods=['POST'])
24 | def nnrandom():
25 |     response = Response()
26 | 
27 |     for header_name, header_value in request.headers.items():
28 |         response.headers[header_name] = header_value
29 |     response.data = nn_random.link_ttl(request.data)
30 | 
31 |     return response
32 | '''
33 | 
34 | if __name__ == "__main__":
35 |     app.run(host=host, threaded=False)


--------------------------------------------------------------------------------
/patterns.py:
--------------------------------------------------------------------------------
1 | import re
2 | 
3 | re_newlines = re.compile(r"[\n\r]+")
4 | 


--------------------------------------------------------------------------------
/preprocess.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | from rdflib import Graph
  3 | 
  4 | 
  5 | def open_triples(path):
  6 |     return codecs.open(path, "r", "utf-8")
  7 | 
  8 | 
  9 | def read_triples(path):
 10 |     return codecs.open(path, "r", "utf-8").read()
 11 | 
 12 | 
 13 | # takes the path of the .ttl file and returns the dictionary
 14 | # whose keys are the subject and values are the object.
 15 | def read_triples_manuel(path):
 16 |     result = dict()
 17 |     file = codecs.open(path, "r", "utf-8")
 18 | 
 19 |     line = file.readline()
 20 |     while line != '':
 21 |         if line.startswith('<'):
 22 |             splitted_line = line.split()
 23 |             subject = splitted_line[0][1:-1]
 24 |             object = splitted_line[2][1:-1]
 25 | 
 26 |             result[subject] = object
 27 |         line = file.readline()
 28 | 
 29 |     return result
 30 | 
 31 | 
 32 | def parse_triples(input_triple, input_format='n3'):
 33 |     g = Graph()
 34 |     return g.parse(data=input_triple, format=input_format)
 35 | 
 36 | 
 37 | # takes the rdflib graph and writes its subject and object
 38 | # to the given file.
 39 | def write_triple(input_triple, path):
 40 |     file = open(path, 'w')
 41 | 
 42 |     count = 0
 43 |     print('writing is started...')
 44 |     print(len(input_triple))
 45 |     for subj, pred, obj in input_triple:
 46 |         file.write(str(subj) + ' ' + str(obj) + '\n')
 47 |         if count%100000 == 0:
 48 |             print(count, 'nodes are written..')
 49 | 
 50 |         count += 1
 51 |     print('count', count)
 52 |     file.close()
 53 | 
 54 | 
 55 | def triple2dict(input_triple):
 56 |     result = dict()
 57 |     for subj, pred, obj in input_triple:
 58 |         result[str(subj)] = str(obj)
 59 | 
 60 |     return result
 61 | 
 62 | 
 63 | # nodes_ids is dictionary: keys are urls, values are ids of them.
 64 | # edges is list of tuples where two nodes have an edge.
 65 | def filter_edges_by_nodes(nodes_ids, edges):
 66 | 
 67 |     filtered_edges = list()
 68 |     filtered_edges_ids = list()
 69 | 
 70 |     for nodes in edges:
 71 |         node1, node2 = nodes[0], nodes[1]
 72 | 
 73 |         try:
 74 |             id1, id2 = nodes_ids[node1], nodes_ids[node2]
 75 |         except KeyError:
 76 |             continue
 77 | 
 78 |         filtered_edges.append((node1, node2))
 79 |         filtered_edges_ids.append((id1, id2))
 80 | 
 81 |     return filtered_edges, filtered_edges_ids
 82 | 
 83 | 
 84 | def read_dict(path):
 85 |     result = dict()
 86 | 
 87 |     file = codecs.open(path, 'r')
 88 |     line = file.readline()
 89 | 
 90 |     while line != '':
 91 |         splitted = line.split()
 92 |         line = file.readline()
 93 | 
 94 |         try:
 95 |             result[str(splitted[0])] = splitted[1:]
 96 |         except IndexError:
 97 |             continue
 98 | 
 99 |     return result
100 | 
101 | 
102 | def read_lookup(path):
103 |     result = dict()
104 | 
105 |     file = codecs.open(path, 'r')
106 |     line = file.readline()
107 | 
108 |     while line != '':
109 |         splitted = line.split()
110 |         line = file.readline()
111 | 
112 |         try:
113 |             result[str(splitted[0])] = int(splitted[1])
114 |         except IndexError:
115 |             continue
116 | 
117 |     return result
118 | 
119 | 
120 | def read_edges(path):
121 |     edges = list()
122 | 
123 |     file = codecs.open(path, 'r')
124 |     line = file.readline()
125 | 
126 |     while line != '':
127 |         splitted = line.split()
128 |         line = file.readline()
129 | 
130 |         edges.append((int(splitted[0]), int(splitted[1])))
131 | 
132 |     return edges
133 | 
134 | 
135 | def write_edges(edgelist, path):
136 |     file = codecs.open(path, 'w')
137 | 
138 |     for edge in edgelist:
139 |         file.write(str(edge[0]) + ' ' + str(edge[1]) + '\n')
140 | 
141 |     file.close()
142 | 
143 | def read_list(path):
144 |     data = list()
145 | 
146 |     file = codecs.open(path, 'r')
147 |     line = file.readline()
148 | 
149 |     while line != '':
150 |         splitted = line.split()
151 |         data.append(splitted[0])
152 |         line = file.readline()
153 | 
154 |     return data
155 | 
156 | 
157 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tqdm
 2 | namedlist
 3 | langid
 4 | wikidata
 5 | sqlitedict
 6 | Flask
 7 | requests
 8 | grequests
 9 | rdflib
10 | nltk
11 | 


--------------------------------------------------------------------------------
/supervised/README.md:
--------------------------------------------------------------------------------
 1 | # kb2vec/supervised 
 2 | 
 3 | This project provides an alternative use of graph embeddings in Entity Disambiguation. The input of the feedforward neural network is the concatenation of context vector, span vector, entity graph embeddings, and long abstract (of corresponding entity) vector. 
 4 | 
 5 | Installation
 6 | -----------
 7 | 
 8 | ```
 9 | cd supervised
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | Set up
14 | -----------
15 | 
16 | 1 -  Creating entity graph embeddings:
17 | 
18 | From DBpedia datasets (https://wiki.dbpedia.org/develop/datasets/downloads-2016-10), long 
19 | abstracts, labels, and page links files are downloaded. Using `../construct_graph.py`, the graph is contructed.
20 | Page links are the inputs of DeepWalk  algorithm (https://github.com/phanein/deepwalk) to create entity graph embeddings.
21 | 
22 | 2 - FFNN ablation test:
23 | 
24 | For generating negative samples use negative_sampling.py and for training nn.py.The relative inputs are commented out for ablation test.
25 | 
26 | 


--------------------------------------------------------------------------------
/supervised/negative_sampling_test.py:
--------------------------------------------------------------------------------
  1 | from supervised import negative_sampling
  2 | import ttl
  3 | import codecs
  4 | from sqlitedict import SqliteDict
  5 | 
  6 | def check_written_file(contexts_r, phrases_r, contexts, phrases):
  7 |     is_equal = True
  8 | 
  9 |     for phrase in phrases:
 10 |         entity, beg, end, ref_context, url = phrase[0], phrase[1], phrase[2], phrase[3], phrase[4]
 11 |         try:
 12 |             context = contexts[ref_context]
 13 |             context_r = contexts_r[entity+str(beg)+str(end)+url+context]
 14 |             phrase_r = phrases_r[entity+str(beg)+str(end)+url+context]
 15 | 
 16 |             is_equal &= (context == context_r) & (entity == phrase_r[0]) & (beg == phrase_r[1]) & (end == phrase_r[2])
 17 |             if not is_equal:
 18 |                 print(entity, url, beg, end)
 19 |                 break
 20 |         except KeyError:
 21 |             print("Warning: not found", ref_context)
 22 | 
 23 |     return is_equal
 24 | 
 25 | 
 26 | def get_statistics_true_url(positives_negatives, urls_db):
 27 |     db = SqliteDict(urls_db, autocommit=False)
 28 |     urls = list(db.keys())
 29 | 
 30 |     file = codecs.open('candidates_without_true_name1_.tsv', 'a')
 31 |     count_exist = 0
 32 |     count_all = 0
 33 |     count_not_included = 0
 34 | 
 35 |     for positive_negative in positives_negatives:
 36 |         entity, beg, end, true_url, context, negative_samples = positive_negative
 37 | 
 38 |         samples = list()
 39 |         for negative_sample in negative_samples:
 40 |             samples.append(negative_sample.strip())
 41 | 
 42 |         if true_url in samples:
 43 |             count_exist += 1
 44 |         elif true_url in urls:
 45 |             file.write(str(entity) + '\t' + str(true_url) + '\n')
 46 |         else:
 47 |             count_not_included += 1
 48 | 
 49 |         count_all += 1
 50 | 
 51 |     print(count_exist)
 52 |     print(count_all)
 53 |     print(count_not_included)
 54 |     return float(count_exist)/count_all
 55 | 
 56 | 
 57 | ''' 
 58 | # creating negative samples 
 59 | contexts_r, phrases_r = negative_sampling.read_samples('/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/positive_samples_new.tsv')
 60 | print('positive samples are read..')
 61 | negative_samples = negative_sampling.create_negative_samples_with_positive_samples(urls_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db',
 62 |                                            contexts=contexts_r, phrases=phrases_r)
 63 | 
 64 | print(len(negative_samples))
 65 | print('Writing started..')
 66 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=negative_samples,
 67 |                        path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives_new.tsv')
 68 | '''
 69 | ''' '''
 70 | # creating candidates
 71 | contexts_r, phrases_r = negative_sampling.read_samples('/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/positive_samples_new.tsv')
 72 | print('positive samples are read..')
 73 | negative_samples = negative_sampling.create_candidates(urls_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db',
 74 |                                            contexts=contexts_r, phrases=phrases_r)
 75 | 
 76 | print(len(negative_samples))
 77 | print('Writing started..')
 78 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=negative_samples,
 79 |                        path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/candidates/candidate1_big.tsv')
 80 | 
 81 | 
 82 | ''' 
 83 | # get statistics
 84 | positive_negatives = negative_sampling.\
 85 |     read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/candidates/candidate1.tsv')
 86 | print(get_statistics_true_url(positive_negatives, urls_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db'))
 87 | '''
 88 | 
 89 | # check samples
 90 | #positive_negatives, count = negative_sampling.\
 91 | #    read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives.tsv')
 92 | 
 93 | #print(len(positive_negatives), count)
 94 | ''' 
 95 | # closest sampling
 96 | positive_negatives = negative_sampling.\
 97 |     read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives.tsv')
 98 | filtered_samples = negative_sampling. \
 99 |     filter_negative_samples_closest_with_scores(positives_negatives=positive_negatives,
100 |                                                    url_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db',
101 |                                                    pagerank_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/pagerank.db', n=10)
102 | print('starts to write')
103 | negative_sampling.write_negative_samples_with_positive_samples_with_scores(positive_negatives=filtered_samples,
104 |                                              path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/with_scores/negative_samples_filtered_closest_10.tsv')
105 | '''
106 | ''' 
107 | # random sampling
108 | 
109 | positive_negatives = negative_sampling.read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives_new.tsv')
110 | filtered_samples = negative_sampling.filter_negative_samples_randomly(positives_negatives=positive_negatives,
111 |                                                    url_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db',
112 |                                                     n=10)
113 | print('starts to write')
114 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=filtered_samples,
115 |                                              path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_filtered_randomly_10_big.tsv')
116 | 
117 | '''
118 | ''' 
119 | # completely random
120 | contexts_r, phrases_r = negative_sampling.read_samples('/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/positive_samples.tsv')
121 | samples = negative_sampling.create_completely_random(urls_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db',
122 |                                            contexts=contexts_r, phrases=phrases_r, n=5)
123 | 
124 | print('starts to write')
125 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=samples,
126 |                                              path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_completely_random_5.tsv')
127 | '''
128 | ''' 
129 | # closest sampling with scores and similarity
130 | positive_negatives = negative_sampling.\
131 |     read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives_new.tsv')
132 | 
133 | sims_scores = negative_sampling.get_negative_samples_similarity_and_scores(positives_negatives=positive_negatives,
134 |                                                                            url_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db',
135 |                                                                            graphembed='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/nodes.embeddings',
136 |                                                                            pagerank_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/pagerank.db')
137 | 
138 | print('starts to write')
139 | negative_sampling.write_negative_samples_with_positive_samples_with_scores(positive_negatives=sims_scores,
140 |                                              path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/with_scores/negative_samples_sims_scores_new.tsv')
141 | '''
142 | ''' 
143 | # prune closest
144 | 
145 | positive_negatives = negative_sampling.read_negative_samples_with_positive_samples_with_scores(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/with_scores/negative_samples_sims_scores_new.tsv')
146 | print('positive_negatives is read')
147 | pruned_samples = negative_sampling.prune_most_closest(positives_negatives=positive_negatives, n=10)
148 | print('samples is pruned')
149 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=pruned_samples,
150 |                                              path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_filtered_closest_pruned_10_big.tsv')
151 | '''
152 | 
153 | 
154 | def ttl2csv(list_of_paths, write_path):
155 |     for input_ttl_fpath in list_of_paths:
156 |         in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8")
157 | 
158 |         input_ttl = in_ttl.read()
159 |         graph, contexts, phrases = negative_sampling.parse_d2kb_ttl(input_ttl)
160 | 
161 |         print(phrases)
162 |         print(contexts)
163 | 
164 |         negative_sampling.write_positive_samples(contexts=contexts, phrases=phrases,
165 |                                                  path=write_path)
166 | 
167 | 
168 | input_ttl_fpaths = ["/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl",
169 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/kore50-nif.ttl",
170 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/Reuters-128.ttl"]
171 | 
172 | new_input_ttl_fpaths = ["/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/RSS-500.ttl",
173 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"]
174 | 
175 | #ttl2csv(new_input_ttl_fpaths,
176 | #        write_path="/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/positive_samples_new.tsv")
177 | 
178 | 


--------------------------------------------------------------------------------
/supervised/requirements.txt:
--------------------------------------------------------------------------------
 1 | tensorflow==0.12.0
 2 | gensim==3.6.0
 3 | nltk==3.3
 4 | numpy==1.15.2
 5 | sqlitedict==1.6.0
 6 | rdflib==4.2.2
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/test_supertagger.py:
--------------------------------------------------------------------------------
1 | from diffbot_api import entity_link
2 | 
3 | r = entity_link("Michael Jeffrey Jordan, also known by his initials, MJ, is an American former professional basketball player. He played 15 seasons in the National Basketball Association for the Chicago Bulls and Washington Wizards.")
4 | print(r)


--------------------------------------------------------------------------------
/tests/baseline_linker_dbpedia_test.py:
--------------------------------------------------------------------------------
 1 | from linkers.baseline import BaselineLinker
 2 | from candidate import Phrase
 3 | from pandas import read_csv
 4 | 
 5 | dataset_fpath = "../datasets/dbpedia.tsv"
 6 | 
 7 | df = read_csv(dataset_fpath, sep="\t", encoding="utf-8")
 8 | bl = BaselineLinker()
 9 | 
10 | for i, row in df.iterrows():
11 |     phrases = [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip())
12 |                for phrase in row.targets.split(",")]
13 | 
14 |     print("\n\n{}\n".format(row.context))
15 | 
16 |     for phrase, candidate in bl.link(row.context, phrases):
17 |         link = candidate.link if candidate else ""
18 |         print(phrase.text, link)


--------------------------------------------------------------------------------
/tests/baseline_linker_test.py:
--------------------------------------------------------------------------------
 1 | from linkers.baseline import BaselineLinker
 2 | from candidate import Phrase
 3 | 
 4 | context = "San Francisco said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey."
 5 | phrases = "San Francisco"
 6 | 
 7 | phrases =  [Phrase(phrase.strip(), 0, len(phrase.strip()), "http://" + phrase.strip())
 8 |                    for phrase in phrases.split(",")]
 9 | bl = BaselineLinker()
10 | 
11 | for phrase, candidate in bl.link(context, phrases):
12 |     print(phrase.text, candidate)


--------------------------------------------------------------------------------
/tests/dense_linker_test.py:
--------------------------------------------------------------------------------
 1 | from linkers.dense import DenseLinker
 2 | from candidate import make_phrases
 3 | 
 4 | # embeddings_fpath = "../data/wiki-news-300d-1M.vec"
 5 | embeddings_fpath = "../data/crawl-300d-2M.vec"
 6 | 
 7 | dataset_fpaths = ["../datasets/dbpedia.ttl.phrases.tsv",
 8 |                   "../datasets/kore50.ttl.phrases.tsv",
 9 |                   "../datasets/n3-reuters-128.ttl.phrases.tsv"]
10 | 
11 | l = DenseLinker("../data/count-stopwords-10", embeddings_fpath, stop_words=True, tfidf=False)
12 | l.train(dataset_fpaths)
13 | 
14 | context = "Madonna is a great music  signer and lives near West Holywood in LA. adonna Louise Ciccone (/tʃɪˈkoʊni/; born August 16, 1958) is an American singer, songwriter, actress, and businesswoman. Referred to as the Queen of Pop since the 1980s, Madonna is known for pushing the boundaries of lyrical content in mainstream popular music, as well as visual imagery in music videos and on stage. She has also frequently reinvented both her music and image while maintaining autonomy within the recording industry. Besides sparking controversy, her works have bee "
15 | phrases = ["Madonna"]
16 | 
17 | linked_phrases = l.link(context, make_phrases(phrases))
18 | print(linked_phrases)
19 | 


--------------------------------------------------------------------------------
/tests/diffbot_api_test.py:
--------------------------------------------------------------------------------
 1 | from diffbot_api import query_and_save, ENTITY_TYPES
 2 | 
 3 | query_and_save('allUris:"barackobama.com"', "data/all-uris.json")
 4 | query_and_save('wikipediaUri:"en.wikipedia.org/wiki/Barack_Obama"', "data/wiki-uri.json")
 5 | query_and_save('allUris:"en.wikipedia.org/wiki/Barack\_Obama"', "data/all-uris-wiki.json")
 6 | query_and_save('origins:"en.wikipedia.org/wiki/Barack_Obama"', "data/origins.json")
 7 | 
 8 | for entity_type in ENTITY_TYPES:
 9 |     query_and_save(
10 |         query='type:{}'.format(entity_type),
11 |         output_fpath="data/{}.json".format(entity_type))
12 |     
13 | query_and_save(
14 |     query='type:Person name:"Alexander Panchenko"',
15 |     output_fpath="data/ap.json")
16 | 
17 | 
18 | query_and_save(
19 |     query='type:Person employments.employer.name:"Diffbot"',
20 |     output_fpath="data/diffbot-employees.json")
21 | 
22 | 
23 | query_and_save(
24 |     query='type:Person employments.{title:"CEO" employer.name:"Diffbot"}',
25 |     output_fpath="data/diffbot-ceo.json")
26 | 
27 | query_and_save(
28 |     query='type:Person employments.{employer.name:"Diffbot" isCurrent:true}',
29 |     output_fpath="data/diffbot-current-employees.json")
30 | 
31 | query_and_save(
32 |     query='type:Person name:"Angela Merkel"',
33 |     output_fpath="data/am.json")
34 | 
35 | query_and_save(
36 |     query='type:Person name:"Barack Obama"',
37 |     output_fpath="data/bo.json")
38 | 
39 | query_and_save(
40 |     query='type:Person name:"Nicolas Sarkozy"',
41 |     output_fpath="data/ns.json")
42 | 
43 | query_and_save(
44 |     query='type:Person name:"Diego Maradona"',
45 |     output_fpath="data/dm.json")
46 | 


--------------------------------------------------------------------------------
/tests/score_test.py:
--------------------------------------------------------------------------------
 1 | # inside the link function
 2 | 
 3 | cds = []
 4 | for i, candidate in enumerate(candidates):
 5 |     cds.append((scores[i][0], candidate.name,
 6 |                 sims.toarray()[i][0], overlap_scores[i][0]))
 7 | 
 8 | for c_score, c_name, c_sim, c_overlap in sorted(cds, reverse=True):
 9 |     print("- {} {:.2f} {:.2f} {:.2f}".format(c_name, c_score, c_sim, c_overlap))
10 | 


--------------------------------------------------------------------------------
/tests/sparse_linker_single_test.py:
--------------------------------------------------------------------------------
 1 | from linkers.sparse import SparseLinker
 2 | from candidate import make_phrases
 3 | 
 4 | 
 5 | dataset_fpaths = ["../datasets/singleton.tsv"]
 6 | 
 7 | sl = SparseLinker("../data/single5")
 8 | # sl.train(dataset_fpaths)
 9 | context = "Richard Stallman, often known by his initials, rms — is an American free software movement activist and programmer. He campaigns for software to be distributed in a manner such that its users receive the freedoms to use, study, distribute and modify that software."
10 | phrases = ["Richard Stallman"]
11 | linked_phrases = sl.link(context, make_phrases(phrases))
12 | print(linked_phrases)
13 | 
14 | context = "Linus Benedict Torvalds (/ˈliːnəs ˈtɔːrvɔːldz/;[5] Swedish: [ˈliːn.ɵs ˈtuːr.valds] (About this sound listen); born December 28, 1969) is a Finnish-American software engineer[2][6] who is the creator, and historically, the principal developer of the Linux kernel, which became the kernel for operating systems such as the Linux operating systems, Android, and Chrome OS."
15 | phrases = ["Linus Torvalds"]
16 | linked_phrases = sl.link(context, make_phrases(phrases))
17 | print(linked_phrases)


--------------------------------------------------------------------------------
/tests/sparse_linker_test.py:
--------------------------------------------------------------------------------
 1 | from linkers.sparse import SparseLinker
 2 | from candidate import make_phrases
 3 | 
 4 | dataset_fpaths = ["../datasets/dbpedia.ttl.phrases.tsv",
 5 |                   "../datasets/kore50.ttl.phrases.tsv",
 6 |                   "../datasets/n3-reuters-128.ttl.phrases.tsv"]
 7 | 
 8 | dataset_fpaths = ["../datasets/test.phrases.tsv"]
 9 | 
10 | def profiling(function):
11 |     import cProfile
12 |     import pstats
13 |     from io import StringIO
14 |     pr = cProfile.Profile()
15 |     pr.enable()
16 | 
17 |     function()
18 | 
19 |     pr.disable()
20 |     s = StringIO()
21 |     sortby = 'cumulative'
22 |     ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
23 |     ps.print_stats()
24 |     print(s.getvalue())
25 | 
26 | 
27 | 
28 | sl = SparseLinker("../data/count-stopwords-test2-related", stop_words=True, tfidf=False, related_entities=True)
29 | profiling(lambda: sl.train(dataset_fpaths))
30 | 
31 | 
32 | context = "Madonna is a great music  signer and lives near West Holywood in LA. adonna Louise Ciccone (/tʃɪˈkoʊni/; born August 16, 1958) is an American singer, songwriter, actress, and businesswoman. Referred to as the Queen of Pop since the 1980s, Madonna is known for pushing the boundaries of lyrical content in mainstream popular music, as well as visual imagery in music videos and on stage. She has also frequently reinvented both her music and image while maintaining autonomy within the recording industry. Besides sparking controversy, her works have bee "
33 | phrases = ["Madonna"]
34 | linked_phrases = sl.link(context, make_phrases(phrases))
35 | print(linked_phrases)
36 | 
37 | context = "Richard Stallman, often known by his initials, rms — is an American free software movement activist and programmer. He campaigns for software to be distributed in a manner such that its users receive the freedoms to use, study, distribute and modify that software."
38 | phrases = ["Richard Stallman"]
39 | linked_phrases = sl.link(context, make_phrases(phrases))
40 | print(linked_phrases)
41 | 
42 | context = "Linus Benedict Torvalds (/ˈliːnəs ˈtɔːrvɔːldz/;[5] Swedish: [ˈliːn.ɵs ˈtuːr.valds] (About this sound listen); born December 28, 1969) is a Finnish-American software engineer[2][6] who is the creator, and historically, the principal developer of the Linux kernel, which became the kernel for operating systems such as the Linux operating systems, Android, and Chrome OS."
43 | phrases = ["Linus Torvalds"]
44 | linked_phrases = sl.link(context, make_phrases(phrases))
45 | print(linked_phrases)
46 | 


--------------------------------------------------------------------------------
/tests/supertagger_test.py:
--------------------------------------------------------------------------------
 1 | from linkers.supertagger import SuperTagger
 2 | from candidate import Phrase
 3 | import codecs
 4 | 
 5 | 
 6 | st = SuperTagger()
 7 | 
 8 | 
 9 | def make_positional_phrases(word_beg_ends):
10 |     phrases = []
11 |     for word, beg, end in word_beg_ends:
12 |         phrases.append(Phrase(word, beg, end, "http://www.{}.com".format(word)))
13 |     return phrases
14 | 
15 | request_fpath = "../data/supertagger-1529250101365435-request.ttl"
16 | with codecs.open(request_fpath, "r", "utf-8") as ttl:
17 |     input_ttl = ttl.read()
18 | 
19 | output_ttl = st.link_ttl(input_ttl)
20 | with codecs.open(request_fpath + ".response", "w", "utf-8") as ttl:
21 |     ttl.write(output_ttl)
22 | 
23 | context = "Prokhorov said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey."
24 | phrases = make_positional_phrases([["Russia", 180, 186],
25 |                                   ["sport", 198, 203],
26 |                                   ["basketabll", 166, 176],
27 |                                   ["Russians", 129, 137],
28 |                                   ["Prokhorov", 0, 9]])
29 | linked_phrases = st.link(context, phrases)
30 | print(linked_phrases)
31 | 
32 | context = "Madonna is a great music  signer and lives near West Holywood in Los Angeles. adonna Louise Ciccone (/tʃɪˈkoʊni/; born August 16, 1958) is an American singer, songwriter, actress, and businesswoman. Referred to as the Queen of Pop since the 1980s, Madonna is known for pushing the boundaries of lyrical content in mainstream popular music, as well as visual imagery in music videos and on stage. She has also frequently reinvented both her music and image while maintaining autonomy within the recording industry. Besides sparking controversy, her works have bee "
33 | phrases = [Phrase("Madonna", 0, 6, "http://madonna.com"),
34 |            Phrase("West Holywood", 48, 62, "http://westholy.com"),
35 |            Phrase("Los Angeles", 65, 76, "http://la.com")]
36 | 
37 | linked_phrases = st.link(context, phrases)
38 | print(linked_phrases)
39 | 
40 | 


--------------------------------------------------------------------------------
/tests/supervised/preprocess/prepro_util_test.py:
--------------------------------------------------------------------------------
  1 | from supervised.preprocess.prepro_util import *
  2 | from supervised.preprocess.util import load_url2graphid
  3 | from supervised.negative_sampling import parse_d2kb_ttl
  4 | 
  5 | '''
  6 | 
  7 | generator = InputSamplesGenerator()
  8 | samples = generator.process('/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl', ttl=True)
  9 | not_include = 0
 10 | total = 0
 11 | except_ = 0
 12 | for sample in samples:
 13 |     chunk_id, chunk_words, entity, begin_gm, end_gm, ground_truth, cand_entities, cand_entities_scores = sample
 14 | 
 15 |     for index in range(len(entity)):
 16 |         try:
 17 |             print(entity[index], ground_truth[index], cand_entities[index])
 18 |             if int(ground_truth[index]) not in cand_entities[index]:
 19 |                 not_include += 1
 20 |         except:
 21 |             except_ += 1
 22 |         total += 1
 23 | 
 24 | 
 25 | print(not_include, total, except_)
 26 | '''
 27 | # len phrase 660 - spotlight
 28 | # 288 - kore50-nif
 29 | # 880 - Reuters-128
 30 | # 1000 - RSS-500
 31 | # 1655 - News-100
 32 | def test_chunker_parse_d2kb():
 33 |     chunker = Chunker()
 34 | 
 35 |     input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"
 36 |     in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8")
 37 | 
 38 |     input_ttl = in_ttl.read()
 39 | 
 40 |     _, contexts, phrases = chunker.parse_d2kb_ttl(input_ttl)
 41 |     print('CONT:', len(contexts.keys()), 'PHR:', len(phrases.keys()))
 42 | 
 43 |     for key in contexts.keys():
 44 |         if key not in phrases.keys():
 45 |             print(key)
 46 | 
 47 |     for key in phrases.keys():
 48 |         if key not in contexts.keys():
 49 |             print(key)
 50 | 
 51 |     if contexts.keys() == phrases.keys():
 52 |         print("YESSS")
 53 | 
 54 |     _, contexts_, phrases_ = parse_d2kb_ttl(input_ttl)
 55 | 
 56 |     if len(set(phrases_)) != len(phrases):
 57 |         print('len original:', len(set(phrases_)) , 'len chunker:', len(phrases.keys()))
 58 | 
 59 |     if len(contexts) != len(contexts_):
 60 |         print('len original:', len(contexts_), 'len chunker:', len(contexts))
 61 | 
 62 |     if set(contexts_.keys()).difference(set(contexts.keys())):
 63 |         print('not the same context keys')
 64 | 
 65 |     contexts_keys = contexts.keys()
 66 |     len_phrase = 0
 67 |     for context in contexts_keys:
 68 |         try:
 69 |             phrase_contexts = phrases[context]
 70 |             for phrase in phrase_contexts:
 71 |                 span, beg, end, ind_ref = phrase
 72 |                 if (span, beg, end, context, ind_ref) not in phrases_:
 73 |                     print((span, beg, end, context, ind_ref))
 74 |                     return
 75 |             len_phrase += len(phrase_contexts)
 76 |         except KeyError:
 77 |             # only one context ref in spotlight, the problem in the dataset!
 78 |             # http://www.nytimes.com/2010/10/11/arts/design/11chaos.html?ref=arts_sentence2
 79 |             print('KEY ERROR:', context)
 80 |     print(len_phrase)
 81 | 
 82 | 
 83 | # number_phrases = 608 + 52 ground truth error - spotlight
 84 | # number_phrases = 254 + 34 ground truth error - kore50-nif
 85 | # number_phrases = 562 + 318 ground truth error - Reuters-128
 86 | # number_phrases = 462 + 538 ground truth error - RSS-500
 87 | # number_phrases = 32 + 1623 ground truth error - News-100
 88 | def test_process_ttl():
 89 |     chunker = Chunker()
 90 | 
 91 |     input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"
 92 |     url2graphid = load_url2graphid()
 93 |     count = 0
 94 |     number_phrases = 0
 95 |     for chunk in chunker.process_ttl(input_ttl_fpath, url2graphid):
 96 |         #print(chunk)
 97 |         chunk_id, chunk_words, begin_gm, end_gm, ground_truth = chunk
 98 |         number_phrases += len(begin_gm)
 99 |         count += 1
100 |     print(count, number_phrases)
101 | 
102 | 
103 | # 608 11366 - dbpedia-spotlight-nifspotlight
104 | # 254 6180 - kore50-nif
105 | # 562 7474 - Reuters-128
106 | # 462 6389 - RSS-500
107 | # 32 46 - News-100
108 | def test_chunk2sample():
109 |     input_generator = InputSamplesGenerator()
110 | 
111 |     input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"
112 |     url2graphid = load_url2graphid()
113 | 
114 |     number_cand = 0
115 |     number_phrases = 0
116 | 
117 | 
118 |     for chunk in input_generator.chunker.process_ttl(input_ttl_fpath, url2graphid):
119 |         chunk_id, chunk_words, begin_gm, end_gm, ground_truth, \
120 |         cand_entities, cand_entities_scores = input_generator.chunk2sample(chunk)
121 | 
122 |         if len(begin_gm) != len(end_gm) or len(begin_gm) != len(ground_truth) or len(begin_gm) != len(cand_entities):
123 |             print(chunk_id, begin_gm, end_gm, ground_truth, cand_entities, cand_entities_scores)
124 |         number_phrases += len(begin_gm)
125 | 
126 |         for index in range(len(begin_gm)):
127 |             candidates = cand_entities[index]
128 |             number_cand += len(candidates)
129 | 
130 |     print(number_phrases, number_cand)
131 | 
132 | 
133 | # 57 - dbpedia-spotlight-nifspotlight
134 | # 50 - kore50-nif
135 | # 107 - Reuters-128
136 | # 334 - RSS-500
137 | # 14 - News-100
138 | def test_InputSampleGenerate_process():
139 |     input_generator = InputSamplesGenerator()
140 |     input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"
141 |     number_samples = 0
142 |     for sample in input_generator.process(input_ttl_fpath, ttl=True):
143 |         number_samples += 1
144 | 
145 |     print(number_samples)
146 | 
147 | print('Called')
148 | #test_chunker_parse_d2kb()
149 | #test_process_ttl()
150 | #test_chunk2sample()
151 | test_InputSampleGenerate_process()
152 | print('Finished')
153 | input_ttl_fpaths = ["/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl",
154 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/kore50-nif.ttl",
155 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/Reuters-128.ttl",
156 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/RSS-500.ttl",
157 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"]


--------------------------------------------------------------------------------
/tests/supervised/preprocess/util_test.py:
--------------------------------------------------------------------------------
  1 | from supervised.preprocess.util import FetchFilteredCoreferencedCandEntities, load_url2graphid
  2 | from nltk.tokenize import word_tokenize
  3 | from supervised.negative_sampling import parse_d2kb_ttl
  4 | import codecs
  5 | 
  6 | 
  7 | # not_match_entity = 13, total = 331 spotlight
  8 | # not_match_entity = 22, total = 1000 rss-500
  9 | # not_match_entity = 57, total = 880 reuters
 10 | # not_match_entity = 3, total = 144 kore50-nif
 11 | # not_match_entity = 75, total = 1655 News-100
 12 | def test_index_span():
 13 | 
 14 |     context = "In the first study, intended to measure a person’s short-term emotional reaction to gossiping, " \
 15 |                   "140 men and women, primarily undergraduates, were asked to talk about a fictional person either " \
 16 |                   "positively or negatively."
 17 |     beg = 124
 18 |     end = 138
 19 |     entity = 'undergraduates'
 20 |     chunk_words = word_tokenize(context)
 21 |     left = chunk_words.index(entity)
 22 |     right = left + len(word_tokenize(entity))
 23 |     print(' '.join(chunk_words[left:right]))
 24 | 
 25 |     input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"
 26 |     in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8")
 27 | 
 28 |     input_ttl = in_ttl.read()
 29 |     graph, contexts, phrases = parse_d2kb_ttl(input_ttl)
 30 |     phrases = set(phrases)
 31 | 
 32 |     not_match_entity = 0
 33 |     for phrase in phrases:
 34 |         entity, beg, end, ref_context, url = phrase
 35 | 
 36 |         try:
 37 |             context = contexts[ref_context]
 38 |         except KeyError:
 39 |             print('KeyError', phrase)
 40 | 
 41 |         chunk_words = word_tokenize(context)
 42 | 
 43 |         try:
 44 |             left = chunk_words.index(entity)
 45 |             right = left + len(word_tokenize(entity))
 46 |         except ValueError:
 47 |             left = len(word_tokenize(context[:beg]))
 48 |             right = len(word_tokenize(context[:end]))
 49 | 
 50 |         span_text = ' '.join(chunk_words[left:right])
 51 | 
 52 |         if span_text != entity:
 53 |             print('ERROR:', 'span:', span_text, 'entity:', entity, 'beg-end:', context[beg:end], 'context:', context)
 54 |             not_match_entity += 1
 55 | 
 56 |     print(not_match_entity, len(phrases))
 57 | 
 58 | 
 59 | #  - nones: 14  - # of phrases: 331 331  - not include: 12  - # of total cand: 6124  - # of except 26 - spotlight
 60 | #  - nones: 3  - # of phrases: 144 144  - not include: 8  - # of total cand: 3498  - # of except 17 - kore50-nif
 61 | #  - nones: 318  - # of phrases: 880 880  - not include: 42  - # of total cand: 9622  - # of except 318 - Reuters-128
 62 | #  - nones: 359  - # of phrases: 1000 1000  - not include: 48  - # of total cand: 9485  - # of except 538 - RSS-500
 63 | #  - nones: 309  - # of phrases: 1655 1655  - not include: 18  - # of total cand: 20772  - # of except 1623 - News-100
 64 | def test_cand_list():
 65 |     fetch_filtered_entities = FetchFilteredCoreferencedCandEntities()
 66 |     url2graphid = load_url2graphid()
 67 | 
 68 |     input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"
 69 |     in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8")
 70 | 
 71 |     input_ttl = in_ttl.read()
 72 |     graph, contexts, phrases = parse_d2kb_ttl(input_ttl)
 73 |     phrases = set(phrases)
 74 | 
 75 |     count_nones = 0
 76 |     count = 0
 77 |     count_not_include = 0
 78 |     cand_number = 0
 79 |     key_error = 0
 80 |     for phrase in phrases:
 81 |         entity, beg, end, ref_context, url = phrase
 82 |         try:
 83 |             id = url2graphid[url]
 84 |         except KeyError:
 85 |             id = -1
 86 |             key_error += 1
 87 |         try:
 88 |             context = contexts[ref_context]
 89 |         except KeyError:
 90 |             print('KeyError', phrase)
 91 | 
 92 |         chunk_words = word_tokenize(context)
 93 | 
 94 |         try:
 95 |             left = chunk_words.index(entity)
 96 |             right = left + len(word_tokenize(entity))
 97 |         except ValueError:
 98 |             left = len(word_tokenize(context[:beg]))
 99 |             right = len(word_tokenize(context[:end]))
100 | 
101 |         cand, score = fetch_filtered_entities.process(left, right, chunk_words)
102 | 
103 |         if cand is None:
104 |             count_nones += 1
105 |         else:
106 |             cand_number += len(cand)
107 | 
108 |         if cand is not None and id != -1:
109 |             if int(id) not in cand:
110 |                 count_not_include += 1
111 | 
112 |         count += 1
113 | 
114 |     print(' - nones:', count_nones, ' - # of phrases:', count, len(phrases), ' - not include:', count_not_include,
115 |           ' - # of total cand:', cand_number, ' - # of except', key_error)
116 | 
117 | 
118 | #test_index_span()
119 | test_cand_list()
120 | 
121 | input_ttl_fpaths = ["/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl",
122 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/kore50-nif.ttl",
123 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/Reuters-128.ttl",
124 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/RSS-500.ttl",
125 |                     "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"]


--------------------------------------------------------------------------------
/tests/test_construct_dict.py:
--------------------------------------------------------------------------------
 1 | import construct_graph
 2 | import preprocess
 3 | 
 4 | 
 5 | # for subgraph
 6 | graph = construct_graph.Graph(logfile='../datasets/subset/construct_graph.log')
 7 | subnodes = preprocess.read_list(path='../datasets/subset/1000_nodelist_url.txt')
 8 | 
 9 | graph.create_nodes_from_db(longabsdb_path='../datasets/subset/1000_long_abstracts.db',
10 |                             labelsdb_path='../datasets/subset/1000_labels.db',
11 |                             lookupdb_path='../datasets/subset/1000_nodes_lookup.db',
12 |                             subnodes=subnodes)
13 | print('nodes created..')
14 | 
15 | edges = preprocess.read_edges(path='../datasets/subset/1000_edgelist.txt')
16 | print('edges are read...')
17 | 
18 | graph.create_edges_from_list(edges=edges)
19 | print('edges are created...')
20 | 
21 | graph.write_graph(path='../datasets/subset/1000_graph_sub.gpickle')
22 | print('graph is written...')
23 | 
24 | graph.draw()
25 | 
26 | 


--------------------------------------------------------------------------------
/tests/test_data_helper.py:
--------------------------------------------------------------------------------
1 | import data_helper
2 | 
3 | # look up
4 | data_helper.create_dictdb_from_file(file_path='../datasets/subset/1000_nodes_lookup.txt',
5 |                                     db_path='../datasets/subset/1000_nodes_lookup__.db')
6 | 


--------------------------------------------------------------------------------
/tests/ttl_test.py:
--------------------------------------------------------------------------------
 1 | from ttl import parse_d2kb_ttl
 2 | import codecs
 3 | 
 4 | input_ttl_fpaths = ["../datasets/kore50.ttl", "../datasets/n3-reuters-128.ttl", "../datasets/dbpedia.ttl"]
 5 | 
 6 | for input_ttl_fpath in input_ttl_fpaths:
 7 |     in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8")
 8 |     phrases_fpath = input_ttl_fpath + ".phrases.tsv"
 9 |     contexts_fpath = input_ttl_fpath + ".contexts.tsv"
10 | 
11 |     phrases_ttl = codecs.open(phrases_fpath, "w", "utf-8")
12 |     phrases_ttl.write("targets\tcontexts\n")
13 | 
14 |     contexts_ttl = codecs.open(contexts_fpath, "w", "utf-8")
15 |     contexts_ttl.write("targets\tcontexts\n")
16 | 
17 |     input_ttl = in_ttl.read()
18 |     graph, contexts, phrases = parse_d2kb_ttl(input_ttl)
19 | 
20 |     for phrase in phrases:
21 |         phrases_ttl.write("{}\t    \n".format(phrase.text))
22 | 
23 |     for context in contexts:
24 |         contexts_ttl.write("    \t{}\n".format(context))
25 | 
26 |     in_ttl.close()
27 |     phrases_ttl.close()
28 |     contexts_ttl.close()
29 | 
30 |     print("Output:", phrases_fpath)
31 |     print("Output:", contexts_fpath)


--------------------------------------------------------------------------------
/tmp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 35,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "name:\"New York\"\n4\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "from diffbot_api import CachedQuery\n",
 20 |     "import json \n",
 21 |     "import codecs \n",
 22 |     "\n",
 23 |     "\n",
 24 |     "cq = CachedQuery()\n",
 25 |     "for i, query in enumerate(cq._cache):\n",
 26 |     "    db_entity = json.loads(cq._cache[query].content)\n",
 27 |     "    print(query)\n",
 28 |     "    print(len(db_entity))\n",
 29 |     "    \n",
 30 |     "    with codecs.open(\"/Users/panchenko/Desktop/{}.json\".format(query), \"w\", \"utf-8\") as out:\n",
 31 |     "        out.write(json.dumps(db_entity))\n",
 32 |     "        \n",
 33 |     "    break"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 34,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "# for hit in db_entity[\"data\"]:\n",
 45 |     "#     for k in hit:\n",
 46 |     "#         print(k)\n",
 47 |     "#     break"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 40,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [
 57 |     {
 58 |      "name": "stdout",
 59 |      "output_type": "stream",
 60 |      "text": [
 61 |       "37456900.0\n35052100.0\n17384900.0\n12499500.0\n10722700.0\n9986700\n7824100\n7290000\n6874600\n6252400\n5308100\n4715200\n4680200\n4586800\n4501900\n4477000\n4432100\n4327100\n4172200\n4034500\n3650100\n3161500\n3146300\n3028400\n2934900\n2922200\n2802000\n2639900\n2522500\n2315200\n2294500\n2243600\n2173900\n2137200\n2109700\n2087500\n2080400\n2023800\n2007600\n1929400\n1864800\n1860900\n1859700\n1856800\n1786600\n1785600\n1744700\n1690400\n1678900\n1645300\n\n 0 New York City is part of {'name': 'United States of America', 'diffbotUri': 'http://diffbot.com/entity/AcZTRPXDrY9', 'targetDiffbotUri': 'http://diffbot.com/entity/AcZTRPXDrY9', 'type': 'AdministrativeArea'}\n"
 62 |      ]
 63 |     },
 64 |     {
 65 |      "ename": "Exception",
 66 |      "evalue": "",
 67 |      "traceback": [
 68 |       "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
 69 |       "\u001b[0;31mException\u001b[0m                                 Traceback (most recent call last)",
 70 |       "\u001b[0;32m<ipython-input-40-374b66ac1bb9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     13\u001b[0m                         \u001b[0mrootId\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"diffbotUri\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     14\u001b[0m                         \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhit\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"is part of\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m                         \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
 71 |       "\u001b[0;31mException\u001b[0m: "
 72 |      ],
 73 |      "output_type": "error"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "cq = CachedQuery()\n",
 78 |     "for i, query in enumerate(cq._cache):\n",
 79 |     "    db_entity = json.loads(cq._cache[query].content)\n",
 80 |     "    if \"data\" not in db_entity: continue\n",
 81 |     "    \n",
 82 |     "    for hit in db_entity[\"data\"]:\n",
 83 |     "        for field_name in hit:\n",
 84 |     "            if field_name == \"importance\": \n",
 85 |     "                print(hit[\"importance\"])\n",
 86 |     "            if field_name == \"isPartOf\":\n",
 87 |     "                for i, root in enumerate(hit[\"isPartOf\"]):\n",
 88 |     "                    if \"diffbotUri\" in root:\n",
 89 |     "                        rootId = root[\"diffbotUri\"]\n",
 90 |     "                        print(\"\\n\", i, hit[\"name\"], \"is part of\", root)\n",
 91 |     "                        raise Exception()\n",
 92 |     "    \n",
 93 |     "    "
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 8,
 99 |    "metadata": {
100 |     "collapsed": false
101 |    },
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "defaultdict(<function extract_relations.<locals>.<lambda> at 0x104c55d90>, {'founders': ['http://diffbot.com/entity/PKuADpLXgMS']}) \n\nhttp://diffbot.com/entity/OCK8zsXiAVy\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x104e850d0>, {'founders': ['http://diffbot.com/entity/PN+XIqH03xf'], 'ceo': ['http://diffbot.com/entity/PNRyBHTdnbx']}) \n\nhttp://diffbot.com/entity/CHjULiDhdyX\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x1027718c8>, {'founders': ['http://diffbot.com/entity/PQGFYHeZhLH']}) \n\nhttp://diffbot.com/entity/EZmGU5Kh0KB\nhttp://diffbot.com/entity/Ozw6gU5AsJc\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x102771d08>, {'founders': ['http://diffbot.com/entity/P9iQ6uiD5to']}) \n\nhttp://diffbot.com/entity/ON6yeCKuGnm\nhttp://diffbot.com/entity/OAbB86ZLYEH\nhttp://diffbot.com/entity/Ox_Hnd_7WEr\nhttp://diffbot.com/entity/OIZzlT1rihy\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x105629378>, {'founders': ['http://diffbot.com/entity/PqXwyAswiIv']}) \n\nhttp://diffbot.com/entity/E9hzKNQUiTC\nhttp://diffbot.com/entity/EuXdxsjCRjg\nhttp://diffbot.com/entity/OWeqj9aprzB\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x1056298c8>, {'founders': ['http://diffbot.com/entity/P1ejNzclrxY']}) \n\nhttp://diffbot.com/entity/O8zTfY2Tp_F\nhttp://diffbot.com/entity/OVX1ErF6X53\nhttp://diffbot.com/entity/E3r36BkD5tg\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x104c552f0>, {'founders': ['http://diffbot.com/entity/PqBPxbbKUwG', 'http://diffbot.com/entity/PfNzLu47VeG']}) \n\nhttp://diffbot.com/entity/ONvHRir0UFC\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x105614048>, {'founders': ['http://diffbot.com/entity/PsshDYuCF33']}) \n\nhttp://diffbot.com/entity/OVSJYDFkcq+\nhttp://diffbot.com/entity/O8VFLGMbcQL\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x105614400>, {'founders': ['http://diffbot.com/entity/PwXUa7FvRPX', 'http://diffbot.com/entity/PjV1xV8x05B', 'http://diffbot.com/entity/PkIbZofedHb']}) \n\nhttp://diffbot.com/entity/Ox7OKybcRL7\nhttp://diffbot.com/entity/C6i+B13u3sC\nhttp://diffbot.com/entity/O+YAZiSQc9+\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x1056148c8>, {'founders': ['http://diffbot.com/entity/P1ejNzclrxY']}) \n\nhttp://diffbot.com/entity/OXk7mZS+Pb1\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x105614bf8>, {'founders': ['http://diffbot.com/entity/P0x0Tt66MLe', 'http://diffbot.com/entity/P1Cr1u7J9Lp']}) \n\nhttp://diffbot.com/entity/O1MpWqFXriE\nhttp://diffbot.com/entity/Oo1LrGpqO1p\nhttp://diffbot.com/entity/OBu5GZWCmV_\nhttp://diffbot.com/entity/OMg6x1ZdAty\nhttp://diffbot.com/entity/OVadPm2y5Nh\nhttp://diffbot.com/entity/EFM+ReMc_Se\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x105628400>, {'founders': ['http://diffbot.com/entity/POsi1jTcSJ4']}) \n\nhttp://diffbot.com/entity/EyQuLEzfnQT\nhttp://diffbot.com/entity/OoVdS2h6bvY\nhttp://diffbot.com/entity/Cdng7W7qya1\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x1056288c8>, {'founders': ['http://diffbot.com/entity/PQUyVUx+GIV']}) \n\nhttp://diffbot.com/entity/OTAOjf66kRw\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x105628bf8>, {'founders': ['http://diffbot.com/entity/Ppx+o3WH4IR']}) \n\nhttp://diffbot.com/entity/CJuObvMhCx0\nhttp://diffbot.com/entity/OeFeWnK6gP0\nhttp://diffbot.com/entity/OLcHzKRVGE7\nhttp://diffbot.com/entity/OwneFlJCez3\nhttp://diffbot.com/entity/OWKZeuqQ01f\nhttp://diffbot.com/entity/O_3vnLIY3GN\nhttp://diffbot.com/entity/O_M1cxlmxlv\nhttp://diffbot.com/entity/OXLAYKu5JBi\nhttp://diffbot.com/entity/O0v9KVly3KO\ndefaultdict(<function extract_relations.<locals>.<lambda> at 0x1056006a8>, {'founders': ['http://diffbot.com/entity/PaNntqxS5JI', 'http://diffbot.com/entity/Pj220nHFvPf'], 'ceo': ['http://diffbot.com/entity/PIHmMtCQuZx']}) \n\nhttp://diffbot.com/entity/C4yiK5ZPo6Z\nhttp://diffbot.com/entity/CYn98uQbrgS\nhttp://diffbot.com/entity/BkWvaONPQIK\nhttp://diffbot.com/entity/O7++6ItrZHG\nhttp://diffbot.com/entity/Cz11YfBBi4K\nhttp://diffbot.com/entity/O+kGQM+V_uD\nhttp://diffbot.com/entity/CImG9pD6qOK\nhttp://diffbot.com/entity/OvT6TRIswQR\nhttp://diffbot.com/entity/Oi9f+wirvLS\nhttp://diffbot.com/entity/OknTVY5fJ12\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "from collections import Counter, defaultdict \n",
113 |     " \n",
114 |     "\n",
115 |     "cq = CachedQuery()\n",
116 |     "printed = 0\n",
117 |     "\n",
118 |     "for i, query in enumerate(cq._cache):\n",
119 |     "    db_entity = json.loads(cq._cache[query].content)\n",
120 |     "\n",
121 |     "    if \"data\" not in db_entity: continue \n",
122 |     "    \n",
123 |     "    for hit in db_entity[\"data\"]:\n",
124 |     "        \n",
125 |     "        if len(r) > 0 and printed < 20:\n",
126 |     "            print(r, \"\\n\")\n",
127 |     "            printed += 1    \n",
128 |     "            \n",
129 |     "        print(uri)\n",
130 |     "        \n",
131 |     "    break\n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     ""
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 48,
146 |    "metadata": {},
147 |    "outputs": [
148 |     {
149 |      "data": {
150 |       "text/plain": [
151 |        "[{'diffbotUri': 'http://diffbot.com/entity/AcZTRPXDrY9',\n  'name': 'United States of America',\n  'targetDiffbotUri': 'http://diffbot.com/entity/AcZTRPXDrY9',\n  'type': 'AdministrativeArea'},\n {'diffbotUri': 'http://diffbot.com/entity/AdBDaXfj65G',\n  'name': 'New York',\n  'targetDiffbotUri': 'http://diffbot.com/entity/AdBDaXfj65G',\n  'type': 'AdministrativeArea'},\n {'diffbotUri': 'http://diffbot.com/entity/AZfTRPXDrY9',\n  'name': 'New York City',\n  'targetDiffbotUri': 'http://diffbot.com/entity/AZfTRPXDrY9',\n  'type': 'AdministrativeArea'}]"
152 |       ]
153 |      },
154 |      "execution_count": 48,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "isinstance(hit[field_name], list)\n",
161 |     "hit[field_name]"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 24,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     ""
171 |    ]
172 |   }
173 |  ],
174 |  "metadata": {
175 |   "kernelspec": {
176 |    "display_name": "Python 2",
177 |    "language": "python",
178 |    "name": "python2"
179 |   },
180 |   "language_info": {
181 |    "codemirror_mode": {
182 |     "name": "ipython",
183 |     "version": 2
184 |    },
185 |    "file_extension": ".py",
186 |    "mimetype": "text/x-python",
187 |    "name": "python",
188 |    "nbconvert_exporter": "python",
189 |    "pygments_lexer": "ipython2",
190 |    "version": "2.7.6"
191 |   }
192 |  },
193 |  "nbformat": 4,
194 |  "nbformat_minor": 0
195 | }
196 | 


--------------------------------------------------------------------------------
/ttl.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from rdflib import URIRef, Graph
  3 | import codecs
  4 | from candidate import Phrase
  5 | 
  6 | 
  7 | A = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
  8 | PHRASE = "#Phrase"
  9 | CONTEXT = "#Context"
 10 | STRING = "#isString"
 11 | ANCOR = "#anchorOf"
 12 | BEG = "#beginIndex"
 13 | END = "#endIndex"
 14 | CLASS_URI = URIRef("http://www.w3.org/2005/11/its/rdf#taClassRef")
 15 | LINK_URI = URIRef("http://www.w3.org/2005/11/its/rdf#taIdentRef")
 16 | NONE_URI = URIRef("http://dbpedia.org/nonsense")
 17 | # NONE_URI = URIRef("http://dbpedia.org/page/Thing")
 18 | 
 19 | 
 20 | class DatasetBuilder(object):
 21 |     def __init__(self, dataset_fpath):
 22 |         self._dataset_fpath = dataset_fpath
 23 |         with codecs.open(self._dataset_fpath, "a", "utf-8") as ttl_f:
 24 |             ttl_f.write("targets\tcontext\n")
 25 | 
 26 |     def add_to_dataset(self, input_ttl):
 27 |         graph, context, phrases = parse_d2kb_ttl(input_ttl)
 28 |         with codecs.open(self._dataset_fpath, "a", "utf-8") as ttl_f:
 29 |             phrases_str = ", ".join(p.text for p in phrases)
 30 |             ttl_f.write("{}\t{}\n".format(phrases_str, context))
 31 | 
 32 | 
 33 | def parse_d2kb_ttl(input_ttl):
 34 |     g = Graph()
 35 |     result = g.parse(data=input_ttl, format="n3")
 36 |     contexts, phrases = get_phrases(g)
 37 | 
 38 |     return g, contexts, phrases
 39 | 
 40 | 
 41 | def get_phrases(g):
 42 |     """ Collect the context and phrases """
 43 |     
 44 |     contexts = []
 45 |     phrases = []
 46 |     
 47 |     for subj, pred, obj in g:
 48 |         p = str(pred)
 49 |         s = str(subj)
 50 |         o = str(obj)
 51 | 
 52 |         # catch the context 
 53 |         if o.endswith(CONTEXT):
 54 |             for pred_s, obj_s in g.predicate_objects(subj):
 55 |                 if pred_s.strip().endswith(STRING):
 56 |                     contexts.append(obj_s)
 57 | 
 58 |         # catch the phrases to disambiguate 
 59 |         if o.endswith(PHRASE) or p.endswith(ANCOR):
 60 |             phrase = ""
 61 |             end = -1
 62 |             beg = -1
 63 |             for pred_s, obj_s in g.predicate_objects(subj):
 64 |                 ps = pred_s.strip()
 65 |                 if ps.endswith(ANCOR): phrase = str(obj_s)
 66 |                 elif ps.endswith(BEG): beg = int(obj_s)
 67 |                 elif ps.endswith(END): end = int(obj_s)
 68 | 
 69 |             if phrase == "" or beg == -1 or end == -1:
 70 |                 print("Warning: bad phrase", subj, pred, obj)
 71 |             else:
 72 |                 phrases.append(Phrase(phrase, beg, end, subj))
 73 | 
 74 |     return contexts, phrases
 75 | 
 76 | 
 77 | def add_nonsense_response(input_ttl):
 78 |     graph, context, phrases = parse_d2kb_ttl(input_ttl)
 79 |     
 80 |     # add new triples that correspond to the links of the disambiguation links
 81 |     print("# triples input:", len(graph))
 82 |     for phrase in phrases:
 83 |         graph.add( (phrase.subj, CLASS_URI, NONE_URI) )
 84 |         graph.add( (phrase.subj, LINK_URI, NONE_URI) )
 85 |     print("# triples output:", len(graph))
 86 | 
 87 |     output_ttl = str(graph.serialize(format='n3', encoding="utf-8"), "utf-8")
 88 |     
 89 |     return output_ttl
 90 | 
 91 | 
 92 | def remove_classref(text):
 93 |     output = []
 94 |     for line in text.split("\n"):
 95 |         upd_line = re.sub(r"itsrdf:taClassRef     <[^;]*> ;",
 96 |                           "itsrdf:taClassRef     <nonsense> ;",
 97 |                           line)
 98 |         output.append(upd_line)
 99 |         
100 |     return "\n".join(output)
101 | 
102 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from math import log
 2 | import os 
 3 | from difflib import SequenceMatcher
 4 | 
 5 | 
 6 | # This is the project root directory assuming that utils.py is in the root directory
 7 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
 8 | 
 9 | 
10 | def ensure_dir(dir_path):
11 |     if not os.path.exists(dir_path): os.makedirs(dir_path)
12 | 
13 | 
14 | def dbpedia2wikipedia(url, to_en=True):
15 |     """ Convert a dbpedia to wikipedia url. """
16 | 
17 |     url = url.replace("https://", "")
18 |     url = url.replace("http://", "")
19 | 
20 |     if to_en:
21 |         wiki_domain = "en.wikipedia.org/wiki/"
22 |     else:
23 |         wiki_domain = "wikipedia.org/wiki/"
24 |         
25 |     new_url = url.replace("dbpedia.org/resource/", wiki_domain)
26 |     if new_url == url:
27 |         new_url = url.replace("dbpedia.org/page/", wiki_domain)
28 |         
29 |     return new_url
30 | 
31 | 
32 | def longest_common_substring(s1, s2, lower=True):
33 |     if lower:
34 |         s1 = s1.lower()
35 |         s2 = s2.lower()
36 | 
37 |     match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
38 |     substring = s1[match.a: match.a + match.size]
39 | 
40 |     return substring
41 | 
42 | 
43 | def overlap(s1, s2, lower=True):
44 |     direct = longest_common_substring(s1, s2, lower)
45 |     inverse = longest_common_substring(s2, s1, lower)
46 |     max_overlap = float(max(len(direct), len(inverse)))
47 |     if max_overlap < 3:
48 |         return 0.0
49 |     else:
50 |         max_len = float(max(len(s1), len(s2)))
51 |         return max_overlap / max_len
52 | 
53 | 
54 | def truncated_log(x):
55 |     if x > 0: return log(x)
56 |     else: return 0.0    
57 | 


--------------------------------------------------------------------------------