├── .gitignore ├── Diffbot API test.ipynb ├── README.md ├── cache └── __init__.py ├── candidate.py ├── construct_graph.py ├── converter.py ├── data_helper.py ├── dataset_stats.py ├── datasets ├── ace2004.tsv ├── actors.txt ├── award-actors.txt ├── dbpedia-url.txt ├── dbpedia-url.txt.absent ├── dbpedia.tsv ├── dbpedia.ttl ├── dbpedia.ttl.contexts.tsv ├── dbpedia.ttl.phrases.tsv ├── dzerczynski.tsv ├── entities.tsv ├── kore50-urls.txt ├── kore50-urls.txt.absent ├── kore50.tsv ├── kore50.ttl ├── kore50.ttl.contexts.tsv ├── kore50.ttl.phrases.tsv ├── n3-news-128.tsv ├── n3-reuters-128-urls.txt ├── n3-reuters-128-urls.txt.absent ├── n3-reuters-128.tsv ├── n3-reuters-128.ttl ├── n3-reuters-128.ttl.contexts.tsv ├── n3-reuters-128.ttl.phrases.tsv ├── oke-evaluation-dataset-task1.ttl ├── oke-evaluation-dataset-task2.ttl ├── oke-sample-dataset-task1.ttl ├── oke-sample-dataset-task2.ttl ├── phrases.txt ├── singleton.tsv ├── subset │ ├── 1000_edgelist.txt │ ├── 1000_labels.db │ ├── 1000_long_abstracts.db │ ├── 1000_nodelist_url.txt │ ├── 1000_nodes_lookup.db │ └── 1000_nodes_lookup.txt ├── test.phrases.tsv ├── top-cities.txt ├── us-states.txt └── us-universitites.txt ├── diffbot_api.py ├── fwd.sh ├── generate_absent.py ├── linkers ├── __init__.py ├── baseline.py ├── context_aware.py ├── dense.py ├── nn_graph.py ├── sparse.py └── supertagger.py ├── nif_ws.py ├── nif_ws_graph.py ├── patterns.py ├── preprocess.py ├── requirements.txt ├── supervised ├── README.md ├── negative_sampling.py ├── negative_sampling_test.py ├── nn.py └── requirements.txt ├── test_supertagger.py ├── tests ├── baseline_linker_dbpedia_test.py ├── baseline_linker_test.py ├── dense_linker_test.py ├── diffbot_api_test.py ├── score_test.py ├── sparse_linker_single_test.py ├── sparse_linker_test.py ├── supertagger_test.py ├── supervised │ └── preprocess │ │ ├── prepro_util_test.py │ │ └── util_test.py ├── test_construct_dict.py ├── test_data_helper.py └── ttl_test.py ├── tmp.ipynb ├── ttl.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Data files 2 | *.json 3 | *.csv 4 | *.txt 5 | *.ttl 6 | *.out 7 | *.sqlite 8 | *.pkl 9 | 10 | # Byte-compiled / optimized / DLL files 11 | __pycache__/ 12 | *.py[cod] 13 | *$py.class 14 | 15 | # C extensions 16 | *.so 17 | *.ttl 18 | *.swp 19 | 20 | # Distribution / packaging 21 | .Python 22 | env/ 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # dotenv 94 | .env 95 | 96 | # virtualenv 97 | .venv 98 | venv/ 99 | ENV/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | -------------------------------------------------------------------------------- /Diffbot API test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%load_ext autoreload\n", 10 | "%autoreload 2" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": { 40 | "scrolled": false 41 | }, 42 | "outputs": [], 43 | "source": [] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "\n", 54 | "\n" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true, 62 | "scrolled": true 63 | }, 64 | "outputs": [], 65 | "source": [] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "collapsed": true 90 | }, 91 | "outputs": [], 92 | "source": [] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 14, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "collapsed": true 144 | }, 145 | "outputs": [], 146 | "source": [] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": { 161 | "collapsed": true 162 | }, 163 | "outputs": [], 164 | "source": [] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "outputs": [], 173 | "source": [] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "anaconda-cloud": {}, 187 | "kernelspec": { 188 | "display_name": "Python 3", 189 | "language": "python", 190 | "name": "python3" 191 | }, 192 | "language_info": { 193 | "codemirror_mode": { 194 | "name": "ipython", 195 | "version": 3 196 | }, 197 | "file_extension": ".py", 198 | "mimetype": "text/x-python", 199 | "name": "python", 200 | "nbconvert_exporter": "python", 201 | "pygments_lexer": "ipython3", 202 | "version": "3.6.3" 203 | } 204 | }, 205 | "nbformat": 4, 206 | "nbformat_minor": 1 207 | } 208 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # kb2vec 2 | 3 | Vectorizing knowledge bases for entity linking 4 | 5 | Installation 6 | ----------- 7 | 8 | ``` 9 | pip install -r requirements.txt 10 | python -m nltk.downloader stopwords 11 | python -m nltk.downloader punkt 12 | python -m nltk.downloader averaged_perceptron_tagger 13 | ``` 14 | 15 | Download the `data` folder and and unzip it: 16 | 17 | ``` 18 | wget http://ltdata1.informatik.uni-hamburg.de/kb2vec/data.zip 19 | unzip data.zip 20 | ``` 21 | 22 | Start the web service: 23 | --------------------- 24 | 25 | Entity linking NIF server: 26 | 27 | ``` 28 | python nif_ws.py 29 | ``` 30 | 31 | which will run at ``http://localhost:5000`` 32 | 33 | GERBIL NIF-based evaluation server (from the ``gerbil`` directory): 34 | 35 | ``` 36 | bash start.sh 37 | ``` 38 | 39 | which will run at ``http://localhost:1234/gerbil`` 40 | 41 | 42 | DBpedia entity linking NIF wrapper (from the ``gerbil-dbpedia-ws`` directory): 43 | 44 | ``` 45 | docker-compose up -d 46 | ``` 47 | 48 | which will run at ``http://localhost:8181/spotlight`` 49 | 50 | 51 | http://localhost:8181/spotlight 52 | http://localhost:5000/random 53 | http://localhost:5000/sparse_overlap 54 | http://localhost:5000/dense_overlap 55 | http://localhost:5000/supertagger 56 | -------------------------------------------------------------------------------- /cache/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/cache/__init__.py -------------------------------------------------------------------------------- /candidate.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from namedlist import namedlist 3 | import codecs 4 | 5 | Phrase = namedtuple("Phrase", "text beg end subj") 6 | 7 | #PhraseBase = namedtuple("PhraseBase", "text beg end subj") 8 | # 9 | #class Phrase(PhraseBase): 10 | # def get_hash(self): 11 | # return hash(self.text) 12 | # 13 | # def __hash__(self): 14 | # return self.get_hash() 15 | # 16 | # def __eq__(self, other): 17 | # return self.get_hash() == other.get_hash() 18 | 19 | 20 | CandidateBase = namedlist("CandidateBase", "score name link wiki types names uris text db_uri importance relations") 21 | 22 | 23 | def make_phrases(str_phrases): 24 | """ From a list of strings generates a list of phrases (e.g. for tests)""" 25 | 26 | return [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip()) 27 | for phrase in str_phrases] 28 | 29 | 30 | class Candidate(CandidateBase): 31 | def __init__(self, score=0.0, name="", link="", wiki="", types=[], names=[], uris=[], text="", 32 | db_uri="", importance=1.0, relations={}): 33 | CandidateBase.__init__(self, score, name, link, wiki, types, names, uris, text, 34 | db_uri, importance, relations) 35 | 36 | def get_hash(self): 37 | uris = "".join(self.uris) if self.uris is not None else "" 38 | types = "".join(self.types) if self.types is not None else "" 39 | hash_str = self.name + uris + types 40 | if hash_str is None: 41 | print("Warning: hash string is none.") 42 | 43 | return hash(hash_str) 44 | 45 | def __hash__(self): 46 | return self.get_hash() 47 | 48 | def __eq__(self, other): 49 | return self.get_hash() == other.get_hash() 50 | 51 | def __gt__(self, other): 52 | return self.score > other.score 53 | 54 | def __lt__(self, other): 55 | return self.score < other.score 56 | 57 | 58 | def save_candidates_text(output_fpath="data/sf-candidates.txt"): 59 | re_newlines = re.compile(r"[\n\r]+") 60 | 61 | with codecs.open(output_fpath, "w", "utf-8") as c_f: 62 | for phrase in c: 63 | for candidate in c[phrase]: 64 | text = candidate.text 65 | c_f.write("{}\t{}\t{}\n".format( 66 | phrase.text, 67 | candidate.name, 68 | text.strip())) 69 | 70 | print(output_fpath) 71 | -------------------------------------------------------------------------------- /construct_graph.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import matplotlib.pyplot as plt 3 | import logging 4 | import codecs 5 | from sqlitedict import SqliteDict 6 | 7 | 8 | class Graph: 9 | def __init__(self, logfile='output.log'): 10 | self._G = nx.DiGraph() 11 | # create logger 12 | self._logger = logging.getLogger('construct_graph') 13 | self._logger.setLevel(logging.DEBUG) 14 | fh = logging.FileHandler(logfile) 15 | fh.setLevel(logging.DEBUG) 16 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 17 | fh.setFormatter(formatter) 18 | self._logger.addHandler(fh) 19 | 20 | # takes three dictionaries: 21 | # url_ids - keys contain the urls, values are the unique ids of these urls. 22 | # url_longabstracts - keys contain the urls, values are the long abstracts (texts) of them. 23 | # url_labels - keys, again, are the urls, values contain the title of them. 24 | # unique ids are used to create node and other properties are used as the attributes of the nodes. 25 | def create_nodes_from_dict(self, url_longabstracts, url_labels, url_ids): 26 | urls = url_ids.keys() 27 | 28 | count = 0 29 | for url in urls: 30 | # long abstract is the list of tokens. 31 | long_abstract = url_longabstracts[url] 32 | # title is the list of tokens/token. 33 | title = url_labels[url] 34 | # node id is the integer value. 35 | node_id = url_ids[url] 36 | 37 | # id, url, long abstract (text), and title are attributes. 38 | self._G.add_node(node_id, id=node_id, url=url, long_abstract=long_abstract, title=title) 39 | if count % 100000 == 0: 40 | self._logger.info(str(count) + ' nodes are processed..') 41 | 42 | count += 1 43 | 44 | # subnodes is the list of nodes, it is used to create nodes from sublist and have a subgraph. 45 | def create_nodes_from_db(self, longabsdb_path, labelsdb_path, lookupdb_path, subnodes=False): 46 | longabsdb = SqliteDict(longabsdb_path, autocommit=False) 47 | labelsdb = SqliteDict(labelsdb_path, autocommit=False) 48 | lookupdb = SqliteDict(lookupdb_path, autocommit=False) 49 | 50 | if subnodes: 51 | urls = subnodes 52 | else: 53 | urls = lookupdb.keys() 54 | 55 | count = 0 56 | for url in urls: 57 | # long abstract is string. 58 | long_abstract = longabsdb[url] 59 | # title is string. 60 | title = labelsdb[url] 61 | # node id is the integer value. 62 | node_id = int(lookupdb[url]) 63 | 64 | # id, url, long abstract (text), and title are attributes. 65 | self._G.add_node(node_id, id=node_id, url=url, long_abstract=long_abstract, title=title) 66 | if count % 100000 == 0: 67 | self._logger.info(str(count) + ' nodes are processed..') 68 | 69 | count += 1 70 | 71 | longabsdb.close() 72 | labelsdb.close() 73 | lookupdb.close() 74 | 75 | # takes file a parameter: 76 | # file contains edge at each line, like (1, 2). 77 | def create_edges_from_file(self, path): 78 | count = 0 79 | 80 | file = codecs.open(path, 'r') 81 | line = file.readline() 82 | 83 | while line != '': 84 | nodes = line.split() 85 | line = file.readline() 86 | 87 | self._G.add_edge(int(nodes[0]), int(nodes[1])) 88 | 89 | if count % 100000 == 0: 90 | self._logger.info(str(count) + ' edges are processed..') 91 | 92 | count += 1 93 | 94 | def create_edges_from_list(self, edges): 95 | self._G.add_edges_from(edges) 96 | 97 | def write_graph(self, path): 98 | nx.write_gpickle(self._G, path) 99 | 100 | def load_graph(self, path): 101 | self._G = nx.read_gpickle(path) 102 | 103 | def draw(self): 104 | nx.draw(self._G, with_labels=True, font_weight='bold') 105 | plt.show() 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /converter.py: -------------------------------------------------------------------------------- 1 | from wikidata.client import Client 2 | from traceback import format_exc 3 | from sqlitedict import SqliteDict 4 | from traceback import format_exc 5 | from os.path import join 6 | from utils import ROOT_DIR 7 | 8 | 9 | WIKIDATA_DOMAIN = "wikidata.org" 10 | WIKIPEDIA_DOMAIN = "wikipedia.org" 11 | WIKIDATA_PREFIX = "wikidata.org/wiki/" 12 | DBPEDIA_PREFIX = "http://dbpedia.org/resource/" 13 | WIKIPEDIA_PREFIX = "wikipedia.org/wiki/" 14 | CACHED_WIKI2DBPEDIA_DB = join(join(ROOT_DIR, "cache"), "wikidata2dbpedia-cache.sqlite") 15 | 16 | verbose = False 17 | 18 | 19 | class URIConverter(object): 20 | def __init__(self, cache_fpath=CACHED_WIKI2DBPEDIA_DB): 21 | self._cache = SqliteDict(cache_fpath, autocommit=True) 22 | self._client = Client() 23 | 24 | def __del__(self): 25 | try: 26 | self._cache.close() 27 | except: 28 | if verbose: print("Warning: trying to close a closed cache.") 29 | 30 | def close(self): 31 | self._cache.close() 32 | 33 | def get_postfix(self, string, prefix): 34 | """ Given a string and a prefix returns postfix. If not found 35 | then returns None. """ 36 | 37 | beg_index = string.find(prefix) 38 | if beg_index != -1: 39 | end_index = beg_index + len(prefix) 40 | return string[end_index:] 41 | else: 42 | return None 43 | 44 | def get_fuzzy_postfix(self, string, prefix): 45 | if prefix in string: 46 | parts = string.split("/") 47 | if len(parts) > 1: 48 | return parts[-1] 49 | else: 50 | return None 51 | 52 | def wikipedia2dbpedia(self, wikipedia_uri): 53 | article_name = self.get_fuzzy_postfix(wikipedia_uri, prefix=WIKIPEDIA_DOMAIN) 54 | 55 | if article_name is None: 56 | if verbose: print("Warning: cannot convert to DBpedia URI '{}'".format(wikipedia_uri)) 57 | return "" 58 | else: 59 | return DBPEDIA_PREFIX + article_name 60 | 61 | def wikidataid2wikipedia(self, wikidata_q_id="Q42"): 62 | try: 63 | if wikidata_q_id in self._cache: 64 | return self._cache[wikidata_q_id] 65 | else: 66 | entity = self._client.get(wikidata_q_id, load=True) 67 | can_get = ("sitelinks" in entity.attributes and 68 | "enwiki" in entity.attributes["sitelinks"] and 69 | "url" in entity.attributes["sitelinks"]["enwiki"]) 70 | if can_get: 71 | wikipedia_uri = entity.attributes["sitelinks"]["enwiki"]["url"] 72 | self._cache[wikidata_q_id] = wikipedia_uri 73 | return wikipedia_uri 74 | else: 75 | wiki_links = [] 76 | for key in entity.attributes["sitelinks"]: 77 | if key.endswith("wiki"): 78 | if "url" in entity.attributes["sitelinks"][key]: 79 | wiki_links.append(entity.attributes["sitelinks"][key]["url"]) 80 | 81 | if len(wiki_links) > 0: 82 | print("Warning: no links to English Wiki found, but found {} links to other Wikis".format(len(wiki_links))) 83 | self._cache[wikidata_q_id] = wiki_links[0] 84 | return wiki_links[0] 85 | else: 86 | self._cache[wikidata_q_id] = "" 87 | return "" 88 | 89 | except KeyboardInterrupt: 90 | raise KeyboardInterrupt() 91 | except: 92 | print("Warning: cannot process '{}'".format(wikidata_q_id)) 93 | print(format_exc()) 94 | return "" 95 | 96 | def get_wikidata_id(self, wikidata_uri): 97 | wikidata_id = self.get_fuzzy_postfix(wikidata_uri, prefix=WIKIDATA_DOMAIN) 98 | if wikidata_id is None: 99 | if verbose: print("Warning: cannot extract WikiData ID '{}'".format(wikidata_uri)) 100 | return "" 101 | else: 102 | return wikidata_id 103 | 104 | def wikidata2wikipedia(self, wikidata_uri): 105 | wikidata_id = self.get_wikidata_id(wikidata_uri) 106 | if wikidata_id != "": 107 | wikipedia_uri = self.wikidataid2wikipedia(wikidata_id) 108 | return wikipedia_uri 109 | else: 110 | if verbose: print("Warning: cannot extract DBpedia URI from a Wikidata URI") 111 | return "" 112 | 113 | 114 | def wikidata2dbpedia(self, wikidata_uri): 115 | return self.wikipedia2dbpedia(self.wikidata2wikipedia(wikidata_uri)) 116 | 117 | -------------------------------------------------------------------------------- /data_helper.py: -------------------------------------------------------------------------------- 1 | from sqlitedict import SqliteDict 2 | import sqlite3 3 | import codecs 4 | 5 | 6 | def create_dictdb_from_file(file_path, db_path): 7 | db = SqliteDict(db_path, autocommit=True) 8 | 9 | file = codecs.open(file_path, 'r') 10 | line = file.readline() 11 | 12 | while line != '': 13 | splitted = line.split() 14 | line = file.readline() 15 | try: 16 | key, value = splitted[0], ' '.join(splitted[1:]) 17 | db[key] = value 18 | except IndexError: 19 | continue 20 | 21 | file.close() 22 | db.close() 23 | 24 | 25 | def create_db_from_dictdb(lookup_db_path, longabs_db_path, labels_db_path, db_name): 26 | connection = sqlite3.connect(db_name) 27 | cursor = connection.cursor() 28 | 29 | cursor.execute('''CREATE TABLE graph (node_id INTEGER PRIMARY KEY NOT NULL, long_abstracts TEXT, labels TEXT)''') 30 | 31 | connection.commit() 32 | 33 | lookup_db = SqliteDict(lookup_db_path, autocommit=False) 34 | longabs_db = SqliteDict(longabs_db_path, autocommit=False) 35 | labels_db = SqliteDict(labels_db_path, autocommit=False) 36 | 37 | intersection_nodes = lookup_db.keys() 38 | 39 | count = 0 40 | 41 | for node in intersection_nodes: 42 | longab = longabs_db[node] 43 | label = labels_db[node] 44 | id = lookup_db[node] 45 | 46 | cursor.execute('''INSERT INTO graph VALUES (?,?,?)''', (id, longab, label)) 47 | 48 | if count%100000 == 0: 49 | print(count) 50 | connection.commit() 51 | 52 | count += 1 53 | 54 | connection.commit() 55 | 56 | connection.close() 57 | lookup_db.close() 58 | labels_db.close() 59 | longabs_db.close() 60 | -------------------------------------------------------------------------------- /dataset_stats.py: -------------------------------------------------------------------------------- 1 | from pandas import read_csv 2 | from glob import glob 3 | import re 4 | from traceback import format_exc 5 | 6 | 7 | def dataset_stat(dataset_fpath): 8 | try: 9 | df = read_csv(dataset_fpath, sep="\t", encoding="utf-8") 10 | df.targets 11 | except AttributeError: 12 | df = read_csv(dataset_fpath, sep="\t", encoding="utf-8", names=["targets","context"]) 13 | 14 | targets = set() 15 | for i, row in df.iterrows(): 16 | for t in str(row.targets).split(","): 17 | ts = t.strip() 18 | if len(ts) > 0: targets.add(ts) 19 | 20 | print("# of contexts:", len(df)) 21 | print("# of targets:", len(targets)) 22 | 23 | 24 | def format_urls(url_fpaths): 25 | url = re.compile(r"<([^>]+)>") 26 | for url_fpath in glob(url_fpaths): 27 | print(url_fpath) 28 | with open(url_fpath, "r") as in_f, open(url_fpath + ".out", "w") as out_f: 29 | for line in in_f: 30 | match = url.search(line) 31 | if match: 32 | out_f.write("{}\n".format(match.groups(0)[0])) 33 | 34 | 35 | datasets_fpath = "/home/panchenko/kb2vec/datasets/*.tsv" 36 | for dataset_fpath in glob(datasets_fpath): 37 | 38 | print(dataset_fpath) 39 | dataset_stat(dataset_fpath) 40 | 41 | 42 | format_urls(url_fpaths = "datasets/*txt") 43 | -------------------------------------------------------------------------------- /datasets/dbpedia-url.txt: -------------------------------------------------------------------------------- 1 | http://dbpedia.org/resource/Anxiety 2 | http://dbpedia.org/resource/Conspiracy_theory 3 | http://dbpedia.org/resource/Consumer 4 | http://dbpedia.org/resource/Internet 5 | http://dbpedia.org/resource/Internet_privacy 6 | http://dbpedia.org/resource/Lawsuit 7 | http://dbpedia.org/resource/Marketing 8 | http://dbpedia.org/resource/User_(computing) 9 | http://dbpedia.org/resource/Worry 10 | http://dbpedia.org/resource/Alarmism 11 | http://dbpedia.org/resource/Advertising 12 | http://dbpedia.org/resource/Marketing 13 | http://dbpedia.org/resource/Online_and_offline 14 | http://dbpedia.org/resource/User_(computing) 15 | http://dbpedia.org/resource/Web_developer 16 | http://dbpedia.org/resource/Year 17 | http://dbpedia.org/resource/HTML5 18 | http://dbpedia.org/resource/Information_privacy 19 | http://dbpedia.org/resource/Internet 20 | http://dbpedia.org/resource/Internet 21 | http://dbpedia.org/resource/Era 22 | http://dbpedia.org/resource/HTML 23 | http://dbpedia.org/resource/Promise 24 | http://dbpedia.org/resource/Source_code 25 | http://dbpedia.org/resource/Web_browser 26 | http://dbpedia.org/resource/Web_page 27 | http://dbpedia.org/resource/World_Wide_Web 28 | http://dbpedia.org/resource/Year 29 | http://dbpedia.org/resource/Computer_software 30 | http://dbpedia.org/resource/Content_(media) 31 | http://dbpedia.org/resource/Email 32 | http://dbpedia.org/resource/Multimedia 33 | http://dbpedia.org/resource/Online_and_offline 34 | http://dbpedia.org/resource/Restaurant 35 | http://dbpedia.org/resource/Smartphone 36 | http://dbpedia.org/resource/Uploading_and_downloading 37 | http://dbpedia.org/resource/User_(computing) 38 | http://dbpedia.org/resource/Billionaire 39 | http://dbpedia.org/resource/Country 40 | http://dbpedia.org/resource/Home 41 | http://dbpedia.org/resource/Mikhail_Prokhorov 42 | http://dbpedia.org/resource/Moscow 43 | http://dbpedia.org/resource/New_Jersey_Nets 44 | http://dbpedia.org/resource/Ownership 45 | http://dbpedia.org/resource/Sunday 46 | http://dbpedia.org/resource/Mikhail_Prokhorov 47 | http://dbpedia.org/resource/Country 48 | http://dbpedia.org/resource/Greeting 49 | http://dbpedia.org/resource/Home 50 | http://dbpedia.org/resource/Russia 51 | http://dbpedia.org/resource/Sponsor_(commercial) 52 | http://dbpedia.org/resource/Sportsperson 53 | http://dbpedia.org/resource/Team 54 | http://dbpedia.org/resource/Wealth 55 | http://dbpedia.org/resource/Mikhail_Prokhorov 56 | http://dbpedia.org/resource/Basketball_court 57 | http://dbpedia.org/resource/CSKA_Universal_Sports_Hall 58 | http://dbpedia.org/resource/Exhibition_game 59 | http://dbpedia.org/resource/Game 60 | http://dbpedia.org/resource/New_Jersey_Nets 61 | http://dbpedia.org/resource/Practice_(learning_method) 62 | http://dbpedia.org/resource/Russia 63 | http://dbpedia.org/resource/Sports_club 64 | http://dbpedia.org/resource/Sportsperson 65 | http://dbpedia.org/resource/Year 66 | http://dbpedia.org/resource/Mikhail_Prokhorov 67 | http://dbpedia.org/resource/Association_football 68 | http://dbpedia.org/resource/Basketball 69 | http://dbpedia.org/resource/Coach_(sport) 70 | http://dbpedia.org/resource/Future 71 | http://dbpedia.org/resource/Hockey 72 | http://dbpedia.org/resource/Interaction 73 | http://dbpedia.org/resource/New_Jersey_Nets 74 | http://dbpedia.org/resource/Popularity 75 | http://dbpedia.org/resource/Russia 76 | http://dbpedia.org/resource/Russians 77 | http://dbpedia.org/resource/Sport 78 | http://dbpedia.org/resource/Sportsperson 79 | http://dbpedia.org/resource/Fan_(person) 80 | http://dbpedia.org/resource/New_Jersey_Nets 81 | http://dbpedia.org/resource/Cannabis_(drug) 82 | http://dbpedia.org/resource/Family 83 | http://dbpedia.org/resource/Middle_age 84 | http://dbpedia.org/resource/Parent 85 | http://dbpedia.org/resource/Ritual 86 | http://dbpedia.org/resource/Cannabis_(drug) 87 | http://dbpedia.org/resource/Illegal_drug_trade 88 | http://dbpedia.org/resource/Illinois 89 | http://dbpedia.org/resource/Parent 90 | http://dbpedia.org/resource/Writer 91 | http://dbpedia.org/resource/Year 92 | http://dbpedia.org/resource/Parent 93 | http://dbpedia.org/resource/Illegal_drug_trade 94 | http://dbpedia.org/resource/Grounding_(punishment) 95 | http://dbpedia.org/resource/Mother 96 | http://dbpedia.org/resource/Medical_cannabis 97 | http://dbpedia.org/resource/Parent 98 | http://dbpedia.org/resource/Alzheimer’s_disease 99 | http://dbpedia.org/resource/Cancer 100 | http://dbpedia.org/resource/Father 101 | http://dbpedia.org/resource/Heart_disease 102 | http://dbpedia.org/resource/Mother 103 | http://dbpedia.org/resource/Nausea 104 | http://dbpedia.org/resource/Orthostatic_hypotension 105 | http://dbpedia.org/resource/Medical_cannabis 106 | http://dbpedia.org/resource/Research 107 | http://dbpedia.org/resource/California_State_Route_1 108 | http://dbpedia.org/resource/Driving 109 | http://dbpedia.org/resource/Los_Angeles 110 | http://dbpedia.org/resource/Roof 111 | http://dbpedia.org/resource/San_Francisco 112 | http://dbpedia.org/resource/Toyota_Prius 113 | http://dbpedia.org/resource/Driving 114 | http://dbpedia.org/resource/Person 115 | http://dbpedia.org/resource/Steering_wheel 116 | http://dbpedia.org/resource/Driverless_car 117 | http://dbpedia.org/resource/Artificial_intelligence 118 | http://dbpedia.org/resource/Automobile 119 | http://dbpedia.org/resource/Automobile 120 | http://dbpedia.org/resource/Computer_software 121 | http://dbpedia.org/resource/Google 122 | http://dbpedia.org/resource/Human 123 | http://dbpedia.org/resource/Project 124 | http://dbpedia.org/resource/Automobile 125 | http://dbpedia.org/resource/Human 126 | http://dbpedia.org/resource/Human 127 | http://dbpedia.org/resource/Mile 128 | http://dbpedia.org/resource/Mile 129 | http://dbpedia.org/resource/Navigation_system 130 | http://dbpedia.org/resource/Steering_wheel 131 | http://dbpedia.org/resource/Technician 132 | http://dbpedia.org/resource/Lombard_Street_(San_Francisco) 133 | http://dbpedia.org/resource/San_Francisco 134 | http://dbpedia.org/resource/Street 135 | http://dbpedia.org/resource/Accident 136 | http://dbpedia.org/resource/Automobile 137 | http://dbpedia.org/resource/Engineer 138 | http://dbpedia.org/resource/Google_driverless_car 139 | http://dbpedia.org/resource/Rear-end_collision 140 | http://dbpedia.org/resource/Traffic_light 141 | http://dbpedia.org/resource/Canadian 142 | http://dbpedia.org/resource/Dinner 143 | http://dbpedia.org/resource/Diplomatic_mission 144 | http://dbpedia.org/resource/Economy 145 | http://dbpedia.org/resource/Economy 146 | http://dbpedia.org/resource/Friday 147 | http://dbpedia.org/resource/International 148 | http://dbpedia.org/resource/Presidency_of_Barack_Obama 149 | http://dbpedia.org/resource/Problem 150 | http://dbpedia.org/resource/World 151 | http://dbpedia.org/resource/Balance_of_trade 152 | http://dbpedia.org/resource/Beef_tenderloin 153 | http://dbpedia.org/resource/Canada 154 | http://dbpedia.org/resource/Currency 155 | http://dbpedia.org/resource/Europe 156 | http://dbpedia.org/resource/Japan 157 | http://dbpedia.org/resource/People's_Republic_of_China 158 | http://dbpedia.org/resource/Renminbi 159 | http://dbpedia.org/resource/Scallop 160 | http://dbpedia.org/resource/Timothy_Geithner 161 | http://dbpedia.org/resource/United_States_Secretary_of_the_Treasury 162 | http://dbpedia.org/resource/World 163 | http://dbpedia.org/resource/Annual_Meetings_of_the_International_Monetary_Fund_and_the_World_Bank_Group 164 | http://dbpedia.org/resource/Currency 165 | http://dbpedia.org/resource/Box_office 166 | http://dbpedia.org/resource/Art 167 | http://dbpedia.org/resource/Art_exhibition 168 | http://dbpedia.org/resource/Art_exhibition 169 | http://dbpedia.org/resource/Autumn 170 | http://dbpedia.org/resource/Chaos 171 | http://dbpedia.org/resource/Classicism 172 | http://dbpedia.org/resource/France 173 | http://dbpedia.org/resource/Germany 174 | http://dbpedia.org/resource/Italy 175 | http://dbpedia.org/resource/Name 176 | http://dbpedia.org/resource/Percentage 177 | http://dbpedia.org/resource/Solomon_R._Guggenheim_Museum 178 | http://dbpedia.org/resource/History 179 | http://dbpedia.org/resource/Cubism 180 | http://dbpedia.org/resource/Futurism 181 | http://dbpedia.org/resource/Modernism 182 | http://dbpedia.org/resource/Nightmare 183 | http://dbpedia.org/resource/World_War_I 184 | http://dbpedia.org/resource/Ancient_Greece 185 | http://dbpedia.org/resource/Italian_Renaissance 186 | http://dbpedia.org/resource/Tradition 187 | http://dbpedia.org/resource/Artist 188 | http://dbpedia.org/resource/Conservatism 189 | http://dbpedia.org/resource/Ideology 190 | http://dbpedia.org/resource/Social_order 191 | http://dbpedia.org/resource/Lil_Wayne 192 | http://dbpedia.org/resource/Boredom 193 | http://dbpedia.org/resource/Free_association_(psychology) 194 | http://dbpedia.org/resource/Idea 195 | http://dbpedia.org/resource/Self-consciousness 196 | http://dbpedia.org/resource/Aggression 197 | http://dbpedia.org/resource/Album 198 | http://dbpedia.org/resource/Album 199 | http://dbpedia.org/resource/Idea 200 | http://dbpedia.org/resource/Prison 201 | http://dbpedia.org/resource/Rebirth_(Lil_Wayne_album) 202 | http://dbpedia.org/resource/Research 203 | http://dbpedia.org/resource/Rikers_Island 204 | http://dbpedia.org/resource/Scientific_method 205 | http://dbpedia.org/resource/Sentence_(law) 206 | http://dbpedia.org/resource/Experiment 207 | http://dbpedia.org/resource/Lil_Wayne 208 | http://dbpedia.org/resource/Album 209 | http://dbpedia.org/resource/I_Am_Not_a_Human_Being 210 | http://dbpedia.org/resource/Month 211 | http://dbpedia.org/resource/Song 212 | http://dbpedia.org/resource/Studio_recording 213 | http://dbpedia.org/resource/Criminal_charge 214 | http://dbpedia.org/resource/Gun_politics 215 | http://dbpedia.org/resource/Prison 216 | http://dbpedia.org/resource/Rebirth_(Lil_Wayne_album) 217 | http://dbpedia.org/resource/Scar 218 | http://dbpedia.org/resource/Song 219 | http://dbpedia.org/resource/Accordion 220 | http://dbpedia.org/resource/Backpack 221 | http://dbpedia.org/resource/Ballet_tutu 222 | http://dbpedia.org/resource/Man 223 | http://dbpedia.org/resource/Audience 224 | http://dbpedia.org/resource/Buster_Keaton 225 | http://dbpedia.org/resource/Chair 226 | http://dbpedia.org/resource/Dance 227 | http://dbpedia.org/resource/Ladder 228 | http://dbpedia.org/resource/2006 229 | http://dbpedia.org/resource/Berlin 230 | http://dbpedia.org/resource/Brick 231 | http://dbpedia.org/resource/Choreography 232 | http://dbpedia.org/resource/Dance 233 | http://dbpedia.org/resource/Gothic_architecture 234 | http://dbpedia.org/resource/House_dance 235 | http://dbpedia.org/resource/Man 236 | http://dbpedia.org/resource/Performance 237 | http://dbpedia.org/resource/Pumping_station 238 | http://dbpedia.org/resource/Sasha_Waltz 239 | http://dbpedia.org/resource/Spree 240 | http://dbpedia.org/resource/Berlin 241 | http://dbpedia.org/resource/Boiler 242 | http://dbpedia.org/resource/Dance_troupe 243 | http://dbpedia.org/resource/Hall 244 | http://dbpedia.org/resource/Machine 245 | http://dbpedia.org/resource/Man 246 | http://dbpedia.org/resource/Performance 247 | http://dbpedia.org/resource/Pumping_station 248 | http://dbpedia.org/resource/Room 249 | http://dbpedia.org/resource/Berlin_Ostbahnhof 250 | http://dbpedia.org/resource/Boulevard 251 | http://dbpedia.org/resource/Berlin 252 | http://dbpedia.org/resource/Mile 253 | http://dbpedia.org/resource/Nightclub 254 | http://dbpedia.org/resource/Unter_den_Linden 255 | http://dbpedia.org/resource/Cappuccino 256 | http://dbpedia.org/resource/Dance 257 | http://dbpedia.org/resource/Dance 258 | http://dbpedia.org/resource/Month 259 | http://dbpedia.org/resource/Performance 260 | http://dbpedia.org/resource/Ticket_(admission) 261 | http://dbpedia.org/resource/Blue 262 | http://dbpedia.org/resource/Finland 263 | http://dbpedia.org/resource/Floor_plan 264 | http://dbpedia.org/resource/Green 265 | http://dbpedia.org/resource/House 266 | http://dbpedia.org/resource/Leaf 267 | http://dbpedia.org/resource/Leaf_shape 268 | http://dbpedia.org/resource/Park 269 | http://dbpedia.org/resource/Red 270 | http://dbpedia.org/resource/Roof 271 | http://dbpedia.org/resource/Turku 272 | http://dbpedia.org/resource/Yellow 273 | http://dbpedia.org/resource/Commuting 274 | http://dbpedia.org/resource/Curve 275 | http://dbpedia.org/resource/Drop_(liquid) 276 | http://dbpedia.org/resource/Floor 277 | http://dbpedia.org/resource/Grand_Central_Terminal 278 | http://dbpedia.org/resource/New_York_City 279 | http://dbpedia.org/resource/Runway_(fashion) 280 | http://dbpedia.org/resource/Shape 281 | http://dbpedia.org/resource/Video 282 | http://dbpedia.org/resource/Wall 283 | http://dbpedia.org/resource/Window 284 | http://dbpedia.org/resource/Port_of_Turku 285 | http://dbpedia.org/resource/House 286 | http://dbpedia.org/resource/Landmark 287 | http://dbpedia.org/resource/September 288 | http://dbpedia.org/resource/Turku 289 | http://dbpedia.org/resource/Woodland 290 | http://dbpedia.org/resource/Artist 291 | http://dbpedia.org/resource/Artist 292 | http://dbpedia.org/resource/Interior_design 293 | http://dbpedia.org/resource/Sculpture 294 | http://dbpedia.org/resource/Finnish_sauna 295 | http://dbpedia.org/resource/2011 296 | http://dbpedia.org/resource/Culture 297 | http://dbpedia.org/resource/Exhibition 298 | http://dbpedia.org/resource/Festival 299 | http://dbpedia.org/resource/Fiberglass 300 | http://dbpedia.org/resource/Sauna 301 | http://dbpedia.org/resource/Garlic 302 | http://dbpedia.org/resource/House 303 | http://dbpedia.org/resource/Red 304 | http://dbpedia.org/resource/Shape 305 | http://dbpedia.org/resource/Turku 306 | http://dbpedia.org/resource/Alice_Roosevelt_Longworth 307 | http://dbpedia.org/resource/Hedonism 308 | http://dbpedia.org/resource/Gossip 309 | http://dbpedia.org/resource/Gossip 310 | http://dbpedia.org/resource/Research 311 | http://dbpedia.org/resource/Husband 312 | http://dbpedia.org/resource/Presentation 313 | http://dbpedia.org/resource/September 314 | http://dbpedia.org/resource/Social_psychology 315 | http://dbpedia.org/resource/Emotion 316 | http://dbpedia.org/resource/Fiction 317 | http://dbpedia.org/resource/Gossip 318 | http://dbpedia.org/resource/Man 319 | http://dbpedia.org/resource/Person 320 | http://dbpedia.org/resource/Person 321 | http://dbpedia.org/resource/Term_(time) 322 | http://dbpedia.org/resource/Undergraduate_education 323 | http://dbpedia.org/resource/Woman 324 | http://dbpedia.org/resource/Female 325 | http://dbpedia.org/resource/Gossip 326 | http://dbpedia.org/resource/Gossip 327 | http://dbpedia.org/resource/Questionnaire 328 | http://dbpedia.org/resource/Self-esteem 329 | http://dbpedia.org/resource/Social_support 330 | http://dbpedia.org/resource/Term_(time) 331 | http://dbpedia.org/resource/Undergraduate_education 332 | -------------------------------------------------------------------------------- /datasets/dbpedia-url.txt.absent: -------------------------------------------------------------------------------- 1 | en.wikipedia.org/wiki/Uploading_and_downloading 2 | en.wikipedia.org/wiki/New_Jersey_Nets 3 | en.wikipedia.org/wiki/CSKA_Universal_Sports_Hall 4 | en.wikipedia.org/wiki/New_Jersey_Nets 5 | en.wikipedia.org/wiki/New_Jersey_Nets 6 | en.wikipedia.org/wiki/New_Jersey_Nets 7 | en.wikipedia.org/wiki/Grounding_(punishment) 8 | en.wikipedia.org/wiki/Alzheimer’s_disease 9 | en.wikipedia.org/wiki/Heart_disease 10 | en.wikipedia.org/wiki/Driverless_car 11 | en.wikipedia.org/wiki/Google_driverless_car 12 | en.wikipedia.org/wiki/People's_Republic_of_China 13 | en.wikipedia.org/wiki/Gun_politics 14 | en.wikipedia.org/wiki/Ballet_tutu 15 | en.wikipedia.org/wiki/Leaf_shape 16 | -------------------------------------------------------------------------------- /datasets/dbpedia.tsv: -------------------------------------------------------------------------------- 1 | targets context 2 | Rebirth, scars, ngs b In a sense it’s an improvement on “Rebirth,” even if a couple of the songs bear the scars of that period, particularly the Rick Rubinesque title track, and “Popular,” which has some of the new wave sizzle he was toying with. 3 | Prokhorov, basketball, players, soccer, popularity, future, interaction, Nets, Russia, sport, hockey, Russians, coaches Prokhorov said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey. 4 | car, engineers, traffic light, rear-ended, accident, Google car The only accident, engineers said, was when one Google car was rear-ended while stopped at a traffic light. 5 | red, fiberglass, festival, Turku, garlic, exhibition, culture, Finnish bathhouses, 2011, sauna, houses, shaped On the front lawn we lingered in front of his work-in-progress: a fiberglass sauna shaped like a garlic clove, one of five Finnish bathhouses that will be on display in SaunaLab, an exhibition at Turku 2011, the coming yearlong culture festival. 6 | cappuccino, ticket, months, dance, dance, performance During warmer months, a casual passer-by without a ticket to the night’s performance can enjoy a cappuccino with the dance aficionados on the waterfront terrace, but the main draw is the celebrated Ms. Waltz and her dancers, interlocked in twisted, shifting embraces. 7 | Lombard Street, San Francisco, streets One even drove itself down Lombard Street in San Francisco, one of the steepest and curviest streets in the nation. 8 | Prokhorov, arena of CSKA Moscow, game, exhibition, years, players, practice, Nets, Russia, court, club The stay was just long enough for a little practice and an exhibition in which the Nets were interspersed on the court with top players from youth leagues across Russia — some as young as 8 years old — for a light-hearted game in the arena of CSKA Moscow, the professional Russian club Prokhorov once owned. 9 | miles, boulevard, Unter den Linden, Ostbahnhof, German capital’s, nightclubs Radialsystem V, miles away from the German capital’s grand boulevard of Unter den Linden, is surrounded by a cluster of gritty nightclubs near the Ostbahnhof train station. 10 | mbard Street 11 | nightmare, modernist, World War I, Cubism, Futurism Its narrative goes something like this: After the exhausting nightmare of World War I, Europe’s early modernist vanguard backed away from disruptive experimental styles like Cubism and Futurism. 12 | nausea, father, dizzy spells, cancer, Alzheimer’s disease, heart ailment, mother His father had a heart ailment, his mother had dizzy spells and nausea, and both were worried about Alzheimer’s disease and cancer. 13 | parents, medical marijuana But with age and the growing acceptance of medical marijuana, his parents were curious. 14 | research, gossiping New research finds that gossiping can be good for you — as long as you have something nice to say. 15 | gossip But it seems the greater pleasure comes from more temperate gossip. 16 | experimentation It failed less for its experimentation, which was spotty, than for the strenuousness with which he pursued it. 17 | history I found the whole thing totally engrossing: a survey-style piece of investigative history with a bomb ticking away inside. 18 | research, scientific, Rikers Island, aggressive, album, idea, album, sentence, jail “Rebirth,” the last album he released before he began serving a one-year jail sentence at Rikers Island, was his quixotic attempt at a rap-rock album, an out-of-date idea that he treated like aggressive scientific research. 19 | alarmists But the alarmists have not seen anything yet. 20 | New York, windows, catwalk, teardrops, commuters, curved, floor, shape, video, walls, Grand Central Terminal Inside, it’s even wackier: curved walls, windows in the shape of teardrops, and a catwalk with a tiny video screen embedded in the floor that shows an endless loop of antlike commuters rushing through Grand Central Terminal in New York. 21 | marketers, lawsuits, Internet, Internet privacy, computer users, Worries, consumer, anxiety, conspiracy theories Worries over Internet privacy have spurred lawsuits, conspiracy theories and consumer anxiety as marketers and others invent new ways to track computer users on the Internet. 22 | person, driving, wheel Harder to notice was that the person at the wheel was not actually driving. 23 | blue, park, yellow, red, house, leaf-shaped, floor plan, Turku, LEAF, Finland, green, roof “LIFE ON A LEAF,” a whimsical yellow house that sits in a wooded park at the edge of Turku in southern Finland, is a lopsided construction with a bright red, blue and green roof and a leaf-shaped floor plan. 24 | Buster Keaton, chair, ladder, dance, audience A tall dancer feigns clumsiness with a slyly graceful gawkiness that would do Buster Keaton proud, stumbling around a ladder and a chair to the delight of the audience. 25 | Los Angeles, San Francisco, roof, Toyota Prius, Highway 1, driving Anyone driving the twists of Highway 1 between San Francisco and Los Angeles recently may have glimpsed a Toyota Prius with a curious funnel-like cylinder on the roof. 26 | man, rucksack, tutu, accordion The short, bearded man in the tutu wanders onstage, an accordion strapped to his back like a rucksack. 27 | research, marijuana They looked at some research and decided marijuana was worth a try. 28 | gossiping, women, person, men, short-term, person, emotional, fictional, undergraduates In the first study, intended to measure a person’s short-term emotional reaction to gossiping, 140 men and women, primarily undergraduates, were asked to talk about a fictional person either positively or negatively. 29 | years, parents, Illinois, marijuana, supplying, writer Bryan, 46, a writer who lives in Illinois, began supplying his parents about five years ago, after he told them about his own marijuana use. 30 | beef tenderloin, world, China, trade imbalances, renminbi, Europe, Treasury Secretary, Japan, Canada, currency, Timothy F. Geithner, scallops Over seared scallops and beef tenderloin, Treasury Secretary Timothy F. Geithner urged his counterparts from Europe, Canada and Japan to help persuade China to let its currency, the renminbi, rise in value — a crucial element in redressing the trade imbalances that are threatening recovery around the world. 31 | fan, Nets If the Nets pick up a strong Russian fan base in the process, that would be nice, too, he said. 32 | middle-age, rites, families, marijuana, parents To the rites of middle-age passage, some families are adding another: buying marijuana for aging parents. 33 | artists, conservatism To some degree recuperative conservatism worked; it gave artists a way to keep moving when an old way seemed blocked. 34 | Embassy, Friday, economic, dinner, economic, international, Canadian, Obama administration, world, problem At a private dinner on Friday at the Canadian Embassy, finance officials from seven world economic powers focused on the most vexing international economic problem facing the Obama administration. 35 | ancient Greece, Italian Renaissance, traditional They retreated to traditional forms and looked back to an imagined classical past — ancient Greece, the Italian Renaissance — for images of wholeness and harmony. 36 | box office Boilerplate is safe box office, and we’ve gotten our share lately. 37 | social psychologist, presentation, September, husband In a presentation in September, Jennifer Cole, a social psychologist, and Hannah Scrivener reported results from two related studies, both of which demonstrate that it’s in one’s self-interest to say “So-and-so’s second husband is adorable” rather than “She married that lout?” 38 | Internet, Internet, HTML 5, privacy Nearly everyone who uses the Internet will face the privacy risks that come with those capabilities, which are an integral part of the Web language that will soon power the Internet: HTML 5. 39 | gun-possession, jailed, charge (He was jailed on a gun-possession charge.) 40 | interior designer, artist, artists, sculptor Jan-Erik Andersson — the artist, sculptor and interior designer who, with contributions from 20 other artists, created the place, and gives tours to groups of visitors who make arrangements in advance — was waiting for me. 41 | online, marketers, computer users, years, Web developers, advertisers In the next few years, a powerful new suite of capabilities will become available to Web developers that could give marketers and advertisers access to many more details about computer users’ online activities. 42 | I Am Not a Human Being, recording, month, album, Lil Wayne, songs From somewhere among those recording sessions, and those rumored to be slated for “Tha Carter IV,” his next album, came the songs found on “I Am Not a Human Being,” an uncentered collection of odds and ends meant to sate interest until Lil Wayne’s release from Rikers, which is expected to be early next month. 43 | miles, miles, navigation system, cars, technician, human, human, wheel With someone behind the wheel to take control if something goes awry and a technician in the passenger seat to monitor the navigation system, seven test cars have driven 1,000 miles without human intervention and more than 140,000 miles with only occasional human control. 44 | country, wealthy, greeted, Russia, sponsors, home, team, Prokhorov, players “Welcome to Russia,” Prokhorov said as he greeted his players during their first trip to a country he hopes will become a kind of second home for the team — as well as a source of wealthy sponsors. 45 | September, house, landmark, Turku’s harbor, Turku, woods On an unseasonably warm September morning I traveled to the Leaf house, which has become something of a local landmark, passing Turku’s harbor overlooked by pine-forested hills, then turning into a clearing in the woods. 46 | annual meetings of the International Monetary Fund, currency But the next afternoon, the annual meetings of the International Monetary Fund ended with a tepid statement that made only fleeting and indirect references to the simmering currency tensions. 47 | pumping station, room, hall, Berlin, boiler, dance troupe, machine, man, performance Even though Berliners know about the dance troupe, visitors often miss its frequently sold-out performances in the towering space of the former machine hall and boiler room of the pumping station. 48 | project, artificial-intelligence, vehicles that can drive themselves, car, Google, car, software, human The car is a project of Google, which has been working in secret but in plain view on vehicles that can drive themselves, using artificial-intelligence software that can sense anything near the car and mimic the decisions made by a human driver. 49 | house dance, Spree, Sasha Waltz, brick, Berlin, Gothic, man, pumping station, dance, 2006, choreographer, performance Sasha Waltz & Guests (pictured), named for its choreographer, is the house dance ensemble at Radialsystem V (Holzmarktstr, 33; radialsystem.de), a performance space that opened in 2006 in an old brick Gothic pumping station on the Spree River in Berlin. 50 | mother, grounded “We would have grounded him,” said his mother, who is 72. 51 | social support, gossiping, long-term, self-esteem, female, undergrads, gossip, questionnaires The second study, which looked into the long-term effects of gossiping on well-being, had 160 participants, mostly female undergrads, fill out questionnaires about their tendency to gossip, their self-esteem and their perceived social support. 52 | illegal drugs, parents When he was growing up, he said, his parents were very strict about illegal drugs. 53 | hedonist, Alice Roosevelt Longworth “IF you can’t say something good about someone, sit right here by me,” Alice Roosevelt Longworth, a self-proclaimed “hedonist,” used to say. 54 | billionaire, home, owner, Moscow, Sunday, Nets, country, Mikhail D. Prokhorov The Nets dropped into Moscow on Sunday for a one-day visit meant to raise their profile in the home country of their new owner, the Russian billionaire Mikhail D. Prokhorov. 55 | social order, ideology At the same time classicism as an ideology, with its emphasis on order, purity and exclusion, was being espoused by rising political figures intent on creating a new, lethally exclusionary social order. 56 | Visitors with a stake in art-as-uplift will find the story it tells mystifying, if not perverse. 57 | free-associative, idea, Lil Wayne, self-consciousness, boredom Lil Wayne’s least interesting mode is fixation. Restless and free-associative, he’s best when bouncing from one idea to the next, sticking around just long enough to master it, then splitting before boredom or self-consciousness sets in. 58 | exhibition, Chaos, Italy, names, Guggenheim Museum, percentage, Germany, France, fall, exhibition, Art, Classicism So it’s great that the Guggenheim Museum is giving us the opposite in its major fall exhibition, “Chaos and Classicism: Art in France, Italy, and Germany, 1918-1936.” With its high percentage of unfamiliar names, the exhibition won’t pull crowds. 59 | multimedia, smartphone, software, downloading, restaurant, offline, e-mail, content, users It will make it easier for users to view multimedia content without downloading extra software; check e-mail offline; or find a favorite restaurant or shop on a smartphone. 60 | Web, code, Web pages, era, Hypertext Markup Language, promises, Internet browsing, years The new Web code, the fifth version of Hypertext Markup Language used to create Web pages, is already in limited use, and it promises to usher in a new era of Internet browsing within the next few years. 61 | -------------------------------------------------------------------------------- /datasets/dbpedia.ttl.contexts.tsv: -------------------------------------------------------------------------------- 1 | targets contexts 2 | To some degree recuperative conservatism worked; it gave artists a way to keep moving when an old way seemed blocked. 3 | The second study, which looked into the long-term effects of gossiping on well-being, had 160 participants, mostly female undergrads, fill out questionnaires about their tendency to gossip, their self-esteem and their perceived social support. 4 | The short, bearded man in the tutu wanders onstage, an accordion strapped to his back like a rucksack. 5 | Inside, it’s even wackier: curved walls, windows in the shape of teardrops, and a catwalk with a tiny video screen embedded in the floor that shows an endless loop of antlike commuters rushing through Grand Central Terminal in New York. 6 | At a private dinner on Friday at the Canadian Embassy, finance officials from seven world economic powers focused on the most vexing international economic problem facing the Obama administration. 7 | It failed less for its experimentation, which was spotty, than for the strenuousness with which he pursued it. 8 | Prokhorov said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey. 9 | At the same time classicism as an ideology, with its emphasis on order, purity and exclusion, was being espoused by rising political figures intent on creating a new, lethally exclusionary social order. 10 | Harder to notice was that the person at the wheel was not actually driving. 11 | But the alarmists have not seen anything yet. 12 | Its narrative goes something like this: After the exhausting nightmare of World War I, Europe’s early modernist vanguard backed away from disruptive experimental styles like Cubism and Futurism. 13 | But it seems the greater pleasure comes from more temperate gossip. 14 | Nearly everyone who uses the Internet will face the privacy risks that come with those capabilities, which are an integral part of the Web language that will soon power the Internet: HTML 5. 15 | Jan-Erik Andersson — the artist, sculptor and interior designer who, with contributions from 20 other artists, created the place, and gives tours to groups of visitors who make arrangements in advance — was waiting for me. 16 | “We would have grounded him,” said his mother, who is 72. 17 | Boilerplate is safe box office, and we’ve gotten our share lately. 18 | On an unseasonably warm September morning I traveled to the Leaf house, which has become something of a local landmark, passing Turku’s harbor overlooked by pine-forested hills, then turning into a clearing in the woods. 19 | On the front lawn we lingered in front of his work-in-progress: a fiberglass sauna shaped like a garlic clove, one of five Finnish bathhouses that will be on display in SaunaLab, an exhibition at Turku 2011, the coming yearlong culture festival. 20 | In a sense it’s an improvement on “Rebirth,” even if a couple of the songs bear the scars of that period, particularly the Rick Rubinesque title track, and “Popular,” which has some of the new wave sizzle he was toying with. 21 | In a presentation in September, Jennifer Cole, a social psychologist, and Hannah Scrivener reported results from two related studies, both of which demonstrate that it’s in one’s self-interest to say “So-and-so’s second husband is adorable” rather than “She married that lout?” 22 | But the next afternoon, the annual meetings of the International Monetary Fund ended with a tepid statement that made only fleeting and indirect references to the simmering currency tensions. 23 | In the first study, intended to measure a person’s short-term emotional reaction to gossiping, 140 men and women, primarily undergraduates, were asked to talk about a fictional person either positively or negatively. 24 | (He was jailed on a gun-possession charge.) 25 | Over seared scallops and beef tenderloin, Treasury Secretary Timothy F. Geithner urged his counterparts from Europe, Canada and Japan to help persuade China to let its currency, the renminbi, rise in value — a crucial element in redressing the trade imbalances that are threatening recovery around the world. 26 | Visitors with a stake in art-as-uplift will find the story it tells mystifying, if not perverse. 27 | Lil Wayne’s least interesting mode is fixation. Restless and free-associative, he’s best when bouncing from one idea to the next, sticking around just long enough to master it, then splitting before boredom or self-consciousness sets in. 28 | A tall dancer feigns clumsiness with a slyly graceful gawkiness that would do Buster Keaton proud, stumbling around a ladder and a chair to the delight of the audience. 29 | They looked at some research and decided marijuana was worth a try. 30 | Sasha Waltz & Guests (pictured), named for its choreographer, is the house dance ensemble at Radialsystem V (Holzmarktstr, 33; radialsystem.de), a performance space that opened in 2006 in an old brick Gothic pumping station on the Spree River in Berlin. 31 | “Welcome to Russia,” Prokhorov said as he greeted his players during their first trip to a country he hopes will become a kind of second home for the team — as well as a source of wealthy sponsors. 32 | “LIFE ON A LEAF,” a whimsical yellow house that sits in a wooded park at the edge of Turku in southern Finland, is a lopsided construction with a bright red, blue and green roof and a leaf-shaped floor plan. 33 | Bryan, 46, a writer who lives in Illinois, began supplying his parents about five years ago, after he told them about his own marijuana use. 34 | New research finds that gossiping can be good for you — as long as you have something nice to say. 35 | To the rites of middle-age passage, some families are adding another: buying marijuana for aging parents. 36 | When he was growing up, he said, his parents were very strict about illegal drugs. 37 | It will make it easier for users to view multimedia content without downloading extra software; check e-mail offline; or find a favorite restaurant or shop on a smartphone. 38 | “Rebirth,” the last album he released before he began serving a one-year jail sentence at Rikers Island, was his quixotic attempt at a rap-rock album, an out-of-date idea that he treated like aggressive scientific research. 39 | The car is a project of Google, which has been working in secret but in plain view on vehicles that can drive themselves, using artificial-intelligence software that can sense anything near the car and mimic the decisions made by a human driver. 40 | Worries over Internet privacy have spurred lawsuits, conspiracy theories and consumer anxiety as marketers and others invent new ways to track computer users on the Internet. 41 | One even drove itself down Lombard Street in San Francisco, one of the steepest and curviest streets in the nation. 42 | So it’s great that the Guggenheim Museum is giving us the opposite in its major fall exhibition, “Chaos and Classicism: Art in France, Italy, and Germany, 1918-1936.” With its high percentage of unfamiliar names, the exhibition won’t pull crowds. 43 | “IF you can’t say something good about someone, sit right here by me,” Alice Roosevelt Longworth, a self-proclaimed “hedonist,” used to say. 44 | But with age and the growing acceptance of medical marijuana, his parents were curious. 45 | During warmer months, a casual passer-by without a ticket to the night’s performance can enjoy a cappuccino with the dance aficionados on the waterfront terrace, but the main draw is the celebrated Ms. Waltz and her dancers, interlocked in twisted, shifting embraces. 46 | The only accident, engineers said, was when one Google car was rear-ended while stopped at a traffic light. 47 | I found the whole thing totally engrossing: a survey-style piece of investigative history with a bomb ticking away inside. 48 | They retreated to traditional forms and looked back to an imagined classical past — ancient Greece, the Italian Renaissance — for images of wholeness and harmony. 49 | The new Web code, the fifth version of Hypertext Markup Language used to create Web pages, is already in limited use, and it promises to usher in a new era of Internet browsing within the next few years. 50 | The stay was just long enough for a little practice and an exhibition in which the Nets were interspersed on the court with top players from youth leagues across Russia — some as young as 8 years old — for a light-hearted game in the arena of CSKA Moscow, the professional Russian club Prokhorov once owned. 51 | From somewhere among those recording sessions, and those rumored to be slated for “Tha Carter IV,” his next album, came the songs found on “I Am Not a Human Being,” an uncentered collection of odds and ends meant to sate interest until Lil Wayne’s release from Rikers, which is expected to be early next month. 52 | The Nets dropped into Moscow on Sunday for a one-day visit meant to raise their profile in the home country of their new owner, the Russian billionaire Mikhail D. Prokhorov. 53 | Anyone driving the twists of Highway 1 between San Francisco and Los Angeles recently may have glimpsed a Toyota Prius with a curious funnel-like cylinder on the roof. 54 | Even though Berliners know about the dance troupe, visitors often miss its frequently sold-out performances in the towering space of the former machine hall and boiler room of the pumping station. 55 | Radialsystem V, miles away from the German capital’s grand boulevard of Unter den Linden, is surrounded by a cluster of gritty nightclubs near the Ostbahnhof train station. 56 | His father had a heart ailment, his mother had dizzy spells and nausea, and both were worried about Alzheimer’s disease and cancer. 57 | In the next few years, a powerful new suite of capabilities will become available to Web developers that could give marketers and advertisers access to many more details about computer users’ online activities. 58 | With someone behind the wheel to take control if something goes awry and a technician in the passenger seat to monitor the navigation system, seven test cars have driven 1,000 miles without human intervention and more than 140,000 miles with only occasional human control. 59 | If the Nets pick up a strong Russian fan base in the process, that would be nice, too, he said. 60 | -------------------------------------------------------------------------------- /datasets/dbpedia.ttl.phrases.tsv: -------------------------------------------------------------------------------- 1 | targets contexts 2 | soccer 3 | mother 4 | free-associative 5 | currency 6 | privacy 7 | illegal drugs 8 | scars 9 | Canada 10 | parents 11 | human 12 | hall 13 | Turku 14 | Russia 15 | husband 16 | ancient Greece 17 | conservatism 18 | dance 19 | shape 20 | album 21 | advertisers 22 | self-consciousness 23 | ideology 24 | players 25 | ticket 26 | boulevard 27 | Worries 28 | Lil Wayne 29 | streets 30 | 2006 31 | artist 32 | conservatism 33 | cars 34 | hedonist 35 | coaches 36 | Turku 37 | man 38 | machine 39 | currency 40 | hall 41 | walls 42 | walls 43 | gossip 44 | rucksack 45 | car 46 | traffic light 47 | self-esteem 48 | software 49 | short-term 50 | idea 51 | court 52 | world 53 | exhibition 54 | Russia 55 | September 56 | floor plan 57 | presentation 58 | curved 59 | trade imbalances 60 | traditional 61 | Sasha Waltz 62 | anxiety 63 | ideology 64 | cars 65 | Lombard Street 66 | Web 67 | ladder 68 | multimedia 69 | undergraduates 70 | San Francisco 71 | short-term 72 | Google 73 | Prokhorov 74 | Embassy 75 | online 76 | game 77 | Canadian 78 | artists 79 | Berlin 80 | Los Angeles 81 | marijuana 82 | Canada 83 | online 84 | cappuccino 85 | artists 86 | sport 87 | writer 88 | songs 89 | undergrads 90 | Turku 91 | catwalk 92 | players 93 | interaction 94 | Internet 95 | sponsors 96 | Nets 97 | home 98 | Europe 99 | New York 100 | parents 101 | long-term 102 | experimentation 103 | gossiping 104 | Nets 105 | shaped 106 | wheel 107 | Prokhorov 108 | writer 109 | accident 110 | pumping station 111 | exhibition 112 | San Francisco 113 | artificial-intelligence 114 | computer users 115 | tutu 116 | September 117 | accordion 118 | basketball 119 | Web developers 120 | sponsors 121 | promises 122 | marijuana 123 | smartphone 124 | human 125 | Nets 126 | garlic 127 | dance 128 | Toyota Prius 129 | offline 130 | World War I 131 | gun-possession 132 | jail 133 | Grand Central Terminal 134 | dance troupe 135 | HTML 5 136 | Turku’s harbor 137 | shaped 138 | free-associative 139 | economic 140 | September 141 | parents 142 | ticket 143 | marketers 144 | names 145 | Russians 146 | person 147 | long-term 148 | landmark 149 | Web developers 150 | floor plan 151 | billionaire 152 | research 153 | Gothic 154 | history 155 | problem 156 | performance 157 | sentence 158 | Illinois 159 | man 160 | Google car 161 | percentage 162 | World War I 163 | Gothic 164 | experimentation 165 | miles 166 | research 167 | gossip 168 | Rikers Island 169 | era 170 | international 171 | navigation system 172 | recording 173 | fan 174 | miles 175 | LEAF 176 | wheel 177 | I Am Not a Human Being 178 | culture 179 | parents 180 | fall 181 | streets 182 | project 183 | interior designer 184 | conspiracy theories 185 | future 186 | currency 187 | China 188 | German capital’s 189 | female 190 | marijuana 191 | pumping station 192 | fiberglass 193 | research 194 | club 195 | house 196 | undergrads 197 | wealthy 198 | parents 199 | self-consciousness 200 | catwalk 201 | rear-ended 202 | machine 203 | charge 204 | brick 205 | Italy 206 | blue 207 | smartphone 208 | album 209 | gossip 210 | popularity 211 | scallops 212 | parents 213 | car 214 | project 215 | economic 216 | driving 217 | dinner 218 | lawsuits 219 | pumping station 220 | dance 221 | windows 222 | heart ailment 223 | teardrops 224 | father 225 | hockey 226 | gun-possession 227 | content 228 | software 229 | percentage 230 | Ostbahnhof 231 | greeted 232 | dance troupe 233 | Russia 234 | roof 235 | families 236 | Alzheimer’s disease 237 | families 238 | box office 239 | code 240 | arena of CSKA Moscow 241 | man 242 | Finnish bathhouses 243 | red 244 | court 245 | Germany 246 | engineers 247 | car 248 | economic 249 | marijuana 250 | Nets 251 | game 252 | performance 253 | Classicism 254 | Web pages 255 | country 256 | boredom 257 | miles 258 | Worries 259 | car 260 | gossip 261 | nightmare 262 | men 263 | nausea 264 | social support 265 | software 266 | self-esteem 267 | names 268 | Internet 269 | Sasha Waltz 270 | man 271 | fall 272 | research 273 | downloading 274 | marketers 275 | German capital’s 276 | artificial-intelligence 277 | women 278 | dance 279 | performance 280 | sauna 281 | France 282 | Finland 283 | popularity 284 | Internet browsing 285 | gossiping 286 | Friday 287 | owner 288 | scientific 289 | social order 290 | Nets 291 | club 292 | culture 293 | Los Angeles 294 | consumer 295 | Timothy F. Geithner 296 | modernist 297 | chair 298 | women 299 | rear-ended 300 | traditional 301 | house dance 302 | anxiety 303 | Highway 1 304 | world 305 | men 306 | mother 307 | park 308 | green 309 | person 310 | team 311 | marijuana 312 | years 313 | Moscow 314 | presentation 315 | middle-age 316 | nausea 317 | future 318 | month 319 | Berlin 320 | interior designer 321 | human 322 | annual meetings of the International Monetary Fund 323 | ancient Greece 324 | ladder 325 | Italian Renaissance 326 | miles 327 | gossiping 328 | lawsuits 329 | performance 330 | players 331 | grounded 332 | marijuana 333 | accident 334 | house 335 | home 336 | house dance 337 | medical marijuana 338 | HTML 5 339 | Art 340 | Friday 341 | Alice Roosevelt Longworth 342 | Web 343 | rites 344 | nightclubs 345 | roof 346 | San Francisco 347 | performance 348 | person 349 | world 350 | Lil Wayne 351 | traffic light 352 | Rebirth 353 | Obama administration 354 | code 355 | Chaos 356 | Guggenheim Museum 357 | middle-age 358 | practice 359 | festival 360 | Grand Central Terminal 361 | billionaire 362 | charge 363 | songs 364 | sculptor 365 | Berlin 366 | Nets 367 | France 368 | idea 369 | Ostbahnhof 370 | social psychologist 371 | songs 372 | floor 373 | person 374 | months 375 | human 376 | Alice Roosevelt Longworth 377 | Sunday 378 | scars 379 | Rebirth 380 | engineers 381 | vehicles that can drive themselves 382 | red 383 | windows 384 | multimedia 385 | choreographer 386 | exhibition 387 | car 388 | wheel 389 | software 390 | economic 391 | woods 392 | years 393 | arena of CSKA Moscow 394 | pumping station 395 | computer users 396 | Finland 397 | Embassy 398 | alarmists 399 | Rebirth 400 | scallops 401 | Russia 402 | chair 403 | restaurant 404 | downloading 405 | accordion 406 | rucksack 407 | years 408 | cappuccino 409 | years 410 | houses 411 | exhibition 412 | 2011 413 | dizzy spells 414 | restaurant 415 | car 416 | video 417 | month 418 | commuters 419 | sculptor 420 | aggressive 421 | Hypertext Markup Language 422 | Russia 423 | brick 424 | China 425 | mother 426 | grounded 427 | leaf-shaped 428 | Hypertext Markup Language 429 | artist 430 | exhibition 431 | parents 432 | album 433 | Spree 434 | box office 435 | New York 436 | undergraduates 437 | exhibition 438 | Sunday 439 | miles 440 | house 441 | Prokhorov 442 | marketers 443 | room 444 | human 445 | scientific 446 | hedonist 447 | Prokhorov 448 | room 449 | teardrops 450 | dinner 451 | San Francisco 452 | beef tenderloin 453 | players 454 | idea 455 | Italian Renaissance 456 | dance 457 | consumer 458 | human 459 | 2011 460 | Turku 461 | practice 462 | supplying 463 | computer users 464 | nightmare 465 | boiler 466 | country 467 | Chaos 468 | emotional 469 | heart ailment 470 | album 471 | recording 472 | players 473 | dance 474 | nightclubs 475 | man 476 | Russians 477 | artists 478 | months 479 | home 480 | red 481 | driving 482 | era 483 | yellow 484 | fan 485 | Guggenheim Museum 486 | green 487 | years 488 | person 489 | marketers 490 | Lil Wayne 491 | Rebirth 492 | man 493 | international 494 | driving 495 | Treasury Secretary 496 | songs 497 | audience 498 | Spree 499 | social order 500 | Nets 501 | promises 502 | roof 503 | trade imbalances 504 | husband 505 | renminbi 506 | Highway 1 507 | research 508 | dance 509 | basketball 510 | alarmists 511 | coaches 512 | Turku’s harbor 513 | woods 514 | LEAF 515 | Internet privacy 516 | team 517 | 2006 518 | gossiping 519 | I Am Not a Human Being 520 | Nets 521 | modernist 522 | exhibition 523 | sauna 524 | country 525 | sport 526 | Timothy F. Geithner 527 | driving 528 | Cubism 529 | questionnaires 530 | hockey 531 | Obama administration 532 | technician 533 | aggressive 534 | Canadian 535 | Futurism 536 | curved 537 | users 538 | Mikhail D. Prokhorov 539 | female 540 | tutu 541 | Italy 542 | parents 543 | advertisers 544 | cancer 545 | fiberglass 546 | e-mail 547 | floor 548 | world 549 | gossiping 550 | wealthy 551 | Unter den Linden 552 | annual meetings of the International Monetary Fund 553 | technician 554 | video 555 | users 556 | Turku 557 | Mikhail D. Prokhorov 558 | social support 559 | boulevard 560 | e-mail 561 | Europe 562 | commuters 563 | Internet 564 | country 565 | problem 566 | vehicles that can drive themselves 567 | years 568 | Classicism 569 | wheel 570 | artists 571 | Japan 572 | Futurism 573 | Russia 574 | Moscow 575 | Unter den Linden 576 | Finnish bathhouses 577 | cancer 578 | shape 579 | boredom 580 | audience 581 | September 582 | father 583 | emotional 584 | beef tenderloin 585 | boiler 586 | Alzheimer’s disease 587 | Internet privacy 588 | Cubism 589 | Turku 590 | social psychologist 591 | Prokhorov 592 | idea 593 | Germany 594 | blue 595 | performance 596 | rites 597 | renminbi 598 | currency 599 | exhibition 600 | house 601 | years 602 | Web pages 603 | gossiping 604 | years 605 | album 606 | Internet 607 | Rikers Island 608 | Japan 609 | garlic 610 | questionnaires 611 | fictional 612 | navigation system 613 | soccer 614 | houses 615 | interaction 616 | Buster Keaton 617 | miles 618 | jailed 619 | conspiracy theories 620 | computer users 621 | dizzy spells 622 | Lombard Street 623 | leaf-shaped 624 | offline 625 | Prokhorov 626 | landmark 627 | jail 628 | Google 629 | Internet browsing 630 | choreographer 631 | red 632 | Toyota Prius 633 | fictional 634 | history 635 | medical marijuana 636 | Lil Wayne 637 | festival 638 | Illinois 639 | album 640 | Art 641 | Treasury Secretary 642 | mother 643 | roof 644 | content 645 | person 646 | players 647 | jailed 648 | privacy 649 | home 650 | greeted 651 | Berlin 652 | owner 653 | supplying 654 | Internet 655 | Buster Keaton 656 | dance 657 | yellow 658 | illegal drugs 659 | sentence 660 | park 661 | Internet 662 | research 663 | Google car 664 | -------------------------------------------------------------------------------- /datasets/kore50-urls.txt: -------------------------------------------------------------------------------- 1 | http://dbpedia.org/resource/David_Beckham 2 | http://dbpedia.org/resource/Victoria_Beckham 3 | http://dbpedia.org/resource/David_Beckham 4 | http://dbpedia.org/resource/Victoria_Beckham 5 | http://dbpedia.org/resource/Tiger_Woods 6 | http://dbpedia.org/resource/Elin_Nordegren 7 | http://dbpedia.org/resource/Tiger_Woods 8 | http://dbpedia.org/resource/U.S._Open_(golf) 9 | http://dbpedia.org/resource/Madonna_(entertainer) 10 | http://dbpedia.org/resource/Eva_Perón 11 | http://dbpedia.org/resource/Carlos_Leon 12 | http://dbpedia.org/resource/Madonna_(entertainer) 13 | http://dbpedia.org/resource/First_Lady_of_Argentina 14 | http://dbpedia.org/resource/Angelina_Jolie 15 | http://dbpedia.org/resource/Jon_Voight 16 | http://dbpedia.org/resource/Brad_Pitt 17 | http://dbpedia.org/resource/Heidi_Klum 18 | http://dbpedia.org/resource/Seal_(musician) 19 | http://dbpedia.org/resource/Las_Vegas,_Nevada 20 | http://dbpedia.org/resource/Paris_Hilton 21 | http://dbpedia.org/resource/Kim_Kardashian 22 | http://dbpedia.org/resource/Justin_Bieber 23 | http://dbpedia.org/resource/Lady_gaga 24 | http://dbpedia.org/resource/Kate_Perry 25 | http://dbpedia.org/resource/MTV 26 | http://dbpedia.org/resource/Twitter 27 | http://dbpedia.org/resource/Bob_Dylan 28 | http://dbpedia.org/resource/Hurricane_(song) 29 | http://dbpedia.org/resource/Rubin_Carter 30 | http://dbpedia.org/resource/Desire_(Bob_Dylan_album) 31 | http://dbpedia.org/resource/Desire_(Bob_Dylan_album) 32 | http://dbpedia.org/resource/Emmylou_Harris 33 | http://dbpedia.org/resource/Joey_(Bob_Dylan_song) 34 | http://dbpedia.org/resource/Eric_Clapton 35 | http://dbpedia.org/resource/Jeff_Beck 36 | http://dbpedia.org/resource/Jimmy_Page 37 | http://dbpedia.org/resource/Paul_Allen 38 | http://dbpedia.org/resource/EMP_Museum 39 | http://dbpedia.org/resource/Seattle 40 | http://dbpedia.org/resource/Jimi_Hendrix 41 | http://dbpedia.org/resource/Bob_Dylan 42 | http://dbpedia.org/resource/Frank_Sinatra 43 | http://dbpedia.org/resource/Bob_Dylan 44 | http://dbpedia.org/resource/Billy_Joel 45 | http://dbpedia.org/resource/Carlos_Santana 46 | http://dbpedia.org/resource/Columbia_Records 47 | http://dbpedia.org/resource/Sony_Music_Entertainment 48 | http://dbpedia.org/resource/Johnny_Cash 49 | http://dbpedia.org/resource/American_Recordings_(album) 50 | http://dbpedia.org/resource/Rick_Rubin 51 | http://dbpedia.org/resource/Josh_Homme 52 | http://dbpedia.org/resource/Dave_Grohl 53 | http://dbpedia.org/resource/John_Paul_Jones_(musician) 54 | http://dbpedia.org/resource/Steve_Jobs 55 | http://dbpedia.org/resource/Joan_Baez 56 | http://dbpedia.org/resource/Stanford_University 57 | http://dbpedia.org/resource/Isle_of_wight_festival 58 | http://dbpedia.org/resource/Woodstock_Festival 59 | http://dbpedia.org/resource/Miles_davis 60 | http://dbpedia.org/resource/Chicago_(band) 61 | http://dbpedia.org/resource/Joni_Mitchell 62 | http://dbpedia.org/resource/Eric_Clapton 63 | http://dbpedia.org/resource/Blues 64 | http://dbpedia.org/resource/Rock_music 65 | http://dbpedia.org/resource/John_Mayall 66 | http://dbpedia.org/resource/Steve_Jobs 67 | http://dbpedia.org/resource/Apple_Inc. 68 | http://dbpedia.org/resource/Stanford_University 69 | http://dbpedia.org/resource/Steve_Ballmer 70 | http://dbpedia.org/resource/Stanford_University 71 | http://dbpedia.org/resource/Microsoft 72 | http://dbpedia.org/resource/Microsoft_Windows 73 | http://dbpedia.org/resource/Cairo_(operating_system) 74 | http://dbpedia.org/resource/Microsoft 75 | http://dbpedia.org/resource/Bill_Gates 76 | http://dbpedia.org/resource/Steve_Jobs 77 | http://dbpedia.org/resource/Bill_Gates 78 | http://dbpedia.org/resource/Sergey_Brin 79 | http://dbpedia.org/resource/Larry_Page 80 | http://dbpedia.org/resource/Karl_Albrecht 81 | http://dbpedia.org/resource/Theo_Albrecht 82 | http://dbpedia.org/resource/Apple_Inc. 83 | http://dbpedia.org/resource/Mango_(clothing) 84 | http://dbpedia.org/resource/Orange_(telecommunications) 85 | http://dbpedia.org/resource/Sam_Zell 86 | http://dbpedia.org/resource/Equity_International 87 | http://dbpedia.org/resource/Pixar 88 | http://dbpedia.org/resource/Cars_(film) 89 | http://dbpedia.org/resource/John_Lasseter 90 | http://dbpedia.org/resource/Mars_bar 91 | http://dbpedia.org/resource/Galaxy_(chocolate) 92 | http://dbpedia.org/resource/Bounty_(chocolate_bar) 93 | http://dbpedia.org/resource/Robert_Bosch_GmbH 94 | http://dbpedia.org/resource/Sharp_Corporation 95 | http://dbpedia.org/resource/Manchester_City_F.C. 96 | http://dbpedia.org/resource/Tottenham_Hotspur_F.C. 97 | http://dbpedia.org/resource/Arsenal_F.C. 98 | http://dbpedia.org/resource/Emirates_Stadium 99 | http://dbpedia.org/resource/Atlético_Madrid 100 | http://dbpedia.org/resource/Real_Madrid_C.F. 101 | http://dbpedia.org/resource/Thomas_Müller_(footballer) 102 | http://dbpedia.org/resource/England_national_football_team 103 | http://dbpedia.org/resource/Thomas_Müller_(footballer) 104 | http://dbpedia.org/resource/Mario_Gomez 105 | http://dbpedia.org/resource/FC_Bayern_Munich 106 | http://dbpedia.org/resource/Norbert_Haug 107 | http://dbpedia.org/resource/FC_Red_Bull_Salzburg 108 | http://dbpedia.org/resource/Rudi_Völler 109 | http://dbpedia.org/resource/Netherlands_national_football_team 110 | http://dbpedia.org/resource/San_Siro 111 | http://dbpedia.org/resource/Willi_Landgraf 112 | http://dbpedia.org/resource/Erik_Meijer_(footballer) 113 | http://dbpedia.org/resource/New_Tivoli 114 | http://dbpedia.org/resource/Reinhold_Yabo 115 | http://dbpedia.org/resource/Alemannia_Aachen 116 | http://dbpedia.org/resource/Hertha_BSC 117 | http://dbpedia.org/resource/Borussia_Dortmund 118 | http://dbpedia.org/resource/Richard_Nixon 119 | http://dbpedia.org/resource/Watergate_scandal 120 | http://dbpedia.org/resource/Ping_Pong_Diplomacy 121 | http://dbpedia.org/resource/People's_Republic_of_China 122 | http://dbpedia.org/resource/The_Sun_(United_Kingdom) 123 | http://dbpedia.org/resource/The_Times 124 | http://dbpedia.org/resource/Greece 125 | http://dbpedia.org/resource/Eurozone 126 | http://dbpedia.org/resource/Enola_Gay 127 | http://dbpedia.org/resource/Hiroshima 128 | http://dbpedia.org/resource/World_War_II 129 | http://dbpedia.org/resource/Red_Army_Faction 130 | http://dbpedia.org/resource/Andreas_Baader 131 | http://dbpedia.org/resource/Ulrike_Meinhof 132 | http://dbpedia.org/resource/Hanns-Martin_Schleyer 133 | http://dbpedia.org/resource/Jacqueline_Kennedy_Onassis 134 | http://dbpedia.org/resource/John_F._Kennedy 135 | http://dbpedia.org/resource/Neil_Armstrong 136 | http://dbpedia.org/resource/Moon 137 | http://dbpedia.org/resource/Erich_Honecker 138 | http://dbpedia.org/resource/Neunkirchen,_Saarland 139 | http://dbpedia.org/resource/Macedonia_(Greece) 140 | http://dbpedia.org/resource/Greece 141 | http://dbpedia.org/resource/Barack_Obama 142 | http://dbpedia.org/resource/Angela_Merkel 143 | http://dbpedia.org/resource/John_F._Kennedy_International_Airport 144 | http://dbpedia.org/resource/John_P._Kennedy 145 | -------------------------------------------------------------------------------- /datasets/kore50-urls.txt.absent: -------------------------------------------------------------------------------- 1 | en.wikipedia.org/wiki/First_Lady_of_Argentina 2 | en.wikipedia.org/wiki/Kate_Perry 3 | en.wikipedia.org/wiki/Hurricane_(song) 4 | en.wikipedia.org/wiki/EMP_Museum 5 | en.wikipedia.org/wiki/Sony_Music_Entertainment 6 | en.wikipedia.org/wiki/Woodstock_Festival 7 | en.wikipedia.org/wiki/Orange_(telecommunications) 8 | en.wikipedia.org/wiki/Mars_bar 9 | en.wikipedia.org/wiki/Robert_Bosch_GmbH 10 | en.wikipedia.org/wiki/Thomas_Müller_(footballer) 11 | en.wikipedia.org/wiki/Thomas_Müller_(footballer) 12 | en.wikipedia.org/wiki/Mario_Gomez 13 | en.wikipedia.org/wiki/Ping_Pong_Diplomacy 14 | en.wikipedia.org/wiki/People's_Republic_of_China 15 | en.wikipedia.org/wiki/Hanns-Martin_Schleyer 16 | -------------------------------------------------------------------------------- /datasets/kore50.tsv: -------------------------------------------------------------------------------- 1 | targets context 2 | Jon, Angelina, Brad Angelina, her father Jon, and her partner Brad never played together in the same movie. 3 | Sharp, Bosch Bosch and Sharp are both home appliances producing companies. 4 | Carter, Hurricane, Dylan, Desire Dylan performed Hurricane about the black fighter Carter, from his album Desire. 5 | Microsoft, Bill, Cairo Cairo was the code name for a project at Microsoft from 1991 to 1996. Its charter was to build technologies for a next generation operating system that would fulfill the vision of Bill. 6 | Mayall, Blues, Rock, Eric Eric preferred to play Blues instead of Rock, so he joined Mayall 's band. 7 | Santana, Columbia, Sony, Dylan, Joel, Sinatra Despite featuring some of the most promininent musicians of their decade --- like Sinatra, Dylan, Joel, and Santana --- Columbia was aquired by Sony in the 1980s. 8 | John, Pixar, Cars Pixar produced Cars, and John directed it. 9 | Atletico, Real Atletico has beaten its archrival Real. 10 | China, Ping-Pong Diplomacy, Nixon, Watergate Nixon resigned after Watergate despite his success in the Ping-Pong Diplomacy with China. 11 | Orange, Apple, Mango While Apple is an electronics company, Mango is a clothing one and Orange is a communication one. 12 | Victoria, David David and Victoria added spice to their marriage. 13 | Mars, Bounty, Galaxy Mars, Galaxy, and Bounty are all chocolate. 14 | Theo, Karl Karl and Theo made their extreme fortunes selling low-price groceries. 15 | Obama, Merkel, JFK Obama welcomed Merkel upon her arrival at JFK. 16 | Onassis, Kennedy Onassis married Kennedy on October 20, 1968. 17 | Aachen, Yabo Yabo plays for Aachen. 18 | Greece, Macedonia Macedonia is a province of Greece. 19 | Dylan, Hendrix, Allen, EMP, Seattle Allen founded the EMP in Seattle, which featured exhibitions about Hendrix and Dylan, but also about various science fiction movies. 20 | Madonna, Carlos, Eva Madonna played Eva and was seen with Carlos. 21 | Jobs, Stanford, Baez Jobs and Baez dated in the late 1970s, and she performed at his Stanford memorial. 22 | Haug, Red Bull Haug congratulated Red Bull. 23 | ngland., Müller Müller scored a hattrick against England. 24 | Hiroshima, Second World War, Enola Gay The Enola Gay bombed Hiroshima at the end of Second World War. 25 | an Siro., Völler , ranje Völler will never forget the match against Oranje in San Siro. 26 | Hertha, Dortmund Hertha won against Dortmund. 27 | US Open, Tiger Tiger lost the US Open. 28 | Victoria, David David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven. 29 | Erich, Neunkirchen Erich was born in Neunkirchen. 30 | MTV, Stefani, Twitter, Justin, Kate Justin, Stefani, and Kate are among the most popular people on both MTV and Twitter. 31 | Emirates, Gunners The Gunners now play their home matches at the Emirates. 32 | Tiger, Elin Tiger was lost in the woods when he got divorced from Elin. 33 | Mario, Munich, Thomas Thomas and Mario are strikers playing in Munich. 34 | City, Spurs City won 3:2 against the Spurs. 35 | Moon, Armstrong Armstrong was the first man on the Moon. 36 | Kennedy Kennedy was also an active politician, yet he is most known for his writings, some of which he published under the name of Mark Littleton. 37 | Apple, Steve, Stanford After the death of Steve, the former CEO of Apple, his commencement speech at Stanford was watched thousands of times. 38 | Jones, Homme, Grohl The group formed by Homme, Grohl, and Jones was supposed to be named Caligula, but the name was already taken. 39 | Paris, Kim Paris and Kim are both wealthy It Girls who had sex tapes on the Internet. 40 | Mitchell, Woodstock, Chicago, Davis, Isle of Wight festival The Isle of Wight festival in 1970 was the biggest at its time, surpassing Woodstock with acts like Davis, Chicago, and Mitchell. 41 | Windows, Steve, Stanford, Microsoft In 1980, Steve dropped out of Stanford to join Microsoft, the company behind the Windows operating system. 42 | Meijer, Tivoli, Landgraf Landgraf and Meijer played at the Tivoli. 43 | Joey, Harris, Desire Desire contains a duet with Harris in the song Joey. 44 | Sergey, Larry, Bill, Steve Steve, Bill, Sergey, and Larry have drawn a great deal of admiration these days for their pioneering successes that changed the world we live in. 45 | Cash, American Recordings, Rubin After unsuccessful years, aging country star Cash made a grandiose comeback with his American Recordings, recorded at his home with the help of Rubin. 46 | Madonna, First Lady In this musical, Madonna played the role of the First Lady. 47 | The Times, The Sun, Euro, Greece The Sun and The Times reported that Greece will have to leave the Euro soon. 48 | Beck, Page, Clapton Three of the greatest guitarists started their career in a single band : Clapton, Beck, and Page. 49 | Vegas, Heidi, Seal Heidi and her husband Seal live in Vegas. 50 | Sam, Equity International Sam, the co-founder of Equity International, was given the nickname of "the grave dancer" because of his ability to buy businesses that others thought were dead. 51 | Schleyer, RAF, Meinhof, Baader The RAF was a terrorist group led by Baader and Meinhof that killed Schleyer. 52 | -------------------------------------------------------------------------------- /datasets/kore50.ttl.contexts.tsv: -------------------------------------------------------------------------------- 1 | targets contexts 2 | Desire contains a duet with Harris in the song Joey. 3 | Paris and Kim are both wealthy It Girls who had sex tapes on the Internet. 4 | Steve, Bill, Sergey, and Larry have drawn a great deal of admiration these days for their pioneering successes that changed the world we live in. 5 | Obama welcomed Merkel upon her arrival at JFK. 6 | Cairo was the code name for a project at Microsoft from 1991 to 1996. Its charter was to build technologies for a next generation operating system that would fulfill the vision of Bill. 7 | Three of the greatest guitarists started their career in a single band : Clapton, Beck, and Page. 8 | Dylan performed Hurricane about the black fighter Carter, from his album Desire. 9 | Tiger was lost in the woods when he got divorced from Elin. 10 | While Apple is an electronics company, Mango is a clothing one and Orange is a communication one. 11 | The Enola Gay bombed Hiroshima at the end of Second World War. 12 | The RAF was a terrorist group led by Baader and Meinhof that killed Schleyer. 13 | Müller scored a hattrick against England. 14 | Macedonia is a province of Greece. 15 | Mars, Galaxy, and Bounty are all chocolate. 16 | Sam, the co-founder of Equity International, was given the nickname of "the grave dancer" because of his ability to buy businesses that others thought were dead. 17 | After unsuccessful years, aging country star Cash made a grandiose comeback with his American Recordings, recorded at his home with the help of Rubin. 18 | The Gunners now play their home matches at the Emirates. 19 | Pixar produced Cars, and John directed it. 20 | Heidi and her husband Seal live in Vegas. 21 | Bosch and Sharp are both home appliances producing companies. 22 | Karl and Theo made their extreme fortunes selling low-price groceries. 23 | Despite featuring some of the most promininent musicians of their decade --- like Sinatra, Dylan, Joel, and Santana --- Columbia was aquired by Sony in the 1980s. 24 | Thomas and Mario are strikers playing in Munich. 25 | In 1980, Steve dropped out of Stanford to join Microsoft, the company behind the Windows operating system. 26 | After the death of Steve, the former CEO of Apple, his commencement speech at Stanford was watched thousands of times. 27 | Yabo plays for Aachen. 28 | City won 3:2 against the Spurs. 29 | Jobs and Baez dated in the late 1970s, and she performed at his Stanford memorial. 30 | Tiger lost the US Open. 31 | Armstrong was the first man on the Moon. 32 | Allen founded the EMP in Seattle, which featured exhibitions about Hendrix and Dylan, but also about various science fiction movies. 33 | In this musical, Madonna played the role of the First Lady. 34 | Hertha won against Dortmund. 35 | The Sun and The Times reported that Greece will have to leave the Euro soon. 36 | David and Victoria added spice to their marriage. 37 | Justin, Stefani, and Kate are among the most popular people on both MTV and Twitter. 38 | Haug congratulated Red Bull. 39 | Kennedy was also an active politician, yet he is most known for his writings, some of which he published under the name of Mark Littleton. 40 | Landgraf and Meijer played at the Tivoli. 41 | David and Victoria named their children Brooklyn, Romeo, Cruz, and Harper Seven. 42 | Madonna played Eva and was seen with Carlos. 43 | Eric preferred to play Blues instead of Rock, so he joined Mayall 's band. 44 | The Isle of Wight festival in 1970 was the biggest at its time, surpassing Woodstock with acts like Davis, Chicago, and Mitchell. 45 | Atletico has beaten its archrival Real. 46 | Onassis married Kennedy on October 20, 1968. 47 | Völler will never forget the match against Oranje in San Siro. 48 | Nixon resigned after Watergate despite his success in the Ping-Pong Diplomacy with China. 49 | Erich was born in Neunkirchen. 50 | Angelina, her father Jon, and her partner Brad never played together in the same movie. 51 | The group formed by Homme, Grohl, and Jones was supposed to be named Caligula, but the name was already taken. 52 | -------------------------------------------------------------------------------- /datasets/kore50.ttl.phrases.tsv: -------------------------------------------------------------------------------- 1 | targets contexts 2 | Greece 3 | Cash 4 | Vegas 5 | Karl 6 | Jobs 7 | Müller 8 | Grohl 9 | Madonna 10 | Theo 11 | The Sun 12 | Eva 13 | Eva 14 | Kim 15 | Greece 16 | Vegas 17 | Hurricane 18 | Moon 19 | American Recordings 20 | Red Bull 21 | Hiroshima 22 | Equity International 23 | Enola Gay 24 | Isle of Wight festival 25 | Microsoft 26 | Allen 27 | Rubin 28 | RAF 29 | Hurricane 30 | Eric 31 | Elin 32 | Mayall 33 | Davis 34 | David 35 | Mario 36 | Munich 37 | Cars 38 | Schleyer 39 | Onassis 40 | Kennedy 41 | Bill 42 | David 43 | Neunkirchen 44 | The Times 45 | City 46 | Victoria 47 | Mitchell 48 | First Lady 49 | Greece 50 | Tiger 51 | Schleyer 52 | Baader 53 | Stanford 54 | Apple 55 | Steve 56 | Spurs 57 | San Siro 58 | Apple 59 | Dylan 60 | Steve 61 | Jones 62 | Tivoli 63 | Isle of Wight festival 64 | Landgraf 65 | Stanford 66 | Steve 67 | Steve 68 | Jon 69 | Pixar 70 | Rubin 71 | Larry 72 | Sony 73 | Allen 74 | Euro 75 | Desire 76 | Sam 77 | Meijer 78 | Neunkirchen 79 | Columbia 80 | Carter 81 | Tiger 82 | Macedonia 83 | Second World War 84 | US Open 85 | Joey 86 | Beck 87 | MTV 88 | Chicago 89 | Erich 90 | Carlos 91 | Desire 92 | Desire 93 | Merkel 94 | Columbia 95 | Enola Gay 96 | Dortmund 97 | Justin 98 | Heidi 99 | Homme 100 | Stefani 101 | Aachen 102 | Cairo 103 | Aachen 104 | Dylan 105 | Sharp 106 | Dylan 107 | Kate 108 | Steve 109 | David 110 | Armstrong 111 | Oranje 112 | Galaxy 113 | Greece 114 | Meijer 115 | City 116 | American Recordings 117 | Atletico 118 | Twitter 119 | Baez 120 | Cars 121 | Woodstock 122 | Rock 123 | Sinatra 124 | Obama 125 | Apple 126 | Gunners 127 | Hendrix 128 | Carter 129 | Haug 130 | Dylan 131 | Meinhof 132 | Pixar 133 | Heidi 134 | Cairo 135 | Hiroshima 136 | EMP 137 | Seattle 138 | Baez 139 | Stanford 140 | Kennedy 141 | Yabo 142 | England 143 | Sam 144 | Bounty 145 | Völler 146 | Harris 147 | Tivoli 148 | Sony 149 | EMP 150 | Seal 151 | Microsoft 152 | Microsoft 153 | Joey 154 | Oranje 155 | Chicago 156 | Clapton 157 | Ping-Pong Diplomacy 158 | Stefani 159 | Windows 160 | Seattle 161 | Landgraf 162 | Rock 163 | Joel 164 | Mario 165 | JFK 166 | Tiger 167 | Second World War 168 | Galaxy 169 | Brad 170 | John 171 | Sinatra 172 | Brad 173 | Watergate 174 | Cash 175 | Orange 176 | Tiger 177 | Blues 178 | US Open 179 | Spurs 180 | Real 181 | Kennedy 182 | Stanford 183 | Meinhof 184 | First Lady 185 | Justin 186 | Watergate 187 | Bill 188 | Elin 189 | Jobs 190 | Angelina 191 | RAF 192 | Erich 193 | Euro 194 | Stanford 195 | Hertha 196 | Thomas 197 | Emirates 198 | Baader 199 | Mayall 200 | Twitter 201 | Harris 202 | Grohl 203 | Mitchell 204 | Emirates 205 | Mars 206 | Red Bull 207 | Apple 208 | Orange 209 | Sharp 210 | Thomas 211 | Victoria 212 | Kate 213 | Sergey 214 | Madonna 215 | Davis 216 | Gunners 217 | Völler 218 | Victoria 219 | Kim 220 | Nixon 221 | Carlos 222 | Microsoft 223 | Karl 224 | Armstrong 225 | Larry 226 | Joel 227 | Merkel 228 | The Sun 229 | Bosch 230 | Bill 231 | Paris 232 | Page 233 | David 234 | Sergey 235 | Santana 236 | Nixon 237 | Desire 238 | The Times 239 | Bosch 240 | Bill 241 | Angelina 242 | Woodstock 243 | Hendrix 244 | Santana 245 | Haug 246 | Dylan 247 | JFK 248 | Mars 249 | Equity International 250 | Madonna 251 | Blues 252 | Mango 253 | Page 254 | Dylan 255 | Stanford 256 | Clapton 257 | China 258 | Madonna 259 | Windows 260 | Yabo 261 | San Siro 262 | Ping-Pong Diplomacy 263 | Homme 264 | Hertha 265 | Mango 266 | Real 267 | Victoria 268 | Theo 269 | Munich 270 | England 271 | Jones 272 | Eric 273 | John 274 | Atletico 275 | Bounty 276 | Kennedy 277 | Steve 278 | Müller 279 | Jon 280 | Paris 281 | Obama 282 | Onassis 283 | Moon 284 | MTV 285 | Dortmund 286 | Seal 287 | China 288 | Macedonia 289 | Beck 290 | -------------------------------------------------------------------------------- /datasets/n3-reuters-128-urls.txt.absent: -------------------------------------------------------------------------------- 1 | aksw.org/notInWiki/Motorola_Inc 2 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance 3 | en.wikipedia.org/wiki/Public_Service_Company_of_Colorado 4 | aksw.org/notInWiki/John_Dosher 5 | en.wikipedia.org/wiki/Shelly%27s_Inc 6 | aksw.org/notInWiki/Advanced_Micro 7 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton 8 | aksw.org/notInWiki/Tehran_Radio 9 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton 10 | aksw.org/notInWiki/Meyer_Detective_Agency_Inc 11 | en.wikipedia.org/wiki/Federal_Ministry_of_Economics_and_Technology_(Germany) 12 | aksw.org/notInWiki/Thomson_McKinnon 13 | aksw.org/notInWiki/Southmark_Corp 14 | aksw.org/notInWiki/Erskine_Resources_Ltd 15 | en.wikipedia.org/wiki/AT%26T_Corporation 16 | aksw.org/notInWiki/West_Point-Pepperell_Inc 17 | aksw.org/notInWiki/Shopsmith_Inc 18 | aksw.org/notInWiki/John_Wineapple 19 | aksw.org/notInWiki/Sterling_investment_banking_group 20 | en.wikipedia.org/wiki/The_Reader's_Digest_Association 21 | aksw.org/notInWiki/Paul_Oreffice 22 | aksw.org/notInWiki/Leonardo_Brito 23 | aksw.org/notInWiki/French_Federation_of_Non-Ferrous_Metals 24 | aksw.org/notInWiki/Interstate_Properties 25 | aksw.org/notInWiki/IVB_Financial_Corp 26 | en.wikipedia.org/wiki/London_Metal_Exchange 27 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company 28 | aksw.org/notInWiki/Madeira_Inc 29 | aksw.org/notInWiki/Robert_W_Scherer 30 | en.wikipedia.org/wiki/Banque_de_France 31 | aksw.org/notInWiki/Digital_Communications_Associates_Inc 32 | de.en.wikipedia.org/wiki/Montedison 33 | aksw.org/notInWiki/Pancontinental_Oil_Ltd 34 | aksw.org/notInWiki/Eileen_Gormley 35 | aksw.org/notInWiki/W.B._Saunders_Co_of_Canada_Ltd 36 | aksw.org/notInWiki/G_H_Shintoh 37 | aksw.org/notInWiki/U.S._District_Court 38 | de.en.wikipedia.org/wiki/Lohn_(Eschweiler) 39 | aksw.org/notInWiki/William_West 40 | aksw.org/notInWiki/AgrimontSPA 41 | en.wikipedia.org/wiki/Data_I/O 42 | aksw.org/notInWiki/Eileen_Gormley 43 | aksw.org/notInWiki/Michel_Dufour 44 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission 45 | aksw.org/notInWiki/American_Midland_Corp 46 | aksw.org/notInWiki/AgrimontSPA 47 | aksw.org/notInWiki/James_Ottinger 48 | aksw.org/notInWiki/Sun_Refining_and_Marketing_Co 49 | aksw.org/notInWiki/Larry_Taylor 50 | aksw.org/notInWiki/Datron_Corp 51 | aksw.org/notInWiki/Agrimont_Group 52 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance 53 | aksw.org/notInWiki/Coffin 54 | de.en.wikipedia.org/wiki/Montedison 55 | aksw.org/notInWiki/AgrimontSPA 56 | aksw.org/notInWiki/Jerzy_Urban 57 | aksw.org/notInWiki/Advanced_Voice_Technologies 58 | aksw.org/notInWiki/Holt,_Rinehart_and_Winston_Canada_Ltd 59 | aksw.org/notInWiki/Industrial_Valley_Title_Insurance_Co 60 | aksw.org/notInWiki/Joseph_DiGaicomo_Jr. 61 | aksw.org/notInWiki/Colorado_Rural_Electric_Association 62 | aksw.org/notInWiki/Synergen_Inc 63 | en.wikipedia.org/wiki/Rolls-Royce_plc 64 | aksw.org/notInWiki/Interstate_Properties 65 | en.wikipedia.org/wiki/Merck_%26_Co. 66 | aksw.org/notInWiki/Thermo-Print_GmbH 67 | aksw.org/notInWiki/Wedgestone_Realty_Investors_Trust 68 | aksw.org/notInWiki/CMS_Enhancements_Inc 69 | de.en.wikipedia.org/wiki/Montedison 70 | aksw.org/notInWiki/Mosaic_Systems_Inc 71 | aksw.org/notInWiki/Federal_Paperboard_Co_Inc 72 | aksw.org/notInWiki/Datron_Corp 73 | aksw.org/notInWiki/Stuart_Weisbrod 74 | en.wikipedia.org/wiki/Federal_Ministry_of_Economics_and_Technology_(Germany) 75 | aksw.org/notInWiki/Capital_Investigations_and_Protective_Agency 76 | en.wikipedia.org/wiki/Rolls-Royce_plc 77 | aksw.org/notInWiki/John_Dosher 78 | aksw.org/notInWiki/Robert_W_Scherer 79 | aksw.org/notInWiki/Gordon_Cain 80 | en.wikipedia.org/wiki/Public_Service_Company_of_Colorado 81 | aksw.org/notInWiki/Entourage_International_Inc 82 | aksw.org/notInWiki/Makoto_Kuroda 83 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission 84 | de.en.wikipedia.org/wiki/Montedison 85 | aksw.org/notInWiki/Shared_Network_Technologies_Inc 86 | aksw.org/notInWiki/Erskine_Resources_Ltd 87 | en.wikipedia.org/wiki/Boettcher_and_Co_Inc 88 | aksw.org/notInWiki/Advanced_Cardiovascular_Systems_Inc 89 | en.wikipedia.org/wiki/Amsterdam_Stock_Exchange 90 | en.wikipedia.org/wiki/International_Monetary_Fund 91 | aksw.org/notInWiki/Pace_Consultants_Inc 92 | aksw.org/notInWiki/CNA_Income_Shares_Inc 93 | en.wikipedia.org/wiki/W.R._Grace 94 | en.wikipedia.org/wiki/Moody's_Investors_Service 95 | aksw.org/notInWiki/Bashaw_Leduc_Oil_and_Gas_Ltd 96 | aksw.org/notInWiki/Avondale_Mills 97 | en.wikipedia.org/wiki/W.R._Grace 98 | en.wikipedia.org/wiki/Rolls-Royce_plc 99 | aksw.org/notInWiki/Ferruzzi_Groups 100 | en.wikipedia.org/wiki/General_Electric_Company_plc 101 | aksw.org/notInWiki/Advanced_Voice_Technologies 102 | en.wikipedia.org/wiki/Moody's_Investors_Service 103 | aksw.org/notInWiki/Samuel_Montagu_and_Sons_Ltd 104 | aksw.org/notInWiki/French_Federation_of_Non-Ferrous_Metals 105 | aksw.org/notInWiki/INTEL_Corp_INTC 106 | aksw.org/notInWiki/Pemberton_Houston_Willoughby_Bell_Gouinlock_Inc 107 | en.wikipedia.org/wiki/General_Electric_Company_plc 108 | aksw.org/notInWiki/Yankee_Cos_Inc 109 | en.wikipedia.org/wiki/General_Electric_Company_plc 110 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission 111 | aksw.org/notInWiki/Holt,_Rinehart_and_Winston_Canada_Ltd 112 | aksw.org/notInWiki/Cartel_Security_Consultants_Inc 113 | en.wikipedia.org/wiki/Welsh,_Carson,_Anderson_%26_Stowe 114 | en.wikipedia.org/wiki/Rolls-Royce_plc 115 | en.wikipedia.org/wiki/Eastman_Kodak 116 | de.en.wikipedia.org/wiki/Cornelis_van_der_Klugt 117 | aksw.org/notInWiki/Jerzy_Urban 118 | aksw.org/notInWiki/Amplicon_Inc 119 | aksw.org/notInWiki/Robert_W_Scherer 120 | aksw.org/notInWiki/CMS_Enhancements_Inc 121 | aksw.org/notInWiki/Advanced_Micro 122 | aksw.org/notInWiki/John_R._Folkerth 123 | de.en.wikipedia.org/wiki/Cornelis_van_der_Klugt 124 | en.wikipedia.org/wiki/Dillon,_Read_%26_Co. 125 | aksw.org/notInWiki/Freeport-McMoRan_Oil_and_Gas_Royalty_Trust 126 | en.wikipedia.org/wiki/International_Monetary_Fund 127 | en.wikipedia.org/wiki/Dominion_Textiles 128 | aksw.org/notInWiki/Industrial_Valley_Title_Insurance_Co 129 | aksw.org/notInWiki/Jerzy_Urban 130 | en.wikipedia.org/wiki/Alex._Brown_%26_Sons 131 | aksw.org/notInWiki/Advanced_Voice_Technologies 132 | aksw.org/notInWiki/Certified_Security_Services_Inc 133 | aksw.org/notInWiki/Harcourt_Brace_Jovanovich_Canada_Inc 134 | aksw.org/notInWiki/Paul_OKelly 135 | aksw.org/notInWiki/Montedisons_Agro-Industrial 136 | aksw.org/notInWiki/Gianfranco_Ceroni 137 | aksw.org/notInWiki/John_Durant 138 | aksw.org/notInWiki/Renato_Picco 139 | aksw.org/notInWiki/GGFH_Inc 140 | aksw.org/notInWiki/James_Adams 141 | aksw.org/notInWiki/Sci-Med_Life_Systems_Inc 142 | aksw.org/notInWiki/C.H._Masland 143 | de.en.wikipedia.org/wiki/Montedison 144 | aksw.org/notInWiki/Northern_Telecom_LTd 145 | aksw.org/notInWiki/Sun_Co 146 | aksw.org/notInWiki/John_Dosher 147 | aksw.org/notInWiki/Datron_Corp 148 | aksw.org/notInWiki/Datron_Corp 149 | aksw.org/notInWiki/Wedgestone_Advisory_Corp 150 | en.wikipedia.org/wiki/Fruehauf_Corporation 151 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton 152 | aksw.org/notInWiki/Christopher_Hogg 153 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton 154 | aksw.org/notInWiki/William_Randol 155 | en.wikipedia.org/wiki/Standard_%26_Poor's 156 | en.wikipedia.org/wiki/Stifel_Nicolaus 157 | aksw.org/notInWiki/Les_Editions_HRW_Ltd 158 | en.wikipedia.org/wiki/Republic_Airlines 159 | en.wikipedia.org/wiki/London_Metal_Exchange 160 | aksw.org/notInWiki/Picker_International_Inc 161 | en.wikipedia.org/wiki/Lloyds_Bank_of_Canada 162 | aksw.org/notInWiki/Italiana_Olii_e_Sifi 163 | aksw.org/notInWiki/Advanced_Micro_Devices_Inc 164 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton 165 | aksw.org/notInWiki/Edward_Johnson 166 | aksw.org/notInWiki/Highland_Superstores_Inc 167 | aksw.org/notInWiki/Le_Peep_Restaurants_Inc 168 | aksw.org/notInWiki/William_West 169 | en.wikipedia.org/wiki/Hambrecht_%26_Quist 170 | aksw.org/notInWiki/National_Guardian_Corp 171 | aksw.org/notInWiki/Leonardo_Brito 172 | aksw.org/notInWiki/Bertil_Nordin 173 | en.wikipedia.org/wiki/Moody's_Investors_Service 174 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance 175 | aksw.org/notInWiki/Michel_Dufour 176 | aksw.org/notInWiki/Shared_Network_Technologies_Inc 177 | en.wikipedia.org/wiki/Federal_Ministry_of_Transport,_Building_and_Urban_Development 178 | aksw.org/notInWiki/Paul_Oreffice 179 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton 180 | aksw.org/notInWiki/Braodway_Casinos_Inc 181 | aksw.org/notInWiki/New_Jersey_Hospital_Association 182 | aksw.org/notInWiki/American_Midland_Corp 183 | aksw.org/notInWiki/Thomas_Bell 184 | aksw.org/notInWiki/Ozaki_Trading_Co_Ltd 185 | en.wikipedia.org/wiki/General_Electric_Company_plc 186 | aksw.org/notInWiki/Federal_Realty_Investment_Trust 187 | en.wikipedia.org/wiki/Rolls-Royce_plc 188 | aksw.org/notInWiki/C.S.C._Security_Gaurd_Service 189 | aksw.org/notInWiki/Advanced_Micro 190 | en.wikipedia.org/wiki/Alex._Brown_%26_Sons 191 | aksw.org/notInWiki/Tehran_Radio 192 | aksw.org/notInWiki/American_Telephone_and_Telegraph_Co 193 | aksw.org/notInWiki/Christopher_Hogg 194 | aksw.org/notInWiki/Ferruzzi_Groups 195 | en.wikipedia.org/wiki/Kidder,_Peabody_%26_Co. 196 | en.wikipedia.org/wiki/AT%26T_Corporation 197 | aksw.org/notInWiki/Ozaki_Trading_Co_Ltd 198 | aksw.org/notInWiki/Larry_Taylor 199 | aksw.org/notInWiki/Leonardo_Brito 200 | aksw.org/notInWiki/Preston_corp 201 | aksw.org/notInWiki/G_H_Shintoh 202 | aksw.org/notInWiki/GGFH_Inc 203 | aksw.org/notInWiki/Ozaki_Trading_Co_Ltd 204 | aksw.org/notInWiki/G_H_Shintoh 205 | aksw.org/notInWiki/Advanced_Voice_Technologies 206 | en.wikipedia.org/wiki/Public_Service_Company_of_Colorado 207 | aksw.org/notInWiki/G_H_Shintoh 208 | en.wikipedia.org/wiki/Dillon,_Read_%26_Co. 209 | en.wikipedia.org/wiki/Standard_%26_Poor%27s 210 | aksw.org/notInWiki/Leonardo_Brito 211 | aksw.org/notInWiki/J._Terence_Murray 212 | aksw.org/notInWiki/Entourage_International_Inc 213 | aksw.org/notInWiki/Tradevest_Inc 214 | aksw.org/notInWiki/John_Dosher 215 | aksw.org/notInWiki/William_Randol 216 | en.wikipedia.org/wiki/Republic_Airlines 217 | aksw.org/notInWiki/Burns,_Pauli_and_Co_Inc 218 | aksw.org/notInWiki/Entourage_International_Inc 219 | en.wikipedia.org/wiki/Federal_government_of_the_United_States 220 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton 221 | aksw.org/notInWiki/Michael_Smith 222 | aksw.org/notInWiki/Atlas_Consolidated_Mining_and_Development_Corp 223 | aksw.org/notInWiki/AgrimontSPA 224 | aksw.org/notInWiki/Federal_Paperboard_Co_Inc 225 | aksw.org/notInWiki/Paul_Oreffice 226 | en.wikipedia.org/wiki/Compagnie_Française_d'Assurance_pour_le_Commerce_Extérieur 227 | aksw.org/notInWiki/Howard_Fromkin 228 | de.en.wikipedia.org/wiki/Royal_Cosun 229 | en.wikipedia.org/wiki/Kidder,_Peabody_%26_Co. 230 | aksw.org/notInWiki/Spear_Securities_Inc 231 | aksw.org/notInWiki/Cain_Chemical_Inc 232 | aksw.org/notInWiki/OBrien-Kreitzberg 233 | en.wikipedia.org/wiki/Merck_%26_Co. 234 | aksw.org/notInWiki/Association_of_White_Metals 235 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company 236 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company 237 | en.wikipedia.org/wiki/General_Electric_Company_plc 238 | aksw.org/notInWiki/Biotherapeutics_Inc 239 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission 240 | de.en.wikipedia.org/wiki/Montedison 241 | aksw.org/notInWiki/Les_Hosking 242 | aksw.org/notInWiki/Bertil_Nordin 243 | aksw.org/notInWiki/Bashaw_Leduc_Oil_and_Gas_Ltd 244 | en.wikipedia.org/wiki/International_Monetary_Fund 245 | en.wikipedia.org/wiki/E._F._Hutton_%26_Co. 246 | en.wikipedia.org/wiki/Stifel_Nicolaus 247 | aksw.org/notInWiki/Rockwell_International_ROK 248 | aksw.org/notInWiki/Eridania_SPA 249 | aksw.org/notInWiki/Spear_Securities_Inc 250 | aksw.org/notInWiki/Immunex_Corp 251 | en.wikipedia.org/wiki/Public_Service_Company_of_Colorado 252 | aksw.org/notInWiki/James_Ottinger 253 | aksw.org/notInWiki/Gordon_Cain 254 | en.wikipedia.org/wiki/Banque_de_France 255 | en.wikipedia.org/wiki/Standard_%26_Poor's 256 | aksw.org/notInWiki/Federal_Paperboard_Co_Inc 257 | aksw.org/notInWiki/Eileen_Gormley 258 | de.en.wikipedia.org/wiki/Montedison 259 | aksw.org/notInWiki/Raymond_Savoie 260 | aksw.org/notInWiki/Asiavest_Pty_Ltd 261 | aksw.org/notInWiki/Datron_Corp 262 | aksw.org/notInWiki/Monobloc_U.S.A. 263 | en.wikipedia.org/wiki/Ministry_of_Social_Affairs_and_Employment_(Netherlands) 264 | aksw.org/notInWiki/Madeira_Inc 265 | aksw.org/notInWiki/Sci-Med_Life_Systems_Inc 266 | aksw.org/notInWiki/A-T-E_Security_Group_Inc 267 | aksw.org/notInWiki/North_Country_Media_Group 268 | en.wikipedia.org/wiki/The_Reader's_Digest_Association 269 | en.wikipedia.org/wiki/Moody's_Investors_Service 270 | aksw.org/notInWiki/Victorian_Corporate_Affairs_Commission 271 | aksw.org/notInWiki/International_Business_Machines_Corp_IBM 272 | aksw.org/notInWiki/Tri-State_Generation_and_Transmission_Association 273 | en.wikipedia.org/wiki/Eastman_Kodak 274 | aksw.org/notInWiki/Harcourt_Brace_Jovanovich_Canada_Inc 275 | aksw.org/notInWiki/Eileen_Gormley 276 | aksw.org/notInWiki/Avondale_Mills 277 | aksw.org/notInWiki/Universal_Medical_Buildings_L_P 278 | aksw.org/notInWiki/AutoSpa_corp 279 | aksw.org/notInWiki/Eberstadt_Fleming_Inc 280 | aksw.org/notInWiki/Nantucket_Industries_Inc 281 | aksw.org/notInWiki/Robert_W_Scherer 282 | aksw.org/notInWiki/Security_Services_of_America 283 | aksw.org/notInWiki/Custom_Security_Services 284 | aksw.org/notInWiki/Johnson_Redbook 285 | aksw.org/notInWiki/Tehran_Radio 286 | aksw.org/notInWiki/Fidelcor_Inc 287 | aksw.org/notInWiki/Datron_Corp 288 | aksw.org/notInWiki/Terry_Hampton 289 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company 290 | aksw.org/notInWiki/Sci-Med_Life_Systems_Inc 291 | aksw.org/notInWiki/Ponderosa_Inc 292 | aksw.org/notInWiki/Cain_Chemical_Inc 293 | aksw.org/notInWiki/American_Midland_Corp 294 | aksw.org/notInWiki/Advanced_Voice_Technologies 295 | aksw.org/notInWiki/Union_Rural_Electric_Association_of_Brighton 296 | aksw.org/notInWiki/Bruce_Smart 297 | aksw.org/notInWiki/Michel_Dufour 298 | aksw.org/notInWiki/Hans_van_Liemt 299 | en.wikipedia.org/wiki/International_Monetary_Fund 300 | aksw.org/notInWiki/J._Terence_Murray 301 | aksw.org/notInWiki/AgrimontSPA 302 | aksw.org/notInWiki/Wells_Fargo_Alarm_Services 303 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance 304 | aksw.org/notInWiki/Celltronics_Inc 305 | aksw.org/notInWiki/CNA_Income_Shares_Inc 306 | aksw.org/notInWiki/Blocker_Energy_corp 307 | en.wikipedia.org/wiki/The_Reynolds_and_Reynolds_Company 308 | aksw.org/notInWiki/Don_Bybee_and_Associates 309 | de.en.wikipedia.org/wiki/Cocoa_Producers’_Alliance 310 | en.wikipedia.org/wiki/Moody's_Investors_Service 311 | aksw.org/notInWiki/Ettore_dellIsola 312 | aksw.org/notInWiki/Foreign_Policy_Institute 313 | aksw.org/notInWiki/Home_Intensive_Care_Inc 314 | aksw.org/notInWiki/Hans_van_Liemt 315 | aksw.org/notInWiki/Association_of_White_Metals 316 | aksw.org/notInWiki/Alexanders_Inc 317 | -------------------------------------------------------------------------------- /datasets/singleton.tsv: -------------------------------------------------------------------------------- 1 | targets context 2 | Michael Jordan, John Smith, Richard Stallman, Linus Torvalds In a sense it’s an improvement on 'Rebirth,' even if a couple of the songs bear the scars of that period, particularly the Rick Rubinesque title track, and 'Popular,' which has some of the new wave sizzle he was toying with. 3 | -------------------------------------------------------------------------------- /datasets/subset/1000_labels.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/datasets/subset/1000_labels.db -------------------------------------------------------------------------------- /datasets/subset/1000_long_abstracts.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/datasets/subset/1000_long_abstracts.db -------------------------------------------------------------------------------- /datasets/subset/1000_nodes_lookup.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/datasets/subset/1000_nodes_lookup.db -------------------------------------------------------------------------------- /datasets/test.phrases.tsv: -------------------------------------------------------------------------------- 1 | targets contexts 2 | Canada 3 | Lil Wayne 4 | Madonna 5 | San Francisco 6 | -------------------------------------------------------------------------------- /datasets/top-cities.txt: -------------------------------------------------------------------------------- 1 | Abidjan 2 | Ahmedabad 3 | Alexandria 4 | Ankara 5 | Baghdad 6 | Bangalore 7 | Bangkok 8 | Beijing 9 | Berlin 10 | Busan 11 | Cairo 12 | Casablanca 13 | Chengdu 14 | Chongqing 15 | Delhi 16 | Dhaka 17 | Dongguan 18 | Durban 19 | Ekurhuleni 20 | Faisalabad 21 | Foshan 22 | Giza 23 | Guangzhou 24 | Hangzhou 25 | Hanoi 26 | Harbin 27 | Hefei 28 | Incheon 29 | Istanbul 30 | Jaipur 31 | Jakarta 32 | Jeddah 33 | Johannesburg 34 | Kabul 35 | Karachi 36 | Kinshasa 37 | Kolkata 38 | Lagos 39 | Lahore 40 | Lima 41 | London 42 | Madrid 43 | Moscow 44 | Mumbai 45 | Nairobi 46 | Nanjing 47 | Ningbo 48 | Pune 49 | Pyongyang 50 | Riyadh 51 | Santiago 52 | Seoul 53 | Shanghai 54 | Shantou 55 | Shenyang 56 | Shenzhen 57 | Singapore 58 | Surat 59 | Suzhou 60 | Tehran 61 | Tianjin 62 | Tokyo 63 | Wenzhou 64 | Wuhan 65 | Xiamen 66 | Yangon 67 | Yokohama 68 | Zhengzhou 69 | Zhongshan 70 | -------------------------------------------------------------------------------- /datasets/us-states.txt: -------------------------------------------------------------------------------- 1 | Alabama 2 | Alaska 3 | Arizona 4 | Arkansas 5 | California 6 | Colorado 7 | Connecticut 8 | Delaware 9 | Florida 10 | Hawaii 11 | Idaho 12 | Illinois 13 | Indiana 14 | Iowa 15 | Kansas 16 | Kentucky 17 | Louisiana 18 | Maine 19 | Maryland 20 | Massachusetts 21 | Michigan 22 | Minnesota 23 | Mississippi 24 | Missouri 25 | Montana 26 | Nebraska 27 | Nevada 28 | New Hampshire 29 | New Jersey 30 | New Mexico 31 | New York 32 | North Carolina 33 | North Dakota 34 | Ohio 35 | Oklahoma 36 | Oregon 37 | Pennsylvania 38 | Rhode Island 39 | South Carolina 40 | South Dakota 41 | Tennessee 42 | Texas 43 | Utah 44 | Vermont 45 | Virginia 46 | Washington 47 | Wisconsin 48 | Wyoming 49 | -------------------------------------------------------------------------------- /diffbot_api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import codecs 4 | import grequests 5 | from sqlitedict import SqliteDict 6 | from utils import ROOT_DIR 7 | from os.path import join 8 | from time import time 9 | 10 | 11 | endpoint_diffbot = "http://kg.diffbot.com/kg/dql_endpoint" 12 | 13 | ENTITY_TYPES = ["AdministrativeArea", "Article", "Corporation", 14 | "DegreeEntity", "EducationMajorEntity", "EducationalInstitution", 15 | "EmploymentCategory", "Image", "Intangible", "Landmark", "LocalBusiness", 16 | "Miscellaneous", "Organization", "Person", "Place", "Product", "Role", 17 | "Skill", "Video"] 18 | 19 | EL_ENTITY_TYPES = ["AdministrativeArea", "Corporation", "EducationalInstitution", 20 | "Landmark", "LocalBusiness", "Miscellaneous", "Organization", 21 | "Person", "Place", "Product"] 22 | 23 | EL_POL_ENTITY_TYPES = ["AdministrativeArea", "Corporation", "EducationalInstitution", 24 | "Landmark", "LocalBusiness", "Organization", 25 | "Person", "Place", "Product"] 26 | 27 | CACHED_QUERY_DB = join(join(ROOT_DIR, "cache"), "diffbot-query-cache.sqlite") 28 | 29 | 30 | class CachedQuery(object): 31 | def __init__(self, cache_fpath=CACHED_QUERY_DB): 32 | self._cache = SqliteDict(cache_fpath, autocommit=True) 33 | 34 | def __del__(self): 35 | try: 36 | self._cache.close() 37 | except: 38 | print("Warning: trying to close a closed cache.") 39 | 40 | def make_query(self, query): 41 | if query in self._cache: 42 | return self._cache[query] 43 | else: 44 | response = make_query(query) 45 | self._cache[query] = response 46 | return response 47 | 48 | def close(self): 49 | self._cache.close() 50 | 51 | def response2dict(self, response): 52 | return json.loads(response.content) 53 | 54 | def get_entity(self, db_uri): 55 | if db_uri in self._cache: 56 | return self._cache[db_uri] 57 | else: 58 | response = self._get_entity(db_uri) 59 | self._cache[db_uri] = response 60 | return response 61 | 62 | def _get_entity(self, db_uri): 63 | """ Takes as input URI like http://www.diffbot.com/entity/CQSNBJBdRL7 and returns 64 | and entity. """ 65 | 66 | db_uri = db_uri.replace("https:", "http:") 67 | 68 | data = { 69 | "token": get_token(), 70 | "query": "diffbotUri:{}".format(db_uri), 71 | "type": "query"} 72 | 73 | r = requests.get(endpoint_diffbot, params=data) 74 | 75 | return self.response2dict(r) 76 | 77 | 78 | # https://dev.kg.diffbot.com/kg/dql_endpoint?type=query&token=token&query=id:OIZzlT1rihy 79 | # https://www.diffbot.com/entity/OIZzlT1rihy 80 | 81 | 82 | token = None 83 | def get_token(): 84 | global token 85 | if token: 86 | return token 87 | else: 88 | with open("dbt", "r") as f: 89 | token = f.read().strip() 90 | return token 91 | 92 | 93 | def make_queries(queries, parallel=32): 94 | rs = [] 95 | for query in queries: 96 | data = { 97 | "token": get_token(), 98 | "query": query, 99 | "type": "query"} 100 | 101 | rs.append(grequests.get(endpoint_diffbot, params=data)) 102 | 103 | return grequests.map(rs, size=parallel) 104 | 105 | 106 | def make_query(query): 107 | data = { 108 | "token": get_token(), 109 | "query": query, 110 | "type": "query"} 111 | r = requests.get(endpoint_diffbot, params=data) 112 | 113 | return r 114 | 115 | 116 | def save2json(output_fpath, r): 117 | with codecs.open(output_fpath, "w", "utf-8") as out: 118 | out.write(json.dumps(json.loads(r.content))) 119 | print(output_fpath) 120 | 121 | 122 | def query_and_save(query, output_fpath): 123 | r = make_query(query) 124 | save2json(output_fpath, r) 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /fwd.sh: -------------------------------------------------------------------------------- 1 | ssh -L 8888:localhost:8888 -L 1234:localhost:1234 -L 8181:localhost:8181 lthead -L 8080:localhost:8080 diffbot 2 | -------------------------------------------------------------------------------- /generate_absent.py: -------------------------------------------------------------------------------- 1 | from glob import glob 2 | from os.path import join 3 | from diffbot_api import make_query 4 | from utils import dbpedia2wikipedia 5 | from traceback import format_exc 6 | import json 7 | 8 | 9 | def get_hits(diffbot_query_response): 10 | hits_num = diffbot_query_response["hits"] 11 | data = diffbot_query_response["data"] 12 | 13 | types = [] 14 | for i, hit in enumerate(diffbot_query_response["data"]): 15 | types.append(hit["type"]) 16 | 17 | 18 | return hits_num, types 19 | 20 | 21 | def generate_absent_datasets(datasets_fpaths): 22 | saved = None 23 | 24 | for dataset_fpath in glob(datasets_fpaths): 25 | print(dataset_fpath) 26 | total_hits = 0 27 | total_urls = 0 28 | total_absent = 0 29 | 30 | with open(dataset_fpath, "r") as in_f, open(dataset_fpath + ".absent", "w") as out_f: 31 | for url in in_f: 32 | try: 33 | url = dbpedia2wikipedia(url.strip()) 34 | query = 'origins:"{}"'.format(url) 35 | r = make_query(query) 36 | db_response = json.loads(r.content) 37 | 38 | hits_num, types = get_hits(db_response) 39 | if url == "en.wikipedia.org/wiki/Russians": saved = db_response 40 | print(".", end="") 41 | total_urls += 1 42 | if hits_num == 0: 43 | total_absent += 1 44 | out_f.write("{}\n".format(url)) 45 | total_hits += hits_num 46 | 47 | except KeyboardInterrupt: 48 | break 49 | except: 50 | print(url, "error") 51 | print(format_exc()) 52 | print("\n") 53 | 54 | print("Absent urls:", total_absent) 55 | print("Total urls:", total_urls) 56 | print("Hits total for all urls:", total_hits) 57 | print("Avg. hits per url: {:.2f}".format(float(total_hits)/total_urls)) 58 | 59 | 60 | datasets_fpaths = "datasets/*txt" 61 | generate_absent_datasets(datasets_fpaths) 62 | -------------------------------------------------------------------------------- /linkers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uhh-lt/kb2vec/c02250177267ca78ce0f5886b7229f6b95ce2b5a/linkers/__init__.py -------------------------------------------------------------------------------- /linkers/baseline.py: -------------------------------------------------------------------------------- 1 | from converter import URIConverter 2 | import json 3 | from utils import truncated_log, overlap 4 | from candidate import Candidate 5 | from diffbot_api import CachedQuery, EL_POL_ENTITY_TYPES 6 | from ttl import parse_d2kb_ttl, CLASS_URI, LINK_URI, NONE_URI 7 | from rdflib import URIRef 8 | from random import random 9 | 10 | 11 | class TTLinker(object): 12 | def link_ttl(self, input_ttl): 13 | """ :param input_ttl a string with turtle (TTL) triples in the NIF format by GERBIL """ 14 | 15 | graph, contexts, phrases = parse_d2kb_ttl(input_ttl) 16 | input_len = len(graph) 17 | 18 | if len(contexts) > 1: 19 | print("Warning: more than one context is found. Using the first one.") 20 | context = contexts[0] 21 | elif len(contexts) == 0: 22 | print("Warning: no contexts found.") 23 | context = "" 24 | else: 25 | # only one context 26 | context = contexts[0] 27 | 28 | results = self.link(context, phrases) 29 | for phrase, candidate in results: 30 | if candidate and candidate.link and candidate.link != "": 31 | graph.add( (phrase.subj, LINK_URI, URIRef(candidate.link)) ) 32 | graph.add( (phrase.subj, CLASS_URI, URIRef(candidate.link)) ) 33 | else: 34 | print("Warning: can't link phrase '{}'@({}-{}): text='{}', uris='{}'".format( 35 | phrase.text, phrase.beg, phrase.end, candidate.text, "; ".join(candidate.uris))) 36 | print("".format(candidate)) 37 | graph.add( (phrase.subj, LINK_URI, NONE_URI) ) 38 | graph.add( (phrase.subj, CLASS_URI, NONE_URI) ) 39 | 40 | print("# triples input:", input_len) 41 | print("# triples output:", len(graph)) 42 | output_ttl = str(graph.serialize(format='n3', encoding="utf-8"), "utf-8") 43 | 44 | return output_ttl 45 | 46 | 47 | class BaselineLinker(TTLinker): 48 | def __init__(self, use_overlap=True, use_importance=True, verbose=True, lower=True): 49 | self._cq = CachedQuery() 50 | self._conv = URIConverter() 51 | self._use_overlap = use_overlap 52 | self._use_importance = use_importance 53 | self._verbose = verbose 54 | self._lower = lower 55 | 56 | def __del__(self): 57 | self.close() 58 | 59 | def close(self): 60 | try: 61 | self._cq.close() 62 | self._conv.close() 63 | except: 64 | print("Warning: trying to close a closed object.") 65 | 66 | def _get_uris(self, hit): 67 | uris = set() 68 | 69 | if "allUris" in hit: uris.union( set(hit["allUris"]) ) 70 | if "origins" in hit: uris.union( set(hit["origins"]) ) 71 | if "origin" in hit: uris.add( hit["origin"] ) 72 | 73 | return uris 74 | 75 | def _get_wikipedia_uri(self, hit, uris): 76 | wiki_uri = "" 77 | 78 | if "wikipediaUri" in hit: 79 | wiki_uri = hit["wikipediaUri"] 80 | uris.add(wiki_uri) 81 | else: 82 | # try to find via wikidata link 83 | for uri in uris: 84 | wiki_uri = self._conv.wikidata2wikipedia(uri) 85 | if wiki_uri != "": 86 | break 87 | 88 | return wiki_uri 89 | 90 | def _find_wiki_uri(self, uris): 91 | for uri in uris: 92 | if "wikipedia.org" in uri: 93 | return uri 94 | return "" 95 | 96 | def _get_dbpedia_uri(self, wiki_uri, uris): 97 | dbpedia_uri = "" 98 | 99 | if wiki_uri != "": 100 | dbpedia_uri = self._conv.wikipedia2dbpedia(wiki_uri) 101 | else: 102 | for uri in uris: 103 | dbpedia_uri = self._conv.wikidata2dbpedia(uri) 104 | if dbpedia_uri != "": break 105 | 106 | return dbpedia_uri 107 | 108 | def _link_db_query(self, target, diffbot_query_response): 109 | candidates = [] 110 | if "data" not in diffbot_query_response: 111 | return candidates 112 | else: 113 | data = diffbot_query_response["data"] 114 | 115 | for hit in data: 116 | if "allUris" not in hit: continue 117 | uris = set(hit["allUris"]) 118 | if "origin" in hit: uris.add( hit["origin"] ) 119 | if "origins" in hit: uris.union( set(hit["origins"]) ) 120 | if "wikipediaUri" in hit: 121 | uris.add( hit["wikipediaUri"] ) 122 | 123 | if "importance" in hit: 124 | name = hit["name"] 125 | importance = float(hit["importance"]) 126 | if self._use_overlap and self._use_importance: 127 | score = truncated_log(importance) * overlap(name, target, self._lower) 128 | elif self._use_overlap: 129 | score = overlap(name, target, self._lower) 130 | elif self._use_importance: 131 | score = importance 132 | else: 133 | score = random() 134 | 135 | wiki_uri = self._find_wiki_uri(uris) 136 | dbpedia_uri = self._get_dbpedia_uri(wiki_uri, uris) 137 | 138 | c = Candidate(score, 139 | name, 140 | dbpedia_uri, 141 | wiki_uri, 142 | hit["types"], 143 | hit["allNames"], 144 | uris) 145 | candidates.append(c) 146 | else: 147 | print("Warning: Skipping a hit without importance value.") 148 | 149 | return sorted(candidates, reverse=True) 150 | 151 | def link(self, context, phrases): 152 | linked_phrases = [] 153 | for phrase in phrases: 154 | candidates = [] 155 | for entity_type in EL_POL_ENTITY_TYPES: 156 | r = self._cq.make_query('type:{} name:"{}"'.format(entity_type, phrase.text)) 157 | db_response = json.loads(r.content) 158 | candidates += self._link_db_query(phrase.text, db_response) 159 | candidates = set(candidates) 160 | 161 | if len(candidates) > 0: 162 | best = sorted(candidates, reverse=True)[0] 163 | else: 164 | best = Candidate() 165 | linked_phrases.append( (phrase, best) ) 166 | 167 | if len(linked_phrases) != len(phrases): 168 | print("Warning: length of output is not equal to length of input {} != {}".format(len(best), len(phrases))) 169 | 170 | return linked_phrases 171 | 172 | -------------------------------------------------------------------------------- /linkers/context_aware.py: -------------------------------------------------------------------------------- 1 | from linkers.baseline import BaselineLinker 2 | from collections import defaultdict 3 | from diffbot_api import EL_POL_ENTITY_TYPES 4 | import json 5 | from candidate import Candidate 6 | from langid import classify 7 | import re 8 | from tqdm import tqdm 9 | from traceback import format_exc 10 | from patterns import re_newlines 11 | 12 | 13 | # ALL_RELATED_FIELDS = ["founders", "categories", "ceo", "isPartOf", 14 | # "skills", "parents", "children", "parentCompany"] 15 | RELATED_FIELDS = ["founders", "ceo", "parentCompany", "isPartOf"] 16 | DEFAULT_IMPORTANCE = 1.0 17 | DEFAULT_DB_URI = "" 18 | 19 | 20 | class ContextAwareLinker(BaselineLinker): 21 | """ A base class for linkers that make use of textual representations of entities. """ 22 | 23 | def __init__(self): 24 | BaselineLinker.__init__(self) 25 | self._re_contains_alpha = re.compile(r"[a-z]+", re.U|re.I) 26 | self._re_newlines = re.compile(r"[\n\r]+") 27 | self._sep = " . " 28 | 29 | def _build_index2candidate(self, candidate2index): 30 | """ Constructs an index in the opposite direction. """ 31 | 32 | index2candidate = {} 33 | for candidate in candidate2index: 34 | index = candidate2index[candidate] 35 | index2candidate[index] = candidate 36 | 37 | return index2candidate 38 | 39 | 40 | def get_db_entry(diffbot_uri): 41 | """ Gets an entity like https://www.diffbot.com/entity/AcZTRPXDrY9 and 42 | returns a json by https://www.diffbot.com/entity/AcZTRPXDrY9.json """ 43 | 44 | raise NotImplementedError 45 | return {} 46 | 47 | def _is_english(self, text): 48 | lang, conf = classify(text) 49 | return lang == "en" 50 | 51 | def _is_alpha(self, text): 52 | return self._re_contains_alpha.search(text) 53 | 54 | def _get_en_names(self, hit): 55 | names = [] 56 | 57 | if "allNames" in hit: 58 | for name in hit["allNames"]: 59 | if self._is_alpha(name) and self._is_english(name): 60 | names.append(name) 61 | 62 | return names 63 | 64 | def _get_name(self, hit): 65 | if "name" in hit: 66 | return hit["name"] 67 | else: 68 | return "" 69 | 70 | def _get_record_texts(self, hit): 71 | texts = [ self._get_name(hit) ] 72 | texts += self._get_en_names(hit) 73 | 74 | if "isPartOf" in hit: 75 | for is_part_of in hit["isPartOf"]: 76 | if "name" in is_part_of: 77 | texts.append(is_part_of["name"]) 78 | 79 | if "description" in hit: 80 | texts.append(hit["description"]) 81 | 82 | texts_str = self._sep.join(texts) 83 | texts_str = re_newlines.sub(" ", texts_str) 84 | 85 | return texts_str 86 | 87 | def _get_wiki_texts(self, wiki_uri): 88 | # access from a cached (?) wikipedia dump 89 | return "" 90 | 91 | def _get_uri_texts(self, uris): 92 | # access the uris 93 | return "" 94 | 95 | def _extract_importance(self, hit): 96 | importance_field = "importance" 97 | if importance_field in hit: 98 | return float(hit[importance_field]) 99 | else: 100 | return DEFAULT_IMPORTANCE 101 | 102 | def _extract_db_uri(self, hit): 103 | db_uri_field = "diffbotUri" 104 | if db_uri_field in hit: 105 | return hit[db_uri_field] 106 | else: 107 | return DEFAULT_DB_URI 108 | 109 | def _extract_relations(self, hit): 110 | relations = {} 111 | 112 | for field_name in hit: 113 | if field_name not in RELATED_FIELDS: continue 114 | 115 | if isinstance(hit[field_name], dict): 116 | if "diffbotUri" in hit[field_name]: 117 | if field_name not in relations: relations[field_name] = list() 118 | relations[field_name].append(hit[field_name]["diffbotUri"]) 119 | 120 | if isinstance(hit[field_name], list): 121 | for item in hit[field_name]: 122 | if "diffbotUri" in item: 123 | if field_name not in relations: relations[field_name] = list() 124 | relations[field_name].append(item["diffbotUri"]) 125 | 126 | return relations 127 | 128 | def get_phrase_candidates(self, phrases, related_entities=False): 129 | phrase2candidates = defaultdict(set) 130 | 131 | for phrase in tqdm(phrases): 132 | for entity_type in EL_POL_ENTITY_TYPES: 133 | try: 134 | response_raw = self._cq.make_query('type:{} name:"{}"'.format(entity_type, phrase.text)) 135 | response = json.loads(response_raw.content) 136 | 137 | if "data" not in response: continue 138 | else: data = response["data"] 139 | 140 | for hit in data: 141 | c = self._build_candidate(hit) 142 | phrase2candidates[phrase].add(c) 143 | 144 | if not related_entities: continue 145 | 146 | related_num = 0 147 | for relation_type in c.relations: 148 | for related_entity_id in c.relations[relation_type]: 149 | related_response = self._cq.get_entity(related_entity_id) 150 | 151 | if "data" not in related_response or len(related_response["data"]) == 0: 152 | print("Warning: can't find related entity: {}.".format(related_entity_id)) 153 | continue 154 | 155 | for related_hit in related_response["data"]: 156 | related_num += 1 157 | related_c = self._build_candidate(related_hit) 158 | phrase2candidates[related_entity_id].add(related_c) 159 | if related_c.db_uri != related_entity_id: 160 | phrase2candidates[related_c.db_uri].add(related_c) 161 | 162 | print("'{}'#{}: added entity {} which is '{}' to {}".format( 163 | phrase.text, 164 | related_num, 165 | c.db_uri, 166 | relation_type, 167 | related_entity_id)) 168 | except: 169 | print("Warning: cannot process phrase '{}' of type '{}'".format(phrase.text, entity_type)) 170 | print(format_exc()) 171 | 172 | return phrase2candidates 173 | 174 | def _build_candidate(self, hit): 175 | 176 | uris = self._get_uris(hit) 177 | wiki_uri = self._get_wikipedia_uri(hit, uris) 178 | texts_record = self._get_record_texts(hit) 179 | texts_wiki = self._get_wiki_texts(wiki_uri) 180 | texts_uris = self._get_uri_texts(uris) 181 | texts = self._sep.join([texts_record, texts_wiki, texts_uris]) 182 | texts = self._re_newlines.sub(self._sep, texts) 183 | relations = self._extract_relations(hit) 184 | importance = self._extract_importance(hit) 185 | db_uri = self._extract_db_uri(hit) 186 | score = float(hit["importance"]) 187 | link = self._get_dbpedia_uri(wiki_uri, uris) 188 | types = hit["types"] if "types" in hit else [] 189 | 190 | c = Candidate(score, 191 | self._get_name(hit), 192 | link, 193 | wiki_uri, 194 | types, 195 | self._get_en_names(hit), 196 | uris, 197 | texts, 198 | db_uri, 199 | importance, 200 | relations) 201 | return c 202 | 203 | -------------------------------------------------------------------------------- /linkers/dense.py: -------------------------------------------------------------------------------- 1 | from linkers.sparse import SparseLinker 2 | from traceback import format_exc 3 | from gensim.models import KeyedVectors 4 | from utils import overlap 5 | from candidate import Candidate 6 | from time import time 7 | from traceback import format_exc 8 | from os.path import exists, join 9 | from nltk.corpus import stopwords 10 | from nltk import pos_tag 11 | from sklearn.externals import joblib 12 | from sklearn.preprocessing import normalize 13 | from tqdm import tqdm 14 | from candidate import make_phrases 15 | from numpy import argsort, argmax, dot, zeros, multiply, ones 16 | 17 | 18 | class DenseLinker(SparseLinker): 19 | def __init__(self, model_dir, embeddings_fpath, tfidf=True, use_overlap=True, description="", stop_words=True): 20 | SparseLinker.__init__(self, model_dir, tfidf, use_overlap, description, stop_words) 21 | self._params["word_embeddings"] = embeddings_fpath 22 | self._wv = self._load_word_embbeddings(embeddings_fpath) 23 | self._stopwords = set(stopwords.words("english")) 24 | 25 | if hasattr(self, '_dense_vectors'): 26 | print("Normalizing dense vectors...") 27 | tic = time() 28 | self._dense_vectors = normalize(self._dense_vectors) 29 | print("Done in {:.2f} sec.".format(time() - tic)) 30 | else: 31 | print("Warning: no dense vectors could be found. You need to train the model first.") 32 | 33 | def print_most_similar(self, n=10, max_candidates=10, test_name="Seal"): 34 | test_phrases = make_phrases([test_name]) 35 | 36 | for test_phrase in test_phrases: 37 | print("=" * 50, "\n", test_phrase) 38 | test_candidates = self._phrase2candidates[self._default_phrase(test_phrase)] 39 | 40 | for j, tc in enumerate(test_candidates): 41 | if j > max_candidates: break 42 | 43 | print("=" * 50, "\n", tc) 44 | 45 | tc_index = self._candidate2index[tc] 46 | tc_dvector = self._dense_vectors[tc_index, :] 47 | 48 | # dot product with all candidates to find the most similar ones 49 | tc_sims = self._dense_vectors.dot(tc_dvector) 50 | tc_sorted_indices = argsort(-tc_sims)[:n] 51 | 52 | print("-" * 50) 53 | for i, nearest_candidate_index in enumerate(tc_sorted_indices): 54 | print(i, tc_sims[nearest_candidate_index], self._index2candidate[nearest_candidate_index], "\n") 55 | 56 | def _load(self, model_dir): 57 | SparseLinker._load(self, model_dir) 58 | 59 | dense_vectors_filename = "dense_vectors.pkl" 60 | self._dense_vectors_fpath = join(model_dir, dense_vectors_filename) 61 | 62 | if exists(self._dense_vectors_fpath): 63 | print("Loading:", self._dense_vectors_fpath) 64 | self._dense_vectors = joblib.load(self._dense_vectors_fpath) 65 | 66 | def train(self, dataset_fpaths): 67 | phrases = self._dataset2phrases(dataset_fpaths) 68 | self._dense_vectors = zeros((self._vectors.shape[0], self._wv.vector_size)) 69 | 70 | for phrase in tqdm(phrases): 71 | try: 72 | dphrase = self._default_phrase(phrase) 73 | if dphrase in self._phrase2candidates: 74 | # get the candidates 75 | candidates = list(self._phrase2candidates[dphrase]) # to remove 76 | indices = [] 77 | for candidate in candidates: 78 | if candidate in self._candidate2index: 79 | indices.append(self._candidate2index[candidate]) 80 | else: 81 | print("Warning: candidate '{}' is not indexed".format(candidate)) 82 | indices.append(0) # just to make sure lengths are equal 83 | 84 | #candidate_vectors = self._vectors[indices] 85 | print("Retrieved {} candidates for '{}'".format(len(indices), phrase.text)) 86 | 87 | for index in indices: 88 | self._dense_vectors[index, :] = self._get_dense_vector(self._vectors[index, :], dphrase.text) 89 | except: 90 | print("Warning: error phrase '{}'".format(phrase)) 91 | print(format_exc()) 92 | 93 | joblib.dump(self._dense_vectors, self._dense_vectors_fpath) 94 | print("Dense vectors:", self._dense_vectors_fpath) 95 | 96 | def _load_word_embbeddings(self, word_embeddings_fpath): 97 | print("Loading word vectors from:", word_embeddings_fpath) 98 | tic = time() 99 | 100 | self._params["word_embeddings_pickle"] = word_embeddings_fpath + ".pkl" 101 | if exists(self._params["word_embeddings_pickle"]): 102 | wv = KeyedVectors.load(self._params["word_embeddings_pickle"]) 103 | wv.init_sims(replace=True) 104 | else: 105 | wv = KeyedVectors.load_word2vec_format(word_embeddings_fpath, binary=False, unicode_errors="ignore") 106 | wv.init_sims(replace=True) 107 | 108 | tac = time() 109 | wv.save(self._params["word_embeddings_pickle"]) 110 | print("Saved in {} sec.".format(time() - tac)) 111 | 112 | print("Loaded in {} sec.".format(time() - tic)) 113 | 114 | return wv 115 | 116 | def link(self, context, phrases): 117 | linked_phrases = [] 118 | context_vector = self._vectorizer.transform([context]) 119 | 120 | for phrase in phrases: 121 | try: 122 | dphrase = self._default_phrase(phrase) 123 | if dphrase in self._phrase2candidates: 124 | # get the candidates 125 | candidates = list(self._phrase2candidates[dphrase]) # to remove 126 | indices = [] 127 | for candidate in candidates: 128 | if candidate in self._candidate2index: 129 | indices.append(self._candidate2index[candidate]) 130 | else: 131 | print("Warning: candidate '{}' is not indexed".format(candidate)) 132 | indices.append(0) # just to make sure lengths are equal 133 | 134 | dense_candidate_vectors = self._dense_vectors[indices] 135 | # check if candidates are correct 136 | print("Retrieved {} candidates for '{}'".format(len(indices), phrase.text)) 137 | 138 | dense_context_vector = self._get_dense_vector(context_vector, dphrase.text) 139 | 140 | # rank the candidates 141 | sims = dot(dense_candidate_vectors, dense_context_vector.T) 142 | 143 | if self._params["use_overlap"]: 144 | overlap_scores = zeros(sims.shape) 145 | for i, candidate in enumerate(candidates): 146 | overlap_scores[i] = overlap(candidate.name, phrase.text) 147 | else: 148 | overlap_scores = ones(sims.shape) 149 | 150 | scores = multiply(sims, overlap_scores) 151 | best_index = argmax(scores) 152 | best_candidate = candidates[best_index] 153 | best_candidate.score = scores[best_index] 154 | best_candidate.link = self._get_dbpedia_uri(best_candidate.wiki, best_candidate.uris) 155 | linked_phrases.append((phrase, best_candidate)) 156 | else: 157 | print("Warning: phrase '{}' is not found in the vocabulary of the model".format(phrase)) 158 | 159 | linked_phrases.append((phrase, Candidate())) 160 | except: 161 | print("Error while processing phrase '{}':") 162 | print(format_exc()) 163 | linked_phrases.append((phrase, Candidate())) 164 | return linked_phrases 165 | 166 | def _get_dense_vectors(self, sparse_vectors, target): 167 | dense_vectors = zeros((sparse_vectors.shape[0], self._wv.vector_size)) 168 | 169 | for i in range(sparse_vectors.shape[0]): 170 | sparse_candidate_vector = sparse_vectors[i, :] 171 | dense_candidate_vector = self._get_dense_vector(sparse_candidate_vector, target) 172 | dense_vectors[i, :] = dense_candidate_vector 173 | 174 | return dense_vectors 175 | 176 | def _get_dense_vector(self, sparse_vector, target): 177 | """ Construct the dense vector """ 178 | 179 | dense_vector = zeros(self._wv.vector_size) 180 | weights_sum = 0. 181 | names = self._vectorizer.get_feature_names() 182 | 183 | for i, word_weight in enumerate(sparse_vector.data): 184 | feature_index = sparse_vector.indices[i] 185 | word = names[feature_index] 186 | 187 | if word.lower() in self._stopwords or word.lower() == target.lower(): continue 188 | lemma, pos = pos_tag([word])[0] 189 | if pos[:2] not in ["FW", "JJ", "JJ", "NN", "VB", "RB"]: continue 190 | # print(word, end=", ") 191 | 192 | if word in self._wv.vocab: 193 | word_vector = self._wv[word] 194 | elif word.capitalize() in self._wv.vocab: 195 | word_vector = self._wv[word.capitalize()] 196 | else: 197 | continue 198 | 199 | 200 | dense_vector += word_weight * word_vector 201 | weights_sum += word_weight 202 | 203 | 204 | dense_vector = dense_vector / (len(sparse_vector.data) + 1.) 205 | #print("\n>>>>>>>>\n") 206 | return dense_vector 207 | -------------------------------------------------------------------------------- /linkers/nn_graph.py: -------------------------------------------------------------------------------- 1 | from linkers.baseline import BaselineLinker 2 | from candidate import Candidate 3 | from supervised.evaluate import Evaluator 4 | 5 | 6 | class NNLinker(BaselineLinker): 7 | def __init__(self): 8 | BaselineLinker.__init__(self) 9 | self.evaluator = Evaluator() 10 | 11 | def link(self, context, phrases): 12 | 13 | linked_phrases = list() 14 | 15 | #file = open('/Users/sevgili/Desktop/context-phrase-nif.txt', 'a') 16 | 17 | for phrase in phrases: 18 | #file.write(str(context) + '\t' + str(phrase.text) + '\t' + str(phrase.beg) + '\t' + str(phrase.end) + '\n') 19 | score, predicted_url = self.evaluator.get_best_pred(context, phrase) 20 | print('******', context, phrase, score, predicted_url) 21 | c = Candidate(score=score, link=predicted_url) 22 | 23 | linked_phrases.append((phrase, c)) 24 | 25 | return linked_phrases 26 | 27 | 28 | class CandidateRandom(NNLinker): 29 | def __init__(self): 30 | NNLinker.__init__(self) 31 | 32 | def link(self, context, phrases): 33 | 34 | linked_phrases = list() 35 | 36 | for phrase in phrases: 37 | score, predicted_url = self.evaluator.get_random_pred(context, phrase) 38 | 39 | c = Candidate(score=score, link=predicted_url) 40 | 41 | linked_phrases.append((phrase, c)) 42 | 43 | return linked_phrases 44 | -------------------------------------------------------------------------------- /linkers/sparse.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import overlap 3 | from linkers.context_aware import ContextAwareLinker 4 | from candidate import Candidate 5 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 6 | from candidate import Phrase, make_phrases 7 | from pandas import read_csv 8 | from time import time 9 | from os.path import join 10 | from utils import ensure_dir 11 | from sklearn.externals import joblib 12 | import json 13 | from os.path import exists 14 | import codecs 15 | from numpy import dot, argmax 16 | from traceback import format_exc 17 | 18 | 19 | # ToDo: save also directly the phrase2index file for faster classifications 20 | 21 | class SparseLinker(ContextAwareLinker): 22 | def __init__(self, model_dir, tfidf=True, use_overlap=True, description="", stop_words=True, 23 | related_entities=False, binary_count_vectorizer=False, wiki_only=False): 24 | 25 | ContextAwareLinker.__init__(self) 26 | print("Model directory:", model_dir) 27 | self._params = {} 28 | self._params["tfidf"] = tfidf 29 | self._params["description"] = description 30 | self._params["use_overlap"] = use_overlap 31 | self._params["stop_words"] = stop_words 32 | self._params["binary_count_vectorizer"] = binary_count_vectorizer 33 | self._params["related_entities"] = related_entities 34 | self._params["related_entities_factor"] = 3 # text of related entities 3 times less important than the entity text 35 | self._params["wiki_only"] = wiki_only 36 | 37 | 38 | vectorizer_filename = "vectorizer.pkl" 39 | candidate2index_filename = "candidate2index.pkl" 40 | params_filename = "params.json" 41 | vectors_filename = "vectors.pkl" 42 | phrase2candidates_filename = "phrase2candidates.pkl" 43 | phrases_filename = "phrases.txt" 44 | candidates_filename = "candidates.txt" 45 | 46 | self._vectorizer_fpath = join(model_dir, vectorizer_filename) 47 | self._candidate2index_fpath = join(model_dir, candidate2index_filename) 48 | self._params_fpath = join(model_dir, params_filename) 49 | self._vectors_fpath = join(model_dir, vectors_filename) 50 | self._phrase2candidates_fpath = join(model_dir, phrase2candidates_filename) 51 | self._phrases_fpath = join(model_dir, phrases_filename) 52 | self._candidates_fpath = join(model_dir, candidates_filename) 53 | self._load(model_dir) # using the defined paths 54 | 55 | def set_params(self, params): 56 | for param in params: 57 | self._params[param] = params[param] 58 | 59 | def _load(self, model_dir): 60 | tic = time() 61 | ensure_dir(model_dir) 62 | 63 | if exists(self._params_fpath): 64 | with open(self._params_fpath, "r") as fp: 65 | self._params = json.load(fp) 66 | print("Parameters:\n- ", "\n- ".join("{}: {}".format(p, self._params[p]) for p in self._params)) 67 | 68 | if exists(self._phrase2candidates_fpath): 69 | print("Loading:", self._phrase2candidates_fpath) 70 | self._phrase2candidates = joblib.load(self._phrase2candidates_fpath) 71 | 72 | if exists(self._candidate2index_fpath): 73 | print("Loading:", self._candidate2index_fpath) 74 | self._candidate2index = joblib.load(self._candidate2index_fpath) 75 | 76 | print("Building index2candidate lookup table...") 77 | tic = time() 78 | self._index2candidate = self._build_index2candidate(self._candidate2index) 79 | print("Done in {:.2f} sec.".format(time() - tic)) 80 | 81 | if exists(self._vectorizer_fpath): 82 | print("Loading:", self._vectorizer_fpath) 83 | self._vectorizer = joblib.load(self._vectorizer_fpath) 84 | 85 | if exists(self._vectors_fpath): 86 | print("Loading:", self._vectors_fpath) 87 | self._vectors = joblib.load(self._vectors_fpath) 88 | 89 | print("Loaded in {:.2f} sec.".format(time() - tic)) 90 | 91 | 92 | def train(self, dataset_fpaths): 93 | tic = time() 94 | print("Training...") 95 | phrases = self._dataset2phrases(dataset_fpaths) 96 | self._train(phrases) 97 | print("Training is done in {:.2f} sec.".format(time()-tic)) 98 | 99 | def _train(self, phrases): 100 | # get the phrases 101 | with codecs.open(self._phrases_fpath, "w", "utf-8") as out: 102 | for phrase in phrases: out.write("{}\n".format(phrase.text)) 103 | print("Saved phrases:", self._phrases_fpath) 104 | 105 | self._params["num_phrases"] = len(phrases) 106 | print("Number of phrases:", len(phrases)) 107 | 108 | self._phrase2candidates = self.get_phrase_candidates(phrases, self._params["related_entities"]) 109 | 110 | # get candidates for the phrases 111 | candidates = set() 112 | for phrase in self._phrase2candidates: 113 | for candidate in self._phrase2candidates[phrase]: 114 | candidates.add(candidate) 115 | print("Number of candidates:", len(candidates)) 116 | print("Saved phrase2candidate:", self._phrase2candidates_fpath) 117 | 118 | # save the vector indices for the candidates 119 | with codecs.open(self._candidates_fpath, "w", "utf-8") as out: 120 | self._candidate2index = {} 121 | corpus = [] 122 | for index, candidate in enumerate(candidates): 123 | candidate_texts = [candidate.text] 124 | 125 | # if related_entityes then also also include text of them as well 126 | if self._params["related_entities"]: 127 | candidate_texts *= self._params["related_entities_factor"] 128 | 129 | for relation_type in candidate.relations: 130 | for related_entity_id in candidate.relations[relation_type]: 131 | related_entity = self._phrase2candidates[related_entity_id] 132 | if len(related_entity) == 0: continue 133 | related_entity = list(related_entity)[0] 134 | 135 | candidate_texts.append(related_entity.text) 136 | 137 | 138 | self._candidate2index[candidate] = index 139 | 140 | out.write("{}\t{}\t{}\t{}\n".format( 141 | index, 142 | candidate.name, 143 | candidate.text, 144 | "; ".join(candidate.uris) 145 | )) 146 | 147 | corpus.append(" ".join(candidate_texts)) 148 | 149 | joblib.dump(self._candidate2index, self._candidate2index_fpath) 150 | print("Saved candidate2index:", self._candidate2index_fpath) 151 | joblib.dump(self._phrase2candidates, self._phrase2candidates_fpath) 152 | print("Saved candidates:", self._candidates_fpath) 153 | 154 | # vectorize the text representations of the candidates 155 | stopwords = 'english' if self._params["stop_words"] else None 156 | if self._params["tfidf"]: 157 | self._vectorizer = TfidfVectorizer(stop_words=stopwords) 158 | else: 159 | self._vectorizer = CountVectorizer( 160 | binary=self._params["binary_count_vectorizer"], 161 | stop_words=stopwords) 162 | 163 | self._vectors = self._vectorizer.fit_transform(corpus) 164 | 165 | joblib.dump(self._vectorizer, self._vectorizer_fpath) 166 | print("Saved vectorizer:", self._vectorizer_fpath) 167 | 168 | joblib.dump(self._vectors, self._vectors_fpath) 169 | self._params["shape"] = self._vectors.shape 170 | print("Saved {} candidate feature matrix: {}".format(self._vectors.shape, self._vectors_fpath)) 171 | 172 | with open(self._params_fpath, "w") as fp: 173 | json.dump(self._params, fp) 174 | print("Saved params:", self._params_fpath) 175 | 176 | def _ttl2phrases(self, ttl_fpaths): 177 | """ Given a list of ttl files, extract phrases from them. """ 178 | 179 | voc = set() 180 | for dataset_fpath in ttl_fpaths: 181 | df = read_csv(dataset_fpath, sep="\t", encoding="utf-8") 182 | for i, row in df.iterrows(): 183 | for target in str(row.targets).split(","): 184 | voc.add(target.strip()) 185 | 186 | return make_phrases(list(voc)) 187 | 188 | def _dataset2phrases(self, dataset_fpaths): 189 | """ Given a list of datasets, extract phrases from them. """ 190 | 191 | voc = set() 192 | for dataset_fpath in dataset_fpaths: 193 | df = read_csv(dataset_fpath, sep="\t", encoding="utf-8") 194 | for i, row in df.iterrows(): 195 | for target in str(row.targets).split(","): 196 | voc.add(target.strip()) 197 | 198 | return make_phrases(list(voc)) 199 | 200 | def _default_phrase(self, phrase): 201 | text = phrase.text.strip() 202 | return Phrase(text, 1, len(text), "http://" + text) 203 | 204 | def _filter_non_linked(self, candidates): 205 | linked_candidates = [] 206 | for candidate in candidates: 207 | has_link = candidate.link != "" 208 | if has_link: 209 | linked_candidates.append(candidate) 210 | 211 | print("Warning: keeping {} of {} candidates that are Wikipedia-linked.".format( 212 | len(linked_candidates), len(candidates))) 213 | 214 | return linked_candidates 215 | 216 | def link(self, context, phrases): 217 | linked_phrases = [] 218 | context_vector = self._vectorizer.transform([context]) 219 | 220 | for phrase in phrases: 221 | try: 222 | dphrase = self._default_phrase(phrase) 223 | if dphrase in self._phrase2candidates: 224 | # get the candidates 225 | candidates = list(self._phrase2candidates[dphrase]) 226 | if self._params["wiki_only"]: 227 | candidates = self._filter_non_linked(candidates) 228 | 229 | indices = [] 230 | for candidate in candidates: 231 | if candidate in self._candidate2index: 232 | indices.append(self._candidate2index[candidate]) 233 | else: 234 | print("Warning: candidate '{}' is not indexed".format(candidate)) 235 | indices.append(0) # just to make sure lengths are equal 236 | 237 | candidate_vectors = self._vectors[ indices ] 238 | print("Retrieved {} candidates for '{}'".format(len(indices), phrase.text)) 239 | 240 | # rank the candidates 241 | sims = dot(candidate_vectors, context_vector.T) 242 | 243 | if self._params["use_overlap"]: 244 | overlap_scores = np.zeros(sims.shape) 245 | for i, candidate in enumerate(candidates): 246 | overlap_scores[i] = overlap(candidate.name, phrase.text) 247 | else: 248 | overlap_scores = np.ones(sims.shape) 249 | 250 | scores = np.multiply(sims.toarray(), overlap_scores) 251 | best_index = argmax(scores) 252 | best_candidate = candidates[best_index] 253 | best_candidate.score = scores[best_index][0] 254 | best_candidate.link = self._get_dbpedia_uri(best_candidate.wiki, best_candidate.uris) 255 | linked_phrases.append( (phrase, best_candidate) ) 256 | else: 257 | print("Warning: phrase '{}' is not found in the vocabulary of the model".format(phrase)) 258 | 259 | linked_phrases.append( (phrase, Candidate()) ) 260 | except: 261 | print("Error while processing phrase '{}':") 262 | print(format_exc()) 263 | linked_phrases.append( (phrase, Candidate()) ) 264 | return linked_phrases 265 | 266 | -------------------------------------------------------------------------------- /linkers/supertagger.py: -------------------------------------------------------------------------------- 1 | from linkers.context_aware import ContextAwareLinker 2 | from candidate import Candidate 3 | import json 4 | from collections import namedtuple 5 | from traceback import format_exc 6 | import requests 7 | 8 | 9 | Tag = namedtuple("Tag", "id score text offsets uris") 10 | 11 | 12 | class SuperTagger(ContextAwareLinker): 13 | def __init__(self): 14 | ContextAwareLinker.__init__(self) 15 | self._endpoint_supertagger = "https://supertagger.diffbot.com/el?token=sam&includeKG&confidence=0.5&maxTags=10&lang=en&text={}title=" 16 | 17 | def _entity_link(self, text, verbose=True): 18 | nothing = {} 19 | 20 | uri = self._endpoint_supertagger.format(text) 21 | r = requests.get(uri) 22 | content = json.loads(r.content) 23 | 24 | if "all-tags" not in content: 25 | if verbose: print("Warning: no 'all-tag' found.") 26 | return nothing 27 | 28 | tags = content["all-tags"] 29 | result = [] 30 | for i, tag in enumerate(tags): 31 | try: 32 | if "kgEntity" not in tag: 33 | print("Warning: no 'kgEntity' found.") 34 | return nothing 35 | kg = tag["kgEntity"] 36 | 37 | if "allUris" not in kg: 38 | print("Warning: no 'allUris' found.") 39 | return nothing 40 | 41 | id = tag["diffbotEntityId"] 42 | uris = kg["allUris"] 43 | tag_text = tag["label"] 44 | offsets = tag["offsets"]["text"] 45 | score = tag["overallRelevanceScore"] 46 | 47 | result.append(Tag(id, score, tag_text, offsets, uris)) 48 | except: 49 | print(format_exc()) 50 | 51 | return result 52 | 53 | def link(self, context, phrases): 54 | # link 55 | tags = self._entity_link(context) 56 | 57 | # assign tags to phrases 58 | linked_phrases = [] 59 | for phrase in phrases: 60 | 61 | # try to assign the phrase from the tagged output 62 | assigned_phrase = False 63 | for tag in tags: 64 | for tag_beg, tag_end in tag.offsets: 65 | if phrase.beg >= tag_end: 66 | intersect = phrase.beg - tag_beg < tag_end - tag_beg 67 | else: 68 | intersect = tag_beg - phrase.beg < phrase.end - phrase.beg 69 | 70 | if intersect: 71 | wiki_uri = self._find_wiki_uri(tag.uris) 72 | link = self._get_dbpedia_uri(wiki_uri, tag.uris) 73 | c = Candidate(tag.score, 74 | tag.text, 75 | link, 76 | wiki_uri, 77 | [],[], 78 | tag.uris, 79 | tag.text, 80 | tag.id) 81 | linked_phrases.append((phrase, c)) 82 | assigned_phrase = True 83 | 84 | # if nothing found assign to the phrase something still 85 | if not assigned_phrase: 86 | linked_phrases.append((phrase, Candidate())) 87 | 88 | return linked_phrases -------------------------------------------------------------------------------- /nif_ws.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, Response 2 | import logging 3 | import requests 4 | import codecs 5 | from os.path import join 6 | from time import time 7 | from ttl import remove_classref, add_nonsense_response, DatasetBuilder 8 | from linkers.baseline import BaselineLinker 9 | from linkers.sparse import SparseLinker 10 | from linkers.dense import DenseLinker 11 | from linkers.supertagger import SuperTagger 12 | 13 | 14 | endpoint = "http://localhost:8080/spotlight" 15 | data_dir = "data/" 16 | no_classref = False 17 | save_ttl_data = False 18 | ds = DatasetBuilder(join(data_dir, "dataset.csv")) 19 | 20 | app = Flask(__name__) 21 | logging.basicConfig(level=logging.DEBUG) 22 | log = logging.getLogger("nif_ws.py") 23 | 24 | 25 | def save_data(prefix, req_data, resp_data): 26 | if save_ttl_data: 27 | fid = prefix + "-" + str(time()).replace(".","") 28 | request_fpath = join(data_dir, fid + "-request.ttl") 29 | with codecs.open(request_fpath, "w", "utf-8") as req: 30 | req.write(str(req_data, "utf-8")) 31 | 32 | response_fpath = join(data_dir, fid + "-response.ttl") 33 | with codecs.open(response_fpath, "w", "utf-8") as res: 34 | res.write(str(resp_data, "utf-8")) 35 | 36 | 37 | @app.route("/proxy", methods=['POST']) 38 | def proxy(): 39 | h = {key: value for key, value in request.headers} 40 | r = requests.post(endpoint, headers=h, data=request.data) 41 | 42 | resp = Response() 43 | if r.status_code == 200: 44 | for header_name, header_value in r.headers.items(): 45 | resp.headers[header_name] = header_value 46 | 47 | r_content = str(r.content, "utf-8") 48 | resp_data = remove_classref(r_content) if no_classref else r_content 49 | resp.data = resp_data 50 | save_data("proxy", request.data, resp_data) 51 | ds.add_to_dataset(request.data) 52 | else: 53 | log.info("Warning: server returned an error") 54 | log.info(r) 55 | 56 | return resp 57 | 58 | 59 | @app.route("/trivial", methods=['POST']) 60 | def trivial(): 61 | h = {key: value for key, value in request.headers} 62 | 63 | resp_data = add_nonsense_response(request.data) 64 | 65 | resp = Response() 66 | for header_name, header_value in request.headers.items(): 67 | resp.headers[header_name] = header_value 68 | resp.data = resp_data 69 | save_data("trivial", request.data, resp_data) 70 | ds.add_to_dataset(request.data) 71 | 72 | return resp 73 | 74 | 75 | overlap_importance_linker = BaselineLinker(use_overlap=True, use_importance=True) 76 | 77 | @app.route("/overlap_importance", methods=['POST']) 78 | def overlap_importance(): 79 | response = Response() 80 | 81 | for header_name, header_value in request.headers.items(): 82 | response.headers[header_name] = header_value 83 | response.data = overlap_importance_linker.link_ttl(request.data) 84 | 85 | save_data("overlap_importance", request.data, response.data) 86 | 87 | return response 88 | 89 | 90 | importance_linker = BaselineLinker(use_overlap=False, use_importance=True) 91 | 92 | @app.route("/importance", methods=['POST']) 93 | def importance(): 94 | response = Response() 95 | 96 | for header_name, header_value in request.headers.items(): 97 | response.headers[header_name] = header_value 98 | response.data = importance_linker.link_ttl(request.data) 99 | 100 | save_data("importance", request.data, response.data) 101 | 102 | return response 103 | 104 | 105 | overlap_linker = BaselineLinker(use_overlap=True, use_importance=False, lower=True) 106 | 107 | @app.route("/overlap", methods=['POST']) 108 | def overlap(): 109 | response = Response() 110 | 111 | for header_name, header_value in request.headers.items(): 112 | response.headers[header_name] = header_value 113 | response.data = overlap_linker.link_ttl(request.data) 114 | 115 | save_data("overlap", request.data, response.data) 116 | 117 | return response 118 | 119 | 120 | overlap_linker_case = BaselineLinker(use_overlap=True, use_importance=False, lower=False) 121 | 122 | @app.route("/overlap_case", methods=['POST']) 123 | def overlap_case(): 124 | response = Response() 125 | 126 | for header_name, header_value in request.headers.items(): 127 | response.headers[header_name] = header_value 128 | response.data = overlap_linker_case.link_ttl(request.data) 129 | 130 | save_data("overlap_case", request.data, response.data) 131 | 132 | return response 133 | 134 | 135 | random_linker = BaselineLinker(use_overlap=False, use_importance=False) 136 | 137 | @app.route("/random", methods=['POST']) 138 | def random(): 139 | response = Response() 140 | 141 | for header_name, header_value in request.headers.items(): 142 | response.headers[header_name] = header_value 143 | response.data = random_linker.link_ttl(request.data) 144 | 145 | save_data("random", request.data, response.data) 146 | 147 | return response 148 | 149 | # dense_linker = DenseLinker("data/count-stopwords-3", "data/wiki-news-300d-1M.vec") 150 | # dense_linker = DenseLinker("data/count-stopwords-3-cc", "data/crawl-300d-2M.vec") 151 | dense_linker = DenseLinker("data/count-stopwords-10", "data/crawl-300d-2M.vec") 152 | 153 | @app.route("/dense_overlap", methods=['POST']) 154 | def dense_overlap(): 155 | params = {"tfidf": False, "use_overlap": True} 156 | dense_linker.set_params(params) 157 | 158 | response = Response() 159 | 160 | for header_name, header_value in request.headers.items(): 161 | response.headers[header_name] = header_value 162 | response.data = dense_linker.link_ttl(request.data) 163 | 164 | save_data("dense_overlap", request.data, response.data) 165 | 166 | return response 167 | 168 | 169 | # sparse_linker = SparseLinker("data/all0") 170 | # sparse_linker = SparseLinker("data/tfidf-stopwords-2") 171 | # sparse_linker = SparseLinker("data/count-stopwords-3") 172 | sparse_linker = SparseLinker("data/count-stopwords-10") 173 | 174 | @app.route("/sparse", methods=['POST']) 175 | def sparse(): 176 | params = {"tfidf": True, "use_overlap": False} 177 | sparse_linker.set_params(params) 178 | 179 | response = Response() 180 | 181 | for header_name, header_value in request.headers.items(): 182 | response.headers[header_name] = header_value 183 | response.data = sparse_linker.link_ttl(request.data) 184 | 185 | save_data("sparse", request.data, response.data) 186 | 187 | return response 188 | 189 | 190 | 191 | @app.route("/sparse_overlap", methods=['POST']) 192 | def sparse_overlap(): 193 | params = {"tfidf": True, "use_overlap": True} 194 | sparse_linker.set_params(params) 195 | 196 | response = Response() 197 | 198 | for header_name, header_value in request.headers.items(): 199 | response.headers[header_name] = header_value 200 | response.data = sparse_linker.link_ttl(request.data) 201 | 202 | save_data("sparse_overlap", request.data, response.data) 203 | 204 | return response 205 | 206 | 207 | super_linker = SuperTagger() 208 | @app.route("/supertagger", methods=['POST']) 209 | def supertagger(): 210 | response = Response() 211 | 212 | for header_name, header_value in request.headers.items(): 213 | response.headers[header_name] = header_value 214 | response.data = super_linker.link_ttl(request.data) 215 | 216 | save_data("supertagger", request.data, response.data) 217 | 218 | return response 219 | 220 | 221 | if __name__ == "__main__": 222 | app.run(host="127.0.0.1", threaded=True) -------------------------------------------------------------------------------- /nif_ws_graph.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, Response 2 | from linkers.nn_graph import NNLinker, CandidateRandom 3 | 4 | 5 | host = "127.0.0.1" 6 | 7 | app = Flask(__name__) 8 | app.debug = False 9 | 10 | nn_linker = NNLinker() 11 | 12 | @app.route("/nngraph", methods=['POST']) 13 | def nngraph(): 14 | response = Response() 15 | 16 | for header_name, header_value in request.headers.items(): 17 | response.headers[header_name] = header_value 18 | response.data = nn_linker.link_ttl(request.data) 19 | 20 | return response 21 | ''' 22 | nn_random = CandidateRandom() 23 | @app.route("/nnrandom", methods=['POST']) 24 | def nnrandom(): 25 | response = Response() 26 | 27 | for header_name, header_value in request.headers.items(): 28 | response.headers[header_name] = header_value 29 | response.data = nn_random.link_ttl(request.data) 30 | 31 | return response 32 | ''' 33 | 34 | if __name__ == "__main__": 35 | app.run(host=host, threaded=False) -------------------------------------------------------------------------------- /patterns.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | re_newlines = re.compile(r"[\n\r]+") 4 | -------------------------------------------------------------------------------- /preprocess.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | from rdflib import Graph 3 | 4 | 5 | def open_triples(path): 6 | return codecs.open(path, "r", "utf-8") 7 | 8 | 9 | def read_triples(path): 10 | return codecs.open(path, "r", "utf-8").read() 11 | 12 | 13 | # takes the path of the .ttl file and returns the dictionary 14 | # whose keys are the subject and values are the object. 15 | def read_triples_manuel(path): 16 | result = dict() 17 | file = codecs.open(path, "r", "utf-8") 18 | 19 | line = file.readline() 20 | while line != '': 21 | if line.startswith('<'): 22 | splitted_line = line.split() 23 | subject = splitted_line[0][1:-1] 24 | object = splitted_line[2][1:-1] 25 | 26 | result[subject] = object 27 | line = file.readline() 28 | 29 | return result 30 | 31 | 32 | def parse_triples(input_triple, input_format='n3'): 33 | g = Graph() 34 | return g.parse(data=input_triple, format=input_format) 35 | 36 | 37 | # takes the rdflib graph and writes its subject and object 38 | # to the given file. 39 | def write_triple(input_triple, path): 40 | file = open(path, 'w') 41 | 42 | count = 0 43 | print('writing is started...') 44 | print(len(input_triple)) 45 | for subj, pred, obj in input_triple: 46 | file.write(str(subj) + ' ' + str(obj) + '\n') 47 | if count%100000 == 0: 48 | print(count, 'nodes are written..') 49 | 50 | count += 1 51 | print('count', count) 52 | file.close() 53 | 54 | 55 | def triple2dict(input_triple): 56 | result = dict() 57 | for subj, pred, obj in input_triple: 58 | result[str(subj)] = str(obj) 59 | 60 | return result 61 | 62 | 63 | # nodes_ids is dictionary: keys are urls, values are ids of them. 64 | # edges is list of tuples where two nodes have an edge. 65 | def filter_edges_by_nodes(nodes_ids, edges): 66 | 67 | filtered_edges = list() 68 | filtered_edges_ids = list() 69 | 70 | for nodes in edges: 71 | node1, node2 = nodes[0], nodes[1] 72 | 73 | try: 74 | id1, id2 = nodes_ids[node1], nodes_ids[node2] 75 | except KeyError: 76 | continue 77 | 78 | filtered_edges.append((node1, node2)) 79 | filtered_edges_ids.append((id1, id2)) 80 | 81 | return filtered_edges, filtered_edges_ids 82 | 83 | 84 | def read_dict(path): 85 | result = dict() 86 | 87 | file = codecs.open(path, 'r') 88 | line = file.readline() 89 | 90 | while line != '': 91 | splitted = line.split() 92 | line = file.readline() 93 | 94 | try: 95 | result[str(splitted[0])] = splitted[1:] 96 | except IndexError: 97 | continue 98 | 99 | return result 100 | 101 | 102 | def read_lookup(path): 103 | result = dict() 104 | 105 | file = codecs.open(path, 'r') 106 | line = file.readline() 107 | 108 | while line != '': 109 | splitted = line.split() 110 | line = file.readline() 111 | 112 | try: 113 | result[str(splitted[0])] = int(splitted[1]) 114 | except IndexError: 115 | continue 116 | 117 | return result 118 | 119 | 120 | def read_edges(path): 121 | edges = list() 122 | 123 | file = codecs.open(path, 'r') 124 | line = file.readline() 125 | 126 | while line != '': 127 | splitted = line.split() 128 | line = file.readline() 129 | 130 | edges.append((int(splitted[0]), int(splitted[1]))) 131 | 132 | return edges 133 | 134 | 135 | def write_edges(edgelist, path): 136 | file = codecs.open(path, 'w') 137 | 138 | for edge in edgelist: 139 | file.write(str(edge[0]) + ' ' + str(edge[1]) + '\n') 140 | 141 | file.close() 142 | 143 | def read_list(path): 144 | data = list() 145 | 146 | file = codecs.open(path, 'r') 147 | line = file.readline() 148 | 149 | while line != '': 150 | splitted = line.split() 151 | data.append(splitted[0]) 152 | line = file.readline() 153 | 154 | return data 155 | 156 | 157 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | namedlist 3 | langid 4 | wikidata 5 | sqlitedict 6 | Flask 7 | requests 8 | grequests 9 | rdflib 10 | nltk 11 | -------------------------------------------------------------------------------- /supervised/README.md: -------------------------------------------------------------------------------- 1 | # kb2vec/supervised 2 | 3 | This project provides an alternative use of graph embeddings in Entity Disambiguation. The input of the feedforward neural network is the concatenation of context vector, span vector, entity graph embeddings, and long abstract (of corresponding entity) vector. 4 | 5 | Installation 6 | ----------- 7 | 8 | ``` 9 | cd supervised 10 | pip install -r requirements.txt 11 | ``` 12 | 13 | Set up 14 | ----------- 15 | 16 | 1 - Creating entity graph embeddings: 17 | 18 | From DBpedia datasets (https://wiki.dbpedia.org/develop/datasets/downloads-2016-10), long 19 | abstracts, labels, and page links files are downloaded. Using `../construct_graph.py`, the graph is contructed. 20 | Page links are the inputs of DeepWalk algorithm (https://github.com/phanein/deepwalk) to create entity graph embeddings. 21 | 22 | 2 - FFNN ablation test: 23 | 24 | For generating negative samples use negative_sampling.py and for training nn.py.The relative inputs are commented out for ablation test. 25 | 26 | -------------------------------------------------------------------------------- /supervised/negative_sampling_test.py: -------------------------------------------------------------------------------- 1 | from supervised import negative_sampling 2 | import ttl 3 | import codecs 4 | from sqlitedict import SqliteDict 5 | 6 | def check_written_file(contexts_r, phrases_r, contexts, phrases): 7 | is_equal = True 8 | 9 | for phrase in phrases: 10 | entity, beg, end, ref_context, url = phrase[0], phrase[1], phrase[2], phrase[3], phrase[4] 11 | try: 12 | context = contexts[ref_context] 13 | context_r = contexts_r[entity+str(beg)+str(end)+url+context] 14 | phrase_r = phrases_r[entity+str(beg)+str(end)+url+context] 15 | 16 | is_equal &= (context == context_r) & (entity == phrase_r[0]) & (beg == phrase_r[1]) & (end == phrase_r[2]) 17 | if not is_equal: 18 | print(entity, url, beg, end) 19 | break 20 | except KeyError: 21 | print("Warning: not found", ref_context) 22 | 23 | return is_equal 24 | 25 | 26 | def get_statistics_true_url(positives_negatives, urls_db): 27 | db = SqliteDict(urls_db, autocommit=False) 28 | urls = list(db.keys()) 29 | 30 | file = codecs.open('candidates_without_true_name1_.tsv', 'a') 31 | count_exist = 0 32 | count_all = 0 33 | count_not_included = 0 34 | 35 | for positive_negative in positives_negatives: 36 | entity, beg, end, true_url, context, negative_samples = positive_negative 37 | 38 | samples = list() 39 | for negative_sample in negative_samples: 40 | samples.append(negative_sample.strip()) 41 | 42 | if true_url in samples: 43 | count_exist += 1 44 | elif true_url in urls: 45 | file.write(str(entity) + '\t' + str(true_url) + '\n') 46 | else: 47 | count_not_included += 1 48 | 49 | count_all += 1 50 | 51 | print(count_exist) 52 | print(count_all) 53 | print(count_not_included) 54 | return float(count_exist)/count_all 55 | 56 | 57 | ''' 58 | # creating negative samples 59 | contexts_r, phrases_r = negative_sampling.read_samples('/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/positive_samples_new.tsv') 60 | print('positive samples are read..') 61 | negative_samples = negative_sampling.create_negative_samples_with_positive_samples(urls_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db', 62 | contexts=contexts_r, phrases=phrases_r) 63 | 64 | print(len(negative_samples)) 65 | print('Writing started..') 66 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=negative_samples, 67 | path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives_new.tsv') 68 | ''' 69 | ''' ''' 70 | # creating candidates 71 | contexts_r, phrases_r = negative_sampling.read_samples('/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/positive_samples_new.tsv') 72 | print('positive samples are read..') 73 | negative_samples = negative_sampling.create_candidates(urls_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db', 74 | contexts=contexts_r, phrases=phrases_r) 75 | 76 | print(len(negative_samples)) 77 | print('Writing started..') 78 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=negative_samples, 79 | path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/candidates/candidate1_big.tsv') 80 | 81 | 82 | ''' 83 | # get statistics 84 | positive_negatives = negative_sampling.\ 85 | read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/candidates/candidate1.tsv') 86 | print(get_statistics_true_url(positive_negatives, urls_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db')) 87 | ''' 88 | 89 | # check samples 90 | #positive_negatives, count = negative_sampling.\ 91 | # read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives.tsv') 92 | 93 | #print(len(positive_negatives), count) 94 | ''' 95 | # closest sampling 96 | positive_negatives = negative_sampling.\ 97 | read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives.tsv') 98 | filtered_samples = negative_sampling. \ 99 | filter_negative_samples_closest_with_scores(positives_negatives=positive_negatives, 100 | url_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db', 101 | pagerank_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/pagerank.db', n=10) 102 | print('starts to write') 103 | negative_sampling.write_negative_samples_with_positive_samples_with_scores(positive_negatives=filtered_samples, 104 | path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/with_scores/negative_samples_filtered_closest_10.tsv') 105 | ''' 106 | ''' 107 | # random sampling 108 | 109 | positive_negatives = negative_sampling.read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives_new.tsv') 110 | filtered_samples = negative_sampling.filter_negative_samples_randomly(positives_negatives=positive_negatives, 111 | url_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db', 112 | n=10) 113 | print('starts to write') 114 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=filtered_samples, 115 | path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_filtered_randomly_10_big.tsv') 116 | 117 | ''' 118 | ''' 119 | # completely random 120 | contexts_r, phrases_r = negative_sampling.read_samples('/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/positive_samples.tsv') 121 | samples = negative_sampling.create_completely_random(urls_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db', 122 | contexts=contexts_r, phrases=phrases_r, n=5) 123 | 124 | print('starts to write') 125 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=samples, 126 | path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_completely_random_5.tsv') 127 | ''' 128 | ''' 129 | # closest sampling with scores and similarity 130 | positive_negatives = negative_sampling.\ 131 | read_negative_samples_with_positive_samples(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_with_positives_new.tsv') 132 | 133 | sims_scores = negative_sampling.get_negative_samples_similarity_and_scores(positives_negatives=positive_negatives, 134 | url_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/intersection_nodes_lookup.db', 135 | graphembed='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/nodes.embeddings', 136 | pagerank_db='/Users/sevgili/Ozge-PhD/DBpedia-datasets/outputs/databases/pagerank.db') 137 | 138 | print('starts to write') 139 | negative_sampling.write_negative_samples_with_positive_samples_with_scores(positive_negatives=sims_scores, 140 | path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/with_scores/negative_samples_sims_scores_new.tsv') 141 | ''' 142 | ''' 143 | # prune closest 144 | 145 | positive_negatives = negative_sampling.read_negative_samples_with_positive_samples_with_scores(path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/with_scores/negative_samples_sims_scores_new.tsv') 146 | print('positive_negatives is read') 147 | pruned_samples = negative_sampling.prune_most_closest(positives_negatives=positive_negatives, n=10) 148 | print('samples is pruned') 149 | negative_sampling.write_negative_samples_with_positive_samples(positive_negatives=pruned_samples, 150 | path='/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/negative_samples_filtered_closest_pruned_10_big.tsv') 151 | ''' 152 | 153 | 154 | def ttl2csv(list_of_paths, write_path): 155 | for input_ttl_fpath in list_of_paths: 156 | in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8") 157 | 158 | input_ttl = in_ttl.read() 159 | graph, contexts, phrases = negative_sampling.parse_d2kb_ttl(input_ttl) 160 | 161 | print(phrases) 162 | print(contexts) 163 | 164 | negative_sampling.write_positive_samples(contexts=contexts, phrases=phrases, 165 | path=write_path) 166 | 167 | 168 | input_ttl_fpaths = ["/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl", 169 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/kore50-nif.ttl", 170 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/Reuters-128.ttl"] 171 | 172 | new_input_ttl_fpaths = ["/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/RSS-500.ttl", 173 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"] 174 | 175 | #ttl2csv(new_input_ttl_fpaths, 176 | # write_path="/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/csv/positive_samples_new.tsv") 177 | 178 | -------------------------------------------------------------------------------- /supervised/requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==0.12.0 2 | gensim==3.6.0 3 | nltk==3.3 4 | numpy==1.15.2 5 | sqlitedict==1.6.0 6 | rdflib==4.2.2 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /test_supertagger.py: -------------------------------------------------------------------------------- 1 | from diffbot_api import entity_link 2 | 3 | r = entity_link("Michael Jeffrey Jordan, also known by his initials, MJ, is an American former professional basketball player. He played 15 seasons in the National Basketball Association for the Chicago Bulls and Washington Wizards.") 4 | print(r) -------------------------------------------------------------------------------- /tests/baseline_linker_dbpedia_test.py: -------------------------------------------------------------------------------- 1 | from linkers.baseline import BaselineLinker 2 | from candidate import Phrase 3 | from pandas import read_csv 4 | 5 | dataset_fpath = "../datasets/dbpedia.tsv" 6 | 7 | df = read_csv(dataset_fpath, sep="\t", encoding="utf-8") 8 | bl = BaselineLinker() 9 | 10 | for i, row in df.iterrows(): 11 | phrases = [Phrase(phrase.strip(), 1, len(phrase.strip()), "http://" + phrase.strip()) 12 | for phrase in row.targets.split(",")] 13 | 14 | print("\n\n{}\n".format(row.context)) 15 | 16 | for phrase, candidate in bl.link(row.context, phrases): 17 | link = candidate.link if candidate else "" 18 | print(phrase.text, link) -------------------------------------------------------------------------------- /tests/baseline_linker_test.py: -------------------------------------------------------------------------------- 1 | from linkers.baseline import BaselineLinker 2 | from candidate import Phrase 3 | 4 | context = "San Francisco said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey." 5 | phrases = "San Francisco" 6 | 7 | phrases = [Phrase(phrase.strip(), 0, len(phrase.strip()), "http://" + phrase.strip()) 8 | for phrase in phrases.split(",")] 9 | bl = BaselineLinker() 10 | 11 | for phrase, candidate in bl.link(context, phrases): 12 | print(phrase.text, candidate) -------------------------------------------------------------------------------- /tests/dense_linker_test.py: -------------------------------------------------------------------------------- 1 | from linkers.dense import DenseLinker 2 | from candidate import make_phrases 3 | 4 | # embeddings_fpath = "../data/wiki-news-300d-1M.vec" 5 | embeddings_fpath = "../data/crawl-300d-2M.vec" 6 | 7 | dataset_fpaths = ["../datasets/dbpedia.ttl.phrases.tsv", 8 | "../datasets/kore50.ttl.phrases.tsv", 9 | "../datasets/n3-reuters-128.ttl.phrases.tsv"] 10 | 11 | l = DenseLinker("../data/count-stopwords-10", embeddings_fpath, stop_words=True, tfidf=False) 12 | l.train(dataset_fpaths) 13 | 14 | context = "Madonna is a great music signer and lives near West Holywood in LA. adonna Louise Ciccone (/tʃɪˈkoʊni/; born August 16, 1958) is an American singer, songwriter, actress, and businesswoman. Referred to as the Queen of Pop since the 1980s, Madonna is known for pushing the boundaries of lyrical content in mainstream popular music, as well as visual imagery in music videos and on stage. She has also frequently reinvented both her music and image while maintaining autonomy within the recording industry. Besides sparking controversy, her works have bee " 15 | phrases = ["Madonna"] 16 | 17 | linked_phrases = l.link(context, make_phrases(phrases)) 18 | print(linked_phrases) 19 | -------------------------------------------------------------------------------- /tests/diffbot_api_test.py: -------------------------------------------------------------------------------- 1 | from diffbot_api import query_and_save, ENTITY_TYPES 2 | 3 | query_and_save('allUris:"barackobama.com"', "data/all-uris.json") 4 | query_and_save('wikipediaUri:"en.wikipedia.org/wiki/Barack_Obama"', "data/wiki-uri.json") 5 | query_and_save('allUris:"en.wikipedia.org/wiki/Barack\_Obama"', "data/all-uris-wiki.json") 6 | query_and_save('origins:"en.wikipedia.org/wiki/Barack_Obama"', "data/origins.json") 7 | 8 | for entity_type in ENTITY_TYPES: 9 | query_and_save( 10 | query='type:{}'.format(entity_type), 11 | output_fpath="data/{}.json".format(entity_type)) 12 | 13 | query_and_save( 14 | query='type:Person name:"Alexander Panchenko"', 15 | output_fpath="data/ap.json") 16 | 17 | 18 | query_and_save( 19 | query='type:Person employments.employer.name:"Diffbot"', 20 | output_fpath="data/diffbot-employees.json") 21 | 22 | 23 | query_and_save( 24 | query='type:Person employments.{title:"CEO" employer.name:"Diffbot"}', 25 | output_fpath="data/diffbot-ceo.json") 26 | 27 | query_and_save( 28 | query='type:Person employments.{employer.name:"Diffbot" isCurrent:true}', 29 | output_fpath="data/diffbot-current-employees.json") 30 | 31 | query_and_save( 32 | query='type:Person name:"Angela Merkel"', 33 | output_fpath="data/am.json") 34 | 35 | query_and_save( 36 | query='type:Person name:"Barack Obama"', 37 | output_fpath="data/bo.json") 38 | 39 | query_and_save( 40 | query='type:Person name:"Nicolas Sarkozy"', 41 | output_fpath="data/ns.json") 42 | 43 | query_and_save( 44 | query='type:Person name:"Diego Maradona"', 45 | output_fpath="data/dm.json") 46 | -------------------------------------------------------------------------------- /tests/score_test.py: -------------------------------------------------------------------------------- 1 | # inside the link function 2 | 3 | cds = [] 4 | for i, candidate in enumerate(candidates): 5 | cds.append((scores[i][0], candidate.name, 6 | sims.toarray()[i][0], overlap_scores[i][0])) 7 | 8 | for c_score, c_name, c_sim, c_overlap in sorted(cds, reverse=True): 9 | print("- {} {:.2f} {:.2f} {:.2f}".format(c_name, c_score, c_sim, c_overlap)) 10 | -------------------------------------------------------------------------------- /tests/sparse_linker_single_test.py: -------------------------------------------------------------------------------- 1 | from linkers.sparse import SparseLinker 2 | from candidate import make_phrases 3 | 4 | 5 | dataset_fpaths = ["../datasets/singleton.tsv"] 6 | 7 | sl = SparseLinker("../data/single5") 8 | # sl.train(dataset_fpaths) 9 | context = "Richard Stallman, often known by his initials, rms — is an American free software movement activist and programmer. He campaigns for software to be distributed in a manner such that its users receive the freedoms to use, study, distribute and modify that software." 10 | phrases = ["Richard Stallman"] 11 | linked_phrases = sl.link(context, make_phrases(phrases)) 12 | print(linked_phrases) 13 | 14 | context = "Linus Benedict Torvalds (/ˈliːnəs ˈtɔːrvɔːldz/;[5] Swedish: [ˈliːn.ɵs ˈtuːr.valds] (About this sound listen); born December 28, 1969) is a Finnish-American software engineer[2][6] who is the creator, and historically, the principal developer of the Linux kernel, which became the kernel for operating systems such as the Linux operating systems, Android, and Chrome OS." 15 | phrases = ["Linus Torvalds"] 16 | linked_phrases = sl.link(context, make_phrases(phrases)) 17 | print(linked_phrases) -------------------------------------------------------------------------------- /tests/sparse_linker_test.py: -------------------------------------------------------------------------------- 1 | from linkers.sparse import SparseLinker 2 | from candidate import make_phrases 3 | 4 | dataset_fpaths = ["../datasets/dbpedia.ttl.phrases.tsv", 5 | "../datasets/kore50.ttl.phrases.tsv", 6 | "../datasets/n3-reuters-128.ttl.phrases.tsv"] 7 | 8 | dataset_fpaths = ["../datasets/test.phrases.tsv"] 9 | 10 | def profiling(function): 11 | import cProfile 12 | import pstats 13 | from io import StringIO 14 | pr = cProfile.Profile() 15 | pr.enable() 16 | 17 | function() 18 | 19 | pr.disable() 20 | s = StringIO() 21 | sortby = 'cumulative' 22 | ps = pstats.Stats(pr, stream=s).sort_stats(sortby) 23 | ps.print_stats() 24 | print(s.getvalue()) 25 | 26 | 27 | 28 | sl = SparseLinker("../data/count-stopwords-test2-related", stop_words=True, tfidf=False, related_entities=True) 29 | profiling(lambda: sl.train(dataset_fpaths)) 30 | 31 | 32 | context = "Madonna is a great music signer and lives near West Holywood in LA. adonna Louise Ciccone (/tʃɪˈkoʊni/; born August 16, 1958) is an American singer, songwriter, actress, and businesswoman. Referred to as the Queen of Pop since the 1980s, Madonna is known for pushing the boundaries of lyrical content in mainstream popular music, as well as visual imagery in music videos and on stage. She has also frequently reinvented both her music and image while maintaining autonomy within the recording industry. Besides sparking controversy, her works have bee " 33 | phrases = ["Madonna"] 34 | linked_phrases = sl.link(context, make_phrases(phrases)) 35 | print(linked_phrases) 36 | 37 | context = "Richard Stallman, often known by his initials, rms — is an American free software movement activist and programmer. He campaigns for software to be distributed in a manner such that its users receive the freedoms to use, study, distribute and modify that software." 38 | phrases = ["Richard Stallman"] 39 | linked_phrases = sl.link(context, make_phrases(phrases)) 40 | print(linked_phrases) 41 | 42 | context = "Linus Benedict Torvalds (/ˈliːnəs ˈtɔːrvɔːldz/;[5] Swedish: [ˈliːn.ɵs ˈtuːr.valds] (About this sound listen); born December 28, 1969) is a Finnish-American software engineer[2][6] who is the creator, and historically, the principal developer of the Linux kernel, which became the kernel for operating systems such as the Linux operating systems, Android, and Chrome OS." 43 | phrases = ["Linus Torvalds"] 44 | linked_phrases = sl.link(context, make_phrases(phrases)) 45 | print(linked_phrases) 46 | -------------------------------------------------------------------------------- /tests/supertagger_test.py: -------------------------------------------------------------------------------- 1 | from linkers.supertagger import SuperTagger 2 | from candidate import Phrase 3 | import codecs 4 | 5 | 6 | st = SuperTagger() 7 | 8 | 9 | def make_positional_phrases(word_beg_ends): 10 | phrases = [] 11 | for word, beg, end in word_beg_ends: 12 | phrases.append(Phrase(word, beg, end, "http://www.{}.com".format(word))) 13 | return phrases 14 | 15 | request_fpath = "../data/supertagger-1529250101365435-request.ttl" 16 | with codecs.open(request_fpath, "r", "utf-8") as ttl: 17 | input_ttl = ttl.read() 18 | 19 | output_ttl = st.link_ttl(input_ttl) 20 | with codecs.open(request_fpath + ".response", "w", "utf-8") as ttl: 21 | ttl.write(output_ttl) 22 | 23 | context = "Prokhorov said the visit would serve as a cornerstone for future interaction between players and coaches from the Nets and young Russians, with the aim of developing basketball in Russia, where the sport is a distant third in popularity behind soccer and hockey." 24 | phrases = make_positional_phrases([["Russia", 180, 186], 25 | ["sport", 198, 203], 26 | ["basketabll", 166, 176], 27 | ["Russians", 129, 137], 28 | ["Prokhorov", 0, 9]]) 29 | linked_phrases = st.link(context, phrases) 30 | print(linked_phrases) 31 | 32 | context = "Madonna is a great music signer and lives near West Holywood in Los Angeles. adonna Louise Ciccone (/tʃɪˈkoʊni/; born August 16, 1958) is an American singer, songwriter, actress, and businesswoman. Referred to as the Queen of Pop since the 1980s, Madonna is known for pushing the boundaries of lyrical content in mainstream popular music, as well as visual imagery in music videos and on stage. She has also frequently reinvented both her music and image while maintaining autonomy within the recording industry. Besides sparking controversy, her works have bee " 33 | phrases = [Phrase("Madonna", 0, 6, "http://madonna.com"), 34 | Phrase("West Holywood", 48, 62, "http://westholy.com"), 35 | Phrase("Los Angeles", 65, 76, "http://la.com")] 36 | 37 | linked_phrases = st.link(context, phrases) 38 | print(linked_phrases) 39 | 40 | -------------------------------------------------------------------------------- /tests/supervised/preprocess/prepro_util_test.py: -------------------------------------------------------------------------------- 1 | from supervised.preprocess.prepro_util import * 2 | from supervised.preprocess.util import load_url2graphid 3 | from supervised.negative_sampling import parse_d2kb_ttl 4 | 5 | ''' 6 | 7 | generator = InputSamplesGenerator() 8 | samples = generator.process('/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl', ttl=True) 9 | not_include = 0 10 | total = 0 11 | except_ = 0 12 | for sample in samples: 13 | chunk_id, chunk_words, entity, begin_gm, end_gm, ground_truth, cand_entities, cand_entities_scores = sample 14 | 15 | for index in range(len(entity)): 16 | try: 17 | print(entity[index], ground_truth[index], cand_entities[index]) 18 | if int(ground_truth[index]) not in cand_entities[index]: 19 | not_include += 1 20 | except: 21 | except_ += 1 22 | total += 1 23 | 24 | 25 | print(not_include, total, except_) 26 | ''' 27 | # len phrase 660 - spotlight 28 | # 288 - kore50-nif 29 | # 880 - Reuters-128 30 | # 1000 - RSS-500 31 | # 1655 - News-100 32 | def test_chunker_parse_d2kb(): 33 | chunker = Chunker() 34 | 35 | input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl" 36 | in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8") 37 | 38 | input_ttl = in_ttl.read() 39 | 40 | _, contexts, phrases = chunker.parse_d2kb_ttl(input_ttl) 41 | print('CONT:', len(contexts.keys()), 'PHR:', len(phrases.keys())) 42 | 43 | for key in contexts.keys(): 44 | if key not in phrases.keys(): 45 | print(key) 46 | 47 | for key in phrases.keys(): 48 | if key not in contexts.keys(): 49 | print(key) 50 | 51 | if contexts.keys() == phrases.keys(): 52 | print("YESSS") 53 | 54 | _, contexts_, phrases_ = parse_d2kb_ttl(input_ttl) 55 | 56 | if len(set(phrases_)) != len(phrases): 57 | print('len original:', len(set(phrases_)) , 'len chunker:', len(phrases.keys())) 58 | 59 | if len(contexts) != len(contexts_): 60 | print('len original:', len(contexts_), 'len chunker:', len(contexts)) 61 | 62 | if set(contexts_.keys()).difference(set(contexts.keys())): 63 | print('not the same context keys') 64 | 65 | contexts_keys = contexts.keys() 66 | len_phrase = 0 67 | for context in contexts_keys: 68 | try: 69 | phrase_contexts = phrases[context] 70 | for phrase in phrase_contexts: 71 | span, beg, end, ind_ref = phrase 72 | if (span, beg, end, context, ind_ref) not in phrases_: 73 | print((span, beg, end, context, ind_ref)) 74 | return 75 | len_phrase += len(phrase_contexts) 76 | except KeyError: 77 | # only one context ref in spotlight, the problem in the dataset! 78 | # http://www.nytimes.com/2010/10/11/arts/design/11chaos.html?ref=arts_sentence2 79 | print('KEY ERROR:', context) 80 | print(len_phrase) 81 | 82 | 83 | # number_phrases = 608 + 52 ground truth error - spotlight 84 | # number_phrases = 254 + 34 ground truth error - kore50-nif 85 | # number_phrases = 562 + 318 ground truth error - Reuters-128 86 | # number_phrases = 462 + 538 ground truth error - RSS-500 87 | # number_phrases = 32 + 1623 ground truth error - News-100 88 | def test_process_ttl(): 89 | chunker = Chunker() 90 | 91 | input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl" 92 | url2graphid = load_url2graphid() 93 | count = 0 94 | number_phrases = 0 95 | for chunk in chunker.process_ttl(input_ttl_fpath, url2graphid): 96 | #print(chunk) 97 | chunk_id, chunk_words, begin_gm, end_gm, ground_truth = chunk 98 | number_phrases += len(begin_gm) 99 | count += 1 100 | print(count, number_phrases) 101 | 102 | 103 | # 608 11366 - dbpedia-spotlight-nifspotlight 104 | # 254 6180 - kore50-nif 105 | # 562 7474 - Reuters-128 106 | # 462 6389 - RSS-500 107 | # 32 46 - News-100 108 | def test_chunk2sample(): 109 | input_generator = InputSamplesGenerator() 110 | 111 | input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl" 112 | url2graphid = load_url2graphid() 113 | 114 | number_cand = 0 115 | number_phrases = 0 116 | 117 | 118 | for chunk in input_generator.chunker.process_ttl(input_ttl_fpath, url2graphid): 119 | chunk_id, chunk_words, begin_gm, end_gm, ground_truth, \ 120 | cand_entities, cand_entities_scores = input_generator.chunk2sample(chunk) 121 | 122 | if len(begin_gm) != len(end_gm) or len(begin_gm) != len(ground_truth) or len(begin_gm) != len(cand_entities): 123 | print(chunk_id, begin_gm, end_gm, ground_truth, cand_entities, cand_entities_scores) 124 | number_phrases += len(begin_gm) 125 | 126 | for index in range(len(begin_gm)): 127 | candidates = cand_entities[index] 128 | number_cand += len(candidates) 129 | 130 | print(number_phrases, number_cand) 131 | 132 | 133 | # 57 - dbpedia-spotlight-nifspotlight 134 | # 50 - kore50-nif 135 | # 107 - Reuters-128 136 | # 334 - RSS-500 137 | # 14 - News-100 138 | def test_InputSampleGenerate_process(): 139 | input_generator = InputSamplesGenerator() 140 | input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl" 141 | number_samples = 0 142 | for sample in input_generator.process(input_ttl_fpath, ttl=True): 143 | number_samples += 1 144 | 145 | print(number_samples) 146 | 147 | print('Called') 148 | #test_chunker_parse_d2kb() 149 | #test_process_ttl() 150 | #test_chunk2sample() 151 | test_InputSampleGenerate_process() 152 | print('Finished') 153 | input_ttl_fpaths = ["/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl", 154 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/kore50-nif.ttl", 155 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/Reuters-128.ttl", 156 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/RSS-500.ttl", 157 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"] -------------------------------------------------------------------------------- /tests/supervised/preprocess/util_test.py: -------------------------------------------------------------------------------- 1 | from supervised.preprocess.util import FetchFilteredCoreferencedCandEntities, load_url2graphid 2 | from nltk.tokenize import word_tokenize 3 | from supervised.negative_sampling import parse_d2kb_ttl 4 | import codecs 5 | 6 | 7 | # not_match_entity = 13, total = 331 spotlight 8 | # not_match_entity = 22, total = 1000 rss-500 9 | # not_match_entity = 57, total = 880 reuters 10 | # not_match_entity = 3, total = 144 kore50-nif 11 | # not_match_entity = 75, total = 1655 News-100 12 | def test_index_span(): 13 | 14 | context = "In the first study, intended to measure a person’s short-term emotional reaction to gossiping, " \ 15 | "140 men and women, primarily undergraduates, were asked to talk about a fictional person either " \ 16 | "positively or negatively." 17 | beg = 124 18 | end = 138 19 | entity = 'undergraduates' 20 | chunk_words = word_tokenize(context) 21 | left = chunk_words.index(entity) 22 | right = left + len(word_tokenize(entity)) 23 | print(' '.join(chunk_words[left:right])) 24 | 25 | input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl" 26 | in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8") 27 | 28 | input_ttl = in_ttl.read() 29 | graph, contexts, phrases = parse_d2kb_ttl(input_ttl) 30 | phrases = set(phrases) 31 | 32 | not_match_entity = 0 33 | for phrase in phrases: 34 | entity, beg, end, ref_context, url = phrase 35 | 36 | try: 37 | context = contexts[ref_context] 38 | except KeyError: 39 | print('KeyError', phrase) 40 | 41 | chunk_words = word_tokenize(context) 42 | 43 | try: 44 | left = chunk_words.index(entity) 45 | right = left + len(word_tokenize(entity)) 46 | except ValueError: 47 | left = len(word_tokenize(context[:beg])) 48 | right = len(word_tokenize(context[:end])) 49 | 50 | span_text = ' '.join(chunk_words[left:right]) 51 | 52 | if span_text != entity: 53 | print('ERROR:', 'span:', span_text, 'entity:', entity, 'beg-end:', context[beg:end], 'context:', context) 54 | not_match_entity += 1 55 | 56 | print(not_match_entity, len(phrases)) 57 | 58 | 59 | # - nones: 14 - # of phrases: 331 331 - not include: 12 - # of total cand: 6124 - # of except 26 - spotlight 60 | # - nones: 3 - # of phrases: 144 144 - not include: 8 - # of total cand: 3498 - # of except 17 - kore50-nif 61 | # - nones: 318 - # of phrases: 880 880 - not include: 42 - # of total cand: 9622 - # of except 318 - Reuters-128 62 | # - nones: 359 - # of phrases: 1000 1000 - not include: 48 - # of total cand: 9485 - # of except 538 - RSS-500 63 | # - nones: 309 - # of phrases: 1655 1655 - not include: 18 - # of total cand: 20772 - # of except 1623 - News-100 64 | def test_cand_list(): 65 | fetch_filtered_entities = FetchFilteredCoreferencedCandEntities() 66 | url2graphid = load_url2graphid() 67 | 68 | input_ttl_fpath = "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl" 69 | in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8") 70 | 71 | input_ttl = in_ttl.read() 72 | graph, contexts, phrases = parse_d2kb_ttl(input_ttl) 73 | phrases = set(phrases) 74 | 75 | count_nones = 0 76 | count = 0 77 | count_not_include = 0 78 | cand_number = 0 79 | key_error = 0 80 | for phrase in phrases: 81 | entity, beg, end, ref_context, url = phrase 82 | try: 83 | id = url2graphid[url] 84 | except KeyError: 85 | id = -1 86 | key_error += 1 87 | try: 88 | context = contexts[ref_context] 89 | except KeyError: 90 | print('KeyError', phrase) 91 | 92 | chunk_words = word_tokenize(context) 93 | 94 | try: 95 | left = chunk_words.index(entity) 96 | right = left + len(word_tokenize(entity)) 97 | except ValueError: 98 | left = len(word_tokenize(context[:beg])) 99 | right = len(word_tokenize(context[:end])) 100 | 101 | cand, score = fetch_filtered_entities.process(left, right, chunk_words) 102 | 103 | if cand is None: 104 | count_nones += 1 105 | else: 106 | cand_number += len(cand) 107 | 108 | if cand is not None and id != -1: 109 | if int(id) not in cand: 110 | count_not_include += 1 111 | 112 | count += 1 113 | 114 | print(' - nones:', count_nones, ' - # of phrases:', count, len(phrases), ' - not include:', count_not_include, 115 | ' - # of total cand:', cand_number, ' - # of except', key_error) 116 | 117 | 118 | #test_index_span() 119 | test_cand_list() 120 | 121 | input_ttl_fpaths = ["/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/dbpedia-spotlight-nif.ttl", 122 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/kore50-nif.ttl", 123 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/Reuters-128.ttl", 124 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/RSS-500.ttl", 125 | "/Users/sevgili/Ozge-PhD/DBpedia-datasets/training-datasets/ttl/News-100.ttl"] -------------------------------------------------------------------------------- /tests/test_construct_dict.py: -------------------------------------------------------------------------------- 1 | import construct_graph 2 | import preprocess 3 | 4 | 5 | # for subgraph 6 | graph = construct_graph.Graph(logfile='../datasets/subset/construct_graph.log') 7 | subnodes = preprocess.read_list(path='../datasets/subset/1000_nodelist_url.txt') 8 | 9 | graph.create_nodes_from_db(longabsdb_path='../datasets/subset/1000_long_abstracts.db', 10 | labelsdb_path='../datasets/subset/1000_labels.db', 11 | lookupdb_path='../datasets/subset/1000_nodes_lookup.db', 12 | subnodes=subnodes) 13 | print('nodes created..') 14 | 15 | edges = preprocess.read_edges(path='../datasets/subset/1000_edgelist.txt') 16 | print('edges are read...') 17 | 18 | graph.create_edges_from_list(edges=edges) 19 | print('edges are created...') 20 | 21 | graph.write_graph(path='../datasets/subset/1000_graph_sub.gpickle') 22 | print('graph is written...') 23 | 24 | graph.draw() 25 | 26 | -------------------------------------------------------------------------------- /tests/test_data_helper.py: -------------------------------------------------------------------------------- 1 | import data_helper 2 | 3 | # look up 4 | data_helper.create_dictdb_from_file(file_path='../datasets/subset/1000_nodes_lookup.txt', 5 | db_path='../datasets/subset/1000_nodes_lookup__.db') 6 | -------------------------------------------------------------------------------- /tests/ttl_test.py: -------------------------------------------------------------------------------- 1 | from ttl import parse_d2kb_ttl 2 | import codecs 3 | 4 | input_ttl_fpaths = ["../datasets/kore50.ttl", "../datasets/n3-reuters-128.ttl", "../datasets/dbpedia.ttl"] 5 | 6 | for input_ttl_fpath in input_ttl_fpaths: 7 | in_ttl = codecs.open(input_ttl_fpath, "r", "utf-8") 8 | phrases_fpath = input_ttl_fpath + ".phrases.tsv" 9 | contexts_fpath = input_ttl_fpath + ".contexts.tsv" 10 | 11 | phrases_ttl = codecs.open(phrases_fpath, "w", "utf-8") 12 | phrases_ttl.write("targets\tcontexts\n") 13 | 14 | contexts_ttl = codecs.open(contexts_fpath, "w", "utf-8") 15 | contexts_ttl.write("targets\tcontexts\n") 16 | 17 | input_ttl = in_ttl.read() 18 | graph, contexts, phrases = parse_d2kb_ttl(input_ttl) 19 | 20 | for phrase in phrases: 21 | phrases_ttl.write("{}\t \n".format(phrase.text)) 22 | 23 | for context in contexts: 24 | contexts_ttl.write(" \t{}\n".format(context)) 25 | 26 | in_ttl.close() 27 | phrases_ttl.close() 28 | contexts_ttl.close() 29 | 30 | print("Output:", phrases_fpath) 31 | print("Output:", contexts_fpath) -------------------------------------------------------------------------------- /tmp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 35, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "name:\"New York\"\n4\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "from diffbot_api import CachedQuery\n", 20 | "import json \n", 21 | "import codecs \n", 22 | "\n", 23 | "\n", 24 | "cq = CachedQuery()\n", 25 | "for i, query in enumerate(cq._cache):\n", 26 | " db_entity = json.loads(cq._cache[query].content)\n", 27 | " print(query)\n", 28 | " print(len(db_entity))\n", 29 | " \n", 30 | " with codecs.open(\"/Users/panchenko/Desktop/{}.json\".format(query), \"w\", \"utf-8\") as out:\n", 31 | " out.write(json.dumps(db_entity))\n", 32 | " \n", 33 | " break" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 34, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "# for hit in db_entity[\"data\"]:\n", 45 | "# for k in hit:\n", 46 | "# print(k)\n", 47 | "# break" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 40, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "37456900.0\n35052100.0\n17384900.0\n12499500.0\n10722700.0\n9986700\n7824100\n7290000\n6874600\n6252400\n5308100\n4715200\n4680200\n4586800\n4501900\n4477000\n4432100\n4327100\n4172200\n4034500\n3650100\n3161500\n3146300\n3028400\n2934900\n2922200\n2802000\n2639900\n2522500\n2315200\n2294500\n2243600\n2173900\n2137200\n2109700\n2087500\n2080400\n2023800\n2007600\n1929400\n1864800\n1860900\n1859700\n1856800\n1786600\n1785600\n1744700\n1690400\n1678900\n1645300\n\n 0 New York City is part of {'name': 'United States of America', 'diffbotUri': 'http://diffbot.com/entity/AcZTRPXDrY9', 'targetDiffbotUri': 'http://diffbot.com/entity/AcZTRPXDrY9', 'type': 'AdministrativeArea'}\n" 62 | ] 63 | }, 64 | { 65 | "ename": "Exception", 66 | "evalue": "", 67 | "traceback": [ 68 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 69 | "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", 70 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 13\u001b[0m \u001b[0mrootId\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"diffbotUri\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhit\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"name\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"is part of\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mroot\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 15\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 16\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", 71 | "\u001b[0;31mException\u001b[0m: " 72 | ], 73 | "output_type": "error" 74 | } 75 | ], 76 | "source": [ 77 | "cq = CachedQuery()\n", 78 | "for i, query in enumerate(cq._cache):\n", 79 | " db_entity = json.loads(cq._cache[query].content)\n", 80 | " if \"data\" not in db_entity: continue\n", 81 | " \n", 82 | " for hit in db_entity[\"data\"]:\n", 83 | " for field_name in hit:\n", 84 | " if field_name == \"importance\": \n", 85 | " print(hit[\"importance\"])\n", 86 | " if field_name == \"isPartOf\":\n", 87 | " for i, root in enumerate(hit[\"isPartOf\"]):\n", 88 | " if \"diffbotUri\" in root:\n", 89 | " rootId = root[\"diffbotUri\"]\n", 90 | " print(\"\\n\", i, hit[\"name\"], \"is part of\", root)\n", 91 | " raise Exception()\n", 92 | " \n", 93 | " " 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 8, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "defaultdict(. at 0x104c55d90>, {'founders': ['http://diffbot.com/entity/PKuADpLXgMS']}) \n\nhttp://diffbot.com/entity/OCK8zsXiAVy\ndefaultdict(. at 0x104e850d0>, {'founders': ['http://diffbot.com/entity/PN+XIqH03xf'], 'ceo': ['http://diffbot.com/entity/PNRyBHTdnbx']}) \n\nhttp://diffbot.com/entity/CHjULiDhdyX\ndefaultdict(. at 0x1027718c8>, {'founders': ['http://diffbot.com/entity/PQGFYHeZhLH']}) \n\nhttp://diffbot.com/entity/EZmGU5Kh0KB\nhttp://diffbot.com/entity/Ozw6gU5AsJc\ndefaultdict(. at 0x102771d08>, {'founders': ['http://diffbot.com/entity/P9iQ6uiD5to']}) \n\nhttp://diffbot.com/entity/ON6yeCKuGnm\nhttp://diffbot.com/entity/OAbB86ZLYEH\nhttp://diffbot.com/entity/Ox_Hnd_7WEr\nhttp://diffbot.com/entity/OIZzlT1rihy\ndefaultdict(. at 0x105629378>, {'founders': ['http://diffbot.com/entity/PqXwyAswiIv']}) \n\nhttp://diffbot.com/entity/E9hzKNQUiTC\nhttp://diffbot.com/entity/EuXdxsjCRjg\nhttp://diffbot.com/entity/OWeqj9aprzB\ndefaultdict(. at 0x1056298c8>, {'founders': ['http://diffbot.com/entity/P1ejNzclrxY']}) \n\nhttp://diffbot.com/entity/O8zTfY2Tp_F\nhttp://diffbot.com/entity/OVX1ErF6X53\nhttp://diffbot.com/entity/E3r36BkD5tg\ndefaultdict(. at 0x104c552f0>, {'founders': ['http://diffbot.com/entity/PqBPxbbKUwG', 'http://diffbot.com/entity/PfNzLu47VeG']}) \n\nhttp://diffbot.com/entity/ONvHRir0UFC\ndefaultdict(. at 0x105614048>, {'founders': ['http://diffbot.com/entity/PsshDYuCF33']}) \n\nhttp://diffbot.com/entity/OVSJYDFkcq+\nhttp://diffbot.com/entity/O8VFLGMbcQL\ndefaultdict(. at 0x105614400>, {'founders': ['http://diffbot.com/entity/PwXUa7FvRPX', 'http://diffbot.com/entity/PjV1xV8x05B', 'http://diffbot.com/entity/PkIbZofedHb']}) \n\nhttp://diffbot.com/entity/Ox7OKybcRL7\nhttp://diffbot.com/entity/C6i+B13u3sC\nhttp://diffbot.com/entity/O+YAZiSQc9+\ndefaultdict(. at 0x1056148c8>, {'founders': ['http://diffbot.com/entity/P1ejNzclrxY']}) \n\nhttp://diffbot.com/entity/OXk7mZS+Pb1\ndefaultdict(. at 0x105614bf8>, {'founders': ['http://diffbot.com/entity/P0x0Tt66MLe', 'http://diffbot.com/entity/P1Cr1u7J9Lp']}) \n\nhttp://diffbot.com/entity/O1MpWqFXriE\nhttp://diffbot.com/entity/Oo1LrGpqO1p\nhttp://diffbot.com/entity/OBu5GZWCmV_\nhttp://diffbot.com/entity/OMg6x1ZdAty\nhttp://diffbot.com/entity/OVadPm2y5Nh\nhttp://diffbot.com/entity/EFM+ReMc_Se\ndefaultdict(. at 0x105628400>, {'founders': ['http://diffbot.com/entity/POsi1jTcSJ4']}) \n\nhttp://diffbot.com/entity/EyQuLEzfnQT\nhttp://diffbot.com/entity/OoVdS2h6bvY\nhttp://diffbot.com/entity/Cdng7W7qya1\ndefaultdict(. at 0x1056288c8>, {'founders': ['http://diffbot.com/entity/PQUyVUx+GIV']}) \n\nhttp://diffbot.com/entity/OTAOjf66kRw\ndefaultdict(. at 0x105628bf8>, {'founders': ['http://diffbot.com/entity/Ppx+o3WH4IR']}) \n\nhttp://diffbot.com/entity/CJuObvMhCx0\nhttp://diffbot.com/entity/OeFeWnK6gP0\nhttp://diffbot.com/entity/OLcHzKRVGE7\nhttp://diffbot.com/entity/OwneFlJCez3\nhttp://diffbot.com/entity/OWKZeuqQ01f\nhttp://diffbot.com/entity/O_3vnLIY3GN\nhttp://diffbot.com/entity/O_M1cxlmxlv\nhttp://diffbot.com/entity/OXLAYKu5JBi\nhttp://diffbot.com/entity/O0v9KVly3KO\ndefaultdict(. at 0x1056006a8>, {'founders': ['http://diffbot.com/entity/PaNntqxS5JI', 'http://diffbot.com/entity/Pj220nHFvPf'], 'ceo': ['http://diffbot.com/entity/PIHmMtCQuZx']}) \n\nhttp://diffbot.com/entity/C4yiK5ZPo6Z\nhttp://diffbot.com/entity/CYn98uQbrgS\nhttp://diffbot.com/entity/BkWvaONPQIK\nhttp://diffbot.com/entity/O7++6ItrZHG\nhttp://diffbot.com/entity/Cz11YfBBi4K\nhttp://diffbot.com/entity/O+kGQM+V_uD\nhttp://diffbot.com/entity/CImG9pD6qOK\nhttp://diffbot.com/entity/OvT6TRIswQR\nhttp://diffbot.com/entity/Oi9f+wirvLS\nhttp://diffbot.com/entity/OknTVY5fJ12\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "from collections import Counter, defaultdict \n", 113 | " \n", 114 | "\n", 115 | "cq = CachedQuery()\n", 116 | "printed = 0\n", 117 | "\n", 118 | "for i, query in enumerate(cq._cache):\n", 119 | " db_entity = json.loads(cq._cache[query].content)\n", 120 | "\n", 121 | " if \"data\" not in db_entity: continue \n", 122 | " \n", 123 | " for hit in db_entity[\"data\"]:\n", 124 | " \n", 125 | " if len(r) > 0 and printed < 20:\n", 126 | " print(r, \"\\n\")\n", 127 | " printed += 1 \n", 128 | " \n", 129 | " print(uri)\n", 130 | " \n", 131 | " break\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 48, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "[{'diffbotUri': 'http://diffbot.com/entity/AcZTRPXDrY9',\n 'name': 'United States of America',\n 'targetDiffbotUri': 'http://diffbot.com/entity/AcZTRPXDrY9',\n 'type': 'AdministrativeArea'},\n {'diffbotUri': 'http://diffbot.com/entity/AdBDaXfj65G',\n 'name': 'New York',\n 'targetDiffbotUri': 'http://diffbot.com/entity/AdBDaXfj65G',\n 'type': 'AdministrativeArea'},\n {'diffbotUri': 'http://diffbot.com/entity/AZfTRPXDrY9',\n 'name': 'New York City',\n 'targetDiffbotUri': 'http://diffbot.com/entity/AZfTRPXDrY9',\n 'type': 'AdministrativeArea'}]" 152 | ] 153 | }, 154 | "execution_count": 48, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "isinstance(hit[field_name], list)\n", 161 | "hit[field_name]" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 24, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "" 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "kernelspec": { 176 | "display_name": "Python 2", 177 | "language": "python", 178 | "name": "python2" 179 | }, 180 | "language_info": { 181 | "codemirror_mode": { 182 | "name": "ipython", 183 | "version": 2 184 | }, 185 | "file_extension": ".py", 186 | "mimetype": "text/x-python", 187 | "name": "python", 188 | "nbconvert_exporter": "python", 189 | "pygments_lexer": "ipython2", 190 | "version": "2.7.6" 191 | } 192 | }, 193 | "nbformat": 4, 194 | "nbformat_minor": 0 195 | } 196 | -------------------------------------------------------------------------------- /ttl.py: -------------------------------------------------------------------------------- 1 | import re 2 | from rdflib import URIRef, Graph 3 | import codecs 4 | from candidate import Phrase 5 | 6 | 7 | A = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" 8 | PHRASE = "#Phrase" 9 | CONTEXT = "#Context" 10 | STRING = "#isString" 11 | ANCOR = "#anchorOf" 12 | BEG = "#beginIndex" 13 | END = "#endIndex" 14 | CLASS_URI = URIRef("http://www.w3.org/2005/11/its/rdf#taClassRef") 15 | LINK_URI = URIRef("http://www.w3.org/2005/11/its/rdf#taIdentRef") 16 | NONE_URI = URIRef("http://dbpedia.org/nonsense") 17 | # NONE_URI = URIRef("http://dbpedia.org/page/Thing") 18 | 19 | 20 | class DatasetBuilder(object): 21 | def __init__(self, dataset_fpath): 22 | self._dataset_fpath = dataset_fpath 23 | with codecs.open(self._dataset_fpath, "a", "utf-8") as ttl_f: 24 | ttl_f.write("targets\tcontext\n") 25 | 26 | def add_to_dataset(self, input_ttl): 27 | graph, context, phrases = parse_d2kb_ttl(input_ttl) 28 | with codecs.open(self._dataset_fpath, "a", "utf-8") as ttl_f: 29 | phrases_str = ", ".join(p.text for p in phrases) 30 | ttl_f.write("{}\t{}\n".format(phrases_str, context)) 31 | 32 | 33 | def parse_d2kb_ttl(input_ttl): 34 | g = Graph() 35 | result = g.parse(data=input_ttl, format="n3") 36 | contexts, phrases = get_phrases(g) 37 | 38 | return g, contexts, phrases 39 | 40 | 41 | def get_phrases(g): 42 | """ Collect the context and phrases """ 43 | 44 | contexts = [] 45 | phrases = [] 46 | 47 | for subj, pred, obj in g: 48 | p = str(pred) 49 | s = str(subj) 50 | o = str(obj) 51 | 52 | # catch the context 53 | if o.endswith(CONTEXT): 54 | for pred_s, obj_s in g.predicate_objects(subj): 55 | if pred_s.strip().endswith(STRING): 56 | contexts.append(obj_s) 57 | 58 | # catch the phrases to disambiguate 59 | if o.endswith(PHRASE) or p.endswith(ANCOR): 60 | phrase = "" 61 | end = -1 62 | beg = -1 63 | for pred_s, obj_s in g.predicate_objects(subj): 64 | ps = pred_s.strip() 65 | if ps.endswith(ANCOR): phrase = str(obj_s) 66 | elif ps.endswith(BEG): beg = int(obj_s) 67 | elif ps.endswith(END): end = int(obj_s) 68 | 69 | if phrase == "" or beg == -1 or end == -1: 70 | print("Warning: bad phrase", subj, pred, obj) 71 | else: 72 | phrases.append(Phrase(phrase, beg, end, subj)) 73 | 74 | return contexts, phrases 75 | 76 | 77 | def add_nonsense_response(input_ttl): 78 | graph, context, phrases = parse_d2kb_ttl(input_ttl) 79 | 80 | # add new triples that correspond to the links of the disambiguation links 81 | print("# triples input:", len(graph)) 82 | for phrase in phrases: 83 | graph.add( (phrase.subj, CLASS_URI, NONE_URI) ) 84 | graph.add( (phrase.subj, LINK_URI, NONE_URI) ) 85 | print("# triples output:", len(graph)) 86 | 87 | output_ttl = str(graph.serialize(format='n3', encoding="utf-8"), "utf-8") 88 | 89 | return output_ttl 90 | 91 | 92 | def remove_classref(text): 93 | output = [] 94 | for line in text.split("\n"): 95 | upd_line = re.sub(r"itsrdf:taClassRef <[^;]*> ;", 96 | "itsrdf:taClassRef ;", 97 | line) 98 | output.append(upd_line) 99 | 100 | return "\n".join(output) 101 | 102 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from math import log 2 | import os 3 | from difflib import SequenceMatcher 4 | 5 | 6 | # This is the project root directory assuming that utils.py is in the root directory 7 | ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) 8 | 9 | 10 | def ensure_dir(dir_path): 11 | if not os.path.exists(dir_path): os.makedirs(dir_path) 12 | 13 | 14 | def dbpedia2wikipedia(url, to_en=True): 15 | """ Convert a dbpedia to wikipedia url. """ 16 | 17 | url = url.replace("https://", "") 18 | url = url.replace("http://", "") 19 | 20 | if to_en: 21 | wiki_domain = "en.wikipedia.org/wiki/" 22 | else: 23 | wiki_domain = "wikipedia.org/wiki/" 24 | 25 | new_url = url.replace("dbpedia.org/resource/", wiki_domain) 26 | if new_url == url: 27 | new_url = url.replace("dbpedia.org/page/", wiki_domain) 28 | 29 | return new_url 30 | 31 | 32 | def longest_common_substring(s1, s2, lower=True): 33 | if lower: 34 | s1 = s1.lower() 35 | s2 = s2.lower() 36 | 37 | match = SequenceMatcher(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2)) 38 | substring = s1[match.a: match.a + match.size] 39 | 40 | return substring 41 | 42 | 43 | def overlap(s1, s2, lower=True): 44 | direct = longest_common_substring(s1, s2, lower) 45 | inverse = longest_common_substring(s2, s1, lower) 46 | max_overlap = float(max(len(direct), len(inverse))) 47 | if max_overlap < 3: 48 | return 0.0 49 | else: 50 | max_len = float(max(len(s1), len(s2))) 51 | return max_overlap / max_len 52 | 53 | 54 | def truncated_log(x): 55 | if x > 0: return log(x) 56 | else: return 0.0 57 | --------------------------------------------------------------------------------