├── KNNBaseline.ipynb ├── README.md ├── filter_mentions.py ├── filter_sentences.py ├── fuzzy_dedup.py ├── indexer.py ├── reader.py ├── reader.rb ├── result_dataset.png ├── sentence.pb.rb ├── sentence.proto └── sentence_pb2.py /KNNBaseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import gzip\n", 12 | "import itertools\n", 13 | "from collections import defaultdict" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import random" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 3, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "class PredictorBase:\n", 37 | " \"\"\"\n", 38 | " This class contains implementation of experiment design.\n", 39 | " It passes few instances with classes to the `train` method, then it passes validation instances to predict\n", 40 | " method.\n", 41 | " \"\"\"\n", 42 | " \n", 43 | " def __init__(self):\n", 44 | " pass\n", 45 | " \n", 46 | " def clear(self):\n", 47 | " \"\"\"\n", 48 | " Clear info about previous bathces\n", 49 | " \"\"\"\n", 50 | " pass\n", 51 | " \n", 52 | " def train(self, X, y):\n", 53 | " \"\"\"\n", 54 | " Train on few entities. Override this method in real implementation\n", 55 | " \"\"\"\n", 56 | " pass\n", 57 | " \n", 58 | " def predict(self, X):\n", 59 | " \"\"\"\n", 60 | " Predict classes of the given entities\n", 61 | " \"\"\"\n", 62 | " pass" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "class Evaluator:\n", 74 | " batch_size = 10\n", 75 | " test_fraction = 0.3\n", 76 | " \n", 77 | " @classmethod\n", 78 | " def read_lines(cls, fd):\n", 79 | " for line in fd:\n", 80 | " yield line.decode().strip('\\n').split('\\t')\n", 81 | " \n", 82 | " def __init__(self, filename=\"./shuffled_dedup_entities.tsv.gz\"):\n", 83 | " self.fd = gzip.open(filename, 'r')\n", 84 | " self.reader = self.read_lines(self.fd)\n", 85 | " \n", 86 | " \n", 87 | " def read_batch(self, size=None):\n", 88 | " batch = list(itertools.islice(self.reader, size or self.batch_size))\n", 89 | " \n", 90 | " groups = defaultdict(list)\n", 91 | " for entity in batch:\n", 92 | " groups[entity[0]].append(entity)\n", 93 | " \n", 94 | " train_groups = {}\n", 95 | " test_groups = {}\n", 96 | " for etype, entities in groups.items():\n", 97 | " if len(entities) * self.test_fraction > 1:\n", 98 | " test_size = int(len(entities) * self.test_fraction)\n", 99 | " test_groups[etype] = entities[:test_size]\n", 100 | " train_groups[etype] = entities[test_size:]\n", 101 | " \n", 102 | " return train_groups, test_groups\n", 103 | " \n", 104 | " @classmethod\n", 105 | " def prepare_data(cls, group):\n", 106 | " X, y = [], []\n", 107 | " for label, entities in group.items():\n", 108 | " for entity in entities:\n", 109 | " X.append((entity[1], entity[3]))\n", 110 | " y.append(label)\n", 111 | "\n", 112 | " c = list(zip(X, y))\n", 113 | "\n", 114 | " random.shuffle(c)\n", 115 | "\n", 116 | " X, y = zip(*c)\n", 117 | " \n", 118 | " return X, y\n", 119 | " \n", 120 | " def eval_batched(self, model, metric, entities_count, count):\n", 121 | " metrics = []\n", 122 | " for batch_id in range(count):\n", 123 | " train, test = eva.read_batch(entities_count)\n", 124 | " X, y = Evaluator.prepare_data(train)\n", 125 | " X_test, y_test = Evaluator.prepare_data(test)\n", 126 | " model.train(X, y)\n", 127 | " pred = model.predict(X_test)\n", 128 | " score = metric(pred, y_test)\n", 129 | " metrics.append(score)\n", 130 | " return np.mean(metrics)\n", 131 | " " 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "from sklearn.pipeline import Pipeline\n", 143 | "from sklearn.preprocessing import FunctionTransformer\n", 144 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", 145 | "from sklearn.neighbors import KNeighborsClassifier\n", 146 | "from sklearn.metrics import f1_score" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 6, 152 | "metadata": { 153 | "collapsed": true 154 | }, 155 | "outputs": [], 156 | "source": [ 157 | "def concat_context(X):\n", 158 | " return np.array(list(map(\n", 159 | " lambda x: x[0] + \" \" + x[1],\n", 160 | " X\n", 161 | " )))\n", 162 | "\n", 163 | "class KNNBaseline(PredictorBase):\n", 164 | " def __init__(self):\n", 165 | " self.model = None\n", 166 | " self.clear()\n", 167 | " \n", 168 | " def clear(self):\n", 169 | " self.model = Pipeline([\n", 170 | " ('concat_context', FunctionTransformer(concat_context)),\n", 171 | " ('vectorizer', CountVectorizer(stop_words='english')),\n", 172 | " ('cls', KNeighborsClassifier(metric='cosine', algorithm='brute'))\n", 173 | " ])\n", 174 | " \n", 175 | " def train(self, X, y):\n", 176 | " self.model.fit(X, y)\n", 177 | " \n", 178 | " \n", 179 | " def predict(self, X):\n", 180 | " return self.model.predict(X)\n", 181 | " \n", 182 | " " 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 7, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "eva = Evaluator()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 8, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "mean_score = eva.eval_batched(\n", 201 | " model=KNNBaseline(),\n", 202 | " metric=lambda x, y: f1_score(x, y, average='micro'),\n", 203 | " entities_count=1000,\n", 204 | " count=50\n", 205 | ")" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 9, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "0.68707437656001968" 217 | ] 218 | }, 219 | "execution_count": 9, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "mean_score" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": { 232 | "collapsed": true 233 | }, 234 | "outputs": [], 235 | "source": [] 236 | } 237 | ], 238 | "metadata": { 239 | "kernelspec": { 240 | "display_name": "Python 3", 241 | "language": "python", 242 | "name": "python3" 243 | }, 244 | "language_info": { 245 | "codemirror_mode": { 246 | "name": "ipython", 247 | "version": 3 248 | }, 249 | "file_extension": ".py", 250 | "mimetype": "text/x-python", 251 | "name": "python", 252 | "nbconvert_exporter": "python", 253 | "pygments_lexer": "ipython3", 254 | "version": "3.6.1" 255 | } 256 | }, 257 | "nbformat": 4, 258 | "nbformat_minor": 2 259 | } 260 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ExtWikilinks 2 | ExtWikilinks is a dataset, obtained from http://www.iesl.cs.umass.edu/data/wiki-links using enrichment of CoreNLP and Elasticsearch . 3 | 4 | Download at: http://academictorrents.com/details/80d0a22ed403b65f7cc0d81d51759b62c66b41ce 5 | 6 | # Purpose 7 | 8 | Original **Wikilinks** dataset contains single entity per sentence, it is not enough to build state of the art named entity linking system, since context of entity may contain a valuable information. 9 | More over, it may take a lot of time to apply POS-tagging to 40 million of sentences, so, this information already included into this extended dataset. 10 | 11 | # Enrichment 12 | 13 | There are two main mechanism involved into enrichment of **Wikilinks** dataset: CoreNLP pipeline and searching for additional entities with Elasticsearch engine in dataset itself. Let's describe both. 14 | 15 | ## CoreNLP processing 16 | Each text abstract in original dataset was analysed using CoreNLP library with following pipeline: 17 | abstract divided into sentences and the sentence with mention passes through following process: 18 | ``` 19 | tokenize, ssplit, pos, lemma, parse 20 | ``` 21 | Since `parse` step involves building of tree, the result of this step was converted into groups (analogue with Chunking). As a result, each token in enriched dataset store 5 parameters: 22 | 23 | * Token 24 | * Lemma 25 | * POS-tag 26 | * Parse tag (tag, assigned by parser) 27 | * Group id 28 | 29 | Previous and next sentence are also stored in dataset, but in raw way. 30 | 31 | ## Elasticsearch processing 32 | In order to enrich sentences with additional links, each (except for stop words) Noun Phrase (NP) extracted on CoreNLP step was searched in mentions of original dataset (with respect to context). It is likely, that single entity have similar mentions in text, but ambiguity may appear in this case. Because of this all search results are stored in extended dataset with minimal threshold. Each additional mention contains information about hits count and average search score. 33 | 34 | # Storage format 35 | Dataset consist of 79 protobuf files 500 000 sentence each compressed with tar gz. You can use following command to extract content: 36 | ``` 37 | tar -xzvf archive_name.tgz 38 | ``` 39 | Each protobuf file consists of protobuf encoded "messages" separated by the length of next message: 40 | ``` 41 | [varbyte(length of msg1)] [msg1] [varbyte(length of msg2)] [msg2] ... 42 | ``` 43 | You can find description of used protobuf in `sentence.proto` file. 44 | 45 | For reading messages from this protobuf you can use `parseDelimitedFrom` in Java or `streamFromDelimitedInput` in Scala. Ruby example is also available in `reader.rb` file. 46 | 47 | # Basic statistics 48 | 49 | Number of sentences: 40 million 50 | 51 | Number of unique entities: 2 million 52 | 53 | # Example of extended sentence 54 | 55 | ![Alt text](/result_dataset.png) 56 | 57 | Other sentence example (in json) 58 | 59 | ```json 60 | { 61 | "sent": "Isaac Asimov advocated something like that with his \"Three Laws.\"", 62 | "mentions": [ 63 | { 64 | "id": 0, 65 | "resolver": "wikilink", <-- original mention 66 | "text": "Isaac Asimov", 67 | "position": { 68 | "fromPos": 0, 69 | "toPos": 13 70 | }, 71 | "context": { 72 | "size": -1, 73 | "left": "goals and I would say therefore at that point lets put a chip in their brain to shut them off if they get murderous thoughts.", 74 | "right": "advocated something like that with his \"Three Laws.\" I say hey, put a chip in their brain to shut them off if they start" 75 | }, 76 | "concepts": [ 77 | { 78 | "link": "http://en.wikipedia.org/wiki/Isaac_Asimov", 79 | "hits": 1, 80 | "avgScore": 1, 81 | "maxScore": 1, 82 | "minScore": 1, 83 | "avgNorm": 1, 84 | "avgSoftMax": 1 85 | } 86 | ] 87 | }, 88 | { 89 | "id": 2, 90 | "resolver": "elastic", <-- mention, detected by search 91 | "text": "Isaac Asimov", 92 | "position": { 93 | "fromPos": 0, 94 | "toPos": 12 95 | }, 96 | "params": { 97 | "sum_weight": 1.4788535592903926, 98 | "avg_wight": 0.7394267796451963, 99 | "max_wight": 0.7675390229334701, 100 | "word_count": 2 101 | }, 102 | "context": { 103 | "size": 3, 104 | "left": "", 105 | "right": "advocated something like" 106 | }, 107 | "concepts": [ 108 | { 109 | "link": "http://en.wikipedia.org/wiki/Isaac_Asimov", 110 | "hits": 23, 111 | "avgScore": 9.063705444335938, 112 | "maxScore": 9.063705444335938, 113 | "minScore": 9.063705444335938, 114 | "avgNorm": 0.3333333333333333, 115 | "avgSoftMax": 0.3333333333333333 116 | } 117 | ] 118 | }, 119 | { 120 | "id": 3, 121 | "resolver": "elastic", 122 | "text": "Three Laws", 123 | "position": { 124 | "fromPos": 53, 125 | "toPos": 63 126 | }, 127 | "params": { 128 | "sum_weight": 1.2180295766124987, 129 | "avg_wight": 0.6090147883062493, 130 | "max_wight": 0.7447375343930597, 131 | "word_count": 2 132 | }, 133 | "context": { 134 | "size": 3, 135 | "left": "with his ``", 136 | "right": ". ''" 137 | }, 138 | "concepts": [ 139 | { 140 | "link": "http://en.wikipedia.org/wiki/Clarke's_three_laws", 141 | "hits": 4, 142 | "avgScore": 7.716668963432312, 143 | "maxScore": 7.717005729675293, 144 | "minScore": 7.715658664703369, 145 | "avgNorm": 0.20000193963662358, 146 | "avgSoftMax": 0.20001494353287008 147 | }, 148 | { 149 | "link": "http://en.wikipedia.org/wiki/Three_Laws_of_Robotics", 150 | "hits": 18, 151 | "avgScore": 7.71663154496087, 152 | "maxScore": 7.717005729675293, 153 | "minScore": 7.715658664703369, 154 | "avgNorm": 0.20000096981831178, 155 | "avgSoftMax": 0.20000745941944043 156 | } 157 | ] 158 | } 159 | ], 160 | "parser_name": "CoreNLP", 161 | "parse_result": [ 162 | { 163 | "token": "Isaac", 164 | "lemma": "Isaac", 165 | "pos_tag": "NNP", 166 | "parserTag": "NP", 167 | "group": 0, 168 | "mentions": [ 169 | 0, 170 | 2 171 | ] 172 | }, 173 | { 174 | "token": "Asimov", 175 | "lemma": "Asimov", 176 | "pos_tag": "NNP", 177 | "parserTag": "NP", 178 | "group": 0, 179 | "mentions": [ 180 | 0, 181 | 2 182 | ] 183 | }, 184 | { 185 | "token": "advocated", 186 | "lemma": "advocate", 187 | "pos_tag": "VBD", 188 | "parserTag": "VBD", 189 | "group": 1 190 | }, 191 | { 192 | "token": "something", 193 | "lemma": "something", 194 | "pos_tag": "NN", 195 | "parserTag": "NP", 196 | "group": 2 197 | }, 198 | { 199 | "token": "like", 200 | "lemma": "like", 201 | "pos_tag": "IN", 202 | "parserTag": "IN", 203 | "group": 3 204 | }, 205 | { 206 | "token": "that", 207 | "lemma": "that", 208 | "pos_tag": "DT", 209 | "parserTag": "NP", 210 | "group": 4 211 | }, 212 | { 213 | "token": "with", 214 | "lemma": "with", 215 | "pos_tag": "IN", 216 | "parserTag": "IN", 217 | "group": 5 218 | }, 219 | { 220 | "token": "his", 221 | "lemma": "he", 222 | "pos_tag": "PRP$", 223 | "parserTag": "NP", 224 | "group": 6 225 | }, 226 | { 227 | "token": "``", 228 | "lemma": "``", 229 | "pos_tag": "``", 230 | "parserTag": "``", 231 | "group": 7 232 | }, 233 | { 234 | "token": "Three", 235 | "lemma": "three", 236 | "pos_tag": "CD", 237 | "parserTag": "NP", 238 | "group": 8, 239 | "mentions": [ 240 | 3 241 | ] 242 | }, 243 | { 244 | "token": "Laws", 245 | "lemma": "law", 246 | "pos_tag": "NNS", 247 | "parserTag": "NP", 248 | "group": 8, 249 | "mentions": [ 250 | 3 251 | ] 252 | }, 253 | { 254 | "token": ".", 255 | "lemma": ".", 256 | "pos_tag": ".", 257 | "parserTag": ".", 258 | "group": 9 259 | }, 260 | { 261 | "token": "''", 262 | "lemma": "''", 263 | "pos_tag": "''", 264 | "parserTag": "''", 265 | "group": 10 266 | } 267 | ], 268 | "prevSentence": "goals and I would say therefore at that point lets put a chip in their brain to shut them off if they get murderous thoughts. ", 269 | "nextSentence": " I say hey, put a chip in their brain to shut them off if they start" 270 | } 271 | ``` 272 | # Citations 273 | 274 | 275 | ``` 276 | @techreport{singh12:wiki-links, 277 | author = "Sameer Singh and Amarnag Subramanya and Fernando Pereira and Andrew McCallum", 278 | title = "Wikilinks: A Large-scale Cross-Document Coreference Corpus Labeled via Links to {Wikipedia}", 279 | institute = "University of Massachusetts, Amherst", 280 | number = "UM-CS-2012-015", 281 | year = "2012" 282 | } 283 | ``` 284 | 285 | ``` 286 | @InProceedings{manning-EtAl:2014:P14-5, 287 | author = {Manning, Christopher D. and Surdeanu, Mihai and Bauer, John and Finkel, Jenny and Bethard, Steven J. and McClosky, David}, 288 | title = {The {Stanford} {CoreNLP} Natural Language Processing Toolkit}, 289 | booktitle = {Association for Computational Linguistics (ACL) System Demonstrations}, 290 | year = {2014}, 291 | pages = {55--60}, 292 | url = {http://www.aclweb.org/anthology/P/P14/P14-5010} 293 | } 294 | ``` 295 | -------------------------------------------------------------------------------- /filter_mentions.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script should filter trash mentions 3 | """ 4 | 5 | import sys 6 | import re 7 | 8 | input_filename = sys.argv[1] 9 | output_filename = sys.argv[2] 10 | 11 | 12 | def check_mention(mention): 13 | word_count = mention.count(' ') 14 | if word_count > 2: 15 | return False 16 | 17 | length = len(mention) 18 | 19 | if length > 50: 20 | return False 21 | 22 | numbers = sum(c.isdigit() for c in mention) 23 | chars = sum(c.isalpha() for c in mention) 24 | other = length - numbers - chars 25 | 26 | if numbers > chars: 27 | return False 28 | 29 | if other > chars: 30 | return False 31 | 32 | if re.match(r'^[\s\d\w\-\,]*$', mention) is None: 33 | return False 34 | 35 | return True 36 | 37 | with open(input_filename) as inp: 38 | with open(output_filename, 'w') as out: 39 | for line in inp: 40 | if check_mention(line): 41 | out.write(line) 42 | -------------------------------------------------------------------------------- /filter_sentences.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script should filter trash sentences 3 | """ 4 | 5 | import sys 6 | import re 7 | 8 | input_filename = sys.argv[1] 9 | output_filename = sys.argv[2] 10 | 11 | def check_mention(mention): 12 | word_count = mention.count(' ') 13 | if word_count > 3: 14 | return False 15 | 16 | length = len(mention) 17 | 18 | if length > 50: 19 | return False 20 | 21 | numbers = sum(c.isdigit() for c in mention) 22 | chars = sum(c.isalpha() for c in mention) 23 | other = length - numbers - chars 24 | 25 | if numbers > chars: 26 | return False 27 | 28 | if other > chars: 29 | return False 30 | 31 | if re.match(r'^[\s\d\w\-\,]*$', mention) is None: 32 | return False 33 | 34 | return True 35 | 36 | def convert_concept(concept): 37 | concept = concept.strip('><') 38 | m = re.search(r'wiki/(.*)$', concept) 39 | if m: 40 | concept = m.groups()[0] 41 | else: 42 | concept = '' 43 | return concept 44 | 45 | redirects = {} 46 | with open('./cleaned_redirects.tsv') as fd: 47 | for line in fd: 48 | from_c, to_c = line.strip().split('\t') 49 | redirects[from_c] = to_c 50 | 51 | 52 | with open(input_filename) as inp: 53 | with open(output_filename, 'w') as out: 54 | for line in inp: 55 | if line.count('\t') != 3: 56 | continue 57 | 58 | left_context, mention_text, mention_link, right_context = line.split('\t') 59 | 60 | passed = True 61 | 62 | passed &= check_mention(mention_text) 63 | 64 | if len(left_context) + len(right_context) < 10: 65 | passed = False 66 | 67 | concept = convert_concept(mention_link) 68 | concept = redirects.get(concept, concept) 69 | 70 | if passed: 71 | out.write("\t".join([concept, left_context, mention_text, right_context])) 72 | 73 | -------------------------------------------------------------------------------- /fuzzy_dedup.py: -------------------------------------------------------------------------------- 1 | from fuzzywuzzy import fuzz 2 | 3 | prev_right_context = "" 4 | prev_left_context = "" 5 | 6 | 7 | import sys 8 | import re 9 | 10 | input_filename = sys.argv[1] 11 | output_filename = sys.argv[2] 12 | 13 | threshold = 70 14 | 15 | with open(input_filename) as inp: 16 | with open(output_filename, 'w') as out: 17 | 18 | for line in inp: 19 | 20 | if line.count('\t') != 3: 21 | continue 22 | 23 | passed = True 24 | 25 | concept, left_context, mention_text, right_context = line.split('\t') 26 | 27 | fuzzy_ratio = fuzz.ratio(left_context, prev_left_context) 28 | if fuzzy_ratio > threshold: 29 | passed = False 30 | 31 | fuzzy_ratio = fuzz.ratio(right_context, prev_right_context) 32 | if fuzzy_ratio > threshold: 33 | passed = False 34 | 35 | prev_right_context = right_context 36 | prev_left_context = left_context 37 | 38 | if passed: 39 | out.write("\t".join([concept, left_context, mention_text, right_context])) -------------------------------------------------------------------------------- /indexer.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script should create SQLite database with indexed mentions. 3 | """ 4 | 5 | import sqlite3 6 | import sys 7 | 8 | query = """ 9 | CREATE VIRTUAL TABLE IF NOT EXISTS sentences 10 | USING fts4( 11 | left_context TEXT, 12 | mention TEXT, 13 | concept TEXT, 14 | right_context TEXT, 15 | notindexed=left_context, 16 | notindexed=right_context, 17 | notindexed=concept 18 | ) 19 | """ 20 | 21 | conn = sqlite3.connect("db_sent.sqlite") 22 | cursor = conn.cursor() 23 | cursor.execute(query) 24 | cursor.close() 25 | conn.commit() 26 | 27 | 28 | cursor = conn.cursor() 29 | 30 | filename = sys.argv[1] 31 | 32 | insert_query = """ 33 | INSERT INTO sentences VALUES (?, ?, ?, ?) 34 | """ 35 | 36 | with open(filename) as fd: 37 | for line in fd: 38 | arr = line.split('\t') 39 | if len(arr) == 4: 40 | left_context, mention, concept, right_context = arr 41 | cursor.execute(insert_query, (left_context, mention, concept, right_context)) 42 | else: 43 | print("Skip:", line) 44 | 45 | conn.commit() 46 | -------------------------------------------------------------------------------- /reader.py: -------------------------------------------------------------------------------- 1 | import sentence_pb2 2 | import sys 3 | import json 4 | 5 | from google.protobuf.internal.encoder import _VarintBytes 6 | from google.protobuf.internal.decoder import _DecodeVarint32 7 | 8 | from google.protobuf.internal import decoder 9 | 10 | 11 | def get_delimited_message_bytes(byte_stream, nr=4): 12 | ''' Parse a delimited protobuf message. This is done by first getting a protobuf varint from 13 | the stream that represents the length of the message, then reading that amount of 14 | from the message and then parse it. 15 | Since the int can be represented as max 4 bytes, first get 4 bytes and try to decode. 16 | The decoder returns the value and the position where the value was found, so we need 17 | to rewind the buffer to the position, because the remaining bytes belong to the message 18 | after. 19 | ''' 20 | length_bites = byte_stream.read(nr) 21 | 22 | (length, pos) = decoder._DecodeVarint32(length_bites, 0) 23 | delimiter_bytes = nr - pos 24 | 25 | message_bytes = byte_stream.read(length - delimiter_bytes) 26 | 27 | total_len = length + pos 28 | return (total_len, length_bites[-delimiter_bytes:] + message_bytes) 29 | 30 | while True: 31 | _, msg_buf = get_delimited_message_bytes(sys.stdin.buffer) 32 | sentence = sentence_pb2.Sentence() 33 | sentence.ParseFromString(msg_buf) 34 | 35 | mention_link = '' 36 | mention_text = '' 37 | left_context = '' 38 | right_context = '' 39 | 40 | for mention in sentence.mentions: 41 | if mention.resolver == 'wikilink': 42 | mention_link = mention.concepts[0].link 43 | mention_text = mention.text 44 | 45 | left_context = mention.context.left 46 | right_context = mention.context.right 47 | break 48 | 49 | print("\t".join([ 50 | left_context, 51 | mention_text, 52 | mention_link, 53 | right_context 54 | ])) 55 | 56 | -------------------------------------------------------------------------------- /reader.rb: -------------------------------------------------------------------------------- 1 | require_relative "sentence.pb.rb" 2 | 3 | class ExtWikilinksReader 4 | 5 | def self.read_sentence(stream) 6 | return nil if stream.eof? 7 | length = Protobuf::Varint.decode(stream) 8 | Sentence.decode(stream.read(length)) 9 | end 10 | 11 | def self.write_sentence(stream, snt) 12 | byte_data = snt.encode 13 | stream << "#{Protobuf::Field::VarintField.encode(byte_data.size)}#{byte_data}" 14 | end 15 | 16 | end 17 | 18 | -------------------------------------------------------------------------------- /result_dataset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/generall/ExtWikilinks/08e78066db0e2f099986d9c8a80f2ac417a81c14/result_dataset.png -------------------------------------------------------------------------------- /sentence.pb.rb: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | ## 4 | # This file is auto-generated. DO NOT EDIT! 5 | # 6 | require 'protobuf/message' 7 | 8 | 9 | ## 10 | # Message Classes 11 | # 12 | class Position < ::Protobuf::Message; end 13 | class Params < ::Protobuf::Message; end 14 | class Context < ::Protobuf::Message; end 15 | class Concept < ::Protobuf::Message; end 16 | class Mention < ::Protobuf::Message; end 17 | class Token < ::Protobuf::Message; end 18 | class Sentence < ::Protobuf::Message; end 19 | 20 | 21 | ## 22 | # Message Fields 23 | # 24 | class Position 25 | required :int32, :fromPos, 1 26 | required :int32, :toPos, 2 27 | end 28 | 29 | class Params 30 | optional :double, :sum_weight, 1 31 | optional :double, :avg_wight, 2 32 | optional :double, :max_wight, 3 33 | optional :int32, :word_count, 4 34 | end 35 | 36 | class Context 37 | required :int32, :size, 1 38 | optional :string, :left, 2 39 | optional :string, :right, 3 40 | end 41 | 42 | class Concept 43 | required :string, :link, 1 44 | optional :int32, :hits, 2 45 | optional :double, :avgScore, 3 46 | optional :double, :maxScore, 4 47 | optional :double, :minScore, 5 48 | optional :double, :avgNorm, 6 49 | optional :double, :avgSoftMax, 7 50 | end 51 | 52 | class Mention 53 | required :int32, :id, 1 54 | optional :string, :resolver, 2 55 | optional :string, :text, 3 56 | optional ::Position, :position, 4 57 | optional ::Params, :params, 5 58 | optional ::Context, :context, 6 59 | repeated ::Concept, :concepts, 7 60 | end 61 | 62 | class Token 63 | optional :string, :token, 1 64 | optional :string, :lemma, 2 65 | optional :string, :pos_tag, 3 66 | optional :string, :parserTag, 4 67 | optional :int32, :group, 5 68 | repeated :int32, :mentions, 6 69 | end 70 | 71 | class Sentence 72 | optional :string, :sent, 1 73 | repeated ::Mention, :mentions, 2 74 | optional :string, :parser_name, 3 75 | repeated ::Token, :parse_result, 4 76 | optional :string, :prevSentence, 5 77 | optional :string, :nextSentence, 6 78 | end 79 | 80 | -------------------------------------------------------------------------------- /sentence.proto: -------------------------------------------------------------------------------- 1 | syntax = "proto2"; 2 | option java_package = "ml.generall"; 3 | 4 | // Position of meta info 5 | message Position { 6 | // from position 7 | required int32 fromPos = 1; 8 | // to position 9 | required int32 toPos = 2; 10 | // Next available id: 3 11 | } 12 | 13 | // parameters of mention 14 | message Params { 15 | // Summ weight of tokens 16 | optional double sum_weight = 1; // Required 17 | // average weight of tokens 18 | optional double avg_wight = 2; // Required 19 | // Max weight of tokens 20 | optional double max_wight = 3; // Required 21 | // count of tokens 22 | optional int32 word_count = 4; // Required 23 | // Next available id: 5 24 | } 25 | 26 | // Context of mention 27 | message Context { 28 | // size of context 29 | required int32 size = 1; // Required 30 | // left context string 31 | optional string left = 2; 32 | // right context string 33 | optional string right = 3; 34 | // Next available id: 1 35 | } 36 | 37 | // Concept link 38 | message Concept { 39 | // Link to concept 40 | required string link = 1; // Required 41 | // Count of hits 42 | optional int32 hits = 2; 43 | 44 | // Average score 45 | optional double avgScore = 3; 46 | 47 | // Max score 48 | optional double maxScore = 4; 49 | 50 | // Minimal score 51 | optional double minScore = 5; 52 | 53 | // Average normalized score 54 | optional double avgNorm = 6; 55 | 56 | // Average soft-max normalized score 57 | optional double avgSoftMax = 7; 58 | 59 | // Next available id: 8 60 | } 61 | 62 | //Mention description 63 | message Mention { 64 | // id of mention 65 | required int32 id = 1; // Required int64 id = 1; 66 | // Resolver name: elastic or wikimention 67 | optional string resolver = 2; // Required 68 | 69 | // Text of mention 70 | optional string text = 3; // Required 71 | 72 | // Position of mention 73 | optional Position position = 4; // Required 74 | 75 | // Params of mention 76 | optional Params params = 5; // Required 77 | 78 | // Context of mention 79 | optional Context context = 6; // Required 80 | 81 | // Concepts 82 | repeated Concept concepts = 7; 83 | 84 | // Next available id: 8 85 | } 86 | 87 | // Token description 88 | message Token { 89 | // Token string 90 | optional string token = 1; // Required 91 | 92 | // Lemma of token 93 | optional string lemma = 2; // Required 94 | 95 | // Part of speech tag 96 | optional string pos_tag = 3; // Required 97 | 98 | // parser tag 99 | optional string parserTag = 4; // Required 100 | 101 | // Group id of token 102 | optional int32 group = 5; // Required 103 | 104 | // Mentions id of this token 105 | repeated int32 mentions = 6; 106 | 107 | // Next available id: 1 108 | } 109 | 110 | // Sentence object. 111 | message Sentence { 112 | // Complete sentence text 113 | optional string sent = 1; // Required 114 | 115 | // Mentions 116 | repeated Mention mentions = 2; 117 | 118 | // Name of sentence parser: CoreNLP, OpenNLP e.t.c 119 | optional string parser_name = 3; // Required 120 | 121 | 122 | // Result of string parsing 123 | repeated Token parse_result = 4; 124 | 125 | // Sentence before observed 126 | optional string prevSentence = 5; 127 | 128 | // Sentence after observed 129 | optional string nextSentence = 6; 130 | 131 | 132 | // Next available id: 7 133 | } 134 | -------------------------------------------------------------------------------- /sentence_pb2.py: -------------------------------------------------------------------------------- 1 | # Generated by the protocol buffer compiler. DO NOT EDIT! 2 | # source: sentence.proto 3 | 4 | import sys 5 | _b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1')) 6 | from google.protobuf import descriptor as _descriptor 7 | from google.protobuf import message as _message 8 | from google.protobuf import reflection as _reflection 9 | from google.protobuf import symbol_database as _symbol_database 10 | from google.protobuf import descriptor_pb2 11 | # @@protoc_insertion_point(imports) 12 | 13 | _sym_db = _symbol_database.Default() 14 | 15 | 16 | 17 | 18 | DESCRIPTOR = _descriptor.FileDescriptor( 19 | name='sentence.proto', 20 | package='', 21 | serialized_pb=_b('\n\x0esentence.proto\"*\n\x08Position\x12\x0f\n\x07\x66romPos\x18\x01 \x02(\x05\x12\r\n\x05toPos\x18\x02 \x02(\x05\"V\n\x06Params\x12\x12\n\nsum_weight\x18\x01 \x01(\x01\x12\x11\n\tavg_wight\x18\x02 \x01(\x01\x12\x11\n\tmax_wight\x18\x03 \x01(\x01\x12\x12\n\nword_count\x18\x04 \x01(\x05\"4\n\x07\x43ontext\x12\x0c\n\x04size\x18\x01 \x02(\x05\x12\x0c\n\x04left\x18\x02 \x01(\t\x12\r\n\x05right\x18\x03 \x01(\t\"\x80\x01\n\x07\x43oncept\x12\x0c\n\x04link\x18\x01 \x02(\t\x12\x0c\n\x04hits\x18\x02 \x01(\x05\x12\x10\n\x08\x61vgScore\x18\x03 \x01(\x01\x12\x10\n\x08maxScore\x18\x04 \x01(\x01\x12\x10\n\x08minScore\x18\x05 \x01(\x01\x12\x0f\n\x07\x61vgNorm\x18\x06 \x01(\x01\x12\x12\n\navgSoftMax\x18\x07 \x01(\x01\"\xa2\x01\n\x07Mention\x12\n\n\x02id\x18\x01 \x02(\x05\x12\x10\n\x08resolver\x18\x02 \x01(\t\x12\x0c\n\x04text\x18\x03 \x01(\t\x12\x1b\n\x08position\x18\x04 \x01(\x0b\x32\t.Position\x12\x17\n\x06params\x18\x05 \x01(\x0b\x32\x07.Params\x12\x19\n\x07\x63ontext\x18\x06 \x01(\x0b\x32\x08.Context\x12\x1a\n\x08\x63oncepts\x18\x07 \x03(\x0b\x32\x08.Concept\"j\n\x05Token\x12\r\n\x05token\x18\x01 \x01(\t\x12\r\n\x05lemma\x18\x02 \x01(\t\x12\x0f\n\x07pos_tag\x18\x03 \x01(\t\x12\x11\n\tparserTag\x18\x04 \x01(\t\x12\r\n\x05group\x18\x05 \x01(\x05\x12\x10\n\x08mentions\x18\x06 \x03(\x05\"\x93\x01\n\x08Sentence\x12\x0c\n\x04sent\x18\x01 \x01(\t\x12\x1a\n\x08mentions\x18\x02 \x03(\x0b\x32\x08.Mention\x12\x13\n\x0bparser_name\x18\x03 \x01(\t\x12\x1c\n\x0cparse_result\x18\x04 \x03(\x0b\x32\x06.Token\x12\x14\n\x0cprevSentence\x18\x05 \x01(\t\x12\x14\n\x0cnextSentence\x18\x06 \x01(\tB\r\n\x0bml.generall') 22 | ) 23 | _sym_db.RegisterFileDescriptor(DESCRIPTOR) 24 | 25 | 26 | 27 | 28 | _POSITION = _descriptor.Descriptor( 29 | name='Position', 30 | full_name='Position', 31 | filename=None, 32 | file=DESCRIPTOR, 33 | containing_type=None, 34 | fields=[ 35 | _descriptor.FieldDescriptor( 36 | name='fromPos', full_name='Position.fromPos', index=0, 37 | number=1, type=5, cpp_type=1, label=2, 38 | has_default_value=False, default_value=0, 39 | message_type=None, enum_type=None, containing_type=None, 40 | is_extension=False, extension_scope=None, 41 | options=None), 42 | _descriptor.FieldDescriptor( 43 | name='toPos', full_name='Position.toPos', index=1, 44 | number=2, type=5, cpp_type=1, label=2, 45 | has_default_value=False, default_value=0, 46 | message_type=None, enum_type=None, containing_type=None, 47 | is_extension=False, extension_scope=None, 48 | options=None), 49 | ], 50 | extensions=[ 51 | ], 52 | nested_types=[], 53 | enum_types=[ 54 | ], 55 | options=None, 56 | is_extendable=False, 57 | extension_ranges=[], 58 | oneofs=[ 59 | ], 60 | serialized_start=18, 61 | serialized_end=60, 62 | ) 63 | 64 | 65 | _PARAMS = _descriptor.Descriptor( 66 | name='Params', 67 | full_name='Params', 68 | filename=None, 69 | file=DESCRIPTOR, 70 | containing_type=None, 71 | fields=[ 72 | _descriptor.FieldDescriptor( 73 | name='sum_weight', full_name='Params.sum_weight', index=0, 74 | number=1, type=1, cpp_type=5, label=1, 75 | has_default_value=False, default_value=0, 76 | message_type=None, enum_type=None, containing_type=None, 77 | is_extension=False, extension_scope=None, 78 | options=None), 79 | _descriptor.FieldDescriptor( 80 | name='avg_wight', full_name='Params.avg_wight', index=1, 81 | number=2, type=1, cpp_type=5, label=1, 82 | has_default_value=False, default_value=0, 83 | message_type=None, enum_type=None, containing_type=None, 84 | is_extension=False, extension_scope=None, 85 | options=None), 86 | _descriptor.FieldDescriptor( 87 | name='max_wight', full_name='Params.max_wight', index=2, 88 | number=3, type=1, cpp_type=5, label=1, 89 | has_default_value=False, default_value=0, 90 | message_type=None, enum_type=None, containing_type=None, 91 | is_extension=False, extension_scope=None, 92 | options=None), 93 | _descriptor.FieldDescriptor( 94 | name='word_count', full_name='Params.word_count', index=3, 95 | number=4, type=5, cpp_type=1, label=1, 96 | has_default_value=False, default_value=0, 97 | message_type=None, enum_type=None, containing_type=None, 98 | is_extension=False, extension_scope=None, 99 | options=None), 100 | ], 101 | extensions=[ 102 | ], 103 | nested_types=[], 104 | enum_types=[ 105 | ], 106 | options=None, 107 | is_extendable=False, 108 | extension_ranges=[], 109 | oneofs=[ 110 | ], 111 | serialized_start=62, 112 | serialized_end=148, 113 | ) 114 | 115 | 116 | _CONTEXT = _descriptor.Descriptor( 117 | name='Context', 118 | full_name='Context', 119 | filename=None, 120 | file=DESCRIPTOR, 121 | containing_type=None, 122 | fields=[ 123 | _descriptor.FieldDescriptor( 124 | name='size', full_name='Context.size', index=0, 125 | number=1, type=5, cpp_type=1, label=2, 126 | has_default_value=False, default_value=0, 127 | message_type=None, enum_type=None, containing_type=None, 128 | is_extension=False, extension_scope=None, 129 | options=None), 130 | _descriptor.FieldDescriptor( 131 | name='left', full_name='Context.left', index=1, 132 | number=2, type=9, cpp_type=9, label=1, 133 | has_default_value=False, default_value=_b("").decode('utf-8'), 134 | message_type=None, enum_type=None, containing_type=None, 135 | is_extension=False, extension_scope=None, 136 | options=None), 137 | _descriptor.FieldDescriptor( 138 | name='right', full_name='Context.right', index=2, 139 | number=3, type=9, cpp_type=9, label=1, 140 | has_default_value=False, default_value=_b("").decode('utf-8'), 141 | message_type=None, enum_type=None, containing_type=None, 142 | is_extension=False, extension_scope=None, 143 | options=None), 144 | ], 145 | extensions=[ 146 | ], 147 | nested_types=[], 148 | enum_types=[ 149 | ], 150 | options=None, 151 | is_extendable=False, 152 | extension_ranges=[], 153 | oneofs=[ 154 | ], 155 | serialized_start=150, 156 | serialized_end=202, 157 | ) 158 | 159 | 160 | _CONCEPT = _descriptor.Descriptor( 161 | name='Concept', 162 | full_name='Concept', 163 | filename=None, 164 | file=DESCRIPTOR, 165 | containing_type=None, 166 | fields=[ 167 | _descriptor.FieldDescriptor( 168 | name='link', full_name='Concept.link', index=0, 169 | number=1, type=9, cpp_type=9, label=2, 170 | has_default_value=False, default_value=_b("").decode('utf-8'), 171 | message_type=None, enum_type=None, containing_type=None, 172 | is_extension=False, extension_scope=None, 173 | options=None), 174 | _descriptor.FieldDescriptor( 175 | name='hits', full_name='Concept.hits', index=1, 176 | number=2, type=5, cpp_type=1, label=1, 177 | has_default_value=False, default_value=0, 178 | message_type=None, enum_type=None, containing_type=None, 179 | is_extension=False, extension_scope=None, 180 | options=None), 181 | _descriptor.FieldDescriptor( 182 | name='avgScore', full_name='Concept.avgScore', index=2, 183 | number=3, type=1, cpp_type=5, label=1, 184 | has_default_value=False, default_value=0, 185 | message_type=None, enum_type=None, containing_type=None, 186 | is_extension=False, extension_scope=None, 187 | options=None), 188 | _descriptor.FieldDescriptor( 189 | name='maxScore', full_name='Concept.maxScore', index=3, 190 | number=4, type=1, cpp_type=5, label=1, 191 | has_default_value=False, default_value=0, 192 | message_type=None, enum_type=None, containing_type=None, 193 | is_extension=False, extension_scope=None, 194 | options=None), 195 | _descriptor.FieldDescriptor( 196 | name='minScore', full_name='Concept.minScore', index=4, 197 | number=5, type=1, cpp_type=5, label=1, 198 | has_default_value=False, default_value=0, 199 | message_type=None, enum_type=None, containing_type=None, 200 | is_extension=False, extension_scope=None, 201 | options=None), 202 | _descriptor.FieldDescriptor( 203 | name='avgNorm', full_name='Concept.avgNorm', index=5, 204 | number=6, type=1, cpp_type=5, label=1, 205 | has_default_value=False, default_value=0, 206 | message_type=None, enum_type=None, containing_type=None, 207 | is_extension=False, extension_scope=None, 208 | options=None), 209 | _descriptor.FieldDescriptor( 210 | name='avgSoftMax', full_name='Concept.avgSoftMax', index=6, 211 | number=7, type=1, cpp_type=5, label=1, 212 | has_default_value=False, default_value=0, 213 | message_type=None, enum_type=None, containing_type=None, 214 | is_extension=False, extension_scope=None, 215 | options=None), 216 | ], 217 | extensions=[ 218 | ], 219 | nested_types=[], 220 | enum_types=[ 221 | ], 222 | options=None, 223 | is_extendable=False, 224 | extension_ranges=[], 225 | oneofs=[ 226 | ], 227 | serialized_start=205, 228 | serialized_end=333, 229 | ) 230 | 231 | 232 | _MENTION = _descriptor.Descriptor( 233 | name='Mention', 234 | full_name='Mention', 235 | filename=None, 236 | file=DESCRIPTOR, 237 | containing_type=None, 238 | fields=[ 239 | _descriptor.FieldDescriptor( 240 | name='id', full_name='Mention.id', index=0, 241 | number=1, type=5, cpp_type=1, label=2, 242 | has_default_value=False, default_value=0, 243 | message_type=None, enum_type=None, containing_type=None, 244 | is_extension=False, extension_scope=None, 245 | options=None), 246 | _descriptor.FieldDescriptor( 247 | name='resolver', full_name='Mention.resolver', index=1, 248 | number=2, type=9, cpp_type=9, label=1, 249 | has_default_value=False, default_value=_b("").decode('utf-8'), 250 | message_type=None, enum_type=None, containing_type=None, 251 | is_extension=False, extension_scope=None, 252 | options=None), 253 | _descriptor.FieldDescriptor( 254 | name='text', full_name='Mention.text', index=2, 255 | number=3, type=9, cpp_type=9, label=1, 256 | has_default_value=False, default_value=_b("").decode('utf-8'), 257 | message_type=None, enum_type=None, containing_type=None, 258 | is_extension=False, extension_scope=None, 259 | options=None), 260 | _descriptor.FieldDescriptor( 261 | name='position', full_name='Mention.position', index=3, 262 | number=4, type=11, cpp_type=10, label=1, 263 | has_default_value=False, default_value=None, 264 | message_type=None, enum_type=None, containing_type=None, 265 | is_extension=False, extension_scope=None, 266 | options=None), 267 | _descriptor.FieldDescriptor( 268 | name='params', full_name='Mention.params', index=4, 269 | number=5, type=11, cpp_type=10, label=1, 270 | has_default_value=False, default_value=None, 271 | message_type=None, enum_type=None, containing_type=None, 272 | is_extension=False, extension_scope=None, 273 | options=None), 274 | _descriptor.FieldDescriptor( 275 | name='context', full_name='Mention.context', index=5, 276 | number=6, type=11, cpp_type=10, label=1, 277 | has_default_value=False, default_value=None, 278 | message_type=None, enum_type=None, containing_type=None, 279 | is_extension=False, extension_scope=None, 280 | options=None), 281 | _descriptor.FieldDescriptor( 282 | name='concepts', full_name='Mention.concepts', index=6, 283 | number=7, type=11, cpp_type=10, label=3, 284 | has_default_value=False, default_value=[], 285 | message_type=None, enum_type=None, containing_type=None, 286 | is_extension=False, extension_scope=None, 287 | options=None), 288 | ], 289 | extensions=[ 290 | ], 291 | nested_types=[], 292 | enum_types=[ 293 | ], 294 | options=None, 295 | is_extendable=False, 296 | extension_ranges=[], 297 | oneofs=[ 298 | ], 299 | serialized_start=336, 300 | serialized_end=498, 301 | ) 302 | 303 | 304 | _TOKEN = _descriptor.Descriptor( 305 | name='Token', 306 | full_name='Token', 307 | filename=None, 308 | file=DESCRIPTOR, 309 | containing_type=None, 310 | fields=[ 311 | _descriptor.FieldDescriptor( 312 | name='token', full_name='Token.token', index=0, 313 | number=1, type=9, cpp_type=9, label=1, 314 | has_default_value=False, default_value=_b("").decode('utf-8'), 315 | message_type=None, enum_type=None, containing_type=None, 316 | is_extension=False, extension_scope=None, 317 | options=None), 318 | _descriptor.FieldDescriptor( 319 | name='lemma', full_name='Token.lemma', index=1, 320 | number=2, type=9, cpp_type=9, label=1, 321 | has_default_value=False, default_value=_b("").decode('utf-8'), 322 | message_type=None, enum_type=None, containing_type=None, 323 | is_extension=False, extension_scope=None, 324 | options=None), 325 | _descriptor.FieldDescriptor( 326 | name='pos_tag', full_name='Token.pos_tag', index=2, 327 | number=3, type=9, cpp_type=9, label=1, 328 | has_default_value=False, default_value=_b("").decode('utf-8'), 329 | message_type=None, enum_type=None, containing_type=None, 330 | is_extension=False, extension_scope=None, 331 | options=None), 332 | _descriptor.FieldDescriptor( 333 | name='parserTag', full_name='Token.parserTag', index=3, 334 | number=4, type=9, cpp_type=9, label=1, 335 | has_default_value=False, default_value=_b("").decode('utf-8'), 336 | message_type=None, enum_type=None, containing_type=None, 337 | is_extension=False, extension_scope=None, 338 | options=None), 339 | _descriptor.FieldDescriptor( 340 | name='group', full_name='Token.group', index=4, 341 | number=5, type=5, cpp_type=1, label=1, 342 | has_default_value=False, default_value=0, 343 | message_type=None, enum_type=None, containing_type=None, 344 | is_extension=False, extension_scope=None, 345 | options=None), 346 | _descriptor.FieldDescriptor( 347 | name='mentions', full_name='Token.mentions', index=5, 348 | number=6, type=5, cpp_type=1, label=3, 349 | has_default_value=False, default_value=[], 350 | message_type=None, enum_type=None, containing_type=None, 351 | is_extension=False, extension_scope=None, 352 | options=None), 353 | ], 354 | extensions=[ 355 | ], 356 | nested_types=[], 357 | enum_types=[ 358 | ], 359 | options=None, 360 | is_extendable=False, 361 | extension_ranges=[], 362 | oneofs=[ 363 | ], 364 | serialized_start=500, 365 | serialized_end=606, 366 | ) 367 | 368 | 369 | _SENTENCE = _descriptor.Descriptor( 370 | name='Sentence', 371 | full_name='Sentence', 372 | filename=None, 373 | file=DESCRIPTOR, 374 | containing_type=None, 375 | fields=[ 376 | _descriptor.FieldDescriptor( 377 | name='sent', full_name='Sentence.sent', index=0, 378 | number=1, type=9, cpp_type=9, label=1, 379 | has_default_value=False, default_value=_b("").decode('utf-8'), 380 | message_type=None, enum_type=None, containing_type=None, 381 | is_extension=False, extension_scope=None, 382 | options=None), 383 | _descriptor.FieldDescriptor( 384 | name='mentions', full_name='Sentence.mentions', index=1, 385 | number=2, type=11, cpp_type=10, label=3, 386 | has_default_value=False, default_value=[], 387 | message_type=None, enum_type=None, containing_type=None, 388 | is_extension=False, extension_scope=None, 389 | options=None), 390 | _descriptor.FieldDescriptor( 391 | name='parser_name', full_name='Sentence.parser_name', index=2, 392 | number=3, type=9, cpp_type=9, label=1, 393 | has_default_value=False, default_value=_b("").decode('utf-8'), 394 | message_type=None, enum_type=None, containing_type=None, 395 | is_extension=False, extension_scope=None, 396 | options=None), 397 | _descriptor.FieldDescriptor( 398 | name='parse_result', full_name='Sentence.parse_result', index=3, 399 | number=4, type=11, cpp_type=10, label=3, 400 | has_default_value=False, default_value=[], 401 | message_type=None, enum_type=None, containing_type=None, 402 | is_extension=False, extension_scope=None, 403 | options=None), 404 | _descriptor.FieldDescriptor( 405 | name='prevSentence', full_name='Sentence.prevSentence', index=4, 406 | number=5, type=9, cpp_type=9, label=1, 407 | has_default_value=False, default_value=_b("").decode('utf-8'), 408 | message_type=None, enum_type=None, containing_type=None, 409 | is_extension=False, extension_scope=None, 410 | options=None), 411 | _descriptor.FieldDescriptor( 412 | name='nextSentence', full_name='Sentence.nextSentence', index=5, 413 | number=6, type=9, cpp_type=9, label=1, 414 | has_default_value=False, default_value=_b("").decode('utf-8'), 415 | message_type=None, enum_type=None, containing_type=None, 416 | is_extension=False, extension_scope=None, 417 | options=None), 418 | ], 419 | extensions=[ 420 | ], 421 | nested_types=[], 422 | enum_types=[ 423 | ], 424 | options=None, 425 | is_extendable=False, 426 | extension_ranges=[], 427 | oneofs=[ 428 | ], 429 | serialized_start=609, 430 | serialized_end=756, 431 | ) 432 | 433 | _MENTION.fields_by_name['position'].message_type = _POSITION 434 | _MENTION.fields_by_name['params'].message_type = _PARAMS 435 | _MENTION.fields_by_name['context'].message_type = _CONTEXT 436 | _MENTION.fields_by_name['concepts'].message_type = _CONCEPT 437 | _SENTENCE.fields_by_name['mentions'].message_type = _MENTION 438 | _SENTENCE.fields_by_name['parse_result'].message_type = _TOKEN 439 | DESCRIPTOR.message_types_by_name['Position'] = _POSITION 440 | DESCRIPTOR.message_types_by_name['Params'] = _PARAMS 441 | DESCRIPTOR.message_types_by_name['Context'] = _CONTEXT 442 | DESCRIPTOR.message_types_by_name['Concept'] = _CONCEPT 443 | DESCRIPTOR.message_types_by_name['Mention'] = _MENTION 444 | DESCRIPTOR.message_types_by_name['Token'] = _TOKEN 445 | DESCRIPTOR.message_types_by_name['Sentence'] = _SENTENCE 446 | 447 | Position = _reflection.GeneratedProtocolMessageType('Position', (_message.Message,), dict( 448 | DESCRIPTOR = _POSITION, 449 | __module__ = 'sentence_pb2' 450 | # @@protoc_insertion_point(class_scope:Position) 451 | )) 452 | _sym_db.RegisterMessage(Position) 453 | 454 | Params = _reflection.GeneratedProtocolMessageType('Params', (_message.Message,), dict( 455 | DESCRIPTOR = _PARAMS, 456 | __module__ = 'sentence_pb2' 457 | # @@protoc_insertion_point(class_scope:Params) 458 | )) 459 | _sym_db.RegisterMessage(Params) 460 | 461 | Context = _reflection.GeneratedProtocolMessageType('Context', (_message.Message,), dict( 462 | DESCRIPTOR = _CONTEXT, 463 | __module__ = 'sentence_pb2' 464 | # @@protoc_insertion_point(class_scope:Context) 465 | )) 466 | _sym_db.RegisterMessage(Context) 467 | 468 | Concept = _reflection.GeneratedProtocolMessageType('Concept', (_message.Message,), dict( 469 | DESCRIPTOR = _CONCEPT, 470 | __module__ = 'sentence_pb2' 471 | # @@protoc_insertion_point(class_scope:Concept) 472 | )) 473 | _sym_db.RegisterMessage(Concept) 474 | 475 | Mention = _reflection.GeneratedProtocolMessageType('Mention', (_message.Message,), dict( 476 | DESCRIPTOR = _MENTION, 477 | __module__ = 'sentence_pb2' 478 | # @@protoc_insertion_point(class_scope:Mention) 479 | )) 480 | _sym_db.RegisterMessage(Mention) 481 | 482 | Token = _reflection.GeneratedProtocolMessageType('Token', (_message.Message,), dict( 483 | DESCRIPTOR = _TOKEN, 484 | __module__ = 'sentence_pb2' 485 | # @@protoc_insertion_point(class_scope:Token) 486 | )) 487 | _sym_db.RegisterMessage(Token) 488 | 489 | Sentence = _reflection.GeneratedProtocolMessageType('Sentence', (_message.Message,), dict( 490 | DESCRIPTOR = _SENTENCE, 491 | __module__ = 'sentence_pb2' 492 | # @@protoc_insertion_point(class_scope:Sentence) 493 | )) 494 | _sym_db.RegisterMessage(Sentence) 495 | 496 | 497 | DESCRIPTOR.has_options = True 498 | DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('\n\013ml.generall')) 499 | # @@protoc_insertion_point(module_scope) 500 | --------------------------------------------------------------------------------