├── .gitignore
├── LICENSE
├── README.md
├── clean_source_bucket.py
├── es-mapping-index
    ├── README.md
    ├── concept.json
    ├── publication.json
    └── settings.json
├── es-mapping
    ├── README.md
    ├── concept.json
    └── publication.json
├── gcp-local-ssd
    ├── readme.md
    ├── run.sh
    └── startup.sh
├── gcp-persistent-disk
    ├── README.md
    ├── exec.sh
    ├── run.sh
    ├── startup.sh
    ├── steps.sh
    └── tmux_example
    │   ├── README.md
    │   ├── bioentity_tmux.sh
    │   ├── bioentity_tmux_kill.sh
    │   ├── concept_tmux.sh
    │   ├── concept_tmux_kill.sh
    │   ├── es_bio.sh
    │   ├── es_concept.sh
    │   ├── es_pub.sh
    │   ├── es_tag.sh
    │   ├── publication_tmux.sh
    │   ├── publication_tmux_kill.sh
    │   ├── taggedtext_tmux.sh
    │   └── taggedtext_tmux_kill.sh
├── load2es.py
├── main.py
├── modules
    ├── AbbreviationFinder.py
    ├── BioStopWords.py
    ├── BioentityTagger.py
    ├── NLP.py
    ├── __init__.py
    └── vocabulary.py
├── publication_alias.sh
├── setup.py
├── tests
    ├── __init__.py
    ├── resources
    │   ├── common_words_as_genes.txt
    │   ├── test-medlinexml
    │   │   ├── test_baseline.xml.gz
    │   │   └── test_update.xml.gz
    │   ├── test-spacy
    │   │   ├── disease.xml
    │   │   └── geneProt.xml
    │   ├── test_abstract_lexebi.txt
    │   └── test_abstract_nlp.txt
    ├── test_tagger.py
    ├── text_medline_parser.py
    └── text_nlp.py
└── venv_elasticsearch.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | .idea/
  2 | .DS_Store
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2017 Biogen, GlaxoSmithKline, EMBL - European Bioinformatics Institute, Wellcome Trust Sanger Institute
 2 | 
 3 | This software was developed as part of the Open Targets project. For more information please see:
 4 | 
 5 | http://www.opentargets.org
 6 | Target Validation platform
 7 | 
 8 | Licensed under the Apache License, Version 2.0 (the "License");
 9 | you may not use this file except in compliance with the License.
10 | You may obtain a copy of the License at
11 | 
12 | http://www.apache.org/licenses/LICENSE-2.0
13 | 
14 | Unless required by applicable law or agreed to in writing, software
15 | distributed under the License is distributed on an "AS IS" BASIS,
16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 | 
18 | See the License for the specific language governing permissions and
19 | limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Note: This repo has been archived because LINK (Library) has been decommissioned.
  2 | 
  3 | # Open Targets Library - NLP Pipeline
  4 | 
  5 | ## NLP Analysis of MedLine/PubMed Running in Apache Beam
  6 | 
  7 | This pipeline is designed to run with Apache Beam using the dataflow runner.
  8 | It has not been tested with other Beam backends, but it should work there as well pending minimal modifications.
  9 | Please see the [Apache Beam SDK](https://beam.apache.org/documentation/sdks/python/) for more info.
 10 | 
 11 | ## Steps to reproduce a full run
 12 | Use python2 with pip and virtualenv
 13 | 
 14 | * Generate a mirror of MEDLINE FTP to a Google Storage Bucket (any other storage provider supported by Python Beam SDK should work). E.g. using [rclone](https://rclone.org/)
 15 | 
 16 |    - Download [pre-built rclone binaries](https://rclone.org/install/#linux-installation-from-precompiled-binary) rather than platform packaged ones as they tend to be more up-to-date
 17 |    - configure rclone with MEDLINE FTP [ftp.ncbi.nlm.nih.gov](ftp://ftp.ncbi.nlm.nih.gov) and your target gcp project
 18 |      (my-gcp-project-buckets)  `rclone config`. Medline must have username `anonymous` and password `anonymous`.
 19 |    - Generate a full mirror:
 20 |      `rclone sync -v medline-ftp:pubmed/baseline my-gcp-project-buckets:my-medline-bucket/baseline`
 21 |    - Update new files:
 22 |      `rclone sync -v medline-ftp:pubmed/updatefiles my-gcp-project-buckets:my-medline-bucket/updatefiles`
 23 |   - Note: you can use `--dry-run` argument to test
 24 | * install tooling
 25 |     ```sh
 26 |     sudo apt-get install python-dev virtualenv build-essential git libxml2-dev libxslt-dev zlib1g-dev tmux
 27 |     ```
 28 | * Download the pipeline
 29 |     ```sh
 30 |     git clone https://github.com/opentargets/library-beam
 31 |     cd library-beam
 32 |     ```
 33 | * Create a virtual environment to manage dependencies in
 34 |     ```sh
 35 |     virtualenv venv --python=python2
 36 |     source venv/bin/activate
 37 |     ```
 38 | * Install the pipeline into the virtual environment   
 39 |     ```sh
 40 |     python setup.py install
 41 |     #note this needs between 3.75GB and 7.5GB RAM
 42 |     pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.0/en_core_web_lg-2.2.0.tar.gz
 43 |     ```
 44 | * Grant the permission to compute user.
 45 |   ```
 46 |   numberHidden -compute@developer.gserviceaccount.com 		Cloud Build Service Agent
 47 |   ```    
 48 | * Change the value for the vocabulary info under modules/vocabulary.py
 49 | 
 50 | * Run pipeline
 51 |   ```sH
 52 |   python -m main \
 53 |     --project open-targets-library \
 54 |     --job_name medline201911 \
 55 |     --runner DataflowRunner \
 56 |     --temp_location gs://medline_2019_11/temp \
 57 |     --setup_file ./setup.py \
 58 |     --worker_machine_type n1-highmem-32 \
 59 |     --input_baseline gs://medline_2019_11/baseline/pubmed19n*.xml.gz \
 60 |     --input_updates gs://medline_2019_11/updatefiles/pubmed19n*.xml.gz \
 61 |     --output_enriched gs://medline_2019_11/analyzed/pubmed19 \
 62 |     --output_splitted gs://medline_2019_11/splitted/pubmed19 \
 63 |     --max_num_workers 32 \
 64 |     --region europe-west1 \
 65 |     --zone europe-west1-d
 66 |   ```
 67 | 
 68 |   This can be monitored via [Google Dataflow](https://console.cloud.google.com/dataflow). Note that "wall time" displayed is not the [usual definition](https://en.wikipedia.org/wiki/Elapsed_real_time) but is per thread and worker.
 69 | 
 70 |   In total it takes approximately 4h.
 71 | 
 72 |   ![image](https://user-images.githubusercontent.com/148221/35000427-4e11b818-fadc-11e7-9c2f-08a68eaed37e.png)
 73 | 
 74 |   ![image](https://user-images.githubusercontent.com/148221/35000458-6108bb24-fadc-11e7-8a84-452f7b3816f6.png)
 75 | 
 76 | ## Steps to load the JSON dumps into ElasticSearch
 77 | 
 78 |   The directory gcp contains the infrastructure scripts to generate the Elasticsearch cluster.
 79 | 
 80 |   * Create a virtual environment to manage dependencies in
 81 |     ```sh
 82 |     virtualenv venv_elasticsearch --python=python2
 83 |     source venv_elasticsearch/bin/activate
 84 |     pip install -r venv_elasticsearch.txt
 85 |     ```
 86 |   * Run job load JSONs in Elasticsearch
 87 | 
 88 |   WARNING: the loading scripts takes a lot of time currently, particurlarly the concept one (24h+). It is good to use `screen` or `tmux` or similar, so it will keep going after disconect and can be recovered.  
 89 | 
 90 |   ```sh
 91 |   python load2es.py publication bioentity taggedtext concept --es http://es:9200
 92 |   ```
 93 | 
 94 |   Note: Elasticsearch must have the International Components for Unicode support plugin installed.i.e. `/usr/share/elasticsearch/bin/elasticsearch-plugin -s install analysis-icu`
 95 | 
 96 | * Increase elasticsearch capacity for the adjancency matrix aggregation (used by LINK tool)
 97 |   ```sh
 98 |   curl -XPUT 'http://myesnode1:9200/pubmed-18-concept/_settings' -H 'Content-Type: application/json' -d'
 99 |      {
100 |         "index" : {
101 |             "max_adjacency_matrix_filters" : 500
102 |             }
103 |      }'
104 |   ```
105 | 
106 | ## Google Cloud Platform
107 | 
108 | When controlling this process from a Google cloud machine, make sure it has sufficient scopes enabled.
109 | 


--------------------------------------------------------------------------------
/clean_source_bucket.py:
--------------------------------------------------------------------------------
 1 | 
 2 | basename = 'pubmed18'
 3 | 
 4 | def delete_all_output():
 5 |     from google.cloud import storage
 6 | 
 7 |     client = storage.Client(project='open-targets')
 8 |     bucket = client.get_bucket('medline-json')
 9 |     names = list(bucket.list_blobs())
10 |     for i, blob_ref in enumerate(names):
11 |         # print blob_ref.name
12 |         if blob_ref.name.endswith('.json.gz') and \
13 |                 (blob_ref.name.startswith('parsed/'+basename)  or
14 |                 blob_ref.name.startswith('analyzed/'+basename) or
15 |                 blob_ref.name.startswith('splitted/'+basename) or
16 |                 blob_ref.name.startswith('test/analyzed/'+basename) or
17 |                 blob_ref.name.startswith('test/splitted/'+basename)or
18 |                 blob_ref.name.startswith('test/parsed/'+basename)) :
19 |             blob = bucket.get_blob(blob_ref.name)
20 |             blob.delete()
21 |             print 'deleted', i, blob_ref.name, 'of', len(names)
22 | 
23 | if __name__ == '__main__':
24 |     delete_all_output()


--------------------------------------------------------------------------------
/es-mapping-index/README.md:
--------------------------------------------------------------------------------
1 | This directory contains the settings for the new infrastructure used by gcp/#todo
2 | 


--------------------------------------------------------------------------------
/es-mapping-index/concept.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "settings": {
  3 |     "number_of_shards": 24,
  4 |     "number_of_replicas": 0,
  5 |     "refresh_interval": "-1",
  6 |     "translog.flush_threshold_size": "1000mb",
  7 |     "analysis": {
  8 |       "filter": {
  9 |         "english_stop": {
 10 |           "type": "stop",
 11 |           "stopwords": [
 12 |             "'ll",
 13 |             "'ve",
 14 |             "0",
 15 |             "1",
 16 |             "10",
 17 |             "100",
 18 |             "11",
 19 |             "12",
 20 |             "13",
 21 |             "14",
 22 |             "15",
 23 |             "16",
 24 |             "17",
 25 |             "18",
 26 |             "19",
 27 |             "2",
 28 |             "20",
 29 |             "21",
 30 |             "22",
 31 |             "23",
 32 |             "24",
 33 |             "25",
 34 |             "26",
 35 |             "27",
 36 |             "28",
 37 |             "29",
 38 |             "3",
 39 |             "30",
 40 |             "31",
 41 |             "32",
 42 |             "33",
 43 |             "34",
 44 |             "35",
 45 |             "36",
 46 |             "37",
 47 |             "38",
 48 |             "39",
 49 |             "4",
 50 |             "40",
 51 |             "41",
 52 |             "42",
 53 |             "43",
 54 |             "44",
 55 |             "45",
 56 |             "46",
 57 |             "47",
 58 |             "48",
 59 |             "49",
 60 |             "5",
 61 |             "50",
 62 |             "51",
 63 |             "52",
 64 |             "53",
 65 |             "54",
 66 |             "55",
 67 |             "56",
 68 |             "57",
 69 |             "58",
 70 |             "59",
 71 |             "6",
 72 |             "60",
 73 |             "61",
 74 |             "62",
 75 |             "63",
 76 |             "64",
 77 |             "65",
 78 |             "66",
 79 |             "67",
 80 |             "68",
 81 |             "69",
 82 |             "7",
 83 |             "70",
 84 |             "71",
 85 |             "72",
 86 |             "73",
 87 |             "74",
 88 |             "75",
 89 |             "76",
 90 |             "77",
 91 |             "78",
 92 |             "79",
 93 |             "8",
 94 |             "80",
 95 |             "81",
 96 |             "82",
 97 |             "83",
 98 |             "84",
 99 |             "85",
100 |             "86",
101 |             "87",
102 |             "88",
103 |             "89",
104 |             "9",
105 |             "90",
106 |             "91",
107 |             "92",
108 |             "93",
109 |             "94",
110 |             "95",
111 |             "96",
112 |             "97",
113 |             "98",
114 |             "99",
115 |             "a",
116 |             "able",
117 |             "about",
118 |             "above",
119 |             "abst",
120 |             "accordance",
121 |             "according",
122 |             "accordingly",
123 |             "across",
124 |             "act",
125 |             "actually",
126 |             "added",
127 |             "adj",
128 |             "affected",
129 |             "affecting",
130 |             "affects",
131 |             "after",
132 |             "afterwards",
133 |             "again",
134 |             "against",
135 |             "ah",
136 |             "all",
137 |             "almost",
138 |             "alone",
139 |             "along",
140 |             "already",
141 |             "also",
142 |             "although",
143 |             "always",
144 |             "am",
145 |             "among",
146 |             "amongst",
147 |             "an",
148 |             "and",
149 |             "announce",
150 |             "another",
151 |             "any",
152 |             "anybody",
153 |             "anyhow",
154 |             "anymore",
155 |             "anyone",
156 |             "anything",
157 |             "anyway",
158 |             "anyways",
159 |             "anywhere",
160 |             "apparently",
161 |             "approximately",
162 |             "are",
163 |             "aren",
164 |             "arent",
165 |             "arise",
166 |             "around",
167 |             "as",
168 |             "aside",
169 |             "ask",
170 |             "asking",
171 |             "at",
172 |             "auth",
173 |             "available",
174 |             "away",
175 |             "awfully",
176 |             "b",
177 |             "back",
178 |             "be",
179 |             "became",
180 |             "because",
181 |             "become",
182 |             "becomes",
183 |             "becoming",
184 |             "been",
185 |             "before",
186 |             "beforehand",
187 |             "begin",
188 |             "beginning",
189 |             "beginnings",
190 |             "begins",
191 |             "behind",
192 |             "being",
193 |             "believe",
194 |             "below",
195 |             "beside",
196 |             "besides",
197 |             "between",
198 |             "beyond",
199 |             "biol",
200 |             "both",
201 |             "brief",
202 |             "briefly",
203 |             "but",
204 |             "by",
205 |             "c",
206 |             "ca",
207 |             "came",
208 |             "can",
209 |             "can't",
210 |             "cannot",
211 |             "cause",
212 |             "causes",
213 |             "certain",
214 |             "certainly",
215 |             "co",
216 |             "com",
217 |             "come",
218 |             "comes",
219 |             "contain",
220 |             "containing",
221 |             "contains",
222 |             "could",
223 |             "couldnt",
224 |             "d",
225 |             "date",
226 |             "did",
227 |             "didn't",
228 |             "different",
229 |             "do",
230 |             "does",
231 |             "doesn't",
232 |             "doing",
233 |             "don't",
234 |             "done",
235 |             "down",
236 |             "downwards",
237 |             "due",
238 |             "during",
239 |             "e",
240 |             "each",
241 |             "ed",
242 |             "edu",
243 |             "effect",
244 |             "eg",
245 |             "eight",
246 |             "eighty",
247 |             "either",
248 |             "else",
249 |             "elsewhere",
250 |             "end",
251 |             "ending",
252 |             "enough",
253 |             "especially",
254 |             "et",
255 |             "et-al",
256 |             "etc",
257 |             "even",
258 |             "ever",
259 |             "every",
260 |             "everybody",
261 |             "everyone",
262 |             "everything",
263 |             "everywhere",
264 |             "ex",
265 |             "except",
266 |             "f",
267 |             "far",
268 |             "few",
269 |             "ff",
270 |             "fifth",
271 |             "first",
272 |             "five",
273 |             "fix",
274 |             "followed",
275 |             "following",
276 |             "follows",
277 |             "for",
278 |             "former",
279 |             "formerly",
280 |             "forth",
281 |             "found",
282 |             "four",
283 |             "from",
284 |             "further",
285 |             "furthermore",
286 |             "g",
287 |             "gave",
288 |             "get",
289 |             "gets",
290 |             "getting",
291 |             "give",
292 |             "given",
293 |             "gives",
294 |             "giving",
295 |             "go",
296 |             "goes",
297 |             "gone",
298 |             "got",
299 |             "gotten",
300 |             "h",
301 |             "had",
302 |             "happens",
303 |             "hardly",
304 |             "has",
305 |             "hasn't",
306 |             "have",
307 |             "haven't",
308 |             "having",
309 |             "he",
310 |             "hed",
311 |             "hence",
312 |             "her",
313 |             "here",
314 |             "hereafter",
315 |             "hereby",
316 |             "herein",
317 |             "heres",
318 |             "hereupon",
319 |             "hers",
320 |             "herself",
321 |             "hes",
322 |             "hi",
323 |             "hid",
324 |             "him",
325 |             "himself",
326 |             "his",
327 |             "hither",
328 |             "home",
329 |             "how",
330 |             "howbeit",
331 |             "however",
332 |             "hundred",
333 |             "i",
334 |             "i'll",
335 |             "i've",
336 |             "id",
337 |             "ie",
338 |             "if",
339 |             "im",
340 |             "immediate",
341 |             "immediately",
342 |             "importance",
343 |             "important",
344 |             "in",
345 |             "inc",
346 |             "indeed",
347 |             "index",
348 |             "information",
349 |             "instead",
350 |             "into",
351 |             "invention",
352 |             "inward",
353 |             "is",
354 |             "isn't",
355 |             "it",
356 |             "it'll",
357 |             "itd",
358 |             "its",
359 |             "itself",
360 |             "j",
361 |             "just",
362 |             "k",
363 |             "keep",
364 |             "keeps",
365 |             "kept",
366 |             "kg",
367 |             "km",
368 |             "know",
369 |             "known",
370 |             "knows",
371 |             "l",
372 |             "largely",
373 |             "last",
374 |             "lately",
375 |             "later",
376 |             "latter",
377 |             "latterly",
378 |             "least",
379 |             "less",
380 |             "lest",
381 |             "let",
382 |             "lets",
383 |             "like",
384 |             "liked",
385 |             "likely",
386 |             "line",
387 |             "little",
388 |             "look",
389 |             "looking",
390 |             "looks",
391 |             "ltd",
392 |             "m",
393 |             "made",
394 |             "mainly",
395 |             "make",
396 |             "makes",
397 |             "many",
398 |             "may",
399 |             "maybe",
400 |             "me",
401 |             "mean",
402 |             "means",
403 |             "meantime",
404 |             "meanwhile",
405 |             "merely",
406 |             "mg",
407 |             "might",
408 |             "million",
409 |             "miss",
410 |             "ml",
411 |             "more",
412 |             "moreover",
413 |             "most",
414 |             "mostly",
415 |             "mr",
416 |             "mrs",
417 |             "much",
418 |             "mug",
419 |             "must",
420 |             "my",
421 |             "myself",
422 |             "n",
423 |             "na",
424 |             "name",
425 |             "namely",
426 |             "nay",
427 |             "nd",
428 |             "near",
429 |             "nearly",
430 |             "necessarily",
431 |             "necessary",
432 |             "need",
433 |             "needs",
434 |             "neither",
435 |             "never",
436 |             "nevertheless",
437 |             "new",
438 |             "next",
439 |             "nine",
440 |             "ninety",
441 |             "no",
442 |             "nobody",
443 |             "non",
444 |             "none",
445 |             "nonetheless",
446 |             "noone",
447 |             "nor",
448 |             "normally",
449 |             "nos",
450 |             "not",
451 |             "noted",
452 |             "nothing",
453 |             "now",
454 |             "nowhere",
455 |             "o",
456 |             "obtain",
457 |             "obtained",
458 |             "obviously",
459 |             "of",
460 |             "off",
461 |             "often",
462 |             "oh",
463 |             "ok",
464 |             "okay",
465 |             "old",
466 |             "omitted",
467 |             "on",
468 |             "once",
469 |             "one",
470 |             "ones",
471 |             "only",
472 |             "onto",
473 |             "or",
474 |             "ord",
475 |             "other",
476 |             "others",
477 |             "otherwise",
478 |             "ought",
479 |             "our",
480 |             "ours",
481 |             "ourselves",
482 |             "out",
483 |             "outside",
484 |             "over",
485 |             "overall",
486 |             "owing",
487 |             "own",
488 |             "p",
489 |             "page",
490 |             "pages",
491 |             "part",
492 |             "particular",
493 |             "particularly",
494 |             "past",
495 |             "per",
496 |             "perhaps",
497 |             "placed",
498 |             "please",
499 |             "plus",
500 |             "poorly",
501 |             "possible",
502 |             "possibly",
503 |             "potentially",
504 |             "pp",
505 |             "predominantly",
506 |             "present",
507 |             "previously",
508 |             "primarily",
509 |             "probably",
510 |             "promptly",
511 |             "proud",
512 |             "provides",
513 |             "put",
514 |             "q",
515 |             "que",
516 |             "quickly",
517 |             "quite",
518 |             "qv",
519 |             "r",
520 |             "ran",
521 |             "rather",
522 |             "rd",
523 |             "re",
524 |             "readily",
525 |             "really",
526 |             "recent",
527 |             "recently",
528 |             "ref",
529 |             "refs",
530 |             "regarding",
531 |             "regardless",
532 |             "regards",
533 |             "related",
534 |             "relatively",
535 |             "research",
536 |             "respectively",
537 |             "resulted",
538 |             "resulting",
539 |             "results",
540 |             "right",
541 |             "run",
542 |             "s",
543 |             "said",
544 |             "same",
545 |             "saw",
546 |             "say",
547 |             "saying",
548 |             "says",
549 |             "sec",
550 |             "section",
551 |             "see",
552 |             "seeing",
553 |             "seem",
554 |             "seemed",
555 |             "seeming",
556 |             "seems",
557 |             "seen",
558 |             "self",
559 |             "selves",
560 |             "sent",
561 |             "seven",
562 |             "several",
563 |             "shall",
564 |             "she",
565 |             "she'll",
566 |             "shed",
567 |             "shes",
568 |             "should",
569 |             "shouldn't",
570 |             "show",
571 |             "showed",
572 |             "shown",
573 |             "showns",
574 |             "shows",
575 |             "significant",
576 |             "significantly",
577 |             "similar",
578 |             "similarly",
579 |             "since",
580 |             "six",
581 |             "slightly",
582 |             "so",
583 |             "some",
584 |             "somebody",
585 |             "somehow",
586 |             "someone",
587 |             "somethan",
588 |             "something",
589 |             "sometime",
590 |             "sometimes",
591 |             "somewhat",
592 |             "somewhere",
593 |             "soon",
594 |             "sorry",
595 |             "specifically",
596 |             "specified",
597 |             "specify",
598 |             "specifying",
599 |             "still",
600 |             "stop",
601 |             "strongly",
602 |             "sub",
603 |             "substantially",
604 |             "successfully",
605 |             "such",
606 |             "sufficiently",
607 |             "suggest",
608 |             "sup",
609 |             "sure",
610 |             "t",
611 |             "take",
612 |             "taken",
613 |             "taking",
614 |             "tell",
615 |             "tends",
616 |             "th",
617 |             "than",
618 |             "thank",
619 |             "thanks",
620 |             "thanx",
621 |             "that",
622 |             "that'll",
623 |             "that've",
624 |             "thats",
625 |             "the",
626 |             "their",
627 |             "theirs",
628 |             "them",
629 |             "themselves",
630 |             "then",
631 |             "thence",
632 |             "there",
633 |             "there'll",
634 |             "there've",
635 |             "thereafter",
636 |             "thereby",
637 |             "thered",
638 |             "therefore",
639 |             "therein",
640 |             "thereof",
641 |             "therere",
642 |             "theres",
643 |             "thereto",
644 |             "thereupon",
645 |             "these",
646 |             "they",
647 |             "they'll",
648 |             "they've",
649 |             "theyd",
650 |             "theyre",
651 |             "think",
652 |             "this",
653 |             "those",
654 |             "thou",
655 |             "though",
656 |             "thoughh",
657 |             "thousand",
658 |             "throug",
659 |             "through",
660 |             "throughout",
661 |             "thru",
662 |             "thus",
663 |             "til",
664 |             "tip",
665 |             "to",
666 |             "together",
667 |             "too",
668 |             "took",
669 |             "toward",
670 |             "towards",
671 |             "tried",
672 |             "tries",
673 |             "truly",
674 |             "try",
675 |             "trying",
676 |             "ts",
677 |             "twice",
678 |             "two",
679 |             "u",
680 |             "un",
681 |             "under",
682 |             "unfortunately",
683 |             "unless",
684 |             "unlike",
685 |             "unlikely",
686 |             "until",
687 |             "unto",
688 |             "up",
689 |             "upon",
690 |             "ups",
691 |             "us",
692 |             "use",
693 |             "used",
694 |             "useful",
695 |             "usefully",
696 |             "usefulness",
697 |             "uses",
698 |             "using",
699 |             "usually",
700 |             "v",
701 |             "value",
702 |             "various",
703 |             "very",
704 |             "via",
705 |             "viz",
706 |             "vol",
707 |             "vols",
708 |             "vs",
709 |             "w",
710 |             "want",
711 |             "wants",
712 |             "was",
713 |             "wasnt",
714 |             "way",
715 |             "we",
716 |             "we'll",
717 |             "we've",
718 |             "wed",
719 |             "welcome",
720 |             "went",
721 |             "were",
722 |             "werent",
723 |             "what",
724 |             "what'll",
725 |             "whatever",
726 |             "whats",
727 |             "when",
728 |             "whence",
729 |             "whenever",
730 |             "where",
731 |             "whereafter",
732 |             "whereas",
733 |             "whereby",
734 |             "wherein",
735 |             "wheres",
736 |             "whereupon",
737 |             "wherever",
738 |             "whether",
739 |             "which",
740 |             "while",
741 |             "whim",
742 |             "whither",
743 |             "who",
744 |             "who'll",
745 |             "whod",
746 |             "whoever",
747 |             "whole",
748 |             "whom",
749 |             "whomever",
750 |             "whos",
751 |             "whose",
752 |             "why",
753 |             "widely",
754 |             "willing",
755 |             "wish",
756 |             "with",
757 |             "within",
758 |             "without",
759 |             "wont",
760 |             "words",
761 |             "world",
762 |             "would",
763 |             "wouldnt",
764 |             "www",
765 |             "x",
766 |             "y",
767 |             "yes",
768 |             "yet",
769 |             "you",
770 |             "you'll",
771 |             "you've",
772 |             "youd",
773 |             "your",
774 |             "youre",
775 |             "yours",
776 |             "yourself",
777 |             "yourselves",
778 |             "z",
779 |             "zero"
780 |           ]
781 |         },
782 |         "english_stemmer": {
783 |           "type": "stemmer",
784 |           "language": "english"
785 |         },
786 |         "english_light_stemmer": {
787 |           "type": "stemmer",
788 |           "language": "light_english"
789 |         },
790 |         "english_minimal_stemmer": {
791 |           "type": "stemmer",
792 |           "language": "minimal_english"
793 |         },
794 |         "english_possessive_stemmer": {
795 |           "type": "stemmer",
796 |           "language": "possessive_english"
797 |         }
798 |       },
799 |       "analyzer": {
800 |         "english": {
801 |           "tokenizer": "classic",
802 |           "filter": [
803 |             "icu_normalizer",
804 |             "english_possessive_stemmer",
805 |             "english_stop",
806 |             "english_minimal_stemmer"
807 |           ]
808 |         }
809 |       }
810 |     }
811 |   },
812 |   "mappings": {
813 |     "_default_": {
814 |       "_all": {
815 |         "enabled": true
816 |       },
817 |       "dynamic_templates": [
818 |         {
819 |           "string_fields": {
820 |             "mapping": {
821 |               "type": "keyword",
822 |               "ignore_above": 256
823 |             },
824 |             "match": "*",
825 |             "match_mapping_type": "string"
826 |           }
827 |         }
828 |       ]
829 |     },
830 |     "concept": {
831 |       "_all": {
832 |         "enabled": false
833 |       },
834 |       "dynamic_templates": [
835 |         {
836 |           "string_fields": {
837 |             "mapping": {
838 |               "type": "keyword",
839 |               "ignore_above": 256
840 |             },
841 |             "match": "*",
842 |             "match_mapping_type": "string"
843 |           }
844 |         }
845 |       ],
846 |       "properties": {
847 |         "concept": {
848 |           "type": "object",
849 |           "properties": {
850 |             "sentence_text": {
851 |               "type": "text",
852 |               "analyzer": "english",
853 |               "eager_global_ordinals": true,
854 |               "fielddata": true
855 |             },
856 |             "verb_subtree": {
857 |               "type": "text",
858 |               "analyzer": "english",
859 |               "eager_global_ordinals": true,
860 |               "fielddata": true
861 |             },
862 |             "relations": {
863 |               "type": "object",
864 |               "properties": {
865 |                 "directed": {
866 |                   "type": "text",
867 |                   "analyzer": "whitespace",
868 |                   "eager_global_ordinals": true,
869 |                   "fielddata": true
870 |                 },
871 |                 "undirected": {
872 |                   "type": "text",
873 |                   "analyzer": "whitespace",
874 |                   "eager_global_ordinals": true,
875 |                   "fielddata": true
876 |                 }
877 |               }
878 |             }
879 |           }
880 |         }
881 |       }
882 |     }
883 |   }
884 | }
885 | 


--------------------------------------------------------------------------------
/es-mapping-index/publication.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "settings": {
   3 |     "number_of_shards": 24,
   4 |     "number_of_replicas": 0,
   5 |     "refresh_interval": "-1",
   6 |     "translog.flush_threshold_size": "1000mb",    
   7 |     "analysis": {
   8 |       "filter": {
   9 |         "english_stop": {
  10 |           "type": "stop",
  11 |           "stopwords": [
  12 |             "'ll",
  13 |             "'ve",
  14 |             "0",
  15 |             "1",
  16 |             "10",
  17 |             "100",
  18 |             "11",
  19 |             "12",
  20 |             "13",
  21 |             "14",
  22 |             "15",
  23 |             "16",
  24 |             "17",
  25 |             "18",
  26 |             "19",
  27 |             "2",
  28 |             "20",
  29 |             "21",
  30 |             "22",
  31 |             "23",
  32 |             "24",
  33 |             "25",
  34 |             "26",
  35 |             "27",
  36 |             "28",
  37 |             "29",
  38 |             "3",
  39 |             "30",
  40 |             "31",
  41 |             "32",
  42 |             "33",
  43 |             "34",
  44 |             "35",
  45 |             "36",
  46 |             "37",
  47 |             "38",
  48 |             "39",
  49 |             "4",
  50 |             "40",
  51 |             "41",
  52 |             "42",
  53 |             "43",
  54 |             "44",
  55 |             "45",
  56 |             "46",
  57 |             "47",
  58 |             "48",
  59 |             "49",
  60 |             "5",
  61 |             "50",
  62 |             "51",
  63 |             "52",
  64 |             "53",
  65 |             "54",
  66 |             "55",
  67 |             "56",
  68 |             "57",
  69 |             "58",
  70 |             "59",
  71 |             "6",
  72 |             "60",
  73 |             "61",
  74 |             "62",
  75 |             "63",
  76 |             "64",
  77 |             "65",
  78 |             "66",
  79 |             "67",
  80 |             "68",
  81 |             "69",
  82 |             "7",
  83 |             "70",
  84 |             "71",
  85 |             "72",
  86 |             "73",
  87 |             "74",
  88 |             "75",
  89 |             "76",
  90 |             "77",
  91 |             "78",
  92 |             "79",
  93 |             "8",
  94 |             "80",
  95 |             "81",
  96 |             "82",
  97 |             "83",
  98 |             "84",
  99 |             "85",
 100 |             "86",
 101 |             "87",
 102 |             "88",
 103 |             "89",
 104 |             "9",
 105 |             "90",
 106 |             "91",
 107 |             "92",
 108 |             "93",
 109 |             "94",
 110 |             "95",
 111 |             "96",
 112 |             "97",
 113 |             "98",
 114 |             "99",
 115 |             "a",
 116 |             "able",
 117 |             "about",
 118 |             "above",
 119 |             "abst",
 120 |             "accordance",
 121 |             "according",
 122 |             "accordingly",
 123 |             "across",
 124 |             "act",
 125 |             "actually",
 126 |             "added",
 127 |             "adj",
 128 |             "affected",
 129 |             "affecting",
 130 |             "affects",
 131 |             "after",
 132 |             "afterwards",
 133 |             "again",
 134 |             "against",
 135 |             "ah",
 136 |             "all",
 137 |             "almost",
 138 |             "alone",
 139 |             "along",
 140 |             "already",
 141 |             "also",
 142 |             "although",
 143 |             "always",
 144 |             "am",
 145 |             "among",
 146 |             "amongst",
 147 |             "an",
 148 |             "and",
 149 |             "announce",
 150 |             "another",
 151 |             "any",
 152 |             "anybody",
 153 |             "anyhow",
 154 |             "anymore",
 155 |             "anyone",
 156 |             "anything",
 157 |             "anyway",
 158 |             "anyways",
 159 |             "anywhere",
 160 |             "apparently",
 161 |             "approximately",
 162 |             "are",
 163 |             "aren",
 164 |             "arent",
 165 |             "arise",
 166 |             "around",
 167 |             "as",
 168 |             "aside",
 169 |             "ask",
 170 |             "asking",
 171 |             "at",
 172 |             "auth",
 173 |             "available",
 174 |             "away",
 175 |             "awfully",
 176 |             "b",
 177 |             "back",
 178 |             "be",
 179 |             "became",
 180 |             "because",
 181 |             "become",
 182 |             "becomes",
 183 |             "becoming",
 184 |             "been",
 185 |             "before",
 186 |             "beforehand",
 187 |             "begin",
 188 |             "beginning",
 189 |             "beginnings",
 190 |             "begins",
 191 |             "behind",
 192 |             "being",
 193 |             "believe",
 194 |             "below",
 195 |             "beside",
 196 |             "besides",
 197 |             "between",
 198 |             "beyond",
 199 |             "biol",
 200 |             "both",
 201 |             "brief",
 202 |             "briefly",
 203 |             "but",
 204 |             "by",
 205 |             "c",
 206 |             "ca",
 207 |             "came",
 208 |             "can",
 209 |             "can't",
 210 |             "cannot",
 211 |             "cause",
 212 |             "causes",
 213 |             "certain",
 214 |             "certainly",
 215 |             "co",
 216 |             "com",
 217 |             "come",
 218 |             "comes",
 219 |             "contain",
 220 |             "containing",
 221 |             "contains",
 222 |             "could",
 223 |             "couldnt",
 224 |             "d",
 225 |             "date",
 226 |             "did",
 227 |             "didn't",
 228 |             "different",
 229 |             "do",
 230 |             "does",
 231 |             "doesn't",
 232 |             "doing",
 233 |             "don't",
 234 |             "done",
 235 |             "down",
 236 |             "downwards",
 237 |             "due",
 238 |             "during",
 239 |             "e",
 240 |             "each",
 241 |             "ed",
 242 |             "edu",
 243 |             "effect",
 244 |             "eg",
 245 |             "eight",
 246 |             "eighty",
 247 |             "either",
 248 |             "else",
 249 |             "elsewhere",
 250 |             "end",
 251 |             "ending",
 252 |             "enough",
 253 |             "especially",
 254 |             "et",
 255 |             "et-al",
 256 |             "etc",
 257 |             "even",
 258 |             "ever",
 259 |             "every",
 260 |             "everybody",
 261 |             "everyone",
 262 |             "everything",
 263 |             "everywhere",
 264 |             "ex",
 265 |             "except",
 266 |             "f",
 267 |             "far",
 268 |             "few",
 269 |             "ff",
 270 |             "fifth",
 271 |             "first",
 272 |             "five",
 273 |             "fix",
 274 |             "followed",
 275 |             "following",
 276 |             "follows",
 277 |             "for",
 278 |             "former",
 279 |             "formerly",
 280 |             "forth",
 281 |             "found",
 282 |             "four",
 283 |             "from",
 284 |             "further",
 285 |             "furthermore",
 286 |             "g",
 287 |             "gave",
 288 |             "get",
 289 |             "gets",
 290 |             "getting",
 291 |             "give",
 292 |             "given",
 293 |             "gives",
 294 |             "giving",
 295 |             "go",
 296 |             "goes",
 297 |             "gone",
 298 |             "got",
 299 |             "gotten",
 300 |             "h",
 301 |             "had",
 302 |             "happens",
 303 |             "hardly",
 304 |             "has",
 305 |             "hasn't",
 306 |             "have",
 307 |             "haven't",
 308 |             "having",
 309 |             "he",
 310 |             "hed",
 311 |             "hence",
 312 |             "her",
 313 |             "here",
 314 |             "hereafter",
 315 |             "hereby",
 316 |             "herein",
 317 |             "heres",
 318 |             "hereupon",
 319 |             "hers",
 320 |             "herself",
 321 |             "hes",
 322 |             "hi",
 323 |             "hid",
 324 |             "him",
 325 |             "himself",
 326 |             "his",
 327 |             "hither",
 328 |             "home",
 329 |             "how",
 330 |             "howbeit",
 331 |             "however",
 332 |             "hundred",
 333 |             "i",
 334 |             "i'll",
 335 |             "i've",
 336 |             "id",
 337 |             "ie",
 338 |             "if",
 339 |             "im",
 340 |             "immediate",
 341 |             "immediately",
 342 |             "importance",
 343 |             "important",
 344 |             "in",
 345 |             "inc",
 346 |             "indeed",
 347 |             "index",
 348 |             "information",
 349 |             "instead",
 350 |             "into",
 351 |             "invention",
 352 |             "inward",
 353 |             "is",
 354 |             "isn't",
 355 |             "it",
 356 |             "it'll",
 357 |             "itd",
 358 |             "its",
 359 |             "itself",
 360 |             "j",
 361 |             "just",
 362 |             "k",
 363 |             "keep",
 364 |             "keeps",
 365 |             "kept",
 366 |             "kg",
 367 |             "km",
 368 |             "know",
 369 |             "known",
 370 |             "knows",
 371 |             "l",
 372 |             "largely",
 373 |             "last",
 374 |             "lately",
 375 |             "later",
 376 |             "latter",
 377 |             "latterly",
 378 |             "least",
 379 |             "less",
 380 |             "lest",
 381 |             "let",
 382 |             "lets",
 383 |             "like",
 384 |             "liked",
 385 |             "likely",
 386 |             "line",
 387 |             "little",
 388 |             "look",
 389 |             "looking",
 390 |             "looks",
 391 |             "ltd",
 392 |             "m",
 393 |             "made",
 394 |             "mainly",
 395 |             "make",
 396 |             "makes",
 397 |             "many",
 398 |             "may",
 399 |             "maybe",
 400 |             "me",
 401 |             "mean",
 402 |             "means",
 403 |             "meantime",
 404 |             "meanwhile",
 405 |             "merely",
 406 |             "mg",
 407 |             "might",
 408 |             "million",
 409 |             "miss",
 410 |             "ml",
 411 |             "more",
 412 |             "moreover",
 413 |             "most",
 414 |             "mostly",
 415 |             "mr",
 416 |             "mrs",
 417 |             "much",
 418 |             "mug",
 419 |             "must",
 420 |             "my",
 421 |             "myself",
 422 |             "n",
 423 |             "na",
 424 |             "name",
 425 |             "namely",
 426 |             "nay",
 427 |             "nd",
 428 |             "near",
 429 |             "nearly",
 430 |             "necessarily",
 431 |             "necessary",
 432 |             "need",
 433 |             "needs",
 434 |             "neither",
 435 |             "never",
 436 |             "nevertheless",
 437 |             "new",
 438 |             "next",
 439 |             "nine",
 440 |             "ninety",
 441 |             "no",
 442 |             "nobody",
 443 |             "non",
 444 |             "none",
 445 |             "nonetheless",
 446 |             "noone",
 447 |             "nor",
 448 |             "normally",
 449 |             "nos",
 450 |             "not",
 451 |             "noted",
 452 |             "nothing",
 453 |             "now",
 454 |             "nowhere",
 455 |             "o",
 456 |             "obtain",
 457 |             "obtained",
 458 |             "obviously",
 459 |             "of",
 460 |             "off",
 461 |             "often",
 462 |             "oh",
 463 |             "ok",
 464 |             "okay",
 465 |             "old",
 466 |             "omitted",
 467 |             "on",
 468 |             "once",
 469 |             "one",
 470 |             "ones",
 471 |             "only",
 472 |             "onto",
 473 |             "or",
 474 |             "ord",
 475 |             "other",
 476 |             "others",
 477 |             "otherwise",
 478 |             "ought",
 479 |             "our",
 480 |             "ours",
 481 |             "ourselves",
 482 |             "out",
 483 |             "outside",
 484 |             "over",
 485 |             "overall",
 486 |             "owing",
 487 |             "own",
 488 |             "p",
 489 |             "page",
 490 |             "pages",
 491 |             "part",
 492 |             "particular",
 493 |             "particularly",
 494 |             "past",
 495 |             "per",
 496 |             "perhaps",
 497 |             "placed",
 498 |             "please",
 499 |             "plus",
 500 |             "poorly",
 501 |             "possible",
 502 |             "possibly",
 503 |             "potentially",
 504 |             "pp",
 505 |             "predominantly",
 506 |             "present",
 507 |             "previously",
 508 |             "primarily",
 509 |             "probably",
 510 |             "promptly",
 511 |             "proud",
 512 |             "provides",
 513 |             "put",
 514 |             "q",
 515 |             "que",
 516 |             "quickly",
 517 |             "quite",
 518 |             "qv",
 519 |             "r",
 520 |             "ran",
 521 |             "rather",
 522 |             "rd",
 523 |             "re",
 524 |             "readily",
 525 |             "really",
 526 |             "recent",
 527 |             "recently",
 528 |             "ref",
 529 |             "refs",
 530 |             "regarding",
 531 |             "regardless",
 532 |             "regards",
 533 |             "related",
 534 |             "relatively",
 535 |             "research",
 536 |             "respectively",
 537 |             "resulted",
 538 |             "resulting",
 539 |             "results",
 540 |             "right",
 541 |             "run",
 542 |             "s",
 543 |             "said",
 544 |             "same",
 545 |             "saw",
 546 |             "say",
 547 |             "saying",
 548 |             "says",
 549 |             "sec",
 550 |             "section",
 551 |             "see",
 552 |             "seeing",
 553 |             "seem",
 554 |             "seemed",
 555 |             "seeming",
 556 |             "seems",
 557 |             "seen",
 558 |             "self",
 559 |             "selves",
 560 |             "sent",
 561 |             "seven",
 562 |             "several",
 563 |             "shall",
 564 |             "she",
 565 |             "she'll",
 566 |             "shed",
 567 |             "shes",
 568 |             "should",
 569 |             "shouldn't",
 570 |             "show",
 571 |             "showed",
 572 |             "shown",
 573 |             "showns",
 574 |             "shows",
 575 |             "significant",
 576 |             "significantly",
 577 |             "similar",
 578 |             "similarly",
 579 |             "since",
 580 |             "six",
 581 |             "slightly",
 582 |             "so",
 583 |             "some",
 584 |             "somebody",
 585 |             "somehow",
 586 |             "someone",
 587 |             "somethan",
 588 |             "something",
 589 |             "sometime",
 590 |             "sometimes",
 591 |             "somewhat",
 592 |             "somewhere",
 593 |             "soon",
 594 |             "sorry",
 595 |             "specifically",
 596 |             "specified",
 597 |             "specify",
 598 |             "specifying",
 599 |             "still",
 600 |             "stop",
 601 |             "strongly",
 602 |             "sub",
 603 |             "substantially",
 604 |             "successfully",
 605 |             "such",
 606 |             "sufficiently",
 607 |             "suggest",
 608 |             "sup",
 609 |             "sure",
 610 |             "t",
 611 |             "take",
 612 |             "taken",
 613 |             "taking",
 614 |             "tell",
 615 |             "tends",
 616 |             "th",
 617 |             "than",
 618 |             "thank",
 619 |             "thanks",
 620 |             "thanx",
 621 |             "that",
 622 |             "that'll",
 623 |             "that've",
 624 |             "thats",
 625 |             "the",
 626 |             "their",
 627 |             "theirs",
 628 |             "them",
 629 |             "themselves",
 630 |             "then",
 631 |             "thence",
 632 |             "there",
 633 |             "there'll",
 634 |             "there've",
 635 |             "thereafter",
 636 |             "thereby",
 637 |             "thered",
 638 |             "therefore",
 639 |             "therein",
 640 |             "thereof",
 641 |             "therere",
 642 |             "theres",
 643 |             "thereto",
 644 |             "thereupon",
 645 |             "these",
 646 |             "they",
 647 |             "they'll",
 648 |             "they've",
 649 |             "theyd",
 650 |             "theyre",
 651 |             "think",
 652 |             "this",
 653 |             "those",
 654 |             "thou",
 655 |             "though",
 656 |             "thoughh",
 657 |             "thousand",
 658 |             "throug",
 659 |             "through",
 660 |             "throughout",
 661 |             "thru",
 662 |             "thus",
 663 |             "til",
 664 |             "tip",
 665 |             "to",
 666 |             "together",
 667 |             "too",
 668 |             "took",
 669 |             "toward",
 670 |             "towards",
 671 |             "tried",
 672 |             "tries",
 673 |             "truly",
 674 |             "try",
 675 |             "trying",
 676 |             "ts",
 677 |             "twice",
 678 |             "two",
 679 |             "u",
 680 |             "un",
 681 |             "under",
 682 |             "unfortunately",
 683 |             "unless",
 684 |             "unlike",
 685 |             "unlikely",
 686 |             "until",
 687 |             "unto",
 688 |             "up",
 689 |             "upon",
 690 |             "ups",
 691 |             "us",
 692 |             "use",
 693 |             "used",
 694 |             "useful",
 695 |             "usefully",
 696 |             "usefulness",
 697 |             "uses",
 698 |             "using",
 699 |             "usually",
 700 |             "v",
 701 |             "value",
 702 |             "various",
 703 |             "very",
 704 |             "via",
 705 |             "viz",
 706 |             "vol",
 707 |             "vols",
 708 |             "vs",
 709 |             "w",
 710 |             "want",
 711 |             "wants",
 712 |             "was",
 713 |             "wasnt",
 714 |             "way",
 715 |             "we",
 716 |             "we'll",
 717 |             "we've",
 718 |             "wed",
 719 |             "welcome",
 720 |             "went",
 721 |             "were",
 722 |             "werent",
 723 |             "what",
 724 |             "what'll",
 725 |             "whatever",
 726 |             "whats",
 727 |             "when",
 728 |             "whence",
 729 |             "whenever",
 730 |             "where",
 731 |             "whereafter",
 732 |             "whereas",
 733 |             "whereby",
 734 |             "wherein",
 735 |             "wheres",
 736 |             "whereupon",
 737 |             "wherever",
 738 |             "whether",
 739 |             "which",
 740 |             "while",
 741 |             "whim",
 742 |             "whither",
 743 |             "who",
 744 |             "who'll",
 745 |             "whod",
 746 |             "whoever",
 747 |             "whole",
 748 |             "whom",
 749 |             "whomever",
 750 |             "whos",
 751 |             "whose",
 752 |             "why",
 753 |             "widely",
 754 |             "willing",
 755 |             "wish",
 756 |             "with",
 757 |             "within",
 758 |             "without",
 759 |             "wont",
 760 |             "words",
 761 |             "world",
 762 |             "would",
 763 |             "wouldnt",
 764 |             "www",
 765 |             "x",
 766 |             "y",
 767 |             "yes",
 768 |             "yet",
 769 |             "you",
 770 |             "you'll",
 771 |             "you've",
 772 |             "youd",
 773 |             "your",
 774 |             "youre",
 775 |             "yours",
 776 |             "yourself",
 777 |             "yourselves",
 778 |             "z",
 779 |             "zero"
 780 |           ]
 781 |         },
 782 |         "english_stemmer": {
 783 |           "type": "stemmer",
 784 |           "language": "english"
 785 |         },
 786 |         "english_light_stemmer": {
 787 |           "type": "stemmer",
 788 |           "language": "light_english"
 789 |         },
 790 |         "english_minimal_stemmer": {
 791 |           "type": "stemmer",
 792 |           "language": "minimal_english"
 793 |         },
 794 |         "english_possessive_stemmer": {
 795 |           "type": "stemmer",
 796 |           "language": "possessive_english"
 797 |         }
 798 |       },
 799 |       "analyzer": {
 800 |         "english": {
 801 |           "tokenizer": "classic",
 802 |           "filter": [
 803 |             "icu_normalizer",
 804 |             "english_possessive_stemmer",
 805 |             "english_stop",
 806 |             "english_minimal_stemmer"
 807 |           ]
 808 |         }
 809 |       }
 810 |     }
 811 |   },
 812 |   "mappings": {
 813 |     "_default_": {
 814 |       "_all": {
 815 |         "enabled": true
 816 |       },
 817 |       "dynamic_templates": [
 818 |         {
 819 |           "string_fields": {
 820 |             "mapping": {
 821 |               "type": "keyword",
 822 |               "ignore_above": 256
 823 |             },
 824 |             "match": "*",
 825 |             "match_mapping_type": "string"
 826 |           }
 827 |         }
 828 |       ]
 829 |     },
 830 |     "publication": {
 831 |       "_all": {
 832 |         "enabled": true
 833 |       },
 834 |       "dynamic_templates": [
 835 |         {
 836 |           "string_fields": {
 837 |             "mapping": {
 838 |               "type": "keyword",
 839 |               "ignore_above": 256
 840 |             },
 841 |             "match": "*",
 842 |             "match_mapping_type": "string"
 843 |           }
 844 |         }
 845 |       ],
 846 |       "properties": {
 847 |         "abstract": {
 848 |           "type": "text",
 849 |           "analyzer": "english",
 850 |           "eager_global_ordinals": true,
 851 |           "fielddata": true
 852 |         },
 853 |         "abstract_sentences": {
 854 |           "type": "nested",
 855 |           "properties": {
 856 |             "value": {
 857 |               "type": "text",
 858 |               "analyzer": "english",
 859 |               "eager_global_ordinals": true,
 860 |               "fielddata": true
 861 |             }
 862 |           }
 863 |         },
 864 |         "authors": {
 865 |           "properties": {
 866 |             "CollectiveName": {
 867 |               "type": "keyword",
 868 |               "ignore_above": 256
 869 |             },
 870 |             "ForeName": {
 871 |               "type": "keyword",
 872 |               "ignore_above": 256
 873 |             },
 874 |             "Identifier": {
 875 |               "type": "keyword",
 876 |               "ignore_above": 256
 877 |             },
 878 |             "Initials": {
 879 |               "type": "keyword",
 880 |               "ignore_above": 256
 881 |             },
 882 |             "LastName": {
 883 |               "type": "keyword",
 884 |               "ignore_above": 256
 885 |             },
 886 |             "Suffix": {
 887 |               "type": "keyword",
 888 |               "ignore_above": 256
 889 |             },
 890 |             "full_name": {
 891 |               "type": "keyword",
 892 |               "ignore_above": 256
 893 |             },
 894 |             "short_name": {
 895 |               "type": "keyword",
 896 |               "ignore_above": 256
 897 |             },
 898 |             "last_name": {
 899 |               "type": "keyword",
 900 |               "ignore_above": 256
 901 |             }
 902 |           }
 903 |         },
 904 |         "chemicals": {
 905 |           "properties": {
 906 |             "name": {
 907 |               "type": "keyword",
 908 |               "ignore_above": 256
 909 |             },
 910 |             "name_id": {
 911 |               "type": "keyword",
 912 |               "ignore_above": 256
 913 |             },
 914 |             "registryNumber": {
 915 |               "type": "keyword",
 916 |               "ignore_above": 256
 917 |             }
 918 |           }
 919 |         },
 920 |         "data_release": {
 921 |           "type": "keyword",
 922 |           "ignore_above": 256
 923 |         },
 924 |         "date": {
 925 |           "type": "date",
 926 |           "format": "strict_date_optional_time||epoch_millis"
 927 |         },
 928 |         "date_of_revision": {
 929 |           "type": "date",
 930 |           "format": "strict_date_optional_time||epoch_millis"
 931 |         },
 932 |         "doi": {
 933 |           "type": "keyword",
 934 |           "ignore_above": 256
 935 |         },
 936 |         "filename": {
 937 |           "type": "keyword",
 938 |           "ignore_above": 256
 939 |         },
 940 |         "full_text": {
 941 |           "type": "text",
 942 |           "analyzer": "english",
 943 |           "eager_global_ordinals": true,
 944 |           "fielddata": true
 945 |         },
 946 |         "journal": {
 947 |           "properties": {
 948 |             "medlineAbbreviation": {
 949 |               "type": "keyword",
 950 |               "ignore_above": 256
 951 |             },
 952 |             "title": {
 953 |               "type": "keyword",
 954 |               "ignore_above": 256
 955 |             }
 956 |           }
 957 |         },
 958 |         "journal_reference": {
 959 |           "properties": {
 960 |             "issue": {
 961 |               "type": "keyword",
 962 |               "ignore_above": 256
 963 |             },
 964 |             "pgn": {
 965 |               "type": "keyword",
 966 |               "ignore_above": 256
 967 |             },
 968 |             "volume": {
 969 |               "type": "keyword",
 970 |               "ignore_above": 256
 971 |             }
 972 |           }
 973 |         },
 974 |         "keywords": {
 975 |           "type": "keyword",
 976 |           "ignore_above": 256
 977 |         },
 978 |         "mesh_headings": {
 979 |           "properties": {
 980 |             "id": {
 981 |               "type": "keyword",
 982 |               "ignore_above": 256
 983 |             },
 984 |             "label": {
 985 |               "type": "keyword",
 986 |               "ignore_above": 256
 987 |             }
 988 |           }
 989 |         },
 990 |         "pub_date": {
 991 |           "type": "date",
 992 |           "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
 993 |         },
 994 |         "pub_id": {
 995 |           "type": "keyword",
 996 |           "ignore_above": 256
 997 |         },
 998 |         "pub_type": {
 999 |           "type": "keyword",
1000 |           "ignore_above": 256
1001 |         },
1002 |         "title": {
1003 |           "type": "text",
1004 |           "analyzer": "english",
1005 |           "eager_global_ordinals": true,
1006 |           "fielddata": true
1007 |         },
1008 |         "text_mined_entities": {
1009 |           "type": "object",
1010 |           "properties": {
1011 |             "nlp": {
1012 |               "type": "object",
1013 |               "properties": {
1014 |                 "tagged_text": {
1015 |                   "type": "string",
1016 |                   "index": "no"
1017 |                 },
1018 |                 "embedding_text": {
1019 |                   "type": "object",
1020 |                   "properties": {
1021 |                     "ent_tag": {
1022 |                       "type": "string",
1023 |                       "index": "no"
1024 |                     },
1025 |                     "plain": {
1026 |                       "type": "string",
1027 |                       "index": "no"
1028 |                     },
1029 |                     "pos_tag": {
1030 |                       "type": "string",
1031 |                       "index": "no"
1032 |                     }
1033 |                   }
1034 |                 }
1035 |               }
1036 |             }
1037 |           }
1038 |         }
1039 |       }
1040 |     }
1041 |   }
1042 | }
1043 | 


--------------------------------------------------------------------------------
/es-mapping-index/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "settings": {
3 |     "number_of_shards": 24,
4 |     "refresh_interval": "-1",
5 |     "translog.flush_threshold_size": "1000mb",
6 |     "number_of_replicas": 0
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/es-mapping/README.md:
--------------------------------------------------------------------------------
1 | # Important 
2 | 
3 | This directory and the relative files are used by the script load2es.py
4 | 


--------------------------------------------------------------------------------
/es-mapping/concept.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "settings": {
  3 |     "number_of_shards": 148,
  4 |     "number_of_replicas": 0,
  5 |     "analysis": {
  6 |       "filter": {
  7 |         "english_stop": {
  8 |           "type": "stop",
  9 |           "stopwords": [
 10 |             "'ll",
 11 |             "'ve",
 12 |             "0",
 13 |             "1",
 14 |             "10",
 15 |             "100",
 16 |             "11",
 17 |             "12",
 18 |             "13",
 19 |             "14",
 20 |             "15",
 21 |             "16",
 22 |             "17",
 23 |             "18",
 24 |             "19",
 25 |             "2",
 26 |             "20",
 27 |             "21",
 28 |             "22",
 29 |             "23",
 30 |             "24",
 31 |             "25",
 32 |             "26",
 33 |             "27",
 34 |             "28",
 35 |             "29",
 36 |             "3",
 37 |             "30",
 38 |             "31",
 39 |             "32",
 40 |             "33",
 41 |             "34",
 42 |             "35",
 43 |             "36",
 44 |             "37",
 45 |             "38",
 46 |             "39",
 47 |             "4",
 48 |             "40",
 49 |             "41",
 50 |             "42",
 51 |             "43",
 52 |             "44",
 53 |             "45",
 54 |             "46",
 55 |             "47",
 56 |             "48",
 57 |             "49",
 58 |             "5",
 59 |             "50",
 60 |             "51",
 61 |             "52",
 62 |             "53",
 63 |             "54",
 64 |             "55",
 65 |             "56",
 66 |             "57",
 67 |             "58",
 68 |             "59",
 69 |             "6",
 70 |             "60",
 71 |             "61",
 72 |             "62",
 73 |             "63",
 74 |             "64",
 75 |             "65",
 76 |             "66",
 77 |             "67",
 78 |             "68",
 79 |             "69",
 80 |             "7",
 81 |             "70",
 82 |             "71",
 83 |             "72",
 84 |             "73",
 85 |             "74",
 86 |             "75",
 87 |             "76",
 88 |             "77",
 89 |             "78",
 90 |             "79",
 91 |             "8",
 92 |             "80",
 93 |             "81",
 94 |             "82",
 95 |             "83",
 96 |             "84",
 97 |             "85",
 98 |             "86",
 99 |             "87",
100 |             "88",
101 |             "89",
102 |             "9",
103 |             "90",
104 |             "91",
105 |             "92",
106 |             "93",
107 |             "94",
108 |             "95",
109 |             "96",
110 |             "97",
111 |             "98",
112 |             "99",
113 |             "a",
114 |             "able",
115 |             "about",
116 |             "above",
117 |             "abst",
118 |             "accordance",
119 |             "according",
120 |             "accordingly",
121 |             "across",
122 |             "act",
123 |             "actually",
124 |             "added",
125 |             "adj",
126 |             "affected",
127 |             "affecting",
128 |             "affects",
129 |             "after",
130 |             "afterwards",
131 |             "again",
132 |             "against",
133 |             "ah",
134 |             "all",
135 |             "almost",
136 |             "alone",
137 |             "along",
138 |             "already",
139 |             "also",
140 |             "although",
141 |             "always",
142 |             "am",
143 |             "among",
144 |             "amongst",
145 |             "an",
146 |             "and",
147 |             "announce",
148 |             "another",
149 |             "any",
150 |             "anybody",
151 |             "anyhow",
152 |             "anymore",
153 |             "anyone",
154 |             "anything",
155 |             "anyway",
156 |             "anyways",
157 |             "anywhere",
158 |             "apparently",
159 |             "approximately",
160 |             "are",
161 |             "aren",
162 |             "arent",
163 |             "arise",
164 |             "around",
165 |             "as",
166 |             "aside",
167 |             "ask",
168 |             "asking",
169 |             "at",
170 |             "auth",
171 |             "available",
172 |             "away",
173 |             "awfully",
174 |             "b",
175 |             "back",
176 |             "be",
177 |             "became",
178 |             "because",
179 |             "become",
180 |             "becomes",
181 |             "becoming",
182 |             "been",
183 |             "before",
184 |             "beforehand",
185 |             "begin",
186 |             "beginning",
187 |             "beginnings",
188 |             "begins",
189 |             "behind",
190 |             "being",
191 |             "believe",
192 |             "below",
193 |             "beside",
194 |             "besides",
195 |             "between",
196 |             "beyond",
197 |             "biol",
198 |             "both",
199 |             "brief",
200 |             "briefly",
201 |             "but",
202 |             "by",
203 |             "c",
204 |             "ca",
205 |             "came",
206 |             "can",
207 |             "can't",
208 |             "cannot",
209 |             "cause",
210 |             "causes",
211 |             "certain",
212 |             "certainly",
213 |             "co",
214 |             "com",
215 |             "come",
216 |             "comes",
217 |             "contain",
218 |             "containing",
219 |             "contains",
220 |             "could",
221 |             "couldnt",
222 |             "d",
223 |             "date",
224 |             "did",
225 |             "didn't",
226 |             "different",
227 |             "do",
228 |             "does",
229 |             "doesn't",
230 |             "doing",
231 |             "don't",
232 |             "done",
233 |             "down",
234 |             "downwards",
235 |             "due",
236 |             "during",
237 |             "e",
238 |             "each",
239 |             "ed",
240 |             "edu",
241 |             "effect",
242 |             "eg",
243 |             "eight",
244 |             "eighty",
245 |             "either",
246 |             "else",
247 |             "elsewhere",
248 |             "end",
249 |             "ending",
250 |             "enough",
251 |             "especially",
252 |             "et",
253 |             "et-al",
254 |             "etc",
255 |             "even",
256 |             "ever",
257 |             "every",
258 |             "everybody",
259 |             "everyone",
260 |             "everything",
261 |             "everywhere",
262 |             "ex",
263 |             "except",
264 |             "f",
265 |             "far",
266 |             "few",
267 |             "ff",
268 |             "fifth",
269 |             "first",
270 |             "five",
271 |             "fix",
272 |             "followed",
273 |             "following",
274 |             "follows",
275 |             "for",
276 |             "former",
277 |             "formerly",
278 |             "forth",
279 |             "found",
280 |             "four",
281 |             "from",
282 |             "further",
283 |             "furthermore",
284 |             "g",
285 |             "gave",
286 |             "get",
287 |             "gets",
288 |             "getting",
289 |             "give",
290 |             "given",
291 |             "gives",
292 |             "giving",
293 |             "go",
294 |             "goes",
295 |             "gone",
296 |             "got",
297 |             "gotten",
298 |             "h",
299 |             "had",
300 |             "happens",
301 |             "hardly",
302 |             "has",
303 |             "hasn't",
304 |             "have",
305 |             "haven't",
306 |             "having",
307 |             "he",
308 |             "hed",
309 |             "hence",
310 |             "her",
311 |             "here",
312 |             "hereafter",
313 |             "hereby",
314 |             "herein",
315 |             "heres",
316 |             "hereupon",
317 |             "hers",
318 |             "herself",
319 |             "hes",
320 |             "hi",
321 |             "hid",
322 |             "him",
323 |             "himself",
324 |             "his",
325 |             "hither",
326 |             "home",
327 |             "how",
328 |             "howbeit",
329 |             "however",
330 |             "hundred",
331 |             "i",
332 |             "i'll",
333 |             "i've",
334 |             "id",
335 |             "ie",
336 |             "if",
337 |             "im",
338 |             "immediate",
339 |             "immediately",
340 |             "importance",
341 |             "important",
342 |             "in",
343 |             "inc",
344 |             "indeed",
345 |             "index",
346 |             "information",
347 |             "instead",
348 |             "into",
349 |             "invention",
350 |             "inward",
351 |             "is",
352 |             "isn't",
353 |             "it",
354 |             "it'll",
355 |             "itd",
356 |             "its",
357 |             "itself",
358 |             "j",
359 |             "just",
360 |             "k",
361 |             "keep",
362 |             "keeps",
363 |             "kept",
364 |             "kg",
365 |             "km",
366 |             "know",
367 |             "known",
368 |             "knows",
369 |             "l",
370 |             "largely",
371 |             "last",
372 |             "lately",
373 |             "later",
374 |             "latter",
375 |             "latterly",
376 |             "least",
377 |             "less",
378 |             "lest",
379 |             "let",
380 |             "lets",
381 |             "like",
382 |             "liked",
383 |             "likely",
384 |             "line",
385 |             "little",
386 |             "look",
387 |             "looking",
388 |             "looks",
389 |             "ltd",
390 |             "m",
391 |             "made",
392 |             "mainly",
393 |             "make",
394 |             "makes",
395 |             "many",
396 |             "may",
397 |             "maybe",
398 |             "me",
399 |             "mean",
400 |             "means",
401 |             "meantime",
402 |             "meanwhile",
403 |             "merely",
404 |             "mg",
405 |             "might",
406 |             "million",
407 |             "miss",
408 |             "ml",
409 |             "more",
410 |             "moreover",
411 |             "most",
412 |             "mostly",
413 |             "mr",
414 |             "mrs",
415 |             "much",
416 |             "mug",
417 |             "must",
418 |             "my",
419 |             "myself",
420 |             "n",
421 |             "na",
422 |             "name",
423 |             "namely",
424 |             "nay",
425 |             "nd",
426 |             "near",
427 |             "nearly",
428 |             "necessarily",
429 |             "necessary",
430 |             "need",
431 |             "needs",
432 |             "neither",
433 |             "never",
434 |             "nevertheless",
435 |             "new",
436 |             "next",
437 |             "nine",
438 |             "ninety",
439 |             "no",
440 |             "nobody",
441 |             "non",
442 |             "none",
443 |             "nonetheless",
444 |             "noone",
445 |             "nor",
446 |             "normally",
447 |             "nos",
448 |             "not",
449 |             "noted",
450 |             "nothing",
451 |             "now",
452 |             "nowhere",
453 |             "o",
454 |             "obtain",
455 |             "obtained",
456 |             "obviously",
457 |             "of",
458 |             "off",
459 |             "often",
460 |             "oh",
461 |             "ok",
462 |             "okay",
463 |             "old",
464 |             "omitted",
465 |             "on",
466 |             "once",
467 |             "one",
468 |             "ones",
469 |             "only",
470 |             "onto",
471 |             "or",
472 |             "ord",
473 |             "other",
474 |             "others",
475 |             "otherwise",
476 |             "ought",
477 |             "our",
478 |             "ours",
479 |             "ourselves",
480 |             "out",
481 |             "outside",
482 |             "over",
483 |             "overall",
484 |             "owing",
485 |             "own",
486 |             "p",
487 |             "page",
488 |             "pages",
489 |             "part",
490 |             "particular",
491 |             "particularly",
492 |             "past",
493 |             "per",
494 |             "perhaps",
495 |             "placed",
496 |             "please",
497 |             "plus",
498 |             "poorly",
499 |             "possible",
500 |             "possibly",
501 |             "potentially",
502 |             "pp",
503 |             "predominantly",
504 |             "present",
505 |             "previously",
506 |             "primarily",
507 |             "probably",
508 |             "promptly",
509 |             "proud",
510 |             "provides",
511 |             "put",
512 |             "q",
513 |             "que",
514 |             "quickly",
515 |             "quite",
516 |             "qv",
517 |             "r",
518 |             "ran",
519 |             "rather",
520 |             "rd",
521 |             "re",
522 |             "readily",
523 |             "really",
524 |             "recent",
525 |             "recently",
526 |             "ref",
527 |             "refs",
528 |             "regarding",
529 |             "regardless",
530 |             "regards",
531 |             "related",
532 |             "relatively",
533 |             "research",
534 |             "respectively",
535 |             "resulted",
536 |             "resulting",
537 |             "results",
538 |             "right",
539 |             "run",
540 |             "s",
541 |             "said",
542 |             "same",
543 |             "saw",
544 |             "say",
545 |             "saying",
546 |             "says",
547 |             "sec",
548 |             "section",
549 |             "see",
550 |             "seeing",
551 |             "seem",
552 |             "seemed",
553 |             "seeming",
554 |             "seems",
555 |             "seen",
556 |             "self",
557 |             "selves",
558 |             "sent",
559 |             "seven",
560 |             "several",
561 |             "shall",
562 |             "she",
563 |             "she'll",
564 |             "shed",
565 |             "shes",
566 |             "should",
567 |             "shouldn't",
568 |             "show",
569 |             "showed",
570 |             "shown",
571 |             "showns",
572 |             "shows",
573 |             "significant",
574 |             "significantly",
575 |             "similar",
576 |             "similarly",
577 |             "since",
578 |             "six",
579 |             "slightly",
580 |             "so",
581 |             "some",
582 |             "somebody",
583 |             "somehow",
584 |             "someone",
585 |             "somethan",
586 |             "something",
587 |             "sometime",
588 |             "sometimes",
589 |             "somewhat",
590 |             "somewhere",
591 |             "soon",
592 |             "sorry",
593 |             "specifically",
594 |             "specified",
595 |             "specify",
596 |             "specifying",
597 |             "still",
598 |             "stop",
599 |             "strongly",
600 |             "sub",
601 |             "substantially",
602 |             "successfully",
603 |             "such",
604 |             "sufficiently",
605 |             "suggest",
606 |             "sup",
607 |             "sure",
608 |             "t",
609 |             "take",
610 |             "taken",
611 |             "taking",
612 |             "tell",
613 |             "tends",
614 |             "th",
615 |             "than",
616 |             "thank",
617 |             "thanks",
618 |             "thanx",
619 |             "that",
620 |             "that'll",
621 |             "that've",
622 |             "thats",
623 |             "the",
624 |             "their",
625 |             "theirs",
626 |             "them",
627 |             "themselves",
628 |             "then",
629 |             "thence",
630 |             "there",
631 |             "there'll",
632 |             "there've",
633 |             "thereafter",
634 |             "thereby",
635 |             "thered",
636 |             "therefore",
637 |             "therein",
638 |             "thereof",
639 |             "therere",
640 |             "theres",
641 |             "thereto",
642 |             "thereupon",
643 |             "these",
644 |             "they",
645 |             "they'll",
646 |             "they've",
647 |             "theyd",
648 |             "theyre",
649 |             "think",
650 |             "this",
651 |             "those",
652 |             "thou",
653 |             "though",
654 |             "thoughh",
655 |             "thousand",
656 |             "throug",
657 |             "through",
658 |             "throughout",
659 |             "thru",
660 |             "thus",
661 |             "til",
662 |             "tip",
663 |             "to",
664 |             "together",
665 |             "too",
666 |             "took",
667 |             "toward",
668 |             "towards",
669 |             "tried",
670 |             "tries",
671 |             "truly",
672 |             "try",
673 |             "trying",
674 |             "ts",
675 |             "twice",
676 |             "two",
677 |             "u",
678 |             "un",
679 |             "under",
680 |             "unfortunately",
681 |             "unless",
682 |             "unlike",
683 |             "unlikely",
684 |             "until",
685 |             "unto",
686 |             "up",
687 |             "upon",
688 |             "ups",
689 |             "us",
690 |             "use",
691 |             "used",
692 |             "useful",
693 |             "usefully",
694 |             "usefulness",
695 |             "uses",
696 |             "using",
697 |             "usually",
698 |             "v",
699 |             "value",
700 |             "various",
701 |             "very",
702 |             "via",
703 |             "viz",
704 |             "vol",
705 |             "vols",
706 |             "vs",
707 |             "w",
708 |             "want",
709 |             "wants",
710 |             "was",
711 |             "wasnt",
712 |             "way",
713 |             "we",
714 |             "we'll",
715 |             "we've",
716 |             "wed",
717 |             "welcome",
718 |             "went",
719 |             "were",
720 |             "werent",
721 |             "what",
722 |             "what'll",
723 |             "whatever",
724 |             "whats",
725 |             "when",
726 |             "whence",
727 |             "whenever",
728 |             "where",
729 |             "whereafter",
730 |             "whereas",
731 |             "whereby",
732 |             "wherein",
733 |             "wheres",
734 |             "whereupon",
735 |             "wherever",
736 |             "whether",
737 |             "which",
738 |             "while",
739 |             "whim",
740 |             "whither",
741 |             "who",
742 |             "who'll",
743 |             "whod",
744 |             "whoever",
745 |             "whole",
746 |             "whom",
747 |             "whomever",
748 |             "whos",
749 |             "whose",
750 |             "why",
751 |             "widely",
752 |             "willing",
753 |             "wish",
754 |             "with",
755 |             "within",
756 |             "without",
757 |             "wont",
758 |             "words",
759 |             "world",
760 |             "would",
761 |             "wouldnt",
762 |             "www",
763 |             "x",
764 |             "y",
765 |             "yes",
766 |             "yet",
767 |             "you",
768 |             "you'll",
769 |             "you've",
770 |             "youd",
771 |             "your",
772 |             "youre",
773 |             "yours",
774 |             "yourself",
775 |             "yourselves",
776 |             "z",
777 |             "zero"
778 |           ]
779 |         },
780 |         "english_stemmer": {
781 |           "type": "stemmer",
782 |           "language": "english"
783 |         },
784 |         "english_light_stemmer": {
785 |           "type": "stemmer",
786 |           "language": "light_english"
787 |         },
788 |         "english_minimal_stemmer": {
789 |           "type": "stemmer",
790 |           "language": "minimal_english"
791 |         },
792 |         "english_possessive_stemmer": {
793 |           "type": "stemmer",
794 |           "language": "possessive_english"
795 |         }
796 |       },
797 |       "analyzer": {
798 |         "english": {
799 |           "tokenizer": "classic",
800 |           "filter": [
801 |             "icu_normalizer",
802 |             "english_possessive_stemmer",
803 |             "english_stop",
804 |             "english_minimal_stemmer"
805 |           ]
806 |         }
807 |       }
808 |     }
809 |   },
810 |   "mappings": {
811 |     "_default_": {
812 |       "_all": {
813 |         "enabled": true
814 |       },
815 |       "dynamic_templates": [
816 |         {
817 |           "string_fields": {
818 |             "mapping": {
819 |               "type": "keyword",
820 |               "ignore_above": 256
821 |             },
822 |             "match": "*",
823 |             "match_mapping_type": "string"
824 |           }
825 |         }
826 |       ]
827 |     },
828 |     "concept": {
829 |       "_all": {
830 |         "enabled": false
831 |       },
832 |       "dynamic_templates": [
833 |         {
834 |           "string_fields": {
835 |             "mapping": {
836 |               "type": "keyword",
837 |               "ignore_above": 256
838 |             },
839 |             "match": "*",
840 |             "match_mapping_type": "string"
841 |           }
842 |         }
843 |       ],
844 |       "properties": {
845 |         "concept": {
846 |           "type": "object",
847 |           "properties": {
848 |             "sentence_text": {
849 |               "type": "text",
850 |               "analyzer": "english",
851 |               "eager_global_ordinals": true,
852 |               "fielddata": true
853 |             },
854 |             "verb_subtree": {
855 |               "type": "text",
856 |               "analyzer": "english",
857 |               "eager_global_ordinals": true,
858 |               "fielddata": true
859 |             },
860 |             "relations": {
861 |               "type": "object",
862 |               "properties": {
863 |                 "directed": {
864 |                   "type": "text",
865 |                   "analyzer": "whitespace",
866 |                   "eager_global_ordinals": true,
867 |                   "fielddata": true
868 |                 },
869 |                 "undirected": {
870 |                   "type": "text",
871 |                   "analyzer": "whitespace",
872 |                   "eager_global_ordinals": true,
873 |                   "fielddata": true
874 |                 }
875 |               }
876 |             }
877 |           }
878 |         }
879 |       }
880 |     }
881 |   }
882 | }
883 | 


--------------------------------------------------------------------------------
/es-mapping/publication.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "settings": {
   3 |     "number_of_shards": 74,
   4 |     "number_of_replicas": 0,
   5 |     "analysis": {
   6 |       "filter": {
   7 |         "english_stop": {
   8 |           "type": "stop",
   9 |           "stopwords": [
  10 |             "'ll",
  11 |             "'ve",
  12 |             "0",
  13 |             "1",
  14 |             "10",
  15 |             "100",
  16 |             "11",
  17 |             "12",
  18 |             "13",
  19 |             "14",
  20 |             "15",
  21 |             "16",
  22 |             "17",
  23 |             "18",
  24 |             "19",
  25 |             "2",
  26 |             "20",
  27 |             "21",
  28 |             "22",
  29 |             "23",
  30 |             "24",
  31 |             "25",
  32 |             "26",
  33 |             "27",
  34 |             "28",
  35 |             "29",
  36 |             "3",
  37 |             "30",
  38 |             "31",
  39 |             "32",
  40 |             "33",
  41 |             "34",
  42 |             "35",
  43 |             "36",
  44 |             "37",
  45 |             "38",
  46 |             "39",
  47 |             "4",
  48 |             "40",
  49 |             "41",
  50 |             "42",
  51 |             "43",
  52 |             "44",
  53 |             "45",
  54 |             "46",
  55 |             "47",
  56 |             "48",
  57 |             "49",
  58 |             "5",
  59 |             "50",
  60 |             "51",
  61 |             "52",
  62 |             "53",
  63 |             "54",
  64 |             "55",
  65 |             "56",
  66 |             "57",
  67 |             "58",
  68 |             "59",
  69 |             "6",
  70 |             "60",
  71 |             "61",
  72 |             "62",
  73 |             "63",
  74 |             "64",
  75 |             "65",
  76 |             "66",
  77 |             "67",
  78 |             "68",
  79 |             "69",
  80 |             "7",
  81 |             "70",
  82 |             "71",
  83 |             "72",
  84 |             "73",
  85 |             "74",
  86 |             "75",
  87 |             "76",
  88 |             "77",
  89 |             "78",
  90 |             "79",
  91 |             "8",
  92 |             "80",
  93 |             "81",
  94 |             "82",
  95 |             "83",
  96 |             "84",
  97 |             "85",
  98 |             "86",
  99 |             "87",
 100 |             "88",
 101 |             "89",
 102 |             "9",
 103 |             "90",
 104 |             "91",
 105 |             "92",
 106 |             "93",
 107 |             "94",
 108 |             "95",
 109 |             "96",
 110 |             "97",
 111 |             "98",
 112 |             "99",
 113 |             "a",
 114 |             "able",
 115 |             "about",
 116 |             "above",
 117 |             "abst",
 118 |             "accordance",
 119 |             "according",
 120 |             "accordingly",
 121 |             "across",
 122 |             "act",
 123 |             "actually",
 124 |             "added",
 125 |             "adj",
 126 |             "affected",
 127 |             "affecting",
 128 |             "affects",
 129 |             "after",
 130 |             "afterwards",
 131 |             "again",
 132 |             "against",
 133 |             "ah",
 134 |             "all",
 135 |             "almost",
 136 |             "alone",
 137 |             "along",
 138 |             "already",
 139 |             "also",
 140 |             "although",
 141 |             "always",
 142 |             "am",
 143 |             "among",
 144 |             "amongst",
 145 |             "an",
 146 |             "and",
 147 |             "announce",
 148 |             "another",
 149 |             "any",
 150 |             "anybody",
 151 |             "anyhow",
 152 |             "anymore",
 153 |             "anyone",
 154 |             "anything",
 155 |             "anyway",
 156 |             "anyways",
 157 |             "anywhere",
 158 |             "apparently",
 159 |             "approximately",
 160 |             "are",
 161 |             "aren",
 162 |             "arent",
 163 |             "arise",
 164 |             "around",
 165 |             "as",
 166 |             "aside",
 167 |             "ask",
 168 |             "asking",
 169 |             "at",
 170 |             "auth",
 171 |             "available",
 172 |             "away",
 173 |             "awfully",
 174 |             "b",
 175 |             "back",
 176 |             "be",
 177 |             "became",
 178 |             "because",
 179 |             "become",
 180 |             "becomes",
 181 |             "becoming",
 182 |             "been",
 183 |             "before",
 184 |             "beforehand",
 185 |             "begin",
 186 |             "beginning",
 187 |             "beginnings",
 188 |             "begins",
 189 |             "behind",
 190 |             "being",
 191 |             "believe",
 192 |             "below",
 193 |             "beside",
 194 |             "besides",
 195 |             "between",
 196 |             "beyond",
 197 |             "biol",
 198 |             "both",
 199 |             "brief",
 200 |             "briefly",
 201 |             "but",
 202 |             "by",
 203 |             "c",
 204 |             "ca",
 205 |             "came",
 206 |             "can",
 207 |             "can't",
 208 |             "cannot",
 209 |             "cause",
 210 |             "causes",
 211 |             "certain",
 212 |             "certainly",
 213 |             "co",
 214 |             "com",
 215 |             "come",
 216 |             "comes",
 217 |             "contain",
 218 |             "containing",
 219 |             "contains",
 220 |             "could",
 221 |             "couldnt",
 222 |             "d",
 223 |             "date",
 224 |             "did",
 225 |             "didn't",
 226 |             "different",
 227 |             "do",
 228 |             "does",
 229 |             "doesn't",
 230 |             "doing",
 231 |             "don't",
 232 |             "done",
 233 |             "down",
 234 |             "downwards",
 235 |             "due",
 236 |             "during",
 237 |             "e",
 238 |             "each",
 239 |             "ed",
 240 |             "edu",
 241 |             "effect",
 242 |             "eg",
 243 |             "eight",
 244 |             "eighty",
 245 |             "either",
 246 |             "else",
 247 |             "elsewhere",
 248 |             "end",
 249 |             "ending",
 250 |             "enough",
 251 |             "especially",
 252 |             "et",
 253 |             "et-al",
 254 |             "etc",
 255 |             "even",
 256 |             "ever",
 257 |             "every",
 258 |             "everybody",
 259 |             "everyone",
 260 |             "everything",
 261 |             "everywhere",
 262 |             "ex",
 263 |             "except",
 264 |             "f",
 265 |             "far",
 266 |             "few",
 267 |             "ff",
 268 |             "fifth",
 269 |             "first",
 270 |             "five",
 271 |             "fix",
 272 |             "followed",
 273 |             "following",
 274 |             "follows",
 275 |             "for",
 276 |             "former",
 277 |             "formerly",
 278 |             "forth",
 279 |             "found",
 280 |             "four",
 281 |             "from",
 282 |             "further",
 283 |             "furthermore",
 284 |             "g",
 285 |             "gave",
 286 |             "get",
 287 |             "gets",
 288 |             "getting",
 289 |             "give",
 290 |             "given",
 291 |             "gives",
 292 |             "giving",
 293 |             "go",
 294 |             "goes",
 295 |             "gone",
 296 |             "got",
 297 |             "gotten",
 298 |             "h",
 299 |             "had",
 300 |             "happens",
 301 |             "hardly",
 302 |             "has",
 303 |             "hasn't",
 304 |             "have",
 305 |             "haven't",
 306 |             "having",
 307 |             "he",
 308 |             "hed",
 309 |             "hence",
 310 |             "her",
 311 |             "here",
 312 |             "hereafter",
 313 |             "hereby",
 314 |             "herein",
 315 |             "heres",
 316 |             "hereupon",
 317 |             "hers",
 318 |             "herself",
 319 |             "hes",
 320 |             "hi",
 321 |             "hid",
 322 |             "him",
 323 |             "himself",
 324 |             "his",
 325 |             "hither",
 326 |             "home",
 327 |             "how",
 328 |             "howbeit",
 329 |             "however",
 330 |             "hundred",
 331 |             "i",
 332 |             "i'll",
 333 |             "i've",
 334 |             "id",
 335 |             "ie",
 336 |             "if",
 337 |             "im",
 338 |             "immediate",
 339 |             "immediately",
 340 |             "importance",
 341 |             "important",
 342 |             "in",
 343 |             "inc",
 344 |             "indeed",
 345 |             "index",
 346 |             "information",
 347 |             "instead",
 348 |             "into",
 349 |             "invention",
 350 |             "inward",
 351 |             "is",
 352 |             "isn't",
 353 |             "it",
 354 |             "it'll",
 355 |             "itd",
 356 |             "its",
 357 |             "itself",
 358 |             "j",
 359 |             "just",
 360 |             "k",
 361 |             "keep",
 362 |             "keeps",
 363 |             "kept",
 364 |             "kg",
 365 |             "km",
 366 |             "know",
 367 |             "known",
 368 |             "knows",
 369 |             "l",
 370 |             "largely",
 371 |             "last",
 372 |             "lately",
 373 |             "later",
 374 |             "latter",
 375 |             "latterly",
 376 |             "least",
 377 |             "less",
 378 |             "lest",
 379 |             "let",
 380 |             "lets",
 381 |             "like",
 382 |             "liked",
 383 |             "likely",
 384 |             "line",
 385 |             "little",
 386 |             "look",
 387 |             "looking",
 388 |             "looks",
 389 |             "ltd",
 390 |             "m",
 391 |             "made",
 392 |             "mainly",
 393 |             "make",
 394 |             "makes",
 395 |             "many",
 396 |             "may",
 397 |             "maybe",
 398 |             "me",
 399 |             "mean",
 400 |             "means",
 401 |             "meantime",
 402 |             "meanwhile",
 403 |             "merely",
 404 |             "mg",
 405 |             "might",
 406 |             "million",
 407 |             "miss",
 408 |             "ml",
 409 |             "more",
 410 |             "moreover",
 411 |             "most",
 412 |             "mostly",
 413 |             "mr",
 414 |             "mrs",
 415 |             "much",
 416 |             "mug",
 417 |             "must",
 418 |             "my",
 419 |             "myself",
 420 |             "n",
 421 |             "na",
 422 |             "name",
 423 |             "namely",
 424 |             "nay",
 425 |             "nd",
 426 |             "near",
 427 |             "nearly",
 428 |             "necessarily",
 429 |             "necessary",
 430 |             "need",
 431 |             "needs",
 432 |             "neither",
 433 |             "never",
 434 |             "nevertheless",
 435 |             "new",
 436 |             "next",
 437 |             "nine",
 438 |             "ninety",
 439 |             "no",
 440 |             "nobody",
 441 |             "non",
 442 |             "none",
 443 |             "nonetheless",
 444 |             "noone",
 445 |             "nor",
 446 |             "normally",
 447 |             "nos",
 448 |             "not",
 449 |             "noted",
 450 |             "nothing",
 451 |             "now",
 452 |             "nowhere",
 453 |             "o",
 454 |             "obtain",
 455 |             "obtained",
 456 |             "obviously",
 457 |             "of",
 458 |             "off",
 459 |             "often",
 460 |             "oh",
 461 |             "ok",
 462 |             "okay",
 463 |             "old",
 464 |             "omitted",
 465 |             "on",
 466 |             "once",
 467 |             "one",
 468 |             "ones",
 469 |             "only",
 470 |             "onto",
 471 |             "or",
 472 |             "ord",
 473 |             "other",
 474 |             "others",
 475 |             "otherwise",
 476 |             "ought",
 477 |             "our",
 478 |             "ours",
 479 |             "ourselves",
 480 |             "out",
 481 |             "outside",
 482 |             "over",
 483 |             "overall",
 484 |             "owing",
 485 |             "own",
 486 |             "p",
 487 |             "page",
 488 |             "pages",
 489 |             "part",
 490 |             "particular",
 491 |             "particularly",
 492 |             "past",
 493 |             "per",
 494 |             "perhaps",
 495 |             "placed",
 496 |             "please",
 497 |             "plus",
 498 |             "poorly",
 499 |             "possible",
 500 |             "possibly",
 501 |             "potentially",
 502 |             "pp",
 503 |             "predominantly",
 504 |             "present",
 505 |             "previously",
 506 |             "primarily",
 507 |             "probably",
 508 |             "promptly",
 509 |             "proud",
 510 |             "provides",
 511 |             "put",
 512 |             "q",
 513 |             "que",
 514 |             "quickly",
 515 |             "quite",
 516 |             "qv",
 517 |             "r",
 518 |             "ran",
 519 |             "rather",
 520 |             "rd",
 521 |             "re",
 522 |             "readily",
 523 |             "really",
 524 |             "recent",
 525 |             "recently",
 526 |             "ref",
 527 |             "refs",
 528 |             "regarding",
 529 |             "regardless",
 530 |             "regards",
 531 |             "related",
 532 |             "relatively",
 533 |             "research",
 534 |             "respectively",
 535 |             "resulted",
 536 |             "resulting",
 537 |             "results",
 538 |             "right",
 539 |             "run",
 540 |             "s",
 541 |             "said",
 542 |             "same",
 543 |             "saw",
 544 |             "say",
 545 |             "saying",
 546 |             "says",
 547 |             "sec",
 548 |             "section",
 549 |             "see",
 550 |             "seeing",
 551 |             "seem",
 552 |             "seemed",
 553 |             "seeming",
 554 |             "seems",
 555 |             "seen",
 556 |             "self",
 557 |             "selves",
 558 |             "sent",
 559 |             "seven",
 560 |             "several",
 561 |             "shall",
 562 |             "she",
 563 |             "she'll",
 564 |             "shed",
 565 |             "shes",
 566 |             "should",
 567 |             "shouldn't",
 568 |             "show",
 569 |             "showed",
 570 |             "shown",
 571 |             "showns",
 572 |             "shows",
 573 |             "significant",
 574 |             "significantly",
 575 |             "similar",
 576 |             "similarly",
 577 |             "since",
 578 |             "six",
 579 |             "slightly",
 580 |             "so",
 581 |             "some",
 582 |             "somebody",
 583 |             "somehow",
 584 |             "someone",
 585 |             "somethan",
 586 |             "something",
 587 |             "sometime",
 588 |             "sometimes",
 589 |             "somewhat",
 590 |             "somewhere",
 591 |             "soon",
 592 |             "sorry",
 593 |             "specifically",
 594 |             "specified",
 595 |             "specify",
 596 |             "specifying",
 597 |             "still",
 598 |             "stop",
 599 |             "strongly",
 600 |             "sub",
 601 |             "substantially",
 602 |             "successfully",
 603 |             "such",
 604 |             "sufficiently",
 605 |             "suggest",
 606 |             "sup",
 607 |             "sure",
 608 |             "t",
 609 |             "take",
 610 |             "taken",
 611 |             "taking",
 612 |             "tell",
 613 |             "tends",
 614 |             "th",
 615 |             "than",
 616 |             "thank",
 617 |             "thanks",
 618 |             "thanx",
 619 |             "that",
 620 |             "that'll",
 621 |             "that've",
 622 |             "thats",
 623 |             "the",
 624 |             "their",
 625 |             "theirs",
 626 |             "them",
 627 |             "themselves",
 628 |             "then",
 629 |             "thence",
 630 |             "there",
 631 |             "there'll",
 632 |             "there've",
 633 |             "thereafter",
 634 |             "thereby",
 635 |             "thered",
 636 |             "therefore",
 637 |             "therein",
 638 |             "thereof",
 639 |             "therere",
 640 |             "theres",
 641 |             "thereto",
 642 |             "thereupon",
 643 |             "these",
 644 |             "they",
 645 |             "they'll",
 646 |             "they've",
 647 |             "theyd",
 648 |             "theyre",
 649 |             "think",
 650 |             "this",
 651 |             "those",
 652 |             "thou",
 653 |             "though",
 654 |             "thoughh",
 655 |             "thousand",
 656 |             "throug",
 657 |             "through",
 658 |             "throughout",
 659 |             "thru",
 660 |             "thus",
 661 |             "til",
 662 |             "tip",
 663 |             "to",
 664 |             "together",
 665 |             "too",
 666 |             "took",
 667 |             "toward",
 668 |             "towards",
 669 |             "tried",
 670 |             "tries",
 671 |             "truly",
 672 |             "try",
 673 |             "trying",
 674 |             "ts",
 675 |             "twice",
 676 |             "two",
 677 |             "u",
 678 |             "un",
 679 |             "under",
 680 |             "unfortunately",
 681 |             "unless",
 682 |             "unlike",
 683 |             "unlikely",
 684 |             "until",
 685 |             "unto",
 686 |             "up",
 687 |             "upon",
 688 |             "ups",
 689 |             "us",
 690 |             "use",
 691 |             "used",
 692 |             "useful",
 693 |             "usefully",
 694 |             "usefulness",
 695 |             "uses",
 696 |             "using",
 697 |             "usually",
 698 |             "v",
 699 |             "value",
 700 |             "various",
 701 |             "very",
 702 |             "via",
 703 |             "viz",
 704 |             "vol",
 705 |             "vols",
 706 |             "vs",
 707 |             "w",
 708 |             "want",
 709 |             "wants",
 710 |             "was",
 711 |             "wasnt",
 712 |             "way",
 713 |             "we",
 714 |             "we'll",
 715 |             "we've",
 716 |             "wed",
 717 |             "welcome",
 718 |             "went",
 719 |             "were",
 720 |             "werent",
 721 |             "what",
 722 |             "what'll",
 723 |             "whatever",
 724 |             "whats",
 725 |             "when",
 726 |             "whence",
 727 |             "whenever",
 728 |             "where",
 729 |             "whereafter",
 730 |             "whereas",
 731 |             "whereby",
 732 |             "wherein",
 733 |             "wheres",
 734 |             "whereupon",
 735 |             "wherever",
 736 |             "whether",
 737 |             "which",
 738 |             "while",
 739 |             "whim",
 740 |             "whither",
 741 |             "who",
 742 |             "who'll",
 743 |             "whod",
 744 |             "whoever",
 745 |             "whole",
 746 |             "whom",
 747 |             "whomever",
 748 |             "whos",
 749 |             "whose",
 750 |             "why",
 751 |             "widely",
 752 |             "willing",
 753 |             "wish",
 754 |             "with",
 755 |             "within",
 756 |             "without",
 757 |             "wont",
 758 |             "words",
 759 |             "world",
 760 |             "would",
 761 |             "wouldnt",
 762 |             "www",
 763 |             "x",
 764 |             "y",
 765 |             "yes",
 766 |             "yet",
 767 |             "you",
 768 |             "you'll",
 769 |             "you've",
 770 |             "youd",
 771 |             "your",
 772 |             "youre",
 773 |             "yours",
 774 |             "yourself",
 775 |             "yourselves",
 776 |             "z",
 777 |             "zero"
 778 |           ]
 779 |         },
 780 |         "english_stemmer": {
 781 |           "type": "stemmer",
 782 |           "language": "english"
 783 |         },
 784 |         "english_light_stemmer": {
 785 |           "type": "stemmer",
 786 |           "language": "light_english"
 787 |         },
 788 |         "english_minimal_stemmer": {
 789 |           "type": "stemmer",
 790 |           "language": "minimal_english"
 791 |         },
 792 |         "english_possessive_stemmer": {
 793 |           "type": "stemmer",
 794 |           "language": "possessive_english"
 795 |         }
 796 |       },
 797 |       "analyzer": {
 798 |         "english": {
 799 |           "tokenizer": "classic",
 800 |           "filter": [
 801 |             "icu_normalizer",
 802 |             "english_possessive_stemmer",
 803 |             "english_stop",
 804 |             "english_minimal_stemmer"
 805 |           ]
 806 |         }
 807 |       }
 808 |     }
 809 |   },
 810 |   "mappings": {
 811 |     "_default_": {
 812 |       "_all": {
 813 |         "enabled": true
 814 |       },
 815 |       "dynamic_templates": [
 816 |         {
 817 |           "string_fields": {
 818 |             "mapping": {
 819 |               "type": "keyword",
 820 |               "ignore_above": 256
 821 |             },
 822 |             "match": "*",
 823 |             "match_mapping_type": "string"
 824 |           }
 825 |         }
 826 |       ]
 827 |     },
 828 |     "publication": {
 829 |       "_all": {
 830 |         "enabled": true
 831 |       },
 832 |       "dynamic_templates": [
 833 |         {
 834 |           "string_fields": {
 835 |             "mapping": {
 836 |               "type": "keyword",
 837 |               "ignore_above": 256
 838 |             },
 839 |             "match": "*",
 840 |             "match_mapping_type": "string"
 841 |           }
 842 |         }
 843 |       ],
 844 |       "properties": {
 845 |         "abstract": {
 846 |           "type": "text",
 847 |           "analyzer": "english",
 848 |           "eager_global_ordinals": true,
 849 |           "fielddata": true
 850 |         },
 851 |         "abstract_sentences": {
 852 |           "type": "nested",
 853 |           "properties": {
 854 |             "value": {
 855 |               "type": "text",
 856 |               "analyzer": "english",
 857 |               "eager_global_ordinals": true,
 858 |               "fielddata": true
 859 |             }
 860 |           }
 861 |         },
 862 |         "authors": {
 863 |           "properties": {
 864 |             "CollectiveName": {
 865 |               "type": "keyword",
 866 |               "ignore_above": 256
 867 |             },
 868 |             "ForeName": {
 869 |               "type": "keyword",
 870 |               "ignore_above": 256
 871 |             },
 872 |             "Identifier": {
 873 |               "type": "keyword",
 874 |               "ignore_above": 256
 875 |             },
 876 |             "Initials": {
 877 |               "type": "keyword",
 878 |               "ignore_above": 256
 879 |             },
 880 |             "LastName": {
 881 |               "type": "keyword",
 882 |               "ignore_above": 256
 883 |             },
 884 |             "Suffix": {
 885 |               "type": "keyword",
 886 |               "ignore_above": 256
 887 |             },
 888 |             "full_name": {
 889 |               "type": "keyword",
 890 |               "ignore_above": 256
 891 |             },
 892 |             "short_name": {
 893 |               "type": "keyword",
 894 |               "ignore_above": 256
 895 |             },
 896 |             "last_name": {
 897 |               "type": "keyword",
 898 |               "ignore_above": 256
 899 |             }
 900 |           }
 901 |         },
 902 |         "chemicals": {
 903 |           "properties": {
 904 |             "name": {
 905 |               "type": "keyword",
 906 |               "ignore_above": 256
 907 |             },
 908 |             "name_id": {
 909 |               "type": "keyword",
 910 |               "ignore_above": 256
 911 |             },
 912 |             "registryNumber": {
 913 |               "type": "keyword",
 914 |               "ignore_above": 256
 915 |             }
 916 |           }
 917 |         },
 918 |         "data_release": {
 919 |           "type": "keyword",
 920 |           "ignore_above": 256
 921 |         },
 922 |         "date": {
 923 |           "type": "date",
 924 |           "format": "strict_date_optional_time||epoch_millis"
 925 |         },
 926 |         "date_of_revision": {
 927 |           "type": "date",
 928 |           "format": "strict_date_optional_time||epoch_millis"
 929 |         },
 930 |         "doi": {
 931 |           "type": "keyword",
 932 |           "ignore_above": 256
 933 |         },
 934 |         "filename": {
 935 |           "type": "keyword",
 936 |           "ignore_above": 256
 937 |         },
 938 |         "full_text": {
 939 |           "type": "text",
 940 |           "analyzer": "english",
 941 |           "eager_global_ordinals": true,
 942 |           "fielddata": true
 943 |         },
 944 |         "journal": {
 945 |           "properties": {
 946 |             "medlineAbbreviation": {
 947 |               "type": "keyword",
 948 |               "ignore_above": 256
 949 |             },
 950 |             "title": {
 951 |               "type": "keyword",
 952 |               "ignore_above": 256
 953 |             }
 954 |           }
 955 |         },
 956 |         "journal_reference": {
 957 |           "properties": {
 958 |             "issue": {
 959 |               "type": "keyword",
 960 |               "ignore_above": 256
 961 |             },
 962 |             "pgn": {
 963 |               "type": "keyword",
 964 |               "ignore_above": 256
 965 |             },
 966 |             "volume": {
 967 |               "type": "keyword",
 968 |               "ignore_above": 256
 969 |             }
 970 |           }
 971 |         },
 972 |         "keywords": {
 973 |           "type": "keyword",
 974 |           "ignore_above": 256
 975 |         },
 976 |         "mesh_headings": {
 977 |           "properties": {
 978 |             "id": {
 979 |               "type": "keyword",
 980 |               "ignore_above": 256
 981 |             },
 982 |             "label": {
 983 |               "type": "keyword",
 984 |               "ignore_above": 256
 985 |             }
 986 |           }
 987 |         },
 988 |         "pub_date": {
 989 |           "type": "date",
 990 |           "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis"
 991 |         },
 992 |         "pub_id": {
 993 |           "type": "keyword",
 994 |           "ignore_above": 256
 995 |         },
 996 |         "pub_type": {
 997 |           "type": "keyword",
 998 |           "ignore_above": 256
 999 |         },
1000 |         "title": {
1001 |           "type": "text",
1002 |           "analyzer": "english",
1003 |           "eager_global_ordinals": true,
1004 |           "fielddata": true
1005 |         },
1006 |         "text_mined_entities": {
1007 |           "type": "object",
1008 |           "properties": {
1009 |             "nlp": {
1010 |               "type": "object",
1011 |               "properties": {
1012 |                 "tagged_text": {
1013 |                   "type": "string",
1014 |                   "index": "no"
1015 |                 },
1016 |                 "embedding_text": {
1017 |                   "type": "object",
1018 |                   "properties": {
1019 |                     "ent_tag": {
1020 |                       "type": "string",
1021 |                       "index": "no"
1022 |                     },
1023 |                     "plain": {
1024 |                       "type": "string",
1025 |                       "index": "no"
1026 |                     },
1027 |                     "pos_tag": {
1028 |                       "type": "string",
1029 |                       "index": "no"
1030 |                     }
1031 |                   }
1032 |                 }
1033 |               }
1034 |             }
1035 |           }
1036 |         }
1037 |       }
1038 |     }
1039 |   }
1040 | }
1041 | 


--------------------------------------------------------------------------------
/gcp-local-ssd/readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | Below are a series of commands useful for doing things with elasticsearch
 4 | 
 5 | Setup a local environment variable for convienience
 6 | ```sh
 7 | HOST=es-190313-102133
 8 | ```
 9 | 
10 | Increase the threshold for "breakers" to help prevent false triggers
11 | ```sh
12 | time curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cluster/settings" -H 'Content-Type: application/json' -d'
13 |    {
14 |       "transient" : {
15 |           "indices.breaker.request.limit" : "90%",
16 |           "network.breaker.inflight_requests.limit": "100%"
17 |           }
18 |    }'
19 | ```
20 | 
21 | Below are a series of commands useful for finding out the status of elasticsearch
22 | ```sh
23 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/nodes?v&s=name"
24 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/indices?v&s=index"
25 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/shards?v&s=index,shard,prirep"
26 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/allocation?v&s=node"
27 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cluster/health?pretty"
28 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cluster/state?pretty"
29 | ```
30 | 
31 | To set a default number of shards create a "template" for future indexes before creating any
32 | ```sh
33 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_template/default" -H 'Content-Type: application/json' \
34 | -d'{"template":"*","settings":{"number_of_shards":37}}'
35 | ```
36 | 
37 | To actually do the loading
38 | ```
39 | time python load2es.py bioentity taggedtext publication concept --es "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200"
40 | ```
41 | 
42 | Required for LINK to work properly
43 | ```sh
44 | time curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-19-concept/_settings" -H 'Content-Type: application/json' -d'
45 |    {
46 |       "index" : {
47 |           "max_adjacency_matrix_filters" : 500
48 |           }
49 |    }'
50 | ```


--------------------------------------------------------------------------------
/gcp-local-ssd/run.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # This script will create a new elasticsearch cluster
 3 | # It is configures as a GCP Instance Group with an Instance Template
 4 | # and sets up a Load Balancer in front of it
 5 | #
 6 | 
 7 | 
 8 | NOW=`date +'%y%m%d-%H%M%S'`
 9 | NAME=es-$NOW
10 | PROJECT=open-targets-library
11 | 
12 | gcloud --project=$PROJECT \
13 |   compute instance-templates create $NAME \
14 |     --custom-cpu 2 \
15 |     --custom-memory 12 \
16 |     --local-ssd="" \
17 |     --image-project debian-cloud \
18 |     --image-family debian-9 \
19 |     --scopes=compute-rw \
20 |     --metadata-from-file startup-script=startup.sh
21 | 
22 | #if trying to do containers, use this
23 | #    --image-project cos-cloud \
24 | #    --image-family cos-stable \
25 | 
26 | #NOTE this is a BETA command and liable to change in future!
27 | gcloud beta compute --project=$PROJECT \
28 |   instance-groups managed create $NAME \
29 |     --size=37 \
30 |     --template=$NAME \
31 |     --zone=europe-west1-d 
32 | 
33 | #create a healthcheck
34 | #used by autohealing and load balancing
35 | #check for 10s every 10s each 6 times for 1m total
36 | gcloud compute --project=$PROJECT \
37 |   health-checks create http $NAME \
38 |     --request-path="/_nodes/_local" \
39 |     --port=9200 \
40 |     --check-interval=10s \
41 |     --timeout=10s \
42 |     --unhealthy-threshold=3 \
43 |     --healthy-threshold=3
44 | 
45 | #configure healthcheck for autohealing
46 | gcloud beta compute --project=$PROJECT \
47 |   instance-groups managed update $NAME \
48 |     --health-check=$NAME \
49 |     --zone=europe-west1-d 
50 | 
51 | #configure a load balancer
52 | #create the load balancer backend service
53 | gcloud compute --project=$PROJECT \
54 |   backend-services create $NAME \
55 |     --health-checks=$NAME \
56 |     --load-balancing-scheme=internal \
57 |     --protocol=tcp \
58 |     --region=europe-west1
59 | #add the instance group to the backend service
60 | gcloud compute --project=$PROJECT \
61 |   backend-services add-backend $NAME \
62 |     --instance-group=$NAME \
63 |     --region=europe-west1 \
64 |     --instance-group-zone=europe-west1-d
65 | 
66 | #create a forwarding rule for the actual load balancing
67 | #must use a service label to get dns!
68 | gcloud compute --project=$PROJECT \
69 |   forwarding-rules create $NAME \
70 |     --service-label $NAME \
71 |     --region=europe-west1 \
72 |     --address-region=europe-west1 \
73 |     --load-balancing-scheme=internal \
74 |     --ip-protocol=TCP \
75 |     --ports=all \
76 |     --backend-service=$NAME
77 | 
78 | # [SERVICE_LABEL].[FORWARDING_RULE_NAME].il4.[REGION].lb.[PROJECT_ID].internal
79 | 
80 | #curl http://$NAME.$NAME.il4.europe-west1.lb.open-targets-af.internal:9200
81 | 
82 | #configure firewall to allow healthchecks - manual
83 | 
84 | #sudo journalctl -n 500 -f -u google-startup-scripts.service
85 | 
86 | #time curl "localhost:9200/_cat/nodes?v&s=name"
87 | #time curl localhost:9200/_cluster/health?pretty
88 | #time curl localhost:9200/_cluster/state?pretty
89 | #time curl localhost:9200/_cat/master?v
90 | #time curl "localhost:9200/_cat/shards?v&s=index,shard,prirep"
91 | 


--------------------------------------------------------------------------------
/gcp-local-ssd/startup.sh:
--------------------------------------------------------------------------------
  1 | #initial setup
  2 | #-------------
  3 | 
  4 | #update packages, and install prerequisites for elasticsearch
  5 | #use the non-interactive frontend for apt so we don't get any prompts
  6 | export DEBIAN_FRONTEND=noninteractive
  7 | #install with apt-get and autoconfirm
  8 | apt-get update
  9 | apt-get -yq install \
 10 |   openjdk-8-jdk-headless \
 11 |   net-tools \
 12 |   wget \
 13 |   uuid-runtime \
 14 |   python-pip \
 15 |   python-dev \
 16 |   python-urllib3 \
 17 |   libyaml-dev \
 18 |   less \
 19 |   apt-transport-https
 20 | 
 21 | #ensure pip is the latest version, more than the debian package
 22 | pip install --upgrade pip
 23 | 
 24 | 
 25 | #install elasticsearch
 26 | #---------------------
 27 | ES_VERSION=5.6.15
 28 | #download the elasticsearch package
 29 | wget --quiet --no-check-certificate \
 30 |   --output-document=/tmp/elasticsearch-$ES_VERSION.deb \
 31 |   https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-$ES_VERSION.deb
 32 | #install the elasticsearch package
 33 | #use the non-interactive frontend for dpkg so we don't get any prompts
 34 | export DEBIAN_FRONTEND=noninteractive
 35 | dpkg -i /tmp/elasticsearch-$ES_VERSION.deb
 36 | #post-install cleanup
 37 | rm /tmp/elasticsearch-$ES_VERSION.deb
 38 | 
 39 | #install elasticsearch google compute engine discovery plugin
 40 | #note this requires scopes=compute-rw on the VM
 41 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install discovery-gce
 42 | 
 43 | #install elasticsearch google storage plugin
 44 | #used to save snapshots into a google cloud bucket
 45 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install repository-gcs
 46 | 
 47 | #International Components for Unicode support plugin
 48 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install analysis-icu
 49 | 
 50 | #configure elasticsearch
 51 | #-----------------------
 52 | 
 53 | #configure elasticseach
 54 | #  cluster.name            must be unique on network for udp broadcast
 55 | #  network.host            allow connections on any network device, not just localhost
 56 | #  http.port               use only 9200 nothing else
 57 | #  bootstrap.memory_lock   disable swap
 58 | #  xpack.security.enabled  turn off xpack extras
 59 | cat > /etc/elasticsearch/elasticsearch.yml <<EOF_C 
 60 | cloud:
 61 |   gce:
 62 |     project_id: open-targets-library
 63 |     zone: europe-west1-d
 64 | discovery:
 65 |   zen:
 66 |     hosts_provider: gce
 67 |     minimum_master_nodes: 3
 68 | indices.store.throttle.max_bytes_per_sec: "200mb"
 69 | cluster.name: library201911v5
 70 | node.name: ${HOSTNAME}
 71 | network.host: 0.0.0.0
 72 | http.port: 9200
 73 | bootstrap.memory_lock: true
 74 | EOF_C
 75 | 
 76 | #configure elasticseach JVM
 77 | cat > /etc/elasticsearch/jvm.options <<EOF_C 
 78 | -Xms31g
 79 | -Xmx31g
 80 | #default elasticsearch settings
 81 | -XX:+UseConcMarkSweepGC
 82 | -XX:CMSInitiatingOccupancyFraction=75
 83 | -XX:+UseCMSInitiatingOccupancyOnly
 84 | -XX:+AlwaysPreTouch
 85 | -server
 86 | -Xss1m
 87 | -Djava.awt.headless=true
 88 | -Dfile.encoding=UTF-8
 89 | -Djna.nosys=true
 90 | -Djdk.io.permissionsUseCanonicalPath=true
 91 | -Dio.netty.noUnsafe=true
 92 | -Dio.netty.noKeySetOptimization=true
 93 | -Dio.netty.recycler.maxCapacityPerThread=0
 94 | -Dlog4j.shutdownHookEnabled=false
 95 | -Dlog4j2.disable.jmx=true
 96 | -Dlog4j.skipJansi=true
 97 | #-XX:+HeapDumpOnOutOfMemoryError
 98 | #rotate gc logfiles
 99 | -XX:+UseGCLogFileRotation
100 | -XX:NumberOfGCLogFiles=32
101 | -XX:GCLogFileSize=128M
102 | EOF_C
103 | 
104 | #configure kernel
105 | #  raise number of open files
106 | #  allow locking an unlimited amount of memory
107 | cat <<EOF_C > /etc/security/limits.conf
108 | * soft nofile 65536
109 | * hard nofile 65536
110 | * soft memlock unlimited
111 | * hard memlock unlimited
112 | EOF_C
113 | 
114 | # set all sysctl configurations
115 | sysctl -p
116 | 
117 | # disable swap another way
118 | swapoff -a
119 | 
120 | #more kernel changes to ensure we get best performance
121 | #more disabling of swap, locking of memory, and reducing unnecessary disk IO
122 | echo "block/sda/queue/scheduler = noop" >> /etc/sysfs.conf
123 | echo noop > /sys/block/sda/queue/scheduler
124 | sed -i 's/\#LimitMEMLOCK=infinity/LimitMEMLOCK=infinity/g' /usr/lib/systemd/system/elasticsearch.service
125 | sed -i '46iLimitMEMLOCK=infinity' /usr/lib/systemd/system/elasticsearch.service
126 | systemctl daemon-reload
127 | 
128 | #actually start the elasticseach server now everything is ready!
129 | service elasticsearch start
130 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/README.md:
--------------------------------------------------------------------------------
 1 | # README
 2 | 
 3 | This directory contains the scripts to generate LINK infrastructure and the instructions about how to load the data into ES.
 4 | 
 5 | 
 6 | ## Infrastructure 
 7 | 
 8 | The file run.sh creates the infrastructure. In this first release the number of VM is hardcoded. (3)
 9 | 
10 | *) 3 Vms: 8cpu and 52 GB <br>
11 | *) Elasticsearch 5.6 and 24 shards <br>
12 | 
13 | gcloud beta compute --project=$PROJECT \
14 |   instance-groups managed create $NAME \
15 |     --size=3 \
16 |     ...
17 | 
18 | The file startup.sh is used by run.sh for creating the instance template.
19 | 
20 | In order to create a new version the user must change the parameter "cluster.name" (ES cluster) and if the number of VMs (run.sh) changes the parameter "minimum_master_nodes" must be number_of_total_vm/2+1. (The number of VMs should be odd)
21 | 
22 | discovery:
23 |   zen:
24 |     hosts_provider: gce
25 |     minimum_master_nodes: 2
26 | indices.store.throttle.max_bytes_per_sec: "200mb"
27 | cluster.name: library201911v7
28 | 
29 | 
30 | ## Load the data in ES
31 | The infrastructure scripts generates a DNS name.
32 | In google cloud these info are stored under  Network services > Load balancing
33 | 
34 | Eg
35 | [SERVICE_LABEL].[FORWARDING_RULE_NAME].il4.[REGION].lb.[PROJECT_ID].internal
36 | 
37 | To test if the cluster is available and correct:
38 | 
39 | > export HOST= _[SERVICE_LABEL]_
40 | > curl http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200
41 | >
42 | > curl -X GET "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/nodes?v&s=name"
43 | > curl -X GET "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cluster/health?pretty"
44 | 
45 | 
46 | The script "steps.sh" contains the steps to load the data into the ES. Please use this script as detailed reference.
47 | 
48 | This is a prototype of infrastructure and the aim is builing a list of commands to run.
49 | 
50 | In future, we aim to create an automatic script.
51 | 
52 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/exec.sh:
--------------------------------------------------------------------------------
 1 | gcloud compute instances create es5-library-node-$1  \
 2 |        --image-project debian-cloud \
 3 |        --image-family debian-9 \
 4 |        --machine-type n1-highmem-8 \
 5 |        --zone europe-west1-d \
 6 |        --metadata-from-file startup-script=startup.sh \
 7 |        --boot-disk-size "1024" \
 8 |        --boot-disk-type "pd-ssd" --boot-disk-device-name "es5-library-node-ssd-$1" \
 9 |        --project open-targets-library \
10 |        --scopes default,storage-rw,compute-rw
11 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/run.sh:
--------------------------------------------------------------------------------
 1 | #
 2 | # This script will create a new elasticsearch cluster
 3 | # It is configures as a GCP Instance Group with an Instance Template
 4 | # and sets up a Load Balancer in front of it
 5 | #
 6 | 
 7 | 
 8 | NOW=`date +'%y%m%d-%H%M%S'`
 9 | NAME=es-$NOW
10 | PROJECT=open-targets-library
11 | 
12 | gcloud --project=$PROJECT \
13 |   compute instance-templates create $NAME \
14 |     --machine-type n1-highmem-8 \
15 |     --image-project debian-cloud \
16 |     --image-family debian-9 \
17 | 	  --boot-disk-size "1024" \
18 | 	  --boot-disk-type "pd-ssd" \
19 | 	  --scopes default,storage-rw,compute-rw \
20 |     --metadata-from-file startup-script=startup.sh
21 | 
22 | #if trying to do containers, use this
23 | #    --image-project cos-cloud \
24 | #    --image-family cos-stable \
25 | 
26 | #NOTE this is a BETA command and liable to change in future!
27 | gcloud beta compute --project=$PROJECT \
28 |   instance-groups managed create $NAME \
29 |     --size=3 \
30 |     --template=$NAME \
31 |     --zone=europe-west1-d 
32 | 
33 | #create a healthcheck
34 | #used by autohealing and load balancing
35 | #check for 10s every 10s each 6 times for 1m total
36 | gcloud compute --project=$PROJECT \
37 |   health-checks create http $NAME \
38 |     --request-path="/_nodes/_local" \
39 |     --port=9200 \
40 |     --check-interval=10s \
41 |     --timeout=10s \
42 |     --unhealthy-threshold=3 \
43 |     --healthy-threshold=3
44 | 
45 | #configure healthcheck for autohealing
46 | gcloud beta compute --project=$PROJECT \
47 |   instance-groups managed update $NAME \
48 |     --health-check=$NAME \
49 |     --zone=europe-west1-d 
50 | 
51 | #configure a load balancer
52 | #create the load balancer backend service
53 | gcloud compute --project=$PROJECT \
54 |   backend-services create $NAME \
55 |     --health-checks=$NAME \
56 |     --load-balancing-scheme=internal \
57 |     --protocol=tcp \
58 |     --region=europe-west1
59 | 
60 | #add the instance group to the backend service
61 | gcloud compute --project=$PROJECT \
62 |   backend-services add-backend $NAME \
63 |     --instance-group=$NAME \
64 |     --region=europe-west1 \
65 |     --instance-group-zone=europe-west1-d
66 | 
67 | #create a forwarding rule for the actual load balancing
68 | #must use a service label to get dns!
69 | gcloud compute --project=$PROJECT \
70 |   forwarding-rules create $NAME \
71 |     --service-label $NAME \
72 |     --region=europe-west1 \
73 |     --address-region=europe-west1 \
74 |     --load-balancing-scheme=internal \
75 |     --ip-protocol=TCP \
76 |     --ports=all \
77 |     --backend-service=$NAME
78 | 
79 | # [SERVICE_LABEL].[FORWARDING_RULE_NAME].il4.[REGION].lb.[PROJECT_ID].internal
80 | 
81 | #curl http://$NAME.$NAME.il4.europe-west1.lb.open-targets-af.internal:9200
82 | 
83 | #configure firewall to allow healthchecks - manual
84 | 
85 | #sudo journalctl -n 500 -f -u google-startup-scripts.service
86 | 
87 | #time curl "localhost:9200/_cat/nodes?v&s=name"
88 | #time curl localhost:9200/_cluster/health?pretty
89 | #time curl localhost:9200/_cluster/state?pretty
90 | #time curl localhost:9200/_cat/master?v
91 | #time curl "localhost:9200/_cat/shards?v&s=index,shard,prirep"
92 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/startup.sh:
--------------------------------------------------------------------------------
  1 | #initial setup
  2 | #-------------
  3 | 
  4 | #update packages, and install prerequisites for elasticsearch
  5 | #use the non-interactive frontend for apt so we don't get any prompts
  6 | export DEBIAN_FRONTEND=noninteractive
  7 | #install with apt-get and autoconfirm
  8 | apt-get update
  9 | apt-get -yq install \
 10 |   openjdk-8-jdk-headless \
 11 |   net-tools \
 12 |   wget \
 13 |   uuid-runtime \
 14 |   python-pip \
 15 |   python-dev \
 16 |   python-urllib3 \
 17 |   libyaml-dev \
 18 |   tmux \
 19 |   jq \
 20 |   less \
 21 |   apt-transport-https
 22 | 
 23 | #ensure pip is the latest version, more than the debian package
 24 | pip install --upgrade pip
 25 | 
 26 | 
 27 | #install elasticsearch
 28 | #---------------------
 29 | ES_VERSION=5.6.15
 30 | #download the elasticsearch package
 31 | wget --quiet --no-check-certificate \
 32 |   --output-document=/tmp/elasticsearch-$ES_VERSION.deb \
 33 |   https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-$ES_VERSION.deb
 34 | #install the elasticsearch package
 35 | #use the non-interactive frontend for dpkg so we don't get any prompts
 36 | export DEBIAN_FRONTEND=noninteractive
 37 | dpkg -i /tmp/elasticsearch-$ES_VERSION.deb
 38 | #post-install cleanup
 39 | rm /tmp/elasticsearch-$ES_VERSION.deb
 40 | 
 41 | #install elasticsearch google compute engine discovery plugin
 42 | #note this requires scopes=compute-rw on the VM
 43 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install discovery-gce
 44 | 
 45 | #install elasticsearch google storage plugin
 46 | #used to save snapshots into a google cloud bucket
 47 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install repository-gcs
 48 | 
 49 | #International Components for Unicode support plugin
 50 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install analysis-icu
 51 | 
 52 | #configure elasticsearch
 53 | #-----------------------
 54 | 
 55 | #configure elasticseach
 56 | #  cluster.name            must be unique on network for udp broadcast
 57 | #  network.host            allow connections on any network device, not just localhost
 58 | #  http.port               use only 9200 nothing else
 59 | #  bootstrap.memory_lock   disable swap
 60 | #  xpack.security.enabled  turn off xpack extras
 61 | cat > /etc/elasticsearch/elasticsearch.yml <<EOF_C
 62 | cloud:
 63 |   gce:
 64 |     project_id: open-targets-library
 65 |     zone: europe-west1-d
 66 | discovery:
 67 |   zen:
 68 |     hosts_provider: gce
 69 |     minimum_master_nodes: 2
 70 | indices.store.throttle.max_bytes_per_sec: "200mb"
 71 | cluster.name: library202010v1
 72 | node.name: ${HOSTNAME}
 73 | network.host: 0.0.0.0
 74 | http.port: 9200
 75 | bootstrap.memory_lock: true
 76 | EOF_C
 77 | 
 78 | #configure elasticseach JVM
 79 | cat > /etc/elasticsearch/jvm.options <<EOF_C
 80 | -Xms31g
 81 | -Xmx31g
 82 | #default elasticsearch settings
 83 | -XX:+UseConcMarkSweepGC
 84 | -XX:CMSInitiatingOccupancyFraction=75
 85 | -XX:+UseCMSInitiatingOccupancyOnly
 86 | -XX:+AlwaysPreTouch
 87 | -server
 88 | -Xss1m
 89 | -Djava.awt.headless=true
 90 | -Dfile.encoding=UTF-8
 91 | -Djna.nosys=true
 92 | -Djdk.io.permissionsUseCanonicalPath=true
 93 | -Dio.netty.noUnsafe=true
 94 | -Dio.netty.noKeySetOptimization=true
 95 | -Dio.netty.recycler.maxCapacityPerThread=0
 96 | -Dlog4j.shutdownHookEnabled=false
 97 | -Dlog4j2.disable.jmx=true
 98 | -Dlog4j.skipJansi=true
 99 | #-XX:+HeapDumpOnOutOfMemoryError
100 | #rotate gc logfiles
101 | -XX:+UseGCLogFileRotation
102 | -XX:NumberOfGCLogFiles=32
103 | -XX:GCLogFileSize=128M
104 | EOF_C
105 | 
106 | #configure kernel
107 | #  raise number of open files
108 | #  allow locking an unlimited amount of memory
109 | cat <<EOF_C > /etc/security/limits.conf
110 | * soft nofile 65536
111 | * hard nofile 65536
112 | * soft memlock unlimited
113 | * hard memlock unlimited
114 | EOF_C
115 | 
116 | # set all sysctl configurations
117 | sysctl -p
118 | 
119 | # disable swap another way
120 | swapoff -a
121 | 
122 | #more kernel changes to ensure we get best performance
123 | #more disabling of swap, locking of memory, and reducing unnecessary disk IO
124 | echo "block/sda/queue/scheduler = noop" >> /etc/sysfs.conf
125 | echo noop > /sys/block/sda/queue/scheduler
126 | sed -i 's/\#LimitMEMLOCK=infinity/LimitMEMLOCK=infinity/g' /usr/lib/systemd/system/elasticsearch.service
127 | sed -i '46iLimitMEMLOCK=infinity' /usr/lib/systemd/system/elasticsearch.service
128 | systemctl daemon-reload
129 | 
130 | #actually start the elasticseach server now everything is ready!
131 | service elasticsearch start
132 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/steps.sh:
--------------------------------------------------------------------------------
  1 | # DNS name
  2 | # Eg.  http://es-201206-133204.es-201206-133204.il4.europe-west1.lb.open-targets-library.internal:9200
  3 | # HOST=es-200617-101804
  4 | # curl -X GET http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200
  5 | 
  6 | export HOST=es-201002-123122
  7 | 
  8 | # the number of shard is related with CPU and VMS. Eg. 3VMsx8cpu=24
  9 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_template/default" -H 'Content-Type: application/json' \
 10 | -d'{"template":"*","settings":{"number_of_shards":24}}'
 11 | 
 12 | mkdir loader
 13 | cd loader
 14 | 
 15 | # Settings for the different indices
 16 | curl  -X GET https://raw.githubusercontent.com/opentargets/library-beam/master/es-mapping-index/concept.json > concept.json
 17 | curl  -X GET https://raw.githubusercontent.com/opentargets/library-beam/master/es-mapping-index/publication.json > publication.json
 18 | curl  -X GET https://raw.githubusercontent.com/opentargets/library-beam/master/es-mapping-index/settings.json > settings.json
 19 | 
 20 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-taggedtext?pretty" -H 'Content-Type: application/json' -d@"settings.json"
 21 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-bioentity?pretty" -H 'Content-Type: application/json' -d@"settings.json"
 22 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20?pretty" -H 'Content-Type: application/json' -d@"publication.json"
 23 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-concept?pretty" -H 'Content-Type: application/json' -d@"concept.json"
 24 | 
 25 | #Adam suggested to add this. Change the HOST
 26 | curl -XPUT 'http://es-201002-123122.es-201002-123122.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-concept/_settings' -H 'Content-Type: application/json' -d'{"index" : {"max_adjacency_matrix_filters" : 500}}'
 27 | 
 28 | curl -X GET http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/indices
 29 | 
 30 | # list of files stored in Google Storage
 31 | gsutil ls gs://medline_2020_10/splitted/pubmed\*_bioentities\*.json.gz > bioentities_files.txt
 32 | gsutil ls gs://medline_2020_10/splitted/pubmed\*_taggedtext\*.json.gz > taggedtext_files.txt
 33 | gsutil ls gs://medline_2020_10/splitted/pubmed\*_small\*.json.gz > publication_files.txt
 34 | gsutil ls gs://medline_2020_10/splitted/pubmed\*_concepts\*.json.gz > concepts_files.txt
 35 | 
 36 | 
 37 | # Taggedtext index // BEGIN FOR EVERY INDEX
 38 | 
 39 | #split the file for running 10 processes
 40 | wc -l taggedtext_files.txt
 41 | split -l 1240 taggedtext_files.txt taggedtext_split_
 42 | 
 43 | # bioentities split
 44 | wc -l bioentities_files.txt
 45 | split -l 1240 bioentities_files.txt bio_split_
 46 | 
 47 | # Concept
 48 | wc -l concepts_files.txt
 49 | split -l 11240 concepts_files.txt conc_split_
 50 | 
 51 | # publication split
 52 | wc -l publication_files.txt
 53 | split -l 1240 publication_files.txt publ_split_
 54 | 
 55 | #_index_name_tmux.sh
 56 | # HOST=dns_name_hardcode (todo: change YOUR_PATH and HOST.)
 57 | #!/bin/bash
 58 | FILES=$YOUR_PATH/loader/taggedtext_split_*
 59 | tmux start-server
 60 | for f in $FILES
 61 | do
 62 |    windowName="tagg-${f: -2}"
 63 |    # take action on each file. $f store current file name
 64 |    #cat $f
 65 |    echo $windowName
 66 |    tmux new-session -d -s ${windowName}
 67 |    tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter
 68 |    tmux send-keys -t ${windowName} "export HOST=es-201002-123122" Enter
 69 |    tmux send-keys -t ${windowName} "export input=${f}; ./es_tag.sh" Enter
 70 | done
 71 | 
 72 | # es_tag.sh
 73 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-20-taggedtext --type taggedtext --id-field pub_id json --json-lines - ; done
 74 | 
 75 | #Kill the list of tmux opened
 76 | #!/bin/bash
 77 | FILES=$YOUR_PATH/loader/taggedtext_split_*
 78 | tmux start-server
 79 | for f in $FILES
 80 | do
 81 |    windowName="tagg-${f: -2}"
 82 |    echo $windowName
 83 |    tmux kill-session -t ${windowName}
 84 | done
 85 | 
 86 | # Changed the refresh interval
 87 | export HOST=dns_name_param
 88 | curl -XPUT http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-taggedtext/_settings -d '{"index":{"refresh_interval":"1s"}}'
 89 | 
 90 | Eg,
 91 | curl -XPUT http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-bioentity/_settings -d '{"index":{"refresh_interval":"1s"}}'
 92 | 
 93 | curl -XPUT http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-concept/_settings -d '{"index":{"refresh_interval":"1s"}}'
 94 | 
 95 | curl -XPUT http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20/_settings -d '{"index":{"refresh_interval":"1s"}}'
 96 | 
 97 | 
 98 | #### IMPORTANT
 99 | The index es_concept.sh is slightly different due the id-field value
100 | 
101 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.int
102 | ernal:9200" --with-retry --bulk-size 10000 --index pubmed-20-concept --type concept json --json-lines - ; done
103 | 
104 | There are some examples under "tmux_example"
105 | 
106 | #Publication Alias. esurl : replace with the proper URL
107 | 
108 | curl -XPOST 'http://esurl:9200/_aliases?pretty' -H 'Content-Type: application/json' -d '
109 |     {
110 |         "actions": [
111 |             {"add": {"index": "pubmed-20", "alias": "!publication-data"}}
112 |         ]
113 |     } '
114 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/README.md:
--------------------------------------------------------------------------------
1 | This directory contains the skeleton of the scripts for loading the indices into ES.
2 | 
3 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/bioentity_tmux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES=YOUR_PATH/loader/bio_split_*
 4 | tmux start-server
 5 | for f in $FILES
 6 | do
 7 |    windowName="bioe-${f: -2}"
 8 |    echo $windowName
 9 |    tmux new-session -d -s ${windowName}
10 |    tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter
11 |    #Add the dns_name here. Todo: improve it.
12 |    tmux send-keys -t ${windowName} "export HOST=....." Enter
13 |    tmux send-keys -t ${windowName} "export input=${f}; ./es_bio.sh" Enter
14 | done
15 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/bioentity_tmux_kill.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES=_YOUR_PATH_/loader/bio_split_*
 4 | tmux start-server
 5 | for f in $FILES
 6 | do
 7 |    windowName="bioe-${f: -2}"
 8 |    echo $windowName
 9 |    tmux kill-session -t ${windowName}
10 | done
11 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/concept_tmux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES=YOUR_PATH/loader/conc_split_*
 4 | tmux start-server
 5 | for f in $FILES
 6 | do
 7 |    windowName="conc-${f: -2}"
 8 |    echo $windowName
 9 |    tmux new-session -d -s ${windowName}
10 |    tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter
11 |    #Add the dns_name here. Todo: improve it.
12 |    tmux send-keys -t ${windowName} "export HOST=......" Enter
13 |    tmux send-keys -t ${windowName} "export input=${f}; ./es_concept.sh" Enter
14 | done
15 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/concept_tmux_kill.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES=YOUR_PATH/loader/conc_split_*
 4 | tmux start-server
 5 | for f in $FILES
 6 | do
 7 |    windowName="conc-${f: -2}"
 8 |    echo $windowName
 9 |    tmux kill-session -t ${windowName}
10 | done
11 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/es_bio.sh:
--------------------------------------------------------------------------------
1 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-19-bioentity --type bioentity --id-field pub_id json --json-lines - ; done
2 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/es_concept.sh:
--------------------------------------------------------------------------------
1 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-19-concept --type concept json --json-lines - ; done
2 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/es_pub.sh:
--------------------------------------------------------------------------------
1 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-19 --type publication --id-field pub_id json --json-lines - ; done
2 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/es_tag.sh:
--------------------------------------------------------------------------------
1 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-19-taggedtext --type taggedtext --id-field pub_id json --json-lines - ; done
2 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/publication_tmux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES=_YOUR_PATH_/loader/publ_split_*
 4 | tmux start-server
 5 | for f in $FILES
 6 | do
 7 |    windowName="publ-${f: -2}"
 8 |    echo $windowName
 9 |    tmux new-session -d -s ${windowName}
10 |    tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter
11 |    #Add the dns_name here. Todo: improve it.
12 |    tmux send-keys -t ${windowName} "export HOST=......" Enter
13 |    tmux send-keys -t ${windowName} "export input=${f}; ./es_pub.sh" Enter
14 | done
15 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/publication_tmux_kill.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES=_YOUR_PATH/loader/publ_split_*
 4 | tmux start-server
 5 | for f in $FILES
 6 | do
 7 |    windowName="publ-${f: -2}"
 8 |    echo $windowName
 9 |    tmux kill-session -t ${windowName}
10 | done
11 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/taggedtext_tmux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES=_YOUR_PATH_/loader/taggedtext_split_*
 4 | tmux start-server
 5 | for f in $FILES
 6 | do
 7 |    windowName="tagg-${f: -2}"
 8 |    echo $windowName
 9 |    tmux new-session -d -s ${windowName}
10 |    tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter
11 |    #Add the dns_name here. Todo: improve it.
12 |    tmux send-keys -t ${windowName} "export HOST=......" Enter
13 |    tmux send-keys -t ${windowName} "export input=${f}; ./es_tag.sh" Enter
14 | done
15 | 


--------------------------------------------------------------------------------
/gcp-persistent-disk/tmux_example/taggedtext_tmux_kill.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | FILES=_YOUR_PATH_/loader/taggedtext_split_*
 4 | tmux start-server
 5 | for f in $FILES
 6 | do
 7 |    windowName="tagg-${f: -2}"
 8 |    echo $windowName
 9 |    tmux kill-session -t ${windowName}
10 | done
11 | 


--------------------------------------------------------------------------------
/load2es.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import codecs
  3 | import gzip
  4 | import json
  5 | import logging
  6 | import time
  7 | from tempfile import NamedTemporaryFile
  8 | 
  9 | from elasticsearch import Elasticsearch
 10 | from elasticsearch.helpers import parallel_bulk
 11 | from google.cloud import storage
 12 | from tqdm import tqdm
 13 | 
 14 | 
 15 | '''
 16 | tmux new-session "python load2es.py publication --es http://myes:9200"
 17 | '''
 18 | 
 19 | NODES = 37
 20 | INDEX_NAME = 'pubmed-20'
 21 | DOC_TYPE = 'publication'
 22 | 
 23 | index_config = {
 24 |     'bioentity':
 25 |         dict(suffix='_bioentities.json.gz',
 26 |              index='pubmed-20-bioentity',
 27 |              doc_type='bioentity',
 28 |              mappings=None,
 29 |              pub_id=True),
 30 |     'taggedtext':
 31 |         dict(suffix='_taggedtext.json.gz',
 32 |              index='pubmed-20-taggedtext',
 33 |              doc_type='taggedtext',
 34 |              mappings=None,
 35 |              pub_id=True),
 36 |     'publication':
 37 |         dict(suffix='_small.json.gz',
 38 |              index='pubmed-20',
 39 |              doc_type='publication',
 40 |              mappings='publication.json',
 41 |              pub_id=True
 42 |              ),
 43 |     'concept':
 44 |         dict(suffix='_concepts.json.gz',
 45 |              index='pubmed-20-concept',
 46 |              doc_type='concept',
 47 |              mappings='concept.json',
 48 |              pub_id=False),
 49 | 
 50 | }
 51 | 
 52 | 
 53 | def read_remote_files(bucket, filenames, index_, doc_type, use_pub_id):
 54 |     for file_name in filenames:
 55 |         for line in read_remote_file(
 56 |                 bucket, file_name, index_, doc_type, use_pub_id):
 57 |             yield line
 58 | 
 59 | 
 60 | def read_remote_file(bucket, file_name, index_, doc_type, use_pub_id):
 61 |     counter = 0
 62 |     while counter <= 3:  # retry 3 times
 63 |         counter += 1
 64 |         try:
 65 |             with NamedTemporaryFile() as cache_file:
 66 |                 # download the file to a temporary location
 67 |                 blob = bucket.get_blob(file_name)
 68 |                 blob.download_to_file(cache_file, )
 69 |                 # flush the file to make sure it is written to disk
 70 |                 cache_file.flush()
 71 |                 # re-open the cache file to decompress it
 72 |                 zf = gzip.open(cache_file.name, 'rb')
 73 | 
 74 |                 reader = codecs.getreader("utf-8")
 75 |                 new_line = []
 76 |                 for line in reader(zf):
 77 |                     new_line.append(line)
 78 |                     if line[-1] == '\n':
 79 |                         counter += 1
 80 |                         if len(new_line) > 1:
 81 |                             line_to_yield = ''.join(new_line)
 82 |                         else:
 83 |                             line_to_yield = line
 84 |                         new_line = []
 85 |                         if line_to_yield:
 86 |                             pub_id = line_to_yield.partition('"pub_id": "')[2].partition('"')[0]
 87 |                             if not pub_id:
 88 |                                 logging.error('no pubmedid parsed for line %s' % line)
 89 |                             else:
 90 |                                 _id = None
 91 |                                 if use_pub_id and pub_id:
 92 |                                     _id = pub_id
 93 |                                     yield {
 94 |                                         '_index': index_,
 95 |                                         '_type': doc_type,
 96 |                                         '_id': _id,
 97 |                                         '_source': line_to_yield
 98 |                                     }
 99 |                                 else:
100 |                                     yield {
101 |                                         '_index': index_,
102 |                                         '_type': doc_type,
103 |                                         '_source': line_to_yield
104 |                                     }
105 |                 break
106 |         except Exception as e:
107 |             logging.exception('could not get file %s: %s' % (file_name, e))
108 |             pass
109 |         if counter == 3:
110 |             logging.error(' file %s skipped', file_name)
111 | 
112 | 
113 | def get_file_names(suffix):
114 |     client = storage.Client(project='open-targets-library')
115 |     bucket = client.get_bucket('medline_2020_06')
116 | 
117 |     for i in bucket.list_blobs(prefix='splitted/'):
118 |         if i.name.endswith(suffix):
119 |             yield i.name
120 | 
121 | 
122 | if __name__ == '__main__':
123 | 
124 |     parser = argparse.ArgumentParser(
125 |         description='Load LINK data into Elasticsearch')
126 |     parser.add_argument('indices', nargs='+',
127 |                         help='one or more elasticsearch indexes to load')
128 |     parser.add_argument('--es', dest='es', action='append',
129 |                         default=[],
130 |                         help='elasticsearch url(s)')
131 |     args = parser.parse_args()
132 | 
133 |     # setup the google cloud storage bucket reading stuff
134 |     client = storage.Client(project='open-targets-library')
135 |     bucket = client.get_bucket('medline_2020_06')
136 | 
137 |     # prepate elasticsearch for loading
138 |     valid_indices = list(set(args.indices) & set(index_config.keys()))
139 |     logging.info('loading data for indices: ' + ', '.join(valid_indices))
140 |     es = Elasticsearch(
141 |         hosts=args.es,
142 |         max_retry=10,
143 |         retry_on_timeout=True,
144 |     )
145 |     for idx in valid_indices:
146 |         index_data = index_config[idx]
147 | 
148 |         # delete any old index
149 |         tqdm.write('deleting %s %s' % (
150 |             index_data['index'], es.indices.delete(
151 |                 index=index_data['index'], 
152 |                 ignore=404, 
153 |                 timeout='300s'
154 |             )
155 |         ))
156 |         if index_data['mappings']:
157 |             tqdm.write('creating %s %s' % (
158 |                 index_data['index'], es.indices.create(
159 |                     index=index_data['index'], 
160 |                     ignore=400,
161 |                     body=json.load(open('es-mapping/' + index_data['mappings'])),
162 |                     timeout='30s'
163 |                 )
164 |             ))
165 |         else:
166 |             tqdm.write('creating %s %s' % (
167 |                 index_data['index'], es.indices.create(
168 |                     index=index_data['index'], 
169 |                     ignore=400,
170 |                     timeout='30s'
171 |                 )
172 |             ))
173 | 
174 |         # wait a while for index to stabilize
175 |         time.sleep(15)
176 | 
177 |         # prepare elasticserach for bulk loading
178 |         temp_index_settings = {
179 |             "index": {
180 |                 "refresh_interval": "-1",
181 |                 "number_of_replicas": 0,
182 |                 "translog.durability": 'async',
183 |             }
184 |         }
185 |         es.indices.put_settings(index=index_data['index'],
186 |                                 body=temp_index_settings)
187 | 
188 |         # get filenames from the bucket for this index
189 |         file_names = tuple(get_file_names(suffix=index_data['suffix']))
190 | 
191 |         # make a generator of all the rows in all the files
192 |         loaded_rows = read_remote_files(
193 |                 bucket,
194 |                 file_names,
195 |                 index_data['index'],
196 |                 index_data['doc_type'],
197 |                 index_data['pub_id']
198 |             )
199 | 
200 |         success, failed = 0, 0
201 |         with tqdm(loaded_rows,
202 |                 desc='loading json for index %s' % index_data['index'],
203 |                 unit=' docs',
204 |                 unit_scale=True,
205 |                 total=30000000 if 'concept' not in index_data['index'] else 570000000) as p_loaded_rows:
206 | 
207 | 
208 |             # configure how many threads to load in
209 |             # this should be less than 1 per elasticsearch node CPU
210 |             threads = NODES * 2
211 |             counter = 0
212 | 
213 |             # do the actual loading now
214 |             for ok, item in parallel_bulk(
215 |                     es, p_loaded_rows,
216 |                     raise_on_error=True,
217 |                     chunk_size=1000,
218 |                     thread_count=threads,
219 |                     request_timeout=300
220 |                     ):
221 | 
222 |                 if not ok:
223 |                     failed += 1
224 |                 else:
225 |                     success += 1
226 |                 counter += 1
227 | 
228 |         tqdm.write("uploaded %i success, %i failed\n" % (success, failed))
229 | 
230 |         # return elasticsearch to non-bulk settings
231 |         # this will make it start to reiplicate if applicable
232 |         restore_index_settings = {
233 |             "index": {
234 |                 "refresh_interval": "1s",
235 |                 "number_of_replicas": 1,
236 |                 "translog.durability": 'request',
237 |             }
238 |         }
239 |         es.indices.put_settings(index=index_data['index'],
240 |                                 body=restore_index_settings)
241 | 
242 | 


--------------------------------------------------------------------------------
/modules/AbbreviationFinder.py:
--------------------------------------------------------------------------------
  1 | #!/usr/local/bin/python
  2 | # -*- coding: UTF-8 -*-
  3 | 
  4 | '''Link abbreviations to their full names | Optimised to find the longest definition
  5 | 
  6 | 
  7 | Adapted and optimised from source available here: source: http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py
  8 | source made available by: Vincent Van Asch
  9 | original source version: 1.2.1
 10 | original alghoritm in:
 11 | 
 12 | A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text
 13 | A. Schwartz and M. Hearst
 14 | Biocomputing, 2003, pp 451-462.
 15 | 
 16 | '''
 17 | import logging
 18 | import re
 19 | 
 20 | from textblob import TextBlob
 21 | 
 22 | 
 23 | class Candidate(unicode):
 24 |     def __new__(cls, start, stop, str):
 25 |         return unicode.__new__(cls, str)
 26 | 
 27 |     def __init__(self, start, stop, str):
 28 |         self._start = start
 29 |         self._stop = stop
 30 | 
 31 |     def __getslice__(self, i, j):
 32 |         start = self.start + i
 33 |         stop = self.start + j
 34 |         str = unicode.__getslice__(self, i, j)
 35 |         return Candidate(start, stop, str)
 36 | 
 37 |     @property
 38 |     def start(self):
 39 |         '''The start index'''
 40 |         return self._start
 41 | 
 42 |     @property
 43 |     def stop(self):
 44 |         '''The stop index'''
 45 |         return self._stop
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | class AbbreviationsParser(object):
 52 |     def __init__(self, verbose = False):
 53 |         self.encoding = 'UTF8'
 54 |         self.verbose = verbose
 55 |         self.logger = logging.getLogger(__name__)
 56 | 
 57 |     def digest(self, textblob):
 58 |         if isinstance(textblob, (str, unicode)):
 59 |            textblob = TextBlob(textblob)
 60 |         return list(self._digest_iterator(textblob))
 61 | 
 62 |     def digest_as_dict(self, textblob):
 63 |         digested = self.digest(textblob)
 64 |         d = {}
 65 |         for i in digested:
 66 |             if i['short'] not in d:
 67 |                 d[i['short']]=i['long']
 68 |         return d
 69 | 
 70 |     def _digest_iterator(self, textblob):
 71 |         omit = 0
 72 |         written = 0
 73 |         for i, sentence in enumerate(textblob.sentences):
 74 |             sentence = sentence.raw
 75 |             # print sentence
 76 |             try:
 77 |                 for candidate in self.getcandidates(sentence):
 78 |                     try:
 79 |                         definition = self.getdefinition(candidate, sentence)
 80 |                     except ValueError as e:
 81 |                         if self.verbose:
 82 |                             self.logger.debug(str((i, 'Omitting candidate', candidate.encode(self.encoding), 'Reason:',
 83 |                                                e.args[0].encode(self.encoding))))
 84 |                         omit += 1
 85 |                     else:
 86 |                         try:
 87 |                             definition = self.definitionselection(definition, candidate)
 88 |                         except IndexError:
 89 |                             if self.verbose:
 90 |                                 self.logger.debug(str((i, 'Omitting candidate', definition.encode(
 91 |                                 self.encoding), '||', candidate.encode(self.encoding))))
 92 |                             omit += 1
 93 |                         except ValueError as e:
 94 |                             if self.verbose:
 95 |                                 self.logger.debug(str((i, 'Omitting candidate', definition.encode(
 96 |                                 self.encoding), '||', candidate.encode(self.encoding), 'Reason:',
 97 |                                                    e.args[0].encode(self.encoding))))
 98 |                             omit += 1
 99 |                         else:
100 | 
101 |                             cline = '%d %d %d %s' % (i, candidate.start, candidate.stop, candidate)
102 |                             dline = '%d %d %d %s' % (i, definition.start, definition.stop, definition)
103 | 
104 |                             yield dict(short=candidate.encode(self.encoding),
105 |                                         long=definition.encode(self.encoding))
106 |                             # print cline.encode(self.encoding)
107 |                             # print dline.encode(self.encoding)
108 |                             # print
109 | 
110 |                             written += 1
111 |             except ValueError as e:
112 |                 if self.verbose:
113 |                     self.logger.debug(str(('Reason:', e.args[0].encode(self.encoding))))
114 | 
115 |     def getcandidates(self, sentence):
116 |         '''Yields Candidates'''
117 |         delimiters = {'(': ('(', ')'),
118 |                       '[': ('[', ']'),
119 |                       '{': ('{', '}'),
120 |                       '<': ('<', '>'), }
121 |         for delimiter in delimiters:
122 |             if delimiter in sentence:
123 |                 del_start, del_end = delimiters[delimiter]
124 |                 # Check some things first
125 |                 if sentence.count(del_start) != sentence.count(del_end):
126 |                     raise ValueError('Unbalanced parentheses: %s' % sentence)
127 | 
128 |                 if sentence.find(del_start) > sentence.find(del_end):
129 |                     raise ValueError('First parentheses is right: %s' % sentence)
130 | 
131 |                 closeindex = -1
132 |                 while 1:
133 |                     # Look for open parenthesis
134 |                     openindex = sentence.find(del_start, closeindex + 1)
135 | 
136 |                     if openindex == -1:
137 |                         break
138 | 
139 |                     # Look for closing parantheses
140 |                     closeindex = openindex + 1
141 |                     open = 1
142 |                     skip = False
143 |                     while open:
144 |                         try:
145 |                             char = sentence[closeindex]
146 |                         except IndexError:
147 |                             # We found an opening bracket but no associated closing bracket
148 |                             # Skip the opening bracket
149 |                             skip = True
150 |                             break
151 |                         if char == del_start:
152 |                             open += 1
153 |                         elif char == del_end:
154 |                             open -= 1
155 |                         closeindex += 1
156 | 
157 |                     if skip:
158 |                         closeindex = openindex + 1
159 |                         continue
160 | 
161 |                     # Output if conditions are met
162 |                     start = openindex + 1
163 |                     stop = closeindex - 1
164 |                     str = sentence[start:stop]
165 | 
166 |                     # Take into account whitepsace that should be removed
167 |                     start = start + len(str) - len(str.lstrip())
168 |                     stop = stop - len(str) + len(str.rstrip())
169 |                     str = sentence[start:stop]
170 | 
171 |                     if self.conditions(str):
172 |                         yield Candidate(start, stop, str)
173 | 
174 |     def getdefinition(self, candidate, sentence):
175 |         '''Takes a candidate and a sentence and returns the definition candidate.
176 | 
177 |            The definintion candidate is the set of tokens (in front of the candidate)
178 |            that starts with a token starting with the first character of the candidate'''
179 |         # Take the tokens in front of the candidate
180 |         tokens = sentence[:candidate.start - 2].lower().split()
181 | 
182 |         # the char that we are looking for
183 |         key = candidate[0].lower()
184 | 
185 |         # Count the number of tokens that start with the same character as the candidate
186 |         firstchars = [t[0] for t in tokens]
187 | 
188 |         definitionfreq = firstchars.count(key)
189 |         candidatefreq = candidate.lower().count(key)
190 | 
191 |         # Look for the list of tokens in front of candidate that
192 |         # have a sufficient number of tokens starting with key
193 |         if candidatefreq <= definitionfreq:
194 |             # we should at least have a good number of starts
195 |             count = 0
196 |             start = 0
197 |             startindex = len(firstchars) - 1
198 |             while count < candidatefreq:
199 |                 if abs(start) > len(firstchars):
200 |                     raise ValueError('not found')
201 | 
202 |                 start -= 1
203 |                 # Look up key in the definition
204 |                 try:
205 |                     startindex = firstchars.index(key, len(firstchars) + start)
206 |                 except ValueError:
207 |                     pass
208 | 
209 |                 # Count the number of keys in definition
210 |                 count = firstchars[startindex:].count(key)
211 | 
212 |             # We found enough keys in the definition so return the definition as a
213 |             # definition candidate
214 |             start = len(' '.join(tokens[:startindex]))
215 |             stop = candidate.start - 2
216 |             str = sentence[start:stop]
217 | 
218 |             # Remove whitespace
219 |             start = start + len(str) - len(str.lstrip())
220 |             stop = stop - len(str) + len(str.rstrip())
221 |             str = sentence[start:stop]
222 | 
223 |             return Candidate(start, stop, str)
224 | 
225 | 
226 |         else:
227 |             # print 'S', sentence
228 |             # print >>sys.stderr, 'KEY', key
229 |             # print >>sys.stderr, 'TOKENS', tokens
230 |             # print >>sys.stderr, 'ABBREV', candidate
231 |             raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate')
232 | 
233 |     def definitionselection(self, definition, abbrev,):
234 |         '''Takes a definition candidate and an abbreviation candidate
235 |         and returns True if the chars in the abbreviation occur in the definition
236 | 
237 |         Based on
238 |         A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst'''
239 | 
240 |         def get_matches():
241 |             '''yield a list of possible dfinitions'''
242 |             if len(definition) < len(abbrev):
243 |                 raise ValueError('Abbreviation is longer than definition')
244 | 
245 |             if abbrev in definition.split():
246 |                 raise ValueError('Abbreviation is full word of definition')
247 | 
248 |             sindex = -1
249 |             lindex = -1
250 | 
251 |             while 1:
252 |                 try:
253 |                     longchar = definition[lindex].lower()
254 |                 except IndexError:
255 |                     break
256 | 
257 |                 shortchar = abbrev[sindex].lower()
258 | 
259 |                 if not shortchar.isalnum():
260 |                     sindex -= 1
261 | 
262 |                 if sindex == -1 * len(abbrev):
263 |                     if shortchar == longchar:
264 |                         if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum():
265 |                             yield definition[lindex:len(definition)]
266 |                         lindex -= 1
267 |                         if lindex == -1 * len(definition):
268 |                             break
269 | 
270 |                     else:
271 |                         lindex -= 1
272 | 
273 |                         if lindex == -1 * (len(definition) + 1):
274 |                             raise ValueError('definition of "%s" not found in "%s"' % (abbrev, definition))
275 | 
276 |                 else:
277 |                     if shortchar == longchar:
278 |                         sindex -= 1
279 |                         lindex -= 1
280 |                     else:
281 |                         lindex -= 1
282 | 
283 |         definitions = list(get_matches())
284 |         if not definitions:
285 |             raise IndexError('no matching definition found')
286 |         definition = definitions[0]
287 |         for i in definitions:
288 |             if len(i) > len(definition):
289 |                 definition = i
290 |         tokens = len(definition.split())
291 |         length = len(abbrev)
292 | 
293 |         if tokens > min([length + 5, length * 2]):
294 |             raise ValueError('did not meet min(|A|+5, |A|*2) constraint')
295 | 
296 |         return definition
297 | 
298 |     def conditions(self, str):
299 |         '''Based on Schwartz&Hearst
300 | 
301 |         2 <= len(str) <= 10
302 |         len(tokens) <= 2
303 |         re.search('[A-Za-z]', str)
304 |         str[0].isalnum()
305 | 
306 |         and extra:
307 |         if it matches ([A-Za-z]\. ?){2,}
308 |         it is a good candidate.
309 | 
310 |         '''
311 |         # import nltk
312 |         # if nltk.re.match('([A-Za-z]\. ?){2,}', str.lstrip()):
313 |         #     return True
314 |         if len(str) < 2 or len(str) > 10:
315 |             return False
316 |         if len(str.split()) > 2:
317 |             return False
318 |         if not re.search('[A-Za-z]', str):
319 |             return False
320 |         if not str[0].isalnum():
321 |             return False
322 | 
323 |         return True
324 | 


--------------------------------------------------------------------------------
/modules/BioentityTagger.py:
--------------------------------------------------------------------------------
  1 | import ahocorasick
  2 | import logging
  3 | import string
  4 | import sys
  5 | import time
  6 | import unicodedata
  7 | 
  8 | import requests
  9 | from fuzzywuzzy import fuzz
 10 | from rope.base.codeanalyze import ChangeCollector
 11 | 
 12 | from BioStopWords import DOMAIN_STOP_WORDS
 13 | from modules.vocabulary import vocabulary_urls
 14 | 
 15 | unicode_punctation_table = dict.fromkeys(i for i in xrange(sys.maxunicode)
 16 |                                          if unicodedata.category(unichr(i)).startswith('P'))
 17 | 
 18 | 
 19 | class BioEntityTagger(object):
 20 |     separators_all = [' ', '.', ',', ';', ':', ')', ']', '(', '[', '{', '}', '/', '\\', '"', "'", '?', '!', '<', '>',
 21 |                       '+', '-']
 22 | 
 23 |     def __init__(self,
 24 |                  partial_match=False,
 25 |                  ignorecase=True,
 26 |                  stopwords=None):
 27 |         '''
 28 | 
 29 |         :param partial_match:  allow for matching a non clomplete word
 30 |         :param ignorecase: case sensitive or not
 31 |         :param stopwords: stopwords to skip, defaults to a very broad list
 32 |         '''
 33 |         self.A = ahocorasick.Automaton()
 34 |         self.partial_match = partial_match
 35 |         self.ignorecase = ignorecase
 36 |         if stopwords is None:
 37 |             stopwords = DOMAIN_STOP_WORDS
 38 |         idx = 0
 39 |         s = requests.Session()
 40 |         '''get the dictionaries from remote files'''
 41 |         for dictionary_url in vocabulary_urls:
 42 |             max_retry = 3
 43 |             retry = 0
 44 |             while retry < max_retry:
 45 |                 dictionary_request = s.get(dictionary_url)
 46 |                 if not dictionary_request.ok:
 47 |                     time.sleep(1)
 48 |                     retry += 1
 49 |                 else:
 50 |                     break
 51 |             if not dictionary_request.ok:
 52 |                 logging.error('cannot download dictionary %s, skipped' % dictionary_url)
 53 |                 continue
 54 |             dictionary = dictionary_request.json()
 55 |             category, reference_db = dictionary_url.split('/')[-1].split('.')[0].split('_')[0].split('-')
 56 |             '''load the elements in the Automation if they are not too short or are stopwords'''
 57 |             for element, element_data in dictionary.items():
 58 |                 ids = element_data['ids']
 59 |                 pref_name = element_data['pref_name']
 60 |                 if len(element) > 2:
 61 |                     element_str = element.encode('utf-8')
 62 |                     if ((len(element_str) < 5) and (element_str not in stopwords) or \
 63 |                                     (len(element_str) >= 5) and (element_str.lower() not in stopwords)):
 64 |                         idx += 1
 65 |                         if self.ignorecase:
 66 |                             element_match = element_str.lower()
 67 |                         else:
 68 |                             element_match = element_str
 69 |                         self.add_tag(element_match,
 70 |                                      idx,
 71 |                                      category,
 72 |                                      reference_db,
 73 |                                      [i.encode('utf-8') for i in ids],
 74 |                                      element,
 75 |                                      element_match,
 76 |                                      pref_name)
 77 |                         '''handle elements with dashes by also creating a copy without'''
 78 |                         if '-' in element_match:
 79 |                             element_match_without_dash = element_match.replace('-', '')
 80 |                             if len(element_match_without_dash) > 2:
 81 |                                 self.add_tag(element_match_without_dash,
 82 |                                              idx,
 83 |                                              category,
 84 |                                              reference_db,
 85 |                                              [i.encode('utf-8') for i in ids],
 86 |                                              element,
 87 |                                              element_match_without_dash,
 88 |                                              pref_name)
 89 |                         '''if supporting partial match'''
 90 |                         if self.partial_match:
 91 |                             for longest_token in element.split():
 92 |                                 if longest_token != element and len(
 93 |                                         longest_token) > 5 and longest_token.lower() not in stopwords:
 94 |                                     self.add_tag(longest_token,
 95 |                                                  idx,
 96 |                                                  category + '-TOKEN',
 97 |                                                  reference_db,
 98 |                                                  [i.encode('utf-8') for i in ids],
 99 |                                                  element,
100 |                                                  longest_token,
101 |                                                  pref_name)
102 | 
103 |         s.close()
104 |         self.A.make_automaton()
105 | 
106 |     def add_tag(self, element_text, idx, category, reference_db, ids, element, match, pref_name):
107 |         unique_resource_key = category + '|' + reference_db
108 |         category_insert = [category]
109 |         reference_db_insert = [reference_db]
110 |         ids_insert = [[i.encode('utf-8') for i in ids]]
111 |         previous_annotation = self.A.get(element_text, None)
112 | 
113 |         if previous_annotation is None:
114 |             annotation = [idx,
115 |                           category_insert,
116 |                           reference_db_insert,
117 |                           ids_insert,
118 |                           element,
119 |                           match,
120 |                           pref_name]
121 | 
122 |             self.A.add_word(element_text,
123 |                             annotation)
124 |         else:
125 |             previous_keys = []
126 |             for j in range(len(previous_annotation[1])):
127 |                 previous_keys.append(previous_annotation[1][j] + '|' + previous_annotation[2][j])
128 |             if unique_resource_key not in previous_keys:
129 |                 previous_annotation[1].extend(category_insert)
130 |                 previous_annotation[2].extend(reference_db_insert)
131 |                 previous_annotation[3].extend(ids_insert)  # TODO: might need to merge addidional ids if the
132 |                 # uniquekey is passed before
133 |                 self.A.add_word(element_text, previous_annotation)
134 | 
135 |     def tag(self, text):
136 |         return self._tag(text, self.A, self.ignorecase)
137 | 
138 |     @staticmethod
139 |     def _tag(text, automation, ignorecase=True):
140 |         '''
141 |         finds tags in a text
142 |         :param text: text to tag
143 |         :param automation: automation to use
144 |         :param ignorecase: deafault to True
145 |         :return:
146 |         '''
147 |         if isinstance(text, unicode):
148 |             text_to_tag = text.encode('utf-8')
149 |         else:
150 |             text_to_tag = text
151 |         if ignorecase:
152 |             text_to_tag = text_to_tag.lower()
153 |         matches = []
154 |         for i in automation.iter(text_to_tag.lower()):
155 |             if len(i[1]) < 7:
156 |                 print i
157 |         for end_index, (insert_order, category_list, reference_db_list, entity_id_list, original_value, match,
158 |                         pref_name) in automation.iter(text_to_tag.lower()):
159 |             start_index = end_index - len(match) + 1
160 |             end_index += 1
161 | 
162 |             if (start_index == 0 or text_to_tag[start_index - 1] in BioEntityTagger.separators_all) and \
163 |                     (end_index == len(text_to_tag) or text_to_tag[end_index] in BioEntityTagger.separators_all):
164 |                 for j in range(len(category_list)):
165 |                     category = category_list[j]
166 |                     reference_db = reference_db_list[j]
167 |                     entity_id = entity_id_list[j]
168 |                     if isinstance(entity_id, list):
169 |                         entity_id = entity_id[0]
170 |                     if category.endswith('-TOKEN'):
171 |                         pre, post = original_value.split(match)[:2]
172 |                         potential_match = text_to_tag[start_index:end_index + len(post)]
173 |                         score = fuzz.token_sort_ratio(original_value, potential_match)
174 |                         if score > 90:
175 |                             tag = MatchedTag(match, start_index, end_index, category.replace('-TOKEN', ''),
176 |                                              reference_db,
177 |                                              entity_id, original_value, pref_name)
178 |                             matches.append(tag.__dict__)
179 |                     else:
180 |                         tag = MatchedTag(match, start_index, end_index, category, reference_db, entity_id,
181 |                                          original_value, pref_name)
182 |                         matches.append(tag.__dict__)
183 |             else:
184 |                 pass
185 | 
186 |         grouped_matches = BioEntityTagger.group_matches_by_category_and_reference(matches)
187 |         filtered_matches = []
188 |         for group, matches_in_group in grouped_matches.items():
189 |             non_nested_matches = BioEntityTagger.remove_nested_matches(matches_in_group)
190 |             filtered_matches.extend(non_nested_matches)
191 | 
192 |         return filtered_matches
193 | 
194 |     @staticmethod
195 |     def group_matches_by_category_and_reference(matches):
196 |         grouped_by_category_type = {}
197 |         for match in matches:
198 |             key = match['category'] + '|' + match['reference_db']
199 |             if key not in grouped_by_category_type:
200 |                 grouped_by_category_type[key] = []
201 |             grouped_by_category_type[key].append(match)
202 | 
203 |         return grouped_by_category_type
204 | 
205 |     @staticmethod
206 |     def remove_nested_matches(matches):
207 |         filtered_matches = []
208 |         sorted_matches = sorted(matches, key=lambda x: (x['start'], -x['end']))
209 |         for i, tag_i in enumerate(sorted_matches):
210 |             keep = True
211 |             for j, tag_j in enumerate(sorted_matches):
212 |                 if i != j:
213 |                     if tag_j['start'] <= tag_i['start'] <= tag_j['end'] and \
214 |                                             tag_j['start'] <= tag_i['end'] <= tag_j['end']:
215 |                         keep = False
216 |                         break
217 |                     elif tag_j['start'] > tag_i['start']:
218 |                         break
219 |                     else:
220 |                         pass
221 |             if keep:
222 |                 filtered_matches.append(tag_i)
223 |         return filtered_matches
224 | 
225 |     @staticmethod
226 |     def mark_tags_in_text(text, matches):
227 |         '''
228 |         produce a text with the tags written as markup
229 |         :param text: text to tags
230 |         :param matches: tags to encode
231 |         :return:
232 |         '''
233 |         text_to_tag = text
234 |         tagged_abstract = ''
235 |         if isinstance(text, unicode):
236 |             text_to_tag = text.encode('utf-8')
237 |         try:
238 |             tagged_abstract = ChangeCollector(text_to_tag)
239 |             for i, tag in enumerate(
240 |                     sorted(matches, key=lambda x: (x['start'], -x['end']))):
241 |                 if isinstance(tag['reference'], (list, tuple)):
242 |                     tag_reference = '|'.join(tag['reference'])
243 |                 else:
244 |                     tag_reference = tag['reference']
245 |                 tagged_abstract.add_change(tag['start'], tag['start'],
246 |                                            '<mark-%s data-entity="%s" reference-db="%s"  reference="%s">' % (
247 |                                                str(i), tag['category'], tag['reference_db'], tag_reference))
248 |                 tagged_abstract.add_change(tag['end'], tag['end'], '</mark-%s>' % str(i))
249 |             tagged_abstract = '<div  class="entities">%s</div></br>' % tagged_abstract.get_changed()
250 |         except UnicodeDecodeError:
251 |             logging.error('cannot generate maked text for unicode decode error')
252 |         return tagged_abstract
253 | 
254 |     @staticmethod
255 |     def get_tags_in_range(matches, start, end):
256 |         filtered_tag = []
257 |         for t in matches:
258 |             if start <= t['start'] <= end and \
259 |                                     start <= t['end'] <= end:
260 |                 filtered_tag.append(t)
261 |             elif t['end'] > end:
262 |                 break
263 | 
264 |         return filtered_tag
265 | 
266 |     @staticmethod
267 |     def get_tag_by_match(tags, match):
268 |         matched_tags = []
269 |         for tag in tags:
270 |             if tag['match'].lower() == match.lower():
271 |                 matched_tags.append(match)
272 |         return []
273 | 
274 |     @staticmethod
275 |     def extend_tags_to_alternative_forms(text, extended_forms):
276 |         A = ahocorasick.Automaton()
277 |         for text_to_match, payload in extended_forms.items():
278 |             A.add_word(text_to_match.lower(),
279 |                        [0, [payload['category']], [payload['reference_db']], [payload['reference']],
280 |                         payload['original_value'],
281 |                         text_to_match.lower(), payload['label']])
282 |         A.make_automaton()
283 | 
284 |         return BioEntityTagger._tag(text, A, )
285 | 
286 | 
287 | class MatchedTag(object):
288 |     def __init__(self,
289 |                  match,
290 |                  start,
291 |                  end,
292 |                  category,
293 |                  reference_db,
294 |                  reference,
295 |                  original_value,
296 |                  label,
297 |                  sentence=None
298 |                  ):
299 |         self.match = match
300 |         self.start = start
301 |         self.end = end
302 |         self.category = category
303 |         self.reference_db = reference_db
304 |         self.reference = reference
305 |         self.original_value = original_value
306 |         self.label = label
307 |         self.sentence = sentence
308 | 
309 |     @staticmethod
310 |     def sanitize_string(s):
311 |         if isinstance(s, unicode):
312 |             return s.translate(unicode_punctation_table)
313 |         elif isinstance(s, str):
314 |             return unicode(s.translate(string.maketrans(' ', '_'), string.punctuation))
315 |         else:
316 |             return u''
317 | 
318 | # TODO: use inflection.table.ascii from SPECIALIST lexicon to enhance matching forms
319 | 


--------------------------------------------------------------------------------
/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opentargets-archive/library-beam/dcf08a6b09a7b11faff1a25655c47d363e18d5d2/modules/__init__.py


--------------------------------------------------------------------------------
/modules/vocabulary.py:
--------------------------------------------------------------------------------
 1 | vocabulary_urls= [
 2 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/ANATOMY-MESH.json",
 3 |   # "https://storage.googleapis.com/opentargets-vocabularies/ANTROPOLOGY-MESH.json",
 4 |   # "https://storage.googleapis.com/opentargets-vocabularies/CHEMICAL-MESH.json",
 5 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/DIAGNOSTICS-MESH.json",
 6 |   # "https://storage.googleapis.com/opentargets-vocabularies/DISCIPLINE-MESH.json",
 7 |   # "https://storage.googleapis.com/opentargets-vocabularies/DISEASE-EPMC.json",
 8 |   # "https://storage.googleapis.com/opentargets-vocabularies/DISEASE-MESH.json",
 9 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/DISEASE-OPENTARGETS.json",
10 |   # "https://storage.googleapis.com/opentargets-vocabularies/GENE-EPMC.json",
11 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/HEALTHCARE-MESH.json",
12 |   # "https://storage.googleapis.com/opentargets-vocabularies/HUMANITIES-MESH.json",
13 |   # "https://storage.googleapis.com/opentargets-vocabularies/INFORMATIONSCIENCE-MESH.json",
14 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/LOC-MESH.json",
15 |   # "https://storage.googleapis.com/opentargets-vocabularies/NAMEDGROUP-MESH.json",
16 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/ORGANISM-MESH.json",
17 |   # "https://storage.googleapis.com/opentargets-vocabularies/PATHWAY-OPENTARGETS.json",
18 |   # "https://storage.googleapis.com/opentargets-vocabularies/PHENOTYPE-EPMC.json",
19 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/PROCESS-MESH.json",
20 |   # "https://storage.googleapis.com/opentargets-vocabularies/PROTEINCOMPLEX-CHEMBL.json",
21 |   # "https://storage.googleapis.com/opentargets-vocabularies/PROTEINCOMPLEX-COMPLEXPORTAL.json",
22 |   # "https://storage.googleapis.com/opentargets-vocabularies/PROTEINCOMPLEX-CORUM.json",
23 |   # "https://storage.googleapis.com/opentargets-vocabularies/PROTEINCOMPLEX-GO.json",
24 |   # "https://storage.googleapis.com/opentargets-vocabularies/PSICHIATRY-MESH.json",
25 |   # "https://storage.googleapis.com/opentargets-vocabularies/PUBLICATION-MESH.json",
26 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/GENE-OPENTARGETS.json",
27 |   # "https://storage.googleapis.com/opentargets-vocabularies/TECHNOLOGY-MESH.json"
28 |   # "https://storage.googleapis.com/opentargets-vocabularies/GENE-LEXEBI.json",
29 |   # "https://storage.googleapis.com/opentargets-vocabularies/DISEASE-LEXEBI.json",
30 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/PHENOTYPE-HPO.json",
31 |   "https://storage.googleapis.com/opentargets-vocabularies_2020_09/DRUG-CHEMBL.json"
32 |   # "https://storage.googleapis.com/opentargets-vocabularies/DISEASEALT-MONDO.json"
33 | ]
34 | 


--------------------------------------------------------------------------------
/publication_alias.sh:
--------------------------------------------------------------------------------
1 | curl -XPOST 'http://esurl:9200/_aliases?pretty' -H 'Content-Type: application/json' -d '
2 |     {
3 |         "actions": [
4 |             {"add": {"index": "pubmed-18", "alias": "!publication-data"}}
5 |         ]
6 |     } '
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed to the Apache Software Foundation (ASF) under one or more
  3 | # contributor license agreements.  See the NOTICE file distributed with
  4 | # this work for additional information regarding copyright ownership.
  5 | # The ASF licenses this file to You under the Apache License, Version 2.0
  6 | # (the "License"); you may not use this file except in compliance with
  7 | # the License.  You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | 
 18 | """Setup.py module for the workflow's worker utilities.
 19 | 
 20 | All the workflow related code is gathered in a package that will be built as a
 21 | source distribution, staged in the staging area for the workflow being run and
 22 | then installed in the workers when they start running.
 23 | 
 24 | This behavior is triggered by specifying the --setup_file command line option
 25 | when running the workflow for remote execution.
 26 | """
 27 | 
 28 | import subprocess
 29 | from distutils.command.build import build as _build
 30 | 
 31 | import setuptools
 32 | 
 33 | 
 34 | # This class handles the pip install mechanism.
 35 | class build(_build):  # pylint: disable=invalid-name
 36 |     """A build command class that will be invoked during package install.
 37 | 
 38 |     The package built using the current setup.py will be staged and later
 39 |     installed in the worker using `pip install package'. This class will be
 40 |     instantiated during install for this specific scenario and will trigger
 41 |     running the custom commands specified.
 42 |     """
 43 |     sub_commands = _build.sub_commands + [('CustomCommands', None)]
 44 | 
 45 | 
 46 | # Some custom command to run during setup. The command is not essential for this
 47 | # workflow. It is used here as an example. Each command will spawn a child
 48 | # process. Typically, these commands will include steps to install non-Python
 49 | # packages. For instance, to install a C++-based library libjpeg62 the following
 50 | # two commands will have to be added:
 51 | #
 52 | #     ['apt-get', 'update'],
 53 | #     ['apt-get', '--assume-yes', install', 'libjpeg62'],
 54 | #
 55 | # First, note that there is no need to use the sudo command because the setup
 56 | # script runs with appropriate access.
 57 | # Second, if apt-get tool is used then the first command needs to be 'apt-get
 58 | # update' so the tool refreshes itself and initializes links to download
 59 | # repositories.  Without this initial step the other apt-get install commands
 60 | # will fail with package not found errors. Note also --assume-yes option which
 61 | # shortcuts the interactive confirmation.
 62 | #
 63 | # The output of custom commands (including failures) will be logged in the
 64 | # worker-startup log.
 65 | CUSTOM_COMMANDS = [
 66 |     ['apt-get', 'update'],
 67 |     ['apt-get', '--assume-yes', 'install', 'libxml2-dev', 'wget', 'unzip'],
 68 |     ['pip', 'install',
 69 |      #'https://github.com/explosion/spacy-models/releases/download/en_depent_web_md-1.2.1/en_depent_web_md-1.2.1.tar.gz',
 70 |      'https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.0/en_core_web_lg-2.2.0.tar.gz',
 71 |      #'nltk'
 72 |      ],
 73 | 
 74 |     ['python', '-m', 'nltk.downloader', 'brown', 'punkt', 'wordnet', 'averaged_perceptron_tagger', 'conll2000',
 75 |      'stopwords']
 76 | ]
 77 | 
 78 | 
 79 | class CustomCommands(setuptools.Command):
 80 |     """A setuptools Command class able to run arbitrary commands."""
 81 | 
 82 |     def initialize_options(self):
 83 |         pass
 84 | 
 85 |     def finalize_options(self):
 86 |         pass
 87 | 
 88 |     def RunCustomCommand(self, command_list):
 89 |         print 'Running command: %s' % command_list
 90 |         p = subprocess.Popen(
 91 |             command_list,
 92 |             stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 93 |         # Can use communicate(input='y\n'.encode()) if the command run requires
 94 |         # some confirmation.
 95 |         stdout_data, stdout_err = p.communicate()
 96 |         print 'Command output: %s | Command err: %s' % (stdout_data, stdout_err)
 97 |         if p.returncode != 0:
 98 |             raise RuntimeError(
 99 |                 'Command %s failed: exit code: %s' % (command_list, p.returncode))
100 | 
101 |     def run(self):
102 |         for command in CUSTOM_COMMANDS:
103 |             self.RunCustomCommand(command)
104 | 
105 | 
106 | # Configure the required packages and scripts to install.
107 | # Note that the Python Dataflow containers come with numpy already installed
108 | # so this dependency will not trigger anything to be installed unless a version
109 | # restriction is specified.
110 | # Note numpy >=1.17 is python3 only
111 | # Note more-itertools >=6.0.0 is python3 only
112 | REQUIRED_PACKAGES = [
113 |     'numpy==1.16.5',
114 |     'more-itertools==5.0.0',
115 |     'apache-beam[gcp]==2.16.0',
116 |     'spacy==2.2.2',
117 |     'python-Levenshtein==0.12.0',
118 |     'fuzzywuzzy==0.17.0',
119 |     'elasticsearch==7.0.5',
120 |     'lxml==4.4.1',
121 |     'textblob==0.15.3',
122 |     'pyahocorasick==1.4.0',
123 |     'rope==0.14.0',
124 |     'unidecode==1.1.1'
125 | ]
126 | 
127 | setuptools.setup(
128 |     name='opentargets-library-beam',
129 |     version='0.0.2',
130 |     description='ETL for opentargets library running on beam',
131 |     install_requires=REQUIRED_PACKAGES,
132 |     # dependency_links=['https://github.com/explosion/spacy-models/releases/download/en_core_web_md-1.2.1
133 |     # /en_core_web_md-1.2.1.tar.gz#egg=en_core_web_md-1.2.1'],
134 |     packages=setuptools.find_packages(),
135 |     cmdclass={
136 |         # Command class instantiated and run during pip install scenarios.
137 |         'build': build,
138 |         'CustomCommands': CustomCommands,
139 |     }
140 | )
141 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opentargets-archive/library-beam/dcf08a6b09a7b11faff1a25655c47d363e18d5d2/tests/__init__.py


--------------------------------------------------------------------------------
/tests/resources/common_words_as_genes.txt:
--------------------------------------------------------------------------------
1 | The NOD mouse, which spontaneously develops insulitis and overt diabetes, is a model of autoimmune type I diabetes mellitus. For the precise analysis of the roles of CD4+ and CD8+ T cells in the pathogenesis of this mouse, these subsets must be transferred into recipients that are completely free of T cells and pathological changes. We used athymic NOD nude mice, which congenitally lack mature T cells and are free of insulitis and hyperglycemia up to the age of 60 weeks, as recipients for this purpose. To the nude recipients we transferred either one of a highly purified CD4+ or CD8+ T cell subset derived from non-diabetic female NOD mice; any in vivo increase in the contaminating T cell subsets was prevented by injecting the antibody homologous to it. Most of the T cell-reconstituted recipients were treated with cyclophosphamide to promote the onset of overt diabetes. Transfer of the CD8+ T cell subset alone did not induce insulitis or hyperglycemia. In contrast, transfer of the CD4+ T cell subset alone produced insulitis, but not hyperglycemia, in all the recipients. However, the subsequent transfer of CD8+ T cells into CD4+ T cell-reconstituted recipients induced severe insulitis and hyperglycemia in almost all the recipients. In these diabetic recipients, we observed severe damage of the pancreatic islets and the infiltration of a large number of CD8+ T cells into the remaining islets; insulin-secreting beta cells were no longer detected. These results suggest that CD4+ T cells play a predominant role in the development of insulitis and that CD8+ T cells migrate into the islets and are subsequently, with the aid of CD4+ T cells, differentiated into killer cells which act against beta cells.


--------------------------------------------------------------------------------
/tests/resources/test-medlinexml/test_baseline.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opentargets-archive/library-beam/dcf08a6b09a7b11faff1a25655c47d363e18d5d2/tests/resources/test-medlinexml/test_baseline.xml.gz


--------------------------------------------------------------------------------
/tests/resources/test-medlinexml/test_update.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opentargets-archive/library-beam/dcf08a6b09a7b11faff1a25655c47d363e18d5d2/tests/resources/test-medlinexml/test_update.xml.gz


--------------------------------------------------------------------------------
/tests/resources/test-spacy/disease.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <InputData>
 3 | <Cluster clsId="C0001940" semType="umlsD">
 4 | <Entry entryId="C0001940_6" baseForm="alcohol induced korsakoff syndrome" type="Mental or Behavioral Dysfunction" mlfreq="1">
 5 | 	<SourceDC sourceName="umlsD" sourceId="UMLSDisease:C0001940"/>
 6 | 	<PosDC posName="POS" pos="N"/>
 7 | 	<Variant writtenForm="alcohol-induced Korsakoff syndrome" type="orth1" mlfreq="1"/>
 8 | <DC att="umls_cui" val="C0001940"/>
 9 | </Entry>
10 | <Entry entryId="C0001940_9" baseForm="alcoholic korsakoff syndrome" type="Mental or Behavioral Dysfunction" mlfreq="34">
11 | 	<SourceDC sourceName="umlsD" sourceId="UMLSDisease:C0001940"/>
12 | 	<PosDC posName="POS" pos="N"/>
13 | 	<Variant writtenForm="Alcoholic Korsakoff syndrome" type="orth1" mlfreq="2"/>
14 | 	<Variant writtenForm="alcoholic Korsakoff syndrome" type="orth1" mlfreq="30"/>
15 | 	<Variant writtenForm="Alcoholic Korsakoff Syndrome" type="orth1" mlfreq="2"/>
16 | <DC att="umls_cui" val="C0001940"/>
17 | </Entry>
18 | <Entry entryId="C0001940_14" baseForm="alcohol induced persisting amnestic disorder" type="Mental or Behavioral Dysfunction" mlfreq="2">
19 | 	<SourceDC sourceName="umlsD" sourceId="UMLSDisease:C0001940"/>
20 | 	<PosDC posName="POS" pos="N"/>
21 | 	<Variant writtenForm="alcohol-induced persisting amnestic disorder" type="orth1" mlfreq="2"/>
22 | <DC att="umls_cui" val="C0001940"/>
23 | </Entry>
24 | <Entry entryId="C0001940_23" baseForm="alcohol amnestic disorder" type="Mental or Behavioral Dysfunction" mlfreq="19">
25 | 	<SourceDC sourceName="umlsD" sourceId="UMLSDisease:C0001940"/>
26 | 	<PosDC posName="POS" pos="N"/>
27 | 	<Variant writtenForm="Alcohol Amnestic Disorder" type="orth1" mlfreq="1"/>
28 | 	<Variant writtenForm="alcohol amnestic disorder" type="orth1" mlfreq="16"/>
29 | 	<Variant writtenForm="Alcohol amnestic disorder" type="orth1" mlfreq="2"/>
30 | <DC att="umls_cui" val="C0001940"/>
31 | </Entry>
32 | <Entry entryId="C0001940_25" baseForm="alcohol induced amnestic syndrome" type="Mental or Behavioral Dysfunction" mlfreq="1">
33 | 	<SourceDC sourceName="umlsD" sourceId="UMLSDisease:C0001940"/>
34 | 	<PosDC posName="POS" pos="N"/>
35 | 	<Variant writtenForm="alcohol-induced amnestic syndrome" type="orth1" mlfreq="1"/>
36 | <DC att="umls_cui" val="C0001940"/>
37 | </Entry>
38 | <Entry entryId="C0001940_37" baseForm="alcohol amnestic syndrome" type="Mental or Behavioral Dysfunction" mlfreq="2">
39 | 	<SourceDC sourceName="umlsD" sourceId="UMLSDisease:C0001940"/>
40 | 	<PosDC posName="POS" pos="N"/>
41 | 	<Variant writtenForm="Alcohol amnestic syndrome" type="orth1" mlfreq="2"/>
42 | <DC att="umls_cui" val="C0001940"/>
43 | </Entry>
44 | </Cluster>
45 | </InputData>


--------------------------------------------------------------------------------
/tests/resources/test-spacy/geneProt.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="ISO-8859-1" ?>
 2 | <!--Generated on: Wed Aug  4 14:51:29 BST 2010-->
 3 | <!DOCTYPE InputData SYSTEM "LexEBI-v1.dtd">
 4 | <InputData>
 5 | <Cluster clsId="UNIPR_UN13C_RAT" semType="geneProt">
 6 | <Entry entryId="UNIPR_UN13C_RAT_1" baseForm="Protein unc-13 homolog C" type="PREFERRED" mlfreq="0">
 7 | 
 8 |         <SourceDC sourceName="BioThesaurus" sourceId="Q62770"/>
 9 |         <PosDC posName="POS" pos="N"/>
10 |         <Variant writtenForm="Unc13c" type="orthographic" mlfreq="0"/>
11 |         <Variant writtenForm="Unc13h3" type="orthographic" mlfreq="0"/>
12 |         <Variant writtenForm="unc-13 homolog C (C. elegans)" type="orthographic" mlfreq="0"/>
13 |         <Variant writtenForm="C. elegans" type="orthographic" mlfreq="13472"/>
14 |         <Variant writtenForm="Munc13-3" type="orthographic" mlfreq="24"/>
15 |         <Variant writtenForm="Munc13-3 protein" type="orthographic" mlfreq="4"/>
16 |         <Variant writtenForm="unc-13 homolog C" type="orthographic" mlfreq="0"/>
17 |         <DC att="uniprot_ac" val="Q62770"/>
18 |         <DC att="speciesNameNCBI" val="10116"/>
19 | 
20 | </Entry>
21 | </Cluster>
22 | 
23 | <Cluster clsId="UNIPR_UN13C_MOUSE" semType="geneProt">
24 | <Entry entryId="UNIPR_UN13C_MOUSE_1" baseForm="Protein unc-13 homolog C" type="PREFERRED" mlfreq="0">
25 | 
26 |         <SourceDC sourceName="BioThesaurus" sourceId="Q8K0T7"/>
27 |         <PosDC posName="POS" pos="N"/>
28 |         <Variant writtenForm="Unc13c" type="orthographic" mlfreq="0"/>
29 |         <Variant writtenForm="Unc13h3" type="orthographic" mlfreq="0"/>
30 |         <Variant writtenForm="unc-13 homolog C (C. elegans)" type="orthographic" mlfreq="0"/>
31 |         <Variant writtenForm="AU019458" type="orthographic" mlfreq="0"/>
32 |         <Variant writtenForm="D9Ertd414e" type="orthographic" mlfreq="0"/>
33 |         <Variant writtenForm="C. elegans" type="orthographic" mlfreq="13472"/>
34 |         <Variant writtenForm="1500037O19Rik" type="orthographic" mlfreq="0"/>
35 |         <Variant writtenForm="Munc13-3" type="orthographic" mlfreq="24"/>
36 |         <Variant writtenForm="MGC40770" type="orthographic" mlfreq="0"/>
37 |         <Variant writtenForm="Unc13c protein" type="orthographic" mlfreq="0"/>
38 |         <Variant writtenForm="unc13 homolog 3" type="orthographic" mlfreq="0"/>
39 |         <Variant writtenForm="unc-13 homolog C" type="orthographic" mlfreq="0"/>
40 |         <DC att="uniprot_ac" val="Q8K0T7"/>
41 |         <DC att="speciesNameNCBI" val="10090"/>
42 | 
43 | </Entry>
44 | </Cluster>
45 | 
46 | <Cluster clsId="`" semType="geneProt">
47 | <Entry entryId="UNIPR_UN13C_HUMAN_1" baseForm="Protein unc-13 homolog C" type="PREFERRED" mlfreq="0">
48 | 
49 |         <SourceDC sourceName="BioThesaurus" sourceId="Q8NB66"/>
50 |         <PosDC posName="POS" pos="N"/>
51 |         <Variant writtenForm="unc-13-like 3" type="orthographic" mlfreq="0"/>
52 |         <Variant writtenForm="hypothetical protein" type="orthographic" mlfreq="709"/>
53 |         <Variant writtenForm="DKFZp547H074" type="orthographic" mlfreq="0"/>
54 |         <Variant writtenForm="UNC13C protein" type="orthographic" mlfreq="0"/>
55 |         <Variant writtenForm="unc-13 homolog C (C. elegans)" type="orthographic" mlfreq="0"/>
56 |         <Variant writtenForm="C. elegans" type="orthographic" mlfreq="13472"/>
57 |         <Variant writtenForm="Munc13-3" type="orthographic" mlfreq="24"/>
58 |         <Variant writtenForm="UNC13C" type="orthographic" mlfreq="0"/>
59 |         <Variant writtenForm="unc-13 homolog C" type="orthographic" mlfreq="0"/>
60 |         <DC att="uniprot_ac" val="Q8NB66"/>
61 |         <DC att="speciesNameNCBI" val="9606"/>
62 | 
63 | </Entry>
64 | </Cluster>
65 | </InputData>


--------------------------------------------------------------------------------
/tests/resources/test_abstract_lexebi.txt:
--------------------------------------------------------------------------------
 1 | Autopsy studies of Alzheimer's disease (AD) have found that neurofibrillary tangle (NFT) pathology of the medial temporal lobe (MTL) demonstrates selective topography with relatively stereotyped subregional involvement at early disease stages, prompting interest in more granular measurement of these structures with in vivo magnetic resonance imaging. We applied a novel, automated method for measurement of hippocampal subfields and extrahippocampal MTL cortical regions. The cohort included cognitively normal (CN) adults (n = 86), early mild cognitive impairment (n = 43), late MCI (n = 22), and mild AD (n = 40) patients from the Alzheimer's Disease Neuroimaging Initiative (ADNI). For pseudolongitudinal analysis of the continuum from preclinical to mild AD dementia, the groups were further divided according to amyloid status based on positron emission tomography. Specific subregions associated with the early NFT pathology of AD were more sensitive to preclinical and early prodromal AD than whole hippocampal volume while more diffuse involvement was found in later stages. In particular, BA35, the first region associated with NFT deposition, was the only region to discriminate preclinical AD from amyloid negative cognitively normal adults ("normal aging"). In general, patterns of atrophy in the pseudolongitudinal analysis largely recapitulated Braak staging of NFTs within the MTL.
 2 | Aquaporin-4 (AQP4)-specific T cells are expanded in neuromyelitis optica (NMO) patients and exhibit Th17 polarization. However, their pathogenic role in CNS autoimmune inflammatory disease is unclear. Although multiple AQP4 T-cell epitopes have been identified in WT C57BL/6 mice, we observed that neither immunization with those determinants nor transfer of donor T cells targeting them caused CNS autoimmune disease in recipient mice. In contrast, robust proliferation was observed following immunization of AQP4-deficient (AQP4-/-) mice with AQP4 peptide (p) 135-153 or p201-220, peptides predicted to contain I-Ab-restricted T-cell epitopes but not identified in WT mice. In comparison with WT mice, AQP4-/- mice used unique T-cell receptor repertoires for recognition of these two AQP4 epitopes. Donor T cells specific for either determinant from AQP4-/-, but not WT, mice induced paralysis in recipient WT and B-cell-deficient mice. AQP4-specific Th17-polarized cells induced more severe disease than Th1-polarized cells. Clinical signs were associated with opticospinal infiltrates of T cells and monocytes. Fluorescent-labeled donor T cells were detected in CNS lesions. Visual system involvement was evident by changes in optical coherence tomography. Fine mapping of AQP4 p201-220 and p135-153 epitopes identified peptides within p201-220 but not p135-153, which induced clinical disease in 40% of WT mice by direct immunization. Our results provide a foundation to evaluate how AQP4-specific T cells contribute to AQP4-targeted CNS autoimmunity (ATCA) and suggest that pathogenic AQP4-specific T-cell responses are normally restrained by central tolerance, which may be relevant to understanding development of AQP4-reactive T cells in NMO.
 3 | Leucine rich repeat kinase 2 (LRRK2) is a promising target for the treatment of Parkinson's disease; however, little is known about the expression of LRRK2 in human brain and if/how LRRK2 protein levels are altered in Parkinson's disease.We measured the protein levels of LRRK2 as well as its phosphorylation on serines 910, 935, and 973 in the postmortem brain tissue of Parkinson's disease patients and aged controls with and without Lewy bodies.LRRK2 and its phosphorylation were measured by immunoblot in brain regions differentially affected in Parkinson's disease (n = 30) as well as subjects with Lewy bodies restricted to the periphery and lower brain stem (n = 25) and matched controls without pathology (n = 25).LRRK2 levels were increased in cases with restricted Lewy bodies, with a 30% increase measured in the substantia nigra. In clinical Parkinson's disease, levels of LRRK2 negatively correlated to disease duration and were comparable with controls. LRRK2 phosphorylation, however, particularly at serine 935, was reduced with clinical Parkinson's disease with a 36% reduction measured in the substantia nigra.Our data show that LRRK2 phosphorylation is reduced with clinical PD, whereas LRRK2 expression is increased in early potential prodromal stages. These results contribute to a better understanding of the role of LRRK2 in idiopathic Parkinson's disease and may aid efforts aimed at therapeutically targeting the LRRK2 protein. © 2016 International Parkinson and Movement Disorder Society.
 4 | Leucine-rich repeat kinase 2 (LRRK2) is a central protein in the pathogenesis of Parkinson's disease (PD), yet its normal function has proved stubbornly hard to elucidate. Even though it remains unclear how pathogenic mutations affect LRRK2 to cause PD, recent findings provide increasing cause for optimism. We summarise here the developing consensus over the effect of pathogenic mutations in the Ras of complex proteins and C-terminal of Roc domains on LRRK2 GTPase activity. This body of work has been greatly reinforced by our own study of the protective R1398H variant contained within the LRRK2 GTPase domain. Collectively, data point towards the pathogenicity of GTP-bound LRRK2 and strengthen a working model for LRRK2 GTPase function as a GTPase activated by dimerisation. Together with the identification of the protective R1398H variant as a valuable control for pathogenic mutations, we have no doubt that these triumphs for the LRRK2 field will accelerate research towards resolving LRRK2 function and towards new treatments for PD.
 5 | Non-coagulating (NC) milk, defined as milk not coagulating within 40 min after rennet-addition, can have a negative influence on cheese production. Its prevalence is estimated at 18% in the Swedish Red (SR) cow population. Our study aimed at identifying genomic regions and causal variants associated with NC milk in SR cows, by doing a GWAS using 777k SNP genotypes and using imputed sequences to fine map the most promising genomic region. Phenotypes were available from 382 SR cows belonging to 21 herds in the south of Sweden, from which individual morning milk was sampled. NC milk was treated as a binary trait, receiving a score of one in case of non-coagulation within 40 min. For all 382 SR cows, 777k SNP genotypes were available as well as the combined genotypes of the genetic variants of αs1-β-κ-caseins. In addition, whole-genome sequences from the 1000 Bull Genome Consortium (Run 3) were available for 429 animals of 15 different breeds. From these sequences, 33 sequences belonged to SR and Finish Ayrshire bulls with a large impact in the SR cow population. Single-marker analyses were run in ASReml using an animal model. After fitting the casein loci, 14 associations at -Log10(P-value) > 6 identified a promising region located on BTA18. We imputed sequences to the 382 genotyped SR cows using Beagle 4 for half of BTA18, and ran a region-wide association study with imputed sequences. In a seven mega base-pairs region on BTA18, our strongest association with NC milk explained almost 34% of the genetic variation in NC milk. Since it is possible that multiple QTL are in strong LD in this region, 59 haplotypes were built, genetically differentiated by means of a phylogenetic tree, and tested in phenotype-genotype association studies. Haplotype analyses support the existence of one QTL underlying NC milk in SR cows. A candidate gene of interest is the VPS35 gene, for which one of our strongest association is an intron SNP in this gene. The VPS35 gene belongs to the mammary gene sets of pre-parturient and of lactating cows.
 6 | In neuromyelitis optica (NMO), one of the underlying pathogenic mechanisms is the formation of antigen-antibody complexes which can trigger an inflammatory response by inducing the infiltration of neutrophils in lesions. Epithelial neutrophil-activating peptide 78 (ENA 78), known as Chemokine (C-X-C motif) ligand 5 (CXCL5), belongs to the ELR-CXCL family. It recruits and activates neutrophils. The aim of this study was to evaluate ENA 78, IL-1β and TNF-α plasma levels in multiple sclerosis (MS) and neuromyelitis optica (NMO) patients.ENA 78, IL-1β and TNF-α plasma levels were detected in 20 healthy controls (HC), 25 MS and 25 NMO patients using MILLIPLEX® map Human High Sensitivity Cytokine/Chemokine Panels.Plasma levels of ENA 78 were significantly higher in NMO patients than in HC (P < 0.001) and MS patients (P < 0.05). The NMO patients showed higher plasma levels of IL-1β compared with HC (P < 0.01). Further, increased plasma levels of TNF-α were found in the MS (P < 0.05) and NMO patients (P < 0.001). In addition, NMO patients had higher Expanded Disability Status Scale (EDSS) scores compared with MS patients (P < 0.05). EDSS scores were correlated with plasma levels of ENA 78 in NMO patients (P < 0.05). There were no significant correlations between EDSS scores and plasma levels of ENA 78 in MS patients (P > 0.05).The overproduction of pro-inflammatory cytokines such as IL-1β and TNF-α during the remission of NMO activates ENA 78, which in turn leads to neutrophil infiltration in lesions. ENA 78 plasma levels were correlated with EDSS scores in NMO patients. Elevated secretion of ENA 78 may be a critical step in neutrophil recruitment during the remission of NMO.
 7 | The tumor necrosis factor like weak inducer of apoptosis (TWEAK) and its receptor, fibroblast growth factor-inducible 14 (Fn14), mediate inflammation and neuronal apoptosis in cerebral edema, ischemic stroke and multiple sclerosis. The downstream effectors and pathways linked to TWEAK-Fn14 signaling are strongly implicated in the pathology of Parkinson's disease (PD), thus indicating a putative role for TWEAK/Fn14 signaling in PD neurodegeneration. Using the 1-methyl-4-phenyl-1,2,3,6-tetrahydropyridine (MPTP) mouse model, we aimed to determine whether genetic ablation or pharmacologic mitigation of the TWEAK protein and its Fn14 receptor affected substantia nigra and striatum Parkinsonian pathology. Changes in endogenous TWEAK protein expression were also quantified in tissue from both MPTP-treated mice and PD human samples. TWEAK protein expression was transiently increased in the striatal tissue but remained unaltered in substantia nigra tissue of MPTP-treated mice. There was also no change of TWEAK protein levels in the substantia nigra or the striatum of human PD patients as compared to matched control subjects. Mitigating the effects of endogenous TWEAK protein using neutralizing antibody did affect MPTP-mediated neurotoxicity in the substantia nigra using the sub-acute model of MPTP (30mg/kg i.p. over five consecutive days). Neither TWEAK nor Fn14 genetic ablation led to attenuation of MPTP-toxicity in the acute model. These findings suggest that TWEAK signaling might be an aspect of MPTP-mediated neuropathology and be involved in the overall neurodegenerative pathology of PD.
 8 | Severe damage to the blood-brain barrier (BBB) allows anti-aquaporin 4 (AQP4) antibodies to access the astrocytic endfeet in neuromyelitis optica (NMO). In the current study, we identified the pathogenic cytokines/chemokines that are responsible for the BBB malfunction induced by NMO sera.We measured the levels of 27 cytokines/chemokines in human brain microvascular endothelial cells (BMECs) after exposure to sera obtained from patients with the acute and stable phases of anti-AQP4 antibody-positive NMO spectrum disorder (NMOSD), multiple sclerosis (MS) patients and healthy controls (HC) using a multiplexed fluorescent bead-based immunoassay system.The induced protein (IP)-10 level in the cells was markedly increased following exposure to acute phase NMOSD sera. Other cytokines/chemokines including interleukin (IL)-6 and monocyte chemotactic protein (MCP)-1 were also significantly increased in the acute NMOSD group compared to both the MS and HC groups. The up-regulation of the IP-10 levels in the cells after exposure to the acute-phase NMOSD sera was also observed using another specified ELISA, and this effect was significantly decreased during the remission phase in the individual NMOSD patients. Furthermore, the increase in the level of IP-10 after exposure to the sera was significantly correlated with the cerebrospinal fluid/serum albumin ratio.Sera from the acute phase of NMO markedly increased the autocrine secretion of IP-10 by BMECs. The over-production of IP-10 in BMECs may play an important role in the pathogenesis of NMO and may therefore help to mediate the trafficking of T cells expressing its receptor across the BBB.
 9 | Despite recent advances in delineating the pathogenic mechanisms of autoimmune disease, the puzzle that reveals the true picture of these diverse immunological disorders is yet to be solved. We know that the human leukocyte antigen (HLA) loci as well as many different genetic susceptibility loci with relatively small effect sizes predispose to various autoimmune diseases and that environmental factors are involved in triggering disease. Models for mechanisms of disease become increasingly complex as relationships between components of both the adaptive and innate immune systems are untangled at the molecular level. In this article, we pose some of the important questions about autoimmunity where the answers will advance our understanding of disease pathogenesis and improve the rational design of novel therapies. How is autoimmunity triggered, and what components of the immune response drive the clinical manifestations of disease? What determines whether a genetically predisposed individual will develop an autoimmune disease? Is restoring immune tolerance the secret to finding cures for autoimmune disease? Current research efforts seek answers to these big questions.
10 | Anti-TNF drugs have represented an epochal revolution in the treatment of rheumatoid arthritis and spondyloarthritis. In the field of axial spondyloarthritis, golimumab, a fully human monoclonal anti-TNFα administered subcutaneously every 4 weeks, has shown significant efficacy and good safety in patients with ankylosing spondylitis. More recently, it was also indicated as an effective treatment for patients suffering from non-radiographic axial spondyloarthitits. Areas covered: A systematic literature search was completed, using the largest electronic databases (Medline, Embase and Cochrane), with the aim to review all data concerning the administration of golimumab in patients suffering from axial spondyloartritis. Expert opinion: In the 16-week GO-AHEAD study, golimumab was effective in patients with non-radiographic spondyloarthritis with high levels of CRP and/or positive MRI findings, but not in subjects with both negative CRP and MRI. This finding allows for the addressing the of anti-TNF treatment more specifically. Preliminary data concerning an open-label extension of the GO-AHEAD study outlined the high retention-rate of the drug at 52 weeks. The production of antibodies against golimumab is rare and it seems to exert scarce influence on the drug performances. In conclusion, golimumab appears as a very useful and well tolerated anti-TNF agent.


--------------------------------------------------------------------------------
/tests/test_tagger.py:
--------------------------------------------------------------------------------
 1 | import unittest, json
 2 | 
 3 | from modules.BioentityTagger import BioEntityTagger
 4 | 
 5 | 
 6 | class TaggerTestCase(unittest.TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         self.tagger = BioEntityTagger()
10 | 
11 |     def testTaggerNLP(self):
12 | 
13 |         for i, text in enumerate(file('resources/test_abstract_nlp.txt')):
14 |             print i
15 |             for tag in self.tagger.tag(text.lower()):
16 |                print tag, text[tag['start']:tag['end']]
17 | 
18 |     def testTaggerLexebi(self):
19 |         for i, text in enumerate(file('resources/test_abstract_lexebi.txt')):
20 | 
21 |             print i
22 |             # for tag in tagger.tag(text.lower()):
23 |             #     print tag, text[tag['start']:tag['end']]
24 |             old_tags = set()
25 |             lexebi_tags = set()
26 |             tags = self.tagger.tag(text.lower())
27 |             for tag in tags:
28 |                 matched_text = text[tag['start']:tag['end']]
29 |                 print tag, matched_text
30 |                 if tag['reference_db'] == 'LEXEBI':
31 |                     lexebi_tags.add(matched_text)
32 |                 else:
33 |                     old_tags.add(matched_text)
34 |             new_tags = lexebi_tags.difference(old_tags)
35 |             print 'New tags identified : {}'.format(new_tags)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |      unittest.main()


--------------------------------------------------------------------------------
/tests/text_medline_parser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/bin/python
 2 | # -*- coding: UTF-8 -*-
 3 | import os
 4 | import unittest
 5 | from lxml import etree
 6 | from tqdm import tqdm
 7 | 
 8 | from main import parse_medline_xml
 9 | from modules.BioentityTagger import BioEntityTagger
10 | from modules.NLP import init_spacy_english_language, SentenceAnalysisSpacy, DocumentAnalysisSpacy
11 | 
12 | class MedlineParser(unittest.TestCase):
13 | 
14 |     def testParsing(self):
15 |         file_name = 'resources/cancer_small.xml'
16 |         tree = etree.parse(file_name)
17 |         out = open('resources/pubmed_result.abstract.txt','w')
18 |         for element in tqdm(tree.iter('MedlineCitation')):
19 |             parsed = next(parse_medline_xml(etree.tostring(element),file_name))
20 |             if parsed['abstract']:
21 |                 try:
22 |                     out.write(parsed['abstract'].encode('utf-8').replace('\n','')+'\n')
23 |                 except Exception as e:
24 |                     print 'could not parse', e
25 | 
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     unittest.main()
30 | 


--------------------------------------------------------------------------------
/venv_elasticsearch.txt:
--------------------------------------------------------------------------------
 1 | cachetools==3.1.1
 2 | certifi==2019.9.11
 3 | chardet==3.0.4
 4 | elasticsearch==5.5.1
 5 | futures==3.3.0
 6 | google-api-core==1.14.3
 7 | google-auth==1.7.1
 8 | google-cloud-core==1.0.3
 9 | google-cloud-storage==1.23.0
10 | google-resumable-media==0.5.0
11 | googleapis-common-protos==1.6.0
12 | idna==2.8
13 | pipdeptree==0.13.2
14 | pkg-resources==0.0.0
15 | protobuf==3.11.0
16 | pyasn1==0.4.8
17 | pyasn1-modules==0.2.7
18 | pytz==2019.3
19 | requests==2.22.0
20 | rsa==4.0
21 | six==1.13.0
22 | tqdm==4.39.0
23 | urllib3==1.21.1
24 | 


--------------------------------------------------------------------------------