├── .gitignore ├── LICENSE ├── README.md ├── clean_source_bucket.py ├── es-mapping-index ├── README.md ├── concept.json ├── publication.json └── settings.json ├── es-mapping ├── README.md ├── concept.json └── publication.json ├── gcp-local-ssd ├── readme.md ├── run.sh └── startup.sh ├── gcp-persistent-disk ├── README.md ├── exec.sh ├── run.sh ├── startup.sh ├── steps.sh └── tmux_example │ ├── README.md │ ├── bioentity_tmux.sh │ ├── bioentity_tmux_kill.sh │ ├── concept_tmux.sh │ ├── concept_tmux_kill.sh │ ├── es_bio.sh │ ├── es_concept.sh │ ├── es_pub.sh │ ├── es_tag.sh │ ├── publication_tmux.sh │ ├── publication_tmux_kill.sh │ ├── taggedtext_tmux.sh │ └── taggedtext_tmux_kill.sh ├── load2es.py ├── main.py ├── modules ├── AbbreviationFinder.py ├── BioStopWords.py ├── BioentityTagger.py ├── NLP.py ├── __init__.py └── vocabulary.py ├── publication_alias.sh ├── setup.py ├── tests ├── __init__.py ├── resources │ ├── common_words_as_genes.txt │ ├── test-medlinexml │ │ ├── test_baseline.xml.gz │ │ └── test_update.xml.gz │ ├── test-spacy │ │ ├── disease.xml │ │ └── geneProt.xml │ ├── test_abstract_lexebi.txt │ └── test_abstract_nlp.txt ├── test_tagger.py ├── text_medline_parser.py └── text_nlp.py └── venv_elasticsearch.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2017 Biogen, GlaxoSmithKline, EMBL - European Bioinformatics Institute, Wellcome Trust Sanger Institute 2 | 3 | This software was developed as part of the Open Targets project. For more information please see: 4 | 5 | http://www.opentargets.org 6 | Target Validation platform 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | 18 | See the License for the specific language governing permissions and 19 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Note: This repo has been archived because LINK (Library) has been decommissioned. 2 | 3 | # Open Targets Library - NLP Pipeline 4 | 5 | ## NLP Analysis of MedLine/PubMed Running in Apache Beam 6 | 7 | This pipeline is designed to run with Apache Beam using the dataflow runner. 8 | It has not been tested with other Beam backends, but it should work there as well pending minimal modifications. 9 | Please see the [Apache Beam SDK](https://beam.apache.org/documentation/sdks/python/) for more info. 10 | 11 | ## Steps to reproduce a full run 12 | Use python2 with pip and virtualenv 13 | 14 | * Generate a mirror of MEDLINE FTP to a Google Storage Bucket (any other storage provider supported by Python Beam SDK should work). E.g. using [rclone](https://rclone.org/) 15 | 16 | - Download [pre-built rclone binaries](https://rclone.org/install/#linux-installation-from-precompiled-binary) rather than platform packaged ones as they tend to be more up-to-date 17 | - configure rclone with MEDLINE FTP [ftp.ncbi.nlm.nih.gov](ftp://ftp.ncbi.nlm.nih.gov) and your target gcp project 18 | (my-gcp-project-buckets) `rclone config`. Medline must have username `anonymous` and password `anonymous`. 19 | - Generate a full mirror: 20 | `rclone sync -v medline-ftp:pubmed/baseline my-gcp-project-buckets:my-medline-bucket/baseline` 21 | - Update new files: 22 | `rclone sync -v medline-ftp:pubmed/updatefiles my-gcp-project-buckets:my-medline-bucket/updatefiles` 23 | - Note: you can use `--dry-run` argument to test 24 | * install tooling 25 | ```sh 26 | sudo apt-get install python-dev virtualenv build-essential git libxml2-dev libxslt-dev zlib1g-dev tmux 27 | ``` 28 | * Download the pipeline 29 | ```sh 30 | git clone https://github.com/opentargets/library-beam 31 | cd library-beam 32 | ``` 33 | * Create a virtual environment to manage dependencies in 34 | ```sh 35 | virtualenv venv --python=python2 36 | source venv/bin/activate 37 | ``` 38 | * Install the pipeline into the virtual environment 39 | ```sh 40 | python setup.py install 41 | #note this needs between 3.75GB and 7.5GB RAM 42 | pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.0/en_core_web_lg-2.2.0.tar.gz 43 | ``` 44 | * Grant the permission to compute user. 45 | ``` 46 | numberHidden -compute@developer.gserviceaccount.com Cloud Build Service Agent 47 | ``` 48 | * Change the value for the vocabulary info under modules/vocabulary.py 49 | 50 | * Run pipeline 51 | ```sH 52 | python -m main \ 53 | --project open-targets-library \ 54 | --job_name medline201911 \ 55 | --runner DataflowRunner \ 56 | --temp_location gs://medline_2019_11/temp \ 57 | --setup_file ./setup.py \ 58 | --worker_machine_type n1-highmem-32 \ 59 | --input_baseline gs://medline_2019_11/baseline/pubmed19n*.xml.gz \ 60 | --input_updates gs://medline_2019_11/updatefiles/pubmed19n*.xml.gz \ 61 | --output_enriched gs://medline_2019_11/analyzed/pubmed19 \ 62 | --output_splitted gs://medline_2019_11/splitted/pubmed19 \ 63 | --max_num_workers 32 \ 64 | --region europe-west1 \ 65 | --zone europe-west1-d 66 | ``` 67 | 68 | This can be monitored via [Google Dataflow](https://console.cloud.google.com/dataflow). Note that "wall time" displayed is not the [usual definition](https://en.wikipedia.org/wiki/Elapsed_real_time) but is per thread and worker. 69 | 70 | In total it takes approximately 4h. 71 | 72 | ![image](https://user-images.githubusercontent.com/148221/35000427-4e11b818-fadc-11e7-9c2f-08a68eaed37e.png) 73 | 74 | ![image](https://user-images.githubusercontent.com/148221/35000458-6108bb24-fadc-11e7-8a84-452f7b3816f6.png) 75 | 76 | ## Steps to load the JSON dumps into ElasticSearch 77 | 78 | The directory gcp contains the infrastructure scripts to generate the Elasticsearch cluster. 79 | 80 | * Create a virtual environment to manage dependencies in 81 | ```sh 82 | virtualenv venv_elasticsearch --python=python2 83 | source venv_elasticsearch/bin/activate 84 | pip install -r venv_elasticsearch.txt 85 | ``` 86 | * Run job load JSONs in Elasticsearch 87 | 88 | WARNING: the loading scripts takes a lot of time currently, particurlarly the concept one (24h+). It is good to use `screen` or `tmux` or similar, so it will keep going after disconect and can be recovered. 89 | 90 | ```sh 91 | python load2es.py publication bioentity taggedtext concept --es http://es:9200 92 | ``` 93 | 94 | Note: Elasticsearch must have the International Components for Unicode support plugin installed.i.e. `/usr/share/elasticsearch/bin/elasticsearch-plugin -s install analysis-icu` 95 | 96 | * Increase elasticsearch capacity for the adjancency matrix aggregation (used by LINK tool) 97 | ```sh 98 | curl -XPUT 'http://myesnode1:9200/pubmed-18-concept/_settings' -H 'Content-Type: application/json' -d' 99 | { 100 | "index" : { 101 | "max_adjacency_matrix_filters" : 500 102 | } 103 | }' 104 | ``` 105 | 106 | ## Google Cloud Platform 107 | 108 | When controlling this process from a Google cloud machine, make sure it has sufficient scopes enabled. 109 | -------------------------------------------------------------------------------- /clean_source_bucket.py: -------------------------------------------------------------------------------- 1 | 2 | basename = 'pubmed18' 3 | 4 | def delete_all_output(): 5 | from google.cloud import storage 6 | 7 | client = storage.Client(project='open-targets') 8 | bucket = client.get_bucket('medline-json') 9 | names = list(bucket.list_blobs()) 10 | for i, blob_ref in enumerate(names): 11 | # print blob_ref.name 12 | if blob_ref.name.endswith('.json.gz') and \ 13 | (blob_ref.name.startswith('parsed/'+basename) or 14 | blob_ref.name.startswith('analyzed/'+basename) or 15 | blob_ref.name.startswith('splitted/'+basename) or 16 | blob_ref.name.startswith('test/analyzed/'+basename) or 17 | blob_ref.name.startswith('test/splitted/'+basename)or 18 | blob_ref.name.startswith('test/parsed/'+basename)) : 19 | blob = bucket.get_blob(blob_ref.name) 20 | blob.delete() 21 | print 'deleted', i, blob_ref.name, 'of', len(names) 22 | 23 | if __name__ == '__main__': 24 | delete_all_output() -------------------------------------------------------------------------------- /es-mapping-index/README.md: -------------------------------------------------------------------------------- 1 | This directory contains the settings for the new infrastructure used by gcp/#todo 2 | -------------------------------------------------------------------------------- /es-mapping-index/concept.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 24, 4 | "number_of_replicas": 0, 5 | "refresh_interval": "-1", 6 | "translog.flush_threshold_size": "1000mb", 7 | "analysis": { 8 | "filter": { 9 | "english_stop": { 10 | "type": "stop", 11 | "stopwords": [ 12 | "'ll", 13 | "'ve", 14 | "0", 15 | "1", 16 | "10", 17 | "100", 18 | "11", 19 | "12", 20 | "13", 21 | "14", 22 | "15", 23 | "16", 24 | "17", 25 | "18", 26 | "19", 27 | "2", 28 | "20", 29 | "21", 30 | "22", 31 | "23", 32 | "24", 33 | "25", 34 | "26", 35 | "27", 36 | "28", 37 | "29", 38 | "3", 39 | "30", 40 | "31", 41 | "32", 42 | "33", 43 | "34", 44 | "35", 45 | "36", 46 | "37", 47 | "38", 48 | "39", 49 | "4", 50 | "40", 51 | "41", 52 | "42", 53 | "43", 54 | "44", 55 | "45", 56 | "46", 57 | "47", 58 | "48", 59 | "49", 60 | "5", 61 | "50", 62 | "51", 63 | "52", 64 | "53", 65 | "54", 66 | "55", 67 | "56", 68 | "57", 69 | "58", 70 | "59", 71 | "6", 72 | "60", 73 | "61", 74 | "62", 75 | "63", 76 | "64", 77 | "65", 78 | "66", 79 | "67", 80 | "68", 81 | "69", 82 | "7", 83 | "70", 84 | "71", 85 | "72", 86 | "73", 87 | "74", 88 | "75", 89 | "76", 90 | "77", 91 | "78", 92 | "79", 93 | "8", 94 | "80", 95 | "81", 96 | "82", 97 | "83", 98 | "84", 99 | "85", 100 | "86", 101 | "87", 102 | "88", 103 | "89", 104 | "9", 105 | "90", 106 | "91", 107 | "92", 108 | "93", 109 | "94", 110 | "95", 111 | "96", 112 | "97", 113 | "98", 114 | "99", 115 | "a", 116 | "able", 117 | "about", 118 | "above", 119 | "abst", 120 | "accordance", 121 | "according", 122 | "accordingly", 123 | "across", 124 | "act", 125 | "actually", 126 | "added", 127 | "adj", 128 | "affected", 129 | "affecting", 130 | "affects", 131 | "after", 132 | "afterwards", 133 | "again", 134 | "against", 135 | "ah", 136 | "all", 137 | "almost", 138 | "alone", 139 | "along", 140 | "already", 141 | "also", 142 | "although", 143 | "always", 144 | "am", 145 | "among", 146 | "amongst", 147 | "an", 148 | "and", 149 | "announce", 150 | "another", 151 | "any", 152 | "anybody", 153 | "anyhow", 154 | "anymore", 155 | "anyone", 156 | "anything", 157 | "anyway", 158 | "anyways", 159 | "anywhere", 160 | "apparently", 161 | "approximately", 162 | "are", 163 | "aren", 164 | "arent", 165 | "arise", 166 | "around", 167 | "as", 168 | "aside", 169 | "ask", 170 | "asking", 171 | "at", 172 | "auth", 173 | "available", 174 | "away", 175 | "awfully", 176 | "b", 177 | "back", 178 | "be", 179 | "became", 180 | "because", 181 | "become", 182 | "becomes", 183 | "becoming", 184 | "been", 185 | "before", 186 | "beforehand", 187 | "begin", 188 | "beginning", 189 | "beginnings", 190 | "begins", 191 | "behind", 192 | "being", 193 | "believe", 194 | "below", 195 | "beside", 196 | "besides", 197 | "between", 198 | "beyond", 199 | "biol", 200 | "both", 201 | "brief", 202 | "briefly", 203 | "but", 204 | "by", 205 | "c", 206 | "ca", 207 | "came", 208 | "can", 209 | "can't", 210 | "cannot", 211 | "cause", 212 | "causes", 213 | "certain", 214 | "certainly", 215 | "co", 216 | "com", 217 | "come", 218 | "comes", 219 | "contain", 220 | "containing", 221 | "contains", 222 | "could", 223 | "couldnt", 224 | "d", 225 | "date", 226 | "did", 227 | "didn't", 228 | "different", 229 | "do", 230 | "does", 231 | "doesn't", 232 | "doing", 233 | "don't", 234 | "done", 235 | "down", 236 | "downwards", 237 | "due", 238 | "during", 239 | "e", 240 | "each", 241 | "ed", 242 | "edu", 243 | "effect", 244 | "eg", 245 | "eight", 246 | "eighty", 247 | "either", 248 | "else", 249 | "elsewhere", 250 | "end", 251 | "ending", 252 | "enough", 253 | "especially", 254 | "et", 255 | "et-al", 256 | "etc", 257 | "even", 258 | "ever", 259 | "every", 260 | "everybody", 261 | "everyone", 262 | "everything", 263 | "everywhere", 264 | "ex", 265 | "except", 266 | "f", 267 | "far", 268 | "few", 269 | "ff", 270 | "fifth", 271 | "first", 272 | "five", 273 | "fix", 274 | "followed", 275 | "following", 276 | "follows", 277 | "for", 278 | "former", 279 | "formerly", 280 | "forth", 281 | "found", 282 | "four", 283 | "from", 284 | "further", 285 | "furthermore", 286 | "g", 287 | "gave", 288 | "get", 289 | "gets", 290 | "getting", 291 | "give", 292 | "given", 293 | "gives", 294 | "giving", 295 | "go", 296 | "goes", 297 | "gone", 298 | "got", 299 | "gotten", 300 | "h", 301 | "had", 302 | "happens", 303 | "hardly", 304 | "has", 305 | "hasn't", 306 | "have", 307 | "haven't", 308 | "having", 309 | "he", 310 | "hed", 311 | "hence", 312 | "her", 313 | "here", 314 | "hereafter", 315 | "hereby", 316 | "herein", 317 | "heres", 318 | "hereupon", 319 | "hers", 320 | "herself", 321 | "hes", 322 | "hi", 323 | "hid", 324 | "him", 325 | "himself", 326 | "his", 327 | "hither", 328 | "home", 329 | "how", 330 | "howbeit", 331 | "however", 332 | "hundred", 333 | "i", 334 | "i'll", 335 | "i've", 336 | "id", 337 | "ie", 338 | "if", 339 | "im", 340 | "immediate", 341 | "immediately", 342 | "importance", 343 | "important", 344 | "in", 345 | "inc", 346 | "indeed", 347 | "index", 348 | "information", 349 | "instead", 350 | "into", 351 | "invention", 352 | "inward", 353 | "is", 354 | "isn't", 355 | "it", 356 | "it'll", 357 | "itd", 358 | "its", 359 | "itself", 360 | "j", 361 | "just", 362 | "k", 363 | "keep", 364 | "keeps", 365 | "kept", 366 | "kg", 367 | "km", 368 | "know", 369 | "known", 370 | "knows", 371 | "l", 372 | "largely", 373 | "last", 374 | "lately", 375 | "later", 376 | "latter", 377 | "latterly", 378 | "least", 379 | "less", 380 | "lest", 381 | "let", 382 | "lets", 383 | "like", 384 | "liked", 385 | "likely", 386 | "line", 387 | "little", 388 | "look", 389 | "looking", 390 | "looks", 391 | "ltd", 392 | "m", 393 | "made", 394 | "mainly", 395 | "make", 396 | "makes", 397 | "many", 398 | "may", 399 | "maybe", 400 | "me", 401 | "mean", 402 | "means", 403 | "meantime", 404 | "meanwhile", 405 | "merely", 406 | "mg", 407 | "might", 408 | "million", 409 | "miss", 410 | "ml", 411 | "more", 412 | "moreover", 413 | "most", 414 | "mostly", 415 | "mr", 416 | "mrs", 417 | "much", 418 | "mug", 419 | "must", 420 | "my", 421 | "myself", 422 | "n", 423 | "na", 424 | "name", 425 | "namely", 426 | "nay", 427 | "nd", 428 | "near", 429 | "nearly", 430 | "necessarily", 431 | "necessary", 432 | "need", 433 | "needs", 434 | "neither", 435 | "never", 436 | "nevertheless", 437 | "new", 438 | "next", 439 | "nine", 440 | "ninety", 441 | "no", 442 | "nobody", 443 | "non", 444 | "none", 445 | "nonetheless", 446 | "noone", 447 | "nor", 448 | "normally", 449 | "nos", 450 | "not", 451 | "noted", 452 | "nothing", 453 | "now", 454 | "nowhere", 455 | "o", 456 | "obtain", 457 | "obtained", 458 | "obviously", 459 | "of", 460 | "off", 461 | "often", 462 | "oh", 463 | "ok", 464 | "okay", 465 | "old", 466 | "omitted", 467 | "on", 468 | "once", 469 | "one", 470 | "ones", 471 | "only", 472 | "onto", 473 | "or", 474 | "ord", 475 | "other", 476 | "others", 477 | "otherwise", 478 | "ought", 479 | "our", 480 | "ours", 481 | "ourselves", 482 | "out", 483 | "outside", 484 | "over", 485 | "overall", 486 | "owing", 487 | "own", 488 | "p", 489 | "page", 490 | "pages", 491 | "part", 492 | "particular", 493 | "particularly", 494 | "past", 495 | "per", 496 | "perhaps", 497 | "placed", 498 | "please", 499 | "plus", 500 | "poorly", 501 | "possible", 502 | "possibly", 503 | "potentially", 504 | "pp", 505 | "predominantly", 506 | "present", 507 | "previously", 508 | "primarily", 509 | "probably", 510 | "promptly", 511 | "proud", 512 | "provides", 513 | "put", 514 | "q", 515 | "que", 516 | "quickly", 517 | "quite", 518 | "qv", 519 | "r", 520 | "ran", 521 | "rather", 522 | "rd", 523 | "re", 524 | "readily", 525 | "really", 526 | "recent", 527 | "recently", 528 | "ref", 529 | "refs", 530 | "regarding", 531 | "regardless", 532 | "regards", 533 | "related", 534 | "relatively", 535 | "research", 536 | "respectively", 537 | "resulted", 538 | "resulting", 539 | "results", 540 | "right", 541 | "run", 542 | "s", 543 | "said", 544 | "same", 545 | "saw", 546 | "say", 547 | "saying", 548 | "says", 549 | "sec", 550 | "section", 551 | "see", 552 | "seeing", 553 | "seem", 554 | "seemed", 555 | "seeming", 556 | "seems", 557 | "seen", 558 | "self", 559 | "selves", 560 | "sent", 561 | "seven", 562 | "several", 563 | "shall", 564 | "she", 565 | "she'll", 566 | "shed", 567 | "shes", 568 | "should", 569 | "shouldn't", 570 | "show", 571 | "showed", 572 | "shown", 573 | "showns", 574 | "shows", 575 | "significant", 576 | "significantly", 577 | "similar", 578 | "similarly", 579 | "since", 580 | "six", 581 | "slightly", 582 | "so", 583 | "some", 584 | "somebody", 585 | "somehow", 586 | "someone", 587 | "somethan", 588 | "something", 589 | "sometime", 590 | "sometimes", 591 | "somewhat", 592 | "somewhere", 593 | "soon", 594 | "sorry", 595 | "specifically", 596 | "specified", 597 | "specify", 598 | "specifying", 599 | "still", 600 | "stop", 601 | "strongly", 602 | "sub", 603 | "substantially", 604 | "successfully", 605 | "such", 606 | "sufficiently", 607 | "suggest", 608 | "sup", 609 | "sure", 610 | "t", 611 | "take", 612 | "taken", 613 | "taking", 614 | "tell", 615 | "tends", 616 | "th", 617 | "than", 618 | "thank", 619 | "thanks", 620 | "thanx", 621 | "that", 622 | "that'll", 623 | "that've", 624 | "thats", 625 | "the", 626 | "their", 627 | "theirs", 628 | "them", 629 | "themselves", 630 | "then", 631 | "thence", 632 | "there", 633 | "there'll", 634 | "there've", 635 | "thereafter", 636 | "thereby", 637 | "thered", 638 | "therefore", 639 | "therein", 640 | "thereof", 641 | "therere", 642 | "theres", 643 | "thereto", 644 | "thereupon", 645 | "these", 646 | "they", 647 | "they'll", 648 | "they've", 649 | "theyd", 650 | "theyre", 651 | "think", 652 | "this", 653 | "those", 654 | "thou", 655 | "though", 656 | "thoughh", 657 | "thousand", 658 | "throug", 659 | "through", 660 | "throughout", 661 | "thru", 662 | "thus", 663 | "til", 664 | "tip", 665 | "to", 666 | "together", 667 | "too", 668 | "took", 669 | "toward", 670 | "towards", 671 | "tried", 672 | "tries", 673 | "truly", 674 | "try", 675 | "trying", 676 | "ts", 677 | "twice", 678 | "two", 679 | "u", 680 | "un", 681 | "under", 682 | "unfortunately", 683 | "unless", 684 | "unlike", 685 | "unlikely", 686 | "until", 687 | "unto", 688 | "up", 689 | "upon", 690 | "ups", 691 | "us", 692 | "use", 693 | "used", 694 | "useful", 695 | "usefully", 696 | "usefulness", 697 | "uses", 698 | "using", 699 | "usually", 700 | "v", 701 | "value", 702 | "various", 703 | "very", 704 | "via", 705 | "viz", 706 | "vol", 707 | "vols", 708 | "vs", 709 | "w", 710 | "want", 711 | "wants", 712 | "was", 713 | "wasnt", 714 | "way", 715 | "we", 716 | "we'll", 717 | "we've", 718 | "wed", 719 | "welcome", 720 | "went", 721 | "were", 722 | "werent", 723 | "what", 724 | "what'll", 725 | "whatever", 726 | "whats", 727 | "when", 728 | "whence", 729 | "whenever", 730 | "where", 731 | "whereafter", 732 | "whereas", 733 | "whereby", 734 | "wherein", 735 | "wheres", 736 | "whereupon", 737 | "wherever", 738 | "whether", 739 | "which", 740 | "while", 741 | "whim", 742 | "whither", 743 | "who", 744 | "who'll", 745 | "whod", 746 | "whoever", 747 | "whole", 748 | "whom", 749 | "whomever", 750 | "whos", 751 | "whose", 752 | "why", 753 | "widely", 754 | "willing", 755 | "wish", 756 | "with", 757 | "within", 758 | "without", 759 | "wont", 760 | "words", 761 | "world", 762 | "would", 763 | "wouldnt", 764 | "www", 765 | "x", 766 | "y", 767 | "yes", 768 | "yet", 769 | "you", 770 | "you'll", 771 | "you've", 772 | "youd", 773 | "your", 774 | "youre", 775 | "yours", 776 | "yourself", 777 | "yourselves", 778 | "z", 779 | "zero" 780 | ] 781 | }, 782 | "english_stemmer": { 783 | "type": "stemmer", 784 | "language": "english" 785 | }, 786 | "english_light_stemmer": { 787 | "type": "stemmer", 788 | "language": "light_english" 789 | }, 790 | "english_minimal_stemmer": { 791 | "type": "stemmer", 792 | "language": "minimal_english" 793 | }, 794 | "english_possessive_stemmer": { 795 | "type": "stemmer", 796 | "language": "possessive_english" 797 | } 798 | }, 799 | "analyzer": { 800 | "english": { 801 | "tokenizer": "classic", 802 | "filter": [ 803 | "icu_normalizer", 804 | "english_possessive_stemmer", 805 | "english_stop", 806 | "english_minimal_stemmer" 807 | ] 808 | } 809 | } 810 | } 811 | }, 812 | "mappings": { 813 | "_default_": { 814 | "_all": { 815 | "enabled": true 816 | }, 817 | "dynamic_templates": [ 818 | { 819 | "string_fields": { 820 | "mapping": { 821 | "type": "keyword", 822 | "ignore_above": 256 823 | }, 824 | "match": "*", 825 | "match_mapping_type": "string" 826 | } 827 | } 828 | ] 829 | }, 830 | "concept": { 831 | "_all": { 832 | "enabled": false 833 | }, 834 | "dynamic_templates": [ 835 | { 836 | "string_fields": { 837 | "mapping": { 838 | "type": "keyword", 839 | "ignore_above": 256 840 | }, 841 | "match": "*", 842 | "match_mapping_type": "string" 843 | } 844 | } 845 | ], 846 | "properties": { 847 | "concept": { 848 | "type": "object", 849 | "properties": { 850 | "sentence_text": { 851 | "type": "text", 852 | "analyzer": "english", 853 | "eager_global_ordinals": true, 854 | "fielddata": true 855 | }, 856 | "verb_subtree": { 857 | "type": "text", 858 | "analyzer": "english", 859 | "eager_global_ordinals": true, 860 | "fielddata": true 861 | }, 862 | "relations": { 863 | "type": "object", 864 | "properties": { 865 | "directed": { 866 | "type": "text", 867 | "analyzer": "whitespace", 868 | "eager_global_ordinals": true, 869 | "fielddata": true 870 | }, 871 | "undirected": { 872 | "type": "text", 873 | "analyzer": "whitespace", 874 | "eager_global_ordinals": true, 875 | "fielddata": true 876 | } 877 | } 878 | } 879 | } 880 | } 881 | } 882 | } 883 | } 884 | } 885 | -------------------------------------------------------------------------------- /es-mapping-index/publication.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 24, 4 | "number_of_replicas": 0, 5 | "refresh_interval": "-1", 6 | "translog.flush_threshold_size": "1000mb", 7 | "analysis": { 8 | "filter": { 9 | "english_stop": { 10 | "type": "stop", 11 | "stopwords": [ 12 | "'ll", 13 | "'ve", 14 | "0", 15 | "1", 16 | "10", 17 | "100", 18 | "11", 19 | "12", 20 | "13", 21 | "14", 22 | "15", 23 | "16", 24 | "17", 25 | "18", 26 | "19", 27 | "2", 28 | "20", 29 | "21", 30 | "22", 31 | "23", 32 | "24", 33 | "25", 34 | "26", 35 | "27", 36 | "28", 37 | "29", 38 | "3", 39 | "30", 40 | "31", 41 | "32", 42 | "33", 43 | "34", 44 | "35", 45 | "36", 46 | "37", 47 | "38", 48 | "39", 49 | "4", 50 | "40", 51 | "41", 52 | "42", 53 | "43", 54 | "44", 55 | "45", 56 | "46", 57 | "47", 58 | "48", 59 | "49", 60 | "5", 61 | "50", 62 | "51", 63 | "52", 64 | "53", 65 | "54", 66 | "55", 67 | "56", 68 | "57", 69 | "58", 70 | "59", 71 | "6", 72 | "60", 73 | "61", 74 | "62", 75 | "63", 76 | "64", 77 | "65", 78 | "66", 79 | "67", 80 | "68", 81 | "69", 82 | "7", 83 | "70", 84 | "71", 85 | "72", 86 | "73", 87 | "74", 88 | "75", 89 | "76", 90 | "77", 91 | "78", 92 | "79", 93 | "8", 94 | "80", 95 | "81", 96 | "82", 97 | "83", 98 | "84", 99 | "85", 100 | "86", 101 | "87", 102 | "88", 103 | "89", 104 | "9", 105 | "90", 106 | "91", 107 | "92", 108 | "93", 109 | "94", 110 | "95", 111 | "96", 112 | "97", 113 | "98", 114 | "99", 115 | "a", 116 | "able", 117 | "about", 118 | "above", 119 | "abst", 120 | "accordance", 121 | "according", 122 | "accordingly", 123 | "across", 124 | "act", 125 | "actually", 126 | "added", 127 | "adj", 128 | "affected", 129 | "affecting", 130 | "affects", 131 | "after", 132 | "afterwards", 133 | "again", 134 | "against", 135 | "ah", 136 | "all", 137 | "almost", 138 | "alone", 139 | "along", 140 | "already", 141 | "also", 142 | "although", 143 | "always", 144 | "am", 145 | "among", 146 | "amongst", 147 | "an", 148 | "and", 149 | "announce", 150 | "another", 151 | "any", 152 | "anybody", 153 | "anyhow", 154 | "anymore", 155 | "anyone", 156 | "anything", 157 | "anyway", 158 | "anyways", 159 | "anywhere", 160 | "apparently", 161 | "approximately", 162 | "are", 163 | "aren", 164 | "arent", 165 | "arise", 166 | "around", 167 | "as", 168 | "aside", 169 | "ask", 170 | "asking", 171 | "at", 172 | "auth", 173 | "available", 174 | "away", 175 | "awfully", 176 | "b", 177 | "back", 178 | "be", 179 | "became", 180 | "because", 181 | "become", 182 | "becomes", 183 | "becoming", 184 | "been", 185 | "before", 186 | "beforehand", 187 | "begin", 188 | "beginning", 189 | "beginnings", 190 | "begins", 191 | "behind", 192 | "being", 193 | "believe", 194 | "below", 195 | "beside", 196 | "besides", 197 | "between", 198 | "beyond", 199 | "biol", 200 | "both", 201 | "brief", 202 | "briefly", 203 | "but", 204 | "by", 205 | "c", 206 | "ca", 207 | "came", 208 | "can", 209 | "can't", 210 | "cannot", 211 | "cause", 212 | "causes", 213 | "certain", 214 | "certainly", 215 | "co", 216 | "com", 217 | "come", 218 | "comes", 219 | "contain", 220 | "containing", 221 | "contains", 222 | "could", 223 | "couldnt", 224 | "d", 225 | "date", 226 | "did", 227 | "didn't", 228 | "different", 229 | "do", 230 | "does", 231 | "doesn't", 232 | "doing", 233 | "don't", 234 | "done", 235 | "down", 236 | "downwards", 237 | "due", 238 | "during", 239 | "e", 240 | "each", 241 | "ed", 242 | "edu", 243 | "effect", 244 | "eg", 245 | "eight", 246 | "eighty", 247 | "either", 248 | "else", 249 | "elsewhere", 250 | "end", 251 | "ending", 252 | "enough", 253 | "especially", 254 | "et", 255 | "et-al", 256 | "etc", 257 | "even", 258 | "ever", 259 | "every", 260 | "everybody", 261 | "everyone", 262 | "everything", 263 | "everywhere", 264 | "ex", 265 | "except", 266 | "f", 267 | "far", 268 | "few", 269 | "ff", 270 | "fifth", 271 | "first", 272 | "five", 273 | "fix", 274 | "followed", 275 | "following", 276 | "follows", 277 | "for", 278 | "former", 279 | "formerly", 280 | "forth", 281 | "found", 282 | "four", 283 | "from", 284 | "further", 285 | "furthermore", 286 | "g", 287 | "gave", 288 | "get", 289 | "gets", 290 | "getting", 291 | "give", 292 | "given", 293 | "gives", 294 | "giving", 295 | "go", 296 | "goes", 297 | "gone", 298 | "got", 299 | "gotten", 300 | "h", 301 | "had", 302 | "happens", 303 | "hardly", 304 | "has", 305 | "hasn't", 306 | "have", 307 | "haven't", 308 | "having", 309 | "he", 310 | "hed", 311 | "hence", 312 | "her", 313 | "here", 314 | "hereafter", 315 | "hereby", 316 | "herein", 317 | "heres", 318 | "hereupon", 319 | "hers", 320 | "herself", 321 | "hes", 322 | "hi", 323 | "hid", 324 | "him", 325 | "himself", 326 | "his", 327 | "hither", 328 | "home", 329 | "how", 330 | "howbeit", 331 | "however", 332 | "hundred", 333 | "i", 334 | "i'll", 335 | "i've", 336 | "id", 337 | "ie", 338 | "if", 339 | "im", 340 | "immediate", 341 | "immediately", 342 | "importance", 343 | "important", 344 | "in", 345 | "inc", 346 | "indeed", 347 | "index", 348 | "information", 349 | "instead", 350 | "into", 351 | "invention", 352 | "inward", 353 | "is", 354 | "isn't", 355 | "it", 356 | "it'll", 357 | "itd", 358 | "its", 359 | "itself", 360 | "j", 361 | "just", 362 | "k", 363 | "keep", 364 | "keeps", 365 | "kept", 366 | "kg", 367 | "km", 368 | "know", 369 | "known", 370 | "knows", 371 | "l", 372 | "largely", 373 | "last", 374 | "lately", 375 | "later", 376 | "latter", 377 | "latterly", 378 | "least", 379 | "less", 380 | "lest", 381 | "let", 382 | "lets", 383 | "like", 384 | "liked", 385 | "likely", 386 | "line", 387 | "little", 388 | "look", 389 | "looking", 390 | "looks", 391 | "ltd", 392 | "m", 393 | "made", 394 | "mainly", 395 | "make", 396 | "makes", 397 | "many", 398 | "may", 399 | "maybe", 400 | "me", 401 | "mean", 402 | "means", 403 | "meantime", 404 | "meanwhile", 405 | "merely", 406 | "mg", 407 | "might", 408 | "million", 409 | "miss", 410 | "ml", 411 | "more", 412 | "moreover", 413 | "most", 414 | "mostly", 415 | "mr", 416 | "mrs", 417 | "much", 418 | "mug", 419 | "must", 420 | "my", 421 | "myself", 422 | "n", 423 | "na", 424 | "name", 425 | "namely", 426 | "nay", 427 | "nd", 428 | "near", 429 | "nearly", 430 | "necessarily", 431 | "necessary", 432 | "need", 433 | "needs", 434 | "neither", 435 | "never", 436 | "nevertheless", 437 | "new", 438 | "next", 439 | "nine", 440 | "ninety", 441 | "no", 442 | "nobody", 443 | "non", 444 | "none", 445 | "nonetheless", 446 | "noone", 447 | "nor", 448 | "normally", 449 | "nos", 450 | "not", 451 | "noted", 452 | "nothing", 453 | "now", 454 | "nowhere", 455 | "o", 456 | "obtain", 457 | "obtained", 458 | "obviously", 459 | "of", 460 | "off", 461 | "often", 462 | "oh", 463 | "ok", 464 | "okay", 465 | "old", 466 | "omitted", 467 | "on", 468 | "once", 469 | "one", 470 | "ones", 471 | "only", 472 | "onto", 473 | "or", 474 | "ord", 475 | "other", 476 | "others", 477 | "otherwise", 478 | "ought", 479 | "our", 480 | "ours", 481 | "ourselves", 482 | "out", 483 | "outside", 484 | "over", 485 | "overall", 486 | "owing", 487 | "own", 488 | "p", 489 | "page", 490 | "pages", 491 | "part", 492 | "particular", 493 | "particularly", 494 | "past", 495 | "per", 496 | "perhaps", 497 | "placed", 498 | "please", 499 | "plus", 500 | "poorly", 501 | "possible", 502 | "possibly", 503 | "potentially", 504 | "pp", 505 | "predominantly", 506 | "present", 507 | "previously", 508 | "primarily", 509 | "probably", 510 | "promptly", 511 | "proud", 512 | "provides", 513 | "put", 514 | "q", 515 | "que", 516 | "quickly", 517 | "quite", 518 | "qv", 519 | "r", 520 | "ran", 521 | "rather", 522 | "rd", 523 | "re", 524 | "readily", 525 | "really", 526 | "recent", 527 | "recently", 528 | "ref", 529 | "refs", 530 | "regarding", 531 | "regardless", 532 | "regards", 533 | "related", 534 | "relatively", 535 | "research", 536 | "respectively", 537 | "resulted", 538 | "resulting", 539 | "results", 540 | "right", 541 | "run", 542 | "s", 543 | "said", 544 | "same", 545 | "saw", 546 | "say", 547 | "saying", 548 | "says", 549 | "sec", 550 | "section", 551 | "see", 552 | "seeing", 553 | "seem", 554 | "seemed", 555 | "seeming", 556 | "seems", 557 | "seen", 558 | "self", 559 | "selves", 560 | "sent", 561 | "seven", 562 | "several", 563 | "shall", 564 | "she", 565 | "she'll", 566 | "shed", 567 | "shes", 568 | "should", 569 | "shouldn't", 570 | "show", 571 | "showed", 572 | "shown", 573 | "showns", 574 | "shows", 575 | "significant", 576 | "significantly", 577 | "similar", 578 | "similarly", 579 | "since", 580 | "six", 581 | "slightly", 582 | "so", 583 | "some", 584 | "somebody", 585 | "somehow", 586 | "someone", 587 | "somethan", 588 | "something", 589 | "sometime", 590 | "sometimes", 591 | "somewhat", 592 | "somewhere", 593 | "soon", 594 | "sorry", 595 | "specifically", 596 | "specified", 597 | "specify", 598 | "specifying", 599 | "still", 600 | "stop", 601 | "strongly", 602 | "sub", 603 | "substantially", 604 | "successfully", 605 | "such", 606 | "sufficiently", 607 | "suggest", 608 | "sup", 609 | "sure", 610 | "t", 611 | "take", 612 | "taken", 613 | "taking", 614 | "tell", 615 | "tends", 616 | "th", 617 | "than", 618 | "thank", 619 | "thanks", 620 | "thanx", 621 | "that", 622 | "that'll", 623 | "that've", 624 | "thats", 625 | "the", 626 | "their", 627 | "theirs", 628 | "them", 629 | "themselves", 630 | "then", 631 | "thence", 632 | "there", 633 | "there'll", 634 | "there've", 635 | "thereafter", 636 | "thereby", 637 | "thered", 638 | "therefore", 639 | "therein", 640 | "thereof", 641 | "therere", 642 | "theres", 643 | "thereto", 644 | "thereupon", 645 | "these", 646 | "they", 647 | "they'll", 648 | "they've", 649 | "theyd", 650 | "theyre", 651 | "think", 652 | "this", 653 | "those", 654 | "thou", 655 | "though", 656 | "thoughh", 657 | "thousand", 658 | "throug", 659 | "through", 660 | "throughout", 661 | "thru", 662 | "thus", 663 | "til", 664 | "tip", 665 | "to", 666 | "together", 667 | "too", 668 | "took", 669 | "toward", 670 | "towards", 671 | "tried", 672 | "tries", 673 | "truly", 674 | "try", 675 | "trying", 676 | "ts", 677 | "twice", 678 | "two", 679 | "u", 680 | "un", 681 | "under", 682 | "unfortunately", 683 | "unless", 684 | "unlike", 685 | "unlikely", 686 | "until", 687 | "unto", 688 | "up", 689 | "upon", 690 | "ups", 691 | "us", 692 | "use", 693 | "used", 694 | "useful", 695 | "usefully", 696 | "usefulness", 697 | "uses", 698 | "using", 699 | "usually", 700 | "v", 701 | "value", 702 | "various", 703 | "very", 704 | "via", 705 | "viz", 706 | "vol", 707 | "vols", 708 | "vs", 709 | "w", 710 | "want", 711 | "wants", 712 | "was", 713 | "wasnt", 714 | "way", 715 | "we", 716 | "we'll", 717 | "we've", 718 | "wed", 719 | "welcome", 720 | "went", 721 | "were", 722 | "werent", 723 | "what", 724 | "what'll", 725 | "whatever", 726 | "whats", 727 | "when", 728 | "whence", 729 | "whenever", 730 | "where", 731 | "whereafter", 732 | "whereas", 733 | "whereby", 734 | "wherein", 735 | "wheres", 736 | "whereupon", 737 | "wherever", 738 | "whether", 739 | "which", 740 | "while", 741 | "whim", 742 | "whither", 743 | "who", 744 | "who'll", 745 | "whod", 746 | "whoever", 747 | "whole", 748 | "whom", 749 | "whomever", 750 | "whos", 751 | "whose", 752 | "why", 753 | "widely", 754 | "willing", 755 | "wish", 756 | "with", 757 | "within", 758 | "without", 759 | "wont", 760 | "words", 761 | "world", 762 | "would", 763 | "wouldnt", 764 | "www", 765 | "x", 766 | "y", 767 | "yes", 768 | "yet", 769 | "you", 770 | "you'll", 771 | "you've", 772 | "youd", 773 | "your", 774 | "youre", 775 | "yours", 776 | "yourself", 777 | "yourselves", 778 | "z", 779 | "zero" 780 | ] 781 | }, 782 | "english_stemmer": { 783 | "type": "stemmer", 784 | "language": "english" 785 | }, 786 | "english_light_stemmer": { 787 | "type": "stemmer", 788 | "language": "light_english" 789 | }, 790 | "english_minimal_stemmer": { 791 | "type": "stemmer", 792 | "language": "minimal_english" 793 | }, 794 | "english_possessive_stemmer": { 795 | "type": "stemmer", 796 | "language": "possessive_english" 797 | } 798 | }, 799 | "analyzer": { 800 | "english": { 801 | "tokenizer": "classic", 802 | "filter": [ 803 | "icu_normalizer", 804 | "english_possessive_stemmer", 805 | "english_stop", 806 | "english_minimal_stemmer" 807 | ] 808 | } 809 | } 810 | } 811 | }, 812 | "mappings": { 813 | "_default_": { 814 | "_all": { 815 | "enabled": true 816 | }, 817 | "dynamic_templates": [ 818 | { 819 | "string_fields": { 820 | "mapping": { 821 | "type": "keyword", 822 | "ignore_above": 256 823 | }, 824 | "match": "*", 825 | "match_mapping_type": "string" 826 | } 827 | } 828 | ] 829 | }, 830 | "publication": { 831 | "_all": { 832 | "enabled": true 833 | }, 834 | "dynamic_templates": [ 835 | { 836 | "string_fields": { 837 | "mapping": { 838 | "type": "keyword", 839 | "ignore_above": 256 840 | }, 841 | "match": "*", 842 | "match_mapping_type": "string" 843 | } 844 | } 845 | ], 846 | "properties": { 847 | "abstract": { 848 | "type": "text", 849 | "analyzer": "english", 850 | "eager_global_ordinals": true, 851 | "fielddata": true 852 | }, 853 | "abstract_sentences": { 854 | "type": "nested", 855 | "properties": { 856 | "value": { 857 | "type": "text", 858 | "analyzer": "english", 859 | "eager_global_ordinals": true, 860 | "fielddata": true 861 | } 862 | } 863 | }, 864 | "authors": { 865 | "properties": { 866 | "CollectiveName": { 867 | "type": "keyword", 868 | "ignore_above": 256 869 | }, 870 | "ForeName": { 871 | "type": "keyword", 872 | "ignore_above": 256 873 | }, 874 | "Identifier": { 875 | "type": "keyword", 876 | "ignore_above": 256 877 | }, 878 | "Initials": { 879 | "type": "keyword", 880 | "ignore_above": 256 881 | }, 882 | "LastName": { 883 | "type": "keyword", 884 | "ignore_above": 256 885 | }, 886 | "Suffix": { 887 | "type": "keyword", 888 | "ignore_above": 256 889 | }, 890 | "full_name": { 891 | "type": "keyword", 892 | "ignore_above": 256 893 | }, 894 | "short_name": { 895 | "type": "keyword", 896 | "ignore_above": 256 897 | }, 898 | "last_name": { 899 | "type": "keyword", 900 | "ignore_above": 256 901 | } 902 | } 903 | }, 904 | "chemicals": { 905 | "properties": { 906 | "name": { 907 | "type": "keyword", 908 | "ignore_above": 256 909 | }, 910 | "name_id": { 911 | "type": "keyword", 912 | "ignore_above": 256 913 | }, 914 | "registryNumber": { 915 | "type": "keyword", 916 | "ignore_above": 256 917 | } 918 | } 919 | }, 920 | "data_release": { 921 | "type": "keyword", 922 | "ignore_above": 256 923 | }, 924 | "date": { 925 | "type": "date", 926 | "format": "strict_date_optional_time||epoch_millis" 927 | }, 928 | "date_of_revision": { 929 | "type": "date", 930 | "format": "strict_date_optional_time||epoch_millis" 931 | }, 932 | "doi": { 933 | "type": "keyword", 934 | "ignore_above": 256 935 | }, 936 | "filename": { 937 | "type": "keyword", 938 | "ignore_above": 256 939 | }, 940 | "full_text": { 941 | "type": "text", 942 | "analyzer": "english", 943 | "eager_global_ordinals": true, 944 | "fielddata": true 945 | }, 946 | "journal": { 947 | "properties": { 948 | "medlineAbbreviation": { 949 | "type": "keyword", 950 | "ignore_above": 256 951 | }, 952 | "title": { 953 | "type": "keyword", 954 | "ignore_above": 256 955 | } 956 | } 957 | }, 958 | "journal_reference": { 959 | "properties": { 960 | "issue": { 961 | "type": "keyword", 962 | "ignore_above": 256 963 | }, 964 | "pgn": { 965 | "type": "keyword", 966 | "ignore_above": 256 967 | }, 968 | "volume": { 969 | "type": "keyword", 970 | "ignore_above": 256 971 | } 972 | } 973 | }, 974 | "keywords": { 975 | "type": "keyword", 976 | "ignore_above": 256 977 | }, 978 | "mesh_headings": { 979 | "properties": { 980 | "id": { 981 | "type": "keyword", 982 | "ignore_above": 256 983 | }, 984 | "label": { 985 | "type": "keyword", 986 | "ignore_above": 256 987 | } 988 | } 989 | }, 990 | "pub_date": { 991 | "type": "date", 992 | "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis" 993 | }, 994 | "pub_id": { 995 | "type": "keyword", 996 | "ignore_above": 256 997 | }, 998 | "pub_type": { 999 | "type": "keyword", 1000 | "ignore_above": 256 1001 | }, 1002 | "title": { 1003 | "type": "text", 1004 | "analyzer": "english", 1005 | "eager_global_ordinals": true, 1006 | "fielddata": true 1007 | }, 1008 | "text_mined_entities": { 1009 | "type": "object", 1010 | "properties": { 1011 | "nlp": { 1012 | "type": "object", 1013 | "properties": { 1014 | "tagged_text": { 1015 | "type": "string", 1016 | "index": "no" 1017 | }, 1018 | "embedding_text": { 1019 | "type": "object", 1020 | "properties": { 1021 | "ent_tag": { 1022 | "type": "string", 1023 | "index": "no" 1024 | }, 1025 | "plain": { 1026 | "type": "string", 1027 | "index": "no" 1028 | }, 1029 | "pos_tag": { 1030 | "type": "string", 1031 | "index": "no" 1032 | } 1033 | } 1034 | } 1035 | } 1036 | } 1037 | } 1038 | } 1039 | } 1040 | } 1041 | } 1042 | } 1043 | -------------------------------------------------------------------------------- /es-mapping-index/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 24, 4 | "refresh_interval": "-1", 5 | "translog.flush_threshold_size": "1000mb", 6 | "number_of_replicas": 0 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /es-mapping/README.md: -------------------------------------------------------------------------------- 1 | # Important 2 | 3 | This directory and the relative files are used by the script load2es.py 4 | -------------------------------------------------------------------------------- /es-mapping/concept.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 148, 4 | "number_of_replicas": 0, 5 | "analysis": { 6 | "filter": { 7 | "english_stop": { 8 | "type": "stop", 9 | "stopwords": [ 10 | "'ll", 11 | "'ve", 12 | "0", 13 | "1", 14 | "10", 15 | "100", 16 | "11", 17 | "12", 18 | "13", 19 | "14", 20 | "15", 21 | "16", 22 | "17", 23 | "18", 24 | "19", 25 | "2", 26 | "20", 27 | "21", 28 | "22", 29 | "23", 30 | "24", 31 | "25", 32 | "26", 33 | "27", 34 | "28", 35 | "29", 36 | "3", 37 | "30", 38 | "31", 39 | "32", 40 | "33", 41 | "34", 42 | "35", 43 | "36", 44 | "37", 45 | "38", 46 | "39", 47 | "4", 48 | "40", 49 | "41", 50 | "42", 51 | "43", 52 | "44", 53 | "45", 54 | "46", 55 | "47", 56 | "48", 57 | "49", 58 | "5", 59 | "50", 60 | "51", 61 | "52", 62 | "53", 63 | "54", 64 | "55", 65 | "56", 66 | "57", 67 | "58", 68 | "59", 69 | "6", 70 | "60", 71 | "61", 72 | "62", 73 | "63", 74 | "64", 75 | "65", 76 | "66", 77 | "67", 78 | "68", 79 | "69", 80 | "7", 81 | "70", 82 | "71", 83 | "72", 84 | "73", 85 | "74", 86 | "75", 87 | "76", 88 | "77", 89 | "78", 90 | "79", 91 | "8", 92 | "80", 93 | "81", 94 | "82", 95 | "83", 96 | "84", 97 | "85", 98 | "86", 99 | "87", 100 | "88", 101 | "89", 102 | "9", 103 | "90", 104 | "91", 105 | "92", 106 | "93", 107 | "94", 108 | "95", 109 | "96", 110 | "97", 111 | "98", 112 | "99", 113 | "a", 114 | "able", 115 | "about", 116 | "above", 117 | "abst", 118 | "accordance", 119 | "according", 120 | "accordingly", 121 | "across", 122 | "act", 123 | "actually", 124 | "added", 125 | "adj", 126 | "affected", 127 | "affecting", 128 | "affects", 129 | "after", 130 | "afterwards", 131 | "again", 132 | "against", 133 | "ah", 134 | "all", 135 | "almost", 136 | "alone", 137 | "along", 138 | "already", 139 | "also", 140 | "although", 141 | "always", 142 | "am", 143 | "among", 144 | "amongst", 145 | "an", 146 | "and", 147 | "announce", 148 | "another", 149 | "any", 150 | "anybody", 151 | "anyhow", 152 | "anymore", 153 | "anyone", 154 | "anything", 155 | "anyway", 156 | "anyways", 157 | "anywhere", 158 | "apparently", 159 | "approximately", 160 | "are", 161 | "aren", 162 | "arent", 163 | "arise", 164 | "around", 165 | "as", 166 | "aside", 167 | "ask", 168 | "asking", 169 | "at", 170 | "auth", 171 | "available", 172 | "away", 173 | "awfully", 174 | "b", 175 | "back", 176 | "be", 177 | "became", 178 | "because", 179 | "become", 180 | "becomes", 181 | "becoming", 182 | "been", 183 | "before", 184 | "beforehand", 185 | "begin", 186 | "beginning", 187 | "beginnings", 188 | "begins", 189 | "behind", 190 | "being", 191 | "believe", 192 | "below", 193 | "beside", 194 | "besides", 195 | "between", 196 | "beyond", 197 | "biol", 198 | "both", 199 | "brief", 200 | "briefly", 201 | "but", 202 | "by", 203 | "c", 204 | "ca", 205 | "came", 206 | "can", 207 | "can't", 208 | "cannot", 209 | "cause", 210 | "causes", 211 | "certain", 212 | "certainly", 213 | "co", 214 | "com", 215 | "come", 216 | "comes", 217 | "contain", 218 | "containing", 219 | "contains", 220 | "could", 221 | "couldnt", 222 | "d", 223 | "date", 224 | "did", 225 | "didn't", 226 | "different", 227 | "do", 228 | "does", 229 | "doesn't", 230 | "doing", 231 | "don't", 232 | "done", 233 | "down", 234 | "downwards", 235 | "due", 236 | "during", 237 | "e", 238 | "each", 239 | "ed", 240 | "edu", 241 | "effect", 242 | "eg", 243 | "eight", 244 | "eighty", 245 | "either", 246 | "else", 247 | "elsewhere", 248 | "end", 249 | "ending", 250 | "enough", 251 | "especially", 252 | "et", 253 | "et-al", 254 | "etc", 255 | "even", 256 | "ever", 257 | "every", 258 | "everybody", 259 | "everyone", 260 | "everything", 261 | "everywhere", 262 | "ex", 263 | "except", 264 | "f", 265 | "far", 266 | "few", 267 | "ff", 268 | "fifth", 269 | "first", 270 | "five", 271 | "fix", 272 | "followed", 273 | "following", 274 | "follows", 275 | "for", 276 | "former", 277 | "formerly", 278 | "forth", 279 | "found", 280 | "four", 281 | "from", 282 | "further", 283 | "furthermore", 284 | "g", 285 | "gave", 286 | "get", 287 | "gets", 288 | "getting", 289 | "give", 290 | "given", 291 | "gives", 292 | "giving", 293 | "go", 294 | "goes", 295 | "gone", 296 | "got", 297 | "gotten", 298 | "h", 299 | "had", 300 | "happens", 301 | "hardly", 302 | "has", 303 | "hasn't", 304 | "have", 305 | "haven't", 306 | "having", 307 | "he", 308 | "hed", 309 | "hence", 310 | "her", 311 | "here", 312 | "hereafter", 313 | "hereby", 314 | "herein", 315 | "heres", 316 | "hereupon", 317 | "hers", 318 | "herself", 319 | "hes", 320 | "hi", 321 | "hid", 322 | "him", 323 | "himself", 324 | "his", 325 | "hither", 326 | "home", 327 | "how", 328 | "howbeit", 329 | "however", 330 | "hundred", 331 | "i", 332 | "i'll", 333 | "i've", 334 | "id", 335 | "ie", 336 | "if", 337 | "im", 338 | "immediate", 339 | "immediately", 340 | "importance", 341 | "important", 342 | "in", 343 | "inc", 344 | "indeed", 345 | "index", 346 | "information", 347 | "instead", 348 | "into", 349 | "invention", 350 | "inward", 351 | "is", 352 | "isn't", 353 | "it", 354 | "it'll", 355 | "itd", 356 | "its", 357 | "itself", 358 | "j", 359 | "just", 360 | "k", 361 | "keep", 362 | "keeps", 363 | "kept", 364 | "kg", 365 | "km", 366 | "know", 367 | "known", 368 | "knows", 369 | "l", 370 | "largely", 371 | "last", 372 | "lately", 373 | "later", 374 | "latter", 375 | "latterly", 376 | "least", 377 | "less", 378 | "lest", 379 | "let", 380 | "lets", 381 | "like", 382 | "liked", 383 | "likely", 384 | "line", 385 | "little", 386 | "look", 387 | "looking", 388 | "looks", 389 | "ltd", 390 | "m", 391 | "made", 392 | "mainly", 393 | "make", 394 | "makes", 395 | "many", 396 | "may", 397 | "maybe", 398 | "me", 399 | "mean", 400 | "means", 401 | "meantime", 402 | "meanwhile", 403 | "merely", 404 | "mg", 405 | "might", 406 | "million", 407 | "miss", 408 | "ml", 409 | "more", 410 | "moreover", 411 | "most", 412 | "mostly", 413 | "mr", 414 | "mrs", 415 | "much", 416 | "mug", 417 | "must", 418 | "my", 419 | "myself", 420 | "n", 421 | "na", 422 | "name", 423 | "namely", 424 | "nay", 425 | "nd", 426 | "near", 427 | "nearly", 428 | "necessarily", 429 | "necessary", 430 | "need", 431 | "needs", 432 | "neither", 433 | "never", 434 | "nevertheless", 435 | "new", 436 | "next", 437 | "nine", 438 | "ninety", 439 | "no", 440 | "nobody", 441 | "non", 442 | "none", 443 | "nonetheless", 444 | "noone", 445 | "nor", 446 | "normally", 447 | "nos", 448 | "not", 449 | "noted", 450 | "nothing", 451 | "now", 452 | "nowhere", 453 | "o", 454 | "obtain", 455 | "obtained", 456 | "obviously", 457 | "of", 458 | "off", 459 | "often", 460 | "oh", 461 | "ok", 462 | "okay", 463 | "old", 464 | "omitted", 465 | "on", 466 | "once", 467 | "one", 468 | "ones", 469 | "only", 470 | "onto", 471 | "or", 472 | "ord", 473 | "other", 474 | "others", 475 | "otherwise", 476 | "ought", 477 | "our", 478 | "ours", 479 | "ourselves", 480 | "out", 481 | "outside", 482 | "over", 483 | "overall", 484 | "owing", 485 | "own", 486 | "p", 487 | "page", 488 | "pages", 489 | "part", 490 | "particular", 491 | "particularly", 492 | "past", 493 | "per", 494 | "perhaps", 495 | "placed", 496 | "please", 497 | "plus", 498 | "poorly", 499 | "possible", 500 | "possibly", 501 | "potentially", 502 | "pp", 503 | "predominantly", 504 | "present", 505 | "previously", 506 | "primarily", 507 | "probably", 508 | "promptly", 509 | "proud", 510 | "provides", 511 | "put", 512 | "q", 513 | "que", 514 | "quickly", 515 | "quite", 516 | "qv", 517 | "r", 518 | "ran", 519 | "rather", 520 | "rd", 521 | "re", 522 | "readily", 523 | "really", 524 | "recent", 525 | "recently", 526 | "ref", 527 | "refs", 528 | "regarding", 529 | "regardless", 530 | "regards", 531 | "related", 532 | "relatively", 533 | "research", 534 | "respectively", 535 | "resulted", 536 | "resulting", 537 | "results", 538 | "right", 539 | "run", 540 | "s", 541 | "said", 542 | "same", 543 | "saw", 544 | "say", 545 | "saying", 546 | "says", 547 | "sec", 548 | "section", 549 | "see", 550 | "seeing", 551 | "seem", 552 | "seemed", 553 | "seeming", 554 | "seems", 555 | "seen", 556 | "self", 557 | "selves", 558 | "sent", 559 | "seven", 560 | "several", 561 | "shall", 562 | "she", 563 | "she'll", 564 | "shed", 565 | "shes", 566 | "should", 567 | "shouldn't", 568 | "show", 569 | "showed", 570 | "shown", 571 | "showns", 572 | "shows", 573 | "significant", 574 | "significantly", 575 | "similar", 576 | "similarly", 577 | "since", 578 | "six", 579 | "slightly", 580 | "so", 581 | "some", 582 | "somebody", 583 | "somehow", 584 | "someone", 585 | "somethan", 586 | "something", 587 | "sometime", 588 | "sometimes", 589 | "somewhat", 590 | "somewhere", 591 | "soon", 592 | "sorry", 593 | "specifically", 594 | "specified", 595 | "specify", 596 | "specifying", 597 | "still", 598 | "stop", 599 | "strongly", 600 | "sub", 601 | "substantially", 602 | "successfully", 603 | "such", 604 | "sufficiently", 605 | "suggest", 606 | "sup", 607 | "sure", 608 | "t", 609 | "take", 610 | "taken", 611 | "taking", 612 | "tell", 613 | "tends", 614 | "th", 615 | "than", 616 | "thank", 617 | "thanks", 618 | "thanx", 619 | "that", 620 | "that'll", 621 | "that've", 622 | "thats", 623 | "the", 624 | "their", 625 | "theirs", 626 | "them", 627 | "themselves", 628 | "then", 629 | "thence", 630 | "there", 631 | "there'll", 632 | "there've", 633 | "thereafter", 634 | "thereby", 635 | "thered", 636 | "therefore", 637 | "therein", 638 | "thereof", 639 | "therere", 640 | "theres", 641 | "thereto", 642 | "thereupon", 643 | "these", 644 | "they", 645 | "they'll", 646 | "they've", 647 | "theyd", 648 | "theyre", 649 | "think", 650 | "this", 651 | "those", 652 | "thou", 653 | "though", 654 | "thoughh", 655 | "thousand", 656 | "throug", 657 | "through", 658 | "throughout", 659 | "thru", 660 | "thus", 661 | "til", 662 | "tip", 663 | "to", 664 | "together", 665 | "too", 666 | "took", 667 | "toward", 668 | "towards", 669 | "tried", 670 | "tries", 671 | "truly", 672 | "try", 673 | "trying", 674 | "ts", 675 | "twice", 676 | "two", 677 | "u", 678 | "un", 679 | "under", 680 | "unfortunately", 681 | "unless", 682 | "unlike", 683 | "unlikely", 684 | "until", 685 | "unto", 686 | "up", 687 | "upon", 688 | "ups", 689 | "us", 690 | "use", 691 | "used", 692 | "useful", 693 | "usefully", 694 | "usefulness", 695 | "uses", 696 | "using", 697 | "usually", 698 | "v", 699 | "value", 700 | "various", 701 | "very", 702 | "via", 703 | "viz", 704 | "vol", 705 | "vols", 706 | "vs", 707 | "w", 708 | "want", 709 | "wants", 710 | "was", 711 | "wasnt", 712 | "way", 713 | "we", 714 | "we'll", 715 | "we've", 716 | "wed", 717 | "welcome", 718 | "went", 719 | "were", 720 | "werent", 721 | "what", 722 | "what'll", 723 | "whatever", 724 | "whats", 725 | "when", 726 | "whence", 727 | "whenever", 728 | "where", 729 | "whereafter", 730 | "whereas", 731 | "whereby", 732 | "wherein", 733 | "wheres", 734 | "whereupon", 735 | "wherever", 736 | "whether", 737 | "which", 738 | "while", 739 | "whim", 740 | "whither", 741 | "who", 742 | "who'll", 743 | "whod", 744 | "whoever", 745 | "whole", 746 | "whom", 747 | "whomever", 748 | "whos", 749 | "whose", 750 | "why", 751 | "widely", 752 | "willing", 753 | "wish", 754 | "with", 755 | "within", 756 | "without", 757 | "wont", 758 | "words", 759 | "world", 760 | "would", 761 | "wouldnt", 762 | "www", 763 | "x", 764 | "y", 765 | "yes", 766 | "yet", 767 | "you", 768 | "you'll", 769 | "you've", 770 | "youd", 771 | "your", 772 | "youre", 773 | "yours", 774 | "yourself", 775 | "yourselves", 776 | "z", 777 | "zero" 778 | ] 779 | }, 780 | "english_stemmer": { 781 | "type": "stemmer", 782 | "language": "english" 783 | }, 784 | "english_light_stemmer": { 785 | "type": "stemmer", 786 | "language": "light_english" 787 | }, 788 | "english_minimal_stemmer": { 789 | "type": "stemmer", 790 | "language": "minimal_english" 791 | }, 792 | "english_possessive_stemmer": { 793 | "type": "stemmer", 794 | "language": "possessive_english" 795 | } 796 | }, 797 | "analyzer": { 798 | "english": { 799 | "tokenizer": "classic", 800 | "filter": [ 801 | "icu_normalizer", 802 | "english_possessive_stemmer", 803 | "english_stop", 804 | "english_minimal_stemmer" 805 | ] 806 | } 807 | } 808 | } 809 | }, 810 | "mappings": { 811 | "_default_": { 812 | "_all": { 813 | "enabled": true 814 | }, 815 | "dynamic_templates": [ 816 | { 817 | "string_fields": { 818 | "mapping": { 819 | "type": "keyword", 820 | "ignore_above": 256 821 | }, 822 | "match": "*", 823 | "match_mapping_type": "string" 824 | } 825 | } 826 | ] 827 | }, 828 | "concept": { 829 | "_all": { 830 | "enabled": false 831 | }, 832 | "dynamic_templates": [ 833 | { 834 | "string_fields": { 835 | "mapping": { 836 | "type": "keyword", 837 | "ignore_above": 256 838 | }, 839 | "match": "*", 840 | "match_mapping_type": "string" 841 | } 842 | } 843 | ], 844 | "properties": { 845 | "concept": { 846 | "type": "object", 847 | "properties": { 848 | "sentence_text": { 849 | "type": "text", 850 | "analyzer": "english", 851 | "eager_global_ordinals": true, 852 | "fielddata": true 853 | }, 854 | "verb_subtree": { 855 | "type": "text", 856 | "analyzer": "english", 857 | "eager_global_ordinals": true, 858 | "fielddata": true 859 | }, 860 | "relations": { 861 | "type": "object", 862 | "properties": { 863 | "directed": { 864 | "type": "text", 865 | "analyzer": "whitespace", 866 | "eager_global_ordinals": true, 867 | "fielddata": true 868 | }, 869 | "undirected": { 870 | "type": "text", 871 | "analyzer": "whitespace", 872 | "eager_global_ordinals": true, 873 | "fielddata": true 874 | } 875 | } 876 | } 877 | } 878 | } 879 | } 880 | } 881 | } 882 | } 883 | -------------------------------------------------------------------------------- /es-mapping/publication.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "number_of_shards": 74, 4 | "number_of_replicas": 0, 5 | "analysis": { 6 | "filter": { 7 | "english_stop": { 8 | "type": "stop", 9 | "stopwords": [ 10 | "'ll", 11 | "'ve", 12 | "0", 13 | "1", 14 | "10", 15 | "100", 16 | "11", 17 | "12", 18 | "13", 19 | "14", 20 | "15", 21 | "16", 22 | "17", 23 | "18", 24 | "19", 25 | "2", 26 | "20", 27 | "21", 28 | "22", 29 | "23", 30 | "24", 31 | "25", 32 | "26", 33 | "27", 34 | "28", 35 | "29", 36 | "3", 37 | "30", 38 | "31", 39 | "32", 40 | "33", 41 | "34", 42 | "35", 43 | "36", 44 | "37", 45 | "38", 46 | "39", 47 | "4", 48 | "40", 49 | "41", 50 | "42", 51 | "43", 52 | "44", 53 | "45", 54 | "46", 55 | "47", 56 | "48", 57 | "49", 58 | "5", 59 | "50", 60 | "51", 61 | "52", 62 | "53", 63 | "54", 64 | "55", 65 | "56", 66 | "57", 67 | "58", 68 | "59", 69 | "6", 70 | "60", 71 | "61", 72 | "62", 73 | "63", 74 | "64", 75 | "65", 76 | "66", 77 | "67", 78 | "68", 79 | "69", 80 | "7", 81 | "70", 82 | "71", 83 | "72", 84 | "73", 85 | "74", 86 | "75", 87 | "76", 88 | "77", 89 | "78", 90 | "79", 91 | "8", 92 | "80", 93 | "81", 94 | "82", 95 | "83", 96 | "84", 97 | "85", 98 | "86", 99 | "87", 100 | "88", 101 | "89", 102 | "9", 103 | "90", 104 | "91", 105 | "92", 106 | "93", 107 | "94", 108 | "95", 109 | "96", 110 | "97", 111 | "98", 112 | "99", 113 | "a", 114 | "able", 115 | "about", 116 | "above", 117 | "abst", 118 | "accordance", 119 | "according", 120 | "accordingly", 121 | "across", 122 | "act", 123 | "actually", 124 | "added", 125 | "adj", 126 | "affected", 127 | "affecting", 128 | "affects", 129 | "after", 130 | "afterwards", 131 | "again", 132 | "against", 133 | "ah", 134 | "all", 135 | "almost", 136 | "alone", 137 | "along", 138 | "already", 139 | "also", 140 | "although", 141 | "always", 142 | "am", 143 | "among", 144 | "amongst", 145 | "an", 146 | "and", 147 | "announce", 148 | "another", 149 | "any", 150 | "anybody", 151 | "anyhow", 152 | "anymore", 153 | "anyone", 154 | "anything", 155 | "anyway", 156 | "anyways", 157 | "anywhere", 158 | "apparently", 159 | "approximately", 160 | "are", 161 | "aren", 162 | "arent", 163 | "arise", 164 | "around", 165 | "as", 166 | "aside", 167 | "ask", 168 | "asking", 169 | "at", 170 | "auth", 171 | "available", 172 | "away", 173 | "awfully", 174 | "b", 175 | "back", 176 | "be", 177 | "became", 178 | "because", 179 | "become", 180 | "becomes", 181 | "becoming", 182 | "been", 183 | "before", 184 | "beforehand", 185 | "begin", 186 | "beginning", 187 | "beginnings", 188 | "begins", 189 | "behind", 190 | "being", 191 | "believe", 192 | "below", 193 | "beside", 194 | "besides", 195 | "between", 196 | "beyond", 197 | "biol", 198 | "both", 199 | "brief", 200 | "briefly", 201 | "but", 202 | "by", 203 | "c", 204 | "ca", 205 | "came", 206 | "can", 207 | "can't", 208 | "cannot", 209 | "cause", 210 | "causes", 211 | "certain", 212 | "certainly", 213 | "co", 214 | "com", 215 | "come", 216 | "comes", 217 | "contain", 218 | "containing", 219 | "contains", 220 | "could", 221 | "couldnt", 222 | "d", 223 | "date", 224 | "did", 225 | "didn't", 226 | "different", 227 | "do", 228 | "does", 229 | "doesn't", 230 | "doing", 231 | "don't", 232 | "done", 233 | "down", 234 | "downwards", 235 | "due", 236 | "during", 237 | "e", 238 | "each", 239 | "ed", 240 | "edu", 241 | "effect", 242 | "eg", 243 | "eight", 244 | "eighty", 245 | "either", 246 | "else", 247 | "elsewhere", 248 | "end", 249 | "ending", 250 | "enough", 251 | "especially", 252 | "et", 253 | "et-al", 254 | "etc", 255 | "even", 256 | "ever", 257 | "every", 258 | "everybody", 259 | "everyone", 260 | "everything", 261 | "everywhere", 262 | "ex", 263 | "except", 264 | "f", 265 | "far", 266 | "few", 267 | "ff", 268 | "fifth", 269 | "first", 270 | "five", 271 | "fix", 272 | "followed", 273 | "following", 274 | "follows", 275 | "for", 276 | "former", 277 | "formerly", 278 | "forth", 279 | "found", 280 | "four", 281 | "from", 282 | "further", 283 | "furthermore", 284 | "g", 285 | "gave", 286 | "get", 287 | "gets", 288 | "getting", 289 | "give", 290 | "given", 291 | "gives", 292 | "giving", 293 | "go", 294 | "goes", 295 | "gone", 296 | "got", 297 | "gotten", 298 | "h", 299 | "had", 300 | "happens", 301 | "hardly", 302 | "has", 303 | "hasn't", 304 | "have", 305 | "haven't", 306 | "having", 307 | "he", 308 | "hed", 309 | "hence", 310 | "her", 311 | "here", 312 | "hereafter", 313 | "hereby", 314 | "herein", 315 | "heres", 316 | "hereupon", 317 | "hers", 318 | "herself", 319 | "hes", 320 | "hi", 321 | "hid", 322 | "him", 323 | "himself", 324 | "his", 325 | "hither", 326 | "home", 327 | "how", 328 | "howbeit", 329 | "however", 330 | "hundred", 331 | "i", 332 | "i'll", 333 | "i've", 334 | "id", 335 | "ie", 336 | "if", 337 | "im", 338 | "immediate", 339 | "immediately", 340 | "importance", 341 | "important", 342 | "in", 343 | "inc", 344 | "indeed", 345 | "index", 346 | "information", 347 | "instead", 348 | "into", 349 | "invention", 350 | "inward", 351 | "is", 352 | "isn't", 353 | "it", 354 | "it'll", 355 | "itd", 356 | "its", 357 | "itself", 358 | "j", 359 | "just", 360 | "k", 361 | "keep", 362 | "keeps", 363 | "kept", 364 | "kg", 365 | "km", 366 | "know", 367 | "known", 368 | "knows", 369 | "l", 370 | "largely", 371 | "last", 372 | "lately", 373 | "later", 374 | "latter", 375 | "latterly", 376 | "least", 377 | "less", 378 | "lest", 379 | "let", 380 | "lets", 381 | "like", 382 | "liked", 383 | "likely", 384 | "line", 385 | "little", 386 | "look", 387 | "looking", 388 | "looks", 389 | "ltd", 390 | "m", 391 | "made", 392 | "mainly", 393 | "make", 394 | "makes", 395 | "many", 396 | "may", 397 | "maybe", 398 | "me", 399 | "mean", 400 | "means", 401 | "meantime", 402 | "meanwhile", 403 | "merely", 404 | "mg", 405 | "might", 406 | "million", 407 | "miss", 408 | "ml", 409 | "more", 410 | "moreover", 411 | "most", 412 | "mostly", 413 | "mr", 414 | "mrs", 415 | "much", 416 | "mug", 417 | "must", 418 | "my", 419 | "myself", 420 | "n", 421 | "na", 422 | "name", 423 | "namely", 424 | "nay", 425 | "nd", 426 | "near", 427 | "nearly", 428 | "necessarily", 429 | "necessary", 430 | "need", 431 | "needs", 432 | "neither", 433 | "never", 434 | "nevertheless", 435 | "new", 436 | "next", 437 | "nine", 438 | "ninety", 439 | "no", 440 | "nobody", 441 | "non", 442 | "none", 443 | "nonetheless", 444 | "noone", 445 | "nor", 446 | "normally", 447 | "nos", 448 | "not", 449 | "noted", 450 | "nothing", 451 | "now", 452 | "nowhere", 453 | "o", 454 | "obtain", 455 | "obtained", 456 | "obviously", 457 | "of", 458 | "off", 459 | "often", 460 | "oh", 461 | "ok", 462 | "okay", 463 | "old", 464 | "omitted", 465 | "on", 466 | "once", 467 | "one", 468 | "ones", 469 | "only", 470 | "onto", 471 | "or", 472 | "ord", 473 | "other", 474 | "others", 475 | "otherwise", 476 | "ought", 477 | "our", 478 | "ours", 479 | "ourselves", 480 | "out", 481 | "outside", 482 | "over", 483 | "overall", 484 | "owing", 485 | "own", 486 | "p", 487 | "page", 488 | "pages", 489 | "part", 490 | "particular", 491 | "particularly", 492 | "past", 493 | "per", 494 | "perhaps", 495 | "placed", 496 | "please", 497 | "plus", 498 | "poorly", 499 | "possible", 500 | "possibly", 501 | "potentially", 502 | "pp", 503 | "predominantly", 504 | "present", 505 | "previously", 506 | "primarily", 507 | "probably", 508 | "promptly", 509 | "proud", 510 | "provides", 511 | "put", 512 | "q", 513 | "que", 514 | "quickly", 515 | "quite", 516 | "qv", 517 | "r", 518 | "ran", 519 | "rather", 520 | "rd", 521 | "re", 522 | "readily", 523 | "really", 524 | "recent", 525 | "recently", 526 | "ref", 527 | "refs", 528 | "regarding", 529 | "regardless", 530 | "regards", 531 | "related", 532 | "relatively", 533 | "research", 534 | "respectively", 535 | "resulted", 536 | "resulting", 537 | "results", 538 | "right", 539 | "run", 540 | "s", 541 | "said", 542 | "same", 543 | "saw", 544 | "say", 545 | "saying", 546 | "says", 547 | "sec", 548 | "section", 549 | "see", 550 | "seeing", 551 | "seem", 552 | "seemed", 553 | "seeming", 554 | "seems", 555 | "seen", 556 | "self", 557 | "selves", 558 | "sent", 559 | "seven", 560 | "several", 561 | "shall", 562 | "she", 563 | "she'll", 564 | "shed", 565 | "shes", 566 | "should", 567 | "shouldn't", 568 | "show", 569 | "showed", 570 | "shown", 571 | "showns", 572 | "shows", 573 | "significant", 574 | "significantly", 575 | "similar", 576 | "similarly", 577 | "since", 578 | "six", 579 | "slightly", 580 | "so", 581 | "some", 582 | "somebody", 583 | "somehow", 584 | "someone", 585 | "somethan", 586 | "something", 587 | "sometime", 588 | "sometimes", 589 | "somewhat", 590 | "somewhere", 591 | "soon", 592 | "sorry", 593 | "specifically", 594 | "specified", 595 | "specify", 596 | "specifying", 597 | "still", 598 | "stop", 599 | "strongly", 600 | "sub", 601 | "substantially", 602 | "successfully", 603 | "such", 604 | "sufficiently", 605 | "suggest", 606 | "sup", 607 | "sure", 608 | "t", 609 | "take", 610 | "taken", 611 | "taking", 612 | "tell", 613 | "tends", 614 | "th", 615 | "than", 616 | "thank", 617 | "thanks", 618 | "thanx", 619 | "that", 620 | "that'll", 621 | "that've", 622 | "thats", 623 | "the", 624 | "their", 625 | "theirs", 626 | "them", 627 | "themselves", 628 | "then", 629 | "thence", 630 | "there", 631 | "there'll", 632 | "there've", 633 | "thereafter", 634 | "thereby", 635 | "thered", 636 | "therefore", 637 | "therein", 638 | "thereof", 639 | "therere", 640 | "theres", 641 | "thereto", 642 | "thereupon", 643 | "these", 644 | "they", 645 | "they'll", 646 | "they've", 647 | "theyd", 648 | "theyre", 649 | "think", 650 | "this", 651 | "those", 652 | "thou", 653 | "though", 654 | "thoughh", 655 | "thousand", 656 | "throug", 657 | "through", 658 | "throughout", 659 | "thru", 660 | "thus", 661 | "til", 662 | "tip", 663 | "to", 664 | "together", 665 | "too", 666 | "took", 667 | "toward", 668 | "towards", 669 | "tried", 670 | "tries", 671 | "truly", 672 | "try", 673 | "trying", 674 | "ts", 675 | "twice", 676 | "two", 677 | "u", 678 | "un", 679 | "under", 680 | "unfortunately", 681 | "unless", 682 | "unlike", 683 | "unlikely", 684 | "until", 685 | "unto", 686 | "up", 687 | "upon", 688 | "ups", 689 | "us", 690 | "use", 691 | "used", 692 | "useful", 693 | "usefully", 694 | "usefulness", 695 | "uses", 696 | "using", 697 | "usually", 698 | "v", 699 | "value", 700 | "various", 701 | "very", 702 | "via", 703 | "viz", 704 | "vol", 705 | "vols", 706 | "vs", 707 | "w", 708 | "want", 709 | "wants", 710 | "was", 711 | "wasnt", 712 | "way", 713 | "we", 714 | "we'll", 715 | "we've", 716 | "wed", 717 | "welcome", 718 | "went", 719 | "were", 720 | "werent", 721 | "what", 722 | "what'll", 723 | "whatever", 724 | "whats", 725 | "when", 726 | "whence", 727 | "whenever", 728 | "where", 729 | "whereafter", 730 | "whereas", 731 | "whereby", 732 | "wherein", 733 | "wheres", 734 | "whereupon", 735 | "wherever", 736 | "whether", 737 | "which", 738 | "while", 739 | "whim", 740 | "whither", 741 | "who", 742 | "who'll", 743 | "whod", 744 | "whoever", 745 | "whole", 746 | "whom", 747 | "whomever", 748 | "whos", 749 | "whose", 750 | "why", 751 | "widely", 752 | "willing", 753 | "wish", 754 | "with", 755 | "within", 756 | "without", 757 | "wont", 758 | "words", 759 | "world", 760 | "would", 761 | "wouldnt", 762 | "www", 763 | "x", 764 | "y", 765 | "yes", 766 | "yet", 767 | "you", 768 | "you'll", 769 | "you've", 770 | "youd", 771 | "your", 772 | "youre", 773 | "yours", 774 | "yourself", 775 | "yourselves", 776 | "z", 777 | "zero" 778 | ] 779 | }, 780 | "english_stemmer": { 781 | "type": "stemmer", 782 | "language": "english" 783 | }, 784 | "english_light_stemmer": { 785 | "type": "stemmer", 786 | "language": "light_english" 787 | }, 788 | "english_minimal_stemmer": { 789 | "type": "stemmer", 790 | "language": "minimal_english" 791 | }, 792 | "english_possessive_stemmer": { 793 | "type": "stemmer", 794 | "language": "possessive_english" 795 | } 796 | }, 797 | "analyzer": { 798 | "english": { 799 | "tokenizer": "classic", 800 | "filter": [ 801 | "icu_normalizer", 802 | "english_possessive_stemmer", 803 | "english_stop", 804 | "english_minimal_stemmer" 805 | ] 806 | } 807 | } 808 | } 809 | }, 810 | "mappings": { 811 | "_default_": { 812 | "_all": { 813 | "enabled": true 814 | }, 815 | "dynamic_templates": [ 816 | { 817 | "string_fields": { 818 | "mapping": { 819 | "type": "keyword", 820 | "ignore_above": 256 821 | }, 822 | "match": "*", 823 | "match_mapping_type": "string" 824 | } 825 | } 826 | ] 827 | }, 828 | "publication": { 829 | "_all": { 830 | "enabled": true 831 | }, 832 | "dynamic_templates": [ 833 | { 834 | "string_fields": { 835 | "mapping": { 836 | "type": "keyword", 837 | "ignore_above": 256 838 | }, 839 | "match": "*", 840 | "match_mapping_type": "string" 841 | } 842 | } 843 | ], 844 | "properties": { 845 | "abstract": { 846 | "type": "text", 847 | "analyzer": "english", 848 | "eager_global_ordinals": true, 849 | "fielddata": true 850 | }, 851 | "abstract_sentences": { 852 | "type": "nested", 853 | "properties": { 854 | "value": { 855 | "type": "text", 856 | "analyzer": "english", 857 | "eager_global_ordinals": true, 858 | "fielddata": true 859 | } 860 | } 861 | }, 862 | "authors": { 863 | "properties": { 864 | "CollectiveName": { 865 | "type": "keyword", 866 | "ignore_above": 256 867 | }, 868 | "ForeName": { 869 | "type": "keyword", 870 | "ignore_above": 256 871 | }, 872 | "Identifier": { 873 | "type": "keyword", 874 | "ignore_above": 256 875 | }, 876 | "Initials": { 877 | "type": "keyword", 878 | "ignore_above": 256 879 | }, 880 | "LastName": { 881 | "type": "keyword", 882 | "ignore_above": 256 883 | }, 884 | "Suffix": { 885 | "type": "keyword", 886 | "ignore_above": 256 887 | }, 888 | "full_name": { 889 | "type": "keyword", 890 | "ignore_above": 256 891 | }, 892 | "short_name": { 893 | "type": "keyword", 894 | "ignore_above": 256 895 | }, 896 | "last_name": { 897 | "type": "keyword", 898 | "ignore_above": 256 899 | } 900 | } 901 | }, 902 | "chemicals": { 903 | "properties": { 904 | "name": { 905 | "type": "keyword", 906 | "ignore_above": 256 907 | }, 908 | "name_id": { 909 | "type": "keyword", 910 | "ignore_above": 256 911 | }, 912 | "registryNumber": { 913 | "type": "keyword", 914 | "ignore_above": 256 915 | } 916 | } 917 | }, 918 | "data_release": { 919 | "type": "keyword", 920 | "ignore_above": 256 921 | }, 922 | "date": { 923 | "type": "date", 924 | "format": "strict_date_optional_time||epoch_millis" 925 | }, 926 | "date_of_revision": { 927 | "type": "date", 928 | "format": "strict_date_optional_time||epoch_millis" 929 | }, 930 | "doi": { 931 | "type": "keyword", 932 | "ignore_above": 256 933 | }, 934 | "filename": { 935 | "type": "keyword", 936 | "ignore_above": 256 937 | }, 938 | "full_text": { 939 | "type": "text", 940 | "analyzer": "english", 941 | "eager_global_ordinals": true, 942 | "fielddata": true 943 | }, 944 | "journal": { 945 | "properties": { 946 | "medlineAbbreviation": { 947 | "type": "keyword", 948 | "ignore_above": 256 949 | }, 950 | "title": { 951 | "type": "keyword", 952 | "ignore_above": 256 953 | } 954 | } 955 | }, 956 | "journal_reference": { 957 | "properties": { 958 | "issue": { 959 | "type": "keyword", 960 | "ignore_above": 256 961 | }, 962 | "pgn": { 963 | "type": "keyword", 964 | "ignore_above": 256 965 | }, 966 | "volume": { 967 | "type": "keyword", 968 | "ignore_above": 256 969 | } 970 | } 971 | }, 972 | "keywords": { 973 | "type": "keyword", 974 | "ignore_above": 256 975 | }, 976 | "mesh_headings": { 977 | "properties": { 978 | "id": { 979 | "type": "keyword", 980 | "ignore_above": 256 981 | }, 982 | "label": { 983 | "type": "keyword", 984 | "ignore_above": 256 985 | } 986 | } 987 | }, 988 | "pub_date": { 989 | "type": "date", 990 | "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd||epoch_millis" 991 | }, 992 | "pub_id": { 993 | "type": "keyword", 994 | "ignore_above": 256 995 | }, 996 | "pub_type": { 997 | "type": "keyword", 998 | "ignore_above": 256 999 | }, 1000 | "title": { 1001 | "type": "text", 1002 | "analyzer": "english", 1003 | "eager_global_ordinals": true, 1004 | "fielddata": true 1005 | }, 1006 | "text_mined_entities": { 1007 | "type": "object", 1008 | "properties": { 1009 | "nlp": { 1010 | "type": "object", 1011 | "properties": { 1012 | "tagged_text": { 1013 | "type": "string", 1014 | "index": "no" 1015 | }, 1016 | "embedding_text": { 1017 | "type": "object", 1018 | "properties": { 1019 | "ent_tag": { 1020 | "type": "string", 1021 | "index": "no" 1022 | }, 1023 | "plain": { 1024 | "type": "string", 1025 | "index": "no" 1026 | }, 1027 | "pos_tag": { 1028 | "type": "string", 1029 | "index": "no" 1030 | } 1031 | } 1032 | } 1033 | } 1034 | } 1035 | } 1036 | } 1037 | } 1038 | } 1039 | } 1040 | } 1041 | -------------------------------------------------------------------------------- /gcp-local-ssd/readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Below are a series of commands useful for doing things with elasticsearch 4 | 5 | Setup a local environment variable for convienience 6 | ```sh 7 | HOST=es-190313-102133 8 | ``` 9 | 10 | Increase the threshold for "breakers" to help prevent false triggers 11 | ```sh 12 | time curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cluster/settings" -H 'Content-Type: application/json' -d' 13 | { 14 | "transient" : { 15 | "indices.breaker.request.limit" : "90%", 16 | "network.breaker.inflight_requests.limit": "100%" 17 | } 18 | }' 19 | ``` 20 | 21 | Below are a series of commands useful for finding out the status of elasticsearch 22 | ```sh 23 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/nodes?v&s=name" 24 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/indices?v&s=index" 25 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/shards?v&s=index,shard,prirep" 26 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/allocation?v&s=node" 27 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cluster/health?pretty" 28 | curl "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cluster/state?pretty" 29 | ``` 30 | 31 | To set a default number of shards create a "template" for future indexes before creating any 32 | ```sh 33 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_template/default" -H 'Content-Type: application/json' \ 34 | -d'{"template":"*","settings":{"number_of_shards":37}}' 35 | ``` 36 | 37 | To actually do the loading 38 | ``` 39 | time python load2es.py bioentity taggedtext publication concept --es "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" 40 | ``` 41 | 42 | Required for LINK to work properly 43 | ```sh 44 | time curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-19-concept/_settings" -H 'Content-Type: application/json' -d' 45 | { 46 | "index" : { 47 | "max_adjacency_matrix_filters" : 500 48 | } 49 | }' 50 | ``` -------------------------------------------------------------------------------- /gcp-local-ssd/run.sh: -------------------------------------------------------------------------------- 1 | # 2 | # This script will create a new elasticsearch cluster 3 | # It is configures as a GCP Instance Group with an Instance Template 4 | # and sets up a Load Balancer in front of it 5 | # 6 | 7 | 8 | NOW=`date +'%y%m%d-%H%M%S'` 9 | NAME=es-$NOW 10 | PROJECT=open-targets-library 11 | 12 | gcloud --project=$PROJECT \ 13 | compute instance-templates create $NAME \ 14 | --custom-cpu 2 \ 15 | --custom-memory 12 \ 16 | --local-ssd="" \ 17 | --image-project debian-cloud \ 18 | --image-family debian-9 \ 19 | --scopes=compute-rw \ 20 | --metadata-from-file startup-script=startup.sh 21 | 22 | #if trying to do containers, use this 23 | # --image-project cos-cloud \ 24 | # --image-family cos-stable \ 25 | 26 | #NOTE this is a BETA command and liable to change in future! 27 | gcloud beta compute --project=$PROJECT \ 28 | instance-groups managed create $NAME \ 29 | --size=37 \ 30 | --template=$NAME \ 31 | --zone=europe-west1-d 32 | 33 | #create a healthcheck 34 | #used by autohealing and load balancing 35 | #check for 10s every 10s each 6 times for 1m total 36 | gcloud compute --project=$PROJECT \ 37 | health-checks create http $NAME \ 38 | --request-path="/_nodes/_local" \ 39 | --port=9200 \ 40 | --check-interval=10s \ 41 | --timeout=10s \ 42 | --unhealthy-threshold=3 \ 43 | --healthy-threshold=3 44 | 45 | #configure healthcheck for autohealing 46 | gcloud beta compute --project=$PROJECT \ 47 | instance-groups managed update $NAME \ 48 | --health-check=$NAME \ 49 | --zone=europe-west1-d 50 | 51 | #configure a load balancer 52 | #create the load balancer backend service 53 | gcloud compute --project=$PROJECT \ 54 | backend-services create $NAME \ 55 | --health-checks=$NAME \ 56 | --load-balancing-scheme=internal \ 57 | --protocol=tcp \ 58 | --region=europe-west1 59 | #add the instance group to the backend service 60 | gcloud compute --project=$PROJECT \ 61 | backend-services add-backend $NAME \ 62 | --instance-group=$NAME \ 63 | --region=europe-west1 \ 64 | --instance-group-zone=europe-west1-d 65 | 66 | #create a forwarding rule for the actual load balancing 67 | #must use a service label to get dns! 68 | gcloud compute --project=$PROJECT \ 69 | forwarding-rules create $NAME \ 70 | --service-label $NAME \ 71 | --region=europe-west1 \ 72 | --address-region=europe-west1 \ 73 | --load-balancing-scheme=internal \ 74 | --ip-protocol=TCP \ 75 | --ports=all \ 76 | --backend-service=$NAME 77 | 78 | # [SERVICE_LABEL].[FORWARDING_RULE_NAME].il4.[REGION].lb.[PROJECT_ID].internal 79 | 80 | #curl http://$NAME.$NAME.il4.europe-west1.lb.open-targets-af.internal:9200 81 | 82 | #configure firewall to allow healthchecks - manual 83 | 84 | #sudo journalctl -n 500 -f -u google-startup-scripts.service 85 | 86 | #time curl "localhost:9200/_cat/nodes?v&s=name" 87 | #time curl localhost:9200/_cluster/health?pretty 88 | #time curl localhost:9200/_cluster/state?pretty 89 | #time curl localhost:9200/_cat/master?v 90 | #time curl "localhost:9200/_cat/shards?v&s=index,shard,prirep" 91 | -------------------------------------------------------------------------------- /gcp-local-ssd/startup.sh: -------------------------------------------------------------------------------- 1 | #initial setup 2 | #------------- 3 | 4 | #update packages, and install prerequisites for elasticsearch 5 | #use the non-interactive frontend for apt so we don't get any prompts 6 | export DEBIAN_FRONTEND=noninteractive 7 | #install with apt-get and autoconfirm 8 | apt-get update 9 | apt-get -yq install \ 10 | openjdk-8-jdk-headless \ 11 | net-tools \ 12 | wget \ 13 | uuid-runtime \ 14 | python-pip \ 15 | python-dev \ 16 | python-urllib3 \ 17 | libyaml-dev \ 18 | less \ 19 | apt-transport-https 20 | 21 | #ensure pip is the latest version, more than the debian package 22 | pip install --upgrade pip 23 | 24 | 25 | #install elasticsearch 26 | #--------------------- 27 | ES_VERSION=5.6.15 28 | #download the elasticsearch package 29 | wget --quiet --no-check-certificate \ 30 | --output-document=/tmp/elasticsearch-$ES_VERSION.deb \ 31 | https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-$ES_VERSION.deb 32 | #install the elasticsearch package 33 | #use the non-interactive frontend for dpkg so we don't get any prompts 34 | export DEBIAN_FRONTEND=noninteractive 35 | dpkg -i /tmp/elasticsearch-$ES_VERSION.deb 36 | #post-install cleanup 37 | rm /tmp/elasticsearch-$ES_VERSION.deb 38 | 39 | #install elasticsearch google compute engine discovery plugin 40 | #note this requires scopes=compute-rw on the VM 41 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install discovery-gce 42 | 43 | #install elasticsearch google storage plugin 44 | #used to save snapshots into a google cloud bucket 45 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install repository-gcs 46 | 47 | #International Components for Unicode support plugin 48 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install analysis-icu 49 | 50 | #configure elasticsearch 51 | #----------------------- 52 | 53 | #configure elasticseach 54 | # cluster.name must be unique on network for udp broadcast 55 | # network.host allow connections on any network device, not just localhost 56 | # http.port use only 9200 nothing else 57 | # bootstrap.memory_lock disable swap 58 | # xpack.security.enabled turn off xpack extras 59 | cat > /etc/elasticsearch/elasticsearch.yml < /etc/elasticsearch/jvm.options < /etc/security/limits.conf 108 | * soft nofile 65536 109 | * hard nofile 65536 110 | * soft memlock unlimited 111 | * hard memlock unlimited 112 | EOF_C 113 | 114 | # set all sysctl configurations 115 | sysctl -p 116 | 117 | # disable swap another way 118 | swapoff -a 119 | 120 | #more kernel changes to ensure we get best performance 121 | #more disabling of swap, locking of memory, and reducing unnecessary disk IO 122 | echo "block/sda/queue/scheduler = noop" >> /etc/sysfs.conf 123 | echo noop > /sys/block/sda/queue/scheduler 124 | sed -i 's/\#LimitMEMLOCK=infinity/LimitMEMLOCK=infinity/g' /usr/lib/systemd/system/elasticsearch.service 125 | sed -i '46iLimitMEMLOCK=infinity' /usr/lib/systemd/system/elasticsearch.service 126 | systemctl daemon-reload 127 | 128 | #actually start the elasticseach server now everything is ready! 129 | service elasticsearch start 130 | -------------------------------------------------------------------------------- /gcp-persistent-disk/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | This directory contains the scripts to generate LINK infrastructure and the instructions about how to load the data into ES. 4 | 5 | 6 | ## Infrastructure 7 | 8 | The file run.sh creates the infrastructure. In this first release the number of VM is hardcoded. (3) 9 | 10 | *) 3 Vms: 8cpu and 52 GB
11 | *) Elasticsearch 5.6 and 24 shards
12 | 13 | gcloud beta compute --project=$PROJECT \ 14 | instance-groups managed create $NAME \ 15 | --size=3 \ 16 | ... 17 | 18 | The file startup.sh is used by run.sh for creating the instance template. 19 | 20 | In order to create a new version the user must change the parameter "cluster.name" (ES cluster) and if the number of VMs (run.sh) changes the parameter "minimum_master_nodes" must be number_of_total_vm/2+1. (The number of VMs should be odd) 21 | 22 | discovery: 23 | zen: 24 | hosts_provider: gce 25 | minimum_master_nodes: 2 26 | indices.store.throttle.max_bytes_per_sec: "200mb" 27 | cluster.name: library201911v7 28 | 29 | 30 | ## Load the data in ES 31 | The infrastructure scripts generates a DNS name. 32 | In google cloud these info are stored under Network services > Load balancing 33 | 34 | Eg 35 | [SERVICE_LABEL].[FORWARDING_RULE_NAME].il4.[REGION].lb.[PROJECT_ID].internal 36 | 37 | To test if the cluster is available and correct: 38 | 39 | > export HOST= _[SERVICE_LABEL]_ 40 | > curl http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200 41 | > 42 | > curl -X GET "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/nodes?v&s=name" 43 | > curl -X GET "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cluster/health?pretty" 44 | 45 | 46 | The script "steps.sh" contains the steps to load the data into the ES. Please use this script as detailed reference. 47 | 48 | This is a prototype of infrastructure and the aim is builing a list of commands to run. 49 | 50 | In future, we aim to create an automatic script. 51 | 52 | -------------------------------------------------------------------------------- /gcp-persistent-disk/exec.sh: -------------------------------------------------------------------------------- 1 | gcloud compute instances create es5-library-node-$1 \ 2 | --image-project debian-cloud \ 3 | --image-family debian-9 \ 4 | --machine-type n1-highmem-8 \ 5 | --zone europe-west1-d \ 6 | --metadata-from-file startup-script=startup.sh \ 7 | --boot-disk-size "1024" \ 8 | --boot-disk-type "pd-ssd" --boot-disk-device-name "es5-library-node-ssd-$1" \ 9 | --project open-targets-library \ 10 | --scopes default,storage-rw,compute-rw 11 | -------------------------------------------------------------------------------- /gcp-persistent-disk/run.sh: -------------------------------------------------------------------------------- 1 | # 2 | # This script will create a new elasticsearch cluster 3 | # It is configures as a GCP Instance Group with an Instance Template 4 | # and sets up a Load Balancer in front of it 5 | # 6 | 7 | 8 | NOW=`date +'%y%m%d-%H%M%S'` 9 | NAME=es-$NOW 10 | PROJECT=open-targets-library 11 | 12 | gcloud --project=$PROJECT \ 13 | compute instance-templates create $NAME \ 14 | --machine-type n1-highmem-8 \ 15 | --image-project debian-cloud \ 16 | --image-family debian-9 \ 17 | --boot-disk-size "1024" \ 18 | --boot-disk-type "pd-ssd" \ 19 | --scopes default,storage-rw,compute-rw \ 20 | --metadata-from-file startup-script=startup.sh 21 | 22 | #if trying to do containers, use this 23 | # --image-project cos-cloud \ 24 | # --image-family cos-stable \ 25 | 26 | #NOTE this is a BETA command and liable to change in future! 27 | gcloud beta compute --project=$PROJECT \ 28 | instance-groups managed create $NAME \ 29 | --size=3 \ 30 | --template=$NAME \ 31 | --zone=europe-west1-d 32 | 33 | #create a healthcheck 34 | #used by autohealing and load balancing 35 | #check for 10s every 10s each 6 times for 1m total 36 | gcloud compute --project=$PROJECT \ 37 | health-checks create http $NAME \ 38 | --request-path="/_nodes/_local" \ 39 | --port=9200 \ 40 | --check-interval=10s \ 41 | --timeout=10s \ 42 | --unhealthy-threshold=3 \ 43 | --healthy-threshold=3 44 | 45 | #configure healthcheck for autohealing 46 | gcloud beta compute --project=$PROJECT \ 47 | instance-groups managed update $NAME \ 48 | --health-check=$NAME \ 49 | --zone=europe-west1-d 50 | 51 | #configure a load balancer 52 | #create the load balancer backend service 53 | gcloud compute --project=$PROJECT \ 54 | backend-services create $NAME \ 55 | --health-checks=$NAME \ 56 | --load-balancing-scheme=internal \ 57 | --protocol=tcp \ 58 | --region=europe-west1 59 | 60 | #add the instance group to the backend service 61 | gcloud compute --project=$PROJECT \ 62 | backend-services add-backend $NAME \ 63 | --instance-group=$NAME \ 64 | --region=europe-west1 \ 65 | --instance-group-zone=europe-west1-d 66 | 67 | #create a forwarding rule for the actual load balancing 68 | #must use a service label to get dns! 69 | gcloud compute --project=$PROJECT \ 70 | forwarding-rules create $NAME \ 71 | --service-label $NAME \ 72 | --region=europe-west1 \ 73 | --address-region=europe-west1 \ 74 | --load-balancing-scheme=internal \ 75 | --ip-protocol=TCP \ 76 | --ports=all \ 77 | --backend-service=$NAME 78 | 79 | # [SERVICE_LABEL].[FORWARDING_RULE_NAME].il4.[REGION].lb.[PROJECT_ID].internal 80 | 81 | #curl http://$NAME.$NAME.il4.europe-west1.lb.open-targets-af.internal:9200 82 | 83 | #configure firewall to allow healthchecks - manual 84 | 85 | #sudo journalctl -n 500 -f -u google-startup-scripts.service 86 | 87 | #time curl "localhost:9200/_cat/nodes?v&s=name" 88 | #time curl localhost:9200/_cluster/health?pretty 89 | #time curl localhost:9200/_cluster/state?pretty 90 | #time curl localhost:9200/_cat/master?v 91 | #time curl "localhost:9200/_cat/shards?v&s=index,shard,prirep" 92 | -------------------------------------------------------------------------------- /gcp-persistent-disk/startup.sh: -------------------------------------------------------------------------------- 1 | #initial setup 2 | #------------- 3 | 4 | #update packages, and install prerequisites for elasticsearch 5 | #use the non-interactive frontend for apt so we don't get any prompts 6 | export DEBIAN_FRONTEND=noninteractive 7 | #install with apt-get and autoconfirm 8 | apt-get update 9 | apt-get -yq install \ 10 | openjdk-8-jdk-headless \ 11 | net-tools \ 12 | wget \ 13 | uuid-runtime \ 14 | python-pip \ 15 | python-dev \ 16 | python-urllib3 \ 17 | libyaml-dev \ 18 | tmux \ 19 | jq \ 20 | less \ 21 | apt-transport-https 22 | 23 | #ensure pip is the latest version, more than the debian package 24 | pip install --upgrade pip 25 | 26 | 27 | #install elasticsearch 28 | #--------------------- 29 | ES_VERSION=5.6.15 30 | #download the elasticsearch package 31 | wget --quiet --no-check-certificate \ 32 | --output-document=/tmp/elasticsearch-$ES_VERSION.deb \ 33 | https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-$ES_VERSION.deb 34 | #install the elasticsearch package 35 | #use the non-interactive frontend for dpkg so we don't get any prompts 36 | export DEBIAN_FRONTEND=noninteractive 37 | dpkg -i /tmp/elasticsearch-$ES_VERSION.deb 38 | #post-install cleanup 39 | rm /tmp/elasticsearch-$ES_VERSION.deb 40 | 41 | #install elasticsearch google compute engine discovery plugin 42 | #note this requires scopes=compute-rw on the VM 43 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install discovery-gce 44 | 45 | #install elasticsearch google storage plugin 46 | #used to save snapshots into a google cloud bucket 47 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install repository-gcs 48 | 49 | #International Components for Unicode support plugin 50 | /usr/share/elasticsearch/bin/elasticsearch-plugin -s install analysis-icu 51 | 52 | #configure elasticsearch 53 | #----------------------- 54 | 55 | #configure elasticseach 56 | # cluster.name must be unique on network for udp broadcast 57 | # network.host allow connections on any network device, not just localhost 58 | # http.port use only 9200 nothing else 59 | # bootstrap.memory_lock disable swap 60 | # xpack.security.enabled turn off xpack extras 61 | cat > /etc/elasticsearch/elasticsearch.yml < /etc/elasticsearch/jvm.options < /etc/security/limits.conf 110 | * soft nofile 65536 111 | * hard nofile 65536 112 | * soft memlock unlimited 113 | * hard memlock unlimited 114 | EOF_C 115 | 116 | # set all sysctl configurations 117 | sysctl -p 118 | 119 | # disable swap another way 120 | swapoff -a 121 | 122 | #more kernel changes to ensure we get best performance 123 | #more disabling of swap, locking of memory, and reducing unnecessary disk IO 124 | echo "block/sda/queue/scheduler = noop" >> /etc/sysfs.conf 125 | echo noop > /sys/block/sda/queue/scheduler 126 | sed -i 's/\#LimitMEMLOCK=infinity/LimitMEMLOCK=infinity/g' /usr/lib/systemd/system/elasticsearch.service 127 | sed -i '46iLimitMEMLOCK=infinity' /usr/lib/systemd/system/elasticsearch.service 128 | systemctl daemon-reload 129 | 130 | #actually start the elasticseach server now everything is ready! 131 | service elasticsearch start 132 | -------------------------------------------------------------------------------- /gcp-persistent-disk/steps.sh: -------------------------------------------------------------------------------- 1 | # DNS name 2 | # Eg. http://es-201206-133204.es-201206-133204.il4.europe-west1.lb.open-targets-library.internal:9200 3 | # HOST=es-200617-101804 4 | # curl -X GET http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200 5 | 6 | export HOST=es-201002-123122 7 | 8 | # the number of shard is related with CPU and VMS. Eg. 3VMsx8cpu=24 9 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_template/default" -H 'Content-Type: application/json' \ 10 | -d'{"template":"*","settings":{"number_of_shards":24}}' 11 | 12 | mkdir loader 13 | cd loader 14 | 15 | # Settings for the different indices 16 | curl -X GET https://raw.githubusercontent.com/opentargets/library-beam/master/es-mapping-index/concept.json > concept.json 17 | curl -X GET https://raw.githubusercontent.com/opentargets/library-beam/master/es-mapping-index/publication.json > publication.json 18 | curl -X GET https://raw.githubusercontent.com/opentargets/library-beam/master/es-mapping-index/settings.json > settings.json 19 | 20 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-taggedtext?pretty" -H 'Content-Type: application/json' -d@"settings.json" 21 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-bioentity?pretty" -H 'Content-Type: application/json' -d@"settings.json" 22 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20?pretty" -H 'Content-Type: application/json' -d@"publication.json" 23 | curl -XPUT "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-concept?pretty" -H 'Content-Type: application/json' -d@"concept.json" 24 | 25 | #Adam suggested to add this. Change the HOST 26 | curl -XPUT 'http://es-201002-123122.es-201002-123122.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-concept/_settings' -H 'Content-Type: application/json' -d'{"index" : {"max_adjacency_matrix_filters" : 500}}' 27 | 28 | curl -X GET http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/_cat/indices 29 | 30 | # list of files stored in Google Storage 31 | gsutil ls gs://medline_2020_10/splitted/pubmed\*_bioentities\*.json.gz > bioentities_files.txt 32 | gsutil ls gs://medline_2020_10/splitted/pubmed\*_taggedtext\*.json.gz > taggedtext_files.txt 33 | gsutil ls gs://medline_2020_10/splitted/pubmed\*_small\*.json.gz > publication_files.txt 34 | gsutil ls gs://medline_2020_10/splitted/pubmed\*_concepts\*.json.gz > concepts_files.txt 35 | 36 | 37 | # Taggedtext index // BEGIN FOR EVERY INDEX 38 | 39 | #split the file for running 10 processes 40 | wc -l taggedtext_files.txt 41 | split -l 1240 taggedtext_files.txt taggedtext_split_ 42 | 43 | # bioentities split 44 | wc -l bioentities_files.txt 45 | split -l 1240 bioentities_files.txt bio_split_ 46 | 47 | # Concept 48 | wc -l concepts_files.txt 49 | split -l 11240 concepts_files.txt conc_split_ 50 | 51 | # publication split 52 | wc -l publication_files.txt 53 | split -l 1240 publication_files.txt publ_split_ 54 | 55 | #_index_name_tmux.sh 56 | # HOST=dns_name_hardcode (todo: change YOUR_PATH and HOST.) 57 | #!/bin/bash 58 | FILES=$YOUR_PATH/loader/taggedtext_split_* 59 | tmux start-server 60 | for f in $FILES 61 | do 62 | windowName="tagg-${f: -2}" 63 | # take action on each file. $f store current file name 64 | #cat $f 65 | echo $windowName 66 | tmux new-session -d -s ${windowName} 67 | tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter 68 | tmux send-keys -t ${windowName} "export HOST=es-201002-123122" Enter 69 | tmux send-keys -t ${windowName} "export input=${f}; ./es_tag.sh" Enter 70 | done 71 | 72 | # es_tag.sh 73 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-20-taggedtext --type taggedtext --id-field pub_id json --json-lines - ; done 74 | 75 | #Kill the list of tmux opened 76 | #!/bin/bash 77 | FILES=$YOUR_PATH/loader/taggedtext_split_* 78 | tmux start-server 79 | for f in $FILES 80 | do 81 | windowName="tagg-${f: -2}" 82 | echo $windowName 83 | tmux kill-session -t ${windowName} 84 | done 85 | 86 | # Changed the refresh interval 87 | export HOST=dns_name_param 88 | curl -XPUT http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-taggedtext/_settings -d '{"index":{"refresh_interval":"1s"}}' 89 | 90 | Eg, 91 | curl -XPUT http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-bioentity/_settings -d '{"index":{"refresh_interval":"1s"}}' 92 | 93 | curl -XPUT http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20-concept/_settings -d '{"index":{"refresh_interval":"1s"}}' 94 | 95 | curl -XPUT http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200/pubmed-20/_settings -d '{"index":{"refresh_interval":"1s"}}' 96 | 97 | 98 | #### IMPORTANT 99 | The index es_concept.sh is slightly different due the id-field value 100 | 101 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.int 102 | ernal:9200" --with-retry --bulk-size 10000 --index pubmed-20-concept --type concept json --json-lines - ; done 103 | 104 | There are some examples under "tmux_example" 105 | 106 | #Publication Alias. esurl : replace with the proper URL 107 | 108 | curl -XPOST 'http://esurl:9200/_aliases?pretty' -H 'Content-Type: application/json' -d ' 109 | { 110 | "actions": [ 111 | {"add": {"index": "pubmed-20", "alias": "!publication-data"}} 112 | ] 113 | } ' 114 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/README.md: -------------------------------------------------------------------------------- 1 | This directory contains the skeleton of the scripts for loading the indices into ES. 2 | 3 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/bioentity_tmux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES=YOUR_PATH/loader/bio_split_* 4 | tmux start-server 5 | for f in $FILES 6 | do 7 | windowName="bioe-${f: -2}" 8 | echo $windowName 9 | tmux new-session -d -s ${windowName} 10 | tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter 11 | #Add the dns_name here. Todo: improve it. 12 | tmux send-keys -t ${windowName} "export HOST=....." Enter 13 | tmux send-keys -t ${windowName} "export input=${f}; ./es_bio.sh" Enter 14 | done 15 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/bioentity_tmux_kill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES=_YOUR_PATH_/loader/bio_split_* 4 | tmux start-server 5 | for f in $FILES 6 | do 7 | windowName="bioe-${f: -2}" 8 | echo $windowName 9 | tmux kill-session -t ${windowName} 10 | done 11 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/concept_tmux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES=YOUR_PATH/loader/conc_split_* 4 | tmux start-server 5 | for f in $FILES 6 | do 7 | windowName="conc-${f: -2}" 8 | echo $windowName 9 | tmux new-session -d -s ${windowName} 10 | tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter 11 | #Add the dns_name here. Todo: improve it. 12 | tmux send-keys -t ${windowName} "export HOST=......" Enter 13 | tmux send-keys -t ${windowName} "export input=${f}; ./es_concept.sh" Enter 14 | done 15 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/concept_tmux_kill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES=YOUR_PATH/loader/conc_split_* 4 | tmux start-server 5 | for f in $FILES 6 | do 7 | windowName="conc-${f: -2}" 8 | echo $windowName 9 | tmux kill-session -t ${windowName} 10 | done 11 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/es_bio.sh: -------------------------------------------------------------------------------- 1 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-19-bioentity --type bioentity --id-field pub_id json --json-lines - ; done 2 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/es_concept.sh: -------------------------------------------------------------------------------- 1 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-19-concept --type concept json --json-lines - ; done 2 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/es_pub.sh: -------------------------------------------------------------------------------- 1 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-19 --type publication --id-field pub_id json --json-lines - ; done 2 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/es_tag.sh: -------------------------------------------------------------------------------- 1 | time for file in $(cat ${input}); do gsutil cat $file | gunzip | elasticsearch_loader --es-host "http://$HOST.$HOST.il4.europe-west1.lb.open-targets-library.internal:9200" --with-retry --bulk-size 10000 --index pubmed-19-taggedtext --type taggedtext --id-field pub_id json --json-lines - ; done 2 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/publication_tmux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES=_YOUR_PATH_/loader/publ_split_* 4 | tmux start-server 5 | for f in $FILES 6 | do 7 | windowName="publ-${f: -2}" 8 | echo $windowName 9 | tmux new-session -d -s ${windowName} 10 | tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter 11 | #Add the dns_name here. Todo: improve it. 12 | tmux send-keys -t ${windowName} "export HOST=......" Enter 13 | tmux send-keys -t ${windowName} "export input=${f}; ./es_pub.sh" Enter 14 | done 15 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/publication_tmux_kill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES=_YOUR_PATH/loader/publ_split_* 4 | tmux start-server 5 | for f in $FILES 6 | do 7 | windowName="publ-${f: -2}" 8 | echo $windowName 9 | tmux kill-session -t ${windowName} 10 | done 11 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/taggedtext_tmux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES=_YOUR_PATH_/loader/taggedtext_split_* 4 | tmux start-server 5 | for f in $FILES 6 | do 7 | windowName="tagg-${f: -2}" 8 | echo $windowName 9 | tmux new-session -d -s ${windowName} 10 | tmux send-keys -t ${windowName} "source ~/library-beam/venv_elastic/bin/activate" Enter 11 | #Add the dns_name here. Todo: improve it. 12 | tmux send-keys -t ${windowName} "export HOST=......" Enter 13 | tmux send-keys -t ${windowName} "export input=${f}; ./es_tag.sh" Enter 14 | done 15 | -------------------------------------------------------------------------------- /gcp-persistent-disk/tmux_example/taggedtext_tmux_kill.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | FILES=_YOUR_PATH_/loader/taggedtext_split_* 4 | tmux start-server 5 | for f in $FILES 6 | do 7 | windowName="tagg-${f: -2}" 8 | echo $windowName 9 | tmux kill-session -t ${windowName} 10 | done 11 | -------------------------------------------------------------------------------- /load2es.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import codecs 3 | import gzip 4 | import json 5 | import logging 6 | import time 7 | from tempfile import NamedTemporaryFile 8 | 9 | from elasticsearch import Elasticsearch 10 | from elasticsearch.helpers import parallel_bulk 11 | from google.cloud import storage 12 | from tqdm import tqdm 13 | 14 | 15 | ''' 16 | tmux new-session "python load2es.py publication --es http://myes:9200" 17 | ''' 18 | 19 | NODES = 37 20 | INDEX_NAME = 'pubmed-20' 21 | DOC_TYPE = 'publication' 22 | 23 | index_config = { 24 | 'bioentity': 25 | dict(suffix='_bioentities.json.gz', 26 | index='pubmed-20-bioentity', 27 | doc_type='bioentity', 28 | mappings=None, 29 | pub_id=True), 30 | 'taggedtext': 31 | dict(suffix='_taggedtext.json.gz', 32 | index='pubmed-20-taggedtext', 33 | doc_type='taggedtext', 34 | mappings=None, 35 | pub_id=True), 36 | 'publication': 37 | dict(suffix='_small.json.gz', 38 | index='pubmed-20', 39 | doc_type='publication', 40 | mappings='publication.json', 41 | pub_id=True 42 | ), 43 | 'concept': 44 | dict(suffix='_concepts.json.gz', 45 | index='pubmed-20-concept', 46 | doc_type='concept', 47 | mappings='concept.json', 48 | pub_id=False), 49 | 50 | } 51 | 52 | 53 | def read_remote_files(bucket, filenames, index_, doc_type, use_pub_id): 54 | for file_name in filenames: 55 | for line in read_remote_file( 56 | bucket, file_name, index_, doc_type, use_pub_id): 57 | yield line 58 | 59 | 60 | def read_remote_file(bucket, file_name, index_, doc_type, use_pub_id): 61 | counter = 0 62 | while counter <= 3: # retry 3 times 63 | counter += 1 64 | try: 65 | with NamedTemporaryFile() as cache_file: 66 | # download the file to a temporary location 67 | blob = bucket.get_blob(file_name) 68 | blob.download_to_file(cache_file, ) 69 | # flush the file to make sure it is written to disk 70 | cache_file.flush() 71 | # re-open the cache file to decompress it 72 | zf = gzip.open(cache_file.name, 'rb') 73 | 74 | reader = codecs.getreader("utf-8") 75 | new_line = [] 76 | for line in reader(zf): 77 | new_line.append(line) 78 | if line[-1] == '\n': 79 | counter += 1 80 | if len(new_line) > 1: 81 | line_to_yield = ''.join(new_line) 82 | else: 83 | line_to_yield = line 84 | new_line = [] 85 | if line_to_yield: 86 | pub_id = line_to_yield.partition('"pub_id": "')[2].partition('"')[0] 87 | if not pub_id: 88 | logging.error('no pubmedid parsed for line %s' % line) 89 | else: 90 | _id = None 91 | if use_pub_id and pub_id: 92 | _id = pub_id 93 | yield { 94 | '_index': index_, 95 | '_type': doc_type, 96 | '_id': _id, 97 | '_source': line_to_yield 98 | } 99 | else: 100 | yield { 101 | '_index': index_, 102 | '_type': doc_type, 103 | '_source': line_to_yield 104 | } 105 | break 106 | except Exception as e: 107 | logging.exception('could not get file %s: %s' % (file_name, e)) 108 | pass 109 | if counter == 3: 110 | logging.error(' file %s skipped', file_name) 111 | 112 | 113 | def get_file_names(suffix): 114 | client = storage.Client(project='open-targets-library') 115 | bucket = client.get_bucket('medline_2020_06') 116 | 117 | for i in bucket.list_blobs(prefix='splitted/'): 118 | if i.name.endswith(suffix): 119 | yield i.name 120 | 121 | 122 | if __name__ == '__main__': 123 | 124 | parser = argparse.ArgumentParser( 125 | description='Load LINK data into Elasticsearch') 126 | parser.add_argument('indices', nargs='+', 127 | help='one or more elasticsearch indexes to load') 128 | parser.add_argument('--es', dest='es', action='append', 129 | default=[], 130 | help='elasticsearch url(s)') 131 | args = parser.parse_args() 132 | 133 | # setup the google cloud storage bucket reading stuff 134 | client = storage.Client(project='open-targets-library') 135 | bucket = client.get_bucket('medline_2020_06') 136 | 137 | # prepate elasticsearch for loading 138 | valid_indices = list(set(args.indices) & set(index_config.keys())) 139 | logging.info('loading data for indices: ' + ', '.join(valid_indices)) 140 | es = Elasticsearch( 141 | hosts=args.es, 142 | max_retry=10, 143 | retry_on_timeout=True, 144 | ) 145 | for idx in valid_indices: 146 | index_data = index_config[idx] 147 | 148 | # delete any old index 149 | tqdm.write('deleting %s %s' % ( 150 | index_data['index'], es.indices.delete( 151 | index=index_data['index'], 152 | ignore=404, 153 | timeout='300s' 154 | ) 155 | )) 156 | if index_data['mappings']: 157 | tqdm.write('creating %s %s' % ( 158 | index_data['index'], es.indices.create( 159 | index=index_data['index'], 160 | ignore=400, 161 | body=json.load(open('es-mapping/' + index_data['mappings'])), 162 | timeout='30s' 163 | ) 164 | )) 165 | else: 166 | tqdm.write('creating %s %s' % ( 167 | index_data['index'], es.indices.create( 168 | index=index_data['index'], 169 | ignore=400, 170 | timeout='30s' 171 | ) 172 | )) 173 | 174 | # wait a while for index to stabilize 175 | time.sleep(15) 176 | 177 | # prepare elasticserach for bulk loading 178 | temp_index_settings = { 179 | "index": { 180 | "refresh_interval": "-1", 181 | "number_of_replicas": 0, 182 | "translog.durability": 'async', 183 | } 184 | } 185 | es.indices.put_settings(index=index_data['index'], 186 | body=temp_index_settings) 187 | 188 | # get filenames from the bucket for this index 189 | file_names = tuple(get_file_names(suffix=index_data['suffix'])) 190 | 191 | # make a generator of all the rows in all the files 192 | loaded_rows = read_remote_files( 193 | bucket, 194 | file_names, 195 | index_data['index'], 196 | index_data['doc_type'], 197 | index_data['pub_id'] 198 | ) 199 | 200 | success, failed = 0, 0 201 | with tqdm(loaded_rows, 202 | desc='loading json for index %s' % index_data['index'], 203 | unit=' docs', 204 | unit_scale=True, 205 | total=30000000 if 'concept' not in index_data['index'] else 570000000) as p_loaded_rows: 206 | 207 | 208 | # configure how many threads to load in 209 | # this should be less than 1 per elasticsearch node CPU 210 | threads = NODES * 2 211 | counter = 0 212 | 213 | # do the actual loading now 214 | for ok, item in parallel_bulk( 215 | es, p_loaded_rows, 216 | raise_on_error=True, 217 | chunk_size=1000, 218 | thread_count=threads, 219 | request_timeout=300 220 | ): 221 | 222 | if not ok: 223 | failed += 1 224 | else: 225 | success += 1 226 | counter += 1 227 | 228 | tqdm.write("uploaded %i success, %i failed\n" % (success, failed)) 229 | 230 | # return elasticsearch to non-bulk settings 231 | # this will make it start to reiplicate if applicable 232 | restore_index_settings = { 233 | "index": { 234 | "refresh_interval": "1s", 235 | "number_of_replicas": 1, 236 | "translog.durability": 'request', 237 | } 238 | } 239 | es.indices.put_settings(index=index_data['index'], 240 | body=restore_index_settings) 241 | 242 | -------------------------------------------------------------------------------- /modules/AbbreviationFinder.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python 2 | # -*- coding: UTF-8 -*- 3 | 4 | '''Link abbreviations to their full names | Optimised to find the longest definition 5 | 6 | 7 | Adapted and optimised from source available here: source: http://www.cnts.ua.ac.be/~vincent/scripts/abbreviations.py 8 | source made available by: Vincent Van Asch 9 | original source version: 1.2.1 10 | original alghoritm in: 11 | 12 | A Simple Algorithm for Identifying Abbreviations Definitions in Biomedical Text 13 | A. Schwartz and M. Hearst 14 | Biocomputing, 2003, pp 451-462. 15 | 16 | ''' 17 | import logging 18 | import re 19 | 20 | from textblob import TextBlob 21 | 22 | 23 | class Candidate(unicode): 24 | def __new__(cls, start, stop, str): 25 | return unicode.__new__(cls, str) 26 | 27 | def __init__(self, start, stop, str): 28 | self._start = start 29 | self._stop = stop 30 | 31 | def __getslice__(self, i, j): 32 | start = self.start + i 33 | stop = self.start + j 34 | str = unicode.__getslice__(self, i, j) 35 | return Candidate(start, stop, str) 36 | 37 | @property 38 | def start(self): 39 | '''The start index''' 40 | return self._start 41 | 42 | @property 43 | def stop(self): 44 | '''The stop index''' 45 | return self._stop 46 | 47 | 48 | 49 | 50 | 51 | class AbbreviationsParser(object): 52 | def __init__(self, verbose = False): 53 | self.encoding = 'UTF8' 54 | self.verbose = verbose 55 | self.logger = logging.getLogger(__name__) 56 | 57 | def digest(self, textblob): 58 | if isinstance(textblob, (str, unicode)): 59 | textblob = TextBlob(textblob) 60 | return list(self._digest_iterator(textblob)) 61 | 62 | def digest_as_dict(self, textblob): 63 | digested = self.digest(textblob) 64 | d = {} 65 | for i in digested: 66 | if i['short'] not in d: 67 | d[i['short']]=i['long'] 68 | return d 69 | 70 | def _digest_iterator(self, textblob): 71 | omit = 0 72 | written = 0 73 | for i, sentence in enumerate(textblob.sentences): 74 | sentence = sentence.raw 75 | # print sentence 76 | try: 77 | for candidate in self.getcandidates(sentence): 78 | try: 79 | definition = self.getdefinition(candidate, sentence) 80 | except ValueError as e: 81 | if self.verbose: 82 | self.logger.debug(str((i, 'Omitting candidate', candidate.encode(self.encoding), 'Reason:', 83 | e.args[0].encode(self.encoding)))) 84 | omit += 1 85 | else: 86 | try: 87 | definition = self.definitionselection(definition, candidate) 88 | except IndexError: 89 | if self.verbose: 90 | self.logger.debug(str((i, 'Omitting candidate', definition.encode( 91 | self.encoding), '||', candidate.encode(self.encoding)))) 92 | omit += 1 93 | except ValueError as e: 94 | if self.verbose: 95 | self.logger.debug(str((i, 'Omitting candidate', definition.encode( 96 | self.encoding), '||', candidate.encode(self.encoding), 'Reason:', 97 | e.args[0].encode(self.encoding)))) 98 | omit += 1 99 | else: 100 | 101 | cline = '%d %d %d %s' % (i, candidate.start, candidate.stop, candidate) 102 | dline = '%d %d %d %s' % (i, definition.start, definition.stop, definition) 103 | 104 | yield dict(short=candidate.encode(self.encoding), 105 | long=definition.encode(self.encoding)) 106 | # print cline.encode(self.encoding) 107 | # print dline.encode(self.encoding) 108 | # print 109 | 110 | written += 1 111 | except ValueError as e: 112 | if self.verbose: 113 | self.logger.debug(str(('Reason:', e.args[0].encode(self.encoding)))) 114 | 115 | def getcandidates(self, sentence): 116 | '''Yields Candidates''' 117 | delimiters = {'(': ('(', ')'), 118 | '[': ('[', ']'), 119 | '{': ('{', '}'), 120 | '<': ('<', '>'), } 121 | for delimiter in delimiters: 122 | if delimiter in sentence: 123 | del_start, del_end = delimiters[delimiter] 124 | # Check some things first 125 | if sentence.count(del_start) != sentence.count(del_end): 126 | raise ValueError('Unbalanced parentheses: %s' % sentence) 127 | 128 | if sentence.find(del_start) > sentence.find(del_end): 129 | raise ValueError('First parentheses is right: %s' % sentence) 130 | 131 | closeindex = -1 132 | while 1: 133 | # Look for open parenthesis 134 | openindex = sentence.find(del_start, closeindex + 1) 135 | 136 | if openindex == -1: 137 | break 138 | 139 | # Look for closing parantheses 140 | closeindex = openindex + 1 141 | open = 1 142 | skip = False 143 | while open: 144 | try: 145 | char = sentence[closeindex] 146 | except IndexError: 147 | # We found an opening bracket but no associated closing bracket 148 | # Skip the opening bracket 149 | skip = True 150 | break 151 | if char == del_start: 152 | open += 1 153 | elif char == del_end: 154 | open -= 1 155 | closeindex += 1 156 | 157 | if skip: 158 | closeindex = openindex + 1 159 | continue 160 | 161 | # Output if conditions are met 162 | start = openindex + 1 163 | stop = closeindex - 1 164 | str = sentence[start:stop] 165 | 166 | # Take into account whitepsace that should be removed 167 | start = start + len(str) - len(str.lstrip()) 168 | stop = stop - len(str) + len(str.rstrip()) 169 | str = sentence[start:stop] 170 | 171 | if self.conditions(str): 172 | yield Candidate(start, stop, str) 173 | 174 | def getdefinition(self, candidate, sentence): 175 | '''Takes a candidate and a sentence and returns the definition candidate. 176 | 177 | The definintion candidate is the set of tokens (in front of the candidate) 178 | that starts with a token starting with the first character of the candidate''' 179 | # Take the tokens in front of the candidate 180 | tokens = sentence[:candidate.start - 2].lower().split() 181 | 182 | # the char that we are looking for 183 | key = candidate[0].lower() 184 | 185 | # Count the number of tokens that start with the same character as the candidate 186 | firstchars = [t[0] for t in tokens] 187 | 188 | definitionfreq = firstchars.count(key) 189 | candidatefreq = candidate.lower().count(key) 190 | 191 | # Look for the list of tokens in front of candidate that 192 | # have a sufficient number of tokens starting with key 193 | if candidatefreq <= definitionfreq: 194 | # we should at least have a good number of starts 195 | count = 0 196 | start = 0 197 | startindex = len(firstchars) - 1 198 | while count < candidatefreq: 199 | if abs(start) > len(firstchars): 200 | raise ValueError('not found') 201 | 202 | start -= 1 203 | # Look up key in the definition 204 | try: 205 | startindex = firstchars.index(key, len(firstchars) + start) 206 | except ValueError: 207 | pass 208 | 209 | # Count the number of keys in definition 210 | count = firstchars[startindex:].count(key) 211 | 212 | # We found enough keys in the definition so return the definition as a 213 | # definition candidate 214 | start = len(' '.join(tokens[:startindex])) 215 | stop = candidate.start - 2 216 | str = sentence[start:stop] 217 | 218 | # Remove whitespace 219 | start = start + len(str) - len(str.lstrip()) 220 | stop = stop - len(str) + len(str.rstrip()) 221 | str = sentence[start:stop] 222 | 223 | return Candidate(start, stop, str) 224 | 225 | 226 | else: 227 | # print 'S', sentence 228 | # print >>sys.stderr, 'KEY', key 229 | # print >>sys.stderr, 'TOKENS', tokens 230 | # print >>sys.stderr, 'ABBREV', candidate 231 | raise ValueError('There are less keys in the tokens in front of candidate than there are in the candidate') 232 | 233 | def definitionselection(self, definition, abbrev,): 234 | '''Takes a definition candidate and an abbreviation candidate 235 | and returns True if the chars in the abbreviation occur in the definition 236 | 237 | Based on 238 | A simple algorithm for identifying abbreviation definitions in biomedical texts, Schwartz & Hearst''' 239 | 240 | def get_matches(): 241 | '''yield a list of possible dfinitions''' 242 | if len(definition) < len(abbrev): 243 | raise ValueError('Abbreviation is longer than definition') 244 | 245 | if abbrev in definition.split(): 246 | raise ValueError('Abbreviation is full word of definition') 247 | 248 | sindex = -1 249 | lindex = -1 250 | 251 | while 1: 252 | try: 253 | longchar = definition[lindex].lower() 254 | except IndexError: 255 | break 256 | 257 | shortchar = abbrev[sindex].lower() 258 | 259 | if not shortchar.isalnum(): 260 | sindex -= 1 261 | 262 | if sindex == -1 * len(abbrev): 263 | if shortchar == longchar: 264 | if lindex == -1 * len(definition) or not definition[lindex - 1].isalnum(): 265 | yield definition[lindex:len(definition)] 266 | lindex -= 1 267 | if lindex == -1 * len(definition): 268 | break 269 | 270 | else: 271 | lindex -= 1 272 | 273 | if lindex == -1 * (len(definition) + 1): 274 | raise ValueError('definition of "%s" not found in "%s"' % (abbrev, definition)) 275 | 276 | else: 277 | if shortchar == longchar: 278 | sindex -= 1 279 | lindex -= 1 280 | else: 281 | lindex -= 1 282 | 283 | definitions = list(get_matches()) 284 | if not definitions: 285 | raise IndexError('no matching definition found') 286 | definition = definitions[0] 287 | for i in definitions: 288 | if len(i) > len(definition): 289 | definition = i 290 | tokens = len(definition.split()) 291 | length = len(abbrev) 292 | 293 | if tokens > min([length + 5, length * 2]): 294 | raise ValueError('did not meet min(|A|+5, |A|*2) constraint') 295 | 296 | return definition 297 | 298 | def conditions(self, str): 299 | '''Based on Schwartz&Hearst 300 | 301 | 2 <= len(str) <= 10 302 | len(tokens) <= 2 303 | re.search('[A-Za-z]', str) 304 | str[0].isalnum() 305 | 306 | and extra: 307 | if it matches ([A-Za-z]\. ?){2,} 308 | it is a good candidate. 309 | 310 | ''' 311 | # import nltk 312 | # if nltk.re.match('([A-Za-z]\. ?){2,}', str.lstrip()): 313 | # return True 314 | if len(str) < 2 or len(str) > 10: 315 | return False 316 | if len(str.split()) > 2: 317 | return False 318 | if not re.search('[A-Za-z]', str): 319 | return False 320 | if not str[0].isalnum(): 321 | return False 322 | 323 | return True 324 | -------------------------------------------------------------------------------- /modules/BioentityTagger.py: -------------------------------------------------------------------------------- 1 | import ahocorasick 2 | import logging 3 | import string 4 | import sys 5 | import time 6 | import unicodedata 7 | 8 | import requests 9 | from fuzzywuzzy import fuzz 10 | from rope.base.codeanalyze import ChangeCollector 11 | 12 | from BioStopWords import DOMAIN_STOP_WORDS 13 | from modules.vocabulary import vocabulary_urls 14 | 15 | unicode_punctation_table = dict.fromkeys(i for i in xrange(sys.maxunicode) 16 | if unicodedata.category(unichr(i)).startswith('P')) 17 | 18 | 19 | class BioEntityTagger(object): 20 | separators_all = [' ', '.', ',', ';', ':', ')', ']', '(', '[', '{', '}', '/', '\\', '"', "'", '?', '!', '<', '>', 21 | '+', '-'] 22 | 23 | def __init__(self, 24 | partial_match=False, 25 | ignorecase=True, 26 | stopwords=None): 27 | ''' 28 | 29 | :param partial_match: allow for matching a non clomplete word 30 | :param ignorecase: case sensitive or not 31 | :param stopwords: stopwords to skip, defaults to a very broad list 32 | ''' 33 | self.A = ahocorasick.Automaton() 34 | self.partial_match = partial_match 35 | self.ignorecase = ignorecase 36 | if stopwords is None: 37 | stopwords = DOMAIN_STOP_WORDS 38 | idx = 0 39 | s = requests.Session() 40 | '''get the dictionaries from remote files''' 41 | for dictionary_url in vocabulary_urls: 42 | max_retry = 3 43 | retry = 0 44 | while retry < max_retry: 45 | dictionary_request = s.get(dictionary_url) 46 | if not dictionary_request.ok: 47 | time.sleep(1) 48 | retry += 1 49 | else: 50 | break 51 | if not dictionary_request.ok: 52 | logging.error('cannot download dictionary %s, skipped' % dictionary_url) 53 | continue 54 | dictionary = dictionary_request.json() 55 | category, reference_db = dictionary_url.split('/')[-1].split('.')[0].split('_')[0].split('-') 56 | '''load the elements in the Automation if they are not too short or are stopwords''' 57 | for element, element_data in dictionary.items(): 58 | ids = element_data['ids'] 59 | pref_name = element_data['pref_name'] 60 | if len(element) > 2: 61 | element_str = element.encode('utf-8') 62 | if ((len(element_str) < 5) and (element_str not in stopwords) or \ 63 | (len(element_str) >= 5) and (element_str.lower() not in stopwords)): 64 | idx += 1 65 | if self.ignorecase: 66 | element_match = element_str.lower() 67 | else: 68 | element_match = element_str 69 | self.add_tag(element_match, 70 | idx, 71 | category, 72 | reference_db, 73 | [i.encode('utf-8') for i in ids], 74 | element, 75 | element_match, 76 | pref_name) 77 | '''handle elements with dashes by also creating a copy without''' 78 | if '-' in element_match: 79 | element_match_without_dash = element_match.replace('-', '') 80 | if len(element_match_without_dash) > 2: 81 | self.add_tag(element_match_without_dash, 82 | idx, 83 | category, 84 | reference_db, 85 | [i.encode('utf-8') for i in ids], 86 | element, 87 | element_match_without_dash, 88 | pref_name) 89 | '''if supporting partial match''' 90 | if self.partial_match: 91 | for longest_token in element.split(): 92 | if longest_token != element and len( 93 | longest_token) > 5 and longest_token.lower() not in stopwords: 94 | self.add_tag(longest_token, 95 | idx, 96 | category + '-TOKEN', 97 | reference_db, 98 | [i.encode('utf-8') for i in ids], 99 | element, 100 | longest_token, 101 | pref_name) 102 | 103 | s.close() 104 | self.A.make_automaton() 105 | 106 | def add_tag(self, element_text, idx, category, reference_db, ids, element, match, pref_name): 107 | unique_resource_key = category + '|' + reference_db 108 | category_insert = [category] 109 | reference_db_insert = [reference_db] 110 | ids_insert = [[i.encode('utf-8') for i in ids]] 111 | previous_annotation = self.A.get(element_text, None) 112 | 113 | if previous_annotation is None: 114 | annotation = [idx, 115 | category_insert, 116 | reference_db_insert, 117 | ids_insert, 118 | element, 119 | match, 120 | pref_name] 121 | 122 | self.A.add_word(element_text, 123 | annotation) 124 | else: 125 | previous_keys = [] 126 | for j in range(len(previous_annotation[1])): 127 | previous_keys.append(previous_annotation[1][j] + '|' + previous_annotation[2][j]) 128 | if unique_resource_key not in previous_keys: 129 | previous_annotation[1].extend(category_insert) 130 | previous_annotation[2].extend(reference_db_insert) 131 | previous_annotation[3].extend(ids_insert) # TODO: might need to merge addidional ids if the 132 | # uniquekey is passed before 133 | self.A.add_word(element_text, previous_annotation) 134 | 135 | def tag(self, text): 136 | return self._tag(text, self.A, self.ignorecase) 137 | 138 | @staticmethod 139 | def _tag(text, automation, ignorecase=True): 140 | ''' 141 | finds tags in a text 142 | :param text: text to tag 143 | :param automation: automation to use 144 | :param ignorecase: deafault to True 145 | :return: 146 | ''' 147 | if isinstance(text, unicode): 148 | text_to_tag = text.encode('utf-8') 149 | else: 150 | text_to_tag = text 151 | if ignorecase: 152 | text_to_tag = text_to_tag.lower() 153 | matches = [] 154 | for i in automation.iter(text_to_tag.lower()): 155 | if len(i[1]) < 7: 156 | print i 157 | for end_index, (insert_order, category_list, reference_db_list, entity_id_list, original_value, match, 158 | pref_name) in automation.iter(text_to_tag.lower()): 159 | start_index = end_index - len(match) + 1 160 | end_index += 1 161 | 162 | if (start_index == 0 or text_to_tag[start_index - 1] in BioEntityTagger.separators_all) and \ 163 | (end_index == len(text_to_tag) or text_to_tag[end_index] in BioEntityTagger.separators_all): 164 | for j in range(len(category_list)): 165 | category = category_list[j] 166 | reference_db = reference_db_list[j] 167 | entity_id = entity_id_list[j] 168 | if isinstance(entity_id, list): 169 | entity_id = entity_id[0] 170 | if category.endswith('-TOKEN'): 171 | pre, post = original_value.split(match)[:2] 172 | potential_match = text_to_tag[start_index:end_index + len(post)] 173 | score = fuzz.token_sort_ratio(original_value, potential_match) 174 | if score > 90: 175 | tag = MatchedTag(match, start_index, end_index, category.replace('-TOKEN', ''), 176 | reference_db, 177 | entity_id, original_value, pref_name) 178 | matches.append(tag.__dict__) 179 | else: 180 | tag = MatchedTag(match, start_index, end_index, category, reference_db, entity_id, 181 | original_value, pref_name) 182 | matches.append(tag.__dict__) 183 | else: 184 | pass 185 | 186 | grouped_matches = BioEntityTagger.group_matches_by_category_and_reference(matches) 187 | filtered_matches = [] 188 | for group, matches_in_group in grouped_matches.items(): 189 | non_nested_matches = BioEntityTagger.remove_nested_matches(matches_in_group) 190 | filtered_matches.extend(non_nested_matches) 191 | 192 | return filtered_matches 193 | 194 | @staticmethod 195 | def group_matches_by_category_and_reference(matches): 196 | grouped_by_category_type = {} 197 | for match in matches: 198 | key = match['category'] + '|' + match['reference_db'] 199 | if key not in grouped_by_category_type: 200 | grouped_by_category_type[key] = [] 201 | grouped_by_category_type[key].append(match) 202 | 203 | return grouped_by_category_type 204 | 205 | @staticmethod 206 | def remove_nested_matches(matches): 207 | filtered_matches = [] 208 | sorted_matches = sorted(matches, key=lambda x: (x['start'], -x['end'])) 209 | for i, tag_i in enumerate(sorted_matches): 210 | keep = True 211 | for j, tag_j in enumerate(sorted_matches): 212 | if i != j: 213 | if tag_j['start'] <= tag_i['start'] <= tag_j['end'] and \ 214 | tag_j['start'] <= tag_i['end'] <= tag_j['end']: 215 | keep = False 216 | break 217 | elif tag_j['start'] > tag_i['start']: 218 | break 219 | else: 220 | pass 221 | if keep: 222 | filtered_matches.append(tag_i) 223 | return filtered_matches 224 | 225 | @staticmethod 226 | def mark_tags_in_text(text, matches): 227 | ''' 228 | produce a text with the tags written as markup 229 | :param text: text to tags 230 | :param matches: tags to encode 231 | :return: 232 | ''' 233 | text_to_tag = text 234 | tagged_abstract = '' 235 | if isinstance(text, unicode): 236 | text_to_tag = text.encode('utf-8') 237 | try: 238 | tagged_abstract = ChangeCollector(text_to_tag) 239 | for i, tag in enumerate( 240 | sorted(matches, key=lambda x: (x['start'], -x['end']))): 241 | if isinstance(tag['reference'], (list, tuple)): 242 | tag_reference = '|'.join(tag['reference']) 243 | else: 244 | tag_reference = tag['reference'] 245 | tagged_abstract.add_change(tag['start'], tag['start'], 246 | '' % ( 247 | str(i), tag['category'], tag['reference_db'], tag_reference)) 248 | tagged_abstract.add_change(tag['end'], tag['end'], '' % str(i)) 249 | tagged_abstract = '
%s

' % tagged_abstract.get_changed() 250 | except UnicodeDecodeError: 251 | logging.error('cannot generate maked text for unicode decode error') 252 | return tagged_abstract 253 | 254 | @staticmethod 255 | def get_tags_in_range(matches, start, end): 256 | filtered_tag = [] 257 | for t in matches: 258 | if start <= t['start'] <= end and \ 259 | start <= t['end'] <= end: 260 | filtered_tag.append(t) 261 | elif t['end'] > end: 262 | break 263 | 264 | return filtered_tag 265 | 266 | @staticmethod 267 | def get_tag_by_match(tags, match): 268 | matched_tags = [] 269 | for tag in tags: 270 | if tag['match'].lower() == match.lower(): 271 | matched_tags.append(match) 272 | return [] 273 | 274 | @staticmethod 275 | def extend_tags_to_alternative_forms(text, extended_forms): 276 | A = ahocorasick.Automaton() 277 | for text_to_match, payload in extended_forms.items(): 278 | A.add_word(text_to_match.lower(), 279 | [0, [payload['category']], [payload['reference_db']], [payload['reference']], 280 | payload['original_value'], 281 | text_to_match.lower(), payload['label']]) 282 | A.make_automaton() 283 | 284 | return BioEntityTagger._tag(text, A, ) 285 | 286 | 287 | class MatchedTag(object): 288 | def __init__(self, 289 | match, 290 | start, 291 | end, 292 | category, 293 | reference_db, 294 | reference, 295 | original_value, 296 | label, 297 | sentence=None 298 | ): 299 | self.match = match 300 | self.start = start 301 | self.end = end 302 | self.category = category 303 | self.reference_db = reference_db 304 | self.reference = reference 305 | self.original_value = original_value 306 | self.label = label 307 | self.sentence = sentence 308 | 309 | @staticmethod 310 | def sanitize_string(s): 311 | if isinstance(s, unicode): 312 | return s.translate(unicode_punctation_table) 313 | elif isinstance(s, str): 314 | return unicode(s.translate(string.maketrans(' ', '_'), string.punctuation)) 315 | else: 316 | return u'' 317 | 318 | # TODO: use inflection.table.ascii from SPECIALIST lexicon to enhance matching forms 319 | -------------------------------------------------------------------------------- /modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opentargets-archive/library-beam/dcf08a6b09a7b11faff1a25655c47d363e18d5d2/modules/__init__.py -------------------------------------------------------------------------------- /modules/vocabulary.py: -------------------------------------------------------------------------------- 1 | vocabulary_urls= [ 2 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/ANATOMY-MESH.json", 3 | # "https://storage.googleapis.com/opentargets-vocabularies/ANTROPOLOGY-MESH.json", 4 | # "https://storage.googleapis.com/opentargets-vocabularies/CHEMICAL-MESH.json", 5 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/DIAGNOSTICS-MESH.json", 6 | # "https://storage.googleapis.com/opentargets-vocabularies/DISCIPLINE-MESH.json", 7 | # "https://storage.googleapis.com/opentargets-vocabularies/DISEASE-EPMC.json", 8 | # "https://storage.googleapis.com/opentargets-vocabularies/DISEASE-MESH.json", 9 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/DISEASE-OPENTARGETS.json", 10 | # "https://storage.googleapis.com/opentargets-vocabularies/GENE-EPMC.json", 11 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/HEALTHCARE-MESH.json", 12 | # "https://storage.googleapis.com/opentargets-vocabularies/HUMANITIES-MESH.json", 13 | # "https://storage.googleapis.com/opentargets-vocabularies/INFORMATIONSCIENCE-MESH.json", 14 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/LOC-MESH.json", 15 | # "https://storage.googleapis.com/opentargets-vocabularies/NAMEDGROUP-MESH.json", 16 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/ORGANISM-MESH.json", 17 | # "https://storage.googleapis.com/opentargets-vocabularies/PATHWAY-OPENTARGETS.json", 18 | # "https://storage.googleapis.com/opentargets-vocabularies/PHENOTYPE-EPMC.json", 19 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/PROCESS-MESH.json", 20 | # "https://storage.googleapis.com/opentargets-vocabularies/PROTEINCOMPLEX-CHEMBL.json", 21 | # "https://storage.googleapis.com/opentargets-vocabularies/PROTEINCOMPLEX-COMPLEXPORTAL.json", 22 | # "https://storage.googleapis.com/opentargets-vocabularies/PROTEINCOMPLEX-CORUM.json", 23 | # "https://storage.googleapis.com/opentargets-vocabularies/PROTEINCOMPLEX-GO.json", 24 | # "https://storage.googleapis.com/opentargets-vocabularies/PSICHIATRY-MESH.json", 25 | # "https://storage.googleapis.com/opentargets-vocabularies/PUBLICATION-MESH.json", 26 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/GENE-OPENTARGETS.json", 27 | # "https://storage.googleapis.com/opentargets-vocabularies/TECHNOLOGY-MESH.json" 28 | # "https://storage.googleapis.com/opentargets-vocabularies/GENE-LEXEBI.json", 29 | # "https://storage.googleapis.com/opentargets-vocabularies/DISEASE-LEXEBI.json", 30 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/PHENOTYPE-HPO.json", 31 | "https://storage.googleapis.com/opentargets-vocabularies_2020_09/DRUG-CHEMBL.json" 32 | # "https://storage.googleapis.com/opentargets-vocabularies/DISEASEALT-MONDO.json" 33 | ] 34 | -------------------------------------------------------------------------------- /publication_alias.sh: -------------------------------------------------------------------------------- 1 | curl -XPOST 'http://esurl:9200/_aliases?pretty' -H 'Content-Type: application/json' -d ' 2 | { 3 | "actions": [ 4 | {"add": {"index": "pubmed-18", "alias": "!publication-data"}} 5 | ] 6 | } ' 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | """Setup.py module for the workflow's worker utilities. 19 | 20 | All the workflow related code is gathered in a package that will be built as a 21 | source distribution, staged in the staging area for the workflow being run and 22 | then installed in the workers when they start running. 23 | 24 | This behavior is triggered by specifying the --setup_file command line option 25 | when running the workflow for remote execution. 26 | """ 27 | 28 | import subprocess 29 | from distutils.command.build import build as _build 30 | 31 | import setuptools 32 | 33 | 34 | # This class handles the pip install mechanism. 35 | class build(_build): # pylint: disable=invalid-name 36 | """A build command class that will be invoked during package install. 37 | 38 | The package built using the current setup.py will be staged and later 39 | installed in the worker using `pip install package'. This class will be 40 | instantiated during install for this specific scenario and will trigger 41 | running the custom commands specified. 42 | """ 43 | sub_commands = _build.sub_commands + [('CustomCommands', None)] 44 | 45 | 46 | # Some custom command to run during setup. The command is not essential for this 47 | # workflow. It is used here as an example. Each command will spawn a child 48 | # process. Typically, these commands will include steps to install non-Python 49 | # packages. For instance, to install a C++-based library libjpeg62 the following 50 | # two commands will have to be added: 51 | # 52 | # ['apt-get', 'update'], 53 | # ['apt-get', '--assume-yes', install', 'libjpeg62'], 54 | # 55 | # First, note that there is no need to use the sudo command because the setup 56 | # script runs with appropriate access. 57 | # Second, if apt-get tool is used then the first command needs to be 'apt-get 58 | # update' so the tool refreshes itself and initializes links to download 59 | # repositories. Without this initial step the other apt-get install commands 60 | # will fail with package not found errors. Note also --assume-yes option which 61 | # shortcuts the interactive confirmation. 62 | # 63 | # The output of custom commands (including failures) will be logged in the 64 | # worker-startup log. 65 | CUSTOM_COMMANDS = [ 66 | ['apt-get', 'update'], 67 | ['apt-get', '--assume-yes', 'install', 'libxml2-dev', 'wget', 'unzip'], 68 | ['pip', 'install', 69 | #'https://github.com/explosion/spacy-models/releases/download/en_depent_web_md-1.2.1/en_depent_web_md-1.2.1.tar.gz', 70 | 'https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.0/en_core_web_lg-2.2.0.tar.gz', 71 | #'nltk' 72 | ], 73 | 74 | ['python', '-m', 'nltk.downloader', 'brown', 'punkt', 'wordnet', 'averaged_perceptron_tagger', 'conll2000', 75 | 'stopwords'] 76 | ] 77 | 78 | 79 | class CustomCommands(setuptools.Command): 80 | """A setuptools Command class able to run arbitrary commands.""" 81 | 82 | def initialize_options(self): 83 | pass 84 | 85 | def finalize_options(self): 86 | pass 87 | 88 | def RunCustomCommand(self, command_list): 89 | print 'Running command: %s' % command_list 90 | p = subprocess.Popen( 91 | command_list, 92 | stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) 93 | # Can use communicate(input='y\n'.encode()) if the command run requires 94 | # some confirmation. 95 | stdout_data, stdout_err = p.communicate() 96 | print 'Command output: %s | Command err: %s' % (stdout_data, stdout_err) 97 | if p.returncode != 0: 98 | raise RuntimeError( 99 | 'Command %s failed: exit code: %s' % (command_list, p.returncode)) 100 | 101 | def run(self): 102 | for command in CUSTOM_COMMANDS: 103 | self.RunCustomCommand(command) 104 | 105 | 106 | # Configure the required packages and scripts to install. 107 | # Note that the Python Dataflow containers come with numpy already installed 108 | # so this dependency will not trigger anything to be installed unless a version 109 | # restriction is specified. 110 | # Note numpy >=1.17 is python3 only 111 | # Note more-itertools >=6.0.0 is python3 only 112 | REQUIRED_PACKAGES = [ 113 | 'numpy==1.16.5', 114 | 'more-itertools==5.0.0', 115 | 'apache-beam[gcp]==2.16.0', 116 | 'spacy==2.2.2', 117 | 'python-Levenshtein==0.12.0', 118 | 'fuzzywuzzy==0.17.0', 119 | 'elasticsearch==7.0.5', 120 | 'lxml==4.4.1', 121 | 'textblob==0.15.3', 122 | 'pyahocorasick==1.4.0', 123 | 'rope==0.14.0', 124 | 'unidecode==1.1.1' 125 | ] 126 | 127 | setuptools.setup( 128 | name='opentargets-library-beam', 129 | version='0.0.2', 130 | description='ETL for opentargets library running on beam', 131 | install_requires=REQUIRED_PACKAGES, 132 | # dependency_links=['https://github.com/explosion/spacy-models/releases/download/en_core_web_md-1.2.1 133 | # /en_core_web_md-1.2.1.tar.gz#egg=en_core_web_md-1.2.1'], 134 | packages=setuptools.find_packages(), 135 | cmdclass={ 136 | # Command class instantiated and run during pip install scenarios. 137 | 'build': build, 138 | 'CustomCommands': CustomCommands, 139 | } 140 | ) 141 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opentargets-archive/library-beam/dcf08a6b09a7b11faff1a25655c47d363e18d5d2/tests/__init__.py -------------------------------------------------------------------------------- /tests/resources/common_words_as_genes.txt: -------------------------------------------------------------------------------- 1 | The NOD mouse, which spontaneously develops insulitis and overt diabetes, is a model of autoimmune type I diabetes mellitus. For the precise analysis of the roles of CD4+ and CD8+ T cells in the pathogenesis of this mouse, these subsets must be transferred into recipients that are completely free of T cells and pathological changes. We used athymic NOD nude mice, which congenitally lack mature T cells and are free of insulitis and hyperglycemia up to the age of 60 weeks, as recipients for this purpose. To the nude recipients we transferred either one of a highly purified CD4+ or CD8+ T cell subset derived from non-diabetic female NOD mice; any in vivo increase in the contaminating T cell subsets was prevented by injecting the antibody homologous to it. Most of the T cell-reconstituted recipients were treated with cyclophosphamide to promote the onset of overt diabetes. Transfer of the CD8+ T cell subset alone did not induce insulitis or hyperglycemia. In contrast, transfer of the CD4+ T cell subset alone produced insulitis, but not hyperglycemia, in all the recipients. However, the subsequent transfer of CD8+ T cells into CD4+ T cell-reconstituted recipients induced severe insulitis and hyperglycemia in almost all the recipients. In these diabetic recipients, we observed severe damage of the pancreatic islets and the infiltration of a large number of CD8+ T cells into the remaining islets; insulin-secreting beta cells were no longer detected. These results suggest that CD4+ T cells play a predominant role in the development of insulitis and that CD8+ T cells migrate into the islets and are subsequently, with the aid of CD4+ T cells, differentiated into killer cells which act against beta cells. -------------------------------------------------------------------------------- /tests/resources/test-medlinexml/test_baseline.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opentargets-archive/library-beam/dcf08a6b09a7b11faff1a25655c47d363e18d5d2/tests/resources/test-medlinexml/test_baseline.xml.gz -------------------------------------------------------------------------------- /tests/resources/test-medlinexml/test_update.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opentargets-archive/library-beam/dcf08a6b09a7b11faff1a25655c47d363e18d5d2/tests/resources/test-medlinexml/test_update.xml.gz -------------------------------------------------------------------------------- /tests/resources/test-spacy/disease.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /tests/resources/test-spacy/geneProt.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /tests/resources/test_abstract_lexebi.txt: -------------------------------------------------------------------------------- 1 | Autopsy studies of Alzheimer's disease (AD) have found that neurofibrillary tangle (NFT) pathology of the medial temporal lobe (MTL) demonstrates selective topography with relatively stereotyped subregional involvement at early disease stages, prompting interest in more granular measurement of these structures with in vivo magnetic resonance imaging. We applied a novel, automated method for measurement of hippocampal subfields and extrahippocampal MTL cortical regions. The cohort included cognitively normal (CN) adults (n = 86), early mild cognitive impairment (n = 43), late MCI (n = 22), and mild AD (n = 40) patients from the Alzheimer's Disease Neuroimaging Initiative (ADNI). For pseudolongitudinal analysis of the continuum from preclinical to mild AD dementia, the groups were further divided according to amyloid status based on positron emission tomography. Specific subregions associated with the early NFT pathology of AD were more sensitive to preclinical and early prodromal AD than whole hippocampal volume while more diffuse involvement was found in later stages. In particular, BA35, the first region associated with NFT deposition, was the only region to discriminate preclinical AD from amyloid negative cognitively normal adults ("normal aging"). In general, patterns of atrophy in the pseudolongitudinal analysis largely recapitulated Braak staging of NFTs within the MTL. 2 | Aquaporin-4 (AQP4)-specific T cells are expanded in neuromyelitis optica (NMO) patients and exhibit Th17 polarization. However, their pathogenic role in CNS autoimmune inflammatory disease is unclear. Although multiple AQP4 T-cell epitopes have been identified in WT C57BL/6 mice, we observed that neither immunization with those determinants nor transfer of donor T cells targeting them caused CNS autoimmune disease in recipient mice. In contrast, robust proliferation was observed following immunization of AQP4-deficient (AQP4-/-) mice with AQP4 peptide (p) 135-153 or p201-220, peptides predicted to contain I-Ab-restricted T-cell epitopes but not identified in WT mice. In comparison with WT mice, AQP4-/- mice used unique T-cell receptor repertoires for recognition of these two AQP4 epitopes. Donor T cells specific for either determinant from AQP4-/-, but not WT, mice induced paralysis in recipient WT and B-cell-deficient mice. AQP4-specific Th17-polarized cells induced more severe disease than Th1-polarized cells. Clinical signs were associated with opticospinal infiltrates of T cells and monocytes. Fluorescent-labeled donor T cells were detected in CNS lesions. Visual system involvement was evident by changes in optical coherence tomography. Fine mapping of AQP4 p201-220 and p135-153 epitopes identified peptides within p201-220 but not p135-153, which induced clinical disease in 40% of WT mice by direct immunization. Our results provide a foundation to evaluate how AQP4-specific T cells contribute to AQP4-targeted CNS autoimmunity (ATCA) and suggest that pathogenic AQP4-specific T-cell responses are normally restrained by central tolerance, which may be relevant to understanding development of AQP4-reactive T cells in NMO. 3 | Leucine rich repeat kinase 2 (LRRK2) is a promising target for the treatment of Parkinson's disease; however, little is known about the expression of LRRK2 in human brain and if/how LRRK2 protein levels are altered in Parkinson's disease.We measured the protein levels of LRRK2 as well as its phosphorylation on serines 910, 935, and 973 in the postmortem brain tissue of Parkinson's disease patients and aged controls with and without Lewy bodies.LRRK2 and its phosphorylation were measured by immunoblot in brain regions differentially affected in Parkinson's disease (n = 30) as well as subjects with Lewy bodies restricted to the periphery and lower brain stem (n = 25) and matched controls without pathology (n = 25).LRRK2 levels were increased in cases with restricted Lewy bodies, with a 30% increase measured in the substantia nigra. In clinical Parkinson's disease, levels of LRRK2 negatively correlated to disease duration and were comparable with controls. LRRK2 phosphorylation, however, particularly at serine 935, was reduced with clinical Parkinson's disease with a 36% reduction measured in the substantia nigra.Our data show that LRRK2 phosphorylation is reduced with clinical PD, whereas LRRK2 expression is increased in early potential prodromal stages. These results contribute to a better understanding of the role of LRRK2 in idiopathic Parkinson's disease and may aid efforts aimed at therapeutically targeting the LRRK2 protein. © 2016 International Parkinson and Movement Disorder Society. 4 | Leucine-rich repeat kinase 2 (LRRK2) is a central protein in the pathogenesis of Parkinson's disease (PD), yet its normal function has proved stubbornly hard to elucidate. Even though it remains unclear how pathogenic mutations affect LRRK2 to cause PD, recent findings provide increasing cause for optimism. We summarise here the developing consensus over the effect of pathogenic mutations in the Ras of complex proteins and C-terminal of Roc domains on LRRK2 GTPase activity. This body of work has been greatly reinforced by our own study of the protective R1398H variant contained within the LRRK2 GTPase domain. Collectively, data point towards the pathogenicity of GTP-bound LRRK2 and strengthen a working model for LRRK2 GTPase function as a GTPase activated by dimerisation. Together with the identification of the protective R1398H variant as a valuable control for pathogenic mutations, we have no doubt that these triumphs for the LRRK2 field will accelerate research towards resolving LRRK2 function and towards new treatments for PD. 5 | Non-coagulating (NC) milk, defined as milk not coagulating within 40 min after rennet-addition, can have a negative influence on cheese production. Its prevalence is estimated at 18% in the Swedish Red (SR) cow population. Our study aimed at identifying genomic regions and causal variants associated with NC milk in SR cows, by doing a GWAS using 777k SNP genotypes and using imputed sequences to fine map the most promising genomic region. Phenotypes were available from 382 SR cows belonging to 21 herds in the south of Sweden, from which individual morning milk was sampled. NC milk was treated as a binary trait, receiving a score of one in case of non-coagulation within 40 min. For all 382 SR cows, 777k SNP genotypes were available as well as the combined genotypes of the genetic variants of αs1-β-κ-caseins. In addition, whole-genome sequences from the 1000 Bull Genome Consortium (Run 3) were available for 429 animals of 15 different breeds. From these sequences, 33 sequences belonged to SR and Finish Ayrshire bulls with a large impact in the SR cow population. Single-marker analyses were run in ASReml using an animal model. After fitting the casein loci, 14 associations at -Log10(P-value) > 6 identified a promising region located on BTA18. We imputed sequences to the 382 genotyped SR cows using Beagle 4 for half of BTA18, and ran a region-wide association study with imputed sequences. In a seven mega base-pairs region on BTA18, our strongest association with NC milk explained almost 34% of the genetic variation in NC milk. Since it is possible that multiple QTL are in strong LD in this region, 59 haplotypes were built, genetically differentiated by means of a phylogenetic tree, and tested in phenotype-genotype association studies. Haplotype analyses support the existence of one QTL underlying NC milk in SR cows. A candidate gene of interest is the VPS35 gene, for which one of our strongest association is an intron SNP in this gene. The VPS35 gene belongs to the mammary gene sets of pre-parturient and of lactating cows. 6 | In neuromyelitis optica (NMO), one of the underlying pathogenic mechanisms is the formation of antigen-antibody complexes which can trigger an inflammatory response by inducing the infiltration of neutrophils in lesions. Epithelial neutrophil-activating peptide 78 (ENA 78), known as Chemokine (C-X-C motif) ligand 5 (CXCL5), belongs to the ELR-CXCL family. It recruits and activates neutrophils. The aim of this study was to evaluate ENA 78, IL-1β and TNF-α plasma levels in multiple sclerosis (MS) and neuromyelitis optica (NMO) patients.ENA 78, IL-1β and TNF-α plasma levels were detected in 20 healthy controls (HC), 25 MS and 25 NMO patients using MILLIPLEX® map Human High Sensitivity Cytokine/Chemokine Panels.Plasma levels of ENA 78 were significantly higher in NMO patients than in HC (P < 0.001) and MS patients (P < 0.05). The NMO patients showed higher plasma levels of IL-1β compared with HC (P < 0.01). Further, increased plasma levels of TNF-α were found in the MS (P < 0.05) and NMO patients (P < 0.001). In addition, NMO patients had higher Expanded Disability Status Scale (EDSS) scores compared with MS patients (P < 0.05). EDSS scores were correlated with plasma levels of ENA 78 in NMO patients (P < 0.05). There were no significant correlations between EDSS scores and plasma levels of ENA 78 in MS patients (P > 0.05).The overproduction of pro-inflammatory cytokines such as IL-1β and TNF-α during the remission of NMO activates ENA 78, which in turn leads to neutrophil infiltration in lesions. ENA 78 plasma levels were correlated with EDSS scores in NMO patients. Elevated secretion of ENA 78 may be a critical step in neutrophil recruitment during the remission of NMO. 7 | The tumor necrosis factor like weak inducer of apoptosis (TWEAK) and its receptor, fibroblast growth factor-inducible 14 (Fn14), mediate inflammation and neuronal apoptosis in cerebral edema, ischemic stroke and multiple sclerosis. The downstream effectors and pathways linked to TWEAK-Fn14 signaling are strongly implicated in the pathology of Parkinson's disease (PD), thus indicating a putative role for TWEAK/Fn14 signaling in PD neurodegeneration. Using the 1-methyl-4-phenyl-1,2,3,6-tetrahydropyridine (MPTP) mouse model, we aimed to determine whether genetic ablation or pharmacologic mitigation of the TWEAK protein and its Fn14 receptor affected substantia nigra and striatum Parkinsonian pathology. Changes in endogenous TWEAK protein expression were also quantified in tissue from both MPTP-treated mice and PD human samples. TWEAK protein expression was transiently increased in the striatal tissue but remained unaltered in substantia nigra tissue of MPTP-treated mice. There was also no change of TWEAK protein levels in the substantia nigra or the striatum of human PD patients as compared to matched control subjects. Mitigating the effects of endogenous TWEAK protein using neutralizing antibody did affect MPTP-mediated neurotoxicity in the substantia nigra using the sub-acute model of MPTP (30mg/kg i.p. over five consecutive days). Neither TWEAK nor Fn14 genetic ablation led to attenuation of MPTP-toxicity in the acute model. These findings suggest that TWEAK signaling might be an aspect of MPTP-mediated neuropathology and be involved in the overall neurodegenerative pathology of PD. 8 | Severe damage to the blood-brain barrier (BBB) allows anti-aquaporin 4 (AQP4) antibodies to access the astrocytic endfeet in neuromyelitis optica (NMO). In the current study, we identified the pathogenic cytokines/chemokines that are responsible for the BBB malfunction induced by NMO sera.We measured the levels of 27 cytokines/chemokines in human brain microvascular endothelial cells (BMECs) after exposure to sera obtained from patients with the acute and stable phases of anti-AQP4 antibody-positive NMO spectrum disorder (NMOSD), multiple sclerosis (MS) patients and healthy controls (HC) using a multiplexed fluorescent bead-based immunoassay system.The induced protein (IP)-10 level in the cells was markedly increased following exposure to acute phase NMOSD sera. Other cytokines/chemokines including interleukin (IL)-6 and monocyte chemotactic protein (MCP)-1 were also significantly increased in the acute NMOSD group compared to both the MS and HC groups. The up-regulation of the IP-10 levels in the cells after exposure to the acute-phase NMOSD sera was also observed using another specified ELISA, and this effect was significantly decreased during the remission phase in the individual NMOSD patients. Furthermore, the increase in the level of IP-10 after exposure to the sera was significantly correlated with the cerebrospinal fluid/serum albumin ratio.Sera from the acute phase of NMO markedly increased the autocrine secretion of IP-10 by BMECs. The over-production of IP-10 in BMECs may play an important role in the pathogenesis of NMO and may therefore help to mediate the trafficking of T cells expressing its receptor across the BBB. 9 | Despite recent advances in delineating the pathogenic mechanisms of autoimmune disease, the puzzle that reveals the true picture of these diverse immunological disorders is yet to be solved. We know that the human leukocyte antigen (HLA) loci as well as many different genetic susceptibility loci with relatively small effect sizes predispose to various autoimmune diseases and that environmental factors are involved in triggering disease. Models for mechanisms of disease become increasingly complex as relationships between components of both the adaptive and innate immune systems are untangled at the molecular level. In this article, we pose some of the important questions about autoimmunity where the answers will advance our understanding of disease pathogenesis and improve the rational design of novel therapies. How is autoimmunity triggered, and what components of the immune response drive the clinical manifestations of disease? What determines whether a genetically predisposed individual will develop an autoimmune disease? Is restoring immune tolerance the secret to finding cures for autoimmune disease? Current research efforts seek answers to these big questions. 10 | Anti-TNF drugs have represented an epochal revolution in the treatment of rheumatoid arthritis and spondyloarthritis. In the field of axial spondyloarthritis, golimumab, a fully human monoclonal anti-TNFα administered subcutaneously every 4 weeks, has shown significant efficacy and good safety in patients with ankylosing spondylitis. More recently, it was also indicated as an effective treatment for patients suffering from non-radiographic axial spondyloarthitits. Areas covered: A systematic literature search was completed, using the largest electronic databases (Medline, Embase and Cochrane), with the aim to review all data concerning the administration of golimumab in patients suffering from axial spondyloartritis. Expert opinion: In the 16-week GO-AHEAD study, golimumab was effective in patients with non-radiographic spondyloarthritis with high levels of CRP and/or positive MRI findings, but not in subjects with both negative CRP and MRI. This finding allows for the addressing the of anti-TNF treatment more specifically. Preliminary data concerning an open-label extension of the GO-AHEAD study outlined the high retention-rate of the drug at 52 weeks. The production of antibodies against golimumab is rare and it seems to exert scarce influence on the drug performances. In conclusion, golimumab appears as a very useful and well tolerated anti-TNF agent. -------------------------------------------------------------------------------- /tests/test_tagger.py: -------------------------------------------------------------------------------- 1 | import unittest, json 2 | 3 | from modules.BioentityTagger import BioEntityTagger 4 | 5 | 6 | class TaggerTestCase(unittest.TestCase): 7 | 8 | def setUp(self): 9 | self.tagger = BioEntityTagger() 10 | 11 | def testTaggerNLP(self): 12 | 13 | for i, text in enumerate(file('resources/test_abstract_nlp.txt')): 14 | print i 15 | for tag in self.tagger.tag(text.lower()): 16 | print tag, text[tag['start']:tag['end']] 17 | 18 | def testTaggerLexebi(self): 19 | for i, text in enumerate(file('resources/test_abstract_lexebi.txt')): 20 | 21 | print i 22 | # for tag in tagger.tag(text.lower()): 23 | # print tag, text[tag['start']:tag['end']] 24 | old_tags = set() 25 | lexebi_tags = set() 26 | tags = self.tagger.tag(text.lower()) 27 | for tag in tags: 28 | matched_text = text[tag['start']:tag['end']] 29 | print tag, matched_text 30 | if tag['reference_db'] == 'LEXEBI': 31 | lexebi_tags.add(matched_text) 32 | else: 33 | old_tags.add(matched_text) 34 | new_tags = lexebi_tags.difference(old_tags) 35 | print 'New tags identified : {}'.format(new_tags) 36 | 37 | 38 | if __name__ == "__main__": 39 | unittest.main() -------------------------------------------------------------------------------- /tests/text_medline_parser.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/bin/python 2 | # -*- coding: UTF-8 -*- 3 | import os 4 | import unittest 5 | from lxml import etree 6 | from tqdm import tqdm 7 | 8 | from main import parse_medline_xml 9 | from modules.BioentityTagger import BioEntityTagger 10 | from modules.NLP import init_spacy_english_language, SentenceAnalysisSpacy, DocumentAnalysisSpacy 11 | 12 | class MedlineParser(unittest.TestCase): 13 | 14 | def testParsing(self): 15 | file_name = 'resources/cancer_small.xml' 16 | tree = etree.parse(file_name) 17 | out = open('resources/pubmed_result.abstract.txt','w') 18 | for element in tqdm(tree.iter('MedlineCitation')): 19 | parsed = next(parse_medline_xml(etree.tostring(element),file_name)) 20 | if parsed['abstract']: 21 | try: 22 | out.write(parsed['abstract'].encode('utf-8').replace('\n','')+'\n') 23 | except Exception as e: 24 | print 'could not parse', e 25 | 26 | 27 | 28 | if __name__ == '__main__': 29 | unittest.main() 30 | -------------------------------------------------------------------------------- /venv_elasticsearch.txt: -------------------------------------------------------------------------------- 1 | cachetools==3.1.1 2 | certifi==2019.9.11 3 | chardet==3.0.4 4 | elasticsearch==5.5.1 5 | futures==3.3.0 6 | google-api-core==1.14.3 7 | google-auth==1.7.1 8 | google-cloud-core==1.0.3 9 | google-cloud-storage==1.23.0 10 | google-resumable-media==0.5.0 11 | googleapis-common-protos==1.6.0 12 | idna==2.8 13 | pipdeptree==0.13.2 14 | pkg-resources==0.0.0 15 | protobuf==3.11.0 16 | pyasn1==0.4.8 17 | pyasn1-modules==0.2.7 18 | pytz==2019.3 19 | requests==2.22.0 20 | rsa==4.0 21 | six==1.13.0 22 | tqdm==4.39.0 23 | urllib3==1.21.1 24 | --------------------------------------------------------------------------------