├── images ├── np-kg-mr.jpg ├── MR_workflow.png ├── framework-github.png └── methods-overview.png ├── resources ├── pmids │ ├── flaxseed_pmid.txt │ ├── gojiberry_pmid.txt │ ├── hawthorn_pmid.txt │ ├── ashwaganda_pmid.txt │ ├── feverfew_pmid.txt │ ├── oregano_pmid.txt │ ├── rhodiola_pmid.txt │ ├── scrubpalmetto_pmid.txt │ ├── valerian_pmid.txt │ ├── cranberry_pmid.txt │ ├── blackcohosh_pmid.txt │ ├── rosemary_pmid.txt │ ├── echinacea_pmid.txt │ ├── horsechestnut_pmid.txt │ ├── milkthistle_pmid.txt │ ├── kratom_pmid.txt │ ├── garlic_pmid.txt │ ├── ginkgo_pmid.txt │ ├── panaxginseng_pmid.txt │ ├── fenugreek_pmid.txt │ ├── soybean_pmid.txt │ ├── turmeric_pmid.txt │ ├── blackpepper_pmid.txt │ ├── licorice_pmid.txt │ ├── grapefruit_pmid.txt │ ├── goldenseal_pmid.txt │ ├── cinnamon_pmid.txt │ └── greentea_pmid.txt ├── data │ ├── dikb-evidence.zip │ ├── fda-drug-interaction-evidence.zip │ ├── CHEMICAL_TRANSPORTER.tsv │ ├── CHEMICAL_INHIBITOR.tsv │ └── CHEMICAL_MOLECULE.tsv ├── ontology_source_list.txt ├── edge_data │ ├── edge_counts.tsv │ └── README.md ├── edge_source_list.txt ├── node_data │ └── README.md ├── ontology-extensions │ ├── log-ontology-extensions-20240229.txt │ └── generate_ontology_extensions.py └── resource_info.txt ├── pheknowlator-notebooks └── README.md ├── requirements.txt ├── literature-graphs └── README.md ├── relation-extraction-scripts ├── README.md ├── NER_metamap.py ├── indraREACH_extract.py ├── pdf_to_text.py ├── semrepExtract.py ├── machineReadMain.py ├── semrep_process_error_files.py └── semrep_process_pmid.py ├── evaluation-notebooks ├── README.md ├── pathSearchMain.py └── KG_path_searches.py ├── .gitignore ├── README.md ├── util-notebooks └── create_strToOBOdict.ipynb └── LICENSE /images/np-kg-mr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanyabt/np-kg/HEAD/images/np-kg-mr.jpg -------------------------------------------------------------------------------- /images/MR_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanyabt/np-kg/HEAD/images/MR_workflow.png -------------------------------------------------------------------------------- /images/framework-github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanyabt/np-kg/HEAD/images/framework-github.png -------------------------------------------------------------------------------- /images/methods-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanyabt/np-kg/HEAD/images/methods-overview.png -------------------------------------------------------------------------------- /resources/pmids/flaxseed_pmid.txt: -------------------------------------------------------------------------------- 1 | 27055970 2 | 31128594 3 | 30709839 4 | 26569216 5 | 19278222 6 | 32719085 -------------------------------------------------------------------------------- /resources/data/dikb-evidence.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanyabt/np-kg/HEAD/resources/data/dikb-evidence.zip -------------------------------------------------------------------------------- /resources/data/fda-drug-interaction-evidence.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sanyabt/np-kg/HEAD/resources/data/fda-drug-interaction-evidence.zip -------------------------------------------------------------------------------- /resources/pmids/gojiberry_pmid.txt: -------------------------------------------------------------------------------- 1 | 11675844 2 | 17568570 3 | 33755822 4 | 28241438 5 | 27352447 6 | 22779192 7 | 29931910 8 | 35931199 9 | 10 | -------------------------------------------------------------------------------- /resources/pmids/hawthorn_pmid.txt: -------------------------------------------------------------------------------- 1 | 12817526 2 | 11807965 3 | 34182907 4 | 27878303 5 | 24952587 6 | 24736980 7 | 24371223 8 | 22814436 9 | 22776719 10 | 21049395 11 | 14682819 12 | 12817526 13 | -------------------------------------------------------------------------------- /resources/pmids/ashwaganda_pmid.txt: -------------------------------------------------------------------------------- 1 | 33460762 2 | 33395575 3 | 32563149 4 | 32201301 5 | 31062367 6 | 30987861 7 | 27241252 8 | 25938222 9 | 25893199 10 | 25684704 11 | 25549922 12 | 24105360 13 | 21853114 14 | 18367983 15 | -------------------------------------------------------------------------------- /pheknowlator-notebooks/README.md: -------------------------------------------------------------------------------- 1 | ## PheKnowLator (Phenotype Knowledge Translator) 2 | 3 | All notebooks in pheknowlator-notebooks are run from code and instructions from the [PheKnowLator project](https://github.com/callahantiff/PheKnowLator). -------------------------------------------------------------------------------- /resources/pmids/feverfew_pmid.txt: -------------------------------------------------------------------------------- 1 | 11206893 2 | 15122077 3 | 33258341 4 | 31144365 5 | 30484368 6 | 26821066 7 | 26626238 8 | 26024595 9 | 24176849 10 | 23933184 11 | 23792430 12 | 23560381 13 | 20707612 14 | 19584059 15 | 19298255 16 | 17272824 17 | 14735438 18 | -------------------------------------------------------------------------------- /resources/pmids/oregano_pmid.txt: -------------------------------------------------------------------------------- 1 | 29211364 2 | 33629672 3 | 31848974 4 | 24759772 5 | 29777637 6 | 24934554 7 | 29094727 8 | 27912149 9 | 34184069 10 | 20729757 11 | 21205415 12 | 29191726 13 | 23255497 14 | 33993977 15 | 18334983 16 | 20086034 17 | 26453324 18 | 25451095 19 | 19909350 20 | 36017806 -------------------------------------------------------------------------------- /resources/pmids/rhodiola_pmid.txt: -------------------------------------------------------------------------------- 1 | 20845605 2 | 26613955 3 | 34471002 4 | 32719085 5 | 31540384 6 | 31399405 7 | 30987861 8 | 30666592 9 | 29017559 10 | 28741143 11 | 27664690 12 | 27572116 13 | 26613955 14 | 25970041 15 | 25747701 16 | 24400445 17 | 24370051 18 | 23835496 19 | 23399640 20 | 22648620 21 | 19790032 22 | 9516038 23 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython>=0.29.14 2 | ipywidgets>=7.7.0 3 | more-itertools>=8.6.0 4 | networkx>=2.4 5 | numpy>=1.18.1 6 | openpyxl>=3.0.3 7 | pandas>=1.0.5 8 | psutil>=5.6.3 9 | python-json-logger>=2.0.1 10 | ray>=1.1.0 11 | rdflib>=4.2.2 12 | reactome2py>=0.0.8 13 | requests>=2.22.0 14 | responses==0.10.12 15 | tqdm>=4.47.0 16 | urllib3>=1.25.3 -------------------------------------------------------------------------------- /resources/pmids/scrubpalmetto_pmid.txt: -------------------------------------------------------------------------------- 1 | 19719333 2 | 26417310 3 | 22257149 4 | 32719085 5 | 14663456 6 | 22588833 7 | 19359798 8 | 15992226 9 | 23674609 10 | 21632963 11 | 19336899 12 | 19154468 13 | 15536458 14 | 25430798 15 | 24392691 16 | 20666626 17 | 25074401 18 | 31955702 19 | 28434739 20 | 22301766 21 | 19096068 22 | 14663456 23 | 11218623 -------------------------------------------------------------------------------- /literature-graphs/README.md: -------------------------------------------------------------------------------- 1 | ## Relation Extraction Workflow for Literature-based Graph 2 | 3 | ![MR-workflow](../images/MR_workflow.png) 4 | 5 | ### Notebooks 6 | 7 | 1. process_reach_data.ipynb 8 | 2. process_semrep_data.ipynb 9 | 3. run_closure_machine_read.ipynb 10 | 4. merge_machine_read_graphs.ipynb 11 | 12 | Output is saved in ./output_graphs/ -------------------------------------------------------------------------------- /resources/pmids/valerian_pmid.txt: -------------------------------------------------------------------------------- 1 | 11876586 2 | 11762111 3 | 10402626 4 | 15900287 5 | 15328251 6 | 4615718 7 | 33932511 8 | 32719085 9 | 31502480 10 | 25430798 11 | 24392691 12 | 23674609 13 | 22588833 14 | 22062945 15 | 21632963 16 | 21184395 17 | 21170698 18 | 21049395 19 | 20666626 20 | 19371257 21 | 18602406 22 | 18331390 23 | 17910620 24 | 17484515 25 | 17214607 26 | 15367385 27 | 15070158 -------------------------------------------------------------------------------- /relation-extraction-scripts/README.md: -------------------------------------------------------------------------------- 1 | ## Relation Extraction Workflow for Literature-based Graph 2 | 3 | ![MR-workflow](../images/MR_workflow.png) 4 | 5 | ### Scripts: 6 | 1. *indraREACH_extract.py* 7 | 2. *machineReadMain.py* 8 | 3. *NER_metamap.py* 9 | 4. *pdf_to_text.py* 10 | 5. *reachJSONextraction.py* 11 | 6. *semrep_process_pmid.py* 12 | 7. *semrep_process_error_files.py* 13 | 8. *semrepExtract.py* -------------------------------------------------------------------------------- /resources/pmids/cranberry_pmid.txt: -------------------------------------------------------------------------------- 1 | 27335715 2 | 21364039 3 | 16413247 4 | 35230114 5 | 34620272 6 | 34579096 7 | 32719085 8 | 29703387 9 | 29521054 10 | 25430798 11 | 24997313 12 | 24387788 13 | 23958198 14 | 23843424 15 | 23673492 16 | 22673012 17 | 22588833 18 | 21632963 19 | 21139236 20 | 21049395 21 | 20717876 22 | 19694738 23 | 19114462 24 | 19082882 25 | 18516070 26 | 17392729 27 | 17322161 28 | 16513449 29 | 12569440 -------------------------------------------------------------------------------- /resources/pmids/blackcohosh_pmid.txt: -------------------------------------------------------------------------------- 1 | 16432272 2 | 16221754 3 | 15900287 4 | 32786546 5 | 32719085 6 | 29074104 7 | 25705905 8 | 25430798 9 | 24704556 10 | 23972793 11 | 23939423 12 | 23738729 13 | 22257149 14 | 21632963 15 | 21213356 16 | 20979450 17 | 20666626 18 | 20460823 19 | 20406160 20 | 20218935 21 | 19815591 22 | 19353999 23 | 18755149 24 | 18214849 25 | 17465192 26 | 16432272 27 | 16415120 28 | 16221754 29 | 15900287 30 | -------------------------------------------------------------------------------- /resources/pmids/rosemary_pmid.txt: -------------------------------------------------------------------------------- 1 | 30087294 2 | 34943193 3 | 31644427 4 | 26626245 5 | 33221424 6 | 33471600 7 | 24072817 8 | 29073181 9 | 24934554 10 | 35458697 11 | 19944162 12 | 29198326 13 | 16873909 14 | 34592323 15 | 29287192 16 | 22531045 17 | 31309269 18 | 12867494 19 | 24316531 20 | 10673984 21 | 34356653 22 | 11498267 23 | 21896322 24 | 30707903 25 | 31623608 26 | 9103309 27 | 28167530 28 | 22374940 29 | 28821847 30 | 25794239 31 | 9806165 32 | 31058853 33 | 22062434 34 | 29710819 35 | 28931599 36 | 7554054 37 | 11712913 38 | 11405259 39 | 25038454 40 | 22217522 41 | 10432344 -------------------------------------------------------------------------------- /resources/pmids/echinacea_pmid.txt: -------------------------------------------------------------------------------- 1 | 23701184 2 | 21078942 3 | 20573086 4 | 15536458 5 | 33730506 6 | 33450678 7 | 32786546 8 | 32719085 9 | 30660822 10 | 26592089 11 | 26441065 12 | 25705905 13 | 25430798 14 | 24392691 15 | 24387348 16 | 23701184 17 | 23674609 18 | 23408271 19 | 22855269 20 | 22607644 21 | 22588833 22 | 22319006 23 | 21632963 24 | 21385154 25 | 21184395 26 | 21170698 27 | 20939821 28 | 20666626 29 | 20653355 30 | 20573086 31 | 19815591 32 | 19719333 33 | 19427870 34 | 19174505 35 | 18947363 36 | 18688789 37 | 18618481 38 | 18584812 39 | 18425719 40 | 18331390 41 | 18258644 42 | 18214849 43 | 17910620 44 | 17658211 45 | 17214607 46 | 16983620 47 | 16877262 48 | 16415120 49 | 15992226 50 | 15536458 51 | 15197212 52 | 15070158 53 | 14749695 54 | 10969720 55 | -------------------------------------------------------------------------------- /resources/ontology_source_list.txt: -------------------------------------------------------------------------------- 1 | disease, resources/ontologies/mondo_with_imports.owl 2 | napdichem, resources/ontologies/chebi_lite_merged_with_imports.owl 3 | protein, resources/ontologies/pr_with_imports.owl 4 | pathway, resources/ontologies/pw_with_imports.owl 5 | relation, resources/ontologies/ro_with_imports.owl 6 | go, resources/ontologies/go_with_imports.owl 7 | chemical, resources/ontologies/chebi_with_imports.owl 8 | phenotype, resources/ontologies/hp_with_imports.owl 9 | anatomy, resources/ontologies/uberon_with_imports.owl 10 | cell, resources/ontologies/clo_with_imports.owl 11 | genomic, resources/ontologies/so_with_imports.owl 12 | oae, resources/ontologies/oae_merged_with_imports.owl 13 | plant, resources/ontologies/po_with_imports.owl 14 | dideo, resources/ontologies/dideo_with_imports.owl -------------------------------------------------------------------------------- /resources/pmids/horsechestnut_pmid.txt: -------------------------------------------------------------------------------- 1 | 30972964 2 | 25632187 3 | 19371257 4 | 18331390 5 | 18071748 6 | 17910620 7 | 17644079 8 | 17220563 9 | 17214607 10 | 15321785 11 | 11588102 12 | 9355936 13 | 8739824 14 | 8737764 15 | 7777701 16 | 3527643 17 | 11298067 18 | 14964349 19 | 8569363 20 | 1621440 21 | 8975254 22 | 27141926 23 | 5753115 24 | 8554461 25 | 351993 26 | 20148408 27 | 8135874 28 | 2686181 29 | 24252494 30 | 24101860 31 | 8721797 32 | 11933130 33 | 9748710 34 | 4681124 35 | 20662111 36 | 11196340 37 | 4611437 38 | 652841 39 | 14418281 40 | 17310430 41 | 815806 42 | 8975251 43 | 16437450 44 | 23152216 45 | 21498026 46 | 8737631 47 | 23741979 48 | 8047546 49 | 7480102 50 | 9793616 51 | 12518108 52 | 11529685 53 | 21310579 54 | 3699401 55 | 25137224 56 | 20185376 57 | 5351610 58 | 25632792 59 | 8519500 60 | 18489306 61 | 7398276 62 | 19025809 63 | 20148408 64 | 10902065 65 | 1084042 66 | 12808296 67 | 818558 68 | 6814494 69 | 2751881 70 | 3699401 71 | 3266797 72 | 6387161 73 | 8737631 74 | 34785211 -------------------------------------------------------------------------------- /resources/pmids/milkthistle_pmid.txt: -------------------------------------------------------------------------------- 1 | 29857089 2 | 25028567 3 | 20014183 4 | 16432272 5 | 16221754 6 | 15666173 7 | 15536458 8 | 12885100 9 | 33706132 10 | 33587330 11 | 32719085 12 | 32344919 13 | 31615114 14 | 29706079 15 | 29254285 16 | 29241692 17 | 28791405 18 | 28457856 19 | 27557939 20 | 26595166 21 | 26438626 22 | 26070840 23 | 25705905 24 | 25623616 25 | 25028567 26 | 25008344 27 | 24730468 28 | 24392691 29 | 23801821 30 | 23674377 31 | 23673225 32 | 23674609 33 | 23394826 34 | 23260576 35 | 22855269 36 | 22588833 37 | 22257149 38 | 22139684 39 | 21632963 40 | 21385154 41 | 21328458 42 | 21170698 43 | 21049395 44 | 20939821 45 | 20828605 46 | 20666626 47 | 20460823 48 | 19934397 49 | 19815591 50 | 19041708 51 | 18214849 52 | 17968815 53 | 17670841 54 | 17611934 55 | 17464157 56 | 17305535 57 | 16983620 58 | 16877262 59 | 16432272 60 | 16289744 61 | 16278402 62 | 16221754 63 | 15536458 64 | 15508430 65 | 15266218 66 | 15072439 67 | 12410543 68 | 12160480 69 | 11377378 70 | 11038151 71 | 10895987 72 | -------------------------------------------------------------------------------- /resources/pmids/kratom_pmid.txt: -------------------------------------------------------------------------------- 1 | 23274770 2 | 33093187 3 | 30604191 4 | 30547698 5 | 31707106 6 | 30088322 7 | 24841968 8 | 29966288 9 | 32913013 10 | 31874991 11 | 30306542 12 | 31045747 13 | 30347696 14 | 21355602 15 | 34003732 16 | 29071751 17 | 31525874 18 | 25262913 19 | 25535742 20 | 27260674 21 | 32583783 22 | 29667422 23 | 23786896 24 | 26692748 25 | 25637762 26 | 30083819 27 | 31878226 28 | 24174816 29 | 17628447 30 | 23435185 31 | 21876481 32 | 24867949 33 | 16107269 34 | 15509230 35 | 29436100 36 | 16409553 37 | 20665236 38 | 9545295 39 | 33735724 40 | 28500736 41 | 20090555 42 | 23773766 43 | 22426328 44 | 32589167 45 | 14761684 46 | 31886779 47 | 16930441 48 | 19114647 49 | 9313947 50 | 30608764 51 | 12244083 52 | 10994809 53 | 17437410 54 | 15805193 55 | 12558585 56 | 3056909 57 | 10084314 58 | 26574358 59 | 25814868 60 | 35165231 61 | 34867362 62 | 34370637 63 | 33690067 64 | 33154449 65 | 33066617 66 | 32601103 67 | 31626272 68 | 31308789 69 | 29499286 70 | 23846544 71 | 21513619 72 | 20650576 73 | 34775626 74 | 34597080 75 | -------------------------------------------------------------------------------- /resources/pmids/garlic_pmid.txt: -------------------------------------------------------------------------------- 1 | 14742762 2 | 12958776 3 | 11206893 4 | 10994537 5 | 16484565 6 | 15974642 7 | 12235448 8 | 34724652 9 | 33450678 10 | 32719085 11 | 31940976 12 | 31092429 13 | 27725449 14 | 27517516 15 | 26423732 16 | 26264202 17 | 25705905 18 | 25430798 19 | 24788927 20 | 24392691 21 | 24387348 22 | 23674609 23 | 23394826 24 | 23272117 25 | 22855269 26 | 22668601 27 | 22610793 28 | 22588833 29 | 22292789 30 | 22257149 31 | 22024968 32 | 21919844 33 | 21838705 34 | 21632963 35 | 21459083 36 | 21388437 37 | 21256950 38 | 21138349 39 | 21104925 40 | 20933082 41 | 20930421 42 | 20666626 43 | 20610890 44 | 20345351 45 | 20226993 46 | 20140680 47 | 20091745 48 | 19815591 49 | 19719333 50 | 19336907 51 | 19174505 52 | 19170155 53 | 18570158 54 | 18516070 55 | 17691916 56 | 17520814 57 | 16983620 58 | 16899612 59 | 16877262 60 | 16823096 61 | 16484569 62 | 16484557 63 | 16302488 64 | 15974642 65 | 15898829 66 | 15649425 67 | 15266218 68 | 15203378 69 | 15197212 70 | 14729595 71 | 14672753 72 | 12891227 73 | 12635815 74 | 12378978 75 | 12235448 76 | 12160480 77 | 12125178 78 | 12127912 79 | 11810949 80 | 11764996 81 | 11740713 82 | 11530830 83 | 11466175 84 | 11408364 85 | 11358376 86 | 11355006 87 | 11238818 88 | 10383929 89 | 8611641 90 | -------------------------------------------------------------------------------- /evaluation-notebooks/README.md: -------------------------------------------------------------------------------- 1 | ## NP-KG Evaluation (case studies green tea and kratom) 2 | 3 | NP-KG has been evaluated with case studies of pharmacokinetic green tea- and kratom-drug interactions involving enzymes and transporters. This folder contains the Jupyter notebooks used for the evaluation strategies described below. 4 | 5 | ### I. Knowledge Recapturing 6 | 7 | 1. **Direct Edges:** 8 | _[KG-mechanistic-knowledge-recapture.ipynb](https://github.com/sanyabt/np-kg/blob/main/evaluation-notebooks/KG-mechanistic-knowledge-recapture.ipynb)_ finds direct edges between natural product nodes (natural products and their constituents) and interacting enzymes and transporters (if any). 9 | 10 | 2. **Shortest Path Searches:** 11 | _[KG-mechanistic-knowledge-shortest-paths.ipynb](https://github.com/sanyabt/np-kg/blob/main/evaluation-notebooks/KG-mechanistic-knowledge-shortest-paths.ipynb)_ finds the shortest paths between the natural product and interacting enzymes and transporters, if there are no direct edges between them. 12 | 13 | ### II. Meta-path Discovery 14 | _[KG-metapath-discovery.ipynb](https://github.com/sanyabt/np-kg/blob/main/evaluation-notebooks/KG-metapath-discovery.ipynb)_ applies direct edge and meta-path searches to find mechanistic pathways between natural product nodes and drugs, with interacting enzymes or transporters. Meta-path searches are applied for the following natural product-drug pairs: 15 | 16 | * Green tea - raloxifene 17 | * Green tea - nadolol 18 | * Kratom - midazolam 19 | * Kratom - quetiapene 20 | * Kratom - venlafaxine 21 | 22 | #### Ground Truth 23 | All searches above are evaluated based on the ground truth data from the [NaPDI Center database](https://repo.napdi.org/). -------------------------------------------------------------------------------- /resources/pmids/ginkgo_pmid.txt: -------------------------------------------------------------------------------- 1 | 34329714 2 | 32645647 3 | 35703551 4 | 31325787 5 | 36265676 6 | 34182907 7 | 34010745 8 | 32786546 9 | 32779484 10 | 32719085 11 | 32435840 12 | 32144952 13 | 31721714 14 | 31452210 15 | 31453781 16 | 31327236 17 | 31325605 18 | 30541279 19 | 30502764 20 | 30445863 21 | 30054600 22 | 28993980 23 | 28416372 24 | 28177134 25 | 26706698 26 | 26434836 27 | 26223121 28 | 25705905 29 | 25456428 30 | 24492725 31 | 24392691 32 | 24387348 33 | 24228138 34 | 24225402 35 | 24114901 36 | 23865865 37 | 23674609 38 | 23477707 39 | 23339036 40 | 22855269 41 | 22588833 42 | 22394605 43 | 22292790 44 | 22292789 45 | 22257149 46 | 22139684 47 | 21919844 48 | 21632963 49 | 21422673 50 | 21385154 51 | 21184395 52 | 21170698 53 | 21139040 54 | 21084040 55 | 21059362 56 | 21049395 57 | 20939821 58 | 20797388 59 | 20186406 60 | 20179667 61 | 19889883 62 | 19719333 63 | 19694739 64 | 19487249 65 | 19401473 66 | 19371257 67 | 19280523 68 | 18954552 69 | 18331390 70 | 18329806 71 | 18204840 72 | 19356072 73 | 18066130 74 | 17910620 75 | 17611934 76 | 17214607 77 | 16922812 78 | 16877262 79 | 16842996 80 | 16497339 81 | 16415120 82 | 16400219 83 | 16289744 84 | 16226778 85 | 15992226 86 | 15969930 87 | 15901353 88 | 15801937 89 | 15608563 90 | 15508430 91 | 15285849 92 | 15212966 93 | 15197212 94 | 15147983 95 | 15133536 96 | 14624188 97 | 13130383 98 | 12905864 99 | 12628485 100 | 12127912 101 | 11062691 102 | 10481350 103 | 28105513 104 | 22802250 105 | 22189672 106 | 19299322 107 | 18420532 108 | 18205997 109 | 17050802 110 | 17050793 111 | 16707409 112 | 16432273 113 | 16428919 114 | 15974642 115 | 12845387 116 | 12235448 117 | 12083489 118 | 11199954 119 | 28144968 120 | 22821965 121 | 22323244 122 | 19451798 123 | 17908535 124 | 16419414 125 | 14742762 126 | 13679551 127 | 12818420 128 | 12813312 129 | 11953006 130 | 11876586 131 | 11206893 132 | 10994537 133 | 10836866 134 | 10531760 135 | 11140327 -------------------------------------------------------------------------------- /resources/pmids/panaxginseng_pmid.txt: -------------------------------------------------------------------------------- 1 | 21844260 2 | 18637764 3 | 18319359 4 | 17050802 5 | 15974642 6 | 15582012 7 | 15238367 8 | 12817527 9 | 12695337 10 | 12235448 11 | 11140327 12 | 25756365 13 | 23760126 14 | 22864326 15 | 15385077 16 | 12711887 17 | 12020172 18 | 11206893 19 | 10531760 20 | 8705908 21 | 3209715 22 | 3597812 23 | 34930332 24 | 34799220 25 | 34549583 26 | 34028092 27 | 33000213 28 | 32810615 29 | 32719085 30 | 32662640 31 | 32251615 32 | 32201301 33 | 32058643 34 | 32023909 35 | 31793297 36 | 31578207 37 | 31453781 38 | 31048189 39 | 30978651 40 | 30525897 41 | 30486542 42 | 30327544 43 | 30078466 44 | 29455730 45 | 29024717 46 | 28965396 47 | 28754329 48 | 28705808 49 | 28707966 50 | 28679380 51 | 28231742 52 | 28043879 53 | 27867186 54 | 27864798 55 | 27619505 56 | 27614967 57 | 27495955 58 | 27462006 59 | 26845774 60 | 26824187 61 | 26223121 62 | 25705905 63 | 25600494 64 | 25297453 65 | 25242515 66 | 25142999 67 | 24922060 68 | 24632066 69 | 24392691 70 | 24387348 71 | 24358789 72 | 24151189 73 | 23776647 74 | 23674609 75 | 23600156 76 | 23394826 77 | 23340957 78 | 23306165 79 | 23165347 80 | 23092794 81 | 22855269 82 | 22736593 83 | 22588833 84 | 22584255 85 | 22426160 86 | 22292789 87 | 22269065 88 | 22257149 89 | 21919844 90 | 21646440 91 | 21632963 92 | 21385154 93 | 21293466 94 | 21240677 95 | 21184395 96 | 21178302 97 | 20939821 98 | 20666626 99 | 20460823 100 | 20399767 101 | 20332334 102 | 20099200 103 | 19719333 104 | 19557931 105 | 19353999 106 | 19336893 107 | 19093375 108 | 19060914 109 | 19022240 110 | 18628990 111 | 18367983 112 | 17917277 113 | 17611934 114 | 16729968 115 | 16547074 116 | 16491447 117 | 16415120 118 | 15974642 119 | 15898829 120 | 15647404 121 | 15359132 122 | 15266218 123 | 15197212 124 | 15133536 125 | 14499026 126 | 12817527 127 | 12695340 128 | 12695337 129 | 12677527 130 | 12661769 131 | 12473381 132 | 12235448 133 | 12160480 134 | 12020680 135 | 11901090 136 | 11199127 137 | 11173062 138 | 10574228 139 | 9168157 140 | -------------------------------------------------------------------------------- /resources/pmids/fenugreek_pmid.txt: -------------------------------------------------------------------------------- 1 | 11310527 2 | 9175175 3 | 31589037 4 | 28679380 5 | 26335391 6 | 31084937 7 | 24966255 8 | 22474806 9 | 23624139 10 | 18812751 11 | 31535851 12 | 30322408 13 | 30226032 14 | 27667025 15 | 26969417 16 | 26365335 17 | 25988261 18 | 24472704 19 | 24444280 20 | 23151341 21 | 22114872 22 | 17703139 23 | 17319111 24 | 16432273 25 | 16162626 26 | 15606445 27 | 15581264 28 | 15525445 29 | 14764441 30 | 12944028 31 | 12902916 32 | 12851279 33 | 12709721 34 | 12623212 35 | 12451326 36 | 12434720 37 | 12399273 38 | 12356283 39 | 12116891 40 | 11882828 41 | 11845867 42 | 11824856 43 | 11488773 44 | 11151743 45 | 10991932 46 | 10555898 47 | 10424324 48 | 10361807 49 | 10336578 50 | 10234600 51 | 9920455 52 | 9489597 53 | 9430449 54 | 26335391 55 | 7781462 56 | 7660155 57 | 7715468 58 | 7530473 59 | 7826820 60 | 12959295 61 | 8400098 62 | 1915111 63 | 2975995 64 | 6308702 65 | 10720790 66 | 11370345 67 | 5814423 68 | 11297864 69 | 26600643 70 | 3352284 71 | 7480183 72 | 24022709 73 | 27639708 74 | 7254294 75 | 27406028 76 | 23411266 77 | 13808147 78 | 23429594 79 | 28283164 80 | 20020282 81 | 19809809 82 | 28947828 83 | 28750220 84 | 27822173 85 | 14669264 86 | 20219717 87 | 18930518 88 | 167146 89 | 15350674 90 | 26436069 91 | 26887316 92 | 26887316 93 | 27496582 94 | 11868855 95 | 26778682 96 | 28259652 97 | 10902065 98 | 15738612 99 | 25057273 100 | 25339548 101 | 20979021 102 | 26104039 103 | 16472574 104 | 20600755 105 | 22178172 106 | 24224030 107 | 7649611 108 | 11532065 109 | 25636873 110 | 24868532 111 | 19857068 112 | 18219452 113 | 3286242 114 | 28138310 115 | 28138310 116 | 12256225 117 | 19353539 118 | 28543097 119 | 28543097 120 | 27110551 121 | 15284693 122 | 25229863 123 | 10506006 124 | 22021992 125 | 26835874 126 | 23512705 127 | 24438170 128 | 9574894 129 | 27498339 130 | 10527654 131 | 16298092 132 | 9087156 133 | 8327543 134 | 27939889 135 | 27756958 136 | 26098483 137 | 11693199 138 | 26791805 139 | 17333802 140 | 6703649 141 | 2871558 142 | 26358163 143 | 26600920 144 | 14496025 145 | 10475807 146 | 2194788 147 | 28199188 148 | 18554847 149 | 21312304 150 | 26516311 151 | 12672166 152 | 18928139 153 | 6696779 154 | 11744301 155 | 18338783 156 | 15980869 157 | 22803695 158 | 9268042 159 | 17392143 160 | 11297850 161 | 12889128 -------------------------------------------------------------------------------- /resources/pmids/soybean_pmid.txt: -------------------------------------------------------------------------------- 1 | 15207228 2 | 12639172 3 | 12452752 4 | 11421567 5 | 9399621 6 | 8918638 7 | 27983686 8 | 27080067 9 | 26251920 10 | 25810408 11 | 21883496 12 | 19251923 13 | 17845252 14 | 15309425 15 | 12817527 16 | 9774365 17 | 9062860 18 | 7956996 19 | 2920504 20 | 35073946 21 | 34571971 22 | 33450678 23 | 33300795 24 | 32929449 25 | 32824087 26 | 32786546 27 | 32719085 28 | 31820026 29 | 31408695 30 | 31292876 31 | 31262454 32 | 30920571 33 | 30883634 34 | 30767231 35 | 30610857 36 | 30203535 37 | 30053282 38 | 30014295 39 | 29984664 40 | 29939292 41 | 29608108 42 | 29601521 43 | 29327292 44 | 29154189 45 | 29101532 46 | 28706438 47 | 28644496 48 | 28300141 49 | 28259659 50 | 28095752 51 | 28025072 52 | 27977298 53 | 27705794 54 | 27440109 55 | 27322965 56 | 27211268 57 | 27048380 58 | 27033456 59 | 26703673 60 | 26385990 61 | 26296158 62 | 25781341 63 | 25482884 64 | 25344067 65 | 25319728 66 | 25077358 67 | 25011215 68 | 24697705 69 | 24699854 70 | 24685904 71 | 24573253 72 | 24478031 73 | 24476214 74 | 24297371 75 | 24295672 76 | 24200780 77 | 24029851 78 | 24018688 79 | 23871935 80 | 23370448 81 | 23344026 82 | 23144813 83 | 23083838 84 | 22842481 85 | 22654110 86 | 22588833 87 | 22581840 88 | 22542538 89 | 22146138 90 | 21666065 91 | 21591741 92 | 21049395 93 | 20607366 94 | 20526668 95 | 20460823 96 | 20155619 97 | 20045417 98 | 19945846 99 | 19910524 100 | 19886390 101 | 19883429 102 | 19875054 103 | 19458107 104 | 19356625 105 | 19190233 106 | 19123435 107 | 19022961 108 | 18632754 109 | 18585705 110 | 18570158 111 | 18543926 112 | 18497090 113 | 18430559 114 | 18204840 115 | 18189424 116 | 17999603 117 | 17827443 118 | 17343982 119 | 16490810 120 | 16415120 121 | 16363067 122 | 16289744 123 | 16132860 124 | 16051636 125 | 15997229 126 | 15753073 127 | 15554233 128 | 15351776 129 | 14596646 130 | 12908908 131 | 12860261 132 | 12851793 133 | 12817527 134 | 12809364 135 | 12756512 136 | 12435590 137 | 12270218 138 | 11532308 139 | 11435884 140 | 11134544 141 | 10868951 142 | 10837012 143 | 10759869 144 | 10681374 145 | 10652098 146 | 10460064 147 | 10397283 148 | 9804957 149 | 9725998 150 | 9623777 151 | 9622078 152 | 8701967 153 | 7585636 154 | 8392369 155 | 1325684 156 | 2647147 157 | 3223959 158 | 2455328 159 | 3304999 160 | 3972834 161 | -------------------------------------------------------------------------------- /resources/edge_data/edge_counts.tsv: -------------------------------------------------------------------------------- 1 | Edge Type Relation Example Edge Unique Edges 2 | chemical-protein RO_0002434 CHEBI_4592, PR_P07099 72564 3 | gene-disease RO_0003302 2, MONDO_0010200 13344 4 | chemical-disease RO_0002610 CHEBI_71223, MONDO_0018229 112863 5 | rna-protein RO_0002513 ENST00000499023, PR_O43660 28220 6 | variant-gene RO_0002566 rs367697256, 7476 236121 7 | chemical-molecule RO_0002436 CHEBI_35553, PR_P10635 393 8 | protein-anatomy RO_0001025 PR_P04217, UBERON_0001114 26815 9 | protein-gocc RO_0001025 PR_A0A024RBG1, GO_0005829 82524 10 | variant-phenotype RO_0003302 rs527236137, HP_0000556 4855 11 | disease-phenotype RO_0002200 MONDO_0010761, HP_0000510 444807 12 | protein-cell RO_0001025 PR_P04217, UBERON_0002113 75311 13 | gene-rna RO_0002511 81623, ENST00000382398 197370 14 | protein-gobp RO_0000056 PR_A0A075B6H7, GO_0002250 129426 15 | protein-protein RO_0002436 PR_P84085, PR_O15020 617499 16 | rna-anatomy RO_0001025 ENST00000359318, UBERON_0001987 474073 17 | protein-catalyst RO_0002436 PR_Q00266, CHEBI_15377 25136 18 | chemical-pathway RO_0000056 CHEBI_10033, R-HSA-1430728 29988 19 | chemical-inhibitor RO_0002449 CHEBI_28901, PR_P11509 273 20 | gene-gene RO_0002435 84220, 729540 1715 21 | transporter-chemical RO_0002020 PR_Q9NPD5, CHEBI_38545 91 22 | gene-protein RO_0002205 4836, PR_P30419 19527 23 | chemical-gobp RO_0002436 CHEBI_35299, GO_0048856 276381 24 | gobp-pathway RO_0009501 GO_0016567, R-HSA-8866654 672 25 | chemical-gocc RO_0002436 CHEBI_34568, GO_0005623 47920 26 | protein-pathway RO_0000056 PR_A0A075B6P5, R-HSA-109582 117707 27 | pathway-gomf RO_0000085 R-HSA-8876283, GO_0015370 2426 28 | chemical-adr RO_0003302 CHEBI_9648, HP_0003270 97720 29 | protein-gomf RO_0000085 PR_A0A024RBG1, GO_0003723 69816 30 | chemical-gene RO_0002434 CHEBI_28667, 348 16718 31 | rna-cell RO_0001025 ENST00000488147, CL_0000057 33152 32 | chemical-phenotype RO_0002610 CHEBI_34026, HP_0000137 81939 33 | protein-cofactor RO_0002436 PR_Q00266, CHEBI_18420 1998 34 | gene-pathway RO_0000056 1, R-HSA-109582 107009 35 | chemical-indication RO_0002606 CHEBI_157175, MONDO_0001080 6505 36 | pathway-gocc RO_0002180 R-HSA-166753, GO_0005576 16014 37 | gene-phenotype RO_0003302 1, HP_0002240 24695 38 | variant-disease RO_0003302 rs146691368, MONDO_0018477 69965 39 | chemical-substrate DIDEO_00000041 CHEBI_135737, PR_P33261 514 40 | chemical-gomf RO_0002436 CHEBI_34568, GO_0005488 28181 41 | -------------------------------------------------------------------------------- /resources/data/CHEMICAL_TRANSPORTER.tsv: -------------------------------------------------------------------------------- 1 | CHEBI_38545 PR_Q9NPD5 2 | CHEBI_8805 PR_P08183 3 | CHEBI_5050 PR_O15245 4 | CHEBI_70735 PR_P08183 5 | CHEBI_82960 PR_P08183 6 | CHEBI_32246 PR_P08183 7 | CHEBI_71200 PR_P08183 8 | CHEBI_63660 PR_Q8TCC7 9 | CHEBI_63632 PR_Q86VL8 10 | CHEBI_50659 PR_P08183 11 | CHEBI_5050 PR_P08183 12 | CHEBI_7772 PR_Q14242 13 | CHEBI_87681 PR_P08183 14 | CHEBI_68579 PR_Q9UNQ0 15 | CHEBI_48390 PR_P08183 16 | CHEBI_9139 PR_P08183 17 | CHEBI_49603 PR_P08183 18 | CHEBI_63632 PR_Q9UNQ0 19 | CHEBI_7772 PR_P08183 20 | CHEBI_135929 PR_P08183 21 | CHEBI_68579 PR_P08183 22 | CHEBI_9168 PR_Q9NPD5 23 | CHEBI_66876 PR_P08183 24 | CHEBI_68478 PR_P08183 25 | CHEBI_28445 PR_P08183 26 | CHEBI_63632 PR_P08183 27 | CHEBI_135967 PR_P08183 28 | CHEBI_72296 PR_P08183 29 | CHEBI_63608 PR_Q9Y6L6 30 | CHEBI_72297 PR_P08183 31 | CHEBI_9168 PR_P08183 32 | CHEBI_50122 PR_P08183 33 | CHEBI_367163 PR_P08183 34 | CHEBI_68610 PR_Q9NPD5 35 | CHEBI_40050 PR_P08183 36 | CHEBI_5050 PR_Q9NPD5 37 | CHEBI_71272 PR_P08183 38 | CHEBI_39112 PR_P08183 39 | CHEBI_7496 PR_Q14242 40 | CHEBI_68610 PR_P08183 41 | CHEBI_71940 PR_P08183 42 | CHEBI_32020 PR_Q9Y6L6 43 | CHEBI_135920 PR_P08183 44 | CHEBI_51141 PR_P08183 45 | CHEBI_38561 PR_P08183 46 | CHEBI_5050 PR_Q86VL8 47 | CHEBI_6541 PR_P08183 48 | CHEBI_5050 PR_Q96FL8 49 | CHEBI_68558 PR_P08183 50 | CHEBI_66910 PR_P08183 51 | CHEBI_52172 PR_Q9Y6L6 52 | CHEBI_9168 PR_Q9Y6L6 53 | CHEBI_8707 PR_P08183 54 | CHEBI_3558 PR_Q9Y6L6 55 | CHEBI_5050 PR_Q8TCC7 56 | CHEBI_63660 PR_Q9Y6L6 57 | CHEBI_367163 PR_Q9Y6L6 58 | CHEBI_135949 PR_P08183 59 | CHEBI_45409 PR_P08183 60 | CHEBI_63608 PR_P08183 61 | CHEBI_681850 PR_P08183 62 | CHEBI_38545 PR_Q9Y6L6 63 | CHEBI_66901 PR_P08183 64 | CHEBI_6931 PR_P08183 65 | CHEBI_72289 PR_P08183 66 | CHEBI_6375 PR_P08183 67 | CHEBI_52172 PR_Q9NPD5 68 | CHEBI_52172 PR_P08183 69 | CHEBI_8871 PR_P08183 70 | CHEBI_66910 PR_Q9NPD5 71 | CHEBI_68621 PR_P08183 72 | CHEBI_38561 PR_Q9Y6L6 73 | CHEBI_3558 PR_Q9NPD5 74 | CHEBI_3558 PR_P08183 75 | CHEBI_135853 PR_P08183 76 | CHEBI_61049 PR_P08183 77 | CHEBI_72296 PR_Q9UNQ0 78 | CHEBI_32020 PR_P08183 79 | CHEBI_63660 PR_Q9UNQ0 80 | CHEBI_63632 PR_Q96FL8 81 | CHEBI_32020 PR_Q9NPD5 82 | CHEBI_31528 PR_P08183 83 | CHEBI_68610 PR_O15244 84 | CHEBI_66910 PR_Q9Y6L6 85 | CHEBI_68595 PR_P08183 86 | CHEBI_9434 PR_Q9NPD5 87 | CHEBI_63660 PR_Q9NPD5 88 | CHEBI_31547 PR_P08183 89 | CHEBI_82960 PR_Q4U2R8 90 | CHEBI_38561 PR_Q9NPD5 91 | CHEBI_38545 PR_P08183 92 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /resources/pmids/turmeric_pmid.txt: -------------------------------------------------------------------------------- 1 | 29857089 2 | 28667541 3 | 28104136 4 | 31231918 5 | 22725836 6 | 9619120 7 | 34620272 8 | 34600893 9 | 34065600 10 | 33800000 11 | 33733472 12 | 33677011 13 | 32961574 14 | 32719085 15 | 31628941 16 | 31533365 17 | 31450166 18 | 31059026 19 | 30778922 20 | 30551030 21 | 30334318 22 | 30276838 23 | 29241692 24 | 29242172 25 | 28904007 26 | 28734960 27 | 27908756 28 | 27039889 29 | 26794747 30 | 26672753 31 | 26609559 32 | 26511469 33 | 26393568 34 | 26102194 35 | 25891083 36 | 25893199 37 | 25632754 38 | 25594233 39 | 24608794 40 | 24510399 41 | 24229684 42 | 23881281 43 | 23848206 44 | 23829533 45 | 23132777 46 | 23064285 47 | 23023936 48 | 22822540 49 | 22725836 50 | 22483553 51 | 22300367 52 | 22181075 53 | 22179005 54 | 21527728 55 | 20977462 56 | 20521597 57 | 20484172 58 | 20403165 59 | 20148399 60 | 19967559 61 | 19886674 62 | 19879740 63 | 19715674 64 | 19653644 65 | 19275711 66 | 18619958 67 | 18385293 68 | 17433521 69 | 17270371 70 | 17178710 71 | 16928820 72 | 16819192 73 | 16458258 74 | 16397849 75 | 16081279 76 | 16021489 77 | 15649425 78 | 15476675 79 | 15280357 80 | 15090070 81 | 12841947 82 | 12235173 83 | 12105223 84 | 8510458 85 | 10780880 86 | 11105995 87 | 20821002 88 | 7898128 89 | 1618892 90 | 15673996 91 | 12676044 92 | 1496714 93 | 11712783 94 | 9436613 95 | 8819298 96 | 7390600 97 | 9704820 98 | 2353930 99 | 12497104 100 | 17214610 101 | 16987575 102 | 2209081 103 | 2960490 104 | 6993103 105 | 7409877 106 | 9062750 107 | 18324353 108 | 10902065 109 | 726520 110 | 17117790 111 | 11815407 112 | 10762434 113 | 18798984 114 | 11363190 115 | 6759931 116 | 8844727 117 | 9782784 118 | 11795474 119 | 15810156 120 | 19803548 121 | 16327153 122 | 1943180 123 | 16286372 124 | 15863912 125 | 2435036 126 | 16112857 127 | 22982056 128 | 12151348 129 | 11231886 130 | 12022761 131 | 16504000 132 | 10889462 133 | 3623345 134 | 3526291 135 | 12616304 136 | 15713005 137 | 12368225 138 | 8520105 139 | 1579064 140 | 9378362 141 | 11485087 142 | 2345457 143 | 10559523 144 | 9733605 145 | 7757981 146 | 10102956 147 | 12495265 148 | 7423534 149 | 7342372 150 | 7845373 151 | 9651124 152 | 7621448 153 | 20471457 154 | 20345353 155 | 28145344 156 | 11857414 157 | 11448902 158 | 17569224 159 | 7553612 160 | 11745031 161 | 1291482 162 | 1394115 163 | 2515397 164 | 8720307 165 | 9120760 166 | 17044766 167 | 9563850 168 | 2699615 169 | 7651374 170 | 7432370 171 | 696348 172 | 17134862 173 | 17022948 174 | 12413724 175 | 8887459 176 | -------------------------------------------------------------------------------- /relation-extraction-scripts/NER_metamap.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to run NER on texts with MetaMap (installed locally) 3 | ''' 4 | from pymetamap import MetaMap 5 | import pickle 6 | import os 7 | import indra 8 | import re 9 | from indra.statements import stmts_from_json_file 10 | 11 | #create instance for metamap API, path from local file 12 | workingDir = os.getcwd() 13 | dir_out = workingDir + '/output_files/' 14 | reachDir = dir_out + 'reach_mapping_files_NER/' 15 | dir_log = workingDir + '/logs/' 16 | mm = MetaMap.get_instance('') 17 | 18 | with open(dir_out+'umls_dict_20221128.pickle', 'rb') as file_i: 19 | umls_dict = pickle.load(file_i) 20 | 21 | #make more sophisticated> currently takes the first MetaMap concept as highest score (same scores ignored) 22 | def extract_concepts_umls(entity, umls_count): 23 | #re.sub(r”\(|\)“, “”, text) 24 | entity = re.sub(r'\(|\)', '', entity) 25 | entity = re.sub(r'[^\x00-\x7F]+','', entity) 26 | text = [entity] 27 | #take the concept with highest score 28 | concepts,error = mm.extract_concepts(text) 29 | if concepts: 30 | concept = concepts[0] 31 | try: 32 | umls_dict[entity] = { 33 | 'cui': concept.cui, 34 | 'umls_term': concept.preferred_name, 35 | 'sem_type': concept.semtypes.strip('][').split(','), 36 | 'score': float(concept.score) 37 | } 38 | umls_count += 1 39 | except AttributeError: 40 | pass 41 | return umls_count 42 | 43 | if __name__ == '__main__': 44 | umls_count = 0 45 | reach_concepts = [] 46 | reach_files = os.listdir(reachDir) 47 | stmts = [] 48 | for file in reach_files: 49 | rpstmts = stmts_from_json_file(reachDir+file) 50 | stmts += rpstmts 51 | print(len(stmts)) 52 | for item in stmts: 53 | agents_list = item.agent_list() 54 | for agent in agents_list: 55 | if agent: 56 | if agent.db_refs: 57 | if 'TEXT' in agent.db_refs: 58 | reach_concepts.append(agent.db_refs['TEXT']) 59 | else: 60 | reach_concepts.extend(item.agent_list()) 61 | else: 62 | reach_concepts.extend(item.agent_list()) 63 | else: 64 | reach_concepts.extend(item.agent_list()) 65 | concepts = set(reach_concepts) 66 | #call function to extract concepts 67 | concepts_list = list(concepts) 68 | print(len(concepts_list)) 69 | index = 0 70 | for concept in concepts_list: 71 | if concept not in umls_dict: 72 | umls_count = extract_concepts_umls(str(concept), umls_count) 73 | index += 1 74 | if index%1000 == 0: 75 | print(index) 76 | total_count = len(concepts_list) 77 | 78 | #save dictionaries to pickle files 79 | with open(dir_out+'umls_dict_20230321.pickle', 'wb') as file_o: 80 | pickle.dump(umls_dict, file_o) 81 | with open(dir_log+'NER_log.txt', 'w') as file_log: 82 | file_log.write('Total concepts = '+str(total_count)) 83 | file_log.write('\nUMLS mapped concepts = '+str(umls_count)) -------------------------------------------------------------------------------- /resources/edge_source_list.txt: -------------------------------------------------------------------------------- 1 | chemical-disease, resources/edge_data/chemical-disease_CTD_chemicals_diseases.tsv 2 | chemical-gene, resources/edge_data/chemical-gene_CTD_chem_gene_ixns.tsv 3 | chemical-gobp, resources/edge_data/chemical-gobp_CTD_chem_go_enriched.tsv 4 | chemical-gocc, resources/edge_data/chemical-gocc_CTD_chem_go_enriched.tsv 5 | chemical-gomf, resources/edge_data/chemical-gomf_CTD_chem_go_enriched.tsv 6 | chemical-pathway, resources/edge_data/chemical-pathway_ChEBI2Reactome_All_Levels.txt 7 | chemical-protein, resources/edge_data/chemical-protein_CTD_chem_gene_ixns.tsv 8 | chemical-phenotype, resources/edge_data/chemical-disease_CTD_chemicals_diseases.tsv 9 | disease-phenotype, resources/edge_data/disease-phenotype_phenotype.hpoa 10 | gene-disease, resources/edge_data/gene-disease_curated_gene_disease_associations.tsv 11 | gene-gene, resources/edge_data/gene-gene_COMBINED.DEFAULT_NETWORKS.BP_COMBINING.txt 12 | gene-pathway, resources/edge_data/gene-pathway_CTD_genes_pathways.tsv 13 | gene-phenotype, resources/edge_data/gene-disease_curated_gene_disease_associations.tsv 14 | gene-protein, resources/processed_data/ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt 15 | gene-rna, resources/processed_data/ENTREZ_GENE_ENSEMBL_TRANSCRIPT_MAP.txt 16 | gobp-pathway, resources/edge_data/gobp-pathway_gene_association.reactome 17 | pathway-gocc, resources/edge_data/pathway-gocc_gene_association.reactome 18 | pathway-gomf, resources/edge_data/pathway-gomf_gene_association.reactome 19 | protein-anatomy, resources/processed_data/HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt 20 | protein-catalyst, resources/processed_data/UNIPROT_PROTEIN_CATALYST.txt 21 | protein-cofactor, resources/processed_data/UNIPROT_PROTEIN_COFACTOR.txt 22 | protein-cell, resources/edge_data/protein-cell_HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt 23 | protein-gobp, resources/edge_data/protein-gobp_goa_human.gaf 24 | protein-gocc, resources/edge_data/protein-gocc_goa_human.gaf 25 | protein-gomf, resources/edge_data/protein-gomf_goa_human.gaf 26 | protein-pathway, resources/edge_data/protein-pathway_UniProt2Reactome_All_Levels.txt 27 | protein-protein, resources/edge_data/protein-protein_9606.protein.links.v11.0.txt 28 | rna-anatomy, resources/processed_data/HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt 29 | rna-cell, resources/processed_data/HPA_GTEX_RNA_GENE_PROTEIN_EDGES.txt 30 | rna-protein, resources/processed_data/ENSEMBL_TRANSCRIPT_PROTEIN_ONTOLOGY_MAP.txt 31 | variant-disease, resources/processed_data/CLINVAR_VARIANT_GENE_DISEASE_PHENOTYPE_EDGES.txt 32 | variant-gene, resources/processed_data/CLINVAR_VARIANT_GENE_DISEASE_PHENOTYPE_EDGES.txt 33 | variant-phenotype, resources/processed_data/CLINVAR_VARIANT_GENE_DISEASE_PHENOTYPE_EDGES.txt 34 | transporter-chemical, resources/processed_data/CHEMICAL_TRANSPORTER.tsv 35 | chemical-molecule, resources/processed_data/CHEMICAL_MOLECULE.tsv 36 | chemical-substrate, resources/processed_data/CHEMICAL_SUBSTRATE.tsv 37 | chemical-inhibitor, resources/processed_data/CHEMICAL_INHIBITOR.tsv 38 | chemical-indication, resources/processed_data/CHEMICAL_INDICATION.tsv 39 | chemical-adr, resources/processed_data/CHEMICAL_ADR.tsv -------------------------------------------------------------------------------- /resources/node_data/README.md: -------------------------------------------------------------------------------- 1 | *** 2 | ## Creating Instance Data Node Metadata 3 | *** 4 | *** 5 | 6 | **Wiki Page:** **[`Dependencies`](https://github.com/callahantiff/PheKnowLator/wiki/Dependencies#node-metadata)** 7 | **Jupyter Notebook:** **[`Data_Preparation.ipynb`](https://github.com/callahantiff/PheKnowLator/blob/master/notebooks/Data_Preparation.ipynb)** 8 | 9 | ___ 10 | 11 | **Purpose:** The knowledge graph can be built with or without the inclusion of node and relation metadata (i.e. 12 | labels, descriptions or definitions, and synonyms). If you'd like to create and use node metadata, please run the 13 | [`Data_Preparation.ipynb`](https://github.com/callahantiff/PheKnowLator/blob/master/notebooks/Data_Preparation.ipynb) Jupyter Notebook and run the code chunks listed under the **INSTANCE AND/OR SUBCLASS (NON-ONTOLOGY CLASS) METADATA** section. These code chunks should be run before the knowledge graph is constructed. For more details on what these data sources are and how they are created, please see the `node_data` [`README.md`](https://github.com/callahantiff/PheKnowLator/blob/master/resources/node_data/README.md). 14 | 15 | Example structure of the metadata dictionary is shown below: 16 | 17 | ```python 18 | { 19 | 'nodes': { 20 | 'http://www.ncbi.nlm.nih.gov/gene/1': { 21 | 'Label': 'A1BG', 22 | 'Description': "A1BG has locus group protein-coding' and is located on chromosome 19 (19q13.43).", 23 | 'Synonym': 'HYST2477alpha-1B-glycoprotein|HEL-S-163pA|ABG|A1B|GAB'} ... }, 24 | 'relations': { 25 | 'http://purl.obolibrary.org/obo/RO_0002533': { 26 | 'Label': 'sequence atomic unit', 27 | 'Description': 'Any individual unit of a collection of like units arranged in a linear order', 28 | 'Synonym': 'None'} ... } 29 | } 30 | ``` 31 | 32 |
33 | 34 | 🛑 *CONSTRAINTS* 🛑 35 | The algorithm makes the following assumptions: 36 | - If metadata is provided, only those edges with nodes that have metadata will be created; valid edges without metadata will be discarded. 37 | - Metadata for all non-ontology nodes and all relations for edges added to the core set of ontologies will be saved as a dictionary in the `./resources/node_data/node_metadata_dict.pkl` repository. 38 | - For each identifier we try to obtain the following metadata: `Label`, `Description`, and `Synonym`. An example of these data types is shown below for a [`gene`](https://github.com/callahantiff/PheKnowLator/wiki/v2-Data-Sources#ncbi-gene) identifier `5620`: 39 | 40 | | **Metadata Type** | **Definition** | **Metadata** | 41 | | :---: | :--- | :--- | 42 | | ID | Node identifiers for instance data sources | `5620` | 43 | | Label | The primary label or name for the node | `LANCL2` | 44 | | Description | A definition or other useful details about the node | `Lanc Like 2` is a `protein-coding` gene that is located on chromosome `7` (map_location: `7p11.2`) | 45 | | Synonym | Alternative terms used for a node | `GPR69B`, `TASP`, `lanC-like protein 2`, `G protein-coupled receptor 69B`, `LanC (bacterial lantibiotic synthetase component C)-like 2`, `LanC lantibiotic synthetase component C-like 2`, `testis-specific adriamycin sensitivity protein` | 46 | 47 |
48 | 49 | #### Metadata + PheKnowLator 50 | *** 51 | The metadata will be used to create the following edges in the knowledge graph: 52 | - **Label** ➞ node `rdfs:label` 53 | - **Description** ➞ node `obo:IAO_0000115` description 54 | - **Synonyms** ➞ node `oboInOwl:hasExactSynonym` synonym 55 | -------------------------------------------------------------------------------- /resources/edge_data/README.md: -------------------------------------------------------------------------------- 1 | *** 2 | ## Constructing Edge Lists 3 | *** 4 | *** 5 | 6 | **Wiki Page:** **[`Data Sources`](https://github.com/callahantiff/PheKnowLator/wiki/v2-Data-Sources#data-sources)** 7 | **Jupyter Notebook:** **[`Data_Preparation.ipynb`](https://github.com/callahantiff/PheKnowLator/blob/master/notebooks/Data_Preparation.ipynb)** 8 | 9 | ___ 10 | 11 | ### Purpose 12 | The first step in constructing a knowledge graph is to build edge lists. In the current build of PheKnowLator (**[`v2.0.0`](https://github.com/callahantiff/PheKnowLator/wiki/v2.0.0)**), this requires downloading and using several sources of linked open data (see Wiki page referenced above for additional information on each source). 13 | 14 |
15 | 16 | _OUTPUT:_ Running this step will output a `json` file to `resources/Master_Edge_List_Dict.json`. The structure of this file is shown below: 17 | 18 | ```python 19 | master_edges = {'chemical-disease' : 20 | {'source_labels' : ';MESH_;', 21 | 'data_type' : 'class-class', 22 | 'edge_relation' : 'RO_0002606', 23 | 'uri' : ('http://purl.obolibrary.org/obo/', 24 | 'http://purl.obolibrary.org/obo/'), 25 | 'delimiter' : '#', 26 | 'column_idx' : '1;4', 27 | 'identifier_maps' : '0:./MESH_CHEBI_MAP.txt;1:disease-dbxref-map', 28 | 'evidence_criteria': "5;!=;' ", 29 | 'filter_criteria' : 'None', 30 | 'edge_list' : ["CHEBI_81395", "DOID_12858"], 31 | ["CHEBI_81395", "DOID_0090103"], ..., 32 | ["CHEBI_81395", "DOID_0090104"]} 33 | ``` 34 | 35 |
36 | 37 | 🛑 *ASSUMPTIONS* 🛑 38 | **The algorithm makes the following assumptions:** 39 | - All downloaded data sources are listed, with metadata, in the [`edge_source_metadata.txt`](https://github.com/callahantiff/PheKnowLator/blob/master/resources/edge_data/edge_source_metadata.txt) document. 40 | - Any data preprocessing, including the development of identifier mapping and evidence/filtering data, has been 41 | completed prior to building the edge lists. A Jupyter Notebook containing examples of different preprocessing 42 | steps can be found [`here`](https://github.com/callahantiff/PheKnowLator/blob/master/notebooks/Data_Preparation.ipynb). 43 | 44 |
45 | 46 | *** 47 | 48 | 71 | -------------------------------------------------------------------------------- /resources/pmids/blackpepper_pmid.txt: -------------------------------------------------------------------------------- 1 | 9619120 2 | 22725836 3 | 27776366 4 | 19283724 5 | 17963429 6 | 20642555 7 | 27670974 8 | 25532954 9 | 508556 10 | 34092198 11 | 35402752 12 | 33897893 13 | 33378196 14 | 33434916 15 | 36043771 16 | 33944662 17 | 31748224 18 | 36299167 19 | 32633857 20 | 33514009 21 | 33919582 22 | 30909366 23 | 28801675 24 | 34997607 25 | 34996326 26 | 27776366 27 | 29414892 28 | 23589122 29 | 35168384 30 | 29878843 31 | 27862930 32 | 27288758 33 | 29779481 34 | 24657329 35 | 29032766 36 | 33578817 37 | 19703367 38 | 27670974 39 | 21333639 40 | 16414224 41 | 22736065 42 | 36290641 43 | 33682565 44 | 24690772 45 | 31340396 46 | 21139236 47 | 27981349 48 | 36208657 49 | 31539794 50 | 16360935 51 | 36341917 52 | 29652199 53 | 30668388 54 | 35667195 55 | 23208983 56 | 24786847 57 | 12130727 58 | 34180761 59 | 21811923 60 | 32370771 61 | 34176823 62 | 24575896 63 | 34125278 64 | 23973509 65 | 28459658 66 | 33363626 67 | 30181518 68 | 29498663 69 | 35883026 70 | 31711793 71 | 27821437 72 | 29133241 73 | 18417181 74 | 30362574 75 | 6800071 76 | 25645812 77 | 18611395 78 | 28947984 79 | 21796656 80 | 26730517 81 | 29502118 82 | 26593909 83 | 33921897 84 | 18480186 85 | 24992195 86 | 30528673 87 | 33793797 88 | 32926628 89 | 31047869 90 | 21802927 91 | 27052193 92 | 30105488 93 | 22725836 94 | 16808005 95 | 26160506 96 | 29227934 97 | 27457692 98 | 31731718 99 | 31033966 100 | 31213294 101 | 26036652 102 | 22902327 103 | 30240490 104 | 30029912 105 | 34098743 106 | 28396565 107 | 27448228 108 | 23707768 109 | 22864626 110 | 24014108 111 | 26548081 112 | 28687052 113 | 20736323 114 | 27111639 115 | 27889497 116 | 18332082 117 | 28123572 118 | 23911889 119 | 28971602 120 | 12110320 121 | 29455730 122 | 17487234 123 | 27182646 124 | 8511777 125 | 20492299 126 | 31201588 127 | 23156991 128 | 23623790 129 | 29683271 130 | 26297981 131 | 1889377 132 | 22822540 133 | 22927137 134 | 30391161 135 | 3917507 136 | 24432371 137 | 15724447 138 | 33174133 139 | 20118549 140 | 12164868 141 | 24820273 142 | 19653312 143 | 8890298 144 | 26031900 145 | 8561796 146 | 30475855 147 | 29423050 148 | 20460777 149 | 30575945 150 | 3080587 151 | 21883938 152 | 10968285 153 | 15183854 154 | 28117414 155 | 23771746 156 | 23537660 157 | 9421252 158 | 17408604 159 | 26439699 160 | 6799642 161 | 8872360 162 | 22878261 163 | 24688365 164 | 18619980 165 | 21562741 166 | 27018328 167 | 28772002 168 | 36658705 169 | 18568022 170 | 24707867 171 | 8634279 172 | 25505573 173 | 21842308 174 | 24398010 175 | 24905252 176 | 8347144 177 | 26142526 178 | 26598901 179 | 16436284 180 | 21787607 181 | 10837555 182 | 25053555 183 | 24705672 184 | 8571358 185 | 25835611 186 | 36493999 187 | 26232071 188 | 32334067 189 | 12806184 190 | 20525733 191 | 29879347 192 | 31654381 193 | 16222444 194 | 32262777 195 | 3564048 196 | 30739353 197 | 7771106 198 | 12473382 199 | 28584158 200 | 7571719 201 | 33280383 202 | 9879661 203 | 20381363 204 | 20642555 205 | 9616184 206 | 11882914 207 | 17156774 208 | 31624335 209 | 18317798 210 | 27671914 211 | 34763313 212 | 19903834 213 | 23750076 214 | 18477507 215 | 23909733 216 | 27882569 217 | 26335572 218 | 1981722 219 | 31321577 220 | 818382 221 | 16243320 222 | 15010263 223 | 8206433 224 | 10930696 225 | 26099455 226 | 20017731 227 | 6425870 228 | 25907981 229 | 31366578 230 | 6438924 231 | 30009814 232 | 6816562 233 | 17880178 234 | 28739698 235 | 20085307 236 | 24122170 237 | 21680766 238 | 18509672 239 | 28736128 240 | 18848780 241 | 20926620 242 | 18524600 243 | 26223128 244 | 25174113 245 | 7794964 246 | 30121230 247 | 16696571 248 | 12686496 249 | 20055162 250 | 17204498 251 | 18334493 252 | 32929340 253 | 19523722 254 | 22073884 255 | 20576527 256 | 17266134 257 | 11999755 258 | 7574722 259 | 15588619 260 | 15377639 261 | 2120856 262 | 6426176 263 | 6422936 -------------------------------------------------------------------------------- /resources/pmids/licorice_pmid.txt: -------------------------------------------------------------------------------- 1 | 8583374 2 | 22983284 3 | 31721714 4 | 31325600 5 | 32978039 6 | 31368836 7 | 25825801 8 | 24882402 9 | 30741588 10 | 32298152 11 | 30582378 12 | 24261979 13 | 32347726 14 | 31807838 15 | 28774812 16 | 30317430 17 | 30733510 18 | 30158992 19 | 31308447 20 | 28754329 21 | 15802882 22 | 31151274 23 | 24939038 24 | 24201019 25 | 28631076 26 | 21462897 27 | 26285764 28 | 33481180 29 | 29099639 30 | 30636046 31 | 32147624 32 | 16204965 33 | 25557030 34 | 31429612 35 | 22980806 36 | 30215539 37 | 29498478 38 | 32698076 39 | 26100226 40 | 19543501 41 | 26552146 42 | 22969825 43 | 28813533 44 | 30594244 45 | 24824478 46 | 29357733 47 | 26068524 48 | 21351298 49 | 23396419 50 | 33312341 51 | 24670676 52 | 23148031 53 | 32158189 54 | 30287522 55 | 34183756 56 | 33287126 57 | 16476124 58 | 20350051 59 | 25189890 60 | 30348944 61 | 25202272 62 | 28627473 63 | 18778682 64 | 22792804 65 | 19173278 66 | 29971002 67 | 29908072 68 | 30668338 69 | 21800547 70 | 26750984 71 | 28668488 72 | 29433959 73 | 30724689 74 | 25614104 75 | 25600891 76 | 31911178 77 | 21818843 78 | 22543233 79 | 23462213 80 | 33439252 81 | 25094029 82 | 24254844 83 | 19952421 84 | 27693954 85 | 28165501 86 | 29997360 87 | 12767692 88 | 9464470 89 | 29891024 90 | 28745128 91 | 29150398 92 | 28115129 93 | 16210916 94 | 17220245 95 | 26153439 96 | 31278631 97 | 27815731 98 | 22741471 99 | 23457494 100 | 27507204 101 | 19571434 102 | 23707333 103 | 10442216 104 | 31884095 105 | 20045984 106 | 9790908 107 | 22983284 108 | 22543032 109 | 24486211 110 | 23189719 111 | 17551811 112 | 26965985 113 | 16952351 114 | 9140223 115 | 28630457 116 | 31182998 117 | 31120605 118 | 23519262 119 | 23065713 120 | 27238153 121 | 21351573 122 | 26634613 123 | 25110319 124 | 21355220 125 | 21796703 126 | 17982673 127 | 34072092 128 | 12019199 129 | 20045987 130 | 21210738 131 | 24497736 132 | 26987268 133 | 23069764 134 | 33924458 135 | 23835906 136 | 29929431 137 | 28775294 138 | 25951662 139 | 20393696 140 | 18451504 141 | 29926584 142 | 12794307 143 | 18362159 144 | 19915794 145 | 10530767 146 | 12207646 147 | 17434990 148 | 12064912 149 | 20597806 150 | 21691759 151 | 25948710 152 | 19831503 153 | 17107662 154 | 26134484 155 | 10969720 156 | 19065667 157 | 10557230 158 | 19149264 159 | 18761400 160 | 27747446 161 | 27919190 162 | 25450236 163 | 34302803 164 | 20642450 165 | 12220964 166 | 18462715 167 | 27686831 168 | 12584850 169 | 22508486 170 | 32292023 171 | 23506993 172 | 27051341 173 | 12943171 174 | 14617078 175 | 22117525 176 | 25660335 177 | 27220746 178 | 16797510 179 | 17030102 180 | 34386321 181 | 34601070 182 | 23975867 183 | 25871879 184 | 11922733 185 | 30428619 186 | 23165347 187 | 19735171 188 | 23265493 189 | 27785943 190 | 15809082 191 | 21110398 192 | 34233566 193 | 16459097 194 | 8505019 195 | 22982774 196 | 12721381 197 | 27152350 198 | 24123597 199 | 25017733 200 | 19252289 201 | 9708921 202 | 8589765 203 | 20647377 204 | 23843229 205 | 17213730 206 | 26297122 207 | 29452068 208 | 16332809 209 | 10192774 210 | 18555495 211 | 10540749 212 | 19726295 213 | 26805419 214 | 30160312 215 | 10079512 216 | 19559778 217 | 26003723 218 | 22796279 219 | 24941800 220 | 19083468 221 | 9568287 222 | 8070385 223 | 29380109 224 | 28301718 225 | 18462716 226 | 15635178 227 | 11773526 228 | 10821120 229 | 20132953 230 | 23923606 231 | 19098391 232 | 19084577 233 | 16773639 234 | 2433549 235 | 10635120 236 | 2541725 237 | 12969435 238 | 12921627 239 | 11584810 240 | 11798659 241 | 9635141 242 | 1513071 243 | 2251786 244 | 10972957 245 | 10474021 246 | 3014078 247 | 11036340 248 | 9772691 249 | 9459201 250 | 8413772 251 | 6099454 252 | 18047011 253 | 33824106 254 | 34662257 255 | 33404688 256 | 30424926 257 | 10950855 258 | 10445386 259 | 9849639 260 | 10408227 261 | 10548450 262 | 9645391 263 | 9845924 264 | 10456689 265 | 18628584 266 | 6357769 267 | 12566654 -------------------------------------------------------------------------------- /relation-extraction-scripts/indraREACH_extract.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to use REACH from INDRA interface to extract statements from the full text of articles given a list of PMIDs. 3 | This script uses process_text function from the REACH API after extracting the full text instead of the INDRA pmc_client. 4 | Used by the machineReadMain script to read with REACH (not an independent script - see indraREACH_new for that version) 5 | Author: Sanya B. Taneja 6 | Date: 05-28-2021 7 | ''' 8 | import os, sys 9 | import indra.literature.pmc_client as pmc_client 10 | import indra.literature.pubmed_client as pubmed_client 11 | from indra.sources import reach 12 | from indra.ontology.bio import bio_ontology 13 | import indra.tools.assemble_corpus as ac 14 | import pickle 15 | import xml.etree.ElementTree as ET 16 | import time 17 | 18 | #True if running assembly pipeline on individual papers 19 | run_assembly = True 20 | 21 | #function to extract reach statements from server running through local server 22 | def process_with_reach(text, pmid, output_dir): 23 | 24 | outFname = output_dir + pmid + '_reach.json' 25 | file_o = open(output_dir + pmid + '_reach_statements.txt', 'w') 26 | rp = reach.process_text(text, citation=pmid, output_fname=outFname, url='http://localhost:8000/api/text') 27 | print('Saving raw REACH output to file: ' + outFname) 28 | if rp is not None: 29 | for stmt in rp.statements: 30 | file_o.write('\n'+ str(stmt)) 31 | return rp.statements 32 | else: 33 | return None 34 | 35 | def process_with_reach_nxml(pmid, output_dir_xml, output_dir): 36 | nxml_file = output_dir_xml+pmid+'.nxml' 37 | outFname = output_dir + pmid + '_reach.json' 38 | file_o = open(output_dir + pmid + '_reach_statements.txt', 'w') 39 | try: 40 | rp = reach.process_nxml_file(nxml_file, citation=pmid, url='http://localhost:8000/api/uploadFile', output_fname=outFname) 41 | except Exception as e: 42 | print(e) 43 | return None 44 | print('Saving raw REACH output to file: ' + outFname) 45 | if rp is not None: 46 | for stmt in rp.statements: 47 | file_o.write('\n'+ str(stmt)) 48 | return rp.statements 49 | else: 50 | return None 51 | 52 | 53 | #get xml string of article using indra.pmc_client from the PMCID 54 | def get_xml_from_pmcid(pmcid, pmid, dirOut_xml, dirOut_txt): 55 | try: 56 | xml_str = pmc_client.get_xml(pmcid) 57 | except Exception as e: 58 | print('Cannot extract full text from PMCID with PMID: ', pmid) 59 | print(e) 60 | if xml_str is None: 61 | return None 62 | fname = dirOut_xml + pmid + '.nxml' 63 | with open(fname, 'wb') as fh: 64 | fh.write(xml_str.encode('utf-8')) 65 | return xml_str 66 | 67 | #extract plaintext from the xml string of article and save in separate text file (only if save_full_text=True) 68 | def get_text_from_xml(pmid, xml_data, dirOut_xml, dirOut_txt): 69 | fname = dirOut_txt + pmid + '.txt' 70 | if xml_data is None: 71 | print('XML is empty') 72 | return None 73 | try: 74 | content_str = pmc_client.extract_text(xml_data) 75 | except Exception as e: 76 | print(e) 77 | print('XML error for pmid: ', pmid) 78 | content_str = None 79 | if content_str is None: 80 | return None 81 | else: 82 | print('Saving full text of XML file') 83 | with open(fname, 'w') as file_o: 84 | file_o.write(content_str) 85 | return content_str 86 | 87 | ''' 88 | Steps to be modified based on use case and required output and processing 89 | ''' 90 | def run_assembly_pipeline(statements): 91 | statements = ac.filter_no_hypothesis(statements) # Filter out hypothetical statements 92 | statements = ac.map_grounding(statements, gilda_mode='local') # Map grounding 93 | statements = ac.run_preassembly(statements, 94 | return_toplevel=False, 95 | ontology=bio_ontology, 96 | normalize_equivalences=True, # Optional: rewrite equivalent groundings to one standard 97 | normalize_opposites=True, # Optional: rewrite opposite groundings to one standard 98 | normalize_ns='OBO') # WM = world modelers, OBO = Bio_Ontology 99 | return statements 100 | 101 | def get_text_from_pmid(pmid, dirOut_xml, dirOut_txt): 102 | xml_str = None 103 | ids = '' 104 | try: 105 | ids = pmc_client.id_lookup(pmid, idtype='pmid') 106 | except Exception as e: 107 | time.sleep(5) 108 | ids = pmc_client.id_lookup(pmid, idtype='pmid') 109 | if 'pmcid' not in ids: 110 | return None 111 | pmcid = ids['pmcid'] 112 | if pmcid is not None: 113 | xml_str = get_xml_from_pmcid(pmcid, pmid, dirOut_xml, dirOut_txt) 114 | if xml_str is not None: 115 | content = get_text_from_xml(pmid, xml_str, dirOut_xml, dirOut_txt) 116 | return content 117 | if pmcid is None or xml_str is None: 118 | return None 119 | return None 120 | -------------------------------------------------------------------------------- /relation-extraction-scripts/pdf_to_text.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to convert PDF to plain text using pdfminer. 3 | Adapted from Linh Huang notebook - https://github.com/infoqualitylab/DDI_Evidence_Classification/blob/b515b659ac7ad0c695dee9e36d63c9ed5777b7ed/Scripts/Preprocess_fulltext_papers.ipynb. 4 | Can be run independently or with machineReadMain.py 5 | Author: Sanya B. Taneja 6 | Date: 05-28-2021 7 | ''' 8 | import pdfminer 9 | from six import StringIO 10 | import re, sys, os 11 | from io import BytesIO as StringIO 12 | 13 | from pdfminer.pdfparser import PDFParser 14 | from pdfminer.pdfdocument import PDFDocument 15 | from pdfminer.pdfpage import PDFPage 16 | from pdfminer.pdfpage import PDFTextExtractionNotAllowed 17 | from pdfminer.pdfinterp import PDFResourceManager 18 | from pdfminer.pdfinterp import PDFPageInterpreter 19 | from pdfminer.pdfdevice import PDFDevice 20 | from pdfminer.layout import LAParams 21 | from pdfminer.converter import PDFPageAggregator 22 | from pdfminer.layout import LAParams, LTTextBox, LTTextLine 23 | 24 | from nltk.tokenize import word_tokenize 25 | 26 | def read_PDF_file(filename, filename_out): 27 | 28 | try: 29 | fp = open(filename, 'rb') 30 | # Create a PDF parser object associated with the file object. 31 | parser = PDFParser(fp) 32 | # Create a PDF document object that stores the document structure. 33 | # Supply the password for initialization. 34 | document = PDFDocument(parser) 35 | # Check if the document allows text extraction. If not, abort. 36 | if not document.is_extractable: 37 | raise PDFTextExtractionNotAllowed 38 | # Create a PDF resource manager object that stores shared resources. 39 | rsrcmgr = PDFResourceManager() 40 | # Set parameters for analysis. 41 | laparams = LAParams() 42 | # Create a PDF page aggregator object. 43 | device = PDFPageAggregator(rsrcmgr, laparams=laparams) 44 | interpreter = PDFPageInterpreter(rsrcmgr, device) 45 | extracted_text = '' 46 | for page in PDFPage.create_pages(document): 47 | interpreter.process_page(page) 48 | layout = device.get_result() 49 | for lt_obj in layout: 50 | if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine): 51 | extracted_text = extracted_text + " " 52 | extracted_text += lt_obj.get_text() 53 | 54 | with open(filename_out, "wb") as txt_file: 55 | txt_file.write(extracted_text.encode("utf-8")) 56 | return True 57 | except Exception as e: 58 | print('Unable to process PDF file') 59 | print(e) 60 | return False 61 | 62 | def process_PDF_file(file, outputFile): 63 | 64 | start_pattern = ["Abstract", "ABSTRACT", "INTRODUCTION", "Introduction", "BACKGROUND", "Background"] 65 | stop_pattern = ["Acknowledgement", "ACKNOWLEDGEMENT", "REFERENCES", "References", "Supplementary Material", "Conflict of Interest statement"] 66 | alternate_start_pattern = ["METHODS", "Methods","Subjects and methods","Subjects and Methods", "MATERIALS AND METHODS","Materials and methods", 67 | "METHODS AND MATERIALS","PATIENTS AND METHODS", "PARTICIPANTS AND METHODS", "SUBJECTS AND METHODS", "Materials and Methods"] 68 | headers_last = ['Result', 'Results', 'RESULTS', 'RESULT', 'Discussion', 'DISCUSSION', 'CONCLUSION', 'Conclusion', 'CONCLUSIONS', 'Conclusions'] 69 | try: 70 | if file.endswith(".txt") and 'processed' not in file: 71 | with open(file, 'r') as article_plaintext_file: 72 | recording = False 73 | output_section = [] 74 | for line in article_plaintext_file: 75 | line = line.replace("\n", "") 76 | line_sans_num = re.sub(r'[^A-Za-z ]+', '', line) 77 | line_sans_num = line_sans_num.strip() 78 | if recording is False: 79 | if line_sans_num in start_pattern: 80 | print(line) 81 | recording = True 82 | output_section.append('<'+line.strip().upper()+'>') 83 | elif recording is True: 84 | if line_sans_num in stop_pattern: 85 | print(line) 86 | recording = False 87 | else: 88 | if line_sans_num in alternate_start_pattern: 89 | output_section.append('') 90 | elif line_sans_num in start_pattern or line_sans_num in headers_last: 91 | output_section.append('<'+line.strip().upper()+'>') 92 | else: 93 | output_section.append(line.strip()) 94 | #Save the text into file 95 | outfile = open(outputFile, 'w') 96 | for line in output_section: 97 | line = line.strip() 98 | line = line.replace("\n", " ") 99 | line = line.replace("- ", "") 100 | line = line + " " 101 | outfile.write(line) 102 | outfile.close() 103 | return True 104 | except Exception as e: 105 | print('Unable to process text file from PDF') 106 | print(e) 107 | return False 108 | -------------------------------------------------------------------------------- /relation-extraction-scripts/semrepExtract.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to extract predications from SemRep from processed files generated by semrep_process_pmid. 3 | ''' 4 | import sys, os 5 | from datetime import datetime, timedelta 6 | import pandas as pd 7 | import subprocess 8 | from nltk import tokenize 9 | import requests 10 | import time 11 | from pdf_to_text import read_PDF_file, process_PDF_file 12 | import logging 13 | 14 | workingDir = os.getcwd() 15 | log_dir = workingDir + '/logs/' 16 | 17 | np = [] 18 | 19 | extraction = True 20 | 21 | count_dict = { 22 | 'n_total_pmid': 0, 23 | 'n_success': 0, 24 | 'n_error': 0, 25 | 'n_statements':0, 26 | 'n_files_processed': 0, 27 | 'n_pdf': 0 28 | } 29 | 30 | pub_year_to_pmid_map = {} 31 | pub_type_to_pmid_map = {} 32 | 33 | section_tags = ['', '', '', '', '', '', '', 34 | '', ''] 35 | 36 | def get_publication_year_and_type(pmid): 37 | pub_year = '' 38 | pub_type = '' 39 | if pmid == '': 40 | return pub_year, pub_type 41 | if pmid in pub_year_to_pmid_map and pmid in pub_type_to_pmid_map: 42 | if pub_year_to_pmid_map[pmid] != '': 43 | pub_year = pub_year_to_pmid_map[pmid] 44 | pub_type = pub_type_to_pmid_map[pmid] 45 | return pub_year, str(pub_type) 46 | time.sleep(5) 47 | uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json" 48 | response = requests.get(uri) 49 | if response.status_code == 429: 50 | time.sleep(5) 51 | response = requests.get(uri) 52 | if response.status_code == 200: 53 | result = response.json() 54 | pub_year = result['result'][pmid]['pubdate'] 55 | pub_year_to_pmid_map[pmid] = pub_year 56 | 57 | pub_type = result['result'][pmid]['pubtype'] 58 | pub_type_to_pmid_map[pmid] = pub_type 59 | return pub_year, str(pub_type) 60 | 61 | ###counting of statements is incorrect and being done per file not all - last reported is for the file not the entire set in count dict or log 62 | def semrep_extract(filepath): 63 | result_dict = { 64 | 'index': [], 65 | 'pmid': [], 66 | 'relation': [], 67 | 'year': [], 68 | 'subject_cui': [], 69 | 'object_cui': [], 70 | 'subject_name': [], 71 | 'object_name': [], 72 | 'subject_type': [], 73 | 'object_type': [], 74 | 'sentence': [], 75 | 'source_section': [], 76 | 'pub_type': [] 77 | } 78 | index = 0 79 | semrep_files = os.listdir(filepath) 80 | for file in semrep_files: 81 | print(file) 82 | pmid = file.split('.')[0] 83 | sem_relations = { 84 | 'items': [], 85 | 'source_sentence': [], 86 | 'source_section': [] 87 | } 88 | with open(filepath+file, 'r', errors='ignore') as file_sem: 89 | lines = file_sem.readlines() 90 | 91 | last_non_empty = '' 92 | section_match = '' 93 | for item in lines: 94 | if any(s in item for s in section_tags): 95 | #assign section 96 | section_match = next((sec for sec in section_tags if sec in item), False) 97 | if '|relation|' in item: 98 | sem_relations['items'].append(item) 99 | sem_relations['source_sentence'].append(last_non_empty) 100 | sem_relations['source_section'].append(section_match) 101 | elif item == '\n' or item == '': 102 | continue 103 | else: 104 | last_non_empty = item 105 | 106 | count_dict['n_statements'] += len(sem_relations) 107 | for rel in sem_relations['items']: 108 | fields = rel.split('|') 109 | if len(fields) < 5: 110 | continue 111 | result_dict['index'].append(index) 112 | result_dict['pmid'].append(pmid) 113 | result_dict['subject_cui'].append(fields[2]) 114 | result_dict['object_cui'].append(fields[9]) 115 | result_dict['subject_name'].append(fields[3]) 116 | result_dict['object_name'].append(fields[10]) 117 | result_dict['relation'].append(fields[8]) 118 | result_dict['subject_type'].append(fields[4]) 119 | result_dict['object_type'].append(fields[11]) 120 | pub_year, pub_type = get_publication_year_and_type(pmid) 121 | result_dict['year'].append(pub_year) 122 | result_dict['pub_type'].append(pub_type) 123 | relation_index = sem_relations['items'].index(rel) 124 | result_dict['sentence'].append(sem_relations['source_sentence'][relation_index]) 125 | result_dict['source_section'].append(sem_relations['source_section'][relation_index]) 126 | index += 1 127 | 128 | return result_dict 129 | 130 | if __name__ == '__main__': 131 | 132 | for item in np: 133 | print('Processing ', item) 134 | outputDir = workingDir + '/output_files/'+item+'/semrepOutput/' 135 | result_dict = semrep_extract(outputDir) 136 | 137 | semrep_result = pd.DataFrame(result_dict) 138 | semrep_result_unique = semrep_result.drop_duplicates(subset=['subject_cui', 'subject_name', 'subject_type', 139 | 'relation', 'object_cui', 'object_name', 'object_type', 'year', 'sentence', 'source_section', 'pub_type']) 140 | semrep_result_unique.to_csv(workingDir+'/output_files/'+item+'/' +item+'_pmid_all_predicates_semrep-extract.tsv', sep='\t', index=False, 141 | columns=['index', 'pmid', 'subject_cui', 'subject_name', 'subject_type', 142 | 'relation', 'object_cui', 'object_name', 'object_type', 'year', 'sentence', 'source_section', 'pub_type']) 143 | -------------------------------------------------------------------------------- /relation-extraction-scripts/machineReadMain.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Main script to run the INDRA/REACH relation extraction system on full text articles. Script reads PDF files, converts 3 | to text, processes text, and generates output. 4 | Author: Sanya B. Taneja 5 | ''' 6 | import os, sys 7 | from pdf_to_text import read_PDF_file, process_PDF_file 8 | from indraREACH_extract import process_with_reach, get_text_from_pmid, run_assembly_pipeline, process_with_reach_nxml 9 | from datetime import datetime, timedelta 10 | from indra.statements import stmts_to_json_file 11 | 12 | working_dir = os.getcwd() 13 | DIR_IN = working_dir + '/input_files/' 14 | DIR_OUT = working_dir + '/output_files/' 15 | DIR_LOG = working_dir + '/logs/' 16 | 17 | logging = True 18 | #True if running assembly pipeline on individual papers 19 | run_assembly = True 20 | 21 | #start_line = int(sys.argv[1]) 22 | #end_line = int(sys.argv[2]) 23 | 24 | NP = ['cranberry', 'fenugreek', 'flaxseed', 'ginger', 'ginkgo', 'goldenseal', 'guarana', 'horsechestnut', 25 | 'licorice', 'oregano', 'scrubpalmetto', 'valerian', 'ashwaganda', 'blackpepper', 'blackcohosh', 'cannabis', 'echinacea', 'feverfew', 'garlic', 'grapefruit', 'milkthistle', 26 | 'panaxginseng', 'rhodiola', 'rosemary', 'soybean', 'hawthorn'] 27 | 28 | pmid_list = [] 29 | 30 | if __name__ == '__main__': 31 | for item in NP: 32 | t0 = datetime.now() 33 | log_file = open(DIR_LOG + item + str(t0) + '_log.txt', 'w') 34 | input_dir = DIR_IN + item +'/' 35 | input_dir_pdf = DIR_IN + item + '/FullTextPDFs/' 36 | output_dir = DIR_OUT + item + '/reachOutput/' 37 | output_dir_pdf = DIR_OUT + item + '/PDFoutput/' 38 | output_dir_xml = DIR_OUT + item + '/xmlTexts/' 39 | file_i = input_dir + item + '_pmid.txt' 40 | log_file.write('Log for '+ item) 41 | with open(file_i, 'r') as file_input: 42 | pmids = file_input.readlines() 43 | count_dict = { 44 | 'n_pmid': 0, 45 | 'n_pdf': 0, 46 | 'n_output_reach': 0, 47 | 'n_statements': 0, 48 | 'n_error': 0 49 | } 50 | stmts = [] 51 | #to track if pdf extract required for PMID 52 | pdf_flag = True 53 | for line in pmids: 54 | pmid = line.strip() 55 | if pmid in pmid_list: 56 | continue 57 | pmid_list.append(pmid) 58 | count_dict['n_pmid'] += 1 59 | log_file.write('\n\nPMID: '+pmid) 60 | #check for full text from PMC and get text 61 | text = get_text_from_pmid(pmid, output_dir_xml, output_dir) 62 | 63 | if text is not None: 64 | #full text available in PMC 65 | pmc_statements = process_with_reach_nxml(pmid, output_dir_xml, output_dir) 66 | if pmc_statements is not None: 67 | pdf_flag = False 68 | count_dict['n_output_reach'] += 1 69 | log_file.write('\nNumber of statements (PMC): '+str(len(pmc_statements))) 70 | stmts += pmc_statements 71 | else: 72 | pmc_statements = process_with_reach(text, pmid, output_dir) 73 | if pmc_statements is not None: 74 | pdf_flag = False 75 | count_dict['n_output_reach'] += 1 76 | log_file.write('\nNumber of statements (PMC-text): '+str(len(pmc_statements))) 77 | stmts += pmc_statements 78 | 79 | elif pdf_flag or text is None: 80 | #check for PDF file and get text 81 | pdf_files = os.listdir(input_dir_pdf) 82 | if pmid + '.pdf' in pdf_files: 83 | count_dict['n_pdf'] += 1 84 | pdf_in_file = input_dir_pdf+pmid+'.pdf' 85 | pdf_out_file = output_dir_pdf+pmid+'.txt' 86 | pdf_txt_file = output_dir_pdf+pmid+'_processed.txt' 87 | pdf_text_val = read_PDF_file(pdf_in_file, pdf_out_file) 88 | if pdf_text_val: 89 | pdf_text_process_val = process_PDF_file(pdf_out_file, pdf_txt_file) 90 | if pdf_text_process_val: 91 | if os.path.getsize(pdf_txt_file) == 0: 92 | pdf_txt_file = pdf_out_file 93 | with open(pdf_txt_file, 'r') as file_txt: 94 | pdf_text = file_txt.read() 95 | pdf_statements = process_with_reach(pdf_text, pmid, output_dir) 96 | if pdf_statements is not None: 97 | count_dict['n_output_reach'] += 1 98 | log_file.write('\nNumber of statements (PDF): '+str(len(pdf_statements))) 99 | stmts += pdf_statements 100 | else: 101 | log_file.write('\nREACH returned None for PMID: '+ pmid) 102 | else: 103 | count_dict['n_error'] += 1 104 | log_file.write('\nUnable to extract from PDF: '+pmid) 105 | #process abstract in this case 106 | 107 | else: 108 | log_file.write('\nPMC or PDF not available for PMID: '+pmid) 109 | count_dict['n_error'] += 1 110 | #process abstract in this case 111 | 112 | if run_assembly: 113 | stmts = run_assembly_pipeline(stmts) 114 | outJSONFname = output_dir + item+ '_reach_output_assembly.json' 115 | 116 | else: 117 | outJSONFname = output_dir + item +'_reach_output_no_assembly.json' 118 | count_dict['n_statements'] = len(stmts) 119 | print('Saving combined output:') 120 | stmts_to_json_file(stmts, outJSONFname) 121 | 122 | if logging: 123 | t1 = datetime.now() 124 | seconds=timedelta.total_seconds(t1-t0) 125 | log_file.write('\nTotal time: '+ str(seconds)+' seconds') 126 | log_file.write('\nInput file: '+file_i) 127 | log_file.write('\nPMIDs: '+ str(len(pmids))) 128 | log_file.write('\nN_pmid_hits: '+str(count_dict['n_pmid'])) 129 | log_file.write('\nN_pdf_hits: '+str(count_dict['n_pdf'])) 130 | log_file.write('\nN_reach_hits: '+str(count_dict['n_output_reach'])) 131 | log_file.write('\nN_errors: '+str(count_dict['n_error'])) 132 | log_file.write('\nN_statements: '+str(count_dict['n_statements'])) 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /resources/ontology-extensions/log-ontology-extensions-20240229.txt: -------------------------------------------------------------------------------- 1 | INFO:root:Generating ontology extensions for 34 natural products: 2 | INFO:root: 3 | Natural products in the list: dict_keys(['Actaea racemosa', 'Aesculus hippocastanum', 'Allium sativum', 'Camellia sinensis', 'Cannabis sativa', 'Cinnamomum cassia', 'Cinnamomum verum', 'Citrus paradisi', 'Crataegus laevigata', 'Curcuma longa', 'Echinacea purpurea', 'Ginkgo biloba', 'Glycine max', 'Glycyrrhiza glabra', 'Glycyrrhiza inflata', 'Glycyrrhiza uralensis', 'Hydrastis canadensis', 'Linum usitatissimum', 'Mitragyna speciosa', 'Origanum vulgare', 'Panax ginseng', 'Paullinia cupana', 'Piper nigrum', 'Rhodiola rosea', 'Rosmarinus officinalis', 'Serenoa repens', 'Silybum marianum', 'Tanacetum parthenium', 'Trigonella foenum-graecum', 'Vaccinium macrocarpon', 'Valeriana officinalis', 'Withania somnifera', 'Zingiber officinale', 'Taraxacum officinale']) 4 | INFO:root: 5 | Total constituents for Actaea racemosa: 64 6 | INFO:root: 7 | GSRS constituents: 64 8 | INFO:root: 9 | EMA constituents: 0 10 | INFO:root: 11 | Total constituents for Aesculus hippocastanum: 16 12 | INFO:root: 13 | GSRS constituents: 6 14 | INFO:root: 15 | EMA constituents: 10 16 | INFO:root: 17 | Total constituents for Allium sativum: 24 18 | INFO:root: 19 | GSRS constituents: 5 20 | INFO:root: 21 | EMA constituents: 19 22 | INFO:root: 23 | Total constituents for Camellia sinensis: 22 24 | INFO:root: 25 | GSRS constituents: 3 26 | INFO:root: 27 | EMA constituents: 19 28 | INFO:root: 29 | Total constituents for Cannabis sativa: 67 30 | INFO:root: 31 | GSRS constituents: 67 32 | INFO:root: 33 | EMA constituents: 0 34 | INFO:root: 35 | Total constituents for Cinnamomum cassia: 12 36 | INFO:root: 37 | GSRS constituents: 12 38 | INFO:root: 39 | EMA constituents: 0 40 | INFO:root: 41 | Total constituents for Cinnamomum verum: 26 42 | INFO:root: 43 | GSRS constituents: 14 44 | INFO:root: 45 | EMA constituents: 12 46 | INFO:root: 47 | Total constituents for Citrus paradisi: 0 48 | INFO:root: 49 | GSRS constituents: 0 50 | INFO:root: 51 | EMA constituents: 0 52 | INFO:root: 53 | Total constituents for Crataegus laevigata: 16 54 | INFO:root: 55 | GSRS constituents: 0 56 | INFO:root: 57 | EMA constituents: 16 58 | INFO:root: 59 | Total constituents for Curcuma longa: 11 60 | INFO:root: 61 | GSRS constituents: 2 62 | INFO:root: 63 | EMA constituents: 9 64 | INFO:root: 65 | Total constituents for Echinacea purpurea: 31 66 | INFO:root: 67 | GSRS constituents: 0 68 | INFO:root: 69 | EMA constituents: 31 70 | INFO:root: 71 | Total constituents for Ginkgo biloba: 18 72 | INFO:root: 73 | GSRS constituents: 11 74 | INFO:root: 75 | EMA constituents: 7 76 | INFO:root: 77 | Total constituents for Glycine max: 25 78 | INFO:root: 79 | GSRS constituents: 1 80 | INFO:root: 81 | EMA constituents: 24 82 | INFO:root: 83 | Total constituents for Glycyrrhiza glabra: 57 84 | INFO:root: 85 | GSRS constituents: 17 86 | INFO:root: 87 | EMA constituents: 40 88 | INFO:root: 89 | Total constituents for Glycyrrhiza inflata: 6 90 | INFO:root: 91 | GSRS constituents: 6 92 | INFO:root: 93 | EMA constituents: 0 94 | INFO:root: 95 | Total constituents for Glycyrrhiza uralensis: 33 96 | INFO:root: 97 | GSRS constituents: 33 98 | INFO:root: 99 | EMA constituents: 0 100 | INFO:root: 101 | Total constituents for Hydrastis canadensis: 2 102 | INFO:root: 103 | GSRS constituents: 2 104 | INFO:root: 105 | EMA constituents: 0 106 | INFO:root: 107 | Total constituents for Linum usitatissimum: 1 108 | INFO:root: 109 | GSRS constituents: 1 110 | INFO:root: 111 | EMA constituents: 0 112 | INFO:root: 113 | Total constituents for Mitragyna speciosa: 6 114 | INFO:root: 115 | GSRS constituents: 6 116 | INFO:root: 117 | EMA constituents: 0 118 | INFO:root: 119 | Total constituents for Origanum vulgare: 1 120 | INFO:root: 121 | GSRS constituents: 1 122 | INFO:root: 123 | EMA constituents: 0 124 | INFO:root: 125 | Total constituents for Panax ginseng: 19 126 | INFO:root: 127 | GSRS constituents: 7 128 | INFO:root: 129 | EMA constituents: 12 130 | INFO:root: 131 | Total constituents for Paullinia cupana: 6 132 | INFO:root: 133 | GSRS constituents: 0 134 | INFO:root: 135 | EMA constituents: 6 136 | INFO:root: 137 | Total constituents for Piper nigrum: 5 138 | INFO:root: 139 | GSRS constituents: 5 140 | INFO:root: 141 | EMA constituents: 0 142 | INFO:root: 143 | Total constituents for Rhodiola rosea: 0 144 | INFO:root: 145 | GSRS constituents: 0 146 | INFO:root: 147 | EMA constituents: 0 148 | INFO:root: 149 | Total constituents for Rosmarinus officinalis: 34 150 | INFO:root: 151 | GSRS constituents: 4 152 | INFO:root: 153 | EMA constituents: 30 154 | INFO:root: 155 | Total constituents for Serenoa repens: 20 156 | INFO:root: 157 | GSRS constituents: 20 158 | INFO:root: 159 | EMA constituents: 0 160 | INFO:root: 161 | Total constituents for Silybum marianum: 18 162 | INFO:root: 163 | GSRS constituents: 1 164 | INFO:root: 165 | EMA constituents: 17 166 | INFO:root: 167 | Total constituents for Tanacetum parthenium: 23 168 | INFO:root: 169 | GSRS constituents: 1 170 | INFO:root: 171 | EMA constituents: 22 172 | INFO:root: 173 | Total constituents for Trigonella foenum-graecum: 18 174 | INFO:root: 175 | GSRS constituents: 18 176 | INFO:root: 177 | EMA constituents: 0 178 | INFO:root: 179 | Total constituents for Vaccinium macrocarpon: 25 180 | INFO:root: 181 | GSRS constituents: 0 182 | INFO:root: 183 | EMA constituents: 25 184 | INFO:root: 185 | Total constituents for Valeriana officinalis: 38 186 | INFO:root: 187 | GSRS constituents: 2 188 | INFO:root: 189 | EMA constituents: 36 190 | INFO:root: 191 | Total constituents for Withania somnifera: 1 192 | INFO:root: 193 | GSRS constituents: 1 194 | INFO:root: 195 | EMA constituents: 0 196 | INFO:root: 197 | Total constituents for Zingiber officinale: 24 198 | INFO:root: 199 | GSRS constituents: 8 200 | INFO:root: 201 | EMA constituents: 16 202 | INFO:root: 203 | Total constituents for Taraxacum officinale: 53 204 | INFO:root: 205 | GSRS constituents: 0 206 | INFO:root: 207 | EMA constituents: 53 208 | INFO:root: 209 | Total constituents added to graph: 722 210 | INFO:root: 211 | Unique constituents: 613 212 | -------------------------------------------------------------------------------- /resources/pmids/grapefruit_pmid.txt: -------------------------------------------------------------------------------- 1 | 31231823 2 | 23399607 3 | 19204629 4 | 19206053 5 | 17468864 6 | 16686323 7 | 16122154 8 | 12698101 9 | 12949438 10 | 11426073 11 | 9053620 12 | 33033962 13 | 29411268 14 | 26407124 15 | 25253884 16 | 23677858 17 | 22943633 18 | 17223855 19 | 17301733 20 | 16995870 21 | 16685052 22 | 16620293 23 | 16321619 24 | 15657782 25 | 15592332 26 | 15536460 27 | 15371986 28 | 15329598 29 | 12891222 30 | 12811362 31 | 12621384 32 | 11678783 33 | 11673746 34 | 11103749 35 | 11079273 36 | 10669910 37 | 10546917 38 | 10456490 39 | 10445384 40 | 10702889 41 | 9174684 42 | 9002010 43 | 8689808 44 | 7586927 45 | 7715468 46 | 8275614 47 | 12959295 48 | 8513655 49 | 1914375 50 | 1671113 51 | 35110472 52 | 34729703 53 | 34620272 54 | 34608881 55 | 34570813 56 | 34432364 57 | 34019943 58 | 33952821 59 | 33733472 60 | 33387374 61 | 33348239 62 | 33033962 63 | 32940733 64 | 32851583 65 | 32719085 66 | 32713008 67 | 32664320 68 | 32660818 69 | 32530447 70 | 32139488 71 | 31941150 72 | 31920193 73 | 31306750 74 | 31206401 75 | 31123035 76 | 31051051 77 | 30979536 78 | 30864454 79 | 30572203 80 | 30298208 81 | 30278432 82 | 30218442 83 | 30112986 84 | 29721568 85 | 29703387 86 | 29248449 87 | 29140707 88 | 28975417 89 | 28971609 90 | 28966261 91 | 28749005 92 | 28414144 93 | 28359180 94 | 28322941 95 | 28032362 96 | 27904048 97 | 27890698 98 | 27709656 99 | 27503364 100 | 27444380 101 | 27294349 102 | 27278683 103 | 27196064 104 | 27173987 105 | 26869410 106 | 26467069 107 | 26213156 108 | 26095990 109 | 25818985 110 | 25720525 111 | 25557052 112 | 25486333 113 | 25470100 114 | 25418056 115 | 25365914 116 | 25265455 117 | 25253884 118 | 25223231 119 | 25207548 120 | 25026202 121 | 25008344 122 | 24898424 123 | 24334957 124 | 24303901 125 | 24292052 126 | 24067745 127 | 24003188 128 | 24003187 129 | 23918667 130 | 23878024 131 | 23696038 132 | 23677858 133 | 23673492 134 | 23550055 135 | 23550054 136 | 23373097 137 | 23297175 138 | 23282066 139 | 23250807 140 | 23184849 141 | 23146034 142 | 23132664 143 | 23126367 144 | 23033114 145 | 22943633 146 | 22940371 147 | 22892338 148 | 22884524 149 | 22850760 150 | 22803802 151 | 22566187 152 | 22394605 153 | 22382318 154 | 22342630 155 | 22348413 156 | 22286159 157 | 22155270 158 | 22124880 159 | 22039822 160 | 21953762 161 | 21853290 162 | 21812328 163 | 21804213 164 | 21692829 165 | 21682192 166 | 21593283 167 | 21553459 168 | 21500482 169 | 21446776 170 | 21346758 171 | 21254874 172 | 21235125 173 | 21189136 174 | 21142260 175 | 21139236 176 | 21084040 177 | 21039758 178 | 20974993 179 | 20717876 180 | 20664534 181 | 20512335 182 | 20503933 183 | 20463004 184 | 20406214 185 | 20187577 186 | 20160155 187 | 19789373 188 | 19779132 189 | 19728747 190 | 19479982 191 | 19445995 192 | 19433977 193 | 19357792 194 | 19224379 195 | 19206053 196 | 19204629 197 | 19172438 198 | 19053852 199 | 19048359 200 | 18762178 201 | 18703023 202 | 18666368 203 | 18663908 204 | 18563955 205 | 18451520 206 | 18445987 207 | 18433343 208 | 18382075 209 | 18400708 210 | 18380401 211 | 18363644 212 | 18292673 213 | 18172627 214 | 18084364 215 | 17995595 216 | 17971226 217 | 17918187 218 | 17766652 219 | 17691921 220 | 17542018 221 | 17438537 222 | 17400460 223 | 17322140 224 | 17301733 225 | 17269895 226 | 17253883 227 | 17223855 228 | 17215845 229 | 17112808 230 | 17112309 231 | 17101740 232 | 17064205 233 | 16995870 234 | 16942732 235 | 16842392 236 | 16814135 237 | 16686323 238 | 16685052 239 | 16669846 240 | 16669845 241 | 16604230 242 | 16513448 243 | 16513449 244 | 16507515 245 | 16454693 246 | 16415112 247 | 16416305 248 | 16390351 249 | 16380358 250 | 16338240 251 | 16321619 252 | 16302488 253 | 16299162 254 | 16260103 255 | 16236039 256 | 16198665 257 | 16196264 258 | 16189799 259 | 16175141 260 | 16161934 261 | 16158798 262 | 16119275 263 | 16048029 264 | 16048354 265 | 15964336 266 | 15903127 267 | 15901751 268 | 15767219 269 | 15735611 270 | 15710766 271 | 15684175 272 | 15673597 273 | 15663291 274 | 15657782 275 | 15640378 276 | 15536460 277 | 15531381 278 | 15504753 279 | 15485894 280 | 15449971 281 | 15371986 282 | 15355126 283 | 15329598 284 | 15269184 285 | 15258108 286 | 15229464 287 | 15206993 288 | 15179411 289 | 15155547 290 | 15118259 291 | 15072439 292 | 14981197 293 | 14769198 294 | 14742207 295 | 14718607 296 | 14704647 297 | 14679360 298 | 14672753 299 | 14664751 300 | 14641551 301 | 14558433 302 | 14531721 303 | 14531725 304 | 12953340 305 | 12951492 306 | 12949438 307 | 12921244 308 | 12891222 309 | 12875231 310 | 12822205 311 | 12811362 312 | 12698101 313 | 12636154 314 | 12621384 315 | 12611197 316 | 12610742 317 | 12495360 318 | 12451428 319 | 12426515 320 | 12378978 321 | 12371301 322 | 12362932 323 | 12235445 324 | 12207630 325 | 12167562 326 | 12139218 327 | 12125178 328 | 12130727 329 | 12095536 330 | 11991343 331 | 11978146 332 | 11963641 333 | 11956673 334 | 11897929 335 | 11823754 336 | 11823753 337 | 11816871 338 | 11791889 339 | 11737987 340 | 11719733 341 | 11673746 342 | 11576693 343 | 11572503 344 | 11488783 345 | 11476124 346 | 11477318 347 | 11452240 348 | 11451028 349 | 11426073 350 | 11424898 351 | 11295678 352 | 11294369 353 | 11231102 354 | 11216183 355 | 11204334 356 | 11180034 357 | 11158415 358 | 11103749 359 | 11098080 360 | 11061578 361 | 11009051 362 | 10994829 363 | 10926350 364 | 10907671 365 | 10903978 366 | 10890261 367 | 10877530 368 | 10872652 369 | 10867385 370 | 10860553 371 | 10859150 372 | 10759694 373 | 10741622 374 | 10688276 375 | 10671908 376 | 10668858 377 | 10639518 378 | 10606837 379 | 10594487 380 | 10583039 381 | 10583025 382 | 10579471 383 | 10546917 384 | 10546925 385 | 10546919 386 | 10548035 387 | 10511919 388 | 10503959 389 | 10471070 390 | 10460065 391 | 10456492 392 | 10456490 393 | 10445384 394 | 10365642 395 | 10227700 396 | 10223776 397 | 10220126 398 | 10205736 399 | 10096255 400 | 9871430 401 | 9834039 402 | 9821810 403 | 9784933 404 | 9757152 405 | 9757148 406 | 9723817 407 | 9674392 408 | 9625273 409 | 9585793 410 | 9578182 411 | 9548795 412 | 9565737 413 | 9559810 414 | 9515185 415 | 9496718 416 | 9351897 417 | 9352575 418 | 9260034 419 | 9153299 420 | 9178939 421 | 9476043 422 | 8886602 423 | 8693526 424 | 8689808 425 | 8827399 426 | 8631189 427 | 8737764 428 | 8858279 429 | 7648762 430 | 7677856 431 | 7582380 432 | 8641324 433 | 7698084 434 | 8161344 435 | 12959294 436 | 12959295 437 | 8485024 438 | -------------------------------------------------------------------------------- /resources/pmids/goldenseal_pmid.txt: -------------------------------------------------------------------------------- 1 | 29579096 2 | 29891588 3 | 20218935 4 | 26990021 5 | 29106036 6 | 21826053 7 | 22565299 8 | 31671245 9 | 32987920 10 | 29806860 11 | 29879346 12 | 27932556 13 | 14551183 14 | 26417265 15 | 31382342 16 | 30944202 17 | 31629051 18 | 31368836 19 | 31671426 20 | 31111379 21 | 29505790 22 | 31351968 23 | 31013627 24 | 33174626 25 | 27324234 26 | 32926925 27 | 26543354 28 | 30977450 29 | 30472912 30 | 30790325 31 | 32109457 32 | 30655545 33 | 30606998 34 | 22257149 35 | 31662463 36 | 30558158 37 | 30428337 38 | 17079360 39 | 15039293 40 | 32081663 41 | 29163755 42 | 17495878 43 | 17666818 44 | 31895879 45 | 32591416 46 | 28893642 47 | 29589530 48 | 25359200 49 | 14570772 50 | 33061977 51 | 29351921 52 | 30882133 53 | 18214849 54 | 21458442 55 | 30226834 56 | 32389634 57 | 32717192 58 | 28700974 59 | 25430796 60 | 30368581 61 | 27327872 62 | 2163278 63 | 29121795 64 | 30168114 65 | 30442987 66 | 28783583 67 | 28633475 68 | 31617070 69 | 31624335 70 | 29065218 71 | 31789425 72 | 25171176 73 | 29676131 74 | 28656004 75 | 26068524 76 | 31878261 77 | 25623616 78 | 23518403 79 | 28763460 80 | 28257954 81 | 29703381 82 | 12434406 83 | 21412693 84 | 29988733 85 | 22796454 86 | 21157683 87 | 19370549 88 | 24492724 89 | 21269266 90 | 29143794 91 | 21342663 92 | 25072399 93 | 29457517 94 | 22855269 95 | 29343943 96 | 7605866 97 | 20623609 98 | 25041515 99 | 12530470 100 | 27054913 101 | 17611934 102 | 20939821 103 | 26932407 104 | 26400396 105 | 20036710 106 | 29565467 107 | 26838742 108 | 17978486 109 | 28298174 110 | 30093307 111 | 26261285 112 | 24060867 113 | 21079763 114 | 27796611 115 | 23886934 116 | 22224048 117 | 29323165 118 | 28363126 119 | 28500623 120 | 25456436 121 | 28283780 122 | 30086269 123 | 22548337 124 | 21545824 125 | 29228954 126 | 16983620 127 | 29350661 128 | 29138046 129 | 22114028 130 | 12620361 131 | 24667776 132 | 27601272 133 | 28298333 134 | 25612454 135 | 16530230 136 | 26403084 137 | 22342832 138 | 24528081 139 | 29191058 140 | 32957491 141 | 28656087 142 | 29806105 143 | 18066144 144 | 25192195 145 | 15900287 146 | 18951337 147 | 29971002 148 | 29427595 149 | 25953522 150 | 26177349 151 | 21320518 152 | 22344858 153 | 23372711 154 | 24333987 155 | 32095824 156 | 30422397 157 | 32325761 158 | 29879347 159 | 25411028 160 | 24246570 161 | 26774040 162 | 29654898 163 | 26134304 164 | 23672049 165 | 28154180 166 | 17292731 167 | 27067643 168 | 23524574 169 | 27066387 170 | 21061468 171 | 9435674 172 | 27457692 173 | 22193446 174 | 24184483 175 | 24588674 176 | 28775788 177 | 21802525 178 | 25261036 179 | 6269439 180 | 29158264 181 | 19782362 182 | 21319959 183 | 22900779 184 | 21920422 185 | 28317281 186 | 26446867 187 | 21678521 188 | 21073932 189 | 10096776 190 | 29238700 191 | 30108655 192 | 19110420 193 | 17704354 194 | 24741189 195 | 27070564 196 | 17213652 197 | 24704252 198 | 23133498 199 | 23924821 200 | 25640685 201 | 19653312 202 | 24786236 203 | 27490210 204 | 15334682 205 | 16961724 206 | 15210579 207 | 20498327 208 | 17658211 209 | 19080170 210 | 22668974 211 | 12842327 212 | 27709945 213 | 12524452 214 | 18434730 215 | 28836036 216 | 20506155 217 | 14757125 218 | 11741516 219 | 11877800 220 | 11321580 221 | 19788498 222 | 25017321 223 | 26352530 224 | 21315125 225 | 12177126 226 | 19767038 227 | 28038998 228 | 20641061 229 | 18729007 230 | 21637946 231 | 20604832 232 | 11704644 233 | 19660925 234 | 21661731 235 | 12809364 236 | 20152792 237 | 26281596 238 | 21908684 239 | 25285405 240 | 16541194 241 | 19567876 242 | 22517372 243 | 21483731 244 | 8982702 245 | 25892872 246 | 27311637 247 | 24583342 248 | 21094631 249 | 18177475 250 | 23280813 251 | 16424781 252 | 14653942 253 | 19168636 254 | 21870106 255 | 15806115 256 | 14746925 257 | 21980456 258 | 21569619 259 | 12532460 260 | 21419197 261 | 28502629 262 | 25395088 263 | 23001793 264 | 16133554 265 | 23299247 266 | 24177287 267 | 21351569 268 | 16934942 269 | 22188769 270 | 28861978 271 | 16203150 272 | 27713083 273 | 26634613 274 | 17250743 275 | 14695443 276 | 18285556 277 | 33347343 278 | 24014108 279 | 25976224 280 | 29679725 281 | 15169625 282 | 415839 283 | 18157518 284 | 17024849 285 | 12732624 286 | 25402492 287 | 18599498 288 | 9681018 289 | 17418998 290 | 12234835 291 | 25145883 292 | 27779107 293 | 23497885 294 | 24025684 295 | 9413244 296 | 24456672 297 | 24316226 298 | 9651117 299 | 28925721 300 | 21425377 301 | 20552446 302 | 26593426 303 | 19403195 304 | 23999088 305 | 11324452 306 | 22588833 307 | 28956622 308 | 11786666 309 | 24066602 310 | 1666562 311 | 18575902 312 | 23079743 313 | 23335996 314 | 20884991 315 | 18204477 316 | 17506935 317 | 11895082 318 | 28656088 319 | 25053555 320 | 22555370 321 | 27217747 322 | 23290487 323 | 24105360 324 | 15356201 325 | 26032585 326 | 21186373 327 | 10724180 328 | 8027780 329 | 10969720 330 | 10507765 331 | 16306794 332 | 23664937 333 | 21319371 334 | 32748032 335 | 17588330 336 | 12652101 337 | 15066206 338 | 34044929 339 | 15588728 340 | 18644201 341 | 21624442 342 | 7902181 343 | 9251899 344 | 17762395 345 | 22086980 346 | 21787170 347 | 20804776 348 | 21755496 349 | 16499024 350 | 16046213 351 | 17328250 352 | 22303716 353 | 2348481 354 | 12064912 355 | 8237399 356 | 27239758 357 | 22512037 358 | 11698526 359 | 1711569 360 | 15849302 361 | 10696087 362 | 18277615 363 | 15071923 364 | 11270990 365 | 10806810 366 | 10743138 367 | 10322908 368 | 22426698 369 | 26027164 370 | 7516608 371 | 23009931 372 | 19419877 373 | 9269949 374 | 19283771 375 | 33652886 376 | 12533487 377 | 17560751 378 | 2169659 379 | 25470292 380 | 21524698 381 | 11978192 382 | 26215416 383 | 20108175 384 | 17631668 385 | 2972098 386 | 11953130 387 | 21761735 388 | 12952418 389 | 24963621 390 | 27910870 391 | 26677714 392 | 17703779 393 | 20410605 394 | 17590462 395 | 7985516 396 | 18053337 397 | 9812715 398 | 2166769 399 | 1456049 400 | 12183269 401 | 22924000 402 | 8737443 403 | 11498866 404 | 12100754 405 | 25481375 406 | 19570618 407 | 9988103 408 | 12749897 409 | 25362106 410 | 2160393 411 | 11327013 412 | 8728473 413 | 10677479 414 | 9425319 415 | 12567772 416 | 11951120 417 | 12610223 418 | 12782200 419 | 2892299 420 | 10627611 421 | 9812742 422 | 1306830 423 | 8586394 424 | 31721320 425 | 32586251 426 | 33340158 427 | 34552326 428 | 34269665 429 | 34620272 430 | 34699754 431 | 34056218 432 | 32808837 433 | 34252480 434 | 34515131 435 | 34386321 436 | 34556670 437 | 34771389 438 | 34192596 439 | 34139914 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12536780.svg)](https://doi.org/10.5281/zenodo.12536780) 2 | 3 | # NP-KG 4 | 5 | **Knowledge Graph Framework to Generate Hypotheses for Natural Product-Drug Interactions** 6 | 7 | NP-KG is a graph framework that creates a biomedical knowledge graph (KG) to identify and generate mechanistic hypotheses for pharmacokinetic natural product-drug interactions (NPDIs). NP-KG uses the [PheKnowLator ecosystem](https://github.com/callahantiff/PheKnowLator) to create an ontology-grounded KG. It then uses two relation extraction systems to extract triples from full texts of natural product-related scientific literature to create a literature-based graph, and integrates the nodes and edges in the ontology-grounded KG. 8 | 9 | ## NP-KG Builds 10 | 11 | **NP-KG:** Merged PheKnowLator KG and literature-based graph with 30 natural products. 12 | 13 | **Ontology-grounded KG:** PheKnowLator KG with a few [additional data sources](https://github.com/sanyabt/np-kg/wiki/v3.0.0#data-sources). 14 | 15 | **Literature-based Graph:** Literature-based graph constructed from scientific literature with relation extraction systems (SemRep and INDRA/REACH) and closure operations. 16 | 17 | ## How to Download and Use 18 | 19 | ### Setup environment 20 | 21 | 1. Clone the repository or download all files. 22 | 2. Install all required packages. Requires Python>=3.6. 23 | 24 | ``` 25 | python -m pip install -r requirements.txt 26 | ``` 27 | 28 | ### If you want to use the pre-built KGs 29 | 30 | 1. [Download the knowledge graph and node labels files from Zenodo](https://doi.org/10.5281/zenodo.12536780) and add to local folder - resources/knowledge_graphs. NP-KG is available as TSV file with triples and NetworkX multidigraph (gpickle files). 31 | 32 | * **Merged KG:** includes merged PheKnowLator KG and literature-based graph. Download this file if you do not know which KG to use. 33 | * Filename: _NP-KG_v3.0.0.tsv_ 34 | * Filename: _NP-KG_v3.0.0.gpickle_ 35 | * **PheKnowLator KG:** includes full instance-based build of the PheKnowLator KG. See [PheKnowLator](https://github.com/callahantiff/PheKnowLator) for more details. 36 | * Filename: _PheKnowLator_v3.1.2_full_instance_inverseRelations_OWLNETS_NetworkxMultiDiGraph.gpickle_ 37 | 38 | ### Node Labels and Node Types 39 | * Download _nodeLabels_v3.0.0.tsv_ file with all node labels for the merged KG. 40 | * Download _nodeTypes_v3.0.0.tsv_ file with node types for all nodes in the merged KG. 41 | 42 | 2. See [evaluation-scripts](https://github.com/sanyabt/np-kg/tree/main/evaluation-scripts) for examples of queries and path searches. 43 | 44 | Note: The download link also contains the KGs as gpickle and ntriples files with the same nodes and edges that can be loaded for other applications. 45 | 46 | ### Loading NP-KG with GRAPE 47 | 48 | The Graph Representation Learning library [GRAPE](https://github.com/AnacletoLAB/grape) provides efficient graph embeddings. To load NP-KG (version 3.0.0) in GRAPE, use the _from_csv_ function and TSV files mentioned above: 49 | 50 | ``` 51 | npkg = Graph.from_csv( 52 | node_path=, 53 | node_list_node_types_column_number=1, 54 | nodes_column_number=0, 55 | node_list_separator='\t', 56 | node_list_header=True, 57 | edge_path=, 58 | edge_list_separator='\t', 59 | edge_list_header=True, 60 | edge_list_edge_types_column_number=1, 61 | sources_column_number=0, 62 | destinations_column_number=2, 63 | weights_column_number=3, 64 | directed=True, 65 | verbose=True 66 | ) 67 | ``` 68 | 69 | NP-KG (v1.0.1) can also be loaded with as below. See [NP-KG Grape Animation tutorial](https://github.com/sanyabt/np-kg/blob/main/resources/NPKG-Grape-Animation.ipynb) for details. 70 | 71 | ```python 72 | pip install grape -U 73 | from grape.datasets.zenodo import NPKG 74 | graph = NPKG(directed=True) 75 | graph 76 | ``` 77 | 78 | ### If you are interested in constructing or extending NP-KG 79 | 80 | See [wiki](https://github.com/sanyabt/np-kg/wiki) for details of data sources, construction, use cases, and evaluation. 81 | 82 | Get In Touch 83 | ------------------------------------------------ 84 | 85 | Get in touch through GitHub issues, discussion, or [email](mailto:sbt12@pitt.edu)! 86 | 87 | 88 | Related Work 89 | ------------------------------------------------ 90 | **NP-KG Publication** 91 | 92 | Taneja SB, Callahan TJ, Paine MF, Kane-Gill SL, Kilicoglu H, Joachimiak MP, Boyce RD. Developing a Knowledge Graph Framework for Pharmacokinetic Natural Product-Drug Interactions. Journal of Biomedical Informatics. 2023. [DOI: doi.org/10.1016/j.jbi.2023.104341](https://doi.org/10.1016/j.jbi.2023.104341). 93 | 94 | **AMIA Informatics Summit poster** 95 | 96 | Taneja SB, Ndungu PW, Paine MF, Kane-Gill SL, Boyce RD. Relation Extraction from Biomedical Literature on Pharmacokinetic Natural Product-Drug Interactions. Poster presentation, AMIA Informatics Summit 2022; March 21-24, 2022. 97 | 98 | **ISMB Conference Abstract and Related Files** 99 | 100 | Taneja SB, Callahan TJ, Brochhausen M, Paine MF, Kane-Gill SL, Boyce RD. Designing potential extensions from G-SRS to ChEBI to identify natural product-drug interactions. Intelligent Systems for Molecular Biology/European Conference on Computational Biology (ISMB/ECCB), 2021. [https://doi.org/10.5281/zenodo.5736386](https://doi.org/10.5281/zenodo.5736386) 101 | 102 | 103 | Cite this Work 104 | ------------------------------------------------ 105 | **Publication** 106 | 107 | ``` 108 | @article{taneja_developing_2023, 109 | title = {Developing a {Knowledge} {Graph} for {Pharmacokinetic} {Natural} {Product}-{Drug} {Interactions}}, 110 | volume = {140}, 111 | issn = {1532-0464}, 112 | url = {https://www.sciencedirect.com/science/article/pii/S153204642300062X}, 113 | doi = {10.1016/j.jbi.2023.104341}, 114 | language = {en}, 115 | urldate = {2023-03-23}, 116 | journal = {Journal of Biomedical Informatics}, 117 | author = {Taneja, Sanya B. and Callahan, Tiffany J. and Paine, Mary F. and Kane-Gill, Sandra L. and Kilicoglu, Halil and Joachimiak, Marcin P. and Boyce, Richard D.}, 118 | year = {2023}, 119 | } 120 | ``` 121 | 122 | **Zenodo Dataset** 123 | 124 | ``` 125 | @dataset{taneja_sanya_bathla_2024_12536780, 126 | author = {Taneja, Sanya Bathla}, 127 | title = {{NP-KG: Knowledge Graph for Natural Product-Drug 128 | Interactions}}, 129 | month = jun, 130 | year = 2024, 131 | publisher = {Zenodo}, 132 | version = {3.0.0}, 133 | doi = {10.5281/zenodo.12536780}, 134 | url = {https://doi.org/10.5281/zenodo.12536780} 135 | } 136 | ``` 137 | 138 | Funding 139 | ------------------------------------------------ 140 | This work is supported by the National Institutes of Health National Center for Complementary and Integrative Health Grant U54 AT008909. 141 | -------------------------------------------------------------------------------- /relation-extraction-scripts/semrep_process_error_files.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | from datetime import datetime, timedelta 3 | import pandas as pd 4 | import subprocess 5 | from nltk import tokenize 6 | import requests 7 | import time 8 | import logging 9 | 10 | ''' 11 | Script to handle all errors in SemRep processing - including missing text file and SemRep timeout errors 12 | ToDO: 13 | 1. Input PMIDs that gave errors (compiled from log files) 14 | 2. Once text available - use the updated text files with individual sentences to process with SemRep 15 | ''' 16 | 17 | workingDir = os.getcwd() 18 | log_dir = workingDir + '/logs/' 19 | np = ['list of NPs'] 20 | 21 | extraction = True 22 | 23 | count_dict = { 24 | 'n_total_pmid': 0, 25 | 'n_success': 0, 26 | 'n_error': 0, 27 | 'n_statements':0, 28 | 'n_files_processed': 0 29 | } 30 | pub_year_to_pmid_map = {} 31 | 32 | def process_with_semrep(infile, outfile): 33 | try: 34 | result = subprocess.run(['/usr/local/bin/semrep.v1.8', '-L', '2018', '-Z', '2018AA', infile, outfile], check=True, timeout=1800) 35 | 36 | return result 37 | except Exception as e: 38 | logging.info('SemRep error in processing %s', str(e)) 39 | return None 40 | 41 | def get_publication_year_and_type(pmid): 42 | pub_year = '' 43 | pub_type = '' 44 | if pmid == '': 45 | return pub_year, pub_type 46 | if pmid in pub_year_to_pmid_map and pmid in pub_type_to_pmid_map: 47 | if pub_year_to_pmid_map[pmid] != '': 48 | pub_year = pub_year_to_pmid_map[pmid] 49 | pub_type = pub_type_to_pmid_map[pmid] 50 | return pub_year, str(pub_type) 51 | time.sleep(5) 52 | uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json" 53 | response = requests.get(uri) 54 | if response.status_code == 429: 55 | time.sleep(5) 56 | response = requests.get(uri) 57 | if response.status_code == 200: 58 | result = response.json() 59 | pub_year = result['result'][pmid]['pubdate'] 60 | pub_year_to_pmid_map[pmid] = pub_year 61 | 62 | pub_type = result['result'][pmid]['pubtype'] 63 | pub_type_to_pmid_map[pmid] = pub_type 64 | return pub_year, str(pub_type) 65 | 66 | def semrep_extract(filepath): 67 | result_dict = { 68 | 'index': [], 69 | 'pmid': [], 70 | 'relation': [], 71 | 'year': [], 72 | 'subject_cui': [], 73 | 'object_cui': [], 74 | 'subject_name': [], 75 | 'object_name': [], 76 | 'subject_type': [], 77 | 'object_type': [], 78 | 'sentence': [], 79 | 'source_section': [], 80 | 'pub_type': [] 81 | } 82 | index = 0 83 | semrep_files = os.listdir(filepath) 84 | for file in semrep_files: 85 | pmid = file.split('.')[0] 86 | sem_relations = { 87 | 'items': [], 88 | 'source_sentence': [], 89 | 'source_section': [] 90 | } 91 | with open(filepath+file, 'r', errors='ignore') as file_sem: 92 | lines = file_sem.readlines() 93 | 94 | last_non_empty = '' 95 | section_match = '' 96 | for item in lines: 97 | if any(s in item for s in section_tags): 98 | #assign section 99 | section_match = next((sec for sec in section_tags if sec in item), False) 100 | if '|relation|' in item: 101 | sem_relations['items'].append(item) 102 | sem_relations['source_sentence'].append(last_non_empty) 103 | sem_relations['source_section'].append(section_match) 104 | elif item == '\n' or item == '': 105 | continue 106 | else: 107 | last_non_empty = item 108 | 109 | count_dict['n_statements'] += len(sem_relations) 110 | for rel in sem_relations['items']: 111 | fields = rel.split('|') 112 | if len(fields) < 5: 113 | continue 114 | result_dict['index'].append(index) 115 | result_dict['pmid'].append(pmid) 116 | result_dict['subject_cui'].append(fields[2]) 117 | result_dict['object_cui'].append(fields[9]) 118 | result_dict['subject_name'].append(fields[3]) 119 | result_dict['object_name'].append(fields[10]) 120 | result_dict['relation'].append(fields[8]) 121 | result_dict['subject_type'].append(fields[4]) 122 | result_dict['object_type'].append(fields[11]) 123 | pub_year, pub_type = get_publication_year_and_type(pmid) 124 | result_dict['year'].append(pub_year) 125 | result_dict['pub_type'].append(pub_type) 126 | relation_index = sem_relations['items'].index(rel) 127 | result_dict['sentence'].append(sem_relations['source_sentence'][relation_index]) 128 | result_dict['source_section'].append(sem_relations['source_section'][relation_index]) 129 | index += 1 130 | 131 | return result_dict 132 | 133 | if __name__ == '__main__': 134 | for item in np: 135 | 136 | inputPMID_file = workingDir + '/input_files/'+item+'/'+item+'_pmid_errors.txt' 137 | inputDir = workingDir + '/output_files/' + item + '/Corrected/' 138 | outputDir = workingDir + '/output_files/'+item+'/semrepOutput/' 139 | 140 | t0=datetime.now() 141 | 142 | log_file = log_dir+item + '_semrep_fix_errors_log'+str(t0)+'.txt' 143 | logging.basicConfig(filename=log_file, filemode='a', level=logging.INFO) 144 | logging.info('Log for %s. PMIDs with errors in processing from iteration 1', item) 145 | logging.info('\nInput file: %s', inputPMID_file) 146 | 147 | with open(inputPMID_file, 'r') as file_input: 148 | pmids = file_input.readlines() 149 | 150 | text_files = os.listdir(inputDir) 151 | 152 | for pmid in pmids: 153 | pmid = pmid.strip() 154 | logging.info('\n\nProcessing PMID: %s', pmid) 155 | count_dict['n_total_pmid'] += 1 156 | file = str(pmid) + '_ascii.txt' 157 | print(file) 158 | if file in text_files: 159 | filename = inputDir+file 160 | 161 | semrep_process = process_with_semrep(filename, outputDir+file.split('_')[0]+'.txt') 162 | if semrep_process is not None: 163 | logging.info('\nFile processed: %s', file) 164 | count_dict['n_success'] += 1 165 | 166 | else: 167 | logging.info('\nError in processing: %s',file) 168 | count_dict['n_error'] += 1 169 | else: 170 | logging.info('\nText unavailable: %s',file) 171 | count_dict['n_error'] +=1 172 | 173 | if extraction: 174 | result_dict = semrep_extract(outputDir) 175 | 176 | semrep_result = pd.DataFrame(result_dict) 177 | semrep_result_unique = semrep_result.drop_duplicates(subset=['subject_cui', 'subject_name', 'subject_type', 178 | 'relation', 'object_cui', 'object_name', 'object_type', 'year', 'sentence']) 179 | semrep_result_unique.to_csv(workingDir+'/output_files/'+item+'/' +item+'_pmid_all_predicates_semrep-errors-fixed.tsv', sep='\t', index=False, 180 | columns=['index', 'pmid', 'subject_cui', 'subject_name', 'subject_type', 181 | 'relation', 'object_cui', 'object_name', 'object_type', 'year', 'sentence']) 182 | 183 | t1 = datetime.now() 184 | seconds=timedelta.total_seconds(t1-t0) 185 | logging.info('\nTotal time: %s seconds', str(seconds)) 186 | logging.info('\nTotal PMIDs: %s',str(count_dict['n_total_pmid'])) 187 | logging.info('\nN_file_hits: %s',str(count_dict['n_files_processed'])) 188 | logging.info('\nN_semrep_hits: %s',str(count_dict['n_success'])) 189 | logging.info('\nN_errors: %s',str(count_dict['n_error'])) 190 | logging.info('\nN_statements: %s',str(count_dict['n_statements'])) 191 | 192 | os.system("pkill semrep") 193 | 194 | logging.info('\nTerminated subprocesses') 195 | -------------------------------------------------------------------------------- /resources/data/CHEMICAL_INHIBITOR.tsv: -------------------------------------------------------------------------------- 1 | CHEBI_28901 PR_P11509 2 | CHEBI_7936 PR_P10635 3 | CHEBI_119573 PR_P08684 4 | CHEBI_47519 PR_P08684 5 | CHEBI_28901 PR_P33260 6 | CHEBI_6923 PR_P11712 7 | CHEBI_4636 PR_P10635 8 | CHEBI_50659 PR_P08183 9 | CHEBI_6916 PR_P05177 10 | CHEBI_4995 PR_P33261 11 | CHEBI_10023 PR_P08684 12 | CHEBI_46081 PR_P11712 13 | CHEBI_101278 PR_P08684 14 | CHEBI_2611 PR_P08684 15 | CHEBI_4975 PR_P05177 16 | CHEBI_6076 PR_P08183 17 | CHEBI_9948 PR_P05177 18 | CHEBI_63589 PR_P11712 19 | CHEBI_48390 PR_P10635 20 | CHEBI_44032 PR_P08684 21 | CHEBI_3732 PR_P08684 22 | CHEBI_9948 PR_P08183 23 | CHEBI_2663 PR_P10635 24 | CHEBI_41423 PR_P10635 25 | CHEBI_585948 PR_P08684 26 | CHEBI_39548 PR_P08684 27 | CHEBI_47519 PR_P33261 28 | CHEBI_6807 PR_P10635 29 | CHEBI_9448 PR_P05177 30 | CHEBI_3699 PR_P05177 31 | CHEBI_9342 PR_P11712 32 | CHEBI_46081 PR_P08684 33 | CHEBI_7494 PR_P08684 34 | CHEBI_10023 PR_P11712 35 | CHEBI_50659 PR_P08684 36 | CHEBI_10023 PR_P33261 37 | CHEBI_3090 PR_P08684 38 | CHEBI_42355 PR_P08684 39 | CHEBI_9943 PR_P10635 40 | CHEBI_5118 PR_P33261 41 | CHEBI_10100 PR_P11712 42 | CHEBI_101278 PR_P10635 43 | CHEBI_8776 PR_P08684 44 | CHEBI_5138 PR_P05177 45 | CHEBI_31596 PR_P10635 46 | CHEBI_47519 PR_P08183 47 | CHEBI_28593 PR_P08684 48 | CHEBI_82941 PR_P08684 49 | CHEBI_5775 PR_P10635 50 | CHEBI_77590 PR_P33261 51 | CHEBI_28593 PR_P08183 52 | CHEBI_87681 PR_P08684 53 | CHEBI_63589 PR_P33261 54 | CHEBI_50275 PR_P33261 55 | CHEBI_5801 PR_P10635 56 | CHEBI_2955 PR_P08183 57 | CHEBI_6076 PR_P08684 58 | CHEBI_63619 PR_P05177 59 | CHEBI_10100 PR_P08684 60 | CHEBI_9123 PR_P10635 61 | CHEBI_101278 PR_P08183 62 | CHEBI_2453 PR_P05177 63 | CHEBI_4031 PR_Q9Y6L6 64 | CHEBI_585948 PR_P08183 65 | CHEBI_8776 PR_P10635 66 | CHEBI_5138 PR_P33261 67 | CHEBI_37924 PR_P08684 68 | CHEBI_2663 PR_P05177 69 | PC_10044355 PR_P08684 70 | CHEBI_68540 PR_P11712 71 | CHEBI_63619 PR_P10635 72 | CHEBI_38561 PR_P11712 73 | CHEBI_63637 PR_P05177 74 | CHEBI_3380 PR_P08183 75 | CHEBI_18358 PR_P05177 76 | PC_6460146 PR_P33261 77 | CHEBI_52172 PR_P08684 78 | CHEBI_9948 PR_P10635 79 | CHEBI_68558 PR_P08183 80 | CHEBI_100241 PR_P05177 81 | CHEBI_29688 PR_P10635 82 | CHEBI_28077 PR_Q9Y6L6 83 | CHEBI_65173 PR_P10635 84 | CHEBI_2663 PR_P11712 85 | CHEBI_40279 PR_P05177 86 | CHEBI_36796 PR_P10635 87 | CHEBI_5296 PR_P10632 88 | CHEBI_681850 PR_P08183 89 | CHEBI_87681 PR_P20815 90 | CHEBI_29688 PR_P08684 91 | CHEBI_7772 PR_P33261 92 | CHEBI_28593 PR_P10635 93 | CHEBI_9948 PR_P08684 94 | CHEBI_5118 PR_P08684 95 | CHEBI_3699 PR_P08684 96 | CHEBI_9448 PR_P10635 97 | CHEBI_9588 PR_P33261 98 | CHEBI_3699 PR_P10635 99 | CHEBI_68621 PR_P08684 100 | CHEBI_5296 PR_Q9Y6L6 101 | CHEBI_83527 PR_P10635 102 | CHEBI_37924 PR_P20815 103 | CHEBI_3770 PR_P11712 104 | CHEBI_85010 PR_Q9Y6L6 105 | CHEBI_40050 PR_P08684 106 | CHEBI_8499 PR_P05177 107 | CHEBI_5118 PR_P10635 108 | CHEBI_68595 PR_P08684 109 | CHEBI_87681 PR_P10635 110 | CHEBI_42355 PR_P08183 111 | CHEBI_5138 PR_P08684 112 | CHEBI_31413 PR_P10635 113 | CHEBI_4031 PR_P08684 114 | CHEBI_681850 PR_P08684 115 | CHEBI_64355 PR_P08684 116 | CHEBI_10112 PR_P05177 117 | CHEBI_45783 PR_P08684 118 | CHEBI_3732 PR_P08183 119 | CHEBI_7496 PR_P08684 120 | CHEBI_45409 PR_P08684 121 | CHEBI_149836 PR_P11712 122 | CHEBI_2955 PR_P08684 123 | CHEBI_64310 PR_P08684 124 | CHEBI_87681 PR_P08183 125 | CHEBI_3219 PR_P10635 126 | PC_10044355 PR_P33261 127 | CHEBI_37941 PR_P20813 128 | CHEBI_63637 PR_P10635 129 | CHEBI_2663 PR_P08183 130 | CHEBI_45783 PR_P10635 131 | CHEBI_5138 PR_P11712 132 | CHEBI_49668 PR_P10635 133 | CHEBI_31859 PR_P33261 134 | CHEBI_47519 PR_P10635 135 | CHEBI_28077 PR_Q9NPD5 136 | CHEBI_45409 PR_P10635 137 | CHEBI_2663 PR_P08684 138 | CHEBI_499361 PR_P08684 139 | PC_10044355 PR_P11712 140 | CHEBI_2668 PR_P08684 141 | CHEBI_36791 PR_P10635 142 | CHEBI_27732 PR_P05177 143 | CHEBI_6909 PR_P11712 144 | CHEBI_49603 PR_P08684 145 | CHEBI_4031 PR_P08183 146 | CHEBI_31348 PR_P11712 147 | CHEBI_71219 PR_P08684 148 | CHEBI_87715 PR_P20813 149 | CHEBI_3441 PR_P08183 150 | CHEBI_3699 PR_P33261 151 | CHEBI_6030 PR_P08684 152 | CHEBI_9588 PR_P05177 153 | CHEBI_3732 PR_P20815 154 | CHEBI_46081 PR_P33261 155 | CHEBI_100241 PR_P08684 156 | CHEBI_7820 PR_P11712 157 | CHEBI_4031 PR_Q9NPD5 158 | CHEBI_8104 PR_P05177 159 | CHEBI_49603 PR_P08183 160 | CHEBI_9588 PR_P20813 161 | CHEBI_71219 PR_P10635 162 | CHEBI_100246 PR_P05177 163 | CHEBI_31401 PR_P08684 164 | CHEBI_4659 PR_P05177 165 | CHEBI_10112 PR_P08684 166 | CHEBI_47519 PR_P05177 167 | CHEBI_37924 PR_P10635 168 | CHEBI_5138 PR_P22310 169 | CHEBI_28177 PR_P10635 170 | SRS_U946SH95EE PR_P11712 171 | CHEBI_48923 PR_P05177 172 | CHEBI_7565 PR_P08684 173 | CHEBI_6923 PR_P08684 174 | CHEBI_47519 PR_P22309 175 | CHEBI_47519 PR_P20813 176 | CHEBI_8985 PR_P08684 177 | CHEBI_47519 PR_Q01740 178 | CHEBI_7772 PR_Q07973 179 | CHEBI_47519 PR_O60656 180 | CHEBI_6717 PR_O60656 181 | CHEBI_7476 PR_P11712 182 | CHEBI_47519 PR_P04798 183 | CHEBI_5296 PR_P04798 184 | CHEBI_8228 PR_P06133 185 | CHEBI_5296 PR_P08684 186 | CHEBI_7936 PR_P08684 187 | CHEBI_47519 PR_P00352 188 | CHEBI_47519 PR_P11712 189 | PC_71306834 PR_P10635 190 | CHEBI_4031 PR_P11712 191 | CHEBI_7476 PR_P22309 192 | CHEBI_7476 PR_O60656 193 | SRS_JQ9EK2H6BG PR_P33261 194 | CHEBI_47519 PR_P20815 195 | CHEBI_4031 PR_P33261 196 | CHEBI_6923 PR_P33261 197 | CHEBI_8426 PR_P11712 198 | CHEBI_8426 PR_P10635 199 | CHEBI_47519 PR_P10632 200 | CHEBI_47519 PR_P51589 201 | CHEBI_63613 PR_P33261 202 | CHEBI_3396 PR_P54855 203 | CHEBI_7565 PR_P10635 204 | CHEBI_7772 PR_P22310 205 | CHEBI_45409 PR_P11712 206 | CHEBI_63613 PR_P08684 207 | CHEBI_5138 PR_P10635 208 | CHEBI_3732 PR_P33261 209 | CHEBI_7476 PR_P10632 210 | CHEBI_8871 PR_P10635 211 | CHEBI_5778 PR_P11712 212 | CHEBI_28901 PR_P20813 213 | CHEBI_135735 PR_P08684 214 | CHEBI_28077 PR_P16662 215 | CHEBI_28821 PR_P10632 216 | CHEBI_28901 PR_P33261 217 | CHEBI_45409 PR_P20815 218 | CHEBI_4031 PR_P10632 219 | CHEBI_6539 PR_P08684 220 | CHEBI_8382 PR_P08684 221 | CHEBI_47519 PR_P16662 222 | CHEBI_63918 PR_P08684 223 | CHEBI_7476 PR_P16662 224 | CHEBI_triazolam PR_P08684 225 | CHEBI_8382 PR_P11712 226 | SRS_U946SH95EE PR_P08684 227 | CHEBI_28077 PR_P11712 228 | CHEBI_4659 PR_P05181 229 | CHEBI_6076 PR_Q01740 230 | CHEBI_6076 PR_Q99518 231 | CHEBI_8426 PR_O60656 232 | CHEBI_8228 PR_O60656 233 | CHEBI_28821 PR_P10635 234 | CHEBI_8228 PR_Q9HAW9 235 | CHEBI_28821 PR_P33261 236 | CHEBI_175901 PR_P08684 237 | CHEBI_28821 PR_P11712 238 | CHEBI_8426 PR_P06133 239 | CHEBI_7772 PR_P08684 240 | CHEBI_68595 PR_P20815 241 | CHEBI_7476 PR_P05177 242 | CHEBI_47519 PR_P78329 243 | CHEBI_45409 PR_P33261 244 | CHEBI_6923 PR_P05177 245 | CHEBI_42355 PR_P33261 246 | CHEBI_28077 PR_P35503 247 | CHEBI_7772 PR_P22309 248 | CHEBI_28177 PR_P05177 249 | CHEBI_8871 PR_P08684 250 | CHEBI_39867 PR_P08684 251 | CHEBI_28901 PR_P11712 252 | CHEBI_7565 PR_P33261 253 | CHEBI_499361 PR_P05177 254 | CHEBI_28821 PR_P05177 255 | CHEBI_7476 PR_P35503 256 | CHEBI_28821 PR_P11509 257 | CHEBI_8228 PR_P16662 258 | CHEBI_28821 PR_P05181 259 | CHEBI_46081 PR_P10632 260 | CHEBI_8382 PR_P05177 261 | CHEBI_28077 PR_P10632 262 | CHEBI_5296 PR_P11712 263 | CHEBI_45735 PR_P08684 264 | CHEBI_8772 PR_P08684 265 | CHEBI_3699 PR_P11509 266 | SRS_JQ9EK2H6BG PR_P08684 267 | CHEBI_47519 PR_P22310 268 | CHEBI_28901 PR_P08684 269 | CHEBI_7772 PR_P11712 270 | CHEBI_499361 PR_P10635 271 | CHEBI_6923 PR_P10635 272 | CHEBI_3396 PR_O60656 273 | CHEBI_45409 PR_P22309 274 | -------------------------------------------------------------------------------- /resources/resource_info.txt: -------------------------------------------------------------------------------- 1 | chemical-disease|:;MESH_;|class-class|RO_0002610|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/MESH_CHEBI_MAP.txt;1:./resources/processed_data/DISEASE_MONDO_MAP.txt|5;==;marker/mechanism|None 2 | chemical-gene|;MESH_;|class-entity|RO_0002434|http://purl.obolibrary.org/obo/|http://www.ncbi.nlm.nih.gov/gene/|t|1;4|0:./resources/processed_data/MESH_CHEBI_MAP.txt|9;affects;not in x|6;==;Homo sapiens::5;.startswith('gene'); 3 | chemical-gobp|:;MESH_;GO_|class-class|RO_0002436|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;5|0:./resources/processed_data/MESH_CHEBI_MAP.txt|8;<=;1.04e-47|3;==;Biological Process 4 | chemical-gocc|:;MESH_;GO_|class-class|RO_0002436|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;5|0:./resources/processed_data/MESH_CHEBI_MAP.txt|8;<=;1.04e-47|3;==;Cellular Component 5 | chemical-gomf|:;MESH_;GO_|class-class|RO_0002436|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;5|0:./resources/processed_data/MESH_CHEBI_MAP.txt|8;<=;1.04e-47|3;==;Molecular Function 6 | chemical-pathway|;CHEBI_;|class-entity|RO_0000056|http://purl.obolibrary.org/obo/|https://reactome.org/content/detail/|t|0;1|None|None|5;==;Homo sapiens 7 | chemical-phenotype|:;MESH_;|class-class|RO_0002610|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/MESH_CHEBI_MAP.txt;1:./resources/processed_data/PHENOTYPE_HPO_MAP.txt|5;==;marker/mechanism|None 8 | chemical-protein|;MESH_;|class-class|RO_0002434|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/MESH_CHEBI_MAP.txt;1:./resources/processed_data/ENTREZ_GENE_PRO_ONTOLOGY_MAP.txt|9;affects;not in x|6;==;Homo sapiens::5;.startswith('protein'); 9 | disease-phenotype|:;;HP_|class-class|RO_0002200|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|0;3|0:./resources/processed_data/DISEASE_MONDO_MAP.txt|None|None|None 10 | gene-disease|;;|entity-class|RO_0003302|http://www.ncbi.nlm.nih.gov/gene/|http://purl.obolibrary.org/obo/|t|0;4|1:./resources/processed_data/DISEASE_MONDO_MAP.txt|10;>=;1.0|None 11 | gene-gene|;;|entity-entity|RO_0002435|http://www.ncbi.nlm.nih.gov/gene/|http://www.ncbi.nlm.nih.gov/gene/|t|0;1|0:./resources/processed_data/ENSEMBL_GENE_ENTREZ_GENE_MAP.txt;1:./resources/processed_data/ENSEMBL_GENE_ENTREZ_GENE_MAP.txt|None|None 12 | gene-pathway|:;;|entity-entity|RO_0000056|http://www.ncbi.nlm.nih.gov/gene/|https://reactome.org/content/detail/|t|1;3|None|None|3;.startswith('REACT:R-HSA-'); 13 | gene-phenotype|;;|entity-class|RO_0003302|http://www.ncbi.nlm.nih.gov/gene/|http://purl.obolibrary.org/obo/|t|0;4|1:./resources/processed_data/PHENOTYPE_HPO_MAP.txt|10;>=;1.0|None 14 | gene-protein|;;|entity-class|RO_0002205|http://www.ncbi.nlm.nih.gov/gene/|http://purl.obolibrary.org/obo/|t|0;1|None|None|4;==;protein-coding 15 | gene-rna|;;|entity-entity|RO_0002511|http://www.ncbi.nlm.nih.gov/gene/|https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=|t|0;1|None|None|None 16 | gobp-pathway|:;GO_;|class-entity|RO_0009501|http://purl.obolibrary.org/obo/|https://reactome.org/content/detail/|t|4;5|None|None|8;==;P::12;==;taxon:9606::5;.startswith('REACTOME'); 17 | pathway-gocc|:;;GO_|entity-class|RO_0002180|https://reactome.org/content/detail/|http://purl.obolibrary.org/obo/|t|5;4|None|None|8;==;C::12;==;taxon:9606::5;.startswith('REACTOME'); 18 | pathway-gomf|:;;GO_|entity-class|RO_0000085|https://reactome.org/content/detail/|http://purl.obolibrary.org/obo/|t|5;4|None|None|8;==;F::12;==;taxon:9606::5;.startswith('REACTOME'); 19 | protein-anatomy|;;|class-class|RO_0001025|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|2;5|0:./resources/processed_data/UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt;1:./resources/processed_data/HPA_GTEx_TISSUE_CELL_MAP.txt|None|3;==;Evidence at protein level::4;==;anatomy 20 | protein-catalyst|;;|class-class|RO_0002436|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|0;1|None|None|None|None 21 | protein-cell|;;|class-class|RO_0001025|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|2;5|0:./resources/processed_data/UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt;1:./resources/processed_data/HPA_GTEx_TISSUE_CELL_MAP.txt|None|3;==;Evidence at protein level::4;==;cell line 22 | protein-cofactor|;;|class-class|RO_0002436|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|0;1|None|None|None|None 23 | protein-gobp|:;;GO_|class-class|RO_0000056|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt|None|8;==;P::12;==;taxon:9606 24 | protein-gocc|:;;GO_|class-class|RO_0001025|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt|None|8;==;C::12;==;taxon:9606 25 | protein-gomf|:;;GO_|class-class|RO_0000085|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;4|0:./resources/processed_data/UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt|None|8;==;F::12;==;taxon:9606 26 | protein-pathway|;;|class-entity|RO_0000056|http://purl.obolibrary.org/obo/|https://reactome.org/content/detail/|t|0;1|0:./resources/processed_data/UNIPROT_ACCESSION_PRO_ONTOLOGY_MAP.txt|None|5;==;Homo sapiens 27 | protein-protein|9606.;;|class-class|RO_0002436|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|''|0;1|0:./resources/processed_data/STRING_PRO_ONTOLOGY_MAP.txt;1:./resources/processed_data/STRING_PRO_ONTOLOGY_MAP.txt|2;>=;700|None 28 | rna-anatomy|;;|entity-class|RO_0001025|https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=|http://purl.obolibrary.org/obo/|t|1;5|0:./resources/processed_data/GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt;1:./resources/processed_data/HPA_GTEx_TISSUE_CELL_MAP.txt|None|3;==;Evidence at transcript level::4;==;anatomy 29 | rna-cell|;;|entity-class|RO_0001025|https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=|http://purl.obolibrary.org/obo/|t|1;5|0:./resources/processed_data/GENE_SYMBOL_ENSEMBL_TRANSCRIPT_MAP.txt;1:./resources/processed_data/HPA_GTEx_TISSUE_CELL_MAP.txt|None|3;==;Evidence at transcript level::4;==;cell line 30 | rna-protein|;;|entity-class|RO_0002513|https://uswest.ensembl.org/Homo_sapiens/Transcript/Summary?t=|http://purl.obolibrary.org/obo/|t|0;1|None|None|4;==;protein-coding 31 | variant-disease|:;rs;|entity-class|RO_0003302|https://www.ncbi.nlm.nih.gov/snp/|http://purl.obolibrary.org/obo/|t|9;12|1:./resources/processed_data/DISEASE_MONDO_MAP.txt|24;in;["criteria provided, multiple submitters, no conflicts", "reviewed by expert panel", "practice guideline"]::7;==;1|9;!=;-1::16;==;GRCh38::8-9;dedup;desc 32 | variant-gene|;rs;|entity-entity|RO_0002566|https://www.ncbi.nlm.nih.gov/snp/|http://www.ncbi.nlm.nih.gov/gene/|t|9;3|None|24;in;["criteria provided, multiple submitters, no conflicts", "reviewed by expert panel", "practice guideline"]|9;!=;-1::3;!=;-1::16;==;GRCh38::8-9;dedup;desc 33 | variant-phenotype|:;rs;|entity-class|RO_0003302|https://www.ncbi.nlm.nih.gov/snp/|http://purl.obolibrary.org/obo/|t|9;12|1:./resources/processed_data/PHENOTYPE_HPO_MAP.txt|24;in;["criteria provided, multiple submitters, no conflicts", "reviewed by expert panel", "practice guideline"]::7;==;1|9;!=;-1::16;==;GRCh38::8-9;dedup;desc 34 | chemical-inhibitor|;;|class-class|RO_0002449|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|0;1|None|None|None|None 35 | chemical-molecule|;;|class-class|RO_0002436|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|0;1|None|None|None|None 36 | transporter-chemical|;;|class-class|RO_0002020|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|1;0|None|None|None|None 37 | chemical-substrate|;;|class-class|DIDEO_00000041|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|0;1|None|None|None|None 38 | chemical-indication|;;|class-class|RO_0002606|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|0;1|None|None|None|None 39 | chemical-adr|;;|class-class|RO_0003302|http://purl.obolibrary.org/obo/|http://purl.obolibrary.org/obo/|t|0;1|None|None|None|None 40 | -------------------------------------------------------------------------------- /relation-extraction-scripts/semrep_process_pmid.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to extract predications from SemRep for full texts of articles (already processed from PDF to text) 3 | MetaMap, WordSenseDisambiguation Server, and SemRep are running locally. 4 | ''' 5 | import sys, os 6 | from datetime import datetime, timedelta 7 | import pandas as pd 8 | import subprocess 9 | from nltk import tokenize 10 | import requests 11 | import time 12 | from pdf_to_text import read_PDF_file, process_PDF_file 13 | import logging 14 | 15 | workingDir = os.getcwd() 16 | log_dir = workingDir + '/logs/' 17 | utf_path = 'replace_utf8.jar' 18 | 19 | np = ['list of NPs'] 20 | 21 | extraction = True 22 | 23 | count_dict = { 24 | 'n_total_pmid': 0, 25 | 'n_success': 0, 26 | 'n_error': 0, 27 | 'n_statements':0, 28 | 'n_files_processed': 0, 29 | 'n_pdf': 0 30 | } 31 | 32 | start_pmid = int(sys.argv[1]) 33 | end_pmid = int(sys.argv[2]) 34 | 35 | pub_year_to_pmid_map = {} 36 | pub_type_to_pmid_map = {} 37 | 38 | section_tags = ['', '', '', '', '', '', '', 39 | '', ''] 40 | 41 | def read_and_write_file(file, filepath_in, filepath_out): 42 | filepath = filepath_in+file 43 | 44 | fileascii = filepath_out+file.split('_')[0]+'_ascii.txt' 45 | convert_to_ascii = subprocess.run(["java", "-jar", utf_path, filepath], capture_output=True, text=True) 46 | if convert_to_ascii: 47 | result = convert_to_ascii.stdout 48 | sentences = tokenize.sent_tokenize(result) 49 | fileo = open(fileascii, 'w', encoding='ascii', errors='backslashreplace') 50 | for item in sentences: 51 | if sentences.index(item) % 5 == 0: 52 | fileo.write('\n\n') 53 | if len(item) > 1000: 54 | fileo.write('\n\n') 55 | #continue 56 | fileo.write(str(item)) 57 | fileo.write('\n') 58 | fileo.close() 59 | count_dict['n_files_processed'] += 1 60 | return fileascii 61 | else: 62 | return None 63 | 64 | def process_with_semrep(infile, outfile): 65 | try: 66 | result = subprocess.run(['/usr/local/bin/semrep.v1.8', '-L', '2018', '-Z', '2018AA', infile, outfile], check=True, timeout=1800) 67 | 68 | return result 69 | except Exception as e: 70 | logging.info('SemRep error in processing %s', str(e)) 71 | return None 72 | 73 | def get_publication_year_and_type(pmid): 74 | pub_year = '' 75 | pub_type = '' 76 | if pmid == '': 77 | return pub_year, pub_type 78 | if pmid in pub_year_to_pmid_map and pmid in pub_type_to_pmid_map: 79 | if pub_year_to_pmid_map[pmid] != '': 80 | pub_year = pub_year_to_pmid_map[pmid] 81 | pub_type = pub_type_to_pmid_map[pmid] 82 | return pub_year, str(pub_type) 83 | time.sleep(5) 84 | uri = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=pubmed&id="+pmid+"&retmode=json" 85 | response = requests.get(uri) 86 | if response.status_code == 429: 87 | time.sleep(5) 88 | response = requests.get(uri) 89 | if response.status_code == 200: 90 | result = response.json() 91 | pub_year = result['result'][pmid]['pubdate'] 92 | pub_year_to_pmid_map[pmid] = pub_year 93 | 94 | pub_type = result['result'][pmid]['pubtype'] 95 | pub_type_to_pmid_map[pmid] = pub_type 96 | return pub_year, str(pub_type) 97 | 98 | ###counting of statements is incorrect and being done per file not all - last reported is for the file not the entire set in count dict or log 99 | def semrep_extract(filepath): 100 | result_dict = { 101 | 'index': [], 102 | 'pmid': [], 103 | 'relation': [], 104 | 'year': [], 105 | 'subject_cui': [], 106 | 'object_cui': [], 107 | 'subject_name': [], 108 | 'object_name': [], 109 | 'subject_type': [], 110 | 'object_type': [], 111 | 'sentence': [], 112 | 'source_section': [], 113 | 'pub_type': [] 114 | } 115 | index = 0 116 | semrep_files = os.listdir(filepath) 117 | for file in semrep_files: 118 | pmid = file.split('.')[0] 119 | sem_relations = { 120 | 'items': [], 121 | 'source_sentence': [], 122 | 'source_section': [] 123 | } 124 | with open(filepath+file, 'r', errors='ignore') as file_sem: 125 | lines = file_sem.readlines() 126 | 127 | last_non_empty = '' 128 | section_match = '' 129 | for item in lines: 130 | if any(s in item for s in section_tags): 131 | #assign section 132 | section_match = next((sec for sec in section_tags if sec in item), False) 133 | if '|relation|' in item: 134 | sem_relations['items'].append(item) 135 | sem_relations['source_sentence'].append(last_non_empty) 136 | sem_relations['source_section'].append(section_match) 137 | elif item == '\n' or item == '': 138 | continue 139 | else: 140 | last_non_empty = item 141 | 142 | count_dict['n_statements'] += len(sem_relations) 143 | for rel in sem_relations['items']: 144 | fields = rel.split('|') 145 | if len(fields) < 5: 146 | continue 147 | result_dict['index'].append(index) 148 | result_dict['pmid'].append(pmid) 149 | result_dict['subject_cui'].append(fields[2]) 150 | result_dict['object_cui'].append(fields[9]) 151 | result_dict['subject_name'].append(fields[3]) 152 | result_dict['object_name'].append(fields[10]) 153 | result_dict['relation'].append(fields[8]) 154 | result_dict['subject_type'].append(fields[4]) 155 | result_dict['object_type'].append(fields[11]) 156 | pub_year, pub_type = get_publication_year_and_type(pmid) 157 | result_dict['year'].append(pub_year) 158 | result_dict['pub_type'].append(pub_type) 159 | relation_index = sem_relations['items'].index(rel) 160 | result_dict['sentence'].append(sem_relations['source_sentence'][relation_index]) 161 | result_dict['source_section'].append(sem_relations['source_section'][relation_index]) 162 | index += 1 163 | 164 | return result_dict 165 | 166 | if __name__ == '__main__': 167 | 168 | for item in np: 169 | inputPMID_file = workingDir + '/input_files/' + item + '/' + item + '_pmid.txt' 170 | #inputPMID_file = workingDir + '/input_files/' + item + '/' + item + '_pmid_PDF.txt' 171 | inputDirRaw = workingDir + '/output_files/' + item + '/PDFoutput/' 172 | inputDir = workingDir + '/output_files/'+item+'/semrepInput/' 173 | outputDir = workingDir + '/output_files/'+item+'/semrepOutput/' 174 | PDF_inputDir = workingDir + '/input_files/'+item+'/FullTextPDFs/' 175 | PDF_outputDir = inputDirRaw 176 | 177 | t0=datetime.now() 178 | 179 | log_file = log_dir+item + '_semrep_log'+str(t0)+'.txt' 180 | logging.basicConfig(filename=log_file, filemode='a', level=logging.INFO) 181 | logging.info('Log for %s. PMIDS: %s to %s', item, str(start_pmid), str(end_pmid)) 182 | 183 | 184 | with open(inputPMID_file, 'r') as file_input: 185 | pmids = file_input.readlines() 186 | 187 | text_files = os.listdir(inputDirRaw) 188 | 189 | for line_no in range(start_pmid, end_pmid): 190 | pmid = pmids[line_no].strip() 191 | logging.info('\n\nProcessing PMID: %s', pmid) 192 | count_dict['n_total_pmid'] += 1 193 | file = str(pmid) + '_processed.txt' 194 | file_alternate = str(pmid) + '.txt' 195 | 196 | text_files = os.listdir(inputDirRaw) 197 | if file not in text_files and file_alternate not in text_files: 198 | logging.info('\nText unavailable: %s', file_alternate) 199 | #extract text from PDF 200 | logging.info('\nChecking for PDF:') 201 | pdf_files = os.listdir(PDF_inputDir) 202 | if pmid + '.pdf' in pdf_files: 203 | logging.info('\nPDF found') 204 | count_dict['n_pdf'] += 1 205 | pdf_in_file = PDF_inputDir+pmid+'.pdf' 206 | pdf_out_file = PDF_outputDir+pmid+'.txt' 207 | pdf_txt_file = PDF_outputDir+pmid+'_processed.txt' 208 | pdf_text_val = read_PDF_file(pdf_in_file, pdf_out_file) 209 | if pdf_text_val: 210 | pdf_text_process_val = process_PDF_file(pdf_out_file, pdf_txt_file) 211 | else: 212 | count_dict['n_error'] += 1 213 | logging.info('\nUnable to extract from PDF: %s',pmid) 214 | else: 215 | logging.info('\nPDF unavailable') 216 | 217 | #update to include new files 218 | text_files = os.listdir(inputDirRaw) 219 | if file not in text_files: 220 | file = file_alternate 221 | 222 | if file in text_files: 223 | ascii_files = os.listdir(inputDir) 224 | fileascii = file.split('_')[0]+'_ascii.txt' 225 | if fileascii in ascii_files: 226 | filename = inputDir+fileascii 227 | else: 228 | filename = read_and_write_file(file, inputDirRaw, inputDir) 229 | if filename is not None: 230 | semrep_process = process_with_semrep(filename, outputDir+file.split('_')[0]+'.txt') 231 | if semrep_process is not None: 232 | logging.info('\nFile processed: %s', file) 233 | count_dict['n_success'] += 1 234 | 235 | else: 236 | logging.info('\nError in processing: %s',file) 237 | count_dict['n_error'] += 1 238 | else: 239 | logging.info('\nText unavailable: %s',file) 240 | count_dict['n_error'] +=1 241 | 242 | else: 243 | logging.info('\nPDF file unavailable: %s', file) 244 | 245 | if extraction: 246 | result_dict = semrep_extract(outputDir) 247 | 248 | semrep_result = pd.DataFrame(result_dict) 249 | semrep_result_unique = semrep_result.drop_duplicates(subset=['subject_cui', 'subject_name', 'subject_type', 250 | 'relation', 'object_cui', 'object_name', 'object_type', 'year', 'sentence', 'source_section', 'pub_type']) 251 | semrep_result_unique.to_csv(workingDir+'/output_files/'+item+'/' +item+'_pmid_all_predicates_semrep-'+str(start_pmid)+'-'+str(end_pmid)+'.tsv', sep='\t', index=False, 252 | columns=['index', 'pmid', 'subject_cui', 'subject_name', 'subject_type', 253 | 'relation', 'object_cui', 'object_name', 'object_type', 'year', 'sentence', 'source_section', 'pub_type']) 254 | 255 | t1 = datetime.now() 256 | seconds=timedelta.total_seconds(t1-t0) 257 | logging.info('\nTotal time: %s seconds', str(seconds)) 258 | logging.info('\nPMIDs: %s to %s', str(start_pmid), str(end_pmid-1)) 259 | logging.info('\nTotal PMIDs: %s',str(count_dict['n_total_pmid'])) 260 | logging.info('\nN_file_hits: %s',str(count_dict['n_files_processed'])) 261 | logging.info('\nN_semrep_hits: %s',str(count_dict['n_success'])) 262 | logging.info('\nN_errors: %s',str(count_dict['n_error'])) 263 | logging.info('\nN_statements: %s',str(count_dict['n_statements'])) 264 | 265 | os.system("pkill semrep") 266 | 267 | logging.info('\nTerminated subprocesses') -------------------------------------------------------------------------------- /util-notebooks/create_strToOBOdict.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import pickle\n", 11 | "from rdflib import Graph, URIRef, BNode, Namespace, Literal" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "Create string to OBO identifier dictionary to search in KG for all natural product nodes and constituents. Also included are some common enzymes, transporters and drugs." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "PATHIN = '../resources/'" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "667" 39 | ] 40 | }, 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "##open existing dictionary\n", 48 | "with open(PATHIN+'strToOBOdict_v2.0.0.pickle', 'rb') as filep:\n", 49 | " obodict = pickle.load(filep)\n", 50 | "len(obodict)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | "
constituent_nameURI
012beta-acetoxycimigenol 3-o-beta-d-xylopyranosidehttp://napdi.org/napdi_srs_imports:12beta_acet...
12'-o-acetylacteinhttp://napdi.org/napdi_srs_imports:2_o_acetyla...
22'-o-acetylcimicifugoside h1http://napdi.org/napdi_srs_imports:2_o_acetylc...
323-epi-26-deoxyacteinhttp://purl.obolibrary.org/obo/CHEBI_70243
423-o-acetylshengmanolhttp://napdi.org/napdi_srs_imports:23_o_acetyl...
\n", 111 | "
" 112 | ], 113 | "text/plain": [ 114 | " constituent_name \\\n", 115 | "0 12beta-acetoxycimigenol 3-o-beta-d-xylopyranoside \n", 116 | "1 2'-o-acetylactein \n", 117 | "2 2'-o-acetylcimicifugoside h1 \n", 118 | "3 23-epi-26-deoxyactein \n", 119 | "4 23-o-acetylshengmanol \n", 120 | "\n", 121 | " URI \n", 122 | "0 http://napdi.org/napdi_srs_imports:12beta_acet... \n", 123 | "1 http://napdi.org/napdi_srs_imports:2_o_acetyla... \n", 124 | "2 http://napdi.org/napdi_srs_imports:2_o_acetylc... \n", 125 | "3 http://purl.obolibrary.org/obo/CHEBI_70243 \n", 126 | "4 http://napdi.org/napdi_srs_imports:23_o_acetyl... " 127 | ] 128 | }, 129 | "execution_count": 5, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "##read in TSV file with NP names and constituents\n", 136 | "filename = 'ontology-extensions/chebi-extensions-constituents-NP-20240229.tsv'\n", 137 | "df = pd.read_csv(PATHIN+filename, sep='\\t')\n", 138 | "df.head()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 6, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "671" 150 | ] 151 | }, 152 | "execution_count": 6, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "obodict_new = {}\n", 159 | "for i in range(len(df.index)):\n", 160 | " name = df.at[i, 'constituent_name']\n", 161 | " uri = df.at[i, 'URI']\n", 162 | " uriref = URIRef(uri)\n", 163 | " obodict_new[name] = uriref\n", 164 | "len(obodict_new)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 7, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "rdflib.term.URIRef('http://purl.obolibrary.org/obo/CHEBI_90')" 176 | ] 177 | }, 178 | "execution_count": 7, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "obodict_new['(-)-epicatechin']" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 8, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "data": { 194 | "text/plain": [ 195 | "rdflib.term.URIRef('http://napdi.org/napdi_srs_imports:camellia_sinensis')" 196 | ] 197 | }, 198 | "execution_count": 8, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "obodict_new['camellia sinensis']" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 9, 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "701" 216 | ] 217 | }, 218 | "execution_count": 9, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "prots = ['CYP3A4', 'CYP3A5', 'CYP1A2', 'CYP2C8', 'CYP1A1', 'CYP2E1', 'CYP2C19', 'CYP2D6', 'CYP2B6', 'CYP2C9', 'UGT', 'UGT1A1', 'UGT1A4', 'UGT1A6', 'UGT1A8', 'UGT1A10', 'UGT1A9', 'UGT2B7', 'PGP', 'SULT1A1', 'SULT1A3', 'MATE1', 'MATE2K', 'OCT1', 'OCT2', 'OATP1B1', 'OATP1A2', 'OATP1B3', 'CYP2A6', 'glycoprotein']\n", 225 | "for prot in prots:\n", 226 | " obodict_new[prot] = obodict[prot]\n", 227 | "len(obodict_new)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 10, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "##save pickle\n", 237 | "with open(PATHIN+'strToOBOdict_v3.0.0.pickle', 'wb') as filep2:\n", 238 | " pickle.dump(obodict_new, filep2)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 11, 244 | "metadata": {}, 245 | "outputs": [ 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "701" 250 | ] 251 | }, 252 | "execution_count": 11, 253 | "metadata": {}, 254 | "output_type": "execute_result" 255 | } 256 | ], 257 | "source": [ 258 | "## add more PK nodes\n", 259 | "with open(PATHIN+'strToOBOdict_v3.0.0.pickle', 'rb') as filep2:\n", 260 | " obodict = pickle.load(filep2)\n", 261 | "len(obodict)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 12, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "obo_prefix = 'http://purl.obolibrary.org/obo/'\n", 271 | "obodict['ABCG2'] = [URIRef(obo_prefix+'PR_Q9UNQ0'), URIRef(obo_prefix+'PR_000001283'), URIRef(obo_prefix+'PR_000041383')]\n", 272 | "obodict['SLCO1B1'] = [URIRef(obo_prefix+'PR_Q9Y6L6'), URIRef(obo_prefix+'PR_000015223')]\n", 273 | "obodict['SLCO1B3'] = [URIRef(obo_prefix+'PR_Q9NPD5'), URIRef(obo_prefix+'PR_000015224')]\n", 274 | "obodict['SLC22A6'] = [URIRef(obo_prefix+'PR_Q4U2R8'), URIRef(obo_prefix+'PR_000014993')]\n", 275 | "obodict['SLC22A8'] = [URIRef(obo_prefix+'PR_Q8TCC7'), URIRef(obo_prefix+'PR_000014995')]\n", 276 | "obodict['SLC47A2'] = [URIRef(obo_prefix+'PR_Q86VL8'), URIRef(obo_prefix+'PR_000015153')]\n", 277 | "obodict['SLC47A1'] = [URIRef(obo_prefix+'PR_Q96FL8'), URIRef(obo_prefix+'PR_000015152')]\n", 278 | "obodict['SLC22A2'] = [URIRef(obo_prefix+'PR_O15244'), URIRef(obo_prefix+'PR_000014988')]" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 13, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "##save obodict\n", 288 | "with open(PATHIN+'strToOBOdict_v3.0.0.pickle', 'wb') as filep2:\n", 289 | " pickle.dump(obodict, filep2)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [] 298 | } 299 | ], 300 | "metadata": { 301 | "kernelspec": { 302 | "display_name": "Python 3.8.0 ('kg-embed')", 303 | "language": "python", 304 | "name": "python3" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.6.13" 317 | }, 318 | "orig_nbformat": 4, 319 | "vscode": { 320 | "interpreter": { 321 | "hash": "96e4f617bfd8d41f7b09e4ea5db07380f9a5a2c21add6996766b0d6e42ab68c1" 322 | } 323 | } 324 | }, 325 | "nbformat": 4, 326 | "nbformat_minor": 2 327 | } 328 | -------------------------------------------------------------------------------- /resources/data/CHEMICAL_MOLECULE.tsv: -------------------------------------------------------------------------------- 1 | CHEBI_8426 PR_Q9UNQ0 2 | CHEBI_35553 PR_P10635 3 | CHEBI_6402 PR_P05177 4 | CHEBI_7465 PR_P10635 5 | CHEBI_6923 PR_P33261 6 | CHEBI_64318 PR_P08684 7 | CHEBI_10100 PR_P11712 8 | CHEBI_8461 PR_P10635 9 | CHEBI_7459 PR_P10635 10 | CHEBI_8426 PR_Q96FL8 11 | CHEBI_7565 PR_P05177 12 | CHEBI_8673 PR_P11712 13 | CHEBI_3238 PR_P10635 14 | CHEBI_7582 PR_P33261 15 | CHEBI_47381 PR_O60656 16 | CHEBI_47381 PR_P19224 17 | CHEBI_45980 PR_P10635 18 | CHEBI_135590 PR_P10635 19 | CHEBI_45285 PR_P08684 20 | CHEBI_48574 PR_P11712 21 | CHEBI_6541 PR_P11712 22 | CHEBI_85078 PR_P31639 23 | CHEBI_7565 PR_P08684 24 | CHEBI_31859 PR_P05177 25 | CHEBI_39548 PR_P08684 26 | CHEBI_6919 PR_P11712 27 | CHEBI_9249 PR_P33261 28 | CHEBI_8081 PR_P10635 29 | CHEBI_37924 PR_P11712 30 | CHEBI_135737 PR_P08684 31 | CHEBI_9948 PR_Q9UNQ0 32 | CHEBI_72291 PR_P08684 33 | CHEBI_50692 PR_P08684 34 | CHEBI_64216 PR_P10635 35 | CHEBI_3219 PR_P33261 36 | CHEBI_9325 PR_P11712 37 | CHEBI_68639 PR_P19099 38 | CHEBI_2611 PR_P08684 39 | CHEBI_63619 PR_P10635 40 | CHEBI_83305 NCBITaxon_1201292 41 | CHEBI_6923 PR_P08684 42 | CHEBI_6030 PR_P08684 43 | CHEBI_77069 PR_P10635 44 | CHEBI_5818 PR_P10635 45 | CHEBI_8499 PR_P10635 46 | CHEBI_9168 PR_P33261 47 | CHEBI_8426 PR_Q9HAW7 48 | CHEBI_7582 PR_P11712 49 | CHEBI_6030 PR_P20815 50 | CHEBI_4636 PR_P10635 51 | CHEBI_47519 PR_P05108 52 | CHEBI_7550 PR_P11712 53 | CHEBI_63632 PR_P08684 54 | CHEBI_6375 PR_P33261 55 | CHEBI_31687 PR_P10635 56 | CHEBI_4659 PR_P05177 57 | CHEBI_4562 PR_P08684 58 | CHEBI_32215 PR_P33261 59 | CHEBI_46081 PR_P08684 60 | CHEBI_6690 PR_P08684 61 | CHEBI_4513 PR_P11712 62 | CHEBI_47519 PR_Q07973 63 | CHEBI_7789 PR_P10635 64 | CHEBI_4910 PR_P15538 65 | CHEBI_9448 PR_P10635 66 | CHEBI_44241 PR_P19099 67 | CHEBI_7575 PR_P08684 68 | CHEBI_31298 PR_P10635 69 | CHEBI_10093 PR_P10635 70 | CHEBI_9325 PR_P05177 71 | CHEBI_6375 PR_P10635 72 | CHEBI_5693 PR_P33261 73 | CHEBI_45409 PR_P10635 74 | CHEBI_6343 PR_P10635 75 | CHEBI_52717 PR_P33261 76 | CHEBI_37924 PR_P22309 77 | CHEBI_3215 PR_P10635 78 | CHEBI_3023 PR_P11712 79 | CHEBI_132922 PR_P11712 80 | CHEBI_9588 PR_P11509 81 | CHEBI_3545 PR_P08684 82 | CHEBI_3738 PR_P10635 83 | CHEBI_9753 PR_P11712 84 | CHEBI_10033 PR_P11712 85 | CHEBI_4315 PR_P11712 86 | CHEBI_6723 PR_P11712 87 | CHEBI_6532 PR_P10635 88 | CHEBI_3223 PR_P08684 89 | CHEBI_28593 PR_P10635 90 | CHEBI_44915 PR_P11712 91 | CHEBI_45409 PR_P08684 92 | CHEBI_50305 PR_P11712 93 | CHEBI_5062 PR_P33261 94 | CHEBI_6715 PR_P33261 95 | CHEBI_2663 PR_P08684 96 | CHEBI_47519 PR_P08686 97 | CHEBI_3348 PR_P11712 98 | CHEBI_6538 PR_P33261 99 | CHEBI_9249 PR_P10635 100 | CHEBI_9566 PR_P10635 101 | CHEBI_8772 PR_P11712 102 | CHEBI_223316 PR_P05177 103 | CHEBI_45409 PR_P11712 104 | CHEBI_3764 PR_P08684 105 | CHEBI_9325 PR_P33261 106 | CHEBI_6923 PR_P10635 107 | CHEBI_31687 PR_P11712 108 | CHEBI_4754 PR_P05177 109 | CHEBI_47780 PR_P10635 110 | CHEBI_46081 PR_P33261 111 | CHEBI_4877 PR_P08684 112 | CHEBI_7772 PR_P33261 113 | CHEBI_9948 PR_P08684 114 | CHEBI_107736 PR_P10635 115 | CHEBI_9241 PR_P33261 116 | CHEBI_9588 PR_P20813 117 | CHEBI_135370 PR_P10635 118 | CHEBI_480999 PR_P08684 119 | CHEBI_32223 PR_P33261 120 | CHEBI_6413 PR_P19099 121 | CHEBI_7550 PR_P08684 122 | CHEBI_5132 PR_P05177 123 | CHEBI_5062 PR_P11712 124 | CHEBI_10023 PR_P11712 125 | CHEBI_37924 PR_P05177 126 | CHEBI_47381 PR_Q9HAW8 127 | CHEBI_5138 PR_P05177 128 | CHEBI_3720 PR_P08684 129 | CHEBI_585948 PR_P11712 130 | CHEBI_135737 PR_P11712 131 | CHEBI_83326 NCBITaxon_46170 132 | CHEBI_3647 PR_P10635 133 | CHEBI_3699 PR_P08183 134 | CHEBI_7575 PR_P11712 135 | CHEBI_31401 PR_P11712 136 | CHEBI_31578 PR_P33261 137 | CHEBI_3374 PR_P05177 138 | CHEBI_50305 PR_P08684 139 | CHEBI_3738 PR_P33261 140 | CHEBI_9671 PR_P10635 141 | CHEBI_47381 PR_Q9HAW7 142 | CHEBI_50730 PR_P10632 143 | CHEBI_9757 PR_P33261 144 | CHEBI_51164 PR_P10635 145 | CHEBI_6923 PR_P11712 146 | CHEBI_83305 NCBITaxon_1313 147 | CHEBI_7772 PR_P05177 148 | CHEBI_10100 PR_P08684 149 | CHEBI_7772 PR_P11712 150 | CHEBI_31687 PR_P33261 151 | CHEBI_4754 PR_P33261 152 | CHEBI_4031 PR_Q5ABU7 153 | CHEBI_8871 PR_P10635 154 | CHEBI_9453 PR_P51589 155 | CHEBI_18358 PR_P05177 156 | CHEBI_5693 PR_P05177 157 | CHEBI_47519 PR_P22680 158 | CHEBI_3090 PR_P33261 159 | CHEBI_77032 PR_P10635 160 | CHEBI_68639 PR_P05093 161 | CHEBI_31515 PR_P10635 162 | CHEBI_38561 PR_P11712 163 | CHEBI_8499 PR_P05177 164 | CHEBI_47519 PR_P05093 165 | CHEBI_7825 PR_P33261 166 | CHEBI_4754 PR_P11712 167 | CHEBI_4953 PR_P08183 168 | CHEBI_6919 PR_P10632 169 | CHEBI_45285 PR_P20815 170 | CHEBI_40009 PR_P08684 171 | CHEBI_6874 PR_P10635 172 | CHEBI_4877 PR_P20815 173 | CHEBI_5118 PR_P10635 174 | CHEBI_127342 PR_P10635 175 | CHEBI_83305 NCBITaxon_46170 176 | CHEBI_27953 PR_P10635 177 | CHEBI_6413 PR_P15538 178 | CHEBI_61049 PR_P08684 179 | CHEBI_9362 PR_P11712 180 | CHEBI_52010 PR_P10635 181 | CHEBI_7496 PR_P08684 182 | CHEBI_3994 PR_P10635 183 | CHEBI_8060 PR_P11712 184 | CHEBI_37941 PR_P20813 185 | CHEBI_44032 PR_P08684 186 | CHEBI_6820 PR_P10635 187 | CHEBI_47381 PR_P22309 188 | CHEBI_47519 PR_P19099 189 | CHEBI_2687 PR_P10635 190 | CHEBI_27882 PR_P08684 191 | CHEBI_9757 PR_P11712 192 | CHEBI_9652 PR_P11509 193 | CHEBI_4754 PR_P10635 194 | CHEBI_3764 PR_P33261 195 | CHEBI_7825 PR_P10635 196 | CHEBI_4910 PR_P19099 197 | CHEBI_37924 PR_P08684 198 | CHEBI_9588 PR_P33261 199 | CHEBI_101278 PR_P08684 200 | CHEBI_31624 PR_P33261 201 | CHEBI_46081 PR_P11712 202 | CHEBI_10033 PR_P08684 203 | CHEBI_7577 PR_P05177 204 | CHEBI_6762 PR_P10635 205 | CHEBI_68639 PR_P08684 206 | CHEBI_4754 PR_P08684 207 | CHEBI_47519 PR_P15538 208 | CHEBI_63617 PR_P10635 209 | CHEBI_253342 PR_P05177 210 | CHEBI_3720 PR_P10635 211 | CHEBI_24564 PR_P05177 212 | CHEBI_681848 PR_P10635 213 | CHEBI_8060 PR_P10635 214 | CHEBI_44241 PR_P15538 215 | CHEBI_31360 PR_P10635 216 | CHEBI_41423 PR_P10635 217 | CHEBI_6149 PR_P10635 218 | CHEBI_7862 PR_P10635 219 | CHEBI_136007 PR_P10635 220 | CHEBI_3348 PR_P08684 221 | CHEBI_47519 PR_Q16850 222 | CHEBI_4325 PR_P08684 223 | CHEBI_3125 PR_P11712 224 | CHEBI_8405 PR_P05177 225 | CHEBI_9123 PR_P33261 226 | CHEBI_3764 PR_P11712 227 | CHEBI_9566 PR_P33261 228 | CHEBI_77780 PR_P11712 229 | CHEBI_553827 PR_P10635 230 | CHEBI_7494 PR_P08684 231 | CHEBI_6920 PR_P08684 232 | CHEBI_7825 PR_P08684 233 | CHEBI_45979 PR_P05177 234 | CHEBI_63621 PR_P08684 235 | CHEBI_7936 PR_P10635 236 | CHEBI_7856 PR_P33261 237 | CHEBI_6076 PR_P08684 238 | CHEBI_9753 PR_P19224 239 | CHEBI_40050 PR_P08684 240 | CHEBI_10023 PR_P33261 241 | CHEBI_50305 PR_P33261 242 | CHEBI_9249 PR_P11712 243 | CHEBI_48273 PR_P10635 244 | CHEBI_8060 PR_P08684 245 | CHEBI_5296 PR_P11712 246 | CHEBI_7640 PR_P33261 247 | CHEBI_2675 PR_P10635 248 | CHEBI_31687 PR_P08684 249 | CHEBI_4026 PR_P08684 250 | CHEBI_6919 PR_P08684 251 | CHEBI_7825 PR_P11712 252 | CHEBI_47519 PR_P08684 253 | CHEBI_9325 PR_P10635 254 | CHEBI_9168 PR_P11712 255 | CHEBI_68639 PR_P15538 256 | CHEBI_5296 PR_P10632 257 | CHEBI_28462 PR_P10635 258 | CHEBI_7936 PR_P05177 259 | CHEBI_3753 PR_P10635 260 | CHEBI_10023 PR_P08684 261 | CHEBI_3181 PR_P08684 262 | CHEBI_5441 PR_P11712 263 | CHEBI_9720 PR_P10635 264 | CHEBI_9620 PR_P33261 265 | CHEBI_3545 PR_P20815 266 | CHEBI_5296 PR_P08183 267 | CHEBI_9325 PR_P08684 268 | CHEBI_4031 PR_P08684 269 | CHEBI_9168 PR_P08684 270 | CHEBI_2896 PR_P08684 271 | CHEBI_49575 PR_P08684 272 | CHEBI_7575 PR_P33261 273 | CHEBI_9566 PR_P05177 274 | CHEBI_3545 PR_P10635 275 | CHEBI_17698 PR_P08684 276 | CHEBI_8060 PR_P10632 277 | CHEBI_7443 PR_P05177 278 | CHEBI_50199 PR_P05177 279 | CHEBI_135737 PR_P33261 280 | CHEBI_63619 PR_P05177 281 | CHEBI_8435 PR_P05177 282 | CHEBI_44241 PR_P08684 283 | CHEBI_8459 PR_P10635 284 | CHEBI_9588 PR_P10635 285 | CHEBI_9546 PR_P10635 286 | CHEBI_8435 PR_P10635 287 | CHEBI_8060 PR_P33261 288 | CHEBI_17688 PR_P11509 289 | CHEBI_4672 PR_P08684 290 | CHEBI_8426 PR_O15245 291 | CHEBI_63613 PR_Q14242 292 | CHEBI_8426 PR_Q8TCC7 293 | CHEBI_47519 PR_Q8TCC7 294 | CHEBI_4031 PR_Q9NPD5 295 | CHEBI_6717 PR_P08183 296 | CHEBI_47519 PR_Q9NPD5 297 | CHEBI_47519 PR_P08183 298 | CHEBI_4031 PR_Q9Y6L6 299 | CHEBI_7565 PR_P08183 300 | CHEBI_8985 PR_Q9NPD5 301 | CHEBI_47519 PR_Q9UNQ0 302 | CHEBI_6076 PR_Q9NPD5 303 | CHEBI_5118 PR_P08183 304 | CHEBI_45409 PR_P08183 305 | CHEBI_4031 PR_Q9UNQ0 306 | CHEBI_100241 PR_P08183 307 | CHEBI_8426 PR_Q86VL8 308 | CHEBI_8426 PR_Q4U2R8 309 | CHEBI_3699 PR_Q86VL8 310 | CHEBI_7772 PR_Q9UNQ0 311 | CHEBI_45409 PR_Q9UNQ0 312 | CHEBI_68647 PR_Q86VL8 313 | CHEBI_45409 PR_Q96FL8 314 | CHEBI_5138 PR_P08183 315 | CHEBI_8426 PR_O15244 316 | CHEBI_63660 PR_Q8TCC7 317 | CHEBI_68647 PR_Q96FL8 318 | CHEBI_5778 PR_Q9NPD5 319 | CHEBI_45409 PR_Q8TCC7 320 | CHEBI_5296 PR_Q8TCC7 321 | CHEBI_45409 PR_Q86VL8 322 | CHEBI_7772 PR_P08183 323 | CHEBI_47519 PR_O15245 324 | CHEBI_68595 PR_P08183 325 | CHEBI_3770 PR_Q8TCC7 326 | CHEBI_37924 PR_P08183 327 | CHEBI_4031 PR_P08183 328 | CHEBI_119573 PR_P08183 329 | CHEBI_45409 PR_O15244 330 | CHEBI_45735 PR_Q9NPD5 331 | CHEBI_3770 PR_Q9NPD5 332 | CHEBI_63918 PR_P08183 333 | CHEBI_8985 PR_Q9Y6L6 334 | CHEBI_5778 PR_P08183 335 | CHEBI_3770 PR_Q4U2R8 336 | CHEBI_39867 PR_P08183 337 | CHEBI_6076 PR_P08183 338 | CHEBI_3699 PR_O15244 339 | CHEBI_135285 PR_P08183 340 | PC_119373 PR_P08183 341 | CHEBI_47519 PR_Q9Y6L6 342 | CHEBI_42355 PR_P08183 343 | CHEBI_45409 PR_O15245 344 | CHEBI_63660 PR_Q4U2R8 345 | CHEBI_39867 PR_Q9Y6L6 346 | CHEBI_45409 PR_Q9NPD5 347 | CHEBI_3770 PR_Q9Y6L6 348 | CHEBI_68647 PR_O15244 349 | CHEBI_4031 PR_Q8TCC7 350 | CHEBI_8426 PR_P08183 351 | CHEBI_3732 PR_Q14242 352 | CHEBI_37924 PR_Q9UNQ0 353 | CHEBI_9588 PR_P08183 354 | CHEBI_63660 PR_Q9Y6L6 355 | CHEBI_9588 PR_Q9UNQ0 356 | CHEBI_45409 PR_Q4U2R8 357 | CHEBI_45409 PR_Q9Y6L6 358 | CHEBI_6076 PR_Q9Y6L6 359 | CHEBI_3732 PR_P08183 360 | CHEBI_28077 PR_P08183 361 | PC_119373 PR_Q86VL8 362 | CHEBI_53289 PR_O15244 363 | PC_119373 PR_Q96FL8 364 | CHEBI_46081 PR_P08183 365 | CHEBI_7577 PR_Q9NPD5 366 | CHEBI_10033 PR_Q9UNQ0 367 | CHEBI_63660 PR_Q9NPD5 368 | CHEBI_7936 PR_P08183 369 | PC_119373 PR_Q9UNQ0 370 | CHEBI_68595 PR_Q9Y6L6 371 | CHEBI_8985 PR_P08183 372 | CHEBI_37924 PR_O15244 373 | CHEBI_3770 PR_Q9UNQ0 374 | CHEBI_7772 PR_Q4U2R8 375 | CHEBI_68595 PR_Q9NPD5 376 | CHEBI_8426 PR_Q9Y6L6 377 | CHEBI_45735 PR_Q9Y6L6 378 | CHEBI_3962 PR_P08183 379 | SRS_U946SH95EE PR_P08183 380 | CHEBI_8499 PR_P08183 381 | CHEBI_7772 PR_Q14242 382 | CHEBI_8985 PR_Q9UNQ0 383 | CHEBI_28077 PR_Q9UNQ0 384 | CHEBI_45735 PR_Q9UNQ0 385 | CHEBI_3699 PR_Q96FL8 386 | CHEBI_28077 PR_Q9Y6L6 387 | CHEBI_68647 PR_O15245 388 | CHEBI_6076 PR_Q9UNQ0 389 | CHEBI_28077 PR_Q9NPD5 390 | CHEBI_45735 PR_P08183 391 | CHEBI_5778 PR_Q9Y6L6 392 | CHEBI_63631 PR_P08183 393 | PC_71306834 PR_O15245 394 | CHEBI_8426 PR_Q9NPD5 395 | -------------------------------------------------------------------------------- /evaluation-notebooks/pathSearchMain.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import networkx as nx 3 | import json 4 | import urllib 5 | import traceback 6 | from itertools import islice 7 | from rdflib import Graph, URIRef, BNode, Namespace, Literal 8 | from rdflib.namespace import RDF, OWL 9 | from tqdm import tqdm 10 | 11 | import pickle 12 | import pandas as pd 13 | import numpy as np 14 | import sys 15 | import KG_path_searches 16 | 17 | combine_graph = False 18 | #change graph names and paths 19 | KG_PATH = '../resources/knowledge_graphs/' 20 | KG_NAME_MERGED = 'PheKnowLator_machine_read_merged_instance_based_OWLNETS_v1.0.gpickle' 21 | NodeLabelsFile = '../resources/nodeLabels_v1.0.pickle' 22 | DIR_OUT = '../output_files/' 23 | 24 | #set to false for no filtering 25 | filter_nodes = True 26 | time_slicing = True 27 | MAXNUMPATHS = 10 28 | 29 | #define namespaces 30 | obo = Namespace('http://purl.obolibrary.org/obo/') 31 | napdi = Namespace('http://napdi.org/napdi_srs_imports:') 32 | 33 | #read nodeLabels dictionary 34 | with open(NodeLabelsFile, 'rb') as filep: 35 | nodeLabels = pickle.load(filep) 36 | 37 | node_dict = { 38 | 'EGCG': obo.CHEBI_4806, 39 | 'ECG': obo.CHEBI_70255, 40 | 'EPICATECHIN': obo.CHEBI_90, 41 | 'CATECHIN': obo.CHEBI_23053, 42 | 'GREENTEA': napdi.camellia_sinensis_leaf, 43 | 'KRATOM': napdi.mitragyna_speciosa, 44 | 'MITRAGYNINE': obo.CHEBI_6956, 45 | 'HYDROXY_MITRAGYNINE': napdi['7_hydroxy_mitragynine'], 46 | 'CYP3A4': obo.PR_P08684, 47 | 'CYP2D6': obo.PR_P10635, 48 | 'CYP2C19': obo.PR_P33261, 49 | 'UGT1A1_protein': obo.PR_P22309, 50 | 'UGT1A8_protein': obo.PR_Q9HAW9, 51 | 'UGT1A10_protein': obo.PR_Q9HAW8, 52 | 'buspirone': obo.CHEBI_3223, 53 | 'nadolol': obo.CHEBI_7444, 54 | 'raloxifene': obo.CHEBI_8772, 55 | 'midazolam': obo.CHEBI_6931, 56 | 'dextromethorphan': obo.CHEBI_4470, 57 | 'seizure': obo.HP_0001250 58 | } 59 | 60 | greentea_tuples = [ 61 | (node_dict['GREENTEA'], node_dict['CYP3A4']), 62 | (node_dict['GREENTEA'], node_dict['CYP2D6']), 63 | (node_dict['GREENTEA'], node_dict['UGT1A1_protein']), 64 | (node_dict['GREENTEA'], node_dict['UGT1A8_protein']), 65 | (node_dict['GREENTEA'], node_dict['UGT1A10_protein']), 66 | (node_dict['EGCG'], node_dict['CYP3A4']), 67 | (node_dict['EGCG'], node_dict['CYP2C19']), 68 | (node_dict['EGCG'], node_dict['CYP2D6']), 69 | (node_dict['EGCG'], node_dict['UGT1A1_protein']), 70 | (node_dict['EGCG'], node_dict['UGT1A8_protein']), 71 | (node_dict['EGCG'], node_dict['UGT1A10_protein']), 72 | (node_dict['ECG'], node_dict['CYP3A4']), 73 | (node_dict['ECG'], node_dict['UGT1A1_protein']), 74 | (node_dict['ECG'], node_dict['UGT1A8_protein']), 75 | (node_dict['ECG'], node_dict['UGT1A10_protein']), 76 | (node_dict['CATECHIN'], node_dict['UGT1A1_protein']), 77 | (node_dict['CATECHIN'], node_dict['UGT1A8_protein']), 78 | (node_dict['CATECHIN'], node_dict['UGT1A10_protein']), 79 | (node_dict['EPICATECHIN'], node_dict['UGT1A1_protein']), 80 | (node_dict['EPICATECHIN'], node_dict['UGT1A8_protein']), 81 | (node_dict['EPICATECHIN'], node_dict['UGT1A10_protein']), 82 | (node_dict['GREENTEA'], node_dict['raloxifene']), 83 | (node_dict['GREENTEA'], node_dict['nadolol']), 84 | (node_dict['GREENTEA'], node_dict['buspirone']) 85 | ] 86 | 87 | kratom_tuples = [ 88 | (node_dict['KRATOM'], node_dict['CYP2D6']), 89 | (node_dict['KRATOM'], node_dict['CYP3A4']), 90 | (node_dict['MITRAGYNINE'], node_dict['CYP3A4']), 91 | (node_dict['MITRAGYNINE'], node_dict['CYP2D6']), 92 | (node_dict['MITRAGYNINE'], node_dict['CYP2C19']), 93 | (node_dict['HYDROXY_MITRAGYNINE'], node_dict['UGT1A1_protein']), 94 | (node_dict['HYDROXY_MITRAGYNINE'], node_dict['CYP2C19']), 95 | (node_dict['KRATOM'], node_dict['midazolam']), 96 | (node_dict['KRATOM'], node_dict['dextromethorphan']), 97 | (node_dict['KRATOM'], node_dict['seizure']) 98 | ] 99 | 100 | nodes_to_filter = [obo.CHEBI_24431, obo.CHEBI_25367, obo.SO_0000704, obo.PR_000029067, obo.PR_000000001, obo.GO_0008152, obo.SO_0000673, 101 | URIRef('https://reactome.org/content/detail/R-HSA-1643685'), URIRef('https://reactome.org/content/detail/R-HSA-1430728')] 102 | 103 | 104 | def get_all_paths(nx_graph, tuples): 105 | 106 | df = pd.DataFrame(columns = ['path_type','path_start', 'path_start_label', 'path_end', 'path_end_label', 'path_count','path_step','subject_label','predicate_label', 107 | 'object_label','subject_uri','predicate_uri','object_uri','source_data', 'pub_year', 'pmid']) 108 | shortestLens = 0 109 | for tpl in tuples: 110 | (s,o) = tpl 111 | startNd = s 112 | endNd = o 113 | try: 114 | shortestPathLabels, shortestPathUri = KG_path_searches.get_bidirectional_shortest_path(nx_graph, startNd, endNd, nodeLabels) 115 | shortestLens = len(shortestPathLabels) 116 | except: 117 | print('Could not find bidirectional shortest path between {} and {}'.format(startNd, endNd)) 118 | shortestLens = 2 119 | 120 | if shortestLens == 0: 121 | print('Shortest path between {} and {} not found. Setting shortest path length=2.'.format(startNd, endNd)) 122 | shortestLens = 2 123 | 124 | stepCount = 0 125 | 126 | for triples in zip(shortestPathLabels, shortestPathUri): 127 | 128 | attribute_dict = triples[1][3][0] 129 | 130 | if 'source_graph' in attribute_dict: 131 | source_data = 'machine_read' 132 | else: 133 | source_data = '' 134 | if 'timestamp' in attribute_dict: 135 | pub_year = attribute_dict['timestamp'] 136 | else: 137 | pub_year = '' 138 | if 'pmid' in attribute_dict: 139 | pmid = attribute_dict['pmid'] 140 | else: 141 | pmid = '' 142 | 143 | df_temp = { 144 | 'path_type': 'bi_shortest', 145 | 'path_start': str(startNd), 146 | 'path_start_label': nodeLabels[str(startNd)], 147 | 'path_end': str(endNd), 148 | 'path_end_label': nodeLabels[str(endNd)], 149 | 'path_count': 1, 150 | 'path_step': stepCount, 151 | 'path_length': len(shortestPathLabels), 152 | 'subject_label': triples[0][0], 153 | 'predicate_label': triples[0][1], 154 | 'object_label': triples[0][2], 155 | 'subject_uri': str(triples[1][0]), 156 | 'predicate_uri': str(triples[1][1]), 157 | 'object_uri': str(triples[1][2]), 158 | 'source_data': source_data, 159 | 'pub_year': pub_year, 160 | 'pmid': pmid 161 | } 162 | 163 | df = df.append(df_temp, ignore_index=True) 164 | stepCount += 1 165 | 166 | #path_s = KG_path_searches.get_k_shortest_paths(nx_graph, startNd, endNd, MAXNUMPATHS) 167 | MAXPATHLENGTH = shortestLens + 10 168 | 169 | #shortestCount = 0 170 | 171 | #save shortest path as dictionary and add to dataframe -- running longer than few hours so commented out for now 172 | '''for node_list in path_s: 173 | stepCount = 0 174 | path_labels = KG_path_searches.get_path_labels(nx_graph, node_list, nodeLabels) 175 | path_uri = KG_path_searches.get_path_uri(nx_graph, node_list) 176 | for triples in zip(path_labels, path_uri): 177 | 178 | if 'timestamp' in triples[1][3][0]: 179 | pub_year = triples[1][3][0]['timestamp'] 180 | else: 181 | pub_year = '' 182 | 183 | df_temp = { 184 | 'path_type': 'shortest', 185 | 'path_start': str(startNd), 186 | 'path_start_label': nodeLabels[str(startNd)], 187 | 'path_end': str(endNd), 188 | 'path_end_label': nodeLabels[str(endNd)], 189 | 'path_count': shortestCount, 190 | 'path_step': stepCount, 191 | 'path_length': len(path_labels), 192 | 'subject_label': triples[0][0], 193 | 'predicate_label': triples[0][1], 194 | 'object_label': triples[0][2], 195 | 'subject_uri': triples[1][0], 196 | 'predicate_uri': triples[1][1], 197 | 'object_uri': triples[1][2], 198 | 'source_data': triples[0][3], 199 | 'pub_year': pub_year, 200 | 'pmid': pmid 201 | } 202 | df.append(df_temp, ignore_index=True) 203 | stepCount += 1 204 | 205 | shortestCount += 1 206 | ''' 207 | pathCount = 0 208 | #get simple paths and add to dataframe 209 | for i in range(shortestLens, MAXPATHLENGTH+1): 210 | cutoff = i 211 | 212 | if filter_nodes: 213 | path_l, path_n = KG_path_searches.get_k_simple_paths_filtered(nx_graph, startNd, endNd, MAXNUMPATHS, cutoff, 214 | shortestLens, nodes_to_filter) 215 | else: 216 | path_l, path_n = KG_path_searches.get_k_simple_paths(nx_graph, startNd, endNd, 217 | MAXNUMPATHS, cutoff, shortestLens) 218 | #create dataframe from path results and append to main dataframe at end of loop 219 | for item in zip(path_l, path_n): 220 | 221 | pathStep = 0 222 | for triples in zip(item[0], item[1]): 223 | 224 | s = triples[1][0] 225 | p = triples[1][2] 226 | o = triples[1][1] 227 | 228 | attribute_dict = nx_graph[s][o][p] 229 | 230 | if 'source_graph' in attribute_dict: 231 | source_data = 'machine_read' 232 | else: 233 | source_data = '' 234 | if 'timestamp' in attribute_dict: 235 | pub_year = attribute_dict['timestamp'] 236 | else: 237 | pub_year = '' 238 | if 'pmid' in attribute_dict: 239 | pmid = attribute_dict['pmid'] 240 | else: 241 | pmid = '' 242 | 243 | df_temp = { 244 | 'path_type': 'simple', 245 | 'path_start': str(startNd), 246 | 'path_start_label': nodeLabels[str(startNd)], 247 | 'path_end': str(endNd), 248 | 'path_end_label': nodeLabels[str(endNd)], 249 | 'path_count': pathCount, 250 | 'path_step': pathStep, 251 | 'path_length': len(item[0]), 252 | 'subject_label': triples[0][0], 253 | 'predicate_label': triples[0][1], 254 | 'object_label': triples[0][2], 255 | 'subject_uri': str(triples[1][0]), 256 | 'predicate_uri': str(triples[1][2]), 257 | 'object_uri': str(triples[1][1]), 258 | 'source_data': source_data, 259 | 'pub_year': pub_year, 260 | 'pmid': pmid 261 | } 262 | 263 | df = df.append(df_temp, ignore_index=True) 264 | pathStep += 1 265 | pathCount += 1 266 | return df 267 | 268 | if __name__ == '__main__': 269 | 270 | ##READ MERGED GRAPH (PL + REACH + SEMREP + INFERRED) 271 | nx_graph = nx.read_gpickle(KG_PATH+KG_NAME_MERGED) 272 | 273 | df_gt = get_all_paths(nx_graph, greentea_tuples) 274 | df_gt = df_gt.fillna('') 275 | 276 | if filter_nodes: 277 | outfilegt = outfilegt_filtered 278 | 279 | df_gt.to_csv(outfilegt, sep='\t', index=False) 280 | 281 | df_kt = get_all_paths(nx_graph, kratom_tuples) 282 | df_kt = df_kt.fillna('') 283 | 284 | if filter_nodes: 285 | outfilekt = outfilekt_filtered 286 | df_kt.to_csv(outfilekt, sep='\t', index=False) 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | -------------------------------------------------------------------------------- /resources/pmids/cinnamon_pmid.txt: -------------------------------------------------------------------------------- 1 | 31172083 2 | 25923145 3 | 24992614 4 | 23343733 5 | 22759711 6 | 30758919 7 | 23401474 8 | 19842940 9 | 19251923 10 | 18227365 11 | 17521300 12 | 17397249 13 | 12871405 14 | 12776809 15 | 11406743 16 | 10445384 17 | 9807979 18 | 8999435 19 | 1618257 20 | 6121161 21 | 7191309 22 | 33839411 23 | 33753217 24 | 33288697 25 | 33007396 26 | 32991865 27 | 32729134 28 | 32664320 29 | 32662751 30 | 32630278 31 | 32530447 32 | 32496054 33 | 32325260 34 | 32290281 35 | 32272686 36 | 32228310 37 | 32222035 38 | 32060993 39 | 31963614 40 | 31882454 41 | 31786635 42 | 31755290 43 | 31704838 44 | 31669719 45 | 31669387 46 | 31647972 47 | 31540101 48 | 31504241 49 | 31480528 50 | 31470883 51 | 31467905 52 | 31443962 53 | 31404943 54 | 31375472 55 | 31356956 56 | 31323368 57 | 31288127 58 | 31271961 59 | 31239454 60 | 31163215 61 | 31140865 62 | 31133659 63 | 31071628 64 | 30865484 65 | 30758919 66 | 30730576 67 | 30706508 68 | 30689998 69 | 30659321 70 | 30638282 71 | 30590317 72 | 30544127 73 | 30540489 74 | 30468569 75 | 30459425 76 | 30422622 77 | 30352150 78 | 30345528 79 | 30274153 80 | 30272491 81 | 30172202 82 | 30154104 83 | 30108138 84 | 30048196 85 | 29945292 86 | 29889332 87 | 29721568 88 | 29719052 89 | 29693578 90 | 29665526 91 | 29652224 92 | 29542427 93 | 29502118 94 | 29435821 95 | 29426822 96 | 29382051 97 | 29363498 98 | 29358130 99 | 29353070 100 | 29342418 101 | 29087786 102 | 29082813 103 | 29074051 104 | 29073181 105 | 29032032 106 | 29031061 107 | 28966261 108 | 28951078 109 | 28867490 110 | 28856944 111 | 28803882 112 | 28782465 113 | 28778465 114 | 28689458 115 | 28671709 116 | 28590129 117 | 28552723 118 | 28468305 119 | 28396528 120 | 28339191 121 | 28314281 122 | 28109950 123 | 27974382 124 | 27874100 125 | 27830552 126 | 27815364 127 | 27779894 128 | 27735854 129 | 27664690 130 | 27586050 131 | 27443528 132 | 27399660 133 | 27347731 134 | 27312536 135 | 27258084 136 | 27241692 137 | 27185570 138 | 27152350 139 | 27141849 140 | 27137136 141 | 27128896 142 | 27109742 143 | 27063220 144 | 26982502 145 | 26956662 146 | 26932161 147 | 26883089 148 | 26851241 149 | 26829614 150 | 26812610 151 | 26750984 152 | 26669427 153 | 26516155 154 | 26423872 155 | 26319176 156 | 26293804 157 | 26256777 158 | 26222195 159 | 26135671 160 | 26107513 161 | 26068522 162 | 26053739 163 | 26051112 164 | 26007223 165 | 25978436 166 | 25857233 167 | 25858611 168 | 25854527 169 | 25850285 170 | 25833196 171 | 25837780 172 | 25660335 173 | 25635877 174 | 25560189 175 | 25549870 176 | 25547627 177 | 25536877 178 | 25446855 179 | 25455450 180 | 25458499 181 | 25343290 182 | 25225718 183 | 25177847 184 | 25111188 185 | 25062738 186 | 25004912 187 | 25002745 188 | 24975826 189 | 24978953 190 | 24974115 191 | 24942396 192 | 24924949 193 | 24907976 194 | 24830396 195 | 24786236 196 | 24730468 197 | 24680951 198 | 24636064 199 | 24571610 200 | 24568163 201 | 24527722 202 | 24475091 203 | 24445070 204 | 24392691 205 | 24384226 206 | 24337438 207 | 24329499 208 | 24252806 209 | 24222221 210 | 24199578 211 | 24135298 212 | 24122170 213 | 24095010 214 | 24076259 215 | 24014106 216 | 23954511 217 | 23919835 218 | 23903410 219 | 23885956 220 | 23844611 221 | 23828908 222 | 23702815 223 | 23675993 224 | 23582453 225 | 23475667 226 | 23448290 227 | 23438824 228 | 23434495 229 | 23432429 230 | 23401474 231 | 23395694 232 | 23387302 233 | 23376975 234 | 23343733 235 | 23287855 236 | 23220116 237 | 23221528 238 | 23178275 239 | 23138378 240 | 23112005 241 | 23072958 242 | 23009931 243 | 22934566 244 | 22936314 245 | 22888116 246 | 22827114 247 | 22799025 248 | 22706231 249 | 22696418 250 | 22686307 251 | 22666439 252 | 22506265 253 | 22497566 254 | 22471730 255 | 22465937 256 | 22451700 257 | 22443586 258 | 22415933 259 | 22382327 260 | 22372747 261 | 22360559 262 | 22342542 263 | 22315333 264 | 22259019 265 | 22209625 266 | 22149257 267 | 22143929 268 | 22133672 269 | 21964418 270 | 21945183 271 | 21915887 272 | 21849623 273 | 21732699 274 | 21692650 275 | 21657833 276 | 21593771 277 | 21488686 278 | 21464173 279 | 21459083 280 | 21418183 281 | 21395287 282 | 21366347 283 | 21353389 284 | 21284934 285 | 21189332 286 | 21182487 287 | 21139236 288 | 21068193 289 | 21058395 290 | 20964364 291 | 20929430 292 | 20885377 293 | 20868728 294 | 20814160 295 | 20727621 296 | 20628935 297 | 20462345 298 | 20402563 299 | 20376629 300 | 20205664 301 | 20139165 302 | 20100815 303 | 20045984 304 | 20020283 305 | 19958094 306 | 19919601 307 | 19888597 308 | 19858067 309 | 19845430 310 | 19754914 311 | 19745566 312 | 19589367 313 | 19576208 314 | 19578179 315 | 19571434 316 | 19507004 317 | 19488724 318 | 19438707 319 | 19397481 320 | 19299527 321 | 19251817 322 | 19202543 323 | 19168709 324 | 19135721 325 | 19132230 326 | 19090569 327 | 18985859 328 | 18979093 329 | 18840518 330 | 18776969 331 | 18698879 332 | 18667618 333 | 18653744 334 | 18602884 335 | 18556422 336 | 18495666 337 | 18456404 338 | 18385293 339 | 18371303 340 | 18356188 341 | 18278162 342 | 18218839 343 | 18206979 344 | 18064444 345 | 18053325 346 | 18052937 347 | 18021343 348 | 17970723 349 | 17907788 350 | 17883958 351 | 17720863 352 | 17715394 353 | 17691916 354 | 17684094 355 | 17672516 356 | 17635701 357 | 17620343 358 | 17611042 359 | 17582584 360 | 17571167 361 | 17565714 362 | 17542018 363 | 17521300 364 | 17467247 365 | 17407329 366 | 17314320 367 | 17295182 368 | 17279585 369 | 17275817 370 | 17251390 371 | 17237153 372 | 17220563 373 | 17178767 374 | 17178771 375 | 17137735 376 | 17067159 377 | 17035599 378 | 16971342 379 | 16946554 380 | 16945988 381 | 16941956 382 | 16927538 383 | 16914511 384 | 16908150 385 | 16904803 386 | 16882766 387 | 16867169 388 | 16859388 389 | 16739070 390 | 16730656 391 | 16711396 392 | 16649998 393 | 16611850 394 | 16604230 395 | 16601787 396 | 16597364 397 | 16490810 398 | 16430028 399 | 16428030 400 | 16411663 401 | 16378601 402 | 16364922 403 | 16341694 404 | 16270629 405 | 16250655 406 | 16215230 407 | 16207711 408 | 16190863 409 | 16135359 410 | 16135656 411 | 16124035 412 | 16109996 413 | 16080004 414 | 16035375 415 | 15961979 416 | 15958517 417 | 15901753 418 | 15900015 419 | 15890377 420 | 15885658 421 | 15861043 422 | 15778702 423 | 15778010 424 | 15769884 425 | 15742976 426 | 15715982 427 | 15696928 428 | 15690760 429 | 15665333 430 | 15634016 431 | 15626586 432 | 15615528 433 | 15592323 434 | 15579482 435 | 15576237 436 | 15499170 437 | 15498516 438 | 15386357 439 | 15377158 440 | 15358790 441 | 15333516 442 | 15310247 443 | 15205384 444 | 15155554 445 | 15155557 446 | 15135306 447 | 15130782 448 | 15121764 449 | 15117964 450 | 15072549 451 | 15047194 452 | 14993813 453 | 14757175 454 | 14742145 455 | 14742142 456 | 14738594 457 | 14709907 458 | 14691574 459 | 14689466 460 | 14565771 461 | 12936703 462 | 12924926 463 | 12919726 464 | 12867277 465 | 12867494 466 | 12849728 467 | 12844136 468 | 12818727 469 | 12756210 470 | 12745870 471 | 12746108 472 | 12732562 473 | 12700282 474 | 12695357 475 | 12646172 476 | 12642475 477 | 12634980 478 | 12629583 479 | 12593759 480 | 12584152 481 | 12502361 482 | 12485952 483 | 12433797 484 | 12406643 485 | 12386133 486 | 12383041 487 | 12325023 488 | 12185557 489 | 12183659 490 | 12162851 491 | 12134949 492 | 12134948 493 | 12071336 494 | 12065433 495 | 12051676 496 | 12042667 497 | 12033517 498 | 12023523 499 | 11978200 500 | 11911841 501 | 11901092 502 | 11868802 503 | 11854151 504 | 11779172 505 | 11774365 506 | 11737866 507 | 11736863 508 | 11697031 509 | 11695850 510 | 11673862 511 | 11602509 512 | 11585060 513 | 11556126 514 | 11547423 515 | 11523064 516 | 11513592 517 | 11504148 518 | 11502732 519 | 11454729 520 | 11408364 521 | 11376561 522 | 11355862 523 | 11353744 524 | 11353760 525 | 11344532 526 | 11315104 527 | 11302937 528 | 11216669 529 | 11181487 530 | 11181500 531 | 11159812 532 | 11159803 533 | 11131030 534 | 11124225 535 | 11106261 536 | 11102743 537 | 11095583 538 | 11095593 539 | 11042091 540 | 11038146 541 | 11016631 542 | 10923861 543 | 10895987 544 | 10890030 545 | 10877007 546 | 10875683 547 | 10871056 548 | 10805063 549 | 10659953 550 | 10647906 551 | 10628907 552 | 10620357 553 | 10616189 554 | 10611136 555 | 10604876 556 | 10562412 557 | 10510277 558 | 10502501 559 | 10493259 560 | 10488716 561 | 10469623 562 | 10445384 563 | 10429839 564 | 10413191 565 | 10409393 566 | 10383922 567 | 10376770 568 | 10350185 569 | 10233205 570 | 10219964 571 | 10215637 572 | 10207119 573 | 10027632 574 | 9929511 575 | 9929510 576 | 9918546 577 | 9893960 578 | 9892192 579 | 9831966 580 | 9820175 581 | 9806947 582 | 9755457 583 | 9695717 584 | 9682269 585 | 9565737 586 | 9492382 587 | 9512924 588 | 9443844 589 | 9443853 590 | 9454781 591 | 9454825 592 | 9394031 593 | 9390105 594 | 9364740 595 | 9364930 596 | 9355936 597 | 9352574 598 | 9353388 599 | 9328175 600 | 9321515 601 | 9316878 602 | 9316851 603 | 9276639 604 | 9296366 605 | 9264313 606 | 9185616 607 | 9178939 608 | 9152602 609 | 9143352 610 | 9152599 611 | 9103523 612 | 9103535 613 | 9106248 614 | 9105404 615 | 9476043 616 | 9010637 617 | 9195021 618 | 8956327 619 | 8937855 620 | 8863822 621 | 8798464 622 | 8886611 623 | 8806763 624 | 8886607 625 | 8824531 626 | 8886603 627 | 8663198 628 | 8819299 629 | 8781780 630 | 8723730 631 | 8627557 632 | 8627511 633 | 8866821 634 | 8613937 635 | 8573198 636 | 8845864 637 | 8739824 638 | 8737764 639 | 9010587 640 | 9010596 641 | 7585636 642 | 8591724 643 | 8591727 644 | 8590980 645 | 7574722 646 | 8654205 647 | 7587956 648 | 7483659 649 | 7587930 650 | 7742153 651 | 7773543 652 | 7720526 653 | 7581481 654 | 7756103 655 | 7945454 656 | 7945434 657 | 7835232 658 | 7839702 659 | 8043020 660 | 8043023 661 | 8200083 662 | 7918079 663 | 8132700 664 | 8267647 665 | 8250953 666 | 8373178 667 | 8220911 668 | 8330360 669 | 8461033 670 | 8097772 671 | 8452565 672 | 8255142 673 | 8352885 674 | 1381906 675 | 1423839 676 | 1551116 677 | 1576054 678 | 1740010 679 | 1371482 680 | 1944238 681 | 1913651 682 | 1898086 683 | 1889415 684 | 1888331 685 | 1765318 686 | 2268369 687 | 2117502 688 | 2334398 689 | 2322567 690 | 2726448 691 | 2905759 692 | 3190735 693 | 3395513 694 | 3756062 695 | 3978021 696 | 7428801 697 | 522518 698 | 631722 699 | 837635 700 | 19299851 701 | 18201587 702 | 17907547 703 | 16117985 704 | 12757697 705 | 12638482 706 | 11444587 707 | 11315759 708 | 11313000 709 | 11273517 710 | 8536426 711 | 8372087 712 | 1832815 713 | 6812734 714 | 7121898 715 | 7122922 716 | 441785 717 | 34794239 718 | 34719555 719 | 34502165 720 | 34402661 721 | 34334558 722 | 34325588 723 | 34196520 724 | 34176447 725 | 33921786 726 | 33703988 727 | 33655414 728 | 32816867 729 | 32788161 730 | 32719085 731 | 32386503 732 | 32324332 733 | 9582001 734 | 19446676 735 | 3169889 736 | 18095427 737 | 19462929 738 | 19837157 739 | 1181070 740 | 6381265 741 | 12678685 742 | 14633804 743 | 6119816 744 | 8015567 745 | 16634838 746 | 2327068 747 | 2721094 748 | 17134518 749 | 4039178 750 | 19571155 751 | 19233576 752 | 6808388 753 | 17924872 754 | 19159947 755 | 20650180 756 | 7823297 757 | 6421682 758 | 6815037 759 | 3749341 760 | -------------------------------------------------------------------------------- /resources/ontology-extensions/generate_ontology_extensions.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Script to create ontology extensions for natural products (instance-based). 3 | Author: Sanya B. Taneja 4 | Created on: 2023-04-04 5 | Last run: 2024-02-29 6 | 7 | Natural product latin binomials are created as classes with cross reference to Global Substance Registration System (GSRS) identifiers. Instances of classes are connected to natural product constituents (depending on whether the 8 | constituents already exist in ChEBI or not). If constituent exists in ChEBI, an instance of the ChEBI class is created and linked to the natural product instance. Else, a new constituent class is created and linked. All classes 9 | contain external cross references to GSRS identifiers. 10 | 11 | Input is a tab-separated file with natural product latin binomials, GSRS identifiers, NCBI ID and ChEBI identifiers (if available) 12 | after constituent annotation, review, and mapping from GSRS and EMA monographs. 13 | Output is a serialized graph in XML file. 14 | ''' 15 | import pandas as pd 16 | ## import RDF related 17 | from rdflib import Graph, BNode, Namespace, URIRef, Literal 18 | import json 19 | import logging 20 | 21 | DIR_IN = 'resources/' 22 | DIR_OUT = 'graphs/' 23 | FILE_IN = DIR_IN + 'EMA_GSRS_NP_constituents_combined_unique_20240116.tsv' 24 | NP_DICT = DIR_IN + 'np_constituents_reference_dict_20240229.json' 25 | OUTFILE = DIR_OUT + 'chebi-extensions-constituents-20240229.tsv' 26 | OUT_GRAPH = DIR_OUT + "chebi-srs-extensions-instance-all-20240229.xml" 27 | LOG_FILE = "log-ontology-extensions-20240229.txt" 28 | logging.basicConfig(filename=LOG_FILE, filemode='a', level=logging.INFO) 29 | 30 | with open(NP_DICT, 'r') as filein: 31 | np_dict = json.load(filein) 32 | 33 | NP_LIST = np_dict.keys() 34 | logging.info('Generating ontology extensions for {} natural products: '.format(len(NP_LIST))) 35 | logging.info('\nNatural products in the list: {}'.format(NP_LIST)) 36 | 37 | ## set up RDF graph 38 | # identify namespaces for other ontologies to be used 39 | LOCAL_NS = Namespace('http://napdi.org/napdi_srs_imports:') 40 | DC_NS = Namespace('http://purl.org/dc/elements/1.1/') 41 | RDF_NS = Namespace('http://www.w3.org/1999/02/22-rdf-syntax-ns#') 42 | OBO_NS = Namespace('http://purl.obolibrary.org/obo/') 43 | OWL_NS = Namespace('http://www.w3.org/2002/07/owl#') 44 | RDFS_NS = Namespace('http://www.w3.org/2000/01/rdf-schema#') 45 | #RO, BFO, GO CHEBI, NCBITaxon, PRO use the OBO namespace 46 | SRS_NS = Namespace('http://gsrs.ncats.nih.gov/ginas/app/substance/') 47 | 48 | relations = { 49 | "has_component" : "RO_0002180", #instance-instance 50 | "has_functional_parent": "chebi#has_functional_parent", #instance-instance 51 | "has_role": "RO_0000087", #instance-instance 52 | "part_of": "BFO_0000050", #instance-instance 53 | "in_taxon": "RO_0002162", #instance-instance 54 | "has_participant": "RO_0000057", #instance-instance 55 | "participates_in": "RO_0000056", #instance-instance 56 | "molecularly_decreases_activity": "RO_0002449", 57 | "database_cross_reference": "http://purl.obolibrary.org/obo/database_cross_reference"} #instance-instance 58 | 59 | def initialGraph(graph): 60 | graph.namespace_manager.reset() 61 | graph.namespace_manager.bind('napdi_srs', LOCAL_NS) 62 | graph.namespace_manager.bind('dc', DC_NS) 63 | graph.namespace_manager.bind('obo', OBO_NS) 64 | graph.namespace_manager.bind('rdf', RDF_NS) 65 | graph.namespace_manager.bind('owl', OWL_NS) 66 | graph.namespace_manager.bind('rdfs', RDFS_NS) 67 | graph.namespace_manager.bind('srs', SRS_NS) 68 | 69 | # Ontology - about and imports 70 | graph.add((URIRef('http://purl.obolibrary.org/obo/napdi-srs-imports'), RDF_NS.type, OWL_NS.Ontology)) 71 | graph.add((URIRef('http://purl.obolibrary.org/obo/napdi-srs-imports'), OWL_NS.imports, URIRef('http://purl.obolibrary.org/obo/iao/2017-03-24/iao.owl'))) 72 | graph.add((URIRef('http://purl.obolibrary.org/obo/napdi-srs-imports'), OWL_NS.imports, URIRef('http://purl.obolibrary.org/obo/bfo/2014-05-03/classes-only.owl'))) 73 | graph.add((URIRef('http://purl.obolibrary.org/obo/napdi-srs-imports'), OWL_NS.imports, URIRef('http://purl.obolibrary.org/obo/ro/core.owl'))) 74 | graph.add((URIRef('http://purl.obolibrary.org/obo/napdi-srs-imports'), DC_NS.creator, Literal('Sanya B. Taneja', lang='en'))) 75 | graph.add((URIRef('http://purl.obolibrary.org/obo/napdi-srs-imports'), DC_NS.contributor, Literal('Richard D. Boyce', lang='en'))) 76 | graph.add((URIRef('http://purl.obolibrary.org/obo/napdi-srs-imports'), RDFS_NS.label, Literal('NaPDI imported entities', lang='en'))) # need to find how to specify XSD type lang=en 77 | graph.add((URIRef('http://purl.obolibrary.org/obo/napdi-srs-imports'), RDFS_NS.comment, Literal('This ontology contains constituents imported from the GSRS database and EMA monographs', lang='en'))) 78 | 79 | def create_constituent_chebi(constituent_name, chebi_id, constituent_gsrs_id, graph, NP, NP_instance): 80 | ##constituent exists in CHEBI 81 | ##convert chebi id from float to string 82 | chebi_id = str(int(chebi_id)) 83 | constituent_class = OBO_NS['CHEBI_'+chebi_id] 84 | #constituent_class = URIRef(constituent_id) 85 | constituent_instance = BNode() 86 | 87 | #Constituent of NP as instance in CHEBI, subClass of chemical entity, cross reference to SRS (if exists) 88 | #this creates an instance of existing class CHEBI_* 89 | #NP has_component NP_constituent (in ChEBI) 90 | graph.add((constituent_class, RDFS_NS.subClassOf, OBO_NS.CHEBI_24431)) 91 | graph.add((constituent_class, RDFS_NS.subClassOf, OBO_NS.CHEBI_24431)) 92 | graph.add((constituent_instance, RDF_NS.type, constituent_class)) 93 | graph.add((constituent_instance, RDF_NS.type, OWL_NS.NamedIndividual)) 94 | if pd.notna(constituent_gsrs_id): 95 | graph.add((constituent_class, OBO_NS.database_cross_reference, SRS_NS[constituent_gsrs_id])) 96 | 97 | bn = BNode() 98 | graph.add((NP, RDFS_NS.subClassOf, bn)) 99 | graph.add((bn, RDF_NS.type, OWL_NS.Restriction)) 100 | graph.add((bn, OWL_NS.onProperty, OBO_NS.RO_0002180)) 101 | graph.add((bn, OWL_NS.someValuesFrom, constituent_class)) 102 | 103 | graph.add((NP_instance, OBO_NS.RO_0002180, constituent_instance)) 104 | return graph, constituent_class 105 | 106 | def create_constituent_no_chebi(constituent_name, constituent_gsrs_id, graph, NP, NP_instance): 107 | constituent_name_new = constituent_name.lower().replace('.', '').replace('>', '').replace('(-)', '').replace('(+)', '').replace('\'', '').replace('/', '').replace(',', '_').replace('(', '').replace(')', '').replace('[', '').replace(']', '') 108 | constituent_name_new = constituent_name_new.replace(' ', '_').replace('-', '_') 109 | constituent_class = LOCAL_NS[constituent_name_new] 110 | constituent_instance = BNode() 111 | 112 | graph.add((constituent_class, RDFS_NS.subClassOf, OBO_NS.CHEBI_24431)) 113 | graph.add((constituent_class, RDF_NS.type, OWL_NS.Class)) 114 | graph.add((constituent_instance, RDF_NS.type, constituent_class)) 115 | graph.add((constituent_instance, RDF_NS.type, OWL_NS.NamedIndividual)) 116 | graph.add((constituent_class, RDFS_NS.label, Literal(constituent_name.lower(), lang='en'))) 117 | 118 | if pd.notna(constituent_gsrs_id): 119 | graph.add((constituent_instance, OBO_NS.database_cross_reference, SRS_NS[constituent_gsrs_id])) 120 | 121 | bn = BNode() 122 | graph.add((NP, RDFS_NS.subClassOf, bn)) 123 | graph.add((bn, RDF_NS.type, OWL_NS.Restriction)) 124 | graph.add((bn, OWL_NS.onProperty, OBO_NS.RO_0002180)) 125 | graph.add((bn, OWL_NS.someValuesFrom, constituent_class)) 126 | 127 | graph.add((NP_instance, OBO_NS.RO_0002180, constituent_instance)) 128 | return graph, constituent_class 129 | 130 | def create_np_extentions(np_name, graph, df_np): 131 | 132 | #defining NP names and IDs 133 | #common_name = np_dict[np_name]['common_name'].lower() 134 | latin_binomial = np_dict[np_name]['latin_binomial'].lower() 135 | parent_name = np_dict[np_name]['parent_name'].lower() 136 | NCBI_ID = 'NCBITaxon_'+np_dict[np_name]['NCBI_ID'] 137 | GSRS_substance_ID = np_dict[np_name]['GSRS_substance_ID'] 138 | GSRS_parent_ID = np_dict[np_name]['GSRS_parent_ID'] 139 | 140 | ##reset index and get number of constituents 141 | df = df_np.reset_index(drop=True) 142 | n_gsrs_constituents = len(df.loc[df['source'] == 'GSRS']) 143 | n_ema_constituents = len(df.loc[df['source'] == 'EMA']) 144 | logging.info('\nGSRS constituents: {}'.format(n_gsrs_constituents)) 145 | logging.info('\nEMA constituents: {}'.format(n_ema_constituents)) 146 | 147 | #create namespaced identifiers and instances 148 | NP_whole = LOCAL_NS[parent_name] 149 | NP = LOCAL_NS[latin_binomial] 150 | NP_instance = BNode() 151 | NP_whole_instance = BNode() 152 | 153 | #NP subClassOf plant anatomical entity, create instance, cross reference in SRS 154 | graph.add((NP, RDFS_NS.subClassOf, OBO_NS.PO_0025131)) 155 | graph.add((NP, RDF_NS.type, OWL_NS.Class)) 156 | graph.add((NP, OBO_NS.database_cross_reference, SRS_NS[GSRS_substance_ID])) 157 | graph.add((NP, RDFS_NS.label, Literal(np_name.replace(' ', '_'), lang='en'))) 158 | 159 | graph.add((NP_instance, RDF_NS.type, NP)) 160 | graph.add((NP_instance, RDF_NS.type, OWL_NS.NamedIndividual)) 161 | 162 | #NP whole substance subClassOf plant anatomical entity, create instance, cross reference in SRS 163 | graph.add((NP_whole, RDFS_NS.subClassOf, OBO_NS.PO_0025131)) 164 | graph.add((NP_whole, RDF_NS.type, OWL_NS.Class)) 165 | graph.add((NP_whole_instance, RDF_NS.type, NP_whole)) 166 | graph.add((NP_whole_instance, RDF_NS.type, OWL_NS.NamedIndividual)) 167 | 168 | graph.add((NP_whole, OBO_NS.database_cross_reference, SRS_NS[GSRS_parent_ID])) 169 | graph.add((NP_whole, RDFS_NS.label, Literal(parent_name, lang='en'))) 170 | 171 | #NP in taxon organism (NCBI Taxon) - class-class relationship 172 | pgt1 = BNode() 173 | graph.add((NP, RDFS_NS.subClassOf, pgt1)) 174 | graph.add((pgt1, RDF_NS.type, OWL_NS.Restriction)) 175 | graph.add((pgt1, OWL_NS.onProperty, OBO_NS.RO_0002162)) 176 | graph.add((pgt1, OWL_NS.someValuesFrom, OBO_NS[NCBI_ID])) 177 | 178 | #NP part of NP parent 179 | pgt2 = BNode() 180 | graph.add((NP, RDFS_NS.subClassOf, pgt2)) 181 | graph.add((pgt2, RDF_NS.type, OWL_NS.Restriction)) 182 | graph.add((pgt2, OWL_NS.onProperty, OBO_NS.BFO_0000050)) 183 | graph.add((pgt2, OWL_NS.someValuesFrom, NP_whole)) 184 | graph.add((NP_instance, OBO_NS.BFO_0000050, NP_whole_instance)) 185 | 186 | #creating dataframe of constituent name and URIs for mapping purposes 187 | dfoutnp = pd.DataFrame(columns=['constituent_name', 'URI']) 188 | 189 | for i in range(len(df.index)): 190 | uri = '' 191 | constituent_name = df.at[i, 'constituent_name'] 192 | constituent_gsrs_id = df.at[i, 'constituent_uuid'] 193 | chebi_id = df.at[i, 'chebi_id'] 194 | if pd.notna(chebi_id): 195 | graph, uri = create_constituent_chebi(constituent_name, chebi_id, constituent_gsrs_id, graph, NP, NP_instance) 196 | else: 197 | graph, uri = create_constituent_no_chebi(constituent_name, constituent_gsrs_id, graph, NP, NP_instance) 198 | dfoutnp = dfoutnp.append({'constituent_name': constituent_name.lower(), 'URI': str(uri)}, ignore_index=True) 199 | return graph, dfoutnp 200 | 201 | if __name__ == "__main__": 202 | 203 | ## default settings 204 | graph = Graph() 205 | initialGraph(graph) 206 | 207 | df = pd.read_csv(FILE_IN, sep='\t') 208 | #logging.info('Total constituents: %s', len(df)) 209 | 210 | dfout = pd.DataFrame(columns=['constituent_name', 'URI']) 211 | ##add to graph here 212 | for np_name in NP_LIST: 213 | df_np = df.loc[df['related_latin_binomial'] == np_name] 214 | logging.info('\nTotal constituents for %s: %d', np_name, len(df_np)) 215 | graph, dfoutnp = create_np_extentions(np_name, graph, df_np) 216 | dfout = pd.concat([dfout, dfoutnp], ignore_index=True) 217 | 218 | logging.info('\nTotal constituents added to graph: %d', len(dfout)) 219 | dfout = dfout.drop_duplicates() 220 | logging.info('\nUnique constituents: %d', len(dfout)) 221 | dfout.to_csv(OUTFILE, sep='\t', index=False) 222 | 223 | f = open(OUT_GRAPH,"w") 224 | graph_str = graph.serialize(format='xml').decode('utf-8') 225 | f.write(graph_str) 226 | f.close() 227 | 228 | graph.close() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /evaluation-notebooks/KG_path_searches.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: Sanya B Taneja 3 | Date: 2021-09-29 4 | 5 | KG path search functions to generate and save paths: 6 | 1. Single source shortest path: save_k_single_source_shortest_paths(G, source, k, filepath) 7 | 2. Bidirectional shortest path: get_bidirectional_shortest_paths(G, source, target) 8 | 3. k shortest paths: get_k_shortest_paths(G, source, target, k, weight='weight'), 9 | 4. k simple paths (nodes and edges), with and without cutoff: 10 | get_k_simple_paths(G, source, target, k, cutoff) 11 | save_k_simple_paths(G, source, target, k, cutoff, filepath) 12 | 5. print_graph_statistics(nx_graph): number of nodes, edges, average degree and density 13 | ''' 14 | 15 | import os 16 | import os.path 17 | import networkx as nx 18 | import json 19 | import urllib 20 | import traceback 21 | from itertools import islice 22 | from rdflib import Graph, URIRef, BNode, Namespace, Literal 23 | from rdflib.namespace import RDF, OWL 24 | from tqdm import tqdm 25 | 26 | import pickle 27 | import pandas as pd 28 | import numpy as np 29 | import sys 30 | 31 | combine_graph = True 32 | #change graph names and paths 33 | KG_PATH = '../resources/knowledge_graphs/' 34 | KG_NAME = 'PheKnowLator_machine_read_merged_instance_based_OWLNETS_v1.0.gpickle' 35 | 36 | NodeLabelsFile = KG_PATH + 'nodeLabels_v1.0.pickle' 37 | DIR_OUT = '../output_files/' 38 | 39 | #read nodeLabels dictionary 40 | with open(NodeLabelsFile, 'rb') as filep: 41 | nodeLabels = pickle.load(filep) 42 | 43 | #define namespaces 44 | obo = Namespace('http://purl.obolibrary.org/obo/') 45 | napdi = Namespace('http://napdi.org/napdi_srs_imports:') 46 | 47 | node_dict = { 48 | 'EGCG': obo.CHEBI_4806, 49 | 'CATECHIN': obo.CHEBI_23053, 50 | 'GREENTEA': napdi.camellia_sinensis_leaf, 51 | 'KRATOM': napdi.mitragyna_speciosa, 52 | 'MITRAGYNINE': obo.CHEBI_6956, 53 | 'HYDROXY_MITRAGYNINE': napdi['7_hydroxy_mitragynine'] 54 | } 55 | 56 | nodes_to_filter = [obo.CHEBI_24431, obo.CHEBI_25367, obo.SO_0000704, obo.PR_000029067, obo.PR_000000001, obo.GO_0008152, obo.SO_0000673, 57 | URIRef('https://reactome.org/content/detail/R-HSA-1643685'), URIRef('https://reactome.org/content/detail/R-HSA-1430728')] 58 | 59 | def save_k_single_source_shortest_paths(G, source, k, filepath): 60 | paths = nx.single_source_shortest_path(G, source) 61 | #if returned paths are dictionary 62 | count = 0 63 | file_save = open(filepath, 'w') 64 | print('Saving path from {}'.format(str(source))) 65 | for target, node_list in paths.items(): 66 | count += 1 67 | if target != source: 68 | if str(target) not in nodeLabels: 69 | target_label = str(target).split('/')[-1] 70 | else: 71 | target_label = nodeLabels[str(target)] 72 | file_save.write('\n{} - {} Path:\n'.format(str(source).split('/')[-1], target_label)) 73 | path_labels = get_path_labels(G, node_list) 74 | for triples in path_labels: 75 | for item in triples: 76 | file_save.write(str(item)+' ') 77 | file_save.write('\n') 78 | if count == k: 79 | break 80 | file_save.close() 81 | 82 | ''' 83 | print bidirectional shortest path between source and target 84 | and return length of shortest path 85 | ''' 86 | def get_bidirectional_shortest_path(G, source, target, nodeLabels): 87 | print('Searching for path from {} - {}'.format(str(source), str(target))) 88 | pathx = nx.bidirectional_shortest_path(G, source, target) 89 | 90 | path_labels = get_path_labels(G, pathx, nodeLabels) 91 | path_uri = get_path_uri(G, pathx) 92 | for triples in zip(path_labels, path_uri): 93 | print(triples) 94 | return path_labels, path_uri 95 | 96 | ''' 97 | returns path labels and URIs for simple paths from source to target that have length greater than shortest path 98 | k = number of paths 99 | cutoff = maximum length of path 100 | shortestLens = length of shortest path between source and target 101 | ''' 102 | def get_k_simple_paths(G, source, target, k, cutoff, shortestLens): 103 | print('Searching for paths from {} - {}'.format(str(source), str(target))) 104 | paths = nx.all_simple_edge_paths(G, source, target, cutoff=cutoff) 105 | path_l = [] 106 | path_n = [] 107 | i = 0 108 | while i shortestLens: 116 | print('[info] Simple path length greater than shortest path length ({}) so adding to results'.format(shortestLens)) 117 | path_n.append(path) 118 | i += 1 119 | 120 | for newpath in path_n: 121 | triple_list = [] 122 | for triple in newpath: 123 | subj_lab = '' 124 | pred_lab = '' 125 | obj_lab = '' 126 | subj = str(triple[0]) 127 | pred = str(triple[2]) 128 | obj = str(triple[1]) 129 | if subj in nodeLabels: 130 | subj_lab = nodeLabels[subj] 131 | if obj in nodeLabels: 132 | obj_lab = nodeLabels[obj] 133 | if pred in nodeLabels: 134 | pred_lab = nodeLabels[pred] 135 | triple_labels = (subj_lab, pred_lab, obj_lab) 136 | triple_list.append(triple_labels) 137 | path_l.append(triple_list) 138 | return path_l, path_n 139 | 140 | ''' 141 | returns path labels and URIs for simple paths from source to target that have length greater than shortest path 142 | skips paths that contain nodes from the filtered list defined above - [] 143 | k = number of paths 144 | cutoff = maximum length of path 145 | shortestLens = length of shortest path between source and target 146 | There is a loop filter cutoff that decides how many paths to search without filtered nodes before moving on to the next. 147 | Currently filter_cutoff = 50 148 | 149 | ''' 150 | def get_k_simple_paths_filtered(G, source, target, k, cutoff, shortestLens, nodes_to_filter): 151 | 152 | filter_cutoff = 50 153 | print('Searching for filtered paths from {} - {}'.format(str(source), str(target))) 154 | paths = nx.all_simple_edge_paths(G, source, target, cutoff=cutoff) 155 | path_l = [] 156 | path_n = [] 157 | i = 0 158 | filter_count = 0 159 | while i shortestLens: 167 | print('[info] Simple path length greater than shortest path length ({})'.format(shortestLens)) 168 | flag = True 169 | triples = [item for sublist in path for item in sublist] 170 | for node in triples: 171 | if node in nodes_to_filter: 172 | filter_count += 1 173 | flag = False 174 | break 175 | if flag: 176 | path_n.append(path) 177 | i += 1 178 | else: 179 | print('Path contains filtered node, skipping path in results') 180 | if filter_count >= filter_cutoff: 181 | print('No path found without filtered nodes, moving to next iteration') 182 | i += 1 183 | 184 | for newpath in path_n: 185 | triple_list = [] 186 | for triple in newpath: 187 | subj_lab = '' 188 | pred_lab = '' 189 | obj_lab = '' 190 | subj = str(triple[0]) 191 | pred = str(triple[2]) 192 | obj = str(triple[1]) 193 | if subj in nodeLabels: 194 | subj_lab = nodeLabels[subj] 195 | if obj in nodeLabels: 196 | obj_lab = nodeLabels[obj] 197 | if pred in nodeLabels: 198 | pred_lab = nodeLabels[pred] 199 | triple_labels = (subj_lab, pred_lab, obj_lab) 200 | triple_list.append(triple_labels) 201 | path_l.append(triple_list) 202 | return path_l, path_n 203 | 204 | ''' 205 | returns list of k shortest paths from source to target 206 | ''' 207 | def get_k_shortest_paths_filtered(G, source, target, k, weight='weight'): 208 | print('Searching for shortest paths from {} - {}'.format(str(source), str(target))) 209 | shortest_paths_filtered = [] 210 | i = 0 211 | for path in paths: 212 | for node in path: 213 | if node in nodes_to_filter: 214 | continue 215 | else: 216 | shortest_paths_filtered.append(path) 217 | return shortest_paths_filtered 218 | 219 | ''' 220 | returns list of k shortest paths from source to target but removes paths with nodes in filtered list above from results 221 | ''' 222 | def get_k_shortest_paths(G, source, target, k, weight='weight'): 223 | print('Searching for shortest paths from {} - {}'.format(str(source), str(target))) 224 | paths = nx.all_shortest_paths(G, source, target, weight=weight) 225 | return list(islice(nx.all_shortest_paths(G, source, target, weight=weight), k)) 226 | 227 | def save_k_shortest_paths(G, source, target, k, filepath, weight='weight'): 228 | paths = get_k_shortest_paths(G, source, target, k, weight='weight') 229 | #print() 230 | file_save = open(filepath, 'w') 231 | source = str(source) 232 | target = str(target) 233 | source_label = source 234 | target_label = target 235 | if source in nodeLabels: 236 | source_label = nodeLabels[source] 237 | if target in nodeLabels: 238 | target_label = nodeLabels[target] 239 | file_save.write('\n{} - {} Shortest Path:\n'.format(source_label, target_label)) 240 | i = 0 241 | for node_list in paths: 242 | file_save.write('\nPATH: '+str(i)+'\n') 243 | path_labels = get_path_labels(G, node_list, nodeLabels) 244 | for triples in path_labels: 245 | for item in triples: 246 | file_save.write(str(item)+' ') 247 | file_save.write('\n') 248 | i += 1 249 | file_save.close() 250 | 251 | def save_k_simple_paths(G, source, target, k, cutoff, filepath): 252 | path_l, path_n = get_k_simple_paths(G, source, target, k, cutoff) 253 | source = str(source) 254 | target = str(target) 255 | #print() 256 | file_save = open(filepath, 'w') 257 | 258 | if source in nodeLabels: 259 | source_label = nodeLabels[source] 260 | if target in nodeLabels: 261 | target_label = nodeLabels[target] 262 | file_save.write('\n{} - {} Simple Path (cutoff= {} ):\n'.format(source_label, target_label, cutoff)) 263 | i = 0 264 | for path_list in path_n: 265 | file_save.write('\nPATH: '+str(i)+'\n') 266 | for triples in path_list: 267 | for item in triples: 268 | file_save.write(str(item)+' ') 269 | file_save.write('\n') 270 | i += 1 271 | file_save.close() 272 | 273 | def get_path_labels(nx_graph, path, nodeLabels): 274 | path_labels = [] 275 | if len(path) < 1: 276 | print('Path length 1, skipping') 277 | return 278 | for edge in zip(path, path[1:]): 279 | data = nx_graph.get_edge_data(*edge) 280 | pred = list(data.keys())[0] 281 | node1_lab = str(edge[0]) 282 | node2_lab = str(edge[1]) 283 | if node1_lab in nodeLabels: 284 | node1_lab = nodeLabels[node1_lab] 285 | if node2_lab in nodeLabels: 286 | node2_lab = nodeLabels[node2_lab] 287 | pred_lab = nodeLabels[str(pred)] 288 | if list(data.values())[0]: 289 | if 'source_graph' in list(data.values())[0]: 290 | source_graph = 'machine_read' 291 | else: 292 | source_graph = '' 293 | else: 294 | source_graph = '' 295 | labels = [node1_lab, pred_lab, node2_lab, source_graph] 296 | path_labels.append(labels) 297 | return path_labels 298 | 299 | def get_path_uri(nx_graph, path): 300 | path_uri = [] 301 | if len(path) < 1: 302 | print('Path length 1, skipping') 303 | return 304 | for edge in zip(path, path[1:]): 305 | data = nx_graph.get_edge_data(*edge) 306 | pred = list(data.keys())[0] 307 | attribute = list(data.values()) 308 | uri = [str(edge[0]), str(pred), str(edge[1]), attribute] 309 | path_uri.append(uri) 310 | return path_uri 311 | 312 | ''' 313 | prints common statistics for the graph passed to function including number of nodes, edges 314 | average degree, node density and nodes with the highest degree 315 | ''' 316 | def print_graph_statistics(nx_graph): 317 | # get the number of nodes, edges, and self-loops 318 | nodes = nx.number_of_nodes(nx_graph) 319 | edges = nx.number_of_edges(nx_graph) 320 | self_loops = nx.number_of_selfloops(nx_graph) 321 | 322 | print('There are {} nodes, {} edges, and {} self-loop(s)'.format(nodes, edges, self_loops)) 323 | # get degree information 324 | avg_degree = float(edges) / nodes 325 | 326 | print('The Average Degree is {}'.format(avg_degree)) 327 | # get 5 nodes with the highest degress 328 | n_deg = sorted([(str(x[0]), x[1]) for x in nx_graph.degree], key=lambda x: x[1], reverse=1)[:6] 329 | 330 | for x in n_deg: 331 | print('{} (degree={})'.format(x[0], x[1])) 332 | # get network density 333 | density = nx.density(nx_graph) 334 | 335 | print('The density of the graph is: {}'.format(density)) 336 | 337 | -------------------------------------------------------------------------------- /resources/pmids/greentea_pmid.txt: -------------------------------------------------------------------------------- 1 | 29703388 2 | 29466429 3 | 27747873 4 | 31721714 5 | 32008337 6 | 31100973 7 | 28531120 8 | 31582423 9 | 30999008 10 | 7483658 11 | 11470492 12 | 23268924 13 | 30660822 14 | 31997084 15 | 28118673 16 | 31405142 17 | 30687102 18 | 30280673 19 | 31685765 20 | 29659189 21 | 32570121 22 | 31029907 23 | 32564472 24 | 33054444 25 | 31694152 26 | 32146939 27 | 31358798 28 | 32085432 29 | 30022812 30 | 31356857 31 | 29421514 32 | 29275296 33 | 22532034 34 | 31948590 35 | 27052517 36 | 32709749 37 | 29368187 38 | 28247504 39 | 31237721 40 | 31613724 41 | 27637455 42 | 30741544 43 | 29570984 44 | 25316200 45 | 32007822 46 | 33184029 47 | 31311979 48 | 31353905 49 | 31558747 50 | 31943177 51 | 31054310 52 | 15072439 53 | 32533968 54 | 26582321 55 | 30983343 56 | 28964574 57 | 24508477 58 | 30183039 59 | 31496652 60 | 30310905 61 | 25985195 62 | 29205863 63 | 31720444 64 | 29480324 65 | 30767525 66 | 25224344 67 | 31400238 68 | 19545213 69 | 31196040 70 | 30066955 71 | 26095990 72 | 26760769 73 | 30286210 74 | 28863367 75 | 28794411 76 | 30993288 77 | 31885091 78 | 33112139 79 | 26576923 80 | 30469543 81 | 29548810 82 | 31877869 83 | 27518169 84 | 21538851 85 | 33063648 86 | 31792332 87 | 29938468 88 | 30513192 89 | 28835392 90 | 29542427 91 | 29781252 92 | 30667442 93 | 30857437 94 | 21278283 95 | 29137307 96 | 22509899 97 | 29061787 98 | 29998274 99 | 30599898 100 | 22429924 101 | 30361436 102 | 29731448 103 | 30381188 104 | 30368213 105 | 25755758 106 | 20561943 107 | 26552961 108 | 28942440 109 | 26907264 110 | 25260713 111 | 20056894 112 | 26426900 113 | 21883003 114 | 18251087 115 | 16995908 116 | 21776468 117 | 15285844 118 | 15832810 119 | 28768059 120 | 30529552 121 | 33005588 122 | 23920278 123 | 29981789 124 | 25789634 125 | 20149610 126 | 28754329 127 | 29191128 128 | 17637191 129 | 29193528 130 | 29467215 131 | 20939821 132 | 29248449 133 | 23038646 134 | 17676868 135 | 24695276 136 | 21924889 137 | 23201067 138 | 20583320 139 | 25559894 140 | 15895106 141 | 33228183 142 | 20473584 143 | 17616136 144 | 15469417 145 | 20471964 146 | 28346686 147 | 28535976 148 | 17164372 149 | 21569833 150 | 33623845 151 | 28190756 152 | 21370452 153 | 22564432 154 | 23698259 155 | 16773535 156 | 24363139 157 | 28868668 158 | 17961513 159 | 25490984 160 | 20653233 161 | 27996361 162 | 26972494 163 | 29157815 164 | 29301278 165 | 27844057 166 | 19539611 167 | 21801144 168 | 21890837 169 | 20450880 170 | 19784588 171 | 17656102 172 | 25757931 173 | 23010222 174 | 17499810 175 | 16733047 176 | 32222693 177 | 29868672 178 | 15319488 179 | 23250807 180 | 30972279 181 | 12237135 182 | 25922640 183 | 26152236 184 | 14643924 185 | 19577593 186 | 26393568 187 | 15036355 188 | 15319329 189 | 25242081 190 | 24656388 191 | 20162400 192 | 25582180 193 | 30466981 194 | 28423656 195 | 18071271 196 | 18570158 197 | 23327877 198 | 18579105 199 | 25774551 200 | 19226653 201 | 28602600 202 | 30477461 203 | 24492671 204 | 17108060 205 | 28740135 206 | 14557796 207 | 11853888 208 | 15807986 209 | 25847253 210 | 19491656 211 | 20798525 212 | 19182437 213 | 24193141 214 | 26344851 215 | 25663641 216 | 20025854 217 | 24384371 218 | 28980319 219 | 18718123 220 | 17364960 221 | 25422365 222 | 18336807 223 | 15383222 224 | 18936213 225 | 17174345 226 | 22759912 227 | 25581901 228 | 23497868 229 | 11358376 230 | 24419562 231 | 21458546 232 | 26046674 233 | 26193264 234 | 19819682 235 | 16762473 236 | 24335094 237 | 21444681 238 | 14647974 239 | 22741576 240 | 15283302 241 | 24492725 242 | 19103281 243 | 33435008 244 | 15499196 245 | 21042803 246 | 23180627 247 | 28115221 248 | 8723732 249 | 18406205 250 | 18606592 251 | 23994611 252 | 22851007 253 | 21818690 254 | 19653312 255 | 22069634 256 | 27306406 257 | 17298385 258 | 27270317 259 | 16216226 260 | 22367622 261 | 16978655 262 | 19172664 263 | 20346967 264 | 22179525 265 | 21343666 266 | 25173233 267 | 18596869 268 | 15629189 269 | 19069242 270 | 25888278 271 | 20191036 272 | 21897050 273 | 18239857 274 | 26460146 275 | 22465028 276 | 18828601 277 | 25418056 278 | 23396419 279 | 20514403 280 | 12842182 281 | 12642472 282 | 10202396 283 | 23554216 284 | 17970592 285 | 11585049 286 | 23967153 287 | 26049011 288 | 18289092 289 | 12542460 290 | 22960141 291 | 23674609 292 | 19719968 293 | 17657175 294 | 15104253 295 | 23062361 296 | 22918967 297 | 18706964 298 | 19174505 299 | 19924425 300 | 19111536 301 | 20471814 302 | 17490800 303 | 27200496 304 | 9105414 305 | 15738931 306 | 15564676 307 | 11792691 308 | 27130545 309 | 18197554 310 | 16129114 311 | 16193318 312 | 20484151 313 | 9734711 314 | 23456781 315 | 22015496 316 | 23831998 317 | 11697467 318 | 22999765 319 | 25326286 320 | 12970085 321 | 2894963 322 | 19452231 323 | 24730468 324 | 20513373 325 | 15464723 326 | 18845128 327 | 14640574 328 | 17077187 329 | 24486085 330 | 15796208 331 | 15729621 332 | 25747701 333 | 20388359 334 | 12841947 335 | 10503899 336 | 16316927 337 | 19632288 338 | 21751816 339 | 8548865 340 | 29122665 341 | 24314868 342 | 23999162 343 | 17658211 344 | 22367292 345 | 18851785 346 | 25927922 347 | 14511674 348 | 18371987 349 | 12970388 350 | 15569775 351 | 26826723 352 | 17511059 353 | 29389596 354 | 12127645 355 | 19353999 356 | 22521609 357 | 26625122 358 | 27782875 359 | 12738191 360 | 19420696 361 | 16620709 362 | 33799251 363 | 20393003 364 | 21632963 365 | 11600046 366 | 23911347 367 | 19170155 368 | 12124307 369 | 27598258 370 | 24392691 371 | 17116366 372 | 12499265 373 | 8017087 374 | 16174516 375 | 23674377 376 | 14558788 377 | 21302034 378 | 11983278 379 | 7955108 380 | 18209568 381 | 15277314 382 | 10216249 383 | 25523370 384 | 12369894 385 | 12970578 386 | 23807728 387 | 11064004 388 | 23116965 389 | 21628958 390 | 9648229 391 | 24225153 392 | 24718416 393 | 16806954 394 | 9605416 395 | 21352821 396 | 16081670 397 | 33543617 398 | 27452659 399 | 22214982 400 | 14600287 401 | 15482646 402 | 16415120 403 | 12920504 404 | 21081808 405 | 33947307 406 | 15769120 407 | 15212478 408 | 11341376 409 | 26514453 410 | 23573138 411 | 17827674 412 | 29415993 413 | 18506841 414 | 25833196 415 | 11087528 416 | 22588833 417 | 8706244 418 | 15451406 419 | 23146761 420 | 7513796 421 | 22969825 422 | 20666626 423 | 18591783 424 | 21698451 425 | 20065503 426 | 12438004 427 | 11448643 428 | 2070485 429 | 15042683 430 | 10754211 431 | 11697539 432 | 23370448 433 | 25586184 434 | 12188632 435 | 12628514 436 | 15723092 437 | 11405259 438 | 33977918 439 | 15822860 440 | 10882397 441 | 10554885 442 | 21434871 443 | 18701626 444 | 21947138 445 | 19502780 446 | 17210444 447 | 20817945 448 | 19863351 449 | 7821873 450 | 11754570 451 | 22371366 452 | 15748600 453 | 22776719 454 | 23375153 455 | 12711142 456 | 15796199 457 | 11325559 458 | 21049395 459 | 21430251 460 | 16141543 461 | 12763027 462 | 15736408 463 | 9725998 464 | 12083865 465 | 14512804 466 | 16177189 467 | 24262486 468 | 29152741 469 | 9522437 470 | 16887864 471 | 26454589 472 | 10521685 473 | 15041478 474 | 18528854 475 | 11482900 476 | 10713480 477 | 27035618 478 | 11454723 479 | 15831081 480 | 25813735 481 | 14679019 482 | 12670496 483 | 12659723 484 | 9508366 485 | 22956110 486 | 15517342 487 | 18404300 488 | 10620351 489 | 12440157 490 | 12168856 491 | 21110950 492 | 9649602 493 | 6683535 494 | 33539218 495 | 12652656 496 | 21787983 497 | 15142676 498 | 25881741 499 | 8132031 500 | 18563256 501 | 27375779 502 | 12470707 503 | 20581223 504 | 23652835 505 | 17015939 506 | 16098960 507 | 27781257 508 | 27487578 509 | 27374085 510 | 26190180 511 | 19822130 512 | 31146527 513 | 33387731 514 | 23252598 515 | 29866157 516 | 26716507 517 | 30059510 518 | 16309275 519 | 31434006 520 | 29491681 521 | 25503260 522 | 2500594 523 | 14714867 524 | 23057780 525 | 11361036 526 | 33727059 527 | 23847592 528 | 15083319 529 | 25680958 530 | 27298605 531 | 26886573 532 | 25875282 533 | 19931438 534 | 1464398 535 | 30807184 536 | 26264479 537 | 11345695 538 | 20108732 539 | 19684242 540 | 32682376 541 | 33071779 542 | 28121324 543 | 18678988 544 | 26417310 545 | 26030803 546 | 29535022 547 | 23885956 548 | 26194608 549 | 26865001 550 | 32458968 551 | 21075177 552 | 26251571 553 | 26686283 554 | 28301607 555 | 33111622 556 | 22818712 557 | 25098399 558 | 17016511 559 | 31874782 560 | 25402944 561 | 31018035 562 | 19815591 563 | 12925309 564 | 34089819 565 | 20218965 566 | 11295171 567 | 20006278 568 | 32067498 569 | 33198812 570 | 22536197 571 | 18534843 572 | 12600689 573 | 32788520 574 | 32937767 575 | 33878138 576 | 3583469 577 | 31935497 578 | 18424588 579 | 24478309 580 | 23261676 581 | 28283780 582 | 27226184 583 | 31201887 584 | 18267120 585 | 21903878 586 | 23399702 587 | 17118359 588 | 30847932 589 | 22078631 590 | 29306210 591 | 28317281 592 | 11453730 593 | 29631143 594 | 17601828 595 | 27658889 596 | 16861692 597 | 26655814 598 | 21138371 599 | 21503789 600 | 7064495 601 | 29341798 602 | 31952134 603 | 31570303 604 | 9103293 605 | 31661763 606 | 30781538 607 | 16846594 608 | 21467581 609 | 27041871 610 | 3624765 611 | 1776278 612 | 19127721 613 | 10964098 614 | 32067561 615 | 22830928 616 | 24973695 617 | 27689996 618 | 19809179 619 | 26602570 620 | 12487753 621 | 24773045 622 | 32409079 623 | 23880231 624 | 27899097 625 | 12675854 626 | 29914362 627 | 10706384 628 | 31197949 629 | 18196445 630 | 24888389 631 | 20451264 632 | 24366058 633 | 2540621 634 | 23106150 635 | 29129733 636 | 29955703 637 | 23117228 638 | 12631120 639 | 32645157 640 | 29223552 641 | 23416740 642 | 24495080 643 | 25093260 644 | 26742036 645 | 17928735 646 | 20513234 647 | 30685699 648 | 31310779 649 | 20416364 650 | 29913456 651 | 25793001 652 | 10670829 653 | 29762899 654 | 23755172 655 | 24722818 656 | 23210776 657 | 23458739 658 | 17707979 659 | 32791963 660 | 27637002 661 | 17725852 662 | 21861094 663 | 15695592 664 | 22830339 665 | 34114330 666 | 19764067 667 | 25957749 668 | 16497339 669 | 29614315 670 | 32166379 671 | 23110224 672 | 20112301 673 | 22634505 674 | 14621186 675 | 18840426 676 | 28707966 677 | 17459420 678 | 23954402 679 | 27993633 680 | 30371539 681 | 32987671 682 | 33450678 683 | 26270535 684 | 29944861 685 | 32114864 686 | 29356593 687 | 10748719 688 | 25410365 689 | 32229160 690 | 27108052 691 | 28422049 692 | 30785752 693 | 32566398 694 | 30262277 695 | 18985486 696 | 29576702 697 | 31906425 698 | 33441891 699 | 11086901 700 | 32407265 701 | 33268671 702 | 32957726 703 | 33116413 704 | 32719085 705 | 34597896 706 | 34303263 707 | 32787531 708 | 33667331 709 | 33989703 710 | 34509843 711 | 34346218 712 | 32746771 713 | 34182907 714 | 34206850 715 | 32480000 716 | 33169423 717 | 34676878 718 | 32713005 719 | 34204055 720 | 34220500 721 | 34798162 722 | 32453025 723 | 34303263 724 | 34607977 725 | 23876835 726 | 34463173 727 | 34959345 728 | 35196743 729 | 34959764 730 | 33517563 731 | 34329684 732 | 34610498 733 | 35146639 734 | 35114350 735 | 35114443 736 | 34449312 737 | 20664420 738 | 15574782 739 | 8284325 740 | 19228856 741 | 11470725 742 | 9494537 743 | 16084066 744 | 18006026 745 | 9447270 746 | 16272702 747 | 17882140 748 | 16176615 749 | 19588362 750 | 16389263 751 | 18716169 752 | 21370398 753 | 16487645 754 | 19394393 755 | 20370896 756 | 20884815 757 | 12552591 758 | 9311619 759 | 11205489 760 | 12960117 761 | 11018418 762 | 17443132 763 | 18019399 764 | 10584049 765 | 11408354 766 | 15741048 767 | 11482766 768 | 16458187 769 | 16855537 770 | 10527294 771 | 21040626 772 | 14600287 773 | 12190117 774 | 11504910 775 | 14519830 776 | 17927503 777 | 15585768 778 | 17906193 779 | 16707877 780 | 8353840 781 | 18614722 782 | 21649457 783 | 18468736 784 | 19176733 785 | 12168856 786 | 1674554 787 | 16387402 788 | 16410036 789 | 16364532 790 | 16618952 791 | 18344595 792 | 17600357 793 | 16427718 794 | 10641059 795 | 16288056 796 | 17686632 797 | 10698173 798 | 9378841 799 | 10995329 800 | 19051209 801 | 10965518 802 | 11513717 803 | 11513717 804 | 16968850 805 | 14652367 806 | 7655336 807 | 12376503 808 | 20828156 809 | 19478218 810 | 10898615 811 | 9281609 812 | 9849117 813 | 10873562 814 | 14572506 815 | 12695345 816 | 11431333 817 | 19074207 818 | 12824094 819 | 12184788 820 | 12850499 821 | 17133573 822 | 16247510 823 | 15563575 824 | 15640470 825 | 11237198 826 | 12519715 827 | 10192923 828 | 12584176 829 | 1667297 830 | 21303262 831 | 11549554 832 | 17227956 833 | 9598844 834 | 19416635 835 | 17891480 836 | 11808919 837 | 11237939 838 | 10964006 839 | 15621343 840 | 21068474 841 | 15857209 842 | 11206275 843 | 9806157 844 | 12851151 845 | 18515521 846 | 9395271 847 | 22254006 848 | 11228277 849 | 11780763 850 | 9356529 851 | 18326618 852 | 2894963 853 | 19719133 854 | 16278361 855 | 15735368 856 | 16988119 857 | 14759162 858 | 21149622 859 | 9568793 860 | 11807163 861 | 17234229 862 | 19472429 863 | 11179857 864 | 15277285 865 | 8187052 866 | 15541906 867 | 10950844 868 | 11334265 869 | --------------------------------------------------------------------------------