├── .gitignore ├── .idea ├── .gitignore ├── docanalysis.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── vcs.xml ├── LICENSE ├── README.md ├── __init__.py ├── config.ini ├── dictionary ├── abb.xml ├── ack_key_phrases_manual.txt ├── ack_key_phrases_manual │ ├── ack_key_phrases.md │ ├── ack_key_phrases_manual.xml │ └── approval_number.xml ├── acknowledgment_feature_names.xml ├── chap4_wikitest_2.xml ├── cities_dictionary │ └── cities.xml ├── consent_type.txt ├── consent_type │ └── consent_type.xml ├── ethics_committee_key_phrases.txt ├── ethics_committee_key_phrases │ └── ethics_committee_key_phrases.xml ├── ethics_key_phrases.txt ├── ethics_key_phrases │ └── ethics_key_phrases.xml ├── features_ack.txt ├── features_ack │ ├── acknowledgment_feature_names.xml │ └── features_ack.xml ├── invasion_biology │ ├── invasion_hypotheses.xml │ └── invasion_hypothesis.txt ├── ipcc.xml ├── methods_key_phrases.txt ├── methods_key_phrases │ └── methods_key_phrases.xml ├── software.xml └── test_terpene.xml ├── docanalysis ├── .DS_Store ├── __init__.py ├── ami_sections.py ├── config │ ├── default_dicts.json │ └── default_sections.json ├── convert_file.py ├── docanalysis.py ├── entity_extraction.py ├── file_lib.py ├── get_html.py ├── glob_trail.py ├── gui.py ├── gui │ ├── css │ │ └── main.css │ ├── eel.js │ └── main.html └── xml_lib.py ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── conf.py │ ├── docanalysis.rst │ ├── entity_extraction.rst │ └── index.rst ├── notebooks ├── README.md └── c_project.ipynb ├── requirements.txt ├── resources ├── approval_number_100.csv ├── demo.py ├── docanalyis_architecture_diagram.PNG ├── docanalysis_demo.ipynb ├── entities_country.csv ├── ethics_statement_corpus_1000.csv ├── fig_ent.xml ├── oil186.csv ├── oil186_20210712.csv ├── oil186_ack.csv ├── pmr_demo.py ├── software_mentions.csv ├── stem_cell_research_300.csv ├── stem_cell_research_300_2020.csv ├── stem_cell_research_300_ethics.csv ├── terpene_fig_entities.csv └── test_pmc.txt ├── setup.py ├── software_papers.ipynb └── tests ├── test_docanalysis_cli.py ├── test_docanalysis_method.py └── testing_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # temporary results 132 | temp/ 133 | 134 | #corpus 135 | oil186/ 136 | corpus/ 137 | stem_cell_research_300/ 138 | stem_cell_research_300_2020 139 | GPE.text 140 | ORG.text 141 | 142 | #vscode 143 | .vscode/ -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/docanalysis.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | 15 | 17 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | For updated tutorial, please check the [Wiki](https://github.com/petermr/docanalysis/wiki/docanalysis-Tutorial) page. 2 | ## docanalysis 3 | `docanalysis` is a Command Line Tool that ingests corpora [(CProjects)](https://github.com/petermr/tigr2ess/blob/master/getpapers/TUTORIAL.md#cproject-and-ctrees) and carries out text-analysis of documents, including 4 | - sectioning 5 | - NLP/text-mining 6 | - dictionary generation 7 | 8 | Besides the bespoke code, it uses [NLTK](https://www.nltk.org/) and other Python tools for many operations, and [spaCy](https://spacy.io/) or [scispaCy](https://allenai.github.io/scispacy/) for extraction and annotation of entities. Outputs summary data and word-dictionaries. 9 | 10 | ### Set up `venv` 11 | We recommend you create a virtual environment (`venv`) before installing `docanalysis` and that you activate the `venv` before each time you run `docanalysis`. 12 | 13 | #### Windows 14 | Creating a `venv` 15 | ``` 16 | >> mkdir docanalysis_demo 17 | >> cd docanalysis_demo 18 | >> python -m venv venv 19 | ``` 20 | 21 | Activating `venv` 22 | ``` 23 | >> venv\Scripts\activate.bat 24 | ``` 25 | 26 | #### MacOS 27 | Creating a `venv` 28 | ``` 29 | >> mkdir docanalysis_demo 30 | >> cd docanalysis_demo 31 | >> python3 -m venv venv 32 | ``` 33 | 34 | Activating `venv` 35 | ``` 36 | >> source venv/bin/activate 37 | ``` 38 | 39 | Refer the [official documentation](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) for more help. 40 | 41 | ### Install `docanalysis` 42 | You can download `docanalysis` from PYPI. 43 | ``` 44 | pip install docanalysis 45 | ``` 46 | If you are on a Mac 47 | ``` 48 | pip3 install docanalysis 49 | ``` 50 | 51 | Download python from: [https://www.python.org/downloads/](https://www.python.org/downloads/) and select the option `Add Python to Path while installing`. Make sure `pip` is installed along with python. Check out [https://pip.pypa.io/en/stable/installation/](https://pip.pypa.io/en/stable/installation/) if you have difficulties installing pip. 52 | 53 | ### Run `docanalysis` 54 | `docanalysis --help` should list the flags we support and their use. 55 | 56 | ``` 57 | usage: docanalysis.py [-h] [--run_pygetpapers] [--make_section] [-q QUERY] [-k HITS] [--project_name PROJECT_NAME] [-d DICTIONARY] [-o OUTPUT] 58 | [--make_ami_dict MAKE_AMI_DICT] [--search_section [SEARCH_SECTION [SEARCH_SECTION ...]]] [--entities [ENTITIES [ENTITIES ...]]] 59 | [--spacy_model SPACY_MODEL] [--html HTML] [--synonyms SYNONYMS] [--make_json MAKE_JSON] [--search_html] [--extract_abb EXTRACT_ABB] 60 | [-l LOGLEVEL] [-f LOGFILE] 61 | 62 | Welcome to docanalysis version 0.1.3. -h or --help for help 63 | 64 | optional arguments: 65 | -h, --help show this help message and exit 66 | --run_pygetpapers [Command] downloads papers from EuropePMC via pygetpapers 67 | --make_section [Command] makes sections; requires a fulltext.xml in CTree directories 68 | -q QUERY, --query QUERY 69 | [pygetpapers] query string 70 | -k HITS, --hits HITS [pygetpapers] number of papers to download 71 | --project_name PROJECT_NAME 72 | CProject directory name 73 | -d DICTIONARY, --dictionary DICTIONARY 74 | [file name/url] existing ami dictionary to annotate sentences or support supervised entity extraction 75 | -o OUTPUT, --output OUTPUT 76 | outputs csv with sentences/terms 77 | --make_ami_dict MAKE_AMI_DICT 78 | [Command] title for ami-dict. Makes ami-dict of all extracted entities; works only with spacy 79 | --search_section [SEARCH_SECTION [SEARCH_SECTION ...]] 80 | [NER/dictionary search] section(s) to annotate. Choose from: ALL, ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL. Defaults to 81 | ALL 82 | --entities [ENTITIES [ENTITIES ...]] 83 | [NER] entities to extract. Default (ALL). Common entities SpaCy: GPE, LANGUAGE, ORG, PERSON (for additional ones check: ); SciSpaCy: CHEMICAL, 84 | DISEASE 85 | --spacy_model SPACY_MODEL 86 | [NER] optional. Choose between spacy or scispacy models. Defaults to spacy 87 | --html HTML outputs html with sentences/terms 88 | --synonyms SYNONYMS annotate the corpus/sections with synonyms from ami-dict 89 | --make_json MAKE_JSON 90 | outputs json with sentences/terms 91 | --search_html searches html documents (mainly IPCC) 92 | --extract_abb EXTRACT_ABB 93 | [Command] title for abb-ami-dict. Extracts abbreviations and expansions; makes ami-dict of all extracted entities 94 | -l LOGLEVEL, --loglevel LOGLEVEL 95 | provide logging level. Example --log warning <>, default='info' 96 | -f LOGFILE, --logfile LOGFILE 97 | saves log to specified file in output directory as well as printing to terminal 98 | ``` 99 | 100 | #### Download papers from [EPMC](https://europepmc.org/) via `pygetpapers` 101 | COMMAND 102 | ``` 103 | docanalysis --run_pygetpapers -q "terpene" -k 10 --project_name terpene_10 104 | ``` 105 | LOGS 106 | ``` 107 | INFO: making project/searching terpene for 10 hits into C:\Users\shweata\docanalysis\terpene_10 108 | INFO: Total Hits are 13935 109 | 1it [00:00, 936.44it/s] 110 | INFO: Saving XML files to C:\Users\shweata\docanalysis\terpene_10\*\fulltext.xml 111 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00, 3.10s/it] 112 | ``` 113 | 114 | CPROJ 115 | ``` 116 | C:\USERS\SHWEATA\DOCANALYSIS\TERPENE_10 117 | │ eupmc_results.json 118 | │ 119 | ├───PMC8625850 120 | │ eupmc_result.json 121 | │ fulltext.xml 122 | │ 123 | ├───PMC8727598 124 | │ eupmc_result.json 125 | │ fulltext.xml 126 | │ 127 | ├───PMC8747377 128 | │ eupmc_result.json 129 | │ fulltext.xml 130 | │ 131 | ├───PMC8771452 132 | │ eupmc_result.json 133 | │ fulltext.xml 134 | │ 135 | ├───PMC8775117 136 | │ eupmc_result.json 137 | │ fulltext.xml 138 | │ 139 | ├───PMC8801761 140 | │ eupmc_result.json 141 | │ fulltext.xml 142 | │ 143 | ├───PMC8831285 144 | │ eupmc_result.json 145 | │ fulltext.xml 146 | │ 147 | ├───PMC8839294 148 | │ eupmc_result.json 149 | │ fulltext.xml 150 | │ 151 | ├───PMC8840323 152 | │ eupmc_result.json 153 | │ fulltext.xml 154 | │ 155 | └───PMC8879232 156 | eupmc_result.json 157 | fulltext.xml 158 | ``` 159 | 160 | #### Section the papers 161 | COMMAND 162 | ``` 163 | docanalysis --project_name terpene_10 --make_section 164 | ``` 165 | LOGS 166 | ``` 167 | WARNING: Making sections in /content/terpene_10/PMC9095633/fulltext.xml 168 | INFO: dict_keys: dict_keys(['abstract', 'acknowledge', 'affiliation', 'author', 'conclusion', 'discussion', 'ethics', 'fig_caption', 'front', 'introduction', 'jrnl_title', 'keyword', 'method', 'octree', 'pdfimage', 'pub_date', 'publisher', 'reference', 'results_discuss', 'search_results', 'sections', 'svg', 'table', 'title']) 169 | WARNING: loading templates.json 170 | INFO: wrote XML sections for /content/terpene_10/PMC9095633/fulltext.xml /content/terpene_10/PMC9095633/sections 171 | WARNING: Making sections in /content/terpene_10/PMC9120863/fulltext.xml 172 | INFO: wrote XML sections for /content/terpene_10/PMC9120863/fulltext.xml /content/terpene_10/PMC9120863/sections 173 | WARNING: Making sections in /content/terpene_10/PMC8982386/fulltext.xml 174 | INFO: wrote XML sections for /content/terpene_10/PMC8982386/fulltext.xml /content/terpene_10/PMC8982386/sections 175 | WARNING: Making sections in /content/terpene_10/PMC9069239/fulltext.xml 176 | INFO: wrote XML sections for /content/terpene_10/PMC9069239/fulltext.xml /content/terpene_10/PMC9069239/sections 177 | WARNING: Making sections in /content/terpene_10/PMC9165828/fulltext.xml 178 | INFO: wrote XML sections for /content/terpene_10/PMC9165828/fulltext.xml /content/terpene_10/PMC9165828/sections 179 | WARNING: Making sections in /content/terpene_10/PMC9119530/fulltext.xml 180 | INFO: wrote XML sections for /content/terpene_10/PMC9119530/fulltext.xml /content/terpene_10/PMC9119530/sections 181 | WARNING: Making sections in /content/terpene_10/PMC8982077/fulltext.xml 182 | INFO: wrote XML sections for /content/terpene_10/PMC8982077/fulltext.xml /content/terpene_10/PMC8982077/sections 183 | WARNING: Making sections in /content/terpene_10/PMC9067962/fulltext.xml 184 | INFO: wrote XML sections for /content/terpene_10/PMC9067962/fulltext.xml /content/terpene_10/PMC9067962/sections 185 | WARNING: Making sections in /content/terpene_10/PMC9154778/fulltext.xml 186 | INFO: wrote XML sections for /content/terpene_10/PMC9154778/fulltext.xml /content/terpene_10/PMC9154778/sections 187 | WARNING: Making sections in /content/terpene_10/PMC9164016/fulltext.xml 188 | INFO: wrote XML sections for /content/terpene_10/PMC9164016/fulltext.xml /content/terpene_10/PMC9164016/sections 189 | 47% 1056/2258 [00:01<00:01, 1003.31it/s]ERROR: cannot parse /content/terpene_10/PMC9165828/sections/1_front/1_article-meta/26_custom-meta-group/0_custom-meta/1_meta-value/0_xref.xml 190 | 67% 1516/2258 [00:01<00:00, 1047.68it/s]ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/7_xref.xml 191 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/14_email.xml 192 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/3_xref.xml 193 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/6_xref.xml 194 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/9_email.xml 195 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/10_email.xml 196 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/4_xref.xml 197 | ... 198 | 100% 2258/2258 [00:02<00:00, 949.43it/s] 199 | ``` 200 | 201 | CTREE 202 | ``` 203 | ├───PMC8625850 204 | │ └───sections 205 | │ ├───0_processing-meta 206 | │ ├───1_front 207 | │ │ ├───0_journal-meta 208 | │ │ └───1_article-meta 209 | │ ├───2_body 210 | │ │ ├───0_1._introduction 211 | │ │ ├───1_2._materials_and_methods 212 | │ │ │ ├───1_2.1._materials 213 | │ │ │ ├───2_2.2._bacterial_strains 214 | │ │ │ ├───3_2.3._preparation_and_character 215 | │ │ │ ├───4_2.4._evaluation_of_the_effect_ 216 | │ │ │ ├───5_2.5._time-kill_studies 217 | │ │ │ ├───6_2.6._propidium_iodide_uptake-e 218 | │ │ │ └───7_2.7._hemolysis_test_from_human 219 | │ │ ├───2_3._results 220 | │ │ │ ├───1_3.1._encapsulation_of_terpene_ 221 | │ │ │ ├───2_3.2._both_terpene_alcohol-load 222 | │ │ │ ├───3_3.3._farnesol_and_geraniol-loa 223 | │ │ │ └───4_3.4._farnesol_and_geraniol-loa 224 | │ │ ├───3_4._discussion 225 | │ │ ├───4_5._conclusions 226 | │ │ └───5_6._patents 227 | │ ├───3_back 228 | │ │ ├───0_ack 229 | │ │ ├───1_fn-group 230 | │ │ │ └───0_fn 231 | │ │ ├───2_app-group 232 | │ │ │ └───0_app 233 | │ │ │ └───2_supplementary-material 234 | │ │ │ └───0_media 235 | │ │ └───9_ref-list 236 | │ └───4_floats-group 237 | │ ├───4_table-wrap 238 | │ ├───5_table-wrap 239 | │ ├───6_table-wrap 240 | │ │ └───4_table-wrap-foot 241 | │ │ └───0_fn 242 | │ ├───7_table-wrap 243 | │ └───8_table-wrap 244 | ... 245 | ``` 246 | ##### Search sections using dictionary 247 | COMMAND 248 | ``` 249 | docanalysis --project_name terpene_10 --output entities.csv --make_ami_dict entities.xml 250 | ``` 251 | LOGS 252 | ``` 253 | INFO: Found 7134 sentences in the section(s). 254 | INFO: getting terms from /content/activity.xml 255 | 100% 7134/7134 [00:02<00:00, 3172.14it/s] 256 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. 257 | "[", "").str.replace("]", "") 258 | INFO: wrote output to /content/terpene_10/activity.csv 259 | ``` 260 | 261 | #### Extract entities 262 | We use `spacy` to extract Named Entites. Here's the list of Entities it supports:CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW,LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART 263 | INPUT 264 | ``` 265 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --entities ORG --output org.csv 266 | ``` 267 | LOGS 268 | ``` 269 | INFO: Found 7134 sentences in the section(s). 270 | INFO: Loading spacy 271 | 100% 7134/7134 [01:08<00:00, 104.16it/s] 272 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. 273 | "[", "").str.replace("]", "") 274 | INFO: wrote output to /content/terpene_10/org.csv 275 | ``` 276 | ##### Extract information from specific section(s) 277 | You can choose to extract entities from specific sections 278 | 279 | COMMAND 280 | ``` 281 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --search_section AUT, AFF --entities ORG --output org_aut_aff.csv 282 | ``` 283 | LOG 284 | ``` 285 | INFO: Found 28 sentences in the section(s). 286 | INFO: Loading spacy 287 | 100% 28/28 [00:00<00:00, 106.66it/s] 288 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. 289 | "[", "").str.replace("]", "") 290 | INFO: wrote output to /content/terpene_10/org_aut_aff.csv 291 | ``` 292 | #### Create dictionary of extracted entities 293 | COMMAND 294 | ``` 295 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --search_section AUT, AFF --entities ORG --output org_aut_aff.csvv --make_ami_dict org 296 | ``` 297 | LOG 298 | ``` 299 | INFO: Found 28 sentences in the section(s). 300 | INFO: Loading spacy 301 | 100% 28/28 [00:00<00:00, 96.56it/s] 302 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True. 303 | "[", "").str.replace("]", "") 304 | INFO: wrote output to /content/terpene_10/org_aut_aff.csvv 305 | INFO: Wrote all the entities extracted to ami dict 306 | ``` 307 | 308 | Snippet of the dictionary 309 | ``` 310 | 311 | - dictionary title="/content/terpene_10/org.xml"> 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | ``` 323 | 324 | ### Extract Abbreviations 325 | 326 | ``` 327 | docanalysis --project_name corpus\ethics_10 --output dict_search_5.csv --make_json dict_search_5.json --make_ami_dict entities --extract_abb ethics_abb 328 | ``` 329 | 330 | `--extract_abb` extracts all abbreviations and make an ami-dictionary of abbreviations and its expansion. 331 | 332 | EXAMPLE DICTIONARY: 333 | ``` 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | ``` 356 | 357 | ### Search HTML 358 | If you working with HTML files (IPCC Reports, for example) and not XMLs in CProjects, you can use `--search_html` flag. 359 | 360 | ``` 361 | docanalysis --project_name corpus\ipcc_sectioned --extract_abb ethics_abb --search_html 362 | ``` 363 | 364 | Make sure that your `html` sections is in `sections` folder. Here's an example structure: 365 | 366 | ``` 367 | C:. 368 | | dict_search_2.csv 369 | | dict_search_2.json 370 | | 371 | \---chap4 372 | | chapter_4 373 | | 374 | \---sections 375 | 4.1.html 376 | 4.2.1.html 377 | 4.2.2.html 378 | 4.2.3.html 379 | 4.2.4.html 380 | 4.2.5.html 381 | 4.2.7.html 382 | 4.2.html 383 | 4.3.1.html 384 | 4.3.2.html 385 | 4.3.html 386 | 4.4.1.html 387 | 4.4.2.html 388 | 4.4.html 389 | 4.5.html 390 | executive_summary.html 391 | frequently_asked_questions.html 392 | table_of_contents.html 393 | ``` 394 | If you haven't sectioned your `html`, please use `py4ami` to section it. 395 | #### What is a dictionary 396 | Dictionary, in `ami`'s terminology, a set of terms/phrases in XML format. 397 | Dictionaries related to ethics and acknowledgments are available in [Ethics Dictionary](https://github.com/petermr/docanalysis/tree/main/ethics_dictionary) folder 398 | 399 | If you'd like to create a custom dictionary, you can find the steps, [here](https://github.com/petermr/tigr2ess/blob/master/dictionaries/TUTORIAL.md) 400 | 401 | ``` 402 | ### Python tools used 403 | - [`pygetpapers`](https://github.com/petermr/pygetpapers) - scrape open repositories to download papers of interest 404 | - [nltk](https://www.nltk.org/) - splits sentences 405 | - [spaCy](https://spacy.io/) and [SciSpaCy](https://allenai.github.io/scispacy/) 406 | - recognize Named-Entities and label them 407 | - Here's the list of NER labels [SpaCy's English model](https://spacy.io/models/en) provides: 408 | `CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART` 409 | 410 | 411 | ### Credits: 412 | - [Ayush Garg](https://github.com/ayush4921) 413 | - [Shweata N. Hegde](https://github.com/ShweataNHegde/) 414 | - [Daniel Mietchen](https://github.com/Daniel-Mietchen) 415 | - [Peter Murray-Rust](https://github.com/petermr) 416 | 417 | 418 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/__init__.py -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | [ethics_statement] 2 | version=0.0.0.1 -------------------------------------------------------------------------------- /dictionary/abb.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /dictionary/ack_key_phrases_manual.txt: -------------------------------------------------------------------------------- 1 | conflict of interest 2 | financial support -------------------------------------------------------------------------------- /dictionary/ack_key_phrases_manual/ack_key_phrases.md: -------------------------------------------------------------------------------- 1 | The terms in the dictionary were created manually by Chaitanya. 2 | -------------------------------------------------------------------------------- /dictionary/ack_key_phrases_manual/ack_key_phrases_manual.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /dictionary/ack_key_phrases_manual/approval_number.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /dictionary/acknowledgment_feature_names.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /dictionary/chap4_wikitest_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | -------------------------------------------------------------------------------- /dictionary/consent_type.txt: -------------------------------------------------------------------------------- 1 | video consent 2 | informed consent 3 | written informed consent 4 | verbal consent 5 | voluntary consent 6 | competent consent -------------------------------------------------------------------------------- /dictionary/consent_type/consent_type.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /dictionary/ethics_committee_key_phrases.txt: -------------------------------------------------------------------------------- 1 | Ethics Committee of 2 | Ethical Committee of 3 | Institutional Review Board of 4 | the IRB of 5 | Institutional Animal Care and Use Committee of 6 | Animal Care and Use Committee of 7 | IACUC of -------------------------------------------------------------------------------- /dictionary/ethics_committee_key_phrases/ethics_committee_key_phrases.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /dictionary/ethics_key_phrases.txt: -------------------------------------------------------------------------------- 1 | animal study was reviewed and approved by 2 | protocol was approved by 3 | studies involving human participants were reviewed and approved by 4 | approved by 5 | informed and written consent 6 | written consent 7 | informed consent 8 | Ethics Committee 9 | legal guardian 10 | next of kin 11 | national legislation 12 | ethics committees 13 | ethics committees 14 | Ethics Committees 15 | appropriate approval 16 | written informed consent 17 | Principle of Laboratory Animal Care 18 | ethics guidelines 19 | Experimental protocols were approved by 20 | ethical clearance 21 | Ethical approval was authorized through 22 | Animal Ethics Committee 23 | Declaration of Helsinki 24 | principles of Good Clinical Practice 25 | Good Clinical Practice 26 | approved by the Institutional Review Board 27 | International Conference on Harmonisation Good Clinical Practice guidelines 28 | approved the study 29 | local ethical committees 30 | International Council for Harmonisation of Technical Requirements for Pharmaceuticals for Human Use 31 | -------------------------------------------------------------------------------- /dictionary/ethics_key_phrases/ethics_key_phrases.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /dictionary/features_ack.txt: -------------------------------------------------------------------------------- 1 | authors gratefully acknowledge 2 | authors sincerely thank 3 | authors thank 4 | special thanks 5 | for providing 6 | author would like 7 | providing constant support 8 | grateful 9 | authors acknowledge 10 | financial contribution 11 | research support 12 | financial support 13 | also thank 14 | scientific support 15 | helpful suggestions 16 | technical support 17 | support provided 18 | sincere thanks 19 | authors express 20 | authors extend 21 | kind support 22 | facilities provided 23 | providing facilities 24 | appreciated 25 | necessary facilities 26 | provided funding 27 | technical assistance 28 | fellowship 29 | funds 30 | contribution 31 | helpful comments 32 | reliable care 33 | valuable comments -------------------------------------------------------------------------------- /dictionary/features_ack/acknowledgment_feature_names.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /dictionary/features_ack/features_ack.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /dictionary/invasion_biology/invasion_hypotheses.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /dictionary/invasion_biology/invasion_hypothesis.txt: -------------------------------------------------------------------------------- 1 | Biotic Resistance Hypothesis 2 | Enemy Release Hypothesis 3 | Propagule Pressure Hypothesis 4 | Synthesizing Invasion Hypotheses 5 | Tens Rule 6 | novel weapons hypothesis 7 | Darwin's naturalization and limiting similarity hypotheses 8 | Phenotypic plasticity hypothesis 9 | Evolution of increased competitive ability and shifting defence hypotheses 10 | Invasional meltdown hypothesis 11 | Disturbance hypothesis 12 | -------------------------------------------------------------------------------- /dictionary/ipcc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /dictionary/methods_key_phrases.txt: -------------------------------------------------------------------------------- 1 | flowering season 2 | harvesting season 3 | blossoming season 4 | collected from 5 | collected in 6 | were collected in a blossoming period 7 | oil sample was obtained from 8 | sample obtained from 9 | flowering period 10 | harvesting period 11 | blossoming period 12 | seeds were sampled from 13 | were purchased from local markets 14 | -------------------------------------------------------------------------------- /dictionary/methods_key_phrases/methods_key_phrases.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /dictionary/software.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /dictionary/test_terpene.xml: -------------------------------------------------------------------------------- 1 | 2 | Created from SPARQL query 3 | 4 | D-limonene(+) 5 | (R)-Limonene(+) 6 | (4R)-4-isopropenyl-1-methylcyclohexene 7 | (R)-4-isopropenyl-1-methyl-1-cyclohexene 8 | (R)-(+)-limonene 9 | D-(+)-limonene 10 | (4R)-1-methyl-4-isopropenylcyclohex-1-ene 11 | (4R)-limonene(+) 12 | (4R)-Limonene 13 | D-Limonen 14 | (+)-4-isopropenyl-1-methylcyclohexene 15 | (R)-p-mentha-1,8-diene 16 | (R)(+)-p-mentha-1,8-diene 17 | (+)-limonene 18 | (1R)-(+)-α-pinene 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /docanalysis/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/docanalysis/.DS_Store -------------------------------------------------------------------------------- /docanalysis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | docanalysis module 3 | """ 4 | from docanalysis.entity_extraction import EntityExtraction 5 | from docanalysis.docanalysis import Docanalysis 6 | 7 | __author__ = "Ayush Garg", "Shweata N. Hegde" 8 | __email__ = "ayush@science.org.in", "shweata.hegde@gmail.com" -------------------------------------------------------------------------------- /docanalysis/ami_sections.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from pathlib import Path 3 | import logging 4 | from lxml import etree as LXET 5 | 6 | from docanalysis.xml_lib import XmlLib 7 | 8 | 9 | class AMIAbsSection(ABC): 10 | """ """ 11 | logger = logging.getLogger("ami_abs_section") 12 | 13 | SECTIONS = "sections" 14 | 15 | def __init__(self) -> None: 16 | pass 17 | 18 | 19 | @classmethod 20 | def make_xml_sections(cls, file, outdir: str, force: bool) -> None: 21 | """make sections 22 | 23 | :param file: 24 | :param outdir: str: 25 | :param force: bool: 26 | 27 | """ 28 | if file is None or outdir is None: 29 | return None 30 | path = Path(file) 31 | if not path.exists(): 32 | cls.logger.warning(f"file {file} does not exist") 33 | return 34 | # sections = Path(self.dirx) 35 | if force or not Path(outdir).exists(): 36 | cls.logger.warning(f"Making sections in {str(path)}") 37 | xml_libx = XmlLib() 38 | xml_libx.logger.setLevel(logging.DEBUG) 39 | xml_libx.read(file) 40 | xml_libx.make_sections(outdir) 41 | 42 | 43 | class AMIFigure(AMIAbsSection): 44 | """holds data on figure captions and hopefully later pointers to pdfimages 45 | 46 | Figures are a mess in JATS. They can be held in different places and often not linked 47 | to the bitmap. This class will include heuristics for uniting and standardising this. 48 | 49 | JATS encoding depends on the publisher. Typically: 50 | 52 | 53 | 54 | XPS core spectra comparison for aged baseline and SEB-3 electrodes. 55 |

The graphite and NCM622 electrodes are taken from the baseline cell after 956 cycles and 56 | the SEB-3 cell after 4021 cycles.

57 | 58 | 59 |
' 60 | 61 | There are sometimes 2 or more

as children of caption. 62 | 63 | 64 | """ 65 | 66 | # JATS tags 67 | LABEL = "label_xml" 68 | CAPTION = "caption" 69 | P = "p" 70 | TITLE = "title" 71 | 72 | def __init__(self): 73 | super().__init__() 74 | self.root = None 75 | self.root_str = None 76 | self.label_xml = None 77 | self.label_text = None 78 | self.caption = None 79 | self.caption_p = None 80 | self.p_text = None 81 | self.caption_title = None 82 | self.title_text = None 83 | 84 | @classmethod 85 | def create_from_jats(cls, xml_path): 86 | """ 87 | 88 | :param xml_path: 89 | 90 | """ 91 | ami_figure = AMIFigure() 92 | ami_figure.root = XmlLib.parse_xml_file_to_root(str(xml_path)) 93 | ami_figure.add_figure_structure() 94 | return ami_figure 95 | 96 | def add_figure_structure(self): 97 | """creates label, caption, title, test(p) from JATS xml""" 98 | self.root_str = LXET.tostring(self.root) 99 | self.label_xml = XmlLib.get_or_create_child(self.root, self.LABEL) 100 | self.label_text = XmlLib.get_text(self.label_xml) 101 | self.caption = XmlLib.get_or_create_child(self.root, self.CAPTION) 102 | self.caption_p = XmlLib.get_or_create_child(self.caption, self.P) 103 | self.p_text = XmlLib.get_text(self.caption_p) 104 | self.caption_title = XmlLib.get_or_create_child(self.caption, self.TITLE) 105 | self.title_text = XmlLib.get_text(self.caption_title) 106 | 107 | def get_xml_str(self): 108 | """ """ 109 | return LXET.tostring(self.root) 110 | 111 | def __str__(self): 112 | s = f" --- {self.label_xml} ----\n" \ 113 | f"[{self.title_text}] \n" \ 114 | f" {self.p_text}" 115 | return s 116 | -------------------------------------------------------------------------------- /docanalysis/config/default_dicts.json: -------------------------------------------------------------------------------- 1 | { 2 | "EO_ACTIVITY": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/activity/eo_activity.xml", 3 | "EO_COMPOUND": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/compound/eo_compound.xml", 4 | "EO_ANALYSIS": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/analysis/eo_analysis_method.xml", 5 | "EO_EXTRACTION": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/extraction/eo_extraction.xml", 6 | "EO_PLANT": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant/eo_plant.xml", 7 | "PLANT_GENUS": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant_genus/plant_genus.xml", 8 | "EO_PLANT_PART": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant_part/plant_part.xml", 9 | "EO_TARGET": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/target/eo_target_organism.xml", 10 | "COUNTRY": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/country/country.xml", 11 | "DISEASE": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/disease/disease.xml", 12 | "ORGANIZATION": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/organization/organization.xml", 13 | "DRUG": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/drug/drug.xml", 14 | "TEST_TRACE": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/test_trace/test_trace.xml" 15 | } 16 | -------------------------------------------------------------------------------- /docanalysis/config/default_sections.json: -------------------------------------------------------------------------------- 1 | { 2 | "ABS":["*abstract.xml"], 3 | "ACK": ["*ack.xml"], 4 | "AFF": ["*aff.xml"], 5 | "AUT": ["*contrib-group.xml"], 6 | "CON": ["*conclusion*/*.xml"], 7 | "DIS": ["*discussion*/**/*_title.xml", "*discussion*/**/*_p.xml"], 8 | "ETH": ["*ethic*/*.xml"], 9 | "FIG": ["*fig*.xml"], 10 | "INT": ["*introduction*/*.xml", "*background*/*.xml"], 11 | "KEY": ["*kwd-group.xml"], 12 | "MET": ["*method*/*.xml", "*material*/*.xml"] , 13 | "RES": ["*result*/*/*_title.xml", "*result*/*/*_p.xml"], 14 | "TAB": ["*table*.xml"], 15 | "TIL": ["*article-meta/*title-group.xml"], 16 | "HTML": ["*.html"] 17 | 18 | } -------------------------------------------------------------------------------- /docanalysis/convert_file.py: -------------------------------------------------------------------------------- 1 | import os 2 | from chardet import detect 3 | 4 | # get file encoding type 5 | 6 | 7 | def get_encoding_type(file): 8 | """ 9 | 10 | :param file: 11 | 12 | """ 13 | with open(file, 'rb') as f: 14 | rawdata = f.read() 15 | return detect(rawdata)['encoding'] 16 | 17 | 18 | from_codec = get_encoding_type('entity_extraction.py') 19 | 20 | # add try: except block for reliability 21 | try: 22 | with open('entity_extraction.py', 'r', encoding=from_codec) as f, open('entity_extraction2.py', 'w', encoding='utf-8') as e: 23 | text = f.read() # for small files, for big use chunks 24 | e.write(text) 25 | 26 | 27 | except UnicodeDecodeError: 28 | print('Decode Error') 29 | except UnicodeEncodeError: 30 | print('Encode Error') 31 | -------------------------------------------------------------------------------- /docanalysis/docanalysis.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import sys 4 | import configargparse 5 | import coloredlogs 6 | from time import gmtime, strftime 7 | from tqdm import tqdm 8 | from functools import partialmethod 9 | from docanalysis.entity_extraction import EntityExtraction 10 | 11 | 12 | class Docanalysis: 13 | 14 | def __init__(self): 15 | """This function makes all the constants""" 16 | self.entity_extraction = EntityExtraction() 17 | self.version = "0.3.0" 18 | 19 | def handle_logger_creation(self, args): 20 | """handles the logging on cml 21 | 22 | :param args: description] 23 | :type args: type] 24 | 25 | """ 26 | coloredlogs.install() 27 | levels = { 28 | "critical": logging.CRITICAL, 29 | "error": logging.ERROR, 30 | "warn": logging.WARNING, 31 | "warning": logging.WARNING, 32 | "info": logging.INFO, 33 | "debug": logging.DEBUG, 34 | } 35 | level = levels.get(args.loglevel.lower()) 36 | 37 | if level == logging.DEBUG: 38 | tqdm.__init__ = partialmethod(tqdm.__init__, disable=True) 39 | 40 | if args.logfile: 41 | self.handle_logfile(args, level) 42 | else: 43 | coloredlogs.install(level=level, fmt='%(levelname)s: %(message)s') 44 | 45 | def handlecli(self): 46 | """Handles the command line interface using argparse""" 47 | version = self.version 48 | 49 | default_path = strftime("%Y_%m_%d_%H_%M_%S", gmtime()) 50 | parser = configargparse.ArgParser( 51 | description=f"Welcome to docanalysis version {version}. -h or --help for help", 52 | add_config_file_help=False, 53 | ) 54 | parser.add_argument( 55 | "--run_pygetpapers", 56 | default=False, 57 | action="store_true", 58 | help="[Command] downloads papers from EuropePMC via pygetpapers", 59 | ) 60 | parser.add_argument( 61 | "--make_section", 62 | default=False, 63 | action="store_true", 64 | help="[Command] makes sections; requires a fulltext.xml in CTree directories", 65 | ) 66 | parser.add_argument( 67 | "-q", 68 | "--query", 69 | default=None, 70 | type=str, 71 | help="[pygetpapers] query string", 72 | ) 73 | parser.add_argument( 74 | "-k", 75 | "--hits", 76 | type=str, 77 | default=None, 78 | help="[pygetpapers] number of papers to download", 79 | ) 80 | 81 | parser.add_argument( 82 | "--project_name", 83 | type=str, 84 | help="CProject directory name", 85 | default=os.path.join(os.getcwd(), default_path), 86 | ) 87 | parser.add_argument( 88 | "-d", 89 | "--dictionary", 90 | default=[], 91 | type=str, 92 | nargs='*', 93 | help="[file name/url] existing ami dictionary to annotate sentences or support supervised entity extraction", 94 | ) 95 | parser.add_argument( 96 | "-o", 97 | "--output", 98 | default=False, 99 | help="outputs csv with sentences/terms", 100 | ) 101 | parser.add_argument( 102 | "--make_ami_dict", 103 | default=False, 104 | help="[Command] title for ami-dict. Makes ami-dict of all extracted entities; works only with spacy", 105 | ) 106 | parser.add_argument( 107 | "--search_section", 108 | default=['ALL'], 109 | action='store', 110 | dest='search_section', 111 | type=str, 112 | nargs='*', 113 | help="[NER/dictionary search] section(s) to annotate. Choose from: ALL, ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL. Defaults to ALL", 114 | ) 115 | 116 | parser.add_argument( 117 | "--entities", 118 | default=['ALL'], 119 | action='store', dest='entities', 120 | type=str, nargs='*', 121 | help="[NER] entities to extract. Default (ALL). Common entities " 122 | "SpaCy: GPE, LANGUAGE, ORG, PERSON (for additional ones check: ); " 123 | ) 124 | 125 | parser.add_argument( 126 | "--spacy_model", 127 | default=False, 128 | type=str, 129 | help="[NER] optional.", 130 | ) 131 | 132 | parser.add_argument( 133 | "--html", 134 | default=False, 135 | type=str, 136 | help="outputs html with sentences/terms", 137 | ) 138 | 139 | parser.add_argument( 140 | "--synonyms", 141 | default=False, 142 | type=str, 143 | help="annotate the corpus/sections with synonyms from ami-dict", 144 | ) 145 | parser.add_argument( 146 | "--make_json", 147 | default=False, 148 | type=str, 149 | help="outputs json with sentences/terms", 150 | ) 151 | parser.add_argument( 152 | "--search_html", 153 | default=False, 154 | action="store_true", 155 | help="searches html documents (mainly IPCC)", 156 | ) 157 | parser.add_argument( 158 | "--extract_abb", 159 | default=False, 160 | help="[Command] title for abb-ami-dict. Extracts abbreviations and expansions; makes ami-dict of all extracted entities" 161 | ) 162 | 163 | parser.add_argument( 164 | "-l", 165 | "--loglevel", 166 | default="info", 167 | help="provide logging level. " 168 | "Example --log warning <>, default='info'", 169 | ) 170 | 171 | parser.add_argument( 172 | "-f", 173 | "--logfile", 174 | default=False, 175 | type=str, 176 | help="saves log to specified file in output directory as well as printing to terminal", 177 | ) 178 | 179 | if len(sys.argv) == 1: 180 | parser.print_help(sys.stderr) 181 | sys.exit() 182 | args = parser.parse_args() 183 | for arg in vars(args): 184 | if vars(args)[arg] == "False": 185 | vars(args)[arg] = False 186 | self.handle_logger_creation(args) 187 | self.entity_extraction.extract_entities_from_papers(args.project_name, args.dictionary, search_sections=args.search_section, entities=args.entities, query=args.query, hits=args.hits, 188 | run_pygetpapers=args.run_pygetpapers, make_section=args.make_section, removefalse=True, 189 | csv_name=args.output, make_ami_dict=args.make_ami_dict, spacy_model=args.spacy_model, html_path=args.html, synonyms=args.synonyms, make_json=args.make_json, search_html=args.search_html, extract_abb=args.extract_abb) 190 | 191 | 192 | def main(): 193 | """Runs the CLI""" 194 | calldocanalysis = Docanalysis() 195 | calldocanalysis.handlecli() 196 | 197 | 198 | if __name__ == "__main__": 199 | main() 200 | -------------------------------------------------------------------------------- /docanalysis/entity_extraction.py: -------------------------------------------------------------------------------- 1 | from distutils.log import error 2 | import os 3 | import logging 4 | import requests 5 | from glob import glob 6 | import spacy 7 | from spacy import displacy 8 | from nltk import tokenize 9 | from spacy.matcher import PhraseMatcher 10 | import pandas as pd 11 | from bs4 import BeautifulSoup 12 | from tqdm import tqdm 13 | import xml.etree.ElementTree as ET 14 | from docanalysis.ami_sections import AMIAbsSection 15 | from pathlib import Path 16 | from pygetpapers import Pygetpapers 17 | from collections import Counter 18 | import pip 19 | import json 20 | import re 21 | from lxml import etree 22 | from pygetpapers.download_tools import DownloadTools 23 | from urllib.request import urlopen 24 | import nltk 25 | try: 26 | nltk.data.find('tokenizers/punkt') 27 | nltk.data.find('corpora/stopwords') 28 | except LookupError: 29 | nltk.download('punkt') 30 | nltk.download('stopwords') 31 | from nltk import tokenize 32 | 33 | 34 | def install(package): 35 | """ 36 | 37 | :param package: 38 | 39 | """ 40 | if hasattr(pip, 'main'): 41 | pip.main(['install', package]) 42 | else: 43 | pip._internal.main(['install', package]) 44 | 45 | 46 | try: 47 | from abbreviations import schwartz_hearst 48 | except ModuleNotFoundError: 49 | install('abbreviations') 50 | from abbreviations import schwartz_hearst 51 | 52 | 53 | #nlp_phrase = spacy.load("en_core_web_sm") 54 | 55 | CONFIG_SECTIONS = 'https://raw.githubusercontent.com/petermr/docanalysis/main/docanalysis/config/default_sections.json' 56 | CONFIG_AMI_DICT = 'https://raw.githubusercontent.com/petermr/docanalysis/main/docanalysis/config/default_dicts.json' 57 | 58 | 59 | class EntityExtraction: 60 | """EntityExtraction Class""" 61 | 62 | def __init__(self): 63 | logging.basicConfig(level=logging.INFO) 64 | self.sections = self.json_to_dict(CONFIG_SECTIONS) 65 | self.dict_of_ami_dict = self.json_to_dict(CONFIG_AMI_DICT) 66 | self.all_paragraphs = {} 67 | self.sentence_dictionary = {} 68 | self.spacy_model = 'spacy' 69 | self.nlp = None 70 | 71 | def download_spacy(self, spacy_type): 72 | """Download or load spacy 73 | 74 | :param spacy_type: "spacy 75 | :type spacy_type: string 76 | 77 | """ 78 | logging.info(f'Loading {spacy_type}') 79 | 80 | if spacy_type == "spacy": 81 | try: 82 | self.nlp = spacy.load('en_core_web_sm') 83 | except OSError: 84 | from spacy.cli import download 85 | download('en_core_web_sm') 86 | self.nlp = spacy.load('en_core_web_sm') 87 | 88 | def dictionary_to_html(self, html_path): 89 | """Converts dictionary to html 90 | 91 | :param html_path: path to save html 92 | :type html_path: string 93 | 94 | """ 95 | list_of_docs = [] 96 | for sentence in self.sentence_dictionary: 97 | list_of_docs.append(self.sentence_dictionary[sentence]['doc']) 98 | html = displacy.render(list_of_docs, style="ent", 99 | page=True, minify=True) 100 | logging.info(f"saving output: {html_path}") 101 | self._write_string_to_file(html, html_path) 102 | 103 | def extract_entities_from_papers(self, corpus_path, terms_xml_path, search_sections, entities, query=None, hits=30, 104 | run_pygetpapers=False, make_section=False, removefalse=True, 105 | csv_name=False, make_ami_dict=False, spacy_model=False, html_path=False, synonyms=False, make_json=False, search_html=False, extract_abb=False): 106 | """logic implementation (Q: how detailed should the description here be?) 107 | 108 | :param corpus_path: 109 | :param terms_xml_path: 110 | :param search_sections: 111 | :param entities: 112 | :param query: (Default value = None) 113 | :param hits: (Default value = 30) 114 | :param run_pygetpapers: (Default value = False) 115 | :param make_section: (Default value = False) 116 | :param removefalse: (Default value = True) 117 | :param csv_name: (Default value = False) 118 | :param make_ami_dict: (Default value = False) 119 | :param spacy_model: (Default value = False) 120 | :param html_path: (Default value = False) 121 | :param synonyms: (Default value = False) 122 | :param make_json: (Default value = False) 123 | :param search_html: (Default value = False) 124 | :param extract_abb: (Default value = False) 125 | 126 | """ 127 | 128 | self.spacy_model = spacy_model 129 | corpus_path = os.path.abspath(corpus_path) 130 | if run_pygetpapers: 131 | if not query: 132 | logging.warning( 133 | "please provide query (like 'terpene', 'essential oils') as parameter") 134 | return 135 | self.run_pygetpapers(query, hits, corpus_path) 136 | if os.path.isdir(corpus_path): 137 | if make_section: 138 | self.run_ami_section(corpus_path) 139 | else: 140 | logging.error("CProject doesn't exist") 141 | return 142 | if search_html: 143 | search_sections = ['HTML', ] 144 | if search_sections == ['ALL', ]: 145 | search_sections = self.sections.keys() 146 | if len(glob(os.path.join(corpus_path, '**', 'sections'))) > 0: 147 | self.all_paragraphs = self.get_glob_for_section( 148 | corpus_path, search_sections) 149 | else: 150 | logging.error('section papers using --make_sections before search') 151 | if spacy_model or csv_name or extract_abb or make_ami_dict: 152 | if search_html: 153 | self.make_dict_with_parsed_document(document_type='html') 154 | else: 155 | self.make_dict_with_parsed_document() 156 | if spacy_model: 157 | self.run_spacy_over_sections(self.sentence_dictionary, entities) 158 | self.remove_statements_not_having_xmldict_terms( 159 | dict_with_parsed_xml=self.sentence_dictionary, searching='entities') 160 | if terms_xml_path: 161 | for i in range(len(terms_xml_path)): 162 | compiled_terms = self.get_terms_from_ami_xml(terms_xml_path[i]) 163 | self.add_if_file_contains_terms( 164 | compiled_terms=compiled_terms, dict_with_parsed_xml=self.sentence_dictionary, searching=f'{i}') 165 | if removefalse: 166 | self.remove_statements_not_having_xmldict_terms( 167 | dict_with_parsed_xml=self.sentence_dictionary, searching=f'{i}') 168 | if synonyms: 169 | synonyms_list = self.get_synonyms_from_ami_xml(terms_xml_path) 170 | self.add_if_file_contains_terms( 171 | compiled_terms=synonyms_list, dict_with_parsed_xml=self.sentence_dictionary, searching='has_synonyms') 172 | if removefalse: 173 | self.remove_statements_not_having_xmldict_terms( 174 | dict_with_parsed_xml=self.sentence_dictionary) 175 | if html_path: 176 | self.dictionary_to_html( 177 | os.path.join(corpus_path, html_path)) 178 | if extract_abb: 179 | self.abbreviation_search_using_sw(self.sentence_dictionary) 180 | abb_ami_dict_path = os.path.join(corpus_path, extract_abb) 181 | self.make_ami_dict_from_abbreviation( 182 | extract_abb, self.sentence_dictionary, abb_ami_dict_path) 183 | if removefalse: 184 | self.remove_statements_not_having_xmldict_terms( 185 | dict_with_parsed_xml=self.sentence_dictionary, searching='abb') 186 | 187 | if csv_name: 188 | dict_with_parsed_xml_no_paragrph = self.remove_paragraph_form_parsed_xml_dict( 189 | self.sentence_dictionary, "paragraph") 190 | self.convert_dict_to_csv( 191 | path=os.path.join(corpus_path, f'{csv_name}'), dict_with_parsed_xml=dict_with_parsed_xml_no_paragrph) 192 | if make_json: 193 | dict_with_parsed_xml_no_paragrph = self.remove_paragraph_form_parsed_xml_dict( 194 | self.sentence_dictionary, "paragraph") 195 | self.convert_dict_to_json(path=os.path.join( 196 | corpus_path, f'{make_json}'), dict_with_parsed_xml=dict_with_parsed_xml_no_paragrph) 197 | if make_ami_dict: 198 | ami_dict_path = os.path.join(corpus_path, make_ami_dict) 199 | self.handle_ami_dict_creation( 200 | self.sentence_dictionary, make_ami_dict, ami_dict_path) 201 | 202 | return self.sentence_dictionary 203 | 204 | def run_pygetpapers(self, query, hits, output): 205 | """calls pygetpapers to query EPMC for papers; downloads specified number of papers 206 | 207 | :param query: query to pygetpapers/EPMC 208 | :type query: str 209 | :param hits: number of papers to download 210 | :type hits: int 211 | :param output: name of the folder 212 | :type output: str 213 | 214 | """ 215 | pygetpapers_call = Pygetpapers() 216 | pygetpapers_call.run_command( 217 | query=query, limit=hits, output=output, xml=True) 218 | logging.info(f"making CProject {output} with {hits} papers on {query}") 219 | 220 | def run_ami_section(self, path): 221 | """Creates sections folder for each paper (CTree); sections papers into front, body, back and floats based on JATS 222 | 223 | :param path: CProject path 224 | :type path: string 225 | 226 | """ 227 | file_list = glob(os.path.join( 228 | path, '**', 'fulltext.xml'), recursive=True) 229 | for paper in file_list: 230 | with open(paper, 'r') as xml_file: 231 | xml_string = xml_file.read() 232 | if len(xml_string) > 0: 233 | outdir = Path(Path(paper).parent, "sections") 234 | AMIAbsSection.make_xml_sections(paper, outdir, True) 235 | else: 236 | logging.warning(f"{paper} is empty") 237 | 238 | def get_glob_for_section(self, path, section_names): 239 | """globs for xml files in section folder of each CTree 240 | 241 | :param path: CProject path 242 | :type path: string 243 | :param section_names: one or more keys (section names) from CONFIG_SECTIONS 244 | :type section_names: string 245 | :returns: list of globs 246 | :rtype: list 247 | 248 | """ 249 | for section_name in section_names: 250 | if section_name in self.sections.keys(): 251 | self.all_paragraphs[section_name] = [] 252 | for section in self.sections[section_name]: 253 | self.all_paragraphs[section_name] += glob(os.path.join( 254 | path, '**', 'sections', '**', section), recursive=True) 255 | else: 256 | logging.error( 257 | "please make sure that you have selected only the supported sections: ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL") 258 | return self.all_paragraphs 259 | 260 | def make_dict_with_parsed_document(self, document_type="xml"): 261 | """creates dictionary with parsed xml or html 262 | 263 | :param document_type: type of file fed: xml or html. Defaults to "xml". 264 | :type document_type: str 265 | :returns: python dict containing parsed text from xml or html 266 | :rtype: dict 267 | 268 | """ 269 | 270 | self.sentence_dictionary = {} 271 | 272 | counter = 1 273 | for section in self.all_paragraphs: 274 | for section_path in tqdm(self.all_paragraphs[section]): 275 | paragraph_path = section_path 276 | if document_type == 'html': 277 | paragraph_text = self.read_text_from_html(paragraph_path) 278 | elif document_type == 'xml': 279 | paragraph_text = self.read_text_from_path(paragraph_path) 280 | sentences = tokenize.sent_tokenize(paragraph_text) 281 | for sentence in sentences: 282 | self.sentence_dictionary[counter] = {} 283 | self._make_dict_attributes( 284 | counter, section, section_path, paragraph_text, sentence) 285 | counter += 1 286 | logging.info( 287 | f"Found {len(self.sentence_dictionary)} sentences in the section(s).") 288 | return self.sentence_dictionary 289 | 290 | def _make_dict_attributes(self, counter, section, section_path, paragraph_text, sentence): 291 | """ 292 | 293 | :param counter: 294 | :param section: 295 | :param section_path: 296 | :param paragraph_text: 297 | :param sentence: 298 | 299 | """ 300 | dict_for_sentences = self.sentence_dictionary[counter] 301 | dict_for_sentences["file_path"] = section_path 302 | dict_for_sentences["paragraph"] = paragraph_text 303 | dict_for_sentences["sentence"] = sentence 304 | dict_for_sentences["section"] = section 305 | 306 | def read_text_from_path(self, paragraph_path): 307 | """uses ElementTree to read text from xml files 308 | 309 | :param paragraph_path: path to xml file 310 | :type paragraph_path: string 311 | :returns: raw text from xml 312 | :rtype: string 313 | 314 | """ 315 | try: 316 | tree = ET.parse(paragraph_path) 317 | root = tree.getroot() 318 | xmlstr = ET.tostring(root, encoding='utf8', method='xml') 319 | soup = BeautifulSoup(xmlstr, features='lxml') 320 | text = soup.get_text(separator=" ") 321 | paragraph_text = text.replace( 322 | '\n', ' ') 323 | except: 324 | paragraph_text = "empty" 325 | logging.error(f"cannot parse {paragraph_path}") 326 | return paragraph_text 327 | 328 | def read_text_from_html(self, paragraph_path): 329 | """uses beautifulsoup to read text from html files 330 | 331 | :param paragraph_path: path to html file 332 | :type paragraph_path: string 333 | :returns: raw text from html 334 | :rtype: string 335 | 336 | """ 337 | with open(paragraph_path, encoding="utf-8") as f: 338 | content = f.read() 339 | soup = BeautifulSoup(content, 'html.parser') 340 | return soup.text.replace('\n', ' ') 341 | 342 | def run_spacy_over_sections(self, dict_with_parsed_xml, entities_names): 343 | """uses spacy to extract specific Named-Entities from sentences in python dict 344 | 345 | :param dict_with_parsed_xml: main dict with sentences 346 | :type dict_with_parsed_xml: dict 347 | :param entities_names: list of kinds of Named-Entities that needs to be extacted 348 | :type entities_names: list 349 | 350 | """ 351 | self.download_spacy(self.spacy_model) 352 | for paragraph in tqdm(dict_with_parsed_xml): 353 | if len(dict_with_parsed_xml[paragraph]['sentence']) > 0: 354 | doc = self.nlp(dict_with_parsed_xml[paragraph]['sentence']) 355 | entities, labels, position_end, position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end = self._make_required_lists() 356 | self._get_entities(entities_names, doc, entities, 357 | labels, position_end, position_start) 358 | self._add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end, 359 | position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end) 360 | 361 | def _get_entities(self, entities_names, doc, entities, labels, position_end, position_start): 362 | """ 363 | 364 | :param entities_names: 365 | :param doc: 366 | :param entities: 367 | :param labels: 368 | :param position_end: 369 | :param position_start: 370 | 371 | """ 372 | for ent in doc.ents: 373 | if (ent.label_ in entities_names) or (entities_names == ['ALL']): 374 | self._add_parsed_entities_to_lists( 375 | entities, labels, position_end, position_start, ent) 376 | 377 | def abbreviation_search_using_sw(self, dict_with_parsed_xml): 378 | """Extracts abbreviations from sentences using schwartz_hearst. Credit: Ananya Singha 379 | 380 | :param dict_with_parsed_xml: main python dictionary with sentences 381 | :type dict_with_parsed_xml: dict 382 | 383 | """ 384 | for text in dict_with_parsed_xml: 385 | dict_for_sentence = dict_with_parsed_xml[text] 386 | dict_for_sentence["abb"] = [] 387 | pairs = schwartz_hearst.extract_abbreviation_definition_pairs( 388 | doc_text=dict_for_sentence['sentence']) 389 | dict_for_sentence["abb"] = pairs 390 | self._make_list_from_dict(pairs) 391 | 392 | def make_abb_exp_list(self, result_dictionary): 393 | """make lists of abbreviations and expansions to input into xml dictionary creating method 394 | 395 | :param result_dictionary: main dictionary that contains sentences and abbreviation dict (abb and expansion) 396 | :type result_dictionary: dict 397 | :returns: all abbreviations 398 | :rtype: list 399 | 400 | """ 401 | list_of_name_lists = [] 402 | list_of_term_lists = [] 403 | for entry in result_dictionary: 404 | sentence_dictionary = result_dictionary[entry] 405 | if 'abb' in sentence_dictionary: 406 | pairs_dicts = (result_dictionary[entry]['abb']) 407 | name_list_for_every_dict, term_list_for_every_dict = self._make_list_from_dict( 408 | pairs_dicts) 409 | list_of_name_lists.append(name_list_for_every_dict) 410 | list_of_term_lists.append(term_list_for_every_dict) 411 | return self._list_of_lists_to_single_list(list_of_name_lists), self._list_of_lists_to_single_list(list_of_term_lists) 412 | 413 | def _make_list_from_dict(self, pairs): 414 | """ 415 | 416 | :param pairs: 417 | 418 | """ 419 | keys_list = [] 420 | values_list = [] 421 | keys_list.extend(pairs.keys()) 422 | values_list.extend(pairs.values()) 423 | return keys_list, values_list 424 | 425 | def _list_of_lists_to_single_list(self, list_of_lists): 426 | """ 427 | 428 | :param list_of_lists: 429 | 430 | """ 431 | return [item for sublist in list_of_lists for item in sublist] 432 | 433 | def make_ami_dict_from_abbreviation(self, title, result_dictionary, path): 434 | """create xml ami-dict containing abbreviations extracted from sentences 435 | 436 | :param title: title of xml ami-dict 437 | :type title: str 438 | :param result_dictionary: main dictionary with sentences and corresponding abbeviations 439 | :type result_dictionary: dict 440 | :param path: path where the xml ami-dict file would lie 441 | :type path: str 442 | 443 | """ 444 | name_list, term_list = self.make_abb_exp_list(result_dictionary) 445 | dictionary_element = etree.Element("dictionary") 446 | dictionary_element.attrib['title'] = title 447 | for name, term in tqdm(zip(name_list, term_list)): 448 | 449 | wiki_lookup_list = self.wiki_lookup(term) 450 | try: 451 | entry_element = etree.SubElement(dictionary_element, "entry") 452 | entry_element.attrib['name'] = name 453 | entry_element.attrib['term'] = term 454 | if len(wiki_lookup_list) == 0: 455 | entry_element.attrib['wikidataID'] = "" 456 | elif len(wiki_lookup_list) == 1: 457 | entry_element.attrib['wikidataID'] = ", ".join(wiki_lookup_list) 458 | else: 459 | raw_element = etree.SubElement(entry_element, 'raw') 460 | raw_element.attrib['wikidataID'] = ", ".join(wiki_lookup_list) 461 | except Exception as e: 462 | logging.error(f"Couldn't add {term} to amidict") 463 | xml_dict = self._etree_to_string(dictionary_element) 464 | self._write_string_to_file(xml_dict, f'{path}.xml') 465 | logging.info(f'wrote all abbreviations to ami dict {path}.xml') 466 | 467 | def _etree_to_string(self, dictionary_element): 468 | """ 469 | 470 | :param dictionary_element: 471 | 472 | """ 473 | xml_dict = etree.tostring( 474 | dictionary_element, pretty_print=True).decode('utf-8') 475 | return xml_dict 476 | 477 | def _get_abbreviations(self, doc, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end): 478 | """ 479 | 480 | :param doc: 481 | :param abbreviations: 482 | :param abbreviations_longform: 483 | :param abbreviation_start: 484 | :param abbreviation_end: 485 | 486 | """ 487 | for abrv in doc._.abbreviations: 488 | abbreviations.append(abrv) 489 | abbreviations_longform.append(abrv._.long_form) 490 | abbreviation_start.append(abrv.start) 491 | abbreviation_end.append(abrv.end) 492 | 493 | def add_if_file_contains_terms(self, compiled_terms, dict_with_parsed_xml, searching='has_terms'): 494 | """populate the main dictionary with term matches, its frequency and span 495 | 496 | :param compiled_terms: list of compiled ami-dict terms 497 | :type compiled_terms: list 498 | :param dict_with_parsed_xml: dictionary containing sentences 499 | :type dict_with_parsed_xml: dict 500 | :param searching: dict key name. Defaults to 'has_terms'. 501 | :type searching: str 502 | 503 | """ 504 | for statement in tqdm(dict_with_parsed_xml): 505 | dict_for_sentence = dict_with_parsed_xml[statement] 506 | dict_for_sentence[f'{searching}'] = [] 507 | dict_for_sentence[f'{searching}_span'] = [] 508 | term_list, span_list, frequency = self.search_sentence_with_compiled_terms( 509 | compiled_terms, dict_for_sentence['sentence']) 510 | if term_list: 511 | dict_for_sentence[f'{searching}'].append(term_list) 512 | dict_for_sentence[f'weight_{searching}'] = frequency 513 | dict_for_sentence[f'{searching}_span'].append(span_list) 514 | 515 | def search_sentence_with_compiled_terms(self, compiled_terms, sentence): 516 | """search sentences using the compiled ami-dict entry 517 | 518 | :param compiled_terms: list of compiled ami-dict terms 519 | :type compiled_terms: list 520 | :param sentence: sentence to search using compiled terms 521 | :type sentence: string 522 | :returns: list of terms that was found after searching sentence 523 | :rtype: list 524 | 525 | """ 526 | # https://stackoverflow.com/questions/47681756/match-exact-phrase-within-a-string-in-python 527 | match_list = [] 528 | span_list = [] 529 | frequency = 0 530 | for compiled_term in compiled_terms: 531 | term_match = compiled_term.search(sentence, re.IGNORECASE) 532 | if term_match is not None: 533 | match_list.append(term_match.group()) 534 | span_list.append(term_match.span()) 535 | frequency = len(match_list) 536 | return match_list, span_list, frequency 537 | 538 | def get_terms_from_ami_xml(self, xml_path): 539 | """parses ami-dict (xml) and reads the entry terms; ami-dict can either be the default ones (user specifies python dict key) or customized ones (user specifies full path to it) 540 | 541 | :param xml_path: either keys from dict_of_ami_dict or full path to ami-dict 542 | :type xml_path: string 543 | :returns: list of regex compiled entry terms from ami-dict 544 | :rtype: list 545 | 546 | """ 547 | if xml_path in self.dict_of_ami_dict.keys(): 548 | logging.info(f"getting terms from {xml_path}") 549 | tree = ET.parse(urlopen(self.dict_of_ami_dict[xml_path])) 550 | root = tree.getroot() 551 | elif xml_path not in self.dict_of_ami_dict.keys(): 552 | tree = ET.parse(xml_path) 553 | root = tree.getroot() 554 | logging.info(f"getting terms from {xml_path}") 555 | else: 556 | logging.error(f'{xml_path} is not a supported dictionary. Choose from: EO_ACTIVITY, EO_COMPOUND, EO_EXTRACTION, EO_PLANT, EO_PLANT_PART, PLANT_GENUS,EO_TARGET, COUNTRY, DISEASE, DRUG, ORGANIZATION ') 557 | 558 | compiled_terms = self._compiled_regex(root.iter('entry')) 559 | return (set(compiled_terms)) 560 | 561 | def _compiled_regex(self, iterate_over): 562 | """ 563 | 564 | :param iterate_over: 565 | 566 | """ 567 | compiled_terms = [] 568 | for para in iterate_over: 569 | try: 570 | term = (para.attrib["term"]) 571 | except KeyError: 572 | term = para.text 573 | try: 574 | compiled_term = self._regex_compile(term) 575 | except re.error: 576 | logging.warning(f'cannot use term {term}') 577 | compiled_terms.append(compiled_term) 578 | return compiled_terms 579 | 580 | def _regex_compile(self, term): 581 | """ 582 | 583 | :param term: 584 | 585 | """ 586 | return re.compile(r'\b{}\b'.format(term)) 587 | 588 | def get_synonyms_from_ami_xml(self, xml_path): 589 | """parses ami-dict (xml) and reads the entry's synonyms; ami-dict can either be the default ones (user specifies python dict key) or customized ones (user specifies full path to it) 590 | 591 | :param xml_path: either keys from dict_of_ami_dict or full path to ami-dict 592 | :type xml_path: string 593 | :returns: list of regex compiled entry's synonyms from ami-dict 594 | :rtype: list 595 | 596 | """ 597 | if xml_path in self.dict_of_ami_dict.keys(): 598 | logging.info(f"getting synonyms from {xml_path}") 599 | tree = ET.parse(urlopen(self.dict_of_ami_dict[xml_path])) 600 | root = tree.getroot() 601 | elif xml_path not in self.dict_of_ami_dict.keys(): 602 | logging.info(f"getting synonyms from {xml_path}") 603 | tree = ET.parse(xml_path) 604 | root = tree.getroot() 605 | else: 606 | logging.error(f'{xml_path} is not a supported dictionary. Choose from: EO_ACTIVITY, EO_COMPOUND, EO_EXTRACTION, EO_PLANT, EO_PLANT_PART, PLANT_GENUS,EO_TARGET, COUNTRY, DISEASE, DRUG, ORGANIZATION ') 607 | synonyms = self._compiled_regex(root.findall("./entry/synonym")) 608 | return synonyms 609 | 610 | def _make_required_lists(self): 611 | """ """ 612 | abbreviations = [] 613 | abbreviations_longform = [] 614 | abbreviation_start = [] 615 | abbreviation_end = [] 616 | entities = [] 617 | labels = [] 618 | position_start = [] 619 | position_end = [] 620 | return entities, labels, position_end, position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end 621 | 622 | def _add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end, 623 | position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end): 624 | """ 625 | 626 | :param dict_for_sentence: 627 | :param entities: 628 | :param labels: 629 | :param position_end: 630 | :param position_start: 631 | :param abbreviations: 632 | :param abbreviations_longform: 633 | :param abbreviation_start: 634 | :param abbreviation_end: 635 | 636 | """ 637 | 638 | dict_for_sentence['entities'] = entities 639 | dict_for_sentence['labels'] = labels 640 | dict_for_sentence['position_start'] = position_start 641 | dict_for_sentence['position_end'] = position_end 642 | dict_for_sentence['abbreviations'] = abbreviations 643 | dict_for_sentence['abbreviations_longform'] = abbreviations_longform 644 | dict_for_sentence['abbreviation_start'] = abbreviation_start 645 | dict_for_sentence['abbreviation_end'] = abbreviation_end 646 | 647 | def _add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None): 648 | """ 649 | 650 | :param entities: 651 | :param labels: 652 | :param position_end: 653 | :param position_start: 654 | :param ent: (Default value = None) 655 | 656 | """ 657 | entities.append(ent.text) 658 | labels.append(ent.label_) 659 | position_start.append(ent.start_char) 660 | position_end.append(ent.end_char) 661 | 662 | def convert_dict_to_csv(self, path, dict_with_parsed_xml): 663 | """Turns python dictionary into CSV using pandas 664 | 665 | :param path: CSV file to write output 666 | :type path: string 667 | :param dict_with_parsed_xml: python dictionary that needs to be converted to csv 668 | :type dict_with_parsed_xml: dict 669 | 670 | """ 671 | df = pd.DataFrame(dict_with_parsed_xml) 672 | df = df.T 673 | for col in df: 674 | try: 675 | df[col] = df[col].astype(str).str.replace( 676 | "[", "").str.replace("]", "") 677 | df[col] = df[col].astype(str).str.replace( 678 | "'", "").str.replace("'", "") 679 | except: 680 | pass 681 | df.to_csv(path, encoding='utf-8', line_terminator='\r\n') 682 | logging.info(f"wrote output to {path}") 683 | 684 | def remove_paragraph_form_parsed_xml_dict(self, dict_with_parsed_xml, key_to_remove): 685 | """pops out the specified key value pairs from python dictionaries 686 | 687 | :param dict_with_parsed_xml: python dict from which a key-value pair needs to be removed 688 | :type dict_with_parsed_xml: dict 689 | :param key_to_remove: key of the pair that needs to be removed 690 | :type key_to_remove: string 691 | :returns: python dict with the specified key-value pair removed 692 | :rtype: dict 693 | 694 | """ 695 | for entry in dict_with_parsed_xml: 696 | dict_with_parsed_xml[entry].pop(key_to_remove, None) 697 | return dict_with_parsed_xml 698 | 699 | def convert_dict_to_json(self, path, dict_with_parsed_xml): 700 | """writes python dictionary to json file 701 | 702 | :param path: json file path to write to 703 | :type path: str 704 | :param dict_with_parsed_xml: main dictionary with sentences, search hits, entities, etc. 705 | :type dict_with_parsed_xml: dict 706 | 707 | """ 708 | with open(path, mode='w', encoding='utf-8') as f: 709 | json.dump(dict_with_parsed_xml, f, indent=4) 710 | logging.info(f"wrote JSON output to {path}") 711 | 712 | def remove_statements_not_having_xmldict_terms(self, dict_with_parsed_xml, searching='has_terms'): 713 | """removes key-value pairs from the main python dict that do not have any match hits 714 | 715 | :param dict_with_parsed_xml: python dictionary from which the specific key-value pairs needs to be removed 716 | :type dict_with_parsed_xml: dict 717 | :param searching: the key to the pair in the nested-dict that needs to be removed (Default value = 'has_terms') 718 | :type searching: str 719 | 720 | """ 721 | statement_to_pop = [] 722 | for statement in dict_with_parsed_xml: 723 | sentect_dict = dict_with_parsed_xml[statement] 724 | if len(sentect_dict[searching]) == 0: 725 | statement_to_pop.append(statement) 726 | 727 | for term in statement_to_pop: 728 | dict_with_parsed_xml.pop(term) 729 | 730 | def make_ami_dict_from_list(self, list_of_terms_with_count, title): 731 | """makes ami-dict from a python dictionary containing terms and frequencies. 732 | 733 | :param list_of_terms_with_count: python dictionary containing terms and their frequency of occurence 734 | :type list_of_terms_with_count: dict 735 | :param title: title of the xml ami-dict as well as the name of the XML file 736 | :type title: string 737 | :returns: xml ami-dict 738 | :rtype: file 739 | 740 | """ 741 | dictionary_element = etree.Element("dictionary") 742 | dictionary_element.attrib['title'] = title 743 | for term in list_of_terms_with_count: 744 | try: 745 | entry_element = etree.SubElement(dictionary_element, "entry") 746 | entry_element.attrib['term'] = term[0] 747 | entry_element.attrib['count'] = str(term[1]) 748 | except Exception as e: 749 | logging.error(f"Couldn't add {term} to amidict") 750 | return self._etree_to_string(dictionary_element) 751 | 752 | def _write_string_to_file(self, string_to_put, title): 753 | """ 754 | 755 | :param string_to_put: 756 | :param title: 757 | 758 | """ 759 | with open(title, mode='w', encoding='utf-8') as f: 760 | f.write(string_to_put) 761 | 762 | def handle_ami_dict_creation(self, result_dictionary, title, path): 763 | """creates and writes ami dictionary with entities extracted and their frequency. 764 | 765 | :param result_dictionary: main python dictionary with sentences, entities, etc. 766 | :type result_dictionary: dict 767 | :param title: title of ami-dictionary (xml file) 768 | :type title: str 769 | :param path: file path 770 | :type path: str 771 | 772 | """ 773 | list_of_entities = [] 774 | for entry in result_dictionary: 775 | if 'entities' in result_dictionary[entry]: 776 | entity = result_dictionary[entry]['entities'] 777 | list_of_entities.extend(entity) 778 | dict_of_entities_with_count = Counter(list_of_entities) 779 | list_of_terms_with_count = dict_of_entities_with_count.most_common() 780 | xml_dict = self.make_ami_dict_from_list( 781 | list_of_terms_with_count, title) 782 | self._write_string_to_file(xml_dict, f'{path}.xml') 783 | logging.info(f"Wrote all the entities extracted to {path}.xml") 784 | 785 | def json_to_dict(self, json_file_link): 786 | """loads json file as python dictionary 787 | 788 | :param json_file_link: link to json file on the web 789 | :type json_file_link: str 790 | :returns: python dictionary from json 791 | :rtype: dictionary 792 | 793 | """ 794 | path = urlopen(json_file_link) 795 | json_dict = json.load(path) 796 | return (json_dict) 797 | 798 | def wiki_lookup(self, query): 799 | """Queries Wikidata API for Wikidata Item IDs for terms in ami-dict 800 | 801 | :param query: term to query wikdiata for ID 802 | :type query: string 803 | :returns: potential Wikidata Item URLs 804 | :rtype: list 805 | 806 | """ 807 | params = { 808 | "action": "wbsearchentities", 809 | "search": query, 810 | "language": "en", 811 | "format": "json" 812 | } 813 | data = requests.get( 814 | "https://www.wikidata.org/w/api.php", params=params) 815 | result = data.json() 816 | hit_list = [] 817 | for hit in result['search']: 818 | try: 819 | if "scientific article" not in hit["description"]: 820 | hit_list.append(hit["id"]) 821 | except: 822 | hit_list.append(hit["id"]) 823 | return hit_list 824 | 825 | 826 | # take out the constants 827 | # look through download_tools (pygetpapers) and see if we have overlapping functionality. 828 | # functionality_from_(where you are getting a data) 829 | 830 | 831 | # Future goals 832 | # make tests automated 833 | # readthedocs 834 | # tutorials 835 | # repository management 836 | -------------------------------------------------------------------------------- /docanalysis/file_lib.py: -------------------------------------------------------------------------------- 1 | """classes and methods to support path operations 2 | 3 | """ 4 | import json 5 | import copy 6 | import glob 7 | import re 8 | import os 9 | import shutil 10 | from pathlib import Path, PurePath 11 | import logging 12 | from glob import glob 13 | from braceexpand import braceexpand 14 | 15 | logging.debug("loading file_lib") 16 | 17 | py4ami = "py4ami" 18 | RESOURCES = "resources" 19 | 20 | # section keys 21 | _DESC = "_DESC" 22 | PROJ = "PROJ" 23 | TREE = "TREE" 24 | SECTS = "SECTS" 25 | SUBSECT = "SUBSECT" 26 | SUBSUB = "SUBSUB" 27 | FILE = "FILE" 28 | SUFFIX = "SUFFIX" 29 | 30 | ALLOWED_SECTS = {_DESC, PROJ, TREE, SECTS, SUBSECT, SUBSUB, FILE, SUFFIX} 31 | 32 | # wildcards 33 | STARS = "**" 34 | STAR = "*" 35 | 36 | # suffixes 37 | S_PDF = "pdf" 38 | S_PNG = "png" 39 | S_SVG = "svg" 40 | S_TXT = "txt" 41 | S_XML = "xml" 42 | 43 | # markers for processing 44 | _NULL = "_NULL" 45 | _REQD = "_REQD" 46 | 47 | # known section names 48 | SVG = "svg" 49 | PDFIMAGES = "pdfimages" 50 | RESULTS = "results" 51 | SECTIONS = "sections" 52 | 53 | # subsects 54 | IMAGE_STAR = "image*" 55 | 56 | # subsects 57 | OCTREE = "*octree" 58 | 59 | # results 60 | SEARCH = "search" 61 | WORD = "word" 62 | EMPTY = "empty" 63 | 64 | # files 65 | FULLTEXT_PAGE = "fulltext-page*" 66 | CHANNEL_STAR = "channel*" 67 | RAW = "raw" 68 | 69 | 70 | class Globber: 71 | """utilities for globbing - may be obsolete""" 72 | 73 | def __init__(self, ami_path, recurse=True, cwd=None) -> None: 74 | self.ami_path = ami_path 75 | self.recurse = recurse 76 | self.cwd = os.getcwd() if cwd is None else cwd 77 | 78 | def get_globbed_files(self) -> list: 79 | """uses the glob_string_list in ami_path to create a path list""" 80 | files = [] 81 | if self.ami_path: 82 | glob_list = self.ami_path.get_glob_string_list() 83 | for gl_str in glob_list: 84 | files += glob.glob(gl_str, recursive=self.recurse) 85 | return files 86 | 87 | 88 | class AmiPath: 89 | """holds a (keyed) scheme for generating lists of path globs 90 | The scheme has several segments which can be set to create a glob expr. 91 | 92 | 93 | """ 94 | # keys for path scheme templates 95 | T_FIGURES = "fig_captions" 96 | T_OCTREE = "octree" 97 | T_PDFIMAGES = "pdfimages" 98 | T_RESULTS = "results" 99 | T_SECTIONS = "sections" 100 | T_SVG = "svg" 101 | 102 | logger = logging.getLogger("ami_path") 103 | # dict 104 | 105 | def __init__(self, scheme=None): 106 | self.scheme = scheme 107 | 108 | def print_scheme(self): 109 | """for debugging and enlightenment""" 110 | if self.scheme is not None: 111 | for key in self.scheme: 112 | print("key ", key, "=", self.scheme[key]) 113 | print("") 114 | 115 | @classmethod 116 | def create_ami_path_from_templates(cls, key, edit_dict=None): 117 | """creates a new AmiPath object from selected template 118 | key: to template 119 | edit_dict: dictionary with values to edit in 120 | 121 | :param key: 122 | :param edit_dict: (Default value = None) 123 | 124 | """ 125 | key = key.lower() 126 | if key is None or key not in TEMPLATES: 127 | cls.logger.error(f"cannot find key {key}") 128 | cls.logger.error("no scheme for: ", key, 129 | "expected", TEMPLATES.keys()) 130 | ami_path = AmiPath() 131 | # start with default template values 132 | ami_path.scheme = copy.deepcopy(TEMPLATES[key]) 133 | if edit_dict: 134 | ami_path.edit_scheme(edit_dict) 135 | return ami_path 136 | 137 | def edit_scheme(self, edit_dict): 138 | """edits values in self.scheme using edit_dict 139 | 140 | :param edit_dict: 141 | 142 | """ 143 | for k, v in edit_dict.items(): 144 | self.scheme[k] = v 145 | 146 | def permute_sets(self): 147 | """ """ 148 | self.scheme_list = [] 149 | self.scheme_list.append(self.scheme) 150 | # if scheme has sets, expand them 151 | change = True 152 | while change: 153 | change = self.expand_set_lists() 154 | 155 | def expand_set_lists(self): 156 | """expands the sets in a scheme 157 | note: sets are held as lists in JSON 158 | 159 | a scheme with 2 sets of size n and m is 160 | expanded to n*m schemes covering all permutations 161 | of the set values 162 | 163 | self.scheme_list contains all the schemes 164 | 165 | returns True if any sets are expanded 166 | 167 | 168 | """ 169 | change = False 170 | for scheme in self.scheme_list: 171 | for sect, value in scheme.items(): 172 | if type(value) == list: 173 | change = True 174 | # delete scheme with set, replace by copies 175 | self.scheme_list.remove(scheme) 176 | for set_value in value: 177 | scheme_copy = copy.deepcopy(scheme) 178 | self.scheme_list.append(scheme_copy) 179 | scheme_copy[sect] = set_value # poke in set value 180 | break # after each set processed 181 | 182 | return change 183 | 184 | def get_glob_string_list(self): 185 | """expand sets in AmiPath 186 | creates m*n... glob strings for sets with len n and m 187 | 188 | 189 | """ 190 | self.permute_sets() 191 | self.glob_string_list = [] 192 | for scheme in self.scheme_list: 193 | glob_string = AmiPath.create_glob_string(scheme) 194 | self.glob_string_list.append(glob_string) 195 | return self.glob_string_list 196 | 197 | @classmethod 198 | def create_glob_string(cls, scheme): 199 | """ 200 | 201 | :param scheme: 202 | 203 | """ 204 | globx = "" 205 | for sect, value in scheme.items(): 206 | cls.logger.debug(sect, type(value), value) 207 | if sect not in ALLOWED_SECTS: 208 | cls.logger.error(f"unknown sect: {sect}") 209 | elif _DESC == sect: 210 | pass 211 | elif _REQD == value: 212 | cls.logger.error("must set ", sect) 213 | globx += _REQD + "/" 214 | elif _NULL == value: 215 | pass 216 | elif FILE == sect: 217 | globx += AmiPath.convert_to_glob(value) 218 | elif STAR in value: 219 | globx += AmiPath.convert_to_glob(value) + "/" 220 | elif SUFFIX == sect: 221 | globx += "." + AmiPath.convert_to_glob(value) 222 | else: 223 | globx += AmiPath.convert_to_glob(value) + "/" 224 | cls.logger.debug("glob", scheme, "=>", globx) 225 | return globx 226 | 227 | @classmethod 228 | def convert_to_glob(cls, value): 229 | """ 230 | 231 | :param value: 232 | 233 | """ 234 | valuex = value 235 | if type(value) == list: 236 | # tacky. string quotes and add commas and parens 237 | valuex = "(" 238 | for v in value: 239 | valuex += v + "," 240 | valuex = valuex[:-1] + ")" 241 | return valuex 242 | 243 | def get_globbed_files(self): 244 | """ """ 245 | files = Globber(self).get_globbed_files() 246 | self.logger.debug("files", len(files)) 247 | return files 248 | 249 | 250 | class BraceGlobber: 251 | """ """ 252 | 253 | def braced_glob(self, path, recursive=False): 254 | """ 255 | 256 | :param path: 257 | :param recursive: (Default value = False) 258 | 259 | """ 260 | ll = [glob(x, recursive=recursive) for x in braceexpand(path)] 261 | return ll 262 | 263 | 264 | class FileLib: 265 | """ """ 266 | 267 | logger = logging.getLogger("file_lib") 268 | 269 | @classmethod 270 | def force_mkdir(cls, dirx): 271 | """ensure dirx exists 272 | 273 | :dirx: directory 274 | 275 | :param dirx: 276 | 277 | """ 278 | if not os.path.exists(dirx): 279 | try: 280 | os.mkdir(dirx) 281 | except Exception as e: 282 | cls.logger.error(f"cannot make dirx {dirx} , {e}") 283 | 284 | @classmethod 285 | def force_mkparent(cls, file): 286 | """ensure parent directory exists 287 | 288 | :path: whose parent directory is to be created if absent 289 | 290 | :param file: 291 | 292 | """ 293 | if file is not None: 294 | cls.force_mkdir(cls.get_parent_dir(file)) 295 | 296 | @classmethod 297 | def force_write(cls, file, data, overwrite=True): 298 | """:write path, creating dirtectory if necessary 299 | :path: path to write to 300 | :data: str data to write 301 | :overwrite: force write iuf path exists 302 | 303 | may throw exception from write 304 | 305 | :param file: 306 | :param data: 307 | :param overwrite: (Default value = True) 308 | 309 | """ 310 | if file is not None: 311 | if os.path.exists(file) and not overwrite: 312 | logging.warning(f"not overwriting existsnt path {file}") 313 | else: 314 | cls.force_mkparent(file) 315 | with open(file, "w", encoding="utf-8") as f: 316 | f.write(data) 317 | 318 | @classmethod 319 | def copy_file_or_directory(cls, dest_path, src_path, overwrite): 320 | """ 321 | 322 | :param dest_path: 323 | :param src_path: 324 | :param overwrite: 325 | 326 | """ 327 | if dest_path.exists(): 328 | if not overwrite: 329 | file_type = "dirx" if dest_path.is_dir() else "path" 330 | raise TypeError( 331 | str(dest_path), f"cannot overwrite existing {file_type} (str({dest_path})") 332 | 333 | else: 334 | # assume directory 335 | cls.logger.warning(f"create directory {dest_path}") 336 | dest_path.mkdir(parents=True, exist_ok=True) 337 | cls.logger.info(f"created directory {dest_path}") 338 | if src_path.is_dir(): 339 | if os.path.exists(dest_path): 340 | shutil.rmtree(dest_path) 341 | shutil.copytree(src_path, dest_path) 342 | cls.logger.info(f"copied directory {src_path} to {dest_path}") 343 | else: 344 | try: 345 | shutil.copy(src_path, dest_path) # will overwrite 346 | cls.logger.info(f"copied path {src_path} to {dest_path}") 347 | except Exception as e: 348 | cls.logger.fatal(f"Cannot copy direcctory {src_path} to {dest_path} because {e}") 349 | 350 | @staticmethod 351 | def create_absolute_name(file): 352 | """create absolute/relative name for a path relative to py4ami 353 | 354 | TODO this is messy 355 | 356 | :param file: 357 | 358 | """ 359 | absolute_file = None 360 | if file is not None: 361 | file_dir = FileLib.get_parent_dir(__file__) 362 | absolute_file = os.path.join(os.path.join(file_dir, file)) 363 | return absolute_file 364 | 365 | @classmethod 366 | def get_py4ami(cls): 367 | """gets paymi_m pathname""" 368 | return Path(__file__).parent.resolve() 369 | 370 | @classmethod 371 | def get_pyami_root(cls): 372 | """gets paymi root pathname""" 373 | return Path(__file__).parent.parent.resolve() 374 | 375 | @classmethod 376 | def get_pyami_resources(cls): 377 | """gets paymi root pathname""" 378 | return Path(cls.get_py4ami(), RESOURCES) 379 | 380 | @classmethod 381 | def get_parent_dir(cls, file): 382 | """ 383 | 384 | :param file: 385 | 386 | """ 387 | return None if file is None else PurePath(file).parent 388 | 389 | @classmethod 390 | def read_pydictionary(cls, file): 391 | """read a json path into a python dictiomary 392 | 393 | :param file: 394 | 395 | """ 396 | import ast 397 | with open(file, "r") as f: 398 | pydict = ast.literal_eval(f.read()) 399 | return pydict 400 | 401 | @classmethod 402 | def punct2underscore(cls, text): 403 | """replace all ASCII punctuation except '.' , '-', '_' by '_' 404 | 405 | for filenames 406 | 407 | :param text: 408 | 409 | """ 410 | # from py4ami.text_lib import TextUtil 411 | # this is non-trivial https://stackoverflow.com/questions/10017147/removing-a-list-of-characters-in-string 412 | 413 | non_file_punct = '\t \n{}!@#$%^&*()[]:;\'",|\\~+=/`' 414 | # [unicode(x.strip()) if x is not None else '' for x in row] 415 | 416 | #text0 = TextUtil.replace_chars(text, non_file_punct, "_") 417 | text0 = ''.join([c if c not in non_file_punct else "_" for c in text]) 418 | return text0 419 | 420 | @classmethod 421 | def get_suffix(cls, file): 422 | """get suffix 423 | INCLUDES the "." 424 | 425 | :param file: 426 | 427 | """ 428 | _suffix = None if file is None else Path(file).suffix 429 | return _suffix 430 | 431 | 432 | # see https://realpython.com/python-pathlib/ 433 | 434 | def main(): 435 | """ """ 436 | print("started file_lib") 437 | # test_templates() 438 | 439 | print("finished file_lib") 440 | 441 | 442 | if __name__ == "__main__": 443 | print("running file_lib main") 444 | main() 445 | else: 446 | # print("running file_lib main anyway") 447 | # main() 448 | pass 449 | 450 | # examples of regex for filenames 451 | 452 | 453 | def glob_re(pattern, strings): 454 | """ 455 | 456 | :param pattern: 457 | :param strings: 458 | 459 | """ 460 | return filter(re.compile(pattern).match, strings) 461 | 462 | 463 | filenames = glob_re(r'.*(abc|123|a1b).*\.txt', os.listdir()) 464 | 465 | # Credits: Peter Murray-Rust, py4ami (https://github.com/petermr/pyami/blob/main/py4ami/file_lib.py) -------------------------------------------------------------------------------- /docanalysis/get_html.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from glob import glob 3 | import os 4 | from abbreviations import schwartz_hearst 5 | from lxml import etree 6 | import yake 7 | 8 | def read_text_from_html(paragraph_path): 9 | """ 10 | 11 | :param paragraph_path: 12 | 13 | """ 14 | with open(paragraph_path, 'r') as f: 15 | html = f.read() 16 | soup = BeautifulSoup(html, features="html.parser") 17 | 18 | # kill all script and style elements 19 | for script in soup(["script", "style"]): 20 | script.extract() # rip it out 21 | 22 | # get text 23 | text = soup.get_text() 24 | 25 | # break into lines and remove leading and trailing space on each 26 | #lines = (line.strip() for line in text.splitlines()) 27 | # break multi-headlines into a line each 28 | #chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) 29 | # drop blank lines 30 | #text_write = '\n'.join(chunk for chunk in chunks if chunk) 31 | #text = '\n'.join(chunk for chunk in chunks if chunk) 32 | return text 33 | 34 | def get_glob(corpus_path): 35 | """ 36 | 37 | :param corpus_path: 38 | 39 | """ 40 | paragraph_path = glob(os.path.join(corpus_path, '**', 'sections', '**', "*html"), recursive=True) 41 | return paragraph_path 42 | 43 | def abbreviation_search_using_sw(paragraph_text): 44 | """ 45 | 46 | :param paragraph_text: 47 | 48 | """ 49 | pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=paragraph_text) 50 | keys = pairs.keys() 51 | values = pairs.values() 52 | return keys, values 53 | 54 | def make_ami_dict_from_list(title, keys, values): 55 | """ 56 | 57 | :param title: 58 | :param keys: 59 | :param values: 60 | 61 | """ 62 | dictionary_element= etree.Element("dictionary") 63 | dictionary_element.attrib['title']= title 64 | for term, expansion in zip(keys, values): 65 | entry_element=etree.SubElement(dictionary_element,"entry") 66 | entry_element.attrib['term']=term 67 | entry_element.attrib['exapansion']=expansion 68 | return etree.tostring(dictionary_element, pretty_print=True).decode('utf-8') 69 | 70 | def write_string_to_file(string_to_put,title): 71 | """ 72 | 73 | :param string_to_put: 74 | :param title: 75 | 76 | """ 77 | with open(title,mode='w', encoding='utf-8') as f: 78 | f.write(string_to_put) 79 | print(f"wrote dict to {title}") 80 | 81 | def extract_keyphrase(paragraph_text): 82 | """ 83 | 84 | :param paragraph_text: 85 | 86 | """ 87 | custom_kw_extractor = yake.KeywordExtractor(lan='en', n=5, top=10, features=None) 88 | keywords = custom_kw_extractor.extract_keywords(paragraph_text) 89 | keywords_list = [] 90 | for kw in keywords: 91 | keywords_list.append(kw[0]) 92 | print(keywords_list) 93 | 94 | def does_everything(corpus_path): 95 | """ 96 | 97 | :param corpus_path: 98 | 99 | """ 100 | all_text = [] 101 | all_keys = [] 102 | all_values = [] 103 | all_paragraph_paths = get_glob(corpus_path) 104 | for paragraph_path in all_paragraph_paths: 105 | paragraph_text = read_text_from_html(paragraph_path) 106 | #print(paragraph_text) 107 | all_text.append(paragraph_text) 108 | keys, values = abbreviation_search_using_sw(paragraph_text) 109 | all_keys.extend(keys) 110 | all_values.extend(values) 111 | print(len(all_keys), all_values) 112 | #all_text_string = joinStrings(all_text) 113 | #print(all_text_string) 114 | #extract_keyphrase(all_text_string) 115 | #dict_string = make_ami_dict_from_list("abb", all_keys, all_values) 116 | #return dict_string 117 | 118 | 119 | def joinStrings(stringList): 120 | """ 121 | 122 | :param stringList: 123 | 124 | """ 125 | return ''.join(string for string in stringList) 126 | 127 | path = os.path.join(os.path.expanduser('~'), "ipcc_sectioned") 128 | does_everything(path) 129 | #write_string_to_file( dict_string, "abb.xml") 130 | 131 | import json 132 | from urllib.request import urlopen 133 | 134 | #PATH = urlopen() 135 | #json_dict = json.load(PATH) 136 | #print(json_dict) 137 | 138 | -------------------------------------------------------------------------------- /docanalysis/glob_trail.py: -------------------------------------------------------------------------------- 1 | import os 2 | from glob import glob 3 | from pprint import pprint 4 | 5 | # define constants 6 | ABS = ['*abstract.xml'] 7 | ACK = ['*ack.xml'] 8 | AFF = ['*aff.xml'] 9 | AUT = ['*contrib-group.xml'] 10 | CON = ['*conclusion*/*.xml'] 11 | DIS = ['*discussion*/**/*_title.xml', '*discussion*/**/*_p.xml'] # might bring unwanted sections like tables, fig. captions etc. Maybe get only title and paragraphs? 12 | ETH = ['*ethic*/*.xml'] 13 | FIG = ['*fig*.xml'] 14 | INT = ['*introduction*/*.xml', '*background*/*.xml'] 15 | KEY = ['*kwd-group.xml'] 16 | MET = ['*method*/*.xml', '*material*/*.xml'] # also gets us supplementary material. Not sure how to exclude them 17 | RES = ['*result*/*/*_title.xml', '*result*/*/*_p.xml'] # not sure if we should use recursive globbing or not. 18 | TAB = ['*table*.xml'] 19 | TIL = ['*article-meta/*title-group.xml'] 20 | 21 | # glob 22 | path = os.getcwd() 23 | cproj = 'corpus/asp_nat_products' 24 | LIST_SEC = [TIL, KEY] 25 | for SEC in LIST_SEC: 26 | for opt in SEC: 27 | glob_list=glob(os.path.join(path, cproj, '**', 'sections', '**', f'{opt}'), recursive=True) 28 | pprint(glob_list) 29 | 30 | # Section list comes from: https://github.com/petermr/pyami/blob/main/py4ami/resources/section_templates.json -------------------------------------------------------------------------------- /docanalysis/gui.py: -------------------------------------------------------------------------------- 1 | import eel 2 | from pygetpapers import Pygetpapers 3 | import os 4 | 5 | eel.init(f'{os.path.dirname(os.path.realpath(__file__))}/gui') 6 | 7 | 8 | @eel.expose 9 | def create_corpus(path, query, number): 10 | pygetpapers_call = Pygetpapers() 11 | pygetpapers_call.run_command( 12 | query=query, limit=number, output=path, xml=True) 13 | 14 | 15 | eel.start('main.html') 16 | -------------------------------------------------------------------------------- /docanalysis/gui/css/main.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/docanalysis/gui/css/main.css -------------------------------------------------------------------------------- /docanalysis/gui/eel.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | function make_papers () { 4 | query = document.getElementById("query").value 5 | number = document.getElementById("number").value 6 | path =document.getElementById("path").value 7 | console.log(path) 8 | } -------------------------------------------------------------------------------- /docanalysis/gui/main.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Docanalysis GUI 10 | 11 | 12 |

13 | 14 |
15 |
16 |
17 | Query 18 | Number of papers 19 | 20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /docanalysis/xml_lib.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | from lxml import etree as LXET 4 | import logging 5 | 6 | from docanalysis.file_lib import FileLib 7 | 8 | logging.debug("loading xml_lib") 9 | 10 | 11 | # make leafnodes and copy remaning content as XML 12 | TERMINAL_COPY = { 13 | "abstract", 14 | "aff", 15 | "article-id", 16 | "article-categories", 17 | "author-notes", 18 | "caption", 19 | "contrib-group", 20 | "fig", 21 | "history", 22 | "issue", 23 | "journal_id", 24 | "journal-title-group", 25 | "kwd-group", 26 | "name", 27 | "notes", 28 | "p", 29 | "permissions", 30 | "person-group", 31 | "pub-date", 32 | "publisher", 33 | "ref", 34 | "table", 35 | "title", 36 | "title-group", 37 | "volume", 38 | } 39 | 40 | 41 | TERMINALS = [ 42 | "inline-formula", 43 | ] 44 | 45 | TITLE = "title" 46 | 47 | IGNORE_CHILDREN = { 48 | "disp-formula", 49 | } 50 | 51 | HTML_TAGS = { 52 | "italic": "i", 53 | "p": "p", 54 | "sub": "sub", 55 | "sup": "sup", 56 | "tr": "tr", 57 | } 58 | 59 | H_TD = "td" 60 | H_TR = "tr" 61 | H_TH = "th" 62 | LINK = "link" 63 | UTF_8 = "UTF-8" 64 | SCRIPT = "script" 65 | STYLESHEET = "stylesheet" 66 | TEXT_CSS = "text/css" 67 | TEXT_JAVASCRIPT = "text/javascript" 68 | 69 | H_HTML = "html" 70 | H_BODY = "body" 71 | H_TBODY = "tbody" 72 | H_DIV = "div" 73 | H_TABLE = "table" 74 | H_THEAD = "thead" 75 | H_HEAD = "head" 76 | H_TITLE = "title" 77 | 78 | RESULTS = "results" 79 | 80 | SEC_TAGS = { 81 | "sec", 82 | } 83 | 84 | LINK_TAGS = { 85 | "xref", 86 | } 87 | 88 | SECTIONS = "sections" 89 | 90 | HTML_NS = "HTML_NS" 91 | MATHML_NS = "MATHML_NS" 92 | SVG_NS = "SVG_NS" 93 | XMLNS_NS = "XMLNS_NS" 94 | XML_NS = "XML_NS" 95 | XLINK_NS = "XLINK_NS" 96 | 97 | XML_LANG = "{" + XML_NS + "}" + 'lang' 98 | 99 | NS_MAP = { 100 | HTML_NS: "http://www.w3.org/1999/xhtml", 101 | MATHML_NS: "http://www.w3.org/1998/Math/MathML", 102 | SVG_NS: "http://www.w3.org/2000/svg", 103 | XLINK_NS: "http://www.w3.org/1999/xlink", 104 | XML_NS: "http://www.w3.org/XML/1998/namespace", 105 | XMLNS_NS: "http://www.w3.org/2000/xmlns/", 106 | } 107 | 108 | logger = logging.getLogger("xml_lib") 109 | logger.setLevel(logging.WARNING) 110 | 111 | 112 | class XmlLib: 113 | """ """ 114 | 115 | def __init__(self, file=None, section_dir=SECTIONS): 116 | self.max_file_len = 30 117 | self.file = file 118 | self.parent_path = None 119 | self.root = None 120 | self.logger = logging.getLogger("xmllib") 121 | self.section_dir = section_dir 122 | self.section_path = None 123 | # self.logger.setLevel(logging.INFO) 124 | 125 | def read(self, file): 126 | """reads XML file , saves file, and parses to self.root 127 | 128 | :param file: 129 | 130 | """ 131 | if file is not None: 132 | self.file = file 133 | self.parent_path = Path(file).parent.absolute() 134 | self.root = XmlLib.parse_xml_file_to_root(file) 135 | 136 | def make_sections(self, section_dir): 137 | """recursively traverse XML tree and write files for each terminal element 138 | 139 | :param section_dir: 140 | 141 | """ 142 | self.section_dir = self.make_sections_path(section_dir) 143 | # indent = 0 144 | # filename = "1" + "_" + self.root.tag 145 | # self.logger.debug(" " * indent, filename) 146 | # subdir = os.path.join(self.section_dir, filename) 147 | # FileLib.force_mkdir(subdir) 148 | 149 | self.make_descendant_tree(self.root, self.section_dir) 150 | self.logger.info( 151 | f"wrote XML sections for {self.file} {self.section_dir}") 152 | 153 | @staticmethod 154 | def parse_xml_file_to_root(file): 155 | """read xml path and create root element 156 | 157 | :param file: 158 | 159 | """ 160 | file = str(file) # if file is Path 161 | if not os.path.exists(file): 162 | raise IOError("path does not exist", file) 163 | xmlp = LXET.XMLParser(encoding=UTF_8) 164 | element_tree = LXET.parse(file, xmlp) 165 | root = element_tree.getroot() 166 | return root 167 | 168 | @staticmethod 169 | def parse_xml_string_to_root(xml): 170 | """read xml string and parse to root element 171 | 172 | :param xml: 173 | 174 | """ 175 | from io import StringIO 176 | tree = LXET.parse(StringIO(xml), LXET.XMLParser(ns_clean=True)) 177 | return tree.getroot() 178 | 179 | def make_sections_path(self, section_dir): 180 | """ 181 | 182 | :param section_dir: 183 | 184 | """ 185 | self.section_path = os.path.join(self.parent_path, section_dir) 186 | if not os.path.exists(self.section_path): 187 | FileLib.force_mkdir(self.section_path) 188 | return self.section_path 189 | 190 | def make_descendant_tree(self, elem, outdir): 191 | """ 192 | 193 | :param elem: 194 | :param outdir: 195 | 196 | """ 197 | 198 | self.logger.setLevel(logging.INFO) 199 | if elem.tag in TERMINALS: 200 | self.logger.debug("skipped ", elem.tag) 201 | return 202 | TERMINAL = "T_" 203 | IGNORE = "I_" 204 | children = list(elem) 205 | self.logger.debug(f"children> {len(children)} .. {self.logger.level}") 206 | isect = 0 207 | for child in children: 208 | if "ProcessingInstruction" in str(type(child)): 209 | # print("PI", child) 210 | continue 211 | if "Comment" in str(type(child)): 212 | continue 213 | flag = "" 214 | child_child_count = len(list(child)) 215 | if child.tag in TERMINAL_COPY or child_child_count == 0: 216 | flag = TERMINAL 217 | elif child.tag in IGNORE_CHILDREN: 218 | flag = IGNORE 219 | 220 | title = child.tag 221 | if child.tag in SEC_TAGS: 222 | title = XmlLib.get_sec_title(child) 223 | 224 | if flag == IGNORE: 225 | title = flag + title 226 | filename = str( 227 | isect) + "_" + FileLib.punct2underscore(title).lower()[:self.max_file_len] 228 | 229 | if flag == TERMINAL: 230 | xml_string = LXET.tostring(child) 231 | filename1 = os.path.join(outdir, filename + '.xml') 232 | self.logger.setLevel(logging.INFO) 233 | self.logger.debug(f"writing dbg {filename1}") 234 | try: 235 | with open(filename1, "wb") as f: 236 | f.write(xml_string) 237 | except Exception: 238 | print(f"cannot write {filename1}") 239 | else: 240 | subdir = os.path.join(outdir, filename) 241 | # creates empty dirx, may be bad idea 242 | FileLib.force_mkdir(subdir) 243 | if flag == "": 244 | self.logger.debug(f">> {title} {child}") 245 | self.make_descendant_tree(child, subdir) 246 | isect += 1 247 | 248 | @staticmethod 249 | def get_sec_title(sec): 250 | """get title of JATS section 251 | 252 | :sec: section (normally sec element 253 | 254 | :param sec: 255 | 256 | """ 257 | title = None 258 | for elem in list(sec): 259 | if elem.tag == TITLE: 260 | title = elem.text 261 | break 262 | 263 | if title is None: 264 | # don't know where the 'xml_file' comes from... 265 | if not hasattr(sec, "xml_file"): 266 | title = "UNKNOWN" 267 | else: 268 | title = "?_" + str(sec["xml_file"][:20]) 269 | title = FileLib.punct2underscore(title) 270 | return title 271 | 272 | @staticmethod 273 | def remove_all(elem, xpath): 274 | """ 275 | 276 | :param elem: 277 | :param xpath: 278 | 279 | """ 280 | for el in elem.xpath(xpath): 281 | el.getparent().remove(el) 282 | 283 | @staticmethod 284 | def get_or_create_child(parent, tag): 285 | """ 286 | 287 | :param parent: 288 | :param tag: 289 | 290 | """ 291 | child = None 292 | if parent is not None: 293 | child = parent.find(tag) 294 | if child is None: 295 | child = LXET.SubElement(parent, tag) 296 | return child 297 | 298 | @classmethod 299 | def get_text(cls, node): 300 | """get text children as string 301 | 302 | :param node: 303 | 304 | """ 305 | return ''.join(node.itertext()) 306 | 307 | @staticmethod 308 | def add_UTF8(html_root): 309 | """adds UTF8 declaration to root 310 | 311 | :param html_root: 312 | 313 | """ 314 | from lxml import etree as LXET 315 | root = html_root.get_or_create_child(html_root, "head") 316 | LXET.SubElement(root, "meta").attrib["charset"] = "UTF-8" 317 | 318 | # replace nodes with text 319 | @staticmethod 320 | def replace_nodes_with_text(data, xpath, replacement): 321 | """replace nodes with specific text 322 | 323 | :param data: 324 | :param xpath: 325 | :param replacement: 326 | 327 | """ 328 | print(data, xpath, replacement) 329 | tree = LXET.fromstring(data) 330 | for r in tree.xpath(xpath): 331 | print("r", r, replacement, r.tail) 332 | text = replacement 333 | if r.tail is not None: 334 | text += r.tail 335 | parent = r.getparent() 336 | if parent is not None: 337 | previous = r.getprevious() 338 | if previous is not None: 339 | previous.tail = (previous.tail or '') + text 340 | else: 341 | parent.text = (parent.text or '') + text 342 | parent.remove(r) 343 | return tree 344 | 345 | @classmethod 346 | def remove_all_tags(cls, xml_string): 347 | """remove all tags from text 348 | 349 | :xml_string: string to be flattened 350 | 351 | :param xml_string: 352 | :returns: flattened string 353 | 354 | """ 355 | tree = LXET.fromstring(xml_string.encode("utf-8")) 356 | strg = LXET.tostring(tree, encoding='utf8', 357 | method='text').decode("utf-8") 358 | return strg 359 | 360 | @classmethod 361 | def xslt_transform(cls, data, xslt_file): 362 | """ 363 | 364 | :param data: 365 | :param xslt_file: 366 | 367 | """ 368 | xslt_root = LXET.parse(xslt_file) 369 | transform = LXET.XSLT(xslt_root) 370 | print("XSLT log", transform.error_log) 371 | result_tree = transform(LXET.fromstring(data)) 372 | assert(result_tree is not None) 373 | root = result_tree.getroot() 374 | assert(root is not None) 375 | 376 | return root 377 | 378 | @classmethod 379 | def xslt_transform_tostring(cls, data, xslt_file): 380 | """ 381 | 382 | :param data: 383 | :param xslt_file: 384 | 385 | """ 386 | root = cls.xslt_transform(data, xslt_file) 387 | return LXET.tostring(root).decode("UTF-8") if root is not None else None 388 | 389 | 390 | class HtmlElement: 391 | """to provide fluent HTML builder and parser""" 392 | pass 393 | 394 | 395 | class DataTable: 396 | """ 397 | 398 | ffml 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | """ 407 | 408 | def __init__(self, title, colheads=None, rowdata=None): 409 | """create dataTables 410 | optionally add column headings (list) and rows (list of conformant lists) 411 | 412 | :param title: of data_title (required) 413 | :param colheads: 414 | :param rowdata: 415 | 416 | """ 417 | self.html = LXET.Element(H_HTML) 418 | self.head = None 419 | self.body = None 420 | self.create_head(title) 421 | self.create_table_thead_tbody() 422 | self.add_column_heads(colheads) 423 | self.add_rows(rowdata) 424 | self.head = None 425 | self.title = None 426 | self.title.text = None 427 | 428 | 429 | def create_head(self, title): 430 | """ffml 431 | 432 | 433 | 434 | 435 | 436 | :param title: 437 | 438 | """ 439 | 440 | self.head = LXET.SubElement(self.html, H_HEAD) 441 | self.title = LXET.SubElement(self.head, H_TITLE) 442 | self.title.text = title 443 | 444 | link = LXET.SubElement(self.head, LINK) 445 | link.attrib["rel"] = STYLESHEET 446 | link.attrib["type"] = TEXT_CSS 447 | link.attrib["href"] = "http://ajax.aspnetcdn.com/ajax/jquery.dataTables/1.9.4/css/jquery.dataTables.css" 448 | link.text = '.' # messy, to stop formatter using "/>" which dataTables doesn't like 449 | 450 | script = LXET.SubElement(self.head, SCRIPT) 451 | script.attrib["src"] = "http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.8.2.min.js" 452 | script.attrib["charset"] = UTF_8 453 | script.attrib["type"] = TEXT_JAVASCRIPT 454 | script.text = '.' # messy, to stop formatter using "/>" which dataTables doesn't like 455 | 456 | script = LXET.SubElement(self.head, SCRIPT) 457 | script.attrib["src"] = "http://ajax.aspnetcdn.com/ajax/jquery.dataTables/1.9.4/jquery.dataTables.min.js" 458 | script.attrib["charset"] = UTF_8 459 | script.attrib["type"] = TEXT_JAVASCRIPT 460 | script.text = "." # messy, to stop formatter using "/>" which dataTables doesn't like 461 | 462 | script = LXET.SubElement(self.head, SCRIPT) 463 | script.attrib["charset"] = UTF_8 464 | script.attrib["type"] = TEXT_JAVASCRIPT 465 | script.text = "$(function() { $(\"#results\").dataTable(); }) " 466 | 467 | def create_table_thead_tbody(self): 468 | """ 469 |
470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | """ 482 | 483 | self.body = LXET.SubElement(self.html, H_BODY) 484 | self.div = LXET.SubElement(self.body, H_DIV) 485 | self.div.attrib["class"] = "bs-example table-responsive" 486 | self.table = LXET.SubElement(self.div, H_TABLE) 487 | self.table.attrib["class"] = "table table-striped table-bordered table-hover" 488 | self.table.attrib["id"] = RESULTS 489 | self.thead = LXET.SubElement(self.table, H_THEAD) 490 | self.tbody = LXET.SubElement(self.table, H_TBODY) 491 | 492 | def add_column_heads(self, colheads): 493 | """ 494 | 495 | :param colheads: 496 | 497 | """ 498 | if colheads is not None: 499 | self.thead_tr = LXET.SubElement(self.thead, H_TR) 500 | for colhead in colheads: 501 | th = LXET.SubElement(self.thead_tr, H_TH) 502 | th.text = str(colhead) 503 | 504 | def add_rows(self, rowdata): 505 | """ 506 | 507 | :param rowdata: 508 | 509 | """ 510 | if rowdata is not None: 511 | for row in rowdata: 512 | self.add_row_old(row) 513 | 514 | def add_row_old(self, row: [str]): 515 | """creates new in 516 | creates
articlesbibliographydic:countryword:frequencies
child elements of row containing string values 517 | 518 | :param row: list of str 519 | :param row: [str]: 520 | 521 | """ 522 | if row is not None: 523 | tr = LXET.SubElement(self.tbody, H_TR) 524 | for val in row: 525 | td = LXET.SubElement(tr, H_TD) 526 | td.text = val 527 | # print("td", td.text) 528 | 529 | def make_row(self): 530 | """:return: row element""" 531 | return LXET.SubElement(self.tbody, H_TR) 532 | 533 | def append_contained_text(self, parent, tag, text): 534 | """create element and add text child 535 | 536 | :param parent: 537 | :param tag: 538 | :param text: 539 | 540 | """ 541 | subelem = LXET.SubElement(parent, tag) 542 | subelem.text = text 543 | return subelem 544 | 545 | def write_full_data_tables(self, output_dir: str) -> None: 546 | """ 547 | 548 | :param output_dir: str: 549 | 550 | """ 551 | if not os.path.exists(output_dir): 552 | os.makedirs(output_dir) 553 | data_table_file = os.path.join(output_dir, "full_data_table.html") 554 | with open(data_table_file, "w") as f: 555 | text = bytes.decode(LXET.tostring(self.html)) 556 | f.write(text) 557 | print("WROTE", data_table_file) 558 | 559 | def __str__(self): 560 | # s = self.html.text 561 | # print("s", s) 562 | # return s 563 | # ic("ichtml", self.html) 564 | htmltext = LXET.tostring(self.html) 565 | print("SELF", htmltext) 566 | return htmltext 567 | 568 | 569 | class Web: 570 | """ """ 571 | def __init__(self): 572 | import tkinter as tk 573 | root = tk.Tk() 574 | site = "http://google.com" 575 | self.display_html(root, site) 576 | root.mainloop() 577 | 578 | @classmethod 579 | def display_html(cls, master, site): 580 | """ 581 | 582 | :param master: 583 | :param site: 584 | 585 | """ 586 | import tkinterweb 587 | frame = tkinterweb.HtmlFrame(master) 588 | frame.load_website(site) 589 | frame.pack(fill="both", expand=True) 590 | 591 | @classmethod 592 | def tkinterweb_demo(cls): 593 | """ """ 594 | from tkinterweb import Demo 595 | Demo() 596 | 597 | 598 | def main(): 599 | """ """ 600 | 601 | XmlLib().test_recurse_sections() # recursively list sections 602 | 603 | # test_data_table() 604 | # test_xml() 605 | 606 | # web = Web() 607 | # Web.tkinterweb_demo() 608 | 609 | 610 | def test_xml(): 611 | """ """ 612 | xml_string = "foo and with bar" 613 | print(XmlLib.remove_all_tags(xml_string)) 614 | 615 | 616 | def test_data_table(): 617 | """ """ 618 | import pprint 619 | data_table = DataTable("test") 620 | data_table.add_column_heads(["a", "b", "c"]) 621 | data_table.add_row_old(["a1", "b1", "c1"]) 622 | data_table.add_row_old(["a2", "b2", "c2"]) 623 | data_table.add_row_old(["a3", "b3", "c3"]) 624 | data_table.add_row_old(["a4", "b4", "c4"]) 625 | html = LXET.tostring(data_table.html).decode("UTF-8") 626 | HOME = os.path.expanduser("~") 627 | with open(os.path.join(HOME, "junk_html.html"), "w") as f: 628 | f.write(html) 629 | pprint.pprint(html) 630 | 631 | 632 | if __name__ == "__main__": 633 | print("running file_lib main") 634 | main() 635 | else: 636 | # print("running file_lib main anyway") 637 | # main() 638 | pass 639 | 640 | # Credits: Peter Murray-Rust, py4ami (https://github.com/petermr/pyami/blob/main/py4ami/file_lib.py) -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme 2 | myst-parser -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | import os 9 | import sys 10 | import sphinx_rtd_theme 11 | sys.path.insert(0, os.path.abspath('..')) 12 | sys.path.append(os.path.abspath('../..')) 13 | project = 'Docanalysis' 14 | copyright = '2022, Ayush Garg, Shweata N Hegde' 15 | author = 'Ayush Garg, Shweata N Hegde' 16 | release = '0.2.4' 17 | 18 | # -- General configuration --------------------------------------------------- 19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 20 | 21 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'myst_parser'] 22 | 23 | templates_path = ['_templates'] 24 | exclude_patterns = [] 25 | napoleon_google_docstring = True 26 | 27 | 28 | # -- Options for HTML output ------------------------------------------------- 29 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 30 | 31 | html_theme = 'sphinx_rtd_theme' 32 | html_static_path = ['_static'] 33 | -------------------------------------------------------------------------------- /docs/source/docanalysis.rst: -------------------------------------------------------------------------------- 1 | Docanalysis module 2 | ================================== 3 | 4 | .. automodule:: docanalysis.docanalysis 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/entity_extraction.rst: -------------------------------------------------------------------------------- 1 | Entity extraction module 2 | ================================== 3 | 4 | .. automodule:: docanalysis.entity_extraction 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | User Documentation 2 | ============================== 3 | 4 | .. include:: ../../README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | .. toctree:: 8 | :maxdepth: 1 9 | :hidden: 10 | :caption: User Documentation: 11 | 12 | user_documentation 13 | 14 | 15 | .. toctree:: 16 | :maxdepth: 7 17 | :caption: Core modules: 18 | 19 | docanalysis 20 | entity_extraction -------------------------------------------------------------------------------- /notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Notebooks 2 | 3 | A general resource for contributed code 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | abbreviations>=0.2.5 2 | beautifulsoup4>=4.10.0 3 | braceexpand>=0.1.7 4 | coloredlogs>=15.0.1 5 | ConfigArgParse>=1.5.3 6 | lxml>=4.7.1 7 | nltk>=3.6.7 8 | pandas>=1.3.4 9 | pygetpapers 10 | pytest>=6.2.5 11 | setuptools>=60.3.1 12 | spacy>=3.0.7 13 | tkinterweb>=3.10.7 14 | tqdm>=4.62.3 15 | yake>=0.4.8 16 | sphinx_rtd_theme>=1.0.0 17 | -------------------------------------------------------------------------------- /resources/approval_number_100.csv: -------------------------------------------------------------------------------- 1 | ,file_path,paragraph,sentence,entities,labels,position_start,position_end,has_terms,weight 2 | 33,C:\Users\shweata\approval_number_100\PMC7833043\sections\1_body\3_ethics_approval\1_p.xml,"The study was approved by the King Khalid University Ethics Committee (approval number: ECM#2020-183–(HAPO-06-B-001), and no identifying personal information (e.g. name, age) or other sensitive data were collected.","The study was approved by the King Khalid University Ethics Committee (approval number: ECM#2020-183–(HAPO-06-B-001), and no identifying personal information (e.g.",the King Khalid University Ethics Committee,ORG,26,69,approval number,1 3 | 63,C:\Users\shweata\approval_number_100\PMC7833043\sections\2_back\2_ethical_approval\1_p.xml,"The Ethical Committee of the Scientific Research, King Khalid University approved the study (approval number: ECM#2020-183–(HAPO-06-B-001) to use scores and absence rates, with no personal information of students disclosed.","The Ethical Committee of the Scientific Research, King Khalid University approved the study (approval number: ECM#2020-183–(HAPO-06-B-001) to use scores and absence rates, with no personal information of students disclosed.",The Ethical Committee of the Scientific Research,ORG,0,48,approval number,1 4 | 96,C:\Users\shweata\approval_number_100\PMC8023627\sections\2_back\2_ethics_approval_and_conse\1_p.xml,"The data were obtained from the Saudi Ministry of Health and World Health Organization records and the study conducted under the approval of the Regional Directorate of Primary Health according to ethical standards with the maintenance of anonymity of each patient. Thus, all the data of patients was recorded without patients details, it was not necessary to obtain the personal consent of the study participants. The study was ethically approved by the institutional review board of the Princess Nourah Bint Abdulrahman University (IRB Approval Number: 20–0217).",The study was ethically approved by the institutional review board of the Princess Nourah Bint Abdulrahman University (IRB Approval Number: 20–0217).,Nourah,GPE,83,89,approval number,1 5 | 169,C:\Users\shweata\approval_number_100\PMC8185902\sections\1_body\1_p.xml,"Following the publication of the above article, the authors have realized that, in the Declarations section on p. 10, they presented an incorrect approval number from the Ethics Committee in question; the statement here should have read as follows: “The present study was approved by the Ethics Committee of the Affiliated Hospital of Shaoxing University (approval no. 2021003). All patients provided written informed consent.”","Following the publication of the above article, the authors have realized that, in the Declarations section on p. 10, they presented an incorrect approval number from the Ethics Committee in question; the statement here should have read as follows: “The present study was approved by the Ethics Committee of the Affiliated Hospital of Shaoxing University (approval no.","the Ethics Committee, the Ethics Committee of the Affiliated Hospital of Shaoxing University","ORG, ORG","167, 284","187, 354",approval number,1 6 | 1252,C:\Users\shweata\approval_number_100\PMC8358494\sections\1_body\1_methods\1_patients_and_study_design\1_p.xml,"The participants enrolled in this retrospective study were outpatients at Suzuki Clinic. All participants received the Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition diagnosis for insomnia, and were prescribed either lemborexant or benzodiazepine hypnotics. The observation period was from July 2020 (when introduced for clinical use) to December 2020 for lemborexant and benzodiazepine hypnotics. Furthermore, there were no criteria for exclusion of research subjects in this study. This study was approved by the ethics committee of Fukui Kinen Hospital. The approval date and approval number of the ethics committee of Fukui Kinen Hospital were 21 January 2021 and 2-017, respectively. Instead of omitting the informed consent for the retrospective cohort study, information about the study was posted in the hospital, and opt-out recruitment was conducted. Insomnia was assessed using the Japanese version of Athens Insomnia Scale (AIS). 5 Efficacy outcome assessment was from the Clinical Global Impressions-Improvement (CGI-I) scale. 6","The approval date and approval number of the ethics committee of Fukui Kinen Hospital were 21 January 2021 and 2-017, respectively.",Fukui Kinen Hospital,ORG,65,85,approval number,1 7 | 2388,C:\Users\shweata\approval_number_100\PMC8431654\sections\2_body\3_4__material_and_methods\1_4_1__blood_samples_used_i\1_p.xml,"Blood samples of COVID-19 patients were taken within the first two weeks after the detection of the SARS-CoV-2 infection of patients at University Hospital of RWTH Aachen. All patient samples were taken after written and informed consent according to the guidelines and specific approval of the study by the local ethics committee (Ethic approval number EK 080/20 for the Covid-19 Aachen study named COVAS; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany) and collected into RWTH cBMB, the central biobank of the medical faculty of RWTH Aachen University (Ethic approval number EK 206/09). Blood samples of healthy donors were taken after written and informed consent according to the guidelines and approval of the study by the local ethics committee (EK 041/15; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany). These control samples were taken in the years 2018 and 2019, before the initial SARS-CoV-2 outbreak. All venous blood samples were anticoagulated with EDTA and cryopreserved at -80 °C until further analysis.","All patient samples were taken after written and informed consent according to the guidelines and specific approval of the study by the local ethics committee (Ethic approval number EK 080/20 for the Covid-19 Aachen study named COVAS; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany) and collected into RWTH cBMB, the central biobank of the medical faculty of RWTH Aachen University (Ethic approval number EK 206/09).","Ethics, RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse, Aachen, Germany, RWTH Aachen University (Ethic","ORG, ORG, GPE, GPE, GPE, ORG","235, 255, 307, 332, 340, 425","241, 305, 321, 338, 347, 454",approval number,1 8 | 2766,C:\Users\shweata\approval_number_100\PMC8442262\sections\2_body\1_methods\9_ethics_approval_and_conse\1_p.xml,"Ethics approval has been approved by the Research Ethics Board of Health (REBH), Ministry of Health, Royal Government of Bhutan vide approval number Ref. No. REBH/Approval/2019/067. An informed consent has been obtained from the individual participants for the use of photographic materials while the use of data and consent to participate had been obtained from the legal guardian (Principal of the school). All methods were carried out in accordance with relevant guidelines and regulations as enshrined in Helsinki Declarations 1964.","Ethics approval has been approved by the Research Ethics Board of Health (REBH), Ministry of Health, Royal Government of Bhutan vide approval number Ref.","the Research Ethics Board of Health, REBH, Ministry of Health, Royal Government, Ref","ORG, ORG, ORG, ORG, ORG","37, 74, 81, 101, 150","72, 78, 99, 117, 153",approval number,1 9 | 3047,C:\Users\shweata\approval_number_100\PMC8449472\sections\2_body\1_methods\1_ethical_approval\1_p.xml,"This study was approved by the Institutional Review Board of Kyungpook National University Chilgok Hospital, Daegu, Korea (approval number: 2020-04-029). As this was a retrospective study, the need for obtaining informed consent from patients was waived by the Institutional Review Board.","This study was approved by the Institutional Review Board of Kyungpook National University Chilgok Hospital, Daegu, Korea (approval number: 2020-04-029).","the Institutional Review Board, Kyungpook National University Chilgok Hospital, Daegu, Korea","ORG, ORG, GPE, GPE","27, 61, 109, 116","57, 107, 114, 121",approval number,1 10 | 3145,C:\Users\shweata\approval_number_100\PMC8454101\sections\2_body\1_methods\1_patients\1_p.xml,Conjunctival samples were obtained from patients undergoing retinal detachment surgery (n = 1) or conjunctival melanoma resection (n = 1) at the Eye Center of the University of Freiburg. Ethics approval was granted from Ethics Committee of the Albert-Ludwigs-University Freiburg (approval number 481/19).,Ethics approval was granted from Ethics Committee of the Albert-Ludwigs-University Freiburg (approval number 481/19).,Ethics Committee,ORG,33,49,approval number,1 11 | 3351,C:\Users\shweata\approval_number_100\PMC8462059\sections\2_body\3_ethical_approval\1_p.xml,This study was approved by the institutional review board and ethics committee of the Japanese Red Cross Ise Hospital (approval number: ER2020‐26).,This study was approved by the institutional review board and ethics committee of the Japanese Red Cross Ise Hospital (approval number: ER2020‐26).,the Japanese Red Cross Ise Hospital,ORG,82,117,approval number,1 12 | 3470,C:\Users\shweata\approval_number_100\PMC8468265\sections\2_body\3_4__materials_and_methods\4_4_4__ethical_approval\1_p.xml,"The ASB study was conducted in accordance with the Declaration of Helsinki. The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital. The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital.","ASB, the Academic Medical Centre, Amsterdam, Netherlands","ORG, ORG, GPE, GPE","4, 63, 92, 107","7, 90, 101, 118",approval number,1 13 | 3471,C:\Users\shweata\approval_number_100\PMC8468265\sections\2_body\3_4__materials_and_methods\4_4_4__ethical_approval\1_p.xml,"The ASB study was conducted in accordance with the Declaration of Helsinki. The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital. The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","Netherlands, ASB","GPE, ORG","39, 85","50, 88",approval number,1 14 | 3734,C:\Users\shweata\approval_number_100\PMC8474954\sections\2_body\1_methods\8_in_vivo_metastasis_assay\1_p.xml,"To test if EMMPRIN plays an important role in osteosarcoma metastasis in vivo, the osteosarcoma cell line 143B was injected in the tail vein of BALB/c mice. The mice were sacrificed at 8 weeks post-injection. Four-week-old male BALB/c nude mice were obtained from Central Lab. Animal Inc. (Seoul, Korea) and maintained under standard conditions until the experiments were performed. The animals were maintained at the animal facility of the Seoul National University Hospital under guidelines prior to the grouping and experiments. A total of 15 BALB/c nude mice were randomized into 3 groups: 1 normal, 2 143 cells transfected with an ad mock shRNA vector (Control), and 3 143 cells transfected with the ad EMMPRIN shRNA vector. Experiments were approved by the Institutional Animal Care and Use Committee of Seoul National University Hospital (approval number 10–0075). One anti-EMMPRIN sequence (5-GTCGTCAGAACACATCAAC-3) or a scrambled sequence was inserted into the plasmid vector pAdEasy-1 (Addgene). They were designated as pAdEasy-1-shRNA and pAdEasy-1 scramble shRNA, respectively. Osteosarcoma cell line 143B was transfected with EMMPRIN shRNA. EMMPRIN shRNA transfected 143B cells were harvested with trypsin, and then resuspended in serum-free RPMI, and injected in the tail vein (1 × 10 5/0.2 mL) of 5 nude mice per group. Health of the animals was monitored daily, and body weights were measured weekly throughout the study period. Anesthesia was performed with isoflurane inhalation as well as ketamine (10 mg/kg) and medetomidine (0.1 mg/kg) injection. All surgical procedures were performed under sterile conditions. Secondary euthanasia method for cervical dislocation was also performed. The mice were sacrificed by CO 2 inhalation at 8 weeks post-injection. Harvested tissues were preserved in Bouin’s fixative, embedded in paraffin, sectioned (4 μm), and stained with hematoxylin and eosin (H&E). Examination of the histological sections was performed using Nikon Eclipse Ci microscope (Nikon Corp., Tokyo, Japan) by a digital camera (Nikon digital sight, DS-2Mv) and the automatic exposure and iSolution Lite software for microscopic images. The tumor lengths and widths were measured by a perpendicular tumor diameter, with the tumor volume being calculated using the following formula: width 2 × length/2 20.",Experiments were approved by the Institutional Animal Care and Use Committee of Seoul National University Hospital (approval number 10–0075).,the Institutional Animal Care and Use Committee of Seoul National University Hospital,ORG,29,114,approval number,1 15 | 3821,C:\Users\shweata\approval_number_100\PMC8475677\sections\2_body\3_methods\3_institutional_review_boar\1_p.xml,"The study was approved by the institutional review boards of Kyoto University Graduate School of Medicine (approval number: E2311), Shiga General Hospital (approval number: 20141120‐01), Tenri Hospital (approval number: 640), Kobe City Medical Center General Hospital (approval number: 14094), Hyogo Prefectural Amagasaki General Medical Center (approval number: Rinri 26‐32), National Hospital Organization Kyoto Medical Center (approval number: 14‐080), Mitsubishi Kyoto Hospital (approved 11/12/2014), Okamoto Memorial Hospital (approval number: 201503), Japanese Red Cross Otsu Hospital (approval number: 318), Hikone Municipal Hospital (approval number: 26‐17), Japanese Red Cross Osaka Hospital (approval number: 392), Shimabara Hospital (approval number: E2311), Kishiwada City Hospital (approval number: 12), Kansai Electric Power Hospital (approval number: 26‐59), Shizuoka General Hospital (approval number: Rin14‐11‐47), Kurashiki Central Hospital (approval number: 1719), Kokura Memorial Hospital (approval number: 14111202), Kitano Hospital (approval number: P14‐11‐012), and Japanese Red Cross Wakayama Medical Center (approval number: 328).","The study was approved by the institutional review boards of Kyoto University Graduate School of Medicine (approval number: E2311), Shiga General Hospital (approval number: 20141120‐01), Tenri Hospital (approval number: 640), Kobe City Medical Center General Hospital (approval number: 14094), Hyogo Prefectural Amagasaki General Medical Center (approval number: Rinri 26‐32), National Hospital Organization Kyoto Medical Center (approval number: 14‐080), Mitsubishi Kyoto Hospital (approved 11/12/2014), Okamoto Memorial Hospital (approval number: 201503), Japanese Red Cross Otsu Hospital (approval number: 318), Hikone Municipal Hospital (approval number: 26‐17), Japanese Red Cross Osaka Hospital (approval number: 392), Shimabara Hospital (approval number: E2311), Kishiwada City Hospital (approval number: 12), Kansai Electric Power Hospital (approval number: 26‐59), Shizuoka General Hospital (approval number: Rin14‐11‐47), Kurashiki Central Hospital (approval number: 1719), Kokura Memorial Hospital (approval number: 14111202), Kitano Hospital (approval number: P14‐11‐012), and Japanese Red Cross Wakayama Medical Center (approval number: 328).","Kyoto University Graduate School of Medicine, Shiga General Hospital, Tenri Hospital, Kobe City Medical Center General Hospital, Hyogo Prefectural Amagasaki General Medical Center, National Hospital Organization Kyoto Medical Center, Mitsubishi Kyoto Hospital, Okamoto Memorial Hospital, Japanese Red Cross Otsu Hospital, Hikone Municipal Hospital, Japanese Red Cross Osaka Hospital, Shimabara Hospital, Kishiwada City Hospital, Kansai Electric Power Hospital, Shizuoka General Hospital, Kurashiki Central Hospital, Kokura Memorial Hospital, Kitano Hospital, Japanese Red Cross","ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG","61, 132, 187, 226, 294, 377, 456, 505, 558, 615, 667, 725, 770, 817, 874, 932, 984, 1038, 1089","105, 154, 201, 267, 344, 428, 481, 530, 590, 640, 700, 743, 793, 847, 899, 958, 1008, 1053, 1107",approval number,1 16 | 4328,C:\Users\shweata\approval_number_100\PMC8482572\sections\2_body\1_methods\4_ethics_approval_and_infor\1_p.xml,The study protocol was approved by the ethical committees of Kawasaki University of Medical Welfare (Approval number: 18-102) and Chiang Mai University (Approval number: NUR-2562-06120). All participants provided written informed consent to participate in the study.,The study protocol was approved by the ethical committees of Kawasaki University of Medical Welfare (Approval number: 18-102) and Chiang Mai University (Approval number: NUR-2562-06120).,"Kawasaki University of Medical Welfare (Approval, Chiang Mai University","ORG, ORG","61, 130","109, 151",approval number,1 17 | -------------------------------------------------------------------------------- /resources/demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from docanalysis import DocAnalysis 3 | from pathlib import Path 4 | 5 | doc_analysis = DocAnalysis() 6 | ETHICS_DICTIONARY_DIR = Path(os.getcwd(), "ethics_dictionary") 7 | CORPUS_DIR = Path(os.getcwd(), "corpus") 8 | 9 | 10 | def create_phrases_file(phrases_dir, phrases_file, dictionary_dir=ETHICS_DICTIONARY_DIR): 11 | global terms_xml_path 12 | terms_xml_dir = Path(dictionary_dir, phrases_dir) 13 | if not terms_xml_dir.exists(): 14 | terms_xml_dir.mkdir() 15 | terms_xml_path = Path(terms_xml_dir, phrases_file) 16 | return terms_xml_path 17 | 18 | 19 | def get_or_create_corpus_dir(subdir_name, corpus_dir=CORPUS_DIR): 20 | """get specific corpus directory, creating if necessary 21 | 22 | :param corpus_dir: directory containing corpora 23 | :param subdir_name: specific corpus to get or create 24 | :return: directoyr of specific corpus""" 25 | assert corpus_dir.exists(), "directory of corpora must exist" 26 | subdir = Path(corpus_dir, subdir_name) 27 | if not subdir.exists(): 28 | subdir.mkdir() 29 | return subdir 30 | 31 | 32 | def run_analysis(corpus_path, phrases_file, query=None, hits=30): 33 | dict_for_entities = doc_analysis.extract_entities_from_papers( 34 | corpus_path=corpus_path, 35 | terms_xml_path=terms_xml_path, 36 | query=query, 37 | hits=hits, 38 | make_project=True 39 | ) 40 | create_and_write_list_for_fields(dict_for_entities, "ORG", "org.text") 41 | create_and_write_list_for_fields(dict_for_entities, "GPE", "GPE.text") 42 | 43 | 44 | def create_and_write_list_for_fields(dict_for_entities, field, out_filename): 45 | list_with_orgs = doc_analysis.extract_particular_fields( 46 | dict_for_entities, field) 47 | with open(out_filename, 'w') as f: 48 | f.write(str(list_with_orgs)) 49 | 50 | 51 | ETHICS = "ethics" 52 | TERPENES = "terpenes" 53 | options = { 54 | ETHICS, 55 | TERPENES 56 | } 57 | 58 | if ETHICS in options: 59 | corpus_dir = get_or_create_corpus_dir("e_cancer_clinical_trial_50") 60 | phrases_file = create_phrases_file("ethics_key_phrases", "ethics_key_phrases.xml", ) 61 | run_analysis( 62 | corpus_dir, 63 | phrases_file, 64 | query="ethics" 65 | ) 66 | 67 | if TERPENES in options: 68 | run_analysis( 69 | get_or_create_corpus_dir(TERPENES), 70 | create_phrases_file("terpenes_key_phrases", "terpenes_key_phrases.xml", dictionary_dir="terpenes_dictionary"), 71 | query=TERPENES, 72 | hits = 20, 73 | ) 74 | 75 | -------------------------------------------------------------------------------- /resources/docanalyis_architecture_diagram.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/resources/docanalyis_architecture_diagram.PNG -------------------------------------------------------------------------------- /resources/entities_country.csv: -------------------------------------------------------------------------------- 1 | ,file_path,paragraph,sentence,section,entities,labels,position_start,position_end,abbreviations,abbreviations_longform,abbreviation_start,abbreviation_end,has_terms,weight_terms 2 | 1,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\7_aff.xml, 1 Institute for Organic Chemistry and BMWZ Leibniz Universität Hannover Schneiderberg 38 30167 Hannover Germany , 1 Institute for Organic Chemistry and BMWZ Leibniz Universität Hannover Schneiderberg 38 30167 Hannover Germany,AFF,Germany,GPE,122,129,,,,,Germany,1 3 | 3,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\8_aff.xml, 2 Structure and Function of Proteins Helmholtz Centre for Infection Research Inhoffenstr. 7 38124 Braunschweig Germany ,7 38124 Braunschweig Germany,AFF,Germany,GPE,27,34,,,,,Germany,1 4 | 5,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\9_aff.xml," 3 Institute for Biochemistry, Biotechnology and Bioinformatics Technische Universität Braunschweig Spielmannstr. 7 38106 Braunschweig Germany ",7 38106 Braunschweig Germany,AFF,Germany,GPE,27,34,,,,,Germany,1 5 | 6,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8465594\sections\1_front\1_article-meta\6_aff.xml,"1 Department of Chemistry of Natural Compounds, University of Chemistry and Technology, Technicka 5, 166 28 Prague, Czech Republic","1 Department of Chemistry of Natural Compounds, University of Chemistry and Technology, Technicka 5, 166 28 Prague, Czech Republic",AFF,"Prague, Czech Republic","GPE, GPE","108, 116","114, 130",,,,,Czech Republic,1 6 | 7,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8465594\sections\1_front\1_article-meta\7_aff.xml,"2 Institute of Bioorganic Chemistry, National Academy of Sciences of Belarus, 5/2 Academician V. F. Kuprevich Street, BY-220141 Minsk, Belarus; khripach@iboch.by","2 Institute of Bioorganic Chemistry, National Academy of Sciences of Belarus, 5/2 Academician V. F. Kuprevich Street, BY-220141 Minsk, Belarus; khripach@iboch.by",AFF,"BY-220141 Minsk, Belarus","GPE, GPE","118, 135","133, 142",,,,,Belarus,1 7 | 11,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8541587\sections\1_front\1_article-meta\7_aff.xml,"Institut National de Recherche pour L’agriculture, L’alimentation et L’environnement (INRAE), 13182 Aix-en-Provence, France; bastien.romero@inrae.fr","Institut National de Recherche pour L’agriculture, L’alimentation et L’environnement (INRAE), 13182 Aix-en-Provence, France; bastien.romero@inrae.fr",AFF,France,GPE,117,123,,,,,France,1 8 | 13,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625326\sections\1_front\1_article-meta\7_aff.xml,"School of Forestry and Resource Conservation, National Taiwan University, Taipei 10617, Taiwan; mary2234@gmail.com (L.-T.M.); bad8016479@gmail.com (P.-L.L.); timmy304681@gmail.com (Y.-T.C.); jimmy81513@hotmail.com (T.-F.S.)","School of Forestry and Resource Conservation, National Taiwan University, Taipei 10617, Taiwan; mary2234@gmail.com (L.-T.M.",AFF,"Taipei, Taiwan","GPE, GPE","74, 88","80, 94",,,,,Taiwan,1 9 | 17,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\7_aff.xml,"1 INSERM U1070 “Pharmacology of Anti-Infective Agents”, 1 rue Georges Bonnet, Pôle Biologie Santé, 86022 Poitiers, France; chantal.valcourtsainz@gmail.com (C.V.); julien.buyck@univ-poitiers.fr (J.M.B.); nicolas.gregoire@univ-poitiers.fr (N.G.); william.couet@univ-poitiers.fr (W.C.); sandrine.marchand@univ-poitiers.fr (S.M.)","1 INSERM U1070 “Pharmacology of Anti-Infective Agents”, 1 rue Georges Bonnet, Pôle Biologie Santé, 86022 Poitiers, France; chantal.valcourtsainz@gmail.com (C.V.); julien.buyck@univ-poitiers.fr (J.M.B.",AFF,"France, C.V.","GPE, GPE","115, 158","121, 162",,,,,France,1 10 | 20,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\8_aff.xml,"2 UFR Médecine-Pharmacie Université de Poitiers, 6 rue de la Milétrie, TSA 51115, 86073 Poitiers, France","2 UFR Médecine-Pharmacie Université de Poitiers, 6 rue de la Milétrie, TSA 51115, 86073 Poitiers, France",AFF,France,GPE,98,104,,,,,France,1 11 | 21,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\9_aff.xml,"3 Laboratoire de Toxicologie-Pharmacocinétique, CHU de Poitiers, 2 rue de la Miletrie, 86021 Poitiers, France","3 Laboratoire de Toxicologie-Pharmacocinétique, CHU de Poitiers, 2 rue de la Miletrie, 86021 Poitiers, France",AFF,France,GPE,103,109,,,,,France,1 12 | 23,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8701039\sections\1_front\1_article-meta\7_aff.xml,"1 Centrale Marseille, CNRS, iSm2 Marseille, ISM2 UMR 7313, Aix-Marseille Université, Av. Escadrille Normandie-Niemen, 13013 Marseille, France; julie.couillaud.13990@gmail.com (J.C.); letitia.LEYDET@univ-amu.fr (L.L.); katia.duquesne@univ-amu.fr (K.D.)","Escadrille Normandie-Niemen, 13013 Marseille, France; julie.couillaud.13990@gmail.com (J.C.); letitia.LEYDET@univ-amu.fr (L.L.",AFF,"Marseille, France, J.C.","GPE, GPE, GPE","35, 46, 89","44, 52, 93",,,,,France,1 13 | 25,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8701039\sections\1_front\1_article-meta\8_aff.xml,"2 Systems and Synthetic Biology Division, Department of Biology and Biological Engineering, Chalmers University of Technology, 41296 Gothenburg, Sweden","2 Systems and Synthetic Biology Division, Department of Biology and Biological Engineering, Chalmers University of Technology, 41296 Gothenburg, Sweden",AFF,"Gothenburg, Sweden","GPE, GPE","133, 145","143, 151",,,,,Sweden,1 14 | 30,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\5_aff.xml,"1 Egyptian Deserts Gene Bank, North Sinai Research Station, Department of Genetic Resources, Desert Research Center , Cairo , Egypt","1 Egyptian Deserts Gene Bank, North Sinai Research Station, Department of Genetic Resources, Desert Research Center , Cairo , Egypt",AFF,"Cairo, Egypt","GPE, GPE","119, 128","124, 133",,,,,Egypt,1 15 | 33,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\8_aff.xml,"4 Department of Biology, Faculty of Science, University of Tabuk , Tabuk , Saudi Arabia","4 Department of Biology, Faculty of Science, University of Tabuk , Tabuk , Saudi Arabia",AFF,Saudi Arabia,GPE,77,89,,,,,Saudi Arabia,1 16 | 34,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\9_aff.xml,"5 Department of Plant Agricultural, Faculty of Agriculture Science, Al-Azhar University , Assiut , Egypt","5 Department of Plant Agricultural, Faculty of Agriculture Science, Al-Azhar University , Assiut , Egypt",AFF,Egypt,GPE,101,106,,,,,Egypt,1 17 | 35,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\10_aff.xml,"4 ICREA—Catalan Institution for Research and Advanced Studies, 08010 Barcelona, Spain; jrintjema@iciq.es","4 ICREA—Catalan Institution for Research and Advanced Studies, 08010 Barcelona, Spain; jrintjema@iciq.es",AFF,Spain,GPE,80,85,,,,,Spain,1 18 | 36,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\11_aff.xml,"5 Institute of Chemical Research of Catalonia (ICIQ), Barcelona Institute of Science and Technology, 43007 Tarragona, Spain; fbravo@iciq.es (F.B.); akleij@iciq.es (A.W.K.)","5 Institute of Chemical Research of Catalonia (ICIQ), Barcelona Institute of Science and Technology, 43007 Tarragona, Spain; fbravo@iciq.es (F.B.",AFF,Spain,GPE,118,123,,,,,Spain,1 19 | 38,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\12_aff.xml,"6 Institute for Bioengineering of Catalonia, Baldiri Reixac 10-12, 08028 Barcelona, Spain","6 Institute for Bioengineering of Catalonia, Baldiri Reixac 10-12, 08028 Barcelona, Spain",AFF,"Barcelona, Spain","GPE, GPE","73, 84","82, 89",,,,,Spain,1 20 | 39,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\7_aff.xml,"1 Departament d’Enginyeria Química, EEBE, Universitat Politècnica de Catalunya, 08019 Barcelona, Spain; reza.zeinali@upc.edu (R.Z.); lourdes.franco@upc.edu (L.F.); carlos.aleman@upc.edu (C.A.)","1 Departament d’Enginyeria Química, EEBE, Universitat Politècnica de Catalunya, 08019 Barcelona, Spain; reza.zeinali@upc.edu (R.Z.",AFF,Spain,GPE,97,102,,,,,Spain,1 21 | 41,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\8_aff.xml,"2 Center for Research in Nano-Engineering, CrNE, Universitat Politècnica de Catalunya, C. Eduard Maristany, 08019 Barcelona, Spain","2 Center for Research in Nano-Engineering, CrNE, Universitat Politècnica de Catalunya, C. Eduard Maristany, 08019 Barcelona, Spain",AFF,Spain,GPE,125,130,,,,,Spain,1 22 | 42,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\9_aff.xml,"3 ALBA Synchrotron Light Source, Carrer de la Llum, 2-26, Cerdanyola del Vallès, 08290 Barcelona, Spain; iyousef@cells.es","3 ALBA Synchrotron Light Source, Carrer de la Llum, 2-26, Cerdanyola del Vallès, 08290 Barcelona, Spain; iyousef@cells.es",AFF,"Barcelona, Spain","GPE, GPE","87, 98","96, 103",,,,,Spain,1 23 | 45,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8771452\sections\1_front\1_article-meta\10_aff.xml,"4 School of Pure and Applied Sciences, Karatina University , Karatina, Kenya","4 School of Pure and Applied Sciences, Karatina University , Karatina, Kenya",AFF,"Karatina, Kenya","GPE, GPE","62, 73","70, 78",,,,,Kenya,1 24 | 61,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\7_aff.xml,"1 Instituto Botánico, Departamento de Ciencia y Tecnología Agroforestal y Genética, Universidad de Castilla-La Mancha, Campus Universitario s/n, 02071 Albacete, Spain; maria.mondejar3@alu.uclm.es (M.M.-L.); albertojose.lopez@uclm.es (A.J.L.-J.); Oussama.ahrazem@uclm.es (O.A.); MariaLourdes.gomez@uclm.es (L.G.-G.)","1 Instituto Botánico, Departamento de Ciencia y Tecnología Agroforestal y Genética, Universidad de Castilla-La Mancha, Campus Universitario s/n, 02071 Albacete, Spain; maria.mondejar3@alu.uclm.es (M.M.-L.); albertojose.lopez@uclm.es (A.J.L.-J.",AFF,"Spain, M.M.-L.","GPE, GPE","161, 199","166, 206",,,,,Spain,1 25 | 64,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\8_aff.xml,"2 Departamento de Química Inorgánica, Orgánica y Bioquímica, Facultad de Farmacia, Universidad de Castilla-La Mancha, C/José María Sánchez Ibáñez s/n, 02008 Albacete, Spain; Joaquinc.garcia@uclm.es","2 Departamento de Química Inorgánica, Orgánica y Bioquímica, Facultad de Farmacia, Universidad de Castilla-La Mancha, C/José María Sánchez Ibáñez s/n, 02008 Albacete, Spain; Joaquinc.garcia@uclm.es",AFF,"Orgánica, Spain","GPE, GPE","38, 167","46, 172",,,,,Spain,1 26 | 65,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\9_aff.xml,"3 Regional Center for Biomedical Research (CRIB), Universidad de Castilla-La Mancha, C/Almansa 13, 02008 Albacete, Spain","3 Regional Center for Biomedical Research (CRIB), Universidad de Castilla-La Mancha, C/Almansa 13, 02008 Albacete, Spain",AFF,Spain,GPE,115,120,,,,,Spain,1 27 | 66,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8782336\sections\1_front\1_article-meta\6_aff.xml,"1 Division of Biological Sciences, University of California, San Diego, La Jolla, California, United States of America","1 Division of Biological Sciences, University of California, San Diego, La Jolla, California, United States of America",AFF,"La Jolla, California, United States of America","GPE, GPE, GPE","74, 84, 96","82, 94, 120",,,,,United States of America,1 28 | 68,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8782336\sections\1_front\1_article-meta\9_aff.xml," SRUC: Scotland’s Rural College, UNITED KINGDOM "," SRUC: Scotland’s Rural College, UNITED KINGDOM",AFF,"Scotland, UNITED KINGDOM","GPE, GPE","8, 34","16, 48",,,,,United Kingdom,1 29 | -------------------------------------------------------------------------------- /resources/fig_ent.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /resources/pmr_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from docanalysis import DocAnalysis 3 | from pathlib import Path 4 | 5 | ethic_statement_creator = DocAnalysis() 6 | term_dir = Path(os.getcwd(), "terpenes_dictionary", "terpenes_key_phrases", ) 7 | if not term_dir.exists(): 8 | term_dir.mkdir() 9 | dict_for_entities = ethic_statement_creator.extract_entities_from_papers( 10 | corpus_path=Path(os.getcwd(), "corpus", "terpenes", ), 11 | terms_xml_path=Path(term_dir, "terpenes_key_phrases.xml"), 12 | query="terpenes", 13 | hits=10, 14 | make_project=True 15 | ) 16 | print(f"dict {dict_for_entities}") 17 | list_with_orgs = ethic_statement_creator.extract_particular_fields( 18 | dict_for_entities, 'ORG') 19 | with open('org.text', 'w') as f: 20 | f.write(str(list_with_orgs)) 21 | list_with_gpe = ethic_statement_creator.extract_particular_fields( 22 | dict_for_entities, 'GPE') 23 | with open('GPE.text', 'w') as f: 24 | f.write(str(list_with_gpe)) 25 | 26 | -------------------------------------------------------------------------------- /resources/test_pmc.txt: -------------------------------------------------------------------------------- 1 | PMC8771452, PMC8771452, PMC8771452 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | try: 5 | from setuptools import setup 6 | except ImportError: 7 | from distutils.core import setup 8 | import configparser 9 | import os 10 | 11 | with open('README.md', encoding='utf-8') as readme_file: 12 | readme = readme_file.read() 13 | 14 | requirements = ['abbreviations', 'beautifulsoup4==4.10.0', 'braceexpand==0.1.7', 'coloredlogs==15.0.1', 'ConfigArgParse==1.5.3', 'lxml==4.7.1', 'nltk==3.6.7', 'pandas==1.3.4', 15 | 'pygetpapers', 16 | 'pytest==6.2.5', 17 | 'setuptools==60.3.1', 18 | 'spacy==3.0.7', 19 | 'tkinterweb==3.10.7', 20 | 'tqdm==4.62.3' 21 | ] 22 | 23 | setup( 24 | name='docanalysis', 25 | version="0.3.0", 26 | description='extract structured information from ethics paragraphs', 27 | long_description_content_type='text/markdown', 28 | long_description=readme, 29 | author='Ayush Garg, Shweata N. Hegde', 30 | author_email='ayush@science.org.in, shweata.hegde@gmail.com', 31 | url='https://github.com/petermr/docanalysis', 32 | packages=[ 33 | 'docanalysis', 34 | ], 35 | package_dir={'docanalysis': 36 | 'docanalysis'}, 37 | include_package_data=True, 38 | install_requires=requirements, 39 | license='Apache License', 40 | zip_safe=False, 41 | keywords='research automation', 42 | classifiers=[ 43 | 'Development Status :: 4 - Beta', 44 | 'Intended Audience :: Developers', 45 | 'License :: OSI Approved :: Apache Software License', 46 | 'Natural Language :: English', 47 | 'Programming Language :: Python :: 3.4', 48 | 'Programming Language :: Python :: 3.5', 49 | 'Programming Language :: Python :: 3.6', 50 | 'Programming Language :: Python :: 3.7', 51 | 'Programming Language :: Python :: 3.8', 52 | 'Programming Language :: Python :: 3.9', 53 | 'Programming Language :: Python :: 3.10', 54 | 55 | ], 56 | entry_points={ 57 | 'console_scripts': [ 58 | 'docanalysis=docanalysis.docanalysis:main', 59 | ], 60 | }, 61 | 62 | ) 63 | -------------------------------------------------------------------------------- /tests/test_docanalysis_cli.py: -------------------------------------------------------------------------------- 1 | # test whether... 2 | # Cproject exists (I) 3 | # dictionary exists (I) 4 | # sections exist (I) 5 | # non-empty CSV exists (O) 6 | # dictionary is created (not sure if we create two dictionaries (entities and keyphrases) can be created at the same time) 7 | import pytest 8 | from pathlib import Path 9 | import os 10 | 11 | DOCANALYSIS_TOP = Path(__file__).parent.parent 12 | #EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300') 13 | PMC_TEXT_FILE = Path(DOCANALYSIS_TOP, 'resources', 'test_pmc.text') 14 | DICT_DIRECTORY = Path(DOCANALYSIS_TOP, 'ethics_dictionary') 15 | TEST_DICT = Path(DICT_DIRECTORY, 'ethics_demo', 'ethics_demo.xml') 16 | TEMP_CPROJECT = Path(DOCANALYSIS_TOP, 'test_ethics_20') 17 | 18 | class TestDocanalysis: 19 | 20 | def test_pygetpapers(self): 21 | """- checks whether 22 | - the corpus directory exists or not 23 | - the number of PMC * folders is equal to the hits specified 24 | - fulltext xml exists in each PMC folder or not 25 | """ 26 | os.system(f'docanalysis --run_pygetpapers --terms {TEST_DICT} --project_name {TEMP_CPROJECT}') 27 | assert TEMP_CPROJECT.exists(), f"checking whether {TEMP_CPROJECT} exists" 28 | assert len(list(TEMP_CPROJECT.glob('PMC*/'))) == 3 29 | assert len(list(TEMP_CPROJECT.glob('PMC*/fulltext.xml'))) == 3 30 | 31 | def test_section_exists(self): 32 | """checkers whether 33 | - the number of PMC folder with sections is equal to number of hits 34 | - section exists in each PMC folder 35 | # not sure if this is the right way of testing whether papers are sectioned 36 | """ 37 | 38 | f'docanalysis --project_name {TEMP_CPROJECT} --run_sectioning' 39 | assert len(list(TEMP_CPROJECT.glob('PMC*/sections/'))) == 3 40 | for PMC in TEMP_CPROJECT.glob('**/'): 41 | for section in PMC.glob('sections/'): 42 | assert section.name.exists() 43 | 44 | def test_search_dict_exists(self): 45 | """checks whether the dictionary directory exists or not 46 | """ 47 | assert TEST_DICT.exists(), f"dictionary {TEST_DICT} must exist" 48 | 49 | def test_csv_output_creation(self): 50 | """checks whether the csv output is created or not 51 | """ 52 | os.system(f'docanalysis --project_name {TEMP_CPROJECT} --dictionary {TEST_DICT} --output') 53 | assert Path(TEMP_CPROJECT, 'entities.csv').exists, 'checking if the output is created' 54 | 55 | def test_dict_creation_entites(self): 56 | os.system(f'docanalysis --project_name {TEMP_CPROJECT} --dictionary {TEST_DICT} --output ----make_ami_dict entities.xml') 57 | assert Path(TEMP_CPROJECT, 'entities.xml').exists, 'checking if the entitty dictionary is created' 58 | 59 | def test_remove_dir(): 60 | import shutil 61 | shutil.rmtree(TEMP_CPROJECT) 62 | assert "Ran all the tests" == "Ran all the tests" -------------------------------------------------------------------------------- /tests/test_docanalysis_method.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import glob 3 | import os 4 | from pathlib import Path 5 | from ..docanalysis.extract_entities import DocAnalysis 6 | 7 | DOCANALYSIS_TOP = Path(__file__).parent.parent 8 | EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300') 9 | 10 | class TestDocanalysisMeth(): 11 | 12 | def test_cproject_exists(self): 13 | assert EXISTING_CPROJECT.exists(), f"checking whether {EXISTING_CPROJECT} exists" 14 | 15 | def test_glob_section(self): 16 | all_paragraphs = glob(os.path.join( 17 | EXISTING_CPROJECT, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True) 18 | assert all_paragraphs is not None -------------------------------------------------------------------------------- /tests/testing_test.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | DOCANALYSIS_TOP = Path(__file__).parent.parent 4 | print(DOCANALYSIS_TOP) 5 | #EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300') 6 | PMC_TEXT_FILE = Path(DOCANALYSIS_TOP, 'resources', 'test_pmc.text') 7 | print(PMC_TEXT_FILE) --------------------------------------------------------------------------------