├── .gitignore
├── .idea
    ├── .gitignore
    ├── docanalysis.iml
    ├── inspectionProfiles
    │   └── profiles_settings.xml
    ├── modules.xml
    └── vcs.xml
├── LICENSE
├── README.md
├── __init__.py
├── config.ini
├── dictionary
    ├── abb.xml
    ├── ack_key_phrases_manual.txt
    ├── ack_key_phrases_manual
    │   ├── ack_key_phrases.md
    │   ├── ack_key_phrases_manual.xml
    │   └── approval_number.xml
    ├── acknowledgment_feature_names.xml
    ├── chap4_wikitest_2.xml
    ├── cities_dictionary
    │   └── cities.xml
    ├── consent_type.txt
    ├── consent_type
    │   └── consent_type.xml
    ├── ethics_committee_key_phrases.txt
    ├── ethics_committee_key_phrases
    │   └── ethics_committee_key_phrases.xml
    ├── ethics_key_phrases.txt
    ├── ethics_key_phrases
    │   └── ethics_key_phrases.xml
    ├── features_ack.txt
    ├── features_ack
    │   ├── acknowledgment_feature_names.xml
    │   └── features_ack.xml
    ├── invasion_biology
    │   ├── invasion_hypotheses.xml
    │   └── invasion_hypothesis.txt
    ├── ipcc.xml
    ├── methods_key_phrases.txt
    ├── methods_key_phrases
    │   └── methods_key_phrases.xml
    ├── software.xml
    └── test_terpene.xml
├── docanalysis
    ├── .DS_Store
    ├── __init__.py
    ├── ami_sections.py
    ├── config
    │   ├── default_dicts.json
    │   └── default_sections.json
    ├── convert_file.py
    ├── docanalysis.py
    ├── entity_extraction.py
    ├── file_lib.py
    ├── get_html.py
    ├── glob_trail.py
    ├── gui.py
    ├── gui
    │   ├── css
    │   │   └── main.css
    │   ├── eel.js
    │   └── main.html
    └── xml_lib.py
├── docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── conf.py
    │   ├── docanalysis.rst
    │   ├── entity_extraction.rst
    │   └── index.rst
├── notebooks
    ├── README.md
    └── c_project.ipynb
├── requirements.txt
├── resources
    ├── approval_number_100.csv
    ├── demo.py
    ├── docanalyis_architecture_diagram.PNG
    ├── docanalysis_demo.ipynb
    ├── entities_country.csv
    ├── ethics_statement_corpus_1000.csv
    ├── fig_ent.xml
    ├── oil186.csv
    ├── oil186_20210712.csv
    ├── oil186_ack.csv
    ├── pmr_demo.py
    ├── software_mentions.csv
    ├── stem_cell_research_300.csv
    ├── stem_cell_research_300_2020.csv
    ├── stem_cell_research_300_ethics.csv
    ├── terpene_fig_entities.csv
    └── test_pmc.txt
├── setup.py
├── software_papers.ipynb
└── tests
    ├── test_docanalysis_cli.py
    ├── test_docanalysis_method.py
    └── testing_test.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # temporary results
132 | temp/
133 | 
134 | #corpus
135 | oil186/
136 | corpus/
137 | stem_cell_research_300/
138 | stem_cell_research_300_2020
139 | GPE.text
140 | ORG.text
141 | 
142 | #vscode
143 | .vscode/


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/.idea/docanalysis.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 6 |     </content>
 7 |     <orderEntry type="inheritedJdk" />
 8 |     <orderEntry type="sourceFolder" forTests="false" />
 9 |   </component>
10 |   <component name="PyDocumentationSettings">
11 |     <option name="format" value="PLAIN" />
12 |     <option name="myDocStringFormat" value="Plain" />
13 |   </component>
14 |   <component name="TestRunnerService">
15 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
16 |   </component>
17 | </module>


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/docanalysis.iml" filepath="$PROJECT_DIR$/.idea/docanalysis.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | For updated tutorial, please check the [Wiki](https://github.com/petermr/docanalysis/wiki/docanalysis-Tutorial) page.
  2 | ## docanalysis 
  3 | `docanalysis` is a Command Line Tool that ingests corpora [(CProjects)](https://github.com/petermr/tigr2ess/blob/master/getpapers/TUTORIAL.md#cproject-and-ctrees) and carries out text-analysis of documents, including
  4 | - sectioning
  5 | - NLP/text-mining
  6 | - dictionary generation 
  7 | 
  8 | Besides the bespoke code, it uses [NLTK](https://www.nltk.org/) and other Python tools for many operations, and [spaCy](https://spacy.io/) or [scispaCy](https://allenai.github.io/scispacy/) for extraction and annotation of entities. Outputs summary data and word-dictionaries. 
  9 | 
 10 | ### Set up `venv`
 11 | We recommend you create a virtual environment (`venv`) before installing `docanalysis` and that you activate the `venv` before each time you run `docanalysis`.
 12 | 
 13 | #### Windows
 14 | Creating a `venv`
 15 | ```
 16 | >> mkdir docanalysis_demo
 17 | >> cd docanalysis_demo
 18 | >> python -m venv venv
 19 | ```
 20 | 
 21 | Activating `venv`
 22 | ```
 23 | >> venv\Scripts\activate.bat
 24 | ```
 25 | 
 26 | #### MacOS
 27 | Creating a `venv`
 28 | ```
 29 | >> mkdir docanalysis_demo
 30 | >> cd docanalysis_demo
 31 | >> python3 -m venv venv
 32 | ```
 33 | 
 34 | Activating `venv`
 35 | ```
 36 | >> source venv/bin/activate
 37 | ```
 38 | 
 39 | Refer the [official documentation](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) for more help. 
 40 | 
 41 | ### Install `docanalysis`
 42 | You can download `docanalysis` from PYPI. 
 43 | ```
 44 |   pip install docanalysis
 45 | ```
 46 | If you are on a Mac
 47 | ```
 48 | pip3 install docanalysis
 49 | ```
 50 | 
 51 | Download python from: [https://www.python.org/downloads/](https://www.python.org/downloads/) and select the option `Add Python to Path while installing`. Make sure `pip` is installed along with python. Check out [https://pip.pypa.io/en/stable/installation/](https://pip.pypa.io/en/stable/installation/) if you have difficulties installing pip.
 52 | 
 53 | ### Run `docanalysis`
 54 | `docanalysis --help` should list the flags we support and their use.
 55 | 
 56 | ```
 57 | usage: docanalysis.py [-h] [--run_pygetpapers] [--make_section] [-q QUERY] [-k HITS] [--project_name PROJECT_NAME] [-d DICTIONARY] [-o OUTPUT]
 58 |                       [--make_ami_dict MAKE_AMI_DICT] [--search_section [SEARCH_SECTION [SEARCH_SECTION ...]]] [--entities [ENTITIES [ENTITIES ...]]]
 59 |                       [--spacy_model SPACY_MODEL] [--html HTML] [--synonyms SYNONYMS] [--make_json MAKE_JSON] [--search_html] [--extract_abb EXTRACT_ABB]
 60 |                       [-l LOGLEVEL] [-f LOGFILE]
 61 | 
 62 | Welcome to docanalysis version 0.1.3. -h or --help for help
 63 | 
 64 | optional arguments:
 65 |   -h, --help            show this help message and exit
 66 |   --run_pygetpapers     [Command] downloads papers from EuropePMC via pygetpapers
 67 |   --make_section        [Command] makes sections; requires a fulltext.xml in CTree directories
 68 |   -q QUERY, --query QUERY
 69 |                         [pygetpapers] query string
 70 |   -k HITS, --hits HITS  [pygetpapers] number of papers to download
 71 |   --project_name PROJECT_NAME
 72 |                         CProject directory name
 73 |   -d DICTIONARY, --dictionary DICTIONARY
 74 |                         [file name/url] existing ami dictionary to annotate sentences or support supervised entity extraction
 75 |   -o OUTPUT, --output OUTPUT
 76 |                         outputs csv with sentences/terms
 77 |   --make_ami_dict MAKE_AMI_DICT
 78 |                         [Command] title for ami-dict. Makes ami-dict of all extracted entities; works only with spacy
 79 |   --search_section [SEARCH_SECTION [SEARCH_SECTION ...]]
 80 |                         [NER/dictionary search] section(s) to annotate. Choose from: ALL, ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL. Defaults to
 81 |                         ALL
 82 |   --entities [ENTITIES [ENTITIES ...]]
 83 |                         [NER] entities to extract. Default (ALL). Common entities SpaCy: GPE, LANGUAGE, ORG, PERSON (for additional ones check: ); SciSpaCy: CHEMICAL,
 84 |                         DISEASE
 85 |   --spacy_model SPACY_MODEL
 86 |                         [NER] optional. Choose between spacy or scispacy models. Defaults to spacy
 87 |   --html HTML           outputs html with sentences/terms
 88 |   --synonyms SYNONYMS   annotate the corpus/sections with synonyms from ami-dict
 89 |   --make_json MAKE_JSON
 90 |                         outputs json with sentences/terms
 91 |   --search_html         searches html documents (mainly IPCC)
 92 |   --extract_abb EXTRACT_ABB
 93 |                         [Command] title for abb-ami-dict. Extracts abbreviations and expansions; makes ami-dict of all extracted entities
 94 |   -l LOGLEVEL, --loglevel LOGLEVEL
 95 |                         provide logging level. Example --log warning <<info,warning,debug,error,critical>>, default='info'
 96 |   -f LOGFILE, --logfile LOGFILE
 97 |                         saves log to specified file in output directory as well as printing to terminal
 98 | ```
 99 | 
100 | #### Download papers from [EPMC](https://europepmc.org/) via `pygetpapers`
101 | COMMAND
102 | ```
103 | docanalysis --run_pygetpapers -q "terpene" -k 10 --project_name terpene_10
104 | ```
105 | LOGS
106 | ```
107 | INFO: making project/searching terpene for 10 hits into C:\Users\shweata\docanalysis\terpene_10
108 | INFO: Total Hits are 13935
109 | 1it [00:00, 936.44it/s]
110 | INFO: Saving XML files to C:\Users\shweata\docanalysis\terpene_10\*\fulltext.xml
111 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00,  3.10s/it]
112 | ```
113 | 
114 | CPROJ
115 | ```
116 | C:\USERS\SHWEATA\DOCANALYSIS\TERPENE_10
117 | │   eupmc_results.json
118 | │
119 | ├───PMC8625850
120 | │       eupmc_result.json
121 | │       fulltext.xml
122 | │
123 | ├───PMC8727598
124 | │       eupmc_result.json
125 | │       fulltext.xml
126 | │
127 | ├───PMC8747377
128 | │       eupmc_result.json
129 | │       fulltext.xml
130 | │
131 | ├───PMC8771452
132 | │       eupmc_result.json
133 | │       fulltext.xml
134 | │
135 | ├───PMC8775117
136 | │       eupmc_result.json
137 | │       fulltext.xml
138 | │
139 | ├───PMC8801761
140 | │       eupmc_result.json
141 | │       fulltext.xml
142 | │
143 | ├───PMC8831285
144 | │       eupmc_result.json
145 | │       fulltext.xml
146 | │
147 | ├───PMC8839294
148 | │       eupmc_result.json
149 | │       fulltext.xml
150 | │
151 | ├───PMC8840323
152 | │       eupmc_result.json
153 | │       fulltext.xml
154 | │
155 | └───PMC8879232
156 |         eupmc_result.json
157 |         fulltext.xml
158 | ```
159 | 
160 | #### Section the papers
161 | COMMAND
162 | ```
163 | docanalysis --project_name terpene_10 --make_section
164 | ```
165 | LOGS
166 | ```
167 | WARNING: Making sections in /content/terpene_10/PMC9095633/fulltext.xml
168 | INFO: dict_keys: dict_keys(['abstract', 'acknowledge', 'affiliation', 'author', 'conclusion', 'discussion', 'ethics', 'fig_caption', 'front', 'introduction', 'jrnl_title', 'keyword', 'method', 'octree', 'pdfimage', 'pub_date', 'publisher', 'reference', 'results_discuss', 'search_results', 'sections', 'svg', 'table', 'title'])
169 | WARNING: loading templates.json
170 | INFO: wrote XML sections for /content/terpene_10/PMC9095633/fulltext.xml /content/terpene_10/PMC9095633/sections
171 | WARNING: Making sections in /content/terpene_10/PMC9120863/fulltext.xml
172 | INFO: wrote XML sections for /content/terpene_10/PMC9120863/fulltext.xml /content/terpene_10/PMC9120863/sections
173 | WARNING: Making sections in /content/terpene_10/PMC8982386/fulltext.xml
174 | INFO: wrote XML sections for /content/terpene_10/PMC8982386/fulltext.xml /content/terpene_10/PMC8982386/sections
175 | WARNING: Making sections in /content/terpene_10/PMC9069239/fulltext.xml
176 | INFO: wrote XML sections for /content/terpene_10/PMC9069239/fulltext.xml /content/terpene_10/PMC9069239/sections
177 | WARNING: Making sections in /content/terpene_10/PMC9165828/fulltext.xml
178 | INFO: wrote XML sections for /content/terpene_10/PMC9165828/fulltext.xml /content/terpene_10/PMC9165828/sections
179 | WARNING: Making sections in /content/terpene_10/PMC9119530/fulltext.xml
180 | INFO: wrote XML sections for /content/terpene_10/PMC9119530/fulltext.xml /content/terpene_10/PMC9119530/sections
181 | WARNING: Making sections in /content/terpene_10/PMC8982077/fulltext.xml
182 | INFO: wrote XML sections for /content/terpene_10/PMC8982077/fulltext.xml /content/terpene_10/PMC8982077/sections
183 | WARNING: Making sections in /content/terpene_10/PMC9067962/fulltext.xml
184 | INFO: wrote XML sections for /content/terpene_10/PMC9067962/fulltext.xml /content/terpene_10/PMC9067962/sections
185 | WARNING: Making sections in /content/terpene_10/PMC9154778/fulltext.xml
186 | INFO: wrote XML sections for /content/terpene_10/PMC9154778/fulltext.xml /content/terpene_10/PMC9154778/sections
187 | WARNING: Making sections in /content/terpene_10/PMC9164016/fulltext.xml
188 | INFO: wrote XML sections for /content/terpene_10/PMC9164016/fulltext.xml /content/terpene_10/PMC9164016/sections
189 |  47% 1056/2258 [00:01<00:01, 1003.31it/s]ERROR: cannot parse /content/terpene_10/PMC9165828/sections/1_front/1_article-meta/26_custom-meta-group/0_custom-meta/1_meta-value/0_xref.xml
190 |  67% 1516/2258 [00:01<00:00, 1047.68it/s]ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/7_xref.xml
191 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/14_email.xml
192 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/3_xref.xml
193 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/6_xref.xml
194 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/9_email.xml
195 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/10_email.xml
196 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/4_xref.xml
197 | ...
198 | 100% 2258/2258 [00:02<00:00, 949.43it/s] 
199 | ```
200 | 
201 | CTREE
202 | ```
203 | ├───PMC8625850
204 | │   └───sections
205 | │       ├───0_processing-meta
206 | │       ├───1_front
207 | │       │   ├───0_journal-meta
208 | │       │   └───1_article-meta
209 | │       ├───2_body
210 | │       │   ├───0_1._introduction
211 | │       │   ├───1_2._materials_and_methods
212 | │       │   │   ├───1_2.1._materials
213 | │       │   │   ├───2_2.2._bacterial_strains
214 | │       │   │   ├───3_2.3._preparation_and_character
215 | │       │   │   ├───4_2.4._evaluation_of_the_effect_
216 | │       │   │   ├───5_2.5._time-kill_studies
217 | │       │   │   ├───6_2.6._propidium_iodide_uptake-e
218 | │       │   │   └───7_2.7._hemolysis_test_from_human
219 | │       │   ├───2_3._results
220 | │       │   │   ├───1_3.1._encapsulation_of_terpene_
221 | │       │   │   ├───2_3.2._both_terpene_alcohol-load
222 | │       │   │   ├───3_3.3._farnesol_and_geraniol-loa
223 | │       │   │   └───4_3.4._farnesol_and_geraniol-loa
224 | │       │   ├───3_4._discussion
225 | │       │   ├───4_5._conclusions
226 | │       │   └───5_6._patents
227 | │       ├───3_back
228 | │       │   ├───0_ack
229 | │       │   ├───1_fn-group
230 | │       │   │   └───0_fn
231 | │       │   ├───2_app-group
232 | │       │   │   └───0_app
233 | │       │   │       └───2_supplementary-material
234 | │       │   │           └───0_media
235 | │       │   └───9_ref-list
236 | │       └───4_floats-group
237 | │           ├───4_table-wrap
238 | │           ├───5_table-wrap
239 | │           ├───6_table-wrap
240 | │           │   └───4_table-wrap-foot
241 | │           │       └───0_fn
242 | │           ├───7_table-wrap
243 | │           └───8_table-wrap
244 | ...
245 | ```
246 | ##### Search sections using dictionary
247 | COMMAND
248 | ```
249 | docanalysis --project_name terpene_10 --output entities.csv --make_ami_dict entities.xml
250 | ```
251 | LOGS
252 | ```
253 | INFO: Found 7134 sentences in the section(s).
254 | INFO: getting terms from /content/activity.xml
255 | 100% 7134/7134 [00:02<00:00, 3172.14it/s]
256 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
257 |   "[", "").str.replace("]", "")
258 | INFO: wrote output to /content/terpene_10/activity.csv
259 | ```
260 | 
261 | #### Extract entities
262 | We use `spacy` to extract Named Entites. Here's the list of Entities it supports:CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW,LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART 
263 | INPUT
264 | ```
265 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --entities ORG --output org.csv
266 | ```
267 | LOGS
268 | ```
269 | INFO: Found 7134 sentences in the section(s).
270 | INFO: Loading spacy
271 | 100% 7134/7134 [01:08<00:00, 104.16it/s]
272 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
273 |   "[", "").str.replace("]", "")
274 | INFO: wrote output to /content/terpene_10/org.csv
275 | ```
276 | ##### Extract information from specific section(s)
277 | You can choose to extract entities from specific sections
278 | 
279 | COMMAND
280 | ```
281 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --search_section AUT, AFF --entities ORG --output org_aut_aff.csv
282 | ```
283 | LOG
284 | ```
285 | INFO: Found 28 sentences in the section(s).
286 | INFO: Loading spacy
287 | 100% 28/28 [00:00<00:00, 106.66it/s]
288 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
289 |   "[", "").str.replace("]", "")
290 | INFO: wrote output to /content/terpene_10/org_aut_aff.csv
291 | ```
292 | #### Create dictionary of extracted entities
293 | COMMAND
294 | ```
295 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --search_section AUT, AFF --entities ORG --output org_aut_aff.csvv --make_ami_dict org
296 | ```
297 | LOG
298 | ```
299 | INFO: Found 28 sentences in the section(s).
300 | INFO: Loading spacy
301 | 100% 28/28 [00:00<00:00, 96.56it/s] 
302 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
303 |   "[", "").str.replace("]", "")
304 | INFO: wrote output to /content/terpene_10/org_aut_aff.csvv
305 | INFO: Wrote all the entities extracted to ami dict
306 | ```
307 | 
308 | Snippet of the dictionary
309 | ```
310 | <?xml version="1.0"?>
311 | - dictionary title="/content/terpene_10/org.xml">
312 | <entry count="2" term="Department of Biochemistry"/>
313 | <entry count="2" term="Chinese Academy of Agricultural Sciences"/>
314 | <entry count="2" term="Tianjin University"/>
315 | <entry count="2" term="Desert Research Center"/>
316 | <entry count="2" term="Chinese Academy of Sciences"/>
317 | <entry count="2" term="University of Colorado Boulder"/>
318 | <entry count="2" term="Department of Neurology"/>
319 | <entry count="1" term="Max Planck Institute for Chemical Ecology"/>
320 | <entry count="1" term="College of Forest Resources and Environmental Science"/>
321 | <entry count="1" term="Michigan Technological University"/>
322 | ```
323 | 
324 | ### Extract Abbreviations
325 | 
326 | ```
327 | docanalysis --project_name corpus\ethics_10 --output dict_search_5.csv  --make_json dict_search_5.json --make_ami_dict entities --extract_abb ethics_abb
328 | ```
329 | 
330 | `--extract_abb` extracts all abbreviations and make an ami-dictionary of abbreviations and its expansion. 
331 | 
332 | EXAMPLE DICTIONARY: 
333 | ```
334 | <dictionary title="ethics_abb">
335 |   <entry name="ASD" term="Atrial septal defect"/>
336 |   <entry name="SPSS" term="Statistical Package for Social Sciences"/>
337 |   <entry name="ACGME" term="Accreditation Council of Graduate Medical Education"/>
338 |   <entry name="ABP" term="American Board of Paediatrics"/>
339 |   <entry name="TBL" term="Team Based Learning"/>
340 |   <entry name="TBL" term="Team-Based Learning"/>
341 |   <entry name="UNTH" term="University of Nigeria Teaching Hospital"/>
342 |   <entry name="PAH" term="pulmonary hypertension"/>
343 |   <entry name="HREC" term="Human Sciences Research Council, Research Ethics Committee"/>
344 |   <entry name="HREC" term="Human Sciences Research Council, Research Ethics Committee"/>
345 |   <entry name="CDC" term="Center for Disease Control and Prevention"/>
346 |   <entry name="ASD" term="Atrial septal defect"/>
347 |   <entry name="PAH" term="pulmonary arterial hypertension"/>
348 |   <entry name="CVDs" term="cardiovascular diseases"/>
349 |   <entry name="BNs" term="Bayesian networks"/>
350 |   <entry name="GI" term="gastrointestinal cancer"/>
351 |   <entry name="ART" term="antiretroviral therapy"/>
352 |   <entry name="HIV" term="human immunodeficiency virus"/>
353 |   <entry name="GATE" term="Global Cooperation on Assistive Technology"/>
354 | </dictionary>
355 | ```
356 | 
357 | ### Search HTML
358 | If you working with HTML files (IPCC Reports, for example) and not XMLs in CProjects, you can use `--search_html` flag.
359 | 
360 | ```
361 | docanalysis --project_name corpus\ipcc_sectioned  --extract_abb ethics_abb --search_html
362 | ```
363 | 
364 |  Make sure that your `html` sections is in `sections` folder. Here's an example structure: 
365 | 
366 | ```
367 | C:.
368 | |   dict_search_2.csv
369 | |   dict_search_2.json
370 | |
371 | \---chap4
372 |     |   chapter_4
373 |     |
374 |     \---sections
375 |             4.1.html
376 |             4.2.1.html
377 |             4.2.2.html
378 |             4.2.3.html
379 |             4.2.4.html
380 |             4.2.5.html
381 |             4.2.7.html
382 |             4.2.html
383 |             4.3.1.html
384 |             4.3.2.html
385 |             4.3.html
386 |             4.4.1.html
387 |             4.4.2.html
388 |             4.4.html
389 |             4.5.html
390 |             executive_summary.html
391 |             frequently_asked_questions.html
392 |             table_of_contents.html
393 | ```
394 | If you haven't sectioned your `html`, please use `py4ami` to section it.  
395 | #### What is a dictionary
396 | Dictionary, in `ami`'s terminology, a set of terms/phrases in XML format. 
397 | Dictionaries related to ethics and acknowledgments are available in [Ethics Dictionary](https://github.com/petermr/docanalysis/tree/main/ethics_dictionary) folder
398 | 
399 | If you'd like to create a custom dictionary, you can find the steps, [here](https://github.com/petermr/tigr2ess/blob/master/dictionaries/TUTORIAL.md)
400 | 
401 | ```
402 | ### Python tools used
403 | - [`pygetpapers`](https://github.com/petermr/pygetpapers) - scrape open repositories to download papers of interest
404 | - [nltk](https://www.nltk.org/) - splits sentences
405 | - [spaCy](https://spacy.io/) and  [SciSpaCy](https://allenai.github.io/scispacy/)
406 |  - recognize Named-Entities and label them
407 |      - Here's the list of NER labels [SpaCy's English model](https://spacy.io/models/en) provides:  
408 |      `CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART`
409 | 
410 | 
411 | ### Credits: 
412 | -  [Ayush Garg](https://github.com/ayush4921)
413 | -  [Shweata N. Hegde](https://github.com/ShweataNHegde/)
414 | -  [Daniel Mietchen](https://github.com/Daniel-Mietchen)
415 | -  [Peter Murray-Rust](https://github.com/petermr)
416 | 
417 | 
418 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/__init__.py


--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
1 | [ethics_statement]
2 | version=0.0.0.1


--------------------------------------------------------------------------------
/dictionary/abb.xml:
--------------------------------------------------------------------------------
 1 | <dictionary title="abb">
 2 |   <entry term="NDCs" exapansion="Nationally  Determined  Contributions"/>
 3 |   <entry term="GHG" exapansion="Greenhouse  Gases"/>
 4 |   <entry term="BURs" exapansion="biennial update reports"/>
 5 |   <entry term="BTRs" exapansion="biennial transparency reports"/>
 6 |   <entry term="LCTPi" exapansion="Low Carbon Technology Partnerships"/>
 7 |   <entry term="LEDS" exapansion="low-emissions development strategies"/>
 8 |   <entry term="CCS" exapansion="carbon capture and sequestration"/>
 9 |   <entry term="CDR" exapansion="carbon dioxide removal"/>
10 |   <entry term="GWP" exapansion="global warming potential"/>
11 |   <entry term="IOT" exapansion="internet-of-things"/>
12 |   <entry term="BURs" exapansion="biennial update reports"/>
13 |   <entry term="BTRs" exapansion="biennial transparency reports"/>
14 |   <entry term="LCTPi" exapansion="Low Carbon Technology Partnerships"/>
15 |   <entry term="LEDS" exapansion="low-emissions development strategies"/>
16 |   <entry term="CCS" exapansion="carbon capture and sequestration"/>
17 |   <entry term="CDR" exapansion="carbon dioxide removal"/>
18 |   <entry term="GWP" exapansion="global warming potential"/>
19 |   <entry term="IOT" exapansion="internet-of-things"/>
20 |   <entry term="IMP" exapansion="Illustrative  Mitigation  Pathways"/>
21 |   <entry term="France" exapansion="Franck  Lecocq"/>
22 |   <entry term="IMP" exapansion="Illustrative  Mitigation  Pathways"/>
23 |   <entry term="France" exapansion="Franck  Lecocq"/>
24 |   <entry term="NBS" exapansion="nature-based solutions"/>
25 |   <entry term="CSA" exapansion="climate-smart agriculture"/>
26 |   <entry term="NBS" exapansion="nature-based solutions"/>
27 |   <entry term="CSA" exapansion="climate-smart agriculture"/>
28 |   <entry term="ITUC" exapansion="International Trade Union Confederation"/>
29 |   <entry term="ILO" exapansion="International Labour Organization"/>
30 |   <entry term="UNEP" exapansion="UN Environmental Programme"/>
31 |   <entry term="Africa" exapansion="African Climate Justice Alliance"/>
32 |   <entry term="EU" exapansion="European Trade Union Confederation"/>
33 |   <entry term="global" exapansion="Global Union"/>
34 |   <entry term="NDCs" exapansion="Nationally  Determined  Contributions"/>
35 |   <entry term="SDPS" exapansion="Shifting development pathways to increased sustainability"/>
36 |   <entry term="GHG" exapansion="Greenhouse  Gases"/>
37 |   <entry term="BURs" exapansion="biennial update reports"/>
38 |   <entry term="BTRs" exapansion="biennial transparency reports"/>
39 |   <entry term="LCTPi" exapansion="Low Carbon Technology Partnerships"/>
40 |   <entry term="LEDS" exapansion="low-emissions development strategies"/>
41 |   <entry term="CCS" exapansion="carbon capture and sequestration"/>
42 |   <entry term="CDR" exapansion="carbon dioxide removal"/>
43 |   <entry term="GWP" exapansion="global warming potential"/>
44 |   <entry term="IOT" exapansion="internet-of-things"/>
45 |   <entry term="IMP" exapansion="Illustrative  Mitigation  Pathways"/>
46 |   <entry term="France" exapansion="Franck  Lecocq"/>
47 |   <entry term="NBS" exapansion="nature-based solutions"/>
48 |   <entry term="CSA" exapansion="climate-smart agriculture"/>
49 |   <entry term="ITUC" exapansion="International Trade Union Confederation"/>
50 |   <entry term="ILO" exapansion="International Labour Organization"/>
51 |   <entry term="UNEP" exapansion="UN Environmental Programme"/>
52 |   <entry term="Africa" exapansion="African Climate Justice Alliance"/>
53 |   <entry term="EU" exapansion="European Trade Union Confederation"/>
54 |   <entry term="global" exapansion="Global Union"/>
55 |   <entry term="ICIs" exapansion="international  cooperative  initiatives"/>
56 | </dictionary>
57 | 


--------------------------------------------------------------------------------
/dictionary/ack_key_phrases_manual.txt:
--------------------------------------------------------------------------------
1 | conflict of interest
2 | financial support


--------------------------------------------------------------------------------
/dictionary/ack_key_phrases_manual/ack_key_phrases.md:
--------------------------------------------------------------------------------
1 | The terms in the dictionary were created manually by Chaitanya. 
2 | 


--------------------------------------------------------------------------------
/dictionary/ack_key_phrases_manual/ack_key_phrases_manual.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <dictionary title="ack_key_phrases_manual">
3 |  <entry term="conflict of interest" name="conflict of interest" wikipedia="conflict of interest" id="CM.ack_key_phrases_manual.0"/>
4 |  <entry term="financial support" name="financial support" wikipedia="financial support" id="CM.ack_key_phrases_manual.1"/>
5 | </dictionary>
6 | 


--------------------------------------------------------------------------------
/dictionary/ack_key_phrases_manual/approval_number.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <dictionary title="aprroval_number">
3 |  <entry term="approval number" name="approval number" id="CM.ack_key_phrases_manual.0"/>
4 |  <entry term="approval numbers" name="approval number" id="CM.ack_key_phrases_manual.1"/>
5 | </dictionary>
6 | 


--------------------------------------------------------------------------------
/dictionary/acknowledgment_feature_names.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dictionary title="acknowledgments_feature_names">
 3 |  <entry term="authors gratefully acknowledge" name="authors gratefully acknowledge" id="CM.acknowledgments_feature_names.0"/>
 4 |  <entry term="authors sincerely thank" name="authors sincerely thank" id="CM.acknowledgments_feature_names.1"/>
 5 |  <entry term="authors thank" name="authors thank" id="CM.acknowledgments_feature_names.2"/>
 6 |  <entry term="special thanks" name="special thanks" id="CM.acknowledgments_feature_names.3"/>
 7 |  <entry term="for providing" name="for providing" id="CM.acknowledgments_feature_names.4"/>
 8 |  <entry term="authors would like" name="authors would like" id="CM.acknowledgments_feature_names.5"/>
 9 |  <entry term="providing constant support" name="providing constant support"  id="CM.acknowledgments_feature_names.6"/>
10 |  <entry term="grateful" name="grateful" id="CM.acknowledgments_feature_names.7"/>
11 |  <entry term="authors acknowledge" name="authors acknowledge" id="CM.acknowledgments_feature_names.8"/>
12 |  <entry term="financial contribution" name="financial contribution"  id="CM.acknowledgments_feature_names.9"/>
13 |  <entry term="ethics committees" name="ethics committees" id="CM.acknowledgments_feature_names.10"/>
14 |  <entry term="research support" name="research support" id="CM.acknowledgments_feature_names.11"/>
15 |  <entry term="financial support" name="financial support" id="CM.acknowledgments_feature_names.12"/>
16 |  <entry term="also thank" name="also thank"  id="CM.acknowledgments_feature_names.13"/>
17 |  <entry term="scientific support" name="scientific support" id="CM.acknowledgments_feature_names.14"/>
18 |  <entry term="helpful suggestions" name="helpful suggestions"  id="CM.acknowledgments_feature_names.15"/>
19 |  <entry term="technical support" name="technical support" id="CM.acknowledgments_feature_names.16"/>
20 |  <entry term="support provided" name="support provided" id="CM.acknowledgments_feature_names.17"/>
21 |  <entry term="sincere thanks" name="sincere thanks"  id="CM.acknowledgments_feature_names.18"/>
22 |  <entry term="authors express" name="authors express" id="CM.acknowledgments_feature_names.19"/>
23 |  <entry term="authors extend" name="authors extend" id="CM.acknowledgments_feature_names.20"/>
24 |  <entry term="kind support" name="kind support"  id="CM.acknowledgments_feature_names.21"/>
25 |  <entry term="facilities provided" name="facilities provided" id="CM.acknowledgments_feature_names.22"/>
26 |  <entry term="providing facilities" name="providing facilities" id="CM.acknowledgments_feature_names.23"/>
27 |  <entry term="appreciated" name="appreciated" id="CM.acknowledgments_feature_names.24"/>
28 |  <entry term="necessary facilities" name="necessary facilities" id="CM.acknowledgments_feature_names.25"/>
29 |  <entry term="provided funding" name="provided funding" id="CM.acknowledgments_feature_names.26"/>
30 |  <entry term="technical assistance" name="technical assistance" id="CM.acknowledgments_feature_names.27"/>
31 |  <entry term="fellowship" name="fellowship"  id="CM.acknowledgments_feature_names.28"/>
32 |  <entry term="funds" name="funds" id="CM.acknowledgments_feature_names.29"/>
33 |  <entry term="cooperation" name="cooperation" id="CM.acknowledgments_feature_names.30"/>
34 |  <entry term="helpful comments" name="helpful comments" id="CM.acknowledgments_feature_names.31"/>
35 |  <entry term="necessary facilities" name="necessary facilities" id="CM.acknowledgments_feature_names.32"/>
36 |  <entry term="reliable care" name="reliable care" id="CM.acknowledgments_feature_names.33"/>
37 |  <entry term="valuable comments" name="valuable comments" id="CM.acknowledgments_feature_names.34"/>
38 |  <entry term="financially supported" name="financially supported" id="CM.acknowledgments_feature_names.35"/>
39 |  <entry term="We thank" name="We thank" id="CM.acknowledgments_feature_names.36"/>
40 |  <entry term="fund" name="fund" id="CM.acknowledgments_feature_names.37"/>
41 |  <entry term="acknowledge" name="acknowledge" id="CM.acknowledgments_feature_names.38"/>
42 |  <entry term="acknowledgments" name="acknowledgements" id="CM.acknowledgments_feature_names.38"/>
43 | </dictionary>


--------------------------------------------------------------------------------
/dictionary/chap4_wikitest_2.xml:
--------------------------------------------------------------------------------
  1 | <dictionary title="chap4_wikitest_2">
  2 |   <entry name="SDGs" term="Sustainable  Development Goals" wikidataID=""/>
  3 |   <entry name="NDCs" term="Nationally  Determined  Contributions" wikidataID=""/>
  4 |   <entry name="GHG" term="Greenhouse  Gases" wikidataID=""/>
  5 |   <entry name="GWPs" term="global warming potentials" wikidataID="Q57402965"/>
  6 |   <entry name="BRs" term="biennial reports" wikidataID="Q66306895"/>
  7 |   <entry name="BURs" term="biennial update reports" wikidataID=""/>
  8 |   <entry name="BTRs" term="biennial transparency reports" wikidataID=""/>
  9 |   <entry name="LCTPi" term="Low Carbon Technology Partnerships" wikidataID=""/>
 10 |   <entry name="OGMP" term="Oil  and  Gas  Methane  Partnership" wikidataID=""/>
 11 |   <entry name="LEDS" term="low-emissions development strategies" wikidataID=""/>
 12 |   <entry name="Amendment" term="ationClimate  Action  and  Low  Carbon  Development" wikidataID=""/>
 13 |   <entry name="CCS" term="carbon capture and sequestration">
 14 |     <raw wikidataID="Q59595408, Q59599221, Q59597782, Q59638196"/>
 15 |   </entry>
 16 |   <entry name="CDR" term="carbon dioxide removal">
 17 |     <raw wikidataID="Q30314404, Q56298528, Q59296188, Q59411018"/>
 18 |   </entry>
 19 |   <entry name="HFC" term="hydrofluorocarbon">
 20 |     <raw wikidataID="Q1031943, Q59100158"/>
 21 |   </entry>
 22 |   <entry name="GWP" term="global warming potential">
 23 |     <raw wikidataID="Q901028, Q57084968, Q57084755, Q57084921, Q57084776, Q57402965"/>
 24 |   </entry>
 25 |   <entry name="GWPs" term="global warming potentials" wikidataID="Q57402965"/>
 26 |   <entry name="BRs" term="biennial reports" wikidataID="Q66306895"/>
 27 |   <entry name="BURs" term="biennial update reports" wikidataID=""/>
 28 |   <entry name="BTRs" term="biennial transparency reports" wikidataID=""/>
 29 |   <entry name="LCTPi" term="Low Carbon Technology Partnerships" wikidataID=""/>
 30 |   <entry name="OGMP" term="Oil  and  Gas  Methane  Partnership" wikidataID=""/>
 31 |   <entry name="LEDS" term="low-emissions development strategies" wikidataID=""/>
 32 |   <entry name="Amendment" term="ationClimate  Action  and  Low  Carbon  Development" wikidataID=""/>
 33 |   <entry name="CCS" term="carbon capture and sequestration">
 34 |     <raw wikidataID="Q59595408, Q59599221, Q59597782, Q59638196"/>
 35 |   </entry>
 36 |   <entry name="CDR" term="carbon dioxide removal">
 37 |     <raw wikidataID="Q30314404, Q56298528, Q59296188, Q59411018"/>
 38 |   </entry>
 39 |   <entry name="HFC" term="hydrofluorocarbon">
 40 |     <raw wikidataID="Q1031943, Q59100158"/>
 41 |   </entry>
 42 |   <entry name="GWP" term="global warming potential">
 43 |     <raw wikidataID="Q901028, Q57084968, Q57084755, Q57084921, Q57084776, Q57402965"/>
 44 |   </entry>
 45 |   <entry name="SDPS" term="shifting development pathways to increased sustainability" wikidataID=""/>
 46 |   <entry name="LCDI" term="Low Carbon Development  Initiative" wikidataID=""/>
 47 |   <entry name="IMP" term="Illustrative  Mitigation  Pathways" wikidataID=""/>
 48 |   <entry name="SSP" term="Shared  Socio-economic  Pathway" wikidataID=""/>
 49 |   <entry name="France" term="Franck  Lecocq" wikidataID=""/>
 50 |   <entry name="SDPS" term="shifting development pathways to increased sustainability" wikidataID=""/>
 51 |   <entry name="LCDI" term="Low Carbon Development  Initiative" wikidataID=""/>
 52 |   <entry name="IMP" term="Illustrative  Mitigation  Pathways" wikidataID=""/>
 53 |   <entry name="SSP" term="Shared  Socio-economic  Pathway" wikidataID=""/>
 54 |   <entry name="France" term="Franck  Lecocq" wikidataID=""/>
 55 |   <entry name="GHG" term="greenhouse  gas" wikidataID=""/>
 56 |   <entry name="ETSs" term="emissions  trading  systems" wikidataID=""/>
 57 |   <entry name="NBS" term="nature-based solutions">
 58 |     <raw wikidataID="Q2421934, Q74278659, Q56752602, Q56752590, Q56873279"/>
 59 |   </entry>
 60 |   <entry name="GHG" term="greenhouse  gas" wikidataID=""/>
 61 |   <entry name="ETSs" term="emissions  trading  systems" wikidataID=""/>
 62 |   <entry name="NBS" term="nature-based solutions">
 63 |     <raw wikidataID="Q2421934, Q74278659, Q56752602, Q56752590, Q56873279"/>
 64 |   </entry>
 65 |   <entry name="ITUC" term="International Trade Union Confederation">
 66 |     <raw wikidataID="Q1071706, Q5974948"/>
 67 |   </entry>
 68 |   <entry name="ILO" term="International Labour Organization">
 69 |     <raw wikidataID="Q54129, Q29883540, Q29700241, Q100269536, Q111172886, Q100269543, Q64532154"/>
 70 |   </entry>
 71 |   <entry name="UNEP" term="UN Environmental Programme" wikidataID=""/>
 72 |   <entry name="EU" term="European Trade Union Confederation" wikidataID="Q1377852"/>
 73 |   <entry name="global" term="Global Union">
 74 |     <raw wikidataID="Q5570909, Q3108971, Q47163025"/>
 75 |   </entry>
 76 |   <entry name="TUCA" term="Trade Union Confederation of the Americas" wikidataID="Q3356083"/>
 77 |   <entry name="NDCs" term="nationally  determined  contributions" wikidataID=""/>
 78 |   <entry name="SDPS" term="Shifting development pathways to increased sustainability" wikidataID=""/>
 79 |   <entry name="SDGs" term="Sustainable  Development Goals" wikidataID=""/>
 80 |   <entry name="NDCs" term="Nationally  Determined  Contributions" wikidataID=""/>
 81 |   <entry name="GHG" term="Greenhouse  Gases" wikidataID=""/>
 82 |   <entry name="GWPs" term="global warming potentials" wikidataID="Q57402965"/>
 83 |   <entry name="BRs" term="biennial reports" wikidataID="Q66306895"/>
 84 |   <entry name="BURs" term="biennial update reports" wikidataID=""/>
 85 |   <entry name="BTRs" term="biennial transparency reports" wikidataID=""/>
 86 |   <entry name="LCTPi" term="Low Carbon Technology Partnerships" wikidataID=""/>
 87 |   <entry name="OGMP" term="Oil  and  Gas  Methane  Partnership" wikidataID=""/>
 88 |   <entry name="LEDS" term="low-emissions development strategies" wikidataID=""/>
 89 |   <entry name="CCS" term="carbon capture and sequestration">
 90 |     <raw wikidataID="Q59595408, Q59599221, Q59597782, Q59638196"/>
 91 |   </entry>
 92 |   <entry name="CDR" term="carbon dioxide removal">
 93 |     <raw wikidataID="Q30314404, Q56298528, Q59296188, Q59411018"/>
 94 |   </entry>
 95 |   <entry name="HFC" term="hydrofluorocarbon">
 96 |     <raw wikidataID="Q1031943, Q59100158"/>
 97 |   </entry>
 98 |   <entry name="GWP" term="global warming potential">
 99 |     <raw wikidataID="Q901028, Q57084968, Q57084755, Q57084921, Q57084776, Q57402965"/>
100 |   </entry>
101 |   <entry name="SDPS" term="shifting development pathways to increased sustainability" wikidataID=""/>
102 |   <entry name="LCDI" term="Low Carbon Development  Initiative" wikidataID=""/>
103 |   <entry name="IMP" term="Illustrative  Mitigation  Pathways" wikidataID=""/>
104 |   <entry name="SSP" term="Shared  Socio-economic  Pathway" wikidataID=""/>
105 |   <entry name="France" term="Franck  Lecocq" wikidataID=""/>
106 |   <entry name="GHG" term="greenhouse  gas" wikidataID=""/>
107 |   <entry name="ETSs" term="emissions  trading  systems" wikidataID=""/>
108 |   <entry name="NBS" term="nature-based solutions">
109 |     <raw wikidataID="Q2421934, Q74278659, Q56752602, Q56752590, Q56873279"/>
110 |   </entry>
111 |   <entry name="ITUC" term="International Trade Union Confederation">
112 |     <raw wikidataID="Q1071706, Q5974948"/>
113 |   </entry>
114 |   <entry name="ILO" term="International Labour Organization">
115 |     <raw wikidataID="Q54129, Q29883540, Q29700241, Q100269536, Q111172886, Q100269543, Q64532154"/>
116 |   </entry>
117 |   <entry name="UNEP" term="UN Environmental Programme" wikidataID=""/>
118 |   <entry name="EU" term="European Trade Union Confederation" wikidataID="Q1377852"/>
119 |   <entry name="global" term="Global Union">
120 |     <raw wikidataID="Q5570909, Q3108971, Q47163025"/>
121 |   </entry>
122 |   <entry name="TUCA" term="Trade Union Confederation of the Americas" wikidataID="Q3356083"/>
123 |   <entry name="ICIs" term="international  cooperative  initiatives" wikidataID=""/>
124 | </dictionary>
125 | 


--------------------------------------------------------------------------------
/dictionary/consent_type.txt:
--------------------------------------------------------------------------------
1 | video consent 
2 | informed consent
3 | written informed consent
4 | verbal consent
5 | voluntary consent
6 | competent consent


--------------------------------------------------------------------------------
/dictionary/consent_type/consent_type.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dictionary title="consent_type">
 3 |  <entry term="competent consent" name="competent consent" id="CM.consent_type.0"/>
 4 |  <entry term="informed consent" name="informed consent" wikipedia="informed consent" id="CM.consent_type.1"/>
 5 |  <entry term="verbal consent" name="verbal consent" id="CM.consent_type.2"/>
 6 |  <entry term="video consent " name="video consent " id="CM.consent_type.3"/>
 7 |  <entry term="voluntary consent" name="voluntary consent" id="CM.consent_type.4"/>
 8 |  <entry term="written informed consent" name="written informed consent" id="CM.consent_type.5"/>
 9 | </dictionary>
10 | 


--------------------------------------------------------------------------------
/dictionary/ethics_committee_key_phrases.txt:
--------------------------------------------------------------------------------
1 | Ethics Committee of 
2 | Ethical Committee of 
3 | Institutional Review Board of 
4 | the IRB of 
5 | Institutional Animal Care and Use Committee of 
6 | Animal Care and Use Committee of 
7 | IACUC of 


--------------------------------------------------------------------------------
/dictionary/ethics_committee_key_phrases/ethics_committee_key_phrases.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dictionary title="ethics_committee_key_phrases">
 3 |  <entry term="animal care and use committee of " name="animal care and use committee of " id="CM.ethics_committee_key_phrases.0"/>
 4 |  <entry term="ethical committee of " name="ethical committee of " id="CM.ethics_committee_key_phrases.1"/>
 5 |  <entry term="ethics committee of " name="ethics committee of " id="CM.ethics_committee_key_phrases.2"/>
 6 |  <entry term="iacuc of " name="iacuc of " id="CM.ethics_committee_key_phrases.3"/>
 7 |  <entry term="institutional animal care and use committee of " name="institutional animal care and use committee of " id="CM.ethics_committee_key_phrases.4"/>
 8 |  <entry term="institutional review board of " name="institutional review board of " id="CM.ethics_committee_key_phrases.5"/>
 9 |  <entry term="the irb of " name="the irb of " id="CM.ethics_committee_key_phrases.6"/>
10 | </dictionary>
11 | 


--------------------------------------------------------------------------------
/dictionary/ethics_key_phrases.txt:
--------------------------------------------------------------------------------
 1 | animal study was reviewed and approved by
 2 | protocol was approved by
 3 | studies involving human participants were reviewed and approved by 
 4 | approved by
 5 | informed and written consent
 6 | written consent
 7 | informed consent
 8 | Ethics Committee
 9 | legal guardian
10 | next of kin
11 | national legislation
12 | ethics committees
13 | ethics committees
14 | Ethics Committees
15 | appropriate approval
16 | written informed consent
17 | Principle of Laboratory Animal Care 
18 | ethics guidelines
19 | Experimental protocols were approved by
20 | ethical clearance
21 | Ethical approval was authorized through
22 | Animal Ethics Committee 
23 | Declaration of Helsinki
24 | principles of Good Clinical Practice
25 | Good Clinical Practice
26 | approved by the Institutional Review Board 
27 | International Conference on Harmonisation Good Clinical Practice guidelines
28 | approved the study
29 | local ethical committees
30 | International Council for Harmonisation of Technical Requirements for Pharmaceuticals for Human Use
31 | 


--------------------------------------------------------------------------------
/dictionary/ethics_key_phrases/ethics_key_phrases.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dictionary title="ethics_key_phrases">
 3 |  <entry term="animal ethics committee" name="animal ethics committee " id="CM.ethics_key_phrases.0"/>
 4 |  <entry term="animal study was reviewed and approved by" name="animal study was reviewed and approved by" id="CM.ethics_key_phrases.1"/>
 5 |  <entry term="appropriate approval" name="appropriate approval" id="CM.ethics_key_phrases.2"/>
 6 |  <entry term="approved by" name="approved by" id="CM.ethics_key_phrases.3"/>
 7 |  <entry term="approved by the institutional review board " name="approved by the institutional review board " id="CM.ethics_key_phrases.4"/>
 8 |  <entry term="approved the study" name="approved the study" id="CM.ethics_key_phrases.5"/>
 9 |  <entry term="declaration of helsinki" name="declaration of helsinki" wikipedia="declaration of helsinki" id="CM.ethics_key_phrases.6"/>
10 |  <entry term="ethical approval was authorized through" name="ethical approval was authorized through" id="CM.ethics_key_phrases.7"/>
11 |  <entry term="ethical clearance" name="ethical clearance" id="CM.ethics_key_phrases.8"/>
12 |  <entry term="ethics committee" name="ethics committee" wikipedia="ethics committee" id="CM.ethics_key_phrases.9"/>
13 |  <entry term="ethics committees" name="ethics committees" wikipedia="ethics committees" id="CM.ethics_key_phrases.10"/>
14 |  <entry term="ethics guidelines" name="ethics guidelines" id="CM.ethics_key_phrases.11"/>
15 |  <entry term="experimental protocols were approved by" name="experimental protocols were approved by" id="CM.ethics_key_phrases.12"/>
16 |  <entry term="good clinical practice" name="good clinical practice" wikipedia="good clinical practice" id="CM.ethics_key_phrases.13"/>
17 |  <entry term="informed and written consent" name="informed and written consent" id="CM.ethics_key_phrases.14"/>
18 |  <entry term="informed consent" name="informed consent" wikipedia="informed consent" id="CM.ethics_key_phrases.15"/>
19 |  <entry term="international conference on harmonisation good clinical practice guidelines" name="international conference on harmonisation good clinical practice guidelines" id="CM.ethics_key_phrases.16"/>
20 |  <entry term="international council for harmonisation of technical requirements for pharmaceuticals for human use" name="international council for harmonisation of technical requirements for pharmaceuticals for human use" id="CM.ethics_key_phrases.17"/>
21 |  <entry term="legal guardian" name="legal guardian" wikipedia="legal guardian" id="CM.ethics_key_phrases.18"/>
22 |  <entry term="local ethical committees" name="local ethical committees" id="CM.ethics_key_phrases.19"/>
23 |  <entry term="national legislation" name="national legislation" id="CM.ethics_key_phrases.20"/>
24 |  <entry term="next of kin" name="next of kin" wikipedia="next of kin" id="CM.ethics_key_phrases.21"/>
25 |  <entry term="principle of laboratory animal care " name="principle of laboratory animal care " id="CM.ethics_key_phrases.22"/>
26 |  <entry term="principles of good clinical practice" name="principles of good clinical practice" id="CM.ethics_key_phrases.23"/>
27 |  <entry term="protocol was approved by" name="protocol was approved by" id="CM.ethics_key_phrases.24"/>
28 |  <entry term="studies involving human participants were reviewed and approved by " name="studies involving human participants were reviewed and approved by " id="CM.ethics_key_phrases.25"/>
29 |  <entry term="written consent" name="written consent" id="CM.ethics_key_phrases.26"/>
30 |  <entry term="written informed consent" name="written informed consent" id="CM.ethics_key_phrases.27"/>
31 |  <entry term="ethics approval" name="ethics approval" id="CM.ethics_key_phrases.28"/>
32 |  <entry term="ethical approval" name="ethical approval" id="CM.ethics_key_phrases.29"/>
33 |  <entry term="consent to participate" name="consent to participate" id="CM.ethics_key_phrases.30"/>
34 |  <entry term="Ethics Approval" name="Ethics Approval" id="CM.ethics_key_phrases.31"/>
35 |  <entry term="Ethical Approval" name="Ethical Approval" id="CM.ethics_key_phrases.32"/>
36 |  
37 | </dictionary>
38 | 


--------------------------------------------------------------------------------
/dictionary/features_ack.txt:
--------------------------------------------------------------------------------
 1 | authors gratefully acknowledge
 2 | authors sincerely thank
 3 | authors thank
 4 | special thanks
 5 | for providing
 6 | author would like
 7 | providing constant support
 8 | grateful
 9 | authors acknowledge
10 | financial contribution
11 | research support
12 | financial support
13 | also thank
14 | scientific support
15 | helpful suggestions
16 | technical support
17 | support provided
18 | sincere thanks
19 | authors express
20 | authors extend
21 | kind support
22 | facilities provided
23 | providing facilities
24 | appreciated
25 | necessary facilities
26 | provided funding
27 | technical assistance
28 | fellowship
29 | funds
30 | contribution
31 | helpful comments
32 | reliable care
33 | valuable comments


--------------------------------------------------------------------------------
/dictionary/features_ack/acknowledgment_feature_names.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dictionary title="acknowledgments_feature_names">
 3 |  <entry term="authors gratefully acknowledge" name="authors gratefully acknowledge" id="CM.acknowledgments_feature_names.0"/>
 4 |  <entry term="authors sincerely thank" name="authors sincerely thank" id="CM.acknowledgments_feature_names.1"/>
 5 |  <entry term="authors thank" name="authors thank" id="CM.acknowledgments_feature_names.2"/>
 6 |  <entry term="special thanks" name="special thanks" id="CM.acknowledgments_feature_names.3"/>
 7 |  <entry term="for providing" name="for providing" id="CM.acknowledgments_feature_names.4"/>
 8 |  <entry term="authors would like" name="authors would like" id="CM.acknowledgments_feature_names.5"/>
 9 |  <entry term="providing constant support" name="providing constant support"  id="CM.acknowledgments_feature_names.6"/>
10 |  <entry term="grateful" name="grateful" id="CM.acknowledgments_feature_names.7"/>
11 |  <entry term="authors acknowledge" name="authors acknowledge" id="CM.acknowledgments_feature_names.8"/>
12 |  <entry term="financial contribution" name="financial contribution"  id="CM.acknowledgments_feature_names.9"/>
13 |  <entry term="ethics committees" name="ethics committees" id="CM.acknowledgments_feature_names.10"/>
14 |  <entry term="research support" name="research support" id="CM.acknowledgments_feature_names.11"/>
15 |  <entry term="financial support" name="financial support" id="CM.acknowledgments_feature_names.12"/>
16 |  <entry term="also thank" name="also thank"  id="CM.acknowledgments_feature_names.13"/>
17 |  <entry term="scientific support" name="scientific support" id="CM.acknowledgments_feature_names.14"/>
18 |  <entry term="helpful suggestions" name="helpful suggestions"  id="CM.acknowledgments_feature_names.15"/>
19 |  <entry term="technical support" name="technical support" id="CM.acknowledgments_feature_names.16"/>
20 |  <entry term="support provided" name="support provided" id="CM.acknowledgments_feature_names.17"/>
21 |  <entry term="sincere thanks" name="sincere thanks"  id="CM.acknowledgments_feature_names.18"/>
22 |  <entry term="authors express" name="authors express" id="CM.acknowledgments_feature_names.19"/>
23 |  <entry term="authors extend" name="authors extend" id="CM.acknowledgments_feature_names.20"/>
24 |  <entry term="kind support" name="kind support"  id="CM.acknowledgments_feature_names.21"/>
25 |  <entry term="facilities provided" name="facilities provided" id="CM.acknowledgments_feature_names.22"/>
26 |  <entry term="providing facilities" name="providing facilities" id="CM.acknowledgments_feature_names.23"/>
27 |  <entry term="appreciated" name="appreciated" id="CM.acknowledgments_feature_names.24"/>
28 |  <entry term="necessary facilities" name="necessary facilities" id="CM.acknowledgments_feature_names.25"/>
29 |  <entry term="provided funding" name="provided funding" id="CM.acknowledgments_feature_names.26"/>
30 |  <entry term="technical assistance" name="technical assistance" id="CM.acknowledgments_feature_names.27"/>
31 |  <entry term="fellowship" name="fellowship"  id="CM.acknowledgments_feature_names.28"/>
32 |  <entry term="funds" name="funds" id="CM.acknowledgments_feature_names.29"/>
33 |  <entry term="cooperation" name="cooperation" id="CM.acknowledgments_feature_names.30"/>
34 |  <entry term="helpful comments" name="helpful comments" id="CM.acknowledgments_feature_names.31"/>
35 |  <entry term="necessary facilities" name="necessary facilities" id="CM.acknowledgments_feature_names.32"/>
36 |  <entry term="reliable care" name="reliable care" id="CM.acknowledgments_feature_names.33"/>
37 |  <entry term="valuable comments" name="valuable comments" id="CM.acknowledgments_feature_names.34"/>
38 |  <entry term="financially supported" name="financially supported" id="CM.acknowledgments_feature_names.35"/>
39 |  <entry term="We thank" name="We thank" id="CM.acknowledgments_feature_names.36"/>
40 |  <entry term="fund" name="fund" id="CM.acknowledgments_feature_names.37"/>
41 |  <entry term="acknowledge" name="acknowledge" id="CM.acknowledgments_feature_names.38"/>
42 |  <entry term="acknowledgments" name="acknowledgements" id="CM.acknowledgments_feature_names.38"/>
43 | </dictionary>


--------------------------------------------------------------------------------
/dictionary/features_ack/features_ack.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dictionary title="features_ack">
 3 |  <entry term="also thank" name="also thank" id="CM.features_ack.0"/>
 4 |  <entry term="appreciated" name="appreciated" wikipedia="appreciated" id="CM.features_ack.1"/>
 5 |  <entry term="author would like" name="author would like" id="CM.features_ack.2"/>
 6 |  <entry term="authors acknowledge" name="authors acknowledge" id="CM.features_ack.3"/>
 7 |  <entry term="authors express" name="authors express" id="CM.features_ack.4"/>
 8 |  <entry term="authors extend" name="authors extend" id="CM.features_ack.5"/>
 9 |  <entry term="authors gratefully acknowledge" name="authors gratefully acknowledge" id="CM.features_ack.6"/>
10 |  <entry term="authors sincerely thank" name="authors sincerely thank" id="CM.features_ack.7"/>
11 |  <entry term="authors thank" name="authors thank" id="CM.features_ack.8"/>
12 |  <entry term="contribution" name="contribution" wikipedia="contribution" id="CM.features_ack.9"/>
13 |  <entry term="facilities provided" name="facilities provided" id="CM.features_ack.10"/>
14 |  <entry term="fellowship" name="fellowship" wikipedia="fellowship" id="CM.features_ack.11"/>
15 |  <entry term="financial contribution" name="financial contribution" id="CM.features_ack.12"/>
16 |  <entry term="financial support" name="financial support" wikipedia="financial support" id="CM.features_ack.13"/>
17 |  <entry term="for providing" name="for providing" id="CM.features_ack.14"/>
18 |  <entry term="funds" name="funds" wikipedia="funds" id="CM.features_ack.15"/>
19 |  <entry term="grateful" name="grateful" wikipedia="grateful" id="CM.features_ack.16"/>
20 |  <entry term="helpful comments" name="helpful comments" id="CM.features_ack.17"/>
21 |  <entry term="helpful suggestions" name="helpful suggestions" id="CM.features_ack.18"/>
22 |  <entry term="kind support" name="kind support" id="CM.features_ack.19"/>
23 |  <entry term="necessary facilities" name="necessary facilities" id="CM.features_ack.20"/>
24 |  <entry term="provided funding" name="provided funding" id="CM.features_ack.21"/>
25 |  <entry term="providing constant support" name="providing constant support" id="CM.features_ack.22"/>
26 |  <entry term="providing facilities" name="providing facilities" id="CM.features_ack.23"/>
27 |  <entry term="reliable care" name="reliable care" id="CM.features_ack.24"/>
28 |  <entry term="research support" name="research support" id="CM.features_ack.25"/>
29 |  <entry term="scientific support" name="scientific support" id="CM.features_ack.26"/>
30 |  <entry term="sincere thanks" name="sincere thanks" id="CM.features_ack.27"/>
31 |  <entry term="special thanks" name="special thanks" id="CM.features_ack.28"/>
32 |  <entry term="support provided" name="support provided" id="CM.features_ack.29"/>
33 |  <entry term="technical assistance" name="technical assistance" wikipedia="technical assistance" id="CM.features_ack.30"/>
34 |  <entry term="technical support" name="technical support" wikipedia="technical support" id="CM.features_ack.31"/>
35 |  <entry term="valuable comments" name="valuable comments" id="CM.features_ack.32"/>
36 | </dictionary>
37 | 


--------------------------------------------------------------------------------
/dictionary/invasion_biology/invasion_hypotheses.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dictionary title="invasion_hypotheses">
 3 |  <entry term="biotic resistance hypothesis" name="biotic resistance hypothesis" id="CM.invasion_hypotheses.0"/>
 4 |  <entry term="darwin's naturalization and limiting similarity hypotheses" name="darwin's naturalization and limiting similarity hypotheses" id="CM.invasion_hypotheses.1"/>
 5 |  <entry term="disturbance hypothesis" name="disturbance hypothesis" id="CM.invasion_hypotheses.2"/>
 6 |  <entry term="enemy release hypothesis" name="enemy release hypothesis" wikipedia="enemy release hypothesis" id="CM.invasion_hypotheses.3"/>
 7 |  <entry term="evolution of increased competitive ability and shifting defence hypotheses" name="evolution of increased competitive ability and shifting defence hypotheses" id="CM.invasion_hypotheses.4"/>
 8 |  <entry term="invasional meltdown hypothesis" name="invasional meltdown hypothesis" id="CM.invasion_hypotheses.5"/>
 9 |  <entry term="novel weapons hypothesis " name="novel weapons hypothesis " id="CM.invasion_hypotheses.6"/>
10 |  <entry term="phenotypic plasticity hypothesis " name="phenotypic plasticity hypothesis " id="CM.invasion_hypotheses.7"/>
11 |  <entry term="propagule pressure hypothesis" name="propagule pressure hypothesis" id="CM.invasion_hypotheses.8"/>
12 |  <entry term="synthesizing invasion hypotheses" name="synthesizing invasion hypotheses" id="CM.invasion_hypotheses.9"/>
13 |  <entry term="tens rule" name="tens rule" id="CM.invasion_hypotheses.10"/>
14 | </dictionary>
15 | 


--------------------------------------------------------------------------------
/dictionary/invasion_biology/invasion_hypothesis.txt:
--------------------------------------------------------------------------------
 1 | Biotic Resistance Hypothesis
 2 | Enemy Release Hypothesis
 3 | Propagule Pressure Hypothesis
 4 | Synthesizing Invasion Hypotheses
 5 | Tens Rule
 6 | novel weapons hypothesis	
 7 | Darwin's naturalization and limiting similarity hypotheses
 8 | Phenotypic plasticity hypothesis	
 9 | Evolution of increased competitive ability and shifting defence hypotheses
10 | Invasional meltdown hypothesis
11 | Disturbance hypothesis
12 | 


--------------------------------------------------------------------------------
/dictionary/ipcc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <dictionary title="ipcc">
3 |  <entry term="net zero energy systems" />
4 |  <entry term="SDGs "/>
5 |  <entry term="energy transformation" />
6 |  <entry term=" energy transportation"/>
7 | </dictionary>
8 | 


--------------------------------------------------------------------------------
/dictionary/methods_key_phrases.txt:
--------------------------------------------------------------------------------
 1 | flowering season
 2 | harvesting season
 3 | blossoming season
 4 | collected from
 5 | collected in
 6 | were collected in a blossoming period
 7 | oil sample was obtained from
 8 | sample obtained from 
 9 | flowering period
10 | harvesting period
11 | blossoming period
12 | seeds were sampled from
13 | were purchased from local markets
14 | 


--------------------------------------------------------------------------------
/dictionary/methods_key_phrases/methods_key_phrases.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <dictionary title="methods_key_phrases">
 3 |  <entry term="blossoming period" name="blossoming period" id="CM.methods_key_phrases.0"/>
 4 |  <entry term="blossoming season" name="blossoming season" id="CM.methods_key_phrases.1"/>
 5 |  <entry term="collected from" name="collected from" id="CM.methods_key_phrases.2"/>
 6 |  <entry term="collected in" name="collected in" id="CM.methods_key_phrases.3"/>
 7 |  <entry term="flowering period" name="flowering period" id="CM.methods_key_phrases.4"/>
 8 |  <entry term="flowering season" name="flowering season" id="CM.methods_key_phrases.5"/>
 9 |  <entry term="harvesting period" name="harvesting period" id="CM.methods_key_phrases.6"/>
10 |  <entry term="harvesting season" name="harvesting season" id="CM.methods_key_phrases.7"/>
11 |  <entry term="oil sample was obtained from" name="oil sample was obtained from" id="CM.methods_key_phrases.8"/>
12 |  <entry term="sample obtained from " name="sample obtained from " id="CM.methods_key_phrases.9"/>
13 |  <entry term="seeds were sampled from" name="seeds were sampled from" id="CM.methods_key_phrases.10"/>
14 |  <entry term="were collected in a blossoming period" name="were collected in a blossoming period" id="CM.methods_key_phrases.11"/>
15 |  <entry term="were purchased from local markets" name="were purchased from local markets" id="CM.methods_key_phrases.12"/>
16 | </dictionary>
17 | 


--------------------------------------------------------------------------------
/dictionary/software.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <dictionary title="consent_type">
3 |  <entry term="ArcGIS" />
4 |  <entry term="ImageJ "/>
5 |  <entry term="Cytoscape" name="Cytoscape"/>
6 |  <entry term="MrBayes" name="MrBayes"/>
7 |  <entry term="GROMACS " name="GROMACS " id="CM.consent_type.4"/>
8 | </dictionary>
9 | 


--------------------------------------------------------------------------------
/dictionary/test_terpene.xml:
--------------------------------------------------------------------------------
 1 | <dictionary title="test_terpene">
 2 |  <desc>Created from SPARQL query</desc>
 3 |  <entry description="test chemical; our fav. limonene" name="limonene" term="limonene">
 4 |   <synonym>D-limonene(+)</synonym>
 5 |   <synonym>(R)-Limonene(+)</synonym>
 6 |   <synonym>(4R)-4-isopropenyl-1-methylcyclohexene</synonym>
 7 |   <synonym>(R)-4-isopropenyl-1-methyl-1-cyclohexene</synonym>
 8 |   <synonym>(R)-(+)-limonene</synonym>
 9 |   <synonym>D-(+)-limonene</synonym>
10 |   <synonym>(4R)-1-methyl-4-isopropenylcyclohex-1-ene</synonym>
11 |   <synonym>(4R)-limonene(+)</synonym>
12 |   <synonym>(4R)-Limonene</synonym>
13 |   <synonym>D-Limonen</synonym>
14 |   <synonym>(+)-4-isopropenyl-1-methylcyclohexene</synonym>
15 |   <synonym>(R)-p-mentha-1,8-diene</synonym>
16 |   <synonym>(R)(+)-p-mentha-1,8-diene</synonym>
17 |   <synonym>(+)-limonene</synonym>
18 |   <synonym>(1R)-(+)-α-pinene</synonym>
19 | 
20 |  </entry>
21 |  </dictionary>
22 | 
23 | 


--------------------------------------------------------------------------------
/docanalysis/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/docanalysis/.DS_Store


--------------------------------------------------------------------------------
/docanalysis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | docanalysis module
3 | """
4 | from docanalysis.entity_extraction import EntityExtraction
5 | from docanalysis.docanalysis import Docanalysis
6 | 
7 | __author__ = "Ayush Garg", "Shweata N. Hegde"
8 | __email__ = "ayush@science.org.in", "shweata.hegde@gmail.com"


--------------------------------------------------------------------------------
/docanalysis/ami_sections.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC
  2 | from pathlib import Path
  3 | import logging
  4 | from lxml import etree as LXET
  5 | 
  6 | from docanalysis.xml_lib import XmlLib
  7 | 
  8 | 
  9 | class AMIAbsSection(ABC):
 10 |     """ """
 11 |     logger = logging.getLogger("ami_abs_section")
 12 | 
 13 |     SECTIONS = "sections"
 14 | 
 15 |     def __init__(self) -> None:
 16 |         pass
 17 | 
 18 |     
 19 |     @classmethod
 20 |     def make_xml_sections(cls, file, outdir: str, force: bool) -> None:
 21 |         """make sections
 22 | 
 23 |         :param file: 
 24 |         :param outdir: str: 
 25 |         :param force: bool: 
 26 | 
 27 |         """
 28 |         if file is None or outdir is None:
 29 |             return None
 30 |         path = Path(file)
 31 |         if not path.exists():
 32 |             cls.logger.warning(f"file {file} does not exist")
 33 |             return
 34 |         #        sections = Path(self.dirx)
 35 |         if force or not Path(outdir).exists():
 36 |             cls.logger.warning(f"Making sections in {str(path)}")
 37 |             xml_libx = XmlLib()
 38 |             xml_libx.logger.setLevel(logging.DEBUG)
 39 |             xml_libx.read(file)
 40 |             xml_libx.make_sections(outdir)
 41 | 
 42 | 
 43 | class AMIFigure(AMIAbsSection):
 44 |     """holds data on figure captions and hopefully later pointers to pdfimages
 45 |     
 46 |     Figures are a mess in JATS. They can be held in different places and often not linked
 47 |     to the bitmap. This class will include heuristics for uniting and standardising this.
 48 |     
 49 |     JATS encoding depends on the publisher. Typically:
 50 |     <fig xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" id="F6"
 51 |          fig-type="figure" orientation="portrait" position="float">
 52 |       <label>Fig. 6</label>
 53 |       <caption>
 54 |         <title>XPS core spectra comparison for aged baseline and SEB-3 electrodes.</title>
 55 |         <p>The graphite and NCM622 electrodes are taken from the baseline cell after 956 cycles and
 56 |             the SEB-3 cell after 4021 cycles.</p>
 57 |       </caption>
 58 |       <graphic xlink:href="aay7633-F6"/>
 59 |     </fig>'
 60 |     
 61 |     There are sometimes 2 or more <p> as children of caption.
 62 | 
 63 | 
 64 |     """
 65 | 
 66 |     # JATS tags
 67 |     LABEL = "label_xml"
 68 |     CAPTION = "caption"
 69 |     P = "p"
 70 |     TITLE = "title"
 71 | 
 72 |     def __init__(self):
 73 |         super().__init__()
 74 |         self.root = None
 75 |         self.root_str = None
 76 |         self.label_xml = None
 77 |         self.label_text = None
 78 |         self.caption = None
 79 |         self.caption_p = None
 80 |         self.p_text = None
 81 |         self.caption_title = None
 82 |         self.title_text = None
 83 | 
 84 |     @classmethod
 85 |     def create_from_jats(cls, xml_path):
 86 |         """
 87 | 
 88 |         :param xml_path: 
 89 | 
 90 |         """
 91 |         ami_figure = AMIFigure()
 92 |         ami_figure.root = XmlLib.parse_xml_file_to_root(str(xml_path))
 93 |         ami_figure.add_figure_structure()
 94 |         return ami_figure
 95 | 
 96 |     def add_figure_structure(self):
 97 |         """creates label, caption, title, test(p) from JATS xml"""
 98 |         self.root_str = LXET.tostring(self.root)
 99 |         self.label_xml = XmlLib.get_or_create_child(self.root, self.LABEL)
100 |         self.label_text = XmlLib.get_text(self.label_xml)
101 |         self.caption = XmlLib.get_or_create_child(self.root, self.CAPTION)
102 |         self.caption_p = XmlLib.get_or_create_child(self.caption, self.P)
103 |         self.p_text = XmlLib.get_text(self.caption_p)
104 |         self.caption_title = XmlLib.get_or_create_child(self.caption, self.TITLE)
105 |         self.title_text = XmlLib.get_text(self.caption_title)
106 | 
107 |     def get_xml_str(self):
108 |         """ """
109 |         return LXET.tostring(self.root)
110 | 
111 |     def __str__(self):
112 |         s = f" --- {self.label_xml} ----\n" \
113 |             f"[{self.title_text}] \n" \
114 |             f"        {self.p_text}"
115 |         return s
116 | 


--------------------------------------------------------------------------------
/docanalysis/config/default_dicts.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "EO_ACTIVITY": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/activity/eo_activity.xml",
 3 |   "EO_COMPOUND": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/compound/eo_compound.xml",
 4 |   "EO_ANALYSIS": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/analysis/eo_analysis_method.xml",
 5 |   "EO_EXTRACTION": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/extraction/eo_extraction.xml",
 6 |   "EO_PLANT": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant/eo_plant.xml",
 7 |   "PLANT_GENUS": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant_genus/plant_genus.xml",
 8 |   "EO_PLANT_PART": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant_part/plant_part.xml",
 9 |   "EO_TARGET": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/target/eo_target_organism.xml",
10 |   "COUNTRY": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/country/country.xml",
11 |   "DISEASE": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/disease/disease.xml",
12 |   "ORGANIZATION": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/organization/organization.xml",
13 |   "DRUG": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/drug/drug.xml",
14 |   "TEST_TRACE": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/test_trace/test_trace.xml"
15 | }
16 | 


--------------------------------------------------------------------------------
/docanalysis/config/default_sections.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "ABS":["*abstract.xml"],
 3 | "ACK": ["*ack.xml"],
 4 | "AFF": ["*aff.xml"],
 5 | "AUT": ["*contrib-group.xml"],
 6 | "CON": ["*conclusion*/*.xml"],
 7 | "DIS": ["*discussion*/**/*_title.xml", "*discussion*/**/*_p.xml"], 
 8 | "ETH": ["*ethic*/*.xml"],
 9 | "FIG": ["*fig*.xml"],
10 | "INT": ["*introduction*/*.xml", "*background*/*.xml"],
11 | "KEY": ["*kwd-group.xml"],
12 | "MET": ["*method*/*.xml", "*material*/*.xml"] ,
13 | "RES": ["*result*/*/*_title.xml", "*result*/*/*_p.xml"], 
14 | "TAB": ["*table*.xml"],
15 | "TIL": ["*article-meta/*title-group.xml"],
16 | "HTML": ["*.html"]
17 | 
18 | }


--------------------------------------------------------------------------------
/docanalysis/convert_file.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from chardet import detect
 3 | 
 4 | # get file encoding type
 5 | 
 6 | 
 7 | def get_encoding_type(file):
 8 |     """
 9 | 
10 |     :param file: 
11 | 
12 |     """
13 |     with open(file, 'rb') as f:
14 |         rawdata = f.read()
15 |     return detect(rawdata)['encoding']
16 | 
17 | 
18 | from_codec = get_encoding_type('entity_extraction.py')
19 | 
20 | # add try: except block for reliability
21 | try:
22 |     with open('entity_extraction.py', 'r', encoding=from_codec) as f, open('entity_extraction2.py', 'w', encoding='utf-8') as e:
23 |         text = f.read()  # for small files, for big use chunks
24 |         e.write(text)
25 | 
26 | 
27 | except UnicodeDecodeError:
28 |     print('Decode Error')
29 | except UnicodeEncodeError:
30 |     print('Encode Error')
31 | 


--------------------------------------------------------------------------------
/docanalysis/docanalysis.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import sys
  4 | import configargparse
  5 | import coloredlogs
  6 | from time import gmtime, strftime
  7 | from tqdm import tqdm
  8 | from functools import partialmethod
  9 | from docanalysis.entity_extraction import EntityExtraction
 10 | 
 11 | 
 12 | class Docanalysis:
 13 | 
 14 |     def __init__(self):
 15 |         """This function makes all the constants"""
 16 |         self.entity_extraction = EntityExtraction()
 17 |         self.version = "0.3.0"
 18 | 
 19 |     def handle_logger_creation(self, args):
 20 |         """handles the logging on cml
 21 | 
 22 |         :param args: description]
 23 |         :type args: type]
 24 | 
 25 |         """
 26 |         coloredlogs.install()
 27 |         levels = {
 28 |             "critical": logging.CRITICAL,
 29 |             "error": logging.ERROR,
 30 |             "warn": logging.WARNING,
 31 |             "warning": logging.WARNING,
 32 |             "info": logging.INFO,
 33 |             "debug": logging.DEBUG,
 34 |         }
 35 |         level = levels.get(args.loglevel.lower())
 36 | 
 37 |         if level == logging.DEBUG:
 38 |             tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
 39 | 
 40 |         if args.logfile:
 41 |             self.handle_logfile(args, level)
 42 |         else:
 43 |             coloredlogs.install(level=level, fmt='%(levelname)s: %(message)s')
 44 | 
 45 |     def handlecli(self):
 46 |         """Handles the command line interface using argparse"""
 47 |         version = self.version
 48 | 
 49 |         default_path = strftime("%Y_%m_%d_%H_%M_%S", gmtime())
 50 |         parser = configargparse.ArgParser(
 51 |             description=f"Welcome to docanalysis version {version}. -h or --help for help",
 52 |             add_config_file_help=False,
 53 |         )
 54 |         parser.add_argument(
 55 |             "--run_pygetpapers",
 56 |             default=False,
 57 |             action="store_true",
 58 |             help="[Command] downloads papers from EuropePMC via pygetpapers",
 59 |         )
 60 |         parser.add_argument(
 61 |             "--make_section",
 62 |             default=False,
 63 |             action="store_true",
 64 |             help="[Command] makes sections; requires a fulltext.xml in CTree directories",
 65 |         )
 66 |         parser.add_argument(
 67 |             "-q",
 68 |             "--query",
 69 |             default=None,
 70 |             type=str,
 71 |             help="[pygetpapers] query string",
 72 |         )
 73 |         parser.add_argument(
 74 |             "-k",
 75 |             "--hits",
 76 |             type=str,
 77 |             default=None,
 78 |             help="[pygetpapers] number of papers to download",
 79 |         )
 80 | 
 81 |         parser.add_argument(
 82 |             "--project_name",
 83 |             type=str,
 84 |             help="CProject directory name",
 85 |             default=os.path.join(os.getcwd(), default_path),
 86 |         )
 87 |         parser.add_argument(
 88 |             "-d",
 89 |             "--dictionary",
 90 |             default=[],
 91 |             type=str,
 92 |             nargs='*',
 93 |             help="[file name/url] existing ami dictionary to annotate sentences or support supervised entity extraction",
 94 |         )
 95 |         parser.add_argument(
 96 |             "-o",
 97 |             "--output",
 98 |             default=False,
 99 |             help="outputs csv with sentences/terms",
100 |         )
101 |         parser.add_argument(
102 |             "--make_ami_dict",
103 |             default=False,
104 |             help="[Command] title for ami-dict. Makes ami-dict of all extracted entities; works only with spacy",
105 |         )
106 |         parser.add_argument(
107 |             "--search_section",
108 |             default=['ALL'],
109 |             action='store',
110 |             dest='search_section',
111 |             type=str,
112 |             nargs='*',
113 |             help="[NER/dictionary search] section(s) to annotate. Choose from: ALL, ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL. Defaults to ALL",
114 |         )
115 | 
116 |         parser.add_argument(
117 |             "--entities",
118 |             default=['ALL'],
119 |             action='store', dest='entities',
120 |             type=str, nargs='*',
121 |             help="[NER] entities to extract. Default (ALL). Common entities "
122 |             "SpaCy: GPE, LANGUAGE, ORG, PERSON (for additional ones check: ); "
123 |         )
124 | 
125 |         parser.add_argument(
126 |             "--spacy_model",
127 |             default=False,
128 |             type=str,
129 |             help="[NER] optional.",
130 |         )
131 | 
132 |         parser.add_argument(
133 |             "--html",
134 |             default=False,
135 |             type=str,
136 |             help="outputs html with sentences/terms",
137 |         )
138 | 
139 |         parser.add_argument(
140 |             "--synonyms",
141 |             default=False,
142 |             type=str,
143 |             help="annotate the corpus/sections with synonyms from ami-dict",
144 |         )
145 |         parser.add_argument(
146 |             "--make_json",
147 |             default=False,
148 |             type=str,
149 |             help="outputs json with sentences/terms",
150 |         )
151 |         parser.add_argument(
152 |             "--search_html",
153 |             default=False,
154 |             action="store_true",
155 |             help="searches html documents (mainly IPCC)",
156 |         )
157 |         parser.add_argument(
158 |             "--extract_abb",
159 |             default=False,
160 |             help="[Command] title for abb-ami-dict. Extracts abbreviations and expansions; makes ami-dict of all extracted entities"
161 |         )
162 | 
163 |         parser.add_argument(
164 |             "-l",
165 |             "--loglevel",
166 |             default="info",
167 |             help="provide logging level. "
168 |             "Example --log warning <<info,warning,debug,error,critical>>, default='info'",
169 |         )
170 | 
171 |         parser.add_argument(
172 |             "-f",
173 |             "--logfile",
174 |             default=False,
175 |             type=str,
176 |             help="saves log to specified file in output directory as well as printing to terminal",
177 |         )
178 | 
179 |         if len(sys.argv) == 1:
180 |             parser.print_help(sys.stderr)
181 |             sys.exit()
182 |         args = parser.parse_args()
183 |         for arg in vars(args):
184 |             if vars(args)[arg] == "False":
185 |                 vars(args)[arg] = False
186 |         self.handle_logger_creation(args)
187 |         self.entity_extraction.extract_entities_from_papers(args.project_name, args.dictionary, search_sections=args.search_section, entities=args.entities, query=args.query, hits=args.hits,
188 |                                                             run_pygetpapers=args.run_pygetpapers, make_section=args.make_section, removefalse=True,
189 |                                                             csv_name=args.output, make_ami_dict=args.make_ami_dict, spacy_model=args.spacy_model, html_path=args.html, synonyms=args.synonyms, make_json=args.make_json, search_html=args.search_html, extract_abb=args.extract_abb)
190 | 
191 | 
192 | def main():
193 |     """Runs the CLI"""
194 |     calldocanalysis = Docanalysis()
195 |     calldocanalysis.handlecli()
196 | 
197 | 
198 | if __name__ == "__main__":
199 |     main()
200 | 


--------------------------------------------------------------------------------
/docanalysis/entity_extraction.py:
--------------------------------------------------------------------------------
  1 | from distutils.log import error
  2 | import os
  3 | import logging
  4 | import requests
  5 | from glob import glob
  6 | import spacy
  7 | from spacy import displacy
  8 | from nltk import tokenize
  9 | from spacy.matcher import PhraseMatcher
 10 | import pandas as pd
 11 | from bs4 import BeautifulSoup
 12 | from tqdm import tqdm
 13 | import xml.etree.ElementTree as ET
 14 | from docanalysis.ami_sections import AMIAbsSection
 15 | from pathlib import Path
 16 | from pygetpapers import Pygetpapers
 17 | from collections import Counter
 18 | import pip
 19 | import json
 20 | import re
 21 | from lxml import etree
 22 | from pygetpapers.download_tools import DownloadTools
 23 | from urllib.request import urlopen
 24 | import nltk
 25 | try:
 26 |     nltk.data.find('tokenizers/punkt')
 27 |     nltk.data.find('corpora/stopwords')
 28 | except LookupError:
 29 |     nltk.download('punkt')
 30 |     nltk.download('stopwords')
 31 |     from nltk import tokenize
 32 | 
 33 | 
 34 | def install(package):
 35 |     """
 36 | 
 37 |     :param package: 
 38 | 
 39 |     """
 40 |     if hasattr(pip, 'main'):
 41 |         pip.main(['install', package])
 42 |     else:
 43 |         pip._internal.main(['install', package])
 44 | 
 45 | 
 46 | try:
 47 |     from abbreviations import schwartz_hearst
 48 | except ModuleNotFoundError:
 49 |     install('abbreviations')
 50 |     from abbreviations import schwartz_hearst
 51 | 
 52 | 
 53 | #nlp_phrase = spacy.load("en_core_web_sm")
 54 | 
 55 | CONFIG_SECTIONS = 'https://raw.githubusercontent.com/petermr/docanalysis/main/docanalysis/config/default_sections.json'
 56 | CONFIG_AMI_DICT = 'https://raw.githubusercontent.com/petermr/docanalysis/main/docanalysis/config/default_dicts.json'
 57 | 
 58 | 
 59 | class EntityExtraction:
 60 |     """EntityExtraction Class"""
 61 | 
 62 |     def __init__(self):
 63 |         logging.basicConfig(level=logging.INFO)
 64 |         self.sections = self.json_to_dict(CONFIG_SECTIONS)
 65 |         self.dict_of_ami_dict = self.json_to_dict(CONFIG_AMI_DICT)
 66 |         self.all_paragraphs = {}
 67 |         self.sentence_dictionary = {}
 68 |         self.spacy_model = 'spacy'
 69 |         self.nlp = None
 70 | 
 71 |     def download_spacy(self, spacy_type):
 72 |         """Download or load spacy
 73 | 
 74 |         :param spacy_type: "spacy
 75 |         :type spacy_type: string
 76 | 
 77 |         """
 78 |         logging.info(f'Loading {spacy_type}')
 79 | 
 80 |         if spacy_type == "spacy":
 81 |             try:
 82 |                 self.nlp = spacy.load('en_core_web_sm')
 83 |             except OSError:
 84 |                 from spacy.cli import download
 85 |                 download('en_core_web_sm')
 86 |                 self.nlp = spacy.load('en_core_web_sm')
 87 | 
 88 |     def dictionary_to_html(self, html_path):
 89 |         """Converts dictionary to html
 90 | 
 91 |         :param html_path: path to save html
 92 |         :type html_path: string
 93 | 
 94 |         """
 95 |         list_of_docs = []
 96 |         for sentence in self.sentence_dictionary:
 97 |             list_of_docs.append(self.sentence_dictionary[sentence]['doc'])
 98 |         html = displacy.render(list_of_docs, style="ent",
 99 |                                page=True, minify=True)
100 |         logging.info(f"saving output: {html_path}")
101 |         self._write_string_to_file(html, html_path)
102 | 
103 |     def extract_entities_from_papers(self, corpus_path, terms_xml_path, search_sections, entities, query=None, hits=30,
104 |                                      run_pygetpapers=False, make_section=False, removefalse=True,
105 |                                      csv_name=False, make_ami_dict=False, spacy_model=False, html_path=False, synonyms=False, make_json=False, search_html=False, extract_abb=False):
106 |         """logic implementation (Q: how detailed should the description here be?)
107 | 
108 |         :param corpus_path: 
109 |         :param terms_xml_path: 
110 |         :param search_sections: 
111 |         :param entities: 
112 |         :param query:  (Default value = None)
113 |         :param hits:  (Default value = 30)
114 |         :param run_pygetpapers:  (Default value = False)
115 |         :param make_section:  (Default value = False)
116 |         :param removefalse:  (Default value = True)
117 |         :param csv_name:  (Default value = False)
118 |         :param make_ami_dict:  (Default value = False)
119 |         :param spacy_model:  (Default value = False)
120 |         :param html_path:  (Default value = False)
121 |         :param synonyms:  (Default value = False)
122 |         :param make_json:  (Default value = False)
123 |         :param search_html:  (Default value = False)
124 |         :param extract_abb:  (Default value = False)
125 | 
126 |         """
127 | 
128 |         self.spacy_model = spacy_model
129 |         corpus_path = os.path.abspath(corpus_path)
130 |         if run_pygetpapers:
131 |             if not query:
132 |                 logging.warning(
133 |                     "please provide query (like 'terpene', 'essential oils') as parameter")
134 |                 return
135 |             self.run_pygetpapers(query, hits, corpus_path)
136 |         if os.path.isdir(corpus_path):
137 |             if make_section:
138 |                 self.run_ami_section(corpus_path)
139 |         else:
140 |             logging.error("CProject doesn't exist")
141 |             return
142 |         if search_html:
143 |             search_sections = ['HTML', ]
144 |         if search_sections == ['ALL', ]:
145 |             search_sections = self.sections.keys()
146 |         if len(glob(os.path.join(corpus_path, '**', 'sections'))) > 0:
147 |             self.all_paragraphs = self.get_glob_for_section(
148 |                 corpus_path, search_sections)
149 |         else:
150 |             logging.error('section papers using --make_sections before search')
151 |         if spacy_model or csv_name or extract_abb or make_ami_dict:
152 |             if search_html:
153 |                 self.make_dict_with_parsed_document(document_type='html')
154 |             else:
155 |                 self.make_dict_with_parsed_document()
156 |         if spacy_model:
157 |             self.run_spacy_over_sections(self.sentence_dictionary, entities)
158 |             self.remove_statements_not_having_xmldict_terms(
159 |                 dict_with_parsed_xml=self.sentence_dictionary, searching='entities')
160 |         if terms_xml_path:
161 |             for i in range(len(terms_xml_path)):
162 |                 compiled_terms = self.get_terms_from_ami_xml(terms_xml_path[i])
163 |                 self.add_if_file_contains_terms(
164 |                     compiled_terms=compiled_terms, dict_with_parsed_xml=self.sentence_dictionary, searching=f'{i}')
165 |                 if removefalse:
166 |                     self.remove_statements_not_having_xmldict_terms(
167 |                         dict_with_parsed_xml=self.sentence_dictionary, searching=f'{i}')
168 |             if synonyms:
169 |                 synonyms_list = self.get_synonyms_from_ami_xml(terms_xml_path)
170 |                 self.add_if_file_contains_terms(
171 |                     compiled_terms=synonyms_list, dict_with_parsed_xml=self.sentence_dictionary, searching='has_synonyms')
172 |                 if removefalse:
173 |                     self.remove_statements_not_having_xmldict_terms(
174 |                         dict_with_parsed_xml=self.sentence_dictionary)
175 |                 if html_path:
176 |                     self.dictionary_to_html(
177 |                         os.path.join(corpus_path, html_path))
178 |         if extract_abb:
179 |             self.abbreviation_search_using_sw(self.sentence_dictionary)
180 |             abb_ami_dict_path = os.path.join(corpus_path, extract_abb)
181 |             self.make_ami_dict_from_abbreviation(
182 |                 extract_abb, self.sentence_dictionary, abb_ami_dict_path)
183 |             if removefalse:
184 |                 self.remove_statements_not_having_xmldict_terms(
185 |                     dict_with_parsed_xml=self.sentence_dictionary, searching='abb')
186 | 
187 |         if csv_name:
188 |             dict_with_parsed_xml_no_paragrph = self.remove_paragraph_form_parsed_xml_dict(
189 |                 self.sentence_dictionary, "paragraph")
190 |             self.convert_dict_to_csv(
191 |                 path=os.path.join(corpus_path, f'{csv_name}'), dict_with_parsed_xml=dict_with_parsed_xml_no_paragrph)
192 |         if make_json:
193 |             dict_with_parsed_xml_no_paragrph = self.remove_paragraph_form_parsed_xml_dict(
194 |                 self.sentence_dictionary, "paragraph")
195 |             self.convert_dict_to_json(path=os.path.join(
196 |                 corpus_path, f'{make_json}'), dict_with_parsed_xml=dict_with_parsed_xml_no_paragrph)
197 |         if make_ami_dict:
198 |             ami_dict_path = os.path.join(corpus_path, make_ami_dict)
199 |             self.handle_ami_dict_creation(
200 |                 self.sentence_dictionary, make_ami_dict, ami_dict_path)
201 | 
202 |         return self.sentence_dictionary
203 | 
204 |     def run_pygetpapers(self, query, hits, output):
205 |         """calls pygetpapers to query EPMC for papers; downloads specified number of papers
206 | 
207 |         :param query: query to pygetpapers/EPMC
208 |         :type query: str
209 |         :param hits: number of papers to download
210 |         :type hits: int
211 |         :param output: name of the folder
212 |         :type output: str
213 | 
214 |         """
215 |         pygetpapers_call = Pygetpapers()
216 |         pygetpapers_call.run_command(
217 |             query=query, limit=hits, output=output, xml=True)
218 |         logging.info(f"making CProject {output} with {hits} papers on {query}")
219 | 
220 |     def run_ami_section(self, path):
221 |         """Creates sections folder for each paper (CTree); sections papers into front, body, back and floats based on JATS
222 | 
223 |         :param path: CProject path
224 |         :type path: string
225 | 
226 |         """
227 |         file_list = glob(os.path.join(
228 |             path, '**', 'fulltext.xml'), recursive=True)
229 |         for paper in file_list:
230 |             with open(paper, 'r') as xml_file:
231 |                 xml_string = xml_file.read()
232 |             if len(xml_string) > 0:
233 |                 outdir = Path(Path(paper).parent, "sections")
234 |                 AMIAbsSection.make_xml_sections(paper, outdir, True)
235 |             else:
236 |                 logging.warning(f"{paper} is empty")
237 | 
238 |     def get_glob_for_section(self, path, section_names):
239 |         """globs for xml files in section folder of each CTree
240 | 
241 |         :param path: CProject path
242 |         :type path: string
243 |         :param section_names: one or more keys (section names) from CONFIG_SECTIONS
244 |         :type section_names: string
245 |         :returns: list of globs
246 |         :rtype: list
247 | 
248 |         """
249 |         for section_name in section_names:
250 |             if section_name in self.sections.keys():
251 |                 self.all_paragraphs[section_name] = []
252 |                 for section in self.sections[section_name]:
253 |                     self.all_paragraphs[section_name] += glob(os.path.join(
254 |                         path, '**', 'sections', '**', section), recursive=True)
255 |             else:
256 |                 logging.error(
257 |                     "please make sure that you have selected only the supported sections: ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL")
258 |         return self.all_paragraphs
259 | 
260 |     def make_dict_with_parsed_document(self, document_type="xml"):
261 |         """creates dictionary with parsed xml or html
262 | 
263 |         :param document_type: type of file fed: xml or html. Defaults to "xml".
264 |         :type document_type: str
265 |         :returns: python dict containing parsed text from xml or html
266 |         :rtype: dict
267 | 
268 |         """
269 | 
270 |         self.sentence_dictionary = {}
271 | 
272 |         counter = 1
273 |         for section in self.all_paragraphs:
274 |             for section_path in tqdm(self.all_paragraphs[section]):
275 |                 paragraph_path = section_path
276 |                 if document_type == 'html':
277 |                     paragraph_text = self.read_text_from_html(paragraph_path)
278 |                 elif document_type == 'xml':
279 |                     paragraph_text = self.read_text_from_path(paragraph_path)
280 |                 sentences = tokenize.sent_tokenize(paragraph_text)
281 |                 for sentence in sentences:
282 |                     self.sentence_dictionary[counter] = {}
283 |                     self._make_dict_attributes(
284 |                         counter, section, section_path, paragraph_text, sentence)
285 |                     counter += 1
286 |         logging.info(
287 |             f"Found {len(self.sentence_dictionary)} sentences in the section(s).")
288 |         return self.sentence_dictionary
289 | 
290 |     def _make_dict_attributes(self, counter, section, section_path, paragraph_text, sentence):
291 |         """
292 | 
293 |         :param counter: 
294 |         :param section: 
295 |         :param section_path: 
296 |         :param paragraph_text: 
297 |         :param sentence: 
298 | 
299 |         """
300 |         dict_for_sentences = self.sentence_dictionary[counter]
301 |         dict_for_sentences["file_path"] = section_path
302 |         dict_for_sentences["paragraph"] = paragraph_text
303 |         dict_for_sentences["sentence"] = sentence
304 |         dict_for_sentences["section"] = section
305 | 
306 |     def read_text_from_path(self, paragraph_path):
307 |         """uses ElementTree to read text from xml files
308 | 
309 |         :param paragraph_path: path to xml file
310 |         :type paragraph_path: string
311 |         :returns: raw text from xml
312 |         :rtype: string
313 | 
314 |         """
315 |         try:
316 |             tree = ET.parse(paragraph_path)
317 |             root = tree.getroot()
318 |             xmlstr = ET.tostring(root, encoding='utf8', method='xml')
319 |             soup = BeautifulSoup(xmlstr, features='lxml')
320 |             text = soup.get_text(separator=" ")
321 |             paragraph_text = text.replace(
322 |                 '\n', ' ')
323 |         except:
324 |             paragraph_text = "empty"
325 |             logging.error(f"cannot parse {paragraph_path}")
326 |         return paragraph_text
327 | 
328 |     def read_text_from_html(self, paragraph_path):
329 |         """uses beautifulsoup to read text from html files
330 | 
331 |         :param paragraph_path: path to html file
332 |         :type paragraph_path: string
333 |         :returns: raw text from html
334 |         :rtype: string
335 | 
336 |         """
337 |         with open(paragraph_path, encoding="utf-8") as f:
338 |             content = f.read()
339 |             soup = BeautifulSoup(content, 'html.parser')
340 |             return soup.text.replace('\n', ' ')
341 | 
342 |     def run_spacy_over_sections(self, dict_with_parsed_xml, entities_names):
343 |         """uses spacy to extract specific Named-Entities from sentences in python dict
344 | 
345 |         :param dict_with_parsed_xml: main dict with sentences
346 |         :type dict_with_parsed_xml: dict
347 |         :param entities_names: list of kinds of Named-Entities that needs to be extacted
348 |         :type entities_names: list
349 | 
350 |         """
351 |         self.download_spacy(self.spacy_model)
352 |         for paragraph in tqdm(dict_with_parsed_xml):
353 |             if len(dict_with_parsed_xml[paragraph]['sentence']) > 0:
354 |                 doc = self.nlp(dict_with_parsed_xml[paragraph]['sentence'])
355 |                 entities, labels, position_end, position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end = self._make_required_lists()
356 |                 self._get_entities(entities_names, doc, entities,
357 |                                    labels, position_end, position_start)
358 |                 self._add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end,
359 |                                         position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end)
360 | 
361 |     def _get_entities(self, entities_names, doc, entities, labels, position_end, position_start):
362 |         """
363 | 
364 |         :param entities_names: 
365 |         :param doc: 
366 |         :param entities: 
367 |         :param labels: 
368 |         :param position_end: 
369 |         :param position_start: 
370 | 
371 |         """
372 |         for ent in doc.ents:
373 |             if (ent.label_ in entities_names) or (entities_names == ['ALL']):
374 |                 self._add_parsed_entities_to_lists(
375 |                     entities, labels, position_end, position_start, ent)
376 | 
377 |     def abbreviation_search_using_sw(self, dict_with_parsed_xml):
378 |         """Extracts abbreviations from sentences using schwartz_hearst. Credit: Ananya Singha
379 | 
380 |         :param dict_with_parsed_xml: main python dictionary with sentences
381 |         :type dict_with_parsed_xml: dict
382 | 
383 |         """
384 |         for text in dict_with_parsed_xml:
385 |             dict_for_sentence = dict_with_parsed_xml[text]
386 |             dict_for_sentence["abb"] = []
387 |             pairs = schwartz_hearst.extract_abbreviation_definition_pairs(
388 |                 doc_text=dict_for_sentence['sentence'])
389 |             dict_for_sentence["abb"] = pairs
390 |             self._make_list_from_dict(pairs)
391 | 
392 |     def make_abb_exp_list(self, result_dictionary):
393 |         """make lists of abbreviations and expansions to input into xml dictionary creating method
394 | 
395 |         :param result_dictionary: main dictionary that contains sentences and abbreviation dict (abb and expansion)
396 |         :type result_dictionary: dict
397 |         :returns: all abbreviations
398 |         :rtype: list
399 | 
400 |         """
401 |         list_of_name_lists = []
402 |         list_of_term_lists = []
403 |         for entry in result_dictionary:
404 |             sentence_dictionary = result_dictionary[entry]
405 |             if 'abb' in sentence_dictionary:
406 |                 pairs_dicts = (result_dictionary[entry]['abb'])
407 |                 name_list_for_every_dict, term_list_for_every_dict = self._make_list_from_dict(
408 |                     pairs_dicts)
409 |                 list_of_name_lists.append(name_list_for_every_dict)
410 |                 list_of_term_lists.append(term_list_for_every_dict)
411 |         return self._list_of_lists_to_single_list(list_of_name_lists), self._list_of_lists_to_single_list(list_of_term_lists)
412 | 
413 |     def _make_list_from_dict(self, pairs):
414 |         """
415 | 
416 |         :param pairs: 
417 | 
418 |         """
419 |         keys_list = []
420 |         values_list = []
421 |         keys_list.extend(pairs.keys())
422 |         values_list.extend(pairs.values())
423 |         return keys_list, values_list
424 | 
425 |     def _list_of_lists_to_single_list(self, list_of_lists):
426 |         """
427 | 
428 |         :param list_of_lists: 
429 | 
430 |         """
431 |         return [item for sublist in list_of_lists for item in sublist]
432 | 
433 |     def make_ami_dict_from_abbreviation(self, title, result_dictionary, path):
434 |         """create xml ami-dict containing abbreviations extracted from sentences
435 | 
436 |         :param title: title of xml ami-dict
437 |         :type title: str
438 |         :param result_dictionary: main dictionary with sentences and corresponding abbeviations
439 |         :type result_dictionary: dict
440 |         :param path: path where the xml ami-dict file would lie
441 |         :type path: str
442 | 
443 |         """
444 |         name_list, term_list = self.make_abb_exp_list(result_dictionary)
445 |         dictionary_element = etree.Element("dictionary")
446 |         dictionary_element.attrib['title'] = title
447 |         for name, term in tqdm(zip(name_list, term_list)):
448 | 
449 |             wiki_lookup_list = self.wiki_lookup(term)
450 |             try:
451 |                 entry_element = etree.SubElement(dictionary_element, "entry")
452 |                 entry_element.attrib['name'] = name
453 |                 entry_element.attrib['term'] = term
454 |                 if len(wiki_lookup_list) == 0:
455 |                     entry_element.attrib['wikidataID'] = ""
456 |                 elif len(wiki_lookup_list) == 1:
457 |                     entry_element.attrib['wikidataID'] = ", ".join(wiki_lookup_list)
458 |                 else:
459 |                     raw_element = etree.SubElement(entry_element, 'raw')
460 |                     raw_element.attrib['wikidataID'] = ", ".join(wiki_lookup_list)
461 |             except Exception as e:
462 |                 logging.error(f"Couldn't add {term} to amidict")
463 |         xml_dict = self._etree_to_string(dictionary_element)
464 |         self._write_string_to_file(xml_dict, f'{path}.xml')
465 |         logging.info(f'wrote all abbreviations to ami dict {path}.xml')
466 | 
467 |     def _etree_to_string(self, dictionary_element):
468 |         """
469 | 
470 |         :param dictionary_element: 
471 | 
472 |         """
473 |         xml_dict = etree.tostring(
474 |             dictionary_element, pretty_print=True).decode('utf-8')
475 |         return xml_dict
476 | 
477 |     def _get_abbreviations(self, doc, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end):
478 |         """
479 | 
480 |         :param doc: 
481 |         :param abbreviations: 
482 |         :param abbreviations_longform: 
483 |         :param abbreviation_start: 
484 |         :param abbreviation_end: 
485 | 
486 |         """
487 |         for abrv in doc._.abbreviations:
488 |             abbreviations.append(abrv)
489 |             abbreviations_longform.append(abrv._.long_form)
490 |             abbreviation_start.append(abrv.start)
491 |             abbreviation_end.append(abrv.end)
492 | 
493 |     def add_if_file_contains_terms(self, compiled_terms, dict_with_parsed_xml, searching='has_terms'):
494 |         """populate the main dictionary with term matches, its frequency and span
495 | 
496 |         :param compiled_terms: list of compiled ami-dict terms
497 |         :type compiled_terms: list
498 |         :param dict_with_parsed_xml: dictionary containing sentences
499 |         :type dict_with_parsed_xml: dict
500 |         :param searching: dict key name. Defaults to 'has_terms'.
501 |         :type searching: str
502 | 
503 |         """
504 |         for statement in tqdm(dict_with_parsed_xml):
505 |             dict_for_sentence = dict_with_parsed_xml[statement]
506 |             dict_for_sentence[f'{searching}'] = []
507 |             dict_for_sentence[f'{searching}_span'] = []
508 |             term_list, span_list, frequency = self.search_sentence_with_compiled_terms(
509 |                 compiled_terms, dict_for_sentence['sentence'])
510 |             if term_list:
511 |                 dict_for_sentence[f'{searching}'].append(term_list)
512 |                 dict_for_sentence[f'weight_{searching}'] = frequency
513 |                 dict_for_sentence[f'{searching}_span'].append(span_list)
514 | 
515 |     def search_sentence_with_compiled_terms(self, compiled_terms, sentence):
516 |         """search sentences using the compiled ami-dict entry
517 | 
518 |         :param compiled_terms: list of compiled ami-dict terms
519 |         :type compiled_terms: list
520 |         :param sentence: sentence to search using compiled terms
521 |         :type sentence: string
522 |         :returns: list of terms that was found after searching sentence
523 |         :rtype: list
524 | 
525 |         """
526 |         # https://stackoverflow.com/questions/47681756/match-exact-phrase-within-a-string-in-python
527 |         match_list = []
528 |         span_list = []
529 |         frequency = 0
530 |         for compiled_term in compiled_terms:
531 |             term_match = compiled_term.search(sentence, re.IGNORECASE)
532 |             if term_match is not None:
533 |                 match_list.append(term_match.group())
534 |                 span_list.append(term_match.span())
535 |             frequency = len(match_list)
536 |         return match_list, span_list, frequency
537 | 
538 |     def get_terms_from_ami_xml(self, xml_path):
539 |         """parses ami-dict (xml) and reads the entry terms; ami-dict can either be the default ones (user specifies python dict key) or customized ones (user specifies full path to it)
540 | 
541 |         :param xml_path: either keys from dict_of_ami_dict or full path to ami-dict
542 |         :type xml_path: string
543 |         :returns: list of regex compiled entry terms from ami-dict
544 |         :rtype: list
545 | 
546 |         """
547 |         if xml_path in self.dict_of_ami_dict.keys():
548 |             logging.info(f"getting terms from {xml_path}")
549 |             tree = ET.parse(urlopen(self.dict_of_ami_dict[xml_path]))
550 |             root = tree.getroot()
551 |         elif xml_path not in self.dict_of_ami_dict.keys():
552 |             tree = ET.parse(xml_path)
553 |             root = tree.getroot()
554 |             logging.info(f"getting terms from {xml_path}")
555 |         else:
556 |             logging.error(f'{xml_path} is not a supported dictionary. Choose from: EO_ACTIVITY, EO_COMPOUND, EO_EXTRACTION, EO_PLANT, EO_PLANT_PART, PLANT_GENUS,EO_TARGET, COUNTRY, DISEASE, DRUG, ORGANIZATION ')
557 | 
558 |         compiled_terms = self._compiled_regex(root.iter('entry'))
559 |         return (set(compiled_terms))
560 | 
561 |     def _compiled_regex(self, iterate_over):
562 |         """
563 | 
564 |         :param iterate_over: 
565 | 
566 |         """
567 |         compiled_terms = []
568 |         for para in iterate_over:
569 |             try:
570 |                 term = (para.attrib["term"])
571 |             except KeyError:
572 |                 term = para.text
573 |             try:
574 |                 compiled_term = self._regex_compile(term)
575 |             except re.error:
576 |                 logging.warning(f'cannot use term {term}')
577 |             compiled_terms.append(compiled_term)
578 |         return compiled_terms
579 | 
580 |     def _regex_compile(self, term):
581 |         """
582 | 
583 |         :param term: 
584 | 
585 |         """
586 |         return re.compile(r'\b{}\b'.format(term))
587 | 
588 |     def get_synonyms_from_ami_xml(self, xml_path):
589 |         """parses ami-dict (xml) and reads the entry's synonyms; ami-dict can either be the default ones (user specifies python dict key) or customized ones (user specifies full path to it)
590 | 
591 |         :param xml_path: either keys from dict_of_ami_dict or full path to ami-dict
592 |         :type xml_path: string
593 |         :returns: list of regex compiled entry's synonyms from ami-dict
594 |         :rtype: list
595 | 
596 |         """
597 |         if xml_path in self.dict_of_ami_dict.keys():
598 |             logging.info(f"getting synonyms from {xml_path}")
599 |             tree = ET.parse(urlopen(self.dict_of_ami_dict[xml_path]))
600 |             root = tree.getroot()
601 |         elif xml_path not in self.dict_of_ami_dict.keys():
602 |             logging.info(f"getting synonyms from {xml_path}")
603 |             tree = ET.parse(xml_path)
604 |             root = tree.getroot()
605 |         else:
606 |             logging.error(f'{xml_path} is not a supported dictionary. Choose from: EO_ACTIVITY, EO_COMPOUND, EO_EXTRACTION, EO_PLANT, EO_PLANT_PART, PLANT_GENUS,EO_TARGET, COUNTRY, DISEASE, DRUG, ORGANIZATION ')
607 |         synonyms = self._compiled_regex(root.findall("./entry/synonym"))
608 |         return synonyms
609 | 
610 |     def _make_required_lists(self):
611 |         """ """
612 |         abbreviations = []
613 |         abbreviations_longform = []
614 |         abbreviation_start = []
615 |         abbreviation_end = []
616 |         entities = []
617 |         labels = []
618 |         position_start = []
619 |         position_end = []
620 |         return entities, labels, position_end, position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end
621 | 
622 |     def _add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end,
623 |                            position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end):
624 |         """
625 | 
626 |         :param dict_for_sentence: 
627 |         :param entities: 
628 |         :param labels: 
629 |         :param position_end: 
630 |         :param position_start: 
631 |         :param abbreviations: 
632 |         :param abbreviations_longform: 
633 |         :param abbreviation_start: 
634 |         :param abbreviation_end: 
635 | 
636 |         """
637 | 
638 |         dict_for_sentence['entities'] = entities
639 |         dict_for_sentence['labels'] = labels
640 |         dict_for_sentence['position_start'] = position_start
641 |         dict_for_sentence['position_end'] = position_end
642 |         dict_for_sentence['abbreviations'] = abbreviations
643 |         dict_for_sentence['abbreviations_longform'] = abbreviations_longform
644 |         dict_for_sentence['abbreviation_start'] = abbreviation_start
645 |         dict_for_sentence['abbreviation_end'] = abbreviation_end
646 | 
647 |     def _add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None):
648 |         """
649 | 
650 |         :param entities: 
651 |         :param labels: 
652 |         :param position_end: 
653 |         :param position_start: 
654 |         :param ent:  (Default value = None)
655 | 
656 |         """
657 |         entities.append(ent.text)
658 |         labels.append(ent.label_)
659 |         position_start.append(ent.start_char)
660 |         position_end.append(ent.end_char)
661 | 
662 |     def convert_dict_to_csv(self, path, dict_with_parsed_xml):
663 |         """Turns python dictionary into CSV using pandas
664 | 
665 |         :param path: CSV file to write output
666 |         :type path: string
667 |         :param dict_with_parsed_xml: python dictionary that needs to be converted to csv
668 |         :type dict_with_parsed_xml: dict
669 | 
670 |         """
671 |         df = pd.DataFrame(dict_with_parsed_xml)
672 |         df = df.T
673 |         for col in df:
674 |             try:
675 |                 df[col] = df[col].astype(str).str.replace(
676 |                     "[", "").str.replace("]", "")
677 |                 df[col] = df[col].astype(str).str.replace(
678 |                     "'", "").str.replace("'", "")
679 |             except:
680 |                 pass
681 |         df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
682 |         logging.info(f"wrote output to {path}")
683 | 
684 |     def remove_paragraph_form_parsed_xml_dict(self, dict_with_parsed_xml, key_to_remove):
685 |         """pops out the specified key value pairs from python dictionaries
686 | 
687 |         :param dict_with_parsed_xml: python dict from which a key-value pair needs to be removed
688 |         :type dict_with_parsed_xml: dict
689 |         :param key_to_remove: key of the pair that needs to be removed
690 |         :type key_to_remove: string
691 |         :returns: python dict with the specified key-value pair removed
692 |         :rtype: dict
693 | 
694 |         """
695 |         for entry in dict_with_parsed_xml:
696 |             dict_with_parsed_xml[entry].pop(key_to_remove, None)
697 |         return dict_with_parsed_xml
698 | 
699 |     def convert_dict_to_json(self, path, dict_with_parsed_xml):
700 |         """writes python dictionary to json file
701 | 
702 |         :param path: json file path to write to
703 |         :type path: str
704 |         :param dict_with_parsed_xml: main dictionary with sentences, search hits, entities, etc.
705 |         :type dict_with_parsed_xml: dict
706 | 
707 |         """
708 |         with open(path, mode='w', encoding='utf-8') as f:
709 |             json.dump(dict_with_parsed_xml, f, indent=4)
710 |         logging.info(f"wrote JSON output to {path}")
711 | 
712 |     def remove_statements_not_having_xmldict_terms(self, dict_with_parsed_xml, searching='has_terms'):
713 |         """removes key-value pairs from the main python dict that do not have any match hits
714 | 
715 |         :param dict_with_parsed_xml: python dictionary from which the specific key-value pairs needs to be removed
716 |         :type dict_with_parsed_xml: dict
717 |         :param searching: the key to the pair in the nested-dict that needs to be removed (Default value = 'has_terms')
718 |         :type searching: str
719 | 
720 |         """
721 |         statement_to_pop = []
722 |         for statement in dict_with_parsed_xml:
723 |             sentect_dict = dict_with_parsed_xml[statement]
724 |             if len(sentect_dict[searching]) == 0:
725 |                 statement_to_pop.append(statement)
726 | 
727 |         for term in statement_to_pop:
728 |             dict_with_parsed_xml.pop(term)
729 | 
730 |     def make_ami_dict_from_list(self, list_of_terms_with_count, title):
731 |         """makes ami-dict from a python dictionary containing terms and frequencies.
732 | 
733 |         :param list_of_terms_with_count: python dictionary containing terms and their frequency of occurence
734 |         :type list_of_terms_with_count: dict
735 |         :param title: title of the xml ami-dict as well as the name of the XML file
736 |         :type title: string
737 |         :returns: xml ami-dict
738 |         :rtype: file
739 | 
740 |         """
741 |         dictionary_element = etree.Element("dictionary")
742 |         dictionary_element.attrib['title'] = title
743 |         for term in list_of_terms_with_count:
744 |             try:
745 |                 entry_element = etree.SubElement(dictionary_element, "entry")
746 |                 entry_element.attrib['term'] = term[0]
747 |                 entry_element.attrib['count'] = str(term[1])
748 |             except Exception as e:
749 |                 logging.error(f"Couldn't add {term} to amidict")
750 |         return self._etree_to_string(dictionary_element)
751 | 
752 |     def _write_string_to_file(self, string_to_put, title):
753 |         """
754 | 
755 |         :param string_to_put: 
756 |         :param title: 
757 | 
758 |         """
759 |         with open(title, mode='w', encoding='utf-8') as f:
760 |             f.write(string_to_put)
761 | 
762 |     def handle_ami_dict_creation(self, result_dictionary, title, path):
763 |         """creates and writes ami dictionary with entities extracted and their frequency.
764 | 
765 |         :param result_dictionary: main python dictionary with sentences, entities, etc.
766 |         :type result_dictionary: dict
767 |         :param title: title of ami-dictionary (xml file)
768 |         :type title: str
769 |         :param path: file path
770 |         :type path: str
771 | 
772 |         """
773 |         list_of_entities = []
774 |         for entry in result_dictionary:
775 |             if 'entities' in result_dictionary[entry]:
776 |                 entity = result_dictionary[entry]['entities']
777 |                 list_of_entities.extend(entity)
778 |         dict_of_entities_with_count = Counter(list_of_entities)
779 |         list_of_terms_with_count = dict_of_entities_with_count.most_common()
780 |         xml_dict = self.make_ami_dict_from_list(
781 |             list_of_terms_with_count, title)
782 |         self._write_string_to_file(xml_dict, f'{path}.xml')
783 |         logging.info(f"Wrote all the entities extracted to {path}.xml")
784 | 
785 |     def json_to_dict(self, json_file_link):
786 |         """loads json file as python dictionary
787 | 
788 |         :param json_file_link: link to json file on the web
789 |         :type json_file_link: str
790 |         :returns: python dictionary from json
791 |         :rtype: dictionary
792 | 
793 |         """
794 |         path = urlopen(json_file_link)
795 |         json_dict = json.load(path)
796 |         return (json_dict)
797 | 
798 |     def wiki_lookup(self, query):
799 |         """Queries Wikidata API for Wikidata Item IDs for terms in ami-dict
800 | 
801 |         :param query: term to query wikdiata for ID
802 |         :type query: string
803 |         :returns: potential Wikidata Item URLs
804 |         :rtype: list
805 | 
806 |         """
807 |         params = {
808 |             "action": "wbsearchentities",
809 |             "search": query,
810 |             "language": "en",
811 |             "format": "json"
812 |         }
813 |         data = requests.get(
814 |             "https://www.wikidata.org/w/api.php", params=params)
815 |         result = data.json()
816 |         hit_list = []
817 |         for hit in result['search']:
818 |             try:
819 |                 if "scientific article" not in hit["description"]:
820 |                     hit_list.append(hit["id"])
821 |             except:
822 |                 hit_list.append(hit["id"])
823 |         return hit_list
824 | 
825 | 
826 | # take out the constants
827 | # look through download_tools (pygetpapers) and see if we have overlapping functionality.
828 | # functionality_from_(where you are getting a data)
829 | 
830 | 
831 | # Future goals
832 | # make tests automated
833 | # readthedocs
834 | # tutorials
835 | # repository management
836 | 


--------------------------------------------------------------------------------
/docanalysis/file_lib.py:
--------------------------------------------------------------------------------
  1 | """classes and methods to support path operations
  2 | 
  3 | """
  4 | import json
  5 | import copy
  6 | import glob
  7 | import re
  8 | import os
  9 | import shutil
 10 | from pathlib import Path, PurePath
 11 | import logging
 12 | from glob import glob
 13 | from braceexpand import braceexpand
 14 | 
 15 | logging.debug("loading file_lib")
 16 | 
 17 | py4ami = "py4ami"
 18 | RESOURCES = "resources"
 19 | 
 20 | # section keys
 21 | _DESC = "_DESC"
 22 | PROJ = "PROJ"
 23 | TREE = "TREE"
 24 | SECTS = "SECTS"
 25 | SUBSECT = "SUBSECT"
 26 | SUBSUB = "SUBSUB"
 27 | FILE = "FILE"
 28 | SUFFIX = "SUFFIX"
 29 | 
 30 | ALLOWED_SECTS = {_DESC, PROJ, TREE, SECTS, SUBSECT, SUBSUB, FILE, SUFFIX}
 31 | 
 32 | # wildcards
 33 | STARS = "**"
 34 | STAR = "*"
 35 | 
 36 | # suffixes
 37 | S_PDF = "pdf"
 38 | S_PNG = "png"
 39 | S_SVG = "svg"
 40 | S_TXT = "txt"
 41 | S_XML = "xml"
 42 | 
 43 | # markers for processing
 44 | _NULL = "_NULL"
 45 | _REQD = "_REQD"
 46 | 
 47 | # known section names
 48 | SVG = "svg"
 49 | PDFIMAGES = "pdfimages"
 50 | RESULTS = "results"
 51 | SECTIONS = "sections"
 52 | 
 53 | # subsects
 54 | IMAGE_STAR = "image*"
 55 | 
 56 | # subsects
 57 | OCTREE = "*octree"
 58 | 
 59 | # results
 60 | SEARCH = "search"
 61 | WORD = "word"
 62 | EMPTY = "empty"
 63 | 
 64 | # files
 65 | FULLTEXT_PAGE = "fulltext-page*"
 66 | CHANNEL_STAR = "channel*"
 67 | RAW = "raw"
 68 | 
 69 | 
 70 | class Globber:
 71 |     """utilities for globbing - may be obsolete"""
 72 | 
 73 |     def __init__(self, ami_path, recurse=True, cwd=None) -> None:
 74 |         self.ami_path = ami_path
 75 |         self.recurse = recurse
 76 |         self.cwd = os.getcwd() if cwd is None else cwd
 77 | 
 78 |     def get_globbed_files(self) -> list:
 79 |         """uses the glob_string_list in ami_path to create a path list"""
 80 |         files = []
 81 |         if self.ami_path:
 82 |             glob_list = self.ami_path.get_glob_string_list()
 83 |             for gl_str in glob_list:
 84 |                 files += glob.glob(gl_str, recursive=self.recurse)
 85 |         return files
 86 | 
 87 | 
 88 | class AmiPath:
 89 |     """holds a (keyed) scheme for generating lists of path globs
 90 |     The scheme has several segments which can be set to create a glob expr.
 91 | 
 92 | 
 93 |     """
 94 |     # keys for path scheme templates
 95 |     T_FIGURES = "fig_captions"
 96 |     T_OCTREE = "octree"
 97 |     T_PDFIMAGES = "pdfimages"
 98 |     T_RESULTS = "results"
 99 |     T_SECTIONS = "sections"
100 |     T_SVG = "svg"
101 | 
102 |     logger = logging.getLogger("ami_path")
103 |     # dict
104 | 
105 |     def __init__(self, scheme=None):
106 |         self.scheme = scheme
107 | 
108 |     def print_scheme(self):
109 |         """for debugging and enlightenment"""
110 |         if self.scheme is not None:
111 |             for key in self.scheme:
112 |                 print("key ", key, "=", self.scheme[key])
113 |             print("")
114 | 
115 |     @classmethod
116 |     def create_ami_path_from_templates(cls, key, edit_dict=None):
117 |         """creates a new AmiPath object from selected template
118 |         key: to template
119 |         edit_dict: dictionary with values to edit in
120 | 
121 |         :param key: 
122 |         :param edit_dict:  (Default value = None)
123 | 
124 |         """
125 |         key = key.lower()
126 |         if key is None or key not in TEMPLATES:
127 |             cls.logger.error(f"cannot find key {key}")
128 |             cls.logger.error("no scheme for: ", key,
129 |                              "expected", TEMPLATES.keys())
130 |         ami_path = AmiPath()
131 |         # start with default template values
132 |         ami_path.scheme = copy.deepcopy(TEMPLATES[key])
133 |         if edit_dict:
134 |             ami_path.edit_scheme(edit_dict)
135 |         return ami_path
136 | 
137 |     def edit_scheme(self, edit_dict):
138 |         """edits values in self.scheme using edit_dict
139 | 
140 |         :param edit_dict: 
141 | 
142 |         """
143 |         for k, v in edit_dict.items():
144 |             self.scheme[k] = v
145 | 
146 |     def permute_sets(self):
147 |         """ """
148 |         self.scheme_list = []
149 |         self.scheme_list.append(self.scheme)
150 |         # if scheme has sets, expand them
151 |         change = True
152 |         while change:
153 |             change = self.expand_set_lists()
154 | 
155 |     def expand_set_lists(self):
156 |         """expands the sets in a scheme
157 |         note: sets are held as lists in JSON
158 |         
159 |         a scheme with 2 sets of size n and m is
160 |         expanded to n*m schemes covering all permutations
161 |         of the set values
162 |         
163 |         self.scheme_list contains all the schemes
164 |         
165 |         returns True if any sets are expanded
166 | 
167 | 
168 |         """
169 |         change = False
170 |         for scheme in self.scheme_list:
171 |             for sect, value in scheme.items():
172 |                 if type(value) == list:
173 |                     change = True
174 |                     # delete scheme with set, replace by copies
175 |                     self.scheme_list.remove(scheme)
176 |                     for set_value in value:
177 |                         scheme_copy = copy.deepcopy(scheme)
178 |                         self.scheme_list.append(scheme_copy)
179 |                         scheme_copy[sect] = set_value  # poke in set value
180 |                     break  # after each set processed
181 | 
182 |         return change
183 | 
184 |     def get_glob_string_list(self):
185 |         """expand sets in AmiPath
186 |         creates m*n... glob strings for sets with len n and m
187 | 
188 | 
189 |         """
190 |         self.permute_sets()
191 |         self.glob_string_list = []
192 |         for scheme in self.scheme_list:
193 |             glob_string = AmiPath.create_glob_string(scheme)
194 |             self.glob_string_list.append(glob_string)
195 |         return self.glob_string_list
196 | 
197 |     @classmethod
198 |     def create_glob_string(cls, scheme):
199 |         """
200 | 
201 |         :param scheme: 
202 | 
203 |         """
204 |         globx = ""
205 |         for sect, value in scheme.items():
206 |             cls.logger.debug(sect, type(value), value)
207 |             if sect not in ALLOWED_SECTS:
208 |                 cls.logger.error(f"unknown sect: {sect}")
209 |             elif _DESC == sect:
210 |                 pass
211 |             elif _REQD == value:
212 |                 cls.logger.error("must set ", sect)
213 |                 globx += _REQD + "/"
214 |             elif _NULL == value:
215 |                 pass
216 |             elif FILE == sect:
217 |                 globx += AmiPath.convert_to_glob(value)
218 |             elif STAR in value:
219 |                 globx += AmiPath.convert_to_glob(value) + "/"
220 |             elif SUFFIX == sect:
221 |                 globx += "." + AmiPath.convert_to_glob(value)
222 |             else:
223 |                 globx += AmiPath.convert_to_glob(value) + "/"
224 |         cls.logger.debug("glob", scheme, "=>", globx)
225 |         return globx
226 | 
227 |     @classmethod
228 |     def convert_to_glob(cls, value):
229 |         """
230 | 
231 |         :param value: 
232 | 
233 |         """
234 |         valuex = value
235 |         if type(value) == list:
236 |             # tacky. string quotes and add commas and parens
237 |             valuex = "("
238 |             for v in value:
239 |                 valuex += v + ","
240 |             valuex = valuex[:-1] + ")"
241 |         return valuex
242 | 
243 |     def get_globbed_files(self):
244 |         """ """
245 |         files = Globber(self).get_globbed_files()
246 |         self.logger.debug("files", len(files))
247 |         return files
248 | 
249 | 
250 | class BraceGlobber:
251 |     """ """
252 | 
253 |     def braced_glob(self, path, recursive=False):
254 |         """
255 | 
256 |         :param path: 
257 |         :param recursive:  (Default value = False)
258 | 
259 |         """
260 |         ll = [glob(x, recursive=recursive) for x in braceexpand(path)]
261 |         return ll
262 | 
263 | 
264 | class FileLib:
265 |     """ """
266 | 
267 |     logger = logging.getLogger("file_lib")
268 | 
269 |     @classmethod
270 |     def force_mkdir(cls, dirx):
271 |         """ensure dirx exists
272 |         
273 |         :dirx: directory
274 | 
275 |         :param dirx: 
276 | 
277 |         """
278 |         if not os.path.exists(dirx):
279 |             try:
280 |                 os.mkdir(dirx)
281 |             except Exception as e:
282 |                 cls.logger.error(f"cannot make dirx {dirx} , {e}")
283 | 
284 |     @classmethod
285 |     def force_mkparent(cls, file):
286 |         """ensure parent directory exists
287 |         
288 |         :path: whose parent directory is to be created if absent
289 | 
290 |         :param file: 
291 | 
292 |         """
293 |         if file is not None:
294 |             cls.force_mkdir(cls.get_parent_dir(file))
295 | 
296 |     @classmethod
297 |     def force_write(cls, file, data, overwrite=True):
298 |         """:write path, creating dirtectory if necessary
299 |         :path: path to write to
300 |         :data: str data to write
301 |         :overwrite: force write iuf path exists
302 |         
303 |         may throw exception from write
304 | 
305 |         :param file: 
306 |         :param data: 
307 |         :param overwrite:  (Default value = True)
308 | 
309 |         """
310 |         if file is not None:
311 |             if os.path.exists(file) and not overwrite:
312 |                 logging.warning(f"not overwriting existsnt path {file}")
313 |             else:
314 |                 cls.force_mkparent(file)
315 |                 with open(file, "w", encoding="utf-8") as f:
316 |                     f.write(data)
317 | 
318 |     @classmethod
319 |     def copy_file_or_directory(cls, dest_path, src_path, overwrite):
320 |         """
321 | 
322 |         :param dest_path: 
323 |         :param src_path: 
324 |         :param overwrite: 
325 | 
326 |         """
327 |         if dest_path.exists():
328 |             if not overwrite:
329 |                 file_type = "dirx" if dest_path.is_dir() else "path"
330 |                 raise TypeError(
331 |                     str(dest_path), f"cannot overwrite existing {file_type} (str({dest_path})")
332 | 
333 |         else:
334 |             # assume directory
335 |             cls.logger.warning(f"create directory {dest_path}")
336 |             dest_path.mkdir(parents=True, exist_ok=True)
337 |             cls.logger.info(f"created directory {dest_path}")
338 |         if src_path.is_dir():
339 |             if os.path.exists(dest_path):
340 |                 shutil.rmtree(dest_path)
341 |             shutil.copytree(src_path, dest_path)
342 |             cls.logger.info(f"copied directory {src_path} to {dest_path}")
343 |         else:
344 |             try:
345 |                 shutil.copy(src_path, dest_path)  # will overwrite
346 |                 cls.logger.info(f"copied path {src_path} to {dest_path}")
347 |             except Exception as e:
348 |                 cls.logger.fatal(f"Cannot copy direcctory {src_path} to {dest_path} because {e}")
349 | 
350 |     @staticmethod
351 |     def create_absolute_name(file):
352 |         """create absolute/relative name for a path relative to py4ami
353 |         
354 |         TODO this is messy
355 | 
356 |         :param file: 
357 | 
358 |         """
359 |         absolute_file = None
360 |         if file is not None:
361 |             file_dir = FileLib.get_parent_dir(__file__)
362 |             absolute_file = os.path.join(os.path.join(file_dir, file))
363 |         return absolute_file
364 | 
365 |     @classmethod
366 |     def get_py4ami(cls):
367 |         """gets paymi_m pathname"""
368 |         return Path(__file__).parent.resolve()
369 | 
370 |     @classmethod
371 |     def get_pyami_root(cls):
372 |         """gets paymi root pathname"""
373 |         return Path(__file__).parent.parent.resolve()
374 | 
375 |     @classmethod
376 |     def get_pyami_resources(cls):
377 |         """gets paymi root pathname"""
378 |         return Path(cls.get_py4ami(), RESOURCES)
379 | 
380 |     @classmethod
381 |     def get_parent_dir(cls, file):
382 |         """
383 | 
384 |         :param file: 
385 | 
386 |         """
387 |         return None if file is None else PurePath(file).parent
388 | 
389 |     @classmethod
390 |     def read_pydictionary(cls, file):
391 |         """read a json path into a python dictiomary
392 | 
393 |         :param file: 
394 | 
395 |         """
396 |         import ast
397 |         with open(file, "r") as f:
398 |             pydict = ast.literal_eval(f.read())
399 |         return pydict
400 | 
401 |     @classmethod
402 |     def punct2underscore(cls, text):
403 |         """replace all ASCII punctuation except '.' , '-', '_' by '_'
404 |         
405 |         for filenames
406 | 
407 |         :param text: 
408 | 
409 |         """
410 |        # from py4ami.text_lib import TextUtil
411 |         # this is non-trivial https://stackoverflow.com/questions/10017147/removing-a-list-of-characters-in-string
412 | 
413 |         non_file_punct = '\t \n{}!@#$%^&*()[]:;\'",|\\~+=/`'
414 |         # [unicode(x.strip()) if x is not None else '' for x in row]
415 | 
416 |         #text0 = TextUtil.replace_chars(text, non_file_punct, "_")
417 |         text0 = ''.join([c if c not in non_file_punct else "_" for c in text])
418 |         return text0
419 | 
420 |     @classmethod
421 |     def get_suffix(cls, file):
422 |         """get suffix
423 |         INCLUDES the "."
424 | 
425 |         :param file: 
426 | 
427 |         """
428 |         _suffix = None if file is None else Path(file).suffix
429 |         return _suffix
430 | 
431 | 
432 | # see https://realpython.com/python-pathlib/
433 | 
434 | def main():
435 |     """ """
436 |     print("started file_lib")
437 |     # test_templates()
438 | 
439 |     print("finished file_lib")
440 | 
441 | 
442 | if __name__ == "__main__":
443 |     print("running file_lib main")
444 |     main()
445 | else:
446 |     #    print("running file_lib main anyway")
447 |     #    main()
448 |     pass
449 | 
450 | # examples of regex for filenames
451 | 
452 | 
453 | def glob_re(pattern, strings):
454 |     """
455 | 
456 |     :param pattern: 
457 |     :param strings: 
458 | 
459 |     """
460 |     return filter(re.compile(pattern).match, strings)
461 | 
462 | 
463 | filenames = glob_re(r'.*(abc|123|a1b).*\.txt', os.listdir())
464 | 
465 | # Credits: Peter Murray-Rust, py4ami (https://github.com/petermr/pyami/blob/main/py4ami/file_lib.py)


--------------------------------------------------------------------------------
/docanalysis/get_html.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | from glob import glob
  3 | import os
  4 | from abbreviations import schwartz_hearst
  5 | from lxml import etree
  6 | import yake
  7 | 
  8 | def read_text_from_html(paragraph_path):
  9 |     """
 10 | 
 11 |     :param paragraph_path: 
 12 | 
 13 |     """
 14 |   with open(paragraph_path, 'r') as f:
 15 |       html = f.read()
 16 |       soup = BeautifulSoup(html, features="html.parser")
 17 | 
 18 |       # kill all script and style elements
 19 |       for script in soup(["script", "style"]):
 20 |           script.extract()    # rip it out
 21 | 
 22 |       # get text
 23 |       text = soup.get_text()
 24 | 
 25 |       # break into lines and remove leading and trailing space on each
 26 |       #lines = (line.strip() for line in text.splitlines())
 27 |       # break multi-headlines into a line each
 28 |       #chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
 29 |       # drop blank lines
 30 |       #text_write = '\n'.join(chunk for chunk in chunks if chunk)
 31 |       #text = '\n'.join(chunk for chunk in chunks if chunk)
 32 |       return text
 33 | 
 34 | def get_glob(corpus_path):
 35 |     """
 36 | 
 37 |     :param corpus_path: 
 38 | 
 39 |     """
 40 |     paragraph_path = glob(os.path.join(corpus_path, '**', 'sections', '**', "*html"), recursive=True)
 41 |     return paragraph_path
 42 | 
 43 | def abbreviation_search_using_sw(paragraph_text):
 44 |     """
 45 | 
 46 |     :param paragraph_text: 
 47 | 
 48 |     """
 49 |     pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=paragraph_text)
 50 |     keys = pairs.keys()
 51 |     values = pairs.values()
 52 |     return keys, values
 53 | 
 54 | def make_ami_dict_from_list(title, keys, values):
 55 |     """
 56 | 
 57 |     :param title: 
 58 |     :param keys: 
 59 |     :param values: 
 60 | 
 61 |     """
 62 |     dictionary_element=  etree.Element("dictionary")
 63 |     dictionary_element.attrib['title']= title
 64 |     for term, expansion in zip(keys, values):
 65 |         entry_element=etree.SubElement(dictionary_element,"entry")
 66 |         entry_element.attrib['term']=term
 67 |         entry_element.attrib['exapansion']=expansion
 68 |     return etree.tostring(dictionary_element, pretty_print=True).decode('utf-8')
 69 | 
 70 | def write_string_to_file(string_to_put,title):
 71 |     """
 72 | 
 73 |     :param string_to_put: 
 74 |     :param title: 
 75 | 
 76 |     """
 77 |     with open(title,mode='w', encoding='utf-8') as f:
 78 |         f.write(string_to_put)
 79 |     print(f"wrote dict to {title}")
 80 | 
 81 | def extract_keyphrase(paragraph_text):
 82 |     """
 83 | 
 84 |     :param paragraph_text: 
 85 | 
 86 |     """
 87 |     custom_kw_extractor = yake.KeywordExtractor(lan='en', n=5, top=10, features=None)
 88 |     keywords = custom_kw_extractor.extract_keywords(paragraph_text)
 89 |     keywords_list = []
 90 |     for kw in keywords:
 91 |         keywords_list.append(kw[0])
 92 |     print(keywords_list)
 93 | 
 94 | def does_everything(corpus_path):
 95 |     """
 96 | 
 97 |     :param corpus_path: 
 98 | 
 99 |     """
100 |     all_text = []
101 |     all_keys = []
102 |     all_values = []
103 |     all_paragraph_paths = get_glob(corpus_path)
104 |     for paragraph_path in all_paragraph_paths:
105 |         paragraph_text = read_text_from_html(paragraph_path)
106 |         #print(paragraph_text)
107 |         all_text.append(paragraph_text)
108 |         keys, values = abbreviation_search_using_sw(paragraph_text)
109 |         all_keys.extend(keys)
110 |         all_values.extend(values)
111 |     print(len(all_keys), all_values)
112 |     #all_text_string = joinStrings(all_text)
113 |     #print(all_text_string)
114 |     #extract_keyphrase(all_text_string)
115 |     #dict_string = make_ami_dict_from_list("abb", all_keys, all_values)
116 |     #return dict_string
117 | 
118 | 
119 | def joinStrings(stringList):
120 |     """
121 | 
122 |     :param stringList: 
123 | 
124 |     """
125 |     return ''.join(string for string in stringList)
126 | 
127 | path = os.path.join(os.path.expanduser('~'), "ipcc_sectioned")
128 | does_everything(path)
129 | #write_string_to_file( dict_string, "abb.xml")
130 | 
131 | import json
132 | from urllib.request import urlopen
133 | 
134 | #PATH = urlopen()
135 | #json_dict = json.load(PATH)
136 | #print(json_dict)
137 | 
138 | 


--------------------------------------------------------------------------------
/docanalysis/glob_trail.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from glob import glob
 3 | from pprint import pprint
 4 | 
 5 | # define constants
 6 | ABS = ['*abstract.xml']
 7 | ACK = ['*ack.xml']
 8 | AFF = ['*aff.xml']
 9 | AUT = ['*contrib-group.xml']
10 | CON = ['*conclusion*/*.xml']
11 | DIS = ['*discussion*/**/*_title.xml', '*discussion*/**/*_p.xml'] # might bring unwanted sections like tables, fig. captions etc. Maybe get only title and paragraphs?
12 | ETH = ['*ethic*/*.xml']
13 | FIG = ['*fig*.xml']
14 | INT = ['*introduction*/*.xml', '*background*/*.xml']
15 | KEY = ['*kwd-group.xml']
16 | MET = ['*method*/*.xml', '*material*/*.xml'] # also gets us supplementary material. Not sure how to exclude them
17 | RES = ['*result*/*/*_title.xml', '*result*/*/*_p.xml'] # not sure if we should use recursive globbing or not. 
18 | TAB = ['*table*.xml']
19 | TIL = ['*article-meta/*title-group.xml']
20 | 
21 | # glob
22 | path = os.getcwd()
23 | cproj = 'corpus/asp_nat_products'
24 | LIST_SEC = [TIL, KEY]
25 | for SEC in LIST_SEC:
26 |     for opt in SEC:
27 |         glob_list=glob(os.path.join(path, cproj, '**', 'sections', '**', f'{opt}'), recursive=True)
28 |         pprint(glob_list)
29 | 
30 | # Section list comes from: https://github.com/petermr/pyami/blob/main/py4ami/resources/section_templates.json


--------------------------------------------------------------------------------
/docanalysis/gui.py:
--------------------------------------------------------------------------------
 1 | import eel
 2 | from pygetpapers import Pygetpapers
 3 | import os
 4 | 
 5 | eel.init(f'{os.path.dirname(os.path.realpath(__file__))}/gui')
 6 | 
 7 | 
 8 | @eel.expose
 9 | def create_corpus(path, query, number):
10 |     pygetpapers_call = Pygetpapers()
11 |     pygetpapers_call.run_command(
12 |         query=query, limit=number, output=path, xml=True)
13 | 
14 | 
15 | eel.start('main.html')
16 | 


--------------------------------------------------------------------------------
/docanalysis/gui/css/main.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/docanalysis/gui/css/main.css


--------------------------------------------------------------------------------
/docanalysis/gui/eel.js:
--------------------------------------------------------------------------------
1 |    
2 |    
3 | function make_papers () {
4 |    query = document.getElementById("query").value
5 |    number = document.getElementById("number").value
6 |    path =document.getElementById("path").value
7 |    console.log(path)
8 | }


--------------------------------------------------------------------------------
/docanalysis/gui/main.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="UTF-8" />
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge" />
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0" />
 7 |       <script type="text/javascript" src="eel.js"></script>
 8 | 
 9 |     <title>Docanalysis GUI</title>
10 |   </head>
11 |   <body>
12 |     <div>
13 |       <button>Use an existing Corpus</button>
14 |     </div>
15 |     <br>
16 |     <div>
17 |       <input id="query">Query</input>
18 |       <input id="number">Number of papers</input>
19 |       <input id="path" type="file" webkitdirectory directory multiple/>
20 |     <button id="make_corpus" onclick="make_papers()">Make a Corpus</button>
21 |     </div>
22 |   </body>
23 | </html>
24 | 


--------------------------------------------------------------------------------
/docanalysis/xml_lib.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import os
  3 | from lxml import etree as LXET
  4 | import logging
  5 | 
  6 | from docanalysis.file_lib import FileLib
  7 | 
  8 | logging.debug("loading xml_lib")
  9 | 
 10 | 
 11 | # make leafnodes and copy remaning content as XML
 12 | TERMINAL_COPY = {
 13 |     "abstract",
 14 |     "aff",
 15 |     "article-id",
 16 |     "article-categories",
 17 |     "author-notes",
 18 |     "caption",
 19 |     "contrib-group",
 20 |     "fig",
 21 |     "history",
 22 |     "issue",
 23 |     "journal_id",
 24 |     "journal-title-group",
 25 |     "kwd-group",
 26 |     "name",
 27 |     "notes",
 28 |     "p",
 29 |     "permissions",
 30 |     "person-group",
 31 |     "pub-date",
 32 |     "publisher",
 33 |     "ref",
 34 |     "table",
 35 |     "title",
 36 |     "title-group",
 37 |     "volume",
 38 | }
 39 | 
 40 | 
 41 | TERMINALS = [
 42 |     "inline-formula",
 43 | ]
 44 | 
 45 | TITLE = "title"
 46 | 
 47 | IGNORE_CHILDREN = {
 48 |     "disp-formula",
 49 | }
 50 | 
 51 | HTML_TAGS = {
 52 |     "italic": "i",
 53 |     "p": "p",
 54 |     "sub": "sub",
 55 |     "sup": "sup",
 56 |     "tr": "tr",
 57 | }
 58 | 
 59 | H_TD = "td"
 60 | H_TR = "tr"
 61 | H_TH = "th"
 62 | LINK = "link"
 63 | UTF_8 = "UTF-8"
 64 | SCRIPT = "script"
 65 | STYLESHEET = "stylesheet"
 66 | TEXT_CSS = "text/css"
 67 | TEXT_JAVASCRIPT = "text/javascript"
 68 | 
 69 | H_HTML = "html"
 70 | H_BODY = "body"
 71 | H_TBODY = "tbody"
 72 | H_DIV = "div"
 73 | H_TABLE = "table"
 74 | H_THEAD = "thead"
 75 | H_HEAD = "head"
 76 | H_TITLE = "title"
 77 | 
 78 | RESULTS = "results"
 79 | 
 80 | SEC_TAGS = {
 81 |     "sec",
 82 | }
 83 | 
 84 | LINK_TAGS = {
 85 |     "xref",
 86 | }
 87 | 
 88 | SECTIONS = "sections"
 89 | 
 90 | HTML_NS = "HTML_NS"
 91 | MATHML_NS = "MATHML_NS"
 92 | SVG_NS = "SVG_NS"
 93 | XMLNS_NS = "XMLNS_NS"
 94 | XML_NS = "XML_NS"
 95 | XLINK_NS = "XLINK_NS"
 96 | 
 97 | XML_LANG = "{" + XML_NS + "}" + 'lang'
 98 | 
 99 | NS_MAP = {
100 |     HTML_NS: "http://www.w3.org/1999/xhtml",
101 |     MATHML_NS: "http://www.w3.org/1998/Math/MathML",
102 |     SVG_NS: "http://www.w3.org/2000/svg",
103 |     XLINK_NS: "http://www.w3.org/1999/xlink",
104 |     XML_NS: "http://www.w3.org/XML/1998/namespace",
105 |     XMLNS_NS: "http://www.w3.org/2000/xmlns/",
106 | }
107 | 
108 | logger = logging.getLogger("xml_lib")
109 | logger.setLevel(logging.WARNING)
110 | 
111 | 
112 | class XmlLib:
113 |     """ """
114 | 
115 |     def __init__(self, file=None, section_dir=SECTIONS):
116 |         self.max_file_len = 30
117 |         self.file = file
118 |         self.parent_path = None
119 |         self.root = None
120 |         self.logger = logging.getLogger("xmllib")
121 |         self.section_dir = section_dir
122 |         self.section_path = None
123 | #         self.logger.setLevel(logging.INFO)
124 | 
125 |     def read(self, file):
126 |         """reads XML file , saves file, and parses to self.root
127 | 
128 |         :param file: 
129 | 
130 |         """
131 |         if file is not None:
132 |             self.file = file
133 |             self.parent_path = Path(file).parent.absolute()
134 |             self.root = XmlLib.parse_xml_file_to_root(file)
135 | 
136 |     def make_sections(self, section_dir):
137 |         """recursively traverse XML tree and write files for each terminal element
138 | 
139 |         :param section_dir: 
140 | 
141 |         """
142 |         self.section_dir = self.make_sections_path(section_dir)
143 |         # indent = 0
144 |         # filename = "1" + "_" + self.root.tag
145 |         # self.logger.debug(" " * indent, filename)
146 |         # subdir = os.path.join(self.section_dir, filename)
147 |         # FileLib.force_mkdir(subdir)
148 | 
149 |         self.make_descendant_tree(self.root, self.section_dir)
150 |         self.logger.info(
151 |             f"wrote XML sections for {self.file} {self.section_dir}")
152 | 
153 |     @staticmethod
154 |     def parse_xml_file_to_root(file):
155 |         """read xml path and create root element
156 | 
157 |         :param file: 
158 | 
159 |         """
160 |         file = str(file)  # if file is Path
161 |         if not os.path.exists(file):
162 |             raise IOError("path does not exist", file)
163 |         xmlp = LXET.XMLParser(encoding=UTF_8)
164 |         element_tree = LXET.parse(file, xmlp)
165 |         root = element_tree.getroot()
166 |         return root
167 | 
168 |     @staticmethod
169 |     def parse_xml_string_to_root(xml):
170 |         """read xml string and parse to root element
171 | 
172 |         :param xml: 
173 | 
174 |         """
175 |         from io import StringIO
176 |         tree = LXET.parse(StringIO(xml), LXET.XMLParser(ns_clean=True))
177 |         return tree.getroot()
178 | 
179 |     def make_sections_path(self, section_dir):
180 |         """
181 | 
182 |         :param section_dir: 
183 | 
184 |         """
185 |         self.section_path = os.path.join(self.parent_path, section_dir)
186 |         if not os.path.exists(self.section_path):
187 |             FileLib.force_mkdir(self.section_path)
188 |         return self.section_path
189 | 
190 |     def make_descendant_tree(self, elem, outdir):
191 |         """
192 | 
193 |         :param elem: 
194 |         :param outdir: 
195 | 
196 |         """
197 | 
198 |         self.logger.setLevel(logging.INFO)
199 |         if elem.tag in TERMINALS:
200 |             self.logger.debug("skipped ", elem.tag)
201 |             return
202 |         TERMINAL = "T_"
203 |         IGNORE = "I_"
204 |         children = list(elem)
205 |         self.logger.debug(f"children> {len(children)} .. {self.logger.level}")
206 |         isect = 0
207 |         for child in children:
208 |             if "ProcessingInstruction" in str(type(child)):
209 |                 # print("PI", child)
210 |                 continue
211 |             if "Comment" in str(type(child)):
212 |                 continue
213 |             flag = ""
214 |             child_child_count = len(list(child))
215 |             if child.tag in TERMINAL_COPY or child_child_count == 0:
216 |                 flag = TERMINAL
217 |             elif child.tag in IGNORE_CHILDREN:
218 |                 flag = IGNORE
219 | 
220 |             title = child.tag
221 |             if child.tag in SEC_TAGS:
222 |                 title = XmlLib.get_sec_title(child)
223 | 
224 |             if flag == IGNORE:
225 |                 title = flag + title
226 |             filename = str(
227 |                 isect) + "_" + FileLib.punct2underscore(title).lower()[:self.max_file_len]
228 | 
229 |             if flag == TERMINAL:
230 |                 xml_string = LXET.tostring(child)
231 |                 filename1 = os.path.join(outdir, filename + '.xml')
232 |                 self.logger.setLevel(logging.INFO)
233 |                 self.logger.debug(f"writing dbg {filename1}")
234 |                 try:
235 |                     with open(filename1, "wb") as f:
236 |                         f.write(xml_string)
237 |                 except Exception:
238 |                     print(f"cannot write {filename1}")
239 |             else:
240 |                 subdir = os.path.join(outdir, filename)
241 |                 # creates empty dirx, may be bad idea
242 |                 FileLib.force_mkdir(subdir)
243 |                 if flag == "":
244 |                     self.logger.debug(f">> {title} {child}")
245 |                     self.make_descendant_tree(child, subdir)
246 |             isect += 1
247 | 
248 |     @staticmethod
249 |     def get_sec_title(sec):
250 |         """get title of JATS section
251 |         
252 |         :sec: section (normally sec element
253 | 
254 |         :param sec: 
255 | 
256 |         """
257 |         title = None
258 |         for elem in list(sec):
259 |             if elem.tag == TITLE:
260 |                 title = elem.text
261 |                 break
262 | 
263 |         if title is None:
264 |             # don't know where the 'xml_file' comes from...
265 |             if not hasattr(sec, "xml_file"):
266 |                 title = "UNKNOWN"
267 |             else:
268 |                 title = "?_" + str(sec["xml_file"][:20])
269 |         title = FileLib.punct2underscore(title)
270 |         return title
271 | 
272 |     @staticmethod
273 |     def remove_all(elem, xpath):
274 |         """
275 | 
276 |         :param elem: 
277 |         :param xpath: 
278 | 
279 |         """
280 |         for el in elem.xpath(xpath):
281 |             el.getparent().remove(el)
282 | 
283 |     @staticmethod
284 |     def get_or_create_child(parent, tag):
285 |         """
286 | 
287 |         :param parent: 
288 |         :param tag: 
289 | 
290 |         """
291 |         child = None
292 |         if parent is not None:
293 |             child = parent.find(tag)
294 |             if child is None:
295 |                 child = LXET.SubElement(parent, tag)
296 |         return child
297 | 
298 |     @classmethod
299 |     def get_text(cls, node):
300 |         """get text children as string
301 | 
302 |         :param node: 
303 | 
304 |         """
305 |         return ''.join(node.itertext())
306 | 
307 |     @staticmethod
308 |     def add_UTF8(html_root):
309 |         """adds UTF8 declaration to root
310 | 
311 |         :param html_root: 
312 | 
313 |         """
314 |         from lxml import etree as LXET
315 |         root = html_root.get_or_create_child(html_root, "head")
316 |         LXET.SubElement(root, "meta").attrib["charset"] = "UTF-8"
317 | 
318 |     # replace nodes with text
319 |     @staticmethod
320 |     def replace_nodes_with_text(data, xpath, replacement):
321 |         """replace nodes with specific text
322 | 
323 |         :param data: 
324 |         :param xpath: 
325 |         :param replacement: 
326 | 
327 |         """
328 |         print(data, xpath, replacement)
329 |         tree = LXET.fromstring(data)
330 |         for r in tree.xpath(xpath):
331 |             print("r", r, replacement, r.tail)
332 |             text = replacement
333 |             if r.tail is not None:
334 |                 text += r.tail
335 |             parent = r.getparent()
336 |             if parent is not None:
337 |                 previous = r.getprevious()
338 |                 if previous is not None:
339 |                     previous.tail = (previous.tail or '') + text
340 |                 else:
341 |                     parent.text = (parent.text or '') + text
342 |                 parent.remove(r)
343 |         return tree
344 | 
345 |     @classmethod
346 |     def remove_all_tags(cls, xml_string):
347 |         """remove all tags from text
348 |         
349 |         :xml_string: string to be flattened
350 | 
351 |         :param xml_string: 
352 |         :returns: flattened string
353 | 
354 |         """
355 |         tree = LXET.fromstring(xml_string.encode("utf-8"))
356 |         strg = LXET.tostring(tree, encoding='utf8',
357 |                              method='text').decode("utf-8")
358 |         return strg
359 | 
360 |     @classmethod
361 |     def xslt_transform(cls, data, xslt_file):
362 |         """
363 | 
364 |         :param data: 
365 |         :param xslt_file: 
366 | 
367 |         """
368 |         xslt_root = LXET.parse(xslt_file)
369 |         transform = LXET.XSLT(xslt_root)
370 |         print("XSLT log", transform.error_log)
371 |         result_tree = transform(LXET.fromstring(data))
372 |         assert(result_tree is not None)
373 |         root = result_tree.getroot()
374 |         assert(root is not None)
375 | 
376 |         return root
377 | 
378 |     @classmethod
379 |     def xslt_transform_tostring(cls, data, xslt_file):
380 |         """
381 | 
382 |         :param data: 
383 |         :param xslt_file: 
384 | 
385 |         """
386 |         root = cls.xslt_transform(data, xslt_file)
387 |         return LXET.tostring(root).decode("UTF-8") if root is not None else None
388 | 
389 | 
390 | class HtmlElement:
391 |     """to provide fluent HTML builder and parser"""
392 |     pass
393 | 
394 | 
395 | class DataTable:
396 |     """<html xmlns="http://www.w3.org/1999/xhtml">
397 |      <head charset="UTF-8">
398 |       <title>ffml</title>
399 |       <link rel="stylesheet" type="text/css" href="http://ajax.aspnetcdn.com/ajax/jquery.dataTables/1.9.4/css/jquery.dataTables.css"/>
400 |       <script src="http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.8.2.min.js" charset="UTF-8" type="text/javascript"> </script>
401 |       <script src="http://ajax.aspnetcdn.com/ajax/jquery.dataTables/1.9.4/jquery.dataTables.min.js" charset="UTF-8" type="text/javascript"> </script>
402 |       <script charset="UTF-8" type="text/javascript">$(function() { $("#results").dataTable(); }) </script>
403 |      </head>
404 | 
405 | 
406 |     """
407 | 
408 |     def __init__(self, title, colheads=None, rowdata=None):
409 |         """create dataTables
410 |         optionally add column headings (list) and rows (list of conformant lists)
411 | 
412 |         :param title: of data_title (required)
413 |         :param colheads:
414 |         :param rowdata:
415 | 
416 |         """
417 |         self.html = LXET.Element(H_HTML)
418 |         self.head = None
419 |         self.body = None
420 |         self.create_head(title)
421 |         self.create_table_thead_tbody()
422 |         self.add_column_heads(colheads)
423 |         self.add_rows(rowdata)
424 |         self.head = None
425 |         self.title = None
426 |         self.title.text = None
427 | 
428 | 
429 |     def create_head(self, title):
430 |         """<title>ffml</title>
431 |           <link rel="stylesheet" type="text/css" href="http://ajax.aspnetcdn.com/ajax/jquery.dataTables/1.9.4/css/jquery.dataTables.css"/>
432 |           <script src="http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.8.2.min.js" charset="UTF-8" type="text/javascript"> </script>
433 |           <script src="http://ajax.aspnetcdn.com/ajax/jquery.dataTables/1.9.4/jquery.dataTables.min.js" charset="UTF-8" type="text/javascript"> </script>
434 |           <script charset="UTF-8" type="text/javascript">$(function() { $("#results").dataTable(); }) </script>
435 | 
436 |         :param title: 
437 | 
438 |         """
439 | 
440 |         self.head = LXET.SubElement(self.html, H_HEAD)
441 |         self.title = LXET.SubElement(self.head, H_TITLE)
442 |         self.title.text = title
443 | 
444 |         link = LXET.SubElement(self.head, LINK)
445 |         link.attrib["rel"] = STYLESHEET
446 |         link.attrib["type"] = TEXT_CSS
447 |         link.attrib["href"] = "http://ajax.aspnetcdn.com/ajax/jquery.dataTables/1.9.4/css/jquery.dataTables.css"
448 |         link.text = '.'  # messy, to stop formatter using "/>" which dataTables doesn't like
449 | 
450 |         script = LXET.SubElement(self.head, SCRIPT)
451 |         script.attrib["src"] = "http://ajax.aspnetcdn.com/ajax/jQuery/jquery-1.8.2.min.js"
452 |         script.attrib["charset"] = UTF_8
453 |         script.attrib["type"] = TEXT_JAVASCRIPT
454 |         script.text = '.'  # messy, to stop formatter using "/>" which dataTables doesn't like
455 | 
456 |         script = LXET.SubElement(self.head, SCRIPT)
457 |         script.attrib["src"] = "http://ajax.aspnetcdn.com/ajax/jquery.dataTables/1.9.4/jquery.dataTables.min.js"
458 |         script.attrib["charset"] = UTF_8
459 |         script.attrib["type"] = TEXT_JAVASCRIPT
460 |         script.text = "."  # messy, to stop formatter using "/>" which dataTables doesn't like
461 | 
462 |         script = LXET.SubElement(self.head, SCRIPT)
463 |         script.attrib["charset"] = UTF_8
464 |         script.attrib["type"] = TEXT_JAVASCRIPT
465 |         script.text = "$(function() { $(\"#results\").dataTable(); }) "
466 | 
467 |     def create_table_thead_tbody(self):
468 |         """<body>
469 |               <div class="bs-example table-responsive">
470 |                <table class="table table-striped table-bordered table-hover" id="results">
471 |         <thead>
472 |          <tr>
473 |           <th>articles</th>
474 |           <th>bibliography</th>
475 |           <th>dic:country</th>
476 |           <th>word:frequencies</th>
477 |          </tr>
478 |         </thead>
479 | 
480 | 
481 |         """
482 | 
483 |         self.body = LXET.SubElement(self.html, H_BODY)
484 |         self.div = LXET.SubElement(self.body, H_DIV)
485 |         self.div.attrib["class"] = "bs-example table-responsive"
486 |         self.table = LXET.SubElement(self.div, H_TABLE)
487 |         self.table.attrib["class"] = "table table-striped table-bordered table-hover"
488 |         self.table.attrib["id"] = RESULTS
489 |         self.thead = LXET.SubElement(self.table, H_THEAD)
490 |         self.tbody = LXET.SubElement(self.table, H_TBODY)
491 | 
492 |     def add_column_heads(self, colheads):
493 |         """
494 | 
495 |         :param colheads: 
496 | 
497 |         """
498 |         if colheads is not None:
499 |             self.thead_tr = LXET.SubElement(self.thead, H_TR)
500 |             for colhead in colheads:
501 |                 th = LXET.SubElement(self.thead_tr, H_TH)
502 |                 th.text = str(colhead)
503 | 
504 |     def add_rows(self, rowdata):
505 |         """
506 | 
507 |         :param rowdata: 
508 | 
509 |         """
510 |         if rowdata is not None:
511 |             for row in rowdata:
512 |                 self.add_row_old(row)
513 | 
514 |     def add_row_old(self, row: [str]):
515 |         """creates new <tr> in <tbody>
516 |         creates <td> child elements of row containing string values
517 | 
518 |         :param row: list of str
519 |         :param row: [str]: 
520 | 
521 |         """
522 |         if row is not None:
523 |             tr = LXET.SubElement(self.tbody, H_TR)
524 |             for val in row:
525 |                 td = LXET.SubElement(tr, H_TD)
526 |                 td.text = val
527 |                 # print("td", td.text)
528 | 
529 |     def make_row(self):
530 |         """:return: row element"""
531 |         return LXET.SubElement(self.tbody, H_TR)
532 | 
533 |     def append_contained_text(self, parent, tag, text):
534 |         """create element <tag> and add text child
535 | 
536 |         :param parent: 
537 |         :param tag: 
538 |         :param text: 
539 | 
540 |         """
541 |         subelem = LXET.SubElement(parent, tag)
542 |         subelem.text = text
543 |         return subelem
544 | 
545 |     def write_full_data_tables(self, output_dir: str) -> None:
546 |         """
547 | 
548 |         :param output_dir: str: 
549 | 
550 |         """
551 |         if not os.path.exists(output_dir):
552 |             os.makedirs(output_dir)
553 |         data_table_file = os.path.join(output_dir, "full_data_table.html")
554 |         with open(data_table_file, "w") as f:
555 |             text = bytes.decode(LXET.tostring(self.html))
556 |             f.write(text)
557 |             print("WROTE", data_table_file)
558 | 
559 |     def __str__(self):
560 |         # s = self.html.text
561 |         # print("s", s)
562 |         # return s
563 |         # ic("ichtml", self.html)
564 |         htmltext = LXET.tostring(self.html)
565 |         print("SELF", htmltext)
566 |         return htmltext
567 | 
568 | 
569 | class Web:
570 |     """ """
571 |     def __init__(self):
572 |         import tkinter as tk
573 |         root = tk.Tk()
574 |         site = "http://google.com"
575 |         self.display_html(root, site)
576 |         root.mainloop()
577 | 
578 |     @classmethod
579 |     def display_html(cls, master, site):
580 |         """
581 | 
582 |         :param master: 
583 |         :param site: 
584 | 
585 |         """
586 |         import tkinterweb
587 |         frame = tkinterweb.HtmlFrame(master)
588 |         frame.load_website(site)
589 |         frame.pack(fill="both", expand=True)
590 | 
591 |     @classmethod
592 |     def tkinterweb_demo(cls):
593 |         """ """
594 |         from tkinterweb import Demo
595 |         Demo()
596 | 
597 | 
598 | def main():
599 |     """ """
600 | 
601 |     XmlLib().test_recurse_sections()  # recursively list sections
602 | 
603 | #    test_data_table()
604 | #    test_xml()
605 | 
606 | #    web = Web()
607 | #    Web.tkinterweb_demo()
608 | 
609 | 
610 | def test_xml():
611 |     """ """
612 |     xml_string = "<a>foo <b>and</b> with <d/> bar</a>"
613 |     print(XmlLib.remove_all_tags(xml_string))
614 | 
615 | 
616 | def test_data_table():
617 |     """ """
618 |     import pprint
619 |     data_table = DataTable("test")
620 |     data_table.add_column_heads(["a", "b", "c"])
621 |     data_table.add_row_old(["a1", "b1", "c1"])
622 |     data_table.add_row_old(["a2", "b2", "c2"])
623 |     data_table.add_row_old(["a3", "b3", "c3"])
624 |     data_table.add_row_old(["a4", "b4", "c4"])
625 |     html = LXET.tostring(data_table.html).decode("UTF-8")
626 |     HOME = os.path.expanduser("~")
627 |     with open(os.path.join(HOME, "junk_html.html"), "w") as f:
628 |         f.write(html)
629 |     pprint.pprint(html)
630 | 
631 | 
632 | if __name__ == "__main__":
633 |     print("running file_lib main")
634 |     main()
635 | else:
636 |     #    print("running file_lib main anyway")
637 |     #    main()
638 |     pass
639 | 
640 | # Credits: Peter Murray-Rust, py4ami (https://github.com/petermr/pyami/blob/main/py4ami/file_lib.py)


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 | myst-parser


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | import os
 9 | import sys
10 | import sphinx_rtd_theme
11 | sys.path.insert(0, os.path.abspath('..'))
12 | sys.path.append(os.path.abspath('../..'))
13 | project = 'Docanalysis'
14 | copyright = '2022, Ayush Garg, Shweata N Hegde'
15 | author = 'Ayush Garg, Shweata N Hegde'
16 | release = '0.2.4'
17 | 
18 | # -- General configuration ---------------------------------------------------
19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
20 | 
21 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'myst_parser']
22 | 
23 | templates_path = ['_templates']
24 | exclude_patterns = []
25 | napoleon_google_docstring = True
26 | 
27 | 
28 | # -- Options for HTML output -------------------------------------------------
29 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
30 | 
31 | html_theme = 'sphinx_rtd_theme'
32 | html_static_path = ['_static']
33 | 


--------------------------------------------------------------------------------
/docs/source/docanalysis.rst:
--------------------------------------------------------------------------------
1 | Docanalysis module
2 | ==================================
3 | 
4 | .. automodule:: docanalysis.docanalysis
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/entity_extraction.rst:
--------------------------------------------------------------------------------
1 | Entity extraction module
2 | ==================================
3 | 
4 | .. automodule:: docanalysis.entity_extraction
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | User Documentation
 2 | ==============================
 3 | 
 4 | .. include:: ../../README.md
 5 |    :parser: myst_parser.sphinx_
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 1
 9 |    :hidden:
10 |    :caption: User Documentation:
11 | 
12 |    user_documentation
13 | 
14 | 
15 | .. toctree::
16 |    :maxdepth: 7
17 |    :caption: Core modules:
18 | 
19 |    docanalysis
20 |    entity_extraction


--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Notebooks
2 | 
3 | A general resource for contributed code
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | abbreviations>=0.2.5
 2 | beautifulsoup4>=4.10.0
 3 | braceexpand>=0.1.7
 4 | coloredlogs>=15.0.1
 5 | ConfigArgParse>=1.5.3
 6 | lxml>=4.7.1
 7 | nltk>=3.6.7
 8 | pandas>=1.3.4
 9 | pygetpapers
10 | pytest>=6.2.5
11 | setuptools>=60.3.1
12 | spacy>=3.0.7
13 | tkinterweb>=3.10.7
14 | tqdm>=4.62.3
15 | yake>=0.4.8
16 | sphinx_rtd_theme>=1.0.0
17 | 


--------------------------------------------------------------------------------
/resources/approval_number_100.csv:
--------------------------------------------------------------------------------
 1 | ,file_path,paragraph,sentence,entities,labels,position_start,position_end,has_terms,weight
 2 | 33,C:\Users\shweata\approval_number_100\PMC7833043\sections\1_body\3_ethics_approval\1_p.xml,"The study was approved by the King Khalid University Ethics Committee (approval number: ECM#2020-183–(HAPO-06-B-001), and no identifying personal information (e.g. name, age) or other sensitive data were collected.","The study was approved by the King Khalid University Ethics Committee (approval number: ECM#2020-183–(HAPO-06-B-001), and no identifying personal information (e.g.",the King Khalid University Ethics Committee,ORG,26,69,approval number,1
 3 | 63,C:\Users\shweata\approval_number_100\PMC7833043\sections\2_back\2_ethical_approval\1_p.xml,"The Ethical Committee of the Scientific Research, King Khalid University approved the study (approval number: ECM#2020-183–(HAPO-06-B-001) to use scores and absence rates, with no personal information of students disclosed.","The Ethical Committee of the Scientific Research, King Khalid University approved the study (approval number: ECM#2020-183–(HAPO-06-B-001) to use scores and absence rates, with no personal information of students disclosed.",The Ethical Committee of the Scientific Research,ORG,0,48,approval number,1
 4 | 96,C:\Users\shweata\approval_number_100\PMC8023627\sections\2_back\2_ethics_approval_and_conse\1_p.xml,"The data were obtained from the Saudi Ministry of Health and World Health Organization records and the study conducted under the approval of the Regional Directorate of Primary Health according to ethical standards with the maintenance of anonymity of each patient. Thus, all the data of patients was recorded without patients details, it was not necessary to obtain the personal consent of the study participants. The study was ethically approved by the institutional review board of the Princess Nourah Bint Abdulrahman University (IRB Approval Number: 20–0217).",The study was ethically approved by the institutional review board of the Princess Nourah Bint Abdulrahman University (IRB Approval Number: 20–0217).,Nourah,GPE,83,89,approval number,1
 5 | 169,C:\Users\shweata\approval_number_100\PMC8185902\sections\1_body\1_p.xml,"Following the publication of the above article, the authors have realized that, in the Declarations section on p. 10, they presented an incorrect approval number from the Ethics Committee in question; the statement here should have read as follows: “The present study was approved by the Ethics Committee of the Affiliated Hospital of Shaoxing University (approval no.  2021003). All patients provided written informed consent.”","Following the publication of the above article, the authors have realized that, in the Declarations section on p. 10, they presented an incorrect approval number from the Ethics Committee in question; the statement here should have read as follows: “The present study was approved by the Ethics Committee of the Affiliated Hospital of Shaoxing University (approval no.","the Ethics Committee, the Ethics Committee of the Affiliated Hospital of Shaoxing University","ORG, ORG","167, 284","187, 354",approval number,1
 6 | 1252,C:\Users\shweata\approval_number_100\PMC8358494\sections\1_body\1_methods\1_patients_and_study_design\1_p.xml,"The participants enrolled in this retrospective study were outpatients at Suzuki Clinic. All participants received the  Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition diagnosis for insomnia, and were prescribed either lemborexant or benzodiazepine hypnotics. The observation period was from July 2020 (when introduced for clinical use) to December 2020 for lemborexant and benzodiazepine hypnotics. Furthermore, there were no criteria for exclusion of research subjects in this study. This study was approved by the ethics committee of Fukui Kinen Hospital. The approval date and approval number of the ethics committee of Fukui Kinen Hospital were 21 January 2021 and 2-017, respectively. Instead of omitting the informed consent for the retrospective cohort study, information about the study was posted in the hospital, and opt-out recruitment was conducted. Insomnia was assessed using the Japanese version of Athens Insomnia Scale (AIS). 5 Efficacy outcome assessment was from the Clinical Global Impressions-Improvement (CGI-I) scale. 6","The approval date and approval number of the ethics committee of Fukui Kinen Hospital were 21 January 2021 and 2-017, respectively.",Fukui Kinen Hospital,ORG,65,85,approval number,1
 7 | 2388,C:\Users\shweata\approval_number_100\PMC8431654\sections\2_body\3_4__material_and_methods\1_4_1__blood_samples_used_i\1_p.xml,"Blood samples of COVID-19 patients were taken within the first two weeks after the detection of the SARS-CoV-2 infection of patients at University Hospital of RWTH Aachen. All patient samples were taken after written and informed consent according to the guidelines and specific approval of the study by the local ethics committee (Ethic approval number EK 080/20 for the Covid-19 Aachen study named COVAS; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany) and collected into RWTH cBMB, the central biobank of the medical faculty of RWTH Aachen University (Ethic approval number EK 206/09). Blood samples of healthy donors were taken after written and informed consent according to the guidelines and approval of the study by the local ethics committee (EK 041/15; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany). These control samples were taken in the years 2018 and 2019, before the initial SARS-CoV-2 outbreak. All venous blood samples were anticoagulated with EDTA and cryopreserved at -80 °C until further analysis.","All patient samples were taken after written and informed consent according to the guidelines and specific approval of the study by the local ethics committee (Ethic approval number EK 080/20 for the Covid-19 Aachen study named COVAS; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany) and collected into RWTH cBMB, the central biobank of the medical faculty of RWTH Aachen University (Ethic approval number EK 206/09).","Ethics, RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse, Aachen, Germany, RWTH Aachen University (Ethic","ORG, ORG, GPE, GPE, GPE, ORG","235, 255, 307, 332, 340, 425","241, 305, 321, 338, 347, 454",approval number,1
 8 | 2766,C:\Users\shweata\approval_number_100\PMC8442262\sections\2_body\1_methods\9_ethics_approval_and_conse\1_p.xml,"Ethics approval has been approved by the Research Ethics Board of Health (REBH), Ministry of Health, Royal Government of Bhutan vide approval number  Ref. No. REBH/Approval/2019/067. An informed consent has been obtained from the individual participants for the use of photographic materials while the use of data and consent to participate had been obtained from the legal guardian (Principal of the school). All methods were carried out in accordance with relevant guidelines and regulations as enshrined in Helsinki Declarations 1964.","Ethics approval has been approved by the Research Ethics Board of Health (REBH), Ministry of Health, Royal Government of Bhutan vide approval number  Ref.","the Research Ethics Board of Health, REBH, Ministry of Health, Royal Government, Ref","ORG, ORG, ORG, ORG, ORG","37, 74, 81, 101, 150","72, 78, 99, 117, 153",approval number,1
 9 | 3047,C:\Users\shweata\approval_number_100\PMC8449472\sections\2_body\1_methods\1_ethical_approval\1_p.xml,"This study was approved by the Institutional Review Board of Kyungpook National University Chilgok Hospital, Daegu, Korea (approval number: 2020-04-029). As this was a retrospective study, the need for obtaining informed consent from patients was waived by the Institutional Review Board.","This study was approved by the Institutional Review Board of Kyungpook National University Chilgok Hospital, Daegu, Korea (approval number: 2020-04-029).","the Institutional Review Board, Kyungpook National University Chilgok Hospital, Daegu, Korea","ORG, ORG, GPE, GPE","27, 61, 109, 116","57, 107, 114, 121",approval number,1
10 | 3145,C:\Users\shweata\approval_number_100\PMC8454101\sections\2_body\1_methods\1_patients\1_p.xml,Conjunctival samples were obtained from patients undergoing retinal detachment surgery (n = 1) or conjunctival melanoma resection (n = 1) at the Eye Center of the University of Freiburg. Ethics approval was granted from Ethics Committee of the Albert-Ludwigs-University Freiburg (approval number 481/19).,Ethics approval was granted from Ethics Committee of the Albert-Ludwigs-University Freiburg (approval number 481/19).,Ethics Committee,ORG,33,49,approval number,1
11 | 3351,C:\Users\shweata\approval_number_100\PMC8462059\sections\2_body\3_ethical_approval\1_p.xml,This study was approved by the institutional review board and ethics committee of the Japanese Red Cross Ise Hospital (approval number: ER2020‐26).,This study was approved by the institutional review board and ethics committee of the Japanese Red Cross Ise Hospital (approval number: ER2020‐26).,the Japanese Red Cross Ise Hospital,ORG,82,117,approval number,1
12 | 3470,C:\Users\shweata\approval_number_100\PMC8468265\sections\2_body\3_4__materials_and_methods\4_4_4__ethical_approval\1_p.xml,"The ASB study was conducted in accordance with the Declaration of Helsinki. The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital. The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital.","ASB, the Academic Medical Centre, Amsterdam, Netherlands","ORG, ORG, GPE, GPE","4, 63, 92, 107","7, 90, 101, 118",approval number,1
13 | 3471,C:\Users\shweata\approval_number_100\PMC8468265\sections\2_body\3_4__materials_and_methods\4_4_4__ethical_approval\1_p.xml,"The ASB study was conducted in accordance with the Declaration of Helsinki. The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital. The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","Netherlands, ASB","GPE, ORG","39, 85","50, 88",approval number,1
14 | 3734,C:\Users\shweata\approval_number_100\PMC8474954\sections\2_body\1_methods\8_in_vivo_metastasis_assay\1_p.xml,"To test if EMMPRIN plays an important role in osteosarcoma metastasis in vivo, the osteosarcoma cell line 143B was injected in the tail vein of BALB/c mice. The mice were sacrificed at 8 weeks post-injection. Four-week-old male BALB/c nude mice were obtained from Central Lab. Animal Inc. (Seoul, Korea) and maintained under standard conditions until the experiments were performed. The animals were maintained at the animal facility of the Seoul National University Hospital under guidelines prior to the grouping and experiments. A total of 15 BALB/c nude mice were randomized into 3 groups: 1 normal, 2 143 cells transfected with an ad mock shRNA vector (Control), and 3 143 cells transfected with the ad EMMPRIN shRNA vector. Experiments were approved by the Institutional Animal Care and Use Committee of Seoul National University Hospital (approval number 10–0075). One anti-EMMPRIN sequence (5-GTCGTCAGAACACATCAAC-3) or a scrambled sequence was inserted into the plasmid vector pAdEasy-1 (Addgene). They were designated as pAdEasy-1-shRNA and pAdEasy-1 scramble shRNA, respectively. Osteosarcoma cell line 143B was transfected with EMMPRIN shRNA. EMMPRIN shRNA transfected 143B cells were harvested with trypsin, and then resuspended in serum-free RPMI, and injected in the tail vein (1 × 10 5/0.2 mL) of 5 nude mice per group. Health of the animals was monitored daily, and body weights were measured weekly throughout the study period. Anesthesia was performed with isoflurane inhalation as well as ketamine (10 mg/kg) and medetomidine (0.1 mg/kg) injection. All surgical procedures were performed under sterile conditions. Secondary euthanasia method for cervical dislocation was also performed. The mice were sacrificed by CO 2 inhalation at 8 weeks post-injection. Harvested tissues were preserved in Bouin’s fixative, embedded in paraffin, sectioned (4 μm), and stained with hematoxylin and eosin (H&E). Examination of the histological sections was performed using Nikon Eclipse Ci microscope (Nikon Corp., Tokyo, Japan) by a digital camera (Nikon digital sight, DS-2Mv) and the automatic exposure and iSolution Lite software for microscopic images. The tumor lengths and widths were measured by a perpendicular tumor diameter, with the tumor volume being calculated using the following formula: width 2 × length/2  20.",Experiments were approved by the Institutional Animal Care and Use Committee of Seoul National University Hospital (approval number 10–0075).,the Institutional Animal Care and Use Committee of Seoul National University Hospital,ORG,29,114,approval number,1
15 | 3821,C:\Users\shweata\approval_number_100\PMC8475677\sections\2_body\3_methods\3_institutional_review_boar\1_p.xml,"The study was approved by the institutional review boards of Kyoto University Graduate School of Medicine (approval number: E2311), Shiga General Hospital (approval number: 20141120‐01), Tenri Hospital (approval number: 640), Kobe City Medical Center General Hospital (approval number: 14094), Hyogo Prefectural Amagasaki General Medical Center (approval number: Rinri 26‐32), National Hospital Organization Kyoto Medical Center (approval number: 14‐080), Mitsubishi Kyoto Hospital (approved 11/12/2014), Okamoto Memorial Hospital (approval number: 201503), Japanese Red Cross Otsu Hospital (approval number: 318), Hikone Municipal Hospital (approval number: 26‐17), Japanese Red Cross Osaka Hospital (approval number: 392), Shimabara Hospital (approval number: E2311), Kishiwada City Hospital (approval number: 12), Kansai Electric Power Hospital (approval number: 26‐59), Shizuoka General Hospital (approval number: Rin14‐11‐47), Kurashiki Central Hospital (approval number: 1719), Kokura Memorial Hospital (approval number: 14111202), Kitano Hospital (approval number: P14‐11‐012), and Japanese Red Cross Wakayama Medical Center (approval number: 328).","The study was approved by the institutional review boards of Kyoto University Graduate School of Medicine (approval number: E2311), Shiga General Hospital (approval number: 20141120‐01), Tenri Hospital (approval number: 640), Kobe City Medical Center General Hospital (approval number: 14094), Hyogo Prefectural Amagasaki General Medical Center (approval number: Rinri 26‐32), National Hospital Organization Kyoto Medical Center (approval number: 14‐080), Mitsubishi Kyoto Hospital (approved 11/12/2014), Okamoto Memorial Hospital (approval number: 201503), Japanese Red Cross Otsu Hospital (approval number: 318), Hikone Municipal Hospital (approval number: 26‐17), Japanese Red Cross Osaka Hospital (approval number: 392), Shimabara Hospital (approval number: E2311), Kishiwada City Hospital (approval number: 12), Kansai Electric Power Hospital (approval number: 26‐59), Shizuoka General Hospital (approval number: Rin14‐11‐47), Kurashiki Central Hospital (approval number: 1719), Kokura Memorial Hospital (approval number: 14111202), Kitano Hospital (approval number: P14‐11‐012), and Japanese Red Cross Wakayama Medical Center (approval number: 328).","Kyoto University Graduate School of Medicine, Shiga General Hospital, Tenri Hospital, Kobe City Medical Center General Hospital, Hyogo Prefectural Amagasaki General Medical Center, National Hospital Organization Kyoto Medical Center, Mitsubishi Kyoto Hospital, Okamoto Memorial Hospital, Japanese Red Cross Otsu Hospital, Hikone Municipal Hospital, Japanese Red Cross Osaka Hospital, Shimabara Hospital, Kishiwada City Hospital, Kansai Electric Power Hospital, Shizuoka General Hospital, Kurashiki Central Hospital, Kokura Memorial Hospital, Kitano Hospital, Japanese Red Cross","ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG","61, 132, 187, 226, 294, 377, 456, 505, 558, 615, 667, 725, 770, 817, 874, 932, 984, 1038, 1089","105, 154, 201, 267, 344, 428, 481, 530, 590, 640, 700, 743, 793, 847, 899, 958, 1008, 1053, 1107",approval number,1
16 | 4328,C:\Users\shweata\approval_number_100\PMC8482572\sections\2_body\1_methods\4_ethics_approval_and_infor\1_p.xml,The study protocol was approved by the ethical committees of Kawasaki University of Medical Welfare (Approval number: 18-102) and Chiang Mai University (Approval number: NUR-2562-06120). All participants provided written informed consent to participate in the study.,The study protocol was approved by the ethical committees of Kawasaki University of Medical Welfare (Approval number: 18-102) and Chiang Mai University (Approval number: NUR-2562-06120).,"Kawasaki University of Medical Welfare (Approval, Chiang Mai University","ORG, ORG","61, 130","109, 151",approval number,1
17 | 


--------------------------------------------------------------------------------
/resources/demo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from docanalysis import DocAnalysis
 3 | from pathlib import Path
 4 | 
 5 | doc_analysis = DocAnalysis()
 6 | ETHICS_DICTIONARY_DIR = Path(os.getcwd(), "ethics_dictionary")
 7 | CORPUS_DIR =  Path(os.getcwd(), "corpus")
 8 | 
 9 | 
10 | def create_phrases_file(phrases_dir, phrases_file, dictionary_dir=ETHICS_DICTIONARY_DIR):
11 |     global terms_xml_path
12 |     terms_xml_dir = Path(dictionary_dir, phrases_dir)
13 |     if not terms_xml_dir.exists():
14 |         terms_xml_dir.mkdir()
15 |     terms_xml_path = Path(terms_xml_dir, phrases_file)
16 |     return terms_xml_path
17 | 
18 | 
19 | def get_or_create_corpus_dir(subdir_name, corpus_dir=CORPUS_DIR):
20 |     """get specific corpus directory, creating if necessary
21 | 
22 |     :param corpus_dir: directory containing corpora
23 |     :param subdir_name: specific corpus to get or create
24 |     :return: directoyr of specific corpus"""
25 |     assert corpus_dir.exists(), "directory of corpora must exist"
26 |     subdir = Path(corpus_dir, subdir_name)
27 |     if not subdir.exists():
28 |         subdir.mkdir()
29 |     return subdir
30 | 
31 | 
32 | def run_analysis(corpus_path, phrases_file, query=None, hits=30):
33 |     dict_for_entities = doc_analysis.extract_entities_from_papers(
34 |         corpus_path=corpus_path,
35 |         terms_xml_path=terms_xml_path,
36 |         query=query,
37 |         hits=hits,
38 |         make_project=True
39 |     )
40 |     create_and_write_list_for_fields(dict_for_entities, "ORG", "org.text")
41 |     create_and_write_list_for_fields(dict_for_entities, "GPE", "GPE.text")
42 | 
43 | 
44 | def create_and_write_list_for_fields(dict_for_entities, field, out_filename):
45 |     list_with_orgs = doc_analysis.extract_particular_fields(
46 |         dict_for_entities, field)
47 |     with open(out_filename, 'w') as f:
48 |         f.write(str(list_with_orgs))
49 | 
50 | 
51 | ETHICS = "ethics"
52 | TERPENES = "terpenes"
53 | options = {
54 |     ETHICS,
55 |     TERPENES
56 | }
57 | 
58 | if ETHICS in options:
59 |     corpus_dir = get_or_create_corpus_dir("e_cancer_clinical_trial_50")
60 |     phrases_file = create_phrases_file("ethics_key_phrases", "ethics_key_phrases.xml", )
61 |     run_analysis(
62 |         corpus_dir,
63 |         phrases_file,
64 |         query="ethics"
65 |     )
66 | 
67 | if TERPENES in options:
68 |     run_analysis(
69 |         get_or_create_corpus_dir(TERPENES),
70 |         create_phrases_file("terpenes_key_phrases", "terpenes_key_phrases.xml", dictionary_dir="terpenes_dictionary"),
71 |         query=TERPENES,
72 |         hits = 20,
73 |     )
74 | 
75 | 


--------------------------------------------------------------------------------
/resources/docanalyis_architecture_diagram.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/resources/docanalyis_architecture_diagram.PNG


--------------------------------------------------------------------------------
/resources/entities_country.csv:
--------------------------------------------------------------------------------
 1 | ,file_path,paragraph,sentence,section,entities,labels,position_start,position_end,abbreviations,abbreviations_longform,abbreviation_start,abbreviation_end,has_terms,weight_terms
 2 | 1,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\7_aff.xml,    1     Institute for Organic Chemistry and BMWZ   Leibniz Universität Hannover   Schneiderberg 38   30167   Hannover   Germany  ,    1     Institute for Organic Chemistry and BMWZ   Leibniz Universität Hannover   Schneiderberg 38   30167   Hannover   Germany,AFF,Germany,GPE,122,129,,,,,Germany,1
 3 | 3,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\8_aff.xml,    2     Structure and Function of Proteins   Helmholtz Centre for Infection Research   Inhoffenstr. 7   38124   Braunschweig   Germany  ,7   38124   Braunschweig   Germany,AFF,Germany,GPE,27,34,,,,,Germany,1
 4 | 5,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\9_aff.xml,"    3     Institute for Biochemistry, Biotechnology and Bioinformatics   Technische Universität Braunschweig   Spielmannstr. 7   38106   Braunschweig   Germany  ",7   38106   Braunschweig   Germany,AFF,Germany,GPE,27,34,,,,,Germany,1
 5 | 6,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8465594\sections\1_front\1_article-meta\6_aff.xml,"1 Department of Chemistry of Natural Compounds, University of Chemistry and Technology, Technicka 5, 166 28 Prague, Czech Republic","1 Department of Chemistry of Natural Compounds, University of Chemistry and Technology, Technicka 5, 166 28 Prague, Czech Republic",AFF,"Prague, Czech Republic","GPE, GPE","108, 116","114, 130",,,,,Czech Republic,1
 6 | 7,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8465594\sections\1_front\1_article-meta\7_aff.xml,"2 Institute of Bioorganic Chemistry, National Academy of Sciences of Belarus, 5/2 Academician V. F. Kuprevich Street, BY-220141 Minsk, Belarus;  khripach@iboch.by","2 Institute of Bioorganic Chemistry, National Academy of Sciences of Belarus, 5/2 Academician V. F. Kuprevich Street, BY-220141 Minsk, Belarus;  khripach@iboch.by",AFF,"BY-220141 Minsk, Belarus","GPE, GPE","118, 135","133, 142",,,,,Belarus,1
 7 | 11,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8541587\sections\1_front\1_article-meta\7_aff.xml,"Institut National de Recherche pour L’agriculture, L’alimentation et L’environnement (INRAE), 13182 Aix-en-Provence, France;  bastien.romero@inrae.fr","Institut National de Recherche pour L’agriculture, L’alimentation et L’environnement (INRAE), 13182 Aix-en-Provence, France;  bastien.romero@inrae.fr",AFF,France,GPE,117,123,,,,,France,1
 8 | 13,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625326\sections\1_front\1_article-meta\7_aff.xml,"School of Forestry and Resource Conservation, National Taiwan University, Taipei 10617, Taiwan;  mary2234@gmail.com  (L.-T.M.);  bad8016479@gmail.com  (P.-L.L.);  timmy304681@gmail.com  (Y.-T.C.);  jimmy81513@hotmail.com  (T.-F.S.)","School of Forestry and Resource Conservation, National Taiwan University, Taipei 10617, Taiwan;  mary2234@gmail.com  (L.-T.M.",AFF,"Taipei, Taiwan","GPE, GPE","74, 88","80, 94",,,,,Taiwan,1
 9 | 17,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\7_aff.xml,"1 INSERM U1070 “Pharmacology of Anti-Infective Agents”, 1 rue Georges Bonnet, Pôle Biologie Santé, 86022 Poitiers, France;  chantal.valcourtsainz@gmail.com  (C.V.);  julien.buyck@univ-poitiers.fr  (J.M.B.);  nicolas.gregoire@univ-poitiers.fr  (N.G.);  william.couet@univ-poitiers.fr  (W.C.);  sandrine.marchand@univ-poitiers.fr  (S.M.)","1 INSERM U1070 “Pharmacology of Anti-Infective Agents”, 1 rue Georges Bonnet, Pôle Biologie Santé, 86022 Poitiers, France;  chantal.valcourtsainz@gmail.com  (C.V.);  julien.buyck@univ-poitiers.fr  (J.M.B.",AFF,"France, C.V.","GPE, GPE","115, 158","121, 162",,,,,France,1
10 | 20,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\8_aff.xml,"2 UFR Médecine-Pharmacie Université de Poitiers, 6 rue de la Milétrie, TSA 51115, 86073 Poitiers, France","2 UFR Médecine-Pharmacie Université de Poitiers, 6 rue de la Milétrie, TSA 51115, 86073 Poitiers, France",AFF,France,GPE,98,104,,,,,France,1
11 | 21,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\9_aff.xml,"3 Laboratoire de Toxicologie-Pharmacocinétique, CHU de Poitiers, 2 rue de la Miletrie, 86021 Poitiers, France","3 Laboratoire de Toxicologie-Pharmacocinétique, CHU de Poitiers, 2 rue de la Miletrie, 86021 Poitiers, France",AFF,France,GPE,103,109,,,,,France,1
12 | 23,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8701039\sections\1_front\1_article-meta\7_aff.xml,"1 Centrale Marseille, CNRS, iSm2 Marseille, ISM2 UMR 7313, Aix-Marseille Université, Av. Escadrille Normandie-Niemen, 13013 Marseille, France;  julie.couillaud.13990@gmail.com  (J.C.);  letitia.LEYDET@univ-amu.fr  (L.L.);  katia.duquesne@univ-amu.fr  (K.D.)","Escadrille Normandie-Niemen, 13013 Marseille, France;  julie.couillaud.13990@gmail.com  (J.C.);  letitia.LEYDET@univ-amu.fr  (L.L.",AFF,"Marseille, France, J.C.","GPE, GPE, GPE","35, 46, 89","44, 52, 93",,,,,France,1
13 | 25,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8701039\sections\1_front\1_article-meta\8_aff.xml,"2 Systems and Synthetic Biology Division, Department of Biology and Biological Engineering, Chalmers University of Technology, 41296 Gothenburg, Sweden","2 Systems and Synthetic Biology Division, Department of Biology and Biological Engineering, Chalmers University of Technology, 41296 Gothenburg, Sweden",AFF,"Gothenburg, Sweden","GPE, GPE","133, 145","143, 151",,,,,Sweden,1
14 | 30,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\5_aff.xml,"1 Egyptian Deserts Gene Bank, North Sinai Research Station, Department of Genetic Resources, Desert Research Center ,  Cairo ,  Egypt","1 Egyptian Deserts Gene Bank, North Sinai Research Station, Department of Genetic Resources, Desert Research Center ,  Cairo ,  Egypt",AFF,"Cairo, Egypt","GPE, GPE","119, 128","124, 133",,,,,Egypt,1
15 | 33,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\8_aff.xml,"4 Department of Biology, Faculty of Science, University of Tabuk ,  Tabuk ,  Saudi Arabia","4 Department of Biology, Faculty of Science, University of Tabuk ,  Tabuk ,  Saudi Arabia",AFF,Saudi Arabia,GPE,77,89,,,,,Saudi Arabia,1
16 | 34,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\9_aff.xml,"5 Department of Plant Agricultural, Faculty of Agriculture Science, Al-Azhar University ,  Assiut ,  Egypt","5 Department of Plant Agricultural, Faculty of Agriculture Science, Al-Azhar University ,  Assiut ,  Egypt",AFF,Egypt,GPE,101,106,,,,,Egypt,1
17 | 35,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\10_aff.xml,"4 ICREA—Catalan Institution for Research and Advanced Studies, 08010 Barcelona, Spain;  jrintjema@iciq.es","4 ICREA—Catalan Institution for Research and Advanced Studies, 08010 Barcelona, Spain;  jrintjema@iciq.es",AFF,Spain,GPE,80,85,,,,,Spain,1
18 | 36,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\11_aff.xml,"5 Institute of Chemical Research of Catalonia (ICIQ), Barcelona Institute of Science and Technology, 43007 Tarragona, Spain;  fbravo@iciq.es  (F.B.);  akleij@iciq.es  (A.W.K.)","5 Institute of Chemical Research of Catalonia (ICIQ), Barcelona Institute of Science and Technology, 43007 Tarragona, Spain;  fbravo@iciq.es  (F.B.",AFF,Spain,GPE,118,123,,,,,Spain,1
19 | 38,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\12_aff.xml,"6 Institute for Bioengineering of Catalonia, Baldiri Reixac 10-12, 08028 Barcelona, Spain","6 Institute for Bioengineering of Catalonia, Baldiri Reixac 10-12, 08028 Barcelona, Spain",AFF,"Barcelona, Spain","GPE, GPE","73, 84","82, 89",,,,,Spain,1
20 | 39,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\7_aff.xml,"1 Departament d’Enginyeria Química, EEBE, Universitat Politècnica de Catalunya, 08019 Barcelona, Spain;  reza.zeinali@upc.edu  (R.Z.);  lourdes.franco@upc.edu  (L.F.);  carlos.aleman@upc.edu  (C.A.)","1 Departament d’Enginyeria Química, EEBE, Universitat Politècnica de Catalunya, 08019 Barcelona, Spain;  reza.zeinali@upc.edu  (R.Z.",AFF,Spain,GPE,97,102,,,,,Spain,1
21 | 41,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\8_aff.xml,"2 Center for Research in Nano-Engineering, CrNE, Universitat Politècnica de Catalunya, C. Eduard Maristany, 08019 Barcelona, Spain","2 Center for Research in Nano-Engineering, CrNE, Universitat Politècnica de Catalunya, C. Eduard Maristany, 08019 Barcelona, Spain",AFF,Spain,GPE,125,130,,,,,Spain,1
22 | 42,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\9_aff.xml,"3 ALBA Synchrotron Light Source, Carrer de la Llum, 2-26, Cerdanyola del Vallès, 08290 Barcelona, Spain;  iyousef@cells.es","3 ALBA Synchrotron Light Source, Carrer de la Llum, 2-26, Cerdanyola del Vallès, 08290 Barcelona, Spain;  iyousef@cells.es",AFF,"Barcelona, Spain","GPE, GPE","87, 98","96, 103",,,,,Spain,1
23 | 45,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8771452\sections\1_front\1_article-meta\10_aff.xml,"4 School of Pure and Applied Sciences,  Karatina University , Karatina,  Kenya","4 School of Pure and Applied Sciences,  Karatina University , Karatina,  Kenya",AFF,"Karatina, Kenya","GPE, GPE","62, 73","70, 78",,,,,Kenya,1
24 | 61,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\7_aff.xml,"1 Instituto Botánico, Departamento de Ciencia y Tecnología Agroforestal y Genética, Universidad de Castilla-La Mancha, Campus Universitario s/n, 02071 Albacete, Spain;  maria.mondejar3@alu.uclm.es  (M.M.-L.);  albertojose.lopez@uclm.es  (A.J.L.-J.);  Oussama.ahrazem@uclm.es  (O.A.);  MariaLourdes.gomez@uclm.es  (L.G.-G.)","1 Instituto Botánico, Departamento de Ciencia y Tecnología Agroforestal y Genética, Universidad de Castilla-La Mancha, Campus Universitario s/n, 02071 Albacete, Spain;  maria.mondejar3@alu.uclm.es  (M.M.-L.);  albertojose.lopez@uclm.es  (A.J.L.-J.",AFF,"Spain, M.M.-L.","GPE, GPE","161, 199","166, 206",,,,,Spain,1
25 | 64,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\8_aff.xml,"2 Departamento de Química Inorgánica, Orgánica y Bioquímica, Facultad de Farmacia, Universidad de Castilla-La Mancha, C/José María Sánchez Ibáñez s/n, 02008 Albacete, Spain;  Joaquinc.garcia@uclm.es","2 Departamento de Química Inorgánica, Orgánica y Bioquímica, Facultad de Farmacia, Universidad de Castilla-La Mancha, C/José María Sánchez Ibáñez s/n, 02008 Albacete, Spain;  Joaquinc.garcia@uclm.es",AFF,"Orgánica, Spain","GPE, GPE","38, 167","46, 172",,,,,Spain,1
26 | 65,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\9_aff.xml,"3 Regional Center for Biomedical Research (CRIB), Universidad de Castilla-La Mancha, C/Almansa 13, 02008 Albacete, Spain","3 Regional Center for Biomedical Research (CRIB), Universidad de Castilla-La Mancha, C/Almansa 13, 02008 Albacete, Spain",AFF,Spain,GPE,115,120,,,,,Spain,1
27 | 66,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8782336\sections\1_front\1_article-meta\6_aff.xml,"1   Division of Biological Sciences, University of California, San Diego, La Jolla, California, United States of America","1   Division of Biological Sciences, University of California, San Diego, La Jolla, California, United States of America",AFF,"La Jolla, California, United States of America","GPE, GPE, GPE","74, 84, 96","82, 94, 120",,,,,United States of America,1
28 | 68,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8782336\sections\1_front\1_article-meta\9_aff.xml,"  SRUC: Scotland’s Rural College, UNITED KINGDOM  ","  SRUC: Scotland’s Rural College, UNITED KINGDOM",AFF,"Scotland, UNITED KINGDOM","GPE, GPE","8, 34","16, 48",,,,,United Kingdom,1
29 | 


--------------------------------------------------------------------------------
/resources/fig_ent.xml:
--------------------------------------------------------------------------------
  1 | <dictionary title="C:\Users\shweata\docanalysis\terpene_fig\fig_ent">
  2 |   <entry term="terpene" count="22"/>
  3 |   <entry term="A. oxysepala" count="11"/>
  4 |   <entry term="(+)-limonene" count="10"/>
  5 |   <entry term="AsR6" count="7"/>
  6 |   <entry term="&#177;" count="7"/>
  7 |   <entry term="AoTPS7" count="7"/>
  8 |   <entry term="TMP" count="6"/>
  9 |   <entry term="SNP" count="5"/>
 10 |   <entry term="H" count="5"/>
 11 |   <entry term="geraniol" count="5"/>
 12 |   <entry term="pC5-FTG" count="5"/>
 13 |   <entry term="FPP" count="4"/>
 14 |   <entry term="P. halepensis&#8217;" count="4"/>
 15 |   <entry term="blue" count="4"/>
 16 |   <entry term="No-Fire" count="4"/>
 17 |   <entry term="VOC" count="4"/>
 18 |   <entry term="infection" count="4"/>
 19 |   <entry term="GmB-ACTIN" count="4"/>
 20 |   <entry term="poly(PA-LO" count="4"/>
 21 |   <entry term="TPS8" count="4"/>
 22 |   <entry term="(1S)-(&#8722;)-&#946;-pinene" count="4"/>
 23 |   <entry term="&#946;-bisabolene" count="4"/>
 24 |   <entry term="diphosphate" count="3"/>
 25 |   <entry term="mevalonate" count="3"/>
 26 |   <entry term="US-897" count="3"/>
 27 |   <entry term="farnesol" count="3"/>
 28 |   <entry term="colistin" count="3"/>
 29 |   <entry term="GPP" count="3"/>
 30 |   <entry term="linalool" count="3"/>
 31 |   <entry term="&#945;-terpineol" count="3"/>
 32 |   <entry term="&#945;-bergamotene" count="3"/>
 33 |   <entry term="zingiberene" count="3"/>
 34 |   <entry term="&#945;-caryophyllene" count="3"/>
 35 |   <entry term="amino acid" count="3"/>
 36 |   <entry term="SSR" count="2"/>
 37 |   <entry term="P. halepensis" count="2"/>
 38 |   <entry term="P. sylvestris" count="2"/>
 39 |   <entry term="fatty acid" count="2"/>
 40 |   <entry term="DMAPP" count="2"/>
 41 |   <entry term="pyruvate" count="2"/>
 42 |   <entry term="monoterpenes" count="2"/>
 43 |   <entry term="TcTPS13" count="2"/>
 44 |   <entry term="J53_MCR-1" count="2"/>
 45 |   <entry term="&#945;-farnesene" count="2"/>
 46 |   <entry term="taxol" count="2"/>
 47 |   <entry term="terpenoids" count="2"/>
 48 |   <entry term="Nudx" count="2"/>
 49 |   <entry term="terpenes" count="2"/>
 50 |   <entry term="ThiM" count="2"/>
 51 |   <entry term="ATP" count="2"/>
 52 |   <entry term="nucleotide" count="2"/>
 53 |   <entry term="chlorophyll" count="2"/>
 54 |   <entry term="SoLINS" count="2"/>
 55 |   <entry term="GUS-overexpressing hairy roots" count="2"/>
 56 |   <entry term="D,G" count="2"/>
 57 |   <entry term="poly(PA-LO)/PBS" count="2"/>
 58 |   <entry term="chloroform" count="2"/>
 59 |   <entry term="J132" count="2"/>
 60 |   <entry term="b Volatile terpenes" count="2"/>
 61 |   <entry term="volatiles" count="2"/>
 62 |   <entry term="5, &#946;-pinene" count="2"/>
 63 |   <entry term="6, myrcene" count="2"/>
 64 |   <entry term="8, 3-carene" count="2"/>
 65 |   <entry term="(E)-&#946;-ocimene" count="2"/>
 66 |   <entry term="(&#8722;)-&#945;-copaene" count="2"/>
 67 |   <entry term="&#945;-curcumene" count="2"/>
 68 |   <entry term="&#946;-caryophyllene" count="2"/>
 69 |   <entry term="cubebene" count="2"/>
 70 |   <entry term="&#945;-muurolene" count="2"/>
 71 |   <entry term="AoTPS9" count="2"/>
 72 |   <entry term="Amino acid" count="2"/>
 73 |   <entry term="One-way" count="2"/>
 74 |   <entry term="amino acids" count="2"/>
 75 |   <entry term="AtFPS2" count="2"/>
 76 |   <entry term="&#946;-bisabolenes" count="2"/>
 77 |   <entry term="DF" count="2"/>
 78 |   <entry term="L285" count="1"/>
 79 |   <entry term="tropolone sesquiterpenoids" count="1"/>
 80 |   <entry term="C&#8208;terminus" count="1"/>
 81 |   <entry term="Mg2+=light green" count="1"/>
 82 |   <entry term="selinadiene" count="1"/>
 83 |   <entry term="DNR" count="1"/>
 84 |   <entry term="TTP" count="1"/>
 85 |   <entry term="PNR" count="1"/>
 86 |   <entry term="mono-" count="1"/>
 87 |   <entry term="di-" count="1"/>
 88 |   <entry term="tri-" count="1"/>
 89 |   <entry term="penta-" count="1"/>
 90 |   <entry term="hexa-nucleotide" count="1"/>
 91 |   <entry term="1Biplots" count="1"/>
 92 |   <entry term="2Biplots" count="1"/>
 93 |   <entry term="P. sylvestris&#8217;" count="1"/>
 94 |   <entry term="monoterpene" count="1"/>
 95 |   <entry term="g&#8722;1 DM" count="1"/>
 96 |   <entry term="PS" count="1"/>
 97 |   <entry term="5Picture" count="1"/>
 98 |   <entry term="6Picture" count="1"/>
 99 |   <entry term="glucose" count="1"/>
100 |   <entry term="Thraustochytrium sp." count="1"/>
101 |   <entry term="sodium" count="1"/>
102 |   <entry term="NaCl" count="1"/>
103 |   <entry term="Embden-Meyerhof-Parnas" count="1"/>
104 |   <entry term="fructose-biphosphate" count="1"/>
105 |   <entry term="FAS" count="1"/>
106 |   <entry term="ACAD" count="1"/>
107 |   <entry term="MDD" count="1"/>
108 |   <entry term="mevalonate diphosphate" count="1"/>
109 |   <entry term="3PG" count="1"/>
110 |   <entry term="phosphate" count="1"/>
111 |   <entry term="dimethylallyl diphosphate" count="1"/>
112 |   <entry term="PDHX" count="1"/>
113 |   <entry term="TCA" count="1"/>
114 |   <entry term="tricarboxylic acid" count="1"/>
115 |   <entry term="hexane-extracted" count="1"/>
116 |   <entry term="4Box" count="1"/>
117 |   <entry term="VOCs" count="1"/>
118 |   <entry term="methyl esters" count="1"/>
119 |   <entry term="black/white dots" count="1"/>
120 |   <entry term="conifer terpene" count="1"/>
121 |   <entry term="grandis abietadiene" count="1"/>
122 |   <entry term="A. glauca limonene" count="1"/>
123 |   <entry term="cryptomerioides longifolene" count="1"/>
124 |   <entry term="Green: diterpene synthases" count="1"/>
125 |   <entry term="mono-/sesquiterpene" count="1"/>
126 |   <entry term="TcTPS14" count="1"/>
127 |   <entry term="K" count="1"/>
128 |   <entry term="GER" count="1"/>
129 |   <entry term="Terpene alcohol" count="1"/>
130 |   <entry term="Figure 4Relative propidium iodate" count="1"/>
131 |   <entry term="bottom-blue" count="1"/>
132 |   <entry term="(4 &#8804;" count="1"/>
133 |   <entry term="&#8804;" count="1"/>
134 |   <entry term="genes-12-01974-sch010_Scheme 10Scheme 10Use of various" count="1"/>
135 |   <entry term="lycopene" count="1"/>
136 |   <entry term="geranylgeraniol" count="1"/>
137 |   <entry term="sesquiterpenoids" count="1"/>
138 |   <entry term="genes-12-01974-sch005_Scheme 5Scheme 5First" count="1"/>
139 |   <entry term="tryprostatin B" count="1"/>
140 |   <entry term="enzymes-one" count="1"/>
141 |   <entry term="choline" count="1"/>
142 |   <entry term="genes-12-01974-sch007_Scheme 7Scheme 7In vitro access to" count="1"/>
143 |   <entry term="genes-12-01974-sch008_Scheme 8Scheme 8In vitro access" count="1"/>
144 |   <entry term="isopentenol" count="1"/>
145 |   <entry term="isoprenoid alcohol" count="1"/>
146 |   <entry term="prenyl" count="1"/>
147 |   <entry term="2A oligopeptide" count="1"/>
148 |   <entry term="OCS-t" count="1"/>
149 |   <entry term="phosphinothricin" count="1"/>
150 |   <entry term="pC5-zFSG" count="1"/>
151 |   <entry term="AtFPPS" count="1"/>
152 |   <entry term="agarose" count="1"/>
153 |   <entry term="ShTPS12-" count="1"/>
154 |   <entry term="Terpenes" count="1"/>
155 |   <entry term="ShTPS12-derived sesquiterpenes" count="1"/>
156 |   <entry term="1, &#946;-caryophyllene; 2, &#945;-humulene" count="1"/>
157 |   <entry term="ShSBS-derived sesquiterpenes" count="1"/>
158 |   <entry term="3, (&#8722;)-endo-&#945;-bergamotene" count="1"/>
159 |   <entry term="4, (+)-&#945;-santalene" count="1"/>
160 |   <entry term="5, (&#8722;)-exo-&#945;-bergamotene" count="1"/>
161 |   <entry term="ShTPS12-derived" count="1"/>
162 |   <entry term="Longevity" count="1"/>
163 |   <entry term="terpenoid" count="1"/>
164 |   <entry term="USDA110" count="1"/>
165 |   <entry term="&#8722;74 &#176;C" count="1"/>
166 |   <entry term="&#8722;20 &#176;C" count="1"/>
167 |   <entry term="Green biobased terpene" count="1"/>
168 |   <entry term="1,4 dioxane solvent and uniaxial (1)" count="1"/>
169 |   <entry term="poly(Myr/MA-LO" count="1"/>
170 |   <entry term="6(a" count="1"/>
171 |   <entry term="1&#8211;3" count="1"/>
172 |   <entry term="&#176;" count="1"/>
173 |   <entry term="H2O2" count="1"/>
174 |   <entry term="terpenoid-derived" count="1"/>
175 |   <entry term="FIG&#160;1(A" count="1"/>
176 |   <entry term="unrooted tree [based on maximum likelihood method] with branch values indicating bootstrap support" count="1"/>
177 |   <entry term="Tri-TCs (B" count="1"/>
178 |   <entry term="unrooted tree [based on maximum likelihood method] with branch values indicating bootstrap" count="1"/>
179 |   <entry term="FIG&#160;7GC chromatogram of extracts" count="1"/>
180 |   <entry term="Gypsy" count="1"/>
181 |   <entry term="ANA" count="1"/>
182 |   <entry term="C.&#8201;camphora" count="1"/>
183 |   <entry term="C.&#8201;kanehirae" count="1"/>
184 |   <entry term="terpenoid backbone" count="1"/>
185 |   <entry term="Floral volatile terpene" count="1"/>
186 |   <entry term="c Volatile terpenes" count="1"/>
187 |   <entry term="AqIPP2" count="1"/>
188 |   <entry term="prenyl diphosphate" count="1"/>
189 |   <entry term="5, &#946;-phellandrene" count="1"/>
190 |   <entry term="6, &#946;-pinene" count="1"/>
191 |   <entry term="7, myrcene" count="1"/>
192 |   <entry term="8, (+)-limonene" count="1"/>
193 |   <entry term="3-carene" count="1"/>
194 |   <entry term="11, (Z)-&#946;-ocimene" count="1"/>
195 |   <entry term="&#947;-terpinene" count="1"/>
196 |   <entry term="terpinolene" count="1"/>
197 |   <entry term="&#946;-chamigrene" count="1"/>
198 |   <entry term="&#945;-cedrene" count="1"/>
199 |   <entry term="(Z,E)-&#945;-farnesene" count="1"/>
200 |   <entry term="&#945;-patchoulene" count="1"/>
201 |   <entry term="&#945;-himachalene" count="1"/>
202 |   <entry term="(E)-nerolidol" count="1"/>
203 |   <entry term="farnesol methyl ether" count="1"/>
204 |   <entry term="HTPS7" count="1"/>
205 |   <entry term="Aquilegia TPS8" count="1"/>
206 |   <entry term="farnesyl diphosphate" count="1"/>
207 |   <entry term="&#946;-sesquiphellandrene" count="1"/>
208 |   <entry term="Figure 6Volatile terpene" count="1"/>
209 |   <entry term="TPSs" count="1"/>
210 |   <entry term="Zuojia" count="1"/>
211 |   <entry term="A. oxysepala 8#-12" count="1"/>
212 |   <entry term="A. japonica 8#-12" count="1"/>
213 |   <entry term="5-epi-aristolochene" count="1"/>
214 |   <entry term="&#946;-sesquiphellandrenes" count="1"/>
215 |   <entry term="TQ" count="1"/>
216 |   <entry term="cream" count="1"/>
217 |   <entry term="10.1371/journal.pone.0262354.g002Fig" count="1"/>
218 |   <entry term="Gram-negative" count="1"/>
219 |   <entry term="AIA" count="1"/>
220 |   <entry term="DAPI" count="1"/>
221 |   <entry term="SYTOX-green" count="1"/>
222 |   <entry term="10.1371/journal.pone.0262354.g004Fig 4Genome" count="1"/>
223 |   <entry term="threonine" count="1"/>
224 |   <entry term="viomycin" count="1"/>
225 |   <entry term="S. vinaceus ATCC" count="1"/>
226 |   <entry term="KF042303.1" count="1"/>
227 |   <entry term="S. argenteolus ATCC 11009" count="1"/>
228 |   <entry term="ammonium ionophore" count="1"/>
229 |   <entry term="Nonactin" count="1"/>
230 |   <entry term="S. griseus" count="1"/>
231 |   <entry term="ETH" count="1"/>
232 |   <entry term="Cluster" count="1"/>
233 |   <entry term="10.1371/journal.pone.0262354.g007Fig 7Genomic" count="1"/>
234 |   <entry term="2,887,219" count="1"/>
235 |   <entry term="2,877,618" count="1"/>
236 | </dictionary>
237 | 


--------------------------------------------------------------------------------
/resources/pmr_demo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from docanalysis import DocAnalysis
 3 | from pathlib import Path
 4 | 
 5 | ethic_statement_creator = DocAnalysis()
 6 | term_dir = Path(os.getcwd(), "terpenes_dictionary", "terpenes_key_phrases", )
 7 | if not term_dir.exists():
 8 |     term_dir.mkdir()
 9 | dict_for_entities = ethic_statement_creator.extract_entities_from_papers(
10 |     corpus_path=Path(os.getcwd(), "corpus", "terpenes", ),
11 |     terms_xml_path=Path(term_dir, "terpenes_key_phrases.xml"),
12 |     query="terpenes",
13 |     hits=10,
14 |     make_project=True
15 | )
16 | print(f"dict {dict_for_entities}")
17 | list_with_orgs = ethic_statement_creator.extract_particular_fields(
18 |     dict_for_entities, 'ORG')
19 | with open('org.text', 'w') as f:
20 |     f.write(str(list_with_orgs))
21 | list_with_gpe = ethic_statement_creator.extract_particular_fields(
22 |     dict_for_entities, 'GPE')
23 | with open('GPE.text', 'w') as f:
24 |     f.write(str(list_with_gpe))
25 | 
26 | 


--------------------------------------------------------------------------------
/resources/test_pmc.txt:
--------------------------------------------------------------------------------
1 | PMC8771452, PMC8771452, PMC8771452


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | try:
 5 |     from setuptools import setup
 6 | except ImportError:
 7 |     from distutils.core import setup
 8 | import configparser
 9 | import os
10 | 
11 | with open('README.md', encoding='utf-8') as readme_file:
12 |     readme = readme_file.read()
13 | 
14 | requirements = ['abbreviations', 'beautifulsoup4==4.10.0', 'braceexpand==0.1.7', 'coloredlogs==15.0.1', 'ConfigArgParse==1.5.3', 'lxml==4.7.1', 'nltk==3.6.7', 'pandas==1.3.4',
15 |                 'pygetpapers',
16 |                 'pytest==6.2.5',
17 |                 'setuptools==60.3.1',
18 |                 'spacy==3.0.7',
19 |                 'tkinterweb==3.10.7',
20 |                 'tqdm==4.62.3'
21 |                 ]
22 | 
23 | setup(
24 |     name='docanalysis',
25 |     version="0.3.0",
26 |     description='extract structured information from ethics paragraphs',
27 |     long_description_content_type='text/markdown',
28 |     long_description=readme,
29 |     author='Ayush Garg, Shweata N. Hegde',
30 |     author_email='ayush@science.org.in, shweata.hegde@gmail.com',
31 |     url='https://github.com/petermr/docanalysis',
32 |     packages=[
33 |         'docanalysis',
34 |     ],
35 |     package_dir={'docanalysis':
36 |                  'docanalysis'},
37 |     include_package_data=True,
38 |     install_requires=requirements,
39 |     license='Apache License',
40 |     zip_safe=False,
41 |     keywords='research automation',
42 |     classifiers=[
43 |         'Development Status :: 4 - Beta',
44 |         'Intended Audience :: Developers',
45 |         'License :: OSI Approved :: Apache Software License',
46 |         'Natural Language :: English',
47 |         'Programming Language :: Python :: 3.4',
48 |         'Programming Language :: Python :: 3.5',
49 |         'Programming Language :: Python :: 3.6',
50 |         'Programming Language :: Python :: 3.7',
51 |         'Programming Language :: Python :: 3.8',
52 |         'Programming Language :: Python :: 3.9',
53 |         'Programming Language :: Python :: 3.10',
54 | 
55 |     ],
56 |     entry_points={
57 |         'console_scripts': [
58 |             'docanalysis=docanalysis.docanalysis:main',
59 |         ],
60 |     },
61 | 
62 | )
63 | 


--------------------------------------------------------------------------------
/tests/test_docanalysis_cli.py:
--------------------------------------------------------------------------------
 1 | # test whether...
 2 | # Cproject exists (I)
 3 | # dictionary exists (I)
 4 | # sections exist (I)
 5 | # non-empty CSV exists (O)
 6 | # dictionary is created (not sure if we create two dictionaries (entities and keyphrases) can be created at the same time)
 7 | import pytest
 8 | from pathlib import Path
 9 | import os
10 | 
11 | DOCANALYSIS_TOP = Path(__file__).parent.parent
12 | #EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300')
13 | PMC_TEXT_FILE = Path(DOCANALYSIS_TOP, 'resources', 'test_pmc.text')
14 | DICT_DIRECTORY = Path(DOCANALYSIS_TOP, 'ethics_dictionary')
15 | TEST_DICT = Path(DICT_DIRECTORY, 'ethics_demo', 'ethics_demo.xml')
16 | TEMP_CPROJECT = Path(DOCANALYSIS_TOP, 'test_ethics_20')
17 | 
18 | class TestDocanalysis:
19 | 
20 |     def test_pygetpapers(self):
21 |         """- checks whether 
22 |             - the corpus directory exists or not
23 |             - the number of PMC * folders is equal to the hits specified
24 |             - fulltext xml exists in each PMC folder or not
25 |         """
26 |         os.system(f'docanalysis --run_pygetpapers --terms {TEST_DICT} --project_name {TEMP_CPROJECT}')
27 |         assert TEMP_CPROJECT.exists(), f"checking whether {TEMP_CPROJECT} exists"
28 |         assert len(list(TEMP_CPROJECT.glob('PMC*/'))) == 3
29 |         assert len(list(TEMP_CPROJECT.glob('PMC*/fulltext.xml'))) == 3
30 | 
31 |     def test_section_exists(self):
32 |         """checkers whether
33 |             - the number of PMC folder with sections is equal to number of hits
34 |             - section exists in each PMC folder
35 |         # not sure if this is the right way of testing whether papers are sectioned    
36 |         """
37 | 
38 |         f'docanalysis --project_name {TEMP_CPROJECT} --run_sectioning'
39 |         assert len(list(TEMP_CPROJECT.glob('PMC*/sections/'))) == 3
40 |         for PMC in TEMP_CPROJECT.glob('**/'):
41 |             for section in PMC.glob('sections/'):
42 |                 assert section.name.exists()
43 | 
44 |     def test_search_dict_exists(self):
45 |         """checks whether the dictionary directory exists or not
46 |         """
47 |         assert TEST_DICT.exists(), f"dictionary {TEST_DICT} must exist"
48 | 
49 |     def test_csv_output_creation(self):
50 |         """checks whether the csv output is created or not
51 |         """
52 |         os.system(f'docanalysis --project_name {TEMP_CPROJECT} --dictionary {TEST_DICT} --output')
53 |         assert Path(TEMP_CPROJECT, 'entities.csv').exists, 'checking if the output is created'
54 | 
55 |     def test_dict_creation_entites(self):
56 |         os.system(f'docanalysis --project_name {TEMP_CPROJECT} --dictionary {TEST_DICT} --output ----make_ami_dict entities.xml')
57 |         assert Path(TEMP_CPROJECT, 'entities.xml').exists, 'checking if the entitty dictionary is created'
58 | 
59 |     def test_remove_dir():
60 |         import shutil
61 |         shutil.rmtree(TEMP_CPROJECT)
62 |         assert "Ran all the tests" == "Ran all the tests"


--------------------------------------------------------------------------------
/tests/test_docanalysis_method.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import glob
 3 | import os
 4 | from pathlib import Path
 5 | from ..docanalysis.extract_entities import DocAnalysis
 6 | 
 7 | DOCANALYSIS_TOP = Path(__file__).parent.parent
 8 | EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300')
 9 | 
10 | class TestDocanalysisMeth():
11 | 
12 |     def test_cproject_exists(self):
13 |         assert EXISTING_CPROJECT.exists(), f"checking whether {EXISTING_CPROJECT} exists"
14 | 
15 |     def test_glob_section(self):
16 |         all_paragraphs = glob(os.path.join(
17 |             EXISTING_CPROJECT, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True)
18 |         assert all_paragraphs is not None


--------------------------------------------------------------------------------
/tests/testing_test.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | 
3 | DOCANALYSIS_TOP = Path(__file__).parent.parent
4 | print(DOCANALYSIS_TOP)
5 | #EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300')
6 | PMC_TEXT_FILE = Path(DOCANALYSIS_TOP, 'resources', 'test_pmc.text')
7 | print(PMC_TEXT_FILE)


--------------------------------------------------------------------------------