├── .gitignore
├── .idea
├── .gitignore
├── docanalysis.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── vcs.xml
├── LICENSE
├── README.md
├── __init__.py
├── config.ini
├── dictionary
├── abb.xml
├── ack_key_phrases_manual.txt
├── ack_key_phrases_manual
│ ├── ack_key_phrases.md
│ ├── ack_key_phrases_manual.xml
│ └── approval_number.xml
├── acknowledgment_feature_names.xml
├── chap4_wikitest_2.xml
├── cities_dictionary
│ └── cities.xml
├── consent_type.txt
├── consent_type
│ └── consent_type.xml
├── ethics_committee_key_phrases.txt
├── ethics_committee_key_phrases
│ └── ethics_committee_key_phrases.xml
├── ethics_key_phrases.txt
├── ethics_key_phrases
│ └── ethics_key_phrases.xml
├── features_ack.txt
├── features_ack
│ ├── acknowledgment_feature_names.xml
│ └── features_ack.xml
├── invasion_biology
│ ├── invasion_hypotheses.xml
│ └── invasion_hypothesis.txt
├── ipcc.xml
├── methods_key_phrases.txt
├── methods_key_phrases
│ └── methods_key_phrases.xml
├── software.xml
└── test_terpene.xml
├── docanalysis
├── .DS_Store
├── __init__.py
├── ami_sections.py
├── config
│ ├── default_dicts.json
│ └── default_sections.json
├── convert_file.py
├── docanalysis.py
├── entity_extraction.py
├── file_lib.py
├── get_html.py
├── glob_trail.py
├── gui.py
├── gui
│ ├── css
│ │ └── main.css
│ ├── eel.js
│ └── main.html
└── xml_lib.py
├── docs
├── Makefile
├── make.bat
├── requirements.txt
└── source
│ ├── conf.py
│ ├── docanalysis.rst
│ ├── entity_extraction.rst
│ └── index.rst
├── notebooks
├── README.md
└── c_project.ipynb
├── requirements.txt
├── resources
├── approval_number_100.csv
├── demo.py
├── docanalyis_architecture_diagram.PNG
├── docanalysis_demo.ipynb
├── entities_country.csv
├── ethics_statement_corpus_1000.csv
├── fig_ent.xml
├── oil186.csv
├── oil186_20210712.csv
├── oil186_ack.csv
├── pmr_demo.py
├── software_mentions.csv
├── stem_cell_research_300.csv
├── stem_cell_research_300_2020.csv
├── stem_cell_research_300_ethics.csv
├── terpene_fig_entities.csv
└── test_pmc.txt
├── setup.py
├── software_papers.ipynb
└── tests
├── test_docanalysis_cli.py
├── test_docanalysis_method.py
└── testing_test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # temporary results
132 | temp/
133 |
134 | #corpus
135 | oil186/
136 | corpus/
137 | stem_cell_research_300/
138 | stem_cell_research_300_2020
139 | GPE.text
140 | ORG.text
141 |
142 | #vscode
143 | .vscode/
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/docanalysis.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | For updated tutorial, please check the [Wiki](https://github.com/petermr/docanalysis/wiki/docanalysis-Tutorial) page.
2 | ## docanalysis
3 | `docanalysis` is a Command Line Tool that ingests corpora [(CProjects)](https://github.com/petermr/tigr2ess/blob/master/getpapers/TUTORIAL.md#cproject-and-ctrees) and carries out text-analysis of documents, including
4 | - sectioning
5 | - NLP/text-mining
6 | - dictionary generation
7 |
8 | Besides the bespoke code, it uses [NLTK](https://www.nltk.org/) and other Python tools for many operations, and [spaCy](https://spacy.io/) or [scispaCy](https://allenai.github.io/scispacy/) for extraction and annotation of entities. Outputs summary data and word-dictionaries.
9 |
10 | ### Set up `venv`
11 | We recommend you create a virtual environment (`venv`) before installing `docanalysis` and that you activate the `venv` before each time you run `docanalysis`.
12 |
13 | #### Windows
14 | Creating a `venv`
15 | ```
16 | >> mkdir docanalysis_demo
17 | >> cd docanalysis_demo
18 | >> python -m venv venv
19 | ```
20 |
21 | Activating `venv`
22 | ```
23 | >> venv\Scripts\activate.bat
24 | ```
25 |
26 | #### MacOS
27 | Creating a `venv`
28 | ```
29 | >> mkdir docanalysis_demo
30 | >> cd docanalysis_demo
31 | >> python3 -m venv venv
32 | ```
33 |
34 | Activating `venv`
35 | ```
36 | >> source venv/bin/activate
37 | ```
38 |
39 | Refer the [official documentation](https://packaging.python.org/en/latest/guides/installing-using-pip-and-virtual-environments/) for more help.
40 |
41 | ### Install `docanalysis`
42 | You can download `docanalysis` from PYPI.
43 | ```
44 | pip install docanalysis
45 | ```
46 | If you are on a Mac
47 | ```
48 | pip3 install docanalysis
49 | ```
50 |
51 | Download python from: [https://www.python.org/downloads/](https://www.python.org/downloads/) and select the option `Add Python to Path while installing`. Make sure `pip` is installed along with python. Check out [https://pip.pypa.io/en/stable/installation/](https://pip.pypa.io/en/stable/installation/) if you have difficulties installing pip.
52 |
53 | ### Run `docanalysis`
54 | `docanalysis --help` should list the flags we support and their use.
55 |
56 | ```
57 | usage: docanalysis.py [-h] [--run_pygetpapers] [--make_section] [-q QUERY] [-k HITS] [--project_name PROJECT_NAME] [-d DICTIONARY] [-o OUTPUT]
58 | [--make_ami_dict MAKE_AMI_DICT] [--search_section [SEARCH_SECTION [SEARCH_SECTION ...]]] [--entities [ENTITIES [ENTITIES ...]]]
59 | [--spacy_model SPACY_MODEL] [--html HTML] [--synonyms SYNONYMS] [--make_json MAKE_JSON] [--search_html] [--extract_abb EXTRACT_ABB]
60 | [-l LOGLEVEL] [-f LOGFILE]
61 |
62 | Welcome to docanalysis version 0.1.3. -h or --help for help
63 |
64 | optional arguments:
65 | -h, --help show this help message and exit
66 | --run_pygetpapers [Command] downloads papers from EuropePMC via pygetpapers
67 | --make_section [Command] makes sections; requires a fulltext.xml in CTree directories
68 | -q QUERY, --query QUERY
69 | [pygetpapers] query string
70 | -k HITS, --hits HITS [pygetpapers] number of papers to download
71 | --project_name PROJECT_NAME
72 | CProject directory name
73 | -d DICTIONARY, --dictionary DICTIONARY
74 | [file name/url] existing ami dictionary to annotate sentences or support supervised entity extraction
75 | -o OUTPUT, --output OUTPUT
76 | outputs csv with sentences/terms
77 | --make_ami_dict MAKE_AMI_DICT
78 | [Command] title for ami-dict. Makes ami-dict of all extracted entities; works only with spacy
79 | --search_section [SEARCH_SECTION [SEARCH_SECTION ...]]
80 | [NER/dictionary search] section(s) to annotate. Choose from: ALL, ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL. Defaults to
81 | ALL
82 | --entities [ENTITIES [ENTITIES ...]]
83 | [NER] entities to extract. Default (ALL). Common entities SpaCy: GPE, LANGUAGE, ORG, PERSON (for additional ones check: ); SciSpaCy: CHEMICAL,
84 | DISEASE
85 | --spacy_model SPACY_MODEL
86 | [NER] optional. Choose between spacy or scispacy models. Defaults to spacy
87 | --html HTML outputs html with sentences/terms
88 | --synonyms SYNONYMS annotate the corpus/sections with synonyms from ami-dict
89 | --make_json MAKE_JSON
90 | outputs json with sentences/terms
91 | --search_html searches html documents (mainly IPCC)
92 | --extract_abb EXTRACT_ABB
93 | [Command] title for abb-ami-dict. Extracts abbreviations and expansions; makes ami-dict of all extracted entities
94 | -l LOGLEVEL, --loglevel LOGLEVEL
95 | provide logging level. Example --log warning <>, default='info'
96 | -f LOGFILE, --logfile LOGFILE
97 | saves log to specified file in output directory as well as printing to terminal
98 | ```
99 |
100 | #### Download papers from [EPMC](https://europepmc.org/) via `pygetpapers`
101 | COMMAND
102 | ```
103 | docanalysis --run_pygetpapers -q "terpene" -k 10 --project_name terpene_10
104 | ```
105 | LOGS
106 | ```
107 | INFO: making project/searching terpene for 10 hits into C:\Users\shweata\docanalysis\terpene_10
108 | INFO: Total Hits are 13935
109 | 1it [00:00, 936.44it/s]
110 | INFO: Saving XML files to C:\Users\shweata\docanalysis\terpene_10\*\fulltext.xml
111 | 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:30<00:00, 3.10s/it]
112 | ```
113 |
114 | CPROJ
115 | ```
116 | C:\USERS\SHWEATA\DOCANALYSIS\TERPENE_10
117 | │ eupmc_results.json
118 | │
119 | ├───PMC8625850
120 | │ eupmc_result.json
121 | │ fulltext.xml
122 | │
123 | ├───PMC8727598
124 | │ eupmc_result.json
125 | │ fulltext.xml
126 | │
127 | ├───PMC8747377
128 | │ eupmc_result.json
129 | │ fulltext.xml
130 | │
131 | ├───PMC8771452
132 | │ eupmc_result.json
133 | │ fulltext.xml
134 | │
135 | ├───PMC8775117
136 | │ eupmc_result.json
137 | │ fulltext.xml
138 | │
139 | ├───PMC8801761
140 | │ eupmc_result.json
141 | │ fulltext.xml
142 | │
143 | ├───PMC8831285
144 | │ eupmc_result.json
145 | │ fulltext.xml
146 | │
147 | ├───PMC8839294
148 | │ eupmc_result.json
149 | │ fulltext.xml
150 | │
151 | ├───PMC8840323
152 | │ eupmc_result.json
153 | │ fulltext.xml
154 | │
155 | └───PMC8879232
156 | eupmc_result.json
157 | fulltext.xml
158 | ```
159 |
160 | #### Section the papers
161 | COMMAND
162 | ```
163 | docanalysis --project_name terpene_10 --make_section
164 | ```
165 | LOGS
166 | ```
167 | WARNING: Making sections in /content/terpene_10/PMC9095633/fulltext.xml
168 | INFO: dict_keys: dict_keys(['abstract', 'acknowledge', 'affiliation', 'author', 'conclusion', 'discussion', 'ethics', 'fig_caption', 'front', 'introduction', 'jrnl_title', 'keyword', 'method', 'octree', 'pdfimage', 'pub_date', 'publisher', 'reference', 'results_discuss', 'search_results', 'sections', 'svg', 'table', 'title'])
169 | WARNING: loading templates.json
170 | INFO: wrote XML sections for /content/terpene_10/PMC9095633/fulltext.xml /content/terpene_10/PMC9095633/sections
171 | WARNING: Making sections in /content/terpene_10/PMC9120863/fulltext.xml
172 | INFO: wrote XML sections for /content/terpene_10/PMC9120863/fulltext.xml /content/terpene_10/PMC9120863/sections
173 | WARNING: Making sections in /content/terpene_10/PMC8982386/fulltext.xml
174 | INFO: wrote XML sections for /content/terpene_10/PMC8982386/fulltext.xml /content/terpene_10/PMC8982386/sections
175 | WARNING: Making sections in /content/terpene_10/PMC9069239/fulltext.xml
176 | INFO: wrote XML sections for /content/terpene_10/PMC9069239/fulltext.xml /content/terpene_10/PMC9069239/sections
177 | WARNING: Making sections in /content/terpene_10/PMC9165828/fulltext.xml
178 | INFO: wrote XML sections for /content/terpene_10/PMC9165828/fulltext.xml /content/terpene_10/PMC9165828/sections
179 | WARNING: Making sections in /content/terpene_10/PMC9119530/fulltext.xml
180 | INFO: wrote XML sections for /content/terpene_10/PMC9119530/fulltext.xml /content/terpene_10/PMC9119530/sections
181 | WARNING: Making sections in /content/terpene_10/PMC8982077/fulltext.xml
182 | INFO: wrote XML sections for /content/terpene_10/PMC8982077/fulltext.xml /content/terpene_10/PMC8982077/sections
183 | WARNING: Making sections in /content/terpene_10/PMC9067962/fulltext.xml
184 | INFO: wrote XML sections for /content/terpene_10/PMC9067962/fulltext.xml /content/terpene_10/PMC9067962/sections
185 | WARNING: Making sections in /content/terpene_10/PMC9154778/fulltext.xml
186 | INFO: wrote XML sections for /content/terpene_10/PMC9154778/fulltext.xml /content/terpene_10/PMC9154778/sections
187 | WARNING: Making sections in /content/terpene_10/PMC9164016/fulltext.xml
188 | INFO: wrote XML sections for /content/terpene_10/PMC9164016/fulltext.xml /content/terpene_10/PMC9164016/sections
189 | 47% 1056/2258 [00:01<00:01, 1003.31it/s]ERROR: cannot parse /content/terpene_10/PMC9165828/sections/1_front/1_article-meta/26_custom-meta-group/0_custom-meta/1_meta-value/0_xref.xml
190 | 67% 1516/2258 [00:01<00:00, 1047.68it/s]ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/7_xref.xml
191 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/14_email.xml
192 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/3_xref.xml
193 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/6_xref.xml
194 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/9_email.xml
195 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/10_email.xml
196 | ERROR: cannot parse /content/terpene_10/PMC9119530/sections/1_front/1_article-meta/24_custom-meta-group/0_custom-meta/1_meta-value/4_xref.xml
197 | ...
198 | 100% 2258/2258 [00:02<00:00, 949.43it/s]
199 | ```
200 |
201 | CTREE
202 | ```
203 | ├───PMC8625850
204 | │ └───sections
205 | │ ├───0_processing-meta
206 | │ ├───1_front
207 | │ │ ├───0_journal-meta
208 | │ │ └───1_article-meta
209 | │ ├───2_body
210 | │ │ ├───0_1._introduction
211 | │ │ ├───1_2._materials_and_methods
212 | │ │ │ ├───1_2.1._materials
213 | │ │ │ ├───2_2.2._bacterial_strains
214 | │ │ │ ├───3_2.3._preparation_and_character
215 | │ │ │ ├───4_2.4._evaluation_of_the_effect_
216 | │ │ │ ├───5_2.5._time-kill_studies
217 | │ │ │ ├───6_2.6._propidium_iodide_uptake-e
218 | │ │ │ └───7_2.7._hemolysis_test_from_human
219 | │ │ ├───2_3._results
220 | │ │ │ ├───1_3.1._encapsulation_of_terpene_
221 | │ │ │ ├───2_3.2._both_terpene_alcohol-load
222 | │ │ │ ├───3_3.3._farnesol_and_geraniol-loa
223 | │ │ │ └───4_3.4._farnesol_and_geraniol-loa
224 | │ │ ├───3_4._discussion
225 | │ │ ├───4_5._conclusions
226 | │ │ └───5_6._patents
227 | │ ├───3_back
228 | │ │ ├───0_ack
229 | │ │ ├───1_fn-group
230 | │ │ │ └───0_fn
231 | │ │ ├───2_app-group
232 | │ │ │ └───0_app
233 | │ │ │ └───2_supplementary-material
234 | │ │ │ └───0_media
235 | │ │ └───9_ref-list
236 | │ └───4_floats-group
237 | │ ├───4_table-wrap
238 | │ ├───5_table-wrap
239 | │ ├───6_table-wrap
240 | │ │ └───4_table-wrap-foot
241 | │ │ └───0_fn
242 | │ ├───7_table-wrap
243 | │ └───8_table-wrap
244 | ...
245 | ```
246 | ##### Search sections using dictionary
247 | COMMAND
248 | ```
249 | docanalysis --project_name terpene_10 --output entities.csv --make_ami_dict entities.xml
250 | ```
251 | LOGS
252 | ```
253 | INFO: Found 7134 sentences in the section(s).
254 | INFO: getting terms from /content/activity.xml
255 | 100% 7134/7134 [00:02<00:00, 3172.14it/s]
256 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
257 | "[", "").str.replace("]", "")
258 | INFO: wrote output to /content/terpene_10/activity.csv
259 | ```
260 |
261 | #### Extract entities
262 | We use `spacy` to extract Named Entites. Here's the list of Entities it supports:CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW,LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART
263 | INPUT
264 | ```
265 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --entities ORG --output org.csv
266 | ```
267 | LOGS
268 | ```
269 | INFO: Found 7134 sentences in the section(s).
270 | INFO: Loading spacy
271 | 100% 7134/7134 [01:08<00:00, 104.16it/s]
272 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
273 | "[", "").str.replace("]", "")
274 | INFO: wrote output to /content/terpene_10/org.csv
275 | ```
276 | ##### Extract information from specific section(s)
277 | You can choose to extract entities from specific sections
278 |
279 | COMMAND
280 | ```
281 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --search_section AUT, AFF --entities ORG --output org_aut_aff.csv
282 | ```
283 | LOG
284 | ```
285 | INFO: Found 28 sentences in the section(s).
286 | INFO: Loading spacy
287 | 100% 28/28 [00:00<00:00, 106.66it/s]
288 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
289 | "[", "").str.replace("]", "")
290 | INFO: wrote output to /content/terpene_10/org_aut_aff.csv
291 | ```
292 | #### Create dictionary of extracted entities
293 | COMMAND
294 | ```
295 | docanalysis --project_name terpene_10 --make_section --spacy_model spacy --search_section AUT, AFF --entities ORG --output org_aut_aff.csvv --make_ami_dict org
296 | ```
297 | LOG
298 | ```
299 | INFO: Found 28 sentences in the section(s).
300 | INFO: Loading spacy
301 | 100% 28/28 [00:00<00:00, 96.56it/s]
302 | /usr/local/lib/python3.7/dist-packages/docanalysis/entity_extraction.py:352: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
303 | "[", "").str.replace("]", "")
304 | INFO: wrote output to /content/terpene_10/org_aut_aff.csvv
305 | INFO: Wrote all the entities extracted to ami dict
306 | ```
307 |
308 | Snippet of the dictionary
309 | ```
310 |
311 | - dictionary title="/content/terpene_10/org.xml">
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 | ```
323 |
324 | ### Extract Abbreviations
325 |
326 | ```
327 | docanalysis --project_name corpus\ethics_10 --output dict_search_5.csv --make_json dict_search_5.json --make_ami_dict entities --extract_abb ethics_abb
328 | ```
329 |
330 | `--extract_abb` extracts all abbreviations and make an ami-dictionary of abbreviations and its expansion.
331 |
332 | EXAMPLE DICTIONARY:
333 | ```
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 | ```
356 |
357 | ### Search HTML
358 | If you working with HTML files (IPCC Reports, for example) and not XMLs in CProjects, you can use `--search_html` flag.
359 |
360 | ```
361 | docanalysis --project_name corpus\ipcc_sectioned --extract_abb ethics_abb --search_html
362 | ```
363 |
364 | Make sure that your `html` sections is in `sections` folder. Here's an example structure:
365 |
366 | ```
367 | C:.
368 | | dict_search_2.csv
369 | | dict_search_2.json
370 | |
371 | \---chap4
372 | | chapter_4
373 | |
374 | \---sections
375 | 4.1.html
376 | 4.2.1.html
377 | 4.2.2.html
378 | 4.2.3.html
379 | 4.2.4.html
380 | 4.2.5.html
381 | 4.2.7.html
382 | 4.2.html
383 | 4.3.1.html
384 | 4.3.2.html
385 | 4.3.html
386 | 4.4.1.html
387 | 4.4.2.html
388 | 4.4.html
389 | 4.5.html
390 | executive_summary.html
391 | frequently_asked_questions.html
392 | table_of_contents.html
393 | ```
394 | If you haven't sectioned your `html`, please use `py4ami` to section it.
395 | #### What is a dictionary
396 | Dictionary, in `ami`'s terminology, a set of terms/phrases in XML format.
397 | Dictionaries related to ethics and acknowledgments are available in [Ethics Dictionary](https://github.com/petermr/docanalysis/tree/main/ethics_dictionary) folder
398 |
399 | If you'd like to create a custom dictionary, you can find the steps, [here](https://github.com/petermr/tigr2ess/blob/master/dictionaries/TUTORIAL.md)
400 |
401 | ```
402 | ### Python tools used
403 | - [`pygetpapers`](https://github.com/petermr/pygetpapers) - scrape open repositories to download papers of interest
404 | - [nltk](https://www.nltk.org/) - splits sentences
405 | - [spaCy](https://spacy.io/) and [SciSpaCy](https://allenai.github.io/scispacy/)
406 | - recognize Named-Entities and label them
407 | - Here's the list of NER labels [SpaCy's English model](https://spacy.io/models/en) provides:
408 | `CARDINAL, DATE, EVENT, FAC, GPE, LANGUAGE, LAW, LOC, MONEY, NORP, ORDINAL, ORG, PERCENT, PERSON, PRODUCT, QUANTITY, TIME, WORK_OF_ART`
409 |
410 |
411 | ### Credits:
412 | - [Ayush Garg](https://github.com/ayush4921)
413 | - [Shweata N. Hegde](https://github.com/ShweataNHegde/)
414 | - [Daniel Mietchen](https://github.com/Daniel-Mietchen)
415 | - [Peter Murray-Rust](https://github.com/petermr)
416 |
417 |
418 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/__init__.py
--------------------------------------------------------------------------------
/config.ini:
--------------------------------------------------------------------------------
1 | [ethics_statement]
2 | version=0.0.0.1
--------------------------------------------------------------------------------
/dictionary/abb.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/dictionary/ack_key_phrases_manual.txt:
--------------------------------------------------------------------------------
1 | conflict of interest
2 | financial support
--------------------------------------------------------------------------------
/dictionary/ack_key_phrases_manual/ack_key_phrases.md:
--------------------------------------------------------------------------------
1 | The terms in the dictionary were created manually by Chaitanya.
2 |
--------------------------------------------------------------------------------
/dictionary/ack_key_phrases_manual/ack_key_phrases_manual.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/dictionary/ack_key_phrases_manual/approval_number.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/dictionary/acknowledgment_feature_names.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/dictionary/chap4_wikitest_2.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/dictionary/consent_type.txt:
--------------------------------------------------------------------------------
1 | video consent
2 | informed consent
3 | written informed consent
4 | verbal consent
5 | voluntary consent
6 | competent consent
--------------------------------------------------------------------------------
/dictionary/consent_type/consent_type.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/dictionary/ethics_committee_key_phrases.txt:
--------------------------------------------------------------------------------
1 | Ethics Committee of
2 | Ethical Committee of
3 | Institutional Review Board of
4 | the IRB of
5 | Institutional Animal Care and Use Committee of
6 | Animal Care and Use Committee of
7 | IACUC of
--------------------------------------------------------------------------------
/dictionary/ethics_committee_key_phrases/ethics_committee_key_phrases.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/dictionary/ethics_key_phrases.txt:
--------------------------------------------------------------------------------
1 | animal study was reviewed and approved by
2 | protocol was approved by
3 | studies involving human participants were reviewed and approved by
4 | approved by
5 | informed and written consent
6 | written consent
7 | informed consent
8 | Ethics Committee
9 | legal guardian
10 | next of kin
11 | national legislation
12 | ethics committees
13 | ethics committees
14 | Ethics Committees
15 | appropriate approval
16 | written informed consent
17 | Principle of Laboratory Animal Care
18 | ethics guidelines
19 | Experimental protocols were approved by
20 | ethical clearance
21 | Ethical approval was authorized through
22 | Animal Ethics Committee
23 | Declaration of Helsinki
24 | principles of Good Clinical Practice
25 | Good Clinical Practice
26 | approved by the Institutional Review Board
27 | International Conference on Harmonisation Good Clinical Practice guidelines
28 | approved the study
29 | local ethical committees
30 | International Council for Harmonisation of Technical Requirements for Pharmaceuticals for Human Use
31 |
--------------------------------------------------------------------------------
/dictionary/ethics_key_phrases/ethics_key_phrases.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/dictionary/features_ack.txt:
--------------------------------------------------------------------------------
1 | authors gratefully acknowledge
2 | authors sincerely thank
3 | authors thank
4 | special thanks
5 | for providing
6 | author would like
7 | providing constant support
8 | grateful
9 | authors acknowledge
10 | financial contribution
11 | research support
12 | financial support
13 | also thank
14 | scientific support
15 | helpful suggestions
16 | technical support
17 | support provided
18 | sincere thanks
19 | authors express
20 | authors extend
21 | kind support
22 | facilities provided
23 | providing facilities
24 | appreciated
25 | necessary facilities
26 | provided funding
27 | technical assistance
28 | fellowship
29 | funds
30 | contribution
31 | helpful comments
32 | reliable care
33 | valuable comments
--------------------------------------------------------------------------------
/dictionary/features_ack/acknowledgment_feature_names.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
--------------------------------------------------------------------------------
/dictionary/features_ack/features_ack.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/dictionary/invasion_biology/invasion_hypotheses.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/dictionary/invasion_biology/invasion_hypothesis.txt:
--------------------------------------------------------------------------------
1 | Biotic Resistance Hypothesis
2 | Enemy Release Hypothesis
3 | Propagule Pressure Hypothesis
4 | Synthesizing Invasion Hypotheses
5 | Tens Rule
6 | novel weapons hypothesis
7 | Darwin's naturalization and limiting similarity hypotheses
8 | Phenotypic plasticity hypothesis
9 | Evolution of increased competitive ability and shifting defence hypotheses
10 | Invasional meltdown hypothesis
11 | Disturbance hypothesis
12 |
--------------------------------------------------------------------------------
/dictionary/ipcc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/dictionary/methods_key_phrases.txt:
--------------------------------------------------------------------------------
1 | flowering season
2 | harvesting season
3 | blossoming season
4 | collected from
5 | collected in
6 | were collected in a blossoming period
7 | oil sample was obtained from
8 | sample obtained from
9 | flowering period
10 | harvesting period
11 | blossoming period
12 | seeds were sampled from
13 | were purchased from local markets
14 |
--------------------------------------------------------------------------------
/dictionary/methods_key_phrases/methods_key_phrases.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/dictionary/software.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/dictionary/test_terpene.xml:
--------------------------------------------------------------------------------
1 |
2 | Created from SPARQL query
3 |
4 | D-limonene(+)
5 | (R)-Limonene(+)
6 | (4R)-4-isopropenyl-1-methylcyclohexene
7 | (R)-4-isopropenyl-1-methyl-1-cyclohexene
8 | (R)-(+)-limonene
9 | D-(+)-limonene
10 | (4R)-1-methyl-4-isopropenylcyclohex-1-ene
11 | (4R)-limonene(+)
12 | (4R)-Limonene
13 | D-Limonen
14 | (+)-4-isopropenyl-1-methylcyclohexene
15 | (R)-p-mentha-1,8-diene
16 | (R)(+)-p-mentha-1,8-diene
17 | (+)-limonene
18 | (1R)-(+)-α-pinene
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/docanalysis/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/docanalysis/.DS_Store
--------------------------------------------------------------------------------
/docanalysis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | docanalysis module
3 | """
4 | from docanalysis.entity_extraction import EntityExtraction
5 | from docanalysis.docanalysis import Docanalysis
6 |
7 | __author__ = "Ayush Garg", "Shweata N. Hegde"
8 | __email__ = "ayush@science.org.in", "shweata.hegde@gmail.com"
--------------------------------------------------------------------------------
/docanalysis/ami_sections.py:
--------------------------------------------------------------------------------
1 | from abc import ABC
2 | from pathlib import Path
3 | import logging
4 | from lxml import etree as LXET
5 |
6 | from docanalysis.xml_lib import XmlLib
7 |
8 |
9 | class AMIAbsSection(ABC):
10 | """ """
11 | logger = logging.getLogger("ami_abs_section")
12 |
13 | SECTIONS = "sections"
14 |
15 | def __init__(self) -> None:
16 | pass
17 |
18 |
19 | @classmethod
20 | def make_xml_sections(cls, file, outdir: str, force: bool) -> None:
21 | """make sections
22 |
23 | :param file:
24 | :param outdir: str:
25 | :param force: bool:
26 |
27 | """
28 | if file is None or outdir is None:
29 | return None
30 | path = Path(file)
31 | if not path.exists():
32 | cls.logger.warning(f"file {file} does not exist")
33 | return
34 | # sections = Path(self.dirx)
35 | if force or not Path(outdir).exists():
36 | cls.logger.warning(f"Making sections in {str(path)}")
37 | xml_libx = XmlLib()
38 | xml_libx.logger.setLevel(logging.DEBUG)
39 | xml_libx.read(file)
40 | xml_libx.make_sections(outdir)
41 |
42 |
43 | class AMIFigure(AMIAbsSection):
44 | """holds data on figure captions and hopefully later pointers to pdfimages
45 |
46 | Figures are a mess in JATS. They can be held in different places and often not linked
47 | to the bitmap. This class will include heuristics for uniting and standardising this.
48 |
49 | JATS encoding depends on the publisher. Typically:
50 |
52 |
53 |
54 | XPS core spectra comparison for aged baseline and SEB-3 electrodes.
55 |
The graphite and NCM622 electrodes are taken from the baseline cell after 956 cycles and
56 | the SEB-3 cell after 4021 cycles.
57 |
58 |
59 | '
60 |
61 | There are sometimes 2 or more
as children of caption.
62 |
63 |
64 | """
65 |
66 | # JATS tags
67 | LABEL = "label_xml"
68 | CAPTION = "caption"
69 | P = "p"
70 | TITLE = "title"
71 |
72 | def __init__(self):
73 | super().__init__()
74 | self.root = None
75 | self.root_str = None
76 | self.label_xml = None
77 | self.label_text = None
78 | self.caption = None
79 | self.caption_p = None
80 | self.p_text = None
81 | self.caption_title = None
82 | self.title_text = None
83 |
84 | @classmethod
85 | def create_from_jats(cls, xml_path):
86 | """
87 |
88 | :param xml_path:
89 |
90 | """
91 | ami_figure = AMIFigure()
92 | ami_figure.root = XmlLib.parse_xml_file_to_root(str(xml_path))
93 | ami_figure.add_figure_structure()
94 | return ami_figure
95 |
96 | def add_figure_structure(self):
97 | """creates label, caption, title, test(p) from JATS xml"""
98 | self.root_str = LXET.tostring(self.root)
99 | self.label_xml = XmlLib.get_or_create_child(self.root, self.LABEL)
100 | self.label_text = XmlLib.get_text(self.label_xml)
101 | self.caption = XmlLib.get_or_create_child(self.root, self.CAPTION)
102 | self.caption_p = XmlLib.get_or_create_child(self.caption, self.P)
103 | self.p_text = XmlLib.get_text(self.caption_p)
104 | self.caption_title = XmlLib.get_or_create_child(self.caption, self.TITLE)
105 | self.title_text = XmlLib.get_text(self.caption_title)
106 |
107 | def get_xml_str(self):
108 | """ """
109 | return LXET.tostring(self.root)
110 |
111 | def __str__(self):
112 | s = f" --- {self.label_xml} ----\n" \
113 | f"[{self.title_text}] \n" \
114 | f" {self.p_text}"
115 | return s
116 |
--------------------------------------------------------------------------------
/docanalysis/config/default_dicts.json:
--------------------------------------------------------------------------------
1 | {
2 | "EO_ACTIVITY": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/activity/eo_activity.xml",
3 | "EO_COMPOUND": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/compound/eo_compound.xml",
4 | "EO_ANALYSIS": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/analysis/eo_analysis_method.xml",
5 | "EO_EXTRACTION": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/extraction/eo_extraction.xml",
6 | "EO_PLANT": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant/eo_plant.xml",
7 | "PLANT_GENUS": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant_genus/plant_genus.xml",
8 | "EO_PLANT_PART": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/plant_part/plant_part.xml",
9 | "EO_TARGET": "https://raw.githubusercontent.com/petermr/dictionary/main/cevopen/target/eo_target_organism.xml",
10 | "COUNTRY": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/country/country.xml",
11 | "DISEASE": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/disease/disease.xml",
12 | "ORGANIZATION": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/organization/organization.xml",
13 | "DRUG": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/drug/drug.xml",
14 | "TEST_TRACE": "https://raw.githubusercontent.com/petermr/dictionary/main/openVirus20210120/test_trace/test_trace.xml"
15 | }
16 |
--------------------------------------------------------------------------------
/docanalysis/config/default_sections.json:
--------------------------------------------------------------------------------
1 | {
2 | "ABS":["*abstract.xml"],
3 | "ACK": ["*ack.xml"],
4 | "AFF": ["*aff.xml"],
5 | "AUT": ["*contrib-group.xml"],
6 | "CON": ["*conclusion*/*.xml"],
7 | "DIS": ["*discussion*/**/*_title.xml", "*discussion*/**/*_p.xml"],
8 | "ETH": ["*ethic*/*.xml"],
9 | "FIG": ["*fig*.xml"],
10 | "INT": ["*introduction*/*.xml", "*background*/*.xml"],
11 | "KEY": ["*kwd-group.xml"],
12 | "MET": ["*method*/*.xml", "*material*/*.xml"] ,
13 | "RES": ["*result*/*/*_title.xml", "*result*/*/*_p.xml"],
14 | "TAB": ["*table*.xml"],
15 | "TIL": ["*article-meta/*title-group.xml"],
16 | "HTML": ["*.html"]
17 |
18 | }
--------------------------------------------------------------------------------
/docanalysis/convert_file.py:
--------------------------------------------------------------------------------
1 | import os
2 | from chardet import detect
3 |
4 | # get file encoding type
5 |
6 |
7 | def get_encoding_type(file):
8 | """
9 |
10 | :param file:
11 |
12 | """
13 | with open(file, 'rb') as f:
14 | rawdata = f.read()
15 | return detect(rawdata)['encoding']
16 |
17 |
18 | from_codec = get_encoding_type('entity_extraction.py')
19 |
20 | # add try: except block for reliability
21 | try:
22 | with open('entity_extraction.py', 'r', encoding=from_codec) as f, open('entity_extraction2.py', 'w', encoding='utf-8') as e:
23 | text = f.read() # for small files, for big use chunks
24 | e.write(text)
25 |
26 |
27 | except UnicodeDecodeError:
28 | print('Decode Error')
29 | except UnicodeEncodeError:
30 | print('Encode Error')
31 |
--------------------------------------------------------------------------------
/docanalysis/docanalysis.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 | import sys
4 | import configargparse
5 | import coloredlogs
6 | from time import gmtime, strftime
7 | from tqdm import tqdm
8 | from functools import partialmethod
9 | from docanalysis.entity_extraction import EntityExtraction
10 |
11 |
12 | class Docanalysis:
13 |
14 | def __init__(self):
15 | """This function makes all the constants"""
16 | self.entity_extraction = EntityExtraction()
17 | self.version = "0.3.0"
18 |
19 | def handle_logger_creation(self, args):
20 | """handles the logging on cml
21 |
22 | :param args: description]
23 | :type args: type]
24 |
25 | """
26 | coloredlogs.install()
27 | levels = {
28 | "critical": logging.CRITICAL,
29 | "error": logging.ERROR,
30 | "warn": logging.WARNING,
31 | "warning": logging.WARNING,
32 | "info": logging.INFO,
33 | "debug": logging.DEBUG,
34 | }
35 | level = levels.get(args.loglevel.lower())
36 |
37 | if level == logging.DEBUG:
38 | tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
39 |
40 | if args.logfile:
41 | self.handle_logfile(args, level)
42 | else:
43 | coloredlogs.install(level=level, fmt='%(levelname)s: %(message)s')
44 |
45 | def handlecli(self):
46 | """Handles the command line interface using argparse"""
47 | version = self.version
48 |
49 | default_path = strftime("%Y_%m_%d_%H_%M_%S", gmtime())
50 | parser = configargparse.ArgParser(
51 | description=f"Welcome to docanalysis version {version}. -h or --help for help",
52 | add_config_file_help=False,
53 | )
54 | parser.add_argument(
55 | "--run_pygetpapers",
56 | default=False,
57 | action="store_true",
58 | help="[Command] downloads papers from EuropePMC via pygetpapers",
59 | )
60 | parser.add_argument(
61 | "--make_section",
62 | default=False,
63 | action="store_true",
64 | help="[Command] makes sections; requires a fulltext.xml in CTree directories",
65 | )
66 | parser.add_argument(
67 | "-q",
68 | "--query",
69 | default=None,
70 | type=str,
71 | help="[pygetpapers] query string",
72 | )
73 | parser.add_argument(
74 | "-k",
75 | "--hits",
76 | type=str,
77 | default=None,
78 | help="[pygetpapers] number of papers to download",
79 | )
80 |
81 | parser.add_argument(
82 | "--project_name",
83 | type=str,
84 | help="CProject directory name",
85 | default=os.path.join(os.getcwd(), default_path),
86 | )
87 | parser.add_argument(
88 | "-d",
89 | "--dictionary",
90 | default=[],
91 | type=str,
92 | nargs='*',
93 | help="[file name/url] existing ami dictionary to annotate sentences or support supervised entity extraction",
94 | )
95 | parser.add_argument(
96 | "-o",
97 | "--output",
98 | default=False,
99 | help="outputs csv with sentences/terms",
100 | )
101 | parser.add_argument(
102 | "--make_ami_dict",
103 | default=False,
104 | help="[Command] title for ami-dict. Makes ami-dict of all extracted entities; works only with spacy",
105 | )
106 | parser.add_argument(
107 | "--search_section",
108 | default=['ALL'],
109 | action='store',
110 | dest='search_section',
111 | type=str,
112 | nargs='*',
113 | help="[NER/dictionary search] section(s) to annotate. Choose from: ALL, ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL. Defaults to ALL",
114 | )
115 |
116 | parser.add_argument(
117 | "--entities",
118 | default=['ALL'],
119 | action='store', dest='entities',
120 | type=str, nargs='*',
121 | help="[NER] entities to extract. Default (ALL). Common entities "
122 | "SpaCy: GPE, LANGUAGE, ORG, PERSON (for additional ones check: ); "
123 | )
124 |
125 | parser.add_argument(
126 | "--spacy_model",
127 | default=False,
128 | type=str,
129 | help="[NER] optional.",
130 | )
131 |
132 | parser.add_argument(
133 | "--html",
134 | default=False,
135 | type=str,
136 | help="outputs html with sentences/terms",
137 | )
138 |
139 | parser.add_argument(
140 | "--synonyms",
141 | default=False,
142 | type=str,
143 | help="annotate the corpus/sections with synonyms from ami-dict",
144 | )
145 | parser.add_argument(
146 | "--make_json",
147 | default=False,
148 | type=str,
149 | help="outputs json with sentences/terms",
150 | )
151 | parser.add_argument(
152 | "--search_html",
153 | default=False,
154 | action="store_true",
155 | help="searches html documents (mainly IPCC)",
156 | )
157 | parser.add_argument(
158 | "--extract_abb",
159 | default=False,
160 | help="[Command] title for abb-ami-dict. Extracts abbreviations and expansions; makes ami-dict of all extracted entities"
161 | )
162 |
163 | parser.add_argument(
164 | "-l",
165 | "--loglevel",
166 | default="info",
167 | help="provide logging level. "
168 | "Example --log warning <>, default='info'",
169 | )
170 |
171 | parser.add_argument(
172 | "-f",
173 | "--logfile",
174 | default=False,
175 | type=str,
176 | help="saves log to specified file in output directory as well as printing to terminal",
177 | )
178 |
179 | if len(sys.argv) == 1:
180 | parser.print_help(sys.stderr)
181 | sys.exit()
182 | args = parser.parse_args()
183 | for arg in vars(args):
184 | if vars(args)[arg] == "False":
185 | vars(args)[arg] = False
186 | self.handle_logger_creation(args)
187 | self.entity_extraction.extract_entities_from_papers(args.project_name, args.dictionary, search_sections=args.search_section, entities=args.entities, query=args.query, hits=args.hits,
188 | run_pygetpapers=args.run_pygetpapers, make_section=args.make_section, removefalse=True,
189 | csv_name=args.output, make_ami_dict=args.make_ami_dict, spacy_model=args.spacy_model, html_path=args.html, synonyms=args.synonyms, make_json=args.make_json, search_html=args.search_html, extract_abb=args.extract_abb)
190 |
191 |
192 | def main():
193 | """Runs the CLI"""
194 | calldocanalysis = Docanalysis()
195 | calldocanalysis.handlecli()
196 |
197 |
198 | if __name__ == "__main__":
199 | main()
200 |
--------------------------------------------------------------------------------
/docanalysis/entity_extraction.py:
--------------------------------------------------------------------------------
1 | from distutils.log import error
2 | import os
3 | import logging
4 | import requests
5 | from glob import glob
6 | import spacy
7 | from spacy import displacy
8 | from nltk import tokenize
9 | from spacy.matcher import PhraseMatcher
10 | import pandas as pd
11 | from bs4 import BeautifulSoup
12 | from tqdm import tqdm
13 | import xml.etree.ElementTree as ET
14 | from docanalysis.ami_sections import AMIAbsSection
15 | from pathlib import Path
16 | from pygetpapers import Pygetpapers
17 | from collections import Counter
18 | import pip
19 | import json
20 | import re
21 | from lxml import etree
22 | from pygetpapers.download_tools import DownloadTools
23 | from urllib.request import urlopen
24 | import nltk
25 | try:
26 | nltk.data.find('tokenizers/punkt')
27 | nltk.data.find('corpora/stopwords')
28 | except LookupError:
29 | nltk.download('punkt')
30 | nltk.download('stopwords')
31 | from nltk import tokenize
32 |
33 |
34 | def install(package):
35 | """
36 |
37 | :param package:
38 |
39 | """
40 | if hasattr(pip, 'main'):
41 | pip.main(['install', package])
42 | else:
43 | pip._internal.main(['install', package])
44 |
45 |
46 | try:
47 | from abbreviations import schwartz_hearst
48 | except ModuleNotFoundError:
49 | install('abbreviations')
50 | from abbreviations import schwartz_hearst
51 |
52 |
53 | #nlp_phrase = spacy.load("en_core_web_sm")
54 |
55 | CONFIG_SECTIONS = 'https://raw.githubusercontent.com/petermr/docanalysis/main/docanalysis/config/default_sections.json'
56 | CONFIG_AMI_DICT = 'https://raw.githubusercontent.com/petermr/docanalysis/main/docanalysis/config/default_dicts.json'
57 |
58 |
59 | class EntityExtraction:
60 | """EntityExtraction Class"""
61 |
62 | def __init__(self):
63 | logging.basicConfig(level=logging.INFO)
64 | self.sections = self.json_to_dict(CONFIG_SECTIONS)
65 | self.dict_of_ami_dict = self.json_to_dict(CONFIG_AMI_DICT)
66 | self.all_paragraphs = {}
67 | self.sentence_dictionary = {}
68 | self.spacy_model = 'spacy'
69 | self.nlp = None
70 |
71 | def download_spacy(self, spacy_type):
72 | """Download or load spacy
73 |
74 | :param spacy_type: "spacy
75 | :type spacy_type: string
76 |
77 | """
78 | logging.info(f'Loading {spacy_type}')
79 |
80 | if spacy_type == "spacy":
81 | try:
82 | self.nlp = spacy.load('en_core_web_sm')
83 | except OSError:
84 | from spacy.cli import download
85 | download('en_core_web_sm')
86 | self.nlp = spacy.load('en_core_web_sm')
87 |
88 | def dictionary_to_html(self, html_path):
89 | """Converts dictionary to html
90 |
91 | :param html_path: path to save html
92 | :type html_path: string
93 |
94 | """
95 | list_of_docs = []
96 | for sentence in self.sentence_dictionary:
97 | list_of_docs.append(self.sentence_dictionary[sentence]['doc'])
98 | html = displacy.render(list_of_docs, style="ent",
99 | page=True, minify=True)
100 | logging.info(f"saving output: {html_path}")
101 | self._write_string_to_file(html, html_path)
102 |
103 | def extract_entities_from_papers(self, corpus_path, terms_xml_path, search_sections, entities, query=None, hits=30,
104 | run_pygetpapers=False, make_section=False, removefalse=True,
105 | csv_name=False, make_ami_dict=False, spacy_model=False, html_path=False, synonyms=False, make_json=False, search_html=False, extract_abb=False):
106 | """logic implementation (Q: how detailed should the description here be?)
107 |
108 | :param corpus_path:
109 | :param terms_xml_path:
110 | :param search_sections:
111 | :param entities:
112 | :param query: (Default value = None)
113 | :param hits: (Default value = 30)
114 | :param run_pygetpapers: (Default value = False)
115 | :param make_section: (Default value = False)
116 | :param removefalse: (Default value = True)
117 | :param csv_name: (Default value = False)
118 | :param make_ami_dict: (Default value = False)
119 | :param spacy_model: (Default value = False)
120 | :param html_path: (Default value = False)
121 | :param synonyms: (Default value = False)
122 | :param make_json: (Default value = False)
123 | :param search_html: (Default value = False)
124 | :param extract_abb: (Default value = False)
125 |
126 | """
127 |
128 | self.spacy_model = spacy_model
129 | corpus_path = os.path.abspath(corpus_path)
130 | if run_pygetpapers:
131 | if not query:
132 | logging.warning(
133 | "please provide query (like 'terpene', 'essential oils') as parameter")
134 | return
135 | self.run_pygetpapers(query, hits, corpus_path)
136 | if os.path.isdir(corpus_path):
137 | if make_section:
138 | self.run_ami_section(corpus_path)
139 | else:
140 | logging.error("CProject doesn't exist")
141 | return
142 | if search_html:
143 | search_sections = ['HTML', ]
144 | if search_sections == ['ALL', ]:
145 | search_sections = self.sections.keys()
146 | if len(glob(os.path.join(corpus_path, '**', 'sections'))) > 0:
147 | self.all_paragraphs = self.get_glob_for_section(
148 | corpus_path, search_sections)
149 | else:
150 | logging.error('section papers using --make_sections before search')
151 | if spacy_model or csv_name or extract_abb or make_ami_dict:
152 | if search_html:
153 | self.make_dict_with_parsed_document(document_type='html')
154 | else:
155 | self.make_dict_with_parsed_document()
156 | if spacy_model:
157 | self.run_spacy_over_sections(self.sentence_dictionary, entities)
158 | self.remove_statements_not_having_xmldict_terms(
159 | dict_with_parsed_xml=self.sentence_dictionary, searching='entities')
160 | if terms_xml_path:
161 | for i in range(len(terms_xml_path)):
162 | compiled_terms = self.get_terms_from_ami_xml(terms_xml_path[i])
163 | self.add_if_file_contains_terms(
164 | compiled_terms=compiled_terms, dict_with_parsed_xml=self.sentence_dictionary, searching=f'{i}')
165 | if removefalse:
166 | self.remove_statements_not_having_xmldict_terms(
167 | dict_with_parsed_xml=self.sentence_dictionary, searching=f'{i}')
168 | if synonyms:
169 | synonyms_list = self.get_synonyms_from_ami_xml(terms_xml_path)
170 | self.add_if_file_contains_terms(
171 | compiled_terms=synonyms_list, dict_with_parsed_xml=self.sentence_dictionary, searching='has_synonyms')
172 | if removefalse:
173 | self.remove_statements_not_having_xmldict_terms(
174 | dict_with_parsed_xml=self.sentence_dictionary)
175 | if html_path:
176 | self.dictionary_to_html(
177 | os.path.join(corpus_path, html_path))
178 | if extract_abb:
179 | self.abbreviation_search_using_sw(self.sentence_dictionary)
180 | abb_ami_dict_path = os.path.join(corpus_path, extract_abb)
181 | self.make_ami_dict_from_abbreviation(
182 | extract_abb, self.sentence_dictionary, abb_ami_dict_path)
183 | if removefalse:
184 | self.remove_statements_not_having_xmldict_terms(
185 | dict_with_parsed_xml=self.sentence_dictionary, searching='abb')
186 |
187 | if csv_name:
188 | dict_with_parsed_xml_no_paragrph = self.remove_paragraph_form_parsed_xml_dict(
189 | self.sentence_dictionary, "paragraph")
190 | self.convert_dict_to_csv(
191 | path=os.path.join(corpus_path, f'{csv_name}'), dict_with_parsed_xml=dict_with_parsed_xml_no_paragrph)
192 | if make_json:
193 | dict_with_parsed_xml_no_paragrph = self.remove_paragraph_form_parsed_xml_dict(
194 | self.sentence_dictionary, "paragraph")
195 | self.convert_dict_to_json(path=os.path.join(
196 | corpus_path, f'{make_json}'), dict_with_parsed_xml=dict_with_parsed_xml_no_paragrph)
197 | if make_ami_dict:
198 | ami_dict_path = os.path.join(corpus_path, make_ami_dict)
199 | self.handle_ami_dict_creation(
200 | self.sentence_dictionary, make_ami_dict, ami_dict_path)
201 |
202 | return self.sentence_dictionary
203 |
204 | def run_pygetpapers(self, query, hits, output):
205 | """calls pygetpapers to query EPMC for papers; downloads specified number of papers
206 |
207 | :param query: query to pygetpapers/EPMC
208 | :type query: str
209 | :param hits: number of papers to download
210 | :type hits: int
211 | :param output: name of the folder
212 | :type output: str
213 |
214 | """
215 | pygetpapers_call = Pygetpapers()
216 | pygetpapers_call.run_command(
217 | query=query, limit=hits, output=output, xml=True)
218 | logging.info(f"making CProject {output} with {hits} papers on {query}")
219 |
220 | def run_ami_section(self, path):
221 | """Creates sections folder for each paper (CTree); sections papers into front, body, back and floats based on JATS
222 |
223 | :param path: CProject path
224 | :type path: string
225 |
226 | """
227 | file_list = glob(os.path.join(
228 | path, '**', 'fulltext.xml'), recursive=True)
229 | for paper in file_list:
230 | with open(paper, 'r') as xml_file:
231 | xml_string = xml_file.read()
232 | if len(xml_string) > 0:
233 | outdir = Path(Path(paper).parent, "sections")
234 | AMIAbsSection.make_xml_sections(paper, outdir, True)
235 | else:
236 | logging.warning(f"{paper} is empty")
237 |
238 | def get_glob_for_section(self, path, section_names):
239 | """globs for xml files in section folder of each CTree
240 |
241 | :param path: CProject path
242 | :type path: string
243 | :param section_names: one or more keys (section names) from CONFIG_SECTIONS
244 | :type section_names: string
245 | :returns: list of globs
246 | :rtype: list
247 |
248 | """
249 | for section_name in section_names:
250 | if section_name in self.sections.keys():
251 | self.all_paragraphs[section_name] = []
252 | for section in self.sections[section_name]:
253 | self.all_paragraphs[section_name] += glob(os.path.join(
254 | path, '**', 'sections', '**', section), recursive=True)
255 | else:
256 | logging.error(
257 | "please make sure that you have selected only the supported sections: ACK, AFF, AUT, CON, DIS, ETH, FIG, INT, KEY, MET, RES, TAB, TIL")
258 | return self.all_paragraphs
259 |
260 | def make_dict_with_parsed_document(self, document_type="xml"):
261 | """creates dictionary with parsed xml or html
262 |
263 | :param document_type: type of file fed: xml or html. Defaults to "xml".
264 | :type document_type: str
265 | :returns: python dict containing parsed text from xml or html
266 | :rtype: dict
267 |
268 | """
269 |
270 | self.sentence_dictionary = {}
271 |
272 | counter = 1
273 | for section in self.all_paragraphs:
274 | for section_path in tqdm(self.all_paragraphs[section]):
275 | paragraph_path = section_path
276 | if document_type == 'html':
277 | paragraph_text = self.read_text_from_html(paragraph_path)
278 | elif document_type == 'xml':
279 | paragraph_text = self.read_text_from_path(paragraph_path)
280 | sentences = tokenize.sent_tokenize(paragraph_text)
281 | for sentence in sentences:
282 | self.sentence_dictionary[counter] = {}
283 | self._make_dict_attributes(
284 | counter, section, section_path, paragraph_text, sentence)
285 | counter += 1
286 | logging.info(
287 | f"Found {len(self.sentence_dictionary)} sentences in the section(s).")
288 | return self.sentence_dictionary
289 |
290 | def _make_dict_attributes(self, counter, section, section_path, paragraph_text, sentence):
291 | """
292 |
293 | :param counter:
294 | :param section:
295 | :param section_path:
296 | :param paragraph_text:
297 | :param sentence:
298 |
299 | """
300 | dict_for_sentences = self.sentence_dictionary[counter]
301 | dict_for_sentences["file_path"] = section_path
302 | dict_for_sentences["paragraph"] = paragraph_text
303 | dict_for_sentences["sentence"] = sentence
304 | dict_for_sentences["section"] = section
305 |
306 | def read_text_from_path(self, paragraph_path):
307 | """uses ElementTree to read text from xml files
308 |
309 | :param paragraph_path: path to xml file
310 | :type paragraph_path: string
311 | :returns: raw text from xml
312 | :rtype: string
313 |
314 | """
315 | try:
316 | tree = ET.parse(paragraph_path)
317 | root = tree.getroot()
318 | xmlstr = ET.tostring(root, encoding='utf8', method='xml')
319 | soup = BeautifulSoup(xmlstr, features='lxml')
320 | text = soup.get_text(separator=" ")
321 | paragraph_text = text.replace(
322 | '\n', ' ')
323 | except:
324 | paragraph_text = "empty"
325 | logging.error(f"cannot parse {paragraph_path}")
326 | return paragraph_text
327 |
328 | def read_text_from_html(self, paragraph_path):
329 | """uses beautifulsoup to read text from html files
330 |
331 | :param paragraph_path: path to html file
332 | :type paragraph_path: string
333 | :returns: raw text from html
334 | :rtype: string
335 |
336 | """
337 | with open(paragraph_path, encoding="utf-8") as f:
338 | content = f.read()
339 | soup = BeautifulSoup(content, 'html.parser')
340 | return soup.text.replace('\n', ' ')
341 |
342 | def run_spacy_over_sections(self, dict_with_parsed_xml, entities_names):
343 | """uses spacy to extract specific Named-Entities from sentences in python dict
344 |
345 | :param dict_with_parsed_xml: main dict with sentences
346 | :type dict_with_parsed_xml: dict
347 | :param entities_names: list of kinds of Named-Entities that needs to be extacted
348 | :type entities_names: list
349 |
350 | """
351 | self.download_spacy(self.spacy_model)
352 | for paragraph in tqdm(dict_with_parsed_xml):
353 | if len(dict_with_parsed_xml[paragraph]['sentence']) > 0:
354 | doc = self.nlp(dict_with_parsed_xml[paragraph]['sentence'])
355 | entities, labels, position_end, position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end = self._make_required_lists()
356 | self._get_entities(entities_names, doc, entities,
357 | labels, position_end, position_start)
358 | self._add_lists_to_dict(dict_with_parsed_xml[paragraph], entities, labels, position_end,
359 | position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end)
360 |
361 | def _get_entities(self, entities_names, doc, entities, labels, position_end, position_start):
362 | """
363 |
364 | :param entities_names:
365 | :param doc:
366 | :param entities:
367 | :param labels:
368 | :param position_end:
369 | :param position_start:
370 |
371 | """
372 | for ent in doc.ents:
373 | if (ent.label_ in entities_names) or (entities_names == ['ALL']):
374 | self._add_parsed_entities_to_lists(
375 | entities, labels, position_end, position_start, ent)
376 |
377 | def abbreviation_search_using_sw(self, dict_with_parsed_xml):
378 | """Extracts abbreviations from sentences using schwartz_hearst. Credit: Ananya Singha
379 |
380 | :param dict_with_parsed_xml: main python dictionary with sentences
381 | :type dict_with_parsed_xml: dict
382 |
383 | """
384 | for text in dict_with_parsed_xml:
385 | dict_for_sentence = dict_with_parsed_xml[text]
386 | dict_for_sentence["abb"] = []
387 | pairs = schwartz_hearst.extract_abbreviation_definition_pairs(
388 | doc_text=dict_for_sentence['sentence'])
389 | dict_for_sentence["abb"] = pairs
390 | self._make_list_from_dict(pairs)
391 |
392 | def make_abb_exp_list(self, result_dictionary):
393 | """make lists of abbreviations and expansions to input into xml dictionary creating method
394 |
395 | :param result_dictionary: main dictionary that contains sentences and abbreviation dict (abb and expansion)
396 | :type result_dictionary: dict
397 | :returns: all abbreviations
398 | :rtype: list
399 |
400 | """
401 | list_of_name_lists = []
402 | list_of_term_lists = []
403 | for entry in result_dictionary:
404 | sentence_dictionary = result_dictionary[entry]
405 | if 'abb' in sentence_dictionary:
406 | pairs_dicts = (result_dictionary[entry]['abb'])
407 | name_list_for_every_dict, term_list_for_every_dict = self._make_list_from_dict(
408 | pairs_dicts)
409 | list_of_name_lists.append(name_list_for_every_dict)
410 | list_of_term_lists.append(term_list_for_every_dict)
411 | return self._list_of_lists_to_single_list(list_of_name_lists), self._list_of_lists_to_single_list(list_of_term_lists)
412 |
413 | def _make_list_from_dict(self, pairs):
414 | """
415 |
416 | :param pairs:
417 |
418 | """
419 | keys_list = []
420 | values_list = []
421 | keys_list.extend(pairs.keys())
422 | values_list.extend(pairs.values())
423 | return keys_list, values_list
424 |
425 | def _list_of_lists_to_single_list(self, list_of_lists):
426 | """
427 |
428 | :param list_of_lists:
429 |
430 | """
431 | return [item for sublist in list_of_lists for item in sublist]
432 |
433 | def make_ami_dict_from_abbreviation(self, title, result_dictionary, path):
434 | """create xml ami-dict containing abbreviations extracted from sentences
435 |
436 | :param title: title of xml ami-dict
437 | :type title: str
438 | :param result_dictionary: main dictionary with sentences and corresponding abbeviations
439 | :type result_dictionary: dict
440 | :param path: path where the xml ami-dict file would lie
441 | :type path: str
442 |
443 | """
444 | name_list, term_list = self.make_abb_exp_list(result_dictionary)
445 | dictionary_element = etree.Element("dictionary")
446 | dictionary_element.attrib['title'] = title
447 | for name, term in tqdm(zip(name_list, term_list)):
448 |
449 | wiki_lookup_list = self.wiki_lookup(term)
450 | try:
451 | entry_element = etree.SubElement(dictionary_element, "entry")
452 | entry_element.attrib['name'] = name
453 | entry_element.attrib['term'] = term
454 | if len(wiki_lookup_list) == 0:
455 | entry_element.attrib['wikidataID'] = ""
456 | elif len(wiki_lookup_list) == 1:
457 | entry_element.attrib['wikidataID'] = ", ".join(wiki_lookup_list)
458 | else:
459 | raw_element = etree.SubElement(entry_element, 'raw')
460 | raw_element.attrib['wikidataID'] = ", ".join(wiki_lookup_list)
461 | except Exception as e:
462 | logging.error(f"Couldn't add {term} to amidict")
463 | xml_dict = self._etree_to_string(dictionary_element)
464 | self._write_string_to_file(xml_dict, f'{path}.xml')
465 | logging.info(f'wrote all abbreviations to ami dict {path}.xml')
466 |
467 | def _etree_to_string(self, dictionary_element):
468 | """
469 |
470 | :param dictionary_element:
471 |
472 | """
473 | xml_dict = etree.tostring(
474 | dictionary_element, pretty_print=True).decode('utf-8')
475 | return xml_dict
476 |
477 | def _get_abbreviations(self, doc, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end):
478 | """
479 |
480 | :param doc:
481 | :param abbreviations:
482 | :param abbreviations_longform:
483 | :param abbreviation_start:
484 | :param abbreviation_end:
485 |
486 | """
487 | for abrv in doc._.abbreviations:
488 | abbreviations.append(abrv)
489 | abbreviations_longform.append(abrv._.long_form)
490 | abbreviation_start.append(abrv.start)
491 | abbreviation_end.append(abrv.end)
492 |
493 | def add_if_file_contains_terms(self, compiled_terms, dict_with_parsed_xml, searching='has_terms'):
494 | """populate the main dictionary with term matches, its frequency and span
495 |
496 | :param compiled_terms: list of compiled ami-dict terms
497 | :type compiled_terms: list
498 | :param dict_with_parsed_xml: dictionary containing sentences
499 | :type dict_with_parsed_xml: dict
500 | :param searching: dict key name. Defaults to 'has_terms'.
501 | :type searching: str
502 |
503 | """
504 | for statement in tqdm(dict_with_parsed_xml):
505 | dict_for_sentence = dict_with_parsed_xml[statement]
506 | dict_for_sentence[f'{searching}'] = []
507 | dict_for_sentence[f'{searching}_span'] = []
508 | term_list, span_list, frequency = self.search_sentence_with_compiled_terms(
509 | compiled_terms, dict_for_sentence['sentence'])
510 | if term_list:
511 | dict_for_sentence[f'{searching}'].append(term_list)
512 | dict_for_sentence[f'weight_{searching}'] = frequency
513 | dict_for_sentence[f'{searching}_span'].append(span_list)
514 |
515 | def search_sentence_with_compiled_terms(self, compiled_terms, sentence):
516 | """search sentences using the compiled ami-dict entry
517 |
518 | :param compiled_terms: list of compiled ami-dict terms
519 | :type compiled_terms: list
520 | :param sentence: sentence to search using compiled terms
521 | :type sentence: string
522 | :returns: list of terms that was found after searching sentence
523 | :rtype: list
524 |
525 | """
526 | # https://stackoverflow.com/questions/47681756/match-exact-phrase-within-a-string-in-python
527 | match_list = []
528 | span_list = []
529 | frequency = 0
530 | for compiled_term in compiled_terms:
531 | term_match = compiled_term.search(sentence, re.IGNORECASE)
532 | if term_match is not None:
533 | match_list.append(term_match.group())
534 | span_list.append(term_match.span())
535 | frequency = len(match_list)
536 | return match_list, span_list, frequency
537 |
538 | def get_terms_from_ami_xml(self, xml_path):
539 | """parses ami-dict (xml) and reads the entry terms; ami-dict can either be the default ones (user specifies python dict key) or customized ones (user specifies full path to it)
540 |
541 | :param xml_path: either keys from dict_of_ami_dict or full path to ami-dict
542 | :type xml_path: string
543 | :returns: list of regex compiled entry terms from ami-dict
544 | :rtype: list
545 |
546 | """
547 | if xml_path in self.dict_of_ami_dict.keys():
548 | logging.info(f"getting terms from {xml_path}")
549 | tree = ET.parse(urlopen(self.dict_of_ami_dict[xml_path]))
550 | root = tree.getroot()
551 | elif xml_path not in self.dict_of_ami_dict.keys():
552 | tree = ET.parse(xml_path)
553 | root = tree.getroot()
554 | logging.info(f"getting terms from {xml_path}")
555 | else:
556 | logging.error(f'{xml_path} is not a supported dictionary. Choose from: EO_ACTIVITY, EO_COMPOUND, EO_EXTRACTION, EO_PLANT, EO_PLANT_PART, PLANT_GENUS,EO_TARGET, COUNTRY, DISEASE, DRUG, ORGANIZATION ')
557 |
558 | compiled_terms = self._compiled_regex(root.iter('entry'))
559 | return (set(compiled_terms))
560 |
561 | def _compiled_regex(self, iterate_over):
562 | """
563 |
564 | :param iterate_over:
565 |
566 | """
567 | compiled_terms = []
568 | for para in iterate_over:
569 | try:
570 | term = (para.attrib["term"])
571 | except KeyError:
572 | term = para.text
573 | try:
574 | compiled_term = self._regex_compile(term)
575 | except re.error:
576 | logging.warning(f'cannot use term {term}')
577 | compiled_terms.append(compiled_term)
578 | return compiled_terms
579 |
580 | def _regex_compile(self, term):
581 | """
582 |
583 | :param term:
584 |
585 | """
586 | return re.compile(r'\b{}\b'.format(term))
587 |
588 | def get_synonyms_from_ami_xml(self, xml_path):
589 | """parses ami-dict (xml) and reads the entry's synonyms; ami-dict can either be the default ones (user specifies python dict key) or customized ones (user specifies full path to it)
590 |
591 | :param xml_path: either keys from dict_of_ami_dict or full path to ami-dict
592 | :type xml_path: string
593 | :returns: list of regex compiled entry's synonyms from ami-dict
594 | :rtype: list
595 |
596 | """
597 | if xml_path in self.dict_of_ami_dict.keys():
598 | logging.info(f"getting synonyms from {xml_path}")
599 | tree = ET.parse(urlopen(self.dict_of_ami_dict[xml_path]))
600 | root = tree.getroot()
601 | elif xml_path not in self.dict_of_ami_dict.keys():
602 | logging.info(f"getting synonyms from {xml_path}")
603 | tree = ET.parse(xml_path)
604 | root = tree.getroot()
605 | else:
606 | logging.error(f'{xml_path} is not a supported dictionary. Choose from: EO_ACTIVITY, EO_COMPOUND, EO_EXTRACTION, EO_PLANT, EO_PLANT_PART, PLANT_GENUS,EO_TARGET, COUNTRY, DISEASE, DRUG, ORGANIZATION ')
607 | synonyms = self._compiled_regex(root.findall("./entry/synonym"))
608 | return synonyms
609 |
610 | def _make_required_lists(self):
611 | """ """
612 | abbreviations = []
613 | abbreviations_longform = []
614 | abbreviation_start = []
615 | abbreviation_end = []
616 | entities = []
617 | labels = []
618 | position_start = []
619 | position_end = []
620 | return entities, labels, position_end, position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end
621 |
622 | def _add_lists_to_dict(self, dict_for_sentence, entities, labels, position_end,
623 | position_start, abbreviations, abbreviations_longform, abbreviation_start, abbreviation_end):
624 | """
625 |
626 | :param dict_for_sentence:
627 | :param entities:
628 | :param labels:
629 | :param position_end:
630 | :param position_start:
631 | :param abbreviations:
632 | :param abbreviations_longform:
633 | :param abbreviation_start:
634 | :param abbreviation_end:
635 |
636 | """
637 |
638 | dict_for_sentence['entities'] = entities
639 | dict_for_sentence['labels'] = labels
640 | dict_for_sentence['position_start'] = position_start
641 | dict_for_sentence['position_end'] = position_end
642 | dict_for_sentence['abbreviations'] = abbreviations
643 | dict_for_sentence['abbreviations_longform'] = abbreviations_longform
644 | dict_for_sentence['abbreviation_start'] = abbreviation_start
645 | dict_for_sentence['abbreviation_end'] = abbreviation_end
646 |
647 | def _add_parsed_entities_to_lists(self, entities, labels, position_end, position_start, ent=None):
648 | """
649 |
650 | :param entities:
651 | :param labels:
652 | :param position_end:
653 | :param position_start:
654 | :param ent: (Default value = None)
655 |
656 | """
657 | entities.append(ent.text)
658 | labels.append(ent.label_)
659 | position_start.append(ent.start_char)
660 | position_end.append(ent.end_char)
661 |
662 | def convert_dict_to_csv(self, path, dict_with_parsed_xml):
663 | """Turns python dictionary into CSV using pandas
664 |
665 | :param path: CSV file to write output
666 | :type path: string
667 | :param dict_with_parsed_xml: python dictionary that needs to be converted to csv
668 | :type dict_with_parsed_xml: dict
669 |
670 | """
671 | df = pd.DataFrame(dict_with_parsed_xml)
672 | df = df.T
673 | for col in df:
674 | try:
675 | df[col] = df[col].astype(str).str.replace(
676 | "[", "").str.replace("]", "")
677 | df[col] = df[col].astype(str).str.replace(
678 | "'", "").str.replace("'", "")
679 | except:
680 | pass
681 | df.to_csv(path, encoding='utf-8', line_terminator='\r\n')
682 | logging.info(f"wrote output to {path}")
683 |
684 | def remove_paragraph_form_parsed_xml_dict(self, dict_with_parsed_xml, key_to_remove):
685 | """pops out the specified key value pairs from python dictionaries
686 |
687 | :param dict_with_parsed_xml: python dict from which a key-value pair needs to be removed
688 | :type dict_with_parsed_xml: dict
689 | :param key_to_remove: key of the pair that needs to be removed
690 | :type key_to_remove: string
691 | :returns: python dict with the specified key-value pair removed
692 | :rtype: dict
693 |
694 | """
695 | for entry in dict_with_parsed_xml:
696 | dict_with_parsed_xml[entry].pop(key_to_remove, None)
697 | return dict_with_parsed_xml
698 |
699 | def convert_dict_to_json(self, path, dict_with_parsed_xml):
700 | """writes python dictionary to json file
701 |
702 | :param path: json file path to write to
703 | :type path: str
704 | :param dict_with_parsed_xml: main dictionary with sentences, search hits, entities, etc.
705 | :type dict_with_parsed_xml: dict
706 |
707 | """
708 | with open(path, mode='w', encoding='utf-8') as f:
709 | json.dump(dict_with_parsed_xml, f, indent=4)
710 | logging.info(f"wrote JSON output to {path}")
711 |
712 | def remove_statements_not_having_xmldict_terms(self, dict_with_parsed_xml, searching='has_terms'):
713 | """removes key-value pairs from the main python dict that do not have any match hits
714 |
715 | :param dict_with_parsed_xml: python dictionary from which the specific key-value pairs needs to be removed
716 | :type dict_with_parsed_xml: dict
717 | :param searching: the key to the pair in the nested-dict that needs to be removed (Default value = 'has_terms')
718 | :type searching: str
719 |
720 | """
721 | statement_to_pop = []
722 | for statement in dict_with_parsed_xml:
723 | sentect_dict = dict_with_parsed_xml[statement]
724 | if len(sentect_dict[searching]) == 0:
725 | statement_to_pop.append(statement)
726 |
727 | for term in statement_to_pop:
728 | dict_with_parsed_xml.pop(term)
729 |
730 | def make_ami_dict_from_list(self, list_of_terms_with_count, title):
731 | """makes ami-dict from a python dictionary containing terms and frequencies.
732 |
733 | :param list_of_terms_with_count: python dictionary containing terms and their frequency of occurence
734 | :type list_of_terms_with_count: dict
735 | :param title: title of the xml ami-dict as well as the name of the XML file
736 | :type title: string
737 | :returns: xml ami-dict
738 | :rtype: file
739 |
740 | """
741 | dictionary_element = etree.Element("dictionary")
742 | dictionary_element.attrib['title'] = title
743 | for term in list_of_terms_with_count:
744 | try:
745 | entry_element = etree.SubElement(dictionary_element, "entry")
746 | entry_element.attrib['term'] = term[0]
747 | entry_element.attrib['count'] = str(term[1])
748 | except Exception as e:
749 | logging.error(f"Couldn't add {term} to amidict")
750 | return self._etree_to_string(dictionary_element)
751 |
752 | def _write_string_to_file(self, string_to_put, title):
753 | """
754 |
755 | :param string_to_put:
756 | :param title:
757 |
758 | """
759 | with open(title, mode='w', encoding='utf-8') as f:
760 | f.write(string_to_put)
761 |
762 | def handle_ami_dict_creation(self, result_dictionary, title, path):
763 | """creates and writes ami dictionary with entities extracted and their frequency.
764 |
765 | :param result_dictionary: main python dictionary with sentences, entities, etc.
766 | :type result_dictionary: dict
767 | :param title: title of ami-dictionary (xml file)
768 | :type title: str
769 | :param path: file path
770 | :type path: str
771 |
772 | """
773 | list_of_entities = []
774 | for entry in result_dictionary:
775 | if 'entities' in result_dictionary[entry]:
776 | entity = result_dictionary[entry]['entities']
777 | list_of_entities.extend(entity)
778 | dict_of_entities_with_count = Counter(list_of_entities)
779 | list_of_terms_with_count = dict_of_entities_with_count.most_common()
780 | xml_dict = self.make_ami_dict_from_list(
781 | list_of_terms_with_count, title)
782 | self._write_string_to_file(xml_dict, f'{path}.xml')
783 | logging.info(f"Wrote all the entities extracted to {path}.xml")
784 |
785 | def json_to_dict(self, json_file_link):
786 | """loads json file as python dictionary
787 |
788 | :param json_file_link: link to json file on the web
789 | :type json_file_link: str
790 | :returns: python dictionary from json
791 | :rtype: dictionary
792 |
793 | """
794 | path = urlopen(json_file_link)
795 | json_dict = json.load(path)
796 | return (json_dict)
797 |
798 | def wiki_lookup(self, query):
799 | """Queries Wikidata API for Wikidata Item IDs for terms in ami-dict
800 |
801 | :param query: term to query wikdiata for ID
802 | :type query: string
803 | :returns: potential Wikidata Item URLs
804 | :rtype: list
805 |
806 | """
807 | params = {
808 | "action": "wbsearchentities",
809 | "search": query,
810 | "language": "en",
811 | "format": "json"
812 | }
813 | data = requests.get(
814 | "https://www.wikidata.org/w/api.php", params=params)
815 | result = data.json()
816 | hit_list = []
817 | for hit in result['search']:
818 | try:
819 | if "scientific article" not in hit["description"]:
820 | hit_list.append(hit["id"])
821 | except:
822 | hit_list.append(hit["id"])
823 | return hit_list
824 |
825 |
826 | # take out the constants
827 | # look through download_tools (pygetpapers) and see if we have overlapping functionality.
828 | # functionality_from_(where you are getting a data)
829 |
830 |
831 | # Future goals
832 | # make tests automated
833 | # readthedocs
834 | # tutorials
835 | # repository management
836 |
--------------------------------------------------------------------------------
/docanalysis/file_lib.py:
--------------------------------------------------------------------------------
1 | """classes and methods to support path operations
2 |
3 | """
4 | import json
5 | import copy
6 | import glob
7 | import re
8 | import os
9 | import shutil
10 | from pathlib import Path, PurePath
11 | import logging
12 | from glob import glob
13 | from braceexpand import braceexpand
14 |
15 | logging.debug("loading file_lib")
16 |
17 | py4ami = "py4ami"
18 | RESOURCES = "resources"
19 |
20 | # section keys
21 | _DESC = "_DESC"
22 | PROJ = "PROJ"
23 | TREE = "TREE"
24 | SECTS = "SECTS"
25 | SUBSECT = "SUBSECT"
26 | SUBSUB = "SUBSUB"
27 | FILE = "FILE"
28 | SUFFIX = "SUFFIX"
29 |
30 | ALLOWED_SECTS = {_DESC, PROJ, TREE, SECTS, SUBSECT, SUBSUB, FILE, SUFFIX}
31 |
32 | # wildcards
33 | STARS = "**"
34 | STAR = "*"
35 |
36 | # suffixes
37 | S_PDF = "pdf"
38 | S_PNG = "png"
39 | S_SVG = "svg"
40 | S_TXT = "txt"
41 | S_XML = "xml"
42 |
43 | # markers for processing
44 | _NULL = "_NULL"
45 | _REQD = "_REQD"
46 |
47 | # known section names
48 | SVG = "svg"
49 | PDFIMAGES = "pdfimages"
50 | RESULTS = "results"
51 | SECTIONS = "sections"
52 |
53 | # subsects
54 | IMAGE_STAR = "image*"
55 |
56 | # subsects
57 | OCTREE = "*octree"
58 |
59 | # results
60 | SEARCH = "search"
61 | WORD = "word"
62 | EMPTY = "empty"
63 |
64 | # files
65 | FULLTEXT_PAGE = "fulltext-page*"
66 | CHANNEL_STAR = "channel*"
67 | RAW = "raw"
68 |
69 |
70 | class Globber:
71 | """utilities for globbing - may be obsolete"""
72 |
73 | def __init__(self, ami_path, recurse=True, cwd=None) -> None:
74 | self.ami_path = ami_path
75 | self.recurse = recurse
76 | self.cwd = os.getcwd() if cwd is None else cwd
77 |
78 | def get_globbed_files(self) -> list:
79 | """uses the glob_string_list in ami_path to create a path list"""
80 | files = []
81 | if self.ami_path:
82 | glob_list = self.ami_path.get_glob_string_list()
83 | for gl_str in glob_list:
84 | files += glob.glob(gl_str, recursive=self.recurse)
85 | return files
86 |
87 |
88 | class AmiPath:
89 | """holds a (keyed) scheme for generating lists of path globs
90 | The scheme has several segments which can be set to create a glob expr.
91 |
92 |
93 | """
94 | # keys for path scheme templates
95 | T_FIGURES = "fig_captions"
96 | T_OCTREE = "octree"
97 | T_PDFIMAGES = "pdfimages"
98 | T_RESULTS = "results"
99 | T_SECTIONS = "sections"
100 | T_SVG = "svg"
101 |
102 | logger = logging.getLogger("ami_path")
103 | # dict
104 |
105 | def __init__(self, scheme=None):
106 | self.scheme = scheme
107 |
108 | def print_scheme(self):
109 | """for debugging and enlightenment"""
110 | if self.scheme is not None:
111 | for key in self.scheme:
112 | print("key ", key, "=", self.scheme[key])
113 | print("")
114 |
115 | @classmethod
116 | def create_ami_path_from_templates(cls, key, edit_dict=None):
117 | """creates a new AmiPath object from selected template
118 | key: to template
119 | edit_dict: dictionary with values to edit in
120 |
121 | :param key:
122 | :param edit_dict: (Default value = None)
123 |
124 | """
125 | key = key.lower()
126 | if key is None or key not in TEMPLATES:
127 | cls.logger.error(f"cannot find key {key}")
128 | cls.logger.error("no scheme for: ", key,
129 | "expected", TEMPLATES.keys())
130 | ami_path = AmiPath()
131 | # start with default template values
132 | ami_path.scheme = copy.deepcopy(TEMPLATES[key])
133 | if edit_dict:
134 | ami_path.edit_scheme(edit_dict)
135 | return ami_path
136 |
137 | def edit_scheme(self, edit_dict):
138 | """edits values in self.scheme using edit_dict
139 |
140 | :param edit_dict:
141 |
142 | """
143 | for k, v in edit_dict.items():
144 | self.scheme[k] = v
145 |
146 | def permute_sets(self):
147 | """ """
148 | self.scheme_list = []
149 | self.scheme_list.append(self.scheme)
150 | # if scheme has sets, expand them
151 | change = True
152 | while change:
153 | change = self.expand_set_lists()
154 |
155 | def expand_set_lists(self):
156 | """expands the sets in a scheme
157 | note: sets are held as lists in JSON
158 |
159 | a scheme with 2 sets of size n and m is
160 | expanded to n*m schemes covering all permutations
161 | of the set values
162 |
163 | self.scheme_list contains all the schemes
164 |
165 | returns True if any sets are expanded
166 |
167 |
168 | """
169 | change = False
170 | for scheme in self.scheme_list:
171 | for sect, value in scheme.items():
172 | if type(value) == list:
173 | change = True
174 | # delete scheme with set, replace by copies
175 | self.scheme_list.remove(scheme)
176 | for set_value in value:
177 | scheme_copy = copy.deepcopy(scheme)
178 | self.scheme_list.append(scheme_copy)
179 | scheme_copy[sect] = set_value # poke in set value
180 | break # after each set processed
181 |
182 | return change
183 |
184 | def get_glob_string_list(self):
185 | """expand sets in AmiPath
186 | creates m*n... glob strings for sets with len n and m
187 |
188 |
189 | """
190 | self.permute_sets()
191 | self.glob_string_list = []
192 | for scheme in self.scheme_list:
193 | glob_string = AmiPath.create_glob_string(scheme)
194 | self.glob_string_list.append(glob_string)
195 | return self.glob_string_list
196 |
197 | @classmethod
198 | def create_glob_string(cls, scheme):
199 | """
200 |
201 | :param scheme:
202 |
203 | """
204 | globx = ""
205 | for sect, value in scheme.items():
206 | cls.logger.debug(sect, type(value), value)
207 | if sect not in ALLOWED_SECTS:
208 | cls.logger.error(f"unknown sect: {sect}")
209 | elif _DESC == sect:
210 | pass
211 | elif _REQD == value:
212 | cls.logger.error("must set ", sect)
213 | globx += _REQD + "/"
214 | elif _NULL == value:
215 | pass
216 | elif FILE == sect:
217 | globx += AmiPath.convert_to_glob(value)
218 | elif STAR in value:
219 | globx += AmiPath.convert_to_glob(value) + "/"
220 | elif SUFFIX == sect:
221 | globx += "." + AmiPath.convert_to_glob(value)
222 | else:
223 | globx += AmiPath.convert_to_glob(value) + "/"
224 | cls.logger.debug("glob", scheme, "=>", globx)
225 | return globx
226 |
227 | @classmethod
228 | def convert_to_glob(cls, value):
229 | """
230 |
231 | :param value:
232 |
233 | """
234 | valuex = value
235 | if type(value) == list:
236 | # tacky. string quotes and add commas and parens
237 | valuex = "("
238 | for v in value:
239 | valuex += v + ","
240 | valuex = valuex[:-1] + ")"
241 | return valuex
242 |
243 | def get_globbed_files(self):
244 | """ """
245 | files = Globber(self).get_globbed_files()
246 | self.logger.debug("files", len(files))
247 | return files
248 |
249 |
250 | class BraceGlobber:
251 | """ """
252 |
253 | def braced_glob(self, path, recursive=False):
254 | """
255 |
256 | :param path:
257 | :param recursive: (Default value = False)
258 |
259 | """
260 | ll = [glob(x, recursive=recursive) for x in braceexpand(path)]
261 | return ll
262 |
263 |
264 | class FileLib:
265 | """ """
266 |
267 | logger = logging.getLogger("file_lib")
268 |
269 | @classmethod
270 | def force_mkdir(cls, dirx):
271 | """ensure dirx exists
272 |
273 | :dirx: directory
274 |
275 | :param dirx:
276 |
277 | """
278 | if not os.path.exists(dirx):
279 | try:
280 | os.mkdir(dirx)
281 | except Exception as e:
282 | cls.logger.error(f"cannot make dirx {dirx} , {e}")
283 |
284 | @classmethod
285 | def force_mkparent(cls, file):
286 | """ensure parent directory exists
287 |
288 | :path: whose parent directory is to be created if absent
289 |
290 | :param file:
291 |
292 | """
293 | if file is not None:
294 | cls.force_mkdir(cls.get_parent_dir(file))
295 |
296 | @classmethod
297 | def force_write(cls, file, data, overwrite=True):
298 | """:write path, creating dirtectory if necessary
299 | :path: path to write to
300 | :data: str data to write
301 | :overwrite: force write iuf path exists
302 |
303 | may throw exception from write
304 |
305 | :param file:
306 | :param data:
307 | :param overwrite: (Default value = True)
308 |
309 | """
310 | if file is not None:
311 | if os.path.exists(file) and not overwrite:
312 | logging.warning(f"not overwriting existsnt path {file}")
313 | else:
314 | cls.force_mkparent(file)
315 | with open(file, "w", encoding="utf-8") as f:
316 | f.write(data)
317 |
318 | @classmethod
319 | def copy_file_or_directory(cls, dest_path, src_path, overwrite):
320 | """
321 |
322 | :param dest_path:
323 | :param src_path:
324 | :param overwrite:
325 |
326 | """
327 | if dest_path.exists():
328 | if not overwrite:
329 | file_type = "dirx" if dest_path.is_dir() else "path"
330 | raise TypeError(
331 | str(dest_path), f"cannot overwrite existing {file_type} (str({dest_path})")
332 |
333 | else:
334 | # assume directory
335 | cls.logger.warning(f"create directory {dest_path}")
336 | dest_path.mkdir(parents=True, exist_ok=True)
337 | cls.logger.info(f"created directory {dest_path}")
338 | if src_path.is_dir():
339 | if os.path.exists(dest_path):
340 | shutil.rmtree(dest_path)
341 | shutil.copytree(src_path, dest_path)
342 | cls.logger.info(f"copied directory {src_path} to {dest_path}")
343 | else:
344 | try:
345 | shutil.copy(src_path, dest_path) # will overwrite
346 | cls.logger.info(f"copied path {src_path} to {dest_path}")
347 | except Exception as e:
348 | cls.logger.fatal(f"Cannot copy direcctory {src_path} to {dest_path} because {e}")
349 |
350 | @staticmethod
351 | def create_absolute_name(file):
352 | """create absolute/relative name for a path relative to py4ami
353 |
354 | TODO this is messy
355 |
356 | :param file:
357 |
358 | """
359 | absolute_file = None
360 | if file is not None:
361 | file_dir = FileLib.get_parent_dir(__file__)
362 | absolute_file = os.path.join(os.path.join(file_dir, file))
363 | return absolute_file
364 |
365 | @classmethod
366 | def get_py4ami(cls):
367 | """gets paymi_m pathname"""
368 | return Path(__file__).parent.resolve()
369 |
370 | @classmethod
371 | def get_pyami_root(cls):
372 | """gets paymi root pathname"""
373 | return Path(__file__).parent.parent.resolve()
374 |
375 | @classmethod
376 | def get_pyami_resources(cls):
377 | """gets paymi root pathname"""
378 | return Path(cls.get_py4ami(), RESOURCES)
379 |
380 | @classmethod
381 | def get_parent_dir(cls, file):
382 | """
383 |
384 | :param file:
385 |
386 | """
387 | return None if file is None else PurePath(file).parent
388 |
389 | @classmethod
390 | def read_pydictionary(cls, file):
391 | """read a json path into a python dictiomary
392 |
393 | :param file:
394 |
395 | """
396 | import ast
397 | with open(file, "r") as f:
398 | pydict = ast.literal_eval(f.read())
399 | return pydict
400 |
401 | @classmethod
402 | def punct2underscore(cls, text):
403 | """replace all ASCII punctuation except '.' , '-', '_' by '_'
404 |
405 | for filenames
406 |
407 | :param text:
408 |
409 | """
410 | # from py4ami.text_lib import TextUtil
411 | # this is non-trivial https://stackoverflow.com/questions/10017147/removing-a-list-of-characters-in-string
412 |
413 | non_file_punct = '\t \n{}!@#$%^&*()[]:;\'",|\\~+=/`'
414 | # [unicode(x.strip()) if x is not None else '' for x in row]
415 |
416 | #text0 = TextUtil.replace_chars(text, non_file_punct, "_")
417 | text0 = ''.join([c if c not in non_file_punct else "_" for c in text])
418 | return text0
419 |
420 | @classmethod
421 | def get_suffix(cls, file):
422 | """get suffix
423 | INCLUDES the "."
424 |
425 | :param file:
426 |
427 | """
428 | _suffix = None if file is None else Path(file).suffix
429 | return _suffix
430 |
431 |
432 | # see https://realpython.com/python-pathlib/
433 |
434 | def main():
435 | """ """
436 | print("started file_lib")
437 | # test_templates()
438 |
439 | print("finished file_lib")
440 |
441 |
442 | if __name__ == "__main__":
443 | print("running file_lib main")
444 | main()
445 | else:
446 | # print("running file_lib main anyway")
447 | # main()
448 | pass
449 |
450 | # examples of regex for filenames
451 |
452 |
453 | def glob_re(pattern, strings):
454 | """
455 |
456 | :param pattern:
457 | :param strings:
458 |
459 | """
460 | return filter(re.compile(pattern).match, strings)
461 |
462 |
463 | filenames = glob_re(r'.*(abc|123|a1b).*\.txt', os.listdir())
464 |
465 | # Credits: Peter Murray-Rust, py4ami (https://github.com/petermr/pyami/blob/main/py4ami/file_lib.py)
--------------------------------------------------------------------------------
/docanalysis/get_html.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from glob import glob
3 | import os
4 | from abbreviations import schwartz_hearst
5 | from lxml import etree
6 | import yake
7 |
8 | def read_text_from_html(paragraph_path):
9 | """
10 |
11 | :param paragraph_path:
12 |
13 | """
14 | with open(paragraph_path, 'r') as f:
15 | html = f.read()
16 | soup = BeautifulSoup(html, features="html.parser")
17 |
18 | # kill all script and style elements
19 | for script in soup(["script", "style"]):
20 | script.extract() # rip it out
21 |
22 | # get text
23 | text = soup.get_text()
24 |
25 | # break into lines and remove leading and trailing space on each
26 | #lines = (line.strip() for line in text.splitlines())
27 | # break multi-headlines into a line each
28 | #chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
29 | # drop blank lines
30 | #text_write = '\n'.join(chunk for chunk in chunks if chunk)
31 | #text = '\n'.join(chunk for chunk in chunks if chunk)
32 | return text
33 |
34 | def get_glob(corpus_path):
35 | """
36 |
37 | :param corpus_path:
38 |
39 | """
40 | paragraph_path = glob(os.path.join(corpus_path, '**', 'sections', '**', "*html"), recursive=True)
41 | return paragraph_path
42 |
43 | def abbreviation_search_using_sw(paragraph_text):
44 | """
45 |
46 | :param paragraph_text:
47 |
48 | """
49 | pairs = schwartz_hearst.extract_abbreviation_definition_pairs(doc_text=paragraph_text)
50 | keys = pairs.keys()
51 | values = pairs.values()
52 | return keys, values
53 |
54 | def make_ami_dict_from_list(title, keys, values):
55 | """
56 |
57 | :param title:
58 | :param keys:
59 | :param values:
60 |
61 | """
62 | dictionary_element= etree.Element("dictionary")
63 | dictionary_element.attrib['title']= title
64 | for term, expansion in zip(keys, values):
65 | entry_element=etree.SubElement(dictionary_element,"entry")
66 | entry_element.attrib['term']=term
67 | entry_element.attrib['exapansion']=expansion
68 | return etree.tostring(dictionary_element, pretty_print=True).decode('utf-8')
69 |
70 | def write_string_to_file(string_to_put,title):
71 | """
72 |
73 | :param string_to_put:
74 | :param title:
75 |
76 | """
77 | with open(title,mode='w', encoding='utf-8') as f:
78 | f.write(string_to_put)
79 | print(f"wrote dict to {title}")
80 |
81 | def extract_keyphrase(paragraph_text):
82 | """
83 |
84 | :param paragraph_text:
85 |
86 | """
87 | custom_kw_extractor = yake.KeywordExtractor(lan='en', n=5, top=10, features=None)
88 | keywords = custom_kw_extractor.extract_keywords(paragraph_text)
89 | keywords_list = []
90 | for kw in keywords:
91 | keywords_list.append(kw[0])
92 | print(keywords_list)
93 |
94 | def does_everything(corpus_path):
95 | """
96 |
97 | :param corpus_path:
98 |
99 | """
100 | all_text = []
101 | all_keys = []
102 | all_values = []
103 | all_paragraph_paths = get_glob(corpus_path)
104 | for paragraph_path in all_paragraph_paths:
105 | paragraph_text = read_text_from_html(paragraph_path)
106 | #print(paragraph_text)
107 | all_text.append(paragraph_text)
108 | keys, values = abbreviation_search_using_sw(paragraph_text)
109 | all_keys.extend(keys)
110 | all_values.extend(values)
111 | print(len(all_keys), all_values)
112 | #all_text_string = joinStrings(all_text)
113 | #print(all_text_string)
114 | #extract_keyphrase(all_text_string)
115 | #dict_string = make_ami_dict_from_list("abb", all_keys, all_values)
116 | #return dict_string
117 |
118 |
119 | def joinStrings(stringList):
120 | """
121 |
122 | :param stringList:
123 |
124 | """
125 | return ''.join(string for string in stringList)
126 |
127 | path = os.path.join(os.path.expanduser('~'), "ipcc_sectioned")
128 | does_everything(path)
129 | #write_string_to_file( dict_string, "abb.xml")
130 |
131 | import json
132 | from urllib.request import urlopen
133 |
134 | #PATH = urlopen()
135 | #json_dict = json.load(PATH)
136 | #print(json_dict)
137 |
138 |
--------------------------------------------------------------------------------
/docanalysis/glob_trail.py:
--------------------------------------------------------------------------------
1 | import os
2 | from glob import glob
3 | from pprint import pprint
4 |
5 | # define constants
6 | ABS = ['*abstract.xml']
7 | ACK = ['*ack.xml']
8 | AFF = ['*aff.xml']
9 | AUT = ['*contrib-group.xml']
10 | CON = ['*conclusion*/*.xml']
11 | DIS = ['*discussion*/**/*_title.xml', '*discussion*/**/*_p.xml'] # might bring unwanted sections like tables, fig. captions etc. Maybe get only title and paragraphs?
12 | ETH = ['*ethic*/*.xml']
13 | FIG = ['*fig*.xml']
14 | INT = ['*introduction*/*.xml', '*background*/*.xml']
15 | KEY = ['*kwd-group.xml']
16 | MET = ['*method*/*.xml', '*material*/*.xml'] # also gets us supplementary material. Not sure how to exclude them
17 | RES = ['*result*/*/*_title.xml', '*result*/*/*_p.xml'] # not sure if we should use recursive globbing or not.
18 | TAB = ['*table*.xml']
19 | TIL = ['*article-meta/*title-group.xml']
20 |
21 | # glob
22 | path = os.getcwd()
23 | cproj = 'corpus/asp_nat_products'
24 | LIST_SEC = [TIL, KEY]
25 | for SEC in LIST_SEC:
26 | for opt in SEC:
27 | glob_list=glob(os.path.join(path, cproj, '**', 'sections', '**', f'{opt}'), recursive=True)
28 | pprint(glob_list)
29 |
30 | # Section list comes from: https://github.com/petermr/pyami/blob/main/py4ami/resources/section_templates.json
--------------------------------------------------------------------------------
/docanalysis/gui.py:
--------------------------------------------------------------------------------
1 | import eel
2 | from pygetpapers import Pygetpapers
3 | import os
4 |
5 | eel.init(f'{os.path.dirname(os.path.realpath(__file__))}/gui')
6 |
7 |
8 | @eel.expose
9 | def create_corpus(path, query, number):
10 | pygetpapers_call = Pygetpapers()
11 | pygetpapers_call.run_command(
12 | query=query, limit=number, output=path, xml=True)
13 |
14 |
15 | eel.start('main.html')
16 |
--------------------------------------------------------------------------------
/docanalysis/gui/css/main.css:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/docanalysis/gui/css/main.css
--------------------------------------------------------------------------------
/docanalysis/gui/eel.js:
--------------------------------------------------------------------------------
1 |
2 |
3 | function make_papers () {
4 | query = document.getElementById("query").value
5 | number = document.getElementById("number").value
6 | path =document.getElementById("path").value
7 | console.log(path)
8 | }
--------------------------------------------------------------------------------
/docanalysis/gui/main.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Docanalysis GUI
10 |
11 |
12 |
child elements of row containing string values
517 |
518 | :param row: list of str
519 | :param row: [str]:
520 |
521 | """
522 | if row is not None:
523 | tr = LXET.SubElement(self.tbody, H_TR)
524 | for val in row:
525 | td = LXET.SubElement(tr, H_TD)
526 | td.text = val
527 | # print("td", td.text)
528 |
529 | def make_row(self):
530 | """:return: row element"""
531 | return LXET.SubElement(self.tbody, H_TR)
532 |
533 | def append_contained_text(self, parent, tag, text):
534 | """create element and add text child
535 |
536 | :param parent:
537 | :param tag:
538 | :param text:
539 |
540 | """
541 | subelem = LXET.SubElement(parent, tag)
542 | subelem.text = text
543 | return subelem
544 |
545 | def write_full_data_tables(self, output_dir: str) -> None:
546 | """
547 |
548 | :param output_dir: str:
549 |
550 | """
551 | if not os.path.exists(output_dir):
552 | os.makedirs(output_dir)
553 | data_table_file = os.path.join(output_dir, "full_data_table.html")
554 | with open(data_table_file, "w") as f:
555 | text = bytes.decode(LXET.tostring(self.html))
556 | f.write(text)
557 | print("WROTE", data_table_file)
558 |
559 | def __str__(self):
560 | # s = self.html.text
561 | # print("s", s)
562 | # return s
563 | # ic("ichtml", self.html)
564 | htmltext = LXET.tostring(self.html)
565 | print("SELF", htmltext)
566 | return htmltext
567 |
568 |
569 | class Web:
570 | """ """
571 | def __init__(self):
572 | import tkinter as tk
573 | root = tk.Tk()
574 | site = "http://google.com"
575 | self.display_html(root, site)
576 | root.mainloop()
577 |
578 | @classmethod
579 | def display_html(cls, master, site):
580 | """
581 |
582 | :param master:
583 | :param site:
584 |
585 | """
586 | import tkinterweb
587 | frame = tkinterweb.HtmlFrame(master)
588 | frame.load_website(site)
589 | frame.pack(fill="both", expand=True)
590 |
591 | @classmethod
592 | def tkinterweb_demo(cls):
593 | """ """
594 | from tkinterweb import Demo
595 | Demo()
596 |
597 |
598 | def main():
599 | """ """
600 |
601 | XmlLib().test_recurse_sections() # recursively list sections
602 |
603 | # test_data_table()
604 | # test_xml()
605 |
606 | # web = Web()
607 | # Web.tkinterweb_demo()
608 |
609 |
610 | def test_xml():
611 | """ """
612 | xml_string = "foo and with bar"
613 | print(XmlLib.remove_all_tags(xml_string))
614 |
615 |
616 | def test_data_table():
617 | """ """
618 | import pprint
619 | data_table = DataTable("test")
620 | data_table.add_column_heads(["a", "b", "c"])
621 | data_table.add_row_old(["a1", "b1", "c1"])
622 | data_table.add_row_old(["a2", "b2", "c2"])
623 | data_table.add_row_old(["a3", "b3", "c3"])
624 | data_table.add_row_old(["a4", "b4", "c4"])
625 | html = LXET.tostring(data_table.html).decode("UTF-8")
626 | HOME = os.path.expanduser("~")
627 | with open(os.path.join(HOME, "junk_html.html"), "w") as f:
628 | f.write(html)
629 | pprint.pprint(html)
630 |
631 |
632 | if __name__ == "__main__":
633 | print("running file_lib main")
634 | main()
635 | else:
636 | # print("running file_lib main anyway")
637 | # main()
638 | pass
639 |
640 | # Credits: Peter Murray-Rust, py4ami (https://github.com/petermr/pyami/blob/main/py4ami/file_lib.py)
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 | myst-parser
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | # -- Project information -----------------------------------------------------
7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8 | import os
9 | import sys
10 | import sphinx_rtd_theme
11 | sys.path.insert(0, os.path.abspath('..'))
12 | sys.path.append(os.path.abspath('../..'))
13 | project = 'Docanalysis'
14 | copyright = '2022, Ayush Garg, Shweata N Hegde'
15 | author = 'Ayush Garg, Shweata N Hegde'
16 | release = '0.2.4'
17 |
18 | # -- General configuration ---------------------------------------------------
19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
20 |
21 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 'myst_parser']
22 |
23 | templates_path = ['_templates']
24 | exclude_patterns = []
25 | napoleon_google_docstring = True
26 |
27 |
28 | # -- Options for HTML output -------------------------------------------------
29 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
30 |
31 | html_theme = 'sphinx_rtd_theme'
32 | html_static_path = ['_static']
33 |
--------------------------------------------------------------------------------
/docs/source/docanalysis.rst:
--------------------------------------------------------------------------------
1 | Docanalysis module
2 | ==================================
3 |
4 | .. automodule:: docanalysis.docanalysis
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/source/entity_extraction.rst:
--------------------------------------------------------------------------------
1 | Entity extraction module
2 | ==================================
3 |
4 | .. automodule:: docanalysis.entity_extraction
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | User Documentation
2 | ==============================
3 |
4 | .. include:: ../../README.md
5 | :parser: myst_parser.sphinx_
6 |
7 | .. toctree::
8 | :maxdepth: 1
9 | :hidden:
10 | :caption: User Documentation:
11 |
12 | user_documentation
13 |
14 |
15 | .. toctree::
16 | :maxdepth: 7
17 | :caption: Core modules:
18 |
19 | docanalysis
20 | entity_extraction
--------------------------------------------------------------------------------
/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Notebooks
2 |
3 | A general resource for contributed code
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | abbreviations>=0.2.5
2 | beautifulsoup4>=4.10.0
3 | braceexpand>=0.1.7
4 | coloredlogs>=15.0.1
5 | ConfigArgParse>=1.5.3
6 | lxml>=4.7.1
7 | nltk>=3.6.7
8 | pandas>=1.3.4
9 | pygetpapers
10 | pytest>=6.2.5
11 | setuptools>=60.3.1
12 | spacy>=3.0.7
13 | tkinterweb>=3.10.7
14 | tqdm>=4.62.3
15 | yake>=0.4.8
16 | sphinx_rtd_theme>=1.0.0
17 |
--------------------------------------------------------------------------------
/resources/approval_number_100.csv:
--------------------------------------------------------------------------------
1 | ,file_path,paragraph,sentence,entities,labels,position_start,position_end,has_terms,weight
2 | 33,C:\Users\shweata\approval_number_100\PMC7833043\sections\1_body\3_ethics_approval\1_p.xml,"The study was approved by the King Khalid University Ethics Committee (approval number: ECM#2020-183–(HAPO-06-B-001), and no identifying personal information (e.g. name, age) or other sensitive data were collected.","The study was approved by the King Khalid University Ethics Committee (approval number: ECM#2020-183–(HAPO-06-B-001), and no identifying personal information (e.g.",the King Khalid University Ethics Committee,ORG,26,69,approval number,1
3 | 63,C:\Users\shweata\approval_number_100\PMC7833043\sections\2_back\2_ethical_approval\1_p.xml,"The Ethical Committee of the Scientific Research, King Khalid University approved the study (approval number: ECM#2020-183–(HAPO-06-B-001) to use scores and absence rates, with no personal information of students disclosed.","The Ethical Committee of the Scientific Research, King Khalid University approved the study (approval number: ECM#2020-183–(HAPO-06-B-001) to use scores and absence rates, with no personal information of students disclosed.",The Ethical Committee of the Scientific Research,ORG,0,48,approval number,1
4 | 96,C:\Users\shweata\approval_number_100\PMC8023627\sections\2_back\2_ethics_approval_and_conse\1_p.xml,"The data were obtained from the Saudi Ministry of Health and World Health Organization records and the study conducted under the approval of the Regional Directorate of Primary Health according to ethical standards with the maintenance of anonymity of each patient. Thus, all the data of patients was recorded without patients details, it was not necessary to obtain the personal consent of the study participants. The study was ethically approved by the institutional review board of the Princess Nourah Bint Abdulrahman University (IRB Approval Number: 20–0217).",The study was ethically approved by the institutional review board of the Princess Nourah Bint Abdulrahman University (IRB Approval Number: 20–0217).,Nourah,GPE,83,89,approval number,1
5 | 169,C:\Users\shweata\approval_number_100\PMC8185902\sections\1_body\1_p.xml,"Following the publication of the above article, the authors have realized that, in the Declarations section on p. 10, they presented an incorrect approval number from the Ethics Committee in question; the statement here should have read as follows: “The present study was approved by the Ethics Committee of the Affiliated Hospital of Shaoxing University (approval no. 2021003). All patients provided written informed consent.”","Following the publication of the above article, the authors have realized that, in the Declarations section on p. 10, they presented an incorrect approval number from the Ethics Committee in question; the statement here should have read as follows: “The present study was approved by the Ethics Committee of the Affiliated Hospital of Shaoxing University (approval no.","the Ethics Committee, the Ethics Committee of the Affiliated Hospital of Shaoxing University","ORG, ORG","167, 284","187, 354",approval number,1
6 | 1252,C:\Users\shweata\approval_number_100\PMC8358494\sections\1_body\1_methods\1_patients_and_study_design\1_p.xml,"The participants enrolled in this retrospective study were outpatients at Suzuki Clinic. All participants received the Diagnostic and Statistical Manual of Mental Disorders, Fifth Edition diagnosis for insomnia, and were prescribed either lemborexant or benzodiazepine hypnotics. The observation period was from July 2020 (when introduced for clinical use) to December 2020 for lemborexant and benzodiazepine hypnotics. Furthermore, there were no criteria for exclusion of research subjects in this study. This study was approved by the ethics committee of Fukui Kinen Hospital. The approval date and approval number of the ethics committee of Fukui Kinen Hospital were 21 January 2021 and 2-017, respectively. Instead of omitting the informed consent for the retrospective cohort study, information about the study was posted in the hospital, and opt-out recruitment was conducted. Insomnia was assessed using the Japanese version of Athens Insomnia Scale (AIS). 5 Efficacy outcome assessment was from the Clinical Global Impressions-Improvement (CGI-I) scale. 6","The approval date and approval number of the ethics committee of Fukui Kinen Hospital were 21 January 2021 and 2-017, respectively.",Fukui Kinen Hospital,ORG,65,85,approval number,1
7 | 2388,C:\Users\shweata\approval_number_100\PMC8431654\sections\2_body\3_4__material_and_methods\1_4_1__blood_samples_used_i\1_p.xml,"Blood samples of COVID-19 patients were taken within the first two weeks after the detection of the SARS-CoV-2 infection of patients at University Hospital of RWTH Aachen. All patient samples were taken after written and informed consent according to the guidelines and specific approval of the study by the local ethics committee (Ethic approval number EK 080/20 for the Covid-19 Aachen study named COVAS; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany) and collected into RWTH cBMB, the central biobank of the medical faculty of RWTH Aachen University (Ethic approval number EK 206/09). Blood samples of healthy donors were taken after written and informed consent according to the guidelines and approval of the study by the local ethics committee (EK 041/15; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany). These control samples were taken in the years 2018 and 2019, before the initial SARS-CoV-2 outbreak. All venous blood samples were anticoagulated with EDTA and cryopreserved at -80 °C until further analysis.","All patient samples were taken after written and informed consent according to the guidelines and specific approval of the study by the local ethics committee (Ethic approval number EK 080/20 for the Covid-19 Aachen study named COVAS; Ethics committee of RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse 30, 52074 Aachen, Germany) and collected into RWTH cBMB, the central biobank of the medical faculty of RWTH Aachen University (Ethic approval number EK 206/09).","Ethics, RWTH Aachen University, University Hospital Aachen, Pauwelsstrasse, Aachen, Germany, RWTH Aachen University (Ethic","ORG, ORG, GPE, GPE, GPE, ORG","235, 255, 307, 332, 340, 425","241, 305, 321, 338, 347, 454",approval number,1
8 | 2766,C:\Users\shweata\approval_number_100\PMC8442262\sections\2_body\1_methods\9_ethics_approval_and_conse\1_p.xml,"Ethics approval has been approved by the Research Ethics Board of Health (REBH), Ministry of Health, Royal Government of Bhutan vide approval number Ref. No. REBH/Approval/2019/067. An informed consent has been obtained from the individual participants for the use of photographic materials while the use of data and consent to participate had been obtained from the legal guardian (Principal of the school). All methods were carried out in accordance with relevant guidelines and regulations as enshrined in Helsinki Declarations 1964.","Ethics approval has been approved by the Research Ethics Board of Health (REBH), Ministry of Health, Royal Government of Bhutan vide approval number Ref.","the Research Ethics Board of Health, REBH, Ministry of Health, Royal Government, Ref","ORG, ORG, ORG, ORG, ORG","37, 74, 81, 101, 150","72, 78, 99, 117, 153",approval number,1
9 | 3047,C:\Users\shweata\approval_number_100\PMC8449472\sections\2_body\1_methods\1_ethical_approval\1_p.xml,"This study was approved by the Institutional Review Board of Kyungpook National University Chilgok Hospital, Daegu, Korea (approval number: 2020-04-029). As this was a retrospective study, the need for obtaining informed consent from patients was waived by the Institutional Review Board.","This study was approved by the Institutional Review Board of Kyungpook National University Chilgok Hospital, Daegu, Korea (approval number: 2020-04-029).","the Institutional Review Board, Kyungpook National University Chilgok Hospital, Daegu, Korea","ORG, ORG, GPE, GPE","27, 61, 109, 116","57, 107, 114, 121",approval number,1
10 | 3145,C:\Users\shweata\approval_number_100\PMC8454101\sections\2_body\1_methods\1_patients\1_p.xml,Conjunctival samples were obtained from patients undergoing retinal detachment surgery (n = 1) or conjunctival melanoma resection (n = 1) at the Eye Center of the University of Freiburg. Ethics approval was granted from Ethics Committee of the Albert-Ludwigs-University Freiburg (approval number 481/19).,Ethics approval was granted from Ethics Committee of the Albert-Ludwigs-University Freiburg (approval number 481/19).,Ethics Committee,ORG,33,49,approval number,1
11 | 3351,C:\Users\shweata\approval_number_100\PMC8462059\sections\2_body\3_ethical_approval\1_p.xml,This study was approved by the institutional review board and ethics committee of the Japanese Red Cross Ise Hospital (approval number: ER2020‐26).,This study was approved by the institutional review board and ethics committee of the Japanese Red Cross Ise Hospital (approval number: ER2020‐26).,the Japanese Red Cross Ise Hospital,ORG,82,117,approval number,1
12 | 3470,C:\Users\shweata\approval_number_100\PMC8468265\sections\2_body\3_4__materials_and_methods\4_4_4__ethical_approval\1_p.xml,"The ASB study was conducted in accordance with the Declaration of Helsinki. The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital. The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital.","ASB, the Academic Medical Centre, Amsterdam, Netherlands","ORG, ORG, GPE, GPE","4, 63, 92, 107","7, 90, 101, 118",approval number,1
13 | 3471,C:\Users\shweata\approval_number_100\PMC8468265\sections\2_body\3_4__materials_and_methods\4_4_4__ethical_approval\1_p.xml,"The ASB study was conducted in accordance with the Declaration of Helsinki. The ASB study was approved by the research ethics committee of the Academic Medical Centre, Amsterdam, the Netherlands (approval number MEC 2011-073, date of approval 29-4-2011) and by the institutional review board of each participating hospital. The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","The national perinatal registry in the Netherlands (PERINED) approved linkage of the ASB cohort with their database to further complete missing data on outcomes (approval number 13.64, date of approval 17-12-2013).","Netherlands, ASB","GPE, ORG","39, 85","50, 88",approval number,1
14 | 3734,C:\Users\shweata\approval_number_100\PMC8474954\sections\2_body\1_methods\8_in_vivo_metastasis_assay\1_p.xml,"To test if EMMPRIN plays an important role in osteosarcoma metastasis in vivo, the osteosarcoma cell line 143B was injected in the tail vein of BALB/c mice. The mice were sacrificed at 8 weeks post-injection. Four-week-old male BALB/c nude mice were obtained from Central Lab. Animal Inc. (Seoul, Korea) and maintained under standard conditions until the experiments were performed. The animals were maintained at the animal facility of the Seoul National University Hospital under guidelines prior to the grouping and experiments. A total of 15 BALB/c nude mice were randomized into 3 groups: 1 normal, 2 143 cells transfected with an ad mock shRNA vector (Control), and 3 143 cells transfected with the ad EMMPRIN shRNA vector. Experiments were approved by the Institutional Animal Care and Use Committee of Seoul National University Hospital (approval number 10–0075). One anti-EMMPRIN sequence (5-GTCGTCAGAACACATCAAC-3) or a scrambled sequence was inserted into the plasmid vector pAdEasy-1 (Addgene). They were designated as pAdEasy-1-shRNA and pAdEasy-1 scramble shRNA, respectively. Osteosarcoma cell line 143B was transfected with EMMPRIN shRNA. EMMPRIN shRNA transfected 143B cells were harvested with trypsin, and then resuspended in serum-free RPMI, and injected in the tail vein (1 × 10 5/0.2 mL) of 5 nude mice per group. Health of the animals was monitored daily, and body weights were measured weekly throughout the study period. Anesthesia was performed with isoflurane inhalation as well as ketamine (10 mg/kg) and medetomidine (0.1 mg/kg) injection. All surgical procedures were performed under sterile conditions. Secondary euthanasia method for cervical dislocation was also performed. The mice were sacrificed by CO 2 inhalation at 8 weeks post-injection. Harvested tissues were preserved in Bouin’s fixative, embedded in paraffin, sectioned (4 μm), and stained with hematoxylin and eosin (H&E). Examination of the histological sections was performed using Nikon Eclipse Ci microscope (Nikon Corp., Tokyo, Japan) by a digital camera (Nikon digital sight, DS-2Mv) and the automatic exposure and iSolution Lite software for microscopic images. The tumor lengths and widths were measured by a perpendicular tumor diameter, with the tumor volume being calculated using the following formula: width 2 × length/2 20.",Experiments were approved by the Institutional Animal Care and Use Committee of Seoul National University Hospital (approval number 10–0075).,the Institutional Animal Care and Use Committee of Seoul National University Hospital,ORG,29,114,approval number,1
15 | 3821,C:\Users\shweata\approval_number_100\PMC8475677\sections\2_body\3_methods\3_institutional_review_boar\1_p.xml,"The study was approved by the institutional review boards of Kyoto University Graduate School of Medicine (approval number: E2311), Shiga General Hospital (approval number: 20141120‐01), Tenri Hospital (approval number: 640), Kobe City Medical Center General Hospital (approval number: 14094), Hyogo Prefectural Amagasaki General Medical Center (approval number: Rinri 26‐32), National Hospital Organization Kyoto Medical Center (approval number: 14‐080), Mitsubishi Kyoto Hospital (approved 11/12/2014), Okamoto Memorial Hospital (approval number: 201503), Japanese Red Cross Otsu Hospital (approval number: 318), Hikone Municipal Hospital (approval number: 26‐17), Japanese Red Cross Osaka Hospital (approval number: 392), Shimabara Hospital (approval number: E2311), Kishiwada City Hospital (approval number: 12), Kansai Electric Power Hospital (approval number: 26‐59), Shizuoka General Hospital (approval number: Rin14‐11‐47), Kurashiki Central Hospital (approval number: 1719), Kokura Memorial Hospital (approval number: 14111202), Kitano Hospital (approval number: P14‐11‐012), and Japanese Red Cross Wakayama Medical Center (approval number: 328).","The study was approved by the institutional review boards of Kyoto University Graduate School of Medicine (approval number: E2311), Shiga General Hospital (approval number: 20141120‐01), Tenri Hospital (approval number: 640), Kobe City Medical Center General Hospital (approval number: 14094), Hyogo Prefectural Amagasaki General Medical Center (approval number: Rinri 26‐32), National Hospital Organization Kyoto Medical Center (approval number: 14‐080), Mitsubishi Kyoto Hospital (approved 11/12/2014), Okamoto Memorial Hospital (approval number: 201503), Japanese Red Cross Otsu Hospital (approval number: 318), Hikone Municipal Hospital (approval number: 26‐17), Japanese Red Cross Osaka Hospital (approval number: 392), Shimabara Hospital (approval number: E2311), Kishiwada City Hospital (approval number: 12), Kansai Electric Power Hospital (approval number: 26‐59), Shizuoka General Hospital (approval number: Rin14‐11‐47), Kurashiki Central Hospital (approval number: 1719), Kokura Memorial Hospital (approval number: 14111202), Kitano Hospital (approval number: P14‐11‐012), and Japanese Red Cross Wakayama Medical Center (approval number: 328).","Kyoto University Graduate School of Medicine, Shiga General Hospital, Tenri Hospital, Kobe City Medical Center General Hospital, Hyogo Prefectural Amagasaki General Medical Center, National Hospital Organization Kyoto Medical Center, Mitsubishi Kyoto Hospital, Okamoto Memorial Hospital, Japanese Red Cross Otsu Hospital, Hikone Municipal Hospital, Japanese Red Cross Osaka Hospital, Shimabara Hospital, Kishiwada City Hospital, Kansai Electric Power Hospital, Shizuoka General Hospital, Kurashiki Central Hospital, Kokura Memorial Hospital, Kitano Hospital, Japanese Red Cross","ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG, ORG","61, 132, 187, 226, 294, 377, 456, 505, 558, 615, 667, 725, 770, 817, 874, 932, 984, 1038, 1089","105, 154, 201, 267, 344, 428, 481, 530, 590, 640, 700, 743, 793, 847, 899, 958, 1008, 1053, 1107",approval number,1
16 | 4328,C:\Users\shweata\approval_number_100\PMC8482572\sections\2_body\1_methods\4_ethics_approval_and_infor\1_p.xml,The study protocol was approved by the ethical committees of Kawasaki University of Medical Welfare (Approval number: 18-102) and Chiang Mai University (Approval number: NUR-2562-06120). All participants provided written informed consent to participate in the study.,The study protocol was approved by the ethical committees of Kawasaki University of Medical Welfare (Approval number: 18-102) and Chiang Mai University (Approval number: NUR-2562-06120).,"Kawasaki University of Medical Welfare (Approval, Chiang Mai University","ORG, ORG","61, 130","109, 151",approval number,1
17 |
--------------------------------------------------------------------------------
/resources/demo.py:
--------------------------------------------------------------------------------
1 | import os
2 | from docanalysis import DocAnalysis
3 | from pathlib import Path
4 |
5 | doc_analysis = DocAnalysis()
6 | ETHICS_DICTIONARY_DIR = Path(os.getcwd(), "ethics_dictionary")
7 | CORPUS_DIR = Path(os.getcwd(), "corpus")
8 |
9 |
10 | def create_phrases_file(phrases_dir, phrases_file, dictionary_dir=ETHICS_DICTIONARY_DIR):
11 | global terms_xml_path
12 | terms_xml_dir = Path(dictionary_dir, phrases_dir)
13 | if not terms_xml_dir.exists():
14 | terms_xml_dir.mkdir()
15 | terms_xml_path = Path(terms_xml_dir, phrases_file)
16 | return terms_xml_path
17 |
18 |
19 | def get_or_create_corpus_dir(subdir_name, corpus_dir=CORPUS_DIR):
20 | """get specific corpus directory, creating if necessary
21 |
22 | :param corpus_dir: directory containing corpora
23 | :param subdir_name: specific corpus to get or create
24 | :return: directoyr of specific corpus"""
25 | assert corpus_dir.exists(), "directory of corpora must exist"
26 | subdir = Path(corpus_dir, subdir_name)
27 | if not subdir.exists():
28 | subdir.mkdir()
29 | return subdir
30 |
31 |
32 | def run_analysis(corpus_path, phrases_file, query=None, hits=30):
33 | dict_for_entities = doc_analysis.extract_entities_from_papers(
34 | corpus_path=corpus_path,
35 | terms_xml_path=terms_xml_path,
36 | query=query,
37 | hits=hits,
38 | make_project=True
39 | )
40 | create_and_write_list_for_fields(dict_for_entities, "ORG", "org.text")
41 | create_and_write_list_for_fields(dict_for_entities, "GPE", "GPE.text")
42 |
43 |
44 | def create_and_write_list_for_fields(dict_for_entities, field, out_filename):
45 | list_with_orgs = doc_analysis.extract_particular_fields(
46 | dict_for_entities, field)
47 | with open(out_filename, 'w') as f:
48 | f.write(str(list_with_orgs))
49 |
50 |
51 | ETHICS = "ethics"
52 | TERPENES = "terpenes"
53 | options = {
54 | ETHICS,
55 | TERPENES
56 | }
57 |
58 | if ETHICS in options:
59 | corpus_dir = get_or_create_corpus_dir("e_cancer_clinical_trial_50")
60 | phrases_file = create_phrases_file("ethics_key_phrases", "ethics_key_phrases.xml", )
61 | run_analysis(
62 | corpus_dir,
63 | phrases_file,
64 | query="ethics"
65 | )
66 |
67 | if TERPENES in options:
68 | run_analysis(
69 | get_or_create_corpus_dir(TERPENES),
70 | create_phrases_file("terpenes_key_phrases", "terpenes_key_phrases.xml", dictionary_dir="terpenes_dictionary"),
71 | query=TERPENES,
72 | hits = 20,
73 | )
74 |
75 |
--------------------------------------------------------------------------------
/resources/docanalyis_architecture_diagram.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petermr/docanalysis/eb8a3c13f9491b41f252a363953a976406f964b6/resources/docanalyis_architecture_diagram.PNG
--------------------------------------------------------------------------------
/resources/entities_country.csv:
--------------------------------------------------------------------------------
1 | ,file_path,paragraph,sentence,section,entities,labels,position_start,position_end,abbreviations,abbreviations_longform,abbreviation_start,abbreviation_end,has_terms,weight_terms
2 | 1,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\7_aff.xml, 1 Institute for Organic Chemistry and BMWZ Leibniz Universität Hannover Schneiderberg 38 30167 Hannover Germany , 1 Institute for Organic Chemistry and BMWZ Leibniz Universität Hannover Schneiderberg 38 30167 Hannover Germany,AFF,Germany,GPE,122,129,,,,,Germany,1
3 | 3,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\8_aff.xml, 2 Structure and Function of Proteins Helmholtz Centre for Infection Research Inhoffenstr. 7 38124 Braunschweig Germany ,7 38124 Braunschweig Germany,AFF,Germany,GPE,27,34,,,,,Germany,1
4 | 5,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8457177\sections\1_front\1_article-meta\9_aff.xml," 3 Institute for Biochemistry, Biotechnology and Bioinformatics Technische Universität Braunschweig Spielmannstr. 7 38106 Braunschweig Germany ",7 38106 Braunschweig Germany,AFF,Germany,GPE,27,34,,,,,Germany,1
5 | 6,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8465594\sections\1_front\1_article-meta\6_aff.xml,"1 Department of Chemistry of Natural Compounds, University of Chemistry and Technology, Technicka 5, 166 28 Prague, Czech Republic","1 Department of Chemistry of Natural Compounds, University of Chemistry and Technology, Technicka 5, 166 28 Prague, Czech Republic",AFF,"Prague, Czech Republic","GPE, GPE","108, 116","114, 130",,,,,Czech Republic,1
6 | 7,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8465594\sections\1_front\1_article-meta\7_aff.xml,"2 Institute of Bioorganic Chemistry, National Academy of Sciences of Belarus, 5/2 Academician V. F. Kuprevich Street, BY-220141 Minsk, Belarus; khripach@iboch.by","2 Institute of Bioorganic Chemistry, National Academy of Sciences of Belarus, 5/2 Academician V. F. Kuprevich Street, BY-220141 Minsk, Belarus; khripach@iboch.by",AFF,"BY-220141 Minsk, Belarus","GPE, GPE","118, 135","133, 142",,,,,Belarus,1
7 | 11,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8541587\sections\1_front\1_article-meta\7_aff.xml,"Institut National de Recherche pour L’agriculture, L’alimentation et L’environnement (INRAE), 13182 Aix-en-Provence, France; bastien.romero@inrae.fr","Institut National de Recherche pour L’agriculture, L’alimentation et L’environnement (INRAE), 13182 Aix-en-Provence, France; bastien.romero@inrae.fr",AFF,France,GPE,117,123,,,,,France,1
8 | 13,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625326\sections\1_front\1_article-meta\7_aff.xml,"School of Forestry and Resource Conservation, National Taiwan University, Taipei 10617, Taiwan; mary2234@gmail.com (L.-T.M.); bad8016479@gmail.com (P.-L.L.); timmy304681@gmail.com (Y.-T.C.); jimmy81513@hotmail.com (T.-F.S.)","School of Forestry and Resource Conservation, National Taiwan University, Taipei 10617, Taiwan; mary2234@gmail.com (L.-T.M.",AFF,"Taipei, Taiwan","GPE, GPE","74, 88","80, 94",,,,,Taiwan,1
9 | 17,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\7_aff.xml,"1 INSERM U1070 “Pharmacology of Anti-Infective Agents”, 1 rue Georges Bonnet, Pôle Biologie Santé, 86022 Poitiers, France; chantal.valcourtsainz@gmail.com (C.V.); julien.buyck@univ-poitiers.fr (J.M.B.); nicolas.gregoire@univ-poitiers.fr (N.G.); william.couet@univ-poitiers.fr (W.C.); sandrine.marchand@univ-poitiers.fr (S.M.)","1 INSERM U1070 “Pharmacology of Anti-Infective Agents”, 1 rue Georges Bonnet, Pôle Biologie Santé, 86022 Poitiers, France; chantal.valcourtsainz@gmail.com (C.V.); julien.buyck@univ-poitiers.fr (J.M.B.",AFF,"France, C.V.","GPE, GPE","115, 158","121, 162",,,,,France,1
10 | 20,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\8_aff.xml,"2 UFR Médecine-Pharmacie Université de Poitiers, 6 rue de la Milétrie, TSA 51115, 86073 Poitiers, France","2 UFR Médecine-Pharmacie Université de Poitiers, 6 rue de la Milétrie, TSA 51115, 86073 Poitiers, France",AFF,France,GPE,98,104,,,,,France,1
11 | 21,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8625850\sections\1_front\1_article-meta\9_aff.xml,"3 Laboratoire de Toxicologie-Pharmacocinétique, CHU de Poitiers, 2 rue de la Miletrie, 86021 Poitiers, France","3 Laboratoire de Toxicologie-Pharmacocinétique, CHU de Poitiers, 2 rue de la Miletrie, 86021 Poitiers, France",AFF,France,GPE,103,109,,,,,France,1
12 | 23,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8701039\sections\1_front\1_article-meta\7_aff.xml,"1 Centrale Marseille, CNRS, iSm2 Marseille, ISM2 UMR 7313, Aix-Marseille Université, Av. Escadrille Normandie-Niemen, 13013 Marseille, France; julie.couillaud.13990@gmail.com (J.C.); letitia.LEYDET@univ-amu.fr (L.L.); katia.duquesne@univ-amu.fr (K.D.)","Escadrille Normandie-Niemen, 13013 Marseille, France; julie.couillaud.13990@gmail.com (J.C.); letitia.LEYDET@univ-amu.fr (L.L.",AFF,"Marseille, France, J.C.","GPE, GPE, GPE","35, 46, 89","44, 52, 93",,,,,France,1
13 | 25,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8701039\sections\1_front\1_article-meta\8_aff.xml,"2 Systems and Synthetic Biology Division, Department of Biology and Biological Engineering, Chalmers University of Technology, 41296 Gothenburg, Sweden","2 Systems and Synthetic Biology Division, Department of Biology and Biological Engineering, Chalmers University of Technology, 41296 Gothenburg, Sweden",AFF,"Gothenburg, Sweden","GPE, GPE","133, 145","143, 151",,,,,Sweden,1
14 | 30,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\5_aff.xml,"1 Egyptian Deserts Gene Bank, North Sinai Research Station, Department of Genetic Resources, Desert Research Center , Cairo , Egypt","1 Egyptian Deserts Gene Bank, North Sinai Research Station, Department of Genetic Resources, Desert Research Center , Cairo , Egypt",AFF,"Cairo, Egypt","GPE, GPE","119, 128","124, 133",,,,,Egypt,1
15 | 33,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\8_aff.xml,"4 Department of Biology, Faculty of Science, University of Tabuk , Tabuk , Saudi Arabia","4 Department of Biology, Faculty of Science, University of Tabuk , Tabuk , Saudi Arabia",AFF,Saudi Arabia,GPE,77,89,,,,,Saudi Arabia,1
16 | 34,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8733304\sections\1_front\1_article-meta\9_aff.xml,"5 Department of Plant Agricultural, Faculty of Agriculture Science, Al-Azhar University , Assiut , Egypt","5 Department of Plant Agricultural, Faculty of Agriculture Science, Al-Azhar University , Assiut , Egypt",AFF,Egypt,GPE,101,106,,,,,Egypt,1
17 | 35,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\10_aff.xml,"4 ICREA—Catalan Institution for Research and Advanced Studies, 08010 Barcelona, Spain; jrintjema@iciq.es","4 ICREA—Catalan Institution for Research and Advanced Studies, 08010 Barcelona, Spain; jrintjema@iciq.es",AFF,Spain,GPE,80,85,,,,,Spain,1
18 | 36,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\11_aff.xml,"5 Institute of Chemical Research of Catalonia (ICIQ), Barcelona Institute of Science and Technology, 43007 Tarragona, Spain; fbravo@iciq.es (F.B.); akleij@iciq.es (A.W.K.)","5 Institute of Chemical Research of Catalonia (ICIQ), Barcelona Institute of Science and Technology, 43007 Tarragona, Spain; fbravo@iciq.es (F.B.",AFF,Spain,GPE,118,123,,,,,Spain,1
19 | 38,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\12_aff.xml,"6 Institute for Bioengineering of Catalonia, Baldiri Reixac 10-12, 08028 Barcelona, Spain","6 Institute for Bioengineering of Catalonia, Baldiri Reixac 10-12, 08028 Barcelona, Spain",AFF,"Barcelona, Spain","GPE, GPE","73, 84","82, 89",,,,,Spain,1
20 | 39,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\7_aff.xml,"1 Departament d’Enginyeria Química, EEBE, Universitat Politècnica de Catalunya, 08019 Barcelona, Spain; reza.zeinali@upc.edu (R.Z.); lourdes.franco@upc.edu (L.F.); carlos.aleman@upc.edu (C.A.)","1 Departament d’Enginyeria Química, EEBE, Universitat Politècnica de Catalunya, 08019 Barcelona, Spain; reza.zeinali@upc.edu (R.Z.",AFF,Spain,GPE,97,102,,,,,Spain,1
21 | 41,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\8_aff.xml,"2 Center for Research in Nano-Engineering, CrNE, Universitat Politècnica de Catalunya, C. Eduard Maristany, 08019 Barcelona, Spain","2 Center for Research in Nano-Engineering, CrNE, Universitat Politècnica de Catalunya, C. Eduard Maristany, 08019 Barcelona, Spain",AFF,Spain,GPE,125,130,,,,,Spain,1
22 | 42,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8747377\sections\1_front\1_article-meta\9_aff.xml,"3 ALBA Synchrotron Light Source, Carrer de la Llum, 2-26, Cerdanyola del Vallès, 08290 Barcelona, Spain; iyousef@cells.es","3 ALBA Synchrotron Light Source, Carrer de la Llum, 2-26, Cerdanyola del Vallès, 08290 Barcelona, Spain; iyousef@cells.es",AFF,"Barcelona, Spain","GPE, GPE","87, 98","96, 103",,,,,Spain,1
23 | 45,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8771452\sections\1_front\1_article-meta\10_aff.xml,"4 School of Pure and Applied Sciences, Karatina University , Karatina, Kenya","4 School of Pure and Applied Sciences, Karatina University , Karatina, Kenya",AFF,"Karatina, Kenya","GPE, GPE","62, 73","70, 78",,,,,Kenya,1
24 | 61,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\7_aff.xml,"1 Instituto Botánico, Departamento de Ciencia y Tecnología Agroforestal y Genética, Universidad de Castilla-La Mancha, Campus Universitario s/n, 02071 Albacete, Spain; maria.mondejar3@alu.uclm.es (M.M.-L.); albertojose.lopez@uclm.es (A.J.L.-J.); Oussama.ahrazem@uclm.es (O.A.); MariaLourdes.gomez@uclm.es (L.G.-G.)","1 Instituto Botánico, Departamento de Ciencia y Tecnología Agroforestal y Genética, Universidad de Castilla-La Mancha, Campus Universitario s/n, 02071 Albacete, Spain; maria.mondejar3@alu.uclm.es (M.M.-L.); albertojose.lopez@uclm.es (A.J.L.-J.",AFF,"Spain, M.M.-L.","GPE, GPE","161, 199","166, 206",,,,,Spain,1
25 | 64,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\8_aff.xml,"2 Departamento de Química Inorgánica, Orgánica y Bioquímica, Facultad de Farmacia, Universidad de Castilla-La Mancha, C/José María Sánchez Ibáñez s/n, 02008 Albacete, Spain; Joaquinc.garcia@uclm.es","2 Departamento de Química Inorgánica, Orgánica y Bioquímica, Facultad de Farmacia, Universidad de Castilla-La Mancha, C/José María Sánchez Ibáñez s/n, 02008 Albacete, Spain; Joaquinc.garcia@uclm.es",AFF,"Orgánica, Spain","GPE, GPE","38, 167","46, 172",,,,,Spain,1
26 | 65,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8778794\sections\1_front\1_article-meta\9_aff.xml,"3 Regional Center for Biomedical Research (CRIB), Universidad de Castilla-La Mancha, C/Almansa 13, 02008 Albacete, Spain","3 Regional Center for Biomedical Research (CRIB), Universidad de Castilla-La Mancha, C/Almansa 13, 02008 Albacete, Spain",AFF,Spain,GPE,115,120,,,,,Spain,1
27 | 66,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8782336\sections\1_front\1_article-meta\6_aff.xml,"1 Division of Biological Sciences, University of California, San Diego, La Jolla, California, United States of America","1 Division of Biological Sciences, University of California, San Diego, La Jolla, California, United States of America",AFF,"La Jolla, California, United States of America","GPE, GPE, GPE","74, 84, 96","82, 94, 120",,,,,United States of America,1
28 | 68,C:\Users\shweata\docanalysis\corpus\terpene_fig\PMC8782336\sections\1_front\1_article-meta\9_aff.xml," SRUC: Scotland’s Rural College, UNITED KINGDOM "," SRUC: Scotland’s Rural College, UNITED KINGDOM",AFF,"Scotland, UNITED KINGDOM","GPE, GPE","8, 34","16, 48",,,,,United Kingdom,1
29 |
--------------------------------------------------------------------------------
/resources/fig_ent.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
--------------------------------------------------------------------------------
/resources/pmr_demo.py:
--------------------------------------------------------------------------------
1 | import os
2 | from docanalysis import DocAnalysis
3 | from pathlib import Path
4 |
5 | ethic_statement_creator = DocAnalysis()
6 | term_dir = Path(os.getcwd(), "terpenes_dictionary", "terpenes_key_phrases", )
7 | if not term_dir.exists():
8 | term_dir.mkdir()
9 | dict_for_entities = ethic_statement_creator.extract_entities_from_papers(
10 | corpus_path=Path(os.getcwd(), "corpus", "terpenes", ),
11 | terms_xml_path=Path(term_dir, "terpenes_key_phrases.xml"),
12 | query="terpenes",
13 | hits=10,
14 | make_project=True
15 | )
16 | print(f"dict {dict_for_entities}")
17 | list_with_orgs = ethic_statement_creator.extract_particular_fields(
18 | dict_for_entities, 'ORG')
19 | with open('org.text', 'w') as f:
20 | f.write(str(list_with_orgs))
21 | list_with_gpe = ethic_statement_creator.extract_particular_fields(
22 | dict_for_entities, 'GPE')
23 | with open('GPE.text', 'w') as f:
24 | f.write(str(list_with_gpe))
25 |
26 |
--------------------------------------------------------------------------------
/resources/test_pmc.txt:
--------------------------------------------------------------------------------
1 | PMC8771452, PMC8771452, PMC8771452
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | try:
5 | from setuptools import setup
6 | except ImportError:
7 | from distutils.core import setup
8 | import configparser
9 | import os
10 |
11 | with open('README.md', encoding='utf-8') as readme_file:
12 | readme = readme_file.read()
13 |
14 | requirements = ['abbreviations', 'beautifulsoup4==4.10.0', 'braceexpand==0.1.7', 'coloredlogs==15.0.1', 'ConfigArgParse==1.5.3', 'lxml==4.7.1', 'nltk==3.6.7', 'pandas==1.3.4',
15 | 'pygetpapers',
16 | 'pytest==6.2.5',
17 | 'setuptools==60.3.1',
18 | 'spacy==3.0.7',
19 | 'tkinterweb==3.10.7',
20 | 'tqdm==4.62.3'
21 | ]
22 |
23 | setup(
24 | name='docanalysis',
25 | version="0.3.0",
26 | description='extract structured information from ethics paragraphs',
27 | long_description_content_type='text/markdown',
28 | long_description=readme,
29 | author='Ayush Garg, Shweata N. Hegde',
30 | author_email='ayush@science.org.in, shweata.hegde@gmail.com',
31 | url='https://github.com/petermr/docanalysis',
32 | packages=[
33 | 'docanalysis',
34 | ],
35 | package_dir={'docanalysis':
36 | 'docanalysis'},
37 | include_package_data=True,
38 | install_requires=requirements,
39 | license='Apache License',
40 | zip_safe=False,
41 | keywords='research automation',
42 | classifiers=[
43 | 'Development Status :: 4 - Beta',
44 | 'Intended Audience :: Developers',
45 | 'License :: OSI Approved :: Apache Software License',
46 | 'Natural Language :: English',
47 | 'Programming Language :: Python :: 3.4',
48 | 'Programming Language :: Python :: 3.5',
49 | 'Programming Language :: Python :: 3.6',
50 | 'Programming Language :: Python :: 3.7',
51 | 'Programming Language :: Python :: 3.8',
52 | 'Programming Language :: Python :: 3.9',
53 | 'Programming Language :: Python :: 3.10',
54 |
55 | ],
56 | entry_points={
57 | 'console_scripts': [
58 | 'docanalysis=docanalysis.docanalysis:main',
59 | ],
60 | },
61 |
62 | )
63 |
--------------------------------------------------------------------------------
/tests/test_docanalysis_cli.py:
--------------------------------------------------------------------------------
1 | # test whether...
2 | # Cproject exists (I)
3 | # dictionary exists (I)
4 | # sections exist (I)
5 | # non-empty CSV exists (O)
6 | # dictionary is created (not sure if we create two dictionaries (entities and keyphrases) can be created at the same time)
7 | import pytest
8 | from pathlib import Path
9 | import os
10 |
11 | DOCANALYSIS_TOP = Path(__file__).parent.parent
12 | #EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300')
13 | PMC_TEXT_FILE = Path(DOCANALYSIS_TOP, 'resources', 'test_pmc.text')
14 | DICT_DIRECTORY = Path(DOCANALYSIS_TOP, 'ethics_dictionary')
15 | TEST_DICT = Path(DICT_DIRECTORY, 'ethics_demo', 'ethics_demo.xml')
16 | TEMP_CPROJECT = Path(DOCANALYSIS_TOP, 'test_ethics_20')
17 |
18 | class TestDocanalysis:
19 |
20 | def test_pygetpapers(self):
21 | """- checks whether
22 | - the corpus directory exists or not
23 | - the number of PMC * folders is equal to the hits specified
24 | - fulltext xml exists in each PMC folder or not
25 | """
26 | os.system(f'docanalysis --run_pygetpapers --terms {TEST_DICT} --project_name {TEMP_CPROJECT}')
27 | assert TEMP_CPROJECT.exists(), f"checking whether {TEMP_CPROJECT} exists"
28 | assert len(list(TEMP_CPROJECT.glob('PMC*/'))) == 3
29 | assert len(list(TEMP_CPROJECT.glob('PMC*/fulltext.xml'))) == 3
30 |
31 | def test_section_exists(self):
32 | """checkers whether
33 | - the number of PMC folder with sections is equal to number of hits
34 | - section exists in each PMC folder
35 | # not sure if this is the right way of testing whether papers are sectioned
36 | """
37 |
38 | f'docanalysis --project_name {TEMP_CPROJECT} --run_sectioning'
39 | assert len(list(TEMP_CPROJECT.glob('PMC*/sections/'))) == 3
40 | for PMC in TEMP_CPROJECT.glob('**/'):
41 | for section in PMC.glob('sections/'):
42 | assert section.name.exists()
43 |
44 | def test_search_dict_exists(self):
45 | """checks whether the dictionary directory exists or not
46 | """
47 | assert TEST_DICT.exists(), f"dictionary {TEST_DICT} must exist"
48 |
49 | def test_csv_output_creation(self):
50 | """checks whether the csv output is created or not
51 | """
52 | os.system(f'docanalysis --project_name {TEMP_CPROJECT} --dictionary {TEST_DICT} --output')
53 | assert Path(TEMP_CPROJECT, 'entities.csv').exists, 'checking if the output is created'
54 |
55 | def test_dict_creation_entites(self):
56 | os.system(f'docanalysis --project_name {TEMP_CPROJECT} --dictionary {TEST_DICT} --output ----make_ami_dict entities.xml')
57 | assert Path(TEMP_CPROJECT, 'entities.xml').exists, 'checking if the entitty dictionary is created'
58 |
59 | def test_remove_dir():
60 | import shutil
61 | shutil.rmtree(TEMP_CPROJECT)
62 | assert "Ran all the tests" == "Ran all the tests"
--------------------------------------------------------------------------------
/tests/test_docanalysis_method.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import glob
3 | import os
4 | from pathlib import Path
5 | from ..docanalysis.extract_entities import DocAnalysis
6 |
7 | DOCANALYSIS_TOP = Path(__file__).parent.parent
8 | EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300')
9 |
10 | class TestDocanalysisMeth():
11 |
12 | def test_cproject_exists(self):
13 | assert EXISTING_CPROJECT.exists(), f"checking whether {EXISTING_CPROJECT} exists"
14 |
15 | def test_glob_section(self):
16 | all_paragraphs = glob(os.path.join(
17 | EXISTING_CPROJECT, '*', 'sections', '**', '[1_9]_p.xml'), recursive=True)
18 | assert all_paragraphs is not None
--------------------------------------------------------------------------------
/tests/testing_test.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | DOCANALYSIS_TOP = Path(__file__).parent.parent
4 | print(DOCANALYSIS_TOP)
5 | #EXISTING_CPROJECT = Path(DOCANALYSIS_TOP, 'stem_cell_research_300')
6 | PMC_TEXT_FILE = Path(DOCANALYSIS_TOP, 'resources', 'test_pmc.text')
7 | print(PMC_TEXT_FILE)
--------------------------------------------------------------------------------