├── .github └── workflows │ ├── pull-request.yml │ ├── push-master.yml │ ├── release.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── poetry.lock ├── pyproject.toml ├── refextract ├── __init__.py ├── app.py ├── authors │ ├── __init__.py │ └── regexs.py ├── config.cfg ├── documents │ ├── __init__.py │ ├── pdf.py │ └── text.py └── references │ ├── __init__.py │ ├── api.py │ ├── config.py │ ├── engine.py │ ├── errors.py │ ├── find.py │ ├── kbs.py │ ├── kbs │ ├── authors.kb │ ├── books.kb │ ├── collaborations.kb │ ├── journal-titles-re.kb │ ├── journal-titles.kb │ ├── publishers.kb │ ├── report-numbers.kb │ └── special-journals.kb │ ├── pdf.py │ ├── record.py │ ├── regexs.py │ ├── tag.py │ └── text.py ├── ruff.toml ├── run-tests.sh └── tests ├── conftest.py ├── data ├── 1503.07589v1.pdf ├── 1508.05632v2.pdf ├── 1706.09498v1.pdf ├── 1707.04066v1.pdf ├── 1805.05865.pdf ├── 2110.02751.pdf ├── 2301.05883.pdf ├── 2303.03819.pdf ├── 2304.10117.pdf ├── 2406.06875.pdf ├── 2502.18907.pdf ├── 2502.21088.pdf ├── 2503.05372.pdf ├── 2503.05621.pdf ├── DIS_SHEILA_final.pdf ├── file_resolving.csv ├── packed_pdf.pdf └── wepml008.pdf ├── integration ├── cassettes │ └── test_extract_extract_references_from_url.yaml ├── conftest.py └── test_views.py ├── test_api.py ├── test_engine.py ├── test_find.py ├── test_kbs.py ├── test_pdf.py ├── test_regexs.py ├── test_tag.py └── test_text.py /.github/workflows/pull-request.yml: -------------------------------------------------------------------------------- 1 | name: Pull request master 2 | 3 | on: 4 | pull_request: 5 | branches: [master] 6 | 7 | jobs: 8 | tests: 9 | uses: ./.github/workflows/test.yml 10 | -------------------------------------------------------------------------------- /.github/workflows/push-master.yml: -------------------------------------------------------------------------------- 1 | name: Pull request master 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | 7 | jobs: 8 | tests: 9 | uses: ./.github/workflows/test.yml 10 | 11 | push_and_deploy_qa: 12 | runs-on: ubuntu-latest 13 | needs: [tests] 14 | steps: 15 | - name: Checkout Code 16 | uses: actions/checkout@v4 17 | with: 18 | ref: ${{ github.ref }} 19 | 20 | - name: Build Image 21 | id: build 22 | uses: cern-sis/gh-workflows/.github/actions/docker-build@v6 23 | with: 24 | registry: registry.cern.ch 25 | stage: refextract 26 | image: cern-sis/inspirehep/refextract 27 | cache: false 28 | username: ${{ secrets.HARBOR_USERNAME }} 29 | password: ${{ secrets.HARBOR_PASSWORD }} 30 | 31 | - name: Deploy QA 32 | uses: cern-sis/gh-workflows/.github/actions/kubernetes-project-new-images@v6.4 33 | with: 34 | event-type: update 35 | repo: cern-sis/kubernetes-inspire 36 | images: registry.cern.ch/cern-sis/inspirehep/refextract@${{ steps.build.outputs.image-digest }} 37 | token: ${{ secrets.PAT_FIRE_EVENTS_ON_CERN_SIS_KUBERNETES }} 38 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | release: 5 | types: [released] 6 | 7 | defaults: 8 | run: 9 | shell: bash 10 | 11 | jobs: 12 | push: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v4 17 | with: 18 | fetch-depth: 0 19 | 20 | - name: Push 21 | run: | 22 | git config user.name github-actions 23 | git config user.email github-actions@github.com 24 | git push --force --follow-tags origin ${{ github.ref_name }}:prod 25 | 26 | - name: Generate metadata 27 | id: meta 28 | uses: docker/metadata-action@v5 29 | with: 30 | images: | 31 | registry.cern.ch/cern-sis/inspirehep/refextract 32 | tags: "type=sha" 33 | 34 | - name: send event inspire 35 | uses: cern-sis/gh-workflows/.github/actions/kubernetes-project-new-images@v6.4 36 | with: 37 | repo: cern-sis/kubernetes-inspire 38 | event-type: release 39 | images: ${{ env.DOCKER_METADATA_OUTPUT_TAGS }} 40 | token: ${{ secrets.PAT_FIRE_EVENTS_ON_CERN_SIS_KUBERNETES }} 41 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test Python 3 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | lint_and_test: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout Code 11 | uses: actions/checkout@v4 12 | with: 13 | ref: ${{ github.ref }} 14 | - name: Lint - Pre-commit check 15 | uses: pre-commit/action@v3.0.1 16 | - name: Prep Build 17 | uses: docker/setup-buildx-action@v3 18 | - name: Build Docker image 19 | run: docker build --target refextract-tests -t refextract . 20 | - name: Run tests 21 | run: > 22 | docker run 23 | --entrypoint poetry 24 | refextract 25 | run pytest 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2016, 2017, 2018 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | # Byte-compiled / optimized / DLL files 25 | __pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | 29 | # C extensions 30 | *.so 31 | 32 | # Distribution / packaging 33 | .Python 34 | build/ 35 | develop-eggs/ 36 | dist/ 37 | downloads/ 38 | eggs/ 39 | .eggs/ 40 | lib/ 41 | lib64/ 42 | parts/ 43 | sdist/ 44 | var/ 45 | wheels/ 46 | *.egg-info/ 47 | .installed.cfg 48 | *.egg 49 | MANIFEST 50 | 51 | # PyInstaller 52 | # Usually these files are written by a python script from a template 53 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 54 | *.manifest 55 | *.spec 56 | 57 | # Installer logs 58 | pip-log.txt 59 | pip-delete-this-directory.txt 60 | 61 | # Unit test / coverage reports 62 | htmlcov/ 63 | .tox/ 64 | .coverage 65 | .coverage.* 66 | .cache 67 | nosetests.xml 68 | coverage.xml 69 | *.cover 70 | .hypothesis/ 71 | .pytest_cache/ 72 | 73 | # Translations 74 | *.mo 75 | *.pot 76 | 77 | # Django stuff: 78 | *.log 79 | local_settings.py 80 | db.sqlite3 81 | 82 | # Flask stuff: 83 | instance/ 84 | .webassets-cache 85 | 86 | # Scrapy stuff: 87 | .scrapy 88 | 89 | # Sphinx documentation 90 | docs/_build/ 91 | 92 | # PyBuilder 93 | target/ 94 | 95 | # Jupyter Notebook 96 | .ipynb_checkpoints 97 | 98 | # pyenv 99 | .python-version 100 | 101 | # celery beat schedule file 102 | celerybeat-schedule 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env* 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | .idea 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | 130 | # Build artifacts 131 | AUTHORS 132 | CHANGELOG 133 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.6.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - id: fix-byte-order-marker 9 | - id: mixed-line-ending 10 | - id: name-tests-test 11 | args: [ --pytest-test-first ] 12 | exclude: '^(?!factories/)' 13 | - repo: https://github.com/astral-sh/ruff-pre-commit 14 | rev: v0.11.2 15 | hooks: 16 | - id: ruff 17 | args: [ --fix] 18 | - id: ruff-format 19 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11.6-slim-bullseye AS refextract 2 | 3 | ARG APP_HOME=/refextract 4 | WORKDIR ${APP_HOME} 5 | 6 | COPY refextract refextract/ 7 | 8 | RUN apt update && apt install poppler-utils libmagic1 -y 9 | COPY poetry.lock pyproject.toml README.md ${APP_HOME} 10 | 11 | RUN pip install --no-cache-dir poetry 12 | RUN poetry config virtualenvs.create false \ 13 | && poetry install --only main 14 | 15 | ENV PROMETHEUS_MULTIPROC_DIR='/tmp' 16 | ENTRYPOINT ["gunicorn", "-b", ":5000", "--access-logfile", "-", "--error-logfile", "-", "refextract.app:app", "--timeout", "650"] 17 | 18 | FROM refextract AS refextract-tests 19 | 20 | RUN poetry install --with dev 21 | COPY tests tests/ 22 | RUN poetry install 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2015, 2016 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | 25 | include LICENSE *.rst 26 | include .coveragerc run-tests.sh pytest.ini tox.ini Dockerfile 27 | include docs/*.rst docs/*.py docs/Makefile 28 | 29 | recursive-include refextract * 30 | recursive-include *.py *.css *.css_t *.conf *.html 31 | recursive-include tests *.py 32 | recursive-include tests *.pdf 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # refextract 3 | 4 | ## About 5 | 6 | A library for extracting references used in scholarly communication. 7 | 8 | ## Getting Started 9 | 10 | Note: due to the usage of `mmap` resize functionality this library cannot be locally installed on a mac 11 | 12 | ### Docker Setup: 13 | 14 | Before the first usage, or anytime a new library/dependency is changed a new docker image must be created using: 15 | ```shell 16 | docker build --target refextract-tests -t refextract . 17 | ``` 18 | 19 | After that, spin up a `refextract` service with: 20 | ```shell 21 | docker run -it -d -p 5000:5000 -v ./tests:/refextract/tests -v ./refextract:/refextract/refextract --name refextract refextract 22 | ``` 23 | 24 | ### Running tests 25 | 26 | Exec into the container via 27 | ```shell 28 | docker exec -it refextract /bin/bash 29 | ``` 30 | Then simply run 31 | ```shell 32 | pytest . 33 | ``` 34 | 35 | ## Usage 36 | 37 | To get structured information from a publication reference: 38 | 39 | 40 | ``` python 41 | >>> from refextract import extract_journal_reference 42 | >>> reference = extract_journal_reference('J.Phys.,A39,13445') 43 | >>> print(reference) 44 | { 45 | 'extra_ibids': [], 46 | 'is_ibid': False, 47 | 'misc_txt': '', 48 | 'page': '13445', 49 | 'title': 'J. Phys.', 50 | 'type': 'JOURNAL', 51 | 'volume': 'A39', 52 | 'year': '', 53 | 54 | } 55 | ``` 56 | 57 | To extract references from a PDF: 58 | ``` python 59 | >>> from refextract import extract_references_from_file 60 | >>> references = extract_references_from_file('1503.07589.pdf') 61 | >>> print(references[0]) 62 | { 63 | 'author': ['F. Englert and R. Brout'], 64 | 'doi': ['doi:10.1103/PhysRevLett.13.321'], 65 | 'journal_page': ['321'], 66 | 'journal_reference': ['Phys. Rev. Lett. 13 (1964) 321'], 67 | 'journal_title': ['Phys. Rev. Lett.'], 68 | 'journal_volume': ['13'], 69 | 'journal_year': ['1964'], 70 | 'linemarker': ['1'], 71 | 'raw_ref': ['[1] F. Englert and R. Brout, \u201cBroken symmetry and the mass of gauge vector mesons\u201d, Phys. Rev. Lett. 13 (1964) 321, doi:10.1103/PhysRevLett.13.321.'], 72 | 'texkey': ['Englert:1964et'], 73 | 'year': ['1964'], 74 | } 75 | ``` 76 | 77 | To extract directly from a URL: 78 | ``` python 79 | >>> from refextract import extract_references_from_url 80 | >>> references = extract_references_from_url('https://arxiv.org/pdf/1503.07589.pdf') 81 | >>> print(references[0]) 82 | { 83 | 'author': ['F. Englert and R. Brout'], 84 | 'doi': ['doi:10.1103/PhysRevLett.13.321'], 85 | 'journal_page': ['321'], 86 | 'journal_reference': ['Phys. Rev. Lett. 13 (1964) 321'], 87 | 'journal_title': ['Phys. Rev. Lett.'], 88 | 'journal_volume': ['13'], 89 | 'journal_year': ['1964'], 90 | 'linemarker': ['1'], 91 | 'raw_ref': ['[1] F. Englert and R. Brout, \u201cBroken symmetry and the mass of gauge vector mesons\u201d, Phys. Rev. Lett. 13 (1964) 321, doi:10.1103/PhysRevLett.13.321.'], 92 | 'texkey': ['Englert:1964et'], 93 | 'year': ['1964'], 94 | 95 | } 96 | 97 | ``` 98 | 99 | ## Notes 100 | `refextract` depends on 101 | 102 | [pdftotext](http://linux.die.net/man/1/pdftotext). 103 | 104 | ## Acknowledgments 105 | 106 | `refextract` is based on code and ideas from the following people, who 107 | 108 | contributed to the `docextract` module in Invenio: 109 | - Alessio Deiana 110 | - Federico Poli 111 | - Gerrit Rindermann 112 | - Graham R. Armstrong 113 | - Grzegorz Szpura 114 | - Jan Aage Lavik 115 | - Javier Martin Montull 116 | - Micha Moskovic 117 | - Samuele Kaplun 118 | - Thorsten Schwander 119 | - Tibor Simko 120 | 121 | ## License 122 | GPLv2 123 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "refextract" 3 | version = "0.1.0" 4 | description = "Small library for extracting references used in scholarly communication." 5 | readme = "README.md" 6 | homepage = "https://github.com/inspirehep/refextract" 7 | license = "GPL-2.0-or-later" 8 | authors = [ 9 | "CERN " 10 | ] 11 | classifiers = [ 12 | "Development Status :: 4 - Beta", 13 | "Environment :: Console", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: Science/Research", 16 | "License :: OSI Approved :: GNU General Public License v2 (GPLv2)", 17 | "Operating System :: OS Independent", 18 | "Programming Language :: Python", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.6", 21 | "Programming Language :: Python :: 3.7", 22 | "Programming Language :: Python :: 3.8", 23 | "Topic :: Scientific/Engineering :: Information Analysis", 24 | "Topic :: Software Development :: Libraries", 25 | "Topic :: Software Development :: Libraries :: Python Modules", 26 | "Topic :: Utilities", 27 | ] 28 | 29 | 30 | [tool.poetry.dependencies] 31 | python = ">=3.11,<4" 32 | unidecode = ">=1.0.22,~=1.0" 33 | Flask = ">=2.0.3" 34 | webargs = ">=8.0,~=8.0" 35 | prometheus-flask-exporter = ">=0.23.2,~=0.23" 36 | gunicorn = "^23.0.0" 37 | python-magic = "^0.4.27" 38 | inspire-utils = "^3.0.61" 39 | requests = "^2.32.3" 40 | pypdf = "^5.4.0" 41 | 42 | 43 | [tool.poetry.group.dev.dependencies] 44 | mock = "^5.2.0" 45 | responses =">=0.25.7,~=0.25" 46 | pytest = "^8.3.3" 47 | pytest-cov = "^6.0.0" 48 | ipdb = "^0.13.9" 49 | 50 | [tool.coverage.run] 51 | include = ["refextract/*.py"] 52 | 53 | [tool.pytest.ini_options] 54 | addopts = "--cov=refextract --cov-report=term-missing:skip-covered" 55 | 56 | [build-system] 57 | requires = ["poetry-core>=1.0.0"] 58 | build-backend = "poetry.core.masonry.api" 59 | -------------------------------------------------------------------------------- /refextract/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2015, 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | """Refextract.""" 25 | 26 | from refextract.references.api import ( 27 | extract_journal_reference, 28 | extract_references_from_file, 29 | extract_references_from_string, 30 | extract_references_from_url, 31 | ) 32 | 33 | __all__ = ( 34 | "extract_journal_reference", 35 | "extract_references_from_file", 36 | "extract_references_from_string", 37 | "extract_references_from_url", 38 | ) 39 | -------------------------------------------------------------------------------- /refextract/app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from flask import Flask, jsonify, make_response 4 | from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics 5 | from webargs import fields 6 | from webargs.flaskparser import FlaskParser 7 | 8 | from refextract.references.api import ( 9 | extract_journal_reference, 10 | extract_references_from_string, 11 | extract_references_from_url, 12 | ) 13 | 14 | parser = FlaskParser() 15 | 16 | LOGGER = logging.getLogger(__name__) 17 | 18 | 19 | def create_app(): 20 | app = Flask(__name__) 21 | app.config.from_pyfile("config.cfg", silent=True) 22 | 23 | @app.route("/extract_journal_info", methods=["POST"]) 24 | @parser.use_args( 25 | { 26 | "publication_infos": fields.List(fields.Dict, required=True), 27 | "journal_kb_data": fields.Dict(required=True), 28 | }, 29 | location="json", 30 | ) 31 | def extract_journal_info(args): 32 | publication_infos = args.pop("publication_infos") 33 | journal_kb_data = args.pop("journal_kb_data") 34 | extracted_publication_infos = [] 35 | journal_dict = {"journals": journal_kb_data} 36 | try: 37 | for publication_info in publication_infos: 38 | if not publication_info.get("pubinfo_freetext"): 39 | extracted_publication_infos.append({}) 40 | continue 41 | extracted_publication_info = extract_journal_reference( 42 | publication_info["pubinfo_freetext"], 43 | override_kbs_files=journal_dict, 44 | ) 45 | if not extracted_publication_info: 46 | extracted_publication_info = {} 47 | extracted_publication_infos.append(extracted_publication_info) 48 | except Exception as e: 49 | return make_response( 50 | jsonify( 51 | { 52 | "message": f"Can not extract publication info data." 53 | f" Reason: {str(e)}" 54 | } 55 | ), 56 | 500, 57 | ) 58 | return jsonify({"extracted_publication_infos": extracted_publication_infos}) 59 | 60 | @app.route("/extract_references_from_text", methods=["POST"]) 61 | @parser.use_args( 62 | { 63 | "text": fields.String(required=True), 64 | "journal_kb_data": fields.Dict(required=True), 65 | }, 66 | location="json", 67 | ) 68 | def extract_references_from_text(args): 69 | text = args.pop("text") 70 | journal_kb_data = args.pop("journal_kb_data") 71 | journal_dict = {"journals": journal_kb_data} 72 | try: 73 | extracted_references = extract_references_from_string( 74 | text, 75 | override_kbs_files=journal_dict, 76 | reference_format="{title},{volume},{page}", 77 | ) 78 | except Exception as e: 79 | return make_response( 80 | jsonify({"message": f"Can not extract references. Reason: {str(e)}"}), 81 | 500, 82 | ) 83 | return jsonify({"extracted_references": extracted_references}) 84 | 85 | @app.route("/extract_references_from_url", methods=["POST"]) 86 | @parser.use_args( 87 | { 88 | "url": fields.String(required=True), 89 | "journal_kb_data": fields.Dict(required=True), 90 | }, 91 | location="json", 92 | ) 93 | def extract_references_from_file_url(args): 94 | url = args.pop("url") 95 | journal_kb_data = args.pop("journal_kb_data") 96 | journal_dict = {"journals": journal_kb_data} 97 | try: 98 | extracted_references = extract_references_from_url( 99 | url, 100 | **{ 101 | "override_kbs_files": journal_dict, 102 | "reference_format": "{title},{volume},{page}", 103 | }, 104 | ) 105 | except Exception as e: 106 | return make_response( 107 | jsonify({"message": f"Can not extract references. Reason: {str(e)}"}), 108 | 500, 109 | ) 110 | return jsonify({"extracted_references": extracted_references}) 111 | 112 | @app.route("/extract_references_from_list", methods=["POST"]) 113 | @parser.use_args( 114 | { 115 | "raw_references": fields.List(fields.String, required=True), 116 | "journal_kb_data": fields.Dict(required=True), 117 | }, 118 | location="json", 119 | ) 120 | def extract_references_from_list(args): 121 | references = args.pop("raw_references") 122 | journal_kb_data = args.pop("journal_kb_data") 123 | journal_dict = {"journals": journal_kb_data} 124 | extracted_references = [] 125 | for reference in references: 126 | try: 127 | extracted_reference = extract_references_from_string( 128 | reference, 129 | override_kbs_files=journal_dict, 130 | reference_format="{title},{volume},{page}", 131 | ) 132 | if extracted_reference: 133 | extracted_references.append(extracted_reference[0]) 134 | else: 135 | extracted_references.append({"raw_ref": [reference]}) 136 | except Exception as e: 137 | LOGGER.error( 138 | f"Failed to extract reference: {reference}. Reason: {str(e)}" 139 | ) 140 | extracted_references.append({"raw_ref": [reference]}) 141 | return jsonify({"extracted_references": extracted_references}) 142 | 143 | return app 144 | 145 | 146 | app = create_app() 147 | 148 | if app.config.get("PROMETHEUS_ENABLE_EXPORTER_FLASK"): 149 | LOGGER.info("Starting prometheus metrics exporter") 150 | metrics = GunicornInternalPrometheusMetrics.for_app_factory() 151 | metrics.init_app(app) 152 | 153 | if __name__ == "__main__": 154 | app.run(host="0.0.0.0") 155 | -------------------------------------------------------------------------------- /refextract/authors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2015, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | -------------------------------------------------------------------------------- /refextract/config.cfg: -------------------------------------------------------------------------------- 1 | FILES_DOWNLOAD_MAX_RETRIES = 3 2 | FILES_DOWNLOAD_TIMEOUT = 60 3 | PROMETHEUS_ENABLE_EXPORTER_FLASK = False 4 | -------------------------------------------------------------------------------- /refextract/documents/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2015, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | -------------------------------------------------------------------------------- /refextract/documents/pdf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2013, 2015, 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | """ 25 | When a document is converted to plain-text from PDF, 26 | certain characters may result in the plain-text, that are 27 | either unwanted, or broken. These characters need to be corrected 28 | or removed. Examples are, certain control characters that would 29 | be illegal in XML and must be removed; TeX ligatures (etc); broken 30 | accents such as umlauts on letters that must be corrected. 31 | This function returns a dictionary of (unwanted) characters to look 32 | for and the characters that should be used to replace them. 33 | @return: (dictionary) - { seek -> replace, } or charsacters to 34 | replace in plain-text. 35 | """ 36 | 37 | import logging 38 | import os 39 | import re 40 | import subprocess 41 | 42 | from refextract.references.config import CFG_PATH_PDFTOTEXT 43 | 44 | LOGGER = logging.getLogger(__name__) 45 | 46 | 47 | def convert_PDF_to_plaintext(fpath, keep_layout=False): 48 | """Convert PDF to txt using pdftotext 49 | 50 | Take the path to a PDF file and run pdftotext for this file, capturing 51 | the output. 52 | @param fpath: (string) path to the PDF file 53 | @return: (list) of unicode strings (contents of the PDF file translated 54 | into plaintext; each string is a line in the document.) 55 | """ 56 | if not os.path.isfile(CFG_PATH_PDFTOTEXT): 57 | raise IOError("Missing pdftotext executable") 58 | 59 | layout_option = "-layout" if keep_layout else "-raw" 60 | doclines = [] 61 | # Pattern to check for lines with a leading page-break character. 62 | # If this pattern is matched, we want to split the page-break into 63 | # its own line because we rely upon this for trying to strip headers 64 | # and footers, and for some other pattern matching. 65 | p_break_in_line = re.compile(r"^\s*\f(.+)$", re.UNICODE) 66 | # build pdftotext command: 67 | cmd_pdftotext = [ 68 | CFG_PATH_PDFTOTEXT, 69 | layout_option, 70 | "-q", 71 | "-enc", 72 | "UTF-8", 73 | fpath, 74 | "-", 75 | ] 76 | 77 | LOGGER.debug("%s", " ".join(cmd_pdftotext)) 78 | # open pipe to pdftotext: 79 | pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE) 80 | # read back results: 81 | for docline in pipe_pdftotext.stdout: 82 | unicodeline = docline.decode("utf-8") 83 | # Check for a page-break in this line: 84 | m_break_in_line = p_break_in_line.match(unicodeline) 85 | if m_break_in_line is None: 86 | # There was no page-break in this line. Just add the line: 87 | doclines.append(unicodeline) 88 | else: 89 | # If there was a page-break character in the same line as some 90 | # text, split it out into its own line so that we can later 91 | # try to find headers and footers: 92 | doclines.append("\f") 93 | doclines.append(m_break_in_line.group(1)) 94 | 95 | LOGGER.debug("convert_PDF_to_plaintext found: %s lines of text", len(doclines)) 96 | 97 | return doclines 98 | -------------------------------------------------------------------------------- /refextract/documents/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2013, 2015, 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | """Various utilities to manipulate or clean text""" 25 | 26 | import re 27 | 28 | re_space_comma = re.compile(r"\s,", re.UNICODE) 29 | re_space_semicolon = re.compile(r"\s;", re.UNICODE) 30 | re_space_period = re.compile(r"\s\.", re.UNICODE) 31 | re_colon_space_colon = re.compile(r":\s:", re.UNICODE) 32 | re_comma_space_colon = re.compile(r",\s:", re.UNICODE) 33 | re_space_closing_square_bracket = re.compile(r"\s\]", re.UNICODE) 34 | re_opening_square_bracket_space = re.compile(r"\[\s", re.UNICODE) 35 | re_hyphens = re.compile( 36 | rb"(\\255|\u02D7|\u0335|\u0336|\u2212|\u002D|\uFE63|\uFF0D)".decode( 37 | "raw_unicode_escape" 38 | ), 39 | re.UNICODE, 40 | ) 41 | re_multiple_space = re.compile(r"\s{2,}", re.UNICODE) 42 | 43 | re_group_captured_multiple_space = re.compile(r"(\s{2,})", re.UNICODE) 44 | 45 | 46 | def get_url_repair_patterns(): 47 | """Initialise and return a list of precompiled regexp patterns that 48 | are used to try to re-assemble URLs that have been broken during 49 | a document's conversion to plain-text. 50 | @return: (list) of compiled re regexp patterns used for finding 51 | various broken URLs. 52 | """ 53 | file_types_list = [ 54 | r"h\s*t\s*m", # htm 55 | r"h\s*t\s*m\s*l", # html 56 | r"t\s*x\s*t", # txt 57 | r"p\s*h\s*p", # php 58 | r"a\s*s\s*p\s*", # asp 59 | r"j\s*s\s*p", # jsp 60 | r"p\s*y", # py (python) 61 | r"p\s*l", # pl (perl) 62 | r"x\s*m\s*l", # xml 63 | r"j\s*p\s*g", # jpg 64 | r"g\s*i\s*f", # gif 65 | r"m\s*o\s*v", # mov 66 | r"s\s*w\s*f", # swf 67 | r"p\s*d\s*f", # pdf 68 | r"p\s*s", # ps 69 | r"d\s*o\s*c", # doc 70 | r"t\s*e\s*x", # tex 71 | r"s\s*h\s*t\s*m\s*l", # shtml 72 | ] 73 | 74 | pattern_list = [ 75 | r"(h\s*t\s*t\s*p\s*\:\s*\/\s*\/)", 76 | r"(f\s*t\s*p\s*\:\s*\/\s*\/\s*)", 77 | r"((http|ftp):\/\/\s*[\w\d])", 78 | r"((http|ftp):\/\/([\w\d\s\._\-])+?\s*\/)", 79 | r"((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\s\.\-])+?\/)+)", 80 | r"((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\s\.\-])+?\/)*([\w\d\_\s\-]+\.\s?[\w\d]+))", 81 | ] 82 | pattern_list = [re.compile(p, re.I | re.UNICODE) for p in pattern_list] 83 | 84 | # some possible endings for URLs: 85 | p = r"((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\.\-])+?\/)*([\w\d\_\-]+\.%s))" 86 | for extension in file_types_list: 87 | p_url = re.compile(p % extension, re.I | re.UNICODE) 88 | pattern_list.append(p_url) 89 | 90 | # if url last thing in line, and only 10 letters max, concat them 91 | p_url = re.compile( 92 | r"((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\.\-])+?\/)*\s*?([\w\d\_\.\-]\s?){1,10}\s*)$", 93 | re.I | re.UNICODE, 94 | ) 95 | pattern_list.append(p_url) 96 | 97 | return pattern_list 98 | 99 | 100 | # a list of patterns used to try to repair broken URLs within reference lines: 101 | re_list_url_repair_patterns = get_url_repair_patterns() 102 | 103 | 104 | def join_lines(line1, line2): 105 | """Join 2 lines of text 106 | 107 | >>> join_lines('abc', 'de') 108 | 'abcde' 109 | >>> join_lines('a-', 'b') 110 | 'ab' 111 | """ 112 | if line1 == "": 113 | pass 114 | elif line1[-1] == "-": 115 | # hyphenated word at the end of the 116 | # line - don't add in a space and remove hyphen 117 | line1 = line1[:-1] 118 | elif line1[-1] != " ": 119 | # no space at the end of this 120 | # line, add in a space 121 | line1 = line1 + " " 122 | return line1 + line2 123 | 124 | 125 | def repair_broken_urls(line): 126 | """Attempt to repair broken URLs in a line of text. 127 | 128 | E.g.: remove spaces from the middle of a URL; something like that. 129 | 130 | @param line: (string) the line in which to check for broken URLs. 131 | @return: (string) the line after any broken URLs have been repaired. 132 | """ 133 | 134 | def _chop_spaces_in_url_match(m): 135 | """Suppresses spaces in a matched URL.""" 136 | return m.group(1).replace(" ", "") 137 | 138 | for ptn in re_list_url_repair_patterns: 139 | line = ptn.sub(_chop_spaces_in_url_match, line) 140 | return line 141 | 142 | 143 | def remove_and_record_multiple_spaces_in_line(line): 144 | """For a given string, locate all ocurrences of multiple spaces 145 | together in the line, record the number of spaces found at each 146 | position, and replace them with a single space. 147 | @param line: (string) the text line to be processed for multiple 148 | spaces. 149 | @return: (tuple) countaining a dictionary and a string. The 150 | dictionary contains information about the number of spaces removed 151 | at given positions in the line. For example, if 3 spaces were 152 | removed from the line at index '22', the dictionary would be set 153 | as follows: { 22 : 3 } 154 | The string that is also returned in this tuple is the line after 155 | multiple-space ocurrences have replaced with single spaces. 156 | """ 157 | removed_spaces = {} 158 | # get a collection of match objects for all instances of 159 | # multiple-spaces found in the line: 160 | multispace_matches = re_group_captured_multiple_space.finditer(line) 161 | # record the number of spaces found at each match position: 162 | for multispace in multispace_matches: 163 | removed_spaces[multispace.start()] = multispace.end() - multispace.start() - 1 164 | # now remove the multiple-spaces from the line, replacing with a 165 | # single space at each position: 166 | line = re_group_captured_multiple_space.sub(" ", line) 167 | return (removed_spaces, line) 168 | 169 | 170 | def wash_line(line): 171 | """Wash a text line of certain punctuation errors, replacing them with 172 | more correct alternatives. E.g.: the string 'Yes , I like python.' 173 | will be transformed into 'Yes, I like python.' 174 | @param line: (string) the line to be washed. 175 | @return: (string) the washed line. 176 | """ 177 | line = re_space_comma.sub(",", line) 178 | line = re_space_semicolon.sub(";", line) 179 | line = re_space_period.sub(".", line) 180 | line = re_colon_space_colon.sub(":", line) 181 | line = re_comma_space_colon.sub(":", line) 182 | line = re_space_closing_square_bracket.sub("]", line) 183 | line = re_opening_square_bracket_space.sub("[", line) 184 | line = re_hyphens.sub("-", line) 185 | line = re_multiple_space.sub(" ", line) 186 | return line 187 | 188 | 189 | def remove_page_boundary_lines(docbody): 190 | """Try to locate page breaks, headers and footers within a document body, 191 | and remove the array cells at which they are found. 192 | @param docbody: (list) of strings, each string being a line in the 193 | document's body. 194 | @return: (list) of strings. The document body, hopefully with page- 195 | breaks, headers and footers removed. Each string in the list once more 196 | represents a line in the document. 197 | """ 198 | number_head_lines = number_foot_lines = 0 199 | # Make sure document not just full of whitespace: 200 | if not document_contains_text(docbody): 201 | # document contains only whitespace - cannot safely 202 | # strip headers/footers 203 | return docbody 204 | 205 | # Get list of index posns of pagebreaks in document: 206 | page_break_posns = get_page_break_positions(docbody) 207 | 208 | # Get num lines making up each header if poss: 209 | number_head_lines = get_number_header_lines(docbody, page_break_posns) 210 | 211 | # Get num lines making up each footer if poss: 212 | number_foot_lines = get_number_footer_lines(docbody, page_break_posns) 213 | 214 | # Remove pagebreaks,headers,footers: 215 | docbody = strip_headers_footers_pagebreaks( 216 | docbody, page_break_posns, number_head_lines, number_foot_lines 217 | ) 218 | 219 | return docbody 220 | 221 | 222 | def document_contains_text(docbody): 223 | """Test whether document contains text, or is just full of worthless 224 | whitespace. 225 | @param docbody: (list) of strings - each string being a line of the 226 | document's body 227 | @return: (integer) 1 if non-whitespace found in document; 0 if only 228 | whitespace found in document. 229 | """ 230 | found_non_space = 0 231 | for line in docbody: 232 | if not line.isspace(): 233 | # found a non-whitespace character in this line 234 | found_non_space = 1 235 | break 236 | return found_non_space 237 | 238 | 239 | def get_page_break_positions(docbody): 240 | """Locate page breaks in the list of document lines and create a list 241 | positions in the document body list. 242 | @param docbody: (list) of strings - each string is a line in the 243 | document. 244 | @return: (list) of integer positions, whereby each integer represents the 245 | position (in the document body) of a page-break. 246 | """ 247 | page_break_posns = [] 248 | p_break = re.compile(r"^\s*\f\s*$", re.UNICODE) 249 | num_document_lines = len(docbody) 250 | for i in range(num_document_lines): 251 | if p_break.match(docbody[i]) is not None: 252 | page_break_posns.append(i) 253 | return page_break_posns 254 | 255 | 256 | def get_number_header_lines(docbody, page_break_posns): 257 | """Try to guess the number of header lines each page of a document has. 258 | The positions of the page breaks in the document are used to try to guess 259 | the number of header lines. 260 | @param docbody: (list) of strings - each string being a line in the 261 | document 262 | @param page_break_posns: (list) of integers - each integer is the 263 | position of a page break in the document. 264 | @return: (int) the number of lines that make up the header of each page. 265 | """ 266 | remaining_breaks = len(page_break_posns) - 1 267 | num_header_lines = empty_line = 0 268 | # pattern to search for a word in a line: 269 | p_wordSearch = re.compile(r"([A-Za-z0-9-]+)", re.UNICODE) 270 | if remaining_breaks > 2: 271 | next_head = 2 if remaining_breaks > 3 else 1 272 | keep_checking = 1 273 | while keep_checking: 274 | cur_break = 1 275 | if docbody[(page_break_posns[cur_break] + num_header_lines + 1)].isspace(): 276 | # this is a blank line 277 | empty_line = 1 278 | 279 | if (page_break_posns[cur_break] + num_header_lines + 1) == ( 280 | page_break_posns[(cur_break + 1)] 281 | ): 282 | # Have reached next page-break: document has no 283 | # body - only head/footers! 284 | keep_checking = 0 285 | 286 | grps_headLineWords = p_wordSearch.findall( 287 | docbody[(page_break_posns[cur_break] + num_header_lines + 1)] 288 | ) 289 | cur_break = cur_break + next_head 290 | while (cur_break < remaining_breaks) and keep_checking: 291 | lineno = page_break_posns[cur_break] + num_header_lines + 1 292 | if lineno >= len(docbody): 293 | keep_checking = 0 294 | break 295 | grps_thisLineWords = p_wordSearch.findall(docbody[lineno]) 296 | if empty_line: 297 | if len(grps_thisLineWords) != 0: 298 | # This line should be empty, but isn't 299 | keep_checking = 0 300 | else: 301 | if (len(grps_thisLineWords) == 0) or ( 302 | len(grps_headLineWords) != len(grps_thisLineWords) 303 | ): 304 | # Not same num 'words' as equivilent line 305 | # in 1st header: 306 | keep_checking = 0 307 | else: 308 | keep_checking = check_boundary_lines_similar( 309 | grps_headLineWords, grps_thisLineWords 310 | ) 311 | # Update cur_break for nxt line to check 312 | cur_break = cur_break + next_head 313 | if keep_checking: 314 | # Line is a header line: check next 315 | num_header_lines = num_header_lines + 1 316 | empty_line = 0 317 | return num_header_lines 318 | 319 | 320 | def get_number_footer_lines(docbody, page_break_posns): 321 | """Try to guess the number of footer lines each page of a document has. 322 | The positions of the page breaks in the document are used to try to guess 323 | the number of footer lines. 324 | @param docbody: (list) of strings - each string being a line in the 325 | document 326 | @param page_break_posns: (list) of integers - each integer is the 327 | position of a page break in the document. 328 | @return: (int) the number of lines that make up the footer of each page. 329 | """ 330 | num_breaks = len(page_break_posns) 331 | num_footer_lines = 0 332 | empty_line = 0 333 | keep_checking = 1 334 | p_wordSearch = re.compile(r"([A-Za-z0-9-]+)", re.UNICODE) 335 | if num_breaks > 2: 336 | while keep_checking: 337 | cur_break = 1 338 | if ( 339 | page_break_posns[cur_break] - num_footer_lines - 1 < 0 340 | or page_break_posns[cur_break] - num_footer_lines - 1 > len(docbody) - 1 341 | ): 342 | # Be sure that the docbody list boundary wasn't overstepped: 343 | break 344 | if docbody[(page_break_posns[cur_break] - num_footer_lines - 1)].isspace(): 345 | empty_line = 1 346 | grps_headLineWords = p_wordSearch.findall( 347 | docbody[(page_break_posns[cur_break] - num_footer_lines - 1)] 348 | ) 349 | cur_break = cur_break + 1 350 | while (cur_break < num_breaks) and keep_checking: 351 | grps_thisLineWords = p_wordSearch.findall( 352 | docbody[(page_break_posns[cur_break] - num_footer_lines - 1)] 353 | ) 354 | if empty_line: 355 | if len(grps_thisLineWords) != 0: 356 | # this line should be empty, but isn't 357 | keep_checking = 0 358 | else: 359 | if (len(grps_thisLineWords) == 0) or ( 360 | len(grps_headLineWords) != len(grps_thisLineWords) 361 | ): 362 | # Not same num 'words' as equivilent line 363 | # in 1st footer: 364 | keep_checking = 0 365 | else: 366 | keep_checking = check_boundary_lines_similar( 367 | grps_headLineWords, grps_thisLineWords 368 | ) 369 | # Update cur_break for nxt line to check 370 | cur_break = cur_break + 1 371 | if keep_checking: 372 | # Line is a footer line: check next 373 | num_footer_lines = num_footer_lines + 1 374 | empty_line = 0 375 | return num_footer_lines 376 | 377 | 378 | def strip_headers_footers_pagebreaks( 379 | docbody, page_break_posns, num_head_lines, num_foot_lines 380 | ): 381 | """Remove page-break lines, header lines, and footer lines from the 382 | document. 383 | @param docbody: (list) of strings, whereby each string in the list is a 384 | line in the document. 385 | @param page_break_posns: (list) of integers, whereby each integer 386 | represents the index in docbody at which a page-break is found. 387 | @param num_head_lines: (int) the number of header lines each page in the 388 | document has. 389 | @param num_foot_lines: (int) the number of footer lines each page in the 390 | document has. 391 | @return: (list) of strings - the document body after the headers, 392 | footers, and page-break lines have been stripped from the list. 393 | """ 394 | num_breaks = len(page_break_posns) 395 | page_lens = [] 396 | for x in range(0, num_breaks): 397 | if x < num_breaks - 1: 398 | page_lens.append(page_break_posns[x + 1] - page_break_posns[x]) 399 | page_lens.sort() 400 | if (len(page_lens) > 0) and (num_head_lines + num_foot_lines + 1 < page_lens[0]): 401 | # Safe to chop hdrs & ftrs 402 | page_break_posns.reverse() 403 | first = 1 404 | for i in range(0, len(page_break_posns)): 405 | # Unless this is the last page break, chop headers 406 | if not first: 407 | for _dummy in range(1, num_head_lines + 1): 408 | docbody[page_break_posns[i] + 1 : page_break_posns[i] + 2] = [] 409 | else: 410 | first = 0 411 | # Chop page break itself 412 | docbody[page_break_posns[i] : page_break_posns[i] + 1] = [] 413 | # Chop footers (unless this is the first page break) 414 | if i != len(page_break_posns) - 1: 415 | for _dummy in range(1, num_foot_lines + 1): 416 | docbody[ 417 | page_break_posns[i] - num_foot_lines : page_break_posns[i] 418 | - num_foot_lines 419 | + 1 420 | ] = [] 421 | return docbody 422 | 423 | 424 | def check_boundary_lines_similar(l_1, l_2): 425 | """Compare two lists to see if their elements are roughly the same. 426 | @param l_1: (list) of strings. 427 | @param l_2: (list) of strings. 428 | @return: (int) 1/0. 429 | """ 430 | num_matches = 0 431 | if not isinstance(l_1, list) or not isinstance(l_2, list) or (len(l_1) != len(l_2)): 432 | # these 'boundaries' are not similar 433 | return 0 434 | 435 | num_elements = len(l_1) 436 | for i in range(0, num_elements): 437 | if l_1[i].isdigit() and l_2[i].isdigit(): 438 | # both lines are integers 439 | num_matches += 1 440 | else: 441 | l1_str = l_1[i].lower() 442 | l2_str = l_2[i].lower() 443 | if (l1_str[0] == l2_str[0]) and ( 444 | l1_str[len(l1_str) - 1] == l2_str[len(l2_str) - 1] 445 | ): 446 | num_matches = num_matches + 1 447 | if (len(l_1) == 0) or (float(num_matches) / float(len(l_1)) < 0.9): 448 | return 0 449 | else: 450 | return 1 451 | -------------------------------------------------------------------------------- /refextract/references/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2015, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | -------------------------------------------------------------------------------- /refextract/references/api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2013, 2015, 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | """This is where all the public API calls are accessible to extract references. 25 | 26 | There are 4 API functions available to extract from PDF file, string or URL. In 27 | addition, there is an API call to return a parsed journal reference structure 28 | from a raw string. 29 | """ 30 | 31 | import os 32 | from tempfile import mkstemp 33 | 34 | import magic 35 | import requests 36 | from inspire_utils.dedupers import dedupe_list 37 | 38 | from refextract.references.engine import ( 39 | get_kbs, 40 | get_plaintext_document_body, 41 | parse_reference_line, 42 | parse_references, 43 | ) 44 | from refextract.references.errors import FullTextNotAvailableError 45 | from refextract.references.find import ( 46 | find_numeration_in_body, 47 | get_reference_section_beginning, 48 | ) 49 | from refextract.references.pdf import extract_texkeys_and_urls_from_pdf 50 | from refextract.references.record import update_reference_with_urls 51 | from refextract.references.text import ( 52 | extract_references_from_fulltext, 53 | rebuild_reference_lines, 54 | ) 55 | 56 | 57 | def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs): 58 | """Extract references from the pdf specified in the url. 59 | 60 | The first parameter is the URL of the file. 61 | It returns a list of parsed references. 62 | 63 | It raises FullTextNotAvailableError if the URL gives a 404, 64 | UnknownDocumentTypeError if it is not a PDF or plain text. 65 | 66 | The standard reference format is: {title} {volume} ({year}) {page}. 67 | 68 | E.g. you can change that by passing the reference_format: 69 | 70 | >>> extract_references_from_url(path, reference_format="{title},{volume},{page}") 71 | 72 | If you want to also link each reference to some other resource (like a record), 73 | you can provide a linker_callback function to be executed for every reference 74 | element found. 75 | 76 | To override KBs for journal names etc., use ``override_kbs_files``: 77 | 78 | >>> extract_references_from_url(path, 79 | override_kbs_files={'journals': 'my/path/to.kb'}) 80 | 81 | """ 82 | # Get temporary filepath to download to 83 | filename, filepath = mkstemp( 84 | suffix="_{0}".format(os.path.basename(url)), 85 | ) 86 | os.close(filename) 87 | 88 | try: 89 | req = requests.get(url=url, headers=headers, stream=True) 90 | req.raise_for_status() 91 | with open(filepath, "wb") as f: 92 | for chunk in req.iter_content(chunk_size): 93 | f.write(chunk) 94 | references = extract_references_from_file(filepath, **kwargs) 95 | except requests.exceptions.HTTPError as exc: 96 | raise FullTextNotAvailableError(f"URL not found: '{url}'") from exc 97 | finally: 98 | os.remove(filepath) 99 | return references 100 | 101 | 102 | def extract_references_from_file( 103 | path, 104 | recid=None, 105 | reference_format="{title} {volume} ({year}) {page}", 106 | linker_callback=None, 107 | override_kbs_files=None, 108 | ): 109 | """Extract references from a local pdf file. 110 | 111 | The first parameter is the path to the file. 112 | It returns a list of parsed references. 113 | It raises FullTextNotAvailableError if the file does not exist, 114 | UnknownDocumentTypeError if it is not a PDF or plain text. 115 | 116 | The standard reference format is: {title} {volume} ({year}) {page}. 117 | 118 | E.g. you can change that by passing the reference_format: 119 | 120 | >>> extract_references_from_file(path, reference_format=u"{title},{volume},{page}") 121 | 122 | If you want to also link each reference to some other resource (like a record), 123 | you can provide a linker_callback function to be executed for every reference 124 | element found. 125 | 126 | To override KBs for journal names etc., use ``override_kbs_files``: 127 | 128 | >>> extract_references_from_file(path, 129 | override_kbs_files={'journals': 'my/path/to.kb'}) 130 | 131 | """ 132 | if not os.path.isfile(path): 133 | raise FullTextNotAvailableError("File not found: '{0}'".format(path)) 134 | 135 | docbody = get_plaintext_document_body(path) 136 | reflines, dummy, dummy = extract_references_from_fulltext(docbody) 137 | if not reflines: 138 | docbody = get_plaintext_document_body(path, keep_layout=True) 139 | reflines, dummy, dummy = extract_references_from_fulltext(docbody) 140 | 141 | parsed_refs, stats = parse_references( 142 | reflines, 143 | recid=recid, 144 | reference_format=reference_format, 145 | linker_callback=linker_callback, 146 | override_kbs_files=override_kbs_files, 147 | ) 148 | 149 | if magic.from_file(path, mime=True) == "application/pdf": 150 | extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path) 151 | if len(extracted_texkeys_urls) == len(parsed_refs): 152 | parsed_refs_updated = [] 153 | for ref, ref_texkey_urls in zip( 154 | parsed_refs, extracted_texkeys_urls, strict=False 155 | ): 156 | update_reference_with_urls(ref, ref_texkey_urls.get("urls", [])) 157 | if ref.get("url"): 158 | ref["url"] = dedupe_list(ref["url"]) 159 | parsed_refs_updated.append( 160 | dict(ref, texkey=[ref_texkey_urls["texkey"]]) 161 | ) 162 | 163 | return parsed_refs_updated 164 | return parsed_refs 165 | 166 | 167 | def extract_references_from_string( 168 | source, 169 | is_only_references=True, 170 | recid=None, 171 | reference_format="{title} {volume} ({year}) {page}", 172 | linker_callback=None, 173 | override_kbs_files=None, 174 | ): 175 | """Extract references from a raw string. 176 | 177 | The first parameter is the path to the file. 178 | It returns a tuple (references, stats). 179 | 180 | If the string does not only contain references, improve accuracy by 181 | specifing ``is_only_references=False``. 182 | 183 | The standard reference format is: {title} {volume} ({year}) {page}. 184 | 185 | E.g. you can change that by passing the reference_format: 186 | 187 | >>> extract_references_from_string(path, reference_format="{title},{volume},{page}") 188 | 189 | If you want to also link each reference to some other resource (like a record), 190 | you can provide a linker_callback function to be executed for every reference 191 | element found. 192 | 193 | To override KBs for journal names etc., use ``override_kbs_files``: 194 | 195 | >>> extract_references_from_string(path, 196 | override_kbs_files={'journals': 'my/path/to.kb'}) 197 | """ 198 | docbody = source.split("\n") 199 | if not is_only_references: 200 | reflines, dummy, dummy = extract_references_from_fulltext(docbody) 201 | else: 202 | refs_info = get_reference_section_beginning(docbody) 203 | if not refs_info: 204 | refs_info, dummy = find_numeration_in_body(docbody) 205 | refs_info["start_line"] = 0 206 | refs_info["end_line"] = (len(docbody) - 1,) 207 | 208 | reflines = rebuild_reference_lines(docbody, refs_info["marker_pattern"]) 209 | parsed_refs, stats = parse_references( 210 | reflines, 211 | recid=recid, 212 | reference_format=reference_format, 213 | linker_callback=linker_callback, 214 | override_kbs_files=override_kbs_files, 215 | ) 216 | return parsed_refs 217 | 218 | 219 | def extract_journal_reference(line, override_kbs_files=None): 220 | """Extract the journal reference from string. 221 | 222 | Extracts the journal reference from string and parses for specific 223 | journal information. 224 | """ 225 | kbs = get_kbs(custom_kbs=override_kbs_files) 226 | references, dummy_m, dummy_c, dummy_co = parse_reference_line(line, kbs) 227 | 228 | for elements in references: 229 | for el in elements: 230 | if el["type"] == "JOURNAL": 231 | return el 232 | -------------------------------------------------------------------------------- /refextract/references/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2013, 2015, 2017, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | """refextract configuration.""" 25 | 26 | import os 27 | 28 | try: 29 | from shutil import which 30 | except ImportError: 31 | # CPython <3.3 32 | from distutils.spawn import find_executable as which 33 | 34 | import pkg_resources 35 | 36 | # Version number: 37 | CFG_PATH_PDFTOTEXT = os.environ.get("CFG_PATH_PDFTOTEXT", which("pdftotext")) 38 | 39 | # Module config directory 40 | CFG_KBS_DIR = pkg_resources.resource_filename("refextract.references", "kbs") 41 | 42 | CFG_REFEXTRACT_KBS = { 43 | "journals": "%s/journal-titles.kb" % CFG_KBS_DIR, 44 | "journals_re": "%s/journal-titles-re.kb" % CFG_KBS_DIR, 45 | "report-numbers": "%s/report-numbers.kb" % CFG_KBS_DIR, 46 | "authors": "%s/authors.kb" % CFG_KBS_DIR, 47 | "collaborations": "%s/collaborations.kb" % CFG_KBS_DIR, 48 | "books": "%s/books.kb" % CFG_KBS_DIR, 49 | "publishers": "%s/publishers.kb" % CFG_KBS_DIR, 50 | "special_journals": "%s/special-journals.kb" % CFG_KBS_DIR, 51 | } 52 | 53 | # Reference fields: 54 | CFG_REFEXTRACT_FIELDS = { 55 | "misc": "m", 56 | "linemarker": "o", 57 | "doi": "a", 58 | "hdl": "a", 59 | "reportnumber": "r", 60 | "journal": "s", 61 | "url": "u", 62 | "urldesc": "z", 63 | "author": "h", 64 | "title": "t", 65 | "isbn": "i", 66 | "publisher": "p", 67 | "year": "y", 68 | "collaboration": "c", 69 | "recid": "0", 70 | } 71 | 72 | # Internal tags are used by refextract to mark-up recognised citation 73 | # information. 74 | CFG_REFEXTRACT_MARKER_OPENING_REPORT_NUM = r"" 75 | CFG_REFEXTRACT_MARKER_OPENING_ARXIV = r"" 76 | CFG_REFEXTRACT_MARKER_OPENING_TITLE = r"" 77 | CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID = r"" 78 | CFG_REFEXTRACT_MARKER_OPENING_SERIES = r"" 79 | CFG_REFEXTRACT_MARKER_OPENING_VOLUME = r"" 80 | CFG_REFEXTRACT_MARKER_OPENING_YEAR = r"" 81 | CFG_REFEXTRACT_MARKER_OPENING_PAGE = r"" 82 | CFG_REFEXTRACT_MARKER_OPENING_QUOTED = r"" 83 | CFG_REFEXTRACT_MARKER_OPENING_ISBN = r"" 84 | CFG_REFEXTRACT_MARKER_OPENING_PUBLISHER = r"" 85 | CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION = r"" 86 | 87 | # These are the "closing tags: 88 | CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM = r"" 89 | CFG_REFEXTRACT_MARKER_CLOSING_ARXIV = r"" 90 | CFG_REFEXTRACT_MARKER_CLOSING_TITLE = r"" 91 | CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID = r"" 92 | CFG_REFEXTRACT_MARKER_CLOSING_SERIES = r"" 93 | CFG_REFEXTRACT_MARKER_CLOSING_VOLUME = r"" 94 | CFG_REFEXTRACT_MARKER_CLOSING_YEAR = r"" 95 | CFG_REFEXTRACT_MARKER_CLOSING_PAGE = r"" 96 | CFG_REFEXTRACT_MARKER_CLOSING_QUOTED = r"" 97 | CFG_REFEXTRACT_MARKER_CLOSING_ISBN = r"" 98 | CFG_REFEXTRACT_MARKER_CLOSING_PUBLISHER = r"" 99 | CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION = r"" 100 | 101 | # Of the form '' only 102 | CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND = r"" 103 | CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL = r"" 104 | CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL = r"" 105 | 106 | # The minimum length of a reference's misc text to be deemed insignificant. 107 | # when comparing misc text with semi-colon defined sub-references. 108 | # Values higher than this value reflect meaningful misc text. 109 | # Hence, upon finding a correct semi-colon, but having current misc text 110 | # length less than this value (without other meaningful reference objects: 111 | # report numbers, titles...) then no split will occur. 112 | # (A higher value will increase splitting strictness. i.e. Fewer splits) 113 | CGF_REFEXTRACT_SEMI_COLON_MISC_TEXT_SENSITIVITY = 60 114 | 115 | # The length of misc text between two adjacent authors which is 116 | # deemed as insignificant. As such, when misc text of a length less 117 | # than this value is found, then the latter author group is dumped into misc. 118 | # (A higher value will increase splitting strictness. i.e. Fewer splits) 119 | CGF_REFEXTRACT_ADJACENT_AUTH_MISC_SEPARATION = 10 120 | 121 | # Maximum number of lines for a citation before it is considered invalid 122 | CFG_REFEXTRACT_MAX_LINES = 25 123 | -------------------------------------------------------------------------------- /refextract/references/errors.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2015, 2016, 2018 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | """Custom exceptions.""" 25 | 26 | 27 | class FullTextNotAvailableError(Exception): 28 | """Raised when we cannot access the document text.""" 29 | 30 | 31 | class UnknownDocumentTypeError(Exception): 32 | """Raised when we don't know how to handle the document's MIME type.""" 33 | -------------------------------------------------------------------------------- /refextract/references/find.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2013, 2015, 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | """Finding the reference section from the fulltext""" 25 | 26 | import contextlib 27 | import logging 28 | import re 29 | 30 | from refextract.references.regexs import ( 31 | get_post_reference_section_keyword_patterns, 32 | get_post_reference_section_title_patterns, 33 | get_reference_line_numeration_marker_patterns, 34 | get_reference_section_title_patterns, 35 | re_num, 36 | re_reference_line_bracket_markers, 37 | re_reference_line_dot_markers, 38 | re_reference_line_number_markers, 39 | regex_match_list, 40 | ) 41 | 42 | LOGGER = logging.getLogger(__name__) 43 | 44 | 45 | def find_reference_section(docbody): 46 | """Search in document body for its reference section. 47 | 48 | More precisely, find 49 | the first line of the reference section. Effectively, the function starts 50 | at the end of a document and works backwards, line-by-line, looking for 51 | the title of a reference section. It stops when (if) it finds something 52 | that it considers to be the first line of a reference section. 53 | @param docbody: (list) of strings - the full document body. 54 | @return: (dictionary) : 55 | { 'start_line' : (integer) - index in docbody of 1st reference line, 56 | 'title_string' : (string) - title of the reference section. 57 | 'marker' : (string) - the marker of the first reference line, 58 | 'marker_pattern' : (string) - regexp string used to find the marker, 59 | 'title_marker_same_line' : (integer) - flag to indicate whether the 60 | reference section title was on the same 61 | line as the first reference line's 62 | marker or not. 1 if it was; 0 if not. 63 | } 64 | Much of this information is used by later functions to rebuild 65 | a reference section. 66 | -- OR -- 67 | (None) - when the reference section could not be found. 68 | """ 69 | ref_details = None 70 | title_patterns = get_reference_section_title_patterns() 71 | 72 | # Try to find refs section title: 73 | for title_pattern in title_patterns: 74 | # Look for title pattern in docbody 75 | for reversed_index, line in enumerate(reversed(docbody)): 76 | title_match = title_pattern.match(line) 77 | if title_match: 78 | title = title_match.group("title") 79 | index = len(docbody) - 1 - reversed_index 80 | temp_ref_details, found_title = find_numeration( 81 | docbody[index : index + 6], title 82 | ) 83 | if temp_ref_details: 84 | if ( 85 | ref_details 86 | and "title" in ref_details 87 | and ref_details["title"] 88 | and not temp_ref_details["title"] 89 | ): 90 | continue 91 | if ( 92 | ref_details 93 | and "marker" in ref_details 94 | and ref_details["marker"] 95 | and not temp_ref_details["marker"] 96 | ): 97 | continue 98 | 99 | ref_details = temp_ref_details 100 | ref_details["start_line"] = index 101 | ref_details["title_string"] = title 102 | 103 | if found_title: 104 | break 105 | 106 | if ref_details: 107 | break 108 | 109 | return ref_details 110 | 111 | 112 | def find_numeration_in_body(docbody): 113 | marker_patterns = get_reference_line_numeration_marker_patterns() 114 | ref_details = None 115 | found_title = False 116 | 117 | # No numeration unless we find one 118 | ref_details = { 119 | "title_marker_same_line": False, 120 | "marker": None, 121 | "marker_pattern": None, 122 | } 123 | 124 | for line in docbody: 125 | # Move past blank lines 126 | if line.isspace(): 127 | continue 128 | 129 | # Is this line numerated like a reference line? 130 | m_num = None 131 | mark_match = regex_match_list(line, marker_patterns) 132 | if mark_match: 133 | # Check if it's the first reference 134 | # Something like [1] or (1), etc. 135 | try: 136 | m_num = mark_match.group("marknum") 137 | if m_num != "1": 138 | continue 139 | except IndexError: 140 | pass 141 | 142 | mark = mark_match.group("mark") 143 | mk_ptn = mark_match.re.pattern 144 | ref_details = { 145 | "marker": mark, 146 | "marker_pattern": mk_ptn, 147 | "title_marker_same_line": False, 148 | } 149 | 150 | break 151 | 152 | return ref_details, found_title 153 | 154 | 155 | def find_numeration_in_title(docbody, title): 156 | ref_details = None 157 | found_title = False 158 | 159 | try: 160 | first_line = docbody[0] 161 | except IndexError: 162 | return ref_details, found_title 163 | 164 | # Need to escape to avoid problems like 'References[' 165 | title = re.escape(title) 166 | 167 | mk_with_title_ptns = get_reference_line_numeration_marker_patterns(title) 168 | mk_with_title_match = regex_match_list(first_line, mk_with_title_ptns) 169 | if mk_with_title_match: 170 | mk = mk_with_title_match.group("mark") 171 | mk_ptn = mk_with_title_match.re.pattern 172 | m_num = re_num.search(mk) 173 | if m_num and m_num.group(0) == "1": 174 | # Mark found 175 | found_title = True 176 | ref_details = { 177 | "marker": mk, 178 | "marker_pattern": mk_ptn, 179 | "title_marker_same_line": True, 180 | } 181 | else: 182 | ref_details = { 183 | "marker": mk, 184 | "marker_pattern": mk_ptn, 185 | "title_marker_same_line": True, 186 | } 187 | 188 | return ref_details, found_title 189 | 190 | 191 | def find_numeration(docbody, title): 192 | """Find numeration pattern 193 | 194 | 1st try to find numeration in the title 195 | e.g. 196 | References [4] Riotto... 197 | 198 | 2nd find the numeration alone in the line after the title 199 | e.g. 200 | References 201 | 1 202 | Riotto 203 | 204 | 3rnd find the numeration in the following line 205 | e.g. 206 | References 207 | [1] Riotto 208 | """ 209 | ref_details, found_title = find_numeration_in_title(docbody, title) 210 | if not ref_details: 211 | ref_details, found_title = find_numeration_in_body(docbody) 212 | 213 | return ref_details, found_title 214 | 215 | 216 | def find_reference_section_no_title_via_brackets(docbody): 217 | """This function would generally be used when it was not possible to locate 218 | the start of a document's reference section by means of its title. 219 | Instead, this function will look for reference lines that have numeric 220 | markers of the format [1], [2], etc. 221 | @param docbody: (list) of strings -each string is a line in the document. 222 | @return: (dictionary) : 223 | { 'start_line' : (integer) - index in docbody of 1st reference line, 224 | 'title_string' : (None) - title of the reference section 225 | (None since no title), 226 | 'marker' : (string) - the marker of the first reference line, 227 | 'marker_pattern' : (string) - the regexp string used to find the 228 | marker, 229 | 'title_marker_same_line' : (integer) 0 - to signal title not on same 230 | line as marker. 231 | } 232 | Much of this information is used by later functions to rebuild 233 | a reference section. 234 | -- OR -- 235 | (None) - when the reference section could not be found. 236 | """ 237 | marker_patterns = [re_reference_line_bracket_markers] 238 | return find_reference_section_no_title_generic(docbody, marker_patterns) 239 | 240 | 241 | def find_reference_section_no_title_via_dots(docbody): 242 | """This function would generally be used when it was not possible to locate 243 | the start of a document's reference section by means of its title. 244 | Instead, this function will look for reference lines that have numeric 245 | markers of the format 1., 2., etc. 246 | @param docbody: (list) of strings -each string is a line in the document. 247 | @return: (dictionary) : 248 | { 'start_line' : (integer) - index in docbody of 1st reference line, 249 | 'title_string' : (None) - title of the reference section 250 | (None since no title), 251 | 'marker' : (string) - the marker of the first reference line, 252 | 'marker_pattern' : (string) - the regexp string used to find the 253 | marker, 254 | 'title_marker_same_line' : (integer) 0 - to signal title not on same 255 | line as marker. 256 | } 257 | Much of this information is used by later functions to rebuild 258 | a reference section. 259 | -- OR -- 260 | (None) - when the reference section could not be found. 261 | """ 262 | marker_patterns = [re_reference_line_dot_markers] 263 | return find_reference_section_no_title_generic(docbody, marker_patterns) 264 | 265 | 266 | def find_reference_section_no_title_via_numbers(docbody): 267 | """This function would generally be used when it was not possible to locate 268 | the start of a document's reference section by means of its title. 269 | Instead, this function will look for reference lines that have numeric 270 | markers of the format 1, 2, etc. 271 | @param docbody: (list) of strings -each string is a line in the document. 272 | @return: (dictionary) : 273 | { 'start_line' : (integer) - index in docbody of 1st reference line, 274 | 'title_string' : (None) - title of the reference section 275 | (None since no title), 276 | 'marker' : (string) - the marker of the first reference line, 277 | 'marker_pattern' : (string) - the regexp string used to find the 278 | marker, 279 | 'title_marker_same_line' : (integer) 0 - to signal title not on same 280 | line as marker. 281 | } 282 | Much of this information is used by later functions to rebuild 283 | a reference section. 284 | -- OR -- 285 | (None) - when the reference section could not be found. 286 | """ 287 | marker_patterns = [re_reference_line_number_markers] 288 | return find_reference_section_no_title_generic(docbody, marker_patterns) 289 | 290 | 291 | def find_reference_section_no_title_generic(docbody, marker_patterns): 292 | """This function would generally be used when it was not possible to locate 293 | the start of a document's reference section by means of its title. 294 | Instead, this function will look for reference lines that have numeric 295 | markers of the format [1], [2], {1}, {2}, etc. 296 | @param docbody: (list) of strings -each string is a line in the document. 297 | @return: (dictionary) : 298 | { 'start_line' : (integer) - index in docbody of 1st reference line, 299 | 'title_string' : (None) - title of the reference section 300 | (None since no title), 301 | 'marker' : (string) - the marker of the first reference line, 302 | 'marker_pattern' : (string) - the regexp string used to find the 303 | marker, 304 | 'title_marker_same_line' : (integer) 0 - to signal title not on same 305 | line as marker. 306 | } 307 | Much of this information is used by later functions to rebuild 308 | a reference section. 309 | -- OR -- 310 | (None) - when the reference section could not be found. 311 | """ 312 | if not docbody: 313 | return None 314 | 315 | ref_start_line = ref_line_marker = None 316 | 317 | # try to find first reference line in the reference section: 318 | found_ref_sect = False 319 | 320 | for reversed_index, line in enumerate(reversed(docbody)): 321 | mark_match = regex_match_list(line.strip(), marker_patterns) 322 | if mark_match and mark_match.group("marknum") == "1": 323 | # Get marker recognition pattern: 324 | mark_pattern = mark_match.re.pattern 325 | 326 | # Look for [2] in next 10 lines: 327 | next_test_lines = 10 328 | 329 | index = len(docbody) - reversed_index 330 | zone_to_check = docbody[index : index + next_test_lines] 331 | if len(zone_to_check) < 5: 332 | # We found a 1 towards the end, we assume 333 | # we only have one reference 334 | found = True 335 | else: 336 | # Check for number 2 337 | found = False 338 | for line_ in zone_to_check: 339 | mark_match2 = regex_match_list(line_.strip(), marker_patterns) 340 | if mark_match2 and mark_match2.group("marknum") == "2": 341 | found = True 342 | break 343 | 344 | if found: 345 | # Found next reference line: 346 | found_ref_sect = True 347 | ref_start_line = len(docbody) - 1 - reversed_index 348 | ref_line_marker = mark_match.group("mark") 349 | ref_line_marker_pattern = mark_pattern 350 | break 351 | 352 | if found_ref_sect: 353 | ref_sectn_details = { 354 | "start_line": ref_start_line, 355 | "title_string": None, 356 | "marker": ref_line_marker.strip(), 357 | "marker_pattern": ref_line_marker_pattern, 358 | "title_marker_same_line": False, 359 | } 360 | else: 361 | # didn't manage to find the reference section 362 | ref_sectn_details = None 363 | 364 | return ref_sectn_details 365 | 366 | 367 | def find_end_of_reference_section( 368 | docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn 369 | ): 370 | """Given that the start of a document's reference section has already been 371 | recognised, this function is tasked with finding the line-number in the 372 | document of the last line of the reference section. 373 | @param docbody: (list) of strings - the entire plain-text document body. 374 | @param ref_start_line: (integer) - the index in docbody of the first line 375 | of the reference section. 376 | @param ref_line_marker: (string) - the line marker of the first reference 377 | line. 378 | @param ref_line_marker_ptn: (string) - the pattern used to search for a 379 | reference line marker. 380 | @return: (integer) - index in docbody of the last reference line 381 | -- OR -- 382 | (None) - if ref_start_line was invalid. 383 | """ 384 | section_ended = False 385 | x = ref_start_line 386 | if type(x) is not int or x < 0 or x > len(docbody) or len(docbody) < 1: 387 | # The provided 'first line' of the reference section was invalid. 388 | # Either it was out of bounds in the document body, or it was not a 389 | # valid integer. 390 | # Can't safely find end of refs with this info - quit. 391 | return None 392 | # Get patterns for testing line: 393 | t_patterns = get_post_reference_section_title_patterns() 394 | kw_patterns = get_post_reference_section_keyword_patterns() 395 | 396 | if None not in (ref_line_marker, ref_line_marker_ptn): 397 | mk_patterns = [re.compile(ref_line_marker_ptn, re.I | re.UNICODE)] 398 | else: 399 | mk_patterns = get_reference_line_numeration_marker_patterns() 400 | 401 | current_reference_count = 0 402 | while x < len(docbody) and not section_ended: 403 | # save the reference count 404 | num_match = regex_match_list(docbody[x].strip(), mk_patterns) 405 | if num_match: 406 | with contextlib.suppress(ValueError, IndexError): 407 | current_reference_count = int(num_match.group("marknum")) 408 | 409 | # look for a likely section title that would follow a reference 410 | # section: 411 | end_match = regex_match_list(docbody[x].strip(), t_patterns) 412 | if not end_match: 413 | # didn't match a section title - try looking for keywords that 414 | # suggest the end of a reference section: 415 | end_match = regex_match_list(docbody[x].strip(), kw_patterns) 416 | else: 417 | # Is it really the end of the reference section? Check within the next 418 | # 5 lines for other reference numeration markers: 419 | y = x + 1 420 | line_found = False 421 | while y < x + 200 and y < len(docbody) and not line_found: 422 | num_match = regex_match_list(docbody[y].strip(), mk_patterns) 423 | if num_match and not num_match.group(0).isdigit(): 424 | try: 425 | num = int(num_match.group("marknum")) 426 | if current_reference_count + 1 == num: 427 | line_found = True 428 | except ValueError: 429 | # We have the marknum index so it is 430 | # numeric pattern for references like 431 | # [1], [2] but this match is not a number 432 | pass 433 | except IndexError: 434 | # We have a non numerical references marking 435 | # we don't check for a number continuity 436 | line_found = True 437 | y += 1 438 | if not line_found: 439 | # No ref line found-end section 440 | section_ended = True 441 | if not section_ended: 442 | # Does this & the next 5 lines simply contain numbers? If yes, it's 443 | # probably the axis scale of a graph in a fig. End refs section 444 | digit_test_str = ( 445 | docbody[x] 446 | .replace(" ", "") 447 | .replace(".", "") 448 | .replace("-", "") 449 | .replace("+", "") 450 | .replace("\u00d7", "") 451 | .replace("\u2212", "") 452 | .strip() 453 | ) 454 | if len(digit_test_str) > 10 and digit_test_str.isdigit(): 455 | # The line contains only digits and is longer than 10 chars: 456 | y = x + 1 457 | digit_lines = 4 458 | num_digit_lines = 1 459 | while y < x + digit_lines and y < len(docbody): 460 | digit_test_str = ( 461 | docbody[y] 462 | .replace(" ", "") 463 | .replace(".", "") 464 | .replace("-", "") 465 | .replace("+", "") 466 | .replace("\u00d7", "") 467 | .replace("\u2212", "") 468 | .strip() 469 | ) 470 | if len(digit_test_str) > 10 and digit_test_str.isdigit(): 471 | num_digit_lines += 1 472 | elif len(digit_test_str) == 0: 473 | # This is a blank line. Don't count it, to accommodate 474 | # documents that are double-line spaced: 475 | digit_lines += 1 476 | y = y + 1 477 | if num_digit_lines == digit_lines: 478 | section_ended = True 479 | x += 1 480 | return x - 1 481 | 482 | 483 | def get_reference_section_beginning(fulltext): 484 | sect_start = { 485 | "start_line": None, 486 | "end_line": None, 487 | "title_string": None, 488 | "marker_pattern": None, 489 | "marker": None, 490 | "how_found_start": None, 491 | } 492 | 493 | # Find start of refs section: 494 | sect_start = find_reference_section(fulltext) 495 | if sect_start is not None: 496 | sect_start["how_found_start"] = 1 497 | else: 498 | # No references found - try with no title option 499 | sect_start = find_reference_section_no_title_via_brackets(fulltext) 500 | if sect_start is not None: 501 | sect_start["how_found_start"] = 2 502 | # Try weaker set of patterns if needed 503 | if sect_start is None: 504 | # No references found - try with no title option (with weaker 505 | # patterns..) 506 | sect_start = find_reference_section_no_title_via_dots(fulltext) 507 | if sect_start is not None: 508 | sect_start["how_found_start"] = 3 509 | if sect_start is None: 510 | # No references found - try with no title option (with even 511 | # weaker patterns..) 512 | sect_start = find_reference_section_no_title_via_numbers(fulltext) 513 | if sect_start is not None: 514 | sect_start["how_found_start"] = 4 515 | 516 | if sect_start: 517 | LOGGER.debug("title %r", sect_start["title_string"]) 518 | LOGGER.debug("marker %r", sect_start["marker"]) 519 | LOGGER.debug("title_marker_same_line %s", sect_start["title_marker_same_line"]) 520 | 521 | else: 522 | LOGGER.debug("could not find references section") 523 | return sect_start 524 | -------------------------------------------------------------------------------- /refextract/references/kbs/authors.kb: -------------------------------------------------------------------------------- 1 | Du ̈hrssen---Dührssen 2 | -------------------------------------------------------------------------------- /refextract/references/kbs/collaborations.kb: -------------------------------------------------------------------------------- 1 | # This file holds text which must be recognised alongside authors, and hence included in the $h subfields. 2 | # Matches using this data do not affect how references are split. 3 | # (Just simply appends to the most recent $h subfield for the datafield, or makes a new one). 4 | # Do not append an 's' to the end. 5 | # Insert only the Upper cased version. 6 | CMS Collaboration---CMS Collaboration 7 | ATLAS Collaboration---ATLAS Collaboration 8 | ALICE Collaboration---ALICE Collaboration 9 | LEP Collaboration---LEP Collaboration 10 | CDF Collaboration---CDF Collaboration 11 | D0 Collaboration---D0 Collaboration 12 | ALEPH Collaboration---ALEPH Collaboration 13 | DELPHI Collaboration---DELPHI Collaboration 14 | L3 Collaboration---L3 Collaboration 15 | OPAL Collaboration---OPAL Collaboration 16 | CTEQ Collaboration---CTEQ Collaboration 17 | GEANT4 Collaboration---GEANT4 Collaboration 18 | LHC-B Collaboration---LHC-B Collaboration 19 | CDF II Collaboration---CDF II Collaboration 20 | RD 48 Collaboration---RD 48 Collaboration 21 | SLD Collaboration---SLD Collaboration 22 | H1 Collaboration---H1 Collaboration 23 | COMPASS Collaboration---COMPASS Collaboration 24 | HERMES Collaboration---HERMES Collaboration 25 | European Muon Collaboration---European Muon Collaboration 26 | Spin Muon Collaboration---Spin Muon Collaboration 27 | E143 Collaboration---E143 Collaboration 28 | Particle Data Group Collaboration---Particle Data Group Collaboration 29 | ATLAS Inner Detector software group Collaboration---ATLAS Inner Detector software group Collaboration 30 | DØ Collaboration---DØ Collaboration 31 | CUORE Collaboration---CUORE Collaboration 32 | Belle Collaboration---Belle Collaboration 33 | -------------------------------------------------------------------------------- /refextract/references/kbs/journal-titles-re.kb: -------------------------------------------------------------------------------- 1 | DAN---Dokl.Akad.Nauk Ser.Fiz. 2 | -------------------------------------------------------------------------------- /refextract/references/kbs/publishers.kb: -------------------------------------------------------------------------------- 1 | SHAKER---Shaker 2 | ELSEVIER---Elsevier 3 | NORTH HOLLAND---North-Holland 4 | BRANS---Brans 5 | BIRKHAEUSER---Birkhaeuser 6 | SCI PR---Sci. Pr. 7 | UNIV FORL---Univ.-Forl. 8 | LBL---LBL 9 | DE GRUYTER---de Gruyter 10 | AEG---AEG 11 | BEUTH---Beuth 12 | ELITERA---Elitera 13 | SPRINGER---Springer 14 | VEB VERL TECH---VEB Verl. Tech. 15 | CRC PR---CRC Pr. 16 | PITAGORA---Pitagora 17 | BIRKHAEUSER---Birkhaeuser 18 | GERSBACH---Gersbach 19 | VIEWEG---Vieweg 20 | HILGER---Hilger 21 | IOP---IOP 22 | MATH SCI PR---Math. Sci. Pr. 23 | UNIV PR---Univ. Pr. 24 | MIT PR---MIT Pr. 25 | UNIV PR---Univ. Pr. 26 | WILEY---Wiley 27 | RUTHERFORD LAB---Rutherford Lab. 28 | DARESBURY LAB---Daresbury Lab. 29 | TEOCHE MITTLER---Teoche-Mittler 30 | KLUWER ACADEMIC---Kluwer Academic 31 | REIDEL---Reidel 32 | ED FRONTIERES---Ed. Frontieres 33 | VDI---VDI 34 | FIZ4---FIZ4 35 | PHILIPS---Philips 36 | PRENTICE HALL---Prentice-Hall 37 | GIRARDET---Girardet 38 | UMSCHAU---Umschau 39 | ED FRONTIERES---Ed. Frontieres 40 | WORLD SCIENTIFIC---World Scientific 41 | GIOI PUBL---Gioi Publ. 42 | HUETIG---Huetig 43 | #BRAUN---Braun 44 | FACHBUCHVERL---Fachbuchverl. 45 | ACADEMIC---Academic 46 | ADDISON WESLEY---Addison-Wesley 47 | #BUTTERWORTH---Butterworth 48 | HARWOOD ACAD---Harwood Acad. 49 | HEINEMANN EDUC BOOKS---Heinemann Educ. Books 50 | IMP COLL PR---Imp. Coll. Pr. 51 | MCGRAW HILL---McGraw-Hill 52 | PERGAMON---Pergamon 53 | ROUTLEDGE & KEGAN PAUL---Routledge & Kegan Paul 54 | ROY SOC---Roy. Soc. 55 | WILEY---Wiley 56 | LANL---LANL 57 | TINNON BROWN---Tinnon-Brown 58 | BIBLIOGRAPH INST---Bibliograph. Inst. 59 | AIP---AIP 60 | EDITRICE ABITARE SEGESTA---Editrice Abitare Segesta 61 | UNIV PR---Univ. Pr. 62 | GERSBACH---Gersbach 63 | HANSER---Hanser 64 | OLDENBOURG---Oldenbourg 65 | PIPER---Piper 66 | THIEMIG---Thiemig 67 | ACADEMIC---Academic 68 | AM SOC MECH ENG---Am. Soc. Mech. Eng. 69 | AIP---AIP 70 | BENJAMIN---Benjamin 71 | DOVER---Dover 72 | GORDON AND BREACH---Gordon and Breach 73 | INTERSCIENCE---Interscience 74 | KLUWER ACADEMIC---Kluwer Academic 75 | MCGRAW HILL---McGraw-Hill 76 | NOVA SCI PUBL---Nova Sci. Publ. 77 | PERGAMON---Pergamon 78 | PLENUM---Plenum 79 | SPRINGER---Springer 80 | WILEY---Wiley 81 | CEBAF---CEBAF 82 | CLARENDON---Clarendon 83 | PERGAMON---Pergamon 84 | UNIV PR---Univ. Pr. 85 | ANNUAL REVIEWS---Annual Reviews 86 | INP---INP 87 | SIAM---SIAM 88 | VAN NOSTRAND---van Nostrand 89 | UNIV PR---Univ. Pr. 90 | ADDISON WESLEY---Addison-Wesley 91 | WORLD SCIENTIFIC---World Scientific 92 | MUELLER---Mueller 93 | ECOWIN VERL---Ecowin Verl. 94 | #FREEMAN---Freeman 95 | PITMAN---Pitman 96 | WORLD SCIENTIFIC---World Scientific 97 | NAT TECH INFORM SERV---Nat. Tech. Inform. Serv. 98 | SLAC---SLAC 99 | BERL UNION---Berl. Union 100 | TEUBNER---Teubner 101 | SWED PHYS ARCHIVE---Swed. Phys. Archive 102 | DEUTSCH---Deutsch 103 | KEK---KEK 104 | IAEA---IAEA 105 | SPRINGER---Springer 106 | IEEE---IEEE 107 | PHYSIK VERL---Physik-Verl. 108 | WILEY VCH---Wiley-VCH 109 | AIP---AIP 110 | -------------------------------------------------------------------------------- /refextract/references/kbs/report-numbers.kb: -------------------------------------------------------------------------------- 1 | *****LANL***** 2 | 3 | 4 | 5 | ACC PHYS ---acc-phys 6 | ADAP ORG ---adap-org 7 | ALG GEOM ---alg-geom 8 | AO SCI ---ao-sci 9 | AUTO FMS ---auto-fms 10 | BAYES AN ---bayes-an 11 | CD HG ---cd-hg 12 | CMP LG ---cmp-lg 13 | COMP GAS ---comp-gas 14 | DG GA ---dg-ga 15 | FUNCT AN ---funct-an 16 | GR QC ---gr-qc 17 | ARXIVHEP EX ---hep-ex 18 | ARXIVHEP PH ---hep-ph 19 | ARXIVHEP TH ---hep-th 20 | LC OM ---lc-om 21 | MTRL TH ---mtrl-th 22 | NEURO CEL ---neuro-cel 23 | NEURO DEV ---neuro-dev 24 | NEURO SCI ---neuro-sci 25 | PATT SOL ---patt-sol 26 | 27 | 28 | *****Fermilab extensions***** 29 | 30 | 31 | 32 | 33 | 34 | FERMILAB CONF ---FERMILAB-Conf 35 | FERMILAB FN ---FERMILAB-FN 36 | FERMILAB PUB ---FERMILAB-Pub 37 | FERMILAB TM ---FERMILAB-TM 38 | FERMILAB SLIDES ---FERMILAB-SLIDES 39 | FERMILAB POSTER ---FERMILAB-POSTER 40 | 41 | 42 | 43 | *****Fermilab no extensions***** 44 | 45 | 46 | 47 | 48 | 49 | FERMILAB CODE ---FERMILAB-CODE 50 | FERMILAB DESIGN ---FERMILAB-Design 51 | FERMILABDESIGN ---FERMILAB-Design 52 | FERMILAB PROPOSAL ---FERMILAB-Proposal 53 | FERMILAB THESIS ---FERMILAB-Thesis 54 | FERMILAB MASTERS---FERMILAB-Masters 55 | 56 | 57 | *****Fermilab DØ notes***** 58 | 59 | 60 | 61 | DØ NOTE---D0-Note 62 | D0 NOTE---D0-Note 63 | 64 | *****Fermilab CDF***** 65 | 66 | 67 | CDF ---CDF 68 | CDF ANAL ELECTROWEAK CDFR ---CDF-ANAL-ELECTROWEAK-CDFR 69 | CDF ANAL EXOTIC CDFR ---CDF-ANAL-EXOTIC-CDFR 70 | CDF ANAL EXOTIC PUBLIC ---CDF-ANAL-EXOTIC-PUBLIC 71 | CDF ANAL JET PUBLIC ---CDF-ANAL-JET-PUBLIC 72 | CDF ANAL TOP CDFR ---CDF-ANAL-TOP-CDFR 73 | CDF ANAL TOP PUBLIC ---CDF-ANAL-TOP-PUBLIC 74 | CDF DOC CDF CDFR ---CDF-DOC-CDF-CDFR 75 | CDF DOC CDF PUBLIC ---CDF-DOC-CDF-PUBLIC 76 | CDF DOC PLUG UPGR CDFR ---CDF-DOC-PLUG-UPGR-CDFR 77 | CDF NOTE ---CDF-NOTE 78 | CDF PHYS BOTTOM PUBLIC ---CDF-PHYS-BOTTOM-PUBLIC 79 | CDF PUB ---CDF-PUB 80 | CDF PUB BOTTOM CDFR ---CDF-PUB-BOTTOM-CDFR 81 | CDF PUB BOTTOM PUBLIC ---CDF-PUB-BOTTOM-PUBLIC 82 | CDF PUB CDF PUBLIC ---CDF-PUB-CDF-PUBLIC 83 | CDF PUB ELECTROWEAK CDFR ---CDF-PUB-ELECTROWEAK-CDFR 84 | CDF PUB ELECTROWEAK PUBLIC---CDF-PUB-ELECTROWEAK-PUBLIC 85 | CDF PUB EXOTIC CDFR ---CDF-PUB-EXOTIC-CDFR 86 | CDF PUB EXOTIC PUBLIC ---CDF-PUB-EXOTIC-PUBLIC 87 | CDF PUB HEAVYFLAVOR PUBLIC---CDF-PUB-HEAVYFLAVOR-PUBLIC 88 | CDF PUB JET CDFR ---CDF-PUB-JET-CDFR 89 | CDF PUB JET PUBLIC ---CDF-PUB-JET-PUBLIC 90 | CDF PUB MIN BIAS PUBLIC ---CDF-PUB-MIN-BIAS-PUBLIC 91 | CDF PUB PLUG UPGR PUBLIC ---CDF-PUB-PLUG-UPGR-PUBLIC 92 | CDF PUB PUBLIC ---CDF-PUB-PUBLIC 93 | CDF PUB SEC VTX PUBLIC ---CDF-PUB-SEC-VTX-PUBLIC 94 | CDF PUB SEC_VTX PUBLIC ---CDF-PUB-SEC-VTX-PUBLIC 95 | CDF PUB TOP CDFR ---CDF-PUB-TOP-CDFR 96 | CDF PUB TOP PUBLIC ---CDF-PUB-TOP-PUBLIC 97 | CDF THESIS BOTTOM PUBLIC ---CDF-THESIS-BOTTOM-PUBLIC 98 | CDF THESIS CDF PUBLIC ---CDF-THESIS-CDF-PUBLIC 99 | CDF THESIS TOP PUBLIC ---CDF-THESIS-TOP-PUBLIC 100 | CDF TOP PUBLIC ---CDF-TOP-PUBLIC 101 | 102 | 103 | *****Fermilab MicroBooNE***** 104 | 105 | 106 | MICROBOONE NOTE ---MICROBOONE-NOTE 107 | MICROBOONE PUBLIC NOTE ---MICROBOONE-NOTE 108 | 109 | *****CERN***** 110 | 111 | 112 | 113 | AB NOTE ---AB-NOTE 114 | ALEPH ---ALEPH 115 | ALICE ---ALICE 116 | ALICE INT ---ALICE-INT 117 | ALICE NOTE ---ALICE-INT 118 | ALICE PUBLIC ---ALICE-PUBLIC 119 | ATL CAL ---ATL-CAL 120 | ATL COM ---ATL-COM 121 | ATL COM SOFT ---ATL-COM-SOFT 122 | ATL COM PUB ---ATL-COM-DAQ 123 | ATL COM DAQ ---ATL-COM-DAQ 124 | ATL COM INDENT ---ATL-COM-INDENT 125 | ATL COM LUM ---ATL-COM-LUM 126 | ATL COM MUON ---ATL-COM-MUON 127 | ATL COM PHYS ---ATL-COM-PHYS 128 | ATL COMPHYS ---ATL-COM-PHYS 129 | ATLCOM PHYS ---ATL-COM-PHYS 130 | TL COM PHYS ---ATL-COM-PHYS 131 | ATLAS COM PHYS ---ATLAS-COM-PHYS 132 | ATL COM TILECAL ---ATL-COM-TILECAL 133 | ATL COM LARG ---ATL-COM-LARG 134 | ATLAS COM CONF ---ATLAS-COM-CONF 135 | ATLASCOM CONF ---ATLAS-COM-CONF 136 | ATLAS COMCONF ---ATLAS-COM-CONF 137 | ATLAS CONF ---ATLAS-CONF 138 | ATLASCONF ---ATLAS-CONF 139 | ATL DAQ ---ATL-DAQ 140 | ATL DAQ CONF ---ATL-DAQ-CONF 141 | ATL DAQ PUB ---ATL-DAQ-PUB 142 | ATL DAQ PROC ---ATL-DAQ-PROC 143 | ATL GEN ---ATL-GEN 144 | ATLAS HIGG ---ATLAS-HIGG 145 | ATL INDET ---ATL-INDET 146 | ATL INDET PUB ---ATL-INDET-PUB 147 | ATL INDET PROC ---ATL-INDET-PROC 148 | ATL LARG ---ATL-LARG 149 | ATL MUON ---ATL-MUON 150 | ATL MUON PUB ---ATL-MUON-PUB 151 | ATL PUB MUON ---ATL-PUB-MUON 152 | ATL PHYS ---ATL-PHYS 153 | ATL PHYS CONF ---ATL-PHYS-CONF 154 | ATL PHYS INT ---ATL-PHYS-INT 155 | ATL PHYSINT ---ATL-PHYS-INT 156 | ATLPHYS INT ---ATL-PHYS-INT 157 | ATL PHYS PUB ---ATL-PHYS-PUB 158 | ATL PHYSPUB ---ATL-PHYS-PUB 159 | ATLPHYS PUB ---ATL-PHYS-PUB 160 | ATLAS PHYS PUB ---ATL-PHYS-PUB 161 | ATL PHYS PROC ---ATL-PHYS-PROC 162 | ATL TECH ---ATL-TECH 163 | ATL TILECAL ---ATL-TILECAL 164 | ATL TILECAL PUB ---ATL-TILECAL-PUB 165 | ATL TILECAL PROC ---ATL-TILECAL-PROC 166 | ATL SOFT ---ATL-SOFT 167 | ATL SOFT PUB ---ATL-SOFT-PUB 168 | ATL SOFT PROC ---ATL-SOFT-PROC 169 | ATL IS EN ---ATL-IS-EN 170 | ATL IS QA ---ATL-IS-QA 171 | ATL LARG PUB ---ATL-LARG-PUB 172 | ATL COM LARG ---ATL-COM-LARG 173 | TL COM LARG ---ATL-COM-LARG 174 | ATLCOM LARG ---ATL-COM-LARG 175 | ATL MAGNET PUB ---ATL-MAGNET-PUB 176 | ATL UPGRADE PUB ---ATL-UPGRADE-PUB 177 | ATL UPGRADE PROC ---ATL-UPGRADE-PROC 178 | CERN AB ---CERN-AB 179 | CERN AB NOTE ---CERN-NOTE 180 | CERN ALEPH ---CERN-ALEPH 181 | CERN ALEPH PHYSIC ---CERN-ALEPH-PHYSIC 182 | CERN ALEPH PUB ---CERN-ALEPH-PUB 183 | CERN ALICE INT ---CERN-ALICE-INT 184 | CERN ALICE PUB ---CERN-ALICE-PUB 185 | CERN ALI ---CERN-ALI 186 | CERN AS ---CERN-AS 187 | CERN AT ---CERN-AT 188 | CERN ATL COM CAL ---CERN-ATL-COM-CAL 189 | CERN ATL COM DAQ ---CERN-ATL-COM-DAQ 190 | CERN ATL COM GEN ---CERN-ATL-COM-GEN 191 | CERN ATL COM INDET ---CERN-ATL-COM-INDET 192 | CERN ATL COM LARG ---CERN-ATL-COM-LARG 193 | CERN ATL COM MUON ---CERN-ATL-COM-MUON 194 | CERN ATL COM PHYS ---CERN-ATL-COM-PHYS 195 | CERN ATL COM TECH ---CERN-ATL-COM 196 | CERN ATL COM TILECAL ---CERN-ATL-COM 197 | CERN ATL DAQ ---CERN-ATL-DAQ 198 | CERN ATL SOFT ---CERN-ATL-SOFT 199 | CERN ATL SOFT INT ---CERN-ATL-SOFT-INT 200 | CERN ATL SOFT PUB ---CERN-ATL-SOFT-PUB 201 | CERN ATS ---CERN-ATS 202 | CERNATS ---CERN-ATS 203 | CERN ATS NOTE ---CERN-ATS-NOTE 204 | CERNATS NOTE ---CERN-ATS-NOTE 205 | CERN BE ---CERN-BE 206 | CERN BE NOTE ---CERN-BE-NOTE 207 | CERN CMS ---CERN-CMS 208 | CERN CMS CR ---CERN-CMS-CR 209 | CERN CMS DP ---CERN-CMS-DP 210 | CERN CMS NOTE ---CERN-CMS-NOTE 211 | CERN CN ---CERN-CN 212 | CERN DD ---CERN-DD 213 | CERN DELPHI ---CERN-DELPHI 214 | CERN ECP ---CERN-ECP 215 | CERN EF ---CERN-EF 216 | CERN EP ---CERN-EP 217 | CERN EST ---CERN-EST 218 | CERN ETT ---CERN-ETT 219 | CERN INTC ---CERN-INTC 220 | CERN IT ---CERN-IT 221 | CERN LCGAPP ---CERN-LCGAPP 222 | CERN LHCB ---CERN-LHCB 223 | CERN LHCB DP ---CERN-LHCB-DP 224 | CERN LHCB CONF ---CERN-LHCB-CONF 225 | CERN LHCB INT ---CERN-LHCB-INT 226 | CERN LHCB PUB ---CERN-LHCB-PUB 227 | CERN LHCC ---CERN-LHCC 228 | CERNLHCC ---CERN-LHCC 229 | CERN LHC ---CERN-LHC 230 | CERN LHC PHO ---CERN-LHC-PHO 231 | CERN LHC PROJECT REPORT---CERN-LHC-Project-Report 232 | CERN OPEN ---CERN-OPEN 233 | CERNOPEN ---CERNOPEN 234 | CERN PH EP ---CERN-PH-EP 235 | CERNPH EP ---CERN-PH-EP 236 | CERN PHEP ---CERN-PH-EP 237 | CERN PH LPCC ---CERN-PH-LPCC 238 | CERN PH TH ---CERN-PH-TH 239 | CERN PPE ---CERN-PPE 240 | CERN PROCEEDINGS ---CERN-PROCEEDINGS 241 | CERN PS ---CERN-PS 242 | CERN SL ---CERN-SL 243 | CERN SL NOTE ---CERN-SL-NOTE 244 | CERN SPSC ---CERN-SPSC 245 | CERNSPSC ---CERN-SPSC 246 | CERN ST ---CERN-ST 247 | CERN TH ---CERN-TH 248 | CERN THESIS ---CERN-THESIS 249 | CERNTHESIS ---CERN-THESIS 250 | CERN TIS ---CERN-TIS 251 | CERN ATS ---CERN-ATS 252 | CERN ATSNOTE ---CERN-ATSNOTE 253 | CERN ---CERN 254 | CLICDP NOTE ---CLICDP-NOTE 255 | CMS AN ---CMS-AN 256 | CMS CR ---CMS-CR 257 | CMS DP ---CMS-DP 258 | CMS NOTE ---CMS-NOTE 259 | CMSNOTE ---CMS-NOTE 260 | CMS EXO ---CMS-EXO 261 | CMS TS ---CMS-TS 262 | DELPHI ---DELPHI 263 | DELPHI NOTE ---DELPHI-NOTE 264 | DIRAC NOTE ---DIRAC-NOTE 265 | DN ---DIRAC-NOTE 266 | LHCB ---LHCB 267 | LHCB DP ---LHCB-DP 268 | LHCB JOURNAL ---LHCB-JOURNAL 269 | LHCB ANA ---LHCB-ANA 270 | LHCB CONF ---LHCB-CONF 271 | LHCBCONF ---LHCB-CONF 272 | LHCB INT ---LHCB-INT 273 | LHCB PUB ---LHCB-PUB 274 | LHCB PAPER ---LHCB-PAPER 275 | LHCB PROC ---LHCB-PROC 276 | LHCB TALK ---LHCB-TALK 277 | LHCHXSWG ---LHCHXSWG 278 | LHCHXSWG DRAFT INT ---LHCHXSWG-DRAFT-INT 279 | LHCHXSWG INT ---LHCHXSWG-INT 280 | SN ATLAS ---SN-ATLAS 281 | PAS SUSY ---CMS-PAS-SUS 282 | CMS PAS EXO ---CMS-PAS-EXO 283 | CMS PAS HIN ---CMS-PAS-HIN 284 | CMS PAS QCD ---CMS-PAS-QCD 285 | CMS PAS TOP ---CMS-PAS-TOP 286 | CMS PAS SUS ---CMS-PAS-SUS 287 | CMS PAS BPH ---CMS-PAS-BPH 288 | CMS PAS SMP ---CMS-PAS-SMP 289 | CMS PAS HIG ---CMS-PAS-HIG 290 | CMS PAS EWK ---CMS-PAS-EWK 291 | CMS PAS BTV ---CMS-PAS-BTV 292 | CMS PAS FWD ---CMS-PAS-FWD 293 | CMS PAS TRK ---CMS-PAS-TRK 294 | CMS PAS SMP ---CMS-PAS-SMP 295 | CMS PAS PFT ---CMS-PAS-PFT 296 | CMS PAS MUO ---CMS-PAS-MUO 297 | CMS PAS JME ---CMS-PAS-JME 298 | CMS PAS EGM ---CMS-PAS-EGM 299 | CMS PAS DIF ---CMS-PAS-DIF 300 | CMS PAS B2G ---CMS-PAS-B2G 301 | ATLTILECAL PUB ---ATLTILECAL-PUB 302 | ATLAS TECH PUB ---ATLAS-TECH-PUB 303 | TLCOM MAGNET ---TLCOM-MAGNET 304 | ATLLARG ---ATL-LARG 305 | SL NOTE ---SL-NOTE 306 | TOTEM ---TOTEM 307 | TS NOTE ---TS-NOTE 308 | 309 | *****CERN MORE***** 310 | 311 | 312 | CMS UG TP ---CMS-UG-TP 313 | 314 | 315 | *****CERN DIFFERENT FORMAT***** 316 | 317 | CERN GE ---CERN-GE 318 | 319 | *****CERN with language***** 320 | 321 | 322 | CERN BROCHURE ---CERN-BROCHURE 323 | 324 | 325 | *****LHC***** 326 | 327 | 328 | CERN CLIC NOTE ---CLIC-Note 329 | LHC PROJECT NOTE ---LHC-Project-Note 330 | CERN LHC PROJECT REPORT ---CERN-LHC-Project-Report 331 | LHC PROJECT REPORT ---CERN-LHC-Project-Report 332 | CLIC NOTE ---CLIC-Note 333 | ATLAS TDR ---ATL-TDR 334 | CMS TDR ---CMS-TDR 335 | ATC TT ID ---ATC-TT-ID 336 | ATC TT IN ---ATC-TT-IN 337 | LHCCP ---LHCCP 338 | 339 | ***LHC OTHER FORMAT***** 340 | 341 | 342 | CERN ACC ---CERN-ACC 343 | CERN ACC NOTE ---CERN-ACC-NOTE 344 | 345 | *****KEK***** 346 | 347 | 348 | 349 | 350 | KEK CP ---KEK-CP 351 | KEK INT ---KEK-Internal 352 | KEK INTERNAL ---KEK-Internal 353 | KEK PREPRINT ---KEK-Preprint 354 | KEK TH ---KEK-TH 355 | 356 | 357 | *****DESY***** 358 | 359 | 360 | 361 | DESY ---DESY 362 | DESY M ---DESY-M 363 | DESY THESIS ---DESY-THESIS 364 | DESYTHESIS ---DESY-THESIS 365 | DESY TESLA FEL ---DESY-TESLA-FEL 366 | DESY PROC ---DESY-PROC 367 | DESYPROC ---DESY-PROC 368 | TESLA FEL ---DESY-TESLA-FEL 369 | 370 | 371 | *****DESY F***** 372 | <99s9> 373 | <9s99s99> 374 | <99s99s99> 375 | 376 | DESY F ---DESY-F 377 | 378 | 379 | *****SLAC***** 380 | 381 | 382 | 383 | SLAC AP ---SLAC-AP 384 | SLAC PUB ---SLAC-PUB 385 | SLAC R ---SLAC-R 386 | SLAC TN ---SLAC-TN 387 | SLAC WP ---SLAC-WP 388 | 389 | 390 | *****Berkeley Lab***** 391 | 392 | LBNL ---LBNL 393 | 394 | 395 | 396 | *****Argonne National Laboratory***** 397 | 398 | 399 | ANL HEP TR ---ANL-HEP-TR 400 | 401 | 402 | *****Antares***** 403 | 404 | 405 | ANTARES SOFT ---ANTARES-SOFT 406 | ANTARES PHYS ---ANTARES-Phys 407 | ANTARES OPMO ---ANTARES-Opmo 408 | 409 | *****LIGO***** 410 | 411 | 412 | LIGO ---LIGO 413 | 414 | *****Pierre Auger***** 415 | 416 | 417 | GAP ---GAP 418 | 419 | *****ILC***** 420 | 421 | 422 | EUDET MEMO ---EUDET-MEMO 423 | EUDET REPORT ---EUDET-REPORT 424 | EUROTEV REPORT---EUROTEV-REPORT 425 | ILC NOTE ---ILC-NOTE 426 | ILC REPORT ---ILC-REPORT 427 | LC DET ---LC-DET 428 | LC PHSM ---LC-PHSM 429 | LC REP ---LC-REP 430 | LC REPORT ---LC-REPORT 431 | LC TOOL ---LC-TOOL 432 | LC TH ---LC-TH 433 | LCD NOTE ---LCD-NOTE 434 | 435 | *****IHEP***** 436 | 437 | 438 | 439 | IHEP AC ---IHEP-AC 440 | IHEP CEPC DR ---IHEP-CEPC-DR 441 | IHEP EP ---IHEP-EP 442 | IHEP TH ---IHEP-TH 443 | 444 | *****IPAC***** 445 | 446 | 447 | IPAC ---IPAC 448 | 449 | *****JINR***** 450 | <[EP]9?9syys9?9?9> 451 | <[EP]9?9syyyys9?9?9> 452 | 453 | JINR ---JINR 454 | 455 | *****Other institutes (standard format)***** 456 | 457 | 458 | 459 | BONN IR ---BONN-IR 460 | BONN IB ---BONN-IB 461 | DAMTP ---DAMTP 462 | ESS ---ESS 463 | EUCARD CON ---EUCARD-CON 464 | INO ---INO 465 | JAI ---JAI 466 | KFKI ---KFKI 467 | LPHE ---LPHE 468 | MPP ---MPP 469 | NIKHEF ---NIKHEF 470 | RAL TR ---RAL-TR 471 | SLS TME TA ---SLS-TME-TA 472 | -------------------------------------------------------------------------------- /refextract/references/kbs/special-journals.kb: -------------------------------------------------------------------------------- 1 | JHEP 2 | JCAP 3 | -------------------------------------------------------------------------------- /refextract/references/pdf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2016, 2017, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | import logging 25 | 26 | from pypdf import PdfReader 27 | from pypdf.generic import ByteStringObject 28 | 29 | from refextract.references.regexs import re_reference_in_dest 30 | 31 | LOGGER = logging.getLogger(__name__) 32 | 33 | 34 | class IncompleteCoordinatesError(Exception): 35 | """Exception raised when a named destination does not have all required 36 | coordinates. 37 | """ 38 | 39 | pass 40 | 41 | 42 | def extract_texkeys_and_urls_from_pdf(pdf_file): 43 | """ 44 | Extract the texkeys and corresponding urls from the given PDF file 45 | 46 | This is done by looking up the named destinations in the PDF 47 | 48 | @param pdf_file: path to a PDF 49 | 50 | @return: list of dictionaries with all texkeys 51 | and corresponding urls found in the PDF 52 | """ 53 | with open(pdf_file, "rb") as pdf_stream: 54 | try: 55 | pdf = PdfReader(pdf_stream, strict=False) 56 | destinations = pdf.named_destinations 57 | urls = extract_urls(pdf) 58 | except Exception: 59 | LOGGER.debug("PDF: Internal pypdf error, no TeXkeys returned.") 60 | return [] 61 | # not all named destinations point to references 62 | refs = [] 63 | for destination in destinations.items(): 64 | destination_key = ( 65 | destination[0].decode("utf-8") 66 | if isinstance(destination[0], ByteStringObject) 67 | else destination[0] 68 | ) 69 | match = re_reference_in_dest.match(destination_key) 70 | if match: 71 | refs.append(destination) 72 | two_column_layout = False 73 | try: 74 | if _destinations_in_two_columns(pdf, refs): 75 | two_column_layout = True 76 | LOGGER.debug("PDF: Using two-column layout") 77 | 78 | def sortfunc(dest_couple): 79 | return dest_couple[1] 80 | 81 | else: 82 | LOGGER.debug("PDF: Using single-column layout") 83 | 84 | def sortfunc(dest_couple): 85 | page, _, ypos, xpos = dest_couple[1] 86 | return (page, ypos, xpos) 87 | 88 | refs = [(dest[0], _destination_position(pdf, dest[1])) for dest in refs] 89 | refs.sort(key=sortfunc) 90 | urls = [(uri["/A"]["/URI"], _uri_position(pdf, uri)) for uri in urls] 91 | urls.sort(key=sortfunc) 92 | texkey_url_list = [] 93 | for nb, ref in enumerate(refs): 94 | current_texkey_urls_dict = {} 95 | current_texkey_urls_dict["texkey"] = re_reference_in_dest.match( 96 | ref[0] 97 | ).group(1) 98 | if nb < len(refs) - 1: 99 | next_reference_data = refs[nb + 1] 100 | matched_urls_for_reference, urls = _match_urls_with_reference( 101 | urls, 102 | ref, 103 | next_reference_data, 104 | two_column_layout=two_column_layout, 105 | ) 106 | else: 107 | matched_urls_for_reference, urls = _match_urls_with_reference( 108 | urls, ref, two_column_layout=two_column_layout 109 | ) 110 | if matched_urls_for_reference: 111 | current_texkey_urls_dict["urls"] = matched_urls_for_reference 112 | texkey_url_list.append(current_texkey_urls_dict) 113 | return texkey_url_list 114 | except Exception: 115 | LOGGER.debug("PDF: Impossible to determine layout, no TeXkeys returned") 116 | return [] 117 | 118 | 119 | def _match_urls_with_reference( 120 | urls_to_match, reference, next_reference=None, two_column_layout=False 121 | ): 122 | ref_page_number, ref_column, ref_y, _ = reference[1] 123 | if next_reference: 124 | next_ref_page_number, next_ref_col, next_ref_y, _ = next_reference[1] 125 | urls_for_reference = set() 126 | for url_index, url in enumerate(urls_to_match): 127 | url_page_number, url_col, url_y, _ = url[1] 128 | is_url_under_texkey = ref_y <= url_y 129 | is_url_in_same_col = ref_column == url_col 130 | is_url_in_next_col = url_col > ref_column 131 | is_reference_on_same_page_as_url = ref_page_number == url_page_number 132 | is_reference_on_previous_page_than_url = ref_page_number + 1 == url_page_number 133 | if not next_reference: 134 | if ( 135 | ( 136 | is_reference_on_same_page_as_url 137 | and (is_url_in_same_col or is_url_in_next_col) 138 | ) 139 | or is_reference_on_previous_page_than_url 140 | ) and is_url_under_texkey: 141 | urls_for_reference.add(url[0]) 142 | continue 143 | is_url_between_texkeys = ( 144 | is_reference_on_same_page_as_url or is_reference_on_previous_page_than_url 145 | ) and (ref_y <= url_y <= next_ref_y) 146 | is_next_reference_on_the_same_page = next_ref_page_number == url_page_number 147 | is_last_reference_in_page = ( 148 | is_reference_on_same_page_as_url 149 | and (next_ref_page_number > url_page_number) 150 | and is_url_under_texkey 151 | ) 152 | is_last_reference_in_page_two_col_layout = ( 153 | is_reference_on_same_page_as_url 154 | and is_next_reference_on_the_same_page 155 | and is_url_under_texkey 156 | and (next_ref_col > url_col) 157 | and next_ref_y < url_y 158 | and ref_y <= url_y 159 | and (is_url_in_same_col or is_url_in_next_col) 160 | ) 161 | is_in_new_column = ( 162 | is_reference_on_same_page_as_url 163 | and is_next_reference_on_the_same_page 164 | and ref_y > url_y 165 | and (next_ref_col > ref_column) 166 | and (next_ref_y > url_y) 167 | ) 168 | is_url_for_other_reference_in_new_column = ( 169 | is_reference_on_same_page_as_url 170 | and (next_ref_page_number == url_page_number) 171 | and (next_ref_col == ref_column < url_col) 172 | and (next_ref_y > url_y) 173 | ) 174 | is_url_unrelated_to_references = ref_page_number > url_page_number 175 | is_url_for_next_reference = url_y >= next_ref_y 176 | if is_url_between_texkeys: 177 | if not two_column_layout or (two_column_layout and url_col == ref_column): 178 | urls_for_reference.add(url[0]) 179 | continue 180 | elif ( 181 | is_last_reference_in_page 182 | or is_last_reference_in_page_two_col_layout 183 | or is_in_new_column 184 | ): 185 | urls_for_reference.add(url[0]) 186 | continue 187 | elif is_url_unrelated_to_references: 188 | continue 189 | elif is_url_for_next_reference or is_url_for_other_reference_in_new_column: 190 | urls_to_match = urls_to_match[url_index:] 191 | break 192 | if not next_reference: 193 | urls_to_match = [] 194 | return urls_for_reference, urls_to_match 195 | 196 | 197 | def _destinations_in_two_columns(pdf, destinations, cutoff=3): 198 | """ 199 | Check if the named destinations are organized along two columns (heuristic) 200 | 201 | @param pdf: a PdfReader object 202 | @param destinations: 203 | 204 | 'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the 205 | would-be second column start at the same position, return True 206 | """ 207 | # iterator for the x coordinates of refs in the would-be second column 208 | xpositions = ( 209 | _destination_position(pdf, dest)[3] 210 | for (_, dest) in destinations 211 | if _destination_position(pdf, dest)[1] == 1 212 | ) 213 | xpos_count = {} 214 | for xpos in xpositions: 215 | xpos_count[xpos] = xpos_count.get(xpos, 0) + 1 216 | if xpos_count[xpos] >= cutoff: 217 | return True 218 | return False 219 | 220 | 221 | def _destination_position(pdf, destination): 222 | """ 223 | Gives a tuple (page, column, -y, x) representing the position of the 224 | NamedDestination 225 | 226 | This representation is useful for sorting named destinations and 227 | assumes the text has at most 2 columns 228 | """ 229 | pagewidth = pdf.pages[ 230 | pdf.get_destination_page_number(destination) 231 | ].cropbox.lower_right[0] 232 | if not destination.left or not destination.top: 233 | raise IncompleteCoordinatesError(destination) 234 | # assuming max 2 columns 235 | column = (2 * destination.left) // pagewidth 236 | return ( 237 | pdf.get_destination_page_number(destination), 238 | column, 239 | -destination.top, 240 | destination.left, 241 | ) 242 | 243 | 244 | def _uri_position(pdf, uri_destination): 245 | """ 246 | Gives a tuple (page, column, -y, x) representing the position of the URI 247 | """ 248 | page_nb = uri_destination.get("page_nb") 249 | destintation_left = uri_destination["/Rect"][0] 250 | destintation_top = uri_destination["/Rect"][3] 251 | pagewidth = pdf.get_page(page_nb).cropbox.lower_right[0] 252 | column = (2 * destintation_left) // pagewidth 253 | # neccessary to exclude column from sorting 254 | return (page_nb, column, -destintation_top, destintation_left) 255 | 256 | 257 | def extract_urls(pdf): 258 | urls = [] 259 | pages = len(pdf.pages) 260 | for page_nb in range(pages): 261 | page = pdf.pages[page_nb] 262 | page_object = page.get_object() 263 | urls_for_page = _get_urls_data_from_page_object(page_object, page_nb) 264 | urls.extend(urls_for_page) 265 | return urls 266 | 267 | 268 | def _get_urls_data_from_page_object(page_object, page_nb): 269 | urls_at_page = [] 270 | annotations = page_object.get("/Annots", []) 271 | for annotation in annotations: 272 | annotation_object = annotation.get_object() 273 | if "/URI" in annotation_object["/A"]: 274 | annotation_object.update({"page_nb": page_nb}) 275 | urls_at_page.append(annotation_object) 276 | return urls_at_page 277 | -------------------------------------------------------------------------------- /refextract/references/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2013, 2015, 2016, 2017, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | 25 | def format_marker(line_marker): 26 | return line_marker.strip("[](){}. ") 27 | 28 | 29 | def build_references(citations, reference_format=False): 30 | """Build list of reference dictionaries from a references list""" 31 | # Now, run the method which will take as input: 32 | # 1. A list of lists of dictionaries, where each dictionary is a piece 33 | # of citation information corresponding to a tag in the citation. 34 | # 2. The line marker for this entire citation line (mulitple citation 35 | # 'finds' inside a single citation will use the same marker value) 36 | # The resulting xml line will be a properly marked up form of the 37 | # citation. It will take into account authors to try and split up 38 | # references which should be read as two SEPARATE ones. 39 | return [ 40 | c 41 | for citation_elements in citations 42 | for elements in citation_elements["elements"] 43 | for c in build_reference_fields( 44 | elements, 45 | citation_elements["line_marker"], 46 | citation_elements["raw_ref"], 47 | reference_format, 48 | ) 49 | ] 50 | 51 | 52 | def add_subfield(field, code, value): 53 | if value: 54 | field.setdefault(code, []).append(value) 55 | 56 | 57 | def add_journal_subfield(field, element, reference_format): 58 | add_subfield(field, "journal_title", element.get("title")) 59 | add_subfield(field, "journal_volume", element.get("volume")) 60 | add_subfield(field, "journal_year", element.get("year")) 61 | add_subfield(field, "journal_page", element.get("page")) 62 | add_subfield(field, "journal_reference", reference_format.format(**element)) 63 | 64 | 65 | def create_reference_field(line_marker): 66 | field = {} 67 | if line_marker.strip("., [](){}"): 68 | add_subfield(field, "linemarker", format_marker(line_marker)) 69 | return field 70 | 71 | 72 | def build_reference_fields(citation_elements, line_marker, raw_ref, reference_format): 73 | """Create the final representation of the reference information. 74 | 75 | @param citation_elements: (list) an ordered list of dictionary elements, 76 | with each element corresponding to a found 77 | piece of information from a reference line. 78 | @param line_marker: (string) The line marker for this single reference 79 | line (e.g. [19]) 80 | @param raw_ref: (string) The raw string of this line 81 | @return reference_fields: (list) A list of one dictionary containing the 82 | reference elements 83 | """ 84 | # Begin the datafield element 85 | current_field = create_reference_field(line_marker) 86 | current_field["raw_ref"] = [raw_ref] 87 | 88 | reference_fields = [current_field] 89 | 90 | for element in citation_elements: 91 | # Before going onto checking 'what' the next element is, 92 | # handle misc text and semi-colons 93 | # Multiple misc text subfields will be compressed later 94 | # This will also be the only part of the code that deals with MISC 95 | # tag_typed elements 96 | misc_txt = element["misc_txt"] 97 | if misc_txt.strip("., [](){}"): 98 | misc_txt = misc_txt.lstrip("])} ,.").rstrip("[({ ,.") 99 | add_subfield(current_field, "misc", misc_txt) 100 | 101 | # Now handle the type dependent actions 102 | # JOURNAL 103 | if element["type"] == "JOURNAL": 104 | add_journal_subfield(current_field, element, reference_format) 105 | 106 | # REPORT NUMBER 107 | elif element["type"] == "REPORTNUMBER": 108 | add_subfield(current_field, "reportnumber", element["report_num"]) 109 | 110 | # URL 111 | elif element["type"] == "URL": 112 | if element["url_string"] == element["url_desc"]: 113 | # Build the datafield for the URL segment of the reference 114 | # line: 115 | add_subfield(current_field, "url", element["url_string"]) 116 | # Else, in the case that the url string and the description differ 117 | # in some way, include them both 118 | else: 119 | add_subfield(current_field, "url", element["url_string"]) 120 | add_subfield(current_field, "urldesc", element["url_desc"]) 121 | 122 | # DOI 123 | elif element["type"] == "DOI": 124 | add_subfield(current_field, "doi", "doi:" + element["doi_string"]) 125 | 126 | # HDL 127 | elif element["type"] == "HDL": 128 | add_subfield(current_field, "hdl", "hdl:" + element["hdl_id"]) 129 | 130 | # AUTHOR 131 | elif element["type"] == "AUTH": 132 | value = element["auth_txt"] 133 | if element["auth_type"] == "incl": 134 | value = "(%s)" % value 135 | 136 | add_subfield(current_field, "author", value) 137 | 138 | elif element["type"] == "QUOTED": 139 | add_subfield(current_field, "title", element["title"]) 140 | 141 | elif element["type"] == "ISBN": 142 | add_subfield(current_field, "isbn", element["ISBN"]) 143 | 144 | elif element["type"] == "BOOK": 145 | add_subfield(current_field, "title", element["title"]) 146 | 147 | elif element["type"] == "PUBLISHER": 148 | add_subfield(current_field, "publisher", element["publisher"]) 149 | 150 | elif element["type"] == "YEAR": 151 | add_subfield(current_field, "year", element["year"]) 152 | 153 | elif element["type"] == "COLLABORATION": 154 | add_subfield(current_field, "collaboration", element["collaboration"]) 155 | 156 | elif element["type"] == "RECID": 157 | add_subfield(current_field, "recid", str(element["recid"])) 158 | 159 | return reference_fields 160 | 161 | 162 | def update_reference_with_urls(reference, url_set): 163 | for url in url_set: 164 | add_subfield(reference, "url", url) 165 | 166 | 167 | def merge_misc(field): 168 | current_misc = None 169 | for subfield in field.subfields[:]: 170 | if subfield.code == "m": 171 | if current_misc is None: 172 | current_misc = subfield 173 | else: 174 | current_misc.value += " " + subfield.value 175 | field.subfields.remove(subfield) 176 | -------------------------------------------------------------------------------- /refextract/references/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2015, 2016, 2017, 2018 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | import logging 25 | import re 26 | 27 | from inspire_utils.record import replace_undesirable_characters 28 | 29 | from refextract.documents.text import ( 30 | join_lines, 31 | re_multiple_space, 32 | remove_page_boundary_lines, 33 | repair_broken_urls, 34 | ) 35 | from refextract.references.config import CFG_REFEXTRACT_MAX_LINES 36 | from refextract.references.find import ( 37 | find_end_of_reference_section, 38 | get_reference_section_beginning, 39 | ) 40 | 41 | LOGGER = logging.getLogger(__name__) 42 | 43 | 44 | def extract_references_from_fulltext(fulltext): 45 | """Locate and extract the reference section from a fulltext document. 46 | Return the extracted reference section as a list of strings, whereby each 47 | string in the list is considered to be a single reference line. 48 | E.g. a string could be something like: 49 | '[19] Wilson, A. Unpublished (1986). 50 | @param fulltext: (list) of strings, whereby each string is a line of the 51 | document. 52 | @return: (list) of strings, where each string is an extracted reference 53 | line. 54 | """ 55 | # Try to remove pagebreaks, headers, footers 56 | fulltext = remove_page_boundary_lines(fulltext) 57 | status = 0 58 | # How ref section found flag 59 | how_found_start = 0 60 | # Find start of refs section 61 | ref_sect_start = get_reference_section_beginning(fulltext) 62 | 63 | if ref_sect_start is None: 64 | # No References 65 | refs = [] 66 | status = 4 67 | LOGGER.debug("extract_references_from_fulltext: ref_sect_start is None") 68 | else: 69 | # If a reference section was found, however weak 70 | ref_sect_end = find_end_of_reference_section( 71 | fulltext, 72 | ref_sect_start["start_line"], 73 | ref_sect_start["marker"], 74 | ref_sect_start["marker_pattern"], 75 | ) 76 | if ref_sect_end is None: 77 | # No End to refs? Not safe to extract 78 | refs = [] 79 | status = 5 80 | LOGGER.debug("extract_references_from_fulltext: no end to refs!") 81 | else: 82 | # If the end of the reference section was found.. start extraction 83 | refs = get_reference_lines( 84 | fulltext, 85 | ref_sect_start["start_line"], 86 | ref_sect_end, 87 | ref_sect_start["title_string"], 88 | ref_sect_start["marker_pattern"], 89 | ref_sect_start["title_marker_same_line"], 90 | ) 91 | 92 | return refs, status, how_found_start 93 | 94 | 95 | def get_reference_lines( 96 | docbody, 97 | ref_sect_start_line, 98 | ref_sect_end_line, 99 | ref_sect_title, 100 | ref_line_marker_ptn, 101 | title_marker_same_line, 102 | ): 103 | """After the reference section of a document has been identified, and the 104 | first and last lines of the reference section have been recorded, this 105 | function is called to take the reference lines out of the document body. 106 | The document's reference lines are returned in a list of strings whereby 107 | each string is a reference line. Before this can be done however, the 108 | reference section is passed to another function that rebuilds any broken 109 | reference lines. 110 | @param docbody: (list) of strings - the entire document body. 111 | @param ref_sect_start_line: (integer) - the index in docbody of the first 112 | reference line. 113 | @param ref_sect_end_line: (integer) - the index in docbody of the last 114 | reference line. 115 | @param ref_sect_title: (string) - the title of the reference section 116 | (e.g. "References"). 117 | @param ref_line_marker_ptn: (string) - the patern used to match the 118 | marker for each reference line (e.g., could be used to match lines 119 | with markers of the form [1], [2], etc.) 120 | @param title_marker_same_line: (integer) - a flag to indicate whether 121 | or not the reference section title was on the same line as the first 122 | reference line's marker. 123 | @return: (list) of strings. Each string is a reference line, extracted 124 | from the document. 125 | """ 126 | start_idx = ref_sect_start_line 127 | if title_marker_same_line: 128 | # Title on same line as 1st ref- take title out! 129 | title_start = docbody[start_idx].find(ref_sect_title) 130 | if title_start != -1: 131 | # Set the first line with no title 132 | docbody[start_idx] = docbody[start_idx][title_start + len(ref_sect_title) :] 133 | elif ref_sect_title is not None: 134 | # Set the start of the reference section to be after the title line 135 | start_idx += 1 136 | 137 | if ref_sect_end_line is not None: 138 | ref_lines = docbody[start_idx : ref_sect_end_line + 1] 139 | else: 140 | ref_lines = docbody[start_idx:] 141 | 142 | if ref_sect_title: 143 | ref_lines = strip_footer(ref_lines, ref_sect_title) 144 | # Now rebuild reference lines: 145 | # (Go through each raw reference line, and format them into a set 146 | # of properly ordered lines based on markers) 147 | return rebuild_reference_lines(ref_lines, ref_line_marker_ptn) 148 | 149 | 150 | def match_pagination(ref_line): 151 | """Remove footer pagination from references lines""" 152 | pattern = r"\(?\[?(\d{1,4})\]?\)?\.?\s*$" 153 | re_footer = re.compile(pattern, re.UNICODE) 154 | match = re_footer.match(ref_line) 155 | if match: 156 | return int(match.group(1)) 157 | return None 158 | 159 | 160 | def strip_footer(ref_lines, section_title): 161 | """Remove footer title from references lines""" 162 | pattern = r"\(?\[?\d{0,4}\]?\)?\.?\s*%s\s*$" % re.escape(section_title) 163 | re_footer = re.compile(pattern, re.UNICODE) 164 | return [line for line in ref_lines if not re_footer.match(line)] 165 | 166 | 167 | def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn): 168 | """Given a reference section, rebuild the reference lines. After translation 169 | from PDF to text, reference lines are often broken. This is because 170 | pdftotext doesn't know what is a wrapped-line and what is a genuine new 171 | line. As a result, the following 2 reference lines: 172 | [1] See http://invenio-software.org/ for more details. 173 | [2] Example, AN: private communication (1996). 174 | ...could be broken into the following 4 lines during translation from PDF 175 | to plaintext: 176 | [1] See http://invenio-software.org/ fo 177 | r more details. 178 | [2] Example, AN: private communica 179 | tion (1996). 180 | Such a situation could lead to a citation being separated across 'lines', 181 | meaning that it wouldn't be correctly recognised. 182 | This function tries to rebuild the reference lines. It uses the pattern 183 | used to recognise a reference line's numeration marker to indicate the 184 | start of a line. If no reference line numeration was recognised, it will 185 | simply join all lines together into one large reference line. 186 | @param ref_sectn: (list) of strings. The (potentially broken) reference 187 | lines. 188 | @param ref_line_marker_ptn: (string) - the pattern used to recognise a 189 | reference line's numeration marker. 190 | @return: (list) of strings - the rebuilt reference section. Each string 191 | in the list represents a complete reference line. 192 | """ 193 | indentation_splitting = False 194 | 195 | # This should be moved the function detecting the pattern! 196 | if not ref_line_marker_ptn: 197 | if test_for_blank_lines_separating_reference_lines(ref_sectn): 198 | # Use blank lines to separate ref lines 199 | ref_line_marker_ptn = r"^\s*$" 200 | else: 201 | # No ref line dividers 202 | # We are guessing this the format: 203 | # Reference1 204 | # etc 205 | # Reference2 206 | # etc 207 | # We split when there's no identation 208 | indentation_splitting = True 209 | ref_line_marker_ptn = r"^[^\s]" 210 | 211 | LOGGER.debug("references separator %s", ref_line_marker_ptn) 212 | p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I | re.UNICODE) 213 | 214 | # Start from ref 1 215 | # Append each fixed reference line to rebuilt_references 216 | # and rebuild references as we go 217 | current_ref = 0 218 | rebuilt_references = [] 219 | working_ref = [] 220 | 221 | def prepare_ref(working_ref): 222 | working_ref = working_ref[:CFG_REFEXTRACT_MAX_LINES] 223 | working_line = "" 224 | for line in working_ref: 225 | working_line = join_lines(working_line, line.strip()) 226 | return working_line.rstrip() 227 | 228 | lower_case_start = re.compile(r"[a-z]") 229 | continuing_line_markers = re.compile(r"[,&-]$") 230 | 231 | for line in ref_sectn: 232 | # Can't find a good way to distinguish between 233 | # pagination and the page number of a journal numeration that 234 | # happens to be alone in a new line 235 | # m = match_pagination(line) 236 | # if m and current_ref and current_ref != m + 1: 237 | # continue 238 | 239 | # Try to find the marker for the reference line 240 | m_ref_line_marker = p_ref_line_marker.search(line) 241 | 242 | if m_ref_line_marker: 243 | try: 244 | marknum = int(m_ref_line_marker.group("marknum")) 245 | except IndexError: 246 | marknum = None 247 | except ValueError: 248 | # If the mark is a unicode character category [Nd], 249 | # it is not always convertible to int by int() 250 | # We can't use its numerical value, but we still accept it 251 | # as numeration 252 | pass 253 | 254 | new_line_detected = False 255 | if marknum is None or current_ref + 1 == marknum: 256 | new_line_detected = True 257 | if indentation_splitting: 258 | if lower_case_start.match(line.strip()): 259 | new_line_detected = False 260 | if working_ref and continuing_line_markers.search( 261 | working_ref[-1].strip() 262 | ): 263 | new_line_detected = False 264 | 265 | if new_line_detected: 266 | # Reference line marker found! : Append this reference to the 267 | # list of fixed references and reset the working_line to 268 | # 'blank' 269 | start = m_ref_line_marker.start() 270 | if line[:start]: 271 | # If it's not a blank line to separate refs 272 | # Only append from the start of the marker 273 | # For this case: 274 | # [1] hello 275 | # hello2 [2] foo 276 | working_ref.append(line[:start]) 277 | 278 | # Append current working line to the refs list 279 | if working_ref: 280 | rebuilt_references.append(prepare_ref(working_ref)) 281 | 282 | current_ref = marknum 283 | working_ref = [] 284 | if line[start:]: 285 | working_ref.append(line[start:]) 286 | 287 | else: 288 | # Our marker does not match the counting 289 | # Either we missed one, the author missed one or 290 | # it is not a line marker 291 | # For now we assume it is not line marker 292 | working_ref.append(line) 293 | 294 | elif line: 295 | # Continuation of line 296 | working_ref.append(line) 297 | 298 | if working_ref: 299 | # Append last line 300 | rebuilt_references.append(prepare_ref(working_ref)) 301 | 302 | return rebuilt_references 303 | 304 | 305 | def wash_and_repair_reference_line(line): 306 | """Wash a reference line of undesirable characters (such as poorly-encoded 307 | letters, etc), and repair any errors (such as broken URLs) if possible. 308 | @param line: (string) the reference line to be washed/repaired. 309 | @return: (string) the washed reference line. 310 | """ 311 | # repair URLs in line: 312 | line = repair_broken_urls(line) 313 | # Replace various undesirable characters with their alternatives: 314 | line = replace_undesirable_characters(line) 315 | # Replace "," with "<title>", 316 | # common typing mistake 317 | line = re.sub(r'"([^"]+),"', r'"\g<1>",', line) 318 | line = replace_undesirable_characters(line) 319 | # Remove instances of multiple spaces from line, replacing with a 320 | # single space: 321 | line = re_multiple_space.sub(" ", line) 322 | return line 323 | 324 | 325 | def test_for_blank_lines_separating_reference_lines(ref_sect): 326 | """Test to see if reference lines are separated by blank lines so that 327 | these can be used to rebuild reference lines. 328 | @param ref_sect: (list) of strings - the reference section. 329 | @return: (int) 0 if blank lines do not separate reference lines; 1 if 330 | they do. 331 | """ 332 | num_blanks = 0 # Number of blank lines found between non-blanks 333 | num_lines = 0 # Number of reference lines separated by blanks 334 | blank_line_separators = 0 # Flag to indicate whether blanks lines separate 335 | # ref lines 336 | multi_nonblanks_found = 0 # Flag to indicate whether multiple nonblank 337 | # lines are found together (used because 338 | # if line is dbl-spaced, it isnt a blank that 339 | # separates refs & can't be relied upon) 340 | x = 0 341 | max_line = len(ref_sect) 342 | while x < max_line: 343 | if not ref_sect[x].isspace(): 344 | # not an empty line: 345 | num_lines += 1 346 | x += 1 # Move past line 347 | while x < len(ref_sect) and not ref_sect[x].isspace(): 348 | multi_nonblanks_found = 1 349 | x += 1 350 | x -= 1 351 | else: 352 | # empty line 353 | num_blanks += 1 354 | x += 1 355 | while x < len(ref_sect) and ref_sect[x].isspace(): 356 | x += 1 357 | if x == len(ref_sect): 358 | # Blanks at end doc: dont count 359 | num_blanks -= 1 360 | x -= 1 361 | x += 1 362 | # Now from the number of blank lines & the number of text lines, if 363 | # num_lines > 3, & num_blanks = num_lines, or num_blanks = num_lines - 1, 364 | # then we have blank line separators between reference lines 365 | if ( 366 | (num_lines > 3) 367 | and ((num_blanks == num_lines) or (num_blanks == num_lines - 1)) 368 | and (multi_nonblanks_found) 369 | ): 370 | blank_line_separators = 1 371 | return blank_line_separators 372 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | target-version = "py311" 2 | [lint.flake8-tidy-imports] 3 | ban-relative-imports = "all" 4 | 5 | [lint] 6 | select = [ 7 | # pycodestyle 8 | "E", 9 | # Pyflakes 10 | "F", 11 | # flake8-bugbear 12 | "B", 13 | # flake8-simplify 14 | "SIM", 15 | # isort 16 | "I", 17 | # flake8-tidy-imports 18 | "TID", 19 | # flake8-pytest-style 20 | "PT", 21 | ] 22 | ignore = ["B904"] 23 | 24 | [lint.pycodestyle] 25 | ignore-overlong-task-comments = true 26 | 27 | [lint.pydocstyle] 28 | convention = "google" 29 | -------------------------------------------------------------------------------- /run-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2015, 2018 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | set -e 25 | 26 | flake8 refextract tests 27 | py.test tests 28 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract. 4 | # Copyright (C) 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | import os 25 | 26 | import pytest 27 | 28 | 29 | @pytest.fixture 30 | def pdf_files(): 31 | path_to_pdfs = os.path.join(os.path.dirname(__file__), "data") 32 | pdfs = os.listdir(path_to_pdfs) 33 | pdfs.sort() 34 | return {pdf: os.path.join(path_to_pdfs, pdf) for pdf in pdfs} 35 | -------------------------------------------------------------------------------- /tests/data/1503.07589v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1503.07589v1.pdf -------------------------------------------------------------------------------- /tests/data/1508.05632v2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1508.05632v2.pdf -------------------------------------------------------------------------------- /tests/data/1706.09498v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1706.09498v1.pdf -------------------------------------------------------------------------------- /tests/data/1707.04066v1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1707.04066v1.pdf -------------------------------------------------------------------------------- /tests/data/1805.05865.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1805.05865.pdf -------------------------------------------------------------------------------- /tests/data/2110.02751.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2110.02751.pdf -------------------------------------------------------------------------------- /tests/data/2301.05883.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2301.05883.pdf -------------------------------------------------------------------------------- /tests/data/2303.03819.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2303.03819.pdf -------------------------------------------------------------------------------- /tests/data/2304.10117.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2304.10117.pdf -------------------------------------------------------------------------------- /tests/data/2406.06875.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2406.06875.pdf -------------------------------------------------------------------------------- /tests/data/2502.18907.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2502.18907.pdf -------------------------------------------------------------------------------- /tests/data/2502.21088.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2502.21088.pdf -------------------------------------------------------------------------------- /tests/data/2503.05372.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2503.05372.pdf -------------------------------------------------------------------------------- /tests/data/2503.05621.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2503.05621.pdf -------------------------------------------------------------------------------- /tests/data/DIS_SHEILA_final.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/DIS_SHEILA_final.pdf -------------------------------------------------------------------------------- /tests/data/file_resolving.csv: -------------------------------------------------------------------------------- 1 | 1|2|3 2 | 4|5|6 3 | -------------------------------------------------------------------------------- /tests/data/packed_pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/packed_pdf.pdf -------------------------------------------------------------------------------- /tests/data/wepml008.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/wepml008.pdf -------------------------------------------------------------------------------- /tests/integration/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from refextract.app import create_app 4 | 5 | 6 | @pytest.fixture(autouse=True, scope="session") 7 | def app(): 8 | app = create_app() 9 | return app 10 | 11 | 12 | @pytest.fixture 13 | def app_client(app): 14 | with app.test_client() as client: 15 | yield client 16 | 17 | 18 | @pytest.fixture(scope="session") 19 | def vcr_config(): 20 | return { 21 | "filter_query_parameters": ["access_token"], 22 | "ignore_localhost": True, 23 | "decode_compressed_response": True, 24 | "filter_headers": ("Authorization", "User-Agent"), 25 | "record_mode": "once", 26 | } 27 | -------------------------------------------------------------------------------- /tests/integration/test_views.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import mock 4 | import pytest 5 | 6 | 7 | def test_extract_journal_info(app_client): 8 | journal_kb_data = { 9 | "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.", 10 | "PHYS REV": "Phys.Rev.", 11 | "PHYSICAL REVIEW": "Phys.Rev.", 12 | "PHYS REV LETT": "Phys.Rev.Lett.", 13 | "JINST": "JINST", 14 | "JOURNAL OF INSTRUMENTATION": "JINST", 15 | "SENS ACTUATORS B": "Sens.Actuators B", 16 | "SENSORS AND ACTUATORS B: CHEMICAL": "Sens.Actuators B", 17 | "PHYS SCRIPTA": "Phys.Scripta", 18 | "PHYSICA SCRIPTA": "Phys.Scripta", 19 | "BULL CALCUTTA MATH SOC": "Bull.Calcutta Math.Soc.", 20 | "BULLETIN OF THE CALCUTTA MATHEMATICAL SOCIETY": "Bull.Calcutta Math.Soc.", 21 | "QUANTUM MACHINE INTELLIGENCE": "Quantum Machine Intelligence", 22 | } 23 | publication_infos = [ 24 | {"pubinfo_freetext": "Phys. Rev. 127 (1962) 965-970"}, 25 | {"journal_title": "Phys. Rev."}, 26 | ] 27 | 28 | payload = { 29 | "journal_kb_data": journal_kb_data, 30 | "publication_infos": publication_infos, 31 | } 32 | 33 | headers = { 34 | "content-type": "application/json", 35 | } 36 | response = app_client.post( 37 | "/extract_journal_info", 38 | headers=headers, 39 | data=json.dumps(payload), 40 | ) 41 | assert response.status_code == 200 42 | assert "extracted_publication_infos" in response.json 43 | assert len(response.json["extracted_publication_infos"]) == 2 44 | 45 | 46 | @mock.patch( 47 | "refextract.app.extract_journal_reference", side_effect=KeyError("test message") 48 | ) 49 | def test_extract_journal_info_when_timeout_from_refextract( 50 | mock_extract_refs, app_client 51 | ): 52 | journal_kb_data = { 53 | "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.", 54 | "PHYS REV": "Phys.Rev.", 55 | "PHYSICAL REVIEW": "Phys.Rev.", 56 | "PHYS REV LETT": "Phys.Rev.Lett.", 57 | "JINST": "JINST", 58 | "JOURNAL OF INSTRUMENTATION": "JINST", 59 | "SENS ACTUATORS B": "Sens.Actuators B", 60 | "SENSORS AND ACTUATORS B: CHEMICAL": "Sens.Actuators B", 61 | "PHYS SCRIPTA": "Phys.Scripta", 62 | "PHYSICA SCRIPTA": "Phys.Scripta", 63 | "BULL CALCUTTA MATH SOC": "Bull.Calcutta Math.Soc.", 64 | "BULLETIN OF THE CALCUTTA MATHEMATICAL SOCIETY": "Bull.Calcutta Math.Soc.", 65 | "QUANTUM MACHINE INTELLIGENCE": "Quantum Machine Intelligence", 66 | } 67 | publication_infos = [{"pubinfo_freetext": "Phys. Rev. 127 (1962) 965-970"}] 68 | 69 | payload = { 70 | "journal_kb_data": journal_kb_data, 71 | "publication_infos": publication_infos, 72 | } 73 | 74 | headers = { 75 | "content-type": "application/json", 76 | } 77 | response = app_client.post( 78 | "/extract_journal_info", 79 | headers=headers, 80 | data=json.dumps(payload), 81 | ) 82 | assert response.status_code == 500 83 | assert response.json == { 84 | "message": "Can not extract publication info data. Reason: 'test message'" 85 | } 86 | 87 | 88 | def test_extract_journal_info_for_multiple_pubinfos(app_client): 89 | journal_kb_data = { 90 | "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.", 91 | "PHYS REV": "Phys.Rev.", 92 | "PHYSICAL REVIEW": "Phys.Rev.", 93 | "PHYS REV LETT": "Phys.Rev.Lett.", 94 | "JINST": "JINST", 95 | "JOURNAL OF INSTRUMENTATION": "JINST", 96 | "SENS ACTUATORS B": "Sens.Actuators B", 97 | "SENSORS AND ACTUATORS B: CHEMICAL": "Sens.Actuators B", 98 | "PHYS SCRIPTA": "Phys.Scripta", 99 | "PHYSICA SCRIPTA": "Phys.Scripta", 100 | "BULL CALCUTTA MATH SOC": "Bull.Calcutta Math.Soc.", 101 | "BULLETIN OF THE CALCUTTA MATHEMATICAL SOCIETY": "Bull.Calcutta Math.Soc.", 102 | "QUANTUM MACHINE INTELLIGENCE": "Quantum Machine Intelligence", 103 | } 104 | publication_infos = [ 105 | {"pubinfo_freetext": "Phys. Rev. 127 (1962) 965-970"}, 106 | {"pubinfo_freetext": "Phys.Rev.Lett. 127 (1962) 965-970"}, 107 | ] 108 | 109 | payload = { 110 | "journal_kb_data": journal_kb_data, 111 | "publication_infos": publication_infos, 112 | } 113 | 114 | headers = { 115 | "content-type": "application/json", 116 | } 117 | response = app_client.post( 118 | "/extract_journal_info", 119 | headers=headers, 120 | data=json.dumps(payload), 121 | ) 122 | assert response.status_code == 200 123 | assert "extracted_publication_infos" in response.json 124 | assert len(response.json["extracted_publication_infos"]) == 2 125 | 126 | 127 | def test_extract_extract_references_from_text(app_client): 128 | journal_kb_data = { 129 | "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.", 130 | "PHYS REV": "Phys.Rev.", 131 | "PHYSICAL REVIEW": "Phys.Rev.", 132 | } 133 | headers = { 134 | "content-type": "application/json", 135 | } 136 | text = "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581" 137 | payload = {"journal_kb_data": journal_kb_data, "text": text} 138 | response = app_client.post( 139 | "/extract_references_from_text", 140 | headers=headers, 141 | data=json.dumps(payload), 142 | ) 143 | assert response.status_code == 200 144 | assert "extracted_references" in response.json 145 | assert len(response.json["extracted_references"]) == 1 146 | assert "author" in response.json["extracted_references"][0] 147 | assert "misc" in response.json["extracted_references"][0] 148 | assert "year" in response.json["extracted_references"][0] 149 | 150 | 151 | @mock.patch( 152 | "refextract.app.extract_references_from_string", 153 | side_effect=KeyError("test message"), 154 | ) 155 | def test_extract_references_from_text_when_timeout_from_refextract( 156 | mock_extract_refs, app_client 157 | ): 158 | journal_kb_data = { 159 | "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.", 160 | "PHYS REV": "Phys.Rev.", 161 | "PHYSICAL REVIEW": "Phys.Rev.", 162 | } 163 | headers = { 164 | "content-type": "application/json", 165 | } 166 | text = "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581" 167 | payload = {"journal_kb_data": journal_kb_data, "text": text} 168 | response = app_client.post( 169 | "/extract_references_from_text", headers=headers, data=json.dumps(payload) 170 | ) 171 | assert response.status_code == 500 172 | assert response.json == { 173 | "message": "Can not extract references. Reason: 'test message'" 174 | } 175 | 176 | 177 | def test_extract_extract_references_from_list(app_client): 178 | journal_kb_data = { 179 | "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.", 180 | "PHYS REV": "Phys.Rev.", 181 | "PHYSICAL REVIEW": "Phys.Rev.", 182 | } 183 | headers = { 184 | "content-type": "application/json", 185 | } 186 | raw_references = [ 187 | "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581", 188 | "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 582", 189 | "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 583", 190 | ] 191 | payload = {"journal_kb_data": journal_kb_data, "raw_references": raw_references} 192 | response = app_client.post( 193 | "/extract_references_from_list", 194 | headers=headers, 195 | data=json.dumps(payload), 196 | ) 197 | assert response.status_code == 200 198 | assert "extracted_references" in response.json 199 | assert len(response.json["extracted_references"]) == 3 200 | for reference in response.json["extracted_references"]: 201 | assert "author" in reference 202 | assert "misc" in reference 203 | assert "year" in reference 204 | 205 | 206 | @mock.patch( 207 | "refextract.app.extract_references_from_string", 208 | side_effect=KeyError("test message"), 209 | ) 210 | def test_extract_extract_references_from_list_when_error_from_refextract( 211 | mock_extract_refs, app_client 212 | ): 213 | journal_kb_data = { 214 | "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.", 215 | "PHYS REV": "Phys.Rev.", 216 | "PHYSICAL REVIEW": "Phys.Rev.", 217 | } 218 | headers = { 219 | "content-type": "application/json", 220 | } 221 | raw_references = [ 222 | "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581", 223 | "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 582", 224 | "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 583", 225 | ] 226 | payload = {"journal_kb_data": journal_kb_data, "raw_references": raw_references} 227 | response = app_client.post( 228 | "/extract_references_from_list", 229 | headers=headers, 230 | data=json.dumps(payload), 231 | ) 232 | 233 | expected_response = [ 234 | {"raw_ref": ["Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581"]}, 235 | {"raw_ref": ["Iskra Ł W et al 2017 Acta Phys. Pol. B 48 582"]}, 236 | {"raw_ref": ["Iskra Ł W et al 2017 Acta Phys. Pol. B 48 583"]}, 237 | ] 238 | assert response.status_code == 200 239 | assert response.json["extracted_references"] == expected_response 240 | 241 | 242 | @pytest.mark.vcr 243 | def test_extract_extract_references_from_url(app_client): 244 | journal_kb_data = { 245 | "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.", 246 | "PHYS REV": "Phys.Rev.", 247 | "PHYSICAL REVIEW": "Phys.Rev.", 248 | "PHYS REV LETT": "Phys.Rev.Lett.", 249 | "JINST": "JINST", 250 | "JOURNAL OF INSTRUMENTATION": "JINST", 251 | "SENS ACTUATORS B": "Sens.Actuators B", 252 | "SENSORS AND ACTUATORS B: CHEMICAL": "Sens.Actuators B", 253 | "PHYS SCRIPTA": "Phys.Scripta", 254 | "PHYSICA SCRIPTA": "Phys.Scripta", 255 | "BULL CALCUTTA MATH SOC": "Bull.Calcutta Math.Soc.", 256 | "BULLETIN OF THE CALCUTTA MATHEMATICAL SOCIETY": "Bull.Calcutta Math.Soc.", 257 | "QUANTUM MACHINE INTELLIGENCE": "Quantum Machine Intelligence", 258 | } 259 | headers = { 260 | "content-type": "application/json", 261 | } 262 | url = "https://inspirehep.net/files/33ea6e86a7bfb4cab4734ed5c14d4529" 263 | payload = {"url": url, "journal_kb_data": journal_kb_data} 264 | response = app_client.post( 265 | "/extract_references_from_url", 266 | headers=headers, 267 | data=json.dumps(payload), 268 | ) 269 | assert response.status_code == 200 270 | assert "extracted_references" in response.json 271 | assert len(response.json["extracted_references"]) == 2 272 | -------------------------------------------------------------------------------- /tests/test_api.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | import pytest 25 | import responses 26 | 27 | from refextract.references.api import ( 28 | extract_journal_reference, 29 | extract_references_from_file, 30 | extract_references_from_string, 31 | extract_references_from_url, 32 | ) 33 | from refextract.references.errors import FullTextNotAvailableError 34 | 35 | 36 | @pytest.fixture 37 | def kbs_override(): 38 | return { 39 | "books": [("Griffiths, David", "Introduction to elementary particles", "2008")], 40 | "journals": [ 41 | ( 42 | "PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS", 43 | "Phys.Rev.ST Accel.Beams", 44 | ), 45 | ("PHYS REV D", "Phys.Rev.;D"), 46 | ("PHYS REV", "Phys.Rev."), 47 | ("PHYS REV LETT", "Phys.Rev.Lett."), 48 | ("PHYS LETT", "Phys.Lett."), 49 | ("J PHYS", "J.Phys."), 50 | ("JOURNAL OF PHYSICS", "J.Phys."), 51 | ("J PHYS G", "J.Phys.;G"), 52 | ("PHYSICAL REVIEW", "Phys.Rev."), 53 | ("ADV THEO MATH PHYS", "Adv.Theor.Math.Phys."), 54 | ("MATH PHYS", "Math.Phys."), 55 | ("J MATH PHYS", "J.Math.Phys."), 56 | ("JHEP", "JHEP"), 57 | ( 58 | "SITZUNGSBER PREUSS AKAD WISS PHYS MATH KL", 59 | "Sitzungsber.Preuss.Akad.Wiss.Berlin (Math.Phys.)", 60 | ), 61 | ("PHYS LETT", "Phys.Lett."), 62 | ("NUCL PHYS", "Nucl.Phys."), 63 | ("NUCL PHYS", "Nucl.Phys."), 64 | ("NUCL PHYS PROC SUPPL", "Nucl.Phys.Proc.Suppl."), 65 | ("JINST", "JINST"), 66 | ("THE EUROPEAN PHYSICAL JOURNAL C PARTICLES AND FIELDS", "Eur.Phys.J.;C"), 67 | ("COMMUN MATH PHYS", "Commun.Math.Phys."), 68 | ("COMM MATH PHYS", "Commun.Math.Phys."), 69 | ("REV MOD PHYS", "Rev.Mod.Phys."), 70 | ("ANN PHYS U S", "Ann.Phys."), 71 | ("AM J PHYS", "Am.J.Phys."), 72 | ("PROC R SOC LONDON SER", "Proc.Roy.Soc.Lond."), 73 | ("CLASS QUANT GRAVITY", "Class.Quant.Grav."), 74 | ("FOUND PHYS", "Found.Phys."), 75 | ("IEEE TRANS NUCL SCI", "IEEE Trans.Nucl.Sci."), 76 | ("SCIENCE", "Science"), 77 | ("ACTA MATERIALIA", "Acta Mater."), 78 | ("REVIEWS OF MODERN PHYSICS", "Rev.Mod.Phys."), 79 | ("NUCL INSTRUM METHODS", "Nucl.Instrum.Meth."), 80 | ("Z PHYS", "Z.Phys."), 81 | ], 82 | "journals_re": [ 83 | "DAN---Dokl.Akad.Nauk Ser.Fiz.", 84 | ], 85 | "report-numbers": [ 86 | "#####CERN#####", 87 | "< yy 999>", 88 | "< yyyy 999>", 89 | "ATL CONF---ATL-CONF", 90 | "ATL PHYS INT---ATL-PHYS-INT", 91 | "ATLAS CONF---ATL-CONF", 92 | "#####LANL#####", 93 | "<s/syymm999>", 94 | "<syymm999>", 95 | "ASTRO PH---astro-ph", 96 | "HEP PH---hep-ph", 97 | "HEP TH---hep-th", 98 | "HEP EX---hep-ex", 99 | "#####LHC#####", 100 | "< yy 999>", 101 | "<syyyy 999>", 102 | "< 999>", 103 | "< 9999>", 104 | "CERN LHC PROJECT REPORT---CERN-LHC-Project-Report", 105 | "CLIC NOTE ---CERN-CLIC-Note", 106 | "CERN LHCC ---CERN-LHCC", 107 | "CERN EP ---CERN-EP", 108 | "######ATLANTIS#######", 109 | "< 9999999>", 110 | "CERN EX---CERN-EX", 111 | ], 112 | } 113 | 114 | 115 | def test_journal_extract(): 116 | r = extract_journal_reference("Science Vol. 338 no. 6108 (2012) pp. 773-775") 117 | assert r["year"] == "2012" 118 | assert r["volume"] == "338" 119 | assert r["page"] == "773-775" 120 | assert r["title"] == "Science" 121 | 122 | 123 | def test_extract_references_from_string(kbs_override): 124 | ref_lines = """[9] R. Bousso, JHEP 9906:028 (1999); hep-th/9906022.""" 125 | r = extract_references_from_string(ref_lines, override_kbs_files=kbs_override) 126 | assert len(r) == 2 127 | 128 | 129 | def test_extract_references_from_file(pdf_files): 130 | pdf = pdf_files["1503.07589v1.pdf"] 131 | r = extract_references_from_file(pdf) 132 | assert "texkey" in r[0] 133 | assert "author" in r[0] 134 | assert "url" in r[0] 135 | assert len(r) == 36 136 | with pytest.raises(FullTextNotAvailableError): 137 | extract_references_from_file(pdf + "error") 138 | 139 | 140 | def test_extract_references_from_file_dois_as_pdfs_annotations(pdf_files): 141 | """Test DOIs as PDFs annotations and texkeys as named destinations""" 142 | pdf_file_with_dois_as_pdfs_annotations = pdf_files["2503.05372.pdf"] 143 | extracted_references = extract_references_from_file( 144 | pdf_file_with_dois_as_pdfs_annotations 145 | ) 146 | first_reference = extracted_references[0] 147 | assert len(first_reference["url"]) == 2 148 | assert "https://doi.org/10.1103/PhysRevD.68.037502" in first_reference["url"] 149 | assert "texkey" in first_reference 150 | assert "Cahn:2003cw" in first_reference["texkey"] 151 | assert len(extracted_references) == 39 152 | 153 | 154 | def test_extract_references_from_file_does_not_ignore_letters_in_volume(pdf_files): 155 | """Test that letters in volume are not ignored.""" 156 | pdf = pdf_files["2503.05621.pdf"] 157 | extracted_references = extract_references_from_file(pdf) 158 | fith_reference = extracted_references[4] 159 | assert "journal_volume" in fith_reference 160 | assert fith_reference["journal_reference"][0] == "Phys. Rev. D95 (2017) 114510" 161 | assert fith_reference["journal_volume"][0] == "D95" 162 | assert len(extracted_references) == 24 163 | 164 | 165 | def test_extract_references_with_authors_after_references(pdf_files): 166 | """Test that references extracted even with authors after references.""" 167 | pdf = pdf_files["2502.21088.pdf"] 168 | extracted_references = extract_references_from_file(pdf) 169 | first_reference = extracted_references[0] 170 | last_reference = extracted_references[-1] 171 | # assert first reference is correctly extracted 172 | assert first_reference["journal_reference"][0] == "Phys. Rev. Lett. 25 (1970) 316" 173 | assert first_reference["author"][0] == "S. D. Drell and T.-M. Yan" 174 | # assert last reference correctly extracts collaboration 175 | assert last_reference["collaboration"][0] == "ATLAS Collaboration" 176 | assert len(extracted_references) == 104 177 | 178 | 179 | @pytest.mark.xfail( 180 | reason="It should not put an Author in author field as it is a collaboration. " 181 | "This happens because there are authors after the references." 182 | ) 183 | def test_collaboration_without_author_when_authors_after_references(pdf_files): 184 | """Test that references extracted even with authors after references.""" 185 | pdf = pdf_files["2502.21088.pdf"] 186 | extracted_references = extract_references_from_file(pdf) 187 | last_reference = extracted_references[-1] 188 | # assert last reference is correctly extracted 189 | assert last_reference["collaboration"][0] == "ATLAS Collaboration" 190 | assert "author" not in last_reference 191 | 192 | 193 | @pytest.mark.xfail(reason="It should extract the journal reference and urls correctly.") 194 | def test_extract_references_two_column_layout(pdf_files): 195 | """Test that references extracted even with authors after references.""" 196 | pdf = pdf_files["2502.18907.pdf"] 197 | extracted_references = extract_references_from_file(pdf) 198 | first_reference = extracted_references[0] 199 | assert ( 200 | first_reference["author"][0] 201 | == "Adamopoulos G., Robertson J., Morrison N. A., Godet C." 202 | ) 203 | assert first_reference["journal_reference"][0] == " J. Appl. Phys. 96 (2004) 6348" 204 | assert "url" in first_reference 205 | 206 | 207 | def test_extract_references_with_multiple_refs_under_same_marker(pdf_files): 208 | """Test that references extracted even with authors after references.""" 209 | pdf = pdf_files["2406.06875.pdf"] 210 | extracted_references = extract_references_from_file(pdf) 211 | first_reference = extracted_references[0] 212 | second_reference = extracted_references[1] 213 | third_reference = extracted_references[2] 214 | assert first_reference["author"][0] == "W.T. Tutte" 215 | assert second_reference["author"][0] == "W.T. Tutte" 216 | assert third_reference["author"][0] == "W.T. Tutte" 217 | assert first_reference["journal_reference"][0] == "Can. J. Math. 14 (1962) 21" 218 | assert second_reference["journal_reference"][0] == "Can. J. Math. 15 (1963) 249" 219 | assert ( 220 | third_reference["journal_reference"][0] == "Bull. Am. Math. Soc. 74 (1968) 64" 221 | ) 222 | assert first_reference["linemarker"][0] == "1" 223 | assert second_reference["linemarker"][0] == "1" 224 | assert third_reference["linemarker"][0] == "1" 225 | 226 | 227 | @responses.activate 228 | def test_extract_references_from_url(pdf_files): 229 | with open(pdf_files["1503.07589v1.pdf"], "rb") as fd: 230 | url = "http://arxiv.org/pdf/1503.07589v1.pdf" 231 | responses.add( 232 | responses.GET, url, body=fd.read(), content_type="application/pdf" 233 | ) 234 | 235 | r = extract_references_from_url(url) 236 | assert len(r) == 36 237 | assert "url" in r[0] 238 | 239 | url = "http://www.example.com" 240 | responses.add( 241 | responses.GET, 242 | url, 243 | body="File not found!", 244 | status=404, 245 | content_type="text/plain", 246 | ) 247 | with pytest.raises(FullTextNotAvailableError): 248 | extract_references_from_url(url) 249 | 250 | 251 | def test_long_registrant_dois(pdf_files): 252 | """DOIs with 5 digit registrant code""" 253 | r = extract_references_from_file(pdf_files["wepml008.pdf"]) 254 | assert len(r) == 6 255 | for ref in r[1:]: 256 | assert "doi" in ref 257 | assert ref.get("doi")[0].startswith("doi:10.18429/JACoW") 258 | 259 | 260 | def test_override_kbs_files_can_take_journals_dict(): 261 | journals = {"Journal of Testing": "J.Testing"} 262 | reference = "J. Smith, Journal of Testing 42 (2020) 1234" 263 | 264 | result = extract_references_from_string( 265 | reference, override_kbs_files={"journals": journals} 266 | ) 267 | assert result[0]["journal_title"] == ["J.Testing"] 268 | -------------------------------------------------------------------------------- /tests/test_find.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | from refextract.references.find import get_reference_section_beginning 25 | 26 | 27 | def test_simple(): 28 | sect = get_reference_section_beginning(["Hello", "References", "[1] Ref1"]) 29 | assert sect == { 30 | "marker": "[1]", 31 | "marker_pattern": "\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])", 32 | "start_line": 1, 33 | "title_string": "References", 34 | "title_marker_same_line": False, 35 | "how_found_start": 1, 36 | } 37 | 38 | 39 | def test_no_section(): 40 | sect = get_reference_section_beginning("") 41 | assert sect is None 42 | 43 | 44 | def test_no_title_via_brackets(): 45 | sect = get_reference_section_beginning(["Hello", "[1] Ref1[2] Ref2"]) 46 | assert sect == { 47 | "marker": "[1]", 48 | "marker_pattern": "(?P<mark>(?P<left>\\[)\\s*(?P<marknum>\\d+)\\s*(" 49 | "?P<right>\\]))", 50 | "start_line": 1, 51 | "title_string": None, 52 | "title_marker_same_line": False, 53 | "how_found_start": 2, 54 | } 55 | 56 | 57 | def test_no_title_via_dots(): 58 | sect = get_reference_section_beginning(["Hello", "1. Ref12. Ref2"]) 59 | assert sect == { 60 | "marker": "1.", 61 | "marker_pattern": "(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\.))", 62 | "start_line": 1, 63 | "title_string": None, 64 | "title_marker_same_line": False, 65 | "how_found_start": 3, 66 | } 67 | 68 | 69 | def test_no_title_via_numbers(): 70 | sect = get_reference_section_beginning(["Hello", "1 Ref12 Ref2"]) 71 | assert sect == { 72 | "marker": "1", 73 | "marker_pattern": "(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))", 74 | "start_line": 1, 75 | "title_string": None, 76 | "title_marker_same_line": False, 77 | "how_found_start": 4, 78 | } 79 | 80 | 81 | def test_no_title_via_numbers2(): 82 | sect = get_reference_section_beginning( 83 | [ 84 | "Hello", 85 | "1", 86 | "Ref1", 87 | "(3)", 88 | "2", 89 | "Ref2", 90 | ] 91 | ) 92 | assert sect, { 93 | "marker": "1", 94 | "marker_pattern": "(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))", 95 | "start_line": 1, 96 | "title_string": None, 97 | "title_marker_same_line": False, 98 | "how_found_start": 4, 99 | } 100 | -------------------------------------------------------------------------------- /tests/test_kbs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | import csv 25 | 26 | from refextract.references.kbs import file_resolving, get_kbs 27 | 28 | 29 | def test_get_kbs_doesnt_override_default_if_value_is_none(): 30 | cache = get_kbs(custom_kbs={"journals": None}) 31 | assert len(cache["journals"]) == 3 32 | assert "JHEP" in cache["journals"][-1] 33 | 34 | 35 | def test_get_kbs_caches_journal_dict(): 36 | journals = {"Journal of Testing": "J.Testing"} 37 | 38 | first_cache = get_kbs(custom_kbs={"journals": journals}).copy() 39 | assert len(first_cache["journals"]) == 3 40 | assert first_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TESTING"] 41 | 42 | journals = journals.copy() 43 | second_cache = get_kbs(custom_kbs={"journals": journals}) 44 | # the cache is reused, so identity of the cache elements doesn't change 45 | assert all( 46 | cached_first is cached_second 47 | for (cached_first, cached_second) in zip( 48 | first_cache["journals"], second_cache["journals"], strict=False 49 | ) 50 | ) 51 | 52 | 53 | def test_get_kbs_invalidates_cache_if_input_changes(): 54 | journals = {"Journal of Testing": "J.Testing"} 55 | first_cache = get_kbs(custom_kbs={"journals": journals}).copy() 56 | 57 | journals = journals = {"Journal of Testing": "J.Test."} 58 | second_cache = get_kbs(custom_kbs={"journals": journals}) 59 | # the cache is invalidated, so identity of the cache elements changes 60 | assert all( 61 | cached_first is not cached_second 62 | for (cached_first, cached_second) in zip( 63 | first_cache["journals"], second_cache["journals"], strict=False 64 | ) 65 | ) 66 | assert len(second_cache["journals"]) == 3 67 | assert second_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TEST"] 68 | 69 | 70 | def test_file_resolving(): 71 | # Test that the file resolving works as expected 72 | with file_resolving("tests/data/file_resolving.csv") as fh: 73 | assert fh.read() == "1|2|3\n4|5|6\n" 74 | 75 | 76 | def test_file_resolving_reader(): 77 | # Test that the file resolving works as expected with a reader 78 | with file_resolving("tests/data/file_resolving.csv", reader=csv.reader) as fh: 79 | rows = list(fh) 80 | assert rows == [["1", "2", "3"], ["4", "5", "6"]] 81 | -------------------------------------------------------------------------------- /tests/test_regexs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | import re 25 | 26 | from refextract.references import regexs 27 | 28 | 29 | def test_word(): 30 | r = regexs._create_regex_pattern_with_optional_spaces("ABC") 31 | assert r == r"A\s*B\s*C\s*" 32 | 33 | 34 | def test_reference_section_title_pattern(): 35 | r = regexs.get_reference_section_title_patterns() 36 | assert len(r) > 2 37 | 38 | 39 | def test_get_reference_line_numeration_marker_patterns(): 40 | r = regexs.get_reference_line_numeration_marker_patterns() 41 | assert len(r) > 2 42 | 43 | 44 | def test_get_reference_line_marker_pattern(): 45 | r = regexs.get_reference_line_marker_pattern("ABC") 46 | assert r.pattern.find("ABC") != -1 47 | 48 | 49 | def test_get_post_reference_section_title_patterns(): 50 | r = regexs.get_post_reference_section_title_patterns() 51 | assert len(r) > 2 52 | 53 | 54 | def test_get_post_reference_section_keyword_patterns(): 55 | r = regexs.get_post_reference_section_keyword_patterns() 56 | assert len(r) > 2 57 | 58 | 59 | def test_regex_match_list(): 60 | s = "ABC" 61 | m = regexs.regex_match_list(s, [re.compile("C.C"), re.compile("A.C")]) 62 | assert m 63 | m = regexs.regex_match_list(s, [re.compile("C.C")]) 64 | assert m is None 65 | -------------------------------------------------------------------------------- /tests/test_tag.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | from refextract.references.tag import ( 25 | find_numeration, 26 | find_numeration_more, 27 | identify_ibids, 28 | tag_arxiv, 29 | ) 30 | 31 | 32 | def test_vol_page_year(): 33 | "<vol>, <page> (<year>)" 34 | ref_line = """24, 418 (1930)""" 35 | r = find_numeration(ref_line) 36 | assert r["volume"] == "24" 37 | assert r["year"] == "1930" 38 | assert r["page"] == "418" 39 | 40 | 41 | def test_vol_year_page(): 42 | "<vol>, (<year>) <page>" 43 | ref_line = """24, (1930) 418""" 44 | r = find_numeration(ref_line) 45 | assert r["volume"] == "24" 46 | assert r["year"] == "1930" 47 | assert r["page"] == "418" 48 | 49 | 50 | def test_year_title_volume_page(): 51 | "<year>, <title> <vol> <page>" 52 | ref_line = """1930 <cds.JOURNAL>J.Phys.</cds.JOURNAL> 24, 418""" 53 | r = find_numeration_more(ref_line) 54 | assert r["volume"] == "24" 55 | assert r["year"] == "1930" 56 | assert r["page"] == "418" 57 | 58 | 59 | def test_identify_ibids_empty(): 60 | r = identify_ibids("") 61 | assert r == ({}, "") 62 | 63 | 64 | def test_identify_ibids_simple(): 65 | ref_line = ( 66 | "[46] E. Schrodinger, Sitzungsber. Preuss. Akad. Wiss. Phys. Math. Kl." 67 | " 24, 418(1930); ibid, 3, 1(1931)" 68 | ) 69 | r = identify_ibids(ref_line.upper()) 70 | assert r == ( 71 | {85: "IBID"}, 72 | "[46] E. SCHRODINGER, SITZUNGSBER. PREUSS. AKAD. " 73 | "WISS. PHYS. MATH. KL. 24, 418(1930); ____, 3, " 74 | "1(1931)", 75 | ) 76 | 77 | 78 | def test_4_digits(): 79 | ref_line = """{any prefix}arXiv:1003.1111{any postfix}""" 80 | r = tag_arxiv(ref_line) 81 | assert r.strip(": ") == ( 82 | "{any prefix}<cds.ARXIV>arXiv:1003.1111</cds.ARXIV>{any postfix}" 83 | ) 84 | 85 | 86 | def test_4_digits_suffix(): 87 | ref_line = """{any prefix}arXiv:1104.2222 [physics.ins-det]{any postfix}""" 88 | r = tag_arxiv(ref_line) 89 | assert r.strip(": ") == ( 90 | "{any prefix}<cds.ARXIV>arXiv:1104.2222 [" 91 | "physics.ins-det]</cds.ARXIV>{any postfix}" 92 | ) 93 | 94 | 95 | def test_5_digits(): 96 | ref_line = """{any prefix}arXiv:1303.33333{any postfix}""" 97 | r = tag_arxiv(ref_line) 98 | assert r.strip(": ") == ( 99 | "{any prefix}<cds.ARXIV>arXiv:1303.33333</cds.ARXIV>{any postfix}" 100 | ) 101 | 102 | 103 | def test_5_digits_2012(): 104 | ref_line = """{any prefix}arXiv:1203.33333{any postfix}""" 105 | r = tag_arxiv(ref_line) 106 | assert r.strip(": ") == "{any prefix}arXiv:1203.33333{any postfix}" 107 | 108 | 109 | def test_5_digits_suffix(): 110 | ref_line = """{any prefix}arXiv:1304.44444 [physics.ins-det]{any postfix}""" 111 | r = tag_arxiv(ref_line) 112 | assert r.strip(": ") == ( 113 | "{any prefix}<cds.ARXIV>arXiv:1304.44444 [" 114 | "physics.ins-det]</cds.ARXIV>{any postfix}" 115 | ) 116 | 117 | 118 | def test_4_digits_version(): 119 | ref_line = """{any prefix}arXiv:1003.1111v9{any postfix}""" 120 | r = tag_arxiv(ref_line) 121 | assert r.strip(": ") == ( 122 | "{any prefix}<cds.ARXIV>arXiv:1003.1111</cds.ARXIV>{any postfix}" 123 | ) 124 | 125 | 126 | def test_4_digits_suffix_version(): 127 | ref_line = """{any prefix}arXiv:1104.2222v9 [physics.ins-det]{any postfix}""" 128 | r = tag_arxiv(ref_line) 129 | assert r.strip(": ") == ( 130 | "{any prefix}<cds.ARXIV>arXiv:1104.2222 [" 131 | "physics.ins-det]</cds.ARXIV>{any postfix}" 132 | ) 133 | 134 | 135 | def test_5_digits_version(): 136 | ref_line = """{any prefix}arXiv:1303.33333v9{any postfix}""" 137 | r = tag_arxiv(ref_line) 138 | assert r.strip(": ") == ( 139 | "{any prefix}<cds.ARXIV>arXiv:1303.33333</cds.ARXIV>{any postfix}" 140 | ) 141 | 142 | 143 | def test_5_digits_suffix_version(): 144 | ref_line = """{any prefix}arXiv:1304.44444v9 [physics.ins-det]{any postfix}""" 145 | r = tag_arxiv(ref_line) 146 | assert r.strip(": ") == ( 147 | "{any prefix}<cds.ARXIV>arXiv:1304.44444 [" 148 | "physics.ins-det]</cds.ARXIV>{any postfix}" 149 | ) 150 | 151 | 152 | def test_4_digits_new(): 153 | ref_line = """{any prefix}9910.1234{any postfix}""" 154 | r = tag_arxiv(ref_line) 155 | assert r.strip(": ") == ( 156 | "{any prefix}<cds.ARXIV>arXiv:9910.1234</cds.ARXIV>{any postfix}" 157 | ) 158 | 159 | 160 | def test_4_digits_suffix_new(): 161 | ref_line = """{any prefix}9910.1234 [physics.ins-det]{any postfix}""" 162 | r = tag_arxiv(ref_line) 163 | assert r.strip(": ") == ( 164 | "{any prefix}<cds.ARXIV>arXiv:9910.1234 [" 165 | "physics.ins-det]</cds.ARXIV>{any postfix}" 166 | ) 167 | 168 | 169 | def test_5_digits_new(): 170 | ref_line = """{any prefix}1310.12345{any postfix}""" 171 | r = tag_arxiv(ref_line) 172 | assert r.strip(": ") == ( 173 | "{any prefix}<cds.ARXIV>arXiv:1310.12345</cds.ARXIV>{any postfix}" 174 | ) 175 | 176 | 177 | def test_5_digits_suffix_new(): 178 | ref_line = """{any prefix}1310.12345 [physics.ins-det]{any postfix}""" 179 | r = tag_arxiv(ref_line) 180 | assert r.strip(": ") == ( 181 | "{any prefix}<cds.ARXIV>arXiv:1310.12345 [" 182 | "physics.ins-det]</cds.ARXIV>{any postfix}" 183 | ) 184 | 185 | 186 | def test_4_digits_version_new(): 187 | ref_line = """{any prefix}9910.1234v9{any postfix}""" 188 | r = tag_arxiv(ref_line) 189 | assert r.strip(": ") == ( 190 | "{any prefix}<cds.ARXIV>arXiv:9910.1234</cds.ARXIV>{any postfix}" 191 | ) 192 | 193 | 194 | def test_4_digits_suffix_version_new(): 195 | ref_line = """{any prefix}9910.1234v9 [physics.ins-det]{any postfix}""" 196 | r = tag_arxiv(ref_line) 197 | assert r.strip(": ") == ( 198 | "{any prefix}<cds.ARXIV>arXiv:9910.1234 [" 199 | "physics.ins-det]</cds.ARXIV>{any postfix}" 200 | ) 201 | 202 | 203 | def test_5_digits_version_new(): 204 | ref_line = """{any prefix}1310.12345v9{any postfix}""" 205 | r = tag_arxiv(ref_line) 206 | assert r.strip(": ") == ( 207 | "{any prefix}<cds.ARXIV>arXiv:1310.12345</cds.ARXIV>{any postfix}" 208 | ) 209 | 210 | 211 | def test_5_digits_suffix_version_new(): 212 | ref_line = """{any prefix}1310.12345v9 [physics.ins-det]{any postfix}""" 213 | r = tag_arxiv(ref_line) 214 | assert r.strip(": ") == ( 215 | "{any prefix}<cds.ARXIV>arXiv:1310.12345 " 216 | "[physics.ins-det]</cds.ARXIV>{any postfix}" 217 | ) 218 | 219 | 220 | def test_5_digits_suffix_version_new_2012(): 221 | ref_line = """{any prefix}1210.12345v9 [physics.ins-det]{any postfix}""" 222 | r = tag_arxiv(ref_line) 223 | assert r.strip(": ") == "{any prefix}1210.12345v9 [physics.ins-det]{any postfix}" 224 | -------------------------------------------------------------------------------- /tests/test_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # This file is part of refextract 4 | # Copyright (C) 2016, 2018, 2020 CERN. 5 | # 6 | # refextract is free software; you can redistribute it and/or 7 | # modify it under the terms of the GNU General Public License as 8 | # published by the Free Software Foundation; either version 2 of the 9 | # License, or (at your option) any later version. 10 | # 11 | # refextract is distributed in the hope that it will be useful, but 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 | # General Public License for more details. 15 | # 16 | # You should have received a copy of the GNU General Public License 17 | # along with refextract; if not, write to the Free Software Foundation, Inc., 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. 19 | # 20 | # In applying this license, CERN does not waive the privileges and immunities 21 | # granted to it by virtue of its status as an Intergovernmental Organization 22 | # or submit itself to any jurisdiction. 23 | 24 | from refextract import extract_references_from_file 25 | from refextract.references.text import ( 26 | rebuild_reference_lines, 27 | ) 28 | 29 | 30 | def test_simple(): 31 | marker_pattern = r"^\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])" 32 | refs = [ 33 | "[1] hello", 34 | "hello2", 35 | "[2] foo", 36 | ] 37 | rebuilt_refs = rebuild_reference_lines(refs, marker_pattern) 38 | assert rebuilt_refs == [ 39 | "[1] hello hello2", 40 | "[2] foo", 41 | ] 42 | 43 | 44 | def test_pagination_non_removal(): 45 | marker_pattern = r"^\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])" 46 | refs = [ 47 | "[1] hello", 48 | "hello2", 49 | "[2]", 50 | "foo", 51 | ] 52 | rebuilt_refs = rebuild_reference_lines(refs, marker_pattern) 53 | assert rebuilt_refs == [ 54 | "[1] hello hello2", 55 | "[2] foo", 56 | ] 57 | 58 | 59 | def test_2_lines_together(): 60 | marker_pattern = r"\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])" 61 | refs = [ 62 | "[1] hello", 63 | "hello2 [2] foo", 64 | ] 65 | rebuilt_refs = rebuild_reference_lines(refs, marker_pattern) 66 | assert rebuilt_refs == [ 67 | "[1] hello hello2", 68 | "[2] foo", 69 | ] 70 | 71 | 72 | def test_get_number_header_lines_does_not_crash_on_final_empty_page(pdf_files): 73 | assert extract_references_from_file(pdf_files["1805.05865.pdf"]) 74 | --------------------------------------------------------------------------------