├── .github
    └── workflows
    │   ├── pull-request.yml
    │   ├── push-master.yml
    │   ├── release.yml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── poetry.lock
├── pyproject.toml
├── refextract
    ├── __init__.py
    ├── app.py
    ├── authors
    │   ├── __init__.py
    │   └── regexs.py
    ├── config.cfg
    ├── documents
    │   ├── __init__.py
    │   ├── pdf.py
    │   └── text.py
    └── references
    │   ├── __init__.py
    │   ├── api.py
    │   ├── config.py
    │   ├── engine.py
    │   ├── errors.py
    │   ├── find.py
    │   ├── kbs.py
    │   ├── kbs
    │       ├── authors.kb
    │       ├── books.kb
    │       ├── collaborations.kb
    │       ├── journal-titles-re.kb
    │       ├── journal-titles.kb
    │       ├── publishers.kb
    │       ├── report-numbers.kb
    │       └── special-journals.kb
    │   ├── pdf.py
    │   ├── record.py
    │   ├── regexs.py
    │   ├── tag.py
    │   └── text.py
├── ruff.toml
├── run-tests.sh
└── tests
    ├── conftest.py
    ├── data
        ├── 1503.07589v1.pdf
        ├── 1508.05632v2.pdf
        ├── 1706.09498v1.pdf
        ├── 1707.04066v1.pdf
        ├── 1805.05865.pdf
        ├── 2110.02751.pdf
        ├── 2301.05883.pdf
        ├── 2303.03819.pdf
        ├── 2304.10117.pdf
        ├── 2406.06875.pdf
        ├── 2502.18907.pdf
        ├── 2502.21088.pdf
        ├── 2503.05372.pdf
        ├── 2503.05621.pdf
        ├── DIS_SHEILA_final.pdf
        ├── file_resolving.csv
        ├── packed_pdf.pdf
        └── wepml008.pdf
    ├── integration
        ├── cassettes
        │   └── test_extract_extract_references_from_url.yaml
        ├── conftest.py
        └── test_views.py
    ├── test_api.py
    ├── test_engine.py
    ├── test_find.py
    ├── test_kbs.py
    ├── test_pdf.py
    ├── test_regexs.py
    ├── test_tag.py
    └── test_text.py


/.github/workflows/pull-request.yml:
--------------------------------------------------------------------------------
 1 | name: Pull request master
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [master]
 6 | 
 7 | jobs:
 8 |   tests:
 9 |     uses: ./.github/workflows/test.yml
10 | 


--------------------------------------------------------------------------------
/.github/workflows/push-master.yml:
--------------------------------------------------------------------------------
 1 | name: Pull request master
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 | 
 7 | jobs:
 8 |   tests:
 9 |     uses: ./.github/workflows/test.yml
10 | 
11 |   push_and_deploy_qa:
12 |     runs-on: ubuntu-latest
13 |     needs: [tests]
14 |     steps:
15 |       - name: Checkout Code
16 |         uses: actions/checkout@v4
17 |         with:
18 |           ref: ${{ github.ref }}
19 | 
20 |       - name: Build Image
21 |         id: build
22 |         uses: cern-sis/gh-workflows/.github/actions/docker-build@v6
23 |         with:
24 |           registry: registry.cern.ch
25 |           stage: refextract
26 |           image: cern-sis/inspirehep/refextract
27 |           cache: false
28 |           username: ${{ secrets.HARBOR_USERNAME }}
29 |           password: ${{ secrets.HARBOR_PASSWORD }}
30 | 
31 |       - name: Deploy QA
32 |         uses: cern-sis/gh-workflows/.github/actions/kubernetes-project-new-images@v6.4
33 |         with:
34 |           event-type: update
35 |           repo: cern-sis/kubernetes-inspire
36 |           images: registry.cern.ch/cern-sis/inspirehep/refextract@${{ steps.build.outputs.image-digest }}
37 |           token: ${{ secrets.PAT_FIRE_EVENTS_ON_CERN_SIS_KUBERNETES }}
38 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [released]
 6 | 
 7 | defaults:
 8 |   run:
 9 |     shell: bash
10 | 
11 | jobs:
12 |   push:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - name: Checkout
16 |         uses: actions/checkout@v4
17 |         with:
18 |           fetch-depth: 0
19 | 
20 |       - name: Push
21 |         run: |
22 |           git config user.name github-actions
23 |           git config user.email github-actions@github.com
24 |           git push --force --follow-tags origin ${{ github.ref_name }}:prod
25 | 
26 |       - name: Generate metadata
27 |         id: meta
28 |         uses: docker/metadata-action@v5
29 |         with:
30 |           images: |
31 |             registry.cern.ch/cern-sis/inspirehep/refextract
32 |           tags: "type=sha"
33 | 
34 |       - name: send event inspire
35 |         uses: cern-sis/gh-workflows/.github/actions/kubernetes-project-new-images@v6.4
36 |         with:
37 |           repo: cern-sis/kubernetes-inspire
38 |           event-type: release
39 |           images: ${{ env.DOCKER_METADATA_OUTPUT_TAGS }}
40 |           token: ${{ secrets.PAT_FIRE_EVENTS_ON_CERN_SIS_KUBERNETES }}
41 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test Python 3
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | jobs:
 7 |   lint_and_test:
 8 |       runs-on: ubuntu-latest
 9 |       steps:
10 |         - name: Checkout Code
11 |           uses: actions/checkout@v4
12 |           with:
13 |             ref: ${{ github.ref }}
14 |         - name: Lint - Pre-commit check
15 |           uses: pre-commit/action@v3.0.1
16 |         - name: Prep Build
17 |           uses: docker/setup-buildx-action@v3
18 |         - name: Build Docker image
19 |           run: docker build --target refextract-tests -t refextract .
20 |         - name: Run tests
21 |           run: >
22 |             docker run
23 |             --entrypoint poetry
24 |             refextract
25 |             run pytest
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract
  4 | # Copyright (C) 2016, 2017, 2018 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | # Byte-compiled / optimized / DLL files
 25 | __pycache__/
 26 | *.py[cod]
 27 | *$py.class
 28 | 
 29 | # C extensions
 30 | *.so
 31 | 
 32 | # Distribution / packaging
 33 | .Python
 34 | build/
 35 | develop-eggs/
 36 | dist/
 37 | downloads/
 38 | eggs/
 39 | .eggs/
 40 | lib/
 41 | lib64/
 42 | parts/
 43 | sdist/
 44 | var/
 45 | wheels/
 46 | *.egg-info/
 47 | .installed.cfg
 48 | *.egg
 49 | MANIFEST
 50 | 
 51 | # PyInstaller
 52 | #  Usually these files are written by a python script from a template
 53 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 54 | *.manifest
 55 | *.spec
 56 | 
 57 | # Installer logs
 58 | pip-log.txt
 59 | pip-delete-this-directory.txt
 60 | 
 61 | # Unit test / coverage reports
 62 | htmlcov/
 63 | .tox/
 64 | .coverage
 65 | .coverage.*
 66 | .cache
 67 | nosetests.xml
 68 | coverage.xml
 69 | *.cover
 70 | .hypothesis/
 71 | .pytest_cache/
 72 | 
 73 | # Translations
 74 | *.mo
 75 | *.pot
 76 | 
 77 | # Django stuff:
 78 | *.log
 79 | local_settings.py
 80 | db.sqlite3
 81 | 
 82 | # Flask stuff:
 83 | instance/
 84 | .webassets-cache
 85 | 
 86 | # Scrapy stuff:
 87 | .scrapy
 88 | 
 89 | # Sphinx documentation
 90 | docs/_build/
 91 | 
 92 | # PyBuilder
 93 | target/
 94 | 
 95 | # Jupyter Notebook
 96 | .ipynb_checkpoints
 97 | 
 98 | # pyenv
 99 | .python-version
100 | 
101 | # celery beat schedule file
102 | celerybeat-schedule
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env*
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | .idea
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | 
130 | # Build artifacts
131 | AUTHORS
132 | CHANGELOG
133 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.6.0
 4 |     hooks:
 5 |       - id: check-yaml
 6 |       - id: end-of-file-fixer
 7 |       - id: trailing-whitespace
 8 |       - id: fix-byte-order-marker
 9 |       - id: mixed-line-ending
10 |       - id: name-tests-test
11 |         args: [ --pytest-test-first ]
12 |         exclude: '^(?!factories/)'
13 |   - repo: https://github.com/astral-sh/ruff-pre-commit
14 |     rev: v0.11.2
15 |     hooks:
16 |       - id: ruff
17 |         args: [ --fix]
18 |       - id: ruff-format
19 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11.6-slim-bullseye AS refextract
 2 | 
 3 | ARG APP_HOME=/refextract
 4 | WORKDIR ${APP_HOME}
 5 | 
 6 | COPY refextract refextract/
 7 | 
 8 | RUN apt update && apt install poppler-utils libmagic1 -y
 9 | COPY poetry.lock pyproject.toml README.md ${APP_HOME}
10 | 
11 | RUN pip install --no-cache-dir poetry
12 | RUN poetry config virtualenvs.create false \
13 |     && poetry install --only main
14 | 
15 | ENV PROMETHEUS_MULTIPROC_DIR='/tmp'
16 | ENTRYPOINT ["gunicorn", "-b", ":5000", "--access-logfile", "-", "--error-logfile", "-", "refextract.app:app", "--timeout", "650"]
17 | 
18 | FROM refextract AS refextract-tests
19 | 
20 | RUN poetry install --with dev
21 | COPY tests tests/
22 | RUN poetry install
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | GNU GENERAL PUBLIC LICENSE
  2 |    Version 2, June 1991
  3 | 
  4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 | Everyone is permitted to copy and distribute verbatim copies
  7 | of this license document, but changing it is not allowed.
  8 | 
  9 |         Preamble
 10 | 
 11 | The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 | When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 | To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 | For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 | We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 | Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 | Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 | The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 | GNU GENERAL PUBLIC LICENSE
 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 | 0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 | 1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 | 2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 | a) You must cause the modified files to carry prominent notices
 96 | stating that you changed the files and the date of any change.
 97 | 
 98 | b) You must cause any work that you distribute or publish, that in
 99 | whole or in part contains or is derived from the Program or any
100 | part thereof, to be licensed as a whole at no charge to all third
101 | parties under the terms of this License.
102 | 
103 | c) If the modified program normally reads commands interactively
104 | when run, you must cause it, when started running for such
105 | interactive use in the most ordinary way, to print or display an
106 | announcement including an appropriate copyright notice and a
107 | notice that there is no warranty (or else, saying that you provide
108 | a warranty) and that users may redistribute the program under
109 | these conditions, and telling the user how to view a copy of this
110 | License.  (Exception: if the Program itself is interactive but
111 | does not normally print such an announcement, your work based on
112 | the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 | 3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 | a) Accompany it with the complete corresponding machine-readable
139 | source code, which must be distributed under the terms of Sections
140 | 1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 | b) Accompany it with a written offer, valid for at least three
143 | years, to give any third party, for a charge no more than your
144 | cost of physically performing source distribution, a complete
145 | machine-readable copy of the corresponding source code, to be
146 | distributed under the terms of Sections 1 and 2 above on a medium
147 | customarily used for software interchange; or,
148 | 
149 | c) Accompany it with the information you received as to the offer
150 | to distribute corresponding source code.  (This alternative is
151 | allowed only for noncommercial distribution and only if you
152 | received the program in object code or executable form with such
153 | an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 | 4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 | 5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 | 6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 | 7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 | 8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 | 9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 | 10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |         NO WARRANTY
259 | 
260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |  END OF TERMS AND CONDITIONS
281 | 
282 | How to Apply These Terms to Your New Programs
283 | 
284 | If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 | To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 | <one line to give the program's name and a brief idea of what it does.>
294 | Copyright (C) <year>  <name of author>
295 | 
296 | This program is free software; you can redistribute it and/or modify
297 | it under the terms of the GNU General Public License as published by
298 | the Free Software Foundation; either version 2 of the License, or
299 | (at your option) any later version.
300 | 
301 | This program is distributed in the hope that it will be useful,
302 | but WITHOUT ANY WARRANTY; without even the implied warranty of
303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 | GNU General Public License for more details.
305 | 
306 | You should have received a copy of the GNU General Public License along
307 | with this program; if not, write to the Free Software Foundation, Inc.,
308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 | Gnomovision version 69, Copyright (C) year name of author
316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 | This is free software, and you are welcome to redistribute it
318 | under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 | `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 | <signature of Ty Coon>, 1 April 1989
333 | Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract
 4 | # Copyright (C) 2015, 2016 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | 
25 | include LICENSE *.rst
26 | include .coveragerc run-tests.sh pytest.ini tox.ini Dockerfile
27 | include docs/*.rst docs/*.py docs/Makefile
28 | 
29 | recursive-include refextract *
30 | recursive-include *.py *.css *.css_t *.conf *.html
31 | recursive-include tests *.py
32 | recursive-include tests *.pdf
33 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # refextract
  3 | 
  4 | ## About
  5 | 
  6 | A library for extracting references used in scholarly communication.
  7 | 
  8 | ## Getting Started
  9 | 
 10 | Note: due to the usage of `mmap` resize functionality this library cannot be locally installed on a mac
 11 | 
 12 | ### Docker Setup:
 13 | 
 14 | Before the first usage, or anytime a new library/dependency is changed a new docker image must be created using:
 15 | ```shell
 16 | docker build --target refextract-tests -t refextract .
 17 | ```
 18 | 
 19 | After that, spin up a `refextract` service with:
 20 | ```shell
 21 | docker run -it -d -p 5000:5000 -v ./tests:/refextract/tests -v ./refextract:/refextract/refextract --name refextract refextract
 22 | ```
 23 | 
 24 | ### Running tests
 25 | 
 26 | Exec into the container via
 27 | ```shell
 28 | docker exec -it refextract /bin/bash
 29 | ```
 30 | Then simply run
 31 | ```shell
 32 | pytest .
 33 | ```
 34 | 
 35 | ## Usage
 36 | 
 37 | To get structured information from a publication reference:
 38 | 
 39 | 
 40 | ``` python
 41 | >>> from refextract import extract_journal_reference
 42 | >>> reference = extract_journal_reference('J.Phys.,A39,13445')
 43 | >>> print(reference)
 44 | {
 45 | 'extra_ibids': [],
 46 | 'is_ibid': False,
 47 | 'misc_txt': '',
 48 | 'page': '13445',
 49 | 'title': 'J. Phys.',
 50 | 'type': 'JOURNAL',
 51 | 'volume': 'A39',
 52 | 'year': '',
 53 | 
 54 | }
 55 | ```
 56 | 
 57 | To extract references from a PDF:
 58 | ``` python
 59 | >>> from refextract import extract_references_from_file
 60 | >>> references = extract_references_from_file('1503.07589.pdf')
 61 | >>> print(references[0])
 62 | {
 63 | 'author': ['F. Englert and R. Brout'],
 64 | 'doi': ['doi:10.1103/PhysRevLett.13.321'],
 65 | 'journal_page': ['321'],
 66 | 'journal_reference': ['Phys. Rev. Lett. 13 (1964) 321'],
 67 | 'journal_title': ['Phys. Rev. Lett.'],
 68 | 'journal_volume': ['13'],
 69 | 'journal_year': ['1964'],
 70 | 'linemarker': ['1'],
 71 | 'raw_ref': ['[1] F. Englert and R. Brout, \u201cBroken symmetry and the mass of gauge vector mesons\u201d, Phys. Rev. Lett. 13 (1964) 321, doi:10.1103/PhysRevLett.13.321.'],
 72 | 'texkey': ['Englert:1964et'],
 73 | 'year': ['1964'],
 74 | }
 75 | ```
 76 | 
 77 | To extract directly from a URL:
 78 | ``` python
 79 | >>> from refextract import extract_references_from_url
 80 | >>> references = extract_references_from_url('https://arxiv.org/pdf/1503.07589.pdf')
 81 | >>> print(references[0])
 82 | {
 83 | 'author': ['F. Englert and R. Brout'],
 84 | 'doi': ['doi:10.1103/PhysRevLett.13.321'],
 85 | 'journal_page': ['321'],
 86 | 'journal_reference': ['Phys. Rev. Lett. 13 (1964) 321'],
 87 | 'journal_title': ['Phys. Rev. Lett.'],
 88 | 'journal_volume': ['13'],
 89 | 'journal_year': ['1964'],
 90 | 'linemarker': ['1'],
 91 | 'raw_ref': ['[1] F. Englert and R. Brout, \u201cBroken symmetry and the mass of gauge vector mesons\u201d, Phys. Rev. Lett. 13 (1964) 321, doi:10.1103/PhysRevLett.13.321.'],
 92 | 'texkey': ['Englert:1964et'],
 93 | 'year': ['1964'],
 94 | 
 95 | }
 96 | 
 97 | ```
 98 | 
 99 | ## Notes
100 | `refextract` depends on
101 | 
102 | [pdftotext](http://linux.die.net/man/1/pdftotext).
103 | 
104 | ## Acknowledgments
105 | 
106 | `refextract` is based on code and ideas from the following people, who
107 | 
108 | contributed to the `docextract` module in Invenio:
109 | - Alessio Deiana
110 | - Federico Poli
111 | - Gerrit Rindermann
112 | - Graham R. Armstrong
113 | - Grzegorz Szpura
114 | - Jan Aage Lavik
115 | - Javier Martin Montull
116 | - Micha Moskovic
117 | - Samuele Kaplun
118 | - Thorsten Schwander
119 | - Tibor Simko
120 | 
121 | ## License
122 | GPLv2
123 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "refextract"
 3 | version = "0.1.0"
 4 | description = "Small library for extracting references used in scholarly communication."
 5 | readme = "README.md"
 6 | homepage = "https://github.com/inspirehep/refextract"
 7 | license = "GPL-2.0-or-later"
 8 | authors = [
 9 |     "CERN <admin@inspirehep.net>"
10 | ]
11 | classifiers = [
12 |     "Development Status :: 4 - Beta",
13 |     "Environment :: Console",
14 |     "Intended Audience :: Developers",
15 |     "Intended Audience :: Science/Research",
16 |     "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
17 |     "Operating System :: OS Independent",
18 |     "Programming Language :: Python",
19 |     "Programming Language :: Python :: 3",
20 |     "Programming Language :: Python :: 3.6",
21 |     "Programming Language :: Python :: 3.7",
22 |     "Programming Language :: Python :: 3.8",
23 |     "Topic :: Scientific/Engineering :: Information Analysis",
24 |     "Topic :: Software Development :: Libraries",
25 |     "Topic :: Software Development :: Libraries :: Python Modules",
26 |     "Topic :: Utilities",
27 | ]
28 | 
29 | 
30 | [tool.poetry.dependencies]
31 | python = ">=3.11,<4"
32 | unidecode = ">=1.0.22,~=1.0"
33 | Flask = ">=2.0.3"
34 | webargs = ">=8.0,~=8.0"
35 | prometheus-flask-exporter = ">=0.23.2,~=0.23"
36 | gunicorn = "^23.0.0"
37 | python-magic = "^0.4.27"
38 | inspire-utils = "^3.0.61"
39 | requests = "^2.32.3"
40 | pypdf = "^5.4.0"
41 | 
42 | 
43 | [tool.poetry.group.dev.dependencies]
44 | mock = "^5.2.0"
45 | responses =">=0.25.7,~=0.25"
46 | pytest = "^8.3.3"
47 | pytest-cov = "^6.0.0"
48 | ipdb = "^0.13.9"
49 | 
50 | [tool.coverage.run]
51 | include = ["refextract/*.py"]
52 | 
53 | [tool.pytest.ini_options]
54 | addopts = "--cov=refextract --cov-report=term-missing:skip-covered"
55 | 
56 | [build-system]
57 | requires = ["poetry-core>=1.0.0"]
58 | build-backend = "poetry.core.masonry.api"
59 | 


--------------------------------------------------------------------------------
/refextract/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract.
 4 | # Copyright (C) 2015, 2016, 2018, 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | """Refextract."""
25 | 
26 | from refextract.references.api import (
27 |     extract_journal_reference,
28 |     extract_references_from_file,
29 |     extract_references_from_string,
30 |     extract_references_from_url,
31 | )
32 | 
33 | __all__ = (
34 |     "extract_journal_reference",
35 |     "extract_references_from_file",
36 |     "extract_references_from_string",
37 |     "extract_references_from_url",
38 | )
39 | 


--------------------------------------------------------------------------------
/refextract/app.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from flask import Flask, jsonify, make_response
  4 | from prometheus_flask_exporter.multiprocess import GunicornInternalPrometheusMetrics
  5 | from webargs import fields
  6 | from webargs.flaskparser import FlaskParser
  7 | 
  8 | from refextract.references.api import (
  9 |     extract_journal_reference,
 10 |     extract_references_from_string,
 11 |     extract_references_from_url,
 12 | )
 13 | 
 14 | parser = FlaskParser()
 15 | 
 16 | LOGGER = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def create_app():
 20 |     app = Flask(__name__)
 21 |     app.config.from_pyfile("config.cfg", silent=True)
 22 | 
 23 |     @app.route("/extract_journal_info", methods=["POST"])
 24 |     @parser.use_args(
 25 |         {
 26 |             "publication_infos": fields.List(fields.Dict, required=True),
 27 |             "journal_kb_data": fields.Dict(required=True),
 28 |         },
 29 |         location="json",
 30 |     )
 31 |     def extract_journal_info(args):
 32 |         publication_infos = args.pop("publication_infos")
 33 |         journal_kb_data = args.pop("journal_kb_data")
 34 |         extracted_publication_infos = []
 35 |         journal_dict = {"journals": journal_kb_data}
 36 |         try:
 37 |             for publication_info in publication_infos:
 38 |                 if not publication_info.get("pubinfo_freetext"):
 39 |                     extracted_publication_infos.append({})
 40 |                     continue
 41 |                 extracted_publication_info = extract_journal_reference(
 42 |                     publication_info["pubinfo_freetext"],
 43 |                     override_kbs_files=journal_dict,
 44 |                 )
 45 |                 if not extracted_publication_info:
 46 |                     extracted_publication_info = {}
 47 |                 extracted_publication_infos.append(extracted_publication_info)
 48 |         except Exception as e:
 49 |             return make_response(
 50 |                 jsonify(
 51 |                     {
 52 |                         "message": f"Can not extract publication info data."
 53 |                         f" Reason: {str(e)}"
 54 |                     }
 55 |                 ),
 56 |                 500,
 57 |             )
 58 |         return jsonify({"extracted_publication_infos": extracted_publication_infos})
 59 | 
 60 |     @app.route("/extract_references_from_text", methods=["POST"])
 61 |     @parser.use_args(
 62 |         {
 63 |             "text": fields.String(required=True),
 64 |             "journal_kb_data": fields.Dict(required=True),
 65 |         },
 66 |         location="json",
 67 |     )
 68 |     def extract_references_from_text(args):
 69 |         text = args.pop("text")
 70 |         journal_kb_data = args.pop("journal_kb_data")
 71 |         journal_dict = {"journals": journal_kb_data}
 72 |         try:
 73 |             extracted_references = extract_references_from_string(
 74 |                 text,
 75 |                 override_kbs_files=journal_dict,
 76 |                 reference_format="{title},{volume},{page}",
 77 |             )
 78 |         except Exception as e:
 79 |             return make_response(
 80 |                 jsonify({"message": f"Can not extract references. Reason: {str(e)}"}),
 81 |                 500,
 82 |             )
 83 |         return jsonify({"extracted_references": extracted_references})
 84 | 
 85 |     @app.route("/extract_references_from_url", methods=["POST"])
 86 |     @parser.use_args(
 87 |         {
 88 |             "url": fields.String(required=True),
 89 |             "journal_kb_data": fields.Dict(required=True),
 90 |         },
 91 |         location="json",
 92 |     )
 93 |     def extract_references_from_file_url(args):
 94 |         url = args.pop("url")
 95 |         journal_kb_data = args.pop("journal_kb_data")
 96 |         journal_dict = {"journals": journal_kb_data}
 97 |         try:
 98 |             extracted_references = extract_references_from_url(
 99 |                 url,
100 |                 **{
101 |                     "override_kbs_files": journal_dict,
102 |                     "reference_format": "{title},{volume},{page}",
103 |                 },
104 |             )
105 |         except Exception as e:
106 |             return make_response(
107 |                 jsonify({"message": f"Can not extract references. Reason: {str(e)}"}),
108 |                 500,
109 |             )
110 |         return jsonify({"extracted_references": extracted_references})
111 | 
112 |     @app.route("/extract_references_from_list", methods=["POST"])
113 |     @parser.use_args(
114 |         {
115 |             "raw_references": fields.List(fields.String, required=True),
116 |             "journal_kb_data": fields.Dict(required=True),
117 |         },
118 |         location="json",
119 |     )
120 |     def extract_references_from_list(args):
121 |         references = args.pop("raw_references")
122 |         journal_kb_data = args.pop("journal_kb_data")
123 |         journal_dict = {"journals": journal_kb_data}
124 |         extracted_references = []
125 |         for reference in references:
126 |             try:
127 |                 extracted_reference = extract_references_from_string(
128 |                     reference,
129 |                     override_kbs_files=journal_dict,
130 |                     reference_format="{title},{volume},{page}",
131 |                 )
132 |                 if extracted_reference:
133 |                     extracted_references.append(extracted_reference[0])
134 |                 else:
135 |                     extracted_references.append({"raw_ref": [reference]})
136 |             except Exception as e:
137 |                 LOGGER.error(
138 |                     f"Failed to extract reference: {reference}. Reason: {str(e)}"
139 |                 )
140 |                 extracted_references.append({"raw_ref": [reference]})
141 |         return jsonify({"extracted_references": extracted_references})
142 | 
143 |     return app
144 | 
145 | 
146 | app = create_app()
147 | 
148 | if app.config.get("PROMETHEUS_ENABLE_EXPORTER_FLASK"):
149 |     LOGGER.info("Starting prometheus metrics exporter")
150 |     metrics = GunicornInternalPrometheusMetrics.for_app_factory()
151 |     metrics.init_app(app)
152 | 
153 | if __name__ == "__main__":
154 |     app.run(host="0.0.0.0")
155 | 


--------------------------------------------------------------------------------
/refextract/authors/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract.
 4 | # Copyright (C) 2015, 2018, 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 


--------------------------------------------------------------------------------
/refextract/config.cfg:
--------------------------------------------------------------------------------
1 | FILES_DOWNLOAD_MAX_RETRIES = 3
2 | FILES_DOWNLOAD_TIMEOUT = 60
3 | PROMETHEUS_ENABLE_EXPORTER_FLASK = False
4 | 


--------------------------------------------------------------------------------
/refextract/documents/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract.
 4 | # Copyright (C) 2015, 2018, 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 


--------------------------------------------------------------------------------
/refextract/documents/pdf.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract.
 4 | # Copyright (C) 2013, 2015, 2016, 2018, 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | """
25 | When a document is converted to plain-text from PDF,
26 | certain characters may result in the plain-text, that are
27 | either unwanted, or broken. These characters need to be corrected
28 | or removed. Examples are, certain control characters that would
29 | be illegal in XML and must be removed; TeX ligatures (etc); broken
30 | accents such as umlauts on letters that must be corrected.
31 | This function returns a dictionary of (unwanted) characters to look
32 | for and the characters that should be used to replace them.
33 | @return: (dictionary) - { seek -> replace, } or charsacters to
34 | replace in plain-text.
35 | """
36 | 
37 | import logging
38 | import os
39 | import re
40 | import subprocess
41 | 
42 | from refextract.references.config import CFG_PATH_PDFTOTEXT
43 | 
44 | LOGGER = logging.getLogger(__name__)
45 | 
46 | 
47 | def convert_PDF_to_plaintext(fpath, keep_layout=False):
48 |     """Convert PDF to txt using pdftotext
49 | 
50 |     Take the path to a PDF file and run pdftotext for this file, capturing
51 |     the output.
52 |     @param fpath: (string) path to the PDF file
53 |     @return: (list) of unicode strings (contents of the PDF file translated
54 |     into plaintext; each string is a line in the document.)
55 |     """
56 |     if not os.path.isfile(CFG_PATH_PDFTOTEXT):
57 |         raise IOError("Missing pdftotext executable")
58 | 
59 |     layout_option = "-layout" if keep_layout else "-raw"
60 |     doclines = []
61 |     # Pattern to check for lines with a leading page-break character.
62 |     # If this pattern is matched, we want to split the page-break into
63 |     # its own line because we rely upon this for trying to strip headers
64 |     # and footers, and for some other pattern matching.
65 |     p_break_in_line = re.compile(r"^\s*\f(.+)$", re.UNICODE)
66 |     # build pdftotext command:
67 |     cmd_pdftotext = [
68 |         CFG_PATH_PDFTOTEXT,
69 |         layout_option,
70 |         "-q",
71 |         "-enc",
72 |         "UTF-8",
73 |         fpath,
74 |         "-",
75 |     ]
76 | 
77 |     LOGGER.debug("%s", " ".join(cmd_pdftotext))
78 |     # open pipe to pdftotext:
79 |     pipe_pdftotext = subprocess.Popen(cmd_pdftotext, stdout=subprocess.PIPE)
80 |     # read back results:
81 |     for docline in pipe_pdftotext.stdout:
82 |         unicodeline = docline.decode("utf-8")
83 |         # Check for a page-break in this line:
84 |         m_break_in_line = p_break_in_line.match(unicodeline)
85 |         if m_break_in_line is None:
86 |             # There was no page-break in this line. Just add the line:
87 |             doclines.append(unicodeline)
88 |         else:
89 |             # If there was a page-break character in the same line as some
90 |             # text, split it out into its own line so that we can later
91 |             # try to find headers and footers:
92 |             doclines.append("\f")
93 |             doclines.append(m_break_in_line.group(1))
94 | 
95 |     LOGGER.debug("convert_PDF_to_plaintext found: %s lines of text", len(doclines))
96 | 
97 |     return doclines
98 | 


--------------------------------------------------------------------------------
/refextract/documents/text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract.
  4 | # Copyright (C) 2013, 2015, 2016, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | """Various utilities to manipulate or clean text"""
 25 | 
 26 | import re
 27 | 
 28 | re_space_comma = re.compile(r"\s,", re.UNICODE)
 29 | re_space_semicolon = re.compile(r"\s;", re.UNICODE)
 30 | re_space_period = re.compile(r"\s\.", re.UNICODE)
 31 | re_colon_space_colon = re.compile(r":\s:", re.UNICODE)
 32 | re_comma_space_colon = re.compile(r",\s:", re.UNICODE)
 33 | re_space_closing_square_bracket = re.compile(r"\s\]", re.UNICODE)
 34 | re_opening_square_bracket_space = re.compile(r"\[\s", re.UNICODE)
 35 | re_hyphens = re.compile(
 36 |     rb"(\\255|\u02D7|\u0335|\u0336|\u2212|\u002D|\uFE63|\uFF0D)".decode(
 37 |         "raw_unicode_escape"
 38 |     ),
 39 |     re.UNICODE,
 40 | )
 41 | re_multiple_space = re.compile(r"\s{2,}", re.UNICODE)
 42 | 
 43 | re_group_captured_multiple_space = re.compile(r"(\s{2,})", re.UNICODE)
 44 | 
 45 | 
 46 | def get_url_repair_patterns():
 47 |     """Initialise and return a list of precompiled regexp patterns that
 48 |     are used to try to re-assemble URLs that have been broken during
 49 |     a document's conversion to plain-text.
 50 |     @return: (list) of compiled re regexp patterns used for finding
 51 |      various broken URLs.
 52 |     """
 53 |     file_types_list = [
 54 |         r"h\s*t\s*m",  # htm
 55 |         r"h\s*t\s*m\s*l",  # html
 56 |         r"t\s*x\s*t",  # txt
 57 |         r"p\s*h\s*p",  # php
 58 |         r"a\s*s\s*p\s*",  # asp
 59 |         r"j\s*s\s*p",  # jsp
 60 |         r"p\s*y",  # py (python)
 61 |         r"p\s*l",  # pl (perl)
 62 |         r"x\s*m\s*l",  # xml
 63 |         r"j\s*p\s*g",  # jpg
 64 |         r"g\s*i\s*f",  # gif
 65 |         r"m\s*o\s*v",  # mov
 66 |         r"s\s*w\s*f",  # swf
 67 |         r"p\s*d\s*f",  # pdf
 68 |         r"p\s*s",  # ps
 69 |         r"d\s*o\s*c",  # doc
 70 |         r"t\s*e\s*x",  # tex
 71 |         r"s\s*h\s*t\s*m\s*l",  # shtml
 72 |     ]
 73 | 
 74 |     pattern_list = [
 75 |         r"(h\s*t\s*t\s*p\s*\:\s*\/\s*\/)",
 76 |         r"(f\s*t\s*p\s*\:\s*\/\s*\/\s*)",
 77 |         r"((http|ftp):\/\/\s*[\w\d])",
 78 |         r"((http|ftp):\/\/([\w\d\s\._\-])+?\s*\/)",
 79 |         r"((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\s\.\-])+?\/)+)",
 80 |         r"((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\s\.\-])+?\/)*([\w\d\_\s\-]+\.\s?[\w\d]+))",
 81 |     ]
 82 |     pattern_list = [re.compile(p, re.I | re.UNICODE) for p in pattern_list]
 83 | 
 84 |     # some possible endings for URLs:
 85 |     p = r"((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\.\-])+?\/)*([\w\d\_\-]+\.%s))"
 86 |     for extension in file_types_list:
 87 |         p_url = re.compile(p % extension, re.I | re.UNICODE)
 88 |         pattern_list.append(p_url)
 89 | 
 90 |     # if url last thing in line, and only 10 letters max, concat them
 91 |     p_url = re.compile(
 92 |         r"((http|ftp):\/\/([\w\d\_\.\-])+\/(([\w\d\_\.\-])+?\/)*\s*?([\w\d\_\.\-]\s?){1,10}\s*)$",
 93 |         re.I | re.UNICODE,
 94 |     )
 95 |     pattern_list.append(p_url)
 96 | 
 97 |     return pattern_list
 98 | 
 99 | 
100 | # a list of patterns used to try to repair broken URLs within reference lines:
101 | re_list_url_repair_patterns = get_url_repair_patterns()
102 | 
103 | 
104 | def join_lines(line1, line2):
105 |     """Join 2 lines of text
106 | 
107 |     >>> join_lines('abc', 'de')
108 |     'abcde'
109 |     >>> join_lines('a-', 'b')
110 |     'ab'
111 |     """
112 |     if line1 == "":
113 |         pass
114 |     elif line1[-1] == "-":
115 |         # hyphenated word at the end of the
116 |         # line - don't add in a space and remove hyphen
117 |         line1 = line1[:-1]
118 |     elif line1[-1] != " ":
119 |         # no space at the end of this
120 |         # line, add in a space
121 |         line1 = line1 + " "
122 |     return line1 + line2
123 | 
124 | 
125 | def repair_broken_urls(line):
126 |     """Attempt to repair broken URLs in a line of text.
127 | 
128 |     E.g.: remove spaces from the middle of a URL; something like that.
129 | 
130 |     @param line: (string) the line in which to check for broken URLs.
131 |     @return: (string) the line after any broken URLs have been repaired.
132 |     """
133 | 
134 |     def _chop_spaces_in_url_match(m):
135 |         """Suppresses spaces in a matched URL."""
136 |         return m.group(1).replace(" ", "")
137 | 
138 |     for ptn in re_list_url_repair_patterns:
139 |         line = ptn.sub(_chop_spaces_in_url_match, line)
140 |     return line
141 | 
142 | 
143 | def remove_and_record_multiple_spaces_in_line(line):
144 |     """For a given string, locate all ocurrences of multiple spaces
145 |     together in the line, record the number of spaces found at each
146 |     position, and replace them with a single space.
147 |     @param line: (string) the text line to be processed for multiple
148 |      spaces.
149 |     @return: (tuple) countaining a dictionary and a string. The
150 |      dictionary contains information about the number of spaces removed
151 |      at given positions in the line. For example, if 3 spaces were
152 |      removed from the line at index '22', the dictionary would be set
153 |      as follows: { 22 : 3 }
154 |      The string that is also returned in this tuple is the line after
155 |      multiple-space ocurrences have replaced with single spaces.
156 |     """
157 |     removed_spaces = {}
158 |     # get a collection of match objects for all instances of
159 |     # multiple-spaces found in the line:
160 |     multispace_matches = re_group_captured_multiple_space.finditer(line)
161 |     # record the number of spaces found at each match position:
162 |     for multispace in multispace_matches:
163 |         removed_spaces[multispace.start()] = multispace.end() - multispace.start() - 1
164 |     # now remove the multiple-spaces from the line, replacing with a
165 |     # single space at each position:
166 |     line = re_group_captured_multiple_space.sub(" ", line)
167 |     return (removed_spaces, line)
168 | 
169 | 
170 | def wash_line(line):
171 |     """Wash a text line of certain punctuation errors, replacing them with
172 |     more correct alternatives.  E.g.: the string 'Yes , I like python.'
173 |     will be transformed into 'Yes, I like python.'
174 |     @param line: (string) the line to be washed.
175 |     @return: (string) the washed line.
176 |     """
177 |     line = re_space_comma.sub(",", line)
178 |     line = re_space_semicolon.sub(";", line)
179 |     line = re_space_period.sub(".", line)
180 |     line = re_colon_space_colon.sub(":", line)
181 |     line = re_comma_space_colon.sub(":", line)
182 |     line = re_space_closing_square_bracket.sub("]", line)
183 |     line = re_opening_square_bracket_space.sub("[", line)
184 |     line = re_hyphens.sub("-", line)
185 |     line = re_multiple_space.sub(" ", line)
186 |     return line
187 | 
188 | 
189 | def remove_page_boundary_lines(docbody):
190 |     """Try to locate page breaks, headers and footers within a document body,
191 |     and remove the array cells at which they are found.
192 |     @param docbody: (list) of strings, each string being a line in the
193 |      document's body.
194 |     @return: (list) of strings. The document body, hopefully with page-
195 |      breaks, headers and footers removed. Each string in the list once more
196 |      represents a line in the document.
197 |     """
198 |     number_head_lines = number_foot_lines = 0
199 |     # Make sure document not just full of whitespace:
200 |     if not document_contains_text(docbody):
201 |         # document contains only whitespace - cannot safely
202 |         # strip headers/footers
203 |         return docbody
204 | 
205 |     # Get list of index posns of pagebreaks in document:
206 |     page_break_posns = get_page_break_positions(docbody)
207 | 
208 |     # Get num lines making up each header if poss:
209 |     number_head_lines = get_number_header_lines(docbody, page_break_posns)
210 | 
211 |     # Get num lines making up each footer if poss:
212 |     number_foot_lines = get_number_footer_lines(docbody, page_break_posns)
213 | 
214 |     # Remove pagebreaks,headers,footers:
215 |     docbody = strip_headers_footers_pagebreaks(
216 |         docbody, page_break_posns, number_head_lines, number_foot_lines
217 |     )
218 | 
219 |     return docbody
220 | 
221 | 
222 | def document_contains_text(docbody):
223 |     """Test whether document contains text, or is just full of worthless
224 |     whitespace.
225 |     @param docbody: (list) of strings - each string being a line of the
226 |      document's body
227 |     @return: (integer) 1 if non-whitespace found in document; 0 if only
228 |      whitespace found in document.
229 |     """
230 |     found_non_space = 0
231 |     for line in docbody:
232 |         if not line.isspace():
233 |             # found a non-whitespace character in this line
234 |             found_non_space = 1
235 |             break
236 |     return found_non_space
237 | 
238 | 
239 | def get_page_break_positions(docbody):
240 |     """Locate page breaks in the list of document lines and create a list
241 |     positions in the document body list.
242 |     @param docbody: (list) of strings - each string is a line in the
243 |      document.
244 |     @return: (list) of integer positions, whereby each integer represents the
245 |      position (in the document body) of a page-break.
246 |     """
247 |     page_break_posns = []
248 |     p_break = re.compile(r"^\s*\f\s*$", re.UNICODE)
249 |     num_document_lines = len(docbody)
250 |     for i in range(num_document_lines):
251 |         if p_break.match(docbody[i]) is not None:
252 |             page_break_posns.append(i)
253 |     return page_break_posns
254 | 
255 | 
256 | def get_number_header_lines(docbody, page_break_posns):
257 |     """Try to guess the number of header lines each page of a document has.
258 |     The positions of the page breaks in the document are used to try to guess
259 |     the number of header lines.
260 |     @param docbody: (list) of strings - each string being a line in the
261 |      document
262 |     @param page_break_posns: (list) of integers - each integer is the
263 |      position of a page break in the document.
264 |     @return: (int) the number of lines that make up the header of each page.
265 |     """
266 |     remaining_breaks = len(page_break_posns) - 1
267 |     num_header_lines = empty_line = 0
268 |     # pattern to search for a word in a line:
269 |     p_wordSearch = re.compile(r"([A-Za-z0-9-]+)", re.UNICODE)
270 |     if remaining_breaks > 2:
271 |         next_head = 2 if remaining_breaks > 3 else 1
272 |         keep_checking = 1
273 |         while keep_checking:
274 |             cur_break = 1
275 |             if docbody[(page_break_posns[cur_break] + num_header_lines + 1)].isspace():
276 |                 # this is a blank line
277 |                 empty_line = 1
278 | 
279 |             if (page_break_posns[cur_break] + num_header_lines + 1) == (
280 |                 page_break_posns[(cur_break + 1)]
281 |             ):
282 |                 # Have reached next page-break: document has no
283 |                 # body - only head/footers!
284 |                 keep_checking = 0
285 | 
286 |             grps_headLineWords = p_wordSearch.findall(
287 |                 docbody[(page_break_posns[cur_break] + num_header_lines + 1)]
288 |             )
289 |             cur_break = cur_break + next_head
290 |             while (cur_break < remaining_breaks) and keep_checking:
291 |                 lineno = page_break_posns[cur_break] + num_header_lines + 1
292 |                 if lineno >= len(docbody):
293 |                     keep_checking = 0
294 |                     break
295 |                 grps_thisLineWords = p_wordSearch.findall(docbody[lineno])
296 |                 if empty_line:
297 |                     if len(grps_thisLineWords) != 0:
298 |                         # This line should be empty, but isn't
299 |                         keep_checking = 0
300 |                 else:
301 |                     if (len(grps_thisLineWords) == 0) or (
302 |                         len(grps_headLineWords) != len(grps_thisLineWords)
303 |                     ):
304 |                         # Not same num 'words' as equivilent line
305 |                         # in 1st header:
306 |                         keep_checking = 0
307 |                     else:
308 |                         keep_checking = check_boundary_lines_similar(
309 |                             grps_headLineWords, grps_thisLineWords
310 |                         )
311 |                 # Update cur_break for nxt line to check
312 |                 cur_break = cur_break + next_head
313 |             if keep_checking:
314 |                 # Line is a header line: check next
315 |                 num_header_lines = num_header_lines + 1
316 |             empty_line = 0
317 |     return num_header_lines
318 | 
319 | 
320 | def get_number_footer_lines(docbody, page_break_posns):
321 |     """Try to guess the number of footer lines each page of a document has.
322 |     The positions of the page breaks in the document are used to try to guess
323 |     the number of footer lines.
324 |     @param docbody: (list) of strings - each string being a line in the
325 |      document
326 |     @param page_break_posns: (list) of integers - each integer is the
327 |      position of a page break in the document.
328 |     @return: (int) the number of lines that make up the footer of each page.
329 |     """
330 |     num_breaks = len(page_break_posns)
331 |     num_footer_lines = 0
332 |     empty_line = 0
333 |     keep_checking = 1
334 |     p_wordSearch = re.compile(r"([A-Za-z0-9-]+)", re.UNICODE)
335 |     if num_breaks > 2:
336 |         while keep_checking:
337 |             cur_break = 1
338 |             if (
339 |                 page_break_posns[cur_break] - num_footer_lines - 1 < 0
340 |                 or page_break_posns[cur_break] - num_footer_lines - 1 > len(docbody) - 1
341 |             ):
342 |                 # Be sure that the docbody list boundary wasn't overstepped:
343 |                 break
344 |             if docbody[(page_break_posns[cur_break] - num_footer_lines - 1)].isspace():
345 |                 empty_line = 1
346 |             grps_headLineWords = p_wordSearch.findall(
347 |                 docbody[(page_break_posns[cur_break] - num_footer_lines - 1)]
348 |             )
349 |             cur_break = cur_break + 1
350 |             while (cur_break < num_breaks) and keep_checking:
351 |                 grps_thisLineWords = p_wordSearch.findall(
352 |                     docbody[(page_break_posns[cur_break] - num_footer_lines - 1)]
353 |                 )
354 |                 if empty_line:
355 |                     if len(grps_thisLineWords) != 0:
356 |                         # this line should be empty, but isn't
357 |                         keep_checking = 0
358 |                 else:
359 |                     if (len(grps_thisLineWords) == 0) or (
360 |                         len(grps_headLineWords) != len(grps_thisLineWords)
361 |                     ):
362 |                         # Not same num 'words' as equivilent line
363 |                         # in 1st footer:
364 |                         keep_checking = 0
365 |                     else:
366 |                         keep_checking = check_boundary_lines_similar(
367 |                             grps_headLineWords, grps_thisLineWords
368 |                         )
369 |                 # Update cur_break for nxt line to check
370 |                 cur_break = cur_break + 1
371 |             if keep_checking:
372 |                 # Line is a footer line: check next
373 |                 num_footer_lines = num_footer_lines + 1
374 |             empty_line = 0
375 |     return num_footer_lines
376 | 
377 | 
378 | def strip_headers_footers_pagebreaks(
379 |     docbody, page_break_posns, num_head_lines, num_foot_lines
380 | ):
381 |     """Remove page-break lines, header lines, and footer lines from the
382 |     document.
383 |     @param docbody: (list) of strings, whereby each string in the list is a
384 |      line in the document.
385 |     @param page_break_posns: (list) of integers, whereby each integer
386 |      represents the index in docbody at which a page-break is found.
387 |     @param num_head_lines: (int) the number of header lines each page in the
388 |      document has.
389 |     @param num_foot_lines: (int) the number of footer lines each page in the
390 |      document has.
391 |     @return: (list) of strings - the document body after the headers,
392 |      footers, and page-break lines have been stripped from the list.
393 |     """
394 |     num_breaks = len(page_break_posns)
395 |     page_lens = []
396 |     for x in range(0, num_breaks):
397 |         if x < num_breaks - 1:
398 |             page_lens.append(page_break_posns[x + 1] - page_break_posns[x])
399 |     page_lens.sort()
400 |     if (len(page_lens) > 0) and (num_head_lines + num_foot_lines + 1 < page_lens[0]):
401 |         # Safe to chop hdrs & ftrs
402 |         page_break_posns.reverse()
403 |         first = 1
404 |         for i in range(0, len(page_break_posns)):
405 |             # Unless this is the last page break, chop headers
406 |             if not first:
407 |                 for _dummy in range(1, num_head_lines + 1):
408 |                     docbody[page_break_posns[i] + 1 : page_break_posns[i] + 2] = []
409 |             else:
410 |                 first = 0
411 |             # Chop page break itself
412 |             docbody[page_break_posns[i] : page_break_posns[i] + 1] = []
413 |             # Chop footers (unless this is the first page break)
414 |             if i != len(page_break_posns) - 1:
415 |                 for _dummy in range(1, num_foot_lines + 1):
416 |                     docbody[
417 |                         page_break_posns[i] - num_foot_lines : page_break_posns[i]
418 |                         - num_foot_lines
419 |                         + 1
420 |                     ] = []
421 |     return docbody
422 | 
423 | 
424 | def check_boundary_lines_similar(l_1, l_2):
425 |     """Compare two lists to see if their elements are roughly the same.
426 |     @param l_1: (list) of strings.
427 |     @param l_2: (list) of strings.
428 |     @return: (int) 1/0.
429 |     """
430 |     num_matches = 0
431 |     if not isinstance(l_1, list) or not isinstance(l_2, list) or (len(l_1) != len(l_2)):
432 |         # these 'boundaries' are not similar
433 |         return 0
434 | 
435 |     num_elements = len(l_1)
436 |     for i in range(0, num_elements):
437 |         if l_1[i].isdigit() and l_2[i].isdigit():
438 |             # both lines are integers
439 |             num_matches += 1
440 |         else:
441 |             l1_str = l_1[i].lower()
442 |             l2_str = l_2[i].lower()
443 |             if (l1_str[0] == l2_str[0]) and (
444 |                 l1_str[len(l1_str) - 1] == l2_str[len(l2_str) - 1]
445 |             ):
446 |                 num_matches = num_matches + 1
447 |     if (len(l_1) == 0) or (float(num_matches) / float(len(l_1)) < 0.9):
448 |         return 0
449 |     else:
450 |         return 1
451 | 


--------------------------------------------------------------------------------
/refextract/references/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract.
 4 | # Copyright (C) 2015, 2018, 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 


--------------------------------------------------------------------------------
/refextract/references/api.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract.
  4 | # Copyright (C) 2013, 2015, 2016, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | """This is where all the public API calls are accessible to extract references.
 25 | 
 26 | There are 4 API functions available to extract from PDF file, string or URL. In
 27 | addition, there is an API call to return a parsed journal reference structure
 28 | from a raw string.
 29 | """
 30 | 
 31 | import os
 32 | from tempfile import mkstemp
 33 | 
 34 | import magic
 35 | import requests
 36 | from inspire_utils.dedupers import dedupe_list
 37 | 
 38 | from refextract.references.engine import (
 39 |     get_kbs,
 40 |     get_plaintext_document_body,
 41 |     parse_reference_line,
 42 |     parse_references,
 43 | )
 44 | from refextract.references.errors import FullTextNotAvailableError
 45 | from refextract.references.find import (
 46 |     find_numeration_in_body,
 47 |     get_reference_section_beginning,
 48 | )
 49 | from refextract.references.pdf import extract_texkeys_and_urls_from_pdf
 50 | from refextract.references.record import update_reference_with_urls
 51 | from refextract.references.text import (
 52 |     extract_references_from_fulltext,
 53 |     rebuild_reference_lines,
 54 | )
 55 | 
 56 | 
 57 | def extract_references_from_url(url, headers=None, chunk_size=1024, **kwargs):
 58 |     """Extract references from the pdf specified in the url.
 59 | 
 60 |     The first parameter is the URL of the file.
 61 |     It returns a list of parsed references.
 62 | 
 63 |     It raises FullTextNotAvailableError if the URL gives a 404,
 64 |     UnknownDocumentTypeError if it is not a PDF or plain text.
 65 | 
 66 |     The standard reference format is: {title} {volume} ({year}) {page}.
 67 | 
 68 |     E.g. you can change that by passing the reference_format:
 69 | 
 70 |     >>> extract_references_from_url(path, reference_format="{title},{volume},{page}")
 71 | 
 72 |     If you want to also link each reference to some other resource (like a record),
 73 |     you can provide a linker_callback function to be executed for every reference
 74 |     element found.
 75 | 
 76 |     To override KBs for journal names etc., use ``override_kbs_files``:
 77 | 
 78 |     >>> extract_references_from_url(path,
 79 |                                     override_kbs_files={'journals': 'my/path/to.kb'})
 80 | 
 81 |     """
 82 |     # Get temporary filepath to download to
 83 |     filename, filepath = mkstemp(
 84 |         suffix="_{0}".format(os.path.basename(url)),
 85 |     )
 86 |     os.close(filename)
 87 | 
 88 |     try:
 89 |         req = requests.get(url=url, headers=headers, stream=True)
 90 |         req.raise_for_status()
 91 |         with open(filepath, "wb") as f:
 92 |             for chunk in req.iter_content(chunk_size):
 93 |                 f.write(chunk)
 94 |         references = extract_references_from_file(filepath, **kwargs)
 95 |     except requests.exceptions.HTTPError as exc:
 96 |         raise FullTextNotAvailableError(f"URL not found: '{url}'") from exc
 97 |     finally:
 98 |         os.remove(filepath)
 99 |     return references
100 | 
101 | 
102 | def extract_references_from_file(
103 |     path,
104 |     recid=None,
105 |     reference_format="{title} {volume} ({year}) {page}",
106 |     linker_callback=None,
107 |     override_kbs_files=None,
108 | ):
109 |     """Extract references from a local pdf file.
110 | 
111 |     The first parameter is the path to the file.
112 |     It returns a list of parsed references.
113 |     It raises FullTextNotAvailableError if the file does not exist,
114 |     UnknownDocumentTypeError if it is not a PDF or plain text.
115 | 
116 |     The standard reference format is: {title} {volume} ({year}) {page}.
117 | 
118 |     E.g. you can change that by passing the reference_format:
119 | 
120 |     >>> extract_references_from_file(path, reference_format=u"{title},{volume},{page}")
121 | 
122 |     If you want to also link each reference to some other resource (like a record),
123 |     you can provide a linker_callback function to be executed for every reference
124 |     element found.
125 | 
126 |     To override KBs for journal names etc., use ``override_kbs_files``:
127 | 
128 |     >>> extract_references_from_file(path,
129 |                                      override_kbs_files={'journals': 'my/path/to.kb'})
130 | 
131 |     """
132 |     if not os.path.isfile(path):
133 |         raise FullTextNotAvailableError("File not found: '{0}'".format(path))
134 | 
135 |     docbody = get_plaintext_document_body(path)
136 |     reflines, dummy, dummy = extract_references_from_fulltext(docbody)
137 |     if not reflines:
138 |         docbody = get_plaintext_document_body(path, keep_layout=True)
139 |         reflines, dummy, dummy = extract_references_from_fulltext(docbody)
140 | 
141 |     parsed_refs, stats = parse_references(
142 |         reflines,
143 |         recid=recid,
144 |         reference_format=reference_format,
145 |         linker_callback=linker_callback,
146 |         override_kbs_files=override_kbs_files,
147 |     )
148 | 
149 |     if magic.from_file(path, mime=True) == "application/pdf":
150 |         extracted_texkeys_urls = extract_texkeys_and_urls_from_pdf(path)
151 |         if len(extracted_texkeys_urls) == len(parsed_refs):
152 |             parsed_refs_updated = []
153 |             for ref, ref_texkey_urls in zip(
154 |                 parsed_refs, extracted_texkeys_urls, strict=False
155 |             ):
156 |                 update_reference_with_urls(ref, ref_texkey_urls.get("urls", []))
157 |                 if ref.get("url"):
158 |                     ref["url"] = dedupe_list(ref["url"])
159 |                 parsed_refs_updated.append(
160 |                     dict(ref, texkey=[ref_texkey_urls["texkey"]])
161 |                 )
162 | 
163 |             return parsed_refs_updated
164 |     return parsed_refs
165 | 
166 | 
167 | def extract_references_from_string(
168 |     source,
169 |     is_only_references=True,
170 |     recid=None,
171 |     reference_format="{title} {volume} ({year}) {page}",
172 |     linker_callback=None,
173 |     override_kbs_files=None,
174 | ):
175 |     """Extract references from a raw string.
176 | 
177 |     The first parameter is the path to the file.
178 |     It returns a tuple (references, stats).
179 | 
180 |     If the string does not only contain references, improve accuracy by
181 |     specifing ``is_only_references=False``.
182 | 
183 |     The standard reference format is: {title} {volume} ({year}) {page}.
184 | 
185 |     E.g. you can change that by passing the reference_format:
186 | 
187 |     >>> extract_references_from_string(path, reference_format="{title},{volume},{page}")
188 | 
189 |     If you want to also link each reference to some other resource (like a record),
190 |     you can provide a linker_callback function to be executed for every reference
191 |     element found.
192 | 
193 |     To override KBs for journal names etc., use ``override_kbs_files``:
194 | 
195 |     >>> extract_references_from_string(path,
196 |         override_kbs_files={'journals': 'my/path/to.kb'})
197 |     """
198 |     docbody = source.split("\n")
199 |     if not is_only_references:
200 |         reflines, dummy, dummy = extract_references_from_fulltext(docbody)
201 |     else:
202 |         refs_info = get_reference_section_beginning(docbody)
203 |         if not refs_info:
204 |             refs_info, dummy = find_numeration_in_body(docbody)
205 |             refs_info["start_line"] = 0
206 |             refs_info["end_line"] = (len(docbody) - 1,)
207 | 
208 |         reflines = rebuild_reference_lines(docbody, refs_info["marker_pattern"])
209 |     parsed_refs, stats = parse_references(
210 |         reflines,
211 |         recid=recid,
212 |         reference_format=reference_format,
213 |         linker_callback=linker_callback,
214 |         override_kbs_files=override_kbs_files,
215 |     )
216 |     return parsed_refs
217 | 
218 | 
219 | def extract_journal_reference(line, override_kbs_files=None):
220 |     """Extract the journal reference from string.
221 | 
222 |     Extracts the journal reference from string and parses for specific
223 |     journal information.
224 |     """
225 |     kbs = get_kbs(custom_kbs=override_kbs_files)
226 |     references, dummy_m, dummy_c, dummy_co = parse_reference_line(line, kbs)
227 | 
228 |     for elements in references:
229 |         for el in elements:
230 |             if el["type"] == "JOURNAL":
231 |                 return el
232 | 


--------------------------------------------------------------------------------
/refextract/references/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract.
  4 | # Copyright (C) 2013, 2015, 2017, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | """refextract configuration."""
 25 | 
 26 | import os
 27 | 
 28 | try:
 29 |     from shutil import which
 30 | except ImportError:
 31 |     # CPython <3.3
 32 |     from distutils.spawn import find_executable as which
 33 | 
 34 | import pkg_resources
 35 | 
 36 | # Version number:
 37 | CFG_PATH_PDFTOTEXT = os.environ.get("CFG_PATH_PDFTOTEXT", which("pdftotext"))
 38 | 
 39 | # Module config directory
 40 | CFG_KBS_DIR = pkg_resources.resource_filename("refextract.references", "kbs")
 41 | 
 42 | CFG_REFEXTRACT_KBS = {
 43 |     "journals": "%s/journal-titles.kb" % CFG_KBS_DIR,
 44 |     "journals_re": "%s/journal-titles-re.kb" % CFG_KBS_DIR,
 45 |     "report-numbers": "%s/report-numbers.kb" % CFG_KBS_DIR,
 46 |     "authors": "%s/authors.kb" % CFG_KBS_DIR,
 47 |     "collaborations": "%s/collaborations.kb" % CFG_KBS_DIR,
 48 |     "books": "%s/books.kb" % CFG_KBS_DIR,
 49 |     "publishers": "%s/publishers.kb" % CFG_KBS_DIR,
 50 |     "special_journals": "%s/special-journals.kb" % CFG_KBS_DIR,
 51 | }
 52 | 
 53 | # Reference fields:
 54 | CFG_REFEXTRACT_FIELDS = {
 55 |     "misc": "m",
 56 |     "linemarker": "o",
 57 |     "doi": "a",
 58 |     "hdl": "a",
 59 |     "reportnumber": "r",
 60 |     "journal": "s",
 61 |     "url": "u",
 62 |     "urldesc": "z",
 63 |     "author": "h",
 64 |     "title": "t",
 65 |     "isbn": "i",
 66 |     "publisher": "p",
 67 |     "year": "y",
 68 |     "collaboration": "c",
 69 |     "recid": "0",
 70 | }
 71 | 
 72 | # Internal tags are used by refextract to mark-up recognised citation
 73 | # information.
 74 | CFG_REFEXTRACT_MARKER_OPENING_REPORT_NUM = r"<cds.REPORTNUMBER>"
 75 | CFG_REFEXTRACT_MARKER_OPENING_ARXIV = r"<cds.ARXIV>"
 76 | CFG_REFEXTRACT_MARKER_OPENING_TITLE = r"<cds.JOURNAL>"
 77 | CFG_REFEXTRACT_MARKER_OPENING_TITLE_IBID = r"<cds.JOURNALibid>"
 78 | CFG_REFEXTRACT_MARKER_OPENING_SERIES = r"<cds.SER>"
 79 | CFG_REFEXTRACT_MARKER_OPENING_VOLUME = r"<cds.VOL>"
 80 | CFG_REFEXTRACT_MARKER_OPENING_YEAR = r"<cds.YR>"
 81 | CFG_REFEXTRACT_MARKER_OPENING_PAGE = r"<cds.PG>"
 82 | CFG_REFEXTRACT_MARKER_OPENING_QUOTED = r"<cds.QUOTED>"
 83 | CFG_REFEXTRACT_MARKER_OPENING_ISBN = r"<cds.ISBN>"
 84 | CFG_REFEXTRACT_MARKER_OPENING_PUBLISHER = r"<cds.PUBLISHER>"
 85 | CFG_REFEXTRACT_MARKER_OPENING_COLLABORATION = r"<cds.COLLABORATION>"
 86 | 
 87 | # These are the "closing tags:
 88 | CFG_REFEXTRACT_MARKER_CLOSING_REPORT_NUM = r"</cds.REPORTNUMBER>"
 89 | CFG_REFEXTRACT_MARKER_CLOSING_ARXIV = r"</cds.ARXIV>"
 90 | CFG_REFEXTRACT_MARKER_CLOSING_TITLE = r"</cds.JOURNAL>"
 91 | CFG_REFEXTRACT_MARKER_CLOSING_TITLE_IBID = r"</cds.JOURNALibid>"
 92 | CFG_REFEXTRACT_MARKER_CLOSING_SERIES = r"</cds.SER>"
 93 | CFG_REFEXTRACT_MARKER_CLOSING_VOLUME = r"</cds.VOL>"
 94 | CFG_REFEXTRACT_MARKER_CLOSING_YEAR = r"</cds.YR>"
 95 | CFG_REFEXTRACT_MARKER_CLOSING_PAGE = r"</cds.PG>"
 96 | CFG_REFEXTRACT_MARKER_CLOSING_QUOTED = r"</cds.QUOTED>"
 97 | CFG_REFEXTRACT_MARKER_CLOSING_ISBN = r"</cds.ISBN>"
 98 | CFG_REFEXTRACT_MARKER_CLOSING_PUBLISHER = r"</cds.PUBLISHER>"
 99 | CFG_REFEXTRACT_MARKER_CLOSING_COLLABORATION = r"</cds.COLLABORATION>"
100 | 
101 | # Of the form '</cds.AUTHxxxx>' only
102 | CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_STND = r"</cds.AUTHstnd>"
103 | CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_ETAL = r"</cds.AUTHetal>"
104 | CFG_REFEXTRACT_MARKER_CLOSING_AUTHOR_INCL = r"</cds.AUTHincl>"
105 | 
106 | # The minimum length of a reference's misc text to be deemed insignificant.
107 | # when comparing misc text with semi-colon defined sub-references.
108 | # Values higher than this value reflect meaningful misc text.
109 | # Hence, upon finding a correct semi-colon, but having current misc text
110 | # length less than this value (without other meaningful reference objects:
111 | # report numbers, titles...) then no split will occur.
112 | # (A higher value will increase splitting strictness. i.e. Fewer splits)
113 | CGF_REFEXTRACT_SEMI_COLON_MISC_TEXT_SENSITIVITY = 60
114 | 
115 | # The length of misc text between two adjacent authors which is
116 | # deemed as insignificant. As such, when misc text of a length less
117 | # than this value is found, then the latter author group is dumped into misc.
118 | # (A higher value will increase splitting strictness. i.e. Fewer splits)
119 | CGF_REFEXTRACT_ADJACENT_AUTH_MISC_SEPARATION = 10
120 | 
121 | # Maximum number of lines for a citation before it is considered invalid
122 | CFG_REFEXTRACT_MAX_LINES = 25
123 | 


--------------------------------------------------------------------------------
/refextract/references/errors.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract.
 4 | # Copyright (C) 2015, 2016, 2018 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | """Custom exceptions."""
25 | 
26 | 
27 | class FullTextNotAvailableError(Exception):
28 |     """Raised when we cannot access the document text."""
29 | 
30 | 
31 | class UnknownDocumentTypeError(Exception):
32 |     """Raised when we don't know how to handle the document's MIME type."""
33 | 


--------------------------------------------------------------------------------
/refextract/references/find.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract.
  4 | # Copyright (C) 2013, 2015, 2016, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | """Finding the reference section from the fulltext"""
 25 | 
 26 | import contextlib
 27 | import logging
 28 | import re
 29 | 
 30 | from refextract.references.regexs import (
 31 |     get_post_reference_section_keyword_patterns,
 32 |     get_post_reference_section_title_patterns,
 33 |     get_reference_line_numeration_marker_patterns,
 34 |     get_reference_section_title_patterns,
 35 |     re_num,
 36 |     re_reference_line_bracket_markers,
 37 |     re_reference_line_dot_markers,
 38 |     re_reference_line_number_markers,
 39 |     regex_match_list,
 40 | )
 41 | 
 42 | LOGGER = logging.getLogger(__name__)
 43 | 
 44 | 
 45 | def find_reference_section(docbody):
 46 |     """Search in document body for its reference section.
 47 | 
 48 |     More precisely, find
 49 |     the first line of the reference section. Effectively, the function starts
 50 |     at the end of a document and works backwards, line-by-line, looking for
 51 |     the title of a reference section. It stops when (if) it finds something
 52 |     that it considers to be the first line of a reference section.
 53 |     @param docbody: (list) of strings - the full document body.
 54 |     @return: (dictionary) :
 55 |         { 'start_line' : (integer) - index in docbody of 1st reference line,
 56 |           'title_string' : (string) - title of the reference section.
 57 |           'marker' : (string) - the marker of the first reference line,
 58 |           'marker_pattern' : (string) - regexp string used to find the marker,
 59 |           'title_marker_same_line' : (integer) - flag to indicate whether the
 60 |                                         reference section title was on the same
 61 |                                         line as the first reference line's
 62 |                                         marker or not. 1 if it was; 0 if not.
 63 |         }
 64 |         Much of this information is used by later functions to rebuild
 65 |         a reference section.
 66 |          -- OR --
 67 |                 (None) - when the reference section could not be found.
 68 |     """
 69 |     ref_details = None
 70 |     title_patterns = get_reference_section_title_patterns()
 71 | 
 72 |     # Try to find refs section title:
 73 |     for title_pattern in title_patterns:
 74 |         # Look for title pattern in docbody
 75 |         for reversed_index, line in enumerate(reversed(docbody)):
 76 |             title_match = title_pattern.match(line)
 77 |             if title_match:
 78 |                 title = title_match.group("title")
 79 |                 index = len(docbody) - 1 - reversed_index
 80 |                 temp_ref_details, found_title = find_numeration(
 81 |                     docbody[index : index + 6], title
 82 |                 )
 83 |                 if temp_ref_details:
 84 |                     if (
 85 |                         ref_details
 86 |                         and "title" in ref_details
 87 |                         and ref_details["title"]
 88 |                         and not temp_ref_details["title"]
 89 |                     ):
 90 |                         continue
 91 |                     if (
 92 |                         ref_details
 93 |                         and "marker" in ref_details
 94 |                         and ref_details["marker"]
 95 |                         and not temp_ref_details["marker"]
 96 |                     ):
 97 |                         continue
 98 | 
 99 |                     ref_details = temp_ref_details
100 |                     ref_details["start_line"] = index
101 |                     ref_details["title_string"] = title
102 | 
103 |                 if found_title:
104 |                     break
105 | 
106 |         if ref_details:
107 |             break
108 | 
109 |     return ref_details
110 | 
111 | 
112 | def find_numeration_in_body(docbody):
113 |     marker_patterns = get_reference_line_numeration_marker_patterns()
114 |     ref_details = None
115 |     found_title = False
116 | 
117 |     # No numeration unless we find one
118 |     ref_details = {
119 |         "title_marker_same_line": False,
120 |         "marker": None,
121 |         "marker_pattern": None,
122 |     }
123 | 
124 |     for line in docbody:
125 |         # Move past blank lines
126 |         if line.isspace():
127 |             continue
128 | 
129 |         # Is this line numerated like a reference line?
130 |         m_num = None
131 |         mark_match = regex_match_list(line, marker_patterns)
132 |         if mark_match:
133 |             # Check if it's the first reference
134 |             # Something like [1] or (1), etc.
135 |             try:
136 |                 m_num = mark_match.group("marknum")
137 |                 if m_num != "1":
138 |                     continue
139 |             except IndexError:
140 |                 pass
141 | 
142 |             mark = mark_match.group("mark")
143 |             mk_ptn = mark_match.re.pattern
144 |             ref_details = {
145 |                 "marker": mark,
146 |                 "marker_pattern": mk_ptn,
147 |                 "title_marker_same_line": False,
148 |             }
149 | 
150 |             break
151 | 
152 |     return ref_details, found_title
153 | 
154 | 
155 | def find_numeration_in_title(docbody, title):
156 |     ref_details = None
157 |     found_title = False
158 | 
159 |     try:
160 |         first_line = docbody[0]
161 |     except IndexError:
162 |         return ref_details, found_title
163 | 
164 |     # Need to escape to avoid problems like 'References['
165 |     title = re.escape(title)
166 | 
167 |     mk_with_title_ptns = get_reference_line_numeration_marker_patterns(title)
168 |     mk_with_title_match = regex_match_list(first_line, mk_with_title_ptns)
169 |     if mk_with_title_match:
170 |         mk = mk_with_title_match.group("mark")
171 |         mk_ptn = mk_with_title_match.re.pattern
172 |         m_num = re_num.search(mk)
173 |         if m_num and m_num.group(0) == "1":
174 |             # Mark found
175 |             found_title = True
176 |             ref_details = {
177 |                 "marker": mk,
178 |                 "marker_pattern": mk_ptn,
179 |                 "title_marker_same_line": True,
180 |             }
181 |         else:
182 |             ref_details = {
183 |                 "marker": mk,
184 |                 "marker_pattern": mk_ptn,
185 |                 "title_marker_same_line": True,
186 |             }
187 | 
188 |     return ref_details, found_title
189 | 
190 | 
191 | def find_numeration(docbody, title):
192 |     """Find numeration pattern
193 | 
194 |     1st try to find numeration in the title
195 |     e.g.
196 |     References [4] Riotto...
197 | 
198 |     2nd find the numeration alone in the line after the title
199 |     e.g.
200 |     References
201 |     1
202 |     Riotto
203 | 
204 |     3rnd find the numeration in the following line
205 |     e.g.
206 |     References
207 |     [1] Riotto
208 |     """
209 |     ref_details, found_title = find_numeration_in_title(docbody, title)
210 |     if not ref_details:
211 |         ref_details, found_title = find_numeration_in_body(docbody)
212 | 
213 |     return ref_details, found_title
214 | 
215 | 
216 | def find_reference_section_no_title_via_brackets(docbody):
217 |     """This function would generally be used when it was not possible to locate
218 |     the start of a document's reference section by means of its title.
219 |     Instead, this function will look for reference lines that have numeric
220 |     markers of the format [1], [2], etc.
221 |     @param docbody: (list) of strings -each string is a line in the document.
222 |     @return: (dictionary) :
223 |       { 'start_line' : (integer) - index in docbody of 1st reference line,
224 |         'title_string' : (None) - title of the reference section
225 |                                   (None since no title),
226 |         'marker' : (string) - the marker of the first reference line,
227 |         'marker_pattern' : (string) - the regexp string used to find the
228 |                                       marker,
229 |         'title_marker_same_line' : (integer) 0 - to signal title not on same
230 |                                     line as marker.
231 |       }
232 |               Much of this information is used by later functions to rebuild
233 |               a reference section.
234 |       -- OR --
235 |              (None) - when the reference section could not be found.
236 |     """
237 |     marker_patterns = [re_reference_line_bracket_markers]
238 |     return find_reference_section_no_title_generic(docbody, marker_patterns)
239 | 
240 | 
241 | def find_reference_section_no_title_via_dots(docbody):
242 |     """This function would generally be used when it was not possible to locate
243 |     the start of a document's reference section by means of its title.
244 |     Instead, this function will look for reference lines that have numeric
245 |     markers of the format 1., 2., etc.
246 |     @param docbody: (list) of strings -each string is a line in the document.
247 |     @return: (dictionary) :
248 |       { 'start_line' : (integer) - index in docbody of 1st reference line,
249 |         'title_string' : (None) - title of the reference section
250 |                                   (None since no title),
251 |         'marker' : (string) - the marker of the first reference line,
252 |         'marker_pattern' : (string) - the regexp string used to find the
253 |                                       marker,
254 |         'title_marker_same_line' : (integer) 0 - to signal title not on same
255 |                                     line as marker.
256 |       }
257 |               Much of this information is used by later functions to rebuild
258 |               a reference section.
259 |       -- OR --
260 |              (None) - when the reference section could not be found.
261 |     """
262 |     marker_patterns = [re_reference_line_dot_markers]
263 |     return find_reference_section_no_title_generic(docbody, marker_patterns)
264 | 
265 | 
266 | def find_reference_section_no_title_via_numbers(docbody):
267 |     """This function would generally be used when it was not possible to locate
268 |     the start of a document's reference section by means of its title.
269 |     Instead, this function will look for reference lines that have numeric
270 |     markers of the format 1, 2, etc.
271 |     @param docbody: (list) of strings -each string is a line in the document.
272 |     @return: (dictionary) :
273 |       { 'start_line' : (integer) - index in docbody of 1st reference line,
274 |         'title_string' : (None) - title of the reference section
275 |                                   (None since no title),
276 |         'marker' : (string) - the marker of the first reference line,
277 |         'marker_pattern' : (string) - the regexp string used to find the
278 |                                       marker,
279 |         'title_marker_same_line' : (integer) 0 - to signal title not on same
280 |                                     line as marker.
281 |       }
282 |               Much of this information is used by later functions to rebuild
283 |               a reference section.
284 |       -- OR --
285 |              (None) - when the reference section could not be found.
286 |     """
287 |     marker_patterns = [re_reference_line_number_markers]
288 |     return find_reference_section_no_title_generic(docbody, marker_patterns)
289 | 
290 | 
291 | def find_reference_section_no_title_generic(docbody, marker_patterns):
292 |     """This function would generally be used when it was not possible to locate
293 |     the start of a document's reference section by means of its title.
294 |     Instead, this function will look for reference lines that have numeric
295 |     markers of the format [1], [2], {1}, {2}, etc.
296 |     @param docbody: (list) of strings -each string is a line in the document.
297 |     @return: (dictionary) :
298 |       { 'start_line' : (integer) - index in docbody of 1st reference line,
299 |         'title_string' : (None) - title of the reference section
300 |                                   (None since no title),
301 |         'marker' : (string) - the marker of the first reference line,
302 |         'marker_pattern' : (string) - the regexp string used to find the
303 |                                       marker,
304 |         'title_marker_same_line' : (integer) 0 - to signal title not on same
305 |                                     line as marker.
306 |       }
307 |               Much of this information is used by later functions to rebuild
308 |               a reference section.
309 |       -- OR --
310 |              (None) - when the reference section could not be found.
311 |     """
312 |     if not docbody:
313 |         return None
314 | 
315 |     ref_start_line = ref_line_marker = None
316 | 
317 |     # try to find first reference line in the reference section:
318 |     found_ref_sect = False
319 | 
320 |     for reversed_index, line in enumerate(reversed(docbody)):
321 |         mark_match = regex_match_list(line.strip(), marker_patterns)
322 |         if mark_match and mark_match.group("marknum") == "1":
323 |             # Get marker recognition pattern:
324 |             mark_pattern = mark_match.re.pattern
325 | 
326 |             # Look for [2] in next 10 lines:
327 |             next_test_lines = 10
328 | 
329 |             index = len(docbody) - reversed_index
330 |             zone_to_check = docbody[index : index + next_test_lines]
331 |             if len(zone_to_check) < 5:
332 |                 # We found a 1 towards the end, we assume
333 |                 # we only have one reference
334 |                 found = True
335 |             else:
336 |                 # Check for number 2
337 |                 found = False
338 |                 for line_ in zone_to_check:
339 |                     mark_match2 = regex_match_list(line_.strip(), marker_patterns)
340 |                     if mark_match2 and mark_match2.group("marknum") == "2":
341 |                         found = True
342 |                         break
343 | 
344 |             if found:
345 |                 # Found next reference line:
346 |                 found_ref_sect = True
347 |                 ref_start_line = len(docbody) - 1 - reversed_index
348 |                 ref_line_marker = mark_match.group("mark")
349 |                 ref_line_marker_pattern = mark_pattern
350 |                 break
351 | 
352 |     if found_ref_sect:
353 |         ref_sectn_details = {
354 |             "start_line": ref_start_line,
355 |             "title_string": None,
356 |             "marker": ref_line_marker.strip(),
357 |             "marker_pattern": ref_line_marker_pattern,
358 |             "title_marker_same_line": False,
359 |         }
360 |     else:
361 |         # didn't manage to find the reference section
362 |         ref_sectn_details = None
363 | 
364 |     return ref_sectn_details
365 | 
366 | 
367 | def find_end_of_reference_section(
368 |     docbody, ref_start_line, ref_line_marker, ref_line_marker_ptn
369 | ):
370 |     """Given that the start of a document's reference section has already been
371 |     recognised, this function is tasked with finding the line-number in the
372 |     document of the last line of the reference section.
373 |     @param docbody: (list) of strings - the entire plain-text document body.
374 |     @param ref_start_line: (integer) - the index in docbody of the first line
375 |      of the reference section.
376 |     @param ref_line_marker: (string) - the line marker of the first reference
377 |      line.
378 |     @param ref_line_marker_ptn: (string) - the pattern used to search for a
379 |      reference line marker.
380 |     @return: (integer) - index in docbody of the last reference line
381 |       -- OR --
382 |              (None) - if ref_start_line was invalid.
383 |     """
384 |     section_ended = False
385 |     x = ref_start_line
386 |     if type(x) is not int or x < 0 or x > len(docbody) or len(docbody) < 1:
387 |         # The provided 'first line' of the reference section was invalid.
388 |         # Either it was out of bounds in the document body, or it was not a
389 |         # valid integer.
390 |         # Can't safely find end of refs with this info - quit.
391 |         return None
392 |     # Get patterns for testing line:
393 |     t_patterns = get_post_reference_section_title_patterns()
394 |     kw_patterns = get_post_reference_section_keyword_patterns()
395 | 
396 |     if None not in (ref_line_marker, ref_line_marker_ptn):
397 |         mk_patterns = [re.compile(ref_line_marker_ptn, re.I | re.UNICODE)]
398 |     else:
399 |         mk_patterns = get_reference_line_numeration_marker_patterns()
400 | 
401 |     current_reference_count = 0
402 |     while x < len(docbody) and not section_ended:
403 |         # save the reference count
404 |         num_match = regex_match_list(docbody[x].strip(), mk_patterns)
405 |         if num_match:
406 |             with contextlib.suppress(ValueError, IndexError):
407 |                 current_reference_count = int(num_match.group("marknum"))
408 | 
409 |         # look for a likely section title that would follow a reference
410 |         # section:
411 |         end_match = regex_match_list(docbody[x].strip(), t_patterns)
412 |         if not end_match:
413 |             # didn't match a section title - try looking for keywords that
414 |             # suggest the end of a reference section:
415 |             end_match = regex_match_list(docbody[x].strip(), kw_patterns)
416 |         else:
417 |             # Is it really the end of the reference section? Check within the next
418 |             # 5 lines for other reference numeration markers:
419 |             y = x + 1
420 |             line_found = False
421 |             while y < x + 200 and y < len(docbody) and not line_found:
422 |                 num_match = regex_match_list(docbody[y].strip(), mk_patterns)
423 |                 if num_match and not num_match.group(0).isdigit():
424 |                     try:
425 |                         num = int(num_match.group("marknum"))
426 |                         if current_reference_count + 1 == num:
427 |                             line_found = True
428 |                     except ValueError:
429 |                         # We have the marknum index so it is
430 |                         # numeric pattern for references like
431 |                         # [1], [2] but this match is not a number
432 |                         pass
433 |                     except IndexError:
434 |                         # We have a non numerical references marking
435 |                         # we don't check for a number continuity
436 |                         line_found = True
437 |                 y += 1
438 |             if not line_found:
439 |                 # No ref line found-end section
440 |                 section_ended = True
441 |         if not section_ended:
442 |             # Does this & the next 5 lines simply contain numbers? If yes, it's
443 |             # probably the axis scale of a graph in a fig. End refs section
444 |             digit_test_str = (
445 |                 docbody[x]
446 |                 .replace(" ", "")
447 |                 .replace(".", "")
448 |                 .replace("-", "")
449 |                 .replace("+", "")
450 |                 .replace("\u00d7", "")
451 |                 .replace("\u2212", "")
452 |                 .strip()
453 |             )
454 |             if len(digit_test_str) > 10 and digit_test_str.isdigit():
455 |                 # The line contains only digits and is longer than 10 chars:
456 |                 y = x + 1
457 |                 digit_lines = 4
458 |                 num_digit_lines = 1
459 |                 while y < x + digit_lines and y < len(docbody):
460 |                     digit_test_str = (
461 |                         docbody[y]
462 |                         .replace(" ", "")
463 |                         .replace(".", "")
464 |                         .replace("-", "")
465 |                         .replace("+", "")
466 |                         .replace("\u00d7", "")
467 |                         .replace("\u2212", "")
468 |                         .strip()
469 |                     )
470 |                     if len(digit_test_str) > 10 and digit_test_str.isdigit():
471 |                         num_digit_lines += 1
472 |                     elif len(digit_test_str) == 0:
473 |                         # This is a blank line. Don't count it, to accommodate
474 |                         # documents that are double-line spaced:
475 |                         digit_lines += 1
476 |                     y = y + 1
477 |                 if num_digit_lines == digit_lines:
478 |                     section_ended = True
479 |             x += 1
480 |     return x - 1
481 | 
482 | 
483 | def get_reference_section_beginning(fulltext):
484 |     sect_start = {
485 |         "start_line": None,
486 |         "end_line": None,
487 |         "title_string": None,
488 |         "marker_pattern": None,
489 |         "marker": None,
490 |         "how_found_start": None,
491 |     }
492 | 
493 |     # Find start of refs section:
494 |     sect_start = find_reference_section(fulltext)
495 |     if sect_start is not None:
496 |         sect_start["how_found_start"] = 1
497 |     else:
498 |         # No references found - try with no title option
499 |         sect_start = find_reference_section_no_title_via_brackets(fulltext)
500 |         if sect_start is not None:
501 |             sect_start["how_found_start"] = 2
502 |         # Try weaker set of patterns if needed
503 |         if sect_start is None:
504 |             # No references found - try with no title option (with weaker
505 |             # patterns..)
506 |             sect_start = find_reference_section_no_title_via_dots(fulltext)
507 |             if sect_start is not None:
508 |                 sect_start["how_found_start"] = 3
509 |             if sect_start is None:
510 |                 # No references found - try with no title option (with even
511 |                 # weaker patterns..)
512 |                 sect_start = find_reference_section_no_title_via_numbers(fulltext)
513 |                 if sect_start is not None:
514 |                     sect_start["how_found_start"] = 4
515 | 
516 |     if sect_start:
517 |         LOGGER.debug("title %r", sect_start["title_string"])
518 |         LOGGER.debug("marker %r", sect_start["marker"])
519 |         LOGGER.debug("title_marker_same_line %s", sect_start["title_marker_same_line"])
520 | 
521 |     else:
522 |         LOGGER.debug("could not find references section")
523 |     return sect_start
524 | 


--------------------------------------------------------------------------------
/refextract/references/kbs/authors.kb:
--------------------------------------------------------------------------------
1 | Du ̈hrssen---Dührssen
2 | 


--------------------------------------------------------------------------------
/refextract/references/kbs/collaborations.kb:
--------------------------------------------------------------------------------
 1 | # This file holds text which must be recognised alongside authors, and hence included in the $h subfields.
 2 | # Matches using this data do not affect how references are split.
 3 | # (Just simply appends to the most recent $h subfield for the datafield, or makes a new one).
 4 | # Do not append an 's' to the end.
 5 | # Insert only the Upper cased version.
 6 | CMS Collaboration---CMS Collaboration
 7 | ATLAS Collaboration---ATLAS Collaboration
 8 | ALICE Collaboration---ALICE Collaboration
 9 | LEP Collaboration---LEP Collaboration
10 | CDF Collaboration---CDF Collaboration
11 | D0 Collaboration---D0 Collaboration
12 | ALEPH Collaboration---ALEPH Collaboration
13 | DELPHI Collaboration---DELPHI Collaboration
14 | L3 Collaboration---L3 Collaboration
15 | OPAL Collaboration---OPAL Collaboration
16 | CTEQ Collaboration---CTEQ Collaboration
17 | GEANT4 Collaboration---GEANT4 Collaboration
18 | LHC-B Collaboration---LHC-B Collaboration
19 | CDF II Collaboration---CDF II Collaboration
20 | RD 48 Collaboration---RD 48 Collaboration
21 | SLD Collaboration---SLD Collaboration
22 | H1 Collaboration---H1 Collaboration
23 | COMPASS Collaboration---COMPASS Collaboration
24 | HERMES Collaboration---HERMES Collaboration
25 | European Muon Collaboration---European Muon Collaboration
26 | Spin Muon Collaboration---Spin Muon Collaboration
27 | E143 Collaboration---E143 Collaboration
28 | Particle Data Group Collaboration---Particle Data Group Collaboration
29 | ATLAS Inner Detector software group Collaboration---ATLAS Inner Detector software group Collaboration
30 | DØ Collaboration---DØ Collaboration
31 | CUORE Collaboration---CUORE Collaboration
32 | Belle Collaboration---Belle Collaboration
33 | 


--------------------------------------------------------------------------------
/refextract/references/kbs/journal-titles-re.kb:
--------------------------------------------------------------------------------
1 | DAN---Dokl.Akad.Nauk Ser.Fiz.
2 | 


--------------------------------------------------------------------------------
/refextract/references/kbs/publishers.kb:
--------------------------------------------------------------------------------
  1 | SHAKER---Shaker
  2 | ELSEVIER---Elsevier
  3 | NORTH HOLLAND---North-Holland
  4 | BRANS---Brans
  5 | BIRKHAEUSER---Birkhaeuser
  6 | SCI PR---Sci. Pr.
  7 | UNIV FORL---Univ.-Forl.
  8 | LBL---LBL
  9 | DE GRUYTER---de Gruyter
 10 | AEG---AEG
 11 | BEUTH---Beuth
 12 | ELITERA---Elitera
 13 | SPRINGER---Springer
 14 | VEB VERL TECH---VEB Verl. Tech.
 15 | CRC PR---CRC Pr.
 16 | PITAGORA---Pitagora
 17 | BIRKHAEUSER---Birkhaeuser
 18 | GERSBACH---Gersbach
 19 | VIEWEG---Vieweg
 20 | HILGER---Hilger
 21 | IOP---IOP
 22 | MATH SCI PR---Math. Sci. Pr.
 23 | UNIV PR---Univ. Pr.
 24 | MIT PR---MIT Pr.
 25 | UNIV PR---Univ. Pr.
 26 | WILEY---Wiley
 27 | RUTHERFORD LAB---Rutherford Lab.
 28 | DARESBURY LAB---Daresbury Lab.
 29 | TEOCHE MITTLER---Teoche-Mittler
 30 | KLUWER ACADEMIC---Kluwer Academic
 31 | REIDEL---Reidel
 32 | ED FRONTIERES---Ed. Frontieres
 33 | VDI---VDI
 34 | FIZ4---FIZ4
 35 | PHILIPS---Philips
 36 | PRENTICE HALL---Prentice-Hall
 37 | GIRARDET---Girardet
 38 | UMSCHAU---Umschau
 39 | ED FRONTIERES---Ed. Frontieres
 40 | WORLD SCIENTIFIC---World Scientific
 41 | GIOI PUBL---Gioi Publ.
 42 | HUETIG---Huetig
 43 | #BRAUN---Braun
 44 | FACHBUCHVERL---Fachbuchverl.
 45 | ACADEMIC---Academic
 46 | ADDISON WESLEY---Addison-Wesley
 47 | #BUTTERWORTH---Butterworth
 48 | HARWOOD ACAD---Harwood Acad.
 49 | HEINEMANN EDUC BOOKS---Heinemann Educ. Books
 50 | IMP COLL PR---Imp. Coll. Pr.
 51 | MCGRAW HILL---McGraw-Hill
 52 | PERGAMON---Pergamon
 53 | ROUTLEDGE & KEGAN PAUL---Routledge & Kegan Paul
 54 | ROY SOC---Roy. Soc.
 55 | WILEY---Wiley
 56 | LANL---LANL
 57 | TINNON BROWN---Tinnon-Brown
 58 | BIBLIOGRAPH INST---Bibliograph. Inst.
 59 | AIP---AIP
 60 | EDITRICE ABITARE SEGESTA---Editrice Abitare Segesta
 61 | UNIV PR---Univ. Pr.
 62 | GERSBACH---Gersbach
 63 | HANSER---Hanser
 64 | OLDENBOURG---Oldenbourg
 65 | PIPER---Piper
 66 | THIEMIG---Thiemig
 67 | ACADEMIC---Academic
 68 | AM SOC MECH ENG---Am. Soc. Mech. Eng.
 69 | AIP---AIP
 70 | BENJAMIN---Benjamin
 71 | DOVER---Dover
 72 | GORDON AND BREACH---Gordon and Breach
 73 | INTERSCIENCE---Interscience
 74 | KLUWER ACADEMIC---Kluwer Academic
 75 | MCGRAW HILL---McGraw-Hill
 76 | NOVA SCI PUBL---Nova Sci. Publ.
 77 | PERGAMON---Pergamon
 78 | PLENUM---Plenum
 79 | SPRINGER---Springer
 80 | WILEY---Wiley
 81 | CEBAF---CEBAF
 82 | CLARENDON---Clarendon
 83 | PERGAMON---Pergamon
 84 | UNIV PR---Univ. Pr.
 85 | ANNUAL REVIEWS---Annual Reviews
 86 | INP---INP
 87 | SIAM---SIAM
 88 | VAN NOSTRAND---van Nostrand
 89 | UNIV PR---Univ. Pr.
 90 | ADDISON WESLEY---Addison-Wesley
 91 | WORLD SCIENTIFIC---World Scientific
 92 | MUELLER---Mueller
 93 | ECOWIN VERL---Ecowin Verl.
 94 | #FREEMAN---Freeman
 95 | PITMAN---Pitman
 96 | WORLD SCIENTIFIC---World Scientific
 97 | NAT TECH INFORM SERV---Nat. Tech. Inform. Serv.
 98 | SLAC---SLAC
 99 | BERL UNION---Berl. Union
100 | TEUBNER---Teubner
101 | SWED PHYS ARCHIVE---Swed. Phys. Archive
102 | DEUTSCH---Deutsch
103 | KEK---KEK
104 | IAEA---IAEA
105 | SPRINGER---Springer
106 | IEEE---IEEE
107 | PHYSIK VERL---Physik-Verl.
108 | WILEY VCH---Wiley-VCH
109 | AIP---AIP
110 | 


--------------------------------------------------------------------------------
/refextract/references/kbs/report-numbers.kb:
--------------------------------------------------------------------------------
  1 | *****LANL*****
  2 | <s/syymm999>
  3 | <syymm999>
  4 | 
  5 | ACC PHYS	---acc-phys
  6 | ADAP ORG	---adap-org
  7 | ALG GEOM	---alg-geom
  8 | AO SCI		---ao-sci
  9 | AUTO FMS	---auto-fms
 10 | BAYES AN	---bayes-an
 11 | CD HG		---cd-hg
 12 | CMP LG		---cmp-lg
 13 | COMP GAS	---comp-gas
 14 | DG GA		---dg-ga
 15 | FUNCT AN	---funct-an
 16 | GR QC		---gr-qc
 17 | ARXIVHEP EX	---hep-ex
 18 | ARXIVHEP PH	---hep-ph
 19 | ARXIVHEP TH	---hep-th
 20 | LC OM		---lc-om
 21 | MTRL TH		---mtrl-th
 22 | NEURO CEL	---neuro-cel
 23 | NEURO DEV	---neuro-dev
 24 | NEURO SCI	---neuro-sci
 25 | PATT SOL	---patt-sol
 26 | 
 27 | 
 28 | *****Fermilab extensions*****
 29 | <s9?9?9?9(?: AD?)?(?: AE)?(?: APC)?(?: CD)?(?: CMS)?(?: DI)?(?: E)?(?: ESH)?(?: LDRD)?(?: ND)?(?: PPD)?(?: TD?)?(?: WDRS)?\b>
 30 | <syys9?9?9?9(?: AD?)?(?: AE)?(?: APC)?(?: CD)?(?: CMS)?(?: DI)?(?: E)?(?: ESH)?(?: LDRD)?(?: ND)?(?: PPD)?(?: TD?)?(?: WDRS)?\b>
 31 | <syyyys9?9?9?9(?: AD?)?(?: AE)?(?: APC)?(?: CD)?(?: CMS)?(?: DI)?(?: E)?(?: ESH)?(?: LDRD)?(?: ND)?(?: PPD)?(?: TD?)?(?: WDRS)?\b>
 32 | 
 33 | 
 34 | FERMILAB CONF   ---FERMILAB-Conf
 35 | FERMILAB FN     ---FERMILAB-FN
 36 | FERMILAB PUB    ---FERMILAB-Pub
 37 | FERMILAB TM     ---FERMILAB-TM
 38 | FERMILAB SLIDES ---FERMILAB-SLIDES
 39 | FERMILAB POSTER ---FERMILAB-POSTER
 40 | 
 41 | 
 42 | 
 43 | *****Fermilab no extensions*****
 44 | <s9?9?9?9>
 45 | <syys9?9?9\b>
 46 | <syyyys9?9?9\b>
 47 | 
 48 | 
 49 | FERMILAB CODE   ---FERMILAB-CODE
 50 | FERMILAB DESIGN ---FERMILAB-Design
 51 | FERMILABDESIGN  ---FERMILAB-Design
 52 | FERMILAB PROPOSAL ---FERMILAB-Proposal
 53 | FERMILAB THESIS ---FERMILAB-Thesis
 54 | FERMILAB MASTERS---FERMILAB-Masters
 55 | 
 56 | 
 57 | *****Fermilab DØ notes*****
 58 | <s9?9?9?9>
 59 | <s9?9?9?9 CONF>
 60 | 
 61 | DØ NOTE---D0-Note
 62 | D0 NOTE---D0-Note
 63 | 
 64 | *****Fermilab CDF*****
 65 | <s9?9?9?9>
 66 | 
 67 | CDF                       ---CDF
 68 | CDF ANAL ELECTROWEAK CDFR ---CDF-ANAL-ELECTROWEAK-CDFR
 69 | CDF ANAL EXOTIC CDFR      ---CDF-ANAL-EXOTIC-CDFR
 70 | CDF ANAL EXOTIC PUBLIC    ---CDF-ANAL-EXOTIC-PUBLIC
 71 | CDF ANAL JET PUBLIC       ---CDF-ANAL-JET-PUBLIC
 72 | CDF ANAL TOP CDFR         ---CDF-ANAL-TOP-CDFR
 73 | CDF ANAL TOP PUBLIC       ---CDF-ANAL-TOP-PUBLIC
 74 | CDF DOC CDF CDFR          ---CDF-DOC-CDF-CDFR
 75 | CDF DOC CDF PUBLIC        ---CDF-DOC-CDF-PUBLIC
 76 | CDF DOC PLUG UPGR CDFR    ---CDF-DOC-PLUG-UPGR-CDFR
 77 | CDF NOTE                  ---CDF-NOTE
 78 | CDF PHYS BOTTOM PUBLIC    ---CDF-PHYS-BOTTOM-PUBLIC
 79 | CDF PUB                   ---CDF-PUB
 80 | CDF PUB BOTTOM CDFR       ---CDF-PUB-BOTTOM-CDFR
 81 | CDF PUB BOTTOM PUBLIC     ---CDF-PUB-BOTTOM-PUBLIC
 82 | CDF PUB CDF PUBLIC        ---CDF-PUB-CDF-PUBLIC
 83 | CDF PUB ELECTROWEAK CDFR  ---CDF-PUB-ELECTROWEAK-CDFR
 84 | CDF PUB ELECTROWEAK PUBLIC---CDF-PUB-ELECTROWEAK-PUBLIC
 85 | CDF PUB EXOTIC CDFR       ---CDF-PUB-EXOTIC-CDFR
 86 | CDF PUB EXOTIC PUBLIC     ---CDF-PUB-EXOTIC-PUBLIC
 87 | CDF PUB HEAVYFLAVOR PUBLIC---CDF-PUB-HEAVYFLAVOR-PUBLIC
 88 | CDF PUB JET CDFR          ---CDF-PUB-JET-CDFR
 89 | CDF PUB JET PUBLIC        ---CDF-PUB-JET-PUBLIC
 90 | CDF PUB MIN BIAS PUBLIC   ---CDF-PUB-MIN-BIAS-PUBLIC
 91 | CDF PUB PLUG UPGR PUBLIC  ---CDF-PUB-PLUG-UPGR-PUBLIC
 92 | CDF PUB PUBLIC            ---CDF-PUB-PUBLIC
 93 | CDF PUB SEC VTX PUBLIC    ---CDF-PUB-SEC-VTX-PUBLIC
 94 | CDF PUB SEC_VTX PUBLIC    ---CDF-PUB-SEC-VTX-PUBLIC
 95 | CDF PUB TOP CDFR          ---CDF-PUB-TOP-CDFR
 96 | CDF PUB TOP PUBLIC        ---CDF-PUB-TOP-PUBLIC
 97 | CDF THESIS BOTTOM PUBLIC  ---CDF-THESIS-BOTTOM-PUBLIC
 98 | CDF THESIS CDF PUBLIC     ---CDF-THESIS-CDF-PUBLIC
 99 | CDF THESIS TOP PUBLIC     ---CDF-THESIS-TOP-PUBLIC
100 | CDF TOP PUBLIC            ---CDF-TOP-PUBLIC
101 | 
102 | 
103 | *****Fermilab MicroBooNE*****
104 | <s9?9?9?9(?: PUB| TECH)>
105 | 
106 | MICROBOONE NOTE           ---MICROBOONE-NOTE
107 | MICROBOONE PUBLIC NOTE    ---MICROBOONE-NOTE
108 | 
109 | *****CERN*****
110 | <syys9?9?9>
111 | <syyyys9?9?9>
112 | 
113 | AB NOTE                ---AB-NOTE
114 | ALEPH                  ---ALEPH
115 | ALICE                  ---ALICE
116 | ALICE INT              ---ALICE-INT
117 | ALICE NOTE             ---ALICE-INT
118 | ALICE PUBLIC           ---ALICE-PUBLIC
119 | ATL CAL                ---ATL-CAL
120 | ATL COM                ---ATL-COM
121 | ATL COM SOFT           ---ATL-COM-SOFT
122 | ATL COM PUB            ---ATL-COM-DAQ
123 | ATL COM DAQ            ---ATL-COM-DAQ
124 | ATL COM INDENT         ---ATL-COM-INDENT
125 | ATL COM LUM            ---ATL-COM-LUM
126 | ATL COM MUON           ---ATL-COM-MUON
127 | ATL COM PHYS           ---ATL-COM-PHYS
128 | ATL COMPHYS            ---ATL-COM-PHYS
129 | ATLCOM PHYS            ---ATL-COM-PHYS
130 | TL COM PHYS            ---ATL-COM-PHYS
131 | ATLAS COM PHYS         ---ATLAS-COM-PHYS
132 | ATL COM TILECAL        ---ATL-COM-TILECAL
133 | ATL COM LARG           ---ATL-COM-LARG
134 | ATLAS COM CONF         ---ATLAS-COM-CONF
135 | ATLASCOM CONF          ---ATLAS-COM-CONF
136 | ATLAS COMCONF          ---ATLAS-COM-CONF
137 | ATLAS CONF             ---ATLAS-CONF
138 | ATLASCONF              ---ATLAS-CONF
139 | ATL DAQ                ---ATL-DAQ
140 | ATL DAQ CONF           ---ATL-DAQ-CONF
141 | ATL DAQ PUB            ---ATL-DAQ-PUB
142 | ATL DAQ PROC           ---ATL-DAQ-PROC
143 | ATL GEN                ---ATL-GEN
144 | ATLAS HIGG             ---ATLAS-HIGG
145 | ATL INDET              ---ATL-INDET
146 | ATL INDET PUB          ---ATL-INDET-PUB
147 | ATL INDET PROC         ---ATL-INDET-PROC
148 | ATL LARG               ---ATL-LARG
149 | ATL MUON               ---ATL-MUON
150 | ATL MUON PUB           ---ATL-MUON-PUB
151 | ATL PUB MUON           ---ATL-PUB-MUON
152 | ATL PHYS               ---ATL-PHYS
153 | ATL PHYS CONF          ---ATL-PHYS-CONF
154 | ATL PHYS INT           ---ATL-PHYS-INT
155 | ATL PHYSINT            ---ATL-PHYS-INT
156 | ATLPHYS INT            ---ATL-PHYS-INT
157 | ATL PHYS PUB           ---ATL-PHYS-PUB
158 | ATL PHYSPUB            ---ATL-PHYS-PUB
159 | ATLPHYS PUB            ---ATL-PHYS-PUB
160 | ATLAS PHYS PUB         ---ATL-PHYS-PUB
161 | ATL PHYS PROC          ---ATL-PHYS-PROC
162 | ATL TECH               ---ATL-TECH
163 | ATL TILECAL            ---ATL-TILECAL
164 | ATL TILECAL PUB        ---ATL-TILECAL-PUB
165 | ATL TILECAL PROC       ---ATL-TILECAL-PROC
166 | ATL SOFT               ---ATL-SOFT
167 | ATL SOFT PUB           ---ATL-SOFT-PUB
168 | ATL SOFT PROC          ---ATL-SOFT-PROC
169 | ATL IS EN              ---ATL-IS-EN
170 | ATL IS QA              ---ATL-IS-QA
171 | ATL LARG PUB           ---ATL-LARG-PUB
172 | ATL COM LARG           ---ATL-COM-LARG
173 | TL COM LARG            ---ATL-COM-LARG
174 | ATLCOM LARG            ---ATL-COM-LARG
175 | ATL MAGNET PUB         ---ATL-MAGNET-PUB
176 | ATL UPGRADE PUB        ---ATL-UPGRADE-PUB
177 | ATL UPGRADE PROC       ---ATL-UPGRADE-PROC
178 | CERN AB                ---CERN-AB
179 | CERN AB NOTE           ---CERN-NOTE
180 | CERN ALEPH             ---CERN-ALEPH
181 | CERN ALEPH PHYSIC      ---CERN-ALEPH-PHYSIC
182 | CERN ALEPH PUB         ---CERN-ALEPH-PUB
183 | CERN ALICE INT         ---CERN-ALICE-INT
184 | CERN ALICE PUB         ---CERN-ALICE-PUB
185 | CERN ALI               ---CERN-ALI
186 | CERN AS                ---CERN-AS
187 | CERN AT                ---CERN-AT
188 | CERN ATL COM CAL       ---CERN-ATL-COM-CAL
189 | CERN ATL COM DAQ       ---CERN-ATL-COM-DAQ
190 | CERN ATL COM GEN       ---CERN-ATL-COM-GEN
191 | CERN ATL COM INDET     ---CERN-ATL-COM-INDET
192 | CERN ATL COM LARG      ---CERN-ATL-COM-LARG
193 | CERN ATL COM MUON      ---CERN-ATL-COM-MUON
194 | CERN ATL COM PHYS      ---CERN-ATL-COM-PHYS
195 | CERN ATL COM TECH      ---CERN-ATL-COM
196 | CERN ATL COM TILECAL   ---CERN-ATL-COM
197 | CERN ATL DAQ           ---CERN-ATL-DAQ
198 | CERN ATL SOFT          ---CERN-ATL-SOFT
199 | CERN ATL SOFT INT      ---CERN-ATL-SOFT-INT
200 | CERN ATL SOFT PUB      ---CERN-ATL-SOFT-PUB
201 | CERN ATS               ---CERN-ATS
202 | CERNATS                ---CERN-ATS
203 | CERN ATS NOTE          ---CERN-ATS-NOTE
204 | CERNATS NOTE           ---CERN-ATS-NOTE
205 | CERN BE                ---CERN-BE
206 | CERN BE NOTE           ---CERN-BE-NOTE
207 | CERN CMS               ---CERN-CMS
208 | CERN CMS CR            ---CERN-CMS-CR
209 | CERN CMS DP            ---CERN-CMS-DP
210 | CERN CMS NOTE          ---CERN-CMS-NOTE
211 | CERN CN                ---CERN-CN
212 | CERN DD                ---CERN-DD
213 | CERN DELPHI            ---CERN-DELPHI
214 | CERN ECP               ---CERN-ECP
215 | CERN EF                ---CERN-EF
216 | CERN EP                ---CERN-EP
217 | CERN EST               ---CERN-EST
218 | CERN ETT               ---CERN-ETT
219 | CERN INTC              ---CERN-INTC
220 | CERN IT                ---CERN-IT
221 | CERN LCGAPP            ---CERN-LCGAPP
222 | CERN LHCB              ---CERN-LHCB
223 | CERN LHCB DP           ---CERN-LHCB-DP
224 | CERN LHCB CONF         ---CERN-LHCB-CONF
225 | CERN LHCB INT          ---CERN-LHCB-INT
226 | CERN LHCB PUB          ---CERN-LHCB-PUB
227 | CERN LHCC              ---CERN-LHCC
228 | CERNLHCC               ---CERN-LHCC
229 | CERN LHC               ---CERN-LHC
230 | CERN LHC PHO           ---CERN-LHC-PHO
231 | CERN LHC PROJECT REPORT---CERN-LHC-Project-Report
232 | CERN OPEN              ---CERN-OPEN
233 | CERNOPEN               ---CERNOPEN
234 | CERN PH EP             ---CERN-PH-EP
235 | CERNPH EP              ---CERN-PH-EP
236 | CERN PHEP              ---CERN-PH-EP
237 | CERN PH LPCC           ---CERN-PH-LPCC
238 | CERN PH TH             ---CERN-PH-TH
239 | CERN PPE               ---CERN-PPE
240 | CERN PROCEEDINGS       ---CERN-PROCEEDINGS
241 | CERN PS                ---CERN-PS
242 | CERN SL                ---CERN-SL
243 | CERN SL NOTE           ---CERN-SL-NOTE
244 | CERN SPSC              ---CERN-SPSC
245 | CERNSPSC               ---CERN-SPSC
246 | CERN ST                ---CERN-ST
247 | CERN TH                ---CERN-TH
248 | CERN THESIS            ---CERN-THESIS
249 | CERNTHESIS             ---CERN-THESIS
250 | CERN TIS               ---CERN-TIS
251 | CERN ATS               ---CERN-ATS
252 | CERN ATSNOTE           ---CERN-ATSNOTE
253 | CERN                   ---CERN
254 | CLICDP NOTE            ---CLICDP-NOTE
255 | CMS AN                 ---CMS-AN
256 | CMS CR                 ---CMS-CR
257 | CMS DP                 ---CMS-DP
258 | CMS NOTE               ---CMS-NOTE
259 | CMSNOTE                ---CMS-NOTE
260 | CMS EXO                ---CMS-EXO
261 | CMS TS                 ---CMS-TS
262 | DELPHI                 ---DELPHI
263 | DELPHI NOTE            ---DELPHI-NOTE
264 | DIRAC NOTE             ---DIRAC-NOTE
265 | DN                     ---DIRAC-NOTE
266 | LHCB                   ---LHCB
267 | LHCB DP                ---LHCB-DP
268 | LHCB JOURNAL           ---LHCB-JOURNAL
269 | LHCB ANA               ---LHCB-ANA
270 | LHCB CONF              ---LHCB-CONF
271 | LHCBCONF               ---LHCB-CONF
272 | LHCB INT               ---LHCB-INT
273 | LHCB PUB               ---LHCB-PUB
274 | LHCB PAPER             ---LHCB-PAPER
275 | LHCB PROC              ---LHCB-PROC
276 | LHCB TALK              ---LHCB-TALK
277 | LHCHXSWG               ---LHCHXSWG
278 | LHCHXSWG DRAFT INT     ---LHCHXSWG-DRAFT-INT
279 | LHCHXSWG INT           ---LHCHXSWG-INT
280 | SN ATLAS               ---SN-ATLAS
281 | PAS SUSY               ---CMS-PAS-SUS
282 | CMS PAS EXO            ---CMS-PAS-EXO
283 | CMS PAS HIN            ---CMS-PAS-HIN
284 | CMS PAS QCD            ---CMS-PAS-QCD
285 | CMS PAS TOP            ---CMS-PAS-TOP
286 | CMS PAS SUS            ---CMS-PAS-SUS
287 | CMS PAS BPH            ---CMS-PAS-BPH
288 | CMS PAS SMP            ---CMS-PAS-SMP
289 | CMS PAS HIG            ---CMS-PAS-HIG
290 | CMS PAS EWK            ---CMS-PAS-EWK
291 | CMS PAS BTV            ---CMS-PAS-BTV
292 | CMS PAS FWD            ---CMS-PAS-FWD
293 | CMS PAS TRK            ---CMS-PAS-TRK
294 | CMS PAS SMP            ---CMS-PAS-SMP
295 | CMS PAS PFT            ---CMS-PAS-PFT
296 | CMS PAS MUO            ---CMS-PAS-MUO
297 | CMS PAS JME            ---CMS-PAS-JME
298 | CMS PAS EGM            ---CMS-PAS-EGM
299 | CMS PAS DIF            ---CMS-PAS-DIF
300 | CMS PAS B2G            ---CMS-PAS-B2G
301 | ATLTILECAL PUB         ---ATLTILECAL-PUB
302 | ATLAS TECH PUB         ---ATLAS-TECH-PUB
303 | TLCOM MAGNET           ---TLCOM-MAGNET
304 | ATLLARG                ---ATL-LARG
305 | SL NOTE                ---SL-NOTE
306 | TOTEM                  ---TOTEM
307 | TS NOTE                ---TS-NOTE
308 | 
309 | *****CERN MORE*****
310 | <s9>
311 | 
312 | CMS UG TP       ---CMS-UG-TP
313 | 
314 | 
315 | *****CERN DIFFERENT FORMAT*****
316 | <s9999999>
317 | CERN GE         ---CERN-GE
318 | 
319 | *****CERN with language*****
320 | <syyyys9?9?9sEng>
321 | 
322 | CERN BROCHURE   ---CERN-BROCHURE
323 | 
324 | 
325 | *****LHC*****
326 | <s9?9?9?9>
327 | 
328 | CERN CLIC NOTE          ---CLIC-Note
329 | LHC PROJECT NOTE        ---LHC-Project-Note
330 | CERN LHC PROJECT REPORT ---CERN-LHC-Project-Report
331 | LHC PROJECT REPORT      ---CERN-LHC-Project-Report
332 | CLIC NOTE               ---CLIC-Note
333 | ATLAS TDR               ---ATL-TDR
334 | CMS TDR                 ---CMS-TDR
335 | ATC TT ID               ---ATC-TT-ID
336 | ATC TT IN               ---ATC-TT-IN
337 | LHCCP                   ---LHCCP
338 | 
339 | ***LHC OTHER FORMAT*****
340 | <syyyys9?9?9?9>
341 | 
342 | CERN ACC               ---CERN-ACC
343 | CERN ACC NOTE          ---CERN-ACC-NOTE
344 | 
345 | *****KEK*****
346 | <s9?9?9?9>
347 | <syys9?9?9>
348 | <syyyys9?9?9>
349 | 
350 | KEK CP        ---KEK-CP
351 | KEK INT       ---KEK-Internal
352 | KEK INTERNAL  ---KEK-Internal
353 | KEK PREPRINT  ---KEK-Preprint
354 | KEK TH	      ---KEK-TH
355 | 
356 | 
357 | *****DESY*****
358 | <syys9?9?9>
359 | <syyyys9?9?9>
360 | 
361 | DESY		---DESY
362 | DESY M		---DESY-M
363 | DESY THESIS     ---DESY-THESIS
364 | DESYTHESIS      ---DESY-THESIS
365 | DESY TESLA FEL  ---DESY-TESLA-FEL
366 | DESY PROC       ---DESY-PROC
367 | DESYPROC        ---DESY-PROC
368 | TESLA FEL       ---DESY-TESLA-FEL
369 | 
370 | 
371 | *****DESY F*****
372 | <99s9>
373 | <9s99s99>
374 | <99s99s99>
375 | 
376 | DESY F      ---DESY-F
377 | 
378 | 
379 | *****SLAC*****
380 | <s9?9?9?9?9>
381 | <syys9?9?9>
382 | 
383 | SLAC AP		---SLAC-AP
384 | SLAC PUB	---SLAC-PUB
385 | SLAC R		---SLAC-R
386 | SLAC TN		---SLAC-TN
387 | SLAC WP		---SLAC-WP
388 | 
389 | 
390 | *****Berkeley Lab*****
391 | <s9?9?9?9?9>
392 | LBNL         ---LBNL
393 | 
394 | 
395 | 
396 | *****Argonne National Laboratory*****
397 | <syys9?9>
398 | 
399 | ANL HEP TR   ---ANL-HEP-TR
400 | 
401 | 
402 | *****Antares*****
403 | <syyyys9?9?9>
404 | 
405 | ANTARES SOFT  ---ANTARES-SOFT
406 | ANTARES PHYS  ---ANTARES-Phys
407 | ANTARES OPMO  ---ANTARES-Opmo
408 | 
409 | *****LIGO*****
410 | <sTyy9999s00sa>
411 | 
412 | LIGO          ---LIGO
413 | 
414 | *****Pierre Auger*****
415 | <syyyys9?9?9>
416 | 
417 | GAP           ---GAP
418 | 
419 | *****ILC*****
420 | <syyyys9?9?9>
421 | 
422 | EUDET MEMO    ---EUDET-MEMO
423 | EUDET REPORT  ---EUDET-REPORT
424 | EUROTEV REPORT---EUROTEV-REPORT
425 | ILC NOTE      ---ILC-NOTE
426 | ILC REPORT    ---ILC-REPORT
427 | LC DET        ---LC-DET
428 | LC PHSM       ---LC-PHSM
429 | LC REP        ---LC-REP
430 | LC REPORT     ---LC-REPORT
431 | LC TOOL       ---LC-TOOL
432 | LC TH         ---LC-TH
433 | LCD NOTE      ---LCD-NOTE
434 | 
435 | *****IHEP*****
436 | <syys9?9?9>
437 | <syyyys9?9?9>
438 | 
439 | IHEP AC       ---IHEP-AC
440 | IHEP CEPC DR  ---IHEP-CEPC-DR
441 | IHEP EP       ---IHEP-EP
442 | IHEP TH       ---IHEP-TH
443 | 
444 | *****IPAC*****
445 | <syyyysaaaaa9?9?9>
446 | 
447 | IPAC          ---IPAC
448 | 
449 | *****JINR*****
450 | <[EP]9?9syys9?9?9>
451 | <[EP]9?9syyyys9?9?9>
452 | 
453 | JINR          ---JINR
454 | 
455 | *****Other institutes (standard format)*****
456 | <syys9?9?9?9>
457 | <syyyys9?9?9?9>
458 | 
459 | BONN IR       ---BONN-IR
460 | BONN IB       ---BONN-IB
461 | DAMTP         ---DAMTP
462 | ESS           ---ESS
463 | EUCARD CON    ---EUCARD-CON
464 | INO           ---INO
465 | JAI           ---JAI
466 | KFKI          ---KFKI
467 | LPHE          ---LPHE
468 | MPP           ---MPP
469 | NIKHEF        ---NIKHEF
470 | RAL TR        ---RAL-TR
471 | SLS TME TA    ---SLS-TME-TA
472 | 


--------------------------------------------------------------------------------
/refextract/references/kbs/special-journals.kb:
--------------------------------------------------------------------------------
1 | JHEP
2 | JCAP
3 | 


--------------------------------------------------------------------------------
/refextract/references/pdf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract.
  4 | # Copyright (C) 2016, 2017, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | import logging
 25 | 
 26 | from pypdf import PdfReader
 27 | from pypdf.generic import ByteStringObject
 28 | 
 29 | from refextract.references.regexs import re_reference_in_dest
 30 | 
 31 | LOGGER = logging.getLogger(__name__)
 32 | 
 33 | 
 34 | class IncompleteCoordinatesError(Exception):
 35 |     """Exception raised when a named destination does not have all required
 36 |     coordinates.
 37 |     """
 38 | 
 39 |     pass
 40 | 
 41 | 
 42 | def extract_texkeys_and_urls_from_pdf(pdf_file):
 43 |     """
 44 |     Extract the texkeys and corresponding urls from the given PDF file
 45 | 
 46 |     This is done by looking up the named destinations in the PDF
 47 | 
 48 |     @param pdf_file: path to a PDF
 49 | 
 50 |     @return: list of dictionaries with all texkeys
 51 |      and corresponding urls found in the PDF
 52 |     """
 53 |     with open(pdf_file, "rb") as pdf_stream:
 54 |         try:
 55 |             pdf = PdfReader(pdf_stream, strict=False)
 56 |             destinations = pdf.named_destinations
 57 |             urls = extract_urls(pdf)
 58 |         except Exception:
 59 |             LOGGER.debug("PDF: Internal pypdf error, no TeXkeys returned.")
 60 |             return []
 61 |         # not all named destinations point to references
 62 |         refs = []
 63 |         for destination in destinations.items():
 64 |             destination_key = (
 65 |                 destination[0].decode("utf-8")
 66 |                 if isinstance(destination[0], ByteStringObject)
 67 |                 else destination[0]
 68 |             )
 69 |             match = re_reference_in_dest.match(destination_key)
 70 |             if match:
 71 |                 refs.append(destination)
 72 |         two_column_layout = False
 73 |         try:
 74 |             if _destinations_in_two_columns(pdf, refs):
 75 |                 two_column_layout = True
 76 |                 LOGGER.debug("PDF: Using two-column layout")
 77 | 
 78 |                 def sortfunc(dest_couple):
 79 |                     return dest_couple[1]
 80 | 
 81 |             else:
 82 |                 LOGGER.debug("PDF: Using single-column layout")
 83 | 
 84 |                 def sortfunc(dest_couple):
 85 |                     page, _, ypos, xpos = dest_couple[1]
 86 |                     return (page, ypos, xpos)
 87 | 
 88 |             refs = [(dest[0], _destination_position(pdf, dest[1])) for dest in refs]
 89 |             refs.sort(key=sortfunc)
 90 |             urls = [(uri["/A"]["/URI"], _uri_position(pdf, uri)) for uri in urls]
 91 |             urls.sort(key=sortfunc)
 92 |             texkey_url_list = []
 93 |             for nb, ref in enumerate(refs):
 94 |                 current_texkey_urls_dict = {}
 95 |                 current_texkey_urls_dict["texkey"] = re_reference_in_dest.match(
 96 |                     ref[0]
 97 |                 ).group(1)
 98 |                 if nb < len(refs) - 1:
 99 |                     next_reference_data = refs[nb + 1]
100 |                     matched_urls_for_reference, urls = _match_urls_with_reference(
101 |                         urls,
102 |                         ref,
103 |                         next_reference_data,
104 |                         two_column_layout=two_column_layout,
105 |                     )
106 |                 else:
107 |                     matched_urls_for_reference, urls = _match_urls_with_reference(
108 |                         urls, ref, two_column_layout=two_column_layout
109 |                     )
110 |                 if matched_urls_for_reference:
111 |                     current_texkey_urls_dict["urls"] = matched_urls_for_reference
112 |                 texkey_url_list.append(current_texkey_urls_dict)
113 |             return texkey_url_list
114 |         except Exception:
115 |             LOGGER.debug("PDF: Impossible to determine layout, no TeXkeys returned")
116 |             return []
117 | 
118 | 
119 | def _match_urls_with_reference(
120 |     urls_to_match, reference, next_reference=None, two_column_layout=False
121 | ):
122 |     ref_page_number, ref_column, ref_y, _ = reference[1]
123 |     if next_reference:
124 |         next_ref_page_number, next_ref_col, next_ref_y, _ = next_reference[1]
125 |     urls_for_reference = set()
126 |     for url_index, url in enumerate(urls_to_match):
127 |         url_page_number, url_col, url_y, _ = url[1]
128 |         is_url_under_texkey = ref_y <= url_y
129 |         is_url_in_same_col = ref_column == url_col
130 |         is_url_in_next_col = url_col > ref_column
131 |         is_reference_on_same_page_as_url = ref_page_number == url_page_number
132 |         is_reference_on_previous_page_than_url = ref_page_number + 1 == url_page_number
133 |         if not next_reference:
134 |             if (
135 |                 (
136 |                     is_reference_on_same_page_as_url
137 |                     and (is_url_in_same_col or is_url_in_next_col)
138 |                 )
139 |                 or is_reference_on_previous_page_than_url
140 |             ) and is_url_under_texkey:
141 |                 urls_for_reference.add(url[0])
142 |             continue
143 |         is_url_between_texkeys = (
144 |             is_reference_on_same_page_as_url or is_reference_on_previous_page_than_url
145 |         ) and (ref_y <= url_y <= next_ref_y)
146 |         is_next_reference_on_the_same_page = next_ref_page_number == url_page_number
147 |         is_last_reference_in_page = (
148 |             is_reference_on_same_page_as_url
149 |             and (next_ref_page_number > url_page_number)
150 |             and is_url_under_texkey
151 |         )
152 |         is_last_reference_in_page_two_col_layout = (
153 |             is_reference_on_same_page_as_url
154 |             and is_next_reference_on_the_same_page
155 |             and is_url_under_texkey
156 |             and (next_ref_col > url_col)
157 |             and next_ref_y < url_y
158 |             and ref_y <= url_y
159 |             and (is_url_in_same_col or is_url_in_next_col)
160 |         )
161 |         is_in_new_column = (
162 |             is_reference_on_same_page_as_url
163 |             and is_next_reference_on_the_same_page
164 |             and ref_y > url_y
165 |             and (next_ref_col > ref_column)
166 |             and (next_ref_y > url_y)
167 |         )
168 |         is_url_for_other_reference_in_new_column = (
169 |             is_reference_on_same_page_as_url
170 |             and (next_ref_page_number == url_page_number)
171 |             and (next_ref_col == ref_column < url_col)
172 |             and (next_ref_y > url_y)
173 |         )
174 |         is_url_unrelated_to_references = ref_page_number > url_page_number
175 |         is_url_for_next_reference = url_y >= next_ref_y
176 |         if is_url_between_texkeys:
177 |             if not two_column_layout or (two_column_layout and url_col == ref_column):
178 |                 urls_for_reference.add(url[0])
179 |                 continue
180 |         elif (
181 |             is_last_reference_in_page
182 |             or is_last_reference_in_page_two_col_layout
183 |             or is_in_new_column
184 |         ):
185 |             urls_for_reference.add(url[0])
186 |             continue
187 |         elif is_url_unrelated_to_references:
188 |             continue
189 |         elif is_url_for_next_reference or is_url_for_other_reference_in_new_column:
190 |             urls_to_match = urls_to_match[url_index:]
191 |             break
192 |     if not next_reference:
193 |         urls_to_match = []
194 |     return urls_for_reference, urls_to_match
195 | 
196 | 
197 | def _destinations_in_two_columns(pdf, destinations, cutoff=3):
198 |     """
199 |     Check if the named destinations are organized along two columns (heuristic)
200 | 
201 |     @param pdf: a PdfReader object
202 |     @param destinations:
203 | 
204 |     'cutoff' is used to tune the heuristic: if 'cutoff' destinations in the
205 |     would-be second column start at the same position, return True
206 |     """
207 |     # iterator for the x coordinates of refs in the would-be second column
208 |     xpositions = (
209 |         _destination_position(pdf, dest)[3]
210 |         for (_, dest) in destinations
211 |         if _destination_position(pdf, dest)[1] == 1
212 |     )
213 |     xpos_count = {}
214 |     for xpos in xpositions:
215 |         xpos_count[xpos] = xpos_count.get(xpos, 0) + 1
216 |         if xpos_count[xpos] >= cutoff:
217 |             return True
218 |     return False
219 | 
220 | 
221 | def _destination_position(pdf, destination):
222 |     """
223 |     Gives a tuple (page, column, -y, x) representing the position of the
224 |     NamedDestination
225 | 
226 |     This representation is useful for sorting named destinations and
227 |     assumes the text has at most 2 columns
228 |     """
229 |     pagewidth = pdf.pages[
230 |         pdf.get_destination_page_number(destination)
231 |     ].cropbox.lower_right[0]
232 |     if not destination.left or not destination.top:
233 |         raise IncompleteCoordinatesError(destination)
234 |     # assuming max 2 columns
235 |     column = (2 * destination.left) // pagewidth
236 |     return (
237 |         pdf.get_destination_page_number(destination),
238 |         column,
239 |         -destination.top,
240 |         destination.left,
241 |     )
242 | 
243 | 
244 | def _uri_position(pdf, uri_destination):
245 |     """
246 |     Gives a tuple (page, column, -y, x) representing the position of the URI
247 |     """
248 |     page_nb = uri_destination.get("page_nb")
249 |     destintation_left = uri_destination["/Rect"][0]
250 |     destintation_top = uri_destination["/Rect"][3]
251 |     pagewidth = pdf.get_page(page_nb).cropbox.lower_right[0]
252 |     column = (2 * destintation_left) // pagewidth
253 |     # neccessary to exclude column from sorting
254 |     return (page_nb, column, -destintation_top, destintation_left)
255 | 
256 | 
257 | def extract_urls(pdf):
258 |     urls = []
259 |     pages = len(pdf.pages)
260 |     for page_nb in range(pages):
261 |         page = pdf.pages[page_nb]
262 |         page_object = page.get_object()
263 |         urls_for_page = _get_urls_data_from_page_object(page_object, page_nb)
264 |         urls.extend(urls_for_page)
265 |     return urls
266 | 
267 | 
268 | def _get_urls_data_from_page_object(page_object, page_nb):
269 |     urls_at_page = []
270 |     annotations = page_object.get("/Annots", [])
271 |     for annotation in annotations:
272 |         annotation_object = annotation.get_object()
273 |         if "/URI" in annotation_object["/A"]:
274 |             annotation_object.update({"page_nb": page_nb})
275 |             urls_at_page.append(annotation_object)
276 |     return urls_at_page
277 | 


--------------------------------------------------------------------------------
/refextract/references/record.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract.
  4 | # Copyright (C) 2013, 2015, 2016, 2017, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | 
 25 | def format_marker(line_marker):
 26 |     return line_marker.strip("[](){}. ")
 27 | 
 28 | 
 29 | def build_references(citations, reference_format=False):
 30 |     """Build list of reference dictionaries from a references list"""
 31 |     # Now, run the method which will take as input:
 32 |     # 1. A list of lists of dictionaries, where each dictionary is a piece
 33 |     # of citation information corresponding to a tag in the citation.
 34 |     # 2. The line marker for this entire citation line (mulitple citation
 35 |     # 'finds' inside a single citation will use the same marker value)
 36 |     # The resulting xml line will be a properly marked up form of the
 37 |     # citation. It will take into account authors to try and split up
 38 |     # references which should be read as two SEPARATE ones.
 39 |     return [
 40 |         c
 41 |         for citation_elements in citations
 42 |         for elements in citation_elements["elements"]
 43 |         for c in build_reference_fields(
 44 |             elements,
 45 |             citation_elements["line_marker"],
 46 |             citation_elements["raw_ref"],
 47 |             reference_format,
 48 |         )
 49 |     ]
 50 | 
 51 | 
 52 | def add_subfield(field, code, value):
 53 |     if value:
 54 |         field.setdefault(code, []).append(value)
 55 | 
 56 | 
 57 | def add_journal_subfield(field, element, reference_format):
 58 |     add_subfield(field, "journal_title", element.get("title"))
 59 |     add_subfield(field, "journal_volume", element.get("volume"))
 60 |     add_subfield(field, "journal_year", element.get("year"))
 61 |     add_subfield(field, "journal_page", element.get("page"))
 62 |     add_subfield(field, "journal_reference", reference_format.format(**element))
 63 | 
 64 | 
 65 | def create_reference_field(line_marker):
 66 |     field = {}
 67 |     if line_marker.strip("., [](){}"):
 68 |         add_subfield(field, "linemarker", format_marker(line_marker))
 69 |     return field
 70 | 
 71 | 
 72 | def build_reference_fields(citation_elements, line_marker, raw_ref, reference_format):
 73 |     """Create the final representation of the reference information.
 74 | 
 75 |     @param citation_elements: (list) an ordered list of dictionary elements,
 76 |                               with each element corresponding to a found
 77 |                               piece of information from a reference line.
 78 |     @param line_marker: (string) The line marker for this single reference
 79 |                         line (e.g. [19])
 80 |     @param raw_ref: (string) The raw string of this line
 81 |     @return reference_fields: (list) A list of one dictionary containing the
 82 |                       reference elements
 83 |     """
 84 |     # Begin the datafield element
 85 |     current_field = create_reference_field(line_marker)
 86 |     current_field["raw_ref"] = [raw_ref]
 87 | 
 88 |     reference_fields = [current_field]
 89 | 
 90 |     for element in citation_elements:
 91 |         # Before going onto checking 'what' the next element is,
 92 |         # handle misc text and semi-colons
 93 |         # Multiple misc text subfields will be compressed later
 94 |         # This will also be the only part of the code that deals with MISC
 95 |         # tag_typed elements
 96 |         misc_txt = element["misc_txt"]
 97 |         if misc_txt.strip("., [](){}"):
 98 |             misc_txt = misc_txt.lstrip("])} ,.").rstrip("[({ ,.")
 99 |             add_subfield(current_field, "misc", misc_txt)
100 | 
101 |         # Now handle the type dependent actions
102 |         # JOURNAL
103 |         if element["type"] == "JOURNAL":
104 |             add_journal_subfield(current_field, element, reference_format)
105 | 
106 |         # REPORT NUMBER
107 |         elif element["type"] == "REPORTNUMBER":
108 |             add_subfield(current_field, "reportnumber", element["report_num"])
109 | 
110 |         # URL
111 |         elif element["type"] == "URL":
112 |             if element["url_string"] == element["url_desc"]:
113 |                 # Build the datafield for the URL segment of the reference
114 |                 # line:
115 |                 add_subfield(current_field, "url", element["url_string"])
116 |             # Else, in the case that the url string and the description differ
117 |             # in some way, include them both
118 |             else:
119 |                 add_subfield(current_field, "url", element["url_string"])
120 |                 add_subfield(current_field, "urldesc", element["url_desc"])
121 | 
122 |         # DOI
123 |         elif element["type"] == "DOI":
124 |             add_subfield(current_field, "doi", "doi:" + element["doi_string"])
125 | 
126 |         # HDL
127 |         elif element["type"] == "HDL":
128 |             add_subfield(current_field, "hdl", "hdl:" + element["hdl_id"])
129 | 
130 |         # AUTHOR
131 |         elif element["type"] == "AUTH":
132 |             value = element["auth_txt"]
133 |             if element["auth_type"] == "incl":
134 |                 value = "(%s)" % value
135 | 
136 |             add_subfield(current_field, "author", value)
137 | 
138 |         elif element["type"] == "QUOTED":
139 |             add_subfield(current_field, "title", element["title"])
140 | 
141 |         elif element["type"] == "ISBN":
142 |             add_subfield(current_field, "isbn", element["ISBN"])
143 | 
144 |         elif element["type"] == "BOOK":
145 |             add_subfield(current_field, "title", element["title"])
146 | 
147 |         elif element["type"] == "PUBLISHER":
148 |             add_subfield(current_field, "publisher", element["publisher"])
149 | 
150 |         elif element["type"] == "YEAR":
151 |             add_subfield(current_field, "year", element["year"])
152 | 
153 |         elif element["type"] == "COLLABORATION":
154 |             add_subfield(current_field, "collaboration", element["collaboration"])
155 | 
156 |         elif element["type"] == "RECID":
157 |             add_subfield(current_field, "recid", str(element["recid"]))
158 | 
159 |     return reference_fields
160 | 
161 | 
162 | def update_reference_with_urls(reference, url_set):
163 |     for url in url_set:
164 |         add_subfield(reference, "url", url)
165 | 
166 | 
167 | def merge_misc(field):
168 |     current_misc = None
169 |     for subfield in field.subfields[:]:
170 |         if subfield.code == "m":
171 |             if current_misc is None:
172 |                 current_misc = subfield
173 |             else:
174 |                 current_misc.value += " " + subfield.value
175 |                 field.subfields.remove(subfield)
176 | 


--------------------------------------------------------------------------------
/refextract/references/text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract.
  4 | # Copyright (C) 2015, 2016, 2017, 2018 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | import logging
 25 | import re
 26 | 
 27 | from inspire_utils.record import replace_undesirable_characters
 28 | 
 29 | from refextract.documents.text import (
 30 |     join_lines,
 31 |     re_multiple_space,
 32 |     remove_page_boundary_lines,
 33 |     repair_broken_urls,
 34 | )
 35 | from refextract.references.config import CFG_REFEXTRACT_MAX_LINES
 36 | from refextract.references.find import (
 37 |     find_end_of_reference_section,
 38 |     get_reference_section_beginning,
 39 | )
 40 | 
 41 | LOGGER = logging.getLogger(__name__)
 42 | 
 43 | 
 44 | def extract_references_from_fulltext(fulltext):
 45 |     """Locate and extract the reference section from a fulltext document.
 46 |     Return the extracted reference section as a list of strings, whereby each
 47 |     string in the list is considered to be a single reference line.
 48 |      E.g. a string could be something like:
 49 |      '[19] Wilson, A. Unpublished (1986).
 50 |     @param fulltext: (list) of strings, whereby each string is a line of the
 51 |      document.
 52 |     @return: (list) of strings, where each string is an extracted reference
 53 |      line.
 54 |     """
 55 |     # Try to remove pagebreaks, headers, footers
 56 |     fulltext = remove_page_boundary_lines(fulltext)
 57 |     status = 0
 58 |     # How ref section found flag
 59 |     how_found_start = 0
 60 |     # Find start of refs section
 61 |     ref_sect_start = get_reference_section_beginning(fulltext)
 62 | 
 63 |     if ref_sect_start is None:
 64 |         # No References
 65 |         refs = []
 66 |         status = 4
 67 |         LOGGER.debug("extract_references_from_fulltext: ref_sect_start is None")
 68 |     else:
 69 |         # If a reference section was found, however weak
 70 |         ref_sect_end = find_end_of_reference_section(
 71 |             fulltext,
 72 |             ref_sect_start["start_line"],
 73 |             ref_sect_start["marker"],
 74 |             ref_sect_start["marker_pattern"],
 75 |         )
 76 |         if ref_sect_end is None:
 77 |             # No End to refs? Not safe to extract
 78 |             refs = []
 79 |             status = 5
 80 |             LOGGER.debug("extract_references_from_fulltext: no end to refs!")
 81 |         else:
 82 |             # If the end of the reference section was found.. start extraction
 83 |             refs = get_reference_lines(
 84 |                 fulltext,
 85 |                 ref_sect_start["start_line"],
 86 |                 ref_sect_end,
 87 |                 ref_sect_start["title_string"],
 88 |                 ref_sect_start["marker_pattern"],
 89 |                 ref_sect_start["title_marker_same_line"],
 90 |             )
 91 | 
 92 |     return refs, status, how_found_start
 93 | 
 94 | 
 95 | def get_reference_lines(
 96 |     docbody,
 97 |     ref_sect_start_line,
 98 |     ref_sect_end_line,
 99 |     ref_sect_title,
100 |     ref_line_marker_ptn,
101 |     title_marker_same_line,
102 | ):
103 |     """After the reference section of a document has been identified, and the
104 |     first and last lines of the reference section have been recorded, this
105 |     function is called to take the reference lines out of the document body.
106 |     The document's reference lines are returned in a list of strings whereby
107 |     each string is a reference line. Before this can be done however, the
108 |     reference section is passed to another function that rebuilds any broken
109 |     reference lines.
110 |     @param docbody: (list) of strings - the entire document body.
111 |     @param ref_sect_start_line: (integer) - the index in docbody of the first
112 |      reference line.
113 |     @param ref_sect_end_line: (integer) - the index in docbody of the last
114 |      reference line.
115 |     @param ref_sect_title: (string) - the title of the reference section
116 |      (e.g. "References").
117 |     @param ref_line_marker_ptn: (string) - the patern used to match the
118 |      marker for each reference line (e.g., could be used to match lines
119 |      with markers of the form [1], [2], etc.)
120 |     @param title_marker_same_line: (integer) - a flag to indicate whether
121 |      or not the reference section title was on the same line as the first
122 |      reference line's marker.
123 |     @return: (list) of strings. Each string is a reference line, extracted
124 |      from the document.
125 |     """
126 |     start_idx = ref_sect_start_line
127 |     if title_marker_same_line:
128 |         # Title on same line as 1st ref- take title out!
129 |         title_start = docbody[start_idx].find(ref_sect_title)
130 |         if title_start != -1:
131 |             # Set the first line with no title
132 |             docbody[start_idx] = docbody[start_idx][title_start + len(ref_sect_title) :]
133 |     elif ref_sect_title is not None:
134 |         # Set the start of the reference section to be after the title line
135 |         start_idx += 1
136 | 
137 |     if ref_sect_end_line is not None:
138 |         ref_lines = docbody[start_idx : ref_sect_end_line + 1]
139 |     else:
140 |         ref_lines = docbody[start_idx:]
141 | 
142 |     if ref_sect_title:
143 |         ref_lines = strip_footer(ref_lines, ref_sect_title)
144 |     # Now rebuild reference lines:
145 |     # (Go through each raw reference line, and format them into a set
146 |     # of properly ordered lines based on markers)
147 |     return rebuild_reference_lines(ref_lines, ref_line_marker_ptn)
148 | 
149 | 
150 | def match_pagination(ref_line):
151 |     """Remove footer pagination from references lines"""
152 |     pattern = r"\(?\[?(\d{1,4})\]?\)?\.?\s*$"
153 |     re_footer = re.compile(pattern, re.UNICODE)
154 |     match = re_footer.match(ref_line)
155 |     if match:
156 |         return int(match.group(1))
157 |     return None
158 | 
159 | 
160 | def strip_footer(ref_lines, section_title):
161 |     """Remove footer title from references lines"""
162 |     pattern = r"\(?\[?\d{0,4}\]?\)?\.?\s*%s\s*$" % re.escape(section_title)
163 |     re_footer = re.compile(pattern, re.UNICODE)
164 |     return [line for line in ref_lines if not re_footer.match(line)]
165 | 
166 | 
167 | def rebuild_reference_lines(ref_sectn, ref_line_marker_ptn):
168 |     """Given a reference section, rebuild the reference lines. After translation
169 |     from PDF to text, reference lines are often broken. This is because
170 |     pdftotext doesn't know what is a wrapped-line and what is a genuine new
171 |     line. As a result, the following 2 reference lines:
172 |      [1] See http://invenio-software.org/ for more details.
173 |      [2] Example, AN: private communication (1996).
174 |     ...could be broken into the following 4 lines during translation from PDF
175 |     to plaintext:
176 |      [1] See http://invenio-software.org/ fo
177 |      r more details.
178 |      [2] Example, AN: private communica
179 |      tion (1996).
180 |     Such a situation could lead to a citation being separated across 'lines',
181 |     meaning that it wouldn't be correctly recognised.
182 |     This function tries to rebuild the reference lines. It uses the pattern
183 |     used to recognise a reference line's numeration marker to indicate the
184 |     start of a line. If no reference line numeration was recognised, it will
185 |     simply join all lines together into one large reference line.
186 |     @param ref_sectn: (list) of strings. The (potentially broken) reference
187 |      lines.
188 |     @param ref_line_marker_ptn: (string) - the pattern used to recognise a
189 |      reference line's numeration marker.
190 |     @return: (list) of strings - the rebuilt reference section. Each string
191 |      in the list represents a complete reference line.
192 |     """
193 |     indentation_splitting = False
194 | 
195 |     # This should be moved the function detecting the pattern!
196 |     if not ref_line_marker_ptn:
197 |         if test_for_blank_lines_separating_reference_lines(ref_sectn):
198 |             # Use blank lines to separate ref lines
199 |             ref_line_marker_ptn = r"^\s*$"
200 |         else:
201 |             # No ref line dividers
202 |             # We are guessing this the format:
203 |             # Reference1
204 |             #      etc
205 |             # Reference2
206 |             #      etc
207 |             # We split when there's no identation
208 |             indentation_splitting = True
209 |             ref_line_marker_ptn = r"^[^\s]"
210 | 
211 |     LOGGER.debug("references separator %s", ref_line_marker_ptn)
212 |     p_ref_line_marker = re.compile(ref_line_marker_ptn, re.I | re.UNICODE)
213 | 
214 |     # Start from ref 1
215 |     # Append each fixed reference line to rebuilt_references
216 |     # and rebuild references as we go
217 |     current_ref = 0
218 |     rebuilt_references = []
219 |     working_ref = []
220 | 
221 |     def prepare_ref(working_ref):
222 |         working_ref = working_ref[:CFG_REFEXTRACT_MAX_LINES]
223 |         working_line = ""
224 |         for line in working_ref:
225 |             working_line = join_lines(working_line, line.strip())
226 |         return working_line.rstrip()
227 | 
228 |     lower_case_start = re.compile(r"[a-z]")
229 |     continuing_line_markers = re.compile(r"[,&-]$")
230 | 
231 |     for line in ref_sectn:
232 |         # Can't find a good way to distinguish between
233 |         # pagination and the page number of a journal numeration that
234 |         # happens to be alone in a new line
235 |         # m = match_pagination(line)
236 |         # if m and current_ref and current_ref != m + 1:
237 |         #     continue
238 | 
239 |         # Try to find the marker for the reference line
240 |         m_ref_line_marker = p_ref_line_marker.search(line)
241 | 
242 |         if m_ref_line_marker:
243 |             try:
244 |                 marknum = int(m_ref_line_marker.group("marknum"))
245 |             except IndexError:
246 |                 marknum = None
247 |             except ValueError:
248 |                 # If the mark is a unicode character category [Nd],
249 |                 # it is not always convertible to int by int()
250 |                 # We can't use its numerical value, but we still accept it
251 |                 # as numeration
252 |                 pass
253 | 
254 |             new_line_detected = False
255 |             if marknum is None or current_ref + 1 == marknum:
256 |                 new_line_detected = True
257 |             if indentation_splitting:
258 |                 if lower_case_start.match(line.strip()):
259 |                     new_line_detected = False
260 |                 if working_ref and continuing_line_markers.search(
261 |                     working_ref[-1].strip()
262 |                 ):
263 |                     new_line_detected = False
264 | 
265 |             if new_line_detected:
266 |                 # Reference line marker found! : Append this reference to the
267 |                 # list of fixed references and reset the working_line to
268 |                 # 'blank'
269 |                 start = m_ref_line_marker.start()
270 |                 if line[:start]:
271 |                     # If it's not a blank line to separate refs
272 |                     # Only append from the start of the marker
273 |                     # For this case:
274 |                     # [1] hello
275 |                     # hello2 [2] foo
276 |                     working_ref.append(line[:start])
277 | 
278 |                 # Append current working line to the refs list
279 |                 if working_ref:
280 |                     rebuilt_references.append(prepare_ref(working_ref))
281 | 
282 |                 current_ref = marknum
283 |                 working_ref = []
284 |                 if line[start:]:
285 |                     working_ref.append(line[start:])
286 | 
287 |             else:
288 |                 # Our marker does not match the counting
289 |                 # Either we missed one, the author missed one or
290 |                 # it is not a line marker
291 |                 # For now we assume it is not line marker
292 |                 working_ref.append(line)
293 | 
294 |         elif line:
295 |             # Continuation of line
296 |             working_ref.append(line)
297 | 
298 |     if working_ref:
299 |         # Append last line
300 |         rebuilt_references.append(prepare_ref(working_ref))
301 | 
302 |     return rebuilt_references
303 | 
304 | 
305 | def wash_and_repair_reference_line(line):
306 |     """Wash a reference line of undesirable characters (such as poorly-encoded
307 |     letters, etc), and repair any errors (such as broken URLs) if possible.
308 |     @param line: (string) the reference line to be washed/repaired.
309 |     @return: (string) the washed reference line.
310 |     """
311 |     # repair URLs in line:
312 |     line = repair_broken_urls(line)
313 |     # Replace various undesirable characters with their alternatives:
314 |     line = replace_undesirable_characters(line)
315 |     # Replace "<title>," with "<title>",
316 |     # common typing mistake
317 |     line = re.sub(r'"([^"]+),"', r'"\g<1>",', line)
318 |     line = replace_undesirable_characters(line)
319 |     # Remove instances of multiple spaces from line, replacing with a
320 |     # single space:
321 |     line = re_multiple_space.sub(" ", line)
322 |     return line
323 | 
324 | 
325 | def test_for_blank_lines_separating_reference_lines(ref_sect):
326 |     """Test to see if reference lines are separated by blank lines so that
327 |     these can be used to rebuild reference lines.
328 |     @param ref_sect: (list) of strings - the reference section.
329 |     @return: (int) 0 if blank lines do not separate reference lines; 1 if
330 |      they do.
331 |     """
332 |     num_blanks = 0  # Number of blank lines found between non-blanks
333 |     num_lines = 0  # Number of reference lines separated by blanks
334 |     blank_line_separators = 0  # Flag to indicate whether blanks lines separate
335 |     # ref lines
336 |     multi_nonblanks_found = 0  # Flag to indicate whether multiple nonblank
337 |     # lines are found together (used because
338 |     # if line is dbl-spaced, it isnt a blank that
339 |     # separates refs & can't be relied upon)
340 |     x = 0
341 |     max_line = len(ref_sect)
342 |     while x < max_line:
343 |         if not ref_sect[x].isspace():
344 |             # not an empty line:
345 |             num_lines += 1
346 |             x += 1  # Move past line
347 |             while x < len(ref_sect) and not ref_sect[x].isspace():
348 |                 multi_nonblanks_found = 1
349 |                 x += 1
350 |             x -= 1
351 |         else:
352 |             # empty line
353 |             num_blanks += 1
354 |             x += 1
355 |             while x < len(ref_sect) and ref_sect[x].isspace():
356 |                 x += 1
357 |             if x == len(ref_sect):
358 |                 # Blanks at end doc: dont count
359 |                 num_blanks -= 1
360 |             x -= 1
361 |         x += 1
362 |     # Now from the number of blank lines & the number of text lines, if
363 |     # num_lines > 3, & num_blanks = num_lines, or num_blanks = num_lines - 1,
364 |     # then we have blank line separators between reference lines
365 |     if (
366 |         (num_lines > 3)
367 |         and ((num_blanks == num_lines) or (num_blanks == num_lines - 1))
368 |         and (multi_nonblanks_found)
369 |     ):
370 |         blank_line_separators = 1
371 |     return blank_line_separators
372 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
 1 | target-version = "py311"
 2 | [lint.flake8-tidy-imports]
 3 | ban-relative-imports = "all"
 4 | 
 5 | [lint]
 6 | select = [
 7 |     # pycodestyle
 8 |     "E",
 9 |     # Pyflakes
10 |     "F",
11 |     # flake8-bugbear
12 |     "B",
13 |     # flake8-simplify
14 |     "SIM",
15 |     # isort
16 |     "I",
17 |     # flake8-tidy-imports
18 |     "TID",
19 |     # flake8-pytest-style
20 |     "PT",
21 | ]
22 | ignore = ["B904"]
23 | 
24 | [lint.pycodestyle]
25 | ignore-overlong-task-comments = true
26 | 
27 | [lint.pydocstyle]
28 | convention = "google"
29 | 


--------------------------------------------------------------------------------
/run-tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # This file is part of refextract
 4 | # Copyright (C) 2015, 2018 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | set -e
25 | 
26 | flake8 refextract tests
27 | py.test tests
28 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract.
 4 | # Copyright (C) 2016, 2018, 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | import os
25 | 
26 | import pytest
27 | 
28 | 
29 | @pytest.fixture
30 | def pdf_files():
31 |     path_to_pdfs = os.path.join(os.path.dirname(__file__), "data")
32 |     pdfs = os.listdir(path_to_pdfs)
33 |     pdfs.sort()
34 |     return {pdf: os.path.join(path_to_pdfs, pdf) for pdf in pdfs}
35 | 


--------------------------------------------------------------------------------
/tests/data/1503.07589v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1503.07589v1.pdf


--------------------------------------------------------------------------------
/tests/data/1508.05632v2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1508.05632v2.pdf


--------------------------------------------------------------------------------
/tests/data/1706.09498v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1706.09498v1.pdf


--------------------------------------------------------------------------------
/tests/data/1707.04066v1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1707.04066v1.pdf


--------------------------------------------------------------------------------
/tests/data/1805.05865.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/1805.05865.pdf


--------------------------------------------------------------------------------
/tests/data/2110.02751.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2110.02751.pdf


--------------------------------------------------------------------------------
/tests/data/2301.05883.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2301.05883.pdf


--------------------------------------------------------------------------------
/tests/data/2303.03819.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2303.03819.pdf


--------------------------------------------------------------------------------
/tests/data/2304.10117.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2304.10117.pdf


--------------------------------------------------------------------------------
/tests/data/2406.06875.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2406.06875.pdf


--------------------------------------------------------------------------------
/tests/data/2502.18907.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2502.18907.pdf


--------------------------------------------------------------------------------
/tests/data/2502.21088.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2502.21088.pdf


--------------------------------------------------------------------------------
/tests/data/2503.05372.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2503.05372.pdf


--------------------------------------------------------------------------------
/tests/data/2503.05621.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/2503.05621.pdf


--------------------------------------------------------------------------------
/tests/data/DIS_SHEILA_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/DIS_SHEILA_final.pdf


--------------------------------------------------------------------------------
/tests/data/file_resolving.csv:
--------------------------------------------------------------------------------
1 | 1|2|3
2 | 4|5|6
3 | 


--------------------------------------------------------------------------------
/tests/data/packed_pdf.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/packed_pdf.pdf


--------------------------------------------------------------------------------
/tests/data/wepml008.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/inspirehep/refextract/bf4eb82251eaeaeb73fcc3adf2becaac45e3a0c8/tests/data/wepml008.pdf


--------------------------------------------------------------------------------
/tests/integration/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from refextract.app import create_app
 4 | 
 5 | 
 6 | @pytest.fixture(autouse=True, scope="session")
 7 | def app():
 8 |     app = create_app()
 9 |     return app
10 | 
11 | 
12 | @pytest.fixture
13 | def app_client(app):
14 |     with app.test_client() as client:
15 |         yield client
16 | 
17 | 
18 | @pytest.fixture(scope="session")
19 | def vcr_config():
20 |     return {
21 |         "filter_query_parameters": ["access_token"],
22 |         "ignore_localhost": True,
23 |         "decode_compressed_response": True,
24 |         "filter_headers": ("Authorization", "User-Agent"),
25 |         "record_mode": "once",
26 |     }
27 | 


--------------------------------------------------------------------------------
/tests/integration/test_views.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | import mock
  4 | import pytest
  5 | 
  6 | 
  7 | def test_extract_journal_info(app_client):
  8 |     journal_kb_data = {
  9 |         "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.",
 10 |         "PHYS REV": "Phys.Rev.",
 11 |         "PHYSICAL REVIEW": "Phys.Rev.",
 12 |         "PHYS REV LETT": "Phys.Rev.Lett.",
 13 |         "JINST": "JINST",
 14 |         "JOURNAL OF INSTRUMENTATION": "JINST",
 15 |         "SENS ACTUATORS B": "Sens.Actuators B",
 16 |         "SENSORS AND ACTUATORS B: CHEMICAL": "Sens.Actuators B",
 17 |         "PHYS SCRIPTA": "Phys.Scripta",
 18 |         "PHYSICA SCRIPTA": "Phys.Scripta",
 19 |         "BULL CALCUTTA MATH SOC": "Bull.Calcutta Math.Soc.",
 20 |         "BULLETIN OF THE CALCUTTA MATHEMATICAL SOCIETY": "Bull.Calcutta Math.Soc.",
 21 |         "QUANTUM MACHINE INTELLIGENCE": "Quantum Machine Intelligence",
 22 |     }
 23 |     publication_infos = [
 24 |         {"pubinfo_freetext": "Phys. Rev. 127 (1962) 965-970"},
 25 |         {"journal_title": "Phys. Rev."},
 26 |     ]
 27 | 
 28 |     payload = {
 29 |         "journal_kb_data": journal_kb_data,
 30 |         "publication_infos": publication_infos,
 31 |     }
 32 | 
 33 |     headers = {
 34 |         "content-type": "application/json",
 35 |     }
 36 |     response = app_client.post(
 37 |         "/extract_journal_info",
 38 |         headers=headers,
 39 |         data=json.dumps(payload),
 40 |     )
 41 |     assert response.status_code == 200
 42 |     assert "extracted_publication_infos" in response.json
 43 |     assert len(response.json["extracted_publication_infos"]) == 2
 44 | 
 45 | 
 46 | @mock.patch(
 47 |     "refextract.app.extract_journal_reference", side_effect=KeyError("test message")
 48 | )
 49 | def test_extract_journal_info_when_timeout_from_refextract(
 50 |     mock_extract_refs, app_client
 51 | ):
 52 |     journal_kb_data = {
 53 |         "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.",
 54 |         "PHYS REV": "Phys.Rev.",
 55 |         "PHYSICAL REVIEW": "Phys.Rev.",
 56 |         "PHYS REV LETT": "Phys.Rev.Lett.",
 57 |         "JINST": "JINST",
 58 |         "JOURNAL OF INSTRUMENTATION": "JINST",
 59 |         "SENS ACTUATORS B": "Sens.Actuators B",
 60 |         "SENSORS AND ACTUATORS B: CHEMICAL": "Sens.Actuators B",
 61 |         "PHYS SCRIPTA": "Phys.Scripta",
 62 |         "PHYSICA SCRIPTA": "Phys.Scripta",
 63 |         "BULL CALCUTTA MATH SOC": "Bull.Calcutta Math.Soc.",
 64 |         "BULLETIN OF THE CALCUTTA MATHEMATICAL SOCIETY": "Bull.Calcutta Math.Soc.",
 65 |         "QUANTUM MACHINE INTELLIGENCE": "Quantum Machine Intelligence",
 66 |     }
 67 |     publication_infos = [{"pubinfo_freetext": "Phys. Rev. 127 (1962) 965-970"}]
 68 | 
 69 |     payload = {
 70 |         "journal_kb_data": journal_kb_data,
 71 |         "publication_infos": publication_infos,
 72 |     }
 73 | 
 74 |     headers = {
 75 |         "content-type": "application/json",
 76 |     }
 77 |     response = app_client.post(
 78 |         "/extract_journal_info",
 79 |         headers=headers,
 80 |         data=json.dumps(payload),
 81 |     )
 82 |     assert response.status_code == 500
 83 |     assert response.json == {
 84 |         "message": "Can not extract publication info data. Reason: 'test message'"
 85 |     }
 86 | 
 87 | 
 88 | def test_extract_journal_info_for_multiple_pubinfos(app_client):
 89 |     journal_kb_data = {
 90 |         "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.",
 91 |         "PHYS REV": "Phys.Rev.",
 92 |         "PHYSICAL REVIEW": "Phys.Rev.",
 93 |         "PHYS REV LETT": "Phys.Rev.Lett.",
 94 |         "JINST": "JINST",
 95 |         "JOURNAL OF INSTRUMENTATION": "JINST",
 96 |         "SENS ACTUATORS B": "Sens.Actuators B",
 97 |         "SENSORS AND ACTUATORS B: CHEMICAL": "Sens.Actuators B",
 98 |         "PHYS SCRIPTA": "Phys.Scripta",
 99 |         "PHYSICA SCRIPTA": "Phys.Scripta",
100 |         "BULL CALCUTTA MATH SOC": "Bull.Calcutta Math.Soc.",
101 |         "BULLETIN OF THE CALCUTTA MATHEMATICAL SOCIETY": "Bull.Calcutta Math.Soc.",
102 |         "QUANTUM MACHINE INTELLIGENCE": "Quantum Machine Intelligence",
103 |     }
104 |     publication_infos = [
105 |         {"pubinfo_freetext": "Phys. Rev. 127 (1962) 965-970"},
106 |         {"pubinfo_freetext": "Phys.Rev.Lett. 127 (1962) 965-970"},
107 |     ]
108 | 
109 |     payload = {
110 |         "journal_kb_data": journal_kb_data,
111 |         "publication_infos": publication_infos,
112 |     }
113 | 
114 |     headers = {
115 |         "content-type": "application/json",
116 |     }
117 |     response = app_client.post(
118 |         "/extract_journal_info",
119 |         headers=headers,
120 |         data=json.dumps(payload),
121 |     )
122 |     assert response.status_code == 200
123 |     assert "extracted_publication_infos" in response.json
124 |     assert len(response.json["extracted_publication_infos"]) == 2
125 | 
126 | 
127 | def test_extract_extract_references_from_text(app_client):
128 |     journal_kb_data = {
129 |         "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.",
130 |         "PHYS REV": "Phys.Rev.",
131 |         "PHYSICAL REVIEW": "Phys.Rev.",
132 |     }
133 |     headers = {
134 |         "content-type": "application/json",
135 |     }
136 |     text = "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581"
137 |     payload = {"journal_kb_data": journal_kb_data, "text": text}
138 |     response = app_client.post(
139 |         "/extract_references_from_text",
140 |         headers=headers,
141 |         data=json.dumps(payload),
142 |     )
143 |     assert response.status_code == 200
144 |     assert "extracted_references" in response.json
145 |     assert len(response.json["extracted_references"]) == 1
146 |     assert "author" in response.json["extracted_references"][0]
147 |     assert "misc" in response.json["extracted_references"][0]
148 |     assert "year" in response.json["extracted_references"][0]
149 | 
150 | 
151 | @mock.patch(
152 |     "refextract.app.extract_references_from_string",
153 |     side_effect=KeyError("test message"),
154 | )
155 | def test_extract_references_from_text_when_timeout_from_refextract(
156 |     mock_extract_refs, app_client
157 | ):
158 |     journal_kb_data = {
159 |         "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.",
160 |         "PHYS REV": "Phys.Rev.",
161 |         "PHYSICAL REVIEW": "Phys.Rev.",
162 |     }
163 |     headers = {
164 |         "content-type": "application/json",
165 |     }
166 |     text = "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581"
167 |     payload = {"journal_kb_data": journal_kb_data, "text": text}
168 |     response = app_client.post(
169 |         "/extract_references_from_text", headers=headers, data=json.dumps(payload)
170 |     )
171 |     assert response.status_code == 500
172 |     assert response.json == {
173 |         "message": "Can not extract references. Reason: 'test message'"
174 |     }
175 | 
176 | 
177 | def test_extract_extract_references_from_list(app_client):
178 |     journal_kb_data = {
179 |         "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.",
180 |         "PHYS REV": "Phys.Rev.",
181 |         "PHYSICAL REVIEW": "Phys.Rev.",
182 |     }
183 |     headers = {
184 |         "content-type": "application/json",
185 |     }
186 |     raw_references = [
187 |         "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581",
188 |         "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 582",
189 |         "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 583",
190 |     ]
191 |     payload = {"journal_kb_data": journal_kb_data, "raw_references": raw_references}
192 |     response = app_client.post(
193 |         "/extract_references_from_list",
194 |         headers=headers,
195 |         data=json.dumps(payload),
196 |     )
197 |     assert response.status_code == 200
198 |     assert "extracted_references" in response.json
199 |     assert len(response.json["extracted_references"]) == 3
200 |     for reference in response.json["extracted_references"]:
201 |         assert "author" in reference
202 |         assert "misc" in reference
203 |         assert "year" in reference
204 | 
205 | 
206 | @mock.patch(
207 |     "refextract.app.extract_references_from_string",
208 |     side_effect=KeyError("test message"),
209 | )
210 | def test_extract_extract_references_from_list_when_error_from_refextract(
211 |     mock_extract_refs, app_client
212 | ):
213 |     journal_kb_data = {
214 |         "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.",
215 |         "PHYS REV": "Phys.Rev.",
216 |         "PHYSICAL REVIEW": "Phys.Rev.",
217 |     }
218 |     headers = {
219 |         "content-type": "application/json",
220 |     }
221 |     raw_references = [
222 |         "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581",
223 |         "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 582",
224 |         "Iskra Ł W et al 2017 Acta Phys. Pol. B 48 583",
225 |     ]
226 |     payload = {"journal_kb_data": journal_kb_data, "raw_references": raw_references}
227 |     response = app_client.post(
228 |         "/extract_references_from_list",
229 |         headers=headers,
230 |         data=json.dumps(payload),
231 |     )
232 | 
233 |     expected_response = [
234 |         {"raw_ref": ["Iskra Ł W et al 2017 Acta Phys. Pol. B 48 581"]},
235 |         {"raw_ref": ["Iskra Ł W et al 2017 Acta Phys. Pol. B 48 582"]},
236 |         {"raw_ref": ["Iskra Ł W et al 2017 Acta Phys. Pol. B 48 583"]},
237 |     ]
238 |     assert response.status_code == 200
239 |     assert response.json["extracted_references"] == expected_response
240 | 
241 | 
242 | @pytest.mark.vcr
243 | def test_extract_extract_references_from_url(app_client):
244 |     journal_kb_data = {
245 |         "COMMUNICATIONS IN ASTEROSEISMOLOGY": "Commun.Asteros.",
246 |         "PHYS REV": "Phys.Rev.",
247 |         "PHYSICAL REVIEW": "Phys.Rev.",
248 |         "PHYS REV LETT": "Phys.Rev.Lett.",
249 |         "JINST": "JINST",
250 |         "JOURNAL OF INSTRUMENTATION": "JINST",
251 |         "SENS ACTUATORS B": "Sens.Actuators B",
252 |         "SENSORS AND ACTUATORS B: CHEMICAL": "Sens.Actuators B",
253 |         "PHYS SCRIPTA": "Phys.Scripta",
254 |         "PHYSICA SCRIPTA": "Phys.Scripta",
255 |         "BULL CALCUTTA MATH SOC": "Bull.Calcutta Math.Soc.",
256 |         "BULLETIN OF THE CALCUTTA MATHEMATICAL SOCIETY": "Bull.Calcutta Math.Soc.",
257 |         "QUANTUM MACHINE INTELLIGENCE": "Quantum Machine Intelligence",
258 |     }
259 |     headers = {
260 |         "content-type": "application/json",
261 |     }
262 |     url = "https://inspirehep.net/files/33ea6e86a7bfb4cab4734ed5c14d4529"
263 |     payload = {"url": url, "journal_kb_data": journal_kb_data}
264 |     response = app_client.post(
265 |         "/extract_references_from_url",
266 |         headers=headers,
267 |         data=json.dumps(payload),
268 |     )
269 |     assert response.status_code == 200
270 |     assert "extracted_references" in response.json
271 |     assert len(response.json["extracted_references"]) == 2
272 | 


--------------------------------------------------------------------------------
/tests/test_api.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract
  4 | # Copyright (C) 2016, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | import pytest
 25 | import responses
 26 | 
 27 | from refextract.references.api import (
 28 |     extract_journal_reference,
 29 |     extract_references_from_file,
 30 |     extract_references_from_string,
 31 |     extract_references_from_url,
 32 | )
 33 | from refextract.references.errors import FullTextNotAvailableError
 34 | 
 35 | 
 36 | @pytest.fixture
 37 | def kbs_override():
 38 |     return {
 39 |         "books": [("Griffiths, David", "Introduction to elementary particles", "2008")],
 40 |         "journals": [
 41 |             (
 42 |                 "PHYSICAL REVIEW SPECIAL TOPICS ACCELERATORS AND BEAMS",
 43 |                 "Phys.Rev.ST Accel.Beams",
 44 |             ),
 45 |             ("PHYS REV D", "Phys.Rev.;D"),
 46 |             ("PHYS REV", "Phys.Rev."),
 47 |             ("PHYS REV LETT", "Phys.Rev.Lett."),
 48 |             ("PHYS LETT", "Phys.Lett."),
 49 |             ("J PHYS", "J.Phys."),
 50 |             ("JOURNAL OF PHYSICS", "J.Phys."),
 51 |             ("J PHYS G", "J.Phys.;G"),
 52 |             ("PHYSICAL REVIEW", "Phys.Rev."),
 53 |             ("ADV THEO MATH PHYS", "Adv.Theor.Math.Phys."),
 54 |             ("MATH PHYS", "Math.Phys."),
 55 |             ("J MATH PHYS", "J.Math.Phys."),
 56 |             ("JHEP", "JHEP"),
 57 |             (
 58 |                 "SITZUNGSBER PREUSS AKAD WISS PHYS MATH KL",
 59 |                 "Sitzungsber.Preuss.Akad.Wiss.Berlin (Math.Phys.)",
 60 |             ),
 61 |             ("PHYS LETT", "Phys.Lett."),
 62 |             ("NUCL PHYS", "Nucl.Phys."),
 63 |             ("NUCL PHYS", "Nucl.Phys."),
 64 |             ("NUCL PHYS PROC SUPPL", "Nucl.Phys.Proc.Suppl."),
 65 |             ("JINST", "JINST"),
 66 |             ("THE EUROPEAN PHYSICAL JOURNAL C PARTICLES AND FIELDS", "Eur.Phys.J.;C"),
 67 |             ("COMMUN MATH PHYS", "Commun.Math.Phys."),
 68 |             ("COMM MATH PHYS", "Commun.Math.Phys."),
 69 |             ("REV MOD PHYS", "Rev.Mod.Phys."),
 70 |             ("ANN PHYS U S", "Ann.Phys."),
 71 |             ("AM J PHYS", "Am.J.Phys."),
 72 |             ("PROC R SOC LONDON SER", "Proc.Roy.Soc.Lond."),
 73 |             ("CLASS QUANT GRAVITY", "Class.Quant.Grav."),
 74 |             ("FOUND PHYS", "Found.Phys."),
 75 |             ("IEEE TRANS NUCL SCI", "IEEE Trans.Nucl.Sci."),
 76 |             ("SCIENCE", "Science"),
 77 |             ("ACTA MATERIALIA", "Acta Mater."),
 78 |             ("REVIEWS OF MODERN PHYSICS", "Rev.Mod.Phys."),
 79 |             ("NUCL INSTRUM METHODS", "Nucl.Instrum.Meth."),
 80 |             ("Z PHYS", "Z.Phys."),
 81 |         ],
 82 |         "journals_re": [
 83 |             "DAN---Dokl.Akad.Nauk Ser.Fiz.",
 84 |         ],
 85 |         "report-numbers": [
 86 |             "#####CERN#####",
 87 |             "< yy 999>",
 88 |             "< yyyy 999>",
 89 |             "ATL CONF---ATL-CONF",
 90 |             "ATL PHYS INT---ATL-PHYS-INT",
 91 |             "ATLAS CONF---ATL-CONF",
 92 |             "#####LANL#####",
 93 |             "<s/syymm999>",
 94 |             "<syymm999>",
 95 |             "ASTRO PH---astro-ph",
 96 |             "HEP PH---hep-ph",
 97 |             "HEP TH---hep-th",
 98 |             "HEP EX---hep-ex",
 99 |             "#####LHC#####",
100 |             "< yy 999>",
101 |             "<syyyy 999>",
102 |             "< 999>",
103 |             "< 9999>",
104 |             "CERN LHC PROJECT REPORT---CERN-LHC-Project-Report",
105 |             "CLIC NOTE              ---CERN-CLIC-Note",
106 |             "CERN LHCC              ---CERN-LHCC",
107 |             "CERN EP                ---CERN-EP",
108 |             "######ATLANTIS#######",
109 |             "< 9999999>",
110 |             "CERN EX---CERN-EX",
111 |         ],
112 |     }
113 | 
114 | 
115 | def test_journal_extract():
116 |     r = extract_journal_reference("Science Vol. 338 no. 6108 (2012) pp. 773-775")
117 |     assert r["year"] == "2012"
118 |     assert r["volume"] == "338"
119 |     assert r["page"] == "773-775"
120 |     assert r["title"] == "Science"
121 | 
122 | 
123 | def test_extract_references_from_string(kbs_override):
124 |     ref_lines = """[9] R. Bousso, JHEP 9906:028 (1999); hep-th/9906022."""
125 |     r = extract_references_from_string(ref_lines, override_kbs_files=kbs_override)
126 |     assert len(r) == 2
127 | 
128 | 
129 | def test_extract_references_from_file(pdf_files):
130 |     pdf = pdf_files["1503.07589v1.pdf"]
131 |     r = extract_references_from_file(pdf)
132 |     assert "texkey" in r[0]
133 |     assert "author" in r[0]
134 |     assert "url" in r[0]
135 |     assert len(r) == 36
136 |     with pytest.raises(FullTextNotAvailableError):
137 |         extract_references_from_file(pdf + "error")
138 | 
139 | 
140 | def test_extract_references_from_file_dois_as_pdfs_annotations(pdf_files):
141 |     """Test DOIs as PDFs annotations and texkeys as named destinations"""
142 |     pdf_file_with_dois_as_pdfs_annotations = pdf_files["2503.05372.pdf"]
143 |     extracted_references = extract_references_from_file(
144 |         pdf_file_with_dois_as_pdfs_annotations
145 |     )
146 |     first_reference = extracted_references[0]
147 |     assert len(first_reference["url"]) == 2
148 |     assert "https://doi.org/10.1103/PhysRevD.68.037502" in first_reference["url"]
149 |     assert "texkey" in first_reference
150 |     assert "Cahn:2003cw" in first_reference["texkey"]
151 |     assert len(extracted_references) == 39
152 | 
153 | 
154 | def test_extract_references_from_file_does_not_ignore_letters_in_volume(pdf_files):
155 |     """Test that letters in volume are not ignored."""
156 |     pdf = pdf_files["2503.05621.pdf"]
157 |     extracted_references = extract_references_from_file(pdf)
158 |     fith_reference = extracted_references[4]
159 |     assert "journal_volume" in fith_reference
160 |     assert fith_reference["journal_reference"][0] == "Phys. Rev. D95 (2017) 114510"
161 |     assert fith_reference["journal_volume"][0] == "D95"
162 |     assert len(extracted_references) == 24
163 | 
164 | 
165 | def test_extract_references_with_authors_after_references(pdf_files):
166 |     """Test that references extracted even with authors after references."""
167 |     pdf = pdf_files["2502.21088.pdf"]
168 |     extracted_references = extract_references_from_file(pdf)
169 |     first_reference = extracted_references[0]
170 |     last_reference = extracted_references[-1]
171 |     # assert first reference is correctly extracted
172 |     assert first_reference["journal_reference"][0] == "Phys. Rev. Lett. 25 (1970) 316"
173 |     assert first_reference["author"][0] == "S. D. Drell and T.-M. Yan"
174 |     # assert last reference correctly extracts collaboration
175 |     assert last_reference["collaboration"][0] == "ATLAS Collaboration"
176 |     assert len(extracted_references) == 104
177 | 
178 | 
179 | @pytest.mark.xfail(
180 |     reason="It should not put an Author in author field as it is a collaboration. "
181 |     "This happens because there are authors after the references."
182 | )
183 | def test_collaboration_without_author_when_authors_after_references(pdf_files):
184 |     """Test that references extracted even with authors after references."""
185 |     pdf = pdf_files["2502.21088.pdf"]
186 |     extracted_references = extract_references_from_file(pdf)
187 |     last_reference = extracted_references[-1]
188 |     # assert last reference is correctly extracted
189 |     assert last_reference["collaboration"][0] == "ATLAS Collaboration"
190 |     assert "author" not in last_reference
191 | 
192 | 
193 | @pytest.mark.xfail(reason="It should extract the journal reference and urls correctly.")
194 | def test_extract_references_two_column_layout(pdf_files):
195 |     """Test that references extracted even with authors after references."""
196 |     pdf = pdf_files["2502.18907.pdf"]
197 |     extracted_references = extract_references_from_file(pdf)
198 |     first_reference = extracted_references[0]
199 |     assert (
200 |         first_reference["author"][0]
201 |         == "Adamopoulos G., Robertson J., Morrison N. A., Godet C."
202 |     )
203 |     assert first_reference["journal_reference"][0] == " J. Appl. Phys. 96 (2004) 6348"
204 |     assert "url" in first_reference
205 | 
206 | 
207 | def test_extract_references_with_multiple_refs_under_same_marker(pdf_files):
208 |     """Test that references extracted even with authors after references."""
209 |     pdf = pdf_files["2406.06875.pdf"]
210 |     extracted_references = extract_references_from_file(pdf)
211 |     first_reference = extracted_references[0]
212 |     second_reference = extracted_references[1]
213 |     third_reference = extracted_references[2]
214 |     assert first_reference["author"][0] == "W.T. Tutte"
215 |     assert second_reference["author"][0] == "W.T. Tutte"
216 |     assert third_reference["author"][0] == "W.T. Tutte"
217 |     assert first_reference["journal_reference"][0] == "Can. J. Math. 14 (1962) 21"
218 |     assert second_reference["journal_reference"][0] == "Can. J. Math. 15 (1963) 249"
219 |     assert (
220 |         third_reference["journal_reference"][0] == "Bull. Am. Math. Soc. 74 (1968) 64"
221 |     )
222 |     assert first_reference["linemarker"][0] == "1"
223 |     assert second_reference["linemarker"][0] == "1"
224 |     assert third_reference["linemarker"][0] == "1"
225 | 
226 | 
227 | @responses.activate
228 | def test_extract_references_from_url(pdf_files):
229 |     with open(pdf_files["1503.07589v1.pdf"], "rb") as fd:
230 |         url = "http://arxiv.org/pdf/1503.07589v1.pdf"
231 |         responses.add(
232 |             responses.GET, url, body=fd.read(), content_type="application/pdf"
233 |         )
234 | 
235 |     r = extract_references_from_url(url)
236 |     assert len(r) == 36
237 |     assert "url" in r[0]
238 | 
239 |     url = "http://www.example.com"
240 |     responses.add(
241 |         responses.GET,
242 |         url,
243 |         body="File not found!",
244 |         status=404,
245 |         content_type="text/plain",
246 |     )
247 |     with pytest.raises(FullTextNotAvailableError):
248 |         extract_references_from_url(url)
249 | 
250 | 
251 | def test_long_registrant_dois(pdf_files):
252 |     """DOIs with 5 digit registrant code"""
253 |     r = extract_references_from_file(pdf_files["wepml008.pdf"])
254 |     assert len(r) == 6
255 |     for ref in r[1:]:
256 |         assert "doi" in ref
257 |         assert ref.get("doi")[0].startswith("doi:10.18429/JACoW")
258 | 
259 | 
260 | def test_override_kbs_files_can_take_journals_dict():
261 |     journals = {"Journal of Testing": "J.Testing"}
262 |     reference = "J. Smith, Journal of Testing 42 (2020) 1234"
263 | 
264 |     result = extract_references_from_string(
265 |         reference, override_kbs_files={"journals": journals}
266 |     )
267 |     assert result[0]["journal_title"] == ["J.Testing"]
268 | 


--------------------------------------------------------------------------------
/tests/test_find.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract
  4 | # Copyright (C) 2016, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | from refextract.references.find import get_reference_section_beginning
 25 | 
 26 | 
 27 | def test_simple():
 28 |     sect = get_reference_section_beginning(["Hello", "References", "[1] Ref1"])
 29 |     assert sect == {
 30 |         "marker": "[1]",
 31 |         "marker_pattern": "\\s*(?P<mark>\\[\\s*(?P<marknum>\\d+)\\s*\\])",
 32 |         "start_line": 1,
 33 |         "title_string": "References",
 34 |         "title_marker_same_line": False,
 35 |         "how_found_start": 1,
 36 |     }
 37 | 
 38 | 
 39 | def test_no_section():
 40 |     sect = get_reference_section_beginning("")
 41 |     assert sect is None
 42 | 
 43 | 
 44 | def test_no_title_via_brackets():
 45 |     sect = get_reference_section_beginning(["Hello", "[1] Ref1[2] Ref2"])
 46 |     assert sect == {
 47 |         "marker": "[1]",
 48 |         "marker_pattern": "(?P<mark>(?P<left>\\[)\\s*(?P<marknum>\\d+)\\s*("
 49 |         "?P<right>\\]))",
 50 |         "start_line": 1,
 51 |         "title_string": None,
 52 |         "title_marker_same_line": False,
 53 |         "how_found_start": 2,
 54 |     }
 55 | 
 56 | 
 57 | def test_no_title_via_dots():
 58 |     sect = get_reference_section_beginning(["Hello", "1. Ref12. Ref2"])
 59 |     assert sect == {
 60 |         "marker": "1.",
 61 |         "marker_pattern": "(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>\\.))",
 62 |         "start_line": 1,
 63 |         "title_string": None,
 64 |         "title_marker_same_line": False,
 65 |         "how_found_start": 3,
 66 |     }
 67 | 
 68 | 
 69 | def test_no_title_via_numbers():
 70 |     sect = get_reference_section_beginning(["Hello", "1 Ref12 Ref2"])
 71 |     assert sect == {
 72 |         "marker": "1",
 73 |         "marker_pattern": "(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))",
 74 |         "start_line": 1,
 75 |         "title_string": None,
 76 |         "title_marker_same_line": False,
 77 |         "how_found_start": 4,
 78 |     }
 79 | 
 80 | 
 81 | def test_no_title_via_numbers2():
 82 |     sect = get_reference_section_beginning(
 83 |         [
 84 |             "Hello",
 85 |             "1",
 86 |             "Ref1",
 87 |             "(3)",
 88 |             "2",
 89 |             "Ref2",
 90 |         ]
 91 |     )
 92 |     assert sect, {
 93 |         "marker": "1",
 94 |         "marker_pattern": "(?P<mark>(?P<left>)\\s*(?P<marknum>\\d+)\\s*(?P<right>))",
 95 |         "start_line": 1,
 96 |         "title_string": None,
 97 |         "title_marker_same_line": False,
 98 |         "how_found_start": 4,
 99 |     }
100 | 


--------------------------------------------------------------------------------
/tests/test_kbs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract
 4 | # Copyright (C) 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | import csv
25 | 
26 | from refextract.references.kbs import file_resolving, get_kbs
27 | 
28 | 
29 | def test_get_kbs_doesnt_override_default_if_value_is_none():
30 |     cache = get_kbs(custom_kbs={"journals": None})
31 |     assert len(cache["journals"]) == 3
32 |     assert "JHEP" in cache["journals"][-1]
33 | 
34 | 
35 | def test_get_kbs_caches_journal_dict():
36 |     journals = {"Journal of Testing": "J.Testing"}
37 | 
38 |     first_cache = get_kbs(custom_kbs={"journals": journals}).copy()
39 |     assert len(first_cache["journals"]) == 3
40 |     assert first_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TESTING"]
41 | 
42 |     journals = journals.copy()
43 |     second_cache = get_kbs(custom_kbs={"journals": journals})
44 |     # the cache is reused, so identity of the cache elements doesn't change
45 |     assert all(
46 |         cached_first is cached_second
47 |         for (cached_first, cached_second) in zip(
48 |             first_cache["journals"], second_cache["journals"], strict=False
49 |         )
50 |     )
51 | 
52 | 
53 | def test_get_kbs_invalidates_cache_if_input_changes():
54 |     journals = {"Journal of Testing": "J.Testing"}
55 |     first_cache = get_kbs(custom_kbs={"journals": journals}).copy()
56 | 
57 |     journals = journals = {"Journal of Testing": "J.Test."}
58 |     second_cache = get_kbs(custom_kbs={"journals": journals})
59 |     # the cache is invalidated, so identity of the cache elements changes
60 |     assert all(
61 |         cached_first is not cached_second
62 |         for (cached_first, cached_second) in zip(
63 |             first_cache["journals"], second_cache["journals"], strict=False
64 |         )
65 |     )
66 |     assert len(second_cache["journals"]) == 3
67 |     assert second_cache["journals"][-1] == ["JOURNAL OF TESTING", "J TEST"]
68 | 
69 | 
70 | def test_file_resolving():
71 |     # Test that the file resolving works as expected
72 |     with file_resolving("tests/data/file_resolving.csv") as fh:
73 |         assert fh.read() == "1|2|3\n4|5|6\n"
74 | 
75 | 
76 | def test_file_resolving_reader():
77 |     # Test that the file resolving works as expected with a reader
78 |     with file_resolving("tests/data/file_resolving.csv", reader=csv.reader) as fh:
79 |         rows = list(fh)
80 |         assert rows == [["1", "2", "3"], ["4", "5", "6"]]
81 | 


--------------------------------------------------------------------------------
/tests/test_regexs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract
 4 | # Copyright (C) 2016, 2018, 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | import re
25 | 
26 | from refextract.references import regexs
27 | 
28 | 
29 | def test_word():
30 |     r = regexs._create_regex_pattern_with_optional_spaces("ABC")
31 |     assert r == r"A\s*B\s*C\s*"
32 | 
33 | 
34 | def test_reference_section_title_pattern():
35 |     r = regexs.get_reference_section_title_patterns()
36 |     assert len(r) > 2
37 | 
38 | 
39 | def test_get_reference_line_numeration_marker_patterns():
40 |     r = regexs.get_reference_line_numeration_marker_patterns()
41 |     assert len(r) > 2
42 | 
43 | 
44 | def test_get_reference_line_marker_pattern():
45 |     r = regexs.get_reference_line_marker_pattern("ABC")
46 |     assert r.pattern.find("ABC") != -1
47 | 
48 | 
49 | def test_get_post_reference_section_title_patterns():
50 |     r = regexs.get_post_reference_section_title_patterns()
51 |     assert len(r) > 2
52 | 
53 | 
54 | def test_get_post_reference_section_keyword_patterns():
55 |     r = regexs.get_post_reference_section_keyword_patterns()
56 |     assert len(r) > 2
57 | 
58 | 
59 | def test_regex_match_list():
60 |     s = "ABC"
61 |     m = regexs.regex_match_list(s, [re.compile("C.C"), re.compile("A.C")])
62 |     assert m
63 |     m = regexs.regex_match_list(s, [re.compile("C.C")])
64 |     assert m is None
65 | 


--------------------------------------------------------------------------------
/tests/test_tag.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # This file is part of refextract
  4 | # Copyright (C) 2016, 2018, 2020 CERN.
  5 | #
  6 | # refextract is free software; you can redistribute it and/or
  7 | # modify it under the terms of the GNU General Public License as
  8 | # published by the Free Software Foundation; either version 2 of the
  9 | # License, or (at your option) any later version.
 10 | #
 11 | # refextract is distributed in the hope that it will be useful, but
 12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 14 | # General Public License for more details.
 15 | #
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
 18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
 19 | #
 20 | # In applying this license, CERN does not waive the privileges and immunities
 21 | # granted to it by virtue of its status as an Intergovernmental Organization
 22 | # or submit itself to any jurisdiction.
 23 | 
 24 | from refextract.references.tag import (
 25 |     find_numeration,
 26 |     find_numeration_more,
 27 |     identify_ibids,
 28 |     tag_arxiv,
 29 | )
 30 | 
 31 | 
 32 | def test_vol_page_year():
 33 |     "<vol>, <page> (<year>)"
 34 |     ref_line = """24, 418 (1930)"""
 35 |     r = find_numeration(ref_line)
 36 |     assert r["volume"] == "24"
 37 |     assert r["year"] == "1930"
 38 |     assert r["page"] == "418"
 39 | 
 40 | 
 41 | def test_vol_year_page():
 42 |     "<vol>, (<year>) <page>"
 43 |     ref_line = """24, (1930) 418"""
 44 |     r = find_numeration(ref_line)
 45 |     assert r["volume"] == "24"
 46 |     assert r["year"] == "1930"
 47 |     assert r["page"] == "418"
 48 | 
 49 | 
 50 | def test_year_title_volume_page():
 51 |     "<year>, <title> <vol> <page>"
 52 |     ref_line = """1930 <cds.JOURNAL>J.Phys.</cds.JOURNAL> 24, 418"""
 53 |     r = find_numeration_more(ref_line)
 54 |     assert r["volume"] == "24"
 55 |     assert r["year"] == "1930"
 56 |     assert r["page"] == "418"
 57 | 
 58 | 
 59 | def test_identify_ibids_empty():
 60 |     r = identify_ibids("")
 61 |     assert r == ({}, "")
 62 | 
 63 | 
 64 | def test_identify_ibids_simple():
 65 |     ref_line = (
 66 |         "[46] E. Schrodinger, Sitzungsber. Preuss. Akad. Wiss. Phys. Math. Kl."
 67 |         " 24, 418(1930); ibid, 3, 1(1931)"
 68 |     )
 69 |     r = identify_ibids(ref_line.upper())
 70 |     assert r == (
 71 |         {85: "IBID"},
 72 |         "[46] E. SCHRODINGER, SITZUNGSBER. PREUSS. AKAD. "
 73 |         "WISS. PHYS. MATH. KL. 24, 418(1930); ____, 3, "
 74 |         "1(1931)",
 75 |     )
 76 | 
 77 | 
 78 | def test_4_digits():
 79 |     ref_line = """{any prefix}arXiv:1003.1111{any postfix}"""
 80 |     r = tag_arxiv(ref_line)
 81 |     assert r.strip(": ") == (
 82 |         "{any prefix}<cds.ARXIV>arXiv:1003.1111</cds.ARXIV>{any postfix}"
 83 |     )
 84 | 
 85 | 
 86 | def test_4_digits_suffix():
 87 |     ref_line = """{any prefix}arXiv:1104.2222 [physics.ins-det]{any postfix}"""
 88 |     r = tag_arxiv(ref_line)
 89 |     assert r.strip(": ") == (
 90 |         "{any prefix}<cds.ARXIV>arXiv:1104.2222 ["
 91 |         "physics.ins-det]</cds.ARXIV>{any postfix}"
 92 |     )
 93 | 
 94 | 
 95 | def test_5_digits():
 96 |     ref_line = """{any prefix}arXiv:1303.33333{any postfix}"""
 97 |     r = tag_arxiv(ref_line)
 98 |     assert r.strip(": ") == (
 99 |         "{any prefix}<cds.ARXIV>arXiv:1303.33333</cds.ARXIV>{any postfix}"
100 |     )
101 | 
102 | 
103 | def test_5_digits_2012():
104 |     ref_line = """{any prefix}arXiv:1203.33333{any postfix}"""
105 |     r = tag_arxiv(ref_line)
106 |     assert r.strip(": ") == "{any prefix}arXiv:1203.33333{any postfix}"
107 | 
108 | 
109 | def test_5_digits_suffix():
110 |     ref_line = """{any prefix}arXiv:1304.44444 [physics.ins-det]{any postfix}"""
111 |     r = tag_arxiv(ref_line)
112 |     assert r.strip(": ") == (
113 |         "{any prefix}<cds.ARXIV>arXiv:1304.44444 ["
114 |         "physics.ins-det]</cds.ARXIV>{any postfix}"
115 |     )
116 | 
117 | 
118 | def test_4_digits_version():
119 |     ref_line = """{any prefix}arXiv:1003.1111v9{any postfix}"""
120 |     r = tag_arxiv(ref_line)
121 |     assert r.strip(": ") == (
122 |         "{any prefix}<cds.ARXIV>arXiv:1003.1111</cds.ARXIV>{any postfix}"
123 |     )
124 | 
125 | 
126 | def test_4_digits_suffix_version():
127 |     ref_line = """{any prefix}arXiv:1104.2222v9 [physics.ins-det]{any postfix}"""
128 |     r = tag_arxiv(ref_line)
129 |     assert r.strip(": ") == (
130 |         "{any prefix}<cds.ARXIV>arXiv:1104.2222 ["
131 |         "physics.ins-det]</cds.ARXIV>{any postfix}"
132 |     )
133 | 
134 | 
135 | def test_5_digits_version():
136 |     ref_line = """{any prefix}arXiv:1303.33333v9{any postfix}"""
137 |     r = tag_arxiv(ref_line)
138 |     assert r.strip(": ") == (
139 |         "{any prefix}<cds.ARXIV>arXiv:1303.33333</cds.ARXIV>{any postfix}"
140 |     )
141 | 
142 | 
143 | def test_5_digits_suffix_version():
144 |     ref_line = """{any prefix}arXiv:1304.44444v9 [physics.ins-det]{any postfix}"""
145 |     r = tag_arxiv(ref_line)
146 |     assert r.strip(": ") == (
147 |         "{any prefix}<cds.ARXIV>arXiv:1304.44444 ["
148 |         "physics.ins-det]</cds.ARXIV>{any postfix}"
149 |     )
150 | 
151 | 
152 | def test_4_digits_new():
153 |     ref_line = """{any prefix}9910.1234{any postfix}"""
154 |     r = tag_arxiv(ref_line)
155 |     assert r.strip(": ") == (
156 |         "{any prefix}<cds.ARXIV>arXiv:9910.1234</cds.ARXIV>{any postfix}"
157 |     )
158 | 
159 | 
160 | def test_4_digits_suffix_new():
161 |     ref_line = """{any prefix}9910.1234 [physics.ins-det]{any postfix}"""
162 |     r = tag_arxiv(ref_line)
163 |     assert r.strip(": ") == (
164 |         "{any prefix}<cds.ARXIV>arXiv:9910.1234 ["
165 |         "physics.ins-det]</cds.ARXIV>{any postfix}"
166 |     )
167 | 
168 | 
169 | def test_5_digits_new():
170 |     ref_line = """{any prefix}1310.12345{any postfix}"""
171 |     r = tag_arxiv(ref_line)
172 |     assert r.strip(": ") == (
173 |         "{any prefix}<cds.ARXIV>arXiv:1310.12345</cds.ARXIV>{any postfix}"
174 |     )
175 | 
176 | 
177 | def test_5_digits_suffix_new():
178 |     ref_line = """{any prefix}1310.12345 [physics.ins-det]{any postfix}"""
179 |     r = tag_arxiv(ref_line)
180 |     assert r.strip(": ") == (
181 |         "{any prefix}<cds.ARXIV>arXiv:1310.12345 ["
182 |         "physics.ins-det]</cds.ARXIV>{any postfix}"
183 |     )
184 | 
185 | 
186 | def test_4_digits_version_new():
187 |     ref_line = """{any prefix}9910.1234v9{any postfix}"""
188 |     r = tag_arxiv(ref_line)
189 |     assert r.strip(": ") == (
190 |         "{any prefix}<cds.ARXIV>arXiv:9910.1234</cds.ARXIV>{any postfix}"
191 |     )
192 | 
193 | 
194 | def test_4_digits_suffix_version_new():
195 |     ref_line = """{any prefix}9910.1234v9 [physics.ins-det]{any postfix}"""
196 |     r = tag_arxiv(ref_line)
197 |     assert r.strip(": ") == (
198 |         "{any prefix}<cds.ARXIV>arXiv:9910.1234 ["
199 |         "physics.ins-det]</cds.ARXIV>{any postfix}"
200 |     )
201 | 
202 | 
203 | def test_5_digits_version_new():
204 |     ref_line = """{any prefix}1310.12345v9{any postfix}"""
205 |     r = tag_arxiv(ref_line)
206 |     assert r.strip(": ") == (
207 |         "{any prefix}<cds.ARXIV>arXiv:1310.12345</cds.ARXIV>{any postfix}"
208 |     )
209 | 
210 | 
211 | def test_5_digits_suffix_version_new():
212 |     ref_line = """{any prefix}1310.12345v9 [physics.ins-det]{any postfix}"""
213 |     r = tag_arxiv(ref_line)
214 |     assert r.strip(": ") == (
215 |         "{any prefix}<cds.ARXIV>arXiv:1310.12345 "
216 |         "[physics.ins-det]</cds.ARXIV>{any postfix}"
217 |     )
218 | 
219 | 
220 | def test_5_digits_suffix_version_new_2012():
221 |     ref_line = """{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"""
222 |     r = tag_arxiv(ref_line)
223 |     assert r.strip(": ") == "{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"
224 | 


--------------------------------------------------------------------------------
/tests/test_text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # This file is part of refextract
 4 | # Copyright (C) 2016, 2018, 2020 CERN.
 5 | #
 6 | # refextract is free software; you can redistribute it and/or
 7 | # modify it under the terms of the GNU General Public License as
 8 | # published by the Free Software Foundation; either version 2 of the
 9 | # License, or (at your option) any later version.
10 | #
11 | # refextract is distributed in the hope that it will be useful, but
12 | # WITHOUT ANY WARRANTY; without even the implied warranty of
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 | # General Public License for more details.
15 | #
16 | # You should have received a copy of the GNU General Public License
17 | # along with refextract; if not, write to the Free Software Foundation, Inc.,
18 | # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
19 | #
20 | # In applying this license, CERN does not waive the privileges and immunities
21 | # granted to it by virtue of its status as an Intergovernmental Organization
22 | # or submit itself to any jurisdiction.
23 | 
24 | from refextract import extract_references_from_file
25 | from refextract.references.text import (
26 |     rebuild_reference_lines,
27 | )
28 | 
29 | 
30 | def test_simple():
31 |     marker_pattern = r"^\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])"
32 |     refs = [
33 |         "[1] hello",
34 |         "hello2",
35 |         "[2] foo",
36 |     ]
37 |     rebuilt_refs = rebuild_reference_lines(refs, marker_pattern)
38 |     assert rebuilt_refs == [
39 |         "[1] hello hello2",
40 |         "[2] foo",
41 |     ]
42 | 
43 | 
44 | def test_pagination_non_removal():
45 |     marker_pattern = r"^\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])"
46 |     refs = [
47 |         "[1] hello",
48 |         "hello2",
49 |         "[2]",
50 |         "foo",
51 |     ]
52 |     rebuilt_refs = rebuild_reference_lines(refs, marker_pattern)
53 |     assert rebuilt_refs == [
54 |         "[1] hello hello2",
55 |         "[2] foo",
56 |     ]
57 | 
58 | 
59 | def test_2_lines_together():
60 |     marker_pattern = r"\s*(?P<mark>\[\s*(?P<marknum>\d+)\s*\])"
61 |     refs = [
62 |         "[1] hello",
63 |         "hello2 [2] foo",
64 |     ]
65 |     rebuilt_refs = rebuild_reference_lines(refs, marker_pattern)
66 |     assert rebuilt_refs == [
67 |         "[1] hello hello2",
68 |         "[2] foo",
69 |     ]
70 | 
71 | 
72 | def test_get_number_header_lines_does_not_crash_on_final_empty_page(pdf_files):
73 |     assert extract_references_from_file(pdf_files["1805.05865.pdf"])
74 | 


--------------------------------------------------------------------------------