├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── .python-version
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── HISTORY.txt
├── LICENSE
├── PKG-INFO
├── Pipfile
├── Pipfile.lock
├── README.md
├── README.rst
├── allofplos
    ├── .gitignore
    ├── __init__.py
    ├── allofplos_basics.ipynb
    ├── article.py
    ├── citation_utilities.py
    ├── corpus
    │   ├── __init__.py
    │   ├── corpus.py
    │   └── plos_corpus.py
    ├── dbtoorm.py
    ├── dois.txt
    ├── elements
    │   ├── __init__.py
    │   ├── article_elements.py
    │   ├── journal.py
    │   └── license.py
    ├── jupyter_nbconvert_config.py
    ├── makedb.py
    ├── plos_regex.py
    ├── samples
    │   ├── __init__.py
    │   └── corpus_analysis.py
    ├── starter.db
    ├── starter_corpus
    │   ├── __init__.py
    │   ├── journal.pbio.0020188.xml
    │   ├── journal.pbio.0020334.xml
    │   ├── journal.pbio.0030408.xml
    │   ├── journal.pbio.0040088.xml
    │   ├── journal.pbio.1000359.xml
    │   ├── journal.pbio.1001044.xml
    │   ├── journal.pbio.1001199.xml
    │   ├── journal.pbio.1001289.xml
    │   ├── journal.pbio.1001315.xml
    │   ├── journal.pbio.1001473.xml
    │   ├── journal.pbio.1001636.xml
    │   ├── journal.pcbi.0030158.xml
    │   ├── journal.pcbi.1000112.xml
    │   ├── journal.pcbi.1000204.xml
    │   ├── journal.pcbi.1000589.xml
    │   ├── journal.pcbi.1001051.xml
    │   ├── journal.pcbi.1001083.xml
    │   ├── journal.pcbi.1002484.xml
    │   ├── journal.pcbi.1003292.xml
    │   ├── journal.pcbi.1004079.xml
    │   ├── journal.pcbi.1004082.xml
    │   ├── journal.pcbi.1004089.xml
    │   ├── journal.pcbi.1004113.xml
    │   ├── journal.pcbi.1004141.xml
    │   ├── journal.pcbi.1004152.xml
    │   ├── journal.pcbi.1004156.xml
    │   ├── journal.pcbi.1004453.xml
    │   ├── journal.pcbi.1004692.xml
    │   ├── journal.pgen.1000052.xml
    │   ├── journal.pgen.1002912.xml
    │   ├── journal.pgen.1003316.xml
    │   ├── journal.pmed.0020007.xml
    │   ├── journal.pmed.0020124.xml
    │   ├── journal.pmed.0020171.xml
    │   ├── journal.pmed.0020402.xml
    │   ├── journal.pmed.0030132.xml
    │   ├── journal.pmed.0030205.xml
    │   ├── journal.pmed.0030445.xml
    │   ├── journal.pmed.0030520.xml
    │   ├── journal.pmed.0040303.xml
    │   ├── journal.pmed.1000097.xml
    │   ├── journal.pmed.1000431.xml
    │   ├── journal.pmed.1001080.xml
    │   ├── journal.pmed.1001186.xml
    │   ├── journal.pmed.1001300.xml
    │   ├── journal.pmed.1001418.xml
    │   ├── journal.pmed.1001473.xml
    │   ├── journal.pmed.1001518.xml
    │   ├── journal.pmed.1001786.xml
    │   ├── journal.pntd.0000149.xml
    │   ├── journal.pntd.0001041.xml
    │   ├── journal.pntd.0001969.xml
    │   ├── journal.pntd.0002570.xml
    │   ├── journal.pone.0002554.xml
    │   ├── journal.pone.0005723.xml
    │   ├── journal.pone.0008519.xml
    │   ├── journal.pone.0008915.xml
    │   ├── journal.pone.0010685.xml
    │   ├── journal.pone.0012262.xml
    │   ├── journal.pone.0016329.xml
    │   ├── journal.pone.0016976.xml
    │   ├── journal.pone.0026358.xml
    │   ├── journal.pone.0028031.xml
    │   ├── journal.pone.0036880.xml
    │   ├── journal.pone.0040259.xml
    │   ├── journal.pone.0042593.xml
    │   ├── journal.pone.0046041.xml
    │   ├── journal.pone.0047391.xml
    │   ├── journal.pone.0050698.xml
    │   ├── journal.pone.0052690.xml
    │   ├── journal.pone.0055490.xml
    │   ├── journal.pone.0058242.xml
    │   ├── journal.pone.0066742.xml
    │   ├── journal.pone.0067179.xml
    │   ├── journal.pone.0067227.xml
    │   ├── journal.pone.0067380.xml
    │   ├── journal.pone.0068090.xml
    │   ├── journal.pone.0069640.xml
    │   ├── journal.pone.0070598.xml
    │   ├── journal.pone.0074790.xml
    │   ├── journal.pone.0078761.xml
    │   ├── journal.pone.0078921.xml
    │   ├── journal.pone.0080518.xml
    │   ├── journal.pone.0081648.xml
    │   ├── journal.pone.0087236.xml
    │   ├── journal.pone.0097541.xml
    │   ├── journal.pone.0100977.xml
    │   ├── journal.pone.0108198.xml
    │   ├── journal.pone.0111971.xml
    │   ├── journal.pone.0114370.xml
    │   ├── journal.pone.0115067.xml
    │   ├── journal.pone.0116201.xml
    │   ├── journal.pone.0116586.xml
    │   ├── journal.pone.0116752.xml
    │   ├── journal.pone.0117014.xml
    │   ├── journal.pone.0117688.xml
    │   ├── journal.pone.0117949.xml
    │   ├── journal.pone.0118238.xml
    │   ├── journal.pone.0118342.xml
    │   ├── journal.pone.0119074.xml
    │   ├── journal.pone.0119705.xml
    │   ├── journal.pone.0120049.xml
    │   ├── journal.pone.0120924.xml
    │   ├── journal.pone.0121226.xml
    │   ├── journal.pone.0126470.xml
    │   ├── journal.pone.0138823.xml
    │   ├── journal.pone.0146913.xml
    │   ├── journal.pone.0147124.xml
    │   ├── journal.pone.0152025.xml
    │   ├── journal.pone.0152459.xml
    │   ├── journal.pone.0153152.xml
    │   ├── journal.pone.0153170.xml
    │   ├── journal.pone.0160653.xml
    │   ├── journal.ppat.0020025.xml
    │   ├── journal.ppat.0040045.xml
    │   ├── journal.ppat.1000105.xml
    │   ├── journal.ppat.1000166.xml
    │   ├── journal.ppat.1002247.xml
    │   ├── journal.ppat.1002735.xml
    │   ├── journal.ppat.1002769.xml
    │   ├── journal.ppat.1003133.xml
    │   └── journal.ppat.1005207.xml
    ├── starter_data.py
    ├── transformations.py
    ├── update.py
    └── utils.py
├── contributing.rst
├── docs
    └── source
    │   └── Basic_Corpus_Tutorial.ipynb
├── pyproject.toml
├── setup.cfg
└── tests
    ├── __init__.py
    ├── test_corpus.py
    ├── test_unittests.py
    └── testdata
        ├── journal.pbio.2001413.xml
        ├── journal.pbio.2002354.xml
        ├── journal.pbio.2002399.xml
        ├── journal.pone.0185809.xml
        └── plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package to PyPI when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   release-build:
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       - uses: actions/setup-python@v5
26 |         with:
27 |           python-version: "3.x"
28 | 
29 |       - name: Build release distributions
30 |         run: |
31 |           # NOTE: put your own distribution build steps here.
32 |           python -m pip install build
33 |           python -m build
34 | 
35 |       - name: Upload distributions
36 |         uses: actions/upload-artifact@v4
37 |         with:
38 |           name: release-dists
39 |           path: dist/
40 | 
41 |   pypi-publish:
42 |     runs-on: ubuntu-latest
43 |     needs:
44 |       - release-build
45 |     permissions:
46 |       # IMPORTANT: this permission is mandatory for trusted publishing
47 |       id-token: write
48 | 
49 |     # Dedicated environments with protections for publishing are strongly recommended.
50 |     # For more information, see: https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#deployment-protection-rules
51 |     environment:
52 |       name: pypi
53 |       # OPTIONAL: uncomment and update to include your PyPI project URL in the deployment status:
54 |       url: https://pypi.org/p/allofplos
55 |       #
56 |       # ALTERNATIVE: if your GitHub Release name is the PyPI project version string
57 |       # ALTERNATIVE: exactly, uncomment the following line instead:
58 |       # url: https://pypi.org/project/YOURPROJECT/${{ github.event.release.name }}
59 | 
60 |     steps:
61 |       - name: Retrieve release distributions
62 |         uses: actions/download-artifact@v4
63 |         with:
64 |           name: release-dists
65 |           path: dist/
66 | 
67 |       - name: Publish release distributions to PyPI
68 |         uses: pypa/gh-action-pypi-publish@release/v1
69 |         with:
70 |           packages-dir: dist/
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | allofplos_xml/
  2 | *.zip
  3 | *.nxml
  4 | *.json
  5 | *.swp
  6 | *.js
  7 | !requirements.txt
  8 | *.xlsx
  9 | *.csv
 10 | !doi_to_pmc.csv
 11 | *.iml
 12 | zip_info.txt
 13 | !tests/testdata/journal.pbio.2001413.xml
 14 | !tests/testdata/plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml
 15 | !tests/testdata/journal.pone.0185809.xml
 16 | 
 17 | ## MacOS specific files
 18 | .DS_Store
 19 | .AppleDouble
 20 | .LSOverride
 21 | 
 22 | ## SublimeText specific files
 23 | 
 24 | # Cache files
 25 | *.tmlanguage.cache
 26 | *.tmPreferences.cache
 27 | *.stTheme.cache
 28 | 
 29 | # Workspace files
 30 | *.sublime-workspace
 31 | 
 32 | # Package control
 33 | Package Control.last-run
 34 | Package Control.ca-list
 35 | Package Control.ca-bundle
 36 | Package Control.system-ca-bundle
 37 | Package Control.cache/
 38 | Package Control.ca-certs/
 39 | Package Control.merged-ca-bundle
 40 | Package Control.user-ca-bundle
 41 | oscrypto-ca-bundle.crt
 42 | bh_unicode_properties.cache
 43 | 
 44 | # Sublime-github token
 45 | GitHub.sublime-settings
 46 | 
 47 | ## Python specific files
 48 | # Byte-compiled / optimized / DLL files
 49 | __pycache__
 50 | .py[cod]
 51 | *$py.class
 52 | 
 53 | # C extensions
 54 | *.so
 55 | 
 56 | # Distribution / packaging
 57 | .Python
 58 | build/
 59 | develop-eggs/
 60 | dist/
 61 | downloads/
 62 | eggs/
 63 | .eggs/
 64 | lib/
 65 | lib64
 66 | parts/
 67 | sdist/
 68 | var/
 69 | wheels/
 70 | *.egg-info/
 71 | .installed.cfg
 72 | *.egg
 73 | MANIFEST
 74 | 
 75 | # PyInstaller
 76 | #   Usually these files are written by a python script from a template
 77 | #    before PyInstaller builds the exe, so as to inject date/other infos into it.
 78 | *.manifest
 79 | *.spec
 80 | 
 81 | # Installer logs
 82 | pip-log.txt
 83 | pip-delete-this-directory.txt
 84 | 
 85 | # Unit test / coverage reports
 86 | htmlcov/
 87 | .tox/
 88 | .coverage
 89 | .coverage.*
 90 | .cache
 91 | nosetests.xml
 92 | coverage.xml
 93 | *.cover
 94 | .hypothesis/
 95 | .pytest_cache/
 96 | 
 97 | # Translations
 98 | *.mo
 99 | *.pot
100 | 
101 | # Django stuff
102 | *.log
103 | local_settings.py
104 | db.sqlite3
105 | 
106 | # Flask stuff
107 | instance/
108 | .webassets-cache
109 | 
110 | # Scrapy stuff
111 | .scrapy
112 | 
113 | # Sphinx documentation
114 | docs/_build/
115 | 
116 | # PyBuilder
117 | target/
118 | 
119 | # Jupyter Notebook
120 | .ipynb_checkpoints
121 | 
122 | # celery beat schedule file
123 | celerybeat-schedule
124 | 
125 | # SageMath parsed files
126 | *.sage.py
127 | 
128 | # Environments
129 | .env
130 | .venv
131 | env/
132 | venv
133 | ENV/
134 | env.bak/
135 | venv.bak/
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mypy
141 | .mypy_cache
142 | 


--------------------------------------------------------------------------------
/.python-version:
--------------------------------------------------------------------------------
1 | 3.8.11


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - "3.8"
4 | before_script:
5 |   - pip install pipenv
6 |   - pipenv install -d
7 | script:
8 |   - pipenv run python -m pytest
9 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at mining[at]plos[dot]org. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/HISTORY.txt:
--------------------------------------------------------------------------------
 1 | 1.1.1
 2 | New Features
 3 | – Adds PLoS Mental Health and PLoS Complex Systems journals
 4 | 
 5 | 0.11.1
 6 | Fixes
 7 | fix bugs creating journal URLs and getting related article DOIs
 8 | 
 9 | 0.11.0
10 | New Features
11 | – Article class properties: Taxonomy, Creative Commons license, volume, issue, elocation, doi_link
12 | – Many new Corpus class utilities with tutorial Jupyter Notebook, including `len(Corpus())`,`for article in Corpus()`,`Corpus.random_article`
13 | – Deprecates allofplos.plos_corpus in favor of allofplos.update
14 | 
15 | Fixes
16 | – Removes all reference to PLOS's internal URLs
17 | – URLs now journal-specific; can construct each type of article link (to XML, to landing page, etc)
18 | – Tests no longer require internet connection, but do require pytest
19 | – Update readme and instructions for devs
20 | – Reorganize corpus code
21 | – Fixes Article class properties: abstract, figure-count, table-count, journal
22 | – Fixes bugs in transformations & download functions
23 | 
24 | 0.10.2 Fix updating uncorrected proofs to versions of record
25 | 
26 | 0.10.1 Fix unicode download and print statement
27 | 
28 | 0.10.0 Corpus changes
29 | - new Corpus class
30 | - customize location of corpus directory
31 | - rename seed corpus to starter corpus
32 | - two new Article class properties: revision date for VORs (revdate) and rich article title that includes HTML markup (rich_title)
33 | - initialize TravisCI testing
34 | - update tqdm progress bars to disable if not on command line (i.e. for cron jobs)
35 | 
36 | 0.9.6 This version includes automatic seed data download from pip install
37 | 
38 | 0.9.5 tqdm instead of progressbar2. Seed data. Generate database out of ploscorpus. Several bugfixes.
39 | 
40 | 0.9.0 New Article class and new test data.
41 | 
42 | 0.8.4 Moved transformation methods to new file and changes in the tests.
43 | 
44 | 0.8.3 Filename structure for annotation DOI does not depend on journal name. Improved tests.
45 | 
46 | 0.8.2 Adding a method to generate a CSV file with all PLOS articles.
47 | 
48 | 0.8.1 Adding entry point. Thanks Chris Haumesser.
49 | 
50 | 0.8.0 First public release.
51 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Public Library of Science (PLOS)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PKG-INFO:
--------------------------------------------------------------------------------
 1 | Metadata-Version: 2.1
 2 | Name: allofplos
 3 | Version: 1.1.1
 4 | Summary: Get and analyze all PLOS articles
 5 | Author-email: Elizabeth Seiver <eseiver@plos.org>, Sebastian Bassi <sebastian.bassi@globant.com>, M Pacer <mpacer@berkeley.edu>
 6 | Maintainer-email: Kevin Brandt <kbrandt@plos.org>
 7 | License: MIT License
 8 |         
 9 |         Copyright (c) 2017 Public Library of Science (PLOS)
10 |         
11 |         Permission is hereby granted, free of charge, to any person obtaining a copy
12 |         of this software and associated documentation files (the "Software"), to deal
13 |         in the Software without restriction, including without limitation the rights
14 |         to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 |         copies of the Software, and to permit persons to whom the Software is
16 |         furnished to do so, subject to the following conditions:
17 |         
18 |         The above copyright notice and this permission notice shall be included in all
19 |         copies or substantial portions of the Software.
20 |         
21 |         THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 |         IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 |         FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 |         AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 |         LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 |         OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 |         SOFTWARE.
28 |         
29 | Keywords: science,PLOS,publishing
30 | Classifier: Development Status :: 5 - Production/Stable
31 | Classifier: Intended Audience :: Science/Research
32 | Classifier: Topic :: Scientific/Engineering
33 | Classifier: License :: OSI Approved :: MIT License
34 | Classifier: Programming Language :: Python :: 3.8
35 | License-File: LICENSE


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | allofplos = {editable = true, path = "./"}
 8 | certifi = "*"
 9 | chardet = "*"
10 | charset-normalizer = "*"
11 | idna = "*"
12 | lxml = "*"
13 | peewee = "*"
14 | python-utils = "*"
15 | requests = "*"
16 | six = "*"
17 | tqdm = "*"
18 | unidecode = "*"
19 | urllib3 = "*"
20 | pqdm = "*"
21 | 
22 | [dev-packages]
23 | pytest = "*"
24 | twine = "*"
25 | build = "*"
26 | 
27 | [requires]
28 | python_version = "3.8"
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://api.travis-ci.org/PLOS/allofplos.svg?branch=master)](https://travis-ci.org/PLOS/allofplos)
  2 | 
  3 | # All of Plos (allofplos)
  4 | 
  5 | Copyright (c) 2017-2022, Public Library of Science. MIT License, see
  6 | LICENSE.txt for more information.
  7 | 
  8 | ## Why allofplos?
  9 | 
 10 | This is for downloading/updating/maintaining a repository of all PLOS
 11 | XML article files. This can be used to have a copy of the PLOS text
 12 | corpus for further analysis. Use this program to download all PLOS XML
 13 | article files instead of doing web scraping.
 14 | 
 15 | ## Installation instructions
 16 | 
 17 | This program requires Python 3.8+.
 18 | 
 19 | Using pip:
 20 | 
 21 | ```
 22 | pip install allofplos
 23 | ```
 24 | 
 25 | This should install *allofplos* and requirements. At this stage you are
 26 | ready to go.
 27 | 
 28 | If you want to manually install from source (for example for development
 29 | purposes), first clone the project repository:
 30 | 
 31 | ```
 32 | git clone git@github.com:PLOS/allofplos.git
 33 | ```
 34 | 
 35 | Install Python dependencies inside the newly created virtual
 36 | environment:
 37 | 
 38 | ```
 39 | pipenv install
 40 | ```
 41 | 
 42 | ## How to run the program
 43 | 
 44 | Execute the following command.
 45 | 
 46 | ```
 47 | python -m allofplos.update
 48 | ```
 49 | 
 50 | or, if running from source:
 51 | 
 52 | ```
 53 | pipenv run python -m allofplos.update
 54 | ```
 55 | 
 56 | The first time it runs it will download a larger than 7 Gb zip file
 57 | (**allofplos.zip**) with all the XML files inside. **Note**: Make sure
 58 | that you have enough space in your device for the zip file and for its
 59 | content before running this command (at least 30Gb). After this file
 60 | is downloaded, it will extract its contents into the allofplos_xml
 61 | directory inside your installation of *allofplos*.
 62 | 
 63 | If you want to see the directory on your file system where this is
 64 | installed run
 65 | 
 66 | ```
 67 | python -c "from allofplos import get_corpus_dir; print(get_corpus_dir())"
 68 | ```
 69 | 
 70 | If you ever downloaded the corpus before, it will make an incremental
 71 | update to the existing corpus. The script:
 72 | 
 73 | -   checks for and then downloads to a temporary folder individual new
 74 |     articles that have been published
 75 | 
 76 | -   of those new articles, checks whether they are corrections (and
 77 |     whether the linked corrected article has been updated)
 78 | 
 79 | -   checks whether there are VORs (Versions of Record) for uncorrected
 80 |     proofs in the main articles directory and downloads those
 81 | 
 82 | -   checks whether the newly downloaded articles are uncorrected
 83 |     proofs or not after all of these checks, it moves the new articles
 84 |     into the main articles folder.
 85 | 
 86 | Here’s what the print statements might look like on a typical run:
 87 | 
 88 | ```
 89 | 147 new articles to download.
 90 | 147 new articles downloaded.
 91 | 3 amended articles found.
 92 | 0 amended articles downloaded with new xml.
 93 | Creating new text list of uncorrected proofs from scratch.
 94 | No new VOR articles indexed in Solr.
 95 | 17 VOR articles directly downloaded.
 96 | 17 uncorrected proofs updated to version of record. 44 uncorrected proofs remaining in uncorrected proof list.
 97 | 9 uncorrected proofs found. 53 total in list.
 98 | Corpus started with 219792 articles.
 99 | Moving new and updated files...
100 | 164 files moved. Corpus now has 219939 articles.
101 | ```
102 | 
103 | ## How to run the tests
104 | 
105 | To run the tests, you will need to install *allofplos* with its testing
106 | dependencies. These testing dependencies include `pytest`, which we will
107 | use to run the tests.
108 | 
109 | ```
110 | pipenv run python -m pytest
111 | ```
112 | 
113 | ## Community guidelines
114 | 
115 | If you wish to contribute to this project please open a ticket in the
116 | GitHub repo at <https://github.com/PLOS/allofplos/issues>. For support
117 | requests write to <mining@plos.org>
118 | 
119 | ## Citing This Library
120 | 
121 | *allofplos* is published in the proceedings of the SciPy 2018. DOI
122 | [10.25080/Majora-4af1f417-009](https://doi.org/10.25080/Majora-4af1f417-009)
123 | refers to all versions of allofplos.
124 | 
125 | If you want to cite allofplos using Bibtex:
126 | 
127 |     @InProceedings{ elizabeth_seiver-proc-scipy-2018,
128 |       author    = { Elizabeth Seiver and M Pacer and Sebastian Bassi },
129 |       title     = { Text and data mining scientific articles with allofplos },
130 |       booktitle = { Proceedings of the 17th Python in Science Conference },
131 |       pages     = { 61 - 64 },
132 |       year      = { 2018 },
133 |       editor    = { Fatih Akici and David Lippa and Dillon Niederhut and M Pacer },
134 |       doi       = { 10.25080/Majora-4af1f417-009 }
135 |     }
136 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | .. image:: https://api.travis-ci.org/PLOS/allofplos.svg?branch=master
  2 |    :target: https://travis-ci.org/PLOS/allofplos
  3 |    :alt: Build Status
  4 | 
  5 | All of Plos (allofplos)
  6 | =======================
  7 | 
  8 | Copyright (c) 2017-2022, Public Library of Science. MIT License, see
  9 | LICENSE.txt for more information.
 10 | 
 11 | Why allofplos?
 12 | --------------
 13 | 
 14 | This is for downloading/updating/maintaining a repository of all PLOS
 15 | XML article files. This can be used to have a copy of the PLOS text
 16 | corpus for further analysis. Use this program to download all PLOS XML
 17 | article files instead of doing web scraping.
 18 | 
 19 | Installation instructions
 20 | -------------------------
 21 | 
 22 | This program requires Python 3.8+.
 23 | 
 24 | Using pip:
 25 | 
 26 | ``$ pip install allofplos``
 27 | 
 28 | This should install *allofplos* and requirements. At this stage you are ready to go.
 29 | 
 30 | If you want to manually install from source (for example for development purposes), first clone the project repository:
 31 | 
 32 | ``$ git clone git@github.com:PLOS/allofplos.git``
 33 | 
 34 | Install Python dependencies inside the newly created virtual environment:
 35 | 
 36 | ``$ pipenv install``
 37 | 
 38 | How to run the program
 39 | ----------------------
 40 | 
 41 | Execute the following command.
 42 | 
 43 | ``$ python -m allofplos.update``
 44 | 
 45 | or, if running from source:
 46 | 
 47 | ``$ pipenv run python -m allofplos.update``
 48 | 
 49 | The first time it runs it will download a >7 Gb zip file
 50 | (**allofplos.zip**) with all the XML files inside.
 51 | **Note**: Make sure that you have enough space in your device for the
 52 | zip file and for its content before running this command (at least 30Gb).
 53 | After this file is downloaded, it will extract its contents into the
 54 | allofplos\_xml directory inside your installation of *allofplos*.
 55 | 
 56 | If you want to see the directory on your file system where this is installed run
 57 | 
 58 | ``python -c "from allofplos import get_corpus_dir; print(get_corpus_dir())"``
 59 | 
 60 | If you ever downloaded the corpus before, it will make an incremental
 61 | update to the existing corpus. The script:
 62 | 
 63 | - checks for and then downloads to a temporary folder individual new
 64 |    articles that have been published
 65 | - of those new articles, checks whether they are corrections (and
 66 |    whether the linked corrected article has been updated)
 67 | - checks whether there are VORs (Versions of Record) for uncorrected
 68 |    proofs in the main articles directory and downloads those
 69 | - checks whether the newly downloaded articles are uncorrected proofs
 70 |    or not after all of these checks, it moves the new articles into
 71 |    the main articles folder.
 72 | 
 73 | Here’s what the print statements might look like on a typical run:
 74 | 
 75 | ::
 76 | 
 77 |     147 new articles to download.
 78 |     147 new articles downloaded.
 79 |     3 amended articles found.
 80 |     0 amended articles downloaded with new xml.
 81 |     Creating new text list of uncorrected proofs from scratch.
 82 |     No new VOR articles indexed in Solr.
 83 |     17 VOR articles directly downloaded.
 84 |     17 uncorrected proofs updated to version of record. 44 uncorrected proofs remaining in uncorrected proof list.
 85 |     9 uncorrected proofs found. 53 total in list.
 86 |     Corpus started with 219792 articles.
 87 |     Moving new and updated files...
 88 |     164 files moved. Corpus now has 219939 articles.
 89 | 
 90 | How to run the tests
 91 | --------------------
 92 | 
 93 | To run the tests, you will need to install *allofplos* with its testing
 94 | dependencies. These testing dependencies include ``pytest``, which we will use
 95 | to run the tests.
 96 | 
 97 | ``$ pipenv run python -m pytest``
 98 | 
 99 | It should return something like this:
100 | 
101 | .. code::
102 | 
103 |   collected 20 items
104 | 
105 |   allofplos/tests/test_corpus.py ............                       [ 60%]
106 |   allofplos/tests/test_unittests.py ........                        [100%]
107 | 
108 |   ==================== 20 passed in 0.36 seconds =========================
109 | 
110 | 
111 | Community guidelines
112 | --------------------
113 | 
114 | If you wish to contribute to this project please open a ticket in the
115 | GitHub repo at https://github.com/PLOS/allofplos/issues. For support
116 | requests write to mining@plos.org
117 | 
118 | Citing This Library
119 | -------------------
120 | 
121 | *allofplos* is published in the proceedings of the SciPy 2018.
122 | DOI `10.25080/Majora-4af1f417-009 <https://doi.org/10.25080/Majora-4af1f417-009>`_ refers to all versions of allofplos.
123 | 
124 | If you want to cite allofplos using Bibtex:
125 | 
126 | ::
127 | 
128 |    @InProceedings{ elizabeth_seiver-proc-scipy-2018,
129 |      author    = { Elizabeth Seiver and M Pacer and Sebastian Bassi },
130 |      title     = { Text and data mining scientific articles with allofplos },
131 |      booktitle = { Proceedings of the 17th Python in Science Conference },
132 |      pages     = { 61 - 64 },
133 |      year      = { 2018 },
134 |      editor    = { Fatih Akici and David Lippa and Dillon Niederhut and M Pacer },
135 |      doi       = { 10.25080/Majora-4af1f417-009 }
136 |    }
137 | 


--------------------------------------------------------------------------------
/allofplos/.gitignore:
--------------------------------------------------------------------------------
 1 | allofplos_xml/
 2 | *.zip
 3 | *.log
 4 | */.ipynb_checkpoints/*
 5 | .ipynb_checkpoints/*
 6 | *.nxml
 7 | *.txt
 8 | !dois.txt
 9 | *.json
10 | *.swp
11 | *.pyc
12 | *.js
13 | *.xlsx
14 | *.csv
15 | *.iml
16 | *.gz
17 | 


--------------------------------------------------------------------------------
/allofplos/__init__.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import os
 3 | 
 4 | # path to the root of allofplos (the package)
 5 | ALLOFPLOS_DIR_PATH = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | 
 8 | # Starter pack of PLOS articles
 9 | starterdir = os.path.join(ALLOFPLOS_DIR_PATH, 'starter_corpus')
10 | 
11 | # Temporary folder for downloading and processing new articles
12 | newarticledir = tempfile.mkdtemp()
13 | 
14 | # List of uncorrected proof articles to check for updates
15 | (_, uncorrected_proofs_text_list) = tempfile.mkstemp()
16 | 
17 | def get_corpus_dir():
18 |     """If you want to set the corpus directory, assign the desired path to
19 |     ``os.environ['PLOS_CORPUS']``.
20 |     """
21 |     import os
22 | 
23 |     return os.path.expanduser(os.environ.get("PLOS_CORPUS", "")) or os.path.join(
24 |         ALLOFPLOS_DIR_PATH, "allofplos_xml"
25 |     )
26 | 
27 | del os
28 | 
29 |     
30 | 
31 | # NB: any packages that you want to expose at the top level, you will need to
32 | # import after creating global variables that they may rely upon
33 | # (e.g., corpusdir)
34 | 
35 | from .article import Article
36 | from .corpus import Corpus
37 | 


--------------------------------------------------------------------------------
/allofplos/allofplos_basics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Examples of basic allofplos functions"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {
 18 |     "code_folding": [],
 19 |     "collapsed": true,
 20 |     "slideshow": {
 21 |      "slide_type": "skip"
 22 |     }
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import datetime\n",
 27 |     "from allofplos.plos_regex import (validate_doi, show_invalid_dois, find_valid_dois)\n",
 28 |     "from allofplos.samples.corpus_analysis import (get_random_list_of_dois, get_all_local_dois,\n",
 29 |     "                                               get_all_plos_dois)\n",
 30 |     "from allofplos.corpus.plos_corpus import (get_uncorrected_proofs, get_all_solr_dois)\n",
 31 |     "from allofplos import Article"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {
 37 |     "slideshow": {
 38 |      "slide_type": "slide"
 39 |     }
 40 |    },
 41 |    "source": [
 42 |     "## Get example DOIs: get_random_list_of_dois()"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "slideshow": {
 50 |      "slide_type": "slide"
 51 |     }
 52 |    },
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "Three ways to represent an article\n",
 59 |       "Article as DOI: 10.1371/journal.pone.0088313\n",
 60 |       "Article as local file: allofplos_xml/journal.pone.0088313.xml\n",
 61 |       "Article as url: https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.0088313&type=manuscript\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "example_dois = get_random_list_of_dois(count=10)\n",
 67 |     "example_doi = example_dois[0]\n",
 68 |     "article = Article(example_doi)\n",
 69 |     "example_file = article.filepath\n",
 70 |     "example_url = article.url\n",
 71 |     "print(\"Three ways to represent an article\\nArticle as DOI: {}\\nArticle as local file: {}\\nArticle as url: {}\" \\\n",
 72 |     "       .format(example_doi, example_file, example_url))"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": true,
 80 |     "slideshow": {
 81 |      "slide_type": "slide"
 82 |     }
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "example_corrections_dois = ['10.1371/journal.pone.0166537',\n",
 87 |     "                            '10.1371/journal.ppat.1005301',\n",
 88 |     "                            '10.1371/journal.pone.0100397']"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {
 95 |     "code_folding": [],
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "example_retractions_dois = ['10.1371/journal.pone.0180272',\n",
101 |     "                            '10.1371/journal.pone.0155388',\n",
102 |     "                            '10.1371/journal.pone.0102411']"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {
109 |     "collapsed": true
110 |    },
111 |    "outputs": [],
112 |    "source": [
113 |     "example_vor_doi = '10.1371/journal.ppat.1006307'\n",
114 |     "example_uncorrected_proofs = get_uncorrected_proofs()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Validate PLOS DOI format: validate.doi(string), show_invalid_dois(list)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {
128 |     "collapsed": true,
129 |     "scrolled": true
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "validate_doi('10.1371/journal.pbio.2000797')"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {
140 |     "collapsed": true
141 |    },
142 |    "outputs": [],
143 |    "source": [
144 |     "validate_doi('10.1371/journal.pone.12345678')  # too many trailing digits"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {
151 |     "collapsed": true
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "doi_list = ['10.1371/journal.pbio.2000797', '10.1371/journal.pone.12345678', '10.1371/journal.pmed.1234567']\n",
156 |     "show_invalid_dois(doi_list)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## Check if a DOI resolves correctly: article.check_if_doi_resolves()"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "outputs": [],
173 |    "source": [
174 |     "article = Article('10.1371/journal.pbio.2000797')  # working DOI\n",
175 |     "article.check_if_doi_resolves()"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {
182 |     "collapsed": true
183 |    },
184 |    "outputs": [],
185 |    "source": [
186 |     "article = Article('10.1371/annotation/b8b66a84-4919-4a3e-ba3e-bb11f3853755')   # working DOI\n",
187 |     "article.check_if_doi_resolves()"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "collapsed": true
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "article = Article('10.1371/journal.pone.1111111')  # valid DOI structure, but article doesn't exist\n",
199 |     "article.check_if_doi_resolves()"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Check if uncorrected proof: article.proof"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {
213 |     "collapsed": true
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "article = Article(next(iter(example_uncorrected_proofs)))\n",
218 |     "article.proof"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {
225 |     "collapsed": true
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "article = Article(example_vor_doi)\n",
230 |     "article.proof"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "## Find PLOS DOIs in a string: find_valid_dois(string)"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {
244 |     "collapsed": true
245 |    },
246 |    "outputs": [],
247 |    "source": [
248 |     "find_valid_dois(\"ever seen 10.1371/journal.pbio.2000797, it's great! or maybe 10.1371/journal.pone.1234567?\")"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "## Get article pubdate: article.pubdate"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {
262 |     "collapsed": true
263 |    },
264 |    "outputs": [],
265 |    "source": [
266 |     "# returns a datetime object\n",
267 |     "article = Article(example_doi)\n",
268 |     "article.pubdate"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": null,
274 |    "metadata": {
275 |     "collapsed": true,
276 |     "scrolled": true
277 |    },
278 |    "outputs": [],
279 |    "source": [
280 |     "# datetime object can be transformed into any string format\n",
281 |     "article = Article(example_doi)\n",
282 |     "dates = article.get_dates(string_=True, string_format='%Y-%b-%d')\n",
283 |     "print(dates['epub'])"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "## Check (JATS) article type of article file: article.type_"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {},
297 |    "outputs": [
298 |     {
299 |      "data": {
300 |       "text/plain": [
301 |        "[{'affiliations': ['Department of Psychology, Elmhurst College, Elmhurst, Illinois, United States of America'],\n",
302 |        "  'author_roles': {'author_notes': ['Conceived and designed the experiments',\n",
303 |        "    'Performed the experiments',\n",
304 |        "    'Analyzed the data',\n",
305 |        "    'Contributed reagents/materials/analysis tools',\n",
306 |        "    'Wrote the paper']},\n",
307 |        "  'author_type': 'corresponding',\n",
308 |        "  'contrib_initials': 'KSM',\n",
309 |        "  'contrib_type': 'author',\n",
310 |        "  'editor_type': None,\n",
311 |        "  'email': ['katherine.moore@elmhurst.edu'],\n",
312 |        "  'footnotes': [],\n",
313 |        "  'given_names': 'Katherine Sledge',\n",
314 |        "  'group_name': None,\n",
315 |        "  'ids': [],\n",
316 |        "  'rid_dict': {'aff': ['aff1'], 'corresp': ['cor1']},\n",
317 |        "  'surname': 'Moore'},\n",
318 |        " {'affiliations': ['Department of Psychology, University of Michigan, Ann Arbor, Michigan, United States of America'],\n",
319 |        "  'author_roles': {'author_notes': ['Conceived and designed the experiments',\n",
320 |        "    'Wrote the paper']},\n",
321 |        "  'author_type': 'contributing',\n",
322 |        "  'contrib_initials': 'DHW',\n",
323 |        "  'contrib_type': 'author',\n",
324 |        "  'editor_type': None,\n",
325 |        "  'email': None,\n",
326 |        "  'footnotes': [],\n",
327 |        "  'given_names': 'Daniel H.',\n",
328 |        "  'group_name': None,\n",
329 |        "  'ids': [],\n",
330 |        "  'rid_dict': {'aff': ['aff2']},\n",
331 |        "  'surname': 'Weissman'}]"
332 |       ]
333 |      },
334 |      "execution_count": 5,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "article = Article(example_doi)\n",
341 |     "article.authors"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": null,
347 |    "metadata": {
348 |     "collapsed": true
349 |    },
350 |    "outputs": [],
351 |    "source": [
352 |     "article = Article(example_corrections_dois[0])\n",
353 |     "article.type_"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "collapsed": true
361 |    },
362 |    "outputs": [],
363 |    "source": [
364 |     "article = Article(example_retractions_dois[0])\n",
365 |     "article.type_"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "## Get related DOIs: article.related_dois\n",
373 |     "For corrections and retractions, get the DOI(s) of the PLOS articles being retracted or corrected."
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": true
381 |    },
382 |    "outputs": [],
383 |    "source": [
384 |     "article = Article(example_corrections_dois[0])\n",
385 |     "article.related_dois"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {
392 |     "collapsed": true
393 |    },
394 |    "outputs": [],
395 |    "source": [
396 |     "article = Article(example_retractions_dois[0])\n",
397 |     "article.related_dois"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "markdown",
402 |    "metadata": {},
403 |    "source": [
404 |     "# Working with many articles at once"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "## Get list of every article DOI indexed on the PLOS search API, Solr: get_all_solr_dois()"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "metadata": {
418 |     "collapsed": true
419 |    },
420 |    "outputs": [],
421 |    "source": [
422 |     "solr_dois = get_all_solr_dois()\n",
423 |     "print(len(solr_dois), \"articles indexed on Solr.\")"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "markdown",
428 |    "metadata": {},
429 |    "source": [
430 |     "## Get list of every PLOS article you have downloaded: get_all_local_dois()"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": null,
436 |    "metadata": {
437 |     "collapsed": true
438 |    },
439 |    "outputs": [],
440 |    "source": [
441 |     "all_articles = get_all_local_dois()\n",
442 |     "print(len(all_articles), \"articles on local computer.\")"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "metadata": {},
448 |    "source": [
449 |     "## Combine local and solr articles: get_all_plos_dois()"
450 |    ]
451 |   },
452 |   {
453 |    "cell_type": "code",
454 |    "execution_count": null,
455 |    "metadata": {
456 |     "collapsed": true
457 |    },
458 |    "outputs": [],
459 |    "source": [
460 |     "plos_articles = get_all_plos_dois()"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": null,
466 |    "metadata": {
467 |     "collapsed": true
468 |    },
469 |    "outputs": [],
470 |    "source": [
471 |     "download_updated_xml('allofplos_xml/journal.pcbi.0030158.xml')"
472 |    ]
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "celltoolbar": "Slideshow",
477 |   "kernelspec": {
478 |    "display_name": "Python 3",
479 |    "language": "python",
480 |    "name": "python3"
481 |   },
482 |   "language_info": {
483 |    "codemirror_mode": {
484 |     "name": "ipython",
485 |     "version": 3
486 |    },
487 |    "file_extension": ".py",
488 |    "mimetype": "text/x-python",
489 |    "name": "python",
490 |    "nbconvert_exporter": "python",
491 |    "pygments_lexer": "ipython3",
492 |    "version": "3.6.6"
493 |   },
494 |   "toc": {
495 |    "colors": {
496 |     "hover_highlight": "#DAA520",
497 |     "navigate_num": "#000000",
498 |     "navigate_text": "#333333",
499 |     "running_highlight": "#FF0000",
500 |     "selected_highlight": "#FFD700",
501 |     "sidebar_border": "#EEEEEE",
502 |     "wrapper_background": "#FFFFFF"
503 |    },
504 |    "moveMenuLeft": true,
505 |    "nav_menu": {},
506 |    "navigate_menu": true,
507 |    "number_sections": true,
508 |    "sideBar": true,
509 |    "skip_h1_title": false,
510 |    "threshold": 4,
511 |    "toc_cell": false,
512 |    "toc_position": {},
513 |    "toc_section_display": "block",
514 |    "toc_window_display": false,
515 |    "widenNotebook": false
516 |   }
517 |  },
518 |  "nbformat": 4,
519 |  "nbformat_minor": 2
520 | }
521 | 


--------------------------------------------------------------------------------
/allofplos/corpus/__init__.py:
--------------------------------------------------------------------------------
1 | from .corpus import Corpus
2 | 
3 | from .plos_corpus import *
4 | 


--------------------------------------------------------------------------------
/allofplos/corpus/corpus.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from random import Random
  4 | from collections import OrderedDict
  5 | from itertools import islice
  6 | 
  7 | from .. import get_corpus_dir, Article
  8 | from ..transformations import filename_to_doi, doi_to_path
  9 | 
 10 | 
 11 | class Corpus:
 12 |     """A collection of PLOS articles."""
 13 | 
 14 |     def __init__(self, directory=None, extension='.xml', seed=None):
 15 |         """Creation of an article corpus class."""
 16 |         if directory is None:
 17 |             directory = get_corpus_dir()
 18 |         self.directory = directory
 19 |         self.extension = extension
 20 |         self.random = Random(seed)
 21 | 
 22 |     def __repr__(self):
 23 |         """Value of a corpus object when you call it directly on the command line.
 24 | 
 25 |         Shows the directory location of the corpus
 26 |         :returns: directory
 27 |         :rtype: {str}
 28 |         """
 29 |         out = "Corpus location: {0}\nNumber of files: {1}".format(self.directory, len(self.files))
 30 |         return out
 31 |     
 32 |     def __len__(self):
 33 |         return len(self.dois)
 34 |     
 35 |     def __iter__(self):
 36 |         return (article for article in self.random_article_generator)
 37 |     
 38 |     def __getitem__(self, key):
 39 |         
 40 |         if isinstance(key, int):
 41 |             return Article(self.dois[key], directory=self.directory)
 42 |         elif isinstance(key, slice):
 43 |             return (Article(doi, directory=self.directory) 
 44 |                     for doi in self.dois[key])
 45 |         elif key not in self.dois:
 46 |             path= doi_to_path(key, directory=self.directory)
 47 |             raise IndexError(("You attempted get {doi} from "
 48 |                               "the corpus at \n{directory}. \n"
 49 |                               "This would point to: {path}. \n"
 50 |                               "Is that the file that was intended?"
 51 |                               ).format(doi=key, 
 52 |                                        directory=self.directory,
 53 |                                        path=path
 54 |                                       )
 55 |                             )
 56 |         else:
 57 |             return Article(key, directory=self.directory)
 58 | 
 59 |     def __contains__(self, value):
 60 |         is_in = False
 61 |         if isinstance(value, Article):
 62 |             is_in = value.doi in self.dois and value.directory == self.directory
 63 |         elif isinstance(value, str):
 64 |             doi_in = value in self.dois
 65 |             file_in = value in self.files
 66 |             filepath_in = value in self.filepaths
 67 |             is_in = doi_in or file_in or filepath_in
 68 |         return is_in
 69 |         
 70 |     @property
 71 |     def iter_file_doi(self):
 72 |         """Generator that returns filename, doi tuples for every file in the corpus.
 73 | 
 74 |         Used to generate both DOI and file generators for the corpus.
 75 |         """
 76 |         return ((file_, filename_to_doi(file_))
 77 |                 for file_ in sorted(os.listdir(self.directory))
 78 |                 if file_.endswith(self.extension) and 'DS_Store' not in file_)
 79 | 
 80 |     @property
 81 |     def file_doi(self):
 82 |         """An ordered dict that maps every corpus file to its accompanying DOI."""
 83 |         return OrderedDict(self.iter_file_doi)
 84 | 
 85 |     @property
 86 |     def iter_files(self):
 87 |         """Generator of article XML filenames in the corpus directory."""
 88 | 
 89 |         return (x[0] for x in self.iter_file_doi)
 90 | 
 91 |     @property
 92 |     def iter_dois(self):
 93 |         """Generator of DOIs of the articles in the corpus directory.
 94 | 
 95 |         Use for looping through all corpus articles with the Article class.
 96 |         """
 97 | 
 98 |         return (x[1] for x in self.iter_file_doi)
 99 | 
100 |     @property
101 |     def iter_filepaths(self):
102 |         """Generator of article XML files in corpus directory, including the full path."""
103 |         return (os.path.join(self.directory, fname) for fname in self.iter_files)
104 | 
105 |     @property
106 |     def files(self):
107 |         """List of article XML files in the corpus directory."""
108 | 
109 |         return list(self.iter_files)
110 | 
111 |     @property
112 |     def dois(self):
113 |         """List of DOIs of the articles in the corpus directory."""
114 | 
115 |         return list(self.iter_dois)
116 | 
117 |     @property
118 |     def filepaths(self):
119 |         """List of article XML files in corpus directory, including the full path."""
120 |         return list(self.iter_filepaths)
121 | 
122 |     @property
123 |     def article_generator(self):
124 |         """iterator of articles"""
125 |         return (Article(doi, directory=self.directory) 
126 |                 for doi in self.iter_dois)
127 | 
128 |     @property
129 |     def random_article_generator(self):
130 |         """iterator over random articles"""
131 |         return (Article(doi, directory=self.directory) 
132 |                 for doi in self.iter_random_dois)
133 | 
134 |     @property
135 |     def random_article(self):
136 |         return next(self.random_article_generator)
137 | 
138 |     def random_sample(self, count):
139 |         """
140 |         Creates a generator for random articles. 
141 | 
142 |         Length of generator specified in `count` parameter.
143 | 
144 |         :param count: specify how many articles are to be returned
145 |         :return: a generator of random articles for analysis
146 |         """
147 | 
148 |         return islice(self.random_article_generator, count)
149 | 
150 |     @property
151 |     def iter_random_dois(self):
152 |         return (doi for doi in self.random.sample(self.dois, len(self)))
153 | 
154 |     @property
155 |     def random_doi(self):
156 |         return next(self.iter_random_dois)
157 | 
158 |     def random_dois(self, count):
159 |         """
160 |         Gets a list of random DOIs found in the corpus.
161 | 
162 |         Length of list specified in `count` parameter.
163 |         :param count: specify how many DOIs are to be returned
164 |         :return: a list of random DOIs for analysis
165 |         """
166 | 
167 |         return list(islice(self.iter_random_dois, count))
168 | 


--------------------------------------------------------------------------------
/allofplos/dbtoorm.py:
--------------------------------------------------------------------------------
  1 | """ This is an example on how to convert the SQLite DB into a class model to
  2 |     make queries without using SQL syntax. This example uses peewee as ORM.
  3 |     It should be used as a template, the end user could edit it to fit their
  4 |     needs.
  5 | """
  6 | 
  7 | from peewee import *
  8 | 
  9 | # Start of ORM classes creation.
 10 | 
 11 | database = SqliteDatabase('starter.db', **{})
 12 | 
 13 | class UnknownField(object):
 14 |     def __init__(self, *_, **__): pass
 15 | 
 16 | class BaseModel(Model):
 17 |     class Meta:
 18 |         database = database
 19 | 
 20 | class Affiliations(BaseModel):
 21 |     affiliations = CharField(unique=True)
 22 | 
 23 |     class Meta:
 24 |         db_table = 'affiliations'
 25 | 
 26 | class Articletype(BaseModel):
 27 |     article_type = CharField(unique=True)
 28 | 
 29 |     class Meta:
 30 |         db_table = 'articletype'
 31 | 
 32 | class Jatstype(BaseModel):
 33 |     jats_type = CharField(unique=True)
 34 | 
 35 |     class Meta:
 36 |         db_table = 'jatstype'
 37 | 
 38 | class Journal(BaseModel):
 39 |     journal = CharField(unique=True)
 40 | 
 41 |     class Meta:
 42 |         db_table = 'journal'
 43 | 
 44 | class Plosarticle(BaseModel):
 45 |     doi = TextField(db_column='DOI', unique=True)
 46 |     jats_type = ForeignKeyField(column_name='JATS_type_id', model=Jatstype, field='id')
 47 |     abstract = TextField()
 48 |     created_date = DateTimeField()
 49 |     journal = ForeignKeyField(column_name='journal_id', model=Journal, field='id')
 50 |     plostype = ForeignKeyField(column_name='plostype_id', model=Articletype, field='id')
 51 |     title = TextField()
 52 |     word_count = IntegerField()
 53 | 
 54 |     class Meta:
 55 |         db_table = 'plosarticle'
 56 | 
 57 | class Country(BaseModel):
 58 |     country = CharField(unique=True)
 59 | 
 60 |     class Meta:
 61 |         db_table = 'country'
 62 | 
 63 | class Correspondingauthor(BaseModel):
 64 |     affiliation = ForeignKeyField(column_name='affiliation_id', model=Affiliations, field='id')
 65 |     corr_author_email = CharField(unique=True)
 66 |     country = ForeignKeyField(column_name='country_id', model=Country, field='id')
 67 |     given_name = TextField(null=True)
 68 |     group_name = TextField(null=True)
 69 |     surname = TextField(null=True)
 70 |     tld = TextField(null=True)
 71 | 
 72 |     class Meta:
 73 |         db_table = 'correspondingauthor'
 74 | 
 75 | class Coauthorplosarticle(BaseModel):
 76 |     article = ForeignKeyField(column_name='article_id', model=Plosarticle, field='id')
 77 |     corr_author = ForeignKeyField(column_name='corr_author_id', model=Correspondingauthor, field='id')
 78 | 
 79 |     class Meta:
 80 |         db_table = 'coauthorplosarticle'
 81 | 
 82 | # End of ORM classes creation.
 83 | 
 84 | # Query the starter database to retrieve all paper published in the journal
 85 | # PLOS Computational Biology, since 2008 with a corresponding author from
 86 | # France.
 87 | query = (Plosarticle
 88 |          .select()
 89 |          .join(Coauthorplosarticle)
 90 |          .join(Correspondingauthor)
 91 |          .join(Country)
 92 |          .join(Journal, on=(Plosarticle.journal == Journal.id))
 93 |          .where(Country.country == 'France')
 94 |          .where(Plosarticle.created_date > '2008-1-1')
 95 |          .where(Journal.journal == 'PLOS Computational Biology')
 96 |          )
 97 | 
 98 | # Get how many papers are returned
 99 | print("Papers: {}".format(query.count()))
100 | 
101 | # Get the DOIs of all the papers found with the query
102 | print("DOIs:")
103 | for papers in query:
104 |     print(papers.doi)
105 | 


--------------------------------------------------------------------------------
/allofplos/dois.txt:
--------------------------------------------------------------------------------
  1 | 10.1371/journal.pone.0005723
  2 | 10.1371/journal.pmed.1000431
  3 | 10.1371/journal.pmed.0030132
  4 | 10.1371/journal.pmed.1000431
  5 | 10.1371/journal.pmed.0020402
  6 | 10.1371/journal.pmed.0040303
  7 | 10.1371/journal.pone.0160653
  8 | 10.1371/journal.pone.0016329
  9 | 10.1371/journal.pmed.1001418
 10 | 10.1371/journal.pmed.1001080
 11 | 10.1371/journal.pone.0046041
 12 | 10.1371/journal.pone.0016976
 13 | 10.1371/journal.pbio.1001199
 14 | 10.1371/journal.pbio.0030408
 15 | 10.1371/journal.pmed.0020007
 16 | 10.1371/journal.pone.0047391
 17 | 10.1371/journal.pone.0028031
 18 | 10.1371/journal.pbio.1001315
 19 | 10.1371/journal.ppat.1002247
 20 | 10.1371/journal.pone.0058242
 21 | 10.1371/journal.pmed.0030520
 22 | 10.1371/journal.pone.0012262
 23 | 10.1371/journal.pmed.0030205
 24 | 10.1371/journal.pone.0116586
 25 | 10.1371/journal.pone.0138823
 26 | 10.1371/journal.pone.0026358
 27 | 10.1371/journal.ppat.1002735
 28 | 10.1371/journal.pntd.0001969
 29 | 10.1371/journal.pone.0040259
 30 | 10.1371/journal.pone.0036880
 31 | 10.1371/journal.pone.0042593
 32 | 10.1371/journal.ppat.0040045
 33 | 10.1371/journal.pbio.0040088
 34 | 10.1371/journal.pmed.1000097
 35 | 10.1371/journal.pcbi.1000112
 36 | 10.1371/journal.pcbi.1000589
 37 | 10.1371/journal.pone.0055490
 38 | 10.1371/journal.pmed.0020124
 39 | 10.1371/journal.pgen.1000052
 40 | 10.1371/journal.ppat.1000105
 41 | 10.1371/journal.pbio.1001473
 42 | 10.1371/journal.pmed.0030445
 43 | 10.1371/journal.pgen.1002912
 44 | 10.1371/journal.pone.0005723
 45 | 10.1371/journal.pmed.1001786
 46 | 10.1371/journal.pone.0005723
 47 | 10.1371/journal.pone.0002554
 48 | 10.1371/journal.ppat.0020025
 49 | 10.1371/journal.ppat.1005207
 50 | 10.1371/journal.pbio.1000359
 51 | 10.1371/journal.pmed.0030445
 52 | 10.1371/journal.pone.0108198
 53 | 10.1371/journal.pone.0097541
 54 | 10.1371/journal.pone.0005723
 55 | 10.1371/journal.pone.0008519
 56 | 10.1371/journal.ppat.1000166
 57 | 10.1371/journal.pone.0153170
 58 | 10.1371/journal.pone.0152459
 59 | 10.1371/journal.pone.0152025
 60 | 10.1371/journal.pcbi.1000204
 61 | 10.1371/journal.pbio.1001289
 62 | 10.1371/journal.pntd.0000149
 63 | 10.1371/journal.pone.0153152
 64 | 10.1371/journal.pcbi.1000204
 65 | 10.1371/journal.pone.0126470
 66 | 10.1371/journal.pmed.1001186
 67 | 10.1371/journal.pmed.1001300
 68 | 10.1371/journal.pone.0008915
 69 | 10.1371/journal.pcbi.1001083
 70 | 10.1371/journal.pcbi.1002484
 71 | 10.1371/journal.pcbi.1001051
 72 | 10.1371/journal.pone.0068090
 73 | 10.1371/journal.pone.0074790
 74 | 10.1371/journal.pone.0078921
 75 | 10.1371/journal.pone.0080518
 76 | 10.1371/journal.pone.0070598
 77 | 10.1371/journal.pone.0078761
 78 | 10.1371/journal.pone.0116752
 79 | 10.1371/journal.pone.0117949
 80 | 10.1371/journal.pcbi.1004141
 81 | 10.1371/journal.pcbi.1004082
 82 | 10.1371/journal.pone.0118238
 83 | 10.1371/journal.pone.0121226
 84 | 10.1371/journal.pone.0118342
 85 | 10.1371/journal.pone.0117688
 86 | 10.1371/journal.pone.0114370
 87 | 10.1371/journal.pone.0117014
 88 | 10.1371/journal.pone.0119074
 89 | 10.1371/journal.pcbi.1004141
 90 | 10.1371/journal.pcbi.1004113
 91 | 10.1371/journal.pcbi.1004152
 92 | 10.1371/journal.pone.0115067
 93 | 10.1371/journal.pone.0111971
 94 | 10.1371/journal.pone.0119705
 95 | 10.1371/journal.pone.0116201
 96 | 10.1371/journal.pcbi.1004079
 97 | 10.1371/journal.pcbi.1004089
 98 | 10.1371/journal.pcbi.1004156
 99 | 10.1371/journal.pmed.1001473
100 | 10.1371/journal.pone.0067380
101 | 10.1371/journal.pone.0069640
102 | 10.1371/journal.pone.0087236
103 | 10.1371/journal.pbio.1001044
104 | 10.1371/journal.pone.0120924
105 | 10.1371/journal.pone.0067179
106 | 10.1371/journal.pone.0067227
107 | 10.1371/journal.pone.0066742
108 | 10.1371/journal.pbio.0020188
109 | 10.1371/journal.pcbi.0030158
110 | 10.1371/journal.pone.0010685
111 | 10.1371/journal.pone.0050698
112 | 10.1371/journal.pbio.1001636
113 | 10.1371/journal.pmed.1001518
114 | 10.1371/journal.pcbi.1003292
115 | 10.1371/journal.pgen.1003316
116 | 10.1371/journal.ppat.1002769
117 | 10.1371/journal.pntd.0002570
118 | 10.1371/journal.pone.0081648
119 | 10.1371/journal.ppat.1003133
120 | 10.1371/journal.pntd.0001041
121 | 10.1371/journal.pone.0120924
122 | 10.1371/journal.pmed.0040303
123 | 10.1371/journal.pmed.0020171
124 | 10.1371/journal.pone.0052690
125 | 10.1371/journal.pbio.0020334
126 | 10.1371/journal.pone.0100977
127 | 10.1371/journal.pcbi.1004692
128 | 10.1371/journal.pone.0147124
129 | 10.1371/journal.pone.0146913
130 | 10.1371/journal.pone.0120049
131 | 10.1371/journal.pcbi.1004453
132 | 


--------------------------------------------------------------------------------
/allofplos/elements/__init__.py:
--------------------------------------------------------------------------------
1 | from .article_elements import (parse_article_date, get_contrib_info,
2 |                                match_contribs_to_dicts)
3 | from .journal import Journal
4 | from .license import License
5 | 


--------------------------------------------------------------------------------
/allofplos/elements/article_elements.py:
--------------------------------------------------------------------------------
  1 | """These functions are for parsing individual elements of the article XML tree
  2 | 
  3 | Eventually these functions will probably need to be a class.
  4 | """
  5 | import datetime
  6 | import difflib
  7 | import re
  8 | import string
  9 | 
 10 | import lxml.etree as et
 11 | import unidecode
 12 | 
 13 | 
 14 | def parse_article_date(date_element, date_format='%d %m %Y'):
 15 |     """
 16 |     For an article date element, convert XML to a datetime object.
 17 |     :param date_element: An article XML element that contains a date
 18 |     :param date_format: string format used to convert to datetime object
 19 |     :return: datetime object
 20 |     """
 21 |     day = ''
 22 |     month = ''
 23 |     year = ''
 24 |     for item in date_element.getchildren():
 25 |         if item.tag == 'day':
 26 |             day = item.text
 27 |         if item.tag == 'month':
 28 |             month = item.text
 29 |         if item.tag == 'year':
 30 |             year = item.text
 31 |     if day:
 32 |         date = (day, month, year)
 33 |         string_date = ' '.join(date)
 34 |         date = datetime.datetime.strptime(string_date, date_format)
 35 |     elif month:
 36 |         # try both numerical & word versions of month
 37 |         date = (month, year)
 38 |         string_date = ' '.join(date)
 39 |         try:
 40 |             date = datetime.datetime.strptime(string_date, '%m %Y')
 41 |         except ValueError:
 42 |             date = datetime.datetime.strptime(string_date, '%B %Y')
 43 |     elif year:
 44 |         date = year
 45 |         date = datetime.datetime.strptime(date, '%Y')
 46 |     else:
 47 |         print('date error')
 48 |         date = ''
 49 |     return date
 50 | 
 51 | 
 52 | def get_rid_dict(contrib_element):
 53 |     """ For an individual contributor, get the list of their associated rids.
 54 |     More about rids: https://jats.nlm.nih.gov/archiving/tag-library/1.1/attribute/rid.html
 55 |     Used in get_contrib_info().
 56 |     :param contrib_element: An article XML element with the tag <contrib>
 57 |     :return: dictionary matching each type of rid to its value for that contributor
 58 |     """
 59 |     rid_dict = {}
 60 |     contrib_elements = contrib_element.getchildren()
 61 |     # get list of ref-types
 62 |     rid_type_list = [el.attrib.get('ref-type', 'fn') for el in contrib_elements if el.tag == 'xref']
 63 | 
 64 |     # make dict of ref-types to the actual ref numbers (rids)
 65 |     for rid_type in set(rid_type_list):
 66 |         rid_list = [el.attrib.get('rid', None) for el in contrib_elements if el.tag == 'xref' and el.attrib.get('ref-type', 'fn') == rid_type]
 67 |         rid_dict[rid_type] = rid_list
 68 | 
 69 |     return rid_dict
 70 | 
 71 | 
 72 | def get_author_type(contrib_element):
 73 |     """Get the type of author for a single contributor from their accompanying <contrib> element.
 74 |     Authors can be 'corresponding' or 'contributing'. Depending on the paper, some elements have a 
 75 |     top-level "corresp" attribute that equal yes; otherwise, corresponding status can be inferred
 76 |     from the existence of the <xref> attribute ref-type="corresp"
 77 |     :param contrib_element: An article XML element with the tag <contrib>
 78 |     :return: author type (corresponding, contributing, None)
 79 |     """
 80 |     answer_dict = {
 81 |         "yes": "corresponding",
 82 |         "no": "contributing"
 83 |     }
 84 | 
 85 |     author_type = None
 86 |     if contrib_element.get('contrib-type', None) == 'author':
 87 |         corr = contrib_element.get('corresp', None)
 88 |         if corr:
 89 |             author_type = answer_dict.get(corr, None)
 90 |         else:
 91 |             temp = get_rid_dict(contrib_element).get('corresp', None)
 92 |             if temp:
 93 |                 author_type = answer_dict.get("yes", None)
 94 |             else:
 95 |                 author_type = answer_dict.get("no", None)
 96 | 
 97 |     return author_type
 98 | 
 99 | 
100 | def get_contrib_name(contrib_element):
101 |     """Get the name for a single contributor from their accompanying <contrib> element.
102 |     Also constructs their initials for later matching to article-level dictionaries about
103 |     contributors, including get_aff_dict() and get_fn_dict().
104 |     Can also handle 'collab' aka group authors with a group name but no surname or given names.
105 |     :param contrib_element: An article XML element with the tag <contrib>
106 |     :return: dictionary of a single contributor's given names, surname, initials, and group name
107 |     """
108 |     given_names = ''
109 |     surname = ''
110 | 
111 |     contrib_name_element = contrib_element.find("name")
112 |     if contrib_name_element is not None:
113 |         for name_element in contrib_name_element.getchildren():
114 |             if name_element.tag == 'surname':
115 |                 # for some reason, name_element.text doesn't work for this element
116 |                 surname = (et.tostring(name_element,
117 |                                        encoding='unicode',
118 |                                        method='text').rstrip(' ').rstrip('\t').rstrip('\n')
119 |                            or "")
120 |             elif name_element.tag == 'given-names':
121 |                 given_names = name_element.text
122 |                 if given_names == '':
123 |                     print("given names element.text didn't work")
124 |                     given_names = (et.tostring(name_element,
125 |                                                encoding='unicode',
126 |                                                method='text').rstrip(' ').rstrip('\t').rstrip('\n')
127 |                                    or "")
128 |             else:
129 |                 pass
130 |         if given_names or surname:
131 |             # construct initials if either given or surname is present
132 |             try:
133 |                 contrib_initials = ''.join([part[0].upper() for part in re.split('[-| |,|\.]+', given_names) if part]) + \
134 |                                   ''.join([part[0] for part in re.split('[-| |,|\.]+', surname) if part[0] in string.ascii_uppercase])
135 |             except IndexError:
136 |                 contrib_initials = ''
137 |             contrib_name = dict(contrib_initials=contrib_initials,
138 |                                 given_names=given_names,
139 |                                 surname=surname)
140 |     else:
141 |         # if no <name> element found, assume it's a collaboration
142 |         contrib_collab_element = contrib_element.find("collab")
143 |         group_name = et.tostring(contrib_collab_element, encoding='unicode')
144 |         group_name = re.sub('<[^>]*>', '', group_name).rstrip('\n')
145 |         if not group_name:
146 |             print("Error constructing contrib_name group element")
147 |             group_name = ''
148 |         contrib_name = dict(group_name=group_name)
149 | 
150 |     return contrib_name
151 | 
152 | 
153 | def get_contrib_ids(contrib_element):
154 |     """Get the ids for a single contributor from their accompanying <contrib> element.
155 |     This will mostly get ORCID IDs, and indicate whetherh they are authenticated.
156 |     For more information of ORCIDs, see https://orcid.org/
157 |     :param contrib_element: An article XML element with the tag <contrib>
158 |     :return: list of dictionaries of ids for that contributor
159 |     """
160 |     id_list = []
161 |     for item in contrib_element.getchildren():
162 |         if item.tag == 'contrib-id':
163 |             contrib_id_type = item.attrib.get('contrib-id-type', None)
164 |             contrib_id = item.text
165 |             contrib_authenticated = item.attrib.get('authenticated', None)
166 |             id_dict = dict(id_type=contrib_id_type,
167 |                            id=contrib_id,
168 |                            authenticated=contrib_authenticated
169 |                            )
170 |             id_list.append(id_dict)
171 | 
172 |     return id_list
173 | 
174 | 
175 | def get_credit_taxonomy(contrib_element):
176 |     """Get the contributor roles from the CREDiT taxonomy element when it is present.
177 |     Right now, this is is equivalent to author roles.
178 |     For more information about this data structure, see http://dictionary.casrai.org/Contributor_Roles
179 |     :param contrib_element: An article XML element with the tag <contrib>
180 |     :return: dictionary of contributor roles for an individual contributor
181 | 
182 |     """
183 |     credit_dict = {}
184 |     for item in contrib_element.getchildren():
185 |         if item.tag == 'role':
186 |             content_type = item.attrib.get('content-type', None)
187 |             if content_type == 'http://credit.casrai.org/':
188 |                 content_type = 'CASRAI CREDiT taxonomy'
189 |             role = item.text
190 |             if not credit_dict.get(content_type, None):
191 |                 credit_dict[content_type] = [role]
192 |             else:
193 |                 credit_dict[content_type].append(role)
194 |     return credit_dict
195 | 
196 | 
197 | def match_contrib_initials_to_dict(contrib_dict, special_dict, matched_keys, contrib_key):
198 |     """For an individual contributor, match their initials to a dictionary.
199 |     This is used for both matching contributors to email addresses as well as credit roles,
200 |     where the keys for all dictionaries are contributor initials. In contrib_dict, these initials are
201 |     constructed from the contributor name in get_contrib_name(). For the special dicts, initials are
202 |     provided in the raw XML.
203 |     See match_contribs_to_dicts() for how this matching process is iterated across contributors.
204 |     :param contrib_dict: information about individual contributor, including their name and constructed initials
205 |     :param special_dict: usually either get_aff_dict() or get_credit_dict()
206 |     :param matched_keys: list of keys in special_dict already matched that will be excluded
207 |     :param contrib_key: The item in the contrib dictionary where the matched special_dict will be stored
208 |     :return: updated contrib_dict that includes the newly matched special_dict
209 |     """
210 |     contributor_initials = contrib_dict.get('contrib_initials')
211 |     # special_dict keys (initials) are assumed to be uppercase
212 |     special_dict = {k.upper(): v
213 |                     for k, v in special_dict.items()
214 |                     if k not in matched_keys}
215 |     if contrib_dict.get('group_name', None) is None:
216 |         try:
217 |             contrib_dict[contrib_key] = special_dict[contributor_initials.upper()]
218 |         except KeyError:
219 |             # Sometimes middle initials are included or excluded, so restrict both initial sets to
220 |             # first and last initial only.
221 |             try:
222 |                 contributor_abbrev_initials = ''.join([contributor_initials[0], contributor_initials[-1]])
223 |                 for dict_initials, dict_value in special_dict.items():
224 |                     if contributor_abbrev_initials == ''.join([dict_initials[0], dict_initials[-1]]).upper():
225 |                         contrib_dict[contrib_key] = dict_value
226 |                         break
227 |             except (IndexError, KeyError) as e:
228 |                 pass
229 | 
230 |     return contrib_dict
231 | 
232 | 
233 | def get_contrib_info(contrib_element):
234 |     """Get a dictionary of information for a single contributor from their accompanying <contrib> element.
235 |     Don't call this function directly. Instead, use as a part of get_contributors_info()
236 |     This includes all contributor information that can be directly accessed from <contrib> element contents.
237 |     However, other contributor information is stored in article-level dictionaries that need to be matched
238 |     for each contributor using the rid_dict created here.
239 |     :param contrib_element: An article XML element with the tag <contrib>
240 |     :return: dictionary of contributor name, ids/ORCID, rid_dict, author_roles
241 |     """
242 |     # get their name
243 |     contrib_dict = get_contrib_name(contrib_element)
244 | 
245 |     # get contrib type
246 |     try:
247 |         contrib_dict['contrib_type'] = contrib_element.attrib['contrib-type']
248 |     except KeyError:
249 |         # invalid contributor field; shouldn't count as contributor
250 |         return None
251 | 
252 |     # get author type
253 |     if contrib_dict.get('contrib_type') == 'author':
254 |         contrib_dict['author_type'] = get_author_type(contrib_element)
255 |     elif contrib_dict.get('contrib_type') == 'editor':
256 |         for item in contrib_element.getchildren():
257 |             if item.tag == 'Role' and item.text.lower() != 'editor':
258 |                 print('new editor type: {}'.format(item.text))
259 |                 contrib_dict['editor_type'] = item.text
260 | 
261 |     # get ORCID, if available
262 |     contrib_dict['ids'] = get_contrib_ids(contrib_element)
263 | 
264 |     # get dictionary of contributor's footnote types to footnote ids
265 |     contrib_dict['rid_dict'] = get_rid_dict(contrib_element)
266 | 
267 |     # get dictionary of CREDiT taxonomy, if available
268 |     contrib_dict['author_roles'] = get_credit_taxonomy(contrib_element)
269 | 
270 |     return contrib_dict
271 | 
272 | 
273 | def match_author_names_to_emails(corr_author_list, email_dict):
274 |     """ Finds the best match of author names to potential matching email addresses.
275 |     Don't call directly, but use as a part of match_contribs_to_dicts().
276 |     Sometimes, the initials-matching process in match_contrib_initials_to_dict() fails. When there's
277 |     at least two ummatched corresponding authors and email addresses, this function
278 |     figures out the name/email matching with the highest matching continguous character count and matches them.
279 |     This is a 'hail Mary' that thankfully also has a high rate of accuracy.
280 |     :param corr_author_list: list of contrib_dicts for corresponding authors with no email field
281 |     :param email_dict: list of unmatched author email addresses
282 |     :return: list of updated contrib_dicts for each author, now including an email field
283 |     """
284 |     overall_matching_dict = {}
285 |     match_values = []
286 |     # Step 1: for each author and email combination, compute longest common string
287 |     for corr_author in corr_author_list:
288 |         # make single string of author full name
289 |         seq_1 = unidecode.unidecode(''.join([corr_author.get('given_names'), corr_author.get('surname')]).lower())
290 |         matching_dict = {}
291 |         for email_initials, email_address in email_dict.items():
292 |             # make string of email address that doesn't include domain
293 |             seq_2 = unidecode.unidecode(email_address[0].lower().split('@')[0])
294 |             matcher = difflib.SequenceMatcher(a=seq_1, b=seq_2)
295 | 
296 |             # construct dictionary with name, email, and matching string length for each pair
297 |             match = matcher.find_longest_match(0, len(matcher.a), 0, len(matcher.b))
298 |             matching_dict[tuple(email_address)] = match[-1]
299 |             # add length of match to list of all match lengths
300 |             match_values.append(match[-1])
301 |         overall_matching_dict[(corr_author.get('given_names'), corr_author.get('surname'))] = matching_dict
302 |     # Step 2: for the author and email combination(s) with the longest common string, match them
303 |     # Iterate through max_values in descending order until all are matched
304 |     newly_matched_emails = []
305 |     newly_matched_authors = []
306 |     count = 0
307 |     while len(newly_matched_emails) < len(overall_matching_dict) and count < 20:
308 |         for k1, v1 in overall_matching_dict.items():
309 |             for k2, v2 in v1.items():
310 |                 if v2 == max(match_values):
311 |                     for corr_author in corr_author_list:
312 |                         if k1 == (corr_author.get('given_names'), corr_author.get('surname')) \
313 |                          and k2 not in newly_matched_emails and k1 not in newly_matched_authors:
314 |                             corr_author['email'] = list(k2)
315 |                             # keep track of matched email & author so they're not matched again
316 |                             newly_matched_authors.append(k1)
317 |                             newly_matched_emails.append(k2)
318 |                             match_values.remove(v2)
319 |             count += 1
320 |     # Step 3: match the remaining author and email if there's only one remaining (most common)
321 |     # Might not be necessary with the while loop
322 |     still_unmatched_authors = [author for author in corr_author_list if not author.get('email')]
323 |     still_unmatched_emails = {k: v for k, v in email_dict.items() if tuple(v) not in newly_matched_emails}
324 |     if len(still_unmatched_authors) == len(still_unmatched_emails) <= 1:
325 |         if len(still_unmatched_authors) == len(still_unmatched_emails) == 1:
326 |             # only one remaining. it gets matched
327 |             still_unmatched_authors[0]['email'] = list(still_unmatched_emails.values())[0]
328 |         else:
329 |             # we were successful at matching all emails (likely, two pairs had the same match values)
330 |             pass
331 |     else:
332 |         # something's gone wrong. the candidate list of emails doesn't match the number of authors
333 |         # the corresponding authors printed below will have their ['email'] field unfilled
334 |         print('not calculating right', still_unmatched_authors, still_unmatched_emails)
335 | 
336 |     return corr_author_list
337 | 
338 | 
339 | def match_contribs_to_dicts(contrib_list, special_dict, contrib_key):
340 |     """
341 |     :param contrib_list: list of contributors
342 |     :param special_dict: usually either get_aff_dict() or get_credit_dict()
343 |     :param contrib_key: The item in the contrib dictionary where the matched special_dict will be stored
344 |     """
345 |     matching_error = False
346 |     matched_keys = []
347 |     for contrib_dict in contrib_list:
348 |         contrib_dict = match_contrib_initials_to_dict(contrib_dict,
349 |                                                       special_dict,
350 |                                                       matched_keys,
351 |                                                       contrib_key)
352 |         if contrib_dict.get(contrib_key, None):
353 |             for k, v in special_dict.items():
354 |                 if v == contrib_dict.get(contrib_key):
355 |                     matched_keys.append(k)
356 |     if len(special_dict) == len(matched_keys):
357 |         # all special_dicts and contributors are matched
358 |         pass
359 |     else:
360 |         unmatched_special_dict = {k: v for k, v in special_dict.items()
361 |                                   if k not in matched_keys}
362 |         contrib_dict_missing_special_list = [contrib_dict for contrib_dict in contrib_list
363 |                                              if not contrib_dict.get(contrib_key, None)]
364 | 
365 |         # if one contributor and one special_dict are unmatched, match them
366 |         if len(unmatched_special_dict) == len(contrib_dict_missing_special_list) == 1:
367 |             contrib_dict_missing_special_list[0][contrib_key] = list(unmatched_special_dict.values())[0]
368 | 
369 |         elif len(unmatched_special_dict) != len(contrib_dict_missing_special_list):
370 |             # these numbers should always be the same
371 |             matching_error = True
372 | 
373 |         else:
374 |             if contrib_key == 'email':
375 |                 # match remaining contributor names to emails by string matching
376 |                 contrib_dicts = match_author_names_to_emails(contrib_dict_missing_special_list, unmatched_special_dict)
377 |                 if len([contrib for contrib in contrib_dicts if contrib_key not in contrib.keys()]) == 0:
378 |                     # finally every contributor and special_dict is matched
379 |                     pass
380 |                 else:
381 |                     # even after applying every strategy, there were unmatched contributors
382 |                     matching_error = True
383 |     return contrib_list, matching_error
384 | 


--------------------------------------------------------------------------------
/allofplos/elements/journal.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | journal_map = OrderedDict([
 4 |                          ('pone', 'PLOS ONE'),
 5 |                          ('pcbi', 'PLOS Computational Biology'),
 6 |                          ('pntd', 'PLOS Neglected Tropical Diseases'),
 7 |                          ('pgen', 'PLOS Genetics'),
 8 |                          ('ppat', 'PLOS Pathogens'),
 9 |                          ('pbio', 'PLOS Biology'),
10 |                          ('pmed', 'PLOS Medicine'),
11 |                          ('pctr', 'PLOS Clinical Trials'),
12 |                          ('pstr', 'PLOS Sustainability and Transformation'),
13 |                          ('pclm', 'PLOS Climate'),
14 |                          ('pwat', 'PLOS Water'),
15 |                          ('pgph', 'PLOS Global Public Health'),
16 |                          ('pdig', 'PLOS Digital Health'),
17 |                          ('pmen', 'PLOS Mental Health'),
18 |                          ('pcsy', 'PLOS Complex Systems'),
19 |                          ('annotation', 'PLOS ONE'),
20 |                           ])
21 | 
22 | nlm_ta_journal = {'plos negl trop dis': 'PLOS Neglected Tropical Diseases',
23 |                   'plos pathog': 'PLOS Pathogens',
24 |                   'plos genet': 'PLOS Genetics',
25 |                   'plos biol': 'PLOS Biology',
26 |                   'plos med': 'PLOS Medicine',
27 |                   'plos medicin': 'PLOS Medicine',
28 |                   'plos one': 'PLOS ONE',
29 |                   'plos comput biol': 'PLOS Computational Biology',
30 |                   'plos sustain transform': 'PLOS Sustainability and Transformation',
31 |                   'plos clim': 'PLOS Climate',
32 |                   'plos water': 'PLOS Water',
33 |                   'plos glob public health': 'PLOS Global Public Health',
34 |                   'plos digit health': 'PLOS Digital Health',
35 |                   'plos complex syst': 'PLOS Complex Systems',
36 |                   'plos ment health': 'PLOS Mental Health',
37 |                   }
38 | 
39 | 
40 | class Journal():
41 |     """For parsing the journal name element of articles, as well as converting DOIs to journal names."""
42 | 
43 |     def __init__(self, journal_meta_element):
44 |         """Initialize an instance of the journal class."""
45 |         self.element = journal_meta_element
46 | 
47 |     @staticmethod
48 |     def doi_to_journal(doi):
49 |         """For a given doi, get the PLOS journal that the article is published in.
50 | 
51 |         For the subset of DOIs with 'annotation' in the name, assumes PLOS ONE.
52 |         :return: string of journal name
53 |         """
54 | 
55 |         return next(value for key, value in journal_map.items() if key in doi)
56 | 
57 |     def __str__(self):
58 |         """Provides str(Journal()) style access to Journal().parse_plos_journal.
59 |         """
60 |         return self.parse_plos_journal()
61 | 
62 |     def parse_plos_journal(self, caps_fixed=True):
63 |         """For an individual PLOS article, get the journal it was published in from the article XML.
64 | 
65 |         Relies on article XML metadata. For DOI to journal conversion, see `doi_to_journal()`.
66 |         :param caps_fixed: whether to render 'PLOS' in the journal name correctly, or as-is ('PLoS')
67 |         :return: PLOS journal name at specified xpath location
68 |         """
69 |         journal = ''
70 |         # location for newer journal articles
71 |         journal_path_1 = self.element.xpath('./journal-title-group/journal-title')
72 |         if len(journal_path_1):
73 |             assert len(journal_path_1) == 1
74 |             journal = journal_path_1[0].text
75 |         else:
76 |             # location for older journal articles
77 |             journal_path_2 = self.element.xpath('./journal-title')
78 |             if len(journal_path_2):
79 |                 assert len(journal_path_2) == 1
80 |                 journal = journal_path_2[0].text
81 | 
82 |             else:
83 |                 # location for oldest journal articles
84 |                 nlm_ta_id = [j for j in self.element.getchildren() if j.attrib.get('journal-id-type', None) == 'nlm-ta']
85 |                 assert len(nlm_ta_id) == 1
86 |                 journal = nlm_ta_id[0].text
87 |                 journal = nlm_ta_journal.get(journal.lower(), journal)
88 | 
89 |         if caps_fixed:
90 |             journal = journal.split()
91 |             if journal[0].lower() == 'plos':
92 |                 journal[0] = "PLOS"
93 |             journal = (' ').join(journal)
94 | 
95 |         assert journal in journal_map.values(), '{}: journal field not a valid PLOS journal'.format(journal)
96 |         return journal
97 | 


--------------------------------------------------------------------------------
/allofplos/elements/license.py:
--------------------------------------------------------------------------------
  1 | import lxml.etree as et
  2 | import re
  3 | 
  4 | # Creative Commons links
  5 | xlink_href = '{http://www.w3.org/1999/xlink}href'
  6 | cc_by_4_link = 'https://creativecommons.org/licenses/by/4.0/'
  7 | cc_by_3_link = 'https://creativecommons.org/licenses/by/3.0/'
  8 | cc0_link = 'https://creativecommons.org/publicdomain/zero/1.0/'
  9 | cc_by_3_igo_link = 'https://creativecommons.org/licenses/by/3.0/igo/'
 10 | crown_link = 'http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/'
 11 | cc_dict = {'CC-BY 4.0': cc_by_4_link,
 12 |            'CC-BY 3.0': cc_by_3_link,
 13 |            'CC0': cc0_link,
 14 |            'CC-BY 3.0 IGO': cc_by_3_igo_link,
 15 |            'Crown Copyright': crown_link,
 16 |            }
 17 | 
 18 | 
 19 | class License():
 20 |     """For parsing the license element of articles."""
 21 | 
 22 |     def __init__(self, permissions_element, doi):
 23 |         """Initialize an instance of the license class."""
 24 |         self.element = permissions_element
 25 |         self.doi = doi
 26 |     
 27 |     def __iter__(self):
 28 |         """Provides the ability to cast License as a dictionary using 
 29 |         dict(License(…)).
 30 |         
 31 |         Returns a generator of (key, value) tuples, which when passed into 
 32 |         dict(), will create the appropriate dictionary. 
 33 |         """
 34 |         return ((key, value) for key, value in self.license.items())
 35 |     
 36 |     @property
 37 |     def license(self):
 38 |         """Dictionary of CC license information from the article license field.
 39 |         """
 40 |         lic = ''
 41 |         cc_link = ''
 42 |         copy_year = ''
 43 |         copy_holder = ''
 44 |         permissions = self.element
 45 |         if permissions.xpath('./copyright-year'):
 46 |             copy_year = int(permissions.xpath('./copyright-year')[0].text.strip())
 47 |         if permissions.xpath('./copyright-holder'):
 48 |             try:
 49 |                 copy_holder = ', '.join([x.text.strip() for x in permissions.xpath('./copyright-holder')])
 50 |             except AttributeError:
 51 |                 print('error getting copyright holder for {}'.format(self.doi))
 52 | 
 53 |         license = permissions.xpath('./license')[0]
 54 |         if license.attrib.get(xlink_href):
 55 |             cc_link = license.attrib[xlink_href]
 56 |         elif license.xpath('.//ext-link'):
 57 |             link = license.xpath('.//ext-link')[0]
 58 |             cc_link = link.attrib[xlink_href]
 59 |         if cc_link:
 60 |             if cc_link == cc_by_4_link or any(x in cc_link for x in ["Attribution", "4.0"]):
 61 |                 lic = 'CC-BY 4.0'
 62 |             elif cc_link == cc_by_3_igo_link or 'by/3.0/igo' in cc_link:
 63 |                 lic = 'CC-BY 3.0 IGO'
 64 |             elif cc_link == cc_by_3_link or 'by/3.0' in cc_link:
 65 |                 lic = 'CC-BY 3.0'
 66 |             elif cc_link == cc0_link or 'zero/1.0/' in cc_link:
 67 |                 lic = 'CC0'
 68 |             elif cc_link == 'http://www.nationalarchives.gov.uk/doc/open-government-licence/open-government-licence.htm' \
 69 |              or 'open-government-licence' in cc_link:
 70 |                 lic = "Crown Copyright"
 71 |             elif cc_link == 'http://www.plos.org/oa/':
 72 |                 lic = 'CC-BY 3.0 IGO'
 73 |             else:
 74 |                 print('not 4.0', self.doi, link.attrib[xlink_href])
 75 |                 lic = ''
 76 |         else:
 77 |             lic = self.parse_license(license)
 78 |         lic_dict = {'license': lic,
 79 |                     'license_link': cc_dict.get(lic, ''),
 80 |                     'copyright_holder': copy_holder,
 81 |                     'copyright_year': copy_year}
 82 |         return lic_dict
 83 | 
 84 |     def parse_license(self, license):
 85 |         """For license elements without external links, figure out the appropriate copyright.
 86 | 
 87 |         :param license_element: an article XML element with the tag <license>
 88 |         :return: license name
 89 |         """
 90 |         license_text = ' '.join(re.split('\+|\n|\t| ', et.tostring(license, method='text', encoding='unicode')))
 91 |         license_text = ''.join(line.lstrip(' \t') for line in license_text.splitlines(True))
 92 |         license_text = license_text.replace('\n', ' ').replace('\r', '')
 93 |         if any(x in license_text.lower() for x in ["commons attribution license", "creative commons attrib"]):
 94 |             lic = 'CC-BY 4.0'
 95 |             if any(char.isdigit() for char in license_text):
 96 |                 digits = [char for char in license_text if char.isdigit()]
 97 |                 # Flag numbers in case it specifies a CC version number
 98 |                 print("Number found in CC license string for {}".format(self.doi), digits)
 99 |         elif "commons public domain" in license_text.lower() or any(x in license_text for x in ['CC0', 'CCO public', "public domain"]):
100 |             lic = 'CC0'
101 |         elif "creative commons" in license_text.lower():
102 |             print(self.doi, 'unknown CC', license_text)
103 |             lic = ''
104 |         else:
105 |             if 'Public Library of Science Open-Access License' in license_text:
106 |                 lic = 'CC-BY 4.0'
107 |             elif "crown copyright" in license_text.lower() or \
108 |              any(x in license_text for x in ['Open Government Licen', 'Public Sector Information Regulations']):
109 |                 lic = 'Crown Copyright'
110 |             elif "WHO" in license_text:
111 |                 lic = 'CC-BY 3.0 IGO'
112 |             else:
113 |                 lic = 'CC-BY 4.0'
114 |         return lic
115 | 


--------------------------------------------------------------------------------
/allofplos/jupyter_nbconvert_config.py:
--------------------------------------------------------------------------------
1 | c.ExecutePreprocessor.kernel_name = "py3"


--------------------------------------------------------------------------------
/allofplos/makedb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | Make a SQLite DB out of articles XML files
  6 | """
  7 | 
  8 | import argparse
  9 | import datetime
 10 | import os
 11 | from itertools import islice
 12 | 
 13 | from tqdm import tqdm
 14 | import sqlite3
 15 | 
 16 | from peewee import Model, CharField, ForeignKeyField, TextField, \
 17 |     DateTimeField, BooleanField, IntegerField, IntegrityError
 18 | from playhouse.sqlite_ext import SqliteExtDatabase
 19 | 
 20 | from allofplos.corpus import Corpus
 21 | from allofplos.transformations import filename_to_doi, convert_country
 22 | from allofplos import starterdir
 23 | from allofplos.article import Article
 24 | 
 25 | journal_title_dict = {
 26 |     'PLOS ONE': 'PLOS ONE',
 27 |     'PLOS GENETICS': 'PLOS Genetics',
 28 |     'PLOS GENET': 'PLOS Genetics',
 29 |     'PLOS NEGLECTED TROPICAL DISEASES': 'PLOS Neglected Tropical Diseases',
 30 |     'PLOS NEGL TROP DIS': 'PLOS Neglected Tropical Diseases',
 31 |     'PLOS BIOLOGY': 'PLOS Biology',
 32 |     'PLOS BIOL': 'PLOS Biology',
 33 |     'PLOS CLINICAL TRIALS': 'PLOS Clinical Trials',
 34 |     'PLOS MEDICINE':'PLOS Medicine',
 35 |     'PLOS MEDICIN': 'PLOS Medicine',
 36 |     'PLOS MED': 'PLOS Medicine',
 37 |     'PLOS PATHOG': 'PLOS Pathogens',
 38 |     'PLOS PATHOGENS': 'PLOS Pathogens',
 39 |     'PLOS COMPUTATIONAL BIOLOGY': 'PLOS Computational Biology',
 40 |     'PLOS COMPUT BIOL': 'PLOS Computational Biology',
 41 |     'PLOS CLINICAL TRIALS': 'PLOS Clinical Trials',
 42 | }
 43 | 
 44 | 
 45 | parser = argparse.ArgumentParser()
 46 | parser.add_argument('--db', action='store', help=
 47 |                     'Name the db', default='ploscorpus.db')
 48 | parser.add_argument('--random', action='store', type=int, help=
 49 |                     'Number of articles in a random subset. '
 50 |                     'By default will use all articles')
 51 | parser.add_argument('--starterdb', action='store_true', help=
 52 |                     'Make the starter database', dest='starter')
 53 | args = parser.parse_args()
 54 | 
 55 | # TODO: Put a warning that the DB will be deleted
 56 | if os.path.isfile(args.db):
 57 |     os.remove(args.db)
 58 | 
 59 | if args.starter:
 60 |     if os.path.isfile('starter.db'):
 61 |         os.remove('starter.db')
 62 |     db = SqliteExtDatabase('starter.db')
 63 | else:
 64 |     db = SqliteExtDatabase(args.db)
 65 | 
 66 | class BaseModel(Model):
 67 |     class Meta:
 68 |         database = db
 69 | 
 70 | class Journal(BaseModel):
 71 |     """
 72 |     Journal table stores journals values. It is used in the PLOSArticle table.
 73 |     Fields:
 74 |     journal: Journal name. Based on the values of journal_title_dict
 75 |     """
 76 |     journal = CharField(unique=True)
 77 | 
 78 | class ArticleType(BaseModel):
 79 |     """
 80 |     ArticleType table stores article types used in PLOSArticle table.
 81 |     Fields:
 82 |     article_type: Article type such as Research Article, Retraction,
 83 |       Essay, Perspective and others.
 84 |     """
 85 |     article_type = CharField(unique=True)
 86 | 
 87 | class Country(BaseModel):
 88 |     """
 89 |     Country table stores countries that are related with the corresponding
 90 |       author.
 91 |     Fields:
 92 |     country: Country.
 93 |     """
 94 |     country = CharField(unique=True)
 95 | 
 96 | class Affiliations(BaseModel):
 97 |     """
 98 |     Affiliations table stores affiliations that are related with the
 99 |       corresponding author.
100 |     Fields:
101 |     affiliations: Author affiliations such as "Department of Public Health,
102 |       University of Helsinki, Finland".
103 |     """
104 |     affiliations = CharField(unique=True)
105 | 
106 | class Subjects(BaseModel):
107 |     """
108 |     Subjects table stores subjects that are related to PLOSArticle table by
109 |       using a link table (since an article may have multiple subjects)
110 |     Fields:
111 |     subjects: Subjects like "Molecular Biology", "Cell processes" and so on.
112 |     """
113 |     subjects = CharField(unique=True)
114 | 
115 | class CorrespondingAuthor(BaseModel):
116 |     """
117 |     CorrespondingAuthor table stores the information of the corresponding
118 |       author.
119 |     Fields:
120 |     corr_author_email: e-mail of the corresponding author
121 |     tld: Top Level Domain, last part of the email, such as .ar, .es, .com.
122 |     given_name: Given name of the author.
123 |     surname: Surname of the author.
124 |     group_name: Name of the group if a group is the author. If not, it is null.
125 |     affiliation: Author affiliation, this is a linked field.
126 |     country: Country of the author, this is a linked field.
127 |     """
128 |     corr_author_email = CharField(unique=True)
129 |     tld = TextField(null=True)
130 |     given_name = TextField(null=True)
131 |     surname = TextField(null=True)
132 |     group_name = TextField(null=True)
133 |     affiliation = ForeignKeyField(Affiliations, related_name='aff')
134 |     country = ForeignKeyField(Country, related_name='aff')
135 | 
136 | class JATSType(BaseModel):
137 |     """
138 |     JATSType table stores the article type using the JATS (Journal Article
139 |       Tag Suite) standard (https://jats.nlm.nih.gov/).
140 |     Fields:
141 |     jats_type: JATS type, such as "research-article", "discussion", "editorial".
142 |     """
143 |     jats_type = CharField(unique=True)
144 | 
145 | class PLOSArticle(BaseModel):
146 |     """
147 |     PLOSArticle table is the main table of the database. It stores the articles.
148 |     Fields:
149 |     DOI: DOI for the article. For example: 10.1371/journal.pcbi.0030199
150 |     abstract: Abstract for the article.
151 |     title: Title for the article.
152 |     plostype = Article type (internal PLOS classification), this is a linked
153 |       field.
154 |     journal = Journal name, this is a linked field.
155 |     created_date = Article creation date. For example: 2006-04-11 00:00:00
156 |     word_count = Amount of words in the article.
157 |     JATS_type = Article type (JATS classification), this is a linked field.
158 |     """
159 |     DOI = TextField(unique=True)
160 |     abstract = TextField()
161 |     title = TextField()
162 |     plostype = ForeignKeyField(ArticleType, related_name='arttype')
163 |     journal = ForeignKeyField(Journal, related_name='journals')
164 |     created_date = DateTimeField(default=datetime.datetime.now)
165 |     word_count = IntegerField()
166 |     JATS_type = ForeignKeyField(JATSType, related_name='jats')
167 | 
168 | class SubjectsPLOSArticle(BaseModel):
169 |     """
170 |     SubjectsPLOSArticle is a link table to relate subject with articles
171 |     Fields:
172 |     subject: This field is linked to Subjects table.
173 |     article: This field is linked to PLOSArticle table.
174 |     """
175 |     subject = ForeignKeyField(Subjects)
176 |     article = ForeignKeyField(PLOSArticle)
177 | 
178 | class CoAuthorPLOSArticle(BaseModel):
179 |     """
180 |     CoAuthorPLOSArticle is a link table to relate a co-author with a PLOS
181 |       article.
182 |     Fields:
183 |     corr_author: This field is linked to CorrespondingAuthor table.
184 |     article: This field is linked to PLOSArticle table.
185 |     """
186 |     corr_author = ForeignKeyField(CorrespondingAuthor)
187 |     article = ForeignKeyField(PLOSArticle)
188 | 
189 | db.connect()
190 | db.create_tables([Journal, PLOSArticle, ArticleType, CoAuthorPLOSArticle,
191 |                   CorrespondingAuthor, JATSType, Affiliations, Country,
192 |                   SubjectsPLOSArticle, Subjects])
193 | 
194 | 
195 | corpus_dir = starterdir if args.starter else None
196 | all_files = Corpus(corpus_dir)
197 | num_files = len(all_files) if args.random is None else args.random
198 | 
199 | for article in tqdm(islice(all_files, args.random), total=num_files):
200 |     journal_name = journal_title_dict[article.journal.upper()]
201 |     with db.atomic() as atomic:
202 |         try:
203 |             journal = Journal.create(journal = journal_name)
204 |         except IntegrityError:
205 |             db.rollback()
206 |             journal = Journal.get(Journal.journal == journal_name)
207 |     with db.atomic() as atomic:
208 |         try:
209 |             article_type = ArticleType.create(article_type = article.plostype)
210 |         except IntegrityError:
211 |             db.rollback()
212 |             article_type = ArticleType.get(ArticleType.article_type == article.plostype)
213 |     with db.atomic() as atomic:
214 |         try:
215 |             j_type = JATSType.create(jats_type = article.type_)
216 |         except IntegrityError:
217 |             db.rollback()
218 |             j_type = JATSType.get(JATSType.jats_type == article.type_)
219 |     p_art = PLOSArticle.create(
220 |         DOI=article.doi,
221 |         journal = journal,
222 |         abstract=article.abstract.replace('\n', '').replace('\t', ''),
223 |         title = article.title.replace('\n', '').replace('\t', ''),
224 |         plostype = article_type,
225 |         created_date = article.pubdate,
226 |         word_count=article.word_count,
227 |         JATS_type = j_type)
228 |     # Get subject information
229 |     taxonomy_set = set()
230 |     taxonomy = article.taxonomy
231 |     for values in taxonomy.values():
232 |         for value in values:
233 |             for taxon in value:
234 |                 taxonomy_set.add(taxon)
235 |     for taxon in taxonomy_set:
236 |         with db.atomic() as atomic:
237 |             try:
238 |                 subject = Subjects.create(subjects = taxon)
239 |             except (sqlite3.IntegrityError, IntegrityError):
240 |                 db.rollback()
241 |                 subject = Subjects.get(Subjects.subjects == taxon)
242 |         SubjectsPLOSArticle.create(
243 |                 subject = subject,
244 |                 article = p_art
245 |             )
246 |     if article.authors:
247 |         iterable_authors = article.authors
248 |     else:
249 |         iterable_authors = []
250 |     for auths in iterable_authors:
251 |         if auths['email']:
252 |             with db.atomic() as atomic:
253 |                 if auths['affiliations']:
254 |                     author_aff = auths['affiliations'][0]
255 |                 else:
256 |                     author_aff = 'N/A'
257 |                 try:
258 |                     aff = Affiliations.create(affiliations = author_aff)
259 |                 except (sqlite3.IntegrityError, IntegrityError):
260 |                     db.rollback()
261 |                     aff = Affiliations.get(Affiliations.affiliations ==
262 |                                            author_aff)
263 |             with db.atomic() as atomic:
264 |                 try:
265 |                     if auths['affiliations'][0] == '':
266 |                         country_from_aff = 'N/A'
267 |                     else:
268 |                         country_from_aff = auths['affiliations'][0].\
269 |                                            split(',')[-1].strip()
270 |                 except IndexError:
271 |                     country_from_aff = 'N/A'
272 |                 country_from_aff = convert_country(country_from_aff)
273 |                 try:
274 |                     country = Country.create(country = country_from_aff)
275 |                 except IntegrityError:
276 |                     db.rollback()
277 |                     country = Country.get(Country.country == country_from_aff)
278 | 
279 |             try:
280 |                 co_author = CorrespondingAuthor.create(
281 |                     corr_author_email = auths['email'][0],
282 |                     tld = auths['email'][0].split('.')[-1],
283 |                     given_name = auths['given_names'],
284 |                     surname = auths['surname'],
285 |                     group_name = auths['group_name'],
286 |                     affiliation = aff,
287 |                     country = country
288 |                     )
289 |             except IntegrityError:
290 |                 co_author = CorrespondingAuthor.\
291 |                             get(CorrespondingAuthor.corr_author_email == auths['email'][0])
292 |             coauthplosart = CoAuthorPLOSArticle.create(
293 |                     corr_author = co_author,
294 |                     article = p_art
295 |                 )
296 | 


--------------------------------------------------------------------------------
/allofplos/plos_regex.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The following RegEx pertains to the 7 main PLOS journals and the defunct PLOS Clinical Trials, as well as PLOS Currents.
 3 | """
 4 | 
 5 | import re
 6 | import os
 7 | 
 8 | from . import get_corpus_dir
 9 | 
10 | regex_match_prefix = r"^10\.1371/"
11 | regex_body_match = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7}$)"
12 |                     r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}$))")
13 | regex_body_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
14 |                      r"|(annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
15 | regex_body_currents = (r"((currents\.[a-zA-Z]{2,9}\.[a-zA-Z0-9]{32}$)"
16 |                        r"|(currents\.RRN[\d]{4}$)"
17 |                        r"|([a-zA-Z0-9]{13}$)"
18 |                        r"|([a-zA-Z0-9]{32}$))")
19 | regex_file_search = (r"((journal\.p[a-zA-Z]{3}\.[\d]{7})"
20 |                      r"|(plos\.correction\.[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}))")
21 | full_doi_regex_match = re.compile(regex_match_prefix+regex_body_match)
22 | full_doi_regex_search = re.compile(r"10\.1371/journal\.p[a-zA-Z]{3}\.[\d]{7}"
23 |                                    "|10\.1371/annotation/[a-zA-Z0-9]{8}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{4}-[a-zA-Z0-9]{12}")
24 | currents_doi_regex = re.compile(regex_match_prefix+regex_body_currents)
25 | file_regex_match = re.compile(regex_file_search+r"\.xml")
26 | BASE_URL = 'https://journals.plos.org/plosone/article/file?id='
27 | URL_SUFFIX = '&type=manuscript'
28 | external_url_regex_match = re.compile(re.escape(BASE_URL) +
29 |                                       re.escape("10.1371/") +
30 |                                       regex_body_search +
31 |                                       re.escape(URL_SUFFIX))
32 | 
33 | 
34 | def validate_doi(doi):
35 |     """
36 |     For an individual string, tests whether the full string is in a valid PLOS DOI format or not
37 |     Example: '10.1371/journal.pbio.2000777' is True, but '10.1371/journal.pbio.2000777 ' is False
38 |     :return: True if string is in valid PLOS DOI format; False if not
39 |     """
40 |     return bool(full_doi_regex_match.search(doi))
41 | 
42 | 
43 | def validate_filename(filename):
44 |     """
45 |     For an individual string, tests whether the full string is in a valid article file. This can take two forms.
46 |     
47 |     TODO: Officially document these two forms and give them names. Also, Explain the example below.
48 |     
49 |     Example: 'allofplos_xml/journal.pbio.2000777.xml' is True, but 'allofplos_xml/journal.pbio.20007779.xml' is False
50 |     :filename: A string with a file name
51 |     :return: True if string is in a valid PLOS corpus article format; False if not
52 |     """
53 |     if file_regex_match.search(filename):
54 |         return True
55 |     else:
56 |         return False
57 | 
58 | 
59 | def validate_url(url):
60 |     """
61 |     For an individual string, tests whether the full string is in a valid article url format or not
62 |     Example: 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147&type=manuscript' is True,
63 |     but 'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pcbi.0020147' is False
64 |     :return: True if string is in a valid PLOS article url; False if not
65 |     """
66 |     return bool(external_url_regex_match.search(url))
67 | 
68 | 
69 | def find_valid_dois(doi):
70 |     """
71 |     For an individual string, searches for any valid PLOS DOIs within it and returns them
72 |     :return: list of valid PLOS DOIs contained within string
73 |     """
74 |     return full_doi_regex_search.findall(doi)
75 | 
76 | 
77 | def show_invalid_dois(doi_list):
78 |     """
79 |     Checks to see whether a list of PLOS DOIs follow the correct format. Used mainly to determine
80 |     if linked DOI fields in other articles (such as retractions and corrections) are correct.
81 |     :return: list of DOI candidates that don't match PLOS's pattern
82 |     """
83 |     return list(filter(lambda x: not validate_doi(x), doi_list))
84 | 
85 | 
86 | def currents_doi_filter(doi_list):
87 |     """
88 |     Checks to see whether a list of PLOS Currents DOIs follow the correct format. Used mainly to determine
89 |     if linked DOI fields in PMC articles are correct.
90 |     :return: list of DOI candidates that don't match Currents' pattern
91 |     """
92 |     return list(filter(lambda x: not bool(currents_doi_regex.search(x)), doi_list))
93 | 


--------------------------------------------------------------------------------
/allofplos/samples/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/allofplos/starter.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PLOS/allofplos/07fdeed848b2816a7fbf7da6418ef6e2076e1960/allofplos/starter.db


--------------------------------------------------------------------------------
/allofplos/starter_corpus/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.pbio.0020188.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!DOCTYPE article
  3 |   PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
  4 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="3.0" xml:lang="EN">
  5 |   <front>
  6 |     <journal-meta><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="publisher">pbio</journal-id><journal-id journal-id-type="nlm-ta">PLoS Biol</journal-id><journal-id journal-id-type="pmc">plosbiol</journal-id><!--===== Grouping journal title elements =====--><journal-title-group><journal-title>PLoS Biology</journal-title></journal-title-group><issn pub-type="epub">1545-7885</issn><issn pub-type="ppub">1544-9173</issn><publisher>
  7 |         <publisher-name>Public Library of Science</publisher-name>
  8 |         <publisher-loc>San Francisco, USA</publisher-loc>
  9 |       </publisher></journal-meta>
 10 |     <article-meta><article-id pub-id-type="doi">10.1371/journal.pbio.0020188</article-id><article-categories>
 11 |         <subj-group subj-group-type="heading">
 12 |           <subject>Correspondence and Other Communications</subject>
 13 |         </subj-group>
 14 |         <subj-group subj-group-type="Discipline">
 15 |           <subject>Cell Biology</subject>
 16 |           <subject>Science Policy</subject>
 17 |         </subj-group>
 18 |       </article-categories><title-group><article-title>Taking the Stem Cell Debate to the Public</article-title><alt-title alt-title-type="running-head">Correspondence</alt-title></title-group><contrib-group>
 19 |         <contrib contrib-type="author" corresp="yes" xlink:type="simple">
 20 |           <name name-style="western">
 21 |             <surname>Zon</surname>
 22 |             <given-names>Leonard I</given-names>
 23 |           </name>
 24 |           <xref ref-type="fn" rid="cor1">
 25 |             <sup>*</sup>
 26 |           </xref>
 27 |         </contrib>
 28 |         <contrib contrib-type="author" xlink:type="simple">
 29 |           <name name-style="western">
 30 |             <surname>Zoloth</surname>
 31 |             <given-names>Laurie</given-names>
 32 |           </name>
 33 |         </contrib>
 34 |         <contrib contrib-type="author" xlink:type="simple">
 35 |           <name name-style="western">
 36 |             <surname>Kadereit</surname>
 37 |             <given-names>Suzanne</given-names>
 38 |           </name>
 39 |         </contrib>
 40 |       </contrib-group><author-notes>
 41 |         <fn fn-type="current-aff" id="n1">
 42 |           <p>Howard Hughes Medical Institute, Chevy Chase, Maryland, United States of America</p>
 43 |         </fn>
 44 |         <fn fn-type="current-aff" id="n2">
 45 |           <p>Children's Hospital, Harvard Medical School, Cambridge, Massachusetts, United States of America</p>
 46 |         </fn>
 47 |         <fn fn-type="current-aff" id="n3">
 48 |           <p>International Society for Stem Cell Research, Northbrook, Illinois, United States of America</p>
 49 |         </fn>
 50 |         <fn fn-type="current-aff" id="n4">
 51 |           <p>Center for Genetic Medicine, Northwestern University, Evanston, Illinois, United States of America</p>
 52 |         </fn>
 53 |         <corresp id="cor1">*To whom correspondence should be addressed. E-mail: <email xlink:type="simple">zon@enders.tch.harvard.edu</email></corresp>
 54 |       </author-notes><pub-date pub-type="ppub">
 55 |         <month>6</month>
 56 |         <year>2004</year>
 57 |       </pub-date><pub-date pub-type="epub">
 58 |         <day>15</day>
 59 |         <month>6</month>
 60 |         <year>2004</year>
 61 |       </pub-date><volume>2</volume><issue>6</issue><elocation-id>e188</elocation-id><!--===== Grouping copyright info into permissions =====--><permissions><copyright-year>2004</copyright-year><copyright-holder>Zon et al</copyright-holder><license><license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited.</license-p></license></permissions><related-article page="e116" related-article-type="companion" vol="2" xlink:href="info:doi/10.1371/journal.pbio.0020116" xlink:title="essay" xlink:type="simple">
 62 | 				<article-title>Reason as Our Guide</article-title>
 63 | 			</related-article><related-article page="e181" related-article-type="companion" vol="2" xlink:href="info:doi/10.1371/journal.pbio.0020181" xlink:title="correspondence" xlink:type="simple">
 64 | 				<article-title>Beyond Therapy</article-title>
 65 | 			</related-article><related-article page="e170" related-article-type="companion" vol="2" xlink:href="info:doi/10.1371/journal.pbio.0020170" xlink:title="correspondence" xlink:type="simple">
 66 | 				<article-title>Ethics As Our Guide</article-title>
 67 | 			</related-article><related-article page="e177" related-article-type="companion" vol="2" xlink:href="info:doi/10.1371/journal.pbio.0020177" xlink:title="correspondence" xlink:type="simple">
 68 | 				<article-title>Scientists and Bioethics Councils</article-title>
 69 | 			</related-article><related-article page="e182" related-article-type="companion" vol="2" xlink:href="info:doi/10.1371/journal.pbio.0020182" xlink:title="correspondence" xlink:type="simple">
 70 | 				<article-title>A Voice for Research, a Voice for Patients</article-title>
 71 | 			</related-article><related-article page="e189" related-article-type="companion" vol="2" xlink:href="info:doi/10.1371/journal.pbio.0020189" xlink:title="correspondence" xlink:type="simple">
 72 | 				<article-title>Ethereal Ethics</article-title>
 73 | 			</related-article><abstract abstract-type="toc">
 74 |         <p>In response to the Blackburn and Rowley essay on the President's Council on Bioethics, several thought-provoking opinions on ethical challenges in biomedical research are expressed by prominent stakeholders.</p>
 75 |       </abstract></article-meta>
 76 |   </front>
 77 |   <body>
 78 |     <sec id="s1">
 79 |       <title/>
 80 |       <p>In their essay in the April 2004 issue of <italic>PLoS Biology</italic>, Elizabeth <xref ref-type="bibr" rid="pbio-0020188-Blackburn1">Blackburn and Janet Rowley (2004)</xref>, two distinguished cellular biologists and members of the President's Council on Bioethics, strongly question the scientific foundation of two reports from the Council (<xref ref-type="bibr" rid="pbio-0020188-PCB1">President's Council on Bioethics 2003</xref>, <xref ref-type="bibr" rid="pbio-0020188-PCB2">2004</xref>). The Council on Bioethics was formed by executive order “to advise the President on bioethical issues that may emerge as a consequence of advances in biomedical science and technology.” An open discussion between ethicists and scientists is critical to the advisory system. The recent administrative dismissal of Dr. Blackburn from the Council is very alarming. By stacking the deck with conservative opinions, and not accurately discussing the scientific issues, the Bioethics Council has become irrelevant to the scientific community and presents a jaundiced view to the public.</p>
 81 |       <p>Stem cell research and its applications have the potential to revolutionize human health care. Recent polls show support for embryonic stem cell research, even with conservative voters. The public, as the major benefactor of biomedical research and the target population of beneficial clinical advances, has the right to a fact-based discussion of the science regarding stem cells. It is therefore time that the debate on stem cell research, with its risks and benefits, be taken to the public. A debate on stem cell research restricted to the President's Council on Bioethics is a disservice to the public.</p>
 82 |       <p>Nearly three decades ago, the advent of recombinant DNA technology and in vitro fertilization (IVF) techniques, raised similar concerns regarding research. Contrary to apprehensive expectations, recombinant DNA technology has boosted enormous advances in the health care and pharmaceutical industry. IVF evolved to be a widely accepted, safe medical procedure, with over one million healthy babies born by IVF and related treatments. Similarly, once stem cells are successfully used in the clinic, most of today's political and ethical issues will evaporate.</p>
 83 |       <p>The International Society for Stem Cell Research (ISSCR), a society whose membership encompasses the bulk of the stem cell research brain trust, holds the position that research on both adult and embryonic stem cells will guarantee the fastest progress in scientific discovery and clinical advances. The ISSCR also strongly opposes reproductive cloning and supports the National Academy of Science's proposal to develop voluntary guidelines to encourage responsible practices in human embryonic stem cell research.</p>
 84 |       <p>One of the original recommendations of the President's Council on Bioethics was a four-year moratorium on stem cell research. The purpose of this moratorium was theoretically to open a large, national discourse on the topic of stem cell research, a debate intended to bring all sides into thoughtful reflection on the issue. To that end, the ISSCR has repeatedly and consistently offered an open forum for all sides in the debate at our conferences, and has carefully offered invitations to join our society and to speak at our annual meeting to members of the President's Council, including colleagues whose opposition to stem cell research has been clear. None have accepted. Dr. Kass, in particular, has received several direct appeals but has turned down every such opportunity to make his case to the researchers who arguably are his discourse partners, from whom he could learn much, and whom he should be actively engaged in teaching. It is tragic that voices of dissent and debate are stilled, for it is this very quality of open debate that is at the heart of both the scientific method and an ethically directed American democracy—surely a goal that we all share.</p>
 85 |     </sec>
 86 |   </body>
 87 |   <back>
 88 |     <ref-list>
 89 |       <title>References</title>
 90 |       <ref id="pbio-0020188-Blackburn1">
 91 |         <label>1</label>
 92 |         <nlm-citation publication-type="journal" xlink:type="simple">
 93 |           <person-group person-group-type="author">
 94 |             <name name-style="western">
 95 |               <surname>Blackburn</surname>
 96 |               <given-names>E</given-names>
 97 |             </name>
 98 |             <name name-style="western">
 99 |               <surname>Rowley</surname>
100 |               <given-names>J</given-names>
101 |             </name>
102 |           </person-group>
103 |           <article-title>Reason as our guide.</article-title>
104 |           <source>PLoS Biol</source>
105 |           <year>2004</year>
106 |           <volume>2</volume>
107 |           <fpage>e116</fpage>
108 |           <comment>doi: <ext-link ext-link-type="doi" xlink:href="http://dx.doi.org/10.1371/journal.pbio.0020116" xlink:type="simple">10.1371/journal.pbio.0020116</ext-link></comment>
109 |         </nlm-citation>
110 |       </ref>
111 |       <ref id="pbio-0020188-PCB1">
112 |         <label>2</label>
113 |         <nlm-citation publication-type="journal" xlink:type="simple">
114 |           <collab xlink:type="simple">President's Council on Bioethics</collab>
115 |           <article-title>Beyond therapy: Biotechnology and the pursuit of happiness.</article-title>
116 |           <year>2003</year>
117 |           <comment>Available at <ext-link ext-link-type="uri" xlink:href="http://bioethics.gov/reports/beyondtherapy/index.html" xlink:type="simple">http://bioethics.gov/reports/beyondtherapy/index.html</ext-link> via the Internet. Accessed 19 April 2004</comment>
118 |         </nlm-citation>
119 |       </ref>
120 |       <ref id="pbio-0020188-PCB2">
121 |         <label>3</label>
122 |         <nlm-citation publication-type="journal" xlink:type="simple">
123 |           <collab xlink:type="simple">President's Council on Bioethics</collab>
124 |           <article-title>Monitoring stem cell research.</article-title>
125 |           <year>2004</year>
126 |           <comment>Available at <ext-link ext-link-type="uri" xlink:href="http://bioethics.gov/reports/stemcell/index.html" xlink:type="simple">http://bioethics.gov/reports/stemcell/index.html</ext-link> via the Internet. Accessed 24 March 2004</comment>
127 |         </nlm-citation>
128 |       </ref>
129 |     </ref-list>
130 |   </back>
131 | </article>


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.pbio.0030408.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE article
 3 |   PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
 4 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="article-commentary" dtd-version="3.0" xml:lang="EN">
 5 |   <front>
 6 |     <journal-meta><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="publisher">pbio</journal-id><journal-id journal-id-type="nlm-ta">PLoS Biol</journal-id><journal-id journal-id-type="pmc">plosbiol</journal-id><!--===== Grouping journal title elements =====--><journal-title-group><journal-title>PLoS Biology</journal-title></journal-title-group><issn pub-type="ppub">1544-9173</issn><issn pub-type="epub">1545-7885</issn><publisher>
 7 |         <publisher-name>Public Library of Science</publisher-name>
 8 |         <publisher-loc>San Francisco, USA</publisher-loc>
 9 |       </publisher></journal-meta>
10 |     <article-meta><article-id pub-id-type="doi">10.1371/journal.pbio.0030408</article-id><article-categories>
11 |         <subj-group subj-group-type="heading">
12 |           <subject>Synopsis</subject>
13 |         </subj-group>
14 |         <subj-group subj-group-type="Discipline">
15 |           <subject>Neuroscience</subject>
16 |         </subj-group>
17 |         <subj-group subj-group-type="System Taxonomy">
18 |           <subject>Homo (human)</subject>
19 |         </subj-group>
20 |       </article-categories><title-group><article-title>Stimulating the Brain Makes the Fingers More Sensitive</article-title><alt-title alt-title-type="running-head">Synopsis</alt-title></title-group><pub-date pub-type="ppub">
21 |         <month>11</month>
22 |         <year>2005</year>
23 |       </pub-date><pub-date pub-type="epub">
24 |         <day>18</day>
25 |         <month>10</month>
26 |         <year>2005</year>
27 |       </pub-date><volume>3</volume><issue>11</issue><elocation-id>e408</elocation-id><!--===== Grouping copyright info into permissions =====--><permissions><copyright-year>2005</copyright-year><copyright-holder>Public Library of Science</copyright-holder><license><license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions><related-article page="e362" related-article-type="companion" vol="3" xlink:href="info:doi/10.1371/journal.pbio.0030362" xlink:title="research article" xlink:type="simple">
28 | 				<article-title>Improvement of Tactile Discrimination Performance and Enlargement of Cortical Somatosensory Maps after 5 Hz rTMS</article-title>
29 | 			</related-article></article-meta>
30 |   </front>
31 |   <body>
32 |     <sec id="s1">
33 |       <title/>
34 |       <p>Repetitive transcranial magnetic stimulation (rTMS) has more than a whiff of Buck Rogers to it: a magnetic wand passes over the surface of the skull, triggering changes in the brain underneath. But it's not science fiction, and in the past decade, rTMS has emerged as an intriguing technique for exploring brain function, and a promising, though still unproven, form of therapy. In this issue, Hubert Dinse, Martin Tegenthoff, and their colleagues show that a short course of rTMS can increase finger sensitivity for up to two hours after treatment ends, and that this change corresponds to an increase in the size of the brain map representing the finger.</p>
35 |       <p>rTMS is applied with an electromagnetic coil in the shape of a figure-eight, placed on the scalp directly over the targeted portion of the brain. Short bursts of a strong magnetic pulse stimulate electrical currents within. Sensory input from each region of the body is represented on the surface of the brain, and the location of any region—in this case, the right index finger—can be mapped to allow precise targeting of the rTMS. The authors adjusted the strength of the magnetic field to just below that which triggered a sensory response in the finger, and then applied intermittent pulses of stimulation over the course of about ten minutes.</p>
36 |       <p>They tested the sensitivity of the index finger by determining how far apart two simultaneously applied pinpricks needed to be for the subject to distinguish them as separate stimuli. rTMS increased this two-point discrimination by about 15% immediately after stimulation, an effect that gradually diminished but still remained significant over the course of the next two hours. The effect was fairly specific for the right index finger: there was no effect on the left index finger, which is represented in the opposite hemisphere, and only a small effect on the right ring finger, which is represented several millimeters away from the index finger in the same hemisphere. When stimulation was applied over the area representing the lower leg, the index finger did not become more sensitive. <xref ref-type="fig" rid="pbio-0030408-g001"/></p>
37 |       <fig id="pbio-0030408-g001" position="float">
38 |         <object-id pub-id-type="doi">10.1371/journal.pbio.0030408.g001</object-id>
39 |         <caption>
40 |           <title>Transcranial magnetic stimulation alters sensory perception and activity in sensory cortical areas</title>
41 |         </caption>
42 |         <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pbio.0030408.g001" xlink:type="simple"/>
43 |       </fig>
44 |       <p>The authors used functional magnetic resonance imaging (fMRI) to see how the brain changed in response to the stimulation. They found that the region representing the index finger got larger, and that the degree of increase in any one subject corresponded to the degree of increased sensitivity in that same subject. As the sensory effect faded, so too did the fMRI changes. Thus, the cortex itself undergoes changes as a result of rTMS.</p>
45 |       <p>Practice affects the brain and the brain affects practice—it now appears possible to directly intervene in this brain–behavior loop to improve short-term tactile performance. Other recent work by the same authors shows that rTMS can also improve visual discrimination, suggesting a potential for affecting changes throughout the brain. These results are unlikely to be of immediate benefit to those who rely on exquisite sensitivity in their fingers, whether surgeons or safecrackers, as the equipment needed for rTMS is cumbersome and the duration of the effect relatively short. However, a related technique, transcranial direct current stimulation, employs much more portable equipment, suggesting it may find a role in neurorehabilitation. Further study of both techniques will be needed to determine the future of this futuristic technology. —<italic>Richard Robinson</italic></p>
46 |     </sec>
47 |   </body>
48 | </article>


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.pbio.1001044.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!DOCTYPE article
  3 |   PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
  4 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="book-review" dtd-version="3.0" xml:lang="EN">
  5 |   <front>
  6 |     <journal-meta><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="nlm-ta">PLoS Biol</journal-id><journal-id journal-id-type="pmc">plosbiol</journal-id><!--===== Grouping journal title elements =====--><journal-title-group><journal-title>PLoS Biology</journal-title></journal-title-group><issn pub-type="ppub">1544-9173</issn><issn pub-type="epub">1545-7885</issn><publisher>
  7 |         <publisher-name>Public Library of Science</publisher-name>
  8 |         <publisher-loc>San Francisco, USA</publisher-loc>
  9 |       </publisher></journal-meta>
 10 |     <article-meta><article-id pub-id-type="publisher-id">PBIOLOGY-D-11-00282</article-id><article-id pub-id-type="doi">10.1371/journal.pbio.1001044</article-id><article-categories>
 11 |         <subj-group subj-group-type="heading">
 12 |           <subject>Book Review/Science in the Media</subject>
 13 |         </subj-group>
 14 |         <subj-group subj-group-type="Discipline-v2">
 15 |           <subject>Biology</subject>
 16 |         </subj-group>
 17 |         
 18 |       </article-categories><title-group><article-title>Cancer: The Whole Story</article-title></title-group><contrib-group>
 19 |         <contrib contrib-type="author" xlink:type="simple">
 20 |           <name name-style="western">
 21 |             <surname>Frank</surname>
 22 |             <given-names>Steven A.</given-names>
 23 |           </name>
 24 |           <xref ref-type="aff" rid="aff1"/>
 25 |           <xref ref-type="corresp" rid="cor1">
 26 |             <sup>*</sup>
 27 |           </xref>
 28 |         </contrib>
 29 |       </contrib-group><aff id="aff1">          <addr-line>Department of Ecology and Evolutionary Biology, University of California Irvine, Irvine, California, United States of America</addr-line>       </aff><author-notes>
 30 |         <corresp id="cor1">* E-mail: <email xlink:type="simple">safrank@uci.edu</email></corresp>
 31 |       <fn fn-type="conflict">
 32 |         <p>The author has declared that no competing interests exist.</p>
 33 |       </fn></author-notes><pub-date pub-type="collection">
 34 |         <month>4</month>
 35 |         <year>2011</year>
 36 |       </pub-date><pub-date pub-type="epub">
 37 |         <day>12</day>
 38 |         <month>4</month>
 39 |         <year>2011</year>
 40 |       </pub-date><volume>9</volume><issue>4</issue><elocation-id>e1001044</elocation-id><product xlink:type="simple"> <name name-style="western"><surname>Mukherjee</surname></name> <year>2010</year> <source><bold>The Emperor of All Maladies: A Biography of Cancer</bold></source> <publisher-loc>New York</publisher-loc> <publisher-name>Scribner</publisher-name> <!--===== Restructure page-count as size[@units="page"] =====--><size units="page">592</size> <isbn>978-1439107959 (hardcover)</isbn> <bold>US$30.00</bold> </product><!--===== Grouping copyright info into permissions =====--><permissions><copyright-year>2011</copyright-year><copyright-holder>Steven A. Frank</copyright-holder><license><license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions><funding-group><funding-statement>The author received no specific funding for this work.</funding-statement></funding-group><counts>
 41 |         <page-count count="2"/>
 42 |       </counts></article-meta>
 43 |   </front>
 44 |   <body>
 45 |     <sec id="s1">
 46 |       <title/>
 47 |       <fig id="pbio-1001044-g001" position="float">
 48 |         <object-id pub-id-type="doi">10.1371/journal.pbio.1001044.g001</object-id>
 49 |         <caption>
 50 |           <title>Mukherjee S (2010) The Emperor of All Maladies: A Biography of Cancer. New York: Scribner. 592 p. ISBN 978-1439107959 (hardcover). US$30.00.</title>
 51 |         </caption>
 52 |         <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pbio.1001044.g001" xlink:type="simple"/>
 53 |       </fig>
 54 |       <p>For every patient and doctor, and for every scientist peering at a flask of deranged cells, this book connects the moment to the multiple voices that have played off each other since the first person squeezed a painful lump and wondered what to do.</p>
 55 |       <p>Reading Siddhartha Mukherjee's <italic>The Emperor of all Maladies: The Biography of Cancer</italic>, the full accomplishment of this book slowly dawned on me. The story begins with a real patient with fulminant leukemia and inevitable terror, and a young doctor not sure of the course. The protocols of recent times are applied. But where did those treatments come from? The author, working in the Dana-Farber Cancer Institute and tracing back the history, comes to a hero of the past, Sidney Farber.</p>
 56 |       <p>In the 1940s, what did leukemia seem like to Farber and to his patients? In fact, Farber, originally a pathologist, did not see patients. For the childhood leukemias that fascinated Farber, the children came to the hospital, were diagnosed, and over months died horrible deaths that devastated their families. There was no treatment. But Farber thought the right chemical combinations could be found to control and ultimately beat the disease. It was a heroic goal, but heroes often start as pariahs. And perhaps, in this case, the oncologists who shunned Farber had a point.</p>
 57 |       <p>The theory of chemotherapy was simple. Poison the patient with chemicals that kill cells, and hope that cancer cells die faster than other cells. Hope was indeed a big part of the early studies. The chemical agents were potent poisons that worked very well, but their specificity for cancer cells as opposed to normal cells was not always so great. To knock the cancer back took a lot of poison, which was awful for the patient. Often, the only chance of knocking out the cancer required poisoning the patient right to, and too often past, the threshold of death. If the cancer was knocked out, the child had a brief reprieve. Soon enough the disease came roaring back, more aggressive and untreatable than before.</p>
 58 |       <p>Oncologists wanted nothing to do with Farber, and did not want him “experimenting” on children in their hospital. The treatments were horrible, often more horrible than the disease itself. The supposed miracle cures created false hope, and then failed terribly. Farber was determined. He was more than determined. He <italic>had</italic> to find a way to treat and cure cancer. As with so many of the great characters in cancer's story, <italic>no</italic> was not an answer. It was an obstacle to be overcome, just like the disease itself.</p>
 59 |       <p>Farber developed the chemicals, getting others to help. He came out of the lab, got some beds in the deepest, coldest, most isolated part of the hospital. At least, thought the other oncologists, don't let anyone see what he is doing. Farber recruited nurses and doctors, found patients whose families realized there was no other hope, guessed at some dosages, started injecting, and ran the ward. It was far from a clinical trial in the modern sense, but it was a real trial. Suffering and death were the norm, but that was already the baseline from which they started.</p>
 60 |       <p>The author evokes the people, the failure, the eventual halting progress. Once Farber's voice has been introduced, the story moves off to develop other voices. But Farber continues to echo in the background. We may find ourselves in the modern Dana-Farber hospital walking by his old office, or hear Farber's spirit resonate with the personalities and the approaches of the great cancer surgeons who also tried to cure by first trying to destroy. How far toward death should the treatment go? How much horror in the often temporary cure justifies the journey through hell to get there? If it takes a great personal ego to smash through the obstacles of professional resistance to develop radical chemotherapy or radical surgery, can those giant egos learn and change as they are inevitably found to be partly right and partly wrong?</p>
 61 |       <p>Patients, doctors, treatments. Heroes, dubious behavior, sometimes by the same people. This is already a rich story, beautifully told. The author has that very rare master's touch, evoking fully yet with the fewest of strokes. As readers, the experienced doctor, the bench scientist, and the patient will all move from sketch to realized story in different ways. There is detail and depth, but little to hinder.</p>
 62 |       <p>With the patient-doctor-treatment counterpoint well established, the author adds new voices. Treatments through the 1960s progressed, clinical trial procedures were established, broad cooperative research programs emerged. But the success of treatments was confined to a few types of rare cancers. Overall, the total cancer burden changed little. Meanwhile, more was being learned about where the cancer burden came from. A lot came from cigarettes. We get the story of the epidemiological research, with new heroes.</p>
 63 |       <p>Resistance always comes from somewhere. This time, it's the tobacco industry. You know the story in broad outline. The details resonate with what was being learned about the causes of cancer, with early detection through mammograms and the Pap smear, and with the complexities and controversies over the efficacy of screening. Competing interests arise and economics plays a role. There is increasing activism of the public in shaping research and health-related policies.</p>
 64 |       <p>Until the 1970s, so little was understood about cancer and about how treatments worked, that it was all a black box. Presented with a disease, one poisoned or cut deeply and hoped the patient survived and the cancer died. By the 1970s, we learned to measure better and run proper trials, to cut a bit more or a bit less, to use different combinations of poisons. It was all empirical, in that little was really known about how different cancers differ and why individuals respond differently to the same treatments. And why, for many cancers, did death soon occur at nearly the same rate as before treatments existed?</p>
 65 |       <p>Then we found some of the genes that mutated in cancers. We learned the biochemical actions of different potential treatments. Could we learn to match the specific changes in certain tumors to particular drugs designed to treat the specific malfunctions? Briefly, the answer is that we did in a few cases, we are still learning, and many people think great progress is ahead.</p>
 66 |       <p>By this point in the story, you are well versed in the pace of history. At any time, always slow progress up to that point, new promise imagined ahead. But, as the author develops the story of recent research, you also feel the accelerating pace of change on top of that slow march through time. It was only 15 years ago that we first began to get any real genetic understanding, and those first clues were scattered and unclear. It was only a couple of years ago that we began to measure the actual genetic changes in tumors. And we know that genetic changes are only a part of the story. We have hints about the other factors, and just now can start to measure those factors such as epigenetic changes in DNA markings and histones, signaling changes between different cell types, and so on. The author brings us all the way to this point, keeping Farber and other early players alive through the narrative.</p>
 67 |       <p>This book is about giving the full sense of time and pace and people. The narrative evokes detail rather than instructs. A reader expert in any area will see what is left out, what is made to sound simple when the reality is complex. But the whole story also has a reality, and there have been so very few authors who can tell us the whole story of major areas of medicine or science.</p>
 68 |       <p>To tell the whole story, the author often focuses on individuals as heroes. The device works beautifully. Somehow, with a cast of Tolstoyian proportions, one can keep track of the individuals, and continue to hear their voices even as they come and go. I could not imagine another way to accomplish telling such a broad story, because we remember well-drawn characters long after we have forgotten about some particular technical achievement in a field far from our own. Yet, from the perspective of understanding the history of each era in a deeper and more nuanced way, it is probably good to keep a certain skepticism in mind.</p>
 69 |       <p>In the subjects that I know well in cancer research, I think of the false tendency to exaggerate the role of a few individuals in ways that distort both the actual contributions of individuals and the actual way in which scientific understanding was achieved. The Nobel Prize winner Christian de Duve, when asked how he wanted to be remembered, answered:</p>
 70 |       <disp-quote>
 71 |         <p>I have no such ambition. In the history of science, my contributions are minor and would have been made by someone else had I not stumbled on them first. They already appear in textbooks without mention of my name. I am no Galileo, Newton, Darwin, Einstein or Watson and Crick. But I have had fun and have been rewarded beyond my deserts. So be it <xref ref-type="bibr" rid="pbio.1001044-deDuve1">[1]</xref>.</p>
 72 |       </disp-quote>
 73 |       <p>Nonetheless, a narrative following from one great person to the next is often a good way to tell the whole story:</p>
 74 |       <disp-quote>
 75 |         <p>More attention to the History of Science is needed, as much by scientists as by historians, and especially by biologists, and this should mean a deliberate attempt to understand the thoughts of the great masters of the past, to see in what circumstances or intellectual <italic>milieu</italic> their ideas were formed, where they took the wrong turning or stopped short on the right track. A sense of the <italic>continuity</italic> and the progressive and cumulative character of an advancing science is the best prophylactic I can suggest against the manic-depressive alternations of the cult of <italic>vogue</italic> and <italic>boost</italic>, which threatens to smother the scientific efforts, gigantic as they are, of at least one great nation <xref ref-type="bibr" rid="pbio.1001044-Fisher1">[2]</xref>.</p>
 76 |       </disp-quote>
 77 |       <p>Modern science naturally focuses itself almost entirely in the present and near future. But good treatment, research, and policy require a sense of the historical continuity and the progressive and cumulative character of advancing science—the whole story. To learn the whole story of cancer, read Siddhartha Mukherjee's masterful book.</p>
 78 |       <boxed-text id="pbio-1001044-box001" position="float">
 79 |         <sec id="s1a1">
 80 |           <title>About the Author</title>
 81 |           <p>Steven A. Frank is Professor of Evolutionary Biology at the University of California, Irvine. He develops mathematical and computational models to study problems in evolutionary genetics, infectious disease, and cancer. Professor Frank has published three books: <italic>Foundations of Social Evolution</italic> (1998), <italic>Immunology and Evolution of Infectious Disease</italic> (2002), and <italic>Dynamics of Cancer: Incidence, Inheritance, and Evolution</italic> (2007). Further information about Professor Frank's research can be found on his Web site at <ext-link ext-link-type="uri" xlink:href="http://stevefrank.org" xlink:type="simple">http://stevefrank.org</ext-link>.</p>
 82 |         </sec>
 83 |       </boxed-text>
 84 |     </sec>
 85 |   </body>
 86 |   <back>
 87 |     <ref-list>
 88 |       <title>References</title>
 89 |       <ref id="pbio.1001044-deDuve1">
 90 |         <label>1</label>
 91 |         <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>de Duve</surname><given-names>C</given-names></name></person-group>             <year>2010</year>             <article-title>The joy of discovery.</article-title>             <source>Nature</source>             <volume>467</volume>             <fpage>S5</fpage>             <comment>doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1038/467S5a" xlink:type="simple">10.1038/467S5a</ext-link></comment>          </element-citation>
 92 |       </ref>
 93 |       <ref id="pbio.1001044-Fisher1">
 94 |         <label>2</label>
 95 |         <element-citation publication-type="journal" xlink:type="simple">             <person-group person-group-type="author"><name name-style="western"><surname>Fisher</surname><given-names>R. A</given-names></name></person-group>             <year>1959</year>             <article-title>Natural selection from the genetical standpoint.</article-title>             <source>Aust J Sci</source>             <volume>22</volume>             <fpage>16</fpage>             <lpage>17</lpage>          </element-citation>
 96 |       </ref>
 97 |     </ref-list>
 98 |     
 99 |   </back>
100 | </article>


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.pcbi.0030158.xml:
--------------------------------------------------------------------------------
1 | <!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
2 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="retraction" dtd-version="3.0" xml:lang="EN"><front><journal-meta><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="publisher">pcbi</journal-id><journal-id journal-id-type="allenpress-id">plcb</journal-id><journal-id journal-id-type="nlm-ta">PLoS Comput Biol</journal-id><journal-id journal-id-type="pmc">ploscomp</journal-id><!--===== Grouping journal title elements =====--><journal-title-group><journal-title>PLoS Computational Biology</journal-title></journal-title-group><issn pub-type="ppub">1553-734X</issn><issn pub-type="epub">1553-7358</issn><publisher><publisher-name>Public Library of Science</publisher-name><publisher-loc>San Francisco, USA</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="doi">10.1371/journal.pcbi.0030158</article-id><article-id pub-id-type="publisher-id">07-PLCB-CN-0337R1</article-id><article-id pub-id-type="sici">plcb-03-07-23</article-id><article-categories><subj-group subj-group-type="heading"><subject>Retraction</subject></subj-group><subj-group subj-group-type="Discipline"><subject>Computational Biology</subject><subject>Evolutionary Biology</subject></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Evolutionary biology</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Phylogenetics</subject><subj-group><subject>Phylogenetic analysis</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Taxonomy</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Phylogenetics</subject><subj-group><subject>Phylogenetic analysis</subject></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Computer and information sciences</subject><subj-group><subject>Data management</subject><subj-group><subject>Taxonomy</subject><subj-group><subject>Evolutionary systematics</subject><subj-group><subject>Phylogenetics</subject><subj-group><subject>Phylogenetic analysis</subject></subj-group></subj-group></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Anatomy</subject><subj-group><subject>Head</subject><subj-group><subject>Eyes</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Anatomy</subject><subj-group><subject>Head</subject><subj-group><subject>Eyes</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Biology and life sciences</subject><subj-group><subject>Anatomy</subject><subj-group><subject>Ocular system</subject><subj-group><subject>Eyes</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Medicine and health sciences</subject><subj-group><subject>Anatomy</subject><subj-group><subject>Ocular system</subject><subj-group><subject>Eyes</subject></subj-group></subj-group></subj-group></subj-group><subj-group subj-group-type="Discipline-v3"><subject>Social sciences</subject><subj-group><subject>Sociology</subject><subj-group><subject>Education</subject><subj-group><subject>Schools</subject><subj-group><subject>Universities</subject></subj-group></subj-group></subj-group></subj-group></subj-group></article-categories><title-group><article-title>Retraction: Measures of Clade Confidence Do Not Correlate with Accuracy of Phylogenetic Trees</article-title><alt-title alt-title-type="running-head">N/A</alt-title></title-group><contrib-group><contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Hall</surname><given-names>Barry G</given-names></name></contrib><contrib contrib-type="author" xlink:type="simple"><name name-style="western"><surname>Salipante</surname><given-names>Stephen J</given-names></name></contrib></contrib-group><pub-date pub-type="ppub"><month>7</month><year>2007</year></pub-date><pub-date pub-type="epub"><day>20</day><month>7</month><year>2007</year></pub-date><volume>3</volume><issue>7</issue><elocation-id>e158</elocation-id><!--===== Grouping copyright info into permissions =====--><permissions><copyright-year>2007</copyright-year><copyright-holder>Hall and Salipante</copyright-holder><license><license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions><related-article ext-link-type="uri" id="RA1" page="e51" related-article-type="retracted-article" vol="3" xlink:href="info:doi/10.1371/journal.pcbi.0030051" xlink:title="Research Article" xlink:type="simple">
3 | <article-title>Measures of Clade Confidence Do Not Correlate with Accuracy of Phylogenetic Trees</article-title>
4 | </related-article><counts><page-count count="1"/></counts><!--===== Restructure custom-meta-wrap to custom-meta-group =====--><custom-meta-group><custom-meta><meta-name>Citation:</meta-name><meta-value>Hall BG, Salipante SJ (2007) Retraction: Measures of Clade Confidence Do Not Correlate with Accuracy of Phylogenetic Trees. PLoS Comput Biol 3(7): e158. doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.pcbi.0030158" xlink:type="simple">10.1371/journal.pcbi.0030158</ext-link></meta-value></custom-meta></custom-meta-group></article-meta></front><body><sec id="s1"><title/><p>In <italic>PLoS Computational Biology,</italic> volume 3, issue 3, doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.pcbi.0030051" xlink:type="simple">10.1371/journal.pcbi.0030051</ext-link>:</p><p>As a result of a bug in the Perl script used to compare estimated trees with true trees, the clade confidence measures were sometimes associated with the incorrect clades. The error was detected by the sharp eye of Professor Sarah P. Otto of the University of British Columbia. She noticed a discrepancy between the example tree in Figure 1B and the results reported for the gene <italic>nuoK</italic> in Table 1, and requested that she be sent all ten <italic>nuoK</italic> Bayesian trees. She painstakingly did a manual comparison of those trees with the true trees, concluded that for that dataset there was a strong correlation between clade confidence and the probability of a clade being true, and suggested the possibility of a bug in the Perl script. Dr. Otto put in considerable effort, and we want to acknowledge the generosity of that effort.</p><p>The major conclusion of our paper, as given in its title, is therefore invalid, and the paper must be retracted. It is important to stress that the responsibility for the necessity of retracting our paper is entirely mine (Barry Hall), and that my coauthor Stephen J. Salipante bears none of the responsibility. I wrote the Perl script and failed to check its accuracy sufficiently.</p><p>We have now corrected the script and reanalyzed the trees in Tables 1–6. The results show that there are strong correlations between clade confidence and the probability that a clade is valid for Bayesian posterior probabilities and for Maximum Likelihood bootstrap percentages and weaker correlations for Maximum Likelihood aLRT values. We have prepared a new paper describing this reanalysis and the results achieved and have submitted it for publication.</p></sec></body></article>


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.pmed.0020402.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE article
 3 |   PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
 4 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="article-commentary" dtd-version="3.0" xml:lang="EN">
 5 |   <front>
 6 |     <journal-meta><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="publisher">pmed</journal-id><journal-id journal-id-type="nlm-ta">PLoS Med</journal-id><journal-id journal-id-type="pmc">plosmed</journal-id><!--===== Grouping journal title elements =====--><journal-title-group><journal-title>PLoS Medicine</journal-title></journal-title-group><issn pub-type="ppub">1549-1277</issn><issn pub-type="epub">1549-1676</issn><publisher>
 7 |         <publisher-name>Public Library of Science</publisher-name>
 8 |         <publisher-loc>San Francisco, USA</publisher-loc>
 9 |       </publisher></journal-meta>
10 |     <article-meta><article-id pub-id-type="doi">10.1371/journal.pmed.0020402</article-id><article-categories>
11 |         <subj-group subj-group-type="heading">
12 |           <subject>Synopsis</subject>
13 |         </subj-group>
14 |         <subj-group subj-group-type="Discipline">
15 |           <subject>Genetics and Genomics</subject>
16 |           <subject>Ophthalmology</subject>
17 |         </subj-group>
18 |         <subj-group subj-group-type="System Taxonomy">
19 |           <subject>Gene therapy</subject>
20 |           <subject>Ophthalmology</subject>
21 |         </subj-group>
22 |       </article-categories><title-group><article-title>Tackling Inherited Blindness</article-title><alt-title alt-title-type="running-head">Synopsis</alt-title></title-group><pub-date pub-type="ppub">
23 |         <month>11</month>
24 |         <year>2005</year>
25 |       </pub-date><pub-date pub-type="epub">
26 |         <day>1</day>
27 |         <month>11</month>
28 |         <year>2005</year>
29 |       </pub-date><volume>2</volume><issue>11</issue><elocation-id>e402</elocation-id><!--===== Grouping copyright info into permissions =====--><permissions><copyright-year>2005</copyright-year><copyright-holder>Public Library of Science</copyright-holder><license><license-p>This is an open-access article distributed under the terms of the Creative Commons Public Domain Declaration, which stipulates that, once placed in the public domain, this work may be freely reproduced, distributed, transmitted, modified, built upon, or otherwise used by anyone for any lawful purpose.</license-p></license></permissions><related-article page="e333" related-article-type="companion" vol="2" xlink:href="info:doi/10.1371/journal.pmed.0020333" xlink:title="research article" xlink:type="simple">
30 | 				<article-title>Pharmacological and rAAV Gene Therapy Rescue of Visual Functions in a Blind Mouse Model of Leber Congenital Amaurosis</article-title>
31 | 			</related-article></article-meta>
32 |   </front>
33 |   <body>
34 |     <sec id="s1">
35 |       <title/>
36 |       <p>Imagine the eye is like a camera. The shutter, like the iris of the eye, opens and closes to let in the right amount of light. The lens helps focus light on the film. And the film is like the retina. Regardless of the quality of the camera, if the film is faulty, the developed pictures may be distorted or blurred. In this way, untreatable degenerative diseases of the retina, which affect millions of people worldwide, lead to varying degrees of irreversible blindness. These degenerative eye disorders include retinitis pigmentosa, which affects 1.5 million people, and age-related macular degeneration, which is a leading cause of blindness in North America. The list of inherited retinal dystrophies (degenerations) is long and includes Best disease, choroideremia, cone–rod dystrophy, congenital stationary night blindness, and Leber congenital amaurosis (LCA).</p>
37 |       <p>LCA is a collection of diseases all characterized by severe loss of vision at birth from retinal dysfunction. It is a leading cause of congenital blindness. Currently, there is no treatment for LCA; however, it is known that LCA can be caused by mutations in the gene encoding RPE65, a key protein involved in the production and recycling of the chromophore 11-<italic>cis</italic>-retinal (11-<italic>cis</italic>-RAL) in the eye. 11-<italic>cis</italic>-RAL is an integral part of rhodopsin and cone visual pigments, pigments essential for our vision. About 15% of patients with LCA have mutations in <italic>RPE65</italic>. Humans with this form of LCA and Rpe65-deficient mice models both have severely impaired rod and cone function.</p>
38 |       <p>Armed with this knowledge, scientists are honing in on various therapeutic strategies for genetic eye diseases. These strategies include somatic gene therapy, infusion of protective proteins, and embryonic cell transplantation. The hope is that such interventions will converge and lead to treatments that slow down or prevent the blindness characteristic of many degenerative eye diseases. <xref ref-type="fig" rid="pmed-0020402-g001"/></p>
39 |       <fig id="pmed-0020402-g001" position="float">
40 |         <object-id pub-id-type="doi">10.1371/journal.pmed.0020402.g001</object-id>
41 |         <caption>
42 |           <title>Pharmacological and rAAV Gene Therapy Rescue of the Retinoid Cycle</title>
43 |         </caption>
44 |         <graphic mimetype="image" position="float" xlink:href="info:doi/10.1371/journal.pmed.0020402.g001" xlink:type="simple"/>
45 |       </fig>
46 |       <p>There have been several attempts to restore vision in patients with LCA using interventions such as calcium channel blockers and intraocular injection of neurotrophic factors. In most cases, the effects of these treatments lasted less than a month; hence, repeated administrations were required. Another approach is to bypass the biochemical block in mice without functional Rpe65 using synthetic <italic>cis</italic>-retinoids administered orally; such treatments have induced dramatic improvement in photoreceptor physiology.</p>
47 |       <p>Also, somatic gene therapy has been very successful in many animal models of retinal degeneration. In this issue of <italic>PLoS Medicine</italic>, Krzysztof Palczewski and colleagues attempted to combine two approaches to restore visual function with intraocular gene therapy and oral pharmacologic treatment with novel retinoid compounds in lecithin retinol acyl transferase (LRAT)–deficient mice. LRAT is a key enzyme involved in storage of vitamin A in the form of retinyl esters in structures known as retinosomes. In mice without LRAT, no 11-<italic>cis</italic>-RAL chromophore is produced, and visual function is severely impaired. <italic>Lrat</italic> mutations have been detected in a subset of patients with LCA.</p>
48 |       <p>The team found that gene therapy using intraocular injection of recombinant adeno-associated virus carrying the <italic>Lrat</italic> gene successfully restored electroretinographic and pupillary light responses in <italic>Lrat<sup>−/−</sup></italic> mice. Production of 11-<italic>cis</italic>-RAL was also restored. Pharmacological intervention with orally administered pro-drugs 9-<italic>cis</italic>-retinyl acetate and 9-<italic>cis</italic>-retinyl succinate also caused long-lasting restoration of retinal function in <italic>Lrat</italic>-deficient mice. Combining interventions produced markedly increased levels of visual pigment, and 1,000-fold improvements in pupillary light response and electroretinogram sensitivity. Direct comparison of each treatment was difficult, but both therapies provide efficient recovery of higher order visual responses. One advantage of oral retinoid treatment was its ease of administration compared with the subretinal injections required for viral vectors. Another factor was that the orally administered compounds were not stored in the liver for long, and were quickly oxidized and secreted. Pharmacological treatment could also be given multiple times; several low-dose treatments show cumulative effects. The main disadvantage of oral treatment was the potential for long-term systemic toxicity compared with vector targeting of LRAT to the RPE, which needs to be examined in future studies.</p>
49 |       <p>Interestingly, the researchers observed that chromophore supplementation and somatic gene therapy were optimally effective in combination, particularly when chromophore supplementation was continued at low doses for longer periods of time. The authors suggest that the combined approach might be more suitable for treating a wider age range of patients. Although much more preclinical testing is required, it is likely that pharmacologic and somatic gene therapeutic approaches could be used together if such testing proves safe and successful in human trials. The authors speculate that treatment of patients with oral retinoids could begin in infancy to avoid amblyopia while also avoiding the difficulties associated with surgery in very young patients. For older patients, a long-lasting drug-free treatment might be achieved by surgical introduction of viral vectors.</p>
50 |     </sec>
51 |   </body>
52 | </article>


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.pmed.0030205.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!DOCTYPE article
  3 |   PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
  4 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="3.0" xml:lang="EN">
  5 |   <front>
  6 |     <journal-meta><journal-id journal-id-type="publisher-id">plos</journal-id><journal-id journal-id-type="publisher">pmed</journal-id><journal-id journal-id-type="nlm-ta">PLoS Med</journal-id><journal-id journal-id-type="pmc">plosmed</journal-id><!--===== Grouping journal title elements =====--><journal-title-group><journal-title>PLoS Medicine</journal-title></journal-title-group><issn pub-type="ppub">1549-1277</issn><issn pub-type="epub">1549-1676</issn><publisher>
  7 |         <publisher-name>Public Library of Science</publisher-name>
  8 |         <publisher-loc>San Francisco, USA</publisher-loc>
  9 |       </publisher></journal-meta>
 10 |     <article-meta><article-id pub-id-type="doi">10.1371/journal.pmed.0030205</article-id><article-categories>
 11 |         <subj-group subj-group-type="heading">
 12 |           <subject>Correspondence</subject>
 13 |         </subj-group>
 14 |         <subj-group subj-group-type="Discipline">
 15 |           <subject>Hematology</subject>
 16 |         </subj-group>
 17 |         <subj-group subj-group-type="System Taxonomy">
 18 |           <subject>Drugs and adverse drug reactions</subject>
 19 |           <subject>Hematology (including Blood Transfusion)</subject>
 20 |         </subj-group>
 21 |       </article-categories><title-group><article-title>Mischievous Odds Ratios</article-title><alt-title alt-title-type="running-head">Correspondence</alt-title></title-group><contrib-group>
 22 |         <contrib contrib-type="author" xlink:type="simple">
 23 |           <name name-style="western">
 24 |             <surname>Steinsmith</surname>
 25 |             <given-names>William</given-names>
 26 |           </name>
 27 |           <xref ref-type="aff" rid="aff1">
 28 |             <sup>1</sup>
 29 |           </xref>
 30 |           <xref ref-type="corresp" rid="n1">
 31 |             <sup>*</sup>
 32 |           </xref>
 33 |         </contrib>
 34 |       </contrib-group><aff id="aff1">
 35 | 				<label>1</label>
 36 | 				
 37 | 				<addr-line>Institute of Thermodynamic Biology and Medicine, Williamsburg, Virginia, United States of America</addr-line>
 38 | 				
 39 | 			</aff><author-notes>
 40 |         <corresp id="n1">E-mail: 
 41 | 					<email xlink:type="simple">bbhaywood@aol.com</email>
 42 | 				</corresp>
 43 |       <fn fn-type="conflict" id="n2">
 44 |         <p>
 45 | 						 The author has declared that no competing interests exist.
 46 | 					</p>
 47 |       </fn></author-notes><pub-date pub-type="ppub">
 48 |         <month>4</month>
 49 |         <year>2006</year>
 50 |       </pub-date><pub-date pub-type="epub">
 51 |         <day>25</day>
 52 |         <month>4</month>
 53 |         <year>2006</year>
 54 |       </pub-date><volume>3</volume><issue>4</issue><elocation-id>e205</elocation-id><!--===== Grouping copyright info into permissions =====--><permissions><copyright-year>2006</copyright-year><copyright-holder>William Steinsmith</copyright-holder><license><license-p>This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions><related-article page="e312" related-article-type="companion" vol="3" xlink:href="info:doi/10.1371/journal.pmed.0020312" xlink:title="research article" xlink:type="simple">
 55 | 				<article-title>A 
 56 | 					<italic>C1173T</italic> Dimorphism in the 
 57 | 					<italic>VKORC1</italic> Gene Determines Coumarin Sensitivity and Bleeding Risk
 58 | 				</article-title>
 59 | 			</related-article><related-article page="e210" related-article-type="companion" vol="3" xlink:href="info:doi/10.1371/journal.pmed.0030210" xlink:title="correspondence" xlink:type="simple">
 60 | 				<article-title>Authors' Reply</article-title>
 61 | 			</related-article><funding-group><funding-statement>
 62 | 					The author received no specific funding for this article.
 63 | 				</funding-statement></funding-group></article-meta>
 64 |   </front>
 65 |   <body>
 66 |     <sec id="s1">
 67 |       <title/>
 68 |       <p>Pieter Reitsma and colleagues have explored—in a population of patients anticoagulated with coumarin congeners—the connection between the presence of mutant alleles of a single gene and the risk of haemorrhage [<xref ref-type="bibr" rid="pmed-0030205-b1">1</xref>].
 69 | 			</p>
 70 |       <p>Using as their denominator the odds for bleeding in a patient without mutant alleles, and using as their numerator the odds for patients with each of the two mutant alleles, the authors propose the resulting odds ratios as surrogates for the relative risk of haemorrhage.</p>
 71 |       <p>It should be noted, however, that the conflation of an odds ratio with a relative risk is not generally justified [<xref ref-type="bibr" rid="pmed-0030205-b2">2</xref>,
 72 | 				<xref ref-type="bibr" rid="pmed-0030205-b3">3</xref>]. The relative risk is the ratio of two probabilities (p2/p1), whereas the corresponding odds ratio is [p1/(1-p1)]/[p2/(1-p2)]. Equating these two ratios requires that p1 = p2, i.e., that the risk ratio be unity.
 73 | 			</p>
 74 |       <p>In Reitsma and colleagues' paper, none of the eight odds ratios presented in Table 2 turn out identical with the corresponding calculated risk ratio, and the most discordant pair of values diverge by a factor of about 1.4, i.e., the odds ratio of 2.6 corresponding to a relative risk of 1.9.</p>
 75 |       <p>Mischievous conflation of odds ratios with probability ratios is widespread in the literature dealing with laboratory testing, with the odds ratio (confusingly termed the “likelihood ratio”) typically presented as surrogate for the corresponding ratio of probabilities.</p>
 76 |       <p>The power of a positive laboratory test to enhance the likelihood of disease presence in a given patient (properly termed the “positive probability-based likelihood ratio”) is the ratio of two probabilities: the probability that the patient who tested positive is truly diseased (termed the “positive predictive value”) divided by the probability of disease in the pre-test population (termed the “disease prevalence”).</p>
 77 |       <p>Expressed explicitly in terms of the subcategories of the test population, the positive predictive value is the ratio represented by (True Positives)/(True Positives + False Positives), and the prevalence is the ratio represented by (True Positives + False Negatives)/(True Positives + False Negatives + True Negatives + False Positives).</p>
 78 |       <p>The calculus is easily adapted to compute the probability-based likelihood ratio for the absence of disease in a given patient. In this case, the post-negative-test probability of disease absence (termed the “negative predictive value”) is the ratio represented by (True Negatives)/(True Negatives + False Negatives), and the pre-test probability is one minus the disease prevalence. The negative probability-based likelihood ratio is, then, the ratio represented by the post-test probability divided by the pre-test probability.</p>
 79 |       <p>A more descriptive term for the probability-based likelihood ratio would be the “probability magnifying power,” since it leads to the expanded probability of the presence (or absence) of disease yielded by a positive (or negative) test result.</p>
 80 |     </sec>
 81 |   </body>
 82 |   <back>
 83 |     <ref-list>
 84 |       <title>References</title>
 85 |       <ref id="pmed-0030205-b1">
 86 |         <label>1</label>
 87 |         <nlm-citation publication-type="journal" xlink:type="simple">
 88 |           <person-group person-group-type="author">
 89 |             <name name-style="western">
 90 |               <surname>Reitsma</surname>
 91 |               <given-names>PH</given-names>
 92 |             </name>
 93 |             <name name-style="western">
 94 |               <surname>van der Heijden</surname>
 95 |               <given-names>JF</given-names>
 96 |             </name>
 97 |             <name name-style="western">
 98 |               <surname>Groot</surname>
 99 |               <given-names>AP</given-names>
100 |             </name>
101 |             <name name-style="western">
102 |               <surname>Rosendaal</surname>
103 |               <given-names>FR</given-names>
104 |             </name>
105 |             <name name-style="western">
106 |               <surname>Büller</surname>
107 |               <given-names>HR</given-names>
108 |             </name>
109 |           </person-group>
110 |           <article-title>A 
111 | 						<italic>C1173T</italic> dimorphism in the 
112 | 						<italic>VKORC1</italic> gene determines coumarin sensitivity and bleeding risk.</article-title>
113 |           <source>PLos Med</source>
114 |           <year>2005</year>
115 |           <volume>2</volume>
116 |           <fpage>e312</fpage>
117 |           <comment>doi: <ext-link ext-link-type="doi" xlink:href="http://dx.doi.org/10.1371/journal.pmed.0020312" xlink:type="simple">10.1371/journal.pmed.0020312</ext-link></comment>
118 |         </nlm-citation>
119 |       </ref>
120 |       <ref id="pmed-0030205-b2">
121 |         <label>2</label>
122 |         <nlm-citation publication-type="journal" xlink:type="simple">
123 |           <person-group person-group-type="author">
124 |             <name name-style="western">
125 |               <surname>Van den Ende</surname>
126 |               <given-names>J</given-names>
127 |             </name>
128 |             <name name-style="western">
129 |               <surname>Moreira</surname>
130 |               <given-names>J</given-names>
131 |             </name>
132 |             <name name-style="western">
133 |               <surname>Basinga</surname>
134 |               <given-names>P</given-names>
135 |             </name>
136 |             <name name-style="western">
137 |               <surname>Bisoffi</surname>
138 |               <given-names>Z</given-names>
139 |             </name>
140 |           </person-group>
141 |           <article-title>The trouble with likelihood ratios.</article-title>
142 |           <source>The Lancet</source>
143 |           <year>2005</year>
144 |           <volume>366</volume>
145 |           <fpage>548</fpage>
146 |         </nlm-citation>
147 |       </ref>
148 |       <ref id="pmed-0030205-b3">
149 |         <label>3</label>
150 |         <nlm-citation publication-type="journal" xlink:type="simple">
151 |           <person-group person-group-type="author">
152 |             <name name-style="western">
153 |               <surname>Wessler</surname>
154 |               <given-names>AM</given-names>
155 |             </name>
156 |             <name name-style="western">
157 |               <surname>Bailey</surname>
158 |               <given-names>KR</given-names>
159 |             </name>
160 |           </person-group>
161 |           <article-title>A critique on contemporary reporting of likelihood ratios in test power analysis.</article-title>
162 |           <source>Mayo Clin Proc</source>
163 |           <year>2004</year>
164 |           <volume>79</volume>
165 |           <fpage>1317</fpage>
166 |           <lpage>1318</lpage>
167 |         </nlm-citation>
168 |       </ref>
169 |     </ref-list>
170 |     
171 |   </back>
172 | </article>


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.pone.0097541.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE article
 3 |   PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
 4 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="correction" dtd-version="3.0" xml:lang="en">
 5 | <front>
 6 | <journal-meta>
 7 | <journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
 8 | <journal-id journal-id-type="publisher-id">plos</journal-id>
 9 | <journal-id journal-id-type="pmc">plosone</journal-id><journal-title-group>
10 | <journal-title>PLoS ONE</journal-title></journal-title-group>
11 | <issn pub-type="epub">1932-6203</issn>
12 | <publisher>
13 | <publisher-name>Public Library of Science</publisher-name>
14 | <publisher-loc>San Francisco, USA</publisher-loc></publisher>
15 | </journal-meta>
16 | <article-meta>
17 | <article-id pub-id-type="publisher-id">PONE-D-14-18167</article-id>
18 | <article-id pub-id-type="doi">10.1371/journal.pone.0097541</article-id>
19 | <article-categories><subj-group subj-group-type="heading"><subject>Correction</subject></subj-group></article-categories>
20 | <title-group>
21 | <article-title>Correction: Pollen and Phytolith Evidence for Rice Cultivation and Vegetation Change during the Mid-Late Holocene at the Jiangli Site, Suzhou, East China</article-title>
22 | </title-group>
23 | <contrib-group>
24 | <contrib contrib-type="author" xlink:type="simple"><collab xlink:type="simple">The <italic>PLOS ONE</italic> Staff</collab></contrib>
25 | </contrib-group>
26 | <pub-date pub-type="collection"><year>2014</year></pub-date>
27 | <pub-date pub-type="epub"><day>6</day><month>5</month><year>2014</year></pub-date>
28 | <volume>9</volume>
29 | <issue>5</issue>
30 | <elocation-id>e97541</elocation-id><permissions>
31 | <copyright-year>2014</copyright-year>
32 | <copyright-holder>The PLOS ONE Staff</copyright-holder><license xlink:type="simple"><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions><related-article id="RA1" related-article-type="corrected-article" ext-link-type="uri" page="e86816" xlink:type="simple" xlink:href="info:doi/10.1371/journal.pone.0086816"><article-title>Pollen and Phytolith Evidence for Rice Cultivation and Vegetation Change during the Mid-Late Holocene at the Jiangli Site, Suzhou, East China</article-title></related-article><counts><page-count count="1"/></counts></article-meta>
33 | </front>
34 | <body><sec id="s1">
35 | <title>Notice of Republication</title>
36 | <p>This article was republished on April 21, 2014, to correct errors in number symbols that were introduced during the typesetting process. Please download this article again to view the correct version. The originally published, uncorrected article and the republished, corrected article are provided here for reference.</p>
37 | </sec><sec id="s2">
38 | <title>Supporting Information</title>
39 | <supplementary-material id="pone.0097541.s001" mimetype="application/pdf" xlink:href="info:doi/10.1371/journal.pone.0097541.s001" position="float" xlink:type="simple"><label>File S1</label><caption>
40 | <p>Originally published, uncorrected article.</p>
41 | <p>(PDF)</p>
42 | </caption></supplementary-material><supplementary-material id="pone.0097541.s002" mimetype="application/pdf" xlink:href="info:doi/10.1371/journal.pone.0097541.s002" position="float" xlink:type="simple"><label>File S2</label><caption>
43 | <p>Republished corrected article.</p>
44 | <p>(PDF)</p>
45 | </caption></supplementary-material></sec></body>
46 | <back><ref-list>
47 | <title>Reference</title>
48 | <ref id="pone.0097541-Qiu1"><label>1</label>
49 | <mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Qiu</surname><given-names>Z</given-names></name>, <name name-style="western"><surname>Jiang</surname><given-names>H</given-names></name>, <name name-style="western"><surname>Ding</surname><given-names>J</given-names></name>, <name name-style="western"><surname>Hu</surname><given-names>Y</given-names></name>, <name name-style="western"><surname>Shang</surname><given-names>X</given-names></name> (<year>2014</year>) <article-title>Pollen and Phytolith Evidence for Rice Cultivation and Vegetation Change during the Mid-Late Holocene at the Jiangli Site, Suzhou, East China</article-title>. <source>PLoS ONE</source> <volume>9(1)</volume>: <fpage>e86816</fpage> <comment>doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.pone.0086816" xlink:type="simple">10.1371/journal.pone.0086816</ext-link></comment></mixed-citation>
50 | </ref>
51 | </ref-list></back>
52 | </article>


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.pone.0108198.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE article
 3 |   PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
 4 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="correction" dtd-version="3.0" xml:lang="en">
 5 | <front>
 6 | <journal-meta>
 7 | <journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
 8 | <journal-id journal-id-type="publisher-id">plos</journal-id>
 9 | <journal-id journal-id-type="pmc">plosone</journal-id><journal-title-group>
10 | <journal-title>PLoS ONE</journal-title></journal-title-group>
11 | <issn pub-type="epub">1932-6203</issn>
12 | <publisher>
13 | <publisher-name>Public Library of Science</publisher-name>
14 | <publisher-loc>San Francisco, USA</publisher-loc></publisher>
15 | </journal-meta>
16 | <article-meta>
17 | <article-id pub-id-type="publisher-id">PONE-D-14-38753</article-id>
18 | <article-id pub-id-type="doi">10.1371/journal.pone.0108198</article-id>
19 | <article-categories><subj-group subj-group-type="heading"><subject>Correction</subject></subj-group></article-categories>
20 | <title-group>
21 | <article-title>Correction: Macrophage Control of Phagocytosed Mycobacteria Is Increased by Factors Secreted by Alveolar Epithelial Cells through Nitric Oxide Independent Mechanisms</article-title>
22 | </title-group>
23 | <contrib-group>
24 | <contrib contrib-type="author" xlink:type="simple"><collab xlink:type="simple">The <italic>PLOS ONE</italic> Staff</collab></contrib>
25 | </contrib-group>
26 | <pub-date pub-type="collection"><year>2014</year></pub-date>
27 | <pub-date pub-type="epub"><day>9</day><month>9</month><year>2014</year></pub-date>
28 | <volume>9</volume>
29 | <issue>9</issue>
30 | <elocation-id>e108198</elocation-id><permissions>
31 | <copyright-year>2014</copyright-year>
32 | <copyright-holder>The PLOS ONE Staff</copyright-holder><license xlink:type="simple"><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license></permissions><related-article id="RA1" related-article-type="corrected-article" ext-link-type="uri" page="e103411" xlink:type="simple" xlink:href="info:doi/10.1371/journal.pone.0103411"> <article-title>Macrophage Control of Phagocytosed Mycobacteria Is Increased by Factors Secreted by Alveolar Epithelial Cells through Nitric Oxide Independent Mechanisms</article-title></related-article><counts><page-count count="1"/></counts></article-meta>
33 | </front>
34 | <body><sec id="s1">
35 | <title>Notice of Republication</title>
36 | <p>This article was republished on August 25, 2014, to correct Figure 4, which was erroneously resized during the typesetting process. The publisher apologizes for the errors. Please view this article again to download the correct version. The originally published, uncorrected article and the republished, corrected article are provided here for reference.</p>
37 | </sec><sec id="s2">
38 | <title>Supporting Information</title>
39 | <supplementary-material id="pone.0108198.s001" mimetype="application/pdf" xlink:href="info:doi/10.1371/journal.pone.0108198.s001" position="float" xlink:type="simple"><label>File S1</label><caption>
40 | <p>Originally published, uncorrected article.</p>
41 | <p>(PDF)</p>
42 | </caption></supplementary-material><supplementary-material id="pone.0108198.s002" mimetype="application/pdf" xlink:href="info:doi/10.1371/journal.pone.0108198.s002" position="float" xlink:type="simple"><label>File S2</label><caption>
43 | <p>Republished, corrected article.</p>
44 | <p>(PDF)</p>
45 | </caption></supplementary-material></sec></body>
46 | <back><ref-list>
47 | <title>Reference</title>
48 | <ref id="pone.0108198-Petursdottir1"><label>1</label>
49 | <mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Petursdottir</surname><given-names>DH</given-names></name>, <name name-style="western"><surname>Chuquimia</surname><given-names>OD</given-names></name>, <name name-style="western"><surname>Freidl</surname><given-names>R</given-names></name>, <name name-style="western"><surname>Fernández</surname><given-names>C</given-names></name> (<year>2014</year>) <article-title>Macrophage Control of Phagocytosed Mycobacteria Is Increased by Factors Secreted by Alveolar Epithelial Cells through Nitric Oxide Independent Mechanisms</article-title>. <source>PLoS ONE</source> <volume>9(8)</volume>: <fpage>e103411</fpage> <comment>doi:<ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.pone.0103411" xlink:type="simple">10.1371/journal.pone.0103411</ext-link></comment></mixed-citation>
50 | </ref>
51 | </ref-list></back>
52 | </article>


--------------------------------------------------------------------------------
/allofplos/starter_corpus/journal.ppat.1005207.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!DOCTYPE article
  3 |   PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
  4 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="retraction" dtd-version="3.0" xml:lang="en">
  5 | <front>
  6 | <journal-meta>
  7 | <journal-id journal-id-type="nlm-ta">PLoS Pathog</journal-id>
  8 | <journal-id journal-id-type="publisher-id">plos</journal-id>
  9 | <journal-id journal-id-type="pmc">plospath</journal-id>
 10 | <journal-title-group>
 11 | <journal-title>PLOS Pathogens</journal-title>
 12 | </journal-title-group>
 13 | <issn pub-type="ppub">1553-7366</issn>
 14 | <issn pub-type="epub">1553-7374</issn>
 15 | <publisher>
 16 | <publisher-name>Public Library of Science</publisher-name>
 17 | <publisher-loc>San Francisco, CA USA</publisher-loc>
 18 | </publisher>
 19 | </journal-meta>
 20 | <article-meta>
 21 | <article-id pub-id-type="doi">10.1371/journal.ppat.1005207</article-id>
 22 | <article-id pub-id-type="publisher-id">PPATHOGENS-D-15-02190</article-id>
 23 | <article-categories>
 24 | <subj-group subj-group-type="heading">
 25 | <subject>Retraction</subject>
 26 | </subj-group>
 27 | </article-categories>
 28 | <title-group>
 29 | <article-title>Retraction: Extreme Resistance as a Host Counter-counter Defense against Viral Suppression of RNA Silencing</article-title>
 30 | </title-group>
 31 | <contrib-group>
 32 | <contrib contrib-type="author" xlink:type="simple">
 33 | <name name-style="western">
 34 | <surname>Sansregret</surname>
 35 | <given-names>Raphaël</given-names>
 36 | </name>
 37 | </contrib>
 38 | <contrib contrib-type="author" xlink:type="simple">
 39 | <name name-style="western">
 40 | <surname>Dufour</surname>
 41 | <given-names>Vanessa</given-names>
 42 | </name>
 43 | </contrib>
 44 | <contrib contrib-type="author" xlink:type="simple">
 45 | <name name-style="western">
 46 | <surname>Langlois</surname>
 47 | <given-names>Mathieu</given-names>
 48 | </name>
 49 | </contrib>
 50 | <contrib contrib-type="author" xlink:type="simple">
 51 | <name name-style="western">
 52 | <surname>Daayf</surname>
 53 | <given-names>Fouad</given-names>
 54 | </name>
 55 | </contrib>
 56 | <contrib contrib-type="author" xlink:type="simple">
 57 | <name name-style="western">
 58 | <surname>Dunoyer</surname>
 59 | <given-names>Patrice</given-names>
 60 | </name>
 61 | </contrib>
 62 | <contrib contrib-type="author" xlink:type="simple">
 63 | <name name-style="western">
 64 | <surname>Voinnet</surname>
 65 | <given-names>Olivier</given-names>
 66 | </name>
 67 | </contrib>
 68 | <contrib contrib-type="author" xlink:type="simple">
 69 | <name name-style="western">
 70 | <surname>Bouarab</surname>
 71 | <given-names>Kamal</given-names>
 72 | </name>
 73 | </contrib>
 74 | </contrib-group>
 75 | <pub-date pub-type="epub">
 76 | <day>22</day>
 77 | <month>9</month>
 78 | <year>2015</year>
 79 | </pub-date>
 80 | <pub-date pub-type="collection">
 81 | <month>9</month>
 82 | <year>2015</year>
 83 | </pub-date>
 84 | <volume>11</volume>
 85 | <issue>9</issue>
 86 | <elocation-id>e1005207</elocation-id>
 87 | <permissions>
 88 | <copyright-year>2015</copyright-year>
 89 | <copyright-holder>Sansregret et al</copyright-holder>
 90 | <license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">
 91 | <license-p>This is an open access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited</license-p>
 92 | </license>
 93 | </permissions>
 94 | <self-uri content-type="pdf" xlink:href="info:doi/10.1371/journal.ppat.1005207" xlink:type="simple"/>
 95 | <related-article ext-link-type="uri" id="related001" related-article-type="retracted-article" xlink:href="info:doi/10.1371/journal.ppat.1003435" xlink:type="simple">
 96 | <article-title>Extreme Resistance as a Host Counter-counter Defense against Viral Suppression of RNA Silencing</article-title>
 97 | </related-article>
 98 | <counts>
 99 | <fig-count count="0"/>
100 | <table-count count="0"/>
101 | <page-count count="1"/>
102 | </counts>
103 | </article-meta>
104 | </front>
105 | <body>
106 | <p>At the request of the authors, <italic>PLOS Pathogens</italic> is retracting this publication following an investigation into concerns about the origin and assembly of Figure 6 and a mounting mistake in Figure 1B.</p>
107 | <p>The Northern blot depicted in Figure 6 contains several band duplications affecting the panels labelled 'IP@HA' and 'total RNA'. The figure was provided by Patrice Dunoyer during revision and was extracted from the Master thesis of a former student working under his supervision, without the prior consultation or consent of this student. The other authors of Sansregret <italic>et al</italic>. were not informed about the origin of this figure and, regrettably, its erroneous content escaped their attention both at the final revision and proofreading stages.</p>
108 | <p>During inspection of the original blots, we realised that the loading control of Figure 1B was not the correct one; we have found the cognate one and the loadings are comparable. Importantly, the cross-reacting band visible on Figure 1B also provides an internal loading control. This mistake was made by co-authors Raphael Sansregret and Kamal Bouarab.</p>
109 | <p>Further analysis of the raw material underlying the results depicted in Figure 6 was found to support the original conclusions drawn from it. The other conclusions of the published article also remain valid. However, given the nature and extent of data manipulation in Figure 6, the authors have collectively decided to retract the study.</p>
110 | <p>All authors concur with this statement and apologise for not having detected these errors. Kamal Bouarab and Olivier Voinnet, as the corresponding authors, take full responsibility for the publication of this erroneous paper and regret deeply the inconvenience caused.</p>
111 | </body>
112 | <back>
113 | <ref-list>
114 | <title>Reference</title>
115 | <ref id="ppat.1005207.ref001"><label>1</label><mixed-citation publication-type="journal" xlink:type="simple"><name name-style="western"><surname>Sansregret</surname> <given-names>R</given-names></name>, <name name-style="western"><surname>Dufour</surname> <given-names>V</given-names></name>, <name name-style="western"><surname>Langlois</surname> <given-names>M</given-names></name>, <name name-style="western"><surname>Daayf</surname> <given-names>F</given-names></name>, <name name-style="western"><surname>Dunoyer</surname> <given-names>P</given-names></name>, <name name-style="western"><surname>Voinnet</surname> <given-names>O</given-names></name>, <etal>et al</etal>. (<year>2013</year>) <article-title>Extreme Resistance as a Host Counter-counter Defense against Viral Suppression of RNA Silencing</article-title>. <source>PLoS Pathog</source> <volume>9</volume>(<issue>6</issue>): <fpage>e1003435</fpage>. <comment>doi: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.1371/journal.ppat.1003435" xlink:type="simple">10.1371/journal.ppat.1003435</ext-link></comment> <object-id pub-id-type="pmid">23785291</object-id></mixed-citation></ref>
116 | </ref-list>
117 | </back>
118 | </article>


--------------------------------------------------------------------------------
/allofplos/starter_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Make starter data. This script is for the creation of the starter data directory out
 5 | of the data in the plos_corpus directory. Should be used only for initial starter
 6 | data generation. It is a mantainance script not intented to be used as a
 7 | regular tool.
 8 | """
 9 | 
10 | import os
11 | from shutil import copyfile
12 | 
13 | from . import get_corpus_dir
14 | from .transformations import doi_to_path
15 | 
16 | starter_directory = 'starter_corpus'
17 | 
18 | try:
19 |     os.mkdir(starter_directory)
20 | except FileExistsError:
21 |     pass
22 | 
23 | starter_dois = []
24 | for doi in open('dois.txt'):
25 |     starter_dois.append(doi.replace('\n',''))
26 | 
27 | for doi in starter_dois:
28 |     # Copy file from get_corpus_dir()
29 |     article_path = doi_to_path(doi, get_corpus_dir())
30 |     file_name = os.path.basename(article_path)
31 |     copyfile(article_path, os.path.join(starter_directory,file_name))
32 | 


--------------------------------------------------------------------------------
/allofplos/transformations.py:
--------------------------------------------------------------------------------
  1 | """ Includes all global variables
  2 | """
  3 | from collections import OrderedDict
  4 | import os
  5 | 
  6 | from . import get_corpus_dir
  7 | 
  8 | from .plos_regex import validate_filename, validate_doi
  9 | from .elements import Journal
 10 | 
 11 | # URL bases for PLOS's Solr instances, that index PLOS articles
 12 | BASE_URL_API = 'https://api.plos.org/search'
 13 | 
 14 | BASE_URL_DOI = 'https://doi.org/'
 15 | URL_SUFFIX = '&type=manuscript'
 16 | INT_URL_SUFFIX = '.XML'
 17 | PREFIX = '10.1371/'
 18 | SUFFIX_LOWER = '.xml'
 19 | annotation = 'annotation'
 20 | correction = 'correction'
 21 | ANNOTATION_URL = 'https://journals.plos.org/plosone/article/file?id=10.1371/annotation/'
 22 | ANNOTATION_DOI = '10.1371/annotation'
 23 | BASE_URL_ARTICLE_LANDING_PAGE = 'https://journals.plos.org/plos{}/article?id={}'
 24 | BASE_URL_LANDING_PAGE = 'https://journals.plos.org/{}/'
 25 | LANDING_PAGE_SUFFIX = '{}?id={}'
 26 | doi_url = 'https://doi.org/'
 27 | 
 28 | plos_page_dict = {'article': 'article',
 29 |                   'asset': 'article/asset',
 30 |                   'articleFigsAndTables': 'article/assets/figsAndTables',
 31 |                   'articleAuthors': 'article/authors',
 32 |                   'citationDownloadPage': 'article/citation',
 33 |                   'downloadBibtexCitation': 'article/citation/bibtex',
 34 |                   'downloadRisCitation': 'article/citation/ris',
 35 |                   'figuresPage': 'article/figures',
 36 |                   'assetFile': 'article/file',
 37 |                   'assetXMLFile': 'article/file',
 38 |                   'articleMetrics': 'article/metrics',
 39 |                   'articleRelated': 'article/related'}
 40 | 
 41 | 
 42 | def _get_base_page(journal):
 43 |     """Make the base of a PLOS URL journal-specific.
 44 | 
 45 |     Defaults to PLOS ONE.
 46 | 
 47 |     Use in conjunction with `get_page()` in the Article class.
 48 |     """
 49 |     journal_map = {'PLOS ONE': 'plosone',
 50 |                    'PLOS Computational Biology': 'ploscompbiol',
 51 |                    'PLOS Neglected Tropical Diseases': 'plosntds',
 52 |                    'PLOS Genetics': 'plosgenetics',
 53 |                    'PLOS Pathogens': 'plospathogens',
 54 |                    'PLOS Biology': 'plosbiology',
 55 |                    'PLOS Medicine': 'plosmedicine',
 56 |                    'PLOS Clinical Trials': 'plosclinicaltrials',
 57 |                    'PLOS Sustainability and Transformation': 'sustainabilitytransformation',
 58 |                    'PLOS Climate': 'climate',
 59 |                    'PLOS Water': 'water',
 60 |                    'PLOS Global Public Health': 'globalpublichealth',
 61 |                    'PLOS Digital Health': 'digitalhealth',
 62 |                    'PLOS Mental Health': 'mentalhealth',
 63 |                    'PLOS Complex Systems': 'complexsystems',
 64 |                    }
 65 |     try:
 66 |         url = BASE_URL_LANDING_PAGE.format(journal_map[journal])
 67 |     except KeyError:
 68 |         print('URL error for {}'.format(journal))
 69 |         url = BASE_URL_LANDING_PAGE.format('one')
 70 | 
 71 |     return url
 72 | 
 73 | 
 74 | def filename_to_url(filename):
 75 |     """
 76 |     Transform filename into a downloadable URL where its XML resides
 77 |     Includes transform for the 'annotation' DOIs
 78 |     Example:
 79 |     filename_to_url('allofplos_xml/journal.pone.1000001.xml') = \
 80 |     'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001'
 81 | 
 82 |     :param filename: string representing a filename
 83 |     :return: online location of a PLOS article's XML
 84 |     """
 85 |     if correction in filename:
 86 |         article = 'annotation/' + (filename.split('.', 4)[2])
 87 |     else:
 88 |         article = os.path.splitext((os.path.basename(filename)))[0]
 89 |     doi = PREFIX + article
 90 |     return doi_to_url(doi)
 91 | 
 92 | 
 93 | def filename_to_doi(filename):
 94 |     """
 95 |     Transform filename into the article's DOI.
 96 |     Includes transform for the 'annotation' DOIs.
 97 |     Uses regex to make sure it's a file and not a DOI
 98 |     Example:
 99 |     filename_to_doi('journal.pone.1000001.xml') = '10.1371/journal.pone.1000001'
100 | 
101 |     :param filename: relative path to local XML file in the get_corpus_dir() directory
102 |     :return: full unique identifier for a PLOS article
103 |     """
104 |     filename = os.path.basename(filename)
105 |     if not validate_filename(filename):
106 |         raise Exception("Invalid format for PLOS filename: {}".format(filename))
107 |     elif correction in filename:
108 |         article = 'annotation/' + filename.split('.', 4)[2]
109 |         doi = PREFIX + article
110 |     else:
111 |         doi = PREFIX + os.path.splitext(filename)[0]
112 |     return doi
113 | 
114 | 
115 | def url_to_path(url, directory=None):
116 |     """
117 |     For a given PLOS URL to an XML file, return the relative path to the local XML file
118 |     Example:
119 |     url_to_path('https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001') = \
120 |     'allofplos_xml/journal.pone.1000001.xml'
121 |     :param url: online location of a PLOS article's XML
122 |     :param directory: defaults to get_corpus_dir(), containing article files
123 |     :return: relative path to local XML file in the directory
124 |     """
125 |     if directory is None:
126 |         directory = get_corpus_dir()
127 |     annot_prefix = 'plos.correction.'
128 |     if url.startswith(ANNOTATION_URL):
129 |         # NOTE: REDO THIS!
130 |         file_ = os.path.join(directory,
131 |                              annot_prefix +
132 |                              url[url.index(ANNOTATION_DOI + '/')+len(ANNOTATION_DOI + '/'):].
133 |                              replace(URL_SUFFIX, '').
134 |                              replace(INT_URL_SUFFIX, '') + '.xml')
135 |     else:
136 |         file_ = os.path.join(directory,
137 |                              url[url.index(PREFIX)+len(PREFIX):].
138 |                              replace(URL_SUFFIX, '').
139 |                              replace(INT_URL_SUFFIX, '') + '.xml')
140 |     return file_
141 | 
142 | 
143 | def url_to_doi(url):
144 |     """
145 |     For a given PLOS URL to an XML file, transform it to the article's DOI
146 |     Example:
147 |     url_to_path('https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001') = \
148 |     '10.1371/journal.pone.1000001'
149 |     :param url: online location of a PLOS article's XML
150 |     :return: full unique identifier for a PLOS article
151 |     """
152 |     return url[url.index(PREFIX):].rstrip(URL_SUFFIX).rstrip(INT_URL_SUFFIX)
153 | 
154 | 
155 | def doi_to_url(doi):
156 |     """
157 |     For a given PLOS DOI, return the PLOS URL to that article's XML file
158 |     Example:
159 |     doi_to_url('10.1371/journal.pone.1000001') = \
160 |     'https://journals.plos.org/plosone/article/file?id=10.1371/journal.pone.1000001'
161 |     :param doi: full unique identifier for a PLOS article
162 |     :return: online location of a PLOS article's XML
163 |     """
164 |     if validate_doi(doi) is False:
165 |         raise Exception("Invalid format for PLOS DOI: {}".format(doi))
166 |     journal = Journal.doi_to_journal(doi)
167 |     base_page = _get_base_page(journal)
168 |     return ''.join([base_page, 'article/file?id=', doi, URL_SUFFIX])
169 | 
170 | 
171 | def doi_to_path(doi, directory=None):
172 |     """
173 |     For a given PLOS DOI, return the relative path to that local article
174 |     For DOIs that contain the word 'annotation', searches online version of the article xml to extract
175 |     the journal name, which goes into the filename. Will print DOI if it can't find the journal name
176 |     Uses regex to make sure it's a DOI and not a file
177 |     Example:
178 |     doi_to_path('10.1371/journal.pone.1000001') = 'allofplos_xml/journal.pone.1000001.xml'
179 |     :param doi: full unique identifier for a PLOS article
180 |     :param directory: defaults to get_corpus_dir(), containing article files
181 |     :return: relative path to local XML file
182 |     """
183 |     if directory is None:
184 |         directory = get_corpus_dir()
185 |     if not validate_doi(doi):
186 |         raise Exception("Invalid format for PLOS DOI: {}".format(doi))
187 |     elif doi.startswith(ANNOTATION_DOI):
188 |         article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + SUFFIX_LOWER)
189 |     else:
190 |         article_file = os.path.join(directory, doi.lstrip(PREFIX) + SUFFIX_LOWER)
191 |     return article_file
192 | 
193 | 
194 | def convert_country(country):
195 |     """
196 |     For a given country, transform it using one of these rules
197 |     :param country: string with the country name
198 |     :return: string with the normalized country name
199 |     """
200 |     if (country and 'China' in country) or \
201 |             country == 'Chin' or country == 'CHINA':
202 |         country = 'China'
203 |     elif country and 'Brazil' in country or \
204 |             country == 'Brasil' or \
205 |             country == 'ITA - Instituto Tecnologico de Aeronautica (':
206 |         country = 'Brazil'
207 |     elif country and 'Argentina' in country:
208 |         country = 'Argentina'
209 |     elif country == 'Czechia':
210 |         country = 'Czech Republic'
211 |     elif 'Norwegian' in country:
212 |         country = 'Norway'
213 |     elif country and 'United Kingdom' in country:
214 |         country = 'United Kingdom'
215 |     elif country and 'Hong Kong' in country:
216 |         country = 'Hong Kong'
217 |     elif country == 'Cameroun':
218 |         country = 'Cameroon'
219 |     elif (country and 'Chile' in country) or country == 'CHILE':
220 |         country = 'Chile'
221 |     elif (country and 'United States of America' in \
222 |             country) or country == 'United States' or country \
223 |             == 'USA' or 'Florida' in country or \
224 |             'California' in country or\
225 |             country == 'National Reference Centre for' or \
226 |             country == 'United State of America' or \
227 |             country == 'U.S.A.' or \
228 |             country == 'Virginia':
229 |         country = 'United States of America'
230 |     elif country=='Republic of Panamá' or country=='Panamá' or 'Panama' in country:
231 |         country = 'Panama'
232 |     elif 'Canada' in country:
233 |         country = 'Canada'
234 |     elif 'Colombia' in country or country == 'Universidad Aut':
235 |         country = 'Colombia'
236 |     elif 'Spain' in country or country=='España':
237 |         country = 'Spain'
238 |     elif 'Iran' in country:
239 |         country = 'Iran'
240 |     elif 'Saudi Arabia' in country:
241 |         country = 'Saudi Arabia'
242 |     elif 'Italy' in country:
243 |         country = 'Italy'
244 |     elif 'Japan' in country:
245 |         country = 'Japan'
246 |     elif 'Germany' in country:
247 |         country = 'Germany'
248 |     elif 'Luxembourg' in country:
249 |         country = 'Luxembourg'
250 |     elif ('France' in country) or country == 'Marseille':
251 |         country = 'France'
252 |     elif country == 'ROC' or country == 'R. O. C':
253 |         country = 'Taiwan'
254 |     elif country == 'Brasil':
255 |         country = 'Brazil'
256 |     elif country == 'México' or 'Mexico' in country or \
257 |             country == 'Centro de Investigación':
258 |         country = 'Mexico'
259 |     elif 'Slowakia' in country:
260 |         country = 'Slowakia'
261 |     elif country == 'Korea' or 'Republic of Korea' in country:
262 |         country = 'South Korea'
263 |     elif country == 'United Kindgom':
264 |         country = 'United Kingdom'
265 |     elif country and 'Netherlands' in country:
266 |         country = 'Netherlands'
267 |     elif country == 'Commonwealth of Australia' or 'Australia' in country:
268 |         country = 'Australia'
269 |     elif 'Singapore' in country:
270 |         country = 'Singapore'
271 |     elif country and (country[0].isdigit() or country[0] == '+'):
272 |         country = 'N/A'
273 |     return country
274 | 


--------------------------------------------------------------------------------
/allofplos/update.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from . import get_corpus_dir, newarticledir, uncorrected_proofs_text_list
 4 | from .corpus.plos_corpus import (create_local_plos_corpus, get_dois_needed_list, download_check_and_move,
 5 |                                  MIN_FILES_FOR_VALID_CORPUS)
 6 | 
 7 | 
 8 | def main():
 9 |     """
10 |     Entry point for the program. This is used when the program is used as a
11 |     standalone script
12 |     :return: None
13 |     """
14 |     directory = get_corpus_dir()
15 | 
16 |     # Step 0: Initialize first copy of repository
17 |     try:
18 |         corpus_files = [name for name in os.listdir(directory) if os.path.isfile(
19 |                         os.path.join(directory, name))]
20 |     except FileNotFoundError:
21 |         corpus_files = []
22 |     if len(corpus_files) < MIN_FILES_FOR_VALID_CORPUS:
23 |         print('Not enough articles in {}, re-downloading zip file'.format(directory))
24 |         # TODO: check if zip file is in top-level directory before downloading
25 |         create_local_plos_corpus()
26 | 
27 |     # Step 1: Query solr via URL and construct DOI list
28 |         # Filtered by article type & scheduled for the last 14 days.
29 |         # Returns specific URL query & the number of search results.
30 |         # Parses the returned dictionary of article DOIs, removing common leading numbers, as a list.
31 |         # Compares to list of existing articles in the PLOS corpus folder to create list of DOIs to download.
32 |     print("Checking for new articles...")
33 |     dois_needed_list = get_dois_needed_list()
34 | 
35 |     # Step 2: Download new articles
36 |         # For every doi in dois_needed_list, grab the accompanying XML from journal pages
37 |         # If no new articles, don't run any other cells
38 |         # Check if articles are uncorrected proofs
39 |         # Check if amended articles linked to new amendment articles are updated
40 |         # Merge new XML into folder
41 |         # If need to bulk download, please start here:
42 |         # https://drive.google.com/open?id=0B_JDnoghFeEKLTlJT09IckMwOFk
43 |     download_check_and_move(dois_needed_list,
44 |                             uncorrected_proofs_text_list,
45 |                             tempdir=newarticledir,
46 |                             destination=get_corpus_dir()
47 |                             )
48 |     return None
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/allofplos/utils.py:
--------------------------------------------------------------------------------
 1 | import textwrap
 2 | 
 3 | def dedent(text):
 4 |     """Equivalent of textwrap.dedent that ignores unindented first line.
 5 |     This means it will still dedent strings like:
 6 |     '''foo
 7 |     is a bar
 8 |     '''
 9 |     For use in wrap_paragraphs.
10 |     
11 |     Taken from https://github.com/ipython/ipython_genutils/text.py
12 |     """
13 | 
14 |     if text.startswith('\n'):
15 |         # text starts with blank line, don't ignore the first line
16 |         return textwrap.dedent(text)
17 | 
18 |     # split first line
19 |     splits = text.split('\n',1)
20 |     if len(splits) == 1:
21 |         # only one line
22 |         return textwrap.dedent(text)
23 | 
24 |     first, rest = splits
25 |     # dedent everything but the first line
26 |     rest = textwrap.dedent(rest)
27 |     return '\n'.join([first, rest])
28 | 


--------------------------------------------------------------------------------
/contributing.rst:
--------------------------------------------------------------------------------
 1 | How to install for developing
 2 | -----------------------------
 3 | 
 4 | Clone the repository with::
 5 | 
 6 |     git clone https://github.com/PLOS/allofplos.git
 7 | 
 8 | Change to the directory where the code is downloaded::
 9 | 
10 |     cd allofplos
11 | 
12 | Provided that you are in the allofplos directory, install it with::
13 | 
14 |     pip install -U -e .
15 | 
16 | How to run the tests
17 | --------------------
18 | 
19 | ``allofplos`` uses ``pytest`` to run its tests. Run ``pip install -e .[test]``
20 | from the top level of the directory, this will install pytest as well as any
21 | other testing dependencies.
22 | 
23 | Once you have ``pytest`` installed, from inside the allofplos directory, run:
24 | 
25 | ``(allofplos)$ pytest``
26 | 
27 | It should return something like:
28 | 
29 | .. code::
30 |   
31 |   collected 20 items
32 | 
33 |   allofplos/tests/test_corpus.py ............                       [ 60%]
34 |   allofplos/tests/test_unittests.py ........                        [100%]
35 | 
36 |   ==================== 20 passed in 0.36 seconds =========================
37 | 
38 | Thing to check before doing a release
39 | -------------------------------------
40 | 
41 | * Run the tests
42 | * Version number is stated in the setup.py file
43 | * HISTORY.txt file with the last change at the beginning
44 | 
45 | Making a release
46 | ----------------
47 | Remove untracked files::
48 |   
49 |     git clean -xfdi
50 | 
51 | Delete previous packages from dist directory::
52 | 
53 |     rm dist/*.*
54 | 
55 | Run bdist_wheel::
56 | 
57 |     python setup.py bdist_wheel --universal
58 | 
59 | Upload with twine::
60 | 
61 |     twine upload dist/*
62 | 
63 | (you will need pypi credentials to do the upload)
64 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "allofplos"
 3 | version = "1.1.1"
 4 | description = "Get and analyze all PLOS articles"
 5 | authors = [
 6 |   {name="Elizabeth Seiver", email="eseiver@plos.org"},
 7 |   {name="Sebastian Bassi", email="sebastian.bassi@globant.com"},
 8 |   {name="M Pacer", email="mpacer@berkeley.edu"}
 9 | ]
10 | maintainers = [
11 |   {name="Erik Hetzner", email="ehetzner@plos.org"},
12 |   {name="Kevin Brandt", email="kbrandt@plos.org"}
13 | ]
14 | readme = "README.rst"
15 | license = {file="LICENSE"}
16 | classifiers = [
17 |   "Development Status :: 5 - Production/Stable",
18 |   "Intended Audience :: Science/Research",
19 |   "Topic :: Scientific/Engineering",
20 |   "License :: OSI Approved :: MIT License",
21 |   "Programming Language :: Python :: 3.8",
22 | ]
23 | keywords = ["science", "PLOS", "publishing", "PLoS"]
24 | dependencies = [
25 |   "certifi",
26 |   "chardet",
27 |   "charset-normalizer",
28 |   "idna",
29 |   "lxml",
30 |   "peewee",
31 |   "pqdm",
32 |   "python-utils",
33 |   "requests",
34 |   "six",
35 |   "tqdm",
36 |   "unidecode",
37 |   "urllib3"
38 | ]
39 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [tool:pytest]
2 | norecursedirs = .git allofplos_xml
3 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | TESTDIR = os.path.dirname(os.path.abspath(__file__))
4 | test_data_dirname = "testdata"
5 | TESTDATADIR = os.path.join(TESTDIR, test_data_dirname)
6 | 


--------------------------------------------------------------------------------
/tests/test_corpus.py:
--------------------------------------------------------------------------------
 1 | from . import TESTDATADIR
 2 | from allofplos import Corpus, starterdir
 3 | from allofplos.article import Article
 4 | from allofplos.corpus import listdir_nohidden
 5 | 
 6 | import random
 7 | import pytest
 8 | import os
 9 | 
10 | @pytest.fixture
11 | def corpus():
12 |     return Corpus(TESTDATADIR, seed=1000)
13 | 
14 | @pytest.fixture
15 | def yes_article():
16 |     return Article('10.1371/journal.pbio.2002354', directory=TESTDATADIR)
17 |     
18 | @pytest.fixture
19 | def no_article():
20 |     return Article('10.1371/journal.pmed.0030132', directory=starterdir)
21 | 
22 | def test_corpus_instantiate(corpus):
23 |     assert isinstance(corpus, Corpus)
24 | 
25 | def test_corpus_len(corpus):
26 |     assert len(corpus) == 5
27 | 
28 | def test_corpus_iter_(corpus):
29 |     article_dois = {article.doi for article in corpus}
30 |     assert article_dois == {
31 |         '10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6',
32 |         '10.1371/journal.pbio.2002399',
33 |         '10.1371/journal.pbio.2002354',
34 |         '10.1371/journal.pone.0185809',
35 |         '10.1371/journal.pbio.2001413',
36 |     }
37 | 
38 | def test_corpus_contains_article(corpus, no_article, yes_article):
39 |     assert yes_article in corpus
40 |     assert no_article not in corpus
41 | 
42 | def test_corpus_contains_doi(corpus, no_article, yes_article):
43 |     assert yes_article.doi in corpus
44 |     assert no_article.doi not in corpus
45 | 
46 | def test_corpus_contains_filepath(corpus, no_article, yes_article):
47 |     ## check for filepath, which is currently called filename on Article
48 |     assert yes_article.filepath in corpus
49 |     assert no_article.filepath not in corpus
50 | 
51 | def test_corpus_contains_file(corpus, no_article, yes_article):
52 |     ## check for filename, which is currently unavailable on Article
53 |     assert os.path.basename(yes_article.filepath) in corpus
54 |     assert os.path.basename(no_article.filepath) not in corpus
55 | 
56 | def test_corpus_random_article(corpus):
57 |     article = corpus.random_article
58 |     assert article.doi == "10.1371/journal.pone.0185809"
59 | 
60 | def test_corpus_indexing(corpus):
61 |     assert corpus["10.1371/journal.pbio.2001413"] == corpus[0]
62 |     assert next(corpus[:1]).doi == "10.1371/journal.pbio.2001413"
63 |     assert next(corpus[1:]).doi != "10.1371/journal.pbio.2001413"
64 | 
65 | def test_iter_file_doi(corpus):
66 |     expected = {
67 |      'journal.pbio.2001413.xml': '10.1371/journal.pbio.2001413',
68 |      'journal.pbio.2002354.xml': '10.1371/journal.pbio.2002354',
69 |      'journal.pbio.2002399.xml': '10.1371/journal.pbio.2002399',
70 |      'journal.pone.0185809.xml': '10.1371/journal.pone.0185809',
71 |      'plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml': 
72 |          '10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6',
73 |      }
74 |     assert expected == {f:doi for f, doi in corpus.iter_file_doi} 
75 | 
76 | 
77 | def test_filepaths(corpus):
78 |     assert set(corpus.filepaths) == set(listdir_nohidden(TESTDATADIR))
79 | 
80 | def test_files(corpus):
81 |     annote_file = 'plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml'
82 |     assert annote_file in corpus.files
83 |     assert 'journal.pcbi.0030158.xml' not in corpus.files
84 | 


--------------------------------------------------------------------------------
/tests/testdata/plos.correction.3155a3e9-5fbe-435c-a07a-e9a4846ec0b6.xml:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v3.0 20080202//EN" "http://dtd.nlm.nih.gov/publishing/3.0/journalpublishing3.dtd">
  2 | <article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="retraction" dtd-version="3.0" xml:lang="en">
  3 |   <front>
  4 |     <journal-meta>
  5 |       <journal-id journal-id-type="nlm-ta">PLoS ONE</journal-id>
  6 |       <journal-id journal-id-type="iso-abbrev">PLoS ONE</journal-id>
  7 |       <journal-id journal-id-type="publisher-id">plos</journal-id>
  8 |       <journal-id journal-id-type="pmc">plosone</journal-id>
  9 |       <journal-title-group>
 10 |         <journal-title>PLoS ONE</journal-title>
 11 |       </journal-title-group>
 12 |       <issn pub-type="epub">1932-6203</issn>
 13 |       <publisher>
 14 |         <publisher-name>Public Library of Science</publisher-name>
 15 |         <publisher-loc>San Francisco, USA</publisher-loc>
 16 |       </publisher>
 17 |     </journal-meta>
 18 |     <article-meta>
 19 |       <article-id pub-id-type="pmcid">3388108</article-id>
 20 |       <article-id pub-id-type="pmid">22792133</article-id>
 21 |       <article-id pub-id-type="doi">10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6</article-id>
 22 |       <article-categories>
 23 |         <subj-group subj-group-type="heading">
 24 |           <subject>Retraction</subject>
 25 |         </subj-group>
 26 |       </article-categories>
 27 |       <title-group>
 28 |         <article-title>Retraction: <italic>De Novo</italic> Transcriptomic Analysis of an Oleaginous Microalga: Pathway Description and Gene Discovery for Production of Next-Generation Biofuels</article-title>
 29 |         <alt-title alt-title-type="running-head">Retraction</alt-title>
 30 |       </title-group>
 31 |       <contrib-group>
 32 |         <contrib contrib-type="author" xlink:type="simple">
 33 |           <name name-style="western">
 34 |             <surname>Wan</surname>
 35 |             <given-names>LingLin</given-names>
 36 |           </name>
 37 |         </contrib>
 38 |         <contrib contrib-type="author" xlink:type="simple">
 39 |           <name name-style="western">
 40 |             <surname>Han</surname>
 41 |             <given-names>Juan</given-names>
 42 |           </name>
 43 |         </contrib>
 44 |         <contrib contrib-type="author" xlink:type="simple">
 45 |           <name name-style="western">
 46 |             <surname>Sang</surname>
 47 |             <given-names>Min</given-names>
 48 |           </name>
 49 |         </contrib>
 50 |         <contrib contrib-type="author" xlink:type="simple">
 51 |           <name name-style="western">
 52 |             <surname>Li</surname>
 53 |             <given-names>AiFen</given-names>
 54 |           </name>
 55 |         </contrib>
 56 |         <contrib contrib-type="author" xlink:type="simple">
 57 |           <name name-style="western">
 58 |             <surname>Wu</surname>
 59 |             <given-names>Hong</given-names>
 60 |           </name>
 61 |         </contrib>
 62 |         <contrib contrib-type="author" xlink:type="simple">
 63 |           <name name-style="western">
 64 |             <surname>Yin</surname>
 65 |             <given-names>ShunJi</given-names>
 66 |           </name>
 67 |         </contrib>
 68 |         <contrib contrib-type="author" xlink:type="simple">
 69 |           <name name-style="western">
 70 |             <surname>Zhang</surname>
 71 |             <given-names>ChengWu</given-names>
 72 |           </name>
 73 |         </contrib>
 74 |       </contrib-group>
 75 |       <pub-date pub-type="collection">
 76 |         <year>2012</year>
 77 |       </pub-date>
 78 |       <pub-date pub-type="epub">
 79 |         <day>29</day>
 80 |         <month>6</month>
 81 |         <year>2012</year>
 82 |       </pub-date>
 83 |       <volume>7</volume>
 84 |       <issue>6</issue>
 85 |       <elocation-id>10.1371/annotation/3155a3e9-5fbe-435c-a07a-e9a4846ec0b6</elocation-id>
 86 |       <permissions>
 87 |         <license xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple"><license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/" xlink:type="simple">Creative Commons Attribution License</ext-link>, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.</license-p></license>
 88 |         <copyright-year>2012</copyright-year>
 89 |       </permissions>
 90 |       <related-article xlink:href="info:doi/10.1371/journal.pone.0035142" related-article-type="retracted-article" ext-link-type="uri" vol="7" page="e35142" id="d34e98" xlink:type="simple">
 91 |         <article-title><italic>De Novo</italic> Transcriptomic Analysis of an Oleaginous Microalga: Pathway Description and Gene Discovery for Production of Next-Generation Biofuels</article-title>
 92 |       </related-article>
 93 |     <author-notes><fn fn-type="conflict"><p>No competing interests declared.</p></fn></author-notes></article-meta>
 94 |   </front>
 95 |   <body><p>It has been brought to the attention of the PLoS ONE Editors that a substantial part of the text in this article was appropriated from text in previous publications, including the articles below:
 96 |  
 97 | Transcriptome sequencing and annotation of the microalgae Dunaliella tertiolecta: pathway description and gene discovery for production of next-generation biofuels.
 98 | BMC Genomics. 2011 Mar 14;12:148.
 99 |  
100 | Deep sequencing of the Camellia sinensis transcriptome revealed candidate genes for major metabolic pathways of tea-specific compounds.
101 | BMC Genomics. 2011 Feb 28;12:131.
102 |  
103 | An efficient approach to finding Siraitia grosvenorii triterpene biosynthetic genes by RNA-seq and digital gene expression analysis.
104 | BMC Genomics. 2011 Jul 5;12:343.
105 |  
106 | Examination of triacylglycerol biosynthetic pathways via de novo transcriptomic and proteomic analyses in an unsequenced microalga.
107 | PLoS ONE. 2011;6(10):e25851.
108 |  
109 | PLoS ONE therefore retracts this article due to the identified case of plagiarism.</p></body><back>
110 |     <fn-group>
111 |       
112 |     </fn-group>
113 |   </back>
114 | </article>


--------------------------------------------------------------------------------