├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── ace
    ├── .vscode
    │   └── settings.json
    ├── __init__.py
    ├── config.py
    ├── database.py
    ├── datatable.py
    ├── evaluate.py
    ├── export.py
    ├── extract.py
    ├── ingest.py
    ├── label.py
    ├── scrape.py
    ├── sources.py
    ├── sources
    │   ├── Frontiers.json
    │   ├── HighWire.json
    │   ├── JournalOfCognitiveNeuroscience.json
    │   ├── OUP.json
    │   ├── OldSpringer.json
    │   ├── PMC.json
    │   ├── Plos.json
    │   ├── Sage.json
    │   ├── ScienceDirect.json
    │   ├── Springer.json
    │   └── Wiley.json
    ├── tableparser.py
    ├── tests
    │   ├── __init__.py
    │   ├── cassettes
    │   │   └── test_ace
    │   │   │   ├── test_brain_research_source.yaml
    │   │   │   ├── test_cerebral_cortex_source.yaml
    │   │   │   ├── test_database_processing_stream.yaml
    │   │   │   ├── test_frontiers_source.yaml
    │   │   │   ├── test_journal_scraping.yaml
    │   │   │   ├── test_neuropsychologia_source.yaml
    │   │   │   ├── test_plos_source.yaml
    │   │   │   ├── test_pmc_source.yaml
    │   │   │   ├── test_science_direct_source.yaml
    │   │   │   └── test_springer_source.yaml
    │   ├── data
    │   │   ├── brain.html
    │   │   ├── cerebral_cortex.html
    │   │   ├── cognition.html
    │   │   ├── frontiers.html
    │   │   ├── jcogneuro.html
    │   │   ├── plosone.html
    │   │   ├── pmc.html
    │   │   ├── springer.html
    │   │   └── wiley.html
    │   ├── different_data
    │   │   ├── 14715131.html
    │   │   ├── 15028641.html
    │   │   ├── 15342430.html
    │   │   └── 18242723.html
    │   ├── test_ace.py
    │   └── weird_data
    │   │   ├── 11532885.html
    │   │   ├── 12417470.html
    │   │   ├── 15716157.html
    │   │   ├── 18439804.html
    │   │   ├── 18760263.html
    │   │   ├── 20159144.html
    │   │   ├── 22695256.html
    │   │   ├── 23813017.html
    │   │   ├── 26021218.html
    │   │   ├── 26696806.html
    │   │   ├── 28432782.html
    │   │   ├── 29366950.html
    │   │   ├── 36196770.html
    │   │   └── 38990127.html
    ├── utils.py
    └── version.py
├── example_tables.txt
├── examples
    ├── create_db_and_add_articles.py
    └── fetch_articles_from_pubmed.py
├── requirements.dev.txt
├── requirements.txt
└── setup.py


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Install and Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | concurrency:
12 |   group: testing-${{ github.ref }}
13 |   cancel-in-progress: true
14 | 
15 | jobs:
16 |   test:
17 |     runs-on: ubuntu-latest
18 | 
19 |     steps:
20 |       - name: Checkout code
21 |         uses: actions/checkout@v2
22 | 
23 |       - name: Set up Python
24 |         uses: actions/setup-python@v4
25 |         with:
26 |           python-version: '3.8'
27 | 
28 |       - name: Install dependencies
29 |         run: |
30 |             pip install -r requirements.txt
31 |             pip install -r requirements.dev.txt
32 |             pip install -e .
33 | 
34 |       - name: Test with pytest
35 |         run: pytest
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.DS_Store
3 | *~
4 | build/
5 | dist
6 | dist/*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any person obtaining a copy
 2 | of this software and associated documentation files (the "Software"), to deal
 3 | in the Software without restriction, including without limitation the rights
 4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 5 | copies of the Software, and to permit persons to whom the Software is
 6 | furnished to do so, subject to the following conditions:
 7 | 
 8 | The above copyright notice and this permission notice shall be included in
 9 | all copies or substantial portions of the Software.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # What is ACE?
 3 | 
 4 | ACE stands for Automated Coordinate Extraction. It's a Python package for automated extraction of functional MRI activation data from the tables of published neuroimaging articles. ACE is actually ACE2; a long, long time ago in a faraway land there was a clunkier Ruby version of ACE that did more or less the same thing much more poorly. Thankfully, Ruby ACE has now been disappeared from the internets forever, leaving us with the slightly better thought out package you see here.
 5 | 
 6 | ## Installation
 7 | 
 8 | Install the package from source:
 9 | 
10 | 	> python setup.py install
11 | 
12 | Make sure you have all the dependencies installed (see requirements.txt).
13 | 
14 | That's all!
15 | 
16 | ## Usage
17 | 
18 | For now, take a look at the tests to get a sense of how things work. A quickstart guide will fill this space in the near future.


--------------------------------------------------------------------------------
/ace/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.testing.pytestArgs": [
3 |         "tests"
4 |     ],
5 |     "python.testing.unittestEnabled": false,
6 |     "python.testing.pytestEnabled": true
7 | }


--------------------------------------------------------------------------------
/ace/__init__.py:
--------------------------------------------------------------------------------
 1 | # emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
 2 | # ex: set sts=4 ts=4 sw=4 et:
 3 | """ACE -- Automated Coordinate Extraction.
 4 | """
 5 | __all__ = ["config", "ingest", "database", "datatable", "set_logging_level", "scrape", "sources", "tableparser", "tests", "__version__"]
 6 | 
 7 | import logging
 8 | import sys
 9 | import os
10 | 
11 | from .version import __version__
12 | 
13 | def set_logging_level(level=None):
14 |     """Set package-wide logging level
15 | 
16 |     Args
17 |         level : Logging level constant from logging module (warning, error, info, etc.)
18 |     """
19 |     if level is None:
20 |         level = os.environ.get('ACE_LOGLEVEL', 'warn')
21 |     logger.setLevel(getattr(logging, level.upper()))
22 |     return logger.getEffectiveLevel()
23 | 
24 | def _setup_logger(logger):
25 |     # Basic logging setup
26 |     console = logging.StreamHandler(sys.stdout)
27 |     console.setFormatter(logging.Formatter("%(levelname)-6s %(module)-7s %(message)s"))
28 |     logger.addHandler(console)
29 |     set_logging_level()
30 | 
31 | # Set up logger
32 | logger = logging.getLogger("ace")
33 | _setup_logger(logger)


--------------------------------------------------------------------------------
/ace/config.py:
--------------------------------------------------------------------------------
 1 | ''' GLOBAL SETTINGS '''
 2 | 
 3 | # When True, all Exceptions will be suppressed. When False, Exception 
 4 | # messages will be printed out.
 5 | SILENT_ERRORS = False
 6 | 
 7 | 
 8 | ''' DATABASE SETTINGS '''
 9 | # Adapter to use--either 'mysql' or 'sqlite'
10 | SQL_ADAPTER = 'mysql'
11 | 
12 | # SQLite path (when using sqlite adapter)
13 | SQLITE_URI = 'sqlite:///ace.db'
14 | 
15 | # MySQL configuration
16 | MYSQL_USER = 'ace'
17 | MYSQL_PASSWORD = 'CHANGEME'
18 | MYSQL_DB = 'ace_test'
19 | 
20 | # When True, any processed articles will be saved to DB, whether or not they 
21 | # contain any extracted activations. When False, only articles from which 
22 | # at least one activation was extracted will be saved. Note that if this is set
23 | # to False, processing will be much slower, since every article not already in
24 | # the DB will be parsed, even if it contains no activations and has been
25 | # previously processed.
26 | SAVE_ARTICLES_WITHOUT_ACTIVATIONS = True
27 | 
28 | # By default, ACE will ignore any articles that already exist in the DB 
29 | # when processing new HTML files. If OVERWRITE is set to True, ACE will 
30 | # always overwrite existing records. This is useful when the extraction 
31 | # code has improved substantially and you want to re-extract all data, 
32 | # but should otherwise be left off for the sake of efficiency.
33 | OVERWRITE_EXISTING_ROWS = False
34 | 
35 | 
36 | ''' SOURCE PROCESSING SETTINGS '''
37 | 
38 | # If True, will exercise greater care when parsing (e.g., when estimating 
39 | # number of columns in table, will check every row in the table and take the 
40 | # max instead of just checking the first row). This is generally desirable,
41 | # but will result in slower processing.
42 | CAREFUL_PARSING = True
43 | 
44 | # Sometimes tables have rows that can't be processed--usually because of malformed
45 | # HTML or XML (e.g., failure to close a <td> tag). Such problems will always be 
46 | # logged, but if IGNORE_BAD_ROWS is True, the row will be skipped and execution
47 | # will continue gracefully. When False, any errors will be re-raised,
48 | # terminating execution.
49 | IGNORE_BAD_ROWS = True
50 | 
51 | # Whether or not to ignore tables that appear to be missing a label for at 
52 | # least one column. This doesn't happen much, and in practice most tables with 
53 | # missing labels appear to genuinely have empty columns that are ignored
54 | # anyway, so this should be left off unless problems arise.
55 | EXCLUDE_TABLES_WITH_MISSING_LABELS = False
56 | 
57 | 
58 | 
59 | 
60 | ''' SCRAPING/PARSING SETTINGS '''
61 | USER_AGENTS = [
62 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
63 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
64 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
65 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
66 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
67 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
68 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
69 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
70 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
71 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
72 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
73 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
74 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
75 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
76 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
77 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
78 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
79 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
80 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
81 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
82 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
83 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
84 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36',
85 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
86 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36'
87 | ]
88 | 


--------------------------------------------------------------------------------
/ace/database.py:
--------------------------------------------------------------------------------
  1 | # Database stuff and models
  2 | 
  3 | from sqlalchemy import (TypeDecorator, Table, Column, Integer, Float, String, Boolean,
  4 |                         ForeignKey, DateTime, Text)
  5 | from sqlalchemy.orm import relationship, backref, sessionmaker
  6 | from sqlalchemy import create_engine
  7 | from sqlalchemy.ext.declarative import declarative_base
  8 | from sqlalchemy.ext.associationproxy import association_proxy
  9 | from sqlalchemy.dialects.mysql import MEDIUMTEXT
 10 | from sqlalchemy.sql import exists
 11 | from datetime import datetime
 12 | import simplejson as json
 13 | import logging
 14 | import sys
 15 | from os import path
 16 | import datetime
 17 | 
 18 | from . import config
 19 | from . import extract
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | Base = declarative_base()
 24 | 
 25 | # Backend-dependent column for full text
 26 | LongText = Text().with_variant(MEDIUMTEXT, 'mysql')
 27 | 
 28 | # Handles all Database loading/saving stuff
 29 | class Database:
 30 | 
 31 |     def __init__(self, adapter=None, db_name=None, user=None, password=None):
 32 |         ''' Connect to DB and initialize instance. '''
 33 | 
 34 |         # Default to settings in config file if none passed
 35 |         if adapter is None: adapter = config.SQL_ADAPTER
 36 | 
 37 |         # Generate DB URI
 38 |         if adapter == 'sqlite':
 39 |             db_uri = config.SQLITE_URI if db_name is None else db_name
 40 |         elif adapter == 'mysql':
 41 |             db_name = config.MYSQL_DB if db_name is None else db_name
 42 |             if user is None: user = config.MYSQL_USER
 43 |             if password is None: password = config.MYSQL_PASSWORD
 44 |             db_uri = 'mysql://%s:%s@localhost/%s' % (user, password, db_name)
 45 |         else:
 46 |             raise ValueError("Value of SQL_ADAPTER in settings must be either 'sqlite' or 'mysql'")
 47 | 
 48 |         engine = create_engine(db_uri, echo=False, connect_args={'timeout': 15})
 49 | 
 50 |         if adapter == 'mysql': engine.execute("SET sql_mode=''")
 51 | 
 52 |         Session = sessionmaker(bind=engine)
 53 |         Base.metadata.create_all(engine)
 54 |         self.session = Session()
 55 | 
 56 |     def add(self, record):
 57 |         ''' Add a record to the DB. '''
 58 |         self.session.add(record)
 59 | 
 60 |     def save(self):
 61 |         ''' Commit all stored records to file. '''
 62 |         self.session.commit()
 63 |             # except Exception as err:
 64 |             #     print(err)
 65 | 
 66 |     def delete_article(self, pmid):
 67 |         article = self.session.query(Article).filter_by(id=pmid).first()
 68 |         self.session.delete(article)
 69 |         self.session.commit()
 70 | 
 71 |     def print_stats(self):
 72 |         ''' Summarize the current state of the DB. '''
 73 |         n_articles = self.session.query(Article).count()
 74 |         n_articles_with_coordinates = self.session.query(Article).join(Table).filter(Table.n_activations>0).distinct('article_id').count()
 75 |         n_tables = self.session.query(Table).count()
 76 |         n_activations = self.session.query(Activation).count()
 77 |         n_links = self.session.query(NeurovaultLink).count()
 78 |         n_articles_with_links = self.session.query(NeurovaultLink).distinct('article_id').count()
 79 |         print(f"The database currently contains: {n_articles} articles.\n"
 80 |         f"{n_articles_with_coordinates} have coordinates, and {n_articles_with_links} have NeuroVault links.\n"
 81 |         f"Total of {n_tables} tables, {n_activations} activations and {n_links} NeuroVault links.")
 82 | 
 83 |     def article_exists(self, pmid):
 84 |         ''' Check if an article already exists in the database. '''
 85 |         return self.session.query(exists().where(Article.id==pmid)).scalar()
 86 | 
 87 |     @property
 88 |     def articles(self):
 89 |         return self.session.query(Article).all()
 90 | 
 91 | # Create a JSONString column type for convenience
 92 | class JsonString(TypeDecorator):
 93 |     impl = Text
 94 | 
 95 |     def process_result_value(self, value, dialect):
 96 |         if value is None:
 97 |             return None
 98 |         else:
 99 |             return json.loads(value)
100 | 
101 |     def process_bind_param(self, value, dialect):
102 |         if value is None:
103 |             return None
104 |         else:
105 |             return json.dumps(value)
106 | 
107 | 
108 | class Article(Base):
109 | 
110 |     __tablename__ = 'articles'
111 | 
112 |     id = Column(Integer, primary_key=True)
113 |     title = Column(String(200))
114 |     text = Column(LongText)
115 |     journal = Column(String(200))
116 |     space = Column(String(20))
117 |     publisher = Column(String(200))
118 |     doi = Column(String(200))
119 |     year = Column(Integer)
120 |     authors = Column(Text)
121 |     abstract = Column(Text)
122 |     citation = Column(Text)
123 |     pubmed_metadata = Column(JsonString)
124 |     created_at =  Column(DateTime, default=datetime.datetime.utcnow)
125 |     updated_at = Column(DateTime, default=datetime.datetime.utcnow,
126 |                            onupdate=datetime.datetime.utcnow)
127 | 
128 |     tables = relationship('Table', cascade="all, delete-orphan",
129 |                           backref='article')
130 | 
131 |     neurovault_links = relationship('NeurovaultLink', cascade="all, delete-orphan",
132 |                                 backref='article')
133 |                                 
134 |     features = association_proxy('tags', 'feature')
135 | 
136 |     def __init__(self, text, pmid=None, doi=None, metadata=None):
137 |         self.id = int(pmid)
138 |         self.text = text
139 |         self.space = extract.guess_space(text)
140 |         self.doi = doi
141 |         self.pubmed_metadata = metadata
142 |         self.update_from_metadata()
143 | 
144 |     def update_from_metadata(self):
145 |         if self.pubmed_metadata is not None:
146 |             pmd = self.pubmed_metadata
147 |             self.title = pmd['title']
148 |             self.journal = pmd['journal']
149 |             self.pubmed_metadata = pmd
150 |             self.year = pmd['year']
151 |             self.authors = pmd['authors']
152 |             self.abstract = pmd['abstract']
153 |             self.citation = pmd['citation']
154 |             self.doi = pmd['doi']
155 | 
156 | 
157 | class Table(Base):
158 | 
159 |     __tablename__ = 'tables'
160 | 
161 |     id = Column(Integer, primary_key=True)
162 |     article_id = Column(Integer, ForeignKey('articles.id'))
163 |     activations = relationship('Activation', cascade="all, delete-orphan",
164 |                                 backref='table')
165 |     position = Column(Integer)   # The serial position of occurrence
166 |     number = Column(String(10))   # The stated table ID (e.g., 1, 2b)
167 |     label = Column(String(200))  # The full label (e.g., Table 1, Table 2b)
168 |     caption = Column(Text)
169 |     notes = Column(Text)
170 |     n_activations = Column(Integer)
171 |     n_columns = Column(Integer)
172 | 
173 |     def finalize(self):
174 |         ''' Any cleanup and updating operations we need to do before saving. '''
175 | 
176 |         # # Remove duplicate activations--most commonly produced by problems with
177 |         # # the grouping code.
178 |         # act_defs = set()
179 |         # to_keep = []
180 |         # for a in self.activations:
181 |         #     definition = json.dumps([a.x, a.y, a.z, a.groups])
182 |         #     if definition not in act_defs:
183 |         #         act_defs.add(definition)
184 |         #         to_keep.append(a)
185 |         # self.activations = to_keep
186 | 
187 |         self.n_activations = len(self.activations)
188 | 
189 | 
190 | class Activation(Base):
191 | 
192 |     __tablename__ = 'activations'
193 | 
194 |     id = Column(Integer, primary_key=True)
195 | 
196 |     article_id = Column(Integer, ForeignKey('articles.id'))
197 |     table_id = Column(Integer, ForeignKey('tables.id'))
198 |     columns = Column(JsonString)
199 |     groups = Column(JsonString)
200 |     problems = Column(JsonString)
201 |     x = Column(Float)
202 |     y = Column(Float)
203 |     z = Column(Float)
204 |     number = Column(Integer)
205 |     region = Column(String(100))
206 |     hemisphere = Column(String(100))
207 |     ba = Column(String(100))
208 |     size = Column(String(100))
209 |     statistic = Column(String(100))
210 |     p_value = Column(String(100))
211 | 
212 |     missing_source = Column(Boolean, default=False)
213 | 
214 |     def __init__(self):
215 |         self.problems = []
216 |         self.columns = {}
217 | 
218 |     def set_coords(self, x, y, z):
219 |         new_xyz = []
220 |         for c in [x, y, z]:
221 |             if c == '' or c is None:
222 |                 c = None
223 |             else:
224 |                 c = c.replace(' ', '').replace('--', '-').rstrip('.')
225 |                 c = float(c)
226 |             new_xyz.append(c)
227 | 
228 |         self.x, self.y, self.z = new_xyz
229 | 
230 |     def add_col(self, key, val):
231 |         self.columns[key] = val
232 | 
233 |     # Validates Peak. Considers peak invalid if:
234 |     # * At least one of X, Y, Z is nil or missing
235 |     # * Any |coordinate| > 100
236 |     # * Two or more columns are zeroes (most of the time this
237 |     #   will indicate a problem, but occasionally a real coordinate)
238 |     # Depending on config, either excludes peak, or allows it through
239 |     # but flags potential problems for later inspection.
240 |     def validate(self):
241 | 
242 |         for c in [self.x, self.y, self.z]:
243 |             if c == '' or c is None:
244 |                 logger.debug('Missing x, y, or z coordinate information: [%s, %s, %s]' % tuple(
245 |                     [str(e) for e in [self.x, self.y, self.z]]))
246 |                 return False
247 |             try:
248 |                 if abs(c) >= 100:
249 |                     logger.debug(
250 |                         'Invalid coordinates: at least one dimension (x,y,z) >= 100.')
251 |                     return False
252 |             except:
253 |                 print(c)
254 |                 print(sys.exc_info()[0])
255 |                 raise
256 | 
257 |         sorted_xyz = sorted([abs(self.x), abs(self.y), abs(self.z)])
258 |         if sorted_xyz[0] == 0 and sorted_xyz[1] == 0:
259 |             logger.debug(
260 |                 "At least two dimensions have value == 0; coordinate is probably not real.")
261 |             return False
262 | 
263 |         return True
264 | 
265 | class NeurovaultLink(Base):
266 |     
267 |     __tablename__ = 'Neurovaultlinks'
268 | 
269 |     id = Column(Integer, primary_key=True, autoincrement=True)
270 |     neurovault_id = Column(Integer)
271 |     url = Column(String(100))
272 |     type = Column(String(100))
273 | 
274 |     article_id = Column(Integer, ForeignKey('articles.id'))
275 | 


--------------------------------------------------------------------------------
/ace/datatable.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | logger = logging.getLogger(__name__)
 3 | 
 4 | 
 5 | class DataTable:
 6 | 
 7 |     ''' Simple class to represent the contents of an HTML table.
 8 |         Basically just a grid with array accessor methods and
 9 |         some extra validation. '''
10 | 
11 |     def __init__(self, n_rows, n_cols):
12 |         self.data = [[None] * n_cols for n in range(n_rows)]
13 |         # self.n_rows = n_rows
14 |         self.n_cols = n_cols
15 | 
16 |     def __getitem__(self, inds):
17 |         if isinstance(inds, int):
18 |             inds = [inds]
19 |         row = self.data[inds[0]]
20 |         return row[inds[1]] if len(inds) > 1 else row
21 | 
22 |     def __setitem__(self, inds, v):
23 |         self.data[inds[0]][inds[1]] = v
24 | 
25 |     def to_list(self):
26 |         return self.data
27 | 
28 |     @property
29 |     def n_rows(self):
30 |         return len(self.data)
31 | 
32 |     def add_val(self, val, rows=1, cols=1):
33 |         ''' Find next open position and add values to grid '''
34 | 
35 |         # Flatten list and find next open position
36 |         flat = [item for l in self.data for item in l]
37 |         flat_set = set(flat)
38 | 
39 |         if not None in flat_set:
40 |             open_pos = self.n_rows * self.n_cols
41 |             for i in range(rows):
42 |                 self.data.append([None] * self.n_cols)
43 | 
44 |         else:
45 |             # This indexing operation consumes a lot of CPU time for large tables; need to refactor!
46 |             open_pos = flat.index(None)
47 |             ri = open_pos / self.n_cols
48 |             if (ri + rows) > self.n_rows:
49 |                 for i in range(round((ri + rows)) - self.n_rows):
50 |                     self.data.append([None] * self.n_cols)
51 | 
52 |         ri = open_pos // self.n_cols
53 |         ci = open_pos % self.n_cols
54 | 
55 |         if cols + ci > self.n_cols:
56 |             cols = self.n_cols - ci
57 | 
58 |         for r in range(rows):
59 |             for c in range(cols):
60 |                 if cols > 1:
61 |                     content = '@@%s@%d' % (
62 |                         val, cols) if c == 0 else '@@%s' % val
63 |                 else:
64 |                     content = val
65 |                 self[ri + r, ci + c] = content
66 | 


--------------------------------------------------------------------------------
/ace/evaluate.py:
--------------------------------------------------------------------------------
 1 | """ Tools for evaluating the quality of extracted coordinates. """
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | import numpy as np
 6 | 
 7 | def plot_xyz_histogram(database, bins=50):
 8 | 	''' Takes a database file as input and plots histograms for X/Y/Z coords. '''
 9 | 	data = pd.read_csv(database,sep='\t')
10 | 	data[['x','y','z']].hist(bins=bins)
11 | 	plt.show()
12 | 
13 | 
14 | def proportion_integer_values(database):
15 | 	''' Reports the proportion of integer values in X/Y/Z columns of database file. 
16 | 	This should generally be close to 0--typically around 0.02 or so if everything 
17 | 	is working properly. '''
18 | 	data = pd.read_csv(database,sep='\t')
19 | 	return 1 - data[['x','y','z']].apply(lambda x: np.mean(x == x.round()))


--------------------------------------------------------------------------------
/ace/export.py:
--------------------------------------------------------------------------------
  1 | from .database import Article
  2 | from sqlalchemy import func, or_
  3 | import logging
  4 | import csv
  5 | from pathlib import Path
  6 | import datetime
  7 | import json
  8 | from tqdm import tqdm
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | def export_database(db, foldername, skip_empty=True):
 13 |     # Create folder if it doesn't exist
 14 |     foldername = Path(foldername)
 15 |     foldername.mkdir(parents=True, exist_ok=True)
 16 | 
 17 |     article_columns = ['pmid', 'doi', 'authors', 'title', 'journal', 'publication_year', 'coordinate_space']
 18 |     art_results = []
 19 | 
 20 |     coordinate_columns = ['pmid', 'table_id', 'table_label', 'table_caption', 'table_number', 
 21 |         'x', 'y', 'z', 'p_value', 'region', 'size', 'statistic', 'groups']
 22 |     coordinates = []
 23 | 
 24 |     text_columns = ['pmid', 'title' ,'abstract', 'body']
 25 |     texts = []
 26 | 
 27 |     nv_colls_col = ['pmid','collection_id']
 28 |     nv_colls = []
 29 | 
 30 |     nv_images_col = ['pmid','image_id']
 31 |     nv_images = []
 32 | 
 33 |     print("Exporting database to %s" % foldername)
 34 | 
 35 |     articles = db.session.query(Article)
 36 |     if skip_empty:
 37 |         articles = articles.filter(or_(Article.tables.any(), Article.neurovault_links.any()))
 38 | 
 39 |     for art in tqdm(articles):
 40 |         art_results.append([art.id, art.doi, art.authors, art.title, art.journal, art.year, art.space])
 41 |         texts.append([art.id, art.title, art.abstract, art.text])
 42 | 
 43 |         for t in art.tables:
 44 |             for p in t.activations:
 45 |                 if t.number is None: t.number = ''
 46 |                 if isinstance(p.groups, str):
 47 |                     p.groups = [p.groups]
 48 |                 elif p.groups is None:
 49 |                     p.groups = []
 50 |                 groups = '///'.join(p.groups)
 51 | 
 52 |                 coordinates.append([art.id, t.id, t.label, t.caption, t.number, 
 53 |                     p.x, p.y, p.z, p.p_value, p.region, p.size, p.statistic, groups])
 54 |             
 55 |         for nv in art.neurovault_links:
 56 |             if nv.type == 'collection':
 57 |                 nv_colls.append([art.id, nv.neurovault_id])
 58 |             elif nv.type == 'image':
 59 |                 nv_images.append([art.id, nv.neurovault_id])
 60 | 
 61 |     # Save articles as tab separated file
 62 |     with (foldername / 'metadata.csv').open('w', newline='') as f:
 63 |         writer = csv.writer(f)
 64 |         writer.writerow(article_columns)
 65 |         writer.writerows(art_results)
 66 | 
 67 |     # Save coordinates as tab separated file
 68 |     with (foldername / 'coordinates.csv').open('w', newline='') as f:
 69 |         writer = csv.writer(f)
 70 |         writer.writerow(coordinate_columns)
 71 |         writer.writerows(coordinates)
 72 | 
 73 |     # Save texts as tab separated file
 74 |     with (foldername / 'text.csv').open('w', newline='') as f:
 75 |         writer = csv.writer(f)
 76 |         writer.writerow(text_columns)
 77 |         writer.writerows(texts)
 78 | 
 79 |     # Save NV links
 80 |     with (foldername / 'neurovault_collections.csv').open('w', newline='') as f:
 81 |         writer = csv.writer(f)
 82 |         writer.writerow(nv_colls_col)
 83 |         writer.writerows(nv_colls)
 84 | 
 85 |     with (foldername / 'neurovault_images.csv').open('w', newline='') as f:
 86 |         writer = csv.writer(f)
 87 |         writer.writerow(nv_images_col)
 88 |         writer.writerows(nv_images)
 89 | 
 90 |     # Save json file with time of export
 91 |     export_md = {
 92 |         "exported": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 93 |         "n_articles": len(art_results),
 94 |         "n_activations": len(coordinates),
 95 |         "n_nv_collections": len(nv_colls),
 96 |         "n_nv_images": len(nv_images)
 97 | 
 98 |     }
 99 | 
100 |     with (foldername / 'export.json').open('w') as f:
101 |         json.dump(export_md, f)


--------------------------------------------------------------------------------
/ace/extract.py:
--------------------------------------------------------------------------------
 1 | # Miscellaneous methods for extracting information from text/html
 2 | 
 3 | import bs4 as BeautifulSoup
 4 | import re
 5 | 
 6 | 
 7 | def guess_space(text):
 8 |     ''' Take article text as input and return a guess about the image space. '''
 9 | 
10 |     targets = ['mni', 'talairach', 'afni', 'flirt',
11 |                '711-2', 'spm', 'brainvoyager', 'fsl']
12 |     n_targ = len(targets)
13 |     text = text.lower()
14 |     res = [0] * n_targ
15 |     matches = []
16 |     for i in range(n_targ):
17 |         res[i] = len(re.findall(
18 |             r'\b(.{30,40}\b%s.{30,40})\b' % targets[i], text))
19 | 
20 |     # Sum up diagnostic strings...
21 |     mni = res[5] + res[7]
22 |     t88 = res[2] + res[6]
23 |     software = mni + t88
24 | 
25 | # Assign label
26 | # 1. If only one of MNI or T88 is implied, classify as that
27 |     if (mni and not t88) or (not software and res[0] and not res[1]):
28 |         label = 'MNI'
29 |     elif (t88 and not mni) or (not software and res[1] and not res[0]):
30 |         label = 'TAL'
31 |     else:
32 |         label = 'UNKNOWN'
33 | 
34 |     return label
35 | 


--------------------------------------------------------------------------------
/ace/ingest.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | import logging
 3 | from . import sources, config
 4 | from .scrape import _validate_scrape
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | # The actual function that takes articles and adds them to the database
 9 | # imports sources; sources is a module that contains the classes for each
10 | # source of articles.
11 | 
12 | def add_articles(db, files, commit=True, table_dir=None, limit=None,
13 |     pmid_filenames=False, metadata_dir=None, force_ingest=True, **kwargs):
14 |     ''' Process articles and add their data to the DB.
15 |     Args:
16 |         files: The path to the article(s) to process. Can be a single
17 |             filename (string), a list of filenames, or a path to pass
18 |             to glob (e.g., "article_ls  dir/NIMG*html")
19 |         commit: Whether or not to save records to DB file after adding them.
20 |         table_dir: Directory to store downloaded tables in (if None, tables 
21 |             will not be saved.)
22 |         limit: Optional integer indicating max number of articles to add 
23 |             (selected randomly from all available). When None, will add all
24 |             available articles.
25 |         pmid_filenames: When True, assume that the file basename is a PMID.
26 |             This saves us from having to retrieve metadata from PubMed When
27 |             checking if a file is already in the DB, and greatly speeds up 
28 |             batch processing when overwrite is off.
29 |         metadata_dir: Location to read/write PubMed metadata for articles.
30 |             When None (default), retrieves new metadata each time. If a 
31 |             path is provided, will check there first before querying PubMed,
32 |             and will save the result of the query if it doesn't already
33 |             exist.
34 |         force_ingest: Ingest even if no source is identified. 
35 |         kwargs: Additional keyword arguments to pass to parse_article.
36 |     '''
37 | 
38 |     manager = sources.SourceManager(db, table_dir)
39 | 
40 |     if isinstance(files, str):
41 |         from glob import glob
42 |         files = glob(files)
43 |         if limit is not None:
44 |             from random import shuffle
45 |             shuffle(files)
46 |             files = files[:limit]
47 | 
48 |     missing_sources = []
49 |     for i, f in enumerate(files):
50 |         logger.info("Processing article %s..." % f)
51 |         html = open(f).read()
52 | 
53 |         if not _validate_scrape(html):
54 |             logger.warning("Invalid HTML for %s" % f)
55 |             continue
56 | 
57 |         source = manager.identify_source(html)
58 |         if source is None:
59 |             logger.warning("Could not identify source for %s" % f)
60 |             missing_sources.append(f)
61 |             if not force_ingest:
62 |                 continue
63 |             else:
64 |                 source = sources.DefaultSource(db)
65 | 
66 |         pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None
67 |         article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs)
68 |         if article and (config.SAVE_ARTICLES_WITHOUT_ACTIVATIONS or article.tables):
69 |             db.add(article)
70 |             if commit and (i % 100 == 0 or i == len(files) - 1):
71 |                 db.save()
72 |     db.save()
73 | 
74 |     return missing_sources
75 | 


--------------------------------------------------------------------------------
/ace/label.py:
--------------------------------------------------------------------------------
 1 | # from nltk import *
 2 | import re
 3 | from collections import Counter
 4 | from .database import Article
 5 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def extract_ngram_features(db, tfidf=True, save=None, vocabulary=None, require_activations=True, **kwargs):
10 |     ''' Takes text from an article as input and returns a matrix of document --> 
11 |     ngram weights. At the moment, only extracts terms from abstracts. 
12 |     Args:
13 |         db: A database instance
14 |         tfidf: If True, uses a tf-idf tokenizer; otherwise uses raw counts
15 |         save: an optional path to save a CSV to; if None, returns the resulting data
16 |         vocabulary: an optional list of ngrams to restrict extraction to
17 |         require_activations: When True, only articles containing at least one fMRI activation
18 |             table will be included. When False, use all articles in DB.
19 |         kwargs: Optional keywords passed onto the scikit-learn vectorizer. Common args are
20 |             ngram_range, min_df, max_df, stop_words, and vocabulary.
21 |     '''
22 | 
23 |     # Extract article texts--for now, uses abstracts
24 |     articles = db.session.query(Article.id, Article.abstract)
25 |     if require_activations:
26 |         articles = articles.filter(Article.tables.any())
27 |     pmids, corpus = list(zip(*articles.all()))
28 | 
29 |     # Instantiate vectorizer--either simple counts, or tf-idf
30 |     vectorizer = TfidfVectorizer if tfidf else CountVectorizer
31 |     vectorizer = vectorizer(vocabulary=vocabulary, **kwargs)
32 | 
33 |     # Transform texts
34 |     weights = vectorizer.fit_transform(corpus).toarray()
35 |     names = vectorizer.get_feature_names()
36 | 
37 |     data = pd.DataFrame(weights, columns=names, index=pmids)
38 | 
39 |     if save is not None:
40 |         data.to_csv(save, sep='\t', index_label='pmid', encoding='utf-8')
41 |     else:
42 |         return data
43 | 
44 | 


--------------------------------------------------------------------------------
/ace/scrape.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 |   # use unicode everywhere
  3 | import re
  4 | import sys
  5 | from pathlib import Path
  6 | from collections import Mapping
  7 | import requests
  8 | from time import sleep
  9 | import logging
 10 | import os
 11 | import random
 12 | import xmltodict
 13 | from seleniumbase import Driver
 14 | from selenium.webdriver.support.ui import WebDriverWait
 15 | from selenium.webdriver.support import expected_conditions as EC
 16 | from selenium.webdriver.common.by import By
 17 | from selenium.common.exceptions import TimeoutException
 18 | from tqdm import tqdm
 19 | 
 20 | from ace.utils import PubMedAPI
 21 | from ace.config import USER_AGENTS
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | def get_url(url, n_retries=5, timeout=10.0, verbose=False):
 27 |     headers = {'User-Agent': random.choice(USER_AGENTS)}
 28 | 
 29 |     def exponential_backoff(retries):
 30 |         return 2 ** retries
 31 | 
 32 |     retries = 0
 33 |     while retries < n_retries:
 34 | 
 35 |         try:
 36 |             r = requests.get(url, headers=headers, timeout=timeout)
 37 |             return r.text
 38 |         except requests.exceptions.RequestException as e:
 39 |             logger.warning(f"Request failed: {e}")
 40 |             sleep_time = exponential_backoff(retries)
 41 |             logger.info(f"Retrying in {sleep_time} seconds...")
 42 |             sleep(sleep_time)
 43 |             retries += 1
 44 |     logger.error("Exceeded maximum number of retries.")
 45 |     return None
 46 | 
 47 | def _convert_pmid_to_pmc(pmids):
 48 |     url_template = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids="
 49 |     logger.info("Converting PMIDs to PMCIDs...")
 50 | 
 51 |     # Chunk the PMIDs into groups of 200
 52 |     pmids = [str(p) for p in pmids]
 53 |     pmid_chunks = [pmids[i:i + 200] for i in range(0, len(pmids), 200)]
 54 | 
 55 |     pmc_ids = []
 56 |     for chunk in tqdm(pmid_chunks):
 57 |         pmid_str = ','.join(chunk)
 58 |         url = url_template + pmid_str
 59 |         response = get_url(url)
 60 |         # Respionse <record requested-id="23193288" pmcid="PMC3531191" pmid="23193288" doi="10.1093/nar/gks1163">
 61 |         pmc_ids += re.findall(r'<record requested-id="[^"]+" pmcid="([^"]+)" pmid="([^"]+)" doi="[^"]+">', response)
 62 | 
 63 |     logger.info(f"Found {len(pmc_ids)} PMCIDs from {len(pmids)} PMIDs.")
 64 | 
 65 |     pmids_found = set([p[1] for p in pmc_ids])
 66 |     missing_pmids = [(None, p) for p in pmids if p not in pmids_found]
 67 | 
 68 |     pmc_ids = pmc_ids + missing_pmids
 69 |         
 70 |     return pmc_ids
 71 | 
 72 | 
 73 | def get_pmid_from_doi(doi, api_key=None):
 74 |     ''' Query PubMed for the PMID of a paper based on its doi. We need this
 75 |     for some Sources that don't contain the PMID anywhere in the artice HTML.
 76 |     '''
 77 |     query = f"{doi}[aid]"
 78 |     data = PubMedAPI(api_key=api_key).esearch(query=query)
 79 |     if data:
 80 |         data = data[0]
 81 |     else:
 82 |         data = None
 83 |     return data
 84 | 
 85 | 
 86 | def get_pubmed_metadata(pmid, parse=True, store=None, save=True, api_key=None):
 87 |     ''' Get PubMed metadata for article.
 88 |     Args:
 89 |         pmid: The article's PubMed ID
 90 |         parse: if True, parses the text and returns a dictionary. if False, returns raw text.
 91 |         store: optional string path to PubMed metadata files. If passed, first checks the passed
 92 |             folder for the corresponding ID, and only queries PubMed if not found.
 93 |         save: if store is passed, save is True, and the file does not already exist, 
 94 |             will save the result of the new PubMed query to the store.
 95 |     '''
 96 |     if store is not None:
 97 |         md_file = os.path.join(store, pmid)
 98 | 
 99 |     if store is not None and os.path.exists(md_file):
100 |         logger.info("Retrieving metadata from file %s..." % os.path.join(store, pmid))
101 |         with open(md_file, 'rb') as f:
102 |             xml = f.read()
103 | 
104 |     else:
105 |         logger.info("Retrieving metadata for PubMed article %s..." % str(pmid))
106 |         xml = PubMedAPI(api_key=api_key).efetch(input_id=pmid,  retmode='xml', rettype='medline', db='pubmed')
107 |         if store is not None and save and xml is not None:
108 |             if not os.path.exists(store):
109 |                 os.makedirs(store)
110 |             with open(md_file, 'wb') as f:
111 |                 f.write(xml)
112 | 
113 |     return parse_PMID_xml(xml) if (parse and xml is not None) else xml
114 | 
115 | 
116 | def parse_PMID_xml(xml):
117 |     ''' Take XML-format PubMed metadata and convert it to a dictionary
118 |     with standardized field names. '''
119 | 
120 |     di = xmltodict.parse(xml).get('PubmedArticleSet')
121 |     if not di:
122 |         return None
123 |     
124 |     di = di['PubmedArticle']
125 |     article = di['MedlineCitation']['Article']
126 | 
127 |     if 'ArticleDate' in article:
128 |         date = article['ArticleDate']
129 |     elif 'Journal' in article:
130 |         date = article['Journal']['JournalIssue']['PubDate']
131 |     else:
132 |         date = None
133 |     
134 |     if date:
135 |         year = date.get('Year', None)
136 |     else:   
137 |         year = None
138 | 
139 |     doi = None
140 |     doi_source = article.get('ELocationID', None)
141 |     if doi_source is not None and isinstance(doi_source, list):
142 |         doi_source = [d for d in doi_source if d['@EIdType'] == 'doi'][0]
143 | 
144 |     if doi_source is not None and doi_source['@EIdType'] == 'doi':
145 |         doi = doi_source['#text']
146 | 
147 |     authors = article.get('AuthorList', None)
148 |     
149 |     if authors:
150 |         authors = authors['Author']
151 | 
152 |         try:
153 |             _get_author = lambda a: a['LastName'] + ', ' + a['ForeName']
154 |             if isinstance(authors, list):
155 |                 authors = [_get_author(a) for a in authors if 'ForeName' in a]
156 |             else:
157 |                 authors = [_get_author(authors)]
158 |             authors = ';'.join(authors)
159 |         except:
160 |             authors = None
161 | 
162 |     if 'MeshHeadingList' in di['MedlineCitation']:
163 |         mesh = di['MedlineCitation']['MeshHeadingList']['MeshHeading']
164 |     else:
165 |         mesh = []
166 | 
167 |     abstract = article.get('Abstract', '')
168 |     if abstract != '':
169 |         abstract = abstract.get('AbstractText', '')
170 | 
171 |     cit = di['PubmedData']['ArticleIdList']['ArticleId']
172 |     if isinstance(cit, list):
173 |         cit = cit[1]
174 | 
175 |     metadata = {
176 |         'authors': authors,
177 |         'citation': cit['#text'],
178 |         'comment': abstract,
179 |         'doi': doi,
180 |         'keywords': '',
181 |         'mesh': mesh,
182 |         'pmid': di['MedlineCitation']['PMID'],
183 |         'title': article['ArticleTitle'],
184 |         'abstract': abstract,
185 |         'journal': article['Journal']['Title'],
186 |         'year': year
187 |     }
188 | 
189 |     # Clean up nested Dicts
190 |     for k, v in metadata.items():
191 |         if isinstance(v, list):
192 |             to_join = []
193 |             for a in v:
194 |                 if 'DescriptorName' in a:
195 |                     a = a['DescriptorName']
196 |                 a = a['#text']
197 | 
198 |                 to_join.append(a)
199 |             v = ' | '.join(to_join)
200 |         elif isinstance(v, Mapping):
201 |             v = v.get('#text', '')
202 |         metadata[k] = v
203 | 
204 |     return metadata
205 | 
206 | def _validate_scrape(html):
207 |     """ Checks to see if scraping was successful. 
208 |     For example, checks to see if Cloudfare interfered """
209 | 
210 |     patterns = ['Checking if you are a human',
211 |     'Please turn JavaScript on and reload the page',
212 |     'Checking if the site connection is secure',
213 |     'Enable JavaScript and cookies to continue',
214 |     'There was a problem providing the content you requested',
215 |     '<title>Redirecting</title>',
216 |     '<title>Page not available - PMC</title>',
217 |     'Your request cannot be processed at this time. Please try again later',
218 |     '403 Forbidden',
219 |     'Page not found — ScienceDirect',
220 |     'This site can’t be reached',
221 |     'used Cloudflare to restrict access',
222 |     '502 Bad Gateway',
223 |     ]
224 | 
225 |     for pattern in patterns:
226 |         if pattern in html:
227 |             return False
228 | 
229 |     return True
230 | 
231 | ''' Class for journal Scraping. The above free-floating methods should 
232 | probably be refactored into this class eventually. '''
233 | class Scraper:
234 | 
235 |     def __init__(self, store, api_key=None):
236 |         self.store = Path(store)
237 |         self._client = PubMedAPI(api_key=api_key)
238 | 
239 | 
240 |     def search_pubmed(self, journal, search, retmax=10000, savelist=None,):
241 |         journal = journal.replace(' ', '+')
242 |         search = '+%s' % search
243 |         query = f"({journal}[Journal]+journal+article[pt]{search})"
244 |         logger.info("Query: %s" % query)
245 | 
246 |         doc = self._client.esearch(query, retmax=retmax)
247 | 
248 |         if savelist is not None:
249 |             outf = open(savelist, 'w')
250 |             outf.write(doc)
251 |             outf.close()
252 |         return doc
253 | 
254 | 
255 |     def get_html(self, url, journal, mode='browser'):
256 | 
257 |         ''' Get HTML of full-text article. Uses either browser automation (if mode == 'browser')
258 |         or just gets the URL directly. '''
259 | 
260 |         if mode == 'browser':
261 |             driver = Driver(
262 |                 uc=True,
263 |                 headless2=True,
264 |                 agent=random.choice(USER_AGENTS),
265 |             )
266 |             for attempt in range(15):
267 |                 try:
268 |                     driver.set_page_load_timeout(10)
269 |                     driver.get(url)
270 |                     url = driver.current_url
271 |                 except:
272 |                     driver.quit()
273 |                     logger.info(f"Timeout exception #{attempt}. Retrying...")
274 |                     sleep(5)
275 |                     continue
276 |                 else:
277 |                     break
278 |             else:
279 |                 logger.info("Timeout exception. Giving up.")
280 |                 return None
281 |             for attempt in range(10):
282 |                 try:
283 |                     html = driver.page_source
284 |                 except:
285 |                     logger.info(f"Source Page #{attempt}. Retrying...")
286 |                     driver.quit()
287 |                     driver = Driver(
288 |                         uc=True,
289 |                         headless2=True,
290 |                         agent=random.choice(USER_AGENTS),
291 |                     )
292 |                     driver.get(url)
293 |                     sleep(2)
294 |                 else:
295 |                     break
296 |     
297 |             new_url = self.check_for_substitute_url(url, html, journal)
298 | 
299 |             if url != new_url:
300 |                 driver = Driver(
301 |                     uc=True,
302 |                     headless2=True,
303 |                     agent=random.choice(USER_AGENTS),
304 |                 )
305 |                 driver.get(new_url)
306 |                 if journal.lower() in ['human brain mapping',
307 |                                             'european journal of neuroscience',
308 |                                             'brain and behavior','epilepsia']:
309 |                     sleep(0.5 + random.random() * 1)
310 |                     try:
311 |                         WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'relatedArticles')))
312 |                     except TimeoutException:
313 |                         print("Loading Wiley page took too much time!")
314 | 
315 |                 # Sometimes we get annoying alerts (e.g., Flash animation
316 |                 # timeouts), so we dismiss them if present.
317 |                 try:
318 |                     alert = driver.switch_to_alert()
319 |                     alert.dismiss()
320 |                 except:
321 |                     pass
322 | 
323 |             logger.info(journal.lower())
324 |             timeout = 5
325 |             for attempt in range(10):
326 |                 try:
327 |                     html = driver.page_source
328 |                 except:
329 |                     logger.info(f"Source Page #{attempt}. Retrying...")
330 |                     driver.quit()
331 |                     driver = Driver(
332 |                         uc=True,
333 |                         headless2=True,
334 |                         agent=random.choice(USER_AGENTS),
335 |                     )
336 |                     driver.get(url)
337 |                     sleep(2)
338 |                 else:
339 |                     break
340 |             if journal.lower() in ['journal of neuroscience', 'j neurosci']:
341 |                 ## Find links with class data-table-url, and click on them
342 |                 ## to load the table data.
343 |                 table_links = driver.find_elements(By.CLASS_NAME, 'table-expand-inline')
344 | 
345 |                 if len(table_links):         
346 |                     for link in table_links:
347 |                         WebDriverWait(driver, 20).until(EC.element_to_be_clickable((
348 |                             By.CLASS_NAME, 'table-expand-inline')))    
349 |                         driver.execute_script("arguments[0].scrollIntoView();", link)
350 |                         link.click()
351 |                         sleep(0.5 + random.random() * 1)
352 | 
353 |             # If title has ScienceDirect in in title
354 |             elif ' - ScienceDirect' in html:
355 |                 try:
356 |                     element_present = EC.presence_of_element_located((By.ID, 'abstracts'))
357 |                     WebDriverWait(driver, timeout).until(element_present)
358 |                 except TimeoutException:
359 |                     pass
360 |             elif 'Wiley Online Library</title>' in html:
361 |                 try:
362 |                     element_present = EC.presence_of_element_located((By.ID, 'article__content'))
363 |                     WebDriverWait(driver, timeout).until(element_present)
364 |                 except TimeoutException:
365 |                     pass
366 | 
367 |             ## Uncomment this next line to scroll to end. Doesn't seem to actually help.
368 |             # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
369 |             ## Uncomment next line and insert ID to search for specific element.
370 |             # driver.find_element_by_id('relatedArticles').send_keys('\t')
371 |             # This next line helps minimize the number of blank articles saved from ScienceDirect,
372 |             # which loads content via Ajax requests only after the page is done loading. There is 
373 |             # probably a better way to do this...
374 |             
375 |             driver.quit()
376 |             return html
377 | 
378 |         elif mode == 'requests':
379 |             headers = {'User-Agent': random.choice(USER_AGENTS)}
380 |             r = requests.get(url, headers=headers)
381 |             # For some journals, we can do better than the returned HTML, so get the final URL and 
382 |             # substitute a better one.
383 |             url = self.check_for_substitute_url(r.url, r.text, journal)
384 |             if url != r.url:
385 |                 r = requests.get(url, headers=headers)
386 |                 # XML content is usually misidentified as ISO-8859-1, so we need to manually set utf-8.
387 |                 # Unfortunately this can break other documents. Need to eventually change this to inspect the 
388 |                 # encoding attribute of the document header.
389 |                 r.encoding = 'utf-8'
390 |             return r.text
391 | 
392 | 
393 |     def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_pmc_source=True):
394 |         base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
395 |         "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
396 | 
397 |         if prefer_pmc_source:
398 |             try:
399 |                 response = self._client.elink(pmid, retmode='json', return_content=False)
400 |                 response.raise_for_status()  # Raise an HTTPError for bad responses
401 |                 json_content = response.json()
402 | 
403 |                 providers = {obj['provider']['nameabbr']: obj["url"]["value"] for obj in json_content['linksets'][0]['idurllist'][0]['objurls']}
404 |                 pmc_url = providers.get('PMC')
405 | 
406 |                 if pmc_url:
407 |                     return self.get_html(pmc_url, journal, mode='requests')
408 |                 elif prefer_pmc_source == "only":
409 |                     logger.info("\tNo PMC source found! Skipping...")
410 |                     return
411 |             except requests.RequestException as e:
412 |                 logger.error(f"Request failed: {e}")
413 |             except KeyError as e:
414 |                 logger.error(f"Key error: {e} - JSON content: {json_content}")
415 |         else:
416 |             query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
417 |             logger.info(query)
418 |             return self.get_html(query, journal, mode=mode)
419 | 
420 |         if prefer_pmc_source == "only":
421 |             logger.info("\tNo PMC source found!! Skipping...")
422 |             return
423 | 
424 |         # Fallback if no PMC link found
425 |         query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
426 |         return self.get_html(query, journal, mode=mode)
427 | 
428 | 
429 |     def check_for_substitute_url(self, url, html, journal):
430 |         ''' For some journals/publishers, we can get a better document version by modifying the 
431 |         URL passed from PubMed. E.g., we can get XML with embedded tables from PLoS ONE instead of 
432 |         the standard HTML, which displays tables as images. For some journals (e.g., Frontiers),  
433 |         it's easier to get the URL by searching the source, so pass the html in as well. '''
434 | 
435 |         j = journal.lower()
436 |         try:
437 |             if j == 'plos one':
438 |                 doi_part = re.search('article\?id\=(.*)', url).group(1)
439 |                 return 'http://journals.plos.org/plosone/article/asset?id=%s.XML' % doi_part
440 |             elif j in ['human brain mapping', 'european journal of neuroscience',
441 |                        'brain and behavior', 'epilepsia', 'journal of neuroimaging']:
442 |                 return url.replace('abstract', 'full').split(';')[0]
443 |             elif j == 'journal of cognitive neuroscience':
444 |                 return url.replace('doi/abs', 'doi/full')
445 |             elif j.startswith('frontiers in'):
446 |                 return re.sub('(full|abstract)\/*$', 'xml\/nlm', url)
447 |             elif 'sciencedirect' in url:
448 |                 return url + '?np=y'
449 |             elif 'springer.com' in url:
450 |                 return url + '/fulltext.html'
451 |             else:
452 |                 return url
453 |         except Exception as err:
454 |             return url
455 | 
456 |     
457 |     def is_pmc_open_acess(self, pmcid):
458 |         oa_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id="
459 | 
460 |         response = get_url(oa_url + pmcid)
461 |     
462 |         return 'idIsNotOpenAccess' not in response
463 | 
464 |     def process_article(self, id, journal, delay=None, mode='browser', overwrite=False, prefer_pmc_source=True):
465 | 
466 |         logger.info("Processing %s..." % id)
467 |         journal_path = (self.store / 'html' / journal)
468 |         journal_path.mkdir(parents=True, exist_ok=True)
469 |         filename = journal_path / f"{id}.html"
470 | 
471 |         if not overwrite and os.path.isfile(filename): 
472 |             logger.info("\tAlready exists! Skipping...")
473 |             
474 |             return None, None
475 | 
476 |         # Save the HTML 
477 |         doc = self.get_html_by_pmid(id, journal, mode=mode, prefer_pmc_source=prefer_pmc_source)
478 |         valid = None
479 |         if doc:
480 |             valid = _validate_scrape(doc)
481 |             if valid:
482 |                 with filename.open('w') as f:
483 |                     f.write(doc)
484 |             if not valid:
485 |                 logger.info("\tScrape failed! Skipping...")
486 | 
487 |             # Insert random delay until next request.
488 |             if delay is not None:
489 |                 sleep_time = random.random() * float(delay*2)
490 |                 sleep(sleep_time)
491 | 
492 |         return filename, valid
493 | 
494 |     def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mode='browser', search=None,
495 |                                 limit=None, overwrite=False, min_pmid=None, max_pmid=None, shuffle=False,
496 |                                 index_pmids=False, skip_pubmed_central=True, metadata_store=None, invalid_article_log_file=None, prefer_pmc_source=True):
497 | 
498 |         ''' Try to retrieve all PubMed articles for a single journal that don't 
499 |         already exist in the storage directory.
500 |         Args:
501 |             journal: The name of the journal (as it appears in PubMed).
502 |             pmids: A list of PMIDs to retrieve.
503 |             dois: A list of DOIs to retrieve. 
504 |             delay: Mean delay between requests.
505 |             mode: When 'browser', use selenium to load articles in Chrome. When 
506 |                 'requests', attempts to fetch the HTML directly via requests module.
507 |             search: An optional search string to append to the PubMed query.
508 |                 Primarily useful for journals that are not specific to neuroimaging.
509 |             limit: Optional max number of articles to fetch. Note that only new articles 
510 |                 are counted against this limit; e.g., if limit = 100 and 2,000 articles 
511 |                 are found in PubMed, retrieval will continue until 100 new articles 
512 |                 have been added.
513 |             overwrite: When True, all articles returned from PubMed query will be 
514 |                 fetched, irrespective of whether or not they already exist on disk.
515 |             min_pmid: When a PMID is provided, only articles with PMIDs greater than 
516 |                 this will be processed. Primarily useful for excluding older articles 
517 |                 that aren't available in full-text HTML format.
518 |             max_pmid: When a PMID is provided, only articles with PMIDs less than
519 |                 this will be processed. 
520 |             shuffle: When True, articles are retrieved in random order.
521 |             index_pmids: When True, will create a list of pmids already in the output.
522 |                 When used in combination with overwrite=False, this will not download a pmid
523 |                 even though it's in another directory.
524 |             skip_pubmed_central: When True, skips articles that are available from
525 |                 PubMed Central. This will also write a file with the skipped pmcids
526 |                 to use with pubget.
527 |             metadata_store: Optional path to a directory to store/reference PubMed metadata.
528 |             invalid_article_log_file: Optional path to a file to log files where scraping failed.
529 |             prefer_pmc_source: Optional
530 |                 When True, preferentially retrieve articles from PubMed Central, using requests instead of browser
531 |                 (regardless of mode). This is useful for journals that have full-text articles available on PMC,
532 |                 but are not open-access. If set to "only", will only retrieve articles from PMC, and
533 |                 skip articles it cannot retrieve from PMC.
534 |         '''
535 |         articles_found = 0
536 |         if journal is None and dois is None and pmids is None:
537 |             raise ValueError("Either journal, pmids, or dois must be provided.")
538 | 
539 |         if journal is not None:
540 |             logger.info("Getting PMIDs for articles from %s..." % journal)
541 |             pmids = self.search_pubmed(journal, search)
542 | 
543 |         if dois is not None:
544 |             logger.info("Retrieving articles from %s..." % ', '.join(dois))
545 |             pmids = [get_pmid_from_doi(doi) for doi in dois]
546 | 
547 |             # Remove None values and log missing DOIs
548 |             pmids = [pmid for pmid in pmids if pmid is not None]
549 |             missing_dois = [doi for doi, pmid in zip(dois, pmids) if pmid is None]
550 |             if len(missing_dois) > 0:
551 |                 logger.info("Missing DOIs: %s" % ', '.join(missing_dois))
552 | 
553 |         if shuffle:
554 |             random.shuffle(pmids)
555 | 
556 |         logger.info("Found %d records.\n" % len(pmids))
557 | 
558 |         # If journal is provided, check for existing articles
559 |         if journal is not None:
560 |             logger.info("Retrieving articles from %s..." % journal)
561 |             journal_path = (self.store / 'html' / journal)
562 |             if journal_path.exists():
563 |                 existing = journal_path.glob('*.html')
564 |                 existing = [int(f.stem) for f in existing]
565 |                 n_existing = len(existing)
566 |                 pmids = [pmid for pmid in pmids if int(pmid) not in existing]
567 |                 logger.info(f"Found {n_existing} existing articles.")
568 | 
569 |         # filter out all pmids, not just based on folder
570 |         if index_pmids:
571 |             existing_pmids = [f.stem for f in (self.store / 'html').rglob('*.html')]
572 |             pmids = [pmid for pmid in pmids if pmid not in existing_pmids]
573 | 
574 |         # Filter out articles that are outside the PMID range
575 |         pmids = [
576 |             pmid
577 |             for pmid in pmids 
578 |             if (min_pmid is None or int(pmid) >= min_pmid) and (max_pmid is None or int(pmid) <= max_pmid)
579 |             ]
580 | 
581 |         logger.info(f"Retrieving {len(pmids)} articles...")
582 | 
583 |         if skip_pubmed_central:
584 |             all_ids = _convert_pmid_to_pmc(pmids)
585 |         else:
586 |             all_ids = [(None, pmid) for pmid in pmids]
587 | 
588 |         invalid_articles = []
589 | 
590 |         if journal is None:
591 |             all_iter = []
592 |             for pmcid, pmid in all_ids:
593 |                 metadata = get_pubmed_metadata(pmid, store=metadata_store)
594 |                 if not metadata or 'journal' not in metadata:
595 |                     all_iter.append((pmcid, pmid, "UNKNOWN"))
596 |                     continue
597 |                 all_iter.append((pmcid, pmid, metadata['journal']))
598 |         else:
599 |             all_iter = [(pmcid, pmid, journal) for pmcid, pmid in all_ids]
600 | 
601 |         for pmcid, pmid, journal in all_iter:
602 | 
603 |             if limit is not None and articles_found >= limit: break
604 | 
605 |             if skip_pubmed_central and pmcid and self.is_pmc_open_acess(pmcid):
606 |                 logger.info(f"\tPubMed Central OpenAccess entry found! Skipping {pmid}...")
607 |                 with open('openaccess_pmcids.txt', 'a') as f:
608 |                     f.write(f"{pmcid}\n")
609 |                 continue
610 | 
611 |             filename, valid = self.process_article(pmid, journal, delay, mode, overwrite, prefer_pmc_source)
612 | 
613 |             if not valid:
614 |                 invalid_articles.append(filename)
615 |                 if invalid_article_log_file is not None:
616 |                     with open(invalid_article_log_file, 'a') as f:
617 |                         f.write(f"{pmid}\n")
618 |             else:
619 |                 articles_found += 1
620 | 
621 |         return invalid_articles
622 | 


--------------------------------------------------------------------------------
/ace/sources.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | # use unicode everywhere
  3 | from bs4 import BeautifulSoup
  4 | import re
  5 | import os
  6 | import json
  7 | import abc
  8 | import importlib
  9 | from glob import glob
 10 | from ace import datatable
 11 | from ace import tableparser
 12 | from ace import scrape
 13 | from ace import config
 14 | from ace import database
 15 | import logging
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | 
 20 | class SourceManager:
 21 | 
 22 |     ''' Loads all the available Source subclasses from this module and the
 23 |     associated directory of JSON config files and uses them to determine which parser
 24 |     to call when a new HTML file is passed. '''
 25 | 
 26 |     def __init__(self, database, table_dir=None):
 27 |         ''' SourceManager constructor.
 28 |         Args:
 29 |             database: A Database instance to use with all Sources.
 30 |             table_dir: An optional directory name to save any downloaded tables to.
 31 |                 When table_dir is None, nothing will be saved (requiring new scraping
 32 |                 each time the article is processed).
 33 |         '''
 34 |         module = importlib.import_module('ace.sources')
 35 |         self.sources = {}
 36 |         source_dir = os.path.join(os.path.dirname(__file__), 'sources')
 37 |         for config_file in glob('%s/*json' % source_dir):
 38 |             class_name = config_file.split('/')[-1].split('.')[0]
 39 |             cls = getattr(module, class_name + 'Source')(database, config=config_file, table_dir=table_dir)
 40 |             self.sources[class_name] = cls
 41 | 
 42 |     def identify_source(self, html):
 43 |         ''' Identify the source of the article and return the corresponding Source object. '''
 44 |         for source in list(self.sources.values()):
 45 |             for patt in source.identifiers:
 46 |                 if re.search(patt, html):
 47 |                     logger.debug('Matched article to Source: %s' % source.__class__.__name__)
 48 |                     return source
 49 | 
 50 | 
 51 | # A single source of articles--i.e., a publisher or journal
 52 | class Source(metaclass=abc.ABCMeta):
 53 |     # need to include the \\u2009 which is the thin space to which the table is being invalidated due to those characters
 54 |     # -\\u2009int
 55 |     ENTITIES = {
 56 |         '&nbsp;': ' ',
 57 |         '&minus;': '-',
 58 |         # '&kappa;': 'kappa',
 59 |         '\xa0': ' ',        # Unicode non-breaking space
 60 |         # '\x3e': ' ',
 61 |         '\u2212': '-',      # Various unicode dashes
 62 |         '\u2012': '-',
 63 |         '\u2013': '-',
 64 |         '\u2014': '-',
 65 |         '\u2015': '-',
 66 |         '\u8211': '-',
 67 |         '\u0150': '-',
 68 |         '\u0177': '',
 69 |         '\u0160': '',
 70 |         '\u0145': "'",
 71 |         '\u0146': "'",
 72 |         '\u2009': "",     # Various whitespaces within tables
 73 |         '\u2007': "",
 74 | 
 75 |     }
 76 | 
 77 |     def __init__(self, database, config=None, table_dir=None):
 78 |         self.database = database
 79 |         self.table_dir = table_dir
 80 |         self.entities = {}
 81 | 
 82 |         if config is not None:
 83 |             config = json.load(open(config, 'rb'))
 84 |             valid_keys = ['name', 'identifiers', 'entities', 'delay']
 85 | 
 86 |             for k, v in list(config.items()):
 87 |                 if k in valid_keys:
 88 |                     setattr(self, k, v)
 89 | 
 90 |             # Append any source-specific entities found in the config file to
 91 |             # the standard list
 92 |             if self.entities is None:
 93 |                 self.entities = Source.ENTITIES
 94 |             else:
 95 |                 self.entities.update(Source.ENTITIES)
 96 | 
 97 |     @abc.abstractmethod
 98 |     def parse_article(self, html, pmid=None, metadata_dir=None):
 99 |         ''' Takes HTML article as input and returns an Article. PMID Can also be 
100 |         passed, which prevents having to scrape it from the article and/or look it 
101 |         up in PubMed. '''
102 | 
103 |         # Skip rest of processing if this record already exists
104 |         if pmid is not None and self.database.article_exists(pmid) and not config.OVERWRITE_EXISTING_ROWS:
105 |             return False
106 | 
107 |         html = self.decode_html_entities(html)
108 |         soup = BeautifulSoup(html)
109 |         if pmid is None:
110 |             pmid = self.extract_pmid(soup)
111 | 
112 |         # did our best to find PMID, but failed
113 |         if not pmid:
114 |             return False
115 | 
116 |         metadata = scrape.get_pubmed_metadata(pmid, store=metadata_dir, save=True)
117 | 
118 |         # Remove all scripts and styles
119 |         for script in soup(["script", "style"]):
120 |             script.extract()
121 |         # Get text
122 |         text = soup.get_text()
123 |         if self.database.article_exists(pmid):
124 |             if config.OVERWRITE_EXISTING_ROWS:
125 |                 self.database.delete_article(pmid)
126 |             else:
127 |                 return False
128 | 
129 |         self.article = database.Article(text, pmid=pmid, metadata=metadata)
130 |         self.extract_neurovault(soup)
131 |         return soup
132 | 
133 |     def extract_neurovault(self, soup):
134 |         ''' Look through all links, and use regex to identify NeuroVault links. '''
135 |         image_regexes = ['identifiers.org/neurovault.image:(\d*)',
136 |                      'neurovault.org/images/(\d*)']
137 | 
138 |         image_regexes = re.compile( '|'.join( image_regexes) )
139 | 
140 |         collection_regexes = ['identifiers.org/neurovault.collection:(\w*)',
141 |                      'neurovault.org/collections/(\w*)']
142 | 
143 |         collection_regexes = re.compile( '|'.join( collection_regexes) )
144 | 
145 | 
146 |         nv_links = []
147 |         for link in soup.find_all('a'):
148 |             if link.has_attr('href'):
149 |                 href = link['href']
150 | 
151 |                 img_m = image_regexes.search(href)
152 |                 col_m = collection_regexes.search(href)
153 |                 if not (img_m or col_m):
154 |                     continue
155 | 
156 |                 if img_m:
157 |                     type = 'image'
158 |                     val =  img_m.groups()[0] or img_m.groups()[1]
159 |                 elif col_m:
160 |                     type = 'collection'
161 |                     val =  col_m.groups()[0] or col_m.groups()[1]
162 | 
163 |                 nv_links.append(
164 |                     database.NeurovaultLink(
165 |                         type=type,
166 |                         neurovault_id=val,
167 |                         url=href
168 |                     )
169 |                 )
170 | 
171 |         self.article.neurovault_links = nv_links
172 | 
173 |     def extract_text(self, soup):
174 |         ''' Extract text from the article.
175 |          Publisher specific extraction of body text should be done in a subclass.
176 |          '''
177 | 
178 |         text = soup.get_text()
179 | 
180 |         # Remove any remaining HTML tags
181 |         text = re.sub(r'<[^>]+>', '', text)
182 | 
183 |         # Remove any remaining unicode characters
184 |         text = re.sub(r'\\u[0-9]+', '', text)
185 | 
186 |         # Remove any remaining entities
187 |         text = self.decode_html_entities(text)
188 | 
189 |         # Remove any remaining whitespace
190 |         text = re.sub(r'\s+', ' ', text)
191 | 
192 |         self.article.text = text
193 | 
194 |     def parse_table(self, table):
195 |         ''' Takes HTML for a single table and returns a Table. '''
196 |         # Formatting issues sometimes prevent table extraction, so just return
197 |         if table is None:
198 |             return False
199 | 
200 |         logger.debug("\t\tFound a table...")
201 | 
202 |         # change <br/> to \n
203 |         for br in table.find_all("br"):
204 |             br.replace_with("\n")
205 | 
206 |         # Count columns. Check either just one row, or all of them.
207 |         def n_cols_in_row(row):
208 |             return sum([
209 |                 int(td['colspan'])
210 |                 if td.has_attr('colspan') and td['colspan'] != "NaN" else 1
211 |                 for td in row.find_all(['th', 'td'])
212 |                 ])
213 | 
214 |         search_table = table.find("tbody")
215 |         if search_table is None:
216 |             search_table = table
217 | 
218 |         all_trs = search_table.find_all('tr')
219 |         if all_trs is None or len(all_trs) == 0:
220 |             return False
221 | 
222 |         if config.CAREFUL_PARSING:
223 |             n_cols = max([n_cols_in_row(
224 |                 row) for row in all_trs])
225 |         else:
226 |             n_cols = n_cols_in_row(search_table.find('tr'))
227 | 
228 |         # Initialize grid and populate
229 |         data = datatable.DataTable(0, n_cols)
230 |         rows = table.find_all('tr')
231 |         for (j, r) in enumerate(rows):
232 |             try:
233 |                 cols = r.find_all(['td', 'th'])
234 |                 cols_found_in_row = 0
235 |                 n_cells = len(cols)
236 |                 # Assign number of rows and columns this cell fills. We use these rules:
237 |                 # * If a rowspan/colspan is explicitly provided, use it
238 |                 # * If not, initially assume span == 1 for both rows and columns.
239 |                 for (i, c) in enumerate(cols):
240 |                     r_num = (
241 |                         int(c['rowspan'])
242 |                         if c.has_attr('rowspan') and c['rowspan'] != "NaN" else 1
243 |                     )
244 |                     c_num = (
245 |                         int(c['colspan'])
246 |                         if c.has_attr('colspan') and c['colspan'] != "NaN" else 1
247 |                     )
248 |                     cols_found_in_row += c_num
249 |                     # * Check to make sure that we don't have unaccounted-for columns in the
250 |                     #   row after including the current cell. If we do, adjust the colspan
251 |                     #   to take up all of the remaining columns. This is necessary because
252 |                     #   some tables have malformed HTML, and BeautifulSoup can also
253 |                     #   cause problems in its efforts to fix bad tables. The most common
254 |                     #   problem is deletion or omission of enough <td> tags to fill all
255 |                     #   columns, hence our adjustment. Note that in some cases the order of
256 |                     #   filling is not sequential--e.g., when a previous row has cells with
257 |                     #   rowspan > 1. So we have to check if there are None values left over
258 |                     # in the DataTable's current row after we finish filling
259 |                     # it.
260 |                     if i + 1 == n_cells and cols_found_in_row < n_cols and (len(data.data) == j+1) and data[j].count(None) > c_num:
261 |                         c_num += n_cols - cols_found_in_row
262 |                     data.add_val(c.get_text(), r_num, c_num)
263 |             except Exception as err:
264 |                 if not config.SILENT_ERRORS:
265 |                     logger.error(str(err))
266 |                 if not config.IGNORE_BAD_ROWS:
267 |                     raise
268 | 
269 |         if data.data[data.n_rows- 1].count(None) == data.n_cols:
270 |             data.data.pop()
271 |         logger.debug("\t\tTrying to parse table...")
272 |         return tableparser.parse_table(data)
273 | 
274 |     def extract_doi(self, soup):
275 |         ''' Every Source subclass must be able to extract its doi. '''
276 |         return
277 | 
278 |     def extract_pmid(self, soup):
279 |         ''' Every Source subclass must be able to extract its PMID. '''
280 |         return
281 | 
282 |     def decode_html_entities(self, html):
283 |         ''' Re-encode HTML entities as innocuous little Unicode characters. '''
284 |         # Any entities BeautifulSoup passes through thatwe don't like, e.g.,
285 |         # &nbsp/x0a
286 |         if self.entities:
287 |             patterns = re.compile('(' + '|'.join(re.escape(
288 |                 k) for k in list(self.entities.keys())) + ')')
289 |             replacements = lambda m: self.entities[m.group(0)]
290 |             return patterns.sub(replacements, html)
291 |         else:
292 |             return html
293 | 
294 |     def _download_table(self, url):
295 |         ''' For Sources that have tables in separate files, a helper for 
296 |         downloading and extracting the table data. Also saves to file if desired.
297 |         '''
298 | 
299 |         delay = self.delay if hasattr(self, 'delay') else 0
300 | 
301 |         if self.table_dir is not None:
302 |             filename = '%s/%s' % (self.table_dir, url.replace('/', '_'))
303 |             if os.path.exists(filename):
304 |                 table_html = open(filename).read()
305 |             else:
306 |                 table_html = scrape.get_url(url)
307 |                 open(filename, 'w').write(table_html.encode('utf-8'))
308 |         else:
309 |             table_html = scrape.get_url(url)
310 | 
311 |         if table_html:
312 |             table_html = self.decode_html_entities(table_html)
313 |             return BeautifulSoup(table_html)
314 | 
315 |         return None
316 | 
317 | 
318 | class DefaultSource(Source):
319 |     def parse_article(self, html, pmid=None, **kwargs):
320 |         soup = super(DefaultSource, self).parse_article(html, pmid, **kwargs)
321 |         if not soup:
322 |             return False
323 | 
324 |         self.article.missing_source = True
325 |         return self.article
326 | 
327 | 
328 | class HighWireSource(Source):
329 | 
330 |     def parse_article(self, html, pmid=None, **kwargs):
331 |         soup = super(HighWireSource, self).parse_article(html, pmid, **kwargs)
332 |         if not soup:
333 |             return False
334 | 
335 |         # To download tables, we need the content URL and the number of tables
336 |         content_url = soup.find('meta', {
337 |                                 'name': 'citation_public_url'})['content']
338 | 
339 |         n_tables = len(soup.find_all('span', class_='table-label'))
340 | 
341 |         # Now download each table and parse it
342 |         tables = []
343 |         logger.info(f"Found {n_tables} tables.")
344 |         for i in range(n_tables):
345 |             t_num = i + 1
346 |             url = '%s/T%d.expansion.html' % (content_url, t_num)
347 |             table_soup = self._download_table(url)
348 |             if not table_soup:
349 |                 continue
350 |             tc = table_soup.find(class_='table-expansion')
351 |             if tc:
352 |                 t = tc.find('table', {'id': 'table-%d' % (t_num)})
353 |                 t = self.parse_table(t)
354 |                 if t:
355 |                     t.position = t_num
356 |                     t.label = tc.find(class_='table-label').text
357 |                     t.number = t.label.split(' ')[-1].strip()
358 |                     try:
359 |                         t.caption = tc.find(class_='table-caption').get_text()
360 |                     except:
361 |                         pass
362 |                     try:
363 |                         t.notes = tc.find(class_='table-footnotes').get_text()
364 |                     except:
365 |                         pass
366 |                     tables.append(t)
367 | 
368 |         self.article.tables = tables
369 |         return self.article
370 | 
371 |     def parse_table(self, table):
372 |         return super(HighWireSource, self).parse_table(table)
373 | 
374 |     def extract_doi(self, soup):
375 |         try:
376 |             return soup.find('meta', {'name': 'citation_doi'})['content']
377 |         except:
378 |             return ''
379 | 
380 |     def extract_pmid(self, soup):
381 |         return soup.find('meta', {'name': 'citation_pmid'})['content']
382 | 
383 |     def extract_text(self, soup):
384 |         # If div has class "main-content-wrapper" or "article" or "fulltext-view"
385 |         # extract all text from it
386 | 
387 |         # Assuming you have a BeautifulSoup object called soup
388 |         div = soup.find_all("div", class_="article")
389 |         if div:
390 |             div = div[0]
391 |             div_classes = ["ref-list", "abstract", "copyright-statement", "fn-group", "history-list", "license"]
392 |             for class_ in div_classes:
393 |                 for tag in div.find_all(class_=class_):
394 |                     tag.extract()
395 |             soup = div
396 | 
397 |         return super(HighWireSource, self).extract_text(soup)
398 | 
399 | 
400 | class OUPSource(Source):
401 | 
402 |     def parse_article(self, html, pmid=None, **kwargs):
403 |         soup = super(OUPSource, self).parse_article(html, pmid, **kwargs)
404 |         if not soup:
405 |             return False
406 | 
407 |         # Extract tables
408 |         tables = []
409 | 
410 |         # Exclude modal tables to prevent duplicates
411 |         all_tables = set(soup.select('div.table-full-width-wrap'))
412 |         modal_tables = set(soup.select('div.table-full-width-wrap.table-modal'))
413 |         table_containers = all_tables - modal_tables
414 |         logger.info(f"Found {len(table_containers)} tables.")
415 |         for (i, tc) in enumerate(table_containers):
416 |             table_html = tc.find('table')
417 |             t = self.parse_table(table_html)
418 |             if t:
419 |                 t.position = i + 1
420 |                 try:
421 |                     t.number =  tc.find('span', class_='label').text.split(' ')[-1].strip()
422 |                     t.label = tc.find('span', class_='label').text.strip()
423 |                 except:
424 |                     pass
425 |                 try:
426 |                     t.caption = tc.find('span', class_='caption').get_text()
427 |                 except:
428 |                     pass
429 |                 try:
430 |                     t.notes = tc.find('span', class_='fn').get_text()
431 |                 except:
432 |                     pass
433 |                 tables.append(t)
434 | 
435 |         self.article.tables = tables
436 |         return self.article
437 | 
438 |     def parse_table(self, table):
439 |         return super(OUPSource, self).parse_table(table)
440 | 
441 |     def extract_doi(self, soup):
442 |         try:
443 |             return soup.find('meta', {'name': 'citation_doi'})['content']
444 |         except:
445 |             return ''
446 | 
447 |     def extract_pmid(self, soup):
448 |         pmid = soup.find('meta', {'name': 'citation_pmid'})
449 |         if pmid:
450 |             return pmid['content']
451 |         else:
452 |             return None
453 | 
454 |     def extract_text(self, soup):
455 |         # If div has class "main-content-wrapper" or "article" or "fulltext-view"
456 |         # extract all text from it
457 | 
458 |         # Assuming you have a BeautifulSoup object called soup
459 |         div = soup.find_all("div", class_="article-body")
460 |         if div:
461 |             div = div[0]
462 |             div_classes = ["ref-list", "abstract", "copyright-statement", "fn-group", "history-list", "license"]
463 |             for class_ in div_classes:
464 |                 for tag in div.find_all(class_=class_):
465 |                     tag.extract()
466 |             soup = div
467 | 
468 |         return super(OUPSource, self).extract_text(soup)
469 | 
470 | 
471 | class ScienceDirectSource(Source):
472 | 
473 |     def parse_article(self, html, pmid=None, **kwargs):
474 |         soup = super(ScienceDirectSource, self).parse_article(html, pmid, **kwargs)
475 |         if not soup:
476 |             return False
477 | 
478 |         # Extract tables
479 |         tables = []
480 |         table_containers = soup.find_all('div', {'class': 'tables'})
481 |         if len(table_containers) == 0:
482 |             # try old method
483 |             table_containers = soup.find_all('dl', {'class': 'table'})
484 | 
485 |         logger.info(f"Found {len(table_containers)} tables.")
486 |         for (i, tc) in enumerate(table_containers):
487 |             table_html = tc.find('table')
488 |             t = self.parse_table(table_html)
489 |             if t:
490 |                 t.position = i + 1
491 |                 try:
492 |                     t.number = tc.find('span', class_='label').text.split(' ')[-1].strip() or tc['data-label'].split(' ')[-1].strip()
493 |                     t.label = tc.find('span', class_='label').text.strip()
494 |                 except:
495 |                     pass
496 |                 try:
497 |                     t.caption = tc.find('p').contents[-1].strip()
498 |                 except:
499 |                     pass
500 |                 try:
501 |                     t.notes = tc.find(class_='tblFootnote').get_text()
502 |                 except:
503 |                     pass
504 |                 tables.append(t)
505 | 
506 |         self.article.tables = tables
507 |         return self.article
508 | 
509 |     def parse_table(self, table):
510 |         return super(ScienceDirectSource, self).parse_table(table)
511 | 
512 |     def extract_doi(self, soup):
513 |         try:
514 |             return list(soup.find('div', {'id': 'article-identifier-links'}).children)[0]['href'].replace('https://doi.org/', '')
515 |         except:
516 |             return ''
517 | 
518 |     def extract_pmid(self, soup):
519 |         return scrape.get_pmid_from_doi(self.extract_doi(soup))
520 | 
521 | 
522 | class PlosSource(Source):
523 | 
524 |     def parse_article(self, html, pmid=None, **kwargs):
525 |         soup = super(PlosSource, self).parse_article(html, pmid, **kwargs)  # Do some preprocessing
526 |         if not soup:
527 |             return False
528 | 
529 |         # Extract tables
530 |         tables = []
531 |         table_containers = soup.find_all('table-wrap')
532 |         logger.info(f"Found {len(table_containers)} tables.")
533 |         for (i, tc) in enumerate(table_containers):
534 |             table_html = tc.find('table')
535 |             t = self.parse_table(table_html)
536 |             if t:
537 |                 t.position = i + 1
538 |                 t.label = tc.find('label').text
539 |                 t.number = t.label.split(' ')[-1].strip()
540 |                 try:
541 |                     t.caption = tc.find('title').get_text()
542 |                 except:
543 |                     pass
544 |                 try:
545 |                     t.notes = tc.find('table-wrap-foot').get_text()
546 |                 except:
547 |                     pass
548 |                 tables.append(t)
549 | 
550 |         self.article.tables = tables
551 |         return self.article
552 | 
553 |     def parse_table(self, table):
554 |         return super(PlosSource, self).parse_table(table)
555 | 
556 |     def extract_doi(self, soup):
557 |         try:
558 |             return soup.find('article-id', {'pub-id-type': 'doi'}).text
559 |         except:
560 |             return ''
561 |         
562 |     def extract_pmid(self, soup):
563 |         return scrape.get_pmid_from_doi(self.extract_doi(soup))
564 | 
565 | 
566 | class FrontiersSource(Source):
567 | 
568 |     def parse_article(self, html, pmid=None, **kwargs):
569 | 
570 |         soup = super(FrontiersSource, self).parse_article(html, pmid, **kwargs)
571 |         if not soup:
572 |             return False
573 | 
574 |         # Extract tables
575 |         tables = []
576 |         table_containers = soup.findAll(
577 |             'table-wrap', {'id': re.compile('^T\d+$')})
578 |         logger.info(f"Found {len(table_containers)} tables.")
579 |         for (i, tc) in enumerate(table_containers):
580 |             table_html = tc.find('table')
581 |             t = self.parse_table(table_html)
582 |             # If Table instance is returned, add other properties
583 |             if t:
584 |                 t.position = i + 1
585 |                 t.number = tc['id'][1::].strip()
586 |                 t.label = tc.find('label').get_text()
587 |                 try:
588 |                     t.caption = tc.find('caption').get_text()
589 |                 except:
590 |                     pass
591 |                 try:
592 |                     t.notes = tc.find('table-wrap-foot').get_text()
593 |                 except:
594 |                     pass
595 |                 tables.append(t)
596 | 
597 |         self.article.tables = tables
598 |         return self.article
599 | 
600 |     def parse_table(self, table):
601 |         return super(FrontiersSource, self).parse_table(table)
602 | 
603 |     def extract_doi(self, soup):
604 |         try:
605 |             return soup.find('article-id', {'pub-id-type': 'doi'}).text
606 |         except:
607 |             return ''
608 | 
609 |     def extract_pmid(self, soup):
610 |         return scrape.get_pmid_from_doi(self.extract_doi(soup))
611 | 
612 | 
613 | class JournalOfCognitiveNeuroscienceSource(Source):
614 | 
615 |     def parse_article(self, html, pmid=None, **kwargs):
616 |         soup = super(
617 |             JournalOfCognitiveNeuroscienceSource, self).parse_article(html, pmid, **kwargs)
618 |         if not soup:
619 |             return False
620 | 
621 |         # To download tables, we need the DOI and the number of tables
622 |         doi = self.article.doi or self.extract_doi(soup)
623 |         tables = []
624 | 
625 |         # Now download each table and parse it
626 |         table_containers = soup.find_all('div', {'class': 'table-wrap'})
627 |         logger.info(f"Found {len(table_containers)} tables.")
628 |         for i, tc in enumerate(table_containers):
629 |             table_html = tc.find('table', {'role': 'table'})
630 |             if not table_html:
631 |                 continue
632 | 
633 |             t = self.parse_table(table_html)
634 | 
635 |             if t:
636 |                 t.position = i + 1
637 |                 s = re.search('T(\d+).+$', tc['content-id'])
638 |                 if s:
639 |                     t.number = s.group(1)
640 |                 caption = tc.find('div', class_='caption')
641 |                 if caption:
642 |                     t.label = caption.get_text()
643 |                     t.caption = caption.get_text()
644 |                 try:
645 |                     t.notes = tc.find('div', class_="fn").p.get_text()
646 |                 except:
647 |                     pass
648 |                 tables.append(t)
649 | 
650 |         self.article.tables = tables
651 |         return self.article
652 | 
653 |     def parse_table(self, table):
654 |         return super(JournalOfCognitiveNeuroscienceSource, self).parse_table(table)
655 | 
656 |     def extract_doi(self, soup):
657 |         try:
658 |             return soup.find('meta', {'name': 'dc.Identifier', 'scheme': 'doi'})['content']
659 |         except:
660 |             return ''
661 | 
662 |     def extract_pmid(self, soup):
663 |         return scrape.get_pmid_from_doi(self.extract_doi(soup))
664 | 
665 | 
666 | class WileySource(Source):
667 | 
668 |     def parse_article(self, html, pmid=None, **kwargs):
669 | 
670 |         soup = super(WileySource, self).parse_article(html, pmid, **kwargs)  # Do some preprocessing
671 |         if not soup:
672 |             return False
673 | 
674 |         # Extract tables
675 |         tables = []
676 |         table_containers = soup.findAll('div', {
677 |                                         'class': re.compile('article-table-content|table'), 'id': re.compile('^(.*?)\-tbl\-\d+$|^t(bl)*\d+$')})
678 |         logger.info(f"Found {len(table_containers)} tables.")
679 |         for (i, tc) in enumerate(table_containers):
680 |             table_html = tc.find('table')
681 |             footer = None
682 |             try:
683 |                 # Remove footer, which appears inside table
684 |                 footer = table_html.tfoot.extract()
685 |             except:
686 |                 pass
687 |             t = self.parse_table(table_html)
688 |             # If Table instance is returned, add other properties
689 |             if t:
690 |                 t.position = i + 1
691 |                 # t.number = tc['id'][3::].strip()
692 |                 t.number = re.search('t[bl0\-]*(\d+)$', tc['id']).group(1)
693 |                 try:
694 |                     t.label = tc.find('span', class_='label').get_text()
695 |                 except:
696 |                     pass
697 |                 try:
698 |                     t.caption = tc.find('caption').get_text()
699 |                 except AttributeError:
700 |                     caption = tc.find('div', {'header': 'article-table-caption'})
701 |                     t.caption = caption.get_text() if caption else None
702 |                 try:
703 |                     t.notes = footer.get_text() if footer else None
704 |                 except AttributeError:
705 |                     notes = tc.find('div', {'class': 'article-section__table-footnotes'})
706 |                     t.notes = notes.get_text() if caption else None
707 |                     pass
708 |                 tables.append(t)
709 | 
710 |         self.article.tables = tables
711 |         return self.article
712 | 
713 |     def parse_table(self, table):
714 |         return super(WileySource, self).parse_table(table)
715 | 
716 |     def extract_doi(self, soup):
717 |         try:
718 |             return soup.find('meta', {'name': 'citation_doi'})['content']
719 |         except:
720 |             return ''
721 |     
722 |     def extract_pmid(self, soup):
723 |         return scrape.get_pmid_from_doi(self.extract_doi(soup))
724 | 
725 | # Note: the SageSource is largely useless and untested because Sage renders tables
726 | # as images.
727 | 
728 | 
729 | class SageSource(Source):
730 | 
731 |     def parse_article(self, html, pmid=None, **kwargs):
732 | 
733 |         soup = super(SageSource, self).parse_article(html, pmid, **kwargs)
734 |         if not soup:
735 |             return False
736 | 
737 |         # To download tables, we need the content URL and the number of tables
738 |         content_url = soup.find('meta', {
739 |                                 'name': 'citation_public_url'})['content']
740 | 
741 |         n_tables = len(soup.find_all('span', class_='table-label'))
742 |         logger.info(f"Found {n_tables} tables.")
743 |         # Now download each table and parse it
744 |         tables = []
745 |         for i in range(n_tables):
746 |             t_num = i + 1
747 |             url = '%s/T%d.expansion.html' % (content_url, t_num)
748 |             table_soup = self._download_table(url)
749 |             if not table_soup:
750 |                 continue
751 |             tc = table_soup.find(class_='table-expansion')
752 |             if tc:
753 |                 t = tc.find('table', {'id': 'table-%d' % (t_num)})
754 |                 t = self.parse_table(t)
755 |                 if t:
756 |                     t.position = t_num
757 |                     t.label = tc.find(class_='table-label').text
758 |                     t.number = t.label.split(' ')[-1].strip()
759 |                     try:
760 |                         t.caption = tc.find(class_='table-caption').get_text()
761 |                     except:
762 |                         pass
763 |                     try:
764 |                         t.notes = tc.find(class_='table-footnotes').get_text()
765 |                     except:
766 |                         pass
767 |                     tables.append(t)
768 | 
769 |         self.article.tables = tables
770 |         return self.article
771 | 
772 |     def parse_table(self, table):
773 |         return super(SageSource, self).parse_table(table)
774 | 
775 |     def extract_doi(self, soup):
776 |         try:
777 |             return soup.find('meta', {'name': 'citation_doi'})['content']
778 |         except: 
779 |             return ''
780 |         
781 |     def extract_pmid(self, soup):
782 |         return soup.find('meta', {'name': 'citation_pmid'})['content']
783 | 
784 | 
785 | class OldSpringerSource(Source):
786 | 
787 |     def parse_article(self, html, pmid=None, **kwargs):
788 | 
789 |         soup = super(OldSpringerSource, self).parse_article(html, pmid, **kwargs)
790 |         if not soup:
791 |             return False
792 | 
793 |         # Extract tables
794 |         tables = []
795 |         table_containers = soup.findAll(
796 |             'figure', {'id': re.compile('^Tab\d+$')})
797 |         for (i, tc) in enumerate(table_containers):
798 |             table_html = tc.find('table')
799 |             t = self.parse_table(table_html)
800 |             # If Table instance is returned, add other properties
801 |             if t:
802 |                 t.position = i + 1
803 |                 t.number = tc['id'][3::].strip()
804 |                 t.label = tc.find('span', class_='CaptionNumber').get_text()
805 |                 try:
806 |                     t.caption = tc.find(class_='CaptionContent').p.get_text()
807 |                 except:
808 |                     pass
809 |                 try:
810 |                     t.notes = tc.find(class_='TableFooter').p.get_text()
811 |                 except:
812 |                     pass
813 |                 tables.append(t)
814 | 
815 |         self.article.tables = tables
816 |         return self.article
817 | 
818 |     def parse_table(self, table):
819 |         return super(OldSpringerSource, self).parse_table(table)
820 | 
821 |     def extract_doi(self, soup):
822 |         content = soup.find('p', class_='ArticleDOI').get_text()
823 |         return content.split(' ')[1]
824 | 
825 |     def extract_pmid(self, soup):
826 |         return scrape.get_pmid_from_doi(self.extract_doi(soup))
827 | 
828 | 
829 | class SpringerSource(Source):
830 | 
831 |     def parse_article(self, html, pmid=None, **kwargs):
832 | 
833 |         soup = super(SpringerSource, self).parse_article(html, pmid, **kwargs)
834 |         if not soup:
835 |             return False
836 | 
837 |         # Extract table; going to take the approach of opening and parsing the table via links
838 |         # To download tables, we need the content URL and the number of tables
839 |         content_url = soup.find('meta', {'name': 'citation_fulltext_html_url'})['content']
840 | 
841 |         n_tables = len(soup.find_all('span', string='Full size table'))
842 |         logger.info(f"Found {n_tables} tables.")
843 |         # Now download each table and parse it
844 |         tables = []
845 |         for i in range(n_tables):
846 |             t_num = i + 1
847 |             url = '%s/tables/%d' % (content_url, t_num)
848 |             table_soup = self._download_table(url)
849 |             if not table_soup:
850 |                 continue
851 |             tc = table_soup.find(class_='data last-table')
852 |             t = self.parse_table(tc)
853 |             if t:
854 |                 t.position = t_num
855 | 
856 |                 # id_name is the id HTML element that cotains the title, label and table number that needs to be parse
857 |                 # temp_title sets it up to where the title can be parsed and then categorized
858 |                 id_name = f"table-{t_num}-title"
859 |                 temp_title = table_soup.find('h1', attrs={'id': id_name}).get_text().split()
860 | 
861 |                 # grabbing the first two elements for the label and then making them a string object
862 |                 t.label = " ".join(temp_title[:2])
863 |                 t.number = str(temp_title[1])
864 |                 try:
865 |                     # grabbing the rest of the element for the caption/title of the table and then making them a string object
866 |                     t.caption =  " ".join(temp_title[2:])
867 |                 except:
868 |                     pass
869 |                 try:
870 |                     t.notes = table_soup.find(class_='c-article-table-footer').get_text()
871 |                 except:
872 |                     pass
873 |                 tables.append(t)
874 | 
875 |         self.article.tables = tables
876 |         return self.article
877 | 
878 |     def parse_table(self, table):
879 |         return super(SpringerSource, self).parse_table(table)
880 | 
881 |     def extract_doi(self, soup):
882 |         try: 
883 |             return soup.find('meta', attrs={'name': "citation_doi"})['content']
884 |         except:
885 |             return ''
886 |   
887 |     def extract_pmid(self, soup):
888 |         return scrape.get_pmid_from_doi(self.extract_doi(soup))
889 | 
890 | 
891 | class PMCSource(Source):
892 |     def parse_article(self, html, pmid=None, **kwargs):
893 |         soup = super(PMCSource, self).parse_article(html, pmid, **kwargs)
894 |         if not soup:
895 |             return False
896 | 
897 |         tables = []
898 |         table_containers = soup.findAll('div', {'class': 'table-wrap'})
899 |         logger.info(f"Found {len(table_containers)} tables.")
900 |         for (i, tc) in enumerate(table_containers):
901 |             sub_tables = tc.findAll('div', {'class': 'xtable'})
902 |             for st in sub_tables:
903 |                 t = self.parse_table(st)
904 |                 if t:
905 |                     t.position = i + 1
906 |                     t.label = tc.find('h3').text if tc.find('h3') else None
907 |                     t.number = t.label.split(' ')[-1].strip() if t.label else None
908 |                     try:
909 |                         t.caption = tc.find({"div": {"class": "caption"}}).text
910 |                     except:
911 |                         pass
912 |                     try:
913 |                         t.notes = tc.find('div', class_='tblwrap-foot').text
914 |                     except:
915 |                         pass
916 |                     tables.append(t)
917 | 
918 |         self.article.tables = tables
919 |         return self.article
920 | 
921 |     def extract_pmid(self, soup):
922 |         return soup.find('meta', {'name': 'citation_pmid'})['content']
923 | 
924 |     def extract_doi(self, soup):
925 |         return soup.find('meta', {'name': 'citation_doi'})['content']
926 | 


--------------------------------------------------------------------------------
/ace/sources/Frontiers.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Frontiers",
 3 |   "identifiers": [
 4 |     "<article-id pub-id-type=\"doi\">10.3389"
 5 |   ],
 6 |   "entities": {
 7 |   	"&#x02212;": "-",
 8 |   	"&#x02009;": " "
 9 |   }
10 | }


--------------------------------------------------------------------------------
/ace/sources/HighWire.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "HighWire",
 3 |   "identifiers": [
 4 |     "highwire-journal",
 5 |     "http://schema.highwire.org/Linking",
 6 |     "highwire-journal-article"
 7 |   ],
 8 |   "entities": {
 9 |   },
10 |   "delay": 10
11 | }
12 | 


--------------------------------------------------------------------------------
/ace/sources/JournalOfCognitiveNeuroscience.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "JournalOfCognitiveNeuroscience",
 3 |   "identifiers": [
 4 |     "property=\"og:site_name\" content=\"MIT Press\"",
 5 |     "<title>MIT Press Journals - Journal of Cognitive Neuroscience - Full Text</title>"
 6 |   ],
 7 |   "entities": {
 8 |   	"\u2002": " "
 9 |   },
10 |   "delay": 10
11 | }
12 | 


--------------------------------------------------------------------------------
/ace/sources/OUP.json:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "OUP",
3 |     "identifiers": [
4 |       "OUP Academic"
5 |     ],
6 |     "entities": {
7 |     },
8 |     "delay": 10
9 |   }


--------------------------------------------------------------------------------
/ace/sources/OldSpringer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "OldSpringer",
 3 |     "identifiers": [
 4 |       "- Springer</title>"
 5 |     ],
 6 |     "entities": {
 7 |   
 8 |     }
 9 |   }
10 | 


--------------------------------------------------------------------------------
/ace/sources/PMC.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "PMC",
 3 |     "identifiers": [
 4 |       "<meta name=\"ncbi_db\" content=\"pmc\" />",
 5 |       "<meta name=\"ncbi_db\" content=\"pmc\">"
 6 |     ],
 7 |     "entities": {
 8 |     }
 9 |   }
10 | 


--------------------------------------------------------------------------------
/ace/sources/Plos.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "Plos",
3 |   "identifiers": [
4 |     "<publisher-name>Public Library of Science</publisher-name>"
5 |   ],
6 |   "entities": {
7 |   }
8 | }


--------------------------------------------------------------------------------
/ace/sources/Sage.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "Sage",
3 |   "identifiers": [
4 |     "<meta content=\"SAGE Publications\" name=\"DC.Publisher\" />"
5 |   ],
6 |   "entities": {
7 |   	
8 |   }
9 | }


--------------------------------------------------------------------------------
/ace/sources/ScienceDirect.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ScienceDirect",
 3 |   "identifiers": [
 4 |     "title=\"ScienceDirect -The world's leading full-text scientific database\"",
 5 |     "- ScienceDirect</title>"
 6 |   ],
 7 |   "entities": {
 8 | 
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/ace/sources/Springer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Springer",
 3 |   "identifiers": [
 4 |     "<meta content=\"Springer\" name=\"dc.publisher\"/>",
 5 |     "meta property=\"og:site_name\" content=\"SpringerLink\""
 6 |   ],
 7 |   "entities": {
 8 | 
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/ace/sources/Wiley.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "Wiley",
3 |   "identifiers": [
4 |     "Wiley Online Library</title>"
5 |   ],
6 |   "entities": {
7 |   	
8 |   }
9 | }


--------------------------------------------------------------------------------
/ace/tableparser.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 |   # use unicode everywhere
  3 | 
  4 | # import database
  5 | import regex  # Note: we're using features in the new regex module, not re!
  6 | import logging
  7 | from . import config
  8 | from .database import Activation, Table
  9 | from collections import Counter, defaultdict
 10 | 
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | def identify_standard_columns(labels):
 16 |     ''' Takes a set of column labels and returns an equal-length list with names
 17 |     of any standard columns detected. Unknown columns are assigned None.
 18 |     E.g., passing in ['p value', 'brain region', 'unknown_col'] would return
 19 |     ['p_value', 'region', None].
 20 |     '''
 21 |     standardized = [None] * len(labels)
 22 |     found_coords = False
 23 |     for i, lab in enumerate(labels):
 24 |         if regex.search('(^\s*ba$)|brodmann', lab):
 25 |             s = 'ba'
 26 |         elif regex.search('region|anatom|location|area', lab):
 27 |             s = 'region'
 28 |         elif regex.search('sphere|(^\s*h$)|^\s*hem|^\s*side', lab):
 29 |             s = 'hemisphere'
 30 |         elif regex.search('(^k$)|(mm.*?3)|volume|voxels|size|extent', lab):
 31 |             s = 'size'
 32 |         elif regex.match('\s*[xy]\s*$', lab):
 33 |             found_coords = True
 34 |             s = lab
 35 |         elif regex.match('\s*z\s*$', lab):
 36 |             # For z, we need to distinguish z plane from z-score.
 37 |             # Use simple heuristics:
 38 |             # * If no 'x' column exists, this must be a z-score
 39 |             # * If the preceding label was anything but 'y', must be a z-score
 40 |             # * Otherwise it's a z coordinate
 41 |             # Note: this could theoretically break if someone has non-contiguous
 42 |             # x-y-z columns, but this seems unlikely. If it does happen,
 43 |             # an alternative approach would be to check if the case of the 'z' column
 44 |             # matches the case of the 'x' column and make determination that
 45 |             # way.
 46 |             s = 'statistic' if not found_coords or labels[i - 1] != 'y' else 'z'
 47 |         elif regex.search('rdinate', lab):
 48 |             continue
 49 |         elif lab == 't' or regex.search('^(max.*(z|t).*|.*(z|t).*(score|value|max))$', lab):
 50 |             s = 'statistic'
 51 |         elif regex.search('p[\-\s]+.*val', lab):
 52 |             s = 'p_value'
 53 |         else:
 54 |             s = None
 55 |         standardized[i] = s
 56 |     return standardized
 57 | 
 58 | 
 59 | def identify_repeating_groups(labels):
 60 |     ''' Identify groups: any sets of columns where names repeat.
 61 |     Repeating groups must be contiguous; i.e., [x, y, z, w, x, y, z, f]
 62 |     will not match, but [w, f, x, y, z, x, y, z] will.
 63 | 
 64 |     Note that this will only handle one level of repetition; i.e., 
 65 |     hierarchical groupings will be ignored. E.g., in a 2 x 2 x 3 
 66 |     nesting of columns like hemisphere --> condition --> x/y/z, 
 67 |     only the 4 sets of repeating x/y/z columns will be detected.
 68 | 
 69 |     Returns a list of strings made up of the index of the first column
 70 |     in the group and the number of columns. E.g., '1/3' indicates the 
 71 |     group starts at the second column and contains 3 columns. These 
 72 |     keys can be used to directly look up names stored in a
 73 |     multicolumn_label dictionary.
 74 |     '''
 75 |     # OLD ALGORITHM: MUCH SIMPLER AND FASTER BUT DOESN'T WORK PROPERLY
 76 |     # FOR NON-CONTIGUOUS COLUMN GROUPS
 77 |     # target = '###'.join(unicode(x) for x in labels)
 78 |     # pattern = regex.compile(r'(.+?###.+?)(###\1)+')
 79 |     # matches = pattern.finditer(target)
 80 |     # groups = []
 81 |     # for m in matches:
 82 |     #     sp = m.span()
 83 |     #     n_cols_in_group = len(m.group(1).split('###'))
 84 |     #     start = len(target[0:sp[0]].split('###'))-1
 85 |     #     n_matches = len(m.group(0).split('###'))
 86 |     #     for i in range(n_matches/n_cols_in_group):
 87 |     #         groups.append('%d/%d' % ((i*n_cols_in_group)+start, n_cols_in_group))
 88 |     # return list(set(groups))
 89 | 
 90 |     groups = []
 91 |     n_labels = len(labels)
 92 |     label_counts = Counter(labels)
 93 |     rep_labels = set([k for k, v in list(label_counts.items()) if v > 1])
 94 |     # Track multi-label sequences. Key/value = sequence/onset
 95 |     label_seqs = defaultdict(list)
 96 | 
 97 |     # Loop over labels and identify any sequences made up entirely of labels with 
 98 |     # 2 or more occurrences in the list and without the starting label repeating.
 99 |     for i, lab in enumerate(labels):
100 |         if lab not in rep_labels:
101 |             continue
102 |         current_seq = [lab]
103 |         for j in range(i+1, n_labels):
104 |             lab_j = labels[j]
105 |             if lab_j not in rep_labels or lab_j == lab:
106 |                 break
107 |             current_seq.append(lab_j)
108 |         if len(current_seq) > 1:
109 |             label_seqs['###'.join(current_seq)].append(i)
110 | 
111 |     # Keep only sequences that occur two or more times
112 |     label_seqs = { k: v for k, v in list(label_seqs.items()) if len(v) > 1}
113 |     
114 |     # Invert what's left into a list where the sequence occurs at its start pos
115 |     seq_starts = [None] * n_labels
116 |     for k, v in list(label_seqs.items()):
117 |         for start in v:
118 |             seq_starts[start] = k.split('###')
119 | 
120 |     # Create boolean array to track whether each element has already been used
121 |     labels_used = [False] * n_labels
122 | 
123 |     # Loop through labels and add a group if we find a sequence that starts at 
124 |     # the current position and spans at least one currently unused cell.
125 |     # This is necessary to account for cases where one sequence isn't always 
126 |     # part of the same supersequence, e.g., the y/z in x/y/z could also be a 
127 |     # part of a/y/z or b/y/z.
128 |     for i, lab in enumerate(labels):
129 |         if seq_starts[i] is not None:
130 |             seq_size = len(seq_starts[i])
131 |             if not all(labels_used[i:(i+seq_size)]):
132 |                 labels_used[i:(i+seq_size)] = [True] * seq_size
133 | 
134 |                 # We need to make sure the group contains x/y/z information, 
135 |                 # otherwise we'll end up duplicating a lot of activations.
136 |                 # This is not a very good place to put this check; eventually
137 |                 # we need to refactor much of this class.
138 |                 groups.append('%d/%d' % (i, seq_size))
139 | 
140 |     return groups
141 | 
142 | 
143 | 
144 | def create_activation(data, labels, standard_cols, group_labels=[]):
145 | 
146 |     activation = Activation()
147 | 
148 |     for i, col in enumerate(data):
149 | 
150 |         # Replace unicode minus signs with hyphens
151 |         replace = ['֊', '‐', '‑', '⁃', '﹣', '－', '‒', '–', '—', '﹘', '−', '-']
152 |         for c in replace:
153 |             if c in col:
154 |                 col = col.replace(c, '-')
155 |                 col = col.replace(c + c, '-')
156 | 
157 |         # Cast to integer or float if appropriate
158 |         # if regex.match('[-\d]+$', col):
159 |         #     col = int(col)
160 |         # elif regex.match('[-\d\.]+$', col):
161 |         #     col = float(col)
162 | 
163 |         # Set standard attributes if applicable and do validation where appropriate.
164 |         # Generally, validation will not prevent a bad value from making it into the
165 |         # activation object, but it will flag any potential issues using the "problem" column.
166 |         if standard_cols[i] is not None:
167 | 
168 |             sc = standard_cols[i]
169 | 
170 |             # Validate XYZ columns: Should only be integers (and possible trailing decimals).
171 |             # If they're not, keep only leading numbers. The exception is that ScienceDirect 
172 |             # journals often follow the minus sign with a space (e.g., - 35), which we strip.
173 |             if regex.match('[xyz]$', sc):
174 |                 m = regex.match('([-])\s?(\d+\.*\d*)$', col)
175 |                 if m:
176 |                     col = "%s%s" % (m.group(1), m.group(2))
177 |                 if not regex.match('([-]*\d+)\.*\d*$', col):
178 |                     logging.debug("Value %s in %s column is not valid" % (col, sc))
179 |                     activation.problems.append("Value in %s column is not valid" % sc)
180 |                     return activation
181 |                 col = (float(col))
182 | 
183 |             elif sc == 'region':
184 |                 if not regex.search('[a-zA-Z]', col):
185 |                     logging.debug("Value in region column is not a string")
186 |                     activation.problems.append("Value in region column is not a string")
187 | 
188 |             setattr(activation, sc, col)
189 | 
190 |         # Always include all columns in record
191 |         activation.add_col(labels[i], col)
192 |       
193 |         # Handle columns with multiple coordinates (e.g., 45;12;-12).
194 |         # Assume that any series of 3 numbers in a non-standard column
195 |         # reflects coordinates. Will fail if there are leading numbers!!!
196 |         # Also need to remove space between minus sign and numbers; some ScienceDirect
197 |         # journals leave a gap.
198 |         if not i in standard_cols:
199 |             cs = '([-]?\d{1,3}\.?\d{0,2})'
200 |             clean_col = regex.sub(r'(?<!\d)\.(?!\d)', '', str(col))  # Remove dots not part of numbers
201 |             m = regex.search('\n*%s[,;\s]+%s[,;\s]+%s' % (cs, cs, cs), clean_col)
202 |             if m:
203 |                 x, y, z = [regex.sub('-\s+', '-', c.strip()) for c in [m.group(1), m.group(2), m.group(3)]]
204 |                 logger.info("Found multi-coordinate column: %s\n...and extracted: %s, %s, %s" % (col, x, y, z))
205 |                 activation.set_coords(x, y, z)
206 | 
207 |     activation.groups = group_labels
208 |     return activation
209 | 
210 | 
211 | def parse_table(data):
212 |     ''' Takes a DataTable as input and returns a Table instance. '''
213 |     
214 |     table = Table()
215 |     n_cols = data.n_cols
216 | 
217 |     # Identify column names: first occurrence of unique (i.e. colspan=1) label.
218 |       # Also track multi-column labels for group names.
219 |     labels = [None] * n_cols
220 |     multicol_labels = {}
221 |     for i in range(data.n_rows):
222 |         r = data[i]
223 |         r = [x or '' for x in r]
224 |         found_xyz = regex.search('\d+.*\d+.*\d+', '/'.join(r))  # use this later
225 |         for j, val in enumerate(r):
226 |             val = val.strip().replace('\n', '\t')  # regex doesn't match with newlines in the string
227 |             if val != '' and val[-1] == '.':
228 |                 val = val[:-1].strip()
229 |             # If a value is provided and the cell isn't an overflow cell (i.e., '@@'), and
230 |             # there is no current label assigned to this column...
231 |             if val != '' and val.lower() != "empty cell" and not val.startswith('@@') and labels[j] is None:
232 |                 # Handle the first column separately, because first value in table
233 |                 # is often mistaken for label if label is left blank.
234 |                 # If all other labels have been found, or if there are lots of numbers
235 |                 # in the row, we must already be in contents, so assume the first row
236 |                 # denotes regions. Otherwise assume this is a regular column name.
237 |                 # Note: this heuristic is known to fail in the presence of multiple
238 |                 # unlabeled region columns. See e.g., CerCor bhl081, table 2.
239 |                 if j == 0 and (None not in labels[1::] or found_xyz):
240 |                     labels[j] = 'region'
241 |                 else:
242 |                     labels[j] = val
243 |             else:
244 |                 # Store any multi-column labels. Key is the starting index and
245 |                 # colspan.
246 |                 m = regex.search('^@@(.*)@(\d+)$', val)
247 |                 if m:
248 |                     multicol_labels["%d/%s" % (j, m.group(2))] = m.group(1)
249 | 
250 |     # Compact the list, although there shouldn't be any missing values at this point...
251 |     # labels = [x.lower() for x in labels if x is not None]
252 |     # Convert all labels to lowercase
253 |     labels = [x.lower() if x is not None else 'EMPTY_LABEL' for x in labels]
254 |     n_cols = len(labels)
255 | 
256 |     # Sometimes tables have a single "Coordinates" column name
257 |     # despite breaking X/Y/Z up into 3 columns, so we account for this here.
258 |     for k, v in list(multicol_labels.items()):
259 |         if regex.search('(ordinate|x.*y.*z)', v):
260 |             st, span = k.split('/')
261 |             start, end = int(st), (int(st) + int(span))
262 |             if not regex.search('[a-zA-Z]', ''.join(labels[start:end])):
263 |                 logger.info(
264 |                     "Possible multi-column coordinates found: %s, %s" % (k, v))
265 |                 labels[start:end] = ['x', 'y', 'z']
266 | 
267 | 
268 |     # There shouldn't be any unfilled column labels at this point, but if there are,
269 |     # log that information and skip table if flag is set.
270 |     if None in labels:
271 |         labels = [str(l) for l in labels]
272 |         msg = 'Failed to identify at least one column label: [%s]. Skipping table!' % ', '.join(labels)
273 |         if config.EXCLUDE_TABLES_WITH_MISSING_LABELS:
274 |             logger.error(msg)
275 |             return None
276 |         else:
277 |             logger.warning(msg)
278 | 
279 | 
280 |     # Detect any standard column labels and any repeating column groups
281 |     standard_cols = identify_standard_columns(labels)
282 |     group_cols = identify_repeating_groups(labels)
283 |     logger.debug("Labels: " + ', '.join(labels))
284 |     logger.debug("Standard columns:" + ', '.join([str(x) for x in standard_cols if x is not None]))
285 | 
286 |     # Store a boolean list indicating which columns belong to a group
287 |     cols_in_group = [False] * n_cols
288 |     for g in group_cols:
289 |         onset, length = [int(i) for i in g.split('/')]
290 |         for i in range(onset, onset+length):
291 |             try:
292 |                 cols_in_group[i] = True
293 |             except IndexError:
294 |                 return None
295 | 
296 |     # Loop over rows in table
297 |     group_row = None
298 | 
299 |     for r in data:
300 |         # Strip whitespace and replace empty cells with empty strings
301 |         r = [x.strip() or '' for x in r if x is not None]
302 | 
303 |         logger.debug(r)
304 | 
305 |         n_cells = len(r)
306 | 
307 |         if n_cells != len(labels):
308 |             logger.warning("Skipping row with %d cells (expected %d): %s" % (n_cells, len(labels), r))
309 |             continue 
310 | 
311 |         # Skip row if any value matches the column label--assume we're in header
312 |         match_lab = False
313 |         for i in range(n_cells):
314 |             if r[i] == labels[i]: match_lab = True
315 |         if match_lab: continue
316 | 
317 |         # If row is empty except for value in first column, assume the next few 
318 |         # rows of coordinates are grouped together under this heading.
319 |         # Note that this won't extract a hierarchical structure;
320 |         # e.g., if there are two consecutive group-denoting rows,
321 |         # the value in the second row will overwrite the one in the first row.
322 |         if r[0] and not ''.join(r[1::]).strip():
323 |             group_row = r[0].strip()
324 |             continue
325 | 
326 |         # If first cell spans ALL columns, it probably also denotes a group, as we 
327 |         # should already be past all header rows.
328 |         if r[0].startswith('@@'):
329 |             m = regex.search('@(\d+)', r[0])
330 |             if int(m.group(1)) == n_cols:
331 |                 group_row = r[0].split('@')[2].strip()
332 |                 continue
333 | 
334 |         # Skip any additional header rows
335 |         if n_cells != n_cols or regex.search('@@', ' '.join(r)): continue
336 | 
337 |         # If we don't have to worry about groups, the entire row is a single activation
338 |         if not len(group_cols):
339 |             activation = create_activation(r, labels, standard_cols, group_row)
340 |             if activation.validate():
341 |                 table.activations.append(activation)
342 | 
343 |         # ...otherwise we need to iterate over groups and select appropriate columns for each.
344 |         else:
345 | 
346 |             # Loop over groups and select appropriate columns
347 |             for g in group_cols:
348 |                 onset, length = [int(i) for i in g.split('/')]
349 |                 # Get current grouping labels. Occasionally there are tables that have multiple 
350 |                 # groups of columns but don't attach separate high-order labels to them, so check
351 |                 # for the key first.
352 |                 groups = [multicol_labels[g]] if g in multicol_labels else []
353 |                 if group_row is not None: groups.append(group_row)
354 |                 group_specific_cols = list(range(onset, onset+length))
355 | 
356 |                 # Select columns that belong to this activation: all columns that do not 
357 |                 # belong to any group, plus any columns that belong only to this group.
358 |                 activation_labels = []
359 |                 activation_columns = []
360 |                 activation_scs = [] # standard columns
361 |                 for (i,x) in enumerate(r):
362 |                     if not cols_in_group[i] or i in group_specific_cols:
363 |                         activation_labels.append(labels[i])
364 |                         activation_columns.append(r[i])
365 |                         activation_scs.append(standard_cols[i])
366 |           
367 |                 # Create activation and add to table if it passes validation
368 |                 activation = create_activation(activation_columns, activation_labels, activation_scs, groups)
369 |                 if activation.validate():
370 |                     table.activations.append(activation)
371 | 
372 |     table.finalize()
373 |     return table if len(table.activations) else None
374 | 
375 | 
376 | 
377 | 


--------------------------------------------------------------------------------
/ace/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neurosynth/ACE/248e570ade47dc03e8d0ed98ff16cd019ee119bd/ace/tests/__init__.py


--------------------------------------------------------------------------------
/ace/tests/cassettes/test_ace/test_brain_research_source.yaml:
--------------------------------------------------------------------------------
  1 | interactions:
  2 | - request:
  3 |     body: null
  4 |     headers:
  5 |       Accept:
  6 |       - '*/*'
  7 |       Accept-Encoding:
  8 |       - gzip, deflate
  9 |       Connection:
 10 |       - keep-alive
 11 |       User-Agent:
 12 |       - Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0
 13 |         Safari/537.36
 14 |     method: GET
 15 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=18760263&retmode=xml&rettype=medline
 16 |   response:
 17 |     body:
 18 |       string: !!binary |
 19 |         H4sIAAAAAAAAAHRTwW7bMAy97ys0nRvLdou0GxQXbeyDiyQ1mqxAToMis7EGWwokOV3+frSTpk6B
 20 |         XSySIp/4Hml+/7epyR6sU0ZPaBSElNwn3/j39Hm6WhcZKdpNA+WD9UrWsARPil+Ps3xK6IixxWzO
 21 |         WLpKu6T5OemKRM6TJ6FbYQ8kDuMbxrIFJbTyfud+Mlb6MtB1E2hVBVuzZ1puFNv17zDT+pP5O74J
 22 |         ozAKMJtiQ1/7+BpKOHZQKw1T5YVHMmSJZ+smdJ6ls3yRUfL8rsFOKHZNE17M85S8nnnTJLq7HYfx
 23 |         +Jqz7irhqfAwNc2uBg9lwtcgbBKH4Q/OepPPjfZVEkacHS0sOCQxut3ZfS/KO/cF9soNse4usaJL
 24 |         rOh2iHUuPvHtNTcl1BNaWKX9KKtBemu0ksjuybRWizrh+XK5ILlzenXYwYQOk5BwPBqjypx1Weci
 25 |         zG6BoIxQoqSqbSY01x6sBo/Ir6ZuG0ii+AaVOjndJLoe/8ssBXlBLbz+oHauZMPXE75SHmf6aIXS
 26 |         xIJDLFlxdowiqeeHzcaiIv2kT2kv4Doml1dn3LNwR4xUOa+09ERDa0VNXLtx3mInjrwZS0ooW+nV
 27 |         HojQJWmErwA/SmLmzhoJzim9DTi7wOSF2Cp9ehe3z3oMQHI35uzT45kueyPqRPhwPpa32GrMH/V3
 28 |         gxDqNIDOZkb2Jm5wlpfHyZZGUfIqalWuFxO6xvGGAf4+Y/Yn2HTyoIhBN5ggvA2iGOczQEFtevbS
 29 |         f1or+OuTXCN/Am8oiSfeEFkLq94OpDLvA4ksCIcrpbdEOSKk7LZeuQrKKyL+AQAA//9sVMtqxDAM
 30 |         /BXd2oWyH9FbYQulFHpWHGVj6ljBj4b+fUciZXfpHkJw7MxoZixlml7fX6i2Pv7QxpVWKUBD3xqe
 31 |         DlUKEGDvnSB0oqRnN/1C8S8PPKEnV3KkN7ZA4sq5VfqOsoGHLbIhyYIqgLtIbo7S5iKCPVnQWPVp
 32 |         /ybQG/Eq+C9oDqnbhCC19Q2vqQDvkT5mgSPsgmIesdmEtlkcxJRdwUyaklpRU9HFNy/8oABC1bQb
 33 |         cp/txPnccWWIzRPzyM56xPT4jLvJD9WlfKJjY/gSW+Lowdpo1TyaI/WG+k/5daGbFA8WGQ69WVXd
 34 |         h+rOtztaPVI3LXApEdgY4DT8AgAA//+MVkFqwzAQ/IoekDofMIVCKThJQy99gGJWtqgiGclq7d93
 35 |         VnJkh1DoxVirZHc1OzPyLKyzT+au1QoC7ZAiCJa07vqRGwAOduQpgmE0pU60VeQ10BhANuJN4y6U
 36 |         O7o6PGSmHet05cXYM9cQ2vBhx2NIMPm1tCH1V+UQh39X3pRZa5d2KtHYdYXZWjPjZy5soEMGw52r
 37 |         xK1scTm1i+y3NEm+PkISXpkMn6X1kd15QfF2kMEZSmwMPLkQDSTQS7SrrwPOCEnwm9FZ9dnnIvjg
 38 |         IQvQFp2wrHvkJO6hszpjqLl02l8kn4IhJUtqIp7ZSsNKvGqFEbLOAgovJNX2i3Ms3eO8AJmV8qD7
 39 |         rMRH5XcOWFxodryLQGQLb8tot0pgZ9762LqEwUXMwZ/wZ3G7nxfHXLbuXLQ+yTCeJS65o3cd+Xpf
 40 |         AvUbKJHeDngEcaz3JVI3jJ404fmAcFnUL0ppk6+mxip3F8AtCUjG5E84zEeY294BhnknzvQj3mnS
 41 |         rUtfNSQ+rf7+BQAA//+UV11PwjAU/SvNHvRFR2WMjKhEMoxg2AIOfgDOKUOhc2Mm/HvPXTfWQY36
 42 |         QtJLe+7t6f04I+2yw5/tNmdBGEfbMGIj5NQFekTG3DQPiUvfY47DuXXBFsHAZOv34hp3202Wm3hO
 43 |         MKNE0FjJAFuSlF/I8fegV2y07EwipBS717IzuVfZ+ZsvV6Df6N9BYERSLQ71TzH8v7NHsdoi/y4n
 44 |         yzh90TqdrkBYwnyty6mvddmqk5B8yTbZj7Zv5KFckaiqipVGvNx8ZGSL8a0xxJjvtB2jX0odVqqS
 45 |         Ql2pu388bnUc2+g/lTqLBXlCHQPJg5svzMBkD+LrfPcvvK4Or8aS2FNzBIsG99giL1/eiwQjo59T
 46 |         TfuTAKW1Kq3tSn8qkPXqIMcqTVpUgkttOd33/WKyY7i9QG9WxurEfKCq0dp4439sULSfeQS5xXnH
 47 |         tkniqcZCqk9kl8QO3r10ej1LavPKfEBsBlZ+8QRoofgwGnsIq2lCdNlqhGkOEEmlYgAlURamcbIT
 48 |         KaVv+Yj8qmddGcxbrkU6F0kcUm34Rnk9b5kkMeVr82wRoYL8Bzdtu8M1blwMkGcaDG4xn08dzXIU
 49 |         7Gsc1WAzsMYt6xgM9bxEXxCbPTvDRL3GiMt2RWO9aTVADpjfAAAA//9CM9PUwAiLmQUZlcD6GKsx
 50 |         JIeBsYE5tqB2hlXCxPqe9i41MbLEsALoUpf8XGD3BFgD6SjAIo4CN/tR182mZkamWKxwg7abfIBt
 51 |         PAqSlx8NkheVA8DcwtgSWwCU5oFbc6AwAJZ/wDgDticGT7SZWRhgyxQepbmJecVUKHgsjMyxhYpv
 52 |         YnpeKqhhCSzB88FpWsEzF9T7pkZhZ2GCtXzyhXf1qOIxMwMzbB5LBSf3AMj4RSoWm+gQ0wAAAAD/
 53 |         /8LqYEsTQ2wZ1A88NgCsbod37rQ0scDmbD9gvyi/ANYJAPeFQlKLS6iQQAwNjCyxJZAAWK932JeI
 54 |         hubGlubYQgAxLEBxs2MIhIKxgRm2UAguSE0FttoDUouSUwsobH5Q28XmWK0ISQUNcYyIlGtsYYqt
 55 |         uRySkQkAAAD//wJ3EejUUkTmQToU+mjTL3bQ6RlgzyrRzsYDFI6gbhJkpgg66g+aw4DN0hSlJqdm
 56 |         lqWm4O7DGaPOtFgiTSEgjMRrA3gGBbcF5igWGBuSakFiMii/0NIGyMwYbvMtUYMIarwHsLMIkgLT
 57 |         Nr6ZeaVASwyASiEs4q3PhUSxEs7ZMLTJMKo7ANiCKUqtorX/9ZFTK2wUAuIEu4ICkFBxBsoIBVQO
 58 |         Nj7hmYIyXOGZogCbpIHFH2KuEa4Gm3LQnA7BeRy8JhRkAk0Iho8maBhYaAL79UaWuhEoGvXRXK6P
 59 |         nHf1UedZAQAAAP//QuODpmIBAAAA//8DAEdyNpZFHgAA
 60 |     headers:
 61 |       Access-Control-Allow-Origin:
 62 |       - '*'
 63 |       Access-Control-Expose-Headers:
 64 |       - X-RateLimit-Limit,X-RateLimit-Remaining
 65 |       Cache-Control:
 66 |       - private
 67 |       Connection:
 68 |       - Keep-Alive
 69 |       Content-Security-Policy:
 70 |       - upgrade-insecure-requests
 71 |       Content-Type:
 72 |       - text/xml; charset=UTF-8
 73 |       Date:
 74 |       - Wed, 29 May 2024 16:49:44 GMT
 75 |       Keep-Alive:
 76 |       - timeout=4, max=40
 77 |       NCBI-PHID:
 78 |       - 939BABCDAB19ABA500005B9754D3264B.1.1.m_3
 79 |       NCBI-SID:
 80 |       - 7F6D0045F5A886F2_263CSID
 81 |       Referrer-Policy:
 82 |       - origin-when-cross-origin
 83 |       Server:
 84 |       - Finatra
 85 |       Set-Cookie:
 86 |       - ncbi_sid=7F6D0045F5A886F2_263CSID; domain=.nih.gov; path=/; expires=Thu, 29
 87 |         May 2025 16:49:44 GMT
 88 |       Strict-Transport-Security:
 89 |       - max-age=31536000; includeSubDomains; preload
 90 |       Transfer-Encoding:
 91 |       - chunked
 92 |       X-RateLimit-Limit:
 93 |       - '3'
 94 |       X-RateLimit-Remaining:
 95 |       - '2'
 96 |       X-UA-Compatible:
 97 |       - IE=Edge
 98 |       X-XSS-Protection:
 99 |       - 1; mode=block
100 |       content-encoding:
101 |       - gzip
102 |     status:
103 |       code: 200
104 |       message: OK
105 | version: 1
106 | 


--------------------------------------------------------------------------------
/ace/tests/cassettes/test_ace/test_cerebral_cortex_source.yaml:
--------------------------------------------------------------------------------
  1 | interactions:
  2 | - request:
  3 |     body: null
  4 |     headers:
  5 |       Accept:
  6 |       - '*/*'
  7 |       Accept-Encoding:
  8 |       - gzip, deflate
  9 |       Connection:
 10 |       - keep-alive
 11 |       User-Agent:
 12 |       - Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0
 13 |         Safari/537.36
 14 |     method: GET
 15 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=11532885&retmode=xml&rettype=medline
 16 |   response:
 17 |     body:
 18 |       string: !!binary |
 19 |         H4sIAAAAAAAAAHxSTW/bMAy971dwPm1Aa9luuyaFoqJLckjRuEaTdchp8AfTCLWlQJLT5t+PdlIn
 20 |         KbBdbIp875F8Er99r0rYoLFSq4EX+oEHt+IL/zp6HM4XyRiSOquwuDNO5iXO0EHy6+fDZAjeOWPx
 21 |         w5Sx0XzUgKYd6AxC6+A+VXVqthAF0SVj49gDb+Xc2t4wVrjCV2XlK7nyX/SGqTyTbN32Ybp2+/BP
 22 |         dBmEQegT2qOBPs/xOSU4TVBKhUPpUkfLwIz+tR140/HoYRKPPXh8U2gGHk3tCZ5MJyN47vb2RBhe
 23 |         XUS93hVnTUnwUepwqKt1iQ4LwReYGhEFQchZG/KpVm4lwoCzXUSELYlw1vyb7wm9OT7hRtojrSg6
 24 |         1Qou/qPVkff7tp7rAsuBlxipHK10r2uj0lLwyWwWw8RaNd+usauHweX1+UXUyDaADk/AGoFsw4Is
 25 |         lHV1UHzWZV1hO8k+5C1a9EmjDZpLaMb7p0EzXHdbsQ7MjnsLPpeObnCIBjOTlpBr4/AdvsX4Bgtt
 26 |         Xs8g9hc+3EDY74ffOdvBac3HuywzZEx74Ts+DFtys+NptevZWbiTibHe9zRY0nAW9BKKmlIuta9A
 27 |         RqBZkrLKEfJUQYZQSGt1TrpYwNLoCtxKW2x5ciMLyqbOoWra3gBRltOnCVhXF1ufs5PmPElfpNoP
 28 |         SA/WOEqguO7/4Oxw5GNVtEEvoOf5cfh48MmLagjnbfEoR3Yfid9l1pk0d4doju9O/F6hgr8AAAD/
 29 |         /yzSwQ2AMAwDwFUyAEtVEESlFKOkVWB7XMQrD/tzVi4F/5THdzg1ic8eErUN6+VUjLBnoVP/pPj0
 30 |         Eil66zrmEmFIdXo3ydoPaWDnBQAA//90VbuO2zAQ/BVWgQ04xgEpr0qTIEUaw8DVNEVJRCRSxyWt
 31 |         8O9vlqQe9iWAAUt87GNmdqS9d55wETDMnAshUr6tpPcG1zDxQnIq2w01+llce0MCPyWHAWf+TcdZ
 32 |         vHGeNlrFPeLIKDurAa/wmrDAlJmRQejEgUk4nsSsBfVu5ooCSLcqgncbhrS0j3SYNE7InZAbZQC5
 33 |         lhzMzGuZU4lgRi0Ol+uxwqFtB0qEHEYH74MEQD5qF/c8NVlSLOq8lsPG2/o+uuA85OGjChGFn8Vv
 34 |         IOfgyafS+OX6kAXqahpTO16jeN1hhUpfjK91+Ee19yzTW8rkKTiSs6huC9rLppBy01kKCwifOHmr
 35 |         9BUOvlJgcUQqr3eUMOPKlyG8fnt5ESMVtbTGA5BH6qo+eJs0KGhOGROJw3pocC5vedP1gNLihgE+
 36 |         mDMbkKRLHjkPl18/fh6FnCZYzRIwd2tC2mZ5l8lYBfbgoLlzt8sulk6q7CpoTDN3RBOeIcPEYx8Q
 37 |         Jiwzr3ecLqlBNG+MEl3ottU4/WAnFV+Z5XvtY4Vpt8znK/ALR886YLQeVzLrK8CoFKm9mCip3k19
 38 |         Ii4SHfDFCi+MFk+b6liSBB55VrBPK+3feV482lAqTihw7RFaGoHRs3yzSeq/0yDNTi6LgxZMuPU9
 39 |         Jc/wMbv/YfFTLLYJzEoSg/mj0SO4hUVL8KaVaeEFMBiUWnENMTNLbEYAiq1uM4EK5oZjzouZ74VD
 40 |         dl/KLsgV6RGn81r5aMJWOajkAhjQ98hxD/hOy6SbIxsTBhDoLmaILIYe+7rpMPMsrqoh/mzsXTu/
 41 |         fgAAAP//lJbRi4JAEMb/laFnUXryZe+4jjiyIiLiwMfJW3NRd2NXif77+1ZRCoTqbfwUZ/bnON8M
 42 |         47yF+9gtepOGfSPdfczS2XCLfrlSf4O2ZdfsGE6+kpabQkSjIH4wdrpoL6IxFolWvv+dV8dYLPJc
 43 |         Vb2zJjo3D8LnEh/T71Ue+apFV9N3x+yAtmKbFQEt5QXmVns2eMZbsHGZ8qcPaMPWwMNcyZRo/HEN
 44 |         3CXANmeysjBVTfN4TnEM5SoxZ0O6WHVj7BccFt2RvrR/XViq0EmAuqvr4aovO+oZPWG1QT3nk7Tn
 45 |         SVzHSVzHe1yvpUlNq6dTrCdTrN9Psahb3TjxDwAA//+cV9sKgkAQ/RXxpae8lJVBLYRBRZeHzA+I
 46 |         kpK8YRbU13dmXcsWCezRw87F2ZnZc+piLGtjLJvH2CYhFs2/nfWOoX86m5zHpxuxHr+okPgi+hli
 47 |         udGFEtktDkug4i3G6tQw+1bXUJmDu+Q7e5chIOek1cM/rDu2ygSHVASLa2JuDVW2RWGSKHhipzl4
 48 |         z9DoRG8aZtK17B5ciWFS3FuaYkGDJCdx29NcTZkl91Ze409GimrpsoIquTkfEAc9mWcP5sUkELio
 49 |         8tE/JVza7CYS/f7go00YwRqbELJqaJrGwLRHehXkgmUVxBdMmSxTSvjt8Ts7IfZc2rU5W6yR2TeE
 50 |         BK/nub8/wknxvxUAQsu/HrIgxRNGPSkq/AIAAP//zFjLCoJAFP2V8Ae6o/mCCNoELRQ0aS8pYSgT
 51 |         uervO45MZV5IchZ9wJx7Zu7j3DNEcILWIsov8pbJa3XqSjy2NtsC2zDs2OCMYvaGOAXepxUHj2tB
 52 |         qtW2eMxRExiHBqIJuFoumrYo4xAJhAfSWb5gEpB2yf6EQdur7ULW8nxfLwfnfmEaOoJh2otHBMWt
 53 |         uuafxtbxQ4ZtU2KkFO18qq7tEUN1VzZ5bSJpXkDcUyg1befjC48EV4LoNgyVhfpYMBCGAnucBoSJ
 54 |         tE9Lnz5t3/u0GemNDaY3sEOf5W0iuQJ1zhVPqt1lBnf5N10pYOcchu1h4Iz12P9G+gEAAP//opej
 55 |         TcwtsTk6DNKzp6tr9TFqIH20oUo76FCmS2JJop2NB1BNPqhqhYyqQsfMQON9sBFNyACpEs4xN0uU
 56 |         cURT6DCiB7DiBI9Xghk2vpl5wIa1HWj8EsICNw8QFuKxPxfifNwOQB8VNcLrBEMynACsuopSq2ge
 57 |         BPrIkQFrOkHcYFdQABIqzkBpVkHlYEOMnimQGIdzFTxTIIOysDhEDDvD1WBTnpKfCRrB1TM0sDTW
 58 |         T04tAnaO9Q0N9Sz1wEOFSFr10WzWR05a+qhD5gAAAAD//0Ljg0bVAQAAAP//AwBL93MlEBgAAA==
 59 |     headers:
 60 |       Access-Control-Allow-Origin:
 61 |       - '*'
 62 |       Access-Control-Expose-Headers:
 63 |       - X-RateLimit-Limit,X-RateLimit-Remaining
 64 |       Cache-Control:
 65 |       - private
 66 |       Connection:
 67 |       - Keep-Alive
 68 |       Content-Security-Policy:
 69 |       - upgrade-insecure-requests
 70 |       Content-Type:
 71 |       - text/xml; charset=UTF-8
 72 |       Date:
 73 |       - Tue, 28 May 2024 23:11:39 GMT
 74 |       Keep-Alive:
 75 |       - timeout=4, max=40
 76 |       NCBI-PHID:
 77 |       - D0BDE2F4C1A2A1F500004EF69921D5A8.1.1.m_3
 78 |       NCBI-SID:
 79 |       - 8C4B79304D7D2546_44F6SID
 80 |       Referrer-Policy:
 81 |       - origin-when-cross-origin
 82 |       Server:
 83 |       - Finatra
 84 |       Set-Cookie:
 85 |       - ncbi_sid=8C4B79304D7D2546_44F6SID; domain=.nih.gov; path=/; expires=Wed, 28
 86 |         May 2025 23:11:39 GMT
 87 |       Strict-Transport-Security:
 88 |       - max-age=31536000; includeSubDomains; preload
 89 |       Transfer-Encoding:
 90 |       - chunked
 91 |       X-RateLimit-Limit:
 92 |       - '3'
 93 |       X-RateLimit-Remaining:
 94 |       - '2'
 95 |       X-UA-Compatible:
 96 |       - IE=Edge
 97 |       X-XSS-Protection:
 98 |       - 1; mode=block
 99 |       content-encoding:
100 |       - gzip
101 |     status:
102 |       code: 200
103 |       message: OK
104 | version: 1
105 | 


--------------------------------------------------------------------------------
/ace/tests/cassettes/test_ace/test_frontiers_source.yaml:
--------------------------------------------------------------------------------
  1 | interactions:
  2 | - request:
  3 |     body: null
  4 |     headers:
  5 |       Accept:
  6 |       - '*/*'
  7 |       Accept-Encoding:
  8 |       - gzip, deflate
  9 |       Connection:
 10 |       - keep-alive
 11 |       User-Agent:
 12 |       - Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0
 13 |         Safari/537.36
 14 |     method: GET
 15 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=10.3389%2Ffnhum.2011.00024%5Baid%5D&retmax=10000
 16 |   response:
 17 |     body:
 18 |       string: !!binary |
 19 |         H4sIAAAAAAAAALKxr8jNUShLLSrOzM+zVTLUM1BSSM1Lzk/JzEu3VQoNcdO1UFKwt+MCAAAA//+y
 20 |         UXTxdw6JDHBVSA1OTSxKzghKLS7NKVEICHXy8XRWUNLV1/fz8dXXdwlxUUgtBqtQMDIwMDMwM7LQ
 21 |         13f1U1JQyigpKSi20tdPLS3JzCnWy0tOytTLy8nVy8vM0EvPL4OK66eUpOjDdUKN0gMKKtlxAQAA
 22 |         AP//skGxG8h3zi/NK7EztNGHMGyCUkt8EytAAlAWSCS4JLGoxM4ALAZhc9l4pvhkFgMZnECWnZGh
 23 |         iYmhpaWRjT6QwwUiIZIAAAAA//9kkLEKwkAMhnefIjiLd7UOFkInFwdRUXAQh9iLtlJPueQE396r
 24 |         U9WMIcmf78NdIC8taVKzZTUlbiKHV69bDjMLeT4r4OzreEvEWQbW2sl0eFjHU9tIzQEW8yOav1Xc
 25 |         U/DJ8icLV1EfUZcsQhcuE6CGpurmBKiqG36yG4N03wN5B1XHC+SuUZQd6B3sCIpUaL4voenHDBLd
 26 |         j8M3AAAA//8DAFH6hCYBAgAA
 27 |     headers:
 28 |       Access-Control-Allow-Origin:
 29 |       - '*'
 30 |       Access-Control-Expose-Headers:
 31 |       - X-RateLimit-Limit,X-RateLimit-Remaining
 32 |       Cache-Control:
 33 |       - private
 34 |       Connection:
 35 |       - Keep-Alive
 36 |       Content-Security-Policy:
 37 |       - upgrade-insecure-requests
 38 |       Content-Type:
 39 |       - text/xml; charset=UTF-8
 40 |       Date:
 41 |       - Fri, 23 Feb 2024 02:03:51 GMT
 42 |       Keep-Alive:
 43 |       - timeout=4, max=40
 44 |       NCBI-PHID:
 45 |       - 939B247C4D7BE0F500003CCA4D187A37.1.1.m_1
 46 |       NCBI-SID:
 47 |       - 0A64F1ABA182F348_B4D8SID
 48 |       Referrer-Policy:
 49 |       - origin-when-cross-origin
 50 |       Server:
 51 |       - Finatra
 52 |       Set-Cookie:
 53 |       - ncbi_sid=0A64F1ABA182F348_B4D8SID; domain=.nih.gov; path=/; expires=Sun, 23
 54 |         Feb 2025 02:03:52 GMT
 55 |       Strict-Transport-Security:
 56 |       - max-age=31536000; includeSubDomains; preload
 57 |       Transfer-Encoding:
 58 |       - chunked
 59 |       X-RateLimit-Limit:
 60 |       - '3'
 61 |       X-RateLimit-Remaining:
 62 |       - '2'
 63 |       X-UA-Compatible:
 64 |       - IE=Edge
 65 |       X-XSS-Protection:
 66 |       - 1; mode=block
 67 |       content-encoding:
 68 |       - gzip
 69 |     status:
 70 |       code: 200
 71 |       message: OK
 72 | - request:
 73 |     body: null
 74 |     headers:
 75 |       Accept:
 76 |       - '*/*'
 77 |       Accept-Encoding:
 78 |       - gzip, deflate
 79 |       Connection:
 80 |       - keep-alive
 81 |       User-Agent:
 82 |       - Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0
 83 |         Safari/537.36
 84 |     method: GET
 85 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=21441992&retmode=xml&rettype=medline
 86 |   response:
 87 |     body:
 88 |       string: !!binary |
 89 |         H4sIAAAAAAAAAFxR0W7CIBR931cwntdSSHVxoZjN9qHGqpnOxKel2ptJQsG01M2/H6gx6gvcC+cc
 90 |         zrnw4V+t0AGaVhqdYBpGGA3FE39OZ6Plep6hebepoXpvrNwqWIBF86+PST5COCBkOikISZepBxVX
 91 |         0AuirUXjUndlc0QsYjEh2RQjvLN2374RUtkq1KoOtdyFP+ZA9HYjyf70DjGdvZTfLI5oREOHxs7Q
 92 |         o4/HI8GdAyU1jKQtrQuDFm7v2gSfzQXa2KDI0kk+zTCa/WpoEuwCYMHnRZ6i1XUEWDAax3QwYJz4
 93 |         K8HT0sLI1HsFFirB11A2gkWUcnIqeWG03YnolZNz5QhHQWNO/O7XO7pvP+Eg2xst9qBFozstFt1q
 94 |         XcmX6KfxmwpUgjMFW9sYLbeBe1L5zoVyGcema3SpBM8XiynK21Yvj3u4JWBB+30W9GjfefGoK8mh
 95 |         O0BurlC5QcquTnCuLTQarFNeGdXVIHqcXCr/L96m4P8AAAD//3xUXYvbMBD8K0ueLpDaae+lhTSQ
 96 |         wh0NtNejHC33KFtrRyBLRh/ny7/vSM4lTtL2KZG1OzM7O+jKqPJ4VU6x16snFbDCewgJCosgZWgX
 97 |         O2HIcHTW14pNzatyLMMMPzZV5WBD3vTYR19jRw+H8jTBec2R8WjbCLZpmaRqGnaJIjPff/+5JSFF
 98 |         fwhSYx15G40kJRkCw54EDtrW+b5YlWeIq0fRKnNgRQhdwAdef0AeTqe3tD62Jt9MjnBpAnD37UCD
 99 |         jN5t5bi1XqkZ/RJayeeHz7PnWYaYVP6jTdqLtvfL4vb246eyMXC7SJsqlsvlFdim8sGJOpz+PfFr
100 |         WP9m4tdeW8eSxLWNIkoVLJ6Anl3Nfbay2lPHwkenTEvNhc+2ocqJ1InUviSbgyXHPcNy3PtUcLGH
101 |         m2EnwvxsHekbVMwXFH1iAZ/BbxM1sXlRiEqHZqFHKF/QFsyGqbZGZp4FVTbs/rdxGkCQlSHLmF1r
102 |         OySq6JNi4T17T8aad77nWjWqnkw58gVIPDH6BRTofZbRADMCfRD+xHBzlIEovqmYT8ik7WDc3/m+
103 |         pGn2mKbNQ1gtwS1k1MGT39kB+DVeJ+FEhbckyf4DAAD//4RW0Y7bIBD8FZS3Sr6L+m6d1EZqGym6
104 |         StH9AMbERiXBBXxX9+s7gwlxKvf6FIPJ7szs7OKVMAsISyU+sNCNsUDpIWnU58HxwbpGgxXGs8EK
105 |         sY2m5Mr5qH9VKUQYGy6Nwr7XHWV4FN8AB5dRtYoSPhsVfv9NloqUIi2dQ1PBnzCvaUYqGnCTwU45
106 |         cYUDysKvdCWHCTARYwEuPVw740aKIsUtRcrAoWhQoSLDzPdRoFOkDTTzg8QImn6ThYySyc5ANnBO
107 |         uTHAA7j8rAk9Dnx8aKT6MXdJiGM7EaV4643qCQzKmQEpaZMwwEp6xnY5ef1zBMAlOlCF5cSajfhu
108 |         0SPXrnjpTRAJbDBM8aqlXe/zd8qRZM9Sz62dBQcaiV2U2sph0MkNIYIN7AIR3gwsG3sXrpBPBmfw
109 |         d5JYxMf4geLs5wwYLYWYHesIRVtinMsM5xQnMEirO6/TuzKm/h5HOHnvwrHrdIgZO4MM3kGDNGU4
110 |         mooA14gbTqdNcs0mDaUNy+P8OSeAwBaWAbVEGNpWub5nOfGoHRMD5boL6viK6TBeMo8EokWF2dLk
111 |         hGg3PLyTlqP6tsQMH6GsP6AZxPWrJF8H+dXdFVEfZIjPEvf6Vy/bqd6Wdf0Fkz897cBtsuJQb8tW
112 |         vSdimP5ph+2yqD+dTsbOV/IeUtxtPB1d5J1/1AFfC1BhfwlwK/q1Ep/lpODzKF4c+9NV4vtzJXYw
113 |         aCtJdhHlbjUn2c7E/kNw10tvo7uscjzqRislVykelwzfz/UHAAD//7RX0W6CMBT9FcIPoMzpTFgT
114 |         sz2wbLqH4YOPBBrTjRUDMwt8/e5te4ViNfiwF4RyuQfp6ek51DjmTpTd0Qmwux1gVcByc89WAQTl
115 |         7rly4gQdYxBA7o/oYLjcY3tzhaavEFpR0Gzo4sGgt3159J8n0/ksfPCZ8WOeMU/KHPar2dmIbmrq
116 |         0Ud6eDg3shcd+p3t0Kfkqnstu6uTRyOrqqj0BPr4UzXsA5Zsi/Il8yigQXoiWTk9aXc32hTfWylA
117 |         p8FfQcqZLRbLe7Bd/WFl1d+E/IL1PPTmNHzqab3jK29+yypXS5ySznuiw465563Tz7JKyoPIkDob
118 |         n3XKFwWm6Fo1auyoQhLDuqlhXxzzCArxmLr6kCJV1U5WiHb47kHvM3QfiuIhM/ER5jxlUQw1JU6g
119 |         DosmpmCwohRZ8YyDAuc9dk1sds3tyBYSuayWVxDSDF2yhTDg74C+twLANlnxdmz7cGn6x0At/HPq
120 |         N1oLCYLMMJ/qs/H4Otf/G/4fAAAA//+smG1r2zAUhf/KhUHZwFUsO0pjAgMvK31NSRtGPiuWuxr8
121 |         EpyktP31PVey80abrtsgH6RIiaWj6+Pn+MPrF64C/s8C5PsL6GxXU+tfbhHf04bvdrytGWud7cLs
122 |         GN2FoXXuaxTcvKBYz3lzegEzHI+God+ToVIfzOZ0eCARbv22s7fQu7Thwb0u3LK92+JZmtNE0Eh4
123 |         dJb9XnGAGqI9BMtUeUYx2mM9B7vQD4ER+hrgut8Y65AywZuJhUhrO8w7eB5lLw7U5/DgtIlr21Yg
124 |         6FJQnACrQYeTKkGnECT9vkcn3fDoyxN2Fw5OFFTcmMIfH4H0I6V6XXlQmM6WFG+qgnLkzV5i97HR
125 |         Wc1LZlmuygrTc+6eonu+Mgts+ieatysQ8sJNqh51AsxNIKxH06oy9er+nv/PKTub0dTKarI81/bX
126 |         12DeO4GPR5NlOn9IS77EyMmtWO42upjnUhdZsnCqp8d1mtso+l7IR4CyqK7t28QZ2zLuLE4VSKAW
127 |         /YF1WBlQdueLoOuRbA9D9v/uLFQUBFKF/3YWDExWubguq+WSq5V1Okf0KYFZrDEQ+EEXbdM8s9bX
128 |         TjwJ8dw2ksEUbO7aZmDpvx2wIWA90kQR96Jr7zkpaFxzyd7oZc5VrA1qOMkE/RITEQuKUMUyCP2N
129 |         dOj0Pqde4w4q6vvRYW94BQAA//+smU9Pg0AQxb/KXk1a3D/dsng01TTp0YPxqHRj1woYt2r67Z2Z
130 |         hQVKVGx7I4FMyHvD25kfrdJCpzJT5kxK32EDYl8+fLDlUFSBAbBw3le5IzhR2AIkmpKmYev1bWdG
131 |         agTFD4gDWwfi1Ipcjw4wH2wDJMLCCaOpDbT3+3xTgfDiSnKZxFB88/tnDEUOoQg3/iV3P14HlcYF
132 |         OTS54WakWVIYpaQ+l1mHHmHXmzqhYVcN6CMKjwQI1/GauAQy9rv8AQtZOBBgp6/gQSi7gz29TfOQ
133 |         Hs1ADeHBJ0wa3XwDMtPH5YeRqVZKnygULq8oFbsFma5f7br68ltH2t27JwxVPP1Wj85bCnoUML0I
134 |         X3qLESKXCQBqh6Cd5RtYtCyFcYl8MOLLH0KkAU8klSuQ2igNiSEy2QYGx/aD5WN++ZKU8cEE3yrh
135 |         eO6nRzX4HxXH5UyqJJ+JUxO958jSlngckQmL6j1cstXAEuzpG5o4SO8ayE69pb8pn5ahJ2VAZ13K
136 |         XVvVA8UdItP1YiYmbJ6RE98AAAD//6yZQU+EMBCF/wo3T7rQUijHJXpy1T17K9gAYS2mrBr99b4p
137 |         hLq6Gla8ktCUN5PvPWYogmQ/qSYvQJ+QZPivOvgT59VB8lRGyUKE0BjYJbwbtVNl7Wx1q/oSUeZ8
138 |         o2nqvfYJ5FZXysmc320up7gBXBBEXpoeLwXuc0Zg+O3CKLEJYpijFJO+MmVf5DAkhSCuh/xvFP/l
139 |         tHnCJpynGWPLhDUP2lIwzElb+1y+65bi3ZXr6VfVK+DnevDZxhTaVjDboz2OqLdrgG4aDQ9LNxp6
140 |         fzLUx860+s1z/mDSDzDjbgXx2bltBP2ZiMOJy0IMFcj4qkS5OrsqahNGp9HaS//9mFnuyVLOeZTM
141 |         bn2WJTFfhqBc448z2KIE9wqUsC59j5zhqMH6YHXRP2nVanuGPu+aklw1sE1V749sFgbZXb9bjUf7
142 |         IEKQhsqjGX4AAAD//wIGOjCcIKFlrg/sShkYmpua6YKsNTQ0NDHQBYlYkhv++EwkLnBNzIBZxsCM
143 |         gsAFAAAA//+sWU1PwkAU/CscvVBpgS31pqJ48IMAwYPxsLZrtmltm+0u0X/vvN22kBhMI17ItjSF
144 |         vHn7ZmYHxTWxhEdyZsfk1i1tcHGryH8X6HxbahjlqJEplcIkckN8x3Njz/9bakWxnc+kbyFcCtG2
145 |         P+W/jl/3lmbtImmojzFoFaqso9XpxKci+QE7b3JrDw952A6hRw/+rea/vK5fwUcjxvwwPLHgSYIK
146 |         W1d5xRUG8Mp510WpEhSc5s6KWNZ5+4c0llguCRULA02duYiVoMSmzaQe/T2HktXSnaCvSsuzpBMp
147 |         Hql1+mFyUx/kWhcDyd9SbdwO2sFOKvFOKQfRO3iXwlZtu8Ap+kp+1Snp+jEIeNp5JzZrJ/oaH+Fw
148 |         FrLRGf4tOhp9fQL7HnldL8SiaDxhwWnnC9dcVRagJVeZW5R5RnBctjfn7rSFPBMcVU0z6WfE1xAC
149 |         BahpYXcIVXefClKsb3KdDnflp8gPpJCLDw+1j3dEmRB7TryAhYOXG5RgwKXgCfVIBSj1q/dvEqj7
150 |         ob6sEbGARX1Rg2qd+iyIToNNCkHa9NluLUn9+4TlVhQZ14rTJNriesOL9rxoYTRmISAhXNdG60YU
151 |         3An4jiKzt+9FLdOMK7tNW+yJiJgDvdt38TcAAAD//6xay06DUBD9FXZumgpcHve6pSbVlsTYlUtK
152 |         bg2GFsIjqXy98+BRlGgf7toNJDOHmXPOnGEiZtsPdN5ODp1IuVDW6WOFTGybHOhq3kOBQxe78HWU
153 |         QZjQbkgSHNWvK/dCtTEQKd+Twr1tk+ABE8bPaj6qTUBCATBM5tuSZxhvbQD71Cm807WsZx/ahcLB
154 |         lQr4GSu3tnzMXnFawa6HNx9zbANWj4wxS80M4CtdfYTLak2a96bwLNMXSpjSdoVyrmWw0486C+bS
155 |         lpbybxPMASBzh4edZ/R7ZgawWZhAiHsaTkD4G+zIipZKAuh7x7/roQ1P3YG8MympojA3SNulGXwO
156 |         2aE91OtYcxqg9ZKxWdOO8SbXGjTKEh4z504ID5Aq/K4TDpq+VyBVOtJ3xe9O+l81W2hjXcdJRCI2
157 |         yGJYsFS7MCqqumkSklahrlP8RXUMUkCzJssyrIsi+sQCh4O99sLyqW4TL6XeYwojHqwbThucBrFw
158 |         JO9bt2I6UjWWA+jS4GfaUyRPOj94fG4L8x/kAD/mPEqklOl9tz4vb0hWa6CeAMs3mrdpqguUs5ux
159 |         3F3cPZZ5ViZfAAAA//+kmkFrg0AQhf9KroUgIVHXvRbaQml7sKU9G2MTIWZlN1LaX983sxO1JRJb
160 |         j4FgcPJ8++bzYZAMgQ6c61MfjYQD8dQs16E82uS5ezfuu0u/udMcmEZQ+ec0fFmCo2g+0x0b0zo+
161 |         t7aqgJDAvzeBwauNpDwJ5r9YTPwHXFkRQ0i9M8RXfgRswzS80lWOM7zAgx5shDeUdmbNnufbcp2L
162 |         8J0y/yrUYatofPjbCFuiu4pXYzfSBP6MMDJtWqbYWaZilCiesqaCXB99in81+y/WbcvDYMp9htai
163 |         A4xnTQYAHRZbBroCx07r0jlbGECV3AsT1daeu5stHC6E6y5V5xpLlfxWcO/rwraigF/qTtTy4HVH
164 |         goNIK6WnGf0N1RA5wr2Ua+Q2nJWlKDwhhV8bc6RCVw05H3dm4xVOFbpNhiMVccZYNycXf5caHLF0
165 |         i63X+RYoN1B9F7eQgIJfRr7PmfzkeIDyz4BL9CJ+Wplaxasfae3S3dxiS6MbIOt7MB90jtPZ9IaY
166 |         S4HrmV/u3MsuaTOOr3ddOE2l+OmampcU9r5vAAAA//+kW+1ugkAQfBVfoM0BgtA/jWmi/ihpok9A
167 |         y6GXHGAAbfTpO7sHHJg0aSHxjwlRb28/ZmbHaqHxqf1S6BVjr692HAf8L8sUMeZmkSmy2bUTS1XA
168 |         ufKKKTc2443dfr9C1sACgcD/38QaQ9ZgHmTdlLpuJH51bNTt/v3BgNj4a3c5noh8m6xZ+YMFZazo
169 |         Mkxx0g3bnWR3NjalMjTljkkpxhl2rBLG+1TI1pZIucNiCk0oSZ5Uepaf0fgqAJbCsnAAuw5ZOS4a
170 |         adgLtcjtaQF1hQviNS+eVjTqlOg1nYBaFKlE1qrczmac5qQAMjHCUZw1x5ldBu9lkZJAghdrDVw8
171 |         XgAQFDr9OA69actaxw+iZfhn1cePvJVw5kVmm9zvoHY3AydLjNob3SWR0z1gec4s9AHhWD9BeX5K
172 |         y+9iVLuppMpsAF5QcFqh/zwsAlV+TlTFKhGpD1Sk2ng/eCgPa5L34JGV3zwxtSgdPxTBPEqzVfpT
173 |         Vg2FZ8/cvUEpHExfe0uqK9rLxkSOADuOtrZA/MNQ7QHM1jJJ687SrslhQS502Q/Sgc0cNKr9xwiv
174 |         CA38GaRqu9feXfJh7JYvnrDL7c7xI55/AAAA//+sm91KxDAQhV9lbwWLiUn64+0iorgKFh8gLN1a
175 |         LK5saS98euckmzRWXEO7l0kh0Ekyc+bLDGNsphj/Y6U40JKTV2Sx6p3ueyrydBnQvGtRgo5Qg5ft
176 |         R91gsxypLKt2l3zooakNFek+GxgVxYk3NpjS/gz0Ac1zfbcC92irpHvbk6E3GpGDfBzE5KYi/SiQ
177 |         zqceOYp0nqsrJGn6YhkVDJBSSeMaP70euceEJoUIaQRT5oLLC5vjWc2Aw0mGGfbtYIv1LTy6XKGN
178 |         oT5AA9oOCjufjNMhX/IgyTBfV++OIEQmrZCZHmBVDUkzIUxBXQCKiigmeHqCgYG86lpcPTzdvr48
179 |         l+t7Oq6ZSEADGaLHHNV4ar2oU59mgvE8+l1cKZHlC7VluP/9V//e/KLBR5fkdNfRn4Qif3RG07YJ
180 |         18rgam52egtO1m0NNNDBQ619C1ccnMvLfCHlCXDL6cv5qhKCFSM9lMyY+ndnvYfiomD8Z772DQAA
181 |         //+kW8sKwjAQ/B0vgRiTlvYs6MWT/kBRwUKt4kX69+4j2cSgYttDD700sEm2M7Mz4/fqdoLrEFSx
182 |         Xdv1rC+suVEZalQY3gIA5gkugLDLsxkYhCXhO7x7zZE34AALwc8lwbYO6FYysQ0lg6dQcJOKBaxV
183 |         adgeNXGE9f1r/x39VWmrmR6/3LRnMuU7k3Pp6MbeVAMoBijcKIlF0QgLc0RJn5ITRiEkCSihKuZR
184 |         MCm9YjFzyHO1ABhjJuIXY8pl6Wb+GWJ9xO8VDGBC/n8ULItvMr4bsBAZwCOy6uUsPJY94JTwGh2l
185 |         ws4+GQzeeMNI8UX0J1tZXczTnzaPtuvU/n7GqSOp1Vu0sPUsxaYwz3NZwXdegeKq1aHLXjHJ7Rk5
186 |         DzxjtO4FAAD//4LOJ8CzL6SDCumdgwZZ4d1zI9ImLZFWpRgBexiUZjL0APEuLc7IyywCT867An0H
187 |         6jKAxpkcyzJTEkE9fR0Fz5IqYO89GzyICh82hc/duyBWAeWg7bqDNCaKgWkNAAAA//+kW11vgjAU
188 |         /St9nIkmCPKhe2IuGSQzIc4svlZkoxGEFHDbv9+9t1LQmGXqMwRjKafnng8c3g9cYo8SkDChRm5X
189 |         sT1Vn9rCahHHohQqKIpVuG8tnxKF8PSsbxpWZ+p6U2fkmK71YBgDz/Asc3TdMHfmEV983D9dfcd1
190 |         Des+EhxwmZPtEpF3kMWpysItO4x8I5bWG03ouLms5x1PnxneDlQNRYaTF/AFpLlKeancGg6rLupm
191 |         S+qBohKIHmekTvv2Hn74k84VG09u1WAAZ+2xcx+RCniFa/VK8Z54Vyk9HzcwgWjUbeBWeAn3qdi0
192 |         Ec19LYtsyGIhY74VuIbAhiueKZaMbdB+VNzXqUO8GiWS0BFp8Po9hJFkrtWYZfLZZHoY7N1JcYhE
193 |         Kg6A11ZK38AH+mXZ1lGGLAFseYFVhz/3TEUDKXhNKDbn+UaKLTZVF/6MLcIVi3Csf2SDIXNs/WIc
194 |         175GIAzQ7MefCqsi79tTGEbQLiqQS/7zcXScKQNeNLAP21QOq3aAPICOT0nKD6ctA2OKbrM2m28b
195 |         vvAAtF3vj03zCwAA//8iJtGkggdToEtPgeFZkASaQPKGVLrwJAWvbmFJBxG/kKYFyP+wVngKZG87
196 |         KGKTMoH5CZj7kvNBW5XBU8zwZTRJqXmpaZklxSjpKqAoPx0UgyA2PMTA+2/BMwLAmsXMEgAAAP//
197 |         pFzJasMwEP2V3Hpy61iKIufWQiBp8cnQu3CMK7Ko2Ekh+frOIq9twMQ5OkZgLTPvvXkjMk+tq6ru
198 |         p04oi/M+SR3GYqSEKXHozMAioJobkDHLVGf4vAvE2dmnd0ZjjeyHXkK2/XqEN8qdOa5m60MFf+Ql
199 |         bSbRbiahHkxnWsRS6mlC6/bks1VikRXDUuERT5E3IyJ/s7dbTt4z+rKC0XnNjjcwUb6TBiVEhk29
200 |         5D64QwAjpIfxFAS6hm4LaQ8w6KlzJUCPSUXApLRumJSOxR3eEwoyq0bTq37/jDiSx6IfbTFNbXt3
201 |         14zap5i+UuXVVxaSbFsZk6FpgwHI2VCKq7Ud3sCN04U4a1dmXMFZgRhiSV7Dhci5vdgnIld4XZzL
202 |         JwN7mSuu5LZXrcVbLUl9gN/LXKo40LFWz2KhYBIeXIQ7Q42a/VhLKaNppu4Pu7+g+4Cq3q7Ebmi2
203 |         Idvqa89liTqGb+BIYzxhmajBCS2M29kSZhdv+HAYxfAGCHNwvsrAzuMnvH/hG2GH1zyp628g+wjE
204 |         wCKM5h01WIR/ZZooUssAHnMP73TZpz/eOCkhVlJhLfcXAAD//6RcSwrCMBC9iieQTkNtXFYQQYob
205 |         FwV3orEWP4Vq8frOTD7GWiTaC4SSafLmfSaBUoKUAPEwopJvH0RLMg00i7Zm0W+JjUeJVXQphA7i
206 |         diidEXX6wyD3bVMqgzoWkFRz6zN3aRjF1+diAd9N2AlfMPCnTBeyblDdQCaRCPdfUkB+CQPrxqnY
207 |         I72UMGauVOiK4Z3XsqO6OdatIU62kBSPWmSzTHcJVJKyvlbUEVQXPHgsW2uUccaWlTA4aUL5zh3Z
208 |         L/r03V6BZxGxkeXGiCQk3XwyRDKW6Z+V6l0obLORQiGZH4YpuclTVYwmcw0f3YiJMHHyd4bzhur+
209 |         RIX/NNDnAIUd1frgOhNsWWEKL2tr+iNgu9wIZQfFMGUob0/YwphcU6Gw396d6I5f8SRxo+MEaze5
210 |         zcLadaQoZe82x08q7NWdtGT8kb0LxhBFfhSpT1EjpKXvPmgicz4bbyVO/HEqaZv7JwAAAP//InkS
211 |         2cDSiMIeoW9iKmgYIgDWmUZaiOeYUgRfAwFqAoOX1TiCZ+6L8oAJBTxrHwJe6QFqPAeA10AC+3wl
212 |         qMv44JuNXaCLsRHDvchzL8XQHjt0jj4fsS8NttgRNJoJ6brjaE6CNpRYGCBldMjyGwMz/bzMXNDm
213 |         VdCOPgsSQxtRImMxh7hsbmhqamJuSdmyPWD7MK8kvzgDlKJB43RO+fnZkOUTbrB+WEUSeJo7DH1b
214 |         MnSJWTAAAAD//6RcwYrCMBD9FY+eXE0npr3aywquiv2CSAURtSy1e9ivd2ZS0yRWKM29hPIYJu+9
215 |         eZPWrgyz7uxjdLVrqtsUb5uN5re7zH58/ZjUv41mCumuarLJ0W2UKA8w+oEZgjbWc+o5ZxhrTEBC
216 |         GtdeQ9zzs25IOlKXRSH6+G+nG6tOQjHQlW2XLs7+ugiXMb+qNhRngUCrpZNpkp/kDZgIWMTk4/OJ
217 |         AwUTpCmIOCkbor+pjppCQls7ECHA9324ebh7Iw5CHHtJZx8Z/qfLP7KcQsCnRYNfziYLVPiFkHbg
218 |         VIixGl9SMD6LLcsffeYHNljD7K5ElHNrgSrft3ejLMFqZBVc6+SCXu1+n6MpW9bcc+1nir0P23cT
219 |         CazjUXN8Xe5YMnMQTFxH8qv3YwZS2QQgUXFUdnu66xJ5xJqaqqaMaf7KnX6bK/Ogb7f6CQAA//+k
220 |         XEtrg0AQ/kUV1+4ac2xIIc0l0h7aq4elKQkR3LXS/PrMI86uktJUb4IgOM7ufs73qH4skwUbtPnS
221 |         0OitauFqE9Dt0DQvZY8lfcKvVA7ufAJqBkSGPEsXwgEZa8DOcaXlopmhQhtxFpx9mf6nZEuAhZnv
222 |         Zd91DqeGWK5V2/j69LAG/GCZX9ni8Jklk68hOedJxoS3grKEnr/mjfLID/4aLOXr0RADJbiA96UT
223 |         U4b6Zkn2zyxTxUSt7egRd577qdaLYl4ZS2qtNTNzbt9af+YgktuWqXd7dERERfKYHuLmvZmR5q+9
224 |         RAKZulBjFpX2AZLYkmMi1FfuMPLxwvrXgXjSSyWeW5V0pyahwqFs3iQ61YVKHqcqnv945n2fJYf9
225 |         qZipciitP9TfWNkXkp9DfT6Iet7X1OAr2/gGO3lHvDRcPLOMDkOPHH6U6PwSooTJ1IgK+S0kY6go
226 |         XCCRIas+n3wuKZMrM2+/LOsjpoFuxxNNQqGsvsLg3i/H7ydvVsKqHVhjOWcWQOkFAAD//ypLhZ4r
227 |         Aiw6y5AOPca9VRaUHo1MEctzkJZywLa2mmiCUo4RJTtlsZtGVAo0B+1soXDhA2gOqwQyOBYE7obl
228 |         5yaCAtUXspjaPzmjGFLYgju8zvn5Banws7qQFxEiLYtDjhjI2kHdzDzQ+fbQIhkoAgAAAP//rFzL
229 |         boMwEPydXlyFOKkht17aa5XmkKvBTnAwJbKgUv8+uwsYxyJSJHrkIQMyHs/uzo5leEg+F33pE1b/
230 |         65auc2wMinL762ySD67idFmQiV9lsL3/o3fLNOJziLARm+RtoTJzb3RRUSyA5jlWnV2Xe+RuSyMr
231 |         AoZP1yhFSPCOt1NMHZIE6WStW4emvAF/xTQy671zMcdVBkWaUTMbOO2G6u/YuLoP78beTqDJ90ZT
232 |         xgLKdBibwEl4/RZo8iAkehR0ozsd9/MsuPeG+YafnLM0SbIX+DYEbM6OC2QH88M9mWmD6U0XYtse
233 |         +zNxw/3oLqapJBG+g9N1buXfWA79MgUuy4OXnApPbMh4/EfStjv2fOYaLlElszaKWXPSu4jqTTM5
234 |         YCXTv02l1VzPHQ1ESt/+KdcStvpYaSrQVSkRQNdvAAAA//+sXNFugzAM/BV+YIhSWpq9bqpUqZWm
235 |         sR/IaFSYWDNVQern176EJtBNY2WPvESIGNtn351D6/TwDdOU+pz5Q5JDJPcfK4f+eaNG10ummqaj
236 |         nX9ykaX5xLreE/YEwp+dZF0IF3G+5FAdNFD/cCrdSmOFAKBOcDWDrTzvUdk7gqt9e4VY1jIfZAAn
237 |         h7xZzcF8yRNOl1na7dM+Yr9RsxA9mYk0Pt+/ofv5xJGof7YSUxfYhfz84jr0EiN9FubEaupDazAH
238 |         YNDFXh6cNne3kzAIzoSTVOoAwAbSdgLDRwmX/KbUlW4eMXqJNpvfZgnuVKPPdcmtxhv9aQY9B11R
239 |         loc98H18jYWYi9W0QlSUFeUSg7n3q9rbqgQYdmpV1Th3NwpC6k1dkuqIK54lqTjc1WA0GAR0DUZV
240 |         37XtHd2XzXWwK0mug6rFHzWOng9BsSTENAiFD0KwyfnAPFWKXnBtFRTPKqK0LvnH7r7DWtYNBGM6
241 |         qg9HAjmIDrnnO3ez/UE0eUF6oyXM7dUFAAD//6xd2wqCMBh+lR4gwlzmuq6uupGKuu4gFYmGCVFP
242 |         3//9mzM3iUgvFRky53/Yd9guByT8UddrKrqauro+yEWER2G1z4SLhvDsj2W3iHBtvB/DsxC2cciX
243 |         pT1Em9bSTQCfMkvi+5XpSOsGaNHCHs3ewgy2jEcX9vqUCNB9G2M8xPCurQGM+TmGesbgjFYzJvo9
244 |         Ic3CF5M/2X5eSLGgpSvLCuQflsHx1Gyhs40GmvRQOR1pI4vHvU4aihT5abrTkrAN93RTWs2n8xM7
245 |         sGjpho7hsTkChtJdob2P58uI6apZylJLx9vUNSpsyII+26+Mq5DiNeUsvNOg2FPSkmFXedAe88dy
246 |         JJATv2UoT7KCccuVyoTo8jgzblUdsqCKNHmmyqbaKBg3pWsAq2VL58FLCmkM2Fv0VSiD3bC+cYRP
247 |         fqJS5lXa0Zo/RIWwktmFTSOH0w5KRBBWkSqQwrG3uHiiC9M8NQw//QYAAP//pF1BTsMwEPwKDyiV
248 |         HSV2euSABIgiDlw4VjSBqE4jNfWlr2dnbTd2KRTV9yiHTeyd3Z2ZvQxBVL2oZObRGQyI3nfc9j1Q
249 |         2Jg11GHZBnDHE8MSTNHeY5kBTxeGBquUqGh2R8IVXH7oNoYT4hvzIYusnGPwPqjIopQSsVW2VLab
250 |         wB//1V5WajVhe62vU+0WirDfZZ/CyHRTVLlpeuh7bFJbzoOmcZG6vQRRQVjp1IGJDVVuxNBysZqk
251 |         8fQYL5JKZfKpnuUZbFxm2IWAFkz2mRC3LKorfXpFRUVrmZn0AG65UfDopI4jVXtmE0txX4+MWa7S
252 |         U+JmKW4fDq4WREN94yOEYGHf4Ul3cYZFPWbVWzcVIygZ+114BVx54oT1lwJOz4XKJ0GdeeP/YIdQ
253 |         dS115hew+DXbrjFrhs9vO8LP9BXsF4AjcCS2MHTOmR4sitFSBJ1v0ezm/nPL06HjRfES1M5nB3S7
254 |         pjV8KbTWtGD1rOPrAPqsDy9Q+IE9UIVIMcFumQzdvgEAAP//pF3BasMwDP2V/UDLYnvtfOwoO3SD
255 |         QSnknKYmCyRNltSF/f0kWY1jskMb30IOhQhVT5bee37k779Wa6HuXvXqZPWsdBzWHVBP8EZnlp7n
256 |         ZlvboXn9xglyLs2VHveZhdC7Y83OaRaYYPLujyrTtUbQtk1H6oM9phY4t/S6QsdJDklegOxzTVv+
257 |         /aE710kSJT1xmXyFIB8svD3jIJeys6pqOOSwDKQbib+wsHw252JRZa6TG7rilt7kTBXDjSXb7uFI
258 |         EicZfCPAtDVOpPAiOCkfqws+EpDYSSQXO83qY5b/QE0lNP9ozDdkBrPiaUuLj/umNkiwo3Zra9oS
259 |         fWpuvhIt5SnFSocL3tEMLrfstEf8ph5vcStcUh5/R5ccQp09VWYBRe7kWgaHVHgBiLNz2kDGIloh
260 |         xCuvNFAzlSGoxH2VOo5SnZZF0d/Ul6PkYc46BARCdCkNQ0lgdzNtcQB9bdctn77aAW7cSAG32sIL
261 |         2+S8/ZZeyRcRaeAbfHBqyp4sqnfLPwAAAP//pJ3JCsJADIZfxRdQtIvLUXABvenBc6kVC12kVcG3
262 |         d/6kmUULLr0JIz2EaSd/5s+XD32TkpdEcYUaMs8EBJERBd/6ot5JTbHBGjnJqXB/xv0qEEBX15IM
263 |         PXvXs0kk9XT/wYsj6FHTrOt7fyYy6hSdUDLZLXqQ5M0wkhNy6T0L0EWSl7XQLLesWg+OqNk5AxFf
264 |         ed/CZbENK5bgdJNuvd3awCyeca/+u9HUYRiOvY6k9IP6AiBAG46X7tCl67xVWiT9dXlTOS7d+BGv
265 |         r8yiwoJ1oxCNZLBvkQ4YVVHD6q4nlGj0itv7iE1nz09VeYflIOI+c4tj/24KIlCSb8ThlC+GxM2D
266 |         73cHikXLY76tTyvJ2tF85ZDRKfc7lpUwipfqiAWHhFwXjYiZS7ZhBjEQPB1RRHCbAajxQzR4AjpL
267 |         nmZR1YuFsciHqmtPQw91GJgyaxi8GiZng+GvxehW46U857soD8Ng4v9g4Da/mzUz6u4JAAD//4Jz
268 |         oNrtAAAAAP//QuODblYHAAAA//8DAO9ZGGQUfgAA
269 |     headers:
270 |       Access-Control-Allow-Origin:
271 |       - '*'
272 |       Access-Control-Expose-Headers:
273 |       - X-RateLimit-Limit,X-RateLimit-Remaining
274 |       Cache-Control:
275 |       - private
276 |       Connection:
277 |       - Keep-Alive
278 |       Content-Security-Policy:
279 |       - upgrade-insecure-requests
280 |       Content-Type:
281 |       - text/xml; charset=UTF-8
282 |       Date:
283 |       - Fri, 23 Feb 2024 02:03:51 GMT
284 |       Keep-Alive:
285 |       - timeout=4, max=40
286 |       NCBI-PHID:
287 |       - 939B247C4D7BE0F500002CCA50850F36.1.1.m_3
288 |       NCBI-SID:
289 |       - 6452D0EBA96D54DC_A764SID
290 |       Referrer-Policy:
291 |       - origin-when-cross-origin
292 |       Server:
293 |       - Finatra
294 |       Set-Cookie:
295 |       - ncbi_sid=6452D0EBA96D54DC_A764SID; domain=.nih.gov; path=/; expires=Sun, 23
296 |         Feb 2025 02:03:52 GMT
297 |       Strict-Transport-Security:
298 |       - max-age=31536000; includeSubDomains; preload
299 |       Transfer-Encoding:
300 |       - chunked
301 |       X-RateLimit-Limit:
302 |       - '3'
303 |       X-RateLimit-Remaining:
304 |       - '1'
305 |       X-UA-Compatible:
306 |       - IE=Edge
307 |       X-XSS-Protection:
308 |       - 1; mode=block
309 |       content-encoding:
310 |       - gzip
311 |     status:
312 |       code: 200
313 |       message: OK
314 | version: 1
315 | 


--------------------------------------------------------------------------------
/ace/tests/cassettes/test_ace/test_neuropsychologia_source.yaml:
--------------------------------------------------------------------------------
  1 | interactions:
  2 | - request:
  3 |     body: null
  4 |     headers:
  5 |       Accept:
  6 |       - '*/*'
  7 |       Accept-Encoding:
  8 |       - gzip, deflate
  9 |       Connection:
 10 |       - keep-alive
 11 |       User-Agent:
 12 |       - Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0
 13 |         Safari/537.36
 14 |     method: GET
 15 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=29366950&retmode=xml&rettype=medline
 16 |   response:
 17 |     body:
 18 |       string: !!binary |
 19 |         H4sIAAAAAAAAAHxTUW/aMBB+36/w/FzihFLWTk6qDiKNCSgarBJPk0kO4imxM9thY79+50BTQNNe
 20 |         4vPlu/N9393xx99VSfZgrNQqplEQUvKYvOPvx8+j1XqRkkWzqSB/Mk5mJSzBkcW3T9PJiNAeY/Pp
 21 |         jLHxauxBsw50QyLryBehGmEOpB/2B4ylc0po4VxtPzKWuzxQZRUoWQQ7vWcq20hWt+8w3biT+b0/
 22 |         CKMwChBNsaDrOq5dCccKSqlgJJ1wSIYs8WxsTGfpeDqZp5Q8/1JgYopV04QvZpMxeel406T/cDsc
 23 |         PtyFnPlfCR8LByNd1SU4yBO+BmGSfhg9cNaafKaVK5Kwz9nRwoBDEt5x5k//vQj316+wl/Y/uaLL
 24 |         XFF4nqsLPvFtNdc5lDFdGKlcLy0hc0YrmSG7L7oxSpQJnyyXczKxVq0ONcT0HBTdf7jt3d5FA848
 25 |         qgtCdAMEZYQcJZVNFdOJcmAUOMz8osumgiSKIs5Otm+EL/GN2P0lsZkwHTPWgdn5ewlfSYddnENj
 26 |         dG0PWaFLvZOCs6MfiTw/bTYGVWi7+w/gNaJ7oNPsmOpzUwlFNlCIvUSAKEkubWZkJdVxcPSWFB5z
 27 |         Q7JCVrVQfwCIUDmpRCZ+ojZiu0UZ5R7IXmeilLaNs0RaYmDrJYacbA7EFUAUtE8YsDVCgEjVum1T
 28 |         g5HaEAdVrT3CNmXW2ICzi2L5QuxOdSUcJ9o4dKD8Axy0tytPVX70+/l9vbxuxGKnfECv/Xnmw16c
 29 |         JU+nyMWbuBfpJD/OS64lJS9IMV/PY7rGoQkDXMoh+xGoqwYEvu8BLmzYH2INp2x/AQAA//9sVMtu
 30 |         AjEM/BXf2kpQUVZF9ILEjXv7A8F4qaXdTZQH28/vOCtUunCL49dkPM5gq/SwdNBZ6c/Var1dNh/N
 31 |         +vlt+9LA2izfZ5X2x5Sj4/x3+pKfvNszg+UsBFJZwnWKRrT03kwQzBCgDNkcd2OTlOBSRLWYSSrM
 32 |         uGkLpuJZa27fF2zNJBDTgtpKoD/sVzr4UfCHLkizVRt8JkzQRRq/BSAiQWfkjtpphir8BNP0c4Ou
 33 |         SKI2+n4CR0nPuK7gUhDWVtkyqzCvzgUBLFcxkwuhU8gOMb62rFmSnv4/Flgl4osekXE6QZUJLKm1
 34 |         saWHZuXiuoLY4VzpuyHUXj1XM7o9WoZfAAAA//+cVstuhDAM/JUcWwntf/TQqlpV6tmAlyBlASVB
 35 |         lL/v2BAUst1Lb4iHPR7PTBBWdZa2x1MvrE8wmGxoh2VedJKQeyzkJguvF/PNWMeMe9FSVDiJw2RR
 36 |         M5G4pUeBGARNZuWcXQzFQwOI5IUkQitzm4cmKWWyK0TMiDjQjBqR5O2a48I8JMwX85l3WzCYuY84
 37 |         6yjJDyjzLNk5fIRxwD9RVmFa5yAlDY8se4oNflletXtlbBLePFDttn3sYNx6JkOQZP1TlhW1P2An
 38 |         VceMw8bc3q9vUhPyB9G8g649IcWSBsIhy6d5WfQ4r9Vzt7nwWTC6sebtVYHQuDHIbBTUmpK0Sx9t
 39 |         KVZM2Ojp+2Nobvs44lfkcE2wSjKZzlOLi3/IOcn46O3IdxziqdafO8YdzQ78ugj80weZFyplUBm5
 40 |         k3NlaVD7CwAA///cl9lqwkAYhV9loFBaCHhdDEK0KsGkCLUPMNgxGYwZmcWqT98zS6JpI73pRelN
 41 |         SCZhlu9fzomrEmQD8tYWfGQ/Vm4+UyE9lSnchhy5ULdrISWrsKqygJrToNc19LsEv1fvLTo2Mj6m
 42 |         mKgF3oQ2tOLr7LGHc7uxR3Jdyy3fV4aqrUNHI9QidPJaAeKJ2J8kL0qd1mjhOy9q7SC5vzvSpyGx
 43 |         GkWmlYJJQIvMYERJUqGf2W+seCsmD+wdk/dOd1kT+mN0KWSGPZLG5gUdC6862hZnVOkXCrM0k1yf
 44 |         40H7HM+EZO5uVYodVfGgHYjTmltNUqMV3E1zHyebDZqgl0RsrjMwema2I+6CzjmTBKKniOT0SJYV
 45 |         eG5JWivNtdHMiZ23QxNR2AUOvnDHLkavAA3+EckY3595EZE50pbWp+GXKZbOBdC1MBhd+zZ+cZok
 46 |         NwrXh3Q5zR8j8lZz97vhu/i8xFYjMmZVAadpo3p1mM6TP+vAw/0Bcm4YSkb2Yl5QqeADejkv/hbn
 47 |         38IxNyXtZZHUfEt7SST/lIQwH+oGCrihgOITAAD//8xYXWuDMBT9K6E/wGodrIwubLTC2rkxsHvo
 48 |         o51BRWdHGx/89zsxUWMb3zLYi5Dr9XI/wvWcY6cVHx02bq4v/eF0LmB7tVVViHXGzRMOEc1YVWi9
 49 |         qihj+IqVic3S2rFMjGyfA1hlDdmZd+bu7+ambzcFox8Il+k4sUz5qYG3E385dWGrGVF5Aqg5som9
 50 |         xnNGAvNeC/7DHZ4P/0xRVJXWgqWyKhX1qJPQEkrFswRPlM5XRvK5fZxtQEPvFssZVSyfKN7cKgy6
 51 |         N72xyKDKX4gRRDxuFZIpPcP1RkKN73VCjRZyOPU8vNM72q6sgZb4uaFBlWJHJgJxSEPnvX82aBzD
 52 |         u9V7+Y2JAAOCF4Mru/eev5rrxlb7CfOqAA+hPbeWYk9n7iOOc1MSWlQfL4zT7RuyG5uQ5CV7YXGC
 53 |         ILKbmgFdYYJ4/AAEiouoxuW6/mL5CwAA///MWNsKgkAU/JXwOWh3XW8QQdCFHoqsfkBMxDCVpL6/
 54 |         ca3MOkHUPvgq7MwwnruBtnDIT7u8SMIqvlfGaLzHvAoHW2+UsifEb+BdTsJnGF3TUgMBN5lDEdzH
 55 |         3fVjeH4n88/IaFTHBtCHYouJV0AkPKbgMlFJNxy03v2i2ROc0KzabZdUmh9V4nNRJFWR+PsHSrhB
 56 |         0Exa579+U/RQMtLumCRtKQmKaX1d0BHglrAZ4c8sQgJFOvAdTvk/D9JLULXSLWpSb3PbcjXw2S6j
 57 |         wkrNnzr8coVF4S+xm4eBFnzHI/HjekOGVXmmLlWLYxBfAQAA//+smV0LgjAUhn+SH5gaiBddhBFW
 58 |         RHhvOiKohGGE/fp2draatGzirlQ8+LIzz867PXZqJPbnukUut/MH+KF2RDu2H2BN93Rp6q4lFqYm
 59 |         CCNXL9QCyWCZq3D7wetdFunEet9YrXcviNyZpt4LQo/M7SyQX9DpufKC+LsvgxA/4XmKRRHb6D81
 60 |         9Ql9wZp0j4bW/KBEArjtARmceKdpqfKYKXFEzFCw6L8msct8vzKJK5TTLSXeUUbz8U7SG6WCSDL7
 61 |         V6ZJxmIa8HMIRwX2AmwnwSQlHALUitGM+kYz7PlM4CroM3ufHFTg0PC3k+0bWQCQ4wTKChzPCIV4
 62 |         rALCYNPvv1OUMT8L6ePXJD/f7kwF8BfemetfYY5fAAAA///KS1XCOWuLOmlLdfuBHbGi1Cqa+18f
 63 |         ObnC+koQN9gVFICEijNQ+lFQOVgvyjMFpVPlmaIAn+eDRiBifh2uBpty0Iwj0bOMeE0CTTDimlRE
 64 |         0qiP5gN95Eysj7rGAAAAAP//QuODliEAAAAA//8DANtRBS5BIQAA
 65 |     headers:
 66 |       Access-Control-Allow-Origin:
 67 |       - '*'
 68 |       Access-Control-Expose-Headers:
 69 |       - X-RateLimit-Limit,X-RateLimit-Remaining
 70 |       Cache-Control:
 71 |       - private
 72 |       Connection:
 73 |       - Keep-Alive
 74 |       Content-Security-Policy:
 75 |       - upgrade-insecure-requests
 76 |       Content-Type:
 77 |       - text/xml; charset=UTF-8
 78 |       Date:
 79 |       - Wed, 29 May 2024 15:20:04 GMT
 80 |       Keep-Alive:
 81 |       - timeout=4, max=40
 82 |       NCBI-PHID:
 83 |       - 939BABCDAB19ABA500005433D5DE166E.1.1.m_3
 84 |       NCBI-SID:
 85 |       - DEA6976AF809F8CA_D239SID
 86 |       Referrer-Policy:
 87 |       - origin-when-cross-origin
 88 |       Server:
 89 |       - Finatra
 90 |       Set-Cookie:
 91 |       - ncbi_sid=DEA6976AF809F8CA_D239SID; domain=.nih.gov; path=/; expires=Thu, 29
 92 |         May 2025 15:20:04 GMT
 93 |       Strict-Transport-Security:
 94 |       - max-age=31536000; includeSubDomains; preload
 95 |       Transfer-Encoding:
 96 |       - chunked
 97 |       X-RateLimit-Limit:
 98 |       - '3'
 99 |       X-RateLimit-Remaining:
100 |       - '2'
101 |       X-UA-Compatible:
102 |       - IE=Edge
103 |       X-XSS-Protection:
104 |       - 1; mode=block
105 |       content-encoding:
106 |       - gzip
107 |     status:
108 |       code: 200
109 |       message: OK
110 | version: 1
111 | 


--------------------------------------------------------------------------------
/ace/tests/cassettes/test_ace/test_pmc_source.yaml:
--------------------------------------------------------------------------------
  1 | interactions:
  2 | - request:
  3 |     body: null
  4 |     headers:
  5 |       Accept:
  6 |       - '*/*'
  7 |       Accept-Encoding:
  8 |       - gzip, deflate
  9 |       Connection:
 10 |       - keep-alive
 11 |       User-Agent:
 12 |       - Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML,
 13 |         like Gecko) Chrome/125.0.6422.60 Safari/537.36
 14 |     method: GET
 15 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=28083566&retmode=xml&rettype=medline
 16 |   response:
 17 |     body:
 18 |       string: !!binary |
 19 |         H4sIAAAAAAAAAFxT0W6bMBR931d4fh4YaJNlk+MqATSlSwhq0k55mgxYxBPYyJi0+fvZkJK0L/j6
 20 |         +vjcc64v+OGtrsCJqZZLMYe+60HwQL7gr9E23B/SGKRdVrNioTTPK7ZjGqTPy/UqBNBBKFlvEIr2
 21 |         kQVtRtA34LcaPFLRUXUGgRfcIxQnEMCj1k37E6FCF66oalfwo1vKExJ5xlHT10Gy05fwb3Dv+Z7v
 22 |         GjQ0gj7r+Jwi2CiouGAh11QbM2Bn1q6dw0GcI6R2NnG0XiUxBNtXwdQcGgOQ4HSzisDL2AJIgpk3
 23 |         u5tMpxjZI4IjqtkTO/GWFQQfGFXEusKoD/FGCn0k3h1GQ2TgZ2KP7Wq/N5cvYvuGyYJVRp3iQhsR
 24 |         j7JTglYEr3a7BKzaVuzPDRvPg/uJ7/zwggAjCxjxBtgxYDyzwpjkXX1lfJFVVzPiY3SJcA8mxtYQ
 25 |         2AZadaMnf/rRUyJPoyk0gtFtaYL3XJvuL7msZMlzWoGmPedHTrU6uyCUpeCanxgQrFOyzTkTOQNU
 26 |         FEOC17TkosRoYDHmt4ssU6Zd/Rv2tCAdCXs+kFyohmCk+Hx3FDq2fSjyh+sjF04vYsn0K2PC2bHW
 27 |         vj4Ij1SUrAV8qGLcLHIjn+sziDrT1xLEteynK1Uyt5dMyoCfBW9kRdVAeokj1qiB18XogwScWs0X
 28 |         mWZOlTYJRib+DKPrFseiGPLBd4zeN+9znpbCXnD6w5uceakb8ngt8z40Ix6vimGmCskheKEVLw7J
 29 |         HB4g8T3X/GpT9M/NmjZ37SS43p3reRNT98pgGpm1WtFcX6M9e9NgTTM7y8tF+PvX0/Y5iSBIqjqk
 30 |         +j8AAAD//2yUy07DMBBFf2WWqRSQ2CCxoIs0SKlEqQT0A4I9biy5seWHKv89M0lcAmKRRTKe173H
 31 |         wbP1+VdgW9SROlgv0UPVtJtJt1RElDfhoDpRUPQjfCFlKKVFMhGi5fRI4icdBhC0O6Nncg2uZ6ET
 32 |         lTEZ5OwZOuolyVarVrXvYT8CoRAgxCQzXBEi0p9LWT+1Qs+sBuo8MQJNW8OJHh51wN7EIYOgy+EJ
 33 |         0arb0Q6SZgvg8dx7yX2vK9KWKndhWUz8kNYcX1vKCs6OAcvM+uKMFjoCzsDxxbohxzyt9P/fjcPL
 34 |         Z3dsP/5YUb5uF57V4X1fQ7cr01dvzw9Psx+LVChLjPchCdZnmvntcQMOPSl34dPjNwAAAP//jFXJ
 35 |         asMwEP0VnUoKtnEPLYFCbwkUeupCzlNZsUUty0hK3Px938iK6zhQerWlmbfN6Ap9/otdBPJfhdiQ
 36 |         bJJZuqcuiIFgBJzuUCEMWuId8QoHgBzqncRDbngT+YylApw0eXflOl7zxUJHRsczgTyNWl9LPcBi
 37 |         ob7JaG46ujU3SySz/L/0ft28fby8L/U+f33aKWE/vXLHqdWUhQSzUtIp8rNQeF2z8SvtGLOScZMi
 38 |         xLWzhz7j9FWa5c0EMpuq3fLt0CjRqn0QZE51RS0htsLpugm51xUrrExvnc0hsFYBPZDaWIjpXx7d
 39 |         O+huc915nqp0sBDbg0MXZ6yDU8OMXAR3MUEAtGA8Z5fcWfU3bXgsi/I+E9vdBuRAWsL8SIginYQI
 40 |         2mPHU3vGlIcGFI2WE7hnCHPU1YHalFrktEIWO7aNA0VL0ZeaS2ugDStleTomdOX6Ch5JPEd+XCUj
 41 |         gKjivAKGJrnSWx+U0/BrBI8PLj4osJX/pwJ/9IP0WO/tDwAAAP//3FdBbsIwEPyKH0CbD0RIEQgB
 42 |         QnCgiLMTXGLVcULsUPH7zjpOcEokWvXWE7EN8e7s7M5Awy6EveC4QQD0zLpBd58bV66c6uI8lQrp
 43 |         k67BkDTli6kw2gejxQdgGNrxUyhFnwNCITPHjp5aP2qO2W472xz2q932e4OEJ9OndfNV49YK+MsO
 44 |         uGFbA+978lCJolGy6+5fsMiM06jgNyYkUZ/l/ErzA5PP2RqwKtUYgbg156m0Pj6pB9Rw6mKEPkPM
 45 |         T+zSyOwDSoUmK8IqEN4PSUyAhang9Ty3eAorSeO0G74cUbyyN7qtIohq7cKt3SAXmqoHlYXoqhtd
 46 |         gu/kZZXfTGDe2hBQdVNm0kHeJQ54rG9r08Z7mD/UPgocQoO31xvoNGwb9EBY4e2GPxpYkHjDjd1y
 47 |         2NQFuTYp6jjqt+IFpox7WiOdWRz163hF/hL1ma6x3S/iBFZBtUZwpd/LwcZ0TrpiCwIE5Lnbywn5
 48 |         OPdfCH6PTqS1Jm3qc872WV5C57FJNjsDCyfBMZ4T/HifEB7BRYNVG0fU5v4EA/hQI0YBWAoNH3wc
 49 |         hWB5/EcQJKoQ8sTHWVDWZzHOgj8jwPgXAAAA///UWG1LhEAQ/ivLfRY87UWFTbCCVk4tyujzEVKW
 50 |         6WXJ4b/vmXVXzZNeIIo+qbPOzDqzM87zoMOJBrUEdIAKR6s2UFIZu1mjA5csKFBsr10MUDBDVI7r
 51 |         aluOwmcQRkD3oNZgMJqyfjA6F/d5UeSbl9nwxAS8o9n4xNH/OiLm0EXo6zHwEwbKCO/1T/ysxgA5
 52 |         22jkiroAv6zsPRaLpee4rsdNLeUBfuBl++THAv7UPQ/ukLjWT8JYsCQUTIgrrHZCflI1GPxbH5Gg
 53 |         FklUQ4ZcaLGyveP/cmmRf+fQc6w/8W8fwD+gHrbwK/7NPjWSbQAmkEkmGDorZNfh0eIU8HPfdhe+
 54 |         wu9MYWfJQYzf9ncknVFzyglpxkKerc/2rnTS4Fvsw6DFMd7A9nOTIbYEqx3LdgGkx2LJ8kR5+Qjd
 55 |         Kbejxb3Nd7tfZe0WuFmeds1inacdkaXWWLx+qOq02uS3VAcJMDfNR6OZiZvq1Y90NIuhcPpXdDQ5
 56 |         MgwyE603AAAA///EmN9vgjAQx/+VxoclSxSF/UA3Q0LkwRm7EH3wGVgNJAIGdMn21+97pQUkRrdl
 57 |         yZ6UevWO9nr93PfsLDlQqyVAqp+43MQ5ICgsAtm6BTvATPmdedT3tuyGrXWlFElkWlDxc4i2s4go
 58 |         GXFFu7w8Ik7Glz6T5C1JOaBejLpS6mTBn6s8wo3hxwGYLBLHA10V4EkXPJ1LdAxkWSMoJP5nWQ40
 59 |         z1N1p2xrj0kGiBMlEX2BTgEbKHEdzrZI+wr0tY1BOdyKu84frYg6SjH1QL3OdI43zSnfK31UyXuk
 60 |         TGrhFH9SiM9eIw/ap/KgeaJ4mlrxnCNdSWSUn1OeZMeDcEawrb7JM9s4vOC/0oH/z39aLd/fBGD+
 61 |         ZgHSaFCIHbWnF4LoRKF1546TYXu/dcmsHDn7PQ2V8Uk5Vb9p7fRFHY36kWk1UW9To5rXNufM0wS2
 62 |         uFD42p6M7LF1xRxr0HN8PnuwrNGjfc2alM2LamZr9rDzZiuhBIvOI8qBPkBLUhsXXh+VIBUDLgTO
 63 |         KVv32Sb+CEF/zJ8ZEhVfA9W4tYoaYSUPsiQavDWDrmqzZMvvcc+9ZSgv70KCVagqcCrSELhlsAVz
 64 |         ZX+vCqTBzMnk/vnOfLLG5s0XAAAA//+kmdtugkAQhl9lkt4aCiwsu3DRWI0xFpNGk94bBUtEMbDG
 65 |         tk/fGQ6rUeqp92TJfDM7M/+/T18YKAukQ31A3/ibs+dJIbnDLhJ6PmLShmeY5MX8M47SBUzGuOqF
 66 |         0R6VaNiBjwwV3QrCrgHvUT6PtqqyGBBJgkJ2ro7DbYxhH4aINJ7VOntP3gjZHiR+oxeYlC5g0fgm
 67 |         l5H325HbpmnCVAM/l/snM5BS0MOecLQUGHQIC7jjW1wnwXswCZbNGbPsy3V6LQtTnBJFWsaOlYor
 68 |         OWL6gdGgA4P8W81WMJoYGNFsuclwVlLcbcZ7nKVptq9M1lIuACkZlcR1e2jPWJeSs9YGWKEwXeuk
 69 |         WDR/+wugCLj0PUc2AF3xIEDBOHPs/5XxbQAPqOrocBM4QKOSvAVYr2KUbHCkL/MdiS19nAGv494J
 70 |         KcsMLNNn8k48dRO1hXQ8V1xruU1DN13bZSc9916YjVDFhakDb7stfgH9kQFnz0C6Rnx6hUvTqHwe
 71 |         oSzEO4UrF35ABmhlzFGTDcntUiUUFjBh4QXkTN9A6zFCrnDFLwAAAP//sjQ2JTaEjIFWWppSMYSC
 72 |         y4F97ioFD2BJCcxLRZnglqNCYkEBsIFbnAjuWiNPnIKnrjIhM1uQxAUAAAD//6xa22qEMBT8FT+g
 73 |         FOKmxrhPZSmUhb6U/kBwQ1cQFa0u7df3TEzSqFtwLy8+5EHkzDmTOTOGzfVR09vgdVf6BM0I4iVy
 74 |         K358QwZmYDXmnXnR5n3hfBAVtbWCt9yYWMwiATNRtfkR37ifNyjfMsGyNL6yRTljkkuxFgCexIkQ
 75 |         t827dZporHcPUYAGWNKHjCiRi4XO8aWJjRbH1Ms9XKOvsJFdXLmIwWHjupgH0z9dP011463YZIz5
 76 |         m+ZSnrRV3nAQAV9b5ViknKe3XUuTKr9p2o+ozhGphFfVdbqEjMI/GLQu4dAzxQyRZ+uwj5xpkpuD
 77 |         TT1wuVOTV9ol9g4uOOXHomnqXJHYKKPP77YfmWXAimPWLmN9axN1+TCs8/mxXWtLZBk2d5xP22No
 78 |         FLzbAckmJoG56SSNB2kF+eQQpMeVZC5SIVdjSFIRoji5H4YBWn8Q/ofbS6kH2kN+AQAA//9Kgc8a
 79 |         KSTCognYXS4GljFpieAeb2oFLO8Ug5oTxaDuLLD8KcoGmopUyCWVgjrNJdiyIvaMY2BtZm5lYmgC
 80 |         D3ZyQ93C1MTAyJL4UDc2tDBFzjkAAAAA//+kWl1PwjAU/Sv9AZjooID6NDQhEacPGt/rqGxJi6Sj
 81 |         IeHXc27TXdYRE6dvUJax3HN3zrkfw6O+dCj8NxpMTnbEN7AY4n0kFsojJgj0a0WTH3iqkcjdt9mL
 82 |         D1y29/EU8GjQjwlsBkWxYTcESey7OyuX1EUzd6vVNmjLl3V1uK4dK5UGkLO3IaR2tQEeYZUCcHuH
 83 |         ZymrpHtm6nP3LJIZ/C7YjG3feCgoTFCQ4WzyPx1u507iCcm9AnHgY4E4c9LjFPxQKZDHY9HnJU5r
 84 |         BOlAPamr2JPqpPl5oIdkh2h3dljwzYY2WKc4ZGuUAMR80y9OmmiFbiScNAuEzP6oEPMptPW3eZ6N
 85 |         byFKPTIaCsCzOoQj8UIlNQVX2UbkBSW923ij1kdNHLPE+a7WYG7K9IVT1hJUwCffrh2szgOn/Jv/
 86 |         LGm+3qptIPyU6/n3aIDaMXJDiF2SUrJ4RNNz3DFWi32Iwj+mqP7ATteTeynv5GzOsj4b/CKcAAAA
 87 |         //+kmstugkAUhl9llm1iGy7lIq6sJJU0bKBp0+4AiZla0OhQjU/fcw4wA4tKK3sxMP+Z7z+3mjcW
 88 |         PO0a4yjf500XJM/VGjC/pKKo4CuBakTYJYObHk3YxzEpz/AgkqgRQN6L/JTB/6f8q0U+dpqKepGB
 89 |         NeNptPKLV2QAWYChkhaaLqCqJdSyKjB04IaGkF23VYTlGVNlzaam/VeH5vbYrj01jMHelsxiNdO1
 90 |         9HGqKWOQKnWMoSdjT+COfo1mzeYjbZzgugg5QCJzrB15yFrumqSUg21x1eCwyZWR10oqTZqD/1Oa
 91 |         RAate4CVVosH60prMHVTt0a2VOYAGI5biPLledZtJMFX4c4DB3Y9VVD2Y9uYCiZsA7xAlqrGqD1i
 92 |         /Dr5uGcm9vfeEio8BC5m+Qs4yoH3mFGmcyU6NMd2XNsZ56F8fxAMib3j4gwkiBDXaQrREbZMB4YC
 93 |         qh/z8hMX7xiYZyz2VYal0YqSBKJxgFONbw40x3P04/AueGXzEz+wQDkeu4kXgX8L4XNk79v9xkO4
 94 |         FjkdSxtcTM2xJ/KH9dSvd4xBCaQRlchn2FbtRdsPAAAA//+kmstuwjAQRX+FbSWIiAMBJ6siVTzU
 95 |         qg8qpHaXl1JLqVLFZNG/7702CaFqJSA7QhbxjK9nzoznDHQo9D5jLGGTqHnY3qHQSVZ1Dgx6cQZW
 96 |         yUn4AAtHXI4Z7MEy7Is0ZDn9ZXWEVzDjo0yN9XkVpU1TKWknqi0aQ1NtFmoUZwtMeNrpluf0CFvG
 97 |         s2noisCdt0dLXttsE2Mhxv1y/qKIlIHYLc4AarVHHiVegxlExX+fZRUrfPuZjMnhbt0Zlmubvrcc
 98 |         UIWB1k+vvA0zPaJWTibAmMl0M+rE966cyxD4KY746V+2643YjwrvyL4FmH5iX7NjrmmkGUJvlD+0
 99 |         0h+t1zenUcL5L0w8McYM8anECHx2kakrrL7Y00bTmKpsBaGxdhvnunyzsc4++FwD3U5ox5X+OBRe
100 |         MPUbt/tXIupETj3PPbcSm3hSip7NzLeyhtksfhcqz7G1yLbvKssLbPmOpz37ZvH7t4+QfVUUQKSQ
101 |         heWgIUczVcqNJbAAgbSy1QIosfrVWsPBnYeu5wWTjmCvTIozQT7phxsg8CLOKuIe/LEsK11biriv
102 |         wWrIf/i5i/Kc9LGJkjLWg6XD65iacHeIVNsfAAAA//+km0tvgkAUhf8KyzaRBnkItatCa1xY01Q2
103 |         XU7rILQMk/BI9N/33oGBwWJUTFwRY0jO3Mf55og9TA9hDythT9jDUn+AE4pbOof94G7Bc6Z93sPU
104 |         TDn4sJJPtOAZoXkmE+DvSkkXR2fcs6464yHHgERMmLaeaCHJyK/wfCvKxdL09h2QHF4KJF9XeUGx
105 |         3Jckp/igWZzE1V/CNmXCQEwx74eoxk+13bG6YWOKEmMnRCTQJVdVc/HF8cYkKdLMm4PjbXv4WA7o
106 |         2tYVNMOZ2YZj3Ia7aZ5iuG7jK9YaS6pnrhW4tCJfJIcP0V7Rgqc4WoOP/w6jddpi+e8bPLrXW/B0
107 |         ghYKfw7nWQF+Qy4ERGJJKUhW7y8SjPNtPZKLYTw4bNfNpyneHJqjbw6lkK5hOeblQNc2DefGDtDx
108 |         kg6XQJlUKXIeH4sjoij0C0roSwnRiiRRlNZDstFwSWH+8R38RpMpbMVSQ90xGLVDo0iSneYlc1GI
109 |         3cVymkQUSpKi59TLGL4OHWNb9+MH2UEwsdg4D9sCF2h1JnAsQXFd69E2LpbEcafemYjCOUkUarXg
110 |         fwAAAP//pFvBTsMwDP2VfABMTdqVdTvBOjREB0g97DyNDUVsDVrhwN9jO0mT0HWa2lvVS1S7z3l+
111 |         z/bULJuUAGU3zIdikCInGFJrgWPmppumXrAtmJNXdE7ZxWwZIxRe4NQoHG319G4Z3cuHmCXAVjPe
112 |         JET0hUg6Gf+nq5fyIe7iZKCk2F3fSgkUAMK6KDD0Ff76W5S4y/sLtc9WPLtl4C0zIEv+PuNkWHXL
113 |         lb+g6BmDQyOLLCrcrqCkuRk+Uneb2TMjkx13uLcj6yPdcgB0+JhVno/YI54ssTlr2fycz8Q07XtV
114 |         iTjOomvpnhA8jcXAQYAAQzSSz5brQBAONGDa6GVo4Dm1CWOq13oa1kcBQ0FE77EBIXiibbmNHopB
115 |         anRQ1YdEZ7cyAW/5vRBbf8/2rVAle31ZGAc2m+54NOZ9Q53wZCKyq7VfOCiNxDCkAC+ECNX6n39X
116 |         P1v4aDbPUavawFMJNUxVn+wZmDfcyydWwNNSAmR+2erBIeOAo5CuFN0Gnp67SUKJCoOp9vv6i7BA
117 |         DfWpW9VFav4HAAD//wI218AhDexJGwObYebm5vB2GKmjL7AgNzMxNjPCmboBAAAA//+kXF1PwkAQ
118 |         /Cv9AUqo19KPN9CgaDQGE+PrATXW0CvpFU399c5sr636hPBGKAG67E5nprP8LfnE9zkOJxIxYxoP
119 |         /XwD3YEqLFE/07hI0LOu8nabaGFSzDTDg4bln9q3FabCXfbzbNXe93vICdqWZLh/X4ARaLwdCcsn
120 |         g6LUfQSYOYeGxqvACy/XU5fd5YfP92bt+Fdr0S7Q/WS+g88zXetNVnSylr7XOBp5NHLRjD2Z+p/I
121 |         vdfbzTvOWlg/uo6mSk75clfp15p0ZbZHB+A66N2yVgzxloXc1fxh1si4Qj3gEL6e5KCauiStz2vg
122 |         bmn4bL3V9rw1TiW+C/LTZzIEGza6xtHajgZrNHNRMz9J+eMfy/x78yaOx3EQnwaQlWwEMxjFB5dX
123 |         zEV9ZNtyV3SZBylHv2PWtPSBA5l/iddLBC0tXiHdASVorG5TPp3Jsywb7tmAiFAdzsTj69bVnto/
124 |         FLCuNCqM0yDs5XGkjhtGAKfy1cGq6GISJ+rEDMr13nvBJNIhXp4N84ZOnLMh2W6/1kYrJ2ukxoNc
125 |         0Z+ELkyEBO3KYue8GMdswws/VSoa0iPHxkeSJAnUwSEdFQVJOD5MNn4DAAD//8I53pBYAR5S8E7M
126 |         KwdNBhUp+IEWk+YXgLZAZkC2TCgkJuWXliC4BWBpSPlSBNomAE1RGJufs6AFDqgmgA3TAhWCxiBA
127 |         6TUzLwUxRos1P1oAAAAA//+kXNtOg0AQ/ZX9AQ2EBUpfDDWoiZGY9Au2LS1EZI2wTdqv98wsu60+
128 |         VXgm2YfJXM5lhsibRvK/dMGnUSYXaTSvINfoMvu9KMm4q1XNcPLF7Ki958D1pTKoTvaIUIAfYv2E
129 |         dm2+v4DoxOv7vXi2mhURsIHxHicVCDPdKIA5adKyWWEefVR7Mmt6r/qNle6s6ACxCWPPbpOJBRnF
130 |         CQjAzVQKwC+U4TwqRaC8syS1AM6oARDAk8hRBslvOs42FyP2M+mXB/YaF4hO0URQd3yQMvBFMau7
131 |         Y/79hdDc6o7NoWnpfMQ/e+FCK/A3F9IoWy7S4II3JgIOoI00lbeXMOC0zMJZIV2duorxhjq3mKKi
132 |         yGmibut+o8yneKPtoYoGKaJeWDHNpZeVR5HbZyQYniUd1S3W9XynQBG8WkvcDmrMRzpxbp1mFqbL
133 |         SHqslkxeMk4CGct5xscjgM1B5JiYtOW+0+KkDXhU1T5YN0CPW/K2e5E07NvX74tt/xMI93mjyWYt
134 |         Ffkhx+uFmx8AAAD//6ScTW6DMBCFr8IJKpNg101XVUOSTVEXlbqmCCkotEi4KMrt+2YwxvxIqWCf
135 |         SAljZt68+TxCbJ63OyVdqlJqaaoKJeTqunn1Ka1R5fCzE55IpxDyL8x2QN5XJnhH6XuFqsjOJTT+
136 |         ad8eiPgb/67ohqmxq3uHnME8t6rASnT6SneJEJ9PvB1EFgrbCogp7YEXS80FEUX6/mUWR1ZrhRd5
137 |         3eu0b3Ko0Z8gPhL3ZWiSgCP0Ro41m9Mf6a2siIt5gJrlpQ8e+OuP8/1FRsyts8IYVzrSU3oXeupq
138 |         8URe6kf1tE4O9N140hhTVtmFMnXvt5ABVn+hUY7jsYsyJnxnLS63yWGCCbG71T5H654gabNZ754t
139 |         EcJZXlOdQFJiBpubejOwhq0ZNgcIDTtOD+zSPRAhF57USCgZhvduafVoShhpsVlHBfwjRJ7ndUAP
140 |         cAkmYbMt/mdasEZByb0WyI4OLO3WeXCtaO2w+dh5vpcNziCAtEWl+T3fqHubEl28CGkaoj8AAAD/
141 |         /0IazDdC7GIxI7dutjAyMzIlehOCkSmw6LGkrOHoDMxORYnFoE4IyjI8X2DrMKMS1PEPzwRmuCB3
142 |         tPwTBm0IQkIfWI3Dgxic9hFDLpCgtsJ6shY4mAEAAAD//6Rcyw6CMBD8FT5AE2pbS/HqQXycNPGM
143 |         ocZEIipi/Hx3WyhUOCD8AIdhtp2dzi6gaXJbUdSEtWPCACpB+CGvW2A2MMtL5xjlpb01EOcBpeNK
144 |         YZOdgEYrTO+CfAERtAUJvs5ydb/ooEAK59F0lz1VXOgSgNbjUeCsN/6BGB9bD+ix1gaJ3rZkTqHE
145 |         Er0CJm/EH7XPULXWSHKbz2po1dKA6bIcW51PgMa8lPbqJP7f8V2bZOcz4fe9PEnAhWRknPw6xkmC
146 |         2432Dpjal3GXJ5nnJtcMf2cflZb2TQs74nPvXH8S4Lop0wJgikPvYfoFtslwV7DJBWUhIxZjOvAt
147 |         aiYCymhfpuNYqRRk3IlSsdWl+i/BU0vw68RbAmxmoNaWiXp9AQAA//+kXMFOg0AQ/RU+oJqytIVw
148 |         MT2gpooxqYnxWF2ISSlrajTx7503y7LLupoKNw4EwvJ2583Mm2c76y4oLfvTFUo9asot+U+kUfJs
149 |         o1r6cZty+P6+JnlN9zwrtR88iUV3pshI1OIph56vqeEgYAuNmS40Oge9N232r4mvspL8/LtZpOUl
150 |         60dHPHTrXG+hO6FMEJZrUeHIDtDbfFEQJTACf/HpOnrSZsqV6o8moFAIiWPnMYvGbFtUjG2LrrJU
151 |         +KPYf0AxJgaSpdOY8pVqJKQqANQ9+vctoInoRssC0YoGWLfpzRfqsxII3V8gnULOzlXsgLsY+gx4
152 |         LTpkzZfhiBXT6Vf2EpIKFJ1t13S4rHaHHxR7QflHTuG8JxAj54hFQjt4sZwWqgYofTjSvnpnkf0l
153 |         7UAF5dpgTuu8c0TiNTPSYALQqdOKoUhPISZeLfPE6f2OZFQimYssOT3CwJIlnVZA6uUP5UzbykY3
154 |         aGd1Z9Ga0LfdtSxiLfxhksK6+B3soipvMWs6Ok3OG5jEpiTkTTIz1sWngALlGwAA//9CNFqNzRHj
155 |         l+QuIAZtwjFEn8PCF2oINlQO+XgSfdTTnQEAAAD//0Ljgw6ABgAAAP//AwD0uZYtu1oAAA==
156 |     headers:
157 |       Access-Control-Allow-Origin:
158 |       - '*'
159 |       Access-Control-Expose-Headers:
160 |       - X-RateLimit-Limit,X-RateLimit-Remaining
161 |       Cache-Control:
162 |       - private
163 |       Connection:
164 |       - Keep-Alive
165 |       Content-Security-Policy:
166 |       - upgrade-insecure-requests
167 |       Content-Type:
168 |       - text/xml; charset=UTF-8
169 |       Date:
170 |       - Fri, 31 May 2024 18:29:12 GMT
171 |       Keep-Alive:
172 |       - timeout=4, max=40
173 |       NCBI-PHID:
174 |       - 322CC39D15567C25000056C6CBA40CC2.1.1.m_3
175 |       NCBI-SID:
176 |       - C13BEB26F7D63411_7A4DSID
177 |       Referrer-Policy:
178 |       - origin-when-cross-origin
179 |       Server:
180 |       - Finatra
181 |       Set-Cookie:
182 |       - ncbi_sid=C13BEB26F7D63411_7A4DSID; domain=.nih.gov; path=/; expires=Sat, 31
183 |         May 2025 18:29:13 GMT
184 |       Strict-Transport-Security:
185 |       - max-age=31536000; includeSubDomains; preload
186 |       Transfer-Encoding:
187 |       - chunked
188 |       X-RateLimit-Limit:
189 |       - '3'
190 |       X-RateLimit-Remaining:
191 |       - '2'
192 |       X-UA-Compatible:
193 |       - IE=Edge
194 |       X-XSS-Protection:
195 |       - 1; mode=block
196 |       content-encoding:
197 |       - gzip
198 |     status:
199 |       code: 200
200 |       message: OK
201 | version: 1
202 | 


--------------------------------------------------------------------------------
/ace/tests/cassettes/test_ace/test_science_direct_source.yaml:
--------------------------------------------------------------------------------
  1 | interactions:
  2 | - request:
  3 |     body: null
  4 |     headers:
  5 |       Accept:
  6 |       - '*/*'
  7 |       Accept-Encoding:
  8 |       - gzip, deflate
  9 |       Connection:
 10 |       - keep-alive
 11 |       User-Agent:
 12 |       - Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0
 13 |         Safari/537.36
 14 |     method: GET
 15 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=10.1016%2Fj.cognition.2016.08.003%5Baid%5D&retmax=10000
 16 |   response:
 17 |     body:
 18 |       string: !!binary |
 19 |         H4sIAAAAAAAAALKxr8jNUShLLSrOzM+zVTLUM1BSSM1Lzk/JzEu3VQoNcdO1UFKwt+MCAAAA//+y
 20 |         UXTxdw6JDHBVSA1OTSxKzghKLS7NKVEICHXy8XRWUNLV1/fz8dXXdwlxUUgtBqtQMDIwMDMwM7LQ
 21 |         13f1U1JQyigpKSi20tdPLS3JzCnWy0tOytTLy8nVy8vM0EvPL4OK66eUpOjDdUKN0gMKKtlxAQAA
 22 |         AP//skGxG8h3zi/NK7EztNGHMGyCUkt8EytAAlAWSCS4JLGoxM4ALAZhc9l4pvhkFgMZnECWnZG5
 23 |         qZmFqYW5jT6QwwUiIZIAAAAA//9kkDELwkAMhXd/RXAW74ogCkcnFwdRUXAQh9gL7ZVylUtO8N+b
 24 |         OlXN+BHe43vunDByh6LTnEhM6Y6Z0mtEy2lhobDFElqo+jqGAaq3ArsCaxfT6yHfu8ANJdhubs78
 25 |         JbgLpqhjfyrdPssjy46YsaZSPSWFavhjwKoJ9CQ/Bx4kAKPXTtUG9G1mIQ/Sg53BWs+Z7yRnxjUT
 26 |         lfyZ8g0AAP//AwC2JlrBCAIAAA==
 27 |     headers:
 28 |       Access-Control-Allow-Origin:
 29 |       - '*'
 30 |       Access-Control-Expose-Headers:
 31 |       - X-RateLimit-Limit,X-RateLimit-Remaining
 32 |       Cache-Control:
 33 |       - private
 34 |       Connection:
 35 |       - Keep-Alive
 36 |       Content-Security-Policy:
 37 |       - upgrade-insecure-requests
 38 |       Content-Type:
 39 |       - text/xml; charset=UTF-8
 40 |       Date:
 41 |       - Fri, 23 Feb 2024 02:03:52 GMT
 42 |       Keep-Alive:
 43 |       - timeout=4, max=40
 44 |       NCBI-PHID:
 45 |       - 939B247C4D7BE0F5000028CA55417ADC.1.1.m_1
 46 |       NCBI-SID:
 47 |       - 39378B9512CBFACB_1F52SID
 48 |       Referrer-Policy:
 49 |       - origin-when-cross-origin
 50 |       Server:
 51 |       - Finatra
 52 |       Set-Cookie:
 53 |       - ncbi_sid=39378B9512CBFACB_1F52SID; domain=.nih.gov; path=/; expires=Sun, 23
 54 |         Feb 2025 02:03:53 GMT
 55 |       Strict-Transport-Security:
 56 |       - max-age=31536000; includeSubDomains; preload
 57 |       Transfer-Encoding:
 58 |       - chunked
 59 |       X-RateLimit-Limit:
 60 |       - '3'
 61 |       X-RateLimit-Remaining:
 62 |       - '1'
 63 |       X-UA-Compatible:
 64 |       - IE=Edge
 65 |       X-XSS-Protection:
 66 |       - 1; mode=block
 67 |       content-encoding:
 68 |       - gzip
 69 |     status:
 70 |       code: 200
 71 |       message: OK
 72 | - request:
 73 |     body: null
 74 |     headers:
 75 |       Accept:
 76 |       - '*/*'
 77 |       Accept-Encoding:
 78 |       - gzip, deflate
 79 |       Connection:
 80 |       - keep-alive
 81 |       User-Agent:
 82 |       - Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0
 83 |         Safari/537.36
 84 |     method: GET
 85 |     uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=27568587&retmode=xml&rettype=medline
 86 |   response:
 87 |     body:
 88 |       string: !!binary |
 89 |         H4sIAAAAAAAAAFxR0W6CMBR931d0fZ4UUAdbSs0mPGBEyXQmPi0oN7NJaU0pbv79ihqmvvTetuec
 90 |         nnNLR7+VQAfQNVcywp7jYjRiD/Qxno+X6zxBebOpoHzThm8FLMCg/PN9mo4R7hEym2aExMu4BWUd
 91 |         6Al5tUGTQjaFPiLf9QeEJDOM8M6Yff1KSGlKR4rKkXznfKsDkdsNJ/vTO0Q15tJ++QPXcz3HorE1
 92 |         dO/j/ohR60BwCWNuCmPDoIWtTR3hLImn6SzBaP4jQUfYusaM5lkao1WXGzM/GD6HwzCgpL1iNC4M
 93 |         jFW1F2CgZHQNhWa+69n7U0szJc2OuS+UnDtLODI3pKSt7XpDb7cfcOD1tVZ4p+XfagXXWh35kvc0
 94 |         c1WCiHCuuTS9RMDWaCX51qabqEbLQjCaLhYzlNa1XB73EOFrkBcG/V4Q9q2LFtWRLLoBZMcIpR0p
 95 |         b6oIp9KAlmCs8kqJpgLmDa27S99+RGvxP9jzOdgfAAAA//90U2FLwzAQ/SvHvi8pKiISB6IFBdGB
 96 |         Q9jHtMmakzQpSejQX++ltV0d+CmXu+Ml7967idijrmdmfG7my/c2YoeJVHzwjcMsn+Bjghi83VdV
 97 |         IPqDrMuO89IMOU9pxNgZDVHbg9UxQotO3cKTP0IX9IFmkaQFdL23vW61SxRDPqXFb3QNHDEZiNii
 98 |         lQGkU6AwTlefDDkIopGdjqDbTibzNTR1wUdfI0FX2sgefWCC//mU2MoG3e/HyashUUJvLq4EP91E
 99 |         6dQQZJWmeHL6tnHUvs6lRYYmvAAuX3w9hOT28lmNLlAeV/BBBNX+9W61JysUjFbtmn+yepouyzKy
100 |         4oYVxSU9fYL5B7PDM8z3gtZ3/QMAAP//lFVNi9swEP0rc9sEvGG3hS7tobCUFvZQWNpCzxN7FKtJ
101 |         JCPJcfLv+0ZyNs4mdOklOJY0evM+xnfvHh5m9x/m7+/uP8Jtryo9LmMKXKfT0y/Zp89f3UpbWL2S
102 |         oiK7kEVFkE1trBsQ86LBDaXW96s2xQqMb6yYmHUwgj9uhbc20tr5wVHyZLi2G50TQhv8hLeko0dc
103 |         tytjBZgaH6LH6NE9ExPNmu3zty9z4iAcqemDIpxaqcXrpYiDYUYgON3YOgG8EDpX+3lzFcFT0g56
104 |         B/9wqAgFBFO7oqHlRKNqO9Gjtbq8CfgXtW5EZWPAV/xErEbuN6WR41Y0NLS2bjMIxVUgU1R+onZD
105 |         PSYPATzrof2BjA/KN0oD7RiCmSbsFut/VBvv5uQ1L3q5D8n0m7LxVlzLri5Ru4DgndzEDKJDzU4r
106 |         oSk07tFssE0jDlSAuhbvzPcfT0DZNwcalD5GtIEzc1k6Vnj/mWUQcyQdHOieZeDsQ459kFzy326p
107 |         VFiOURsrXV3Kk82TikAnN+qO68gX9Mw6PGzHDostN3rJcYOUxtIAZ8NNmt8epTms8L1WIr1DAMCt
108 |         Vh1av6UBYm7Z2U7NILkYwMsLN8cwFQKOeSpwuxMStZ/Ftyg7R/nLyCaknllpQb/zVjeyg3sntRDh
109 |         V8Q76fURpHdoQF4ipaYaFVAjdipP4rjOh/iKInl1QT8LKJsOWEmDBnFyfz59xhlsWWuUgXOqOBsD
110 |         uUqGAqcyXXThwmkTIkYVSHZ+jXqnynw2WXYoEXyeSeezpUaGZF9NB8VfAAAA//+MV9Fu2zAM/BUB
111 |         A4Y+GOn2VGwxCiTdCrRog2DZ+q7agqvWjjfJduu/3x1lyclQIHuKY0skdTweKckl6cVkooHBg6lH
112 |         wBVSKhE8WiatfdWuDMk7jocup6Mt1I+EM/EEl0TOYK+yjPzVQAh07ZNoka2IIQjoAFzlFLLb94/e
113 |         /OkZ5alaMVZ4E21S0wcevp5XM2HcHkzZAmBAQNxenUWggvBmVJwDQ3jqC3Ow7xQagT81pNUl1rE0
114 |         RNkiruhT4C46AA8q3Gt05+zbJPtnU18IIS3UTyoV/HOAIlzY3NcoLN9XlfEd44CAVwajqXApnFQ0
115 |         UsAQzYy+ncHxPGkWoA7VR8CKtgn98B39zEhTggxX+8m2GCtcb7tm4tI/8h2lyoTWyX5CB/+pV5TR
116 |         w84K8I8JkIqaY9Fh78+v2t+jsxCfmz2qu4lD3/RSffzwpr8sFacT9b32mP3gZL14QI8GZ2SNp3Xj
117 |         BlPC+LvmZp+YPHqInbuzuLPEeX2aYKZPR1NNfqd9t9GYeu/1c4kzIZ7PF2Zp+fvpYpmfpwX5deuM
118 |         PN1qj+y86Pw8vcpv2BFQTpe3mGPjc76atYXxHr243Al0mbpK7YSYriQB/LeBXra+sAbpVb+wJFPf
119 |         DBUupnitPRKx9WPx1NZtZQskAlWPgQIqQlvpz73ByUuk/loXoOvI3WnfmNG63BnDlwe43OtMrXqA
120 |         avUSHdqjC/WdCMSGsAdF34IKyfP2acTlK9jb1e2gXzBm6dI0YnMXzoEQ1iwMX+tBx3V0MYMwrZyD
121 |         34J5zPWJMBdqvg0pXZYs66/qmbn6CwAA///cmD0OwjAMhW+UXoGShYGtA7OJTGsJZ0jJwumxE/En
122 |         uaISFRIdE0WJ9L0Xxy+ileOq75XCJss2hA6Cg4v69UWTt1GVrKm2+WCflkFiQz5hMi3TydHJNky3
123 |         OsMsxXSXmSd4SnGAaOJsZ+P0qDG49BwajB8eDrVN2lLJBDotAUZaUUwltNzX/gTBQdItjWE4ojQy
124 |         Ngo/JCl2NIHDrwrHHphtCGfI45cA/uV6LVIs5wvSPF9UVSL2WT8tMPYqRLwBAAD//6yY32qDMBTG
125 |         X0W82c3WWv8LXaBb2VaYQmn7AKJulXUa1A729vuSqI1tCmPzRvBgzjmeJCdffuKNIaNDLkAA4wfi
126 |         4zOjtlvd60sc9Lbp66RFO1oLUThIkr++OtyyfUcn/Q9vjpRCTN/iXCrudpPNRHsuv24ahb9zi0iy
127 |         jc8YlsYel2DtGgZjcFDie2bQ8T3J5emtxzwdJuNVfiyPuCF8kyhjig49La2ZxhHGbsR2IVOyk3Ee
128 |         HT4xyxDmqyUxLNdz7Nl8Khs5L3zF+YetSnqAIwBhZ+49DhNrseuGSf+GrEKkNTQhu3r/ksUpnIhS
129 |         SgaUJKuTKqdNWbEN2U6eYViYew1aq6y2Jc0TtsMjnSxS7AKUbzCGZyZ5/IX7WWDNFO4f+JU/jCnN
130 |         2bL9dxjbCTxFmHbR1Psy+Rgjims7qijiBnYZYH1Ez3yDfu6drFFwxzDPnaCl0l6szaeDcX/I0zFd
131 |         Q5HnUwaZmI1QB9c3VJMKdRAX9Qj+PStQrUkOycVNkvdxgdnGCOibXqAIGMbvRYZewZo5YhY/AAAA
132 |         //8CFuueueCBU6rYiTXJ+lIligzNjSywBWEAKNuFZABb4sCKLYUa9hhj9UUAYmDFGTywMmgyh6GR
133 |         pSFGXING0yHdZifouAkVgsbI0hgjChAWBaQWJacWQCoQSq0yNTPGWjCFZKTmF4EbG77AVuugiQJT
134 |         UwtDbM6NBNay6QpE1Tz6GFWdd2pleX5RCniYATYP6R8CmYqEyuEpuaEq8Cj1RYy9EKM8ADYQQ4zi
135 |         YNDIUGZeRmYStEVBhBb4UCsxqkED+Ujq9JECC9HagLUm7KDzvsDWUqKdjQdQTT6o6QOZgoZOLoIm
136 |         R2HTv0WpyanAhm0K7naZIUqzzNAI1ixDMhIAAAD//ypJxWsDeGoWtwXmqO0+A1ItSEwG5UV8NiBa
137 |         lgAAAAD//wKZbEKqBZAZdzwhZIDqA1OoBR7AFqAdUC2YtgHm4lKgNSC1EBbxDsiFxLESznl21Gl2
138 |         S2rbDxp2T60iNnyNDci0Xx85ucK6FhA32BUUgISKM1C6HVA5WKfDMwWlD+KZogCffoXGIGIVA1wN
139 |         NuWgGWDCs754jQBN+OKa5EXSqI/mdH3k3KuPuoQDAAAA//9C44NWeQAAAAD//wMASf21ZqAiAAA=
140 |     headers:
141 |       Access-Control-Allow-Origin:
142 |       - '*'
143 |       Access-Control-Expose-Headers:
144 |       - X-RateLimit-Limit,X-RateLimit-Remaining
145 |       Cache-Control:
146 |       - private
147 |       Connection:
148 |       - Keep-Alive
149 |       Content-Security-Policy:
150 |       - upgrade-insecure-requests
151 |       Content-Type:
152 |       - text/xml; charset=UTF-8
153 |       Date:
154 |       - Fri, 23 Feb 2024 02:03:52 GMT
155 |       Keep-Alive:
156 |       - timeout=4, max=40
157 |       NCBI-PHID:
158 |       - 322C2CA7550E8115000032EF6882AB5A.1.1.m_3
159 |       NCBI-SID:
160 |       - AF95A8A398478A95_FBA1SID
161 |       Referrer-Policy:
162 |       - origin-when-cross-origin
163 |       Server:
164 |       - Finatra
165 |       Set-Cookie:
166 |       - ncbi_sid=AF95A8A398478A95_FBA1SID; domain=.nih.gov; path=/; expires=Sun, 23
167 |         Feb 2025 02:03:53 GMT
168 |       Strict-Transport-Security:
169 |       - max-age=31536000; includeSubDomains; preload
170 |       Transfer-Encoding:
171 |       - chunked
172 |       X-RateLimit-Limit:
173 |       - '3'
174 |       X-RateLimit-Remaining:
175 |       - '1'
176 |       X-UA-Compatible:
177 |       - IE=Edge
178 |       X-XSS-Protection:
179 |       - 1; mode=block
180 |       content-encoding:
181 |       - gzip
182 |     status:
183 |       code: 200
184 |       message: OK
185 | version: 1
186 | 


--------------------------------------------------------------------------------
/ace/tests/test_ace.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | from os.path import dirname, join, exists, sep as pathsep
  4 | 
  5 | import pytest
  6 | 
  7 | from ace import sources, database, export, scrape, ingest
  8 | 
  9 | 
 10 | @pytest.fixture(scope="module")
 11 | def test_data_path():
 12 |     """Returns the path to test datasets, terminated with separator (/ vs \)"""
 13 |     return join(dirname(__file__), 'data') + pathsep
 14 | 
 15 | 
 16 | @pytest.fixture(scope="module")
 17 | def test_weird_data_path():
 18 |     """Returns the path to test datasets, terminated with separator (/ vs \)"""
 19 |     return join(dirname(__file__), 'weird_data') + pathsep
 20 | 
 21 | 
 22 | @pytest.fixture(scope="module")
 23 | def db():
 24 |     db = database.Database(adapter='sqlite', db_name='sqlite:///ace_test_database.tmp')
 25 |     yield db
 26 |     os.remove('ace_test_database.tmp')
 27 | 
 28 | 
 29 | @pytest.fixture(scope="module")
 30 | def source_manager(db):
 31 |     return sources.SourceManager(db)
 32 | 
 33 | 
 34 | @pytest.mark.vcr(record_mode="once")
 35 | def test_pmc_source(test_data_path, source_manager):
 36 |     filename = join(test_data_path, 'pmc.html')
 37 |     html = open(filename).read()
 38 |     source = source_manager.identify_source(html)
 39 |     article = source.parse_article(html)
 40 |     tables = article.tables
 41 |     assert len(tables) == 1
 42 |     t = tables[0]
 43 |     assert t.number == '3'
 44 |     assert t.caption is not None
 45 |     assert t.n_activations == 11
 46 | 
 47 | 
 48 | @pytest.mark.vcr(record_mode="once")
 49 | def test_frontiers_source(test_data_path, source_manager):
 50 |     filename = join(test_data_path, 'frontiers.html')
 51 |     html = open(filename).read()
 52 |     source = source_manager.identify_source(html)
 53 |     article = source.parse_article(html)
 54 |     tables = article.tables
 55 |     assert len(tables) == 3
 56 |     t = tables[2]
 57 |     assert t.number == '5'
 58 |     assert t.caption is not None
 59 |     assert t.n_activations == 13
 60 | 
 61 | 
 62 | @pytest.mark.vcr(record_mode="once")
 63 | def test_science_direct_source(test_data_path, source_manager):
 64 |     filename = join(test_data_path, 'cognition.html')
 65 |     html = open(filename).read()
 66 |     source = source_manager.identify_source(html)
 67 |     article = source.parse_article(html)
 68 |     tables = article.tables
 69 |     assert len(tables) == 1
 70 |     t = tables[0]
 71 |     assert t.number == '1'
 72 |     assert t.caption is not None
 73 |     assert t.n_activations == 2
 74 | 
 75 | 
 76 | @pytest.mark.vcr(record_mode="once")
 77 | def test_plos_source(test_data_path, source_manager):
 78 |     filename = join(test_data_path, 'plosone.html')
 79 |     html = open(filename).read()
 80 |     source = source_manager.identify_source(html)
 81 |     article = source.parse_article(html)
 82 |     tables = article.tables
 83 |     assert len(tables) == 1
 84 |     t = tables[0]
 85 |     assert t.number == '1'
 86 |     assert t.caption is not None
 87 |     assert t.n_activations == 24  # Since there are data for 2 experiments
 88 | 
 89 | @pytest.mark.vcr(record_mode="once")
 90 | def test_springer_source(test_data_path, source_manager):
 91 |     filename = join(test_data_path, 'springer.html')
 92 |     html = open(filename).read()
 93 |     source = source_manager.identify_source(html)
 94 |     article = source.parse_article(html)
 95 |     tables = article.tables
 96 |     assert len(tables) == 1
 97 |     t = tables[0]
 98 |     assert t.number == '1'
 99 |     assert t.caption is not None
100 |     assert t.n_activations == 12
101 | 
102 | 
103 | @pytest.mark.vcr(record_mode="once")
104 | def test_database_processing_stream(db, test_data_path):
105 |     ingest.add_articles(db, test_data_path + '*.html')
106 |     assert len(db.articles) == 7  # cannot find pmid for some articles
107 |     export.export_database(db, 'exported_db')
108 |     assert exists('exported_db')
109 |     shutil.rmtree('exported_db')
110 | 
111 | 
112 | @pytest.mark.vcr(record_mode="once")
113 | def test_journal_scraping(test_data_path):
114 |     scrape_path = join(test_data_path, 'scrape_test')
115 |     os.makedirs(scrape_path, exist_ok=True)
116 |     # Test with PLoS ONE because it's OA
117 |     scraper = scrape.Scraper(scrape_path)
118 |     scraper.retrieve_articles(
119 |         'PLoS ONE',
120 |         delay=5.0,
121 |         mode='requests',
122 |         search='fmri',
123 |         limit=2,
124 |         index_pmids=True,
125 |         skip_pubmed_central=False,
126 |         invalid_article_log_file=join(scrape_path, 'invalid_articles.log'),
127 |         prefer_pmc_source=True,
128 |     )
129 |     # For now just check to make sure we have expected number of files in the directory
130 |     plos_dir = join(scrape_path, 'html/PLoS ONE/')
131 |     n_files = len([name for name in os.listdir(plos_dir) if os.path.isfile(plos_dir + name)])
132 |     shutil.rmtree(scrape_path)
133 |     assert n_files == 2
134 |    
135 | 
136 | 
137 | @pytest.mark.vcr(record_mode="once")
138 | def test_cerebral_cortex_source(test_weird_data_path, source_manager):
139 |     pmid = '11532885'
140 |     filename = join(test_weird_data_path, pmid + '.html')
141 |     html = open(filename).read()
142 |     source = source_manager.identify_source(html)
143 |     article = source.parse_article(html, pmid=pmid)
144 |     tables = article.tables
145 |     assert len(tables) == 5
146 |     total_activations = sum([t.n_activations for t in tables])
147 |     assert total_activations == 44  # across 5 tables
148 | 
149 | 
150 | @pytest.mark.vcr(record_mode="once")
151 | def test_neuropsychologia_source(test_weird_data_path, source_manager):
152 |     pmid = '29366950'
153 |     filename = join(test_weird_data_path, pmid + '.html')
154 |     html = open(filename).read()
155 |     source = source_manager.identify_source(html)
156 |     article = source.parse_article(html, pmid=pmid)
157 |     tables = article.tables
158 |     assert len(tables) == 1
159 |     assert tables[0].n_activations == 10
160 | 
161 | 
162 | # this paper grabs brodmann areas as coordinates, but is not a priority to fix.
163 | @pytest.mark.vcr(record_mode="once")
164 | def test_brain_research_source(test_weird_data_path, source_manager):
165 |     pmid = '18760263'
166 |     filename = join(test_weird_data_path, pmid + '.html')
167 |     html = open(filename).read()
168 |     source = source_manager.identify_source(html)
169 |     article = source.parse_article(html, pmid=pmid)
170 |     tables = article.tables
171 |     assert len(tables) == 2
172 |     total_activations = sum([t.n_activations for t in tables])
173 |     assert total_activations == 26  # across 2 tables
174 | 
175 | 
176 | def test_pmc_embedded_table(test_weird_data_path, source_manager):
177 |     pmid = '20159144'
178 |     filename = join(test_weird_data_path, pmid + '.html')
179 |     html = open(filename).read()
180 |     source = source_manager.identify_source(html)
181 |     article = source.parse_article(html, pmid=pmid)
182 |     tables = article.tables
183 |     assert len(tables) == 0
184 | 
185 | 
186 | def test_wiley_label(test_weird_data_path, source_manager):
187 |     pmid = '36196770'
188 |     filename = join(test_weird_data_path, pmid + '.html')
189 |     html = open(filename).read()
190 |     source = source_manager.identify_source(html)
191 |     article = source.parse_article(html, pmid=pmid)
192 |     tables = article.tables
193 |     assert len(tables) == 1
194 | 
195 | 
196 | def test_elsivier_table_parse(test_weird_data_path, source_manager):
197 |     pmid = '12417470'
198 |     filename = join(test_weird_data_path, pmid + '.html')
199 |     html = open(filename).read()
200 |     source = source_manager.identify_source(html)
201 |     article = source.parse_article(html, pmid=pmid)
202 |     tables = article.tables
203 |     assert len(tables) == 4 # should be 6, but the tables are ill formatted
204 | 
205 | 
206 | def test_multi_column_float_conversion(test_weird_data_path, source_manager):
207 |     pmid = '26021218'
208 |     filename = join(test_weird_data_path, pmid + '.html')
209 |     html = open(filename).read()
210 |     source = source_manager.identify_source(html)
211 |     article = source.parse_article(html, pmid=pmid)
212 |     tables = article.tables
213 |     assert len(tables) == 2  # should be 0, but not removing innacurate tables yet.
214 | 
215 | 
216 | def test_frontier_table_identification(test_weird_data_path, source_manager):
217 |     pmid = '26696806'
218 |     filename = join(test_weird_data_path, pmid + '.html')
219 |     html = open(filename).read()
220 |     source = source_manager.identify_source(html)
221 |     article = source.parse_article(html, pmid=pmid)
222 |     tables = article.tables
223 |     assert len(tables) == 0
224 | 
225 | 
226 | def test_schizophrenia_research_source(test_weird_data_path, source_manager):
227 |     pmid = '18439804'
228 |     filename = join(test_weird_data_path, pmid + '.html')
229 |     html = open(filename).read()
230 |     source = source_manager.identify_source(html)
231 |     article = source.parse_article(html, pmid=pmid)
232 |     tables = article.tables
233 |     assert len(tables) == 0
234 | 
235 | 
236 | def test_find_tables_in_old_sciencedirect(test_weird_data_path, source_manager):
237 |     pmid = '22695256'
238 |     filename = join(test_weird_data_path, pmid + '.html')
239 |     filename = join(test_weird_data_path, pmid + '.html')
240 |     html = open(filename).read()
241 |     source = source_manager.identify_source(html)
242 |     article = source.parse_article(html, pmid=pmid)
243 |     tables = article.tables
244 |     assert len(tables) == 3
245 |     extracted_coordinates = set([(a.x, a.y, a.z) for a in tables[-1].activations])
246 |     table_5_coordinates = set([(-18, -80, 20), (20, 16, 30)])
247 |     assert extracted_coordinates == table_5_coordinates
248 | 
249 | 
250 | def test_old_springer_source(test_weird_data_path, source_manager):
251 |     pmid = '23813017'
252 |     filename = join(test_weird_data_path, pmid + '.html')
253 |     html = open(filename).read()
254 |     source = source_manager.identify_source(html)
255 |     article = source.parse_article(html, pmid=pmid)
256 |     tables = article.tables
257 |     assert len(tables) == 0
258 | 
259 | 
260 | def test_old_table_parser(test_weird_data_path, source_manager):
261 |     pmid = '15716157'
262 |     filename = join(test_weird_data_path, pmid + '.html')
263 |     html = open(filename).read()
264 |     source = source_manager.identify_source(html)
265 |     article = source.parse_article(html, pmid=pmid)
266 |     tables = article.tables
267 |     assert len(tables) == 3
268 | 
269 | 
270 | def test_empty_columns(test_weird_data_path, source_manager):
271 |     pmid = '28432782'
272 |     filename = join(test_weird_data_path, pmid + '.html')
273 |     html = open(filename).read()
274 |     source = source_manager.identify_source(html)
275 |     article = source.parse_article(html, pmid=pmid)
276 |     tables = article.tables
277 |     assert len(tables) == 2
278 | 
279 | @pytest.mark.vcr(record_mode="once")
280 | def test_stroke_table(test_weird_data_path, source_manager):
281 |     pmid = '38990127'
282 |     filename = join(test_weird_data_path, pmid + '.html')
283 |     html = open(filename).read()
284 |     source = source_manager.identify_source(html)
285 |     article = source.parse_article(html, pmid=pmid)
286 |     tables = article.tables
287 |     assert len(tables) == 2
288 | 


--------------------------------------------------------------------------------
/ace/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | import random
 4 | 
 5 | from requests.adapters import HTTPAdapter, Retry
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | from ace.config import USER_AGENTS
 9 | 
10 | class PubMedAPI:
11 |     def __init__(self, api_key=None):
12 |         if api_key is None:
13 |             # Look for api key in environment variable
14 |             api_key = os.environ.get('PUBMED_API_KEY')
15 |         self.api_key = api_key
16 |         self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
17 |         self.headers = {'User-Agent': random.choice(USER_AGENTS)}
18 | 
19 |         self.session = requests.Session()
20 |         retries = Retry(total=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504, 400])
21 |         self.session.mount('https://', HTTPAdapter(max_retries=retries))
22 | 
23 | 
24 |     def get(self, util, params=None, return_content=True):
25 |         url = f"{self.base_url}/{util}.fcgi"
26 |         if self.api_key:
27 |             params['api_key'] = self.api_key
28 |             
29 |         response = self.session.get(url, params=params, headers=self.headers, timeout=10)
30 | 
31 |         if response.status_code != 200:
32 |             raise Exception(f"PubMed API returned status code {response.status_code} for {url}")
33 | 
34 |         if return_content:
35 |             response = response.content
36 | 
37 |         return response
38 |         
39 |     def esearch(self, query, retstart=None, retmax=10000, extract_ids=True, **kwargs):
40 |         params = {
41 |             "db": "pubmed",
42 |             "term": query,
43 |             "retmax": str(retmax),
44 |         }
45 |         if retstart is not None:
46 |             params["retstart"] = str(retstart)
47 |             
48 |         response = self.get("esearch", params=params, **kwargs)
49 |         if extract_ids:
50 |             soup = BeautifulSoup(response)
51 |             response = [t.string for t in soup.find_all('id')]
52 |         return response
53 |     
54 |     def efetch(self, input_id, retmode='txt', rettype='medline', db = 'pubmed', **kwargs):
55 |         params = {
56 |             "db": db,
57 |             "id": input_id,
58 |             "retmode": retmode,
59 |             "rettype": rettype
60 |         }
61 |         
62 |         
63 |         response = self.get("efetch", params=params, **kwargs)
64 |         return response
65 |     
66 |     def elink(self, pmid, retmode='ref', access_db = 'pubmed', **kwargs):
67 |         params = {
68 |             "dbfrom": "pubmed",
69 |             "id": pmid,
70 |             "retmode": retmode
71 |         }
72 |         if access_db == "pmc":
73 |             params["linkname"] = "pubmed_pmc"
74 |         else:
75 |             params["cmd"] = "prlinks"
76 |         
77 |         response = self.get("elink", params=params, **kwargs)
78 |         return response
79 | 


--------------------------------------------------------------------------------
/ace/version.py:
--------------------------------------------------------------------------------
1 | 
2 | __version__ = '0.1.0.dev'


--------------------------------------------------------------------------------
/example_tables.txt:
--------------------------------------------------------------------------------
 1 | ### This file contains some sample tables to illustrate various problems or features we need to account for when extracting coordinates ##
 2 | 
 3 | # Illustrates the need to handle grouping better in cases where two grouping rows occur:
 4 | http://www.mitpressjournals.org/action/showPopup?citid=citart1&id=T2&doi=10.1162%2Fjocn_a_00371
 5 | 
 6 | # Illustrates problems when same region label applies to multiple rows but is only listed once
 7 | http://www.jneurosci.org/content/29/12/3930/T3.expansion.html
 8 | 
 9 | # Table 1 is broken: instead of starting a new <tr> for a new group, an entire <table> is used
10 | http://www.sciencedirect.com/science/article/pii/S1053811911007609
11 | 
12 | # Doesn't detect x/y/z in Table 1 correctly
13 | http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0068494
14 | 


--------------------------------------------------------------------------------
/examples/create_db_and_add_articles.py:
--------------------------------------------------------------------------------
 1 | # In this example we create a new DB file and process a bunch of
 2 | # articles. Note that due to copyright restrictions, articles can't
 3 | # be included in this package, so you'll need to replace PATH_TO_FILES
 4 | # with something that works.
 5 | 
 6 | from ace import database
 7 | from ace.ingest import add_articles
 8 | 
 9 | # Uncomment the next line to seem more information
10 | # ace.set_logging_level('info')
11 | 
12 | # Change this to a valid path to a set of html files.
13 | PATH_TO_FILES = "/home/zorro/neurosynth_scrape/articles/html/Neuroimage/*"
14 | 
15 | db = database.Database(adapter='sqlite', db_name='sqlite:///example_db.db')
16 | add_articles(db, PATH_TO_FILES, pmid_filenames=True)
17 | db.print_stats()
18 | 


--------------------------------------------------------------------------------
/examples/fetch_articles_from_pubmed.py:
--------------------------------------------------------------------------------
 1 | """ Query PubMed for results from several journals, and save to file.
 2 | The resulting directory can then be passed to the Database instance for 
 3 | extraction, as in the create_db_and_add_articles example.
 4 | NOTE: selenium must be installed and working properly for this to work. 
 5 | Code has only been tested with the Chrome driver. """
 6 | 
 7 | from ace.scrape import Scraper
 8 | import ace
 9 | import os
10 | 
11 | 
12 | journals = {
13 |     'Neuroimage': {
14 |         'delay': 20,  # Mean delay between article downloads--prevents the banhammer
15 |         'mode': 'browser',  # ScienceDirect journals require selenium to work properly
16 |         'search': 'fmri',  # Only retrieve articles with this string in abstract
17 |         'min_pmid': 34447833,   # Start from this PMID--can run incrementally
18 |     }
19 | }
20 | 
21 | # Verbose output
22 | ace.set_logging_level('debug')
23 | 
24 | # Create temporary output dir
25 | output_dir = '/tmp/articles'
26 | if not os.path.exists(output_dir):
27 | 	os.makedirs(output_dir)
28 | 
29 | # Initialize Scraper
30 | scraper = Scraper('/tmp/articles')
31 | 
32 | # Loop through journals and 
33 | for j, settings in list(journals.items()):
34 |     scraper.retrieve_journal_articles(j, skip_pubmed_central=True, **settings)
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-recording
3 | vcrpy
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4
 2 | regex
 3 | requests
 4 | simplejson
 5 | sqlalchemy
 6 | selenium
 7 | seleniumbase
 8 | tqdm
 9 | xmltodict
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | # Borrowing a trick from nibabel
 5 | if len(set(('test', 'easy_install', 'develop')).intersection(sys.argv)) > 0:
 6 |     import setuptools
 7 | 
 8 | from distutils.core import setup
 9 | 
10 | extra_setuptools_args = {}
11 | if 'setuptools' in sys.modules:
12 |     extra_setuptools_args = dict(
13 |         tests_require=['nose'],
14 |         test_suite='nose.collector',
15 |         extras_require=dict(
16 |             test='nose>=0.10.1')
17 |     )
18 | 
19 | # fetch version from within ACE module
20 | with open(os.path.join('ace', 'version.py')) as f:
21 |     exec(f.read())
22 | 
23 | setup(name="ace",
24 |       version=__version__,
25 |       description="Automated Coordinate Extraction",
26 |       maintainer='Tal Yarkoni',
27 |       maintainer_email='tyarkoni@gmail.com',
28 |       url='http://github.com/neurosynth/ace',
29 |       packages=["ace",
30 |                   "ace.tests"],
31 |       package_data={'ace': ['sources/*'],
32 |                     'ace.tests': ['data/*']
33 |                     },
34 |       **extra_setuptools_args
35 |       )
36 | 


--------------------------------------------------------------------------------