├── .github
└── workflows
│ └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── ace
├── .vscode
│ └── settings.json
├── __init__.py
├── config.py
├── database.py
├── datatable.py
├── evaluate.py
├── export.py
├── extract.py
├── ingest.py
├── label.py
├── scrape.py
├── sources.py
├── sources
│ ├── Frontiers.json
│ ├── HighWire.json
│ ├── JournalOfCognitiveNeuroscience.json
│ ├── OUP.json
│ ├── OldSpringer.json
│ ├── PMC.json
│ ├── Plos.json
│ ├── Sage.json
│ ├── ScienceDirect.json
│ ├── Springer.json
│ └── Wiley.json
├── tableparser.py
├── tests
│ ├── __init__.py
│ ├── cassettes
│ │ └── test_ace
│ │ │ ├── test_brain_research_source.yaml
│ │ │ ├── test_cerebral_cortex_source.yaml
│ │ │ ├── test_database_processing_stream.yaml
│ │ │ ├── test_frontiers_source.yaml
│ │ │ ├── test_journal_scraping.yaml
│ │ │ ├── test_neuropsychologia_source.yaml
│ │ │ ├── test_plos_source.yaml
│ │ │ ├── test_pmc_source.yaml
│ │ │ ├── test_science_direct_source.yaml
│ │ │ └── test_springer_source.yaml
│ ├── data
│ │ ├── brain.html
│ │ ├── cerebral_cortex.html
│ │ ├── cognition.html
│ │ ├── frontiers.html
│ │ ├── jcogneuro.html
│ │ ├── plosone.html
│ │ ├── pmc.html
│ │ ├── springer.html
│ │ └── wiley.html
│ ├── different_data
│ │ ├── 14715131.html
│ │ ├── 15028641.html
│ │ ├── 15342430.html
│ │ └── 18242723.html
│ ├── test_ace.py
│ └── weird_data
│ │ ├── 11532885.html
│ │ ├── 12417470.html
│ │ ├── 15716157.html
│ │ ├── 18439804.html
│ │ ├── 18760263.html
│ │ ├── 20159144.html
│ │ ├── 22695256.html
│ │ ├── 23813017.html
│ │ ├── 26021218.html
│ │ ├── 26696806.html
│ │ ├── 28432782.html
│ │ ├── 29366950.html
│ │ ├── 36196770.html
│ │ └── 38990127.html
├── utils.py
└── version.py
├── example_tables.txt
├── examples
├── create_db_and_add_articles.py
└── fetch_articles_from_pubmed.py
├── requirements.dev.txt
├── requirements.txt
└── setup.py
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Install and Test
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 | branches:
9 | - master
10 |
11 | concurrency:
12 | group: testing-${{ github.ref }}
13 | cancel-in-progress: true
14 |
15 | jobs:
16 | test:
17 | runs-on: ubuntu-latest
18 |
19 | steps:
20 | - name: Checkout code
21 | uses: actions/checkout@v2
22 |
23 | - name: Set up Python
24 | uses: actions/setup-python@v4
25 | with:
26 | python-version: '3.8'
27 |
28 | - name: Install dependencies
29 | run: |
30 | pip install -r requirements.txt
31 | pip install -r requirements.dev.txt
32 | pip install -e .
33 |
34 | - name: Test with pytest
35 | run: pytest
36 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.DS_Store
3 | *~
4 | build/
5 | dist
6 | dist/*
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Permission is hereby granted, free of charge, to any person obtaining a copy
2 | of this software and associated documentation files (the "Software"), to deal
3 | in the Software without restriction, including without limitation the rights
4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
5 | copies of the Software, and to permit persons to whom the Software is
6 | furnished to do so, subject to the following conditions:
7 |
8 | The above copyright notice and this permission notice shall be included in
9 | all copies or substantial portions of the Software.
10 |
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17 | THE SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # What is ACE?
3 |
4 | ACE stands for Automated Coordinate Extraction. It's a Python package for automated extraction of functional MRI activation data from the tables of published neuroimaging articles. ACE is actually ACE2; a long, long time ago in a faraway land there was a clunkier Ruby version of ACE that did more or less the same thing much more poorly. Thankfully, Ruby ACE has now been disappeared from the internets forever, leaving us with the slightly better thought out package you see here.
5 |
6 | ## Installation
7 |
8 | Install the package from source:
9 |
10 | > python setup.py install
11 |
12 | Make sure you have all the dependencies installed (see requirements.txt).
13 |
14 | That's all!
15 |
16 | ## Usage
17 |
18 | For now, take a look at the tests to get a sense of how things work. A quickstart guide will fill this space in the near future.
--------------------------------------------------------------------------------
/ace/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.testing.pytestArgs": [
3 | "tests"
4 | ],
5 | "python.testing.unittestEnabled": false,
6 | "python.testing.pytestEnabled": true
7 | }
--------------------------------------------------------------------------------
/ace/__init__.py:
--------------------------------------------------------------------------------
1 | # emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*-
2 | # ex: set sts=4 ts=4 sw=4 et:
3 | """ACE -- Automated Coordinate Extraction.
4 | """
5 | __all__ = ["config", "ingest", "database", "datatable", "set_logging_level", "scrape", "sources", "tableparser", "tests", "__version__"]
6 |
7 | import logging
8 | import sys
9 | import os
10 |
11 | from .version import __version__
12 |
13 | def set_logging_level(level=None):
14 | """Set package-wide logging level
15 |
16 | Args
17 | level : Logging level constant from logging module (warning, error, info, etc.)
18 | """
19 | if level is None:
20 | level = os.environ.get('ACE_LOGLEVEL', 'warn')
21 | logger.setLevel(getattr(logging, level.upper()))
22 | return logger.getEffectiveLevel()
23 |
24 | def _setup_logger(logger):
25 | # Basic logging setup
26 | console = logging.StreamHandler(sys.stdout)
27 | console.setFormatter(logging.Formatter("%(levelname)-6s %(module)-7s %(message)s"))
28 | logger.addHandler(console)
29 | set_logging_level()
30 |
31 | # Set up logger
32 | logger = logging.getLogger("ace")
33 | _setup_logger(logger)
--------------------------------------------------------------------------------
/ace/config.py:
--------------------------------------------------------------------------------
1 | ''' GLOBAL SETTINGS '''
2 |
3 | # When True, all Exceptions will be suppressed. When False, Exception
4 | # messages will be printed out.
5 | SILENT_ERRORS = False
6 |
7 |
8 | ''' DATABASE SETTINGS '''
9 | # Adapter to use--either 'mysql' or 'sqlite'
10 | SQL_ADAPTER = 'mysql'
11 |
12 | # SQLite path (when using sqlite adapter)
13 | SQLITE_URI = 'sqlite:///ace.db'
14 |
15 | # MySQL configuration
16 | MYSQL_USER = 'ace'
17 | MYSQL_PASSWORD = 'CHANGEME'
18 | MYSQL_DB = 'ace_test'
19 |
20 | # When True, any processed articles will be saved to DB, whether or not they
21 | # contain any extracted activations. When False, only articles from which
22 | # at least one activation was extracted will be saved. Note that if this is set
23 | # to False, processing will be much slower, since every article not already in
24 | # the DB will be parsed, even if it contains no activations and has been
25 | # previously processed.
26 | SAVE_ARTICLES_WITHOUT_ACTIVATIONS = True
27 |
28 | # By default, ACE will ignore any articles that already exist in the DB
29 | # when processing new HTML files. If OVERWRITE is set to True, ACE will
30 | # always overwrite existing records. This is useful when the extraction
31 | # code has improved substantially and you want to re-extract all data,
32 | # but should otherwise be left off for the sake of efficiency.
33 | OVERWRITE_EXISTING_ROWS = False
34 |
35 |
36 | ''' SOURCE PROCESSING SETTINGS '''
37 |
38 | # If True, will exercise greater care when parsing (e.g., when estimating
39 | # number of columns in table, will check every row in the table and take the
40 | # max instead of just checking the first row). This is generally desirable,
41 | # but will result in slower processing.
42 | CAREFUL_PARSING = True
43 |
44 | # Sometimes tables have rows that can't be processed--usually because of malformed
45 | # HTML or XML (e.g., failure to close a
tag). Such problems will always be
46 | # logged, but if IGNORE_BAD_ROWS is True, the row will be skipped and execution
47 | # will continue gracefully. When False, any errors will be re-raised,
48 | # terminating execution.
49 | IGNORE_BAD_ROWS = True
50 |
51 | # Whether or not to ignore tables that appear to be missing a label for at
52 | # least one column. This doesn't happen much, and in practice most tables with
53 | # missing labels appear to genuinely have empty columns that are ignored
54 | # anyway, so this should be left off unless problems arise.
55 | EXCLUDE_TABLES_WITH_MISSING_LABELS = False
56 |
57 |
58 |
59 |
60 | ''' SCRAPING/PARSING SETTINGS '''
61 | USER_AGENTS = [
62 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
63 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
64 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36',
65 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
66 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
67 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
68 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
69 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
70 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36',
71 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36',
72 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
73 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36',
74 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36',
75 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
76 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
77 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
78 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36',
79 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36',
80 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
81 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36',
82 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
83 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36',
84 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36',
85 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36',
86 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36'
87 | ]
88 |
--------------------------------------------------------------------------------
/ace/database.py:
--------------------------------------------------------------------------------
1 | # Database stuff and models
2 |
3 | from sqlalchemy import (TypeDecorator, Table, Column, Integer, Float, String, Boolean,
4 | ForeignKey, DateTime, Text)
5 | from sqlalchemy.orm import relationship, backref, sessionmaker
6 | from sqlalchemy import create_engine
7 | from sqlalchemy.ext.declarative import declarative_base
8 | from sqlalchemy.ext.associationproxy import association_proxy
9 | from sqlalchemy.dialects.mysql import MEDIUMTEXT
10 | from sqlalchemy.sql import exists
11 | from datetime import datetime
12 | import simplejson as json
13 | import logging
14 | import sys
15 | from os import path
16 | import datetime
17 |
18 | from . import config
19 | from . import extract
20 |
21 | logger = logging.getLogger(__name__)
22 |
23 | Base = declarative_base()
24 |
25 | # Backend-dependent column for full text
26 | LongText = Text().with_variant(MEDIUMTEXT, 'mysql')
27 |
28 | # Handles all Database loading/saving stuff
29 | class Database:
30 |
31 | def __init__(self, adapter=None, db_name=None, user=None, password=None):
32 | ''' Connect to DB and initialize instance. '''
33 |
34 | # Default to settings in config file if none passed
35 | if adapter is None: adapter = config.SQL_ADAPTER
36 |
37 | # Generate DB URI
38 | if adapter == 'sqlite':
39 | db_uri = config.SQLITE_URI if db_name is None else db_name
40 | elif adapter == 'mysql':
41 | db_name = config.MYSQL_DB if db_name is None else db_name
42 | if user is None: user = config.MYSQL_USER
43 | if password is None: password = config.MYSQL_PASSWORD
44 | db_uri = 'mysql://%s:%s@localhost/%s' % (user, password, db_name)
45 | else:
46 | raise ValueError("Value of SQL_ADAPTER in settings must be either 'sqlite' or 'mysql'")
47 |
48 | engine = create_engine(db_uri, echo=False, connect_args={'timeout': 15})
49 |
50 | if adapter == 'mysql': engine.execute("SET sql_mode=''")
51 |
52 | Session = sessionmaker(bind=engine)
53 | Base.metadata.create_all(engine)
54 | self.session = Session()
55 |
56 | def add(self, record):
57 | ''' Add a record to the DB. '''
58 | self.session.add(record)
59 |
60 | def save(self):
61 | ''' Commit all stored records to file. '''
62 | self.session.commit()
63 | # except Exception as err:
64 | # print(err)
65 |
66 | def delete_article(self, pmid):
67 | article = self.session.query(Article).filter_by(id=pmid).first()
68 | self.session.delete(article)
69 | self.session.commit()
70 |
71 | def print_stats(self):
72 | ''' Summarize the current state of the DB. '''
73 | n_articles = self.session.query(Article).count()
74 | n_articles_with_coordinates = self.session.query(Article).join(Table).filter(Table.n_activations>0).distinct('article_id').count()
75 | n_tables = self.session.query(Table).count()
76 | n_activations = self.session.query(Activation).count()
77 | n_links = self.session.query(NeurovaultLink).count()
78 | n_articles_with_links = self.session.query(NeurovaultLink).distinct('article_id').count()
79 | print(f"The database currently contains: {n_articles} articles.\n"
80 | f"{n_articles_with_coordinates} have coordinates, and {n_articles_with_links} have NeuroVault links.\n"
81 | f"Total of {n_tables} tables, {n_activations} activations and {n_links} NeuroVault links.")
82 |
83 | def article_exists(self, pmid):
84 | ''' Check if an article already exists in the database. '''
85 | return self.session.query(exists().where(Article.id==pmid)).scalar()
86 |
87 | @property
88 | def articles(self):
89 | return self.session.query(Article).all()
90 |
91 | # Create a JSONString column type for convenience
92 | class JsonString(TypeDecorator):
93 | impl = Text
94 |
95 | def process_result_value(self, value, dialect):
96 | if value is None:
97 | return None
98 | else:
99 | return json.loads(value)
100 |
101 | def process_bind_param(self, value, dialect):
102 | if value is None:
103 | return None
104 | else:
105 | return json.dumps(value)
106 |
107 |
108 | class Article(Base):
109 |
110 | __tablename__ = 'articles'
111 |
112 | id = Column(Integer, primary_key=True)
113 | title = Column(String(200))
114 | text = Column(LongText)
115 | journal = Column(String(200))
116 | space = Column(String(20))
117 | publisher = Column(String(200))
118 | doi = Column(String(200))
119 | year = Column(Integer)
120 | authors = Column(Text)
121 | abstract = Column(Text)
122 | citation = Column(Text)
123 | pubmed_metadata = Column(JsonString)
124 | created_at = Column(DateTime, default=datetime.datetime.utcnow)
125 | updated_at = Column(DateTime, default=datetime.datetime.utcnow,
126 | onupdate=datetime.datetime.utcnow)
127 |
128 | tables = relationship('Table', cascade="all, delete-orphan",
129 | backref='article')
130 |
131 | neurovault_links = relationship('NeurovaultLink', cascade="all, delete-orphan",
132 | backref='article')
133 |
134 | features = association_proxy('tags', 'feature')
135 |
136 | def __init__(self, text, pmid=None, doi=None, metadata=None):
137 | self.id = int(pmid)
138 | self.text = text
139 | self.space = extract.guess_space(text)
140 | self.doi = doi
141 | self.pubmed_metadata = metadata
142 | self.update_from_metadata()
143 |
144 | def update_from_metadata(self):
145 | if self.pubmed_metadata is not None:
146 | pmd = self.pubmed_metadata
147 | self.title = pmd['title']
148 | self.journal = pmd['journal']
149 | self.pubmed_metadata = pmd
150 | self.year = pmd['year']
151 | self.authors = pmd['authors']
152 | self.abstract = pmd['abstract']
153 | self.citation = pmd['citation']
154 | self.doi = pmd['doi']
155 |
156 |
157 | class Table(Base):
158 |
159 | __tablename__ = 'tables'
160 |
161 | id = Column(Integer, primary_key=True)
162 | article_id = Column(Integer, ForeignKey('articles.id'))
163 | activations = relationship('Activation', cascade="all, delete-orphan",
164 | backref='table')
165 | position = Column(Integer) # The serial position of occurrence
166 | number = Column(String(10)) # The stated table ID (e.g., 1, 2b)
167 | label = Column(String(200)) # The full label (e.g., Table 1, Table 2b)
168 | caption = Column(Text)
169 | notes = Column(Text)
170 | n_activations = Column(Integer)
171 | n_columns = Column(Integer)
172 |
173 | def finalize(self):
174 | ''' Any cleanup and updating operations we need to do before saving. '''
175 |
176 | # # Remove duplicate activations--most commonly produced by problems with
177 | # # the grouping code.
178 | # act_defs = set()
179 | # to_keep = []
180 | # for a in self.activations:
181 | # definition = json.dumps([a.x, a.y, a.z, a.groups])
182 | # if definition not in act_defs:
183 | # act_defs.add(definition)
184 | # to_keep.append(a)
185 | # self.activations = to_keep
186 |
187 | self.n_activations = len(self.activations)
188 |
189 |
190 | class Activation(Base):
191 |
192 | __tablename__ = 'activations'
193 |
194 | id = Column(Integer, primary_key=True)
195 |
196 | article_id = Column(Integer, ForeignKey('articles.id'))
197 | table_id = Column(Integer, ForeignKey('tables.id'))
198 | columns = Column(JsonString)
199 | groups = Column(JsonString)
200 | problems = Column(JsonString)
201 | x = Column(Float)
202 | y = Column(Float)
203 | z = Column(Float)
204 | number = Column(Integer)
205 | region = Column(String(100))
206 | hemisphere = Column(String(100))
207 | ba = Column(String(100))
208 | size = Column(String(100))
209 | statistic = Column(String(100))
210 | p_value = Column(String(100))
211 |
212 | missing_source = Column(Boolean, default=False)
213 |
214 | def __init__(self):
215 | self.problems = []
216 | self.columns = {}
217 |
218 | def set_coords(self, x, y, z):
219 | new_xyz = []
220 | for c in [x, y, z]:
221 | if c == '' or c is None:
222 | c = None
223 | else:
224 | c = c.replace(' ', '').replace('--', '-').rstrip('.')
225 | c = float(c)
226 | new_xyz.append(c)
227 |
228 | self.x, self.y, self.z = new_xyz
229 |
230 | def add_col(self, key, val):
231 | self.columns[key] = val
232 |
233 | # Validates Peak. Considers peak invalid if:
234 | # * At least one of X, Y, Z is nil or missing
235 | # * Any |coordinate| > 100
236 | # * Two or more columns are zeroes (most of the time this
237 | # will indicate a problem, but occasionally a real coordinate)
238 | # Depending on config, either excludes peak, or allows it through
239 | # but flags potential problems for later inspection.
240 | def validate(self):
241 |
242 | for c in [self.x, self.y, self.z]:
243 | if c == '' or c is None:
244 | logger.debug('Missing x, y, or z coordinate information: [%s, %s, %s]' % tuple(
245 | [str(e) for e in [self.x, self.y, self.z]]))
246 | return False
247 | try:
248 | if abs(c) >= 100:
249 | logger.debug(
250 | 'Invalid coordinates: at least one dimension (x,y,z) >= 100.')
251 | return False
252 | except:
253 | print(c)
254 | print(sys.exc_info()[0])
255 | raise
256 |
257 | sorted_xyz = sorted([abs(self.x), abs(self.y), abs(self.z)])
258 | if sorted_xyz[0] == 0 and sorted_xyz[1] == 0:
259 | logger.debug(
260 | "At least two dimensions have value == 0; coordinate is probably not real.")
261 | return False
262 |
263 | return True
264 |
265 | class NeurovaultLink(Base):
266 |
267 | __tablename__ = 'Neurovaultlinks'
268 |
269 | id = Column(Integer, primary_key=True, autoincrement=True)
270 | neurovault_id = Column(Integer)
271 | url = Column(String(100))
272 | type = Column(String(100))
273 |
274 | article_id = Column(Integer, ForeignKey('articles.id'))
275 |
--------------------------------------------------------------------------------
/ace/datatable.py:
--------------------------------------------------------------------------------
1 | import logging
2 | logger = logging.getLogger(__name__)
3 |
4 |
5 | class DataTable:
6 |
7 | ''' Simple class to represent the contents of an HTML table.
8 | Basically just a grid with array accessor methods and
9 | some extra validation. '''
10 |
11 | def __init__(self, n_rows, n_cols):
12 | self.data = [[None] * n_cols for n in range(n_rows)]
13 | # self.n_rows = n_rows
14 | self.n_cols = n_cols
15 |
16 | def __getitem__(self, inds):
17 | if isinstance(inds, int):
18 | inds = [inds]
19 | row = self.data[inds[0]]
20 | return row[inds[1]] if len(inds) > 1 else row
21 |
22 | def __setitem__(self, inds, v):
23 | self.data[inds[0]][inds[1]] = v
24 |
25 | def to_list(self):
26 | return self.data
27 |
28 | @property
29 | def n_rows(self):
30 | return len(self.data)
31 |
32 | def add_val(self, val, rows=1, cols=1):
33 | ''' Find next open position and add values to grid '''
34 |
35 | # Flatten list and find next open position
36 | flat = [item for l in self.data for item in l]
37 | flat_set = set(flat)
38 |
39 | if not None in flat_set:
40 | open_pos = self.n_rows * self.n_cols
41 | for i in range(rows):
42 | self.data.append([None] * self.n_cols)
43 |
44 | else:
45 | # This indexing operation consumes a lot of CPU time for large tables; need to refactor!
46 | open_pos = flat.index(None)
47 | ri = open_pos / self.n_cols
48 | if (ri + rows) > self.n_rows:
49 | for i in range(round((ri + rows)) - self.n_rows):
50 | self.data.append([None] * self.n_cols)
51 |
52 | ri = open_pos // self.n_cols
53 | ci = open_pos % self.n_cols
54 |
55 | if cols + ci > self.n_cols:
56 | cols = self.n_cols - ci
57 |
58 | for r in range(rows):
59 | for c in range(cols):
60 | if cols > 1:
61 | content = '@@%s@%d' % (
62 | val, cols) if c == 0 else '@@%s' % val
63 | else:
64 | content = val
65 | self[ri + r, ci + c] = content
66 |
--------------------------------------------------------------------------------
/ace/evaluate.py:
--------------------------------------------------------------------------------
1 | """ Tools for evaluating the quality of extracted coordinates. """
2 |
3 | import matplotlib.pyplot as plt
4 | import pandas as pd
5 | import numpy as np
6 |
7 | def plot_xyz_histogram(database, bins=50):
8 | ''' Takes a database file as input and plots histograms for X/Y/Z coords. '''
9 | data = pd.read_csv(database,sep='\t')
10 | data[['x','y','z']].hist(bins=bins)
11 | plt.show()
12 |
13 |
14 | def proportion_integer_values(database):
15 | ''' Reports the proportion of integer values in X/Y/Z columns of database file.
16 | This should generally be close to 0--typically around 0.02 or so if everything
17 | is working properly. '''
18 | data = pd.read_csv(database,sep='\t')
19 | return 1 - data[['x','y','z']].apply(lambda x: np.mean(x == x.round()))
--------------------------------------------------------------------------------
/ace/export.py:
--------------------------------------------------------------------------------
1 | from .database import Article
2 | from sqlalchemy import func, or_
3 | import logging
4 | import csv
5 | from pathlib import Path
6 | import datetime
7 | import json
8 | from tqdm import tqdm
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 | def export_database(db, foldername, skip_empty=True):
13 | # Create folder if it doesn't exist
14 | foldername = Path(foldername)
15 | foldername.mkdir(parents=True, exist_ok=True)
16 |
17 | article_columns = ['pmid', 'doi', 'authors', 'title', 'journal', 'publication_year', 'coordinate_space']
18 | art_results = []
19 |
20 | coordinate_columns = ['pmid', 'table_id', 'table_label', 'table_caption', 'table_number',
21 | 'x', 'y', 'z', 'p_value', 'region', 'size', 'statistic', 'groups']
22 | coordinates = []
23 |
24 | text_columns = ['pmid', 'title' ,'abstract', 'body']
25 | texts = []
26 |
27 | nv_colls_col = ['pmid','collection_id']
28 | nv_colls = []
29 |
30 | nv_images_col = ['pmid','image_id']
31 | nv_images = []
32 |
33 | print("Exporting database to %s" % foldername)
34 |
35 | articles = db.session.query(Article)
36 | if skip_empty:
37 | articles = articles.filter(or_(Article.tables.any(), Article.neurovault_links.any()))
38 |
39 | for art in tqdm(articles):
40 | art_results.append([art.id, art.doi, art.authors, art.title, art.journal, art.year, art.space])
41 | texts.append([art.id, art.title, art.abstract, art.text])
42 |
43 | for t in art.tables:
44 | for p in t.activations:
45 | if t.number is None: t.number = ''
46 | if isinstance(p.groups, str):
47 | p.groups = [p.groups]
48 | elif p.groups is None:
49 | p.groups = []
50 | groups = '///'.join(p.groups)
51 |
52 | coordinates.append([art.id, t.id, t.label, t.caption, t.number,
53 | p.x, p.y, p.z, p.p_value, p.region, p.size, p.statistic, groups])
54 |
55 | for nv in art.neurovault_links:
56 | if nv.type == 'collection':
57 | nv_colls.append([art.id, nv.neurovault_id])
58 | elif nv.type == 'image':
59 | nv_images.append([art.id, nv.neurovault_id])
60 |
61 | # Save articles as tab separated file
62 | with (foldername / 'metadata.csv').open('w', newline='') as f:
63 | writer = csv.writer(f)
64 | writer.writerow(article_columns)
65 | writer.writerows(art_results)
66 |
67 | # Save coordinates as tab separated file
68 | with (foldername / 'coordinates.csv').open('w', newline='') as f:
69 | writer = csv.writer(f)
70 | writer.writerow(coordinate_columns)
71 | writer.writerows(coordinates)
72 |
73 | # Save texts as tab separated file
74 | with (foldername / 'text.csv').open('w', newline='') as f:
75 | writer = csv.writer(f)
76 | writer.writerow(text_columns)
77 | writer.writerows(texts)
78 |
79 | # Save NV links
80 | with (foldername / 'neurovault_collections.csv').open('w', newline='') as f:
81 | writer = csv.writer(f)
82 | writer.writerow(nv_colls_col)
83 | writer.writerows(nv_colls)
84 |
85 | with (foldername / 'neurovault_images.csv').open('w', newline='') as f:
86 | writer = csv.writer(f)
87 | writer.writerow(nv_images_col)
88 | writer.writerows(nv_images)
89 |
90 | # Save json file with time of export
91 | export_md = {
92 | "exported": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
93 | "n_articles": len(art_results),
94 | "n_activations": len(coordinates),
95 | "n_nv_collections": len(nv_colls),
96 | "n_nv_images": len(nv_images)
97 |
98 | }
99 |
100 | with (foldername / 'export.json').open('w') as f:
101 | json.dump(export_md, f)
--------------------------------------------------------------------------------
/ace/extract.py:
--------------------------------------------------------------------------------
1 | # Miscellaneous methods for extracting information from text/html
2 |
3 | import bs4 as BeautifulSoup
4 | import re
5 |
6 |
7 | def guess_space(text):
8 | ''' Take article text as input and return a guess about the image space. '''
9 |
10 | targets = ['mni', 'talairach', 'afni', 'flirt',
11 | '711-2', 'spm', 'brainvoyager', 'fsl']
12 | n_targ = len(targets)
13 | text = text.lower()
14 | res = [0] * n_targ
15 | matches = []
16 | for i in range(n_targ):
17 | res[i] = len(re.findall(
18 | r'\b(.{30,40}\b%s.{30,40})\b' % targets[i], text))
19 |
20 | # Sum up diagnostic strings...
21 | mni = res[5] + res[7]
22 | t88 = res[2] + res[6]
23 | software = mni + t88
24 |
25 | # Assign label
26 | # 1. If only one of MNI or T88 is implied, classify as that
27 | if (mni and not t88) or (not software and res[0] and not res[1]):
28 | label = 'MNI'
29 | elif (t88 and not mni) or (not software and res[1] and not res[0]):
30 | label = 'TAL'
31 | else:
32 | label = 'UNKNOWN'
33 |
34 | return label
35 |
--------------------------------------------------------------------------------
/ace/ingest.py:
--------------------------------------------------------------------------------
1 | from os import path
2 | import logging
3 | from . import sources, config
4 | from .scrape import _validate_scrape
5 |
6 | logger = logging.getLogger(__name__)
7 |
8 | # The actual function that takes articles and adds them to the database
9 | # imports sources; sources is a module that contains the classes for each
10 | # source of articles.
11 |
12 | def add_articles(db, files, commit=True, table_dir=None, limit=None,
13 | pmid_filenames=False, metadata_dir=None, force_ingest=True, **kwargs):
14 | ''' Process articles and add their data to the DB.
15 | Args:
16 | files: The path to the article(s) to process. Can be a single
17 | filename (string), a list of filenames, or a path to pass
18 | to glob (e.g., "article_ls dir/NIMG*html")
19 | commit: Whether or not to save records to DB file after adding them.
20 | table_dir: Directory to store downloaded tables in (if None, tables
21 | will not be saved.)
22 | limit: Optional integer indicating max number of articles to add
23 | (selected randomly from all available). When None, will add all
24 | available articles.
25 | pmid_filenames: When True, assume that the file basename is a PMID.
26 | This saves us from having to retrieve metadata from PubMed When
27 | checking if a file is already in the DB, and greatly speeds up
28 | batch processing when overwrite is off.
29 | metadata_dir: Location to read/write PubMed metadata for articles.
30 | When None (default), retrieves new metadata each time. If a
31 | path is provided, will check there first before querying PubMed,
32 | and will save the result of the query if it doesn't already
33 | exist.
34 | force_ingest: Ingest even if no source is identified.
35 | kwargs: Additional keyword arguments to pass to parse_article.
36 | '''
37 |
38 | manager = sources.SourceManager(db, table_dir)
39 |
40 | if isinstance(files, str):
41 | from glob import glob
42 | files = glob(files)
43 | if limit is not None:
44 | from random import shuffle
45 | shuffle(files)
46 | files = files[:limit]
47 |
48 | missing_sources = []
49 | for i, f in enumerate(files):
50 | logger.info("Processing article %s..." % f)
51 | html = open(f).read()
52 |
53 | if not _validate_scrape(html):
54 | logger.warning("Invalid HTML for %s" % f)
55 | continue
56 |
57 | source = manager.identify_source(html)
58 | if source is None:
59 | logger.warning("Could not identify source for %s" % f)
60 | missing_sources.append(f)
61 | if not force_ingest:
62 | continue
63 | else:
64 | source = sources.DefaultSource(db)
65 |
66 | pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None
67 | article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs)
68 | if article and (config.SAVE_ARTICLES_WITHOUT_ACTIVATIONS or article.tables):
69 | db.add(article)
70 | if commit and (i % 100 == 0 or i == len(files) - 1):
71 | db.save()
72 | db.save()
73 |
74 | return missing_sources
75 |
--------------------------------------------------------------------------------
/ace/label.py:
--------------------------------------------------------------------------------
1 | # from nltk import *
2 | import re
3 | from collections import Counter
4 | from .database import Article
5 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
6 | import pandas as pd
7 |
8 |
9 | def extract_ngram_features(db, tfidf=True, save=None, vocabulary=None, require_activations=True, **kwargs):
10 | ''' Takes text from an article as input and returns a matrix of document -->
11 | ngram weights. At the moment, only extracts terms from abstracts.
12 | Args:
13 | db: A database instance
14 | tfidf: If True, uses a tf-idf tokenizer; otherwise uses raw counts
15 | save: an optional path to save a CSV to; if None, returns the resulting data
16 | vocabulary: an optional list of ngrams to restrict extraction to
17 | require_activations: When True, only articles containing at least one fMRI activation
18 | table will be included. When False, use all articles in DB.
19 | kwargs: Optional keywords passed onto the scikit-learn vectorizer. Common args are
20 | ngram_range, min_df, max_df, stop_words, and vocabulary.
21 | '''
22 |
23 | # Extract article texts--for now, uses abstracts
24 | articles = db.session.query(Article.id, Article.abstract)
25 | if require_activations:
26 | articles = articles.filter(Article.tables.any())
27 | pmids, corpus = list(zip(*articles.all()))
28 |
29 | # Instantiate vectorizer--either simple counts, or tf-idf
30 | vectorizer = TfidfVectorizer if tfidf else CountVectorizer
31 | vectorizer = vectorizer(vocabulary=vocabulary, **kwargs)
32 |
33 | # Transform texts
34 | weights = vectorizer.fit_transform(corpus).toarray()
35 | names = vectorizer.get_feature_names()
36 |
37 | data = pd.DataFrame(weights, columns=names, index=pmids)
38 |
39 | if save is not None:
40 | data.to_csv(save, sep='\t', index_label='pmid', encoding='utf-8')
41 | else:
42 | return data
43 |
44 |
--------------------------------------------------------------------------------
/ace/scrape.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # use unicode everywhere
3 | import re
4 | import sys
5 | from pathlib import Path
6 | from collections import Mapping
7 | import requests
8 | from time import sleep
9 | import logging
10 | import os
11 | import random
12 | import xmltodict
13 | from seleniumbase import Driver
14 | from selenium.webdriver.support.ui import WebDriverWait
15 | from selenium.webdriver.support import expected_conditions as EC
16 | from selenium.webdriver.common.by import By
17 | from selenium.common.exceptions import TimeoutException
18 | from tqdm import tqdm
19 |
20 | from ace.utils import PubMedAPI
21 | from ace.config import USER_AGENTS
22 |
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | def get_url(url, n_retries=5, timeout=10.0, verbose=False):
27 | headers = {'User-Agent': random.choice(USER_AGENTS)}
28 |
29 | def exponential_backoff(retries):
30 | return 2 ** retries
31 |
32 | retries = 0
33 | while retries < n_retries:
34 |
35 | try:
36 | r = requests.get(url, headers=headers, timeout=timeout)
37 | return r.text
38 | except requests.exceptions.RequestException as e:
39 | logger.warning(f"Request failed: {e}")
40 | sleep_time = exponential_backoff(retries)
41 | logger.info(f"Retrying in {sleep_time} seconds...")
42 | sleep(sleep_time)
43 | retries += 1
44 | logger.error("Exceeded maximum number of retries.")
45 | return None
46 |
47 | def _convert_pmid_to_pmc(pmids):
48 | url_template = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids="
49 | logger.info("Converting PMIDs to PMCIDs...")
50 |
51 | # Chunk the PMIDs into groups of 200
52 | pmids = [str(p) for p in pmids]
53 | pmid_chunks = [pmids[i:i + 200] for i in range(0, len(pmids), 200)]
54 |
55 | pmc_ids = []
56 | for chunk in tqdm(pmid_chunks):
57 | pmid_str = ','.join(chunk)
58 | url = url_template + pmid_str
59 | response = get_url(url)
60 | # Respionse
61 | pmc_ids += re.findall(r'', response)
62 |
63 | logger.info(f"Found {len(pmc_ids)} PMCIDs from {len(pmids)} PMIDs.")
64 |
65 | pmids_found = set([p[1] for p in pmc_ids])
66 | missing_pmids = [(None, p) for p in pmids if p not in pmids_found]
67 |
68 | pmc_ids = pmc_ids + missing_pmids
69 |
70 | return pmc_ids
71 |
72 |
73 | def get_pmid_from_doi(doi, api_key=None):
74 | ''' Query PubMed for the PMID of a paper based on its doi. We need this
75 | for some Sources that don't contain the PMID anywhere in the artice HTML.
76 | '''
77 | query = f"{doi}[aid]"
78 | data = PubMedAPI(api_key=api_key).esearch(query=query)
79 | if data:
80 | data = data[0]
81 | else:
82 | data = None
83 | return data
84 |
85 |
86 | def get_pubmed_metadata(pmid, parse=True, store=None, save=True, api_key=None):
87 | ''' Get PubMed metadata for article.
88 | Args:
89 | pmid: The article's PubMed ID
90 | parse: if True, parses the text and returns a dictionary. if False, returns raw text.
91 | store: optional string path to PubMed metadata files. If passed, first checks the passed
92 | folder for the corresponding ID, and only queries PubMed if not found.
93 | save: if store is passed, save is True, and the file does not already exist,
94 | will save the result of the new PubMed query to the store.
95 | '''
96 | if store is not None:
97 | md_file = os.path.join(store, pmid)
98 |
99 | if store is not None and os.path.exists(md_file):
100 | logger.info("Retrieving metadata from file %s..." % os.path.join(store, pmid))
101 | with open(md_file, 'rb') as f:
102 | xml = f.read()
103 |
104 | else:
105 | logger.info("Retrieving metadata for PubMed article %s..." % str(pmid))
106 | xml = PubMedAPI(api_key=api_key).efetch(input_id=pmid, retmode='xml', rettype='medline', db='pubmed')
107 | if store is not None and save and xml is not None:
108 | if not os.path.exists(store):
109 | os.makedirs(store)
110 | with open(md_file, 'wb') as f:
111 | f.write(xml)
112 |
113 | return parse_PMID_xml(xml) if (parse and xml is not None) else xml
114 |
115 |
116 | def parse_PMID_xml(xml):
117 | ''' Take XML-format PubMed metadata and convert it to a dictionary
118 | with standardized field names. '''
119 |
120 | di = xmltodict.parse(xml).get('PubmedArticleSet')
121 | if not di:
122 | return None
123 |
124 | di = di['PubmedArticle']
125 | article = di['MedlineCitation']['Article']
126 |
127 | if 'ArticleDate' in article:
128 | date = article['ArticleDate']
129 | elif 'Journal' in article:
130 | date = article['Journal']['JournalIssue']['PubDate']
131 | else:
132 | date = None
133 |
134 | if date:
135 | year = date.get('Year', None)
136 | else:
137 | year = None
138 |
139 | doi = None
140 | doi_source = article.get('ELocationID', None)
141 | if doi_source is not None and isinstance(doi_source, list):
142 | doi_source = [d for d in doi_source if d['@EIdType'] == 'doi'][0]
143 |
144 | if doi_source is not None and doi_source['@EIdType'] == 'doi':
145 | doi = doi_source['#text']
146 |
147 | authors = article.get('AuthorList', None)
148 |
149 | if authors:
150 | authors = authors['Author']
151 |
152 | try:
153 | _get_author = lambda a: a['LastName'] + ', ' + a['ForeName']
154 | if isinstance(authors, list):
155 | authors = [_get_author(a) for a in authors if 'ForeName' in a]
156 | else:
157 | authors = [_get_author(authors)]
158 | authors = ';'.join(authors)
159 | except:
160 | authors = None
161 |
162 | if 'MeshHeadingList' in di['MedlineCitation']:
163 | mesh = di['MedlineCitation']['MeshHeadingList']['MeshHeading']
164 | else:
165 | mesh = []
166 |
167 | abstract = article.get('Abstract', '')
168 | if abstract != '':
169 | abstract = abstract.get('AbstractText', '')
170 |
171 | cit = di['PubmedData']['ArticleIdList']['ArticleId']
172 | if isinstance(cit, list):
173 | cit = cit[1]
174 |
175 | metadata = {
176 | 'authors': authors,
177 | 'citation': cit['#text'],
178 | 'comment': abstract,
179 | 'doi': doi,
180 | 'keywords': '',
181 | 'mesh': mesh,
182 | 'pmid': di['MedlineCitation']['PMID'],
183 | 'title': article['ArticleTitle'],
184 | 'abstract': abstract,
185 | 'journal': article['Journal']['Title'],
186 | 'year': year
187 | }
188 |
189 | # Clean up nested Dicts
190 | for k, v in metadata.items():
191 | if isinstance(v, list):
192 | to_join = []
193 | for a in v:
194 | if 'DescriptorName' in a:
195 | a = a['DescriptorName']
196 | a = a['#text']
197 |
198 | to_join.append(a)
199 | v = ' | '.join(to_join)
200 | elif isinstance(v, Mapping):
201 | v = v.get('#text', '')
202 | metadata[k] = v
203 |
204 | return metadata
205 |
206 | def _validate_scrape(html):
207 | """ Checks to see if scraping was successful.
208 | For example, checks to see if Cloudfare interfered """
209 |
210 | patterns = ['Checking if you are a human',
211 | 'Please turn JavaScript on and reload the page',
212 | 'Checking if the site connection is secure',
213 | 'Enable JavaScript and cookies to continue',
214 | 'There was a problem providing the content you requested',
215 | 'Redirecting',
216 | 'Page not available - PMC',
217 | 'Your request cannot be processed at this time. Please try again later',
218 | '403 Forbidden',
219 | 'Page not found — ScienceDirect',
220 | 'This site can’t be reached',
221 | 'used Cloudflare to restrict access',
222 | '502 Bad Gateway',
223 | ]
224 |
225 | for pattern in patterns:
226 | if pattern in html:
227 | return False
228 |
229 | return True
230 |
231 | ''' Class for journal Scraping. The above free-floating methods should
232 | probably be refactored into this class eventually. '''
233 | class Scraper:
234 |
235 | def __init__(self, store, api_key=None):
236 | self.store = Path(store)
237 | self._client = PubMedAPI(api_key=api_key)
238 |
239 |
240 | def search_pubmed(self, journal, search, retmax=10000, savelist=None,):
241 | journal = journal.replace(' ', '+')
242 | search = '+%s' % search
243 | query = f"({journal}[Journal]+journal+article[pt]{search})"
244 | logger.info("Query: %s" % query)
245 |
246 | doc = self._client.esearch(query, retmax=retmax)
247 |
248 | if savelist is not None:
249 | outf = open(savelist, 'w')
250 | outf.write(doc)
251 | outf.close()
252 | return doc
253 |
254 |
255 | def get_html(self, url, journal, mode='browser'):
256 |
257 | ''' Get HTML of full-text article. Uses either browser automation (if mode == 'browser')
258 | or just gets the URL directly. '''
259 |
260 | if mode == 'browser':
261 | driver = Driver(
262 | uc=True,
263 | headless2=True,
264 | agent=random.choice(USER_AGENTS),
265 | )
266 | for attempt in range(15):
267 | try:
268 | driver.set_page_load_timeout(10)
269 | driver.get(url)
270 | url = driver.current_url
271 | except:
272 | driver.quit()
273 | logger.info(f"Timeout exception #{attempt}. Retrying...")
274 | sleep(5)
275 | continue
276 | else:
277 | break
278 | else:
279 | logger.info("Timeout exception. Giving up.")
280 | return None
281 | for attempt in range(10):
282 | try:
283 | html = driver.page_source
284 | except:
285 | logger.info(f"Source Page #{attempt}. Retrying...")
286 | driver.quit()
287 | driver = Driver(
288 | uc=True,
289 | headless2=True,
290 | agent=random.choice(USER_AGENTS),
291 | )
292 | driver.get(url)
293 | sleep(2)
294 | else:
295 | break
296 |
297 | new_url = self.check_for_substitute_url(url, html, journal)
298 |
299 | if url != new_url:
300 | driver = Driver(
301 | uc=True,
302 | headless2=True,
303 | agent=random.choice(USER_AGENTS),
304 | )
305 | driver.get(new_url)
306 | if journal.lower() in ['human brain mapping',
307 | 'european journal of neuroscience',
308 | 'brain and behavior','epilepsia']:
309 | sleep(0.5 + random.random() * 1)
310 | try:
311 | WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'relatedArticles')))
312 | except TimeoutException:
313 | print("Loading Wiley page took too much time!")
314 |
315 | # Sometimes we get annoying alerts (e.g., Flash animation
316 | # timeouts), so we dismiss them if present.
317 | try:
318 | alert = driver.switch_to_alert()
319 | alert.dismiss()
320 | except:
321 | pass
322 |
323 | logger.info(journal.lower())
324 | timeout = 5
325 | for attempt in range(10):
326 | try:
327 | html = driver.page_source
328 | except:
329 | logger.info(f"Source Page #{attempt}. Retrying...")
330 | driver.quit()
331 | driver = Driver(
332 | uc=True,
333 | headless2=True,
334 | agent=random.choice(USER_AGENTS),
335 | )
336 | driver.get(url)
337 | sleep(2)
338 | else:
339 | break
340 | if journal.lower() in ['journal of neuroscience', 'j neurosci']:
341 | ## Find links with class data-table-url, and click on them
342 | ## to load the table data.
343 | table_links = driver.find_elements(By.CLASS_NAME, 'table-expand-inline')
344 |
345 | if len(table_links):
346 | for link in table_links:
347 | WebDriverWait(driver, 20).until(EC.element_to_be_clickable((
348 | By.CLASS_NAME, 'table-expand-inline')))
349 | driver.execute_script("arguments[0].scrollIntoView();", link)
350 | link.click()
351 | sleep(0.5 + random.random() * 1)
352 |
353 | # If title has ScienceDirect in in title
354 | elif ' - ScienceDirect' in html:
355 | try:
356 | element_present = EC.presence_of_element_located((By.ID, 'abstracts'))
357 | WebDriverWait(driver, timeout).until(element_present)
358 | except TimeoutException:
359 | pass
360 | elif 'Wiley Online Library' in html:
361 | try:
362 | element_present = EC.presence_of_element_located((By.ID, 'article__content'))
363 | WebDriverWait(driver, timeout).until(element_present)
364 | except TimeoutException:
365 | pass
366 |
367 | ## Uncomment this next line to scroll to end. Doesn't seem to actually help.
368 | # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
369 | ## Uncomment next line and insert ID to search for specific element.
370 | # driver.find_element_by_id('relatedArticles').send_keys('\t')
371 | # This next line helps minimize the number of blank articles saved from ScienceDirect,
372 | # which loads content via Ajax requests only after the page is done loading. There is
373 | # probably a better way to do this...
374 |
375 | driver.quit()
376 | return html
377 |
378 | elif mode == 'requests':
379 | headers = {'User-Agent': random.choice(USER_AGENTS)}
380 | r = requests.get(url, headers=headers)
381 | # For some journals, we can do better than the returned HTML, so get the final URL and
382 | # substitute a better one.
383 | url = self.check_for_substitute_url(r.url, r.text, journal)
384 | if url != r.url:
385 | r = requests.get(url, headers=headers)
386 | # XML content is usually misidentified as ISO-8859-1, so we need to manually set utf-8.
387 | # Unfortunately this can break other documents. Need to eventually change this to inspect the
388 | # encoding attribute of the document header.
389 | r.encoding = 'utf-8'
390 | return r.text
391 |
392 |
393 | def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_pmc_source=True):
394 | base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
395 | "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
396 |
397 | if prefer_pmc_source:
398 | try:
399 | response = self._client.elink(pmid, retmode='json', return_content=False)
400 | response.raise_for_status() # Raise an HTTPError for bad responses
401 | json_content = response.json()
402 |
403 | providers = {obj['provider']['nameabbr']: obj["url"]["value"] for obj in json_content['linksets'][0]['idurllist'][0]['objurls']}
404 | pmc_url = providers.get('PMC')
405 |
406 | if pmc_url:
407 | return self.get_html(pmc_url, journal, mode='requests')
408 | elif prefer_pmc_source == "only":
409 | logger.info("\tNo PMC source found! Skipping...")
410 | return
411 | except requests.RequestException as e:
412 | logger.error(f"Request failed: {e}")
413 | except KeyError as e:
414 | logger.error(f"Key error: {e} - JSON content: {json_content}")
415 | else:
416 | query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
417 | logger.info(query)
418 | return self.get_html(query, journal, mode=mode)
419 |
420 | if prefer_pmc_source == "only":
421 | logger.info("\tNo PMC source found!! Skipping...")
422 | return
423 |
424 | # Fallback if no PMC link found
425 | query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}"
426 | return self.get_html(query, journal, mode=mode)
427 |
428 |
429 | def check_for_substitute_url(self, url, html, journal):
430 | ''' For some journals/publishers, we can get a better document version by modifying the
431 | URL passed from PubMed. E.g., we can get XML with embedded tables from PLoS ONE instead of
432 | the standard HTML, which displays tables as images. For some journals (e.g., Frontiers),
433 | it's easier to get the URL by searching the source, so pass the html in as well. '''
434 |
435 | j = journal.lower()
436 | try:
437 | if j == 'plos one':
438 | doi_part = re.search('article\?id\=(.*)', url).group(1)
439 | return 'http://journals.plos.org/plosone/article/asset?id=%s.XML' % doi_part
440 | elif j in ['human brain mapping', 'european journal of neuroscience',
441 | 'brain and behavior', 'epilepsia', 'journal of neuroimaging']:
442 | return url.replace('abstract', 'full').split(';')[0]
443 | elif j == 'journal of cognitive neuroscience':
444 | return url.replace('doi/abs', 'doi/full')
445 | elif j.startswith('frontiers in'):
446 | return re.sub('(full|abstract)\/*$', 'xml\/nlm', url)
447 | elif 'sciencedirect' in url:
448 | return url + '?np=y'
449 | elif 'springer.com' in url:
450 | return url + '/fulltext.html'
451 | else:
452 | return url
453 | except Exception as err:
454 | return url
455 |
456 |
457 | def is_pmc_open_acess(self, pmcid):
458 | oa_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id="
459 |
460 | response = get_url(oa_url + pmcid)
461 |
462 | return 'idIsNotOpenAccess' not in response
463 |
464 | def process_article(self, id, journal, delay=None, mode='browser', overwrite=False, prefer_pmc_source=True):
465 |
466 | logger.info("Processing %s..." % id)
467 | journal_path = (self.store / 'html' / journal)
468 | journal_path.mkdir(parents=True, exist_ok=True)
469 | filename = journal_path / f"{id}.html"
470 |
471 | if not overwrite and os.path.isfile(filename):
472 | logger.info("\tAlready exists! Skipping...")
473 |
474 | return None, None
475 |
476 | # Save the HTML
477 | doc = self.get_html_by_pmid(id, journal, mode=mode, prefer_pmc_source=prefer_pmc_source)
478 | valid = None
479 | if doc:
480 | valid = _validate_scrape(doc)
481 | if valid:
482 | with filename.open('w') as f:
483 | f.write(doc)
484 | if not valid:
485 | logger.info("\tScrape failed! Skipping...")
486 |
487 | # Insert random delay until next request.
488 | if delay is not None:
489 | sleep_time = random.random() * float(delay*2)
490 | sleep(sleep_time)
491 |
492 | return filename, valid
493 |
494 | def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mode='browser', search=None,
495 | limit=None, overwrite=False, min_pmid=None, max_pmid=None, shuffle=False,
496 | index_pmids=False, skip_pubmed_central=True, metadata_store=None, invalid_article_log_file=None, prefer_pmc_source=True):
497 |
498 | ''' Try to retrieve all PubMed articles for a single journal that don't
499 | already exist in the storage directory.
500 | Args:
501 | journal: The name of the journal (as it appears in PubMed).
502 | pmids: A list of PMIDs to retrieve.
503 | dois: A list of DOIs to retrieve.
504 | delay: Mean delay between requests.
505 | mode: When 'browser', use selenium to load articles in Chrome. When
506 | 'requests', attempts to fetch the HTML directly via requests module.
507 | search: An optional search string to append to the PubMed query.
508 | Primarily useful for journals that are not specific to neuroimaging.
509 | limit: Optional max number of articles to fetch. Note that only new articles
510 | are counted against this limit; e.g., if limit = 100 and 2,000 articles
511 | are found in PubMed, retrieval will continue until 100 new articles
512 | have been added.
513 | overwrite: When True, all articles returned from PubMed query will be
514 | fetched, irrespective of whether or not they already exist on disk.
515 | min_pmid: When a PMID is provided, only articles with PMIDs greater than
516 | this will be processed. Primarily useful for excluding older articles
517 | that aren't available in full-text HTML format.
518 | max_pmid: When a PMID is provided, only articles with PMIDs less than
519 | this will be processed.
520 | shuffle: When True, articles are retrieved in random order.
521 | index_pmids: When True, will create a list of pmids already in the output.
522 | When used in combination with overwrite=False, this will not download a pmid
523 | even though it's in another directory.
524 | skip_pubmed_central: When True, skips articles that are available from
525 | PubMed Central. This will also write a file with the skipped pmcids
526 | to use with pubget.
527 | metadata_store: Optional path to a directory to store/reference PubMed metadata.
528 | invalid_article_log_file: Optional path to a file to log files where scraping failed.
529 | prefer_pmc_source: Optional
530 | When True, preferentially retrieve articles from PubMed Central, using requests instead of browser
531 | (regardless of mode). This is useful for journals that have full-text articles available on PMC,
532 | but are not open-access. If set to "only", will only retrieve articles from PMC, and
533 | skip articles it cannot retrieve from PMC.
534 | '''
535 | articles_found = 0
536 | if journal is None and dois is None and pmids is None:
537 | raise ValueError("Either journal, pmids, or dois must be provided.")
538 |
539 | if journal is not None:
540 | logger.info("Getting PMIDs for articles from %s..." % journal)
541 | pmids = self.search_pubmed(journal, search)
542 |
543 | if dois is not None:
544 | logger.info("Retrieving articles from %s..." % ', '.join(dois))
545 | pmids = [get_pmid_from_doi(doi) for doi in dois]
546 |
547 | # Remove None values and log missing DOIs
548 | pmids = [pmid for pmid in pmids if pmid is not None]
549 | missing_dois = [doi for doi, pmid in zip(dois, pmids) if pmid is None]
550 | if len(missing_dois) > 0:
551 | logger.info("Missing DOIs: %s" % ', '.join(missing_dois))
552 |
553 | if shuffle:
554 | random.shuffle(pmids)
555 |
556 | logger.info("Found %d records.\n" % len(pmids))
557 |
558 | # If journal is provided, check for existing articles
559 | if journal is not None:
560 | logger.info("Retrieving articles from %s..." % journal)
561 | journal_path = (self.store / 'html' / journal)
562 | if journal_path.exists():
563 | existing = journal_path.glob('*.html')
564 | existing = [int(f.stem) for f in existing]
565 | n_existing = len(existing)
566 | pmids = [pmid for pmid in pmids if int(pmid) not in existing]
567 | logger.info(f"Found {n_existing} existing articles.")
568 |
569 | # filter out all pmids, not just based on folder
570 | if index_pmids:
571 | existing_pmids = [f.stem for f in (self.store / 'html').rglob('*.html')]
572 | pmids = [pmid for pmid in pmids if pmid not in existing_pmids]
573 |
574 | # Filter out articles that are outside the PMID range
575 | pmids = [
576 | pmid
577 | for pmid in pmids
578 | if (min_pmid is None or int(pmid) >= min_pmid) and (max_pmid is None or int(pmid) <= max_pmid)
579 | ]
580 |
581 | logger.info(f"Retrieving {len(pmids)} articles...")
582 |
583 | if skip_pubmed_central:
584 | all_ids = _convert_pmid_to_pmc(pmids)
585 | else:
586 | all_ids = [(None, pmid) for pmid in pmids]
587 |
588 | invalid_articles = []
589 |
590 | if journal is None:
591 | all_iter = []
592 | for pmcid, pmid in all_ids:
593 | metadata = get_pubmed_metadata(pmid, store=metadata_store)
594 | if not metadata or 'journal' not in metadata:
595 | all_iter.append((pmcid, pmid, "UNKNOWN"))
596 | continue
597 | all_iter.append((pmcid, pmid, metadata['journal']))
598 | else:
599 | all_iter = [(pmcid, pmid, journal) for pmcid, pmid in all_ids]
600 |
601 | for pmcid, pmid, journal in all_iter:
602 |
603 | if limit is not None and articles_found >= limit: break
604 |
605 | if skip_pubmed_central and pmcid and self.is_pmc_open_acess(pmcid):
606 | logger.info(f"\tPubMed Central OpenAccess entry found! Skipping {pmid}...")
607 | with open('openaccess_pmcids.txt', 'a') as f:
608 | f.write(f"{pmcid}\n")
609 | continue
610 |
611 | filename, valid = self.process_article(pmid, journal, delay, mode, overwrite, prefer_pmc_source)
612 |
613 | if not valid:
614 | invalid_articles.append(filename)
615 | if invalid_article_log_file is not None:
616 | with open(invalid_article_log_file, 'a') as f:
617 | f.write(f"{pmid}\n")
618 | else:
619 | articles_found += 1
620 |
621 | return invalid_articles
622 |
--------------------------------------------------------------------------------
/ace/sources.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # use unicode everywhere
3 | from bs4 import BeautifulSoup
4 | import re
5 | import os
6 | import json
7 | import abc
8 | import importlib
9 | from glob import glob
10 | from ace import datatable
11 | from ace import tableparser
12 | from ace import scrape
13 | from ace import config
14 | from ace import database
15 | import logging
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 |
20 | class SourceManager:
21 |
22 | ''' Loads all the available Source subclasses from this module and the
23 | associated directory of JSON config files and uses them to determine which parser
24 | to call when a new HTML file is passed. '''
25 |
26 | def __init__(self, database, table_dir=None):
27 | ''' SourceManager constructor.
28 | Args:
29 | database: A Database instance to use with all Sources.
30 | table_dir: An optional directory name to save any downloaded tables to.
31 | When table_dir is None, nothing will be saved (requiring new scraping
32 | each time the article is processed).
33 | '''
34 | module = importlib.import_module('ace.sources')
35 | self.sources = {}
36 | source_dir = os.path.join(os.path.dirname(__file__), 'sources')
37 | for config_file in glob('%s/*json' % source_dir):
38 | class_name = config_file.split('/')[-1].split('.')[0]
39 | cls = getattr(module, class_name + 'Source')(database, config=config_file, table_dir=table_dir)
40 | self.sources[class_name] = cls
41 |
42 | def identify_source(self, html):
43 | ''' Identify the source of the article and return the corresponding Source object. '''
44 | for source in list(self.sources.values()):
45 | for patt in source.identifiers:
46 | if re.search(patt, html):
47 | logger.debug('Matched article to Source: %s' % source.__class__.__name__)
48 | return source
49 |
50 |
51 | # A single source of articles--i.e., a publisher or journal
52 | class Source(metaclass=abc.ABCMeta):
53 | # need to include the \\u2009 which is the thin space to which the table is being invalidated due to those characters
54 | # -\\u2009int
55 | ENTITIES = {
56 | ' ': ' ',
57 | '−': '-',
58 | # 'κ': 'kappa',
59 | '\xa0': ' ', # Unicode non-breaking space
60 | # '\x3e': ' ',
61 | '\u2212': '-', # Various unicode dashes
62 | '\u2012': '-',
63 | '\u2013': '-',
64 | '\u2014': '-',
65 | '\u2015': '-',
66 | '\u8211': '-',
67 | '\u0150': '-',
68 | '\u0177': '',
69 | '\u0160': '',
70 | '\u0145': "'",
71 | '\u0146': "'",
72 | '\u2009': "", # Various whitespaces within tables
73 | '\u2007': "",
74 |
75 | }
76 |
77 | def __init__(self, database, config=None, table_dir=None):
78 | self.database = database
79 | self.table_dir = table_dir
80 | self.entities = {}
81 |
82 | if config is not None:
83 | config = json.load(open(config, 'rb'))
84 | valid_keys = ['name', 'identifiers', 'entities', 'delay']
85 |
86 | for k, v in list(config.items()):
87 | if k in valid_keys:
88 | setattr(self, k, v)
89 |
90 | # Append any source-specific entities found in the config file to
91 | # the standard list
92 | if self.entities is None:
93 | self.entities = Source.ENTITIES
94 | else:
95 | self.entities.update(Source.ENTITIES)
96 |
97 | @abc.abstractmethod
98 | def parse_article(self, html, pmid=None, metadata_dir=None):
99 | ''' Takes HTML article as input and returns an Article. PMID Can also be
100 | passed, which prevents having to scrape it from the article and/or look it
101 | up in PubMed. '''
102 |
103 | # Skip rest of processing if this record already exists
104 | if pmid is not None and self.database.article_exists(pmid) and not config.OVERWRITE_EXISTING_ROWS:
105 | return False
106 |
107 | html = self.decode_html_entities(html)
108 | soup = BeautifulSoup(html)
109 | if pmid is None:
110 | pmid = self.extract_pmid(soup)
111 |
112 | # did our best to find PMID, but failed
113 | if not pmid:
114 | return False
115 |
116 | metadata = scrape.get_pubmed_metadata(pmid, store=metadata_dir, save=True)
117 |
118 | # Remove all scripts and styles
119 | for script in soup(["script", "style"]):
120 | script.extract()
121 | # Get text
122 | text = soup.get_text()
123 | if self.database.article_exists(pmid):
124 | if config.OVERWRITE_EXISTING_ROWS:
125 | self.database.delete_article(pmid)
126 | else:
127 | return False
128 |
129 | self.article = database.Article(text, pmid=pmid, metadata=metadata)
130 | self.extract_neurovault(soup)
131 | return soup
132 |
133 | def extract_neurovault(self, soup):
134 | ''' Look through all links, and use regex to identify NeuroVault links. '''
135 | image_regexes = ['identifiers.org/neurovault.image:(\d*)',
136 | 'neurovault.org/images/(\d*)']
137 |
138 | image_regexes = re.compile( '|'.join( image_regexes) )
139 |
140 | collection_regexes = ['identifiers.org/neurovault.collection:(\w*)',
141 | 'neurovault.org/collections/(\w*)']
142 |
143 | collection_regexes = re.compile( '|'.join( collection_regexes) )
144 |
145 |
146 | nv_links = []
147 | for link in soup.find_all('a'):
148 | if link.has_attr('href'):
149 | href = link['href']
150 |
151 | img_m = image_regexes.search(href)
152 | col_m = collection_regexes.search(href)
153 | if not (img_m or col_m):
154 | continue
155 |
156 | if img_m:
157 | type = 'image'
158 | val = img_m.groups()[0] or img_m.groups()[1]
159 | elif col_m:
160 | type = 'collection'
161 | val = col_m.groups()[0] or col_m.groups()[1]
162 |
163 | nv_links.append(
164 | database.NeurovaultLink(
165 | type=type,
166 | neurovault_id=val,
167 | url=href
168 | )
169 | )
170 |
171 | self.article.neurovault_links = nv_links
172 |
173 | def extract_text(self, soup):
174 | ''' Extract text from the article.
175 | Publisher specific extraction of body text should be done in a subclass.
176 | '''
177 |
178 | text = soup.get_text()
179 |
180 | # Remove any remaining HTML tags
181 | text = re.sub(r'<[^>]+>', '', text)
182 |
183 | # Remove any remaining unicode characters
184 | text = re.sub(r'\\u[0-9]+', '', text)
185 |
186 | # Remove any remaining entities
187 | text = self.decode_html_entities(text)
188 |
189 | # Remove any remaining whitespace
190 | text = re.sub(r'\s+', ' ', text)
191 |
192 | self.article.text = text
193 |
194 | def parse_table(self, table):
195 | ''' Takes HTML for a single table and returns a Table. '''
196 | # Formatting issues sometimes prevent table extraction, so just return
197 | if table is None:
198 | return False
199 |
200 | logger.debug("\t\tFound a table...")
201 |
202 | # change to \n
203 | for br in table.find_all("br"):
204 | br.replace_with("\n")
205 |
206 | # Count columns. Check either just one row, or all of them.
207 | def n_cols_in_row(row):
208 | return sum([
209 | int(td['colspan'])
210 | if td.has_attr('colspan') and td['colspan'] != "NaN" else 1
211 | for td in row.find_all(['th', 'td'])
212 | ])
213 |
214 | search_table = table.find("tbody")
215 | if search_table is None:
216 | search_table = table
217 |
218 | all_trs = search_table.find_all('tr')
219 | if all_trs is None or len(all_trs) == 0:
220 | return False
221 |
222 | if config.CAREFUL_PARSING:
223 | n_cols = max([n_cols_in_row(
224 | row) for row in all_trs])
225 | else:
226 | n_cols = n_cols_in_row(search_table.find('tr'))
227 |
228 | # Initialize grid and populate
229 | data = datatable.DataTable(0, n_cols)
230 | rows = table.find_all('tr')
231 | for (j, r) in enumerate(rows):
232 | try:
233 | cols = r.find_all(['td', 'th'])
234 | cols_found_in_row = 0
235 | n_cells = len(cols)
236 | # Assign number of rows and columns this cell fills. We use these rules:
237 | # * If a rowspan/colspan is explicitly provided, use it
238 | # * If not, initially assume span == 1 for both rows and columns.
239 | for (i, c) in enumerate(cols):
240 | r_num = (
241 | int(c['rowspan'])
242 | if c.has_attr('rowspan') and c['rowspan'] != "NaN" else 1
243 | )
244 | c_num = (
245 | int(c['colspan'])
246 | if c.has_attr('colspan') and c['colspan'] != "NaN" else 1
247 | )
248 | cols_found_in_row += c_num
249 | # * Check to make sure that we don't have unaccounted-for columns in the
250 | # row after including the current cell. If we do, adjust the colspan
251 | # to take up all of the remaining columns. This is necessary because
252 | # some tables have malformed HTML, and BeautifulSoup can also
253 | # cause problems in its efforts to fix bad tables. The most common
254 | # problem is deletion or omission of enough tags to fill all
255 | # columns, hence our adjustment. Note that in some cases the order of
256 | # filling is not sequential--e.g., when a previous row has cells with
257 | # rowspan > 1. So we have to check if there are None values left over
258 | # in the DataTable's current row after we finish filling
259 | # it.
260 | if i + 1 == n_cells and cols_found_in_row < n_cols and (len(data.data) == j+1) and data[j].count(None) > c_num:
261 | c_num += n_cols - cols_found_in_row
262 | data.add_val(c.get_text(), r_num, c_num)
263 | except Exception as err:
264 | if not config.SILENT_ERRORS:
265 | logger.error(str(err))
266 | if not config.IGNORE_BAD_ROWS:
267 | raise
268 |
269 | if data.data[data.n_rows- 1].count(None) == data.n_cols:
270 | data.data.pop()
271 | logger.debug("\t\tTrying to parse table...")
272 | return tableparser.parse_table(data)
273 |
274 | def extract_doi(self, soup):
275 | ''' Every Source subclass must be able to extract its doi. '''
276 | return
277 |
278 | def extract_pmid(self, soup):
279 | ''' Every Source subclass must be able to extract its PMID. '''
280 | return
281 |
282 | def decode_html_entities(self, html):
283 | ''' Re-encode HTML entities as innocuous little Unicode characters. '''
284 | # Any entities BeautifulSoup passes through thatwe don't like, e.g.,
285 | #  /x0a
286 | if self.entities:
287 | patterns = re.compile('(' + '|'.join(re.escape(
288 | k) for k in list(self.entities.keys())) + ')')
289 | replacements = lambda m: self.entities[m.group(0)]
290 | return patterns.sub(replacements, html)
291 | else:
292 | return html
293 |
294 | def _download_table(self, url):
295 | ''' For Sources that have tables in separate files, a helper for
296 | downloading and extracting the table data. Also saves to file if desired.
297 | '''
298 |
299 | delay = self.delay if hasattr(self, 'delay') else 0
300 |
301 | if self.table_dir is not None:
302 | filename = '%s/%s' % (self.table_dir, url.replace('/', '_'))
303 | if os.path.exists(filename):
304 | table_html = open(filename).read()
305 | else:
306 | table_html = scrape.get_url(url)
307 | open(filename, 'w').write(table_html.encode('utf-8'))
308 | else:
309 | table_html = scrape.get_url(url)
310 |
311 | if table_html:
312 | table_html = self.decode_html_entities(table_html)
313 | return BeautifulSoup(table_html)
314 |
315 | return None
316 |
317 |
318 | class DefaultSource(Source):
319 | def parse_article(self, html, pmid=None, **kwargs):
320 | soup = super(DefaultSource, self).parse_article(html, pmid, **kwargs)
321 | if not soup:
322 | return False
323 |
324 | self.article.missing_source = True
325 | return self.article
326 |
327 |
328 | class HighWireSource(Source):
329 |
330 | def parse_article(self, html, pmid=None, **kwargs):
331 | soup = super(HighWireSource, self).parse_article(html, pmid, **kwargs)
332 | if not soup:
333 | return False
334 |
335 | # To download tables, we need the content URL and the number of tables
336 | content_url = soup.find('meta', {
337 | 'name': 'citation_public_url'})['content']
338 |
339 | n_tables = len(soup.find_all('span', class_='table-label'))
340 |
341 | # Now download each table and parse it
342 | tables = []
343 | logger.info(f"Found {n_tables} tables.")
344 | for i in range(n_tables):
345 | t_num = i + 1
346 | url = '%s/T%d.expansion.html' % (content_url, t_num)
347 | table_soup = self._download_table(url)
348 | if not table_soup:
349 | continue
350 | tc = table_soup.find(class_='table-expansion')
351 | if tc:
352 | t = tc.find('table', {'id': 'table-%d' % (t_num)})
353 | t = self.parse_table(t)
354 | if t:
355 | t.position = t_num
356 | t.label = tc.find(class_='table-label').text
357 | t.number = t.label.split(' ')[-1].strip()
358 | try:
359 | t.caption = tc.find(class_='table-caption').get_text()
360 | except:
361 | pass
362 | try:
363 | t.notes = tc.find(class_='table-footnotes').get_text()
364 | except:
365 | pass
366 | tables.append(t)
367 |
368 | self.article.tables = tables
369 | return self.article
370 |
371 | def parse_table(self, table):
372 | return super(HighWireSource, self).parse_table(table)
373 |
374 | def extract_doi(self, soup):
375 | try:
376 | return soup.find('meta', {'name': 'citation_doi'})['content']
377 | except:
378 | return ''
379 |
380 | def extract_pmid(self, soup):
381 | return soup.find('meta', {'name': 'citation_pmid'})['content']
382 |
383 | def extract_text(self, soup):
384 | # If div has class "main-content-wrapper" or "article" or "fulltext-view"
385 | # extract all text from it
386 |
387 | # Assuming you have a BeautifulSoup object called soup
388 | div = soup.find_all("div", class_="article")
389 | if div:
390 | div = div[0]
391 | div_classes = ["ref-list", "abstract", "copyright-statement", "fn-group", "history-list", "license"]
392 | for class_ in div_classes:
393 | for tag in div.find_all(class_=class_):
394 | tag.extract()
395 | soup = div
396 |
397 | return super(HighWireSource, self).extract_text(soup)
398 |
399 |
400 | class OUPSource(Source):
401 |
402 | def parse_article(self, html, pmid=None, **kwargs):
403 | soup = super(OUPSource, self).parse_article(html, pmid, **kwargs)
404 | if not soup:
405 | return False
406 |
407 | # Extract tables
408 | tables = []
409 |
410 | # Exclude modal tables to prevent duplicates
411 | all_tables = set(soup.select('div.table-full-width-wrap'))
412 | modal_tables = set(soup.select('div.table-full-width-wrap.table-modal'))
413 | table_containers = all_tables - modal_tables
414 | logger.info(f"Found {len(table_containers)} tables.")
415 | for (i, tc) in enumerate(table_containers):
416 | table_html = tc.find('table')
417 | t = self.parse_table(table_html)
418 | if t:
419 | t.position = i + 1
420 | try:
421 | t.number = tc.find('span', class_='label').text.split(' ')[-1].strip()
422 | t.label = tc.find('span', class_='label').text.strip()
423 | except:
424 | pass
425 | try:
426 | t.caption = tc.find('span', class_='caption').get_text()
427 | except:
428 | pass
429 | try:
430 | t.notes = tc.find('span', class_='fn').get_text()
431 | except:
432 | pass
433 | tables.append(t)
434 |
435 | self.article.tables = tables
436 | return self.article
437 |
438 | def parse_table(self, table):
439 | return super(OUPSource, self).parse_table(table)
440 |
441 | def extract_doi(self, soup):
442 | try:
443 | return soup.find('meta', {'name': 'citation_doi'})['content']
444 | except:
445 | return ''
446 |
447 | def extract_pmid(self, soup):
448 | pmid = soup.find('meta', {'name': 'citation_pmid'})
449 | if pmid:
450 | return pmid['content']
451 | else:
452 | return None
453 |
454 | def extract_text(self, soup):
455 | # If div has class "main-content-wrapper" or "article" or "fulltext-view"
456 | # extract all text from it
457 |
458 | # Assuming you have a BeautifulSoup object called soup
459 | div = soup.find_all("div", class_="article-body")
460 | if div:
461 | div = div[0]
462 | div_classes = ["ref-list", "abstract", "copyright-statement", "fn-group", "history-list", "license"]
463 | for class_ in div_classes:
464 | for tag in div.find_all(class_=class_):
465 | tag.extract()
466 | soup = div
467 |
468 | return super(OUPSource, self).extract_text(soup)
469 |
470 |
471 | class ScienceDirectSource(Source):
472 |
473 | def parse_article(self, html, pmid=None, **kwargs):
474 | soup = super(ScienceDirectSource, self).parse_article(html, pmid, **kwargs)
475 | if not soup:
476 | return False
477 |
478 | # Extract tables
479 | tables = []
480 | table_containers = soup.find_all('div', {'class': 'tables'})
481 | if len(table_containers) == 0:
482 | # try old method
483 | table_containers = soup.find_all('dl', {'class': 'table'})
484 |
485 | logger.info(f"Found {len(table_containers)} tables.")
486 | for (i, tc) in enumerate(table_containers):
487 | table_html = tc.find('table')
488 | t = self.parse_table(table_html)
489 | if t:
490 | t.position = i + 1
491 | try:
492 | t.number = tc.find('span', class_='label').text.split(' ')[-1].strip() or tc['data-label'].split(' ')[-1].strip()
493 | t.label = tc.find('span', class_='label').text.strip()
494 | except:
495 | pass
496 | try:
497 | t.caption = tc.find('p').contents[-1].strip()
498 | except:
499 | pass
500 | try:
501 | t.notes = tc.find(class_='tblFootnote').get_text()
502 | except:
503 | pass
504 | tables.append(t)
505 |
506 | self.article.tables = tables
507 | return self.article
508 |
509 | def parse_table(self, table):
510 | return super(ScienceDirectSource, self).parse_table(table)
511 |
512 | def extract_doi(self, soup):
513 | try:
514 | return list(soup.find('div', {'id': 'article-identifier-links'}).children)[0]['href'].replace('https://doi.org/', '')
515 | except:
516 | return ''
517 |
518 | def extract_pmid(self, soup):
519 | return scrape.get_pmid_from_doi(self.extract_doi(soup))
520 |
521 |
522 | class PlosSource(Source):
523 |
524 | def parse_article(self, html, pmid=None, **kwargs):
525 | soup = super(PlosSource, self).parse_article(html, pmid, **kwargs) # Do some preprocessing
526 | if not soup:
527 | return False
528 |
529 | # Extract tables
530 | tables = []
531 | table_containers = soup.find_all('table-wrap')
532 | logger.info(f"Found {len(table_containers)} tables.")
533 | for (i, tc) in enumerate(table_containers):
534 | table_html = tc.find('table')
535 | t = self.parse_table(table_html)
536 | if t:
537 | t.position = i + 1
538 | t.label = tc.find('label').text
539 | t.number = t.label.split(' ')[-1].strip()
540 | try:
541 | t.caption = tc.find('title').get_text()
542 | except:
543 | pass
544 | try:
545 | t.notes = tc.find('table-wrap-foot').get_text()
546 | except:
547 | pass
548 | tables.append(t)
549 |
550 | self.article.tables = tables
551 | return self.article
552 |
553 | def parse_table(self, table):
554 | return super(PlosSource, self).parse_table(table)
555 |
556 | def extract_doi(self, soup):
557 | try:
558 | return soup.find('article-id', {'pub-id-type': 'doi'}).text
559 | except:
560 | return ''
561 |
562 | def extract_pmid(self, soup):
563 | return scrape.get_pmid_from_doi(self.extract_doi(soup))
564 |
565 |
566 | class FrontiersSource(Source):
567 |
568 | def parse_article(self, html, pmid=None, **kwargs):
569 |
570 | soup = super(FrontiersSource, self).parse_article(html, pmid, **kwargs)
571 | if not soup:
572 | return False
573 |
574 | # Extract tables
575 | tables = []
576 | table_containers = soup.findAll(
577 | 'table-wrap', {'id': re.compile('^T\d+$')})
578 | logger.info(f"Found {len(table_containers)} tables.")
579 | for (i, tc) in enumerate(table_containers):
580 | table_html = tc.find('table')
581 | t = self.parse_table(table_html)
582 | # If Table instance is returned, add other properties
583 | if t:
584 | t.position = i + 1
585 | t.number = tc['id'][1::].strip()
586 | t.label = tc.find('label').get_text()
587 | try:
588 | t.caption = tc.find('caption').get_text()
589 | except:
590 | pass
591 | try:
592 | t.notes = tc.find('table-wrap-foot').get_text()
593 | except:
594 | pass
595 | tables.append(t)
596 |
597 | self.article.tables = tables
598 | return self.article
599 |
600 | def parse_table(self, table):
601 | return super(FrontiersSource, self).parse_table(table)
602 |
603 | def extract_doi(self, soup):
604 | try:
605 | return soup.find('article-id', {'pub-id-type': 'doi'}).text
606 | except:
607 | return ''
608 |
609 | def extract_pmid(self, soup):
610 | return scrape.get_pmid_from_doi(self.extract_doi(soup))
611 |
612 |
613 | class JournalOfCognitiveNeuroscienceSource(Source):
614 |
615 | def parse_article(self, html, pmid=None, **kwargs):
616 | soup = super(
617 | JournalOfCognitiveNeuroscienceSource, self).parse_article(html, pmid, **kwargs)
618 | if not soup:
619 | return False
620 |
621 | # To download tables, we need the DOI and the number of tables
622 | doi = self.article.doi or self.extract_doi(soup)
623 | tables = []
624 |
625 | # Now download each table and parse it
626 | table_containers = soup.find_all('div', {'class': 'table-wrap'})
627 | logger.info(f"Found {len(table_containers)} tables.")
628 | for i, tc in enumerate(table_containers):
629 | table_html = tc.find('table', {'role': 'table'})
630 | if not table_html:
631 | continue
632 |
633 | t = self.parse_table(table_html)
634 |
635 | if t:
636 | t.position = i + 1
637 | s = re.search('T(\d+).+$', tc['content-id'])
638 | if s:
639 | t.number = s.group(1)
640 | caption = tc.find('div', class_='caption')
641 | if caption:
642 | t.label = caption.get_text()
643 | t.caption = caption.get_text()
644 | try:
645 | t.notes = tc.find('div', class_="fn").p.get_text()
646 | except:
647 | pass
648 | tables.append(t)
649 |
650 | self.article.tables = tables
651 | return self.article
652 |
653 | def parse_table(self, table):
654 | return super(JournalOfCognitiveNeuroscienceSource, self).parse_table(table)
655 |
656 | def extract_doi(self, soup):
657 | try:
658 | return soup.find('meta', {'name': 'dc.Identifier', 'scheme': 'doi'})['content']
659 | except:
660 | return ''
661 |
662 | def extract_pmid(self, soup):
663 | return scrape.get_pmid_from_doi(self.extract_doi(soup))
664 |
665 |
666 | class WileySource(Source):
667 |
668 | def parse_article(self, html, pmid=None, **kwargs):
669 |
670 | soup = super(WileySource, self).parse_article(html, pmid, **kwargs) # Do some preprocessing
671 | if not soup:
672 | return False
673 |
674 | # Extract tables
675 | tables = []
676 | table_containers = soup.findAll('div', {
677 | 'class': re.compile('article-table-content|table'), 'id': re.compile('^(.*?)\-tbl\-\d+$|^t(bl)*\d+$')})
678 | logger.info(f"Found {len(table_containers)} tables.")
679 | for (i, tc) in enumerate(table_containers):
680 | table_html = tc.find('table')
681 | footer = None
682 | try:
683 | # Remove footer, which appears inside table
684 | footer = table_html.tfoot.extract()
685 | except:
686 | pass
687 | t = self.parse_table(table_html)
688 | # If Table instance is returned, add other properties
689 | if t:
690 | t.position = i + 1
691 | # t.number = tc['id'][3::].strip()
692 | t.number = re.search('t[bl0\-]*(\d+)$', tc['id']).group(1)
693 | try:
694 | t.label = tc.find('span', class_='label').get_text()
695 | except:
696 | pass
697 | try:
698 | t.caption = tc.find('caption').get_text()
699 | except AttributeError:
700 | caption = tc.find('div', {'header': 'article-table-caption'})
701 | t.caption = caption.get_text() if caption else None
702 | try:
703 | t.notes = footer.get_text() if footer else None
704 | except AttributeError:
705 | notes = tc.find('div', {'class': 'article-section__table-footnotes'})
706 | t.notes = notes.get_text() if caption else None
707 | pass
708 | tables.append(t)
709 |
710 | self.article.tables = tables
711 | return self.article
712 |
713 | def parse_table(self, table):
714 | return super(WileySource, self).parse_table(table)
715 |
716 | def extract_doi(self, soup):
717 | try:
718 | return soup.find('meta', {'name': 'citation_doi'})['content']
719 | except:
720 | return ''
721 |
722 | def extract_pmid(self, soup):
723 | return scrape.get_pmid_from_doi(self.extract_doi(soup))
724 |
725 | # Note: the SageSource is largely useless and untested because Sage renders tables
726 | # as images.
727 |
728 |
729 | class SageSource(Source):
730 |
731 | def parse_article(self, html, pmid=None, **kwargs):
732 |
733 | soup = super(SageSource, self).parse_article(html, pmid, **kwargs)
734 | if not soup:
735 | return False
736 |
737 | # To download tables, we need the content URL and the number of tables
738 | content_url = soup.find('meta', {
739 | 'name': 'citation_public_url'})['content']
740 |
741 | n_tables = len(soup.find_all('span', class_='table-label'))
742 | logger.info(f"Found {n_tables} tables.")
743 | # Now download each table and parse it
744 | tables = []
745 | for i in range(n_tables):
746 | t_num = i + 1
747 | url = '%s/T%d.expansion.html' % (content_url, t_num)
748 | table_soup = self._download_table(url)
749 | if not table_soup:
750 | continue
751 | tc = table_soup.find(class_='table-expansion')
752 | if tc:
753 | t = tc.find('table', {'id': 'table-%d' % (t_num)})
754 | t = self.parse_table(t)
755 | if t:
756 | t.position = t_num
757 | t.label = tc.find(class_='table-label').text
758 | t.number = t.label.split(' ')[-1].strip()
759 | try:
760 | t.caption = tc.find(class_='table-caption').get_text()
761 | except:
762 | pass
763 | try:
764 | t.notes = tc.find(class_='table-footnotes').get_text()
765 | except:
766 | pass
767 | tables.append(t)
768 |
769 | self.article.tables = tables
770 | return self.article
771 |
772 | def parse_table(self, table):
773 | return super(SageSource, self).parse_table(table)
774 |
775 | def extract_doi(self, soup):
776 | try:
777 | return soup.find('meta', {'name': 'citation_doi'})['content']
778 | except:
779 | return ''
780 |
781 | def extract_pmid(self, soup):
782 | return soup.find('meta', {'name': 'citation_pmid'})['content']
783 |
784 |
785 | class OldSpringerSource(Source):
786 |
787 | def parse_article(self, html, pmid=None, **kwargs):
788 |
789 | soup = super(OldSpringerSource, self).parse_article(html, pmid, **kwargs)
790 | if not soup:
791 | return False
792 |
793 | # Extract tables
794 | tables = []
795 | table_containers = soup.findAll(
796 | 'figure', {'id': re.compile('^Tab\d+$')})
797 | for (i, tc) in enumerate(table_containers):
798 | table_html = tc.find('table')
799 | t = self.parse_table(table_html)
800 | # If Table instance is returned, add other properties
801 | if t:
802 | t.position = i + 1
803 | t.number = tc['id'][3::].strip()
804 | t.label = tc.find('span', class_='CaptionNumber').get_text()
805 | try:
806 | t.caption = tc.find(class_='CaptionContent').p.get_text()
807 | except:
808 | pass
809 | try:
810 | t.notes = tc.find(class_='TableFooter').p.get_text()
811 | except:
812 | pass
813 | tables.append(t)
814 |
815 | self.article.tables = tables
816 | return self.article
817 |
818 | def parse_table(self, table):
819 | return super(OldSpringerSource, self).parse_table(table)
820 |
821 | def extract_doi(self, soup):
822 | content = soup.find('p', class_='ArticleDOI').get_text()
823 | return content.split(' ')[1]
824 |
825 | def extract_pmid(self, soup):
826 | return scrape.get_pmid_from_doi(self.extract_doi(soup))
827 |
828 |
829 | class SpringerSource(Source):
830 |
831 | def parse_article(self, html, pmid=None, **kwargs):
832 |
833 | soup = super(SpringerSource, self).parse_article(html, pmid, **kwargs)
834 | if not soup:
835 | return False
836 |
837 | # Extract table; going to take the approach of opening and parsing the table via links
838 | # To download tables, we need the content URL and the number of tables
839 | content_url = soup.find('meta', {'name': 'citation_fulltext_html_url'})['content']
840 |
841 | n_tables = len(soup.find_all('span', string='Full size table'))
842 | logger.info(f"Found {n_tables} tables.")
843 | # Now download each table and parse it
844 | tables = []
845 | for i in range(n_tables):
846 | t_num = i + 1
847 | url = '%s/tables/%d' % (content_url, t_num)
848 | table_soup = self._download_table(url)
849 | if not table_soup:
850 | continue
851 | tc = table_soup.find(class_='data last-table')
852 | t = self.parse_table(tc)
853 | if t:
854 | t.position = t_num
855 |
856 | # id_name is the id HTML element that cotains the title, label and table number that needs to be parse
857 | # temp_title sets it up to where the title can be parsed and then categorized
858 | id_name = f"table-{t_num}-title"
859 | temp_title = table_soup.find('h1', attrs={'id': id_name}).get_text().split()
860 |
861 | # grabbing the first two elements for the label and then making them a string object
862 | t.label = " ".join(temp_title[:2])
863 | t.number = str(temp_title[1])
864 | try:
865 | # grabbing the rest of the element for the caption/title of the table and then making them a string object
866 | t.caption = " ".join(temp_title[2:])
867 | except:
868 | pass
869 | try:
870 | t.notes = table_soup.find(class_='c-article-table-footer').get_text()
871 | except:
872 | pass
873 | tables.append(t)
874 |
875 | self.article.tables = tables
876 | return self.article
877 |
878 | def parse_table(self, table):
879 | return super(SpringerSource, self).parse_table(table)
880 |
881 | def extract_doi(self, soup):
882 | try:
883 | return soup.find('meta', attrs={'name': "citation_doi"})['content']
884 | except:
885 | return ''
886 |
887 | def extract_pmid(self, soup):
888 | return scrape.get_pmid_from_doi(self.extract_doi(soup))
889 |
890 |
891 | class PMCSource(Source):
892 | def parse_article(self, html, pmid=None, **kwargs):
893 | soup = super(PMCSource, self).parse_article(html, pmid, **kwargs)
894 | if not soup:
895 | return False
896 |
897 | tables = []
898 | table_containers = soup.findAll('div', {'class': 'table-wrap'})
899 | logger.info(f"Found {len(table_containers)} tables.")
900 | for (i, tc) in enumerate(table_containers):
901 | sub_tables = tc.findAll('div', {'class': 'xtable'})
902 | for st in sub_tables:
903 | t = self.parse_table(st)
904 | if t:
905 | t.position = i + 1
906 | t.label = tc.find('h3').text if tc.find('h3') else None
907 | t.number = t.label.split(' ')[-1].strip() if t.label else None
908 | try:
909 | t.caption = tc.find({"div": {"class": "caption"}}).text
910 | except:
911 | pass
912 | try:
913 | t.notes = tc.find('div', class_='tblwrap-foot').text
914 | except:
915 | pass
916 | tables.append(t)
917 |
918 | self.article.tables = tables
919 | return self.article
920 |
921 | def extract_pmid(self, soup):
922 | return soup.find('meta', {'name': 'citation_pmid'})['content']
923 |
924 | def extract_doi(self, soup):
925 | return soup.find('meta', {'name': 'citation_doi'})['content']
926 |
--------------------------------------------------------------------------------
/ace/sources/Frontiers.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Frontiers",
3 | "identifiers": [
4 | "10.3389"
5 | ],
6 | "entities": {
7 | "−": "-",
8 | " ": " "
9 | }
10 | }
--------------------------------------------------------------------------------
/ace/sources/HighWire.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "HighWire",
3 | "identifiers": [
4 | "highwire-journal",
5 | "http://schema.highwire.org/Linking",
6 | "highwire-journal-article"
7 | ],
8 | "entities": {
9 | },
10 | "delay": 10
11 | }
12 |
--------------------------------------------------------------------------------
/ace/sources/JournalOfCognitiveNeuroscience.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "JournalOfCognitiveNeuroscience",
3 | "identifiers": [
4 | "property=\"og:site_name\" content=\"MIT Press\"",
5 | "MIT Press Journals - Journal of Cognitive Neuroscience - Full Text"
6 | ],
7 | "entities": {
8 | "\u2002": " "
9 | },
10 | "delay": 10
11 | }
12 |
--------------------------------------------------------------------------------
/ace/sources/OUP.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "OUP",
3 | "identifiers": [
4 | "OUP Academic"
5 | ],
6 | "entities": {
7 | },
8 | "delay": 10
9 | }
--------------------------------------------------------------------------------
/ace/sources/OldSpringer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "OldSpringer",
3 | "identifiers": [
4 | "- Springer"
5 | ],
6 | "entities": {
7 |
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/ace/sources/PMC.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "PMC",
3 | "identifiers": [
4 | "",
5 | ""
6 | ],
7 | "entities": {
8 | }
9 | }
10 |
--------------------------------------------------------------------------------
/ace/sources/Plos.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Plos",
3 | "identifiers": [
4 | "Public Library of Science"
5 | ],
6 | "entities": {
7 | }
8 | }
--------------------------------------------------------------------------------
/ace/sources/Sage.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Sage",
3 | "identifiers": [
4 | ""
5 | ],
6 | "entities": {
7 |
8 | }
9 | }
--------------------------------------------------------------------------------
/ace/sources/ScienceDirect.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ScienceDirect",
3 | "identifiers": [
4 | "title=\"ScienceDirect -The world's leading full-text scientific database\"",
5 | "- ScienceDirect"
6 | ],
7 | "entities": {
8 |
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/ace/sources/Springer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Springer",
3 | "identifiers": [
4 | "",
5 | "meta property=\"og:site_name\" content=\"SpringerLink\""
6 | ],
7 | "entities": {
8 |
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/ace/sources/Wiley.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Wiley",
3 | "identifiers": [
4 | "Wiley Online Library"
5 | ],
6 | "entities": {
7 |
8 | }
9 | }
--------------------------------------------------------------------------------
/ace/tableparser.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | # use unicode everywhere
3 |
4 | # import database
5 | import regex # Note: we're using features in the new regex module, not re!
6 | import logging
7 | from . import config
8 | from .database import Activation, Table
9 | from collections import Counter, defaultdict
10 |
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | def identify_standard_columns(labels):
16 | ''' Takes a set of column labels and returns an equal-length list with names
17 | of any standard columns detected. Unknown columns are assigned None.
18 | E.g., passing in ['p value', 'brain region', 'unknown_col'] would return
19 | ['p_value', 'region', None].
20 | '''
21 | standardized = [None] * len(labels)
22 | found_coords = False
23 | for i, lab in enumerate(labels):
24 | if regex.search('(^\s*ba$)|brodmann', lab):
25 | s = 'ba'
26 | elif regex.search('region|anatom|location|area', lab):
27 | s = 'region'
28 | elif regex.search('sphere|(^\s*h$)|^\s*hem|^\s*side', lab):
29 | s = 'hemisphere'
30 | elif regex.search('(^k$)|(mm.*?3)|volume|voxels|size|extent', lab):
31 | s = 'size'
32 | elif regex.match('\s*[xy]\s*$', lab):
33 | found_coords = True
34 | s = lab
35 | elif regex.match('\s*z\s*$', lab):
36 | # For z, we need to distinguish z plane from z-score.
37 | # Use simple heuristics:
38 | # * If no 'x' column exists, this must be a z-score
39 | # * If the preceding label was anything but 'y', must be a z-score
40 | # * Otherwise it's a z coordinate
41 | # Note: this could theoretically break if someone has non-contiguous
42 | # x-y-z columns, but this seems unlikely. If it does happen,
43 | # an alternative approach would be to check if the case of the 'z' column
44 | # matches the case of the 'x' column and make determination that
45 | # way.
46 | s = 'statistic' if not found_coords or labels[i - 1] != 'y' else 'z'
47 | elif regex.search('rdinate', lab):
48 | continue
49 | elif lab == 't' or regex.search('^(max.*(z|t).*|.*(z|t).*(score|value|max))$', lab):
50 | s = 'statistic'
51 | elif regex.search('p[\-\s]+.*val', lab):
52 | s = 'p_value'
53 | else:
54 | s = None
55 | standardized[i] = s
56 | return standardized
57 |
58 |
59 | def identify_repeating_groups(labels):
60 | ''' Identify groups: any sets of columns where names repeat.
61 | Repeating groups must be contiguous; i.e., [x, y, z, w, x, y, z, f]
62 | will not match, but [w, f, x, y, z, x, y, z] will.
63 |
64 | Note that this will only handle one level of repetition; i.e.,
65 | hierarchical groupings will be ignored. E.g., in a 2 x 2 x 3
66 | nesting of columns like hemisphere --> condition --> x/y/z,
67 | only the 4 sets of repeating x/y/z columns will be detected.
68 |
69 | Returns a list of strings made up of the index of the first column
70 | in the group and the number of columns. E.g., '1/3' indicates the
71 | group starts at the second column and contains 3 columns. These
72 | keys can be used to directly look up names stored in a
73 | multicolumn_label dictionary.
74 | '''
75 | # OLD ALGORITHM: MUCH SIMPLER AND FASTER BUT DOESN'T WORK PROPERLY
76 | # FOR NON-CONTIGUOUS COLUMN GROUPS
77 | # target = '###'.join(unicode(x) for x in labels)
78 | # pattern = regex.compile(r'(.+?###.+?)(###\1)+')
79 | # matches = pattern.finditer(target)
80 | # groups = []
81 | # for m in matches:
82 | # sp = m.span()
83 | # n_cols_in_group = len(m.group(1).split('###'))
84 | # start = len(target[0:sp[0]].split('###'))-1
85 | # n_matches = len(m.group(0).split('###'))
86 | # for i in range(n_matches/n_cols_in_group):
87 | # groups.append('%d/%d' % ((i*n_cols_in_group)+start, n_cols_in_group))
88 | # return list(set(groups))
89 |
90 | groups = []
91 | n_labels = len(labels)
92 | label_counts = Counter(labels)
93 | rep_labels = set([k for k, v in list(label_counts.items()) if v > 1])
94 | # Track multi-label sequences. Key/value = sequence/onset
95 | label_seqs = defaultdict(list)
96 |
97 | # Loop over labels and identify any sequences made up entirely of labels with
98 | # 2 or more occurrences in the list and without the starting label repeating.
99 | for i, lab in enumerate(labels):
100 | if lab not in rep_labels:
101 | continue
102 | current_seq = [lab]
103 | for j in range(i+1, n_labels):
104 | lab_j = labels[j]
105 | if lab_j not in rep_labels or lab_j == lab:
106 | break
107 | current_seq.append(lab_j)
108 | if len(current_seq) > 1:
109 | label_seqs['###'.join(current_seq)].append(i)
110 |
111 | # Keep only sequences that occur two or more times
112 | label_seqs = { k: v for k, v in list(label_seqs.items()) if len(v) > 1}
113 |
114 | # Invert what's left into a list where the sequence occurs at its start pos
115 | seq_starts = [None] * n_labels
116 | for k, v in list(label_seqs.items()):
117 | for start in v:
118 | seq_starts[start] = k.split('###')
119 |
120 | # Create boolean array to track whether each element has already been used
121 | labels_used = [False] * n_labels
122 |
123 | # Loop through labels and add a group if we find a sequence that starts at
124 | # the current position and spans at least one currently unused cell.
125 | # This is necessary to account for cases where one sequence isn't always
126 | # part of the same supersequence, e.g., the y/z in x/y/z could also be a
127 | # part of a/y/z or b/y/z.
128 | for i, lab in enumerate(labels):
129 | if seq_starts[i] is not None:
130 | seq_size = len(seq_starts[i])
131 | if not all(labels_used[i:(i+seq_size)]):
132 | labels_used[i:(i+seq_size)] = [True] * seq_size
133 |
134 | # We need to make sure the group contains x/y/z information,
135 | # otherwise we'll end up duplicating a lot of activations.
136 | # This is not a very good place to put this check; eventually
137 | # we need to refactor much of this class.
138 | groups.append('%d/%d' % (i, seq_size))
139 |
140 | return groups
141 |
142 |
143 |
144 | def create_activation(data, labels, standard_cols, group_labels=[]):
145 |
146 | activation = Activation()
147 |
148 | for i, col in enumerate(data):
149 |
150 | # Replace unicode minus signs with hyphens
151 | replace = ['֊', '‐', '‑', '⁃', '﹣', '-', '‒', '–', '—', '﹘', '−', '-']
152 | for c in replace:
153 | if c in col:
154 | col = col.replace(c, '-')
155 | col = col.replace(c + c, '-')
156 |
157 | # Cast to integer or float if appropriate
158 | # if regex.match('[-\d]+$', col):
159 | # col = int(col)
160 | # elif regex.match('[-\d\.]+$', col):
161 | # col = float(col)
162 |
163 | # Set standard attributes if applicable and do validation where appropriate.
164 | # Generally, validation will not prevent a bad value from making it into the
165 | # activation object, but it will flag any potential issues using the "problem" column.
166 | if standard_cols[i] is not None:
167 |
168 | sc = standard_cols[i]
169 |
170 | # Validate XYZ columns: Should only be integers (and possible trailing decimals).
171 | # If they're not, keep only leading numbers. The exception is that ScienceDirect
172 | # journals often follow the minus sign with a space (e.g., - 35), which we strip.
173 | if regex.match('[xyz]$', sc):
174 | m = regex.match('([-])\s?(\d+\.*\d*)$', col)
175 | if m:
176 | col = "%s%s" % (m.group(1), m.group(2))
177 | if not regex.match('([-]*\d+)\.*\d*$', col):
178 | logging.debug("Value %s in %s column is not valid" % (col, sc))
179 | activation.problems.append("Value in %s column is not valid" % sc)
180 | return activation
181 | col = (float(col))
182 |
183 | elif sc == 'region':
184 | if not regex.search('[a-zA-Z]', col):
185 | logging.debug("Value in region column is not a string")
186 | activation.problems.append("Value in region column is not a string")
187 |
188 | setattr(activation, sc, col)
189 |
190 | # Always include all columns in record
191 | activation.add_col(labels[i], col)
192 |
193 | # Handle columns with multiple coordinates (e.g., 45;12;-12).
194 | # Assume that any series of 3 numbers in a non-standard column
195 | # reflects coordinates. Will fail if there are leading numbers!!!
196 | # Also need to remove space between minus sign and numbers; some ScienceDirect
197 | # journals leave a gap.
198 | if not i in standard_cols:
199 | cs = '([-]?\d{1,3}\.?\d{0,2})'
200 | clean_col = regex.sub(r'(? for a new group, an entire is used
10 | http://www.sciencedirect.com/science/article/pii/S1053811911007609
11 |
12 | # Doesn't detect x/y/z in Table 1 correctly
13 | http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0068494
14 |
--------------------------------------------------------------------------------
/examples/create_db_and_add_articles.py:
--------------------------------------------------------------------------------
1 | # In this example we create a new DB file and process a bunch of
2 | # articles. Note that due to copyright restrictions, articles can't
3 | # be included in this package, so you'll need to replace PATH_TO_FILES
4 | # with something that works.
5 |
6 | from ace import database
7 | from ace.ingest import add_articles
8 |
9 | # Uncomment the next line to seem more information
10 | # ace.set_logging_level('info')
11 |
12 | # Change this to a valid path to a set of html files.
13 | PATH_TO_FILES = "/home/zorro/neurosynth_scrape/articles/html/Neuroimage/*"
14 |
15 | db = database.Database(adapter='sqlite', db_name='sqlite:///example_db.db')
16 | add_articles(db, PATH_TO_FILES, pmid_filenames=True)
17 | db.print_stats()
18 |
--------------------------------------------------------------------------------
/examples/fetch_articles_from_pubmed.py:
--------------------------------------------------------------------------------
1 | """ Query PubMed for results from several journals, and save to file.
2 | The resulting directory can then be passed to the Database instance for
3 | extraction, as in the create_db_and_add_articles example.
4 | NOTE: selenium must be installed and working properly for this to work.
5 | Code has only been tested with the Chrome driver. """
6 |
7 | from ace.scrape import Scraper
8 | import ace
9 | import os
10 |
11 |
12 | journals = {
13 | 'Neuroimage': {
14 | 'delay': 20, # Mean delay between article downloads--prevents the banhammer
15 | 'mode': 'browser', # ScienceDirect journals require selenium to work properly
16 | 'search': 'fmri', # Only retrieve articles with this string in abstract
17 | 'min_pmid': 34447833, # Start from this PMID--can run incrementally
18 | }
19 | }
20 |
21 | # Verbose output
22 | ace.set_logging_level('debug')
23 |
24 | # Create temporary output dir
25 | output_dir = '/tmp/articles'
26 | if not os.path.exists(output_dir):
27 | os.makedirs(output_dir)
28 |
29 | # Initialize Scraper
30 | scraper = Scraper('/tmp/articles')
31 |
32 | # Loop through journals and
33 | for j, settings in list(journals.items()):
34 | scraper.retrieve_journal_articles(j, skip_pubmed_central=True, **settings)
35 |
36 |
37 |
38 |
--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-recording
3 | vcrpy
4 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | regex
3 | requests
4 | simplejson
5 | sqlalchemy
6 | selenium
7 | seleniumbase
8 | tqdm
9 | xmltodict
10 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | # Borrowing a trick from nibabel
5 | if len(set(('test', 'easy_install', 'develop')).intersection(sys.argv)) > 0:
6 | import setuptools
7 |
8 | from distutils.core import setup
9 |
10 | extra_setuptools_args = {}
11 | if 'setuptools' in sys.modules:
12 | extra_setuptools_args = dict(
13 | tests_require=['nose'],
14 | test_suite='nose.collector',
15 | extras_require=dict(
16 | test='nose>=0.10.1')
17 | )
18 |
19 | # fetch version from within ACE module
20 | with open(os.path.join('ace', 'version.py')) as f:
21 | exec(f.read())
22 |
23 | setup(name="ace",
24 | version=__version__,
25 | description="Automated Coordinate Extraction",
26 | maintainer='Tal Yarkoni',
27 | maintainer_email='tyarkoni@gmail.com',
28 | url='http://github.com/neurosynth/ace',
29 | packages=["ace",
30 | "ace.tests"],
31 | package_data={'ace': ['sources/*'],
32 | 'ace.tests': ['data/*']
33 | },
34 | **extra_setuptools_args
35 | )
36 |
--------------------------------------------------------------------------------
| |