├── .github └── workflows │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── ace ├── .vscode │ └── settings.json ├── __init__.py ├── config.py ├── database.py ├── datatable.py ├── evaluate.py ├── export.py ├── extract.py ├── ingest.py ├── label.py ├── scrape.py ├── sources.py ├── sources │ ├── Frontiers.json │ ├── HighWire.json │ ├── JournalOfCognitiveNeuroscience.json │ ├── OUP.json │ ├── OldSpringer.json │ ├── PMC.json │ ├── Plos.json │ ├── Sage.json │ ├── ScienceDirect.json │ ├── Springer.json │ └── Wiley.json ├── tableparser.py ├── tests │ ├── __init__.py │ ├── cassettes │ │ └── test_ace │ │ │ ├── test_brain_research_source.yaml │ │ │ ├── test_cerebral_cortex_source.yaml │ │ │ ├── test_database_processing_stream.yaml │ │ │ ├── test_frontiers_source.yaml │ │ │ ├── test_journal_scraping.yaml │ │ │ ├── test_neuropsychologia_source.yaml │ │ │ ├── test_plos_source.yaml │ │ │ ├── test_pmc_source.yaml │ │ │ ├── test_science_direct_source.yaml │ │ │ └── test_springer_source.yaml │ ├── data │ │ ├── brain.html │ │ ├── cerebral_cortex.html │ │ ├── cognition.html │ │ ├── frontiers.html │ │ ├── jcogneuro.html │ │ ├── plosone.html │ │ ├── pmc.html │ │ ├── springer.html │ │ └── wiley.html │ ├── different_data │ │ ├── 14715131.html │ │ ├── 15028641.html │ │ ├── 15342430.html │ │ └── 18242723.html │ ├── test_ace.py │ └── weird_data │ │ ├── 11532885.html │ │ ├── 12417470.html │ │ ├── 15716157.html │ │ ├── 18439804.html │ │ ├── 18760263.html │ │ ├── 20159144.html │ │ ├── 22695256.html │ │ ├── 23813017.html │ │ ├── 26021218.html │ │ ├── 26696806.html │ │ ├── 28432782.html │ │ ├── 29366950.html │ │ ├── 36196770.html │ │ └── 38990127.html ├── utils.py └── version.py ├── example_tables.txt ├── examples ├── create_db_and_add_articles.py └── fetch_articles_from_pubmed.py ├── requirements.dev.txt ├── requirements.txt └── setup.py /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Install and Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | concurrency: 12 | group: testing-${{ github.ref }} 13 | cancel-in-progress: true 14 | 15 | jobs: 16 | test: 17 | runs-on: ubuntu-latest 18 | 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v2 22 | 23 | - name: Set up Python 24 | uses: actions/setup-python@v4 25 | with: 26 | python-version: '3.8' 27 | 28 | - name: Install dependencies 29 | run: | 30 | pip install -r requirements.txt 31 | pip install -r requirements.dev.txt 32 | pip install -e . 33 | 34 | - name: Test with pytest 35 | run: pytest 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.DS_Store 3 | *~ 4 | build/ 5 | dist 6 | dist/* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining a copy 2 | of this software and associated documentation files (the "Software"), to deal 3 | in the Software without restriction, including without limitation the rights 4 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 5 | copies of the Software, and to permit persons to whom the Software is 6 | furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in 9 | all copies or substantial portions of the Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 17 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # What is ACE? 3 | 4 | ACE stands for Automated Coordinate Extraction. It's a Python package for automated extraction of functional MRI activation data from the tables of published neuroimaging articles. ACE is actually ACE2; a long, long time ago in a faraway land there was a clunkier Ruby version of ACE that did more or less the same thing much more poorly. Thankfully, Ruby ACE has now been disappeared from the internets forever, leaving us with the slightly better thought out package you see here. 5 | 6 | ## Installation 7 | 8 | Install the package from source: 9 | 10 | > python setup.py install 11 | 12 | Make sure you have all the dependencies installed (see requirements.txt). 13 | 14 | That's all! 15 | 16 | ## Usage 17 | 18 | For now, take a look at the tests to get a sense of how things work. A quickstart guide will fill this space in the near future. -------------------------------------------------------------------------------- /ace/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "tests" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true 7 | } -------------------------------------------------------------------------------- /ace/__init__.py: -------------------------------------------------------------------------------- 1 | # emacs: -*- mode: python-mode; py-indent-offset: 4; tab-width: 4; indent-tabs-mode: nil -*- 2 | # ex: set sts=4 ts=4 sw=4 et: 3 | """ACE -- Automated Coordinate Extraction. 4 | """ 5 | __all__ = ["config", "ingest", "database", "datatable", "set_logging_level", "scrape", "sources", "tableparser", "tests", "__version__"] 6 | 7 | import logging 8 | import sys 9 | import os 10 | 11 | from .version import __version__ 12 | 13 | def set_logging_level(level=None): 14 | """Set package-wide logging level 15 | 16 | Args 17 | level : Logging level constant from logging module (warning, error, info, etc.) 18 | """ 19 | if level is None: 20 | level = os.environ.get('ACE_LOGLEVEL', 'warn') 21 | logger.setLevel(getattr(logging, level.upper())) 22 | return logger.getEffectiveLevel() 23 | 24 | def _setup_logger(logger): 25 | # Basic logging setup 26 | console = logging.StreamHandler(sys.stdout) 27 | console.setFormatter(logging.Formatter("%(levelname)-6s %(module)-7s %(message)s")) 28 | logger.addHandler(console) 29 | set_logging_level() 30 | 31 | # Set up logger 32 | logger = logging.getLogger("ace") 33 | _setup_logger(logger) -------------------------------------------------------------------------------- /ace/config.py: -------------------------------------------------------------------------------- 1 | ''' GLOBAL SETTINGS ''' 2 | 3 | # When True, all Exceptions will be suppressed. When False, Exception 4 | # messages will be printed out. 5 | SILENT_ERRORS = False 6 | 7 | 8 | ''' DATABASE SETTINGS ''' 9 | # Adapter to use--either 'mysql' or 'sqlite' 10 | SQL_ADAPTER = 'mysql' 11 | 12 | # SQLite path (when using sqlite adapter) 13 | SQLITE_URI = 'sqlite:///ace.db' 14 | 15 | # MySQL configuration 16 | MYSQL_USER = 'ace' 17 | MYSQL_PASSWORD = 'CHANGEME' 18 | MYSQL_DB = 'ace_test' 19 | 20 | # When True, any processed articles will be saved to DB, whether or not they 21 | # contain any extracted activations. When False, only articles from which 22 | # at least one activation was extracted will be saved. Note that if this is set 23 | # to False, processing will be much slower, since every article not already in 24 | # the DB will be parsed, even if it contains no activations and has been 25 | # previously processed. 26 | SAVE_ARTICLES_WITHOUT_ACTIVATIONS = True 27 | 28 | # By default, ACE will ignore any articles that already exist in the DB 29 | # when processing new HTML files. If OVERWRITE is set to True, ACE will 30 | # always overwrite existing records. This is useful when the extraction 31 | # code has improved substantially and you want to re-extract all data, 32 | # but should otherwise be left off for the sake of efficiency. 33 | OVERWRITE_EXISTING_ROWS = False 34 | 35 | 36 | ''' SOURCE PROCESSING SETTINGS ''' 37 | 38 | # If True, will exercise greater care when parsing (e.g., when estimating 39 | # number of columns in table, will check every row in the table and take the 40 | # max instead of just checking the first row). This is generally desirable, 41 | # but will result in slower processing. 42 | CAREFUL_PARSING = True 43 | 44 | # Sometimes tables have rows that can't be processed--usually because of malformed 45 | # HTML or XML (e.g., failure to close a tag). Such problems will always be 46 | # logged, but if IGNORE_BAD_ROWS is True, the row will be skipped and execution 47 | # will continue gracefully. When False, any errors will be re-raised, 48 | # terminating execution. 49 | IGNORE_BAD_ROWS = True 50 | 51 | # Whether or not to ignore tables that appear to be missing a label for at 52 | # least one column. This doesn't happen much, and in practice most tables with 53 | # missing labels appear to genuinely have empty columns that are ignored 54 | # anyway, so this should be left off unless problems arise. 55 | EXCLUDE_TABLES_WITH_MISSING_LABELS = False 56 | 57 | 58 | 59 | 60 | ''' SCRAPING/PARSING SETTINGS ''' 61 | USER_AGENTS = [ 62 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', 63 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', 64 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.155 Safari/537.36', 65 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', 66 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', 67 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', 68 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', 69 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', 70 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.201 Safari/537.36', 71 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.78 Safari/537.36', 72 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', 73 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.118 Safari/537.36', 74 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.91 Safari/537.36', 75 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', 76 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', 77 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', 78 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.60 Safari/537.36', 79 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.122 Safari/537.36', 80 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', 81 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.86 Safari/537.36', 82 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', 83 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.6312.58 Safari/537.36', 84 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36', 85 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.6367.207 Safari/537.36', 86 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.60 Safari/537.36' 87 | ] 88 | -------------------------------------------------------------------------------- /ace/database.py: -------------------------------------------------------------------------------- 1 | # Database stuff and models 2 | 3 | from sqlalchemy import (TypeDecorator, Table, Column, Integer, Float, String, Boolean, 4 | ForeignKey, DateTime, Text) 5 | from sqlalchemy.orm import relationship, backref, sessionmaker 6 | from sqlalchemy import create_engine 7 | from sqlalchemy.ext.declarative import declarative_base 8 | from sqlalchemy.ext.associationproxy import association_proxy 9 | from sqlalchemy.dialects.mysql import MEDIUMTEXT 10 | from sqlalchemy.sql import exists 11 | from datetime import datetime 12 | import simplejson as json 13 | import logging 14 | import sys 15 | from os import path 16 | import datetime 17 | 18 | from . import config 19 | from . import extract 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | Base = declarative_base() 24 | 25 | # Backend-dependent column for full text 26 | LongText = Text().with_variant(MEDIUMTEXT, 'mysql') 27 | 28 | # Handles all Database loading/saving stuff 29 | class Database: 30 | 31 | def __init__(self, adapter=None, db_name=None, user=None, password=None): 32 | ''' Connect to DB and initialize instance. ''' 33 | 34 | # Default to settings in config file if none passed 35 | if adapter is None: adapter = config.SQL_ADAPTER 36 | 37 | # Generate DB URI 38 | if adapter == 'sqlite': 39 | db_uri = config.SQLITE_URI if db_name is None else db_name 40 | elif adapter == 'mysql': 41 | db_name = config.MYSQL_DB if db_name is None else db_name 42 | if user is None: user = config.MYSQL_USER 43 | if password is None: password = config.MYSQL_PASSWORD 44 | db_uri = 'mysql://%s:%s@localhost/%s' % (user, password, db_name) 45 | else: 46 | raise ValueError("Value of SQL_ADAPTER in settings must be either 'sqlite' or 'mysql'") 47 | 48 | engine = create_engine(db_uri, echo=False, connect_args={'timeout': 15}) 49 | 50 | if adapter == 'mysql': engine.execute("SET sql_mode=''") 51 | 52 | Session = sessionmaker(bind=engine) 53 | Base.metadata.create_all(engine) 54 | self.session = Session() 55 | 56 | def add(self, record): 57 | ''' Add a record to the DB. ''' 58 | self.session.add(record) 59 | 60 | def save(self): 61 | ''' Commit all stored records to file. ''' 62 | self.session.commit() 63 | # except Exception as err: 64 | # print(err) 65 | 66 | def delete_article(self, pmid): 67 | article = self.session.query(Article).filter_by(id=pmid).first() 68 | self.session.delete(article) 69 | self.session.commit() 70 | 71 | def print_stats(self): 72 | ''' Summarize the current state of the DB. ''' 73 | n_articles = self.session.query(Article).count() 74 | n_articles_with_coordinates = self.session.query(Article).join(Table).filter(Table.n_activations>0).distinct('article_id').count() 75 | n_tables = self.session.query(Table).count() 76 | n_activations = self.session.query(Activation).count() 77 | n_links = self.session.query(NeurovaultLink).count() 78 | n_articles_with_links = self.session.query(NeurovaultLink).distinct('article_id').count() 79 | print(f"The database currently contains: {n_articles} articles.\n" 80 | f"{n_articles_with_coordinates} have coordinates, and {n_articles_with_links} have NeuroVault links.\n" 81 | f"Total of {n_tables} tables, {n_activations} activations and {n_links} NeuroVault links.") 82 | 83 | def article_exists(self, pmid): 84 | ''' Check if an article already exists in the database. ''' 85 | return self.session.query(exists().where(Article.id==pmid)).scalar() 86 | 87 | @property 88 | def articles(self): 89 | return self.session.query(Article).all() 90 | 91 | # Create a JSONString column type for convenience 92 | class JsonString(TypeDecorator): 93 | impl = Text 94 | 95 | def process_result_value(self, value, dialect): 96 | if value is None: 97 | return None 98 | else: 99 | return json.loads(value) 100 | 101 | def process_bind_param(self, value, dialect): 102 | if value is None: 103 | return None 104 | else: 105 | return json.dumps(value) 106 | 107 | 108 | class Article(Base): 109 | 110 | __tablename__ = 'articles' 111 | 112 | id = Column(Integer, primary_key=True) 113 | title = Column(String(200)) 114 | text = Column(LongText) 115 | journal = Column(String(200)) 116 | space = Column(String(20)) 117 | publisher = Column(String(200)) 118 | doi = Column(String(200)) 119 | year = Column(Integer) 120 | authors = Column(Text) 121 | abstract = Column(Text) 122 | citation = Column(Text) 123 | pubmed_metadata = Column(JsonString) 124 | created_at = Column(DateTime, default=datetime.datetime.utcnow) 125 | updated_at = Column(DateTime, default=datetime.datetime.utcnow, 126 | onupdate=datetime.datetime.utcnow) 127 | 128 | tables = relationship('Table', cascade="all, delete-orphan", 129 | backref='article') 130 | 131 | neurovault_links = relationship('NeurovaultLink', cascade="all, delete-orphan", 132 | backref='article') 133 | 134 | features = association_proxy('tags', 'feature') 135 | 136 | def __init__(self, text, pmid=None, doi=None, metadata=None): 137 | self.id = int(pmid) 138 | self.text = text 139 | self.space = extract.guess_space(text) 140 | self.doi = doi 141 | self.pubmed_metadata = metadata 142 | self.update_from_metadata() 143 | 144 | def update_from_metadata(self): 145 | if self.pubmed_metadata is not None: 146 | pmd = self.pubmed_metadata 147 | self.title = pmd['title'] 148 | self.journal = pmd['journal'] 149 | self.pubmed_metadata = pmd 150 | self.year = pmd['year'] 151 | self.authors = pmd['authors'] 152 | self.abstract = pmd['abstract'] 153 | self.citation = pmd['citation'] 154 | self.doi = pmd['doi'] 155 | 156 | 157 | class Table(Base): 158 | 159 | __tablename__ = 'tables' 160 | 161 | id = Column(Integer, primary_key=True) 162 | article_id = Column(Integer, ForeignKey('articles.id')) 163 | activations = relationship('Activation', cascade="all, delete-orphan", 164 | backref='table') 165 | position = Column(Integer) # The serial position of occurrence 166 | number = Column(String(10)) # The stated table ID (e.g., 1, 2b) 167 | label = Column(String(200)) # The full label (e.g., Table 1, Table 2b) 168 | caption = Column(Text) 169 | notes = Column(Text) 170 | n_activations = Column(Integer) 171 | n_columns = Column(Integer) 172 | 173 | def finalize(self): 174 | ''' Any cleanup and updating operations we need to do before saving. ''' 175 | 176 | # # Remove duplicate activations--most commonly produced by problems with 177 | # # the grouping code. 178 | # act_defs = set() 179 | # to_keep = [] 180 | # for a in self.activations: 181 | # definition = json.dumps([a.x, a.y, a.z, a.groups]) 182 | # if definition not in act_defs: 183 | # act_defs.add(definition) 184 | # to_keep.append(a) 185 | # self.activations = to_keep 186 | 187 | self.n_activations = len(self.activations) 188 | 189 | 190 | class Activation(Base): 191 | 192 | __tablename__ = 'activations' 193 | 194 | id = Column(Integer, primary_key=True) 195 | 196 | article_id = Column(Integer, ForeignKey('articles.id')) 197 | table_id = Column(Integer, ForeignKey('tables.id')) 198 | columns = Column(JsonString) 199 | groups = Column(JsonString) 200 | problems = Column(JsonString) 201 | x = Column(Float) 202 | y = Column(Float) 203 | z = Column(Float) 204 | number = Column(Integer) 205 | region = Column(String(100)) 206 | hemisphere = Column(String(100)) 207 | ba = Column(String(100)) 208 | size = Column(String(100)) 209 | statistic = Column(String(100)) 210 | p_value = Column(String(100)) 211 | 212 | missing_source = Column(Boolean, default=False) 213 | 214 | def __init__(self): 215 | self.problems = [] 216 | self.columns = {} 217 | 218 | def set_coords(self, x, y, z): 219 | new_xyz = [] 220 | for c in [x, y, z]: 221 | if c == '' or c is None: 222 | c = None 223 | else: 224 | c = c.replace(' ', '').replace('--', '-').rstrip('.') 225 | c = float(c) 226 | new_xyz.append(c) 227 | 228 | self.x, self.y, self.z = new_xyz 229 | 230 | def add_col(self, key, val): 231 | self.columns[key] = val 232 | 233 | # Validates Peak. Considers peak invalid if: 234 | # * At least one of X, Y, Z is nil or missing 235 | # * Any |coordinate| > 100 236 | # * Two or more columns are zeroes (most of the time this 237 | # will indicate a problem, but occasionally a real coordinate) 238 | # Depending on config, either excludes peak, or allows it through 239 | # but flags potential problems for later inspection. 240 | def validate(self): 241 | 242 | for c in [self.x, self.y, self.z]: 243 | if c == '' or c is None: 244 | logger.debug('Missing x, y, or z coordinate information: [%s, %s, %s]' % tuple( 245 | [str(e) for e in [self.x, self.y, self.z]])) 246 | return False 247 | try: 248 | if abs(c) >= 100: 249 | logger.debug( 250 | 'Invalid coordinates: at least one dimension (x,y,z) >= 100.') 251 | return False 252 | except: 253 | print(c) 254 | print(sys.exc_info()[0]) 255 | raise 256 | 257 | sorted_xyz = sorted([abs(self.x), abs(self.y), abs(self.z)]) 258 | if sorted_xyz[0] == 0 and sorted_xyz[1] == 0: 259 | logger.debug( 260 | "At least two dimensions have value == 0; coordinate is probably not real.") 261 | return False 262 | 263 | return True 264 | 265 | class NeurovaultLink(Base): 266 | 267 | __tablename__ = 'Neurovaultlinks' 268 | 269 | id = Column(Integer, primary_key=True, autoincrement=True) 270 | neurovault_id = Column(Integer) 271 | url = Column(String(100)) 272 | type = Column(String(100)) 273 | 274 | article_id = Column(Integer, ForeignKey('articles.id')) 275 | -------------------------------------------------------------------------------- /ace/datatable.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logger = logging.getLogger(__name__) 3 | 4 | 5 | class DataTable: 6 | 7 | ''' Simple class to represent the contents of an HTML table. 8 | Basically just a grid with array accessor methods and 9 | some extra validation. ''' 10 | 11 | def __init__(self, n_rows, n_cols): 12 | self.data = [[None] * n_cols for n in range(n_rows)] 13 | # self.n_rows = n_rows 14 | self.n_cols = n_cols 15 | 16 | def __getitem__(self, inds): 17 | if isinstance(inds, int): 18 | inds = [inds] 19 | row = self.data[inds[0]] 20 | return row[inds[1]] if len(inds) > 1 else row 21 | 22 | def __setitem__(self, inds, v): 23 | self.data[inds[0]][inds[1]] = v 24 | 25 | def to_list(self): 26 | return self.data 27 | 28 | @property 29 | def n_rows(self): 30 | return len(self.data) 31 | 32 | def add_val(self, val, rows=1, cols=1): 33 | ''' Find next open position and add values to grid ''' 34 | 35 | # Flatten list and find next open position 36 | flat = [item for l in self.data for item in l] 37 | flat_set = set(flat) 38 | 39 | if not None in flat_set: 40 | open_pos = self.n_rows * self.n_cols 41 | for i in range(rows): 42 | self.data.append([None] * self.n_cols) 43 | 44 | else: 45 | # This indexing operation consumes a lot of CPU time for large tables; need to refactor! 46 | open_pos = flat.index(None) 47 | ri = open_pos / self.n_cols 48 | if (ri + rows) > self.n_rows: 49 | for i in range(round((ri + rows)) - self.n_rows): 50 | self.data.append([None] * self.n_cols) 51 | 52 | ri = open_pos // self.n_cols 53 | ci = open_pos % self.n_cols 54 | 55 | if cols + ci > self.n_cols: 56 | cols = self.n_cols - ci 57 | 58 | for r in range(rows): 59 | for c in range(cols): 60 | if cols > 1: 61 | content = '@@%s@%d' % ( 62 | val, cols) if c == 0 else '@@%s' % val 63 | else: 64 | content = val 65 | self[ri + r, ci + c] = content 66 | -------------------------------------------------------------------------------- /ace/evaluate.py: -------------------------------------------------------------------------------- 1 | """ Tools for evaluating the quality of extracted coordinates. """ 2 | 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | import numpy as np 6 | 7 | def plot_xyz_histogram(database, bins=50): 8 | ''' Takes a database file as input and plots histograms for X/Y/Z coords. ''' 9 | data = pd.read_csv(database,sep='\t') 10 | data[['x','y','z']].hist(bins=bins) 11 | plt.show() 12 | 13 | 14 | def proportion_integer_values(database): 15 | ''' Reports the proportion of integer values in X/Y/Z columns of database file. 16 | This should generally be close to 0--typically around 0.02 or so if everything 17 | is working properly. ''' 18 | data = pd.read_csv(database,sep='\t') 19 | return 1 - data[['x','y','z']].apply(lambda x: np.mean(x == x.round())) -------------------------------------------------------------------------------- /ace/export.py: -------------------------------------------------------------------------------- 1 | from .database import Article 2 | from sqlalchemy import func, or_ 3 | import logging 4 | import csv 5 | from pathlib import Path 6 | import datetime 7 | import json 8 | from tqdm import tqdm 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | def export_database(db, foldername, skip_empty=True): 13 | # Create folder if it doesn't exist 14 | foldername = Path(foldername) 15 | foldername.mkdir(parents=True, exist_ok=True) 16 | 17 | article_columns = ['pmid', 'doi', 'authors', 'title', 'journal', 'publication_year', 'coordinate_space'] 18 | art_results = [] 19 | 20 | coordinate_columns = ['pmid', 'table_id', 'table_label', 'table_caption', 'table_number', 21 | 'x', 'y', 'z', 'p_value', 'region', 'size', 'statistic', 'groups'] 22 | coordinates = [] 23 | 24 | text_columns = ['pmid', 'title' ,'abstract', 'body'] 25 | texts = [] 26 | 27 | nv_colls_col = ['pmid','collection_id'] 28 | nv_colls = [] 29 | 30 | nv_images_col = ['pmid','image_id'] 31 | nv_images = [] 32 | 33 | print("Exporting database to %s" % foldername) 34 | 35 | articles = db.session.query(Article) 36 | if skip_empty: 37 | articles = articles.filter(or_(Article.tables.any(), Article.neurovault_links.any())) 38 | 39 | for art in tqdm(articles): 40 | art_results.append([art.id, art.doi, art.authors, art.title, art.journal, art.year, art.space]) 41 | texts.append([art.id, art.title, art.abstract, art.text]) 42 | 43 | for t in art.tables: 44 | for p in t.activations: 45 | if t.number is None: t.number = '' 46 | if isinstance(p.groups, str): 47 | p.groups = [p.groups] 48 | elif p.groups is None: 49 | p.groups = [] 50 | groups = '///'.join(p.groups) 51 | 52 | coordinates.append([art.id, t.id, t.label, t.caption, t.number, 53 | p.x, p.y, p.z, p.p_value, p.region, p.size, p.statistic, groups]) 54 | 55 | for nv in art.neurovault_links: 56 | if nv.type == 'collection': 57 | nv_colls.append([art.id, nv.neurovault_id]) 58 | elif nv.type == 'image': 59 | nv_images.append([art.id, nv.neurovault_id]) 60 | 61 | # Save articles as tab separated file 62 | with (foldername / 'metadata.csv').open('w', newline='') as f: 63 | writer = csv.writer(f) 64 | writer.writerow(article_columns) 65 | writer.writerows(art_results) 66 | 67 | # Save coordinates as tab separated file 68 | with (foldername / 'coordinates.csv').open('w', newline='') as f: 69 | writer = csv.writer(f) 70 | writer.writerow(coordinate_columns) 71 | writer.writerows(coordinates) 72 | 73 | # Save texts as tab separated file 74 | with (foldername / 'text.csv').open('w', newline='') as f: 75 | writer = csv.writer(f) 76 | writer.writerow(text_columns) 77 | writer.writerows(texts) 78 | 79 | # Save NV links 80 | with (foldername / 'neurovault_collections.csv').open('w', newline='') as f: 81 | writer = csv.writer(f) 82 | writer.writerow(nv_colls_col) 83 | writer.writerows(nv_colls) 84 | 85 | with (foldername / 'neurovault_images.csv').open('w', newline='') as f: 86 | writer = csv.writer(f) 87 | writer.writerow(nv_images_col) 88 | writer.writerows(nv_images) 89 | 90 | # Save json file with time of export 91 | export_md = { 92 | "exported": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 93 | "n_articles": len(art_results), 94 | "n_activations": len(coordinates), 95 | "n_nv_collections": len(nv_colls), 96 | "n_nv_images": len(nv_images) 97 | 98 | } 99 | 100 | with (foldername / 'export.json').open('w') as f: 101 | json.dump(export_md, f) -------------------------------------------------------------------------------- /ace/extract.py: -------------------------------------------------------------------------------- 1 | # Miscellaneous methods for extracting information from text/html 2 | 3 | import bs4 as BeautifulSoup 4 | import re 5 | 6 | 7 | def guess_space(text): 8 | ''' Take article text as input and return a guess about the image space. ''' 9 | 10 | targets = ['mni', 'talairach', 'afni', 'flirt', 11 | '711-2', 'spm', 'brainvoyager', 'fsl'] 12 | n_targ = len(targets) 13 | text = text.lower() 14 | res = [0] * n_targ 15 | matches = [] 16 | for i in range(n_targ): 17 | res[i] = len(re.findall( 18 | r'\b(.{30,40}\b%s.{30,40})\b' % targets[i], text)) 19 | 20 | # Sum up diagnostic strings... 21 | mni = res[5] + res[7] 22 | t88 = res[2] + res[6] 23 | software = mni + t88 24 | 25 | # Assign label 26 | # 1. If only one of MNI or T88 is implied, classify as that 27 | if (mni and not t88) or (not software and res[0] and not res[1]): 28 | label = 'MNI' 29 | elif (t88 and not mni) or (not software and res[1] and not res[0]): 30 | label = 'TAL' 31 | else: 32 | label = 'UNKNOWN' 33 | 34 | return label 35 | -------------------------------------------------------------------------------- /ace/ingest.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | import logging 3 | from . import sources, config 4 | from .scrape import _validate_scrape 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | # The actual function that takes articles and adds them to the database 9 | # imports sources; sources is a module that contains the classes for each 10 | # source of articles. 11 | 12 | def add_articles(db, files, commit=True, table_dir=None, limit=None, 13 | pmid_filenames=False, metadata_dir=None, force_ingest=True, **kwargs): 14 | ''' Process articles and add their data to the DB. 15 | Args: 16 | files: The path to the article(s) to process. Can be a single 17 | filename (string), a list of filenames, or a path to pass 18 | to glob (e.g., "article_ls dir/NIMG*html") 19 | commit: Whether or not to save records to DB file after adding them. 20 | table_dir: Directory to store downloaded tables in (if None, tables 21 | will not be saved.) 22 | limit: Optional integer indicating max number of articles to add 23 | (selected randomly from all available). When None, will add all 24 | available articles. 25 | pmid_filenames: When True, assume that the file basename is a PMID. 26 | This saves us from having to retrieve metadata from PubMed When 27 | checking if a file is already in the DB, and greatly speeds up 28 | batch processing when overwrite is off. 29 | metadata_dir: Location to read/write PubMed metadata for articles. 30 | When None (default), retrieves new metadata each time. If a 31 | path is provided, will check there first before querying PubMed, 32 | and will save the result of the query if it doesn't already 33 | exist. 34 | force_ingest: Ingest even if no source is identified. 35 | kwargs: Additional keyword arguments to pass to parse_article. 36 | ''' 37 | 38 | manager = sources.SourceManager(db, table_dir) 39 | 40 | if isinstance(files, str): 41 | from glob import glob 42 | files = glob(files) 43 | if limit is not None: 44 | from random import shuffle 45 | shuffle(files) 46 | files = files[:limit] 47 | 48 | missing_sources = [] 49 | for i, f in enumerate(files): 50 | logger.info("Processing article %s..." % f) 51 | html = open(f).read() 52 | 53 | if not _validate_scrape(html): 54 | logger.warning("Invalid HTML for %s" % f) 55 | continue 56 | 57 | source = manager.identify_source(html) 58 | if source is None: 59 | logger.warning("Could not identify source for %s" % f) 60 | missing_sources.append(f) 61 | if not force_ingest: 62 | continue 63 | else: 64 | source = sources.DefaultSource(db) 65 | 66 | pmid = path.splitext(path.basename(f))[0] if pmid_filenames else None 67 | article = source.parse_article(html, pmid, metadata_dir=metadata_dir, **kwargs) 68 | if article and (config.SAVE_ARTICLES_WITHOUT_ACTIVATIONS or article.tables): 69 | db.add(article) 70 | if commit and (i % 100 == 0 or i == len(files) - 1): 71 | db.save() 72 | db.save() 73 | 74 | return missing_sources 75 | -------------------------------------------------------------------------------- /ace/label.py: -------------------------------------------------------------------------------- 1 | # from nltk import * 2 | import re 3 | from collections import Counter 4 | from .database import Article 5 | from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 6 | import pandas as pd 7 | 8 | 9 | def extract_ngram_features(db, tfidf=True, save=None, vocabulary=None, require_activations=True, **kwargs): 10 | ''' Takes text from an article as input and returns a matrix of document --> 11 | ngram weights. At the moment, only extracts terms from abstracts. 12 | Args: 13 | db: A database instance 14 | tfidf: If True, uses a tf-idf tokenizer; otherwise uses raw counts 15 | save: an optional path to save a CSV to; if None, returns the resulting data 16 | vocabulary: an optional list of ngrams to restrict extraction to 17 | require_activations: When True, only articles containing at least one fMRI activation 18 | table will be included. When False, use all articles in DB. 19 | kwargs: Optional keywords passed onto the scikit-learn vectorizer. Common args are 20 | ngram_range, min_df, max_df, stop_words, and vocabulary. 21 | ''' 22 | 23 | # Extract article texts--for now, uses abstracts 24 | articles = db.session.query(Article.id, Article.abstract) 25 | if require_activations: 26 | articles = articles.filter(Article.tables.any()) 27 | pmids, corpus = list(zip(*articles.all())) 28 | 29 | # Instantiate vectorizer--either simple counts, or tf-idf 30 | vectorizer = TfidfVectorizer if tfidf else CountVectorizer 31 | vectorizer = vectorizer(vocabulary=vocabulary, **kwargs) 32 | 33 | # Transform texts 34 | weights = vectorizer.fit_transform(corpus).toarray() 35 | names = vectorizer.get_feature_names() 36 | 37 | data = pd.DataFrame(weights, columns=names, index=pmids) 38 | 39 | if save is not None: 40 | data.to_csv(save, sep='\t', index_label='pmid', encoding='utf-8') 41 | else: 42 | return data 43 | 44 | -------------------------------------------------------------------------------- /ace/scrape.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # use unicode everywhere 3 | import re 4 | import sys 5 | from pathlib import Path 6 | from collections import Mapping 7 | import requests 8 | from time import sleep 9 | import logging 10 | import os 11 | import random 12 | import xmltodict 13 | from seleniumbase import Driver 14 | from selenium.webdriver.support.ui import WebDriverWait 15 | from selenium.webdriver.support import expected_conditions as EC 16 | from selenium.webdriver.common.by import By 17 | from selenium.common.exceptions import TimeoutException 18 | from tqdm import tqdm 19 | 20 | from ace.utils import PubMedAPI 21 | from ace.config import USER_AGENTS 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | def get_url(url, n_retries=5, timeout=10.0, verbose=False): 27 | headers = {'User-Agent': random.choice(USER_AGENTS)} 28 | 29 | def exponential_backoff(retries): 30 | return 2 ** retries 31 | 32 | retries = 0 33 | while retries < n_retries: 34 | 35 | try: 36 | r = requests.get(url, headers=headers, timeout=timeout) 37 | return r.text 38 | except requests.exceptions.RequestException as e: 39 | logger.warning(f"Request failed: {e}") 40 | sleep_time = exponential_backoff(retries) 41 | logger.info(f"Retrying in {sleep_time} seconds...") 42 | sleep(sleep_time) 43 | retries += 1 44 | logger.error("Exceeded maximum number of retries.") 45 | return None 46 | 47 | def _convert_pmid_to_pmc(pmids): 48 | url_template = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/?ids=" 49 | logger.info("Converting PMIDs to PMCIDs...") 50 | 51 | # Chunk the PMIDs into groups of 200 52 | pmids = [str(p) for p in pmids] 53 | pmid_chunks = [pmids[i:i + 200] for i in range(0, len(pmids), 200)] 54 | 55 | pmc_ids = [] 56 | for chunk in tqdm(pmid_chunks): 57 | pmid_str = ','.join(chunk) 58 | url = url_template + pmid_str 59 | response = get_url(url) 60 | # Respionse 61 | pmc_ids += re.findall(r'', response) 62 | 63 | logger.info(f"Found {len(pmc_ids)} PMCIDs from {len(pmids)} PMIDs.") 64 | 65 | pmids_found = set([p[1] for p in pmc_ids]) 66 | missing_pmids = [(None, p) for p in pmids if p not in pmids_found] 67 | 68 | pmc_ids = pmc_ids + missing_pmids 69 | 70 | return pmc_ids 71 | 72 | 73 | def get_pmid_from_doi(doi, api_key=None): 74 | ''' Query PubMed for the PMID of a paper based on its doi. We need this 75 | for some Sources that don't contain the PMID anywhere in the artice HTML. 76 | ''' 77 | query = f"{doi}[aid]" 78 | data = PubMedAPI(api_key=api_key).esearch(query=query) 79 | if data: 80 | data = data[0] 81 | else: 82 | data = None 83 | return data 84 | 85 | 86 | def get_pubmed_metadata(pmid, parse=True, store=None, save=True, api_key=None): 87 | ''' Get PubMed metadata for article. 88 | Args: 89 | pmid: The article's PubMed ID 90 | parse: if True, parses the text and returns a dictionary. if False, returns raw text. 91 | store: optional string path to PubMed metadata files. If passed, first checks the passed 92 | folder for the corresponding ID, and only queries PubMed if not found. 93 | save: if store is passed, save is True, and the file does not already exist, 94 | will save the result of the new PubMed query to the store. 95 | ''' 96 | if store is not None: 97 | md_file = os.path.join(store, pmid) 98 | 99 | if store is not None and os.path.exists(md_file): 100 | logger.info("Retrieving metadata from file %s..." % os.path.join(store, pmid)) 101 | with open(md_file, 'rb') as f: 102 | xml = f.read() 103 | 104 | else: 105 | logger.info("Retrieving metadata for PubMed article %s..." % str(pmid)) 106 | xml = PubMedAPI(api_key=api_key).efetch(input_id=pmid, retmode='xml', rettype='medline', db='pubmed') 107 | if store is not None and save and xml is not None: 108 | if not os.path.exists(store): 109 | os.makedirs(store) 110 | with open(md_file, 'wb') as f: 111 | f.write(xml) 112 | 113 | return parse_PMID_xml(xml) if (parse and xml is not None) else xml 114 | 115 | 116 | def parse_PMID_xml(xml): 117 | ''' Take XML-format PubMed metadata and convert it to a dictionary 118 | with standardized field names. ''' 119 | 120 | di = xmltodict.parse(xml).get('PubmedArticleSet') 121 | if not di: 122 | return None 123 | 124 | di = di['PubmedArticle'] 125 | article = di['MedlineCitation']['Article'] 126 | 127 | if 'ArticleDate' in article: 128 | date = article['ArticleDate'] 129 | elif 'Journal' in article: 130 | date = article['Journal']['JournalIssue']['PubDate'] 131 | else: 132 | date = None 133 | 134 | if date: 135 | year = date.get('Year', None) 136 | else: 137 | year = None 138 | 139 | doi = None 140 | doi_source = article.get('ELocationID', None) 141 | if doi_source is not None and isinstance(doi_source, list): 142 | doi_source = [d for d in doi_source if d['@EIdType'] == 'doi'][0] 143 | 144 | if doi_source is not None and doi_source['@EIdType'] == 'doi': 145 | doi = doi_source['#text'] 146 | 147 | authors = article.get('AuthorList', None) 148 | 149 | if authors: 150 | authors = authors['Author'] 151 | 152 | try: 153 | _get_author = lambda a: a['LastName'] + ', ' + a['ForeName'] 154 | if isinstance(authors, list): 155 | authors = [_get_author(a) for a in authors if 'ForeName' in a] 156 | else: 157 | authors = [_get_author(authors)] 158 | authors = ';'.join(authors) 159 | except: 160 | authors = None 161 | 162 | if 'MeshHeadingList' in di['MedlineCitation']: 163 | mesh = di['MedlineCitation']['MeshHeadingList']['MeshHeading'] 164 | else: 165 | mesh = [] 166 | 167 | abstract = article.get('Abstract', '') 168 | if abstract != '': 169 | abstract = abstract.get('AbstractText', '') 170 | 171 | cit = di['PubmedData']['ArticleIdList']['ArticleId'] 172 | if isinstance(cit, list): 173 | cit = cit[1] 174 | 175 | metadata = { 176 | 'authors': authors, 177 | 'citation': cit['#text'], 178 | 'comment': abstract, 179 | 'doi': doi, 180 | 'keywords': '', 181 | 'mesh': mesh, 182 | 'pmid': di['MedlineCitation']['PMID'], 183 | 'title': article['ArticleTitle'], 184 | 'abstract': abstract, 185 | 'journal': article['Journal']['Title'], 186 | 'year': year 187 | } 188 | 189 | # Clean up nested Dicts 190 | for k, v in metadata.items(): 191 | if isinstance(v, list): 192 | to_join = [] 193 | for a in v: 194 | if 'DescriptorName' in a: 195 | a = a['DescriptorName'] 196 | a = a['#text'] 197 | 198 | to_join.append(a) 199 | v = ' | '.join(to_join) 200 | elif isinstance(v, Mapping): 201 | v = v.get('#text', '') 202 | metadata[k] = v 203 | 204 | return metadata 205 | 206 | def _validate_scrape(html): 207 | """ Checks to see if scraping was successful. 208 | For example, checks to see if Cloudfare interfered """ 209 | 210 | patterns = ['Checking if you are a human', 211 | 'Please turn JavaScript on and reload the page', 212 | 'Checking if the site connection is secure', 213 | 'Enable JavaScript and cookies to continue', 214 | 'There was a problem providing the content you requested', 215 | 'Redirecting', 216 | 'Page not available - PMC', 217 | 'Your request cannot be processed at this time. Please try again later', 218 | '403 Forbidden', 219 | 'Page not found — ScienceDirect', 220 | 'This site can’t be reached', 221 | 'used Cloudflare to restrict access', 222 | '502 Bad Gateway', 223 | ] 224 | 225 | for pattern in patterns: 226 | if pattern in html: 227 | return False 228 | 229 | return True 230 | 231 | ''' Class for journal Scraping. The above free-floating methods should 232 | probably be refactored into this class eventually. ''' 233 | class Scraper: 234 | 235 | def __init__(self, store, api_key=None): 236 | self.store = Path(store) 237 | self._client = PubMedAPI(api_key=api_key) 238 | 239 | 240 | def search_pubmed(self, journal, search, retmax=10000, savelist=None,): 241 | journal = journal.replace(' ', '+') 242 | search = '+%s' % search 243 | query = f"({journal}[Journal]+journal+article[pt]{search})" 244 | logger.info("Query: %s" % query) 245 | 246 | doc = self._client.esearch(query, retmax=retmax) 247 | 248 | if savelist is not None: 249 | outf = open(savelist, 'w') 250 | outf.write(doc) 251 | outf.close() 252 | return doc 253 | 254 | 255 | def get_html(self, url, journal, mode='browser'): 256 | 257 | ''' Get HTML of full-text article. Uses either browser automation (if mode == 'browser') 258 | or just gets the URL directly. ''' 259 | 260 | if mode == 'browser': 261 | driver = Driver( 262 | uc=True, 263 | headless2=True, 264 | agent=random.choice(USER_AGENTS), 265 | ) 266 | for attempt in range(15): 267 | try: 268 | driver.set_page_load_timeout(10) 269 | driver.get(url) 270 | url = driver.current_url 271 | except: 272 | driver.quit() 273 | logger.info(f"Timeout exception #{attempt}. Retrying...") 274 | sleep(5) 275 | continue 276 | else: 277 | break 278 | else: 279 | logger.info("Timeout exception. Giving up.") 280 | return None 281 | for attempt in range(10): 282 | try: 283 | html = driver.page_source 284 | except: 285 | logger.info(f"Source Page #{attempt}. Retrying...") 286 | driver.quit() 287 | driver = Driver( 288 | uc=True, 289 | headless2=True, 290 | agent=random.choice(USER_AGENTS), 291 | ) 292 | driver.get(url) 293 | sleep(2) 294 | else: 295 | break 296 | 297 | new_url = self.check_for_substitute_url(url, html, journal) 298 | 299 | if url != new_url: 300 | driver = Driver( 301 | uc=True, 302 | headless2=True, 303 | agent=random.choice(USER_AGENTS), 304 | ) 305 | driver.get(new_url) 306 | if journal.lower() in ['human brain mapping', 307 | 'european journal of neuroscience', 308 | 'brain and behavior','epilepsia']: 309 | sleep(0.5 + random.random() * 1) 310 | try: 311 | WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.ID, 'relatedArticles'))) 312 | except TimeoutException: 313 | print("Loading Wiley page took too much time!") 314 | 315 | # Sometimes we get annoying alerts (e.g., Flash animation 316 | # timeouts), so we dismiss them if present. 317 | try: 318 | alert = driver.switch_to_alert() 319 | alert.dismiss() 320 | except: 321 | pass 322 | 323 | logger.info(journal.lower()) 324 | timeout = 5 325 | for attempt in range(10): 326 | try: 327 | html = driver.page_source 328 | except: 329 | logger.info(f"Source Page #{attempt}. Retrying...") 330 | driver.quit() 331 | driver = Driver( 332 | uc=True, 333 | headless2=True, 334 | agent=random.choice(USER_AGENTS), 335 | ) 336 | driver.get(url) 337 | sleep(2) 338 | else: 339 | break 340 | if journal.lower() in ['journal of neuroscience', 'j neurosci']: 341 | ## Find links with class data-table-url, and click on them 342 | ## to load the table data. 343 | table_links = driver.find_elements(By.CLASS_NAME, 'table-expand-inline') 344 | 345 | if len(table_links): 346 | for link in table_links: 347 | WebDriverWait(driver, 20).until(EC.element_to_be_clickable(( 348 | By.CLASS_NAME, 'table-expand-inline'))) 349 | driver.execute_script("arguments[0].scrollIntoView();", link) 350 | link.click() 351 | sleep(0.5 + random.random() * 1) 352 | 353 | # If title has ScienceDirect in in title 354 | elif ' - ScienceDirect' in html: 355 | try: 356 | element_present = EC.presence_of_element_located((By.ID, 'abstracts')) 357 | WebDriverWait(driver, timeout).until(element_present) 358 | except TimeoutException: 359 | pass 360 | elif 'Wiley Online Library' in html: 361 | try: 362 | element_present = EC.presence_of_element_located((By.ID, 'article__content')) 363 | WebDriverWait(driver, timeout).until(element_present) 364 | except TimeoutException: 365 | pass 366 | 367 | ## Uncomment this next line to scroll to end. Doesn't seem to actually help. 368 | # driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 369 | ## Uncomment next line and insert ID to search for specific element. 370 | # driver.find_element_by_id('relatedArticles').send_keys('\t') 371 | # This next line helps minimize the number of blank articles saved from ScienceDirect, 372 | # which loads content via Ajax requests only after the page is done loading. There is 373 | # probably a better way to do this... 374 | 375 | driver.quit() 376 | return html 377 | 378 | elif mode == 'requests': 379 | headers = {'User-Agent': random.choice(USER_AGENTS)} 380 | r = requests.get(url, headers=headers) 381 | # For some journals, we can do better than the returned HTML, so get the final URL and 382 | # substitute a better one. 383 | url = self.check_for_substitute_url(r.url, r.text, journal) 384 | if url != r.url: 385 | r = requests.get(url, headers=headers) 386 | # XML content is usually misidentified as ISO-8859-1, so we need to manually set utf-8. 387 | # Unfortunately this can break other documents. Need to eventually change this to inspect the 388 | # encoding attribute of the document header. 389 | r.encoding = 'utf-8' 390 | return r.text 391 | 392 | 393 | def get_html_by_pmid(self, pmid, journal, mode='browser', retmode='ref', prefer_pmc_source=True): 394 | base_url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi" 395 | "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" 396 | 397 | if prefer_pmc_source: 398 | try: 399 | response = self._client.elink(pmid, retmode='json', return_content=False) 400 | response.raise_for_status() # Raise an HTTPError for bad responses 401 | json_content = response.json() 402 | 403 | providers = {obj['provider']['nameabbr']: obj["url"]["value"] for obj in json_content['linksets'][0]['idurllist'][0]['objurls']} 404 | pmc_url = providers.get('PMC') 405 | 406 | if pmc_url: 407 | return self.get_html(pmc_url, journal, mode='requests') 408 | elif prefer_pmc_source == "only": 409 | logger.info("\tNo PMC source found! Skipping...") 410 | return 411 | except requests.RequestException as e: 412 | logger.error(f"Request failed: {e}") 413 | except KeyError as e: 414 | logger.error(f"Key error: {e} - JSON content: {json_content}") 415 | else: 416 | query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}" 417 | logger.info(query) 418 | return self.get_html(query, journal, mode=mode) 419 | 420 | if prefer_pmc_source == "only": 421 | logger.info("\tNo PMC source found!! Skipping...") 422 | return 423 | 424 | # Fallback if no PMC link found 425 | query = f"{base_url}?dbfrom=pubmed&id={pmid}&cmd=prlinks&retmode={retmode}" 426 | return self.get_html(query, journal, mode=mode) 427 | 428 | 429 | def check_for_substitute_url(self, url, html, journal): 430 | ''' For some journals/publishers, we can get a better document version by modifying the 431 | URL passed from PubMed. E.g., we can get XML with embedded tables from PLoS ONE instead of 432 | the standard HTML, which displays tables as images. For some journals (e.g., Frontiers), 433 | it's easier to get the URL by searching the source, so pass the html in as well. ''' 434 | 435 | j = journal.lower() 436 | try: 437 | if j == 'plos one': 438 | doi_part = re.search('article\?id\=(.*)', url).group(1) 439 | return 'http://journals.plos.org/plosone/article/asset?id=%s.XML' % doi_part 440 | elif j in ['human brain mapping', 'european journal of neuroscience', 441 | 'brain and behavior', 'epilepsia', 'journal of neuroimaging']: 442 | return url.replace('abstract', 'full').split(';')[0] 443 | elif j == 'journal of cognitive neuroscience': 444 | return url.replace('doi/abs', 'doi/full') 445 | elif j.startswith('frontiers in'): 446 | return re.sub('(full|abstract)\/*$', 'xml\/nlm', url) 447 | elif 'sciencedirect' in url: 448 | return url + '?np=y' 449 | elif 'springer.com' in url: 450 | return url + '/fulltext.html' 451 | else: 452 | return url 453 | except Exception as err: 454 | return url 455 | 456 | 457 | def is_pmc_open_acess(self, pmcid): 458 | oa_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=" 459 | 460 | response = get_url(oa_url + pmcid) 461 | 462 | return 'idIsNotOpenAccess' not in response 463 | 464 | def process_article(self, id, journal, delay=None, mode='browser', overwrite=False, prefer_pmc_source=True): 465 | 466 | logger.info("Processing %s..." % id) 467 | journal_path = (self.store / 'html' / journal) 468 | journal_path.mkdir(parents=True, exist_ok=True) 469 | filename = journal_path / f"{id}.html" 470 | 471 | if not overwrite and os.path.isfile(filename): 472 | logger.info("\tAlready exists! Skipping...") 473 | 474 | return None, None 475 | 476 | # Save the HTML 477 | doc = self.get_html_by_pmid(id, journal, mode=mode, prefer_pmc_source=prefer_pmc_source) 478 | valid = None 479 | if doc: 480 | valid = _validate_scrape(doc) 481 | if valid: 482 | with filename.open('w') as f: 483 | f.write(doc) 484 | if not valid: 485 | logger.info("\tScrape failed! Skipping...") 486 | 487 | # Insert random delay until next request. 488 | if delay is not None: 489 | sleep_time = random.random() * float(delay*2) 490 | sleep(sleep_time) 491 | 492 | return filename, valid 493 | 494 | def retrieve_articles(self, journal=None, pmids=None, dois=None, delay=None, mode='browser', search=None, 495 | limit=None, overwrite=False, min_pmid=None, max_pmid=None, shuffle=False, 496 | index_pmids=False, skip_pubmed_central=True, metadata_store=None, invalid_article_log_file=None, prefer_pmc_source=True): 497 | 498 | ''' Try to retrieve all PubMed articles for a single journal that don't 499 | already exist in the storage directory. 500 | Args: 501 | journal: The name of the journal (as it appears in PubMed). 502 | pmids: A list of PMIDs to retrieve. 503 | dois: A list of DOIs to retrieve. 504 | delay: Mean delay between requests. 505 | mode: When 'browser', use selenium to load articles in Chrome. When 506 | 'requests', attempts to fetch the HTML directly via requests module. 507 | search: An optional search string to append to the PubMed query. 508 | Primarily useful for journals that are not specific to neuroimaging. 509 | limit: Optional max number of articles to fetch. Note that only new articles 510 | are counted against this limit; e.g., if limit = 100 and 2,000 articles 511 | are found in PubMed, retrieval will continue until 100 new articles 512 | have been added. 513 | overwrite: When True, all articles returned from PubMed query will be 514 | fetched, irrespective of whether or not they already exist on disk. 515 | min_pmid: When a PMID is provided, only articles with PMIDs greater than 516 | this will be processed. Primarily useful for excluding older articles 517 | that aren't available in full-text HTML format. 518 | max_pmid: When a PMID is provided, only articles with PMIDs less than 519 | this will be processed. 520 | shuffle: When True, articles are retrieved in random order. 521 | index_pmids: When True, will create a list of pmids already in the output. 522 | When used in combination with overwrite=False, this will not download a pmid 523 | even though it's in another directory. 524 | skip_pubmed_central: When True, skips articles that are available from 525 | PubMed Central. This will also write a file with the skipped pmcids 526 | to use with pubget. 527 | metadata_store: Optional path to a directory to store/reference PubMed metadata. 528 | invalid_article_log_file: Optional path to a file to log files where scraping failed. 529 | prefer_pmc_source: Optional 530 | When True, preferentially retrieve articles from PubMed Central, using requests instead of browser 531 | (regardless of mode). This is useful for journals that have full-text articles available on PMC, 532 | but are not open-access. If set to "only", will only retrieve articles from PMC, and 533 | skip articles it cannot retrieve from PMC. 534 | ''' 535 | articles_found = 0 536 | if journal is None and dois is None and pmids is None: 537 | raise ValueError("Either journal, pmids, or dois must be provided.") 538 | 539 | if journal is not None: 540 | logger.info("Getting PMIDs for articles from %s..." % journal) 541 | pmids = self.search_pubmed(journal, search) 542 | 543 | if dois is not None: 544 | logger.info("Retrieving articles from %s..." % ', '.join(dois)) 545 | pmids = [get_pmid_from_doi(doi) for doi in dois] 546 | 547 | # Remove None values and log missing DOIs 548 | pmids = [pmid for pmid in pmids if pmid is not None] 549 | missing_dois = [doi for doi, pmid in zip(dois, pmids) if pmid is None] 550 | if len(missing_dois) > 0: 551 | logger.info("Missing DOIs: %s" % ', '.join(missing_dois)) 552 | 553 | if shuffle: 554 | random.shuffle(pmids) 555 | 556 | logger.info("Found %d records.\n" % len(pmids)) 557 | 558 | # If journal is provided, check for existing articles 559 | if journal is not None: 560 | logger.info("Retrieving articles from %s..." % journal) 561 | journal_path = (self.store / 'html' / journal) 562 | if journal_path.exists(): 563 | existing = journal_path.glob('*.html') 564 | existing = [int(f.stem) for f in existing] 565 | n_existing = len(existing) 566 | pmids = [pmid for pmid in pmids if int(pmid) not in existing] 567 | logger.info(f"Found {n_existing} existing articles.") 568 | 569 | # filter out all pmids, not just based on folder 570 | if index_pmids: 571 | existing_pmids = [f.stem for f in (self.store / 'html').rglob('*.html')] 572 | pmids = [pmid for pmid in pmids if pmid not in existing_pmids] 573 | 574 | # Filter out articles that are outside the PMID range 575 | pmids = [ 576 | pmid 577 | for pmid in pmids 578 | if (min_pmid is None or int(pmid) >= min_pmid) and (max_pmid is None or int(pmid) <= max_pmid) 579 | ] 580 | 581 | logger.info(f"Retrieving {len(pmids)} articles...") 582 | 583 | if skip_pubmed_central: 584 | all_ids = _convert_pmid_to_pmc(pmids) 585 | else: 586 | all_ids = [(None, pmid) for pmid in pmids] 587 | 588 | invalid_articles = [] 589 | 590 | if journal is None: 591 | all_iter = [] 592 | for pmcid, pmid in all_ids: 593 | metadata = get_pubmed_metadata(pmid, store=metadata_store) 594 | if not metadata or 'journal' not in metadata: 595 | all_iter.append((pmcid, pmid, "UNKNOWN")) 596 | continue 597 | all_iter.append((pmcid, pmid, metadata['journal'])) 598 | else: 599 | all_iter = [(pmcid, pmid, journal) for pmcid, pmid in all_ids] 600 | 601 | for pmcid, pmid, journal in all_iter: 602 | 603 | if limit is not None and articles_found >= limit: break 604 | 605 | if skip_pubmed_central and pmcid and self.is_pmc_open_acess(pmcid): 606 | logger.info(f"\tPubMed Central OpenAccess entry found! Skipping {pmid}...") 607 | with open('openaccess_pmcids.txt', 'a') as f: 608 | f.write(f"{pmcid}\n") 609 | continue 610 | 611 | filename, valid = self.process_article(pmid, journal, delay, mode, overwrite, prefer_pmc_source) 612 | 613 | if not valid: 614 | invalid_articles.append(filename) 615 | if invalid_article_log_file is not None: 616 | with open(invalid_article_log_file, 'a') as f: 617 | f.write(f"{pmid}\n") 618 | else: 619 | articles_found += 1 620 | 621 | return invalid_articles 622 | -------------------------------------------------------------------------------- /ace/sources.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # use unicode everywhere 3 | from bs4 import BeautifulSoup 4 | import re 5 | import os 6 | import json 7 | import abc 8 | import importlib 9 | from glob import glob 10 | from ace import datatable 11 | from ace import tableparser 12 | from ace import scrape 13 | from ace import config 14 | from ace import database 15 | import logging 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class SourceManager: 21 | 22 | ''' Loads all the available Source subclasses from this module and the 23 | associated directory of JSON config files and uses them to determine which parser 24 | to call when a new HTML file is passed. ''' 25 | 26 | def __init__(self, database, table_dir=None): 27 | ''' SourceManager constructor. 28 | Args: 29 | database: A Database instance to use with all Sources. 30 | table_dir: An optional directory name to save any downloaded tables to. 31 | When table_dir is None, nothing will be saved (requiring new scraping 32 | each time the article is processed). 33 | ''' 34 | module = importlib.import_module('ace.sources') 35 | self.sources = {} 36 | source_dir = os.path.join(os.path.dirname(__file__), 'sources') 37 | for config_file in glob('%s/*json' % source_dir): 38 | class_name = config_file.split('/')[-1].split('.')[0] 39 | cls = getattr(module, class_name + 'Source')(database, config=config_file, table_dir=table_dir) 40 | self.sources[class_name] = cls 41 | 42 | def identify_source(self, html): 43 | ''' Identify the source of the article and return the corresponding Source object. ''' 44 | for source in list(self.sources.values()): 45 | for patt in source.identifiers: 46 | if re.search(patt, html): 47 | logger.debug('Matched article to Source: %s' % source.__class__.__name__) 48 | return source 49 | 50 | 51 | # A single source of articles--i.e., a publisher or journal 52 | class Source(metaclass=abc.ABCMeta): 53 | # need to include the \\u2009 which is the thin space to which the table is being invalidated due to those characters 54 | # -\\u2009int 55 | ENTITIES = { 56 | ' ': ' ', 57 | '−': '-', 58 | # 'κ': 'kappa', 59 | '\xa0': ' ', # Unicode non-breaking space 60 | # '\x3e': ' ', 61 | '\u2212': '-', # Various unicode dashes 62 | '\u2012': '-', 63 | '\u2013': '-', 64 | '\u2014': '-', 65 | '\u2015': '-', 66 | '\u8211': '-', 67 | '\u0150': '-', 68 | '\u0177': '', 69 | '\u0160': '', 70 | '\u0145': "'", 71 | '\u0146': "'", 72 | '\u2009': "", # Various whitespaces within tables 73 | '\u2007': "", 74 | 75 | } 76 | 77 | def __init__(self, database, config=None, table_dir=None): 78 | self.database = database 79 | self.table_dir = table_dir 80 | self.entities = {} 81 | 82 | if config is not None: 83 | config = json.load(open(config, 'rb')) 84 | valid_keys = ['name', 'identifiers', 'entities', 'delay'] 85 | 86 | for k, v in list(config.items()): 87 | if k in valid_keys: 88 | setattr(self, k, v) 89 | 90 | # Append any source-specific entities found in the config file to 91 | # the standard list 92 | if self.entities is None: 93 | self.entities = Source.ENTITIES 94 | else: 95 | self.entities.update(Source.ENTITIES) 96 | 97 | @abc.abstractmethod 98 | def parse_article(self, html, pmid=None, metadata_dir=None): 99 | ''' Takes HTML article as input and returns an Article. PMID Can also be 100 | passed, which prevents having to scrape it from the article and/or look it 101 | up in PubMed. ''' 102 | 103 | # Skip rest of processing if this record already exists 104 | if pmid is not None and self.database.article_exists(pmid) and not config.OVERWRITE_EXISTING_ROWS: 105 | return False 106 | 107 | html = self.decode_html_entities(html) 108 | soup = BeautifulSoup(html) 109 | if pmid is None: 110 | pmid = self.extract_pmid(soup) 111 | 112 | # did our best to find PMID, but failed 113 | if not pmid: 114 | return False 115 | 116 | metadata = scrape.get_pubmed_metadata(pmid, store=metadata_dir, save=True) 117 | 118 | # Remove all scripts and styles 119 | for script in soup(["script", "style"]): 120 | script.extract() 121 | # Get text 122 | text = soup.get_text() 123 | if self.database.article_exists(pmid): 124 | if config.OVERWRITE_EXISTING_ROWS: 125 | self.database.delete_article(pmid) 126 | else: 127 | return False 128 | 129 | self.article = database.Article(text, pmid=pmid, metadata=metadata) 130 | self.extract_neurovault(soup) 131 | return soup 132 | 133 | def extract_neurovault(self, soup): 134 | ''' Look through all links, and use regex to identify NeuroVault links. ''' 135 | image_regexes = ['identifiers.org/neurovault.image:(\d*)', 136 | 'neurovault.org/images/(\d*)'] 137 | 138 | image_regexes = re.compile( '|'.join( image_regexes) ) 139 | 140 | collection_regexes = ['identifiers.org/neurovault.collection:(\w*)', 141 | 'neurovault.org/collections/(\w*)'] 142 | 143 | collection_regexes = re.compile( '|'.join( collection_regexes) ) 144 | 145 | 146 | nv_links = [] 147 | for link in soup.find_all('a'): 148 | if link.has_attr('href'): 149 | href = link['href'] 150 | 151 | img_m = image_regexes.search(href) 152 | col_m = collection_regexes.search(href) 153 | if not (img_m or col_m): 154 | continue 155 | 156 | if img_m: 157 | type = 'image' 158 | val = img_m.groups()[0] or img_m.groups()[1] 159 | elif col_m: 160 | type = 'collection' 161 | val = col_m.groups()[0] or col_m.groups()[1] 162 | 163 | nv_links.append( 164 | database.NeurovaultLink( 165 | type=type, 166 | neurovault_id=val, 167 | url=href 168 | ) 169 | ) 170 | 171 | self.article.neurovault_links = nv_links 172 | 173 | def extract_text(self, soup): 174 | ''' Extract text from the article. 175 | Publisher specific extraction of body text should be done in a subclass. 176 | ''' 177 | 178 | text = soup.get_text() 179 | 180 | # Remove any remaining HTML tags 181 | text = re.sub(r'<[^>]+>', '', text) 182 | 183 | # Remove any remaining unicode characters 184 | text = re.sub(r'\\u[0-9]+', '', text) 185 | 186 | # Remove any remaining entities 187 | text = self.decode_html_entities(text) 188 | 189 | # Remove any remaining whitespace 190 | text = re.sub(r'\s+', ' ', text) 191 | 192 | self.article.text = text 193 | 194 | def parse_table(self, table): 195 | ''' Takes HTML for a single table and returns a Table. ''' 196 | # Formatting issues sometimes prevent table extraction, so just return 197 | if table is None: 198 | return False 199 | 200 | logger.debug("\t\tFound a table...") 201 | 202 | # change
to \n 203 | for br in table.find_all("br"): 204 | br.replace_with("\n") 205 | 206 | # Count columns. Check either just one row, or all of them. 207 | def n_cols_in_row(row): 208 | return sum([ 209 | int(td['colspan']) 210 | if td.has_attr('colspan') and td['colspan'] != "NaN" else 1 211 | for td in row.find_all(['th', 'td']) 212 | ]) 213 | 214 | search_table = table.find("tbody") 215 | if search_table is None: 216 | search_table = table 217 | 218 | all_trs = search_table.find_all('tr') 219 | if all_trs is None or len(all_trs) == 0: 220 | return False 221 | 222 | if config.CAREFUL_PARSING: 223 | n_cols = max([n_cols_in_row( 224 | row) for row in all_trs]) 225 | else: 226 | n_cols = n_cols_in_row(search_table.find('tr')) 227 | 228 | # Initialize grid and populate 229 | data = datatable.DataTable(0, n_cols) 230 | rows = table.find_all('tr') 231 | for (j, r) in enumerate(rows): 232 | try: 233 | cols = r.find_all(['td', 'th']) 234 | cols_found_in_row = 0 235 | n_cells = len(cols) 236 | # Assign number of rows and columns this cell fills. We use these rules: 237 | # * If a rowspan/colspan is explicitly provided, use it 238 | # * If not, initially assume span == 1 for both rows and columns. 239 | for (i, c) in enumerate(cols): 240 | r_num = ( 241 | int(c['rowspan']) 242 | if c.has_attr('rowspan') and c['rowspan'] != "NaN" else 1 243 | ) 244 | c_num = ( 245 | int(c['colspan']) 246 | if c.has_attr('colspan') and c['colspan'] != "NaN" else 1 247 | ) 248 | cols_found_in_row += c_num 249 | # * Check to make sure that we don't have unaccounted-for columns in the 250 | # row after including the current cell. If we do, adjust the colspan 251 | # to take up all of the remaining columns. This is necessary because 252 | # some tables have malformed HTML, and BeautifulSoup can also 253 | # cause problems in its efforts to fix bad tables. The most common 254 | # problem is deletion or omission of enough tags to fill all 255 | # columns, hence our adjustment. Note that in some cases the order of 256 | # filling is not sequential--e.g., when a previous row has cells with 257 | # rowspan > 1. So we have to check if there are None values left over 258 | # in the DataTable's current row after we finish filling 259 | # it. 260 | if i + 1 == n_cells and cols_found_in_row < n_cols and (len(data.data) == j+1) and data[j].count(None) > c_num: 261 | c_num += n_cols - cols_found_in_row 262 | data.add_val(c.get_text(), r_num, c_num) 263 | except Exception as err: 264 | if not config.SILENT_ERRORS: 265 | logger.error(str(err)) 266 | if not config.IGNORE_BAD_ROWS: 267 | raise 268 | 269 | if data.data[data.n_rows- 1].count(None) == data.n_cols: 270 | data.data.pop() 271 | logger.debug("\t\tTrying to parse table...") 272 | return tableparser.parse_table(data) 273 | 274 | def extract_doi(self, soup): 275 | ''' Every Source subclass must be able to extract its doi. ''' 276 | return 277 | 278 | def extract_pmid(self, soup): 279 | ''' Every Source subclass must be able to extract its PMID. ''' 280 | return 281 | 282 | def decode_html_entities(self, html): 283 | ''' Re-encode HTML entities as innocuous little Unicode characters. ''' 284 | # Any entities BeautifulSoup passes through thatwe don't like, e.g., 285 | #  /x0a 286 | if self.entities: 287 | patterns = re.compile('(' + '|'.join(re.escape( 288 | k) for k in list(self.entities.keys())) + ')') 289 | replacements = lambda m: self.entities[m.group(0)] 290 | return patterns.sub(replacements, html) 291 | else: 292 | return html 293 | 294 | def _download_table(self, url): 295 | ''' For Sources that have tables in separate files, a helper for 296 | downloading and extracting the table data. Also saves to file if desired. 297 | ''' 298 | 299 | delay = self.delay if hasattr(self, 'delay') else 0 300 | 301 | if self.table_dir is not None: 302 | filename = '%s/%s' % (self.table_dir, url.replace('/', '_')) 303 | if os.path.exists(filename): 304 | table_html = open(filename).read() 305 | else: 306 | table_html = scrape.get_url(url) 307 | open(filename, 'w').write(table_html.encode('utf-8')) 308 | else: 309 | table_html = scrape.get_url(url) 310 | 311 | if table_html: 312 | table_html = self.decode_html_entities(table_html) 313 | return BeautifulSoup(table_html) 314 | 315 | return None 316 | 317 | 318 | class DefaultSource(Source): 319 | def parse_article(self, html, pmid=None, **kwargs): 320 | soup = super(DefaultSource, self).parse_article(html, pmid, **kwargs) 321 | if not soup: 322 | return False 323 | 324 | self.article.missing_source = True 325 | return self.article 326 | 327 | 328 | class HighWireSource(Source): 329 | 330 | def parse_article(self, html, pmid=None, **kwargs): 331 | soup = super(HighWireSource, self).parse_article(html, pmid, **kwargs) 332 | if not soup: 333 | return False 334 | 335 | # To download tables, we need the content URL and the number of tables 336 | content_url = soup.find('meta', { 337 | 'name': 'citation_public_url'})['content'] 338 | 339 | n_tables = len(soup.find_all('span', class_='table-label')) 340 | 341 | # Now download each table and parse it 342 | tables = [] 343 | logger.info(f"Found {n_tables} tables.") 344 | for i in range(n_tables): 345 | t_num = i + 1 346 | url = '%s/T%d.expansion.html' % (content_url, t_num) 347 | table_soup = self._download_table(url) 348 | if not table_soup: 349 | continue 350 | tc = table_soup.find(class_='table-expansion') 351 | if tc: 352 | t = tc.find('table', {'id': 'table-%d' % (t_num)}) 353 | t = self.parse_table(t) 354 | if t: 355 | t.position = t_num 356 | t.label = tc.find(class_='table-label').text 357 | t.number = t.label.split(' ')[-1].strip() 358 | try: 359 | t.caption = tc.find(class_='table-caption').get_text() 360 | except: 361 | pass 362 | try: 363 | t.notes = tc.find(class_='table-footnotes').get_text() 364 | except: 365 | pass 366 | tables.append(t) 367 | 368 | self.article.tables = tables 369 | return self.article 370 | 371 | def parse_table(self, table): 372 | return super(HighWireSource, self).parse_table(table) 373 | 374 | def extract_doi(self, soup): 375 | try: 376 | return soup.find('meta', {'name': 'citation_doi'})['content'] 377 | except: 378 | return '' 379 | 380 | def extract_pmid(self, soup): 381 | return soup.find('meta', {'name': 'citation_pmid'})['content'] 382 | 383 | def extract_text(self, soup): 384 | # If div has class "main-content-wrapper" or "article" or "fulltext-view" 385 | # extract all text from it 386 | 387 | # Assuming you have a BeautifulSoup object called soup 388 | div = soup.find_all("div", class_="article") 389 | if div: 390 | div = div[0] 391 | div_classes = ["ref-list", "abstract", "copyright-statement", "fn-group", "history-list", "license"] 392 | for class_ in div_classes: 393 | for tag in div.find_all(class_=class_): 394 | tag.extract() 395 | soup = div 396 | 397 | return super(HighWireSource, self).extract_text(soup) 398 | 399 | 400 | class OUPSource(Source): 401 | 402 | def parse_article(self, html, pmid=None, **kwargs): 403 | soup = super(OUPSource, self).parse_article(html, pmid, **kwargs) 404 | if not soup: 405 | return False 406 | 407 | # Extract tables 408 | tables = [] 409 | 410 | # Exclude modal tables to prevent duplicates 411 | all_tables = set(soup.select('div.table-full-width-wrap')) 412 | modal_tables = set(soup.select('div.table-full-width-wrap.table-modal')) 413 | table_containers = all_tables - modal_tables 414 | logger.info(f"Found {len(table_containers)} tables.") 415 | for (i, tc) in enumerate(table_containers): 416 | table_html = tc.find('table') 417 | t = self.parse_table(table_html) 418 | if t: 419 | t.position = i + 1 420 | try: 421 | t.number = tc.find('span', class_='label').text.split(' ')[-1].strip() 422 | t.label = tc.find('span', class_='label').text.strip() 423 | except: 424 | pass 425 | try: 426 | t.caption = tc.find('span', class_='caption').get_text() 427 | except: 428 | pass 429 | try: 430 | t.notes = tc.find('span', class_='fn').get_text() 431 | except: 432 | pass 433 | tables.append(t) 434 | 435 | self.article.tables = tables 436 | return self.article 437 | 438 | def parse_table(self, table): 439 | return super(OUPSource, self).parse_table(table) 440 | 441 | def extract_doi(self, soup): 442 | try: 443 | return soup.find('meta', {'name': 'citation_doi'})['content'] 444 | except: 445 | return '' 446 | 447 | def extract_pmid(self, soup): 448 | pmid = soup.find('meta', {'name': 'citation_pmid'}) 449 | if pmid: 450 | return pmid['content'] 451 | else: 452 | return None 453 | 454 | def extract_text(self, soup): 455 | # If div has class "main-content-wrapper" or "article" or "fulltext-view" 456 | # extract all text from it 457 | 458 | # Assuming you have a BeautifulSoup object called soup 459 | div = soup.find_all("div", class_="article-body") 460 | if div: 461 | div = div[0] 462 | div_classes = ["ref-list", "abstract", "copyright-statement", "fn-group", "history-list", "license"] 463 | for class_ in div_classes: 464 | for tag in div.find_all(class_=class_): 465 | tag.extract() 466 | soup = div 467 | 468 | return super(OUPSource, self).extract_text(soup) 469 | 470 | 471 | class ScienceDirectSource(Source): 472 | 473 | def parse_article(self, html, pmid=None, **kwargs): 474 | soup = super(ScienceDirectSource, self).parse_article(html, pmid, **kwargs) 475 | if not soup: 476 | return False 477 | 478 | # Extract tables 479 | tables = [] 480 | table_containers = soup.find_all('div', {'class': 'tables'}) 481 | if len(table_containers) == 0: 482 | # try old method 483 | table_containers = soup.find_all('dl', {'class': 'table'}) 484 | 485 | logger.info(f"Found {len(table_containers)} tables.") 486 | for (i, tc) in enumerate(table_containers): 487 | table_html = tc.find('table') 488 | t = self.parse_table(table_html) 489 | if t: 490 | t.position = i + 1 491 | try: 492 | t.number = tc.find('span', class_='label').text.split(' ')[-1].strip() or tc['data-label'].split(' ')[-1].strip() 493 | t.label = tc.find('span', class_='label').text.strip() 494 | except: 495 | pass 496 | try: 497 | t.caption = tc.find('p').contents[-1].strip() 498 | except: 499 | pass 500 | try: 501 | t.notes = tc.find(class_='tblFootnote').get_text() 502 | except: 503 | pass 504 | tables.append(t) 505 | 506 | self.article.tables = tables 507 | return self.article 508 | 509 | def parse_table(self, table): 510 | return super(ScienceDirectSource, self).parse_table(table) 511 | 512 | def extract_doi(self, soup): 513 | try: 514 | return list(soup.find('div', {'id': 'article-identifier-links'}).children)[0]['href'].replace('https://doi.org/', '') 515 | except: 516 | return '' 517 | 518 | def extract_pmid(self, soup): 519 | return scrape.get_pmid_from_doi(self.extract_doi(soup)) 520 | 521 | 522 | class PlosSource(Source): 523 | 524 | def parse_article(self, html, pmid=None, **kwargs): 525 | soup = super(PlosSource, self).parse_article(html, pmid, **kwargs) # Do some preprocessing 526 | if not soup: 527 | return False 528 | 529 | # Extract tables 530 | tables = [] 531 | table_containers = soup.find_all('table-wrap') 532 | logger.info(f"Found {len(table_containers)} tables.") 533 | for (i, tc) in enumerate(table_containers): 534 | table_html = tc.find('table') 535 | t = self.parse_table(table_html) 536 | if t: 537 | t.position = i + 1 538 | t.label = tc.find('label').text 539 | t.number = t.label.split(' ')[-1].strip() 540 | try: 541 | t.caption = tc.find('title').get_text() 542 | except: 543 | pass 544 | try: 545 | t.notes = tc.find('table-wrap-foot').get_text() 546 | except: 547 | pass 548 | tables.append(t) 549 | 550 | self.article.tables = tables 551 | return self.article 552 | 553 | def parse_table(self, table): 554 | return super(PlosSource, self).parse_table(table) 555 | 556 | def extract_doi(self, soup): 557 | try: 558 | return soup.find('article-id', {'pub-id-type': 'doi'}).text 559 | except: 560 | return '' 561 | 562 | def extract_pmid(self, soup): 563 | return scrape.get_pmid_from_doi(self.extract_doi(soup)) 564 | 565 | 566 | class FrontiersSource(Source): 567 | 568 | def parse_article(self, html, pmid=None, **kwargs): 569 | 570 | soup = super(FrontiersSource, self).parse_article(html, pmid, **kwargs) 571 | if not soup: 572 | return False 573 | 574 | # Extract tables 575 | tables = [] 576 | table_containers = soup.findAll( 577 | 'table-wrap', {'id': re.compile('^T\d+$')}) 578 | logger.info(f"Found {len(table_containers)} tables.") 579 | for (i, tc) in enumerate(table_containers): 580 | table_html = tc.find('table') 581 | t = self.parse_table(table_html) 582 | # If Table instance is returned, add other properties 583 | if t: 584 | t.position = i + 1 585 | t.number = tc['id'][1::].strip() 586 | t.label = tc.find('label').get_text() 587 | try: 588 | t.caption = tc.find('caption').get_text() 589 | except: 590 | pass 591 | try: 592 | t.notes = tc.find('table-wrap-foot').get_text() 593 | except: 594 | pass 595 | tables.append(t) 596 | 597 | self.article.tables = tables 598 | return self.article 599 | 600 | def parse_table(self, table): 601 | return super(FrontiersSource, self).parse_table(table) 602 | 603 | def extract_doi(self, soup): 604 | try: 605 | return soup.find('article-id', {'pub-id-type': 'doi'}).text 606 | except: 607 | return '' 608 | 609 | def extract_pmid(self, soup): 610 | return scrape.get_pmid_from_doi(self.extract_doi(soup)) 611 | 612 | 613 | class JournalOfCognitiveNeuroscienceSource(Source): 614 | 615 | def parse_article(self, html, pmid=None, **kwargs): 616 | soup = super( 617 | JournalOfCognitiveNeuroscienceSource, self).parse_article(html, pmid, **kwargs) 618 | if not soup: 619 | return False 620 | 621 | # To download tables, we need the DOI and the number of tables 622 | doi = self.article.doi or self.extract_doi(soup) 623 | tables = [] 624 | 625 | # Now download each table and parse it 626 | table_containers = soup.find_all('div', {'class': 'table-wrap'}) 627 | logger.info(f"Found {len(table_containers)} tables.") 628 | for i, tc in enumerate(table_containers): 629 | table_html = tc.find('table', {'role': 'table'}) 630 | if not table_html: 631 | continue 632 | 633 | t = self.parse_table(table_html) 634 | 635 | if t: 636 | t.position = i + 1 637 | s = re.search('T(\d+).+$', tc['content-id']) 638 | if s: 639 | t.number = s.group(1) 640 | caption = tc.find('div', class_='caption') 641 | if caption: 642 | t.label = caption.get_text() 643 | t.caption = caption.get_text() 644 | try: 645 | t.notes = tc.find('div', class_="fn").p.get_text() 646 | except: 647 | pass 648 | tables.append(t) 649 | 650 | self.article.tables = tables 651 | return self.article 652 | 653 | def parse_table(self, table): 654 | return super(JournalOfCognitiveNeuroscienceSource, self).parse_table(table) 655 | 656 | def extract_doi(self, soup): 657 | try: 658 | return soup.find('meta', {'name': 'dc.Identifier', 'scheme': 'doi'})['content'] 659 | except: 660 | return '' 661 | 662 | def extract_pmid(self, soup): 663 | return scrape.get_pmid_from_doi(self.extract_doi(soup)) 664 | 665 | 666 | class WileySource(Source): 667 | 668 | def parse_article(self, html, pmid=None, **kwargs): 669 | 670 | soup = super(WileySource, self).parse_article(html, pmid, **kwargs) # Do some preprocessing 671 | if not soup: 672 | return False 673 | 674 | # Extract tables 675 | tables = [] 676 | table_containers = soup.findAll('div', { 677 | 'class': re.compile('article-table-content|table'), 'id': re.compile('^(.*?)\-tbl\-\d+$|^t(bl)*\d+$')}) 678 | logger.info(f"Found {len(table_containers)} tables.") 679 | for (i, tc) in enumerate(table_containers): 680 | table_html = tc.find('table') 681 | footer = None 682 | try: 683 | # Remove footer, which appears inside table 684 | footer = table_html.tfoot.extract() 685 | except: 686 | pass 687 | t = self.parse_table(table_html) 688 | # If Table instance is returned, add other properties 689 | if t: 690 | t.position = i + 1 691 | # t.number = tc['id'][3::].strip() 692 | t.number = re.search('t[bl0\-]*(\d+)$', tc['id']).group(1) 693 | try: 694 | t.label = tc.find('span', class_='label').get_text() 695 | except: 696 | pass 697 | try: 698 | t.caption = tc.find('caption').get_text() 699 | except AttributeError: 700 | caption = tc.find('div', {'header': 'article-table-caption'}) 701 | t.caption = caption.get_text() if caption else None 702 | try: 703 | t.notes = footer.get_text() if footer else None 704 | except AttributeError: 705 | notes = tc.find('div', {'class': 'article-section__table-footnotes'}) 706 | t.notes = notes.get_text() if caption else None 707 | pass 708 | tables.append(t) 709 | 710 | self.article.tables = tables 711 | return self.article 712 | 713 | def parse_table(self, table): 714 | return super(WileySource, self).parse_table(table) 715 | 716 | def extract_doi(self, soup): 717 | try: 718 | return soup.find('meta', {'name': 'citation_doi'})['content'] 719 | except: 720 | return '' 721 | 722 | def extract_pmid(self, soup): 723 | return scrape.get_pmid_from_doi(self.extract_doi(soup)) 724 | 725 | # Note: the SageSource is largely useless and untested because Sage renders tables 726 | # as images. 727 | 728 | 729 | class SageSource(Source): 730 | 731 | def parse_article(self, html, pmid=None, **kwargs): 732 | 733 | soup = super(SageSource, self).parse_article(html, pmid, **kwargs) 734 | if not soup: 735 | return False 736 | 737 | # To download tables, we need the content URL and the number of tables 738 | content_url = soup.find('meta', { 739 | 'name': 'citation_public_url'})['content'] 740 | 741 | n_tables = len(soup.find_all('span', class_='table-label')) 742 | logger.info(f"Found {n_tables} tables.") 743 | # Now download each table and parse it 744 | tables = [] 745 | for i in range(n_tables): 746 | t_num = i + 1 747 | url = '%s/T%d.expansion.html' % (content_url, t_num) 748 | table_soup = self._download_table(url) 749 | if not table_soup: 750 | continue 751 | tc = table_soup.find(class_='table-expansion') 752 | if tc: 753 | t = tc.find('table', {'id': 'table-%d' % (t_num)}) 754 | t = self.parse_table(t) 755 | if t: 756 | t.position = t_num 757 | t.label = tc.find(class_='table-label').text 758 | t.number = t.label.split(' ')[-1].strip() 759 | try: 760 | t.caption = tc.find(class_='table-caption').get_text() 761 | except: 762 | pass 763 | try: 764 | t.notes = tc.find(class_='table-footnotes').get_text() 765 | except: 766 | pass 767 | tables.append(t) 768 | 769 | self.article.tables = tables 770 | return self.article 771 | 772 | def parse_table(self, table): 773 | return super(SageSource, self).parse_table(table) 774 | 775 | def extract_doi(self, soup): 776 | try: 777 | return soup.find('meta', {'name': 'citation_doi'})['content'] 778 | except: 779 | return '' 780 | 781 | def extract_pmid(self, soup): 782 | return soup.find('meta', {'name': 'citation_pmid'})['content'] 783 | 784 | 785 | class OldSpringerSource(Source): 786 | 787 | def parse_article(self, html, pmid=None, **kwargs): 788 | 789 | soup = super(OldSpringerSource, self).parse_article(html, pmid, **kwargs) 790 | if not soup: 791 | return False 792 | 793 | # Extract tables 794 | tables = [] 795 | table_containers = soup.findAll( 796 | 'figure', {'id': re.compile('^Tab\d+$')}) 797 | for (i, tc) in enumerate(table_containers): 798 | table_html = tc.find('table') 799 | t = self.parse_table(table_html) 800 | # If Table instance is returned, add other properties 801 | if t: 802 | t.position = i + 1 803 | t.number = tc['id'][3::].strip() 804 | t.label = tc.find('span', class_='CaptionNumber').get_text() 805 | try: 806 | t.caption = tc.find(class_='CaptionContent').p.get_text() 807 | except: 808 | pass 809 | try: 810 | t.notes = tc.find(class_='TableFooter').p.get_text() 811 | except: 812 | pass 813 | tables.append(t) 814 | 815 | self.article.tables = tables 816 | return self.article 817 | 818 | def parse_table(self, table): 819 | return super(OldSpringerSource, self).parse_table(table) 820 | 821 | def extract_doi(self, soup): 822 | content = soup.find('p', class_='ArticleDOI').get_text() 823 | return content.split(' ')[1] 824 | 825 | def extract_pmid(self, soup): 826 | return scrape.get_pmid_from_doi(self.extract_doi(soup)) 827 | 828 | 829 | class SpringerSource(Source): 830 | 831 | def parse_article(self, html, pmid=None, **kwargs): 832 | 833 | soup = super(SpringerSource, self).parse_article(html, pmid, **kwargs) 834 | if not soup: 835 | return False 836 | 837 | # Extract table; going to take the approach of opening and parsing the table via links 838 | # To download tables, we need the content URL and the number of tables 839 | content_url = soup.find('meta', {'name': 'citation_fulltext_html_url'})['content'] 840 | 841 | n_tables = len(soup.find_all('span', string='Full size table')) 842 | logger.info(f"Found {n_tables} tables.") 843 | # Now download each table and parse it 844 | tables = [] 845 | for i in range(n_tables): 846 | t_num = i + 1 847 | url = '%s/tables/%d' % (content_url, t_num) 848 | table_soup = self._download_table(url) 849 | if not table_soup: 850 | continue 851 | tc = table_soup.find(class_='data last-table') 852 | t = self.parse_table(tc) 853 | if t: 854 | t.position = t_num 855 | 856 | # id_name is the id HTML element that cotains the title, label and table number that needs to be parse 857 | # temp_title sets it up to where the title can be parsed and then categorized 858 | id_name = f"table-{t_num}-title" 859 | temp_title = table_soup.find('h1', attrs={'id': id_name}).get_text().split() 860 | 861 | # grabbing the first two elements for the label and then making them a string object 862 | t.label = " ".join(temp_title[:2]) 863 | t.number = str(temp_title[1]) 864 | try: 865 | # grabbing the rest of the element for the caption/title of the table and then making them a string object 866 | t.caption = " ".join(temp_title[2:]) 867 | except: 868 | pass 869 | try: 870 | t.notes = table_soup.find(class_='c-article-table-footer').get_text() 871 | except: 872 | pass 873 | tables.append(t) 874 | 875 | self.article.tables = tables 876 | return self.article 877 | 878 | def parse_table(self, table): 879 | return super(SpringerSource, self).parse_table(table) 880 | 881 | def extract_doi(self, soup): 882 | try: 883 | return soup.find('meta', attrs={'name': "citation_doi"})['content'] 884 | except: 885 | return '' 886 | 887 | def extract_pmid(self, soup): 888 | return scrape.get_pmid_from_doi(self.extract_doi(soup)) 889 | 890 | 891 | class PMCSource(Source): 892 | def parse_article(self, html, pmid=None, **kwargs): 893 | soup = super(PMCSource, self).parse_article(html, pmid, **kwargs) 894 | if not soup: 895 | return False 896 | 897 | tables = [] 898 | table_containers = soup.findAll('div', {'class': 'table-wrap'}) 899 | logger.info(f"Found {len(table_containers)} tables.") 900 | for (i, tc) in enumerate(table_containers): 901 | sub_tables = tc.findAll('div', {'class': 'xtable'}) 902 | for st in sub_tables: 903 | t = self.parse_table(st) 904 | if t: 905 | t.position = i + 1 906 | t.label = tc.find('h3').text if tc.find('h3') else None 907 | t.number = t.label.split(' ')[-1].strip() if t.label else None 908 | try: 909 | t.caption = tc.find({"div": {"class": "caption"}}).text 910 | except: 911 | pass 912 | try: 913 | t.notes = tc.find('div', class_='tblwrap-foot').text 914 | except: 915 | pass 916 | tables.append(t) 917 | 918 | self.article.tables = tables 919 | return self.article 920 | 921 | def extract_pmid(self, soup): 922 | return soup.find('meta', {'name': 'citation_pmid'})['content'] 923 | 924 | def extract_doi(self, soup): 925 | return soup.find('meta', {'name': 'citation_doi'})['content'] 926 | -------------------------------------------------------------------------------- /ace/sources/Frontiers.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Frontiers", 3 | "identifiers": [ 4 | "10.3389" 5 | ], 6 | "entities": { 7 | "−": "-", 8 | " ": " " 9 | } 10 | } -------------------------------------------------------------------------------- /ace/sources/HighWire.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "HighWire", 3 | "identifiers": [ 4 | "highwire-journal", 5 | "http://schema.highwire.org/Linking", 6 | "highwire-journal-article" 7 | ], 8 | "entities": { 9 | }, 10 | "delay": 10 11 | } 12 | -------------------------------------------------------------------------------- /ace/sources/JournalOfCognitiveNeuroscience.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "JournalOfCognitiveNeuroscience", 3 | "identifiers": [ 4 | "property=\"og:site_name\" content=\"MIT Press\"", 5 | "MIT Press Journals - Journal of Cognitive Neuroscience - Full Text" 6 | ], 7 | "entities": { 8 | "\u2002": " " 9 | }, 10 | "delay": 10 11 | } 12 | -------------------------------------------------------------------------------- /ace/sources/OUP.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "OUP", 3 | "identifiers": [ 4 | "OUP Academic" 5 | ], 6 | "entities": { 7 | }, 8 | "delay": 10 9 | } -------------------------------------------------------------------------------- /ace/sources/OldSpringer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "OldSpringer", 3 | "identifiers": [ 4 | "- Springer" 5 | ], 6 | "entities": { 7 | 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /ace/sources/PMC.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "PMC", 3 | "identifiers": [ 4 | "", 5 | "" 6 | ], 7 | "entities": { 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /ace/sources/Plos.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Plos", 3 | "identifiers": [ 4 | "Public Library of Science" 5 | ], 6 | "entities": { 7 | } 8 | } -------------------------------------------------------------------------------- /ace/sources/Sage.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Sage", 3 | "identifiers": [ 4 | "" 5 | ], 6 | "entities": { 7 | 8 | } 9 | } -------------------------------------------------------------------------------- /ace/sources/ScienceDirect.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ScienceDirect", 3 | "identifiers": [ 4 | "title=\"ScienceDirect -The world's leading full-text scientific database\"", 5 | "- ScienceDirect" 6 | ], 7 | "entities": { 8 | 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /ace/sources/Springer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Springer", 3 | "identifiers": [ 4 | "", 5 | "meta property=\"og:site_name\" content=\"SpringerLink\"" 6 | ], 7 | "entities": { 8 | 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /ace/sources/Wiley.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Wiley", 3 | "identifiers": [ 4 | "Wiley Online Library" 5 | ], 6 | "entities": { 7 | 8 | } 9 | } -------------------------------------------------------------------------------- /ace/tableparser.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | # use unicode everywhere 3 | 4 | # import database 5 | import regex # Note: we're using features in the new regex module, not re! 6 | import logging 7 | from . import config 8 | from .database import Activation, Table 9 | from collections import Counter, defaultdict 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def identify_standard_columns(labels): 16 | ''' Takes a set of column labels and returns an equal-length list with names 17 | of any standard columns detected. Unknown columns are assigned None. 18 | E.g., passing in ['p value', 'brain region', 'unknown_col'] would return 19 | ['p_value', 'region', None]. 20 | ''' 21 | standardized = [None] * len(labels) 22 | found_coords = False 23 | for i, lab in enumerate(labels): 24 | if regex.search('(^\s*ba$)|brodmann', lab): 25 | s = 'ba' 26 | elif regex.search('region|anatom|location|area', lab): 27 | s = 'region' 28 | elif regex.search('sphere|(^\s*h$)|^\s*hem|^\s*side', lab): 29 | s = 'hemisphere' 30 | elif regex.search('(^k$)|(mm.*?3)|volume|voxels|size|extent', lab): 31 | s = 'size' 32 | elif regex.match('\s*[xy]\s*$', lab): 33 | found_coords = True 34 | s = lab 35 | elif regex.match('\s*z\s*$', lab): 36 | # For z, we need to distinguish z plane from z-score. 37 | # Use simple heuristics: 38 | # * If no 'x' column exists, this must be a z-score 39 | # * If the preceding label was anything but 'y', must be a z-score 40 | # * Otherwise it's a z coordinate 41 | # Note: this could theoretically break if someone has non-contiguous 42 | # x-y-z columns, but this seems unlikely. If it does happen, 43 | # an alternative approach would be to check if the case of the 'z' column 44 | # matches the case of the 'x' column and make determination that 45 | # way. 46 | s = 'statistic' if not found_coords or labels[i - 1] != 'y' else 'z' 47 | elif regex.search('rdinate', lab): 48 | continue 49 | elif lab == 't' or regex.search('^(max.*(z|t).*|.*(z|t).*(score|value|max))$', lab): 50 | s = 'statistic' 51 | elif regex.search('p[\-\s]+.*val', lab): 52 | s = 'p_value' 53 | else: 54 | s = None 55 | standardized[i] = s 56 | return standardized 57 | 58 | 59 | def identify_repeating_groups(labels): 60 | ''' Identify groups: any sets of columns where names repeat. 61 | Repeating groups must be contiguous; i.e., [x, y, z, w, x, y, z, f] 62 | will not match, but [w, f, x, y, z, x, y, z] will. 63 | 64 | Note that this will only handle one level of repetition; i.e., 65 | hierarchical groupings will be ignored. E.g., in a 2 x 2 x 3 66 | nesting of columns like hemisphere --> condition --> x/y/z, 67 | only the 4 sets of repeating x/y/z columns will be detected. 68 | 69 | Returns a list of strings made up of the index of the first column 70 | in the group and the number of columns. E.g., '1/3' indicates the 71 | group starts at the second column and contains 3 columns. These 72 | keys can be used to directly look up names stored in a 73 | multicolumn_label dictionary. 74 | ''' 75 | # OLD ALGORITHM: MUCH SIMPLER AND FASTER BUT DOESN'T WORK PROPERLY 76 | # FOR NON-CONTIGUOUS COLUMN GROUPS 77 | # target = '###'.join(unicode(x) for x in labels) 78 | # pattern = regex.compile(r'(.+?###.+?)(###\1)+') 79 | # matches = pattern.finditer(target) 80 | # groups = [] 81 | # for m in matches: 82 | # sp = m.span() 83 | # n_cols_in_group = len(m.group(1).split('###')) 84 | # start = len(target[0:sp[0]].split('###'))-1 85 | # n_matches = len(m.group(0).split('###')) 86 | # for i in range(n_matches/n_cols_in_group): 87 | # groups.append('%d/%d' % ((i*n_cols_in_group)+start, n_cols_in_group)) 88 | # return list(set(groups)) 89 | 90 | groups = [] 91 | n_labels = len(labels) 92 | label_counts = Counter(labels) 93 | rep_labels = set([k for k, v in list(label_counts.items()) if v > 1]) 94 | # Track multi-label sequences. Key/value = sequence/onset 95 | label_seqs = defaultdict(list) 96 | 97 | # Loop over labels and identify any sequences made up entirely of labels with 98 | # 2 or more occurrences in the list and without the starting label repeating. 99 | for i, lab in enumerate(labels): 100 | if lab not in rep_labels: 101 | continue 102 | current_seq = [lab] 103 | for j in range(i+1, n_labels): 104 | lab_j = labels[j] 105 | if lab_j not in rep_labels or lab_j == lab: 106 | break 107 | current_seq.append(lab_j) 108 | if len(current_seq) > 1: 109 | label_seqs['###'.join(current_seq)].append(i) 110 | 111 | # Keep only sequences that occur two or more times 112 | label_seqs = { k: v for k, v in list(label_seqs.items()) if len(v) > 1} 113 | 114 | # Invert what's left into a list where the sequence occurs at its start pos 115 | seq_starts = [None] * n_labels 116 | for k, v in list(label_seqs.items()): 117 | for start in v: 118 | seq_starts[start] = k.split('###') 119 | 120 | # Create boolean array to track whether each element has already been used 121 | labels_used = [False] * n_labels 122 | 123 | # Loop through labels and add a group if we find a sequence that starts at 124 | # the current position and spans at least one currently unused cell. 125 | # This is necessary to account for cases where one sequence isn't always 126 | # part of the same supersequence, e.g., the y/z in x/y/z could also be a 127 | # part of a/y/z or b/y/z. 128 | for i, lab in enumerate(labels): 129 | if seq_starts[i] is not None: 130 | seq_size = len(seq_starts[i]) 131 | if not all(labels_used[i:(i+seq_size)]): 132 | labels_used[i:(i+seq_size)] = [True] * seq_size 133 | 134 | # We need to make sure the group contains x/y/z information, 135 | # otherwise we'll end up duplicating a lot of activations. 136 | # This is not a very good place to put this check; eventually 137 | # we need to refactor much of this class. 138 | groups.append('%d/%d' % (i, seq_size)) 139 | 140 | return groups 141 | 142 | 143 | 144 | def create_activation(data, labels, standard_cols, group_labels=[]): 145 | 146 | activation = Activation() 147 | 148 | for i, col in enumerate(data): 149 | 150 | # Replace unicode minus signs with hyphens 151 | replace = ['֊', '‐', '‑', '⁃', '﹣', '-', '‒', '–', '—', '﹘', '−', '-'] 152 | for c in replace: 153 | if c in col: 154 | col = col.replace(c, '-') 155 | col = col.replace(c + c, '-') 156 | 157 | # Cast to integer or float if appropriate 158 | # if regex.match('[-\d]+$', col): 159 | # col = int(col) 160 | # elif regex.match('[-\d\.]+$', col): 161 | # col = float(col) 162 | 163 | # Set standard attributes if applicable and do validation where appropriate. 164 | # Generally, validation will not prevent a bad value from making it into the 165 | # activation object, but it will flag any potential issues using the "problem" column. 166 | if standard_cols[i] is not None: 167 | 168 | sc = standard_cols[i] 169 | 170 | # Validate XYZ columns: Should only be integers (and possible trailing decimals). 171 | # If they're not, keep only leading numbers. The exception is that ScienceDirect 172 | # journals often follow the minus sign with a space (e.g., - 35), which we strip. 173 | if regex.match('[xyz]$', sc): 174 | m = regex.match('([-])\s?(\d+\.*\d*)$', col) 175 | if m: 176 | col = "%s%s" % (m.group(1), m.group(2)) 177 | if not regex.match('([-]*\d+)\.*\d*$', col): 178 | logging.debug("Value %s in %s column is not valid" % (col, sc)) 179 | activation.problems.append("Value in %s column is not valid" % sc) 180 | return activation 181 | col = (float(col)) 182 | 183 | elif sc == 'region': 184 | if not regex.search('[a-zA-Z]', col): 185 | logging.debug("Value in region column is not a string") 186 | activation.problems.append("Value in region column is not a string") 187 | 188 | setattr(activation, sc, col) 189 | 190 | # Always include all columns in record 191 | activation.add_col(labels[i], col) 192 | 193 | # Handle columns with multiple coordinates (e.g., 45;12;-12). 194 | # Assume that any series of 3 numbers in a non-standard column 195 | # reflects coordinates. Will fail if there are leading numbers!!! 196 | # Also need to remove space between minus sign and numbers; some ScienceDirect 197 | # journals leave a gap. 198 | if not i in standard_cols: 199 | cs = '([-]?\d{1,3}\.?\d{0,2})' 200 | clean_col = regex.sub(r'(? for a new group, an entire is used 10 | http://www.sciencedirect.com/science/article/pii/S1053811911007609 11 | 12 | # Doesn't detect x/y/z in Table 1 correctly 13 | http://www.plosone.org/article/info%3Adoi%2F10.1371%2Fjournal.pone.0068494 14 | -------------------------------------------------------------------------------- /examples/create_db_and_add_articles.py: -------------------------------------------------------------------------------- 1 | # In this example we create a new DB file and process a bunch of 2 | # articles. Note that due to copyright restrictions, articles can't 3 | # be included in this package, so you'll need to replace PATH_TO_FILES 4 | # with something that works. 5 | 6 | from ace import database 7 | from ace.ingest import add_articles 8 | 9 | # Uncomment the next line to seem more information 10 | # ace.set_logging_level('info') 11 | 12 | # Change this to a valid path to a set of html files. 13 | PATH_TO_FILES = "/home/zorro/neurosynth_scrape/articles/html/Neuroimage/*" 14 | 15 | db = database.Database(adapter='sqlite', db_name='sqlite:///example_db.db') 16 | add_articles(db, PATH_TO_FILES, pmid_filenames=True) 17 | db.print_stats() 18 | -------------------------------------------------------------------------------- /examples/fetch_articles_from_pubmed.py: -------------------------------------------------------------------------------- 1 | """ Query PubMed for results from several journals, and save to file. 2 | The resulting directory can then be passed to the Database instance for 3 | extraction, as in the create_db_and_add_articles example. 4 | NOTE: selenium must be installed and working properly for this to work. 5 | Code has only been tested with the Chrome driver. """ 6 | 7 | from ace.scrape import Scraper 8 | import ace 9 | import os 10 | 11 | 12 | journals = { 13 | 'Neuroimage': { 14 | 'delay': 20, # Mean delay between article downloads--prevents the banhammer 15 | 'mode': 'browser', # ScienceDirect journals require selenium to work properly 16 | 'search': 'fmri', # Only retrieve articles with this string in abstract 17 | 'min_pmid': 34447833, # Start from this PMID--can run incrementally 18 | } 19 | } 20 | 21 | # Verbose output 22 | ace.set_logging_level('debug') 23 | 24 | # Create temporary output dir 25 | output_dir = '/tmp/articles' 26 | if not os.path.exists(output_dir): 27 | os.makedirs(output_dir) 28 | 29 | # Initialize Scraper 30 | scraper = Scraper('/tmp/articles') 31 | 32 | # Loop through journals and 33 | for j, settings in list(journals.items()): 34 | scraper.retrieve_journal_articles(j, skip_pubmed_central=True, **settings) 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-recording 3 | vcrpy 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | regex 3 | requests 4 | simplejson 5 | sqlalchemy 6 | selenium 7 | seleniumbase 8 | tqdm 9 | xmltodict 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | # Borrowing a trick from nibabel 5 | if len(set(('test', 'easy_install', 'develop')).intersection(sys.argv)) > 0: 6 | import setuptools 7 | 8 | from distutils.core import setup 9 | 10 | extra_setuptools_args = {} 11 | if 'setuptools' in sys.modules: 12 | extra_setuptools_args = dict( 13 | tests_require=['nose'], 14 | test_suite='nose.collector', 15 | extras_require=dict( 16 | test='nose>=0.10.1') 17 | ) 18 | 19 | # fetch version from within ACE module 20 | with open(os.path.join('ace', 'version.py')) as f: 21 | exec(f.read()) 22 | 23 | setup(name="ace", 24 | version=__version__, 25 | description="Automated Coordinate Extraction", 26 | maintainer='Tal Yarkoni', 27 | maintainer_email='tyarkoni@gmail.com', 28 | url='http://github.com/neurosynth/ace', 29 | packages=["ace", 30 | "ace.tests"], 31 | package_data={'ace': ['sources/*'], 32 | 'ace.tests': ['data/*'] 33 | }, 34 | **extra_setuptools_args 35 | ) 36 | --------------------------------------------------------------------------------