├── tests ├── __init__.py ├── collectors │ ├── gsk │ │ ├── __init__.py │ │ └── test_parser.py │ ├── hra │ │ ├── __init__.py │ │ └── test_collector.py │ ├── nct │ │ ├── __init__.py │ │ └── test_parser.py │ ├── euctr │ │ ├── __init__.py │ │ └── test_parser.py │ ├── pubmed │ │ ├── __init__.py │ │ ├── test_spider.py │ │ └── test_parser.py │ ├── takeda │ │ ├── __init__.py │ │ └── test_parser.py │ └── base │ │ └── test_fields.py ├── test_isrctn.py ├── test_gsk.py ├── test_euctr.py ├── conftest.py └── cassettes │ ├── nct.test_parser.TestNctParser.test_parser_parse_text.json │ ├── nct.test_parser.TestNctParser.test_parser_parse_dict.json │ └── nct.test_parser.TestNctParser.test_parser_parse_list.json ├── collectors ├── __init__.py ├── actrn │ ├── __init__.py │ ├── collector.py │ ├── spider.py │ └── record.py ├── euctr │ ├── __init__.py │ ├── collector.py │ └── spider.py ├── fda_dap │ ├── __init__.py │ ├── collector.py │ └── record.py ├── fdadl │ ├── __init__.py │ ├── record.py │ └── collector.py ├── gsk │ ├── __init__.py │ ├── collector.py │ ├── spider.py │ └── record.py ├── hra │ ├── __init__.py │ ├── record.py │ ├── collector.py │ └── parser.py ├── icdcm │ ├── __init__.py │ ├── record.py │ └── collector.py ├── icdpcs │ ├── __init__.py │ ├── record.py │ └── collector.py ├── ictrp │ ├── __init__.py │ ├── collector.py │ ├── spider.py │ └── record.py ├── isrctn │ ├── __init__.py │ ├── collector.py │ ├── spider.py │ └── record.py ├── jprn │ ├── __init__.py │ ├── collector.py │ ├── spider.py │ ├── parser.py │ └── record.py ├── nct │ ├── __init__.py │ ├── collector.py │ └── record.py ├── pfizer │ ├── __init__.py │ ├── collector.py │ ├── record.py │ ├── spider.py │ └── parser.py ├── pubmed │ ├── __init__.py │ ├── collector.py │ ├── record.py │ └── spider.py ├── takeda │ ├── __init__.py │ ├── collector.py │ ├── spider.py │ ├── record.py │ └── parser.py ├── cochrane_reviews │ ├── __init__.py │ ├── record.py │ ├── collector.py │ └── parser.py └── base │ ├── __init__.py │ ├── cli.py │ ├── pipelines.py │ ├── helpers.py │ ├── config.py │ ├── fields.py │ └── record.py ├── migrations ├── __init__.py ├── versions │ ├── 20160510091510_fda_rename_table_to_fdadl.py │ ├── 20160901171321_add_results_url_to_gsk.py │ ├── 20161102144050_add_trial_results_url_to_euctr.py │ ├── 20161206150412_add_exempt_results_to_nct.py │ ├── 20170215125221_add_registry_ids_to_pubmed.py │ ├── 20160819163953_fdadl_add_fda_application_number.py │ ├── 20160610145922_pubmed_add_mesh.py │ ├── 20170214191843_pubmed_rename_identifiers_list_to_article_ids.py │ ├── 20160303155834_pfizer_takeda_add_pk.py │ ├── 20160406115944_ictrp_simplify_primary_key.py │ ├── 20160311153848_add_data_prefix_to_tables.py │ ├── 20160323090938_remove_data_prefix_from_tables.py │ ├── 20160408164205_create_meta_id_indexes.py │ ├── 20160525192212_euctr_fix_column_names.py │ ├── 20160525134303_takeda_fix_column_names.py │ ├── 20160525133746_isrctn_fix_column_names.py │ ├── 20160311151047_update_meta_identifier.py │ ├── 20160831125422_add_drug_name_active_ingredients_and_company_to_fda_dap.py │ ├── 20160224180815_trials_create_table.py │ ├── 20160323145124_trials_remove_table.py │ ├── 20160220212552_nct_fix_boolean_columns.py │ ├── 20160510000353_fda_create_table.py │ ├── 20160525132926_gsk_fix_column_names.py │ ├── 20160509115712_icdcm_create_table.py │ ├── 20160509133714_icdpcs_create_table.py │ ├── 20170123144318_default_for_meta_created_and_meta_updated.py │ ├── 20161007222818_create_cochrane_reviews_table.py │ ├── 20160725130032_fda_dap_create_table.py │ ├── 20160226134759_pfizer_create_table.py │ ├── 20160525130300_actrn_fix_column_names.py │ ├── 20170123151655_add_trigger_for_meta_updated.py │ ├── 20160428204857_pubmed_create_table.py │ ├── 20160301131954_ictrp_create_table.py │ ├── 20160229142254_takeda_create_table.py │ ├── 20160525105409_euctr_fix_column_names.py │ ├── 20160603215242_hra_create_table.py │ ├── 20160220164104_nct_create_table.py │ └── 20160220175816_isrctn_create_table.py ├── script.py.mako ├── config.py └── env.py ├── pytest.ini ├── scrapy.cfg ├── .dockerignore ├── requirements.in ├── requirements.dev.txt ├── pylama.ini ├── Dockerfile ├── tox.ini ├── Makefile ├── .travis.yml ├── CONTRIBUTING.md ├── docs ├── warehouse.md ├── collectors │ ├── actrn.md │ ├── takeda.md │ ├── pfizer.md │ ├── pubmed.md │ ├── isrctn.md │ ├── ictrp.md │ ├── nct.md │ ├── gsk.md │ ├── jprn.md │ └── euctr.md └── overview.md ├── .env.example ├── .gitignore ├── LICENSE.md ├── alembic.ini ├── README.md ├── docker-compose.yml └── requirements.txt /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /collectors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/collectors/gsk/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/collectors/hra/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/collectors/nct/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/collectors/euctr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/collectors/pubmed/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/collectors/takeda/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths=tests 3 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | default = collectors.base.config 3 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !collectors/ 3 | !migrations/ 4 | !alembic.ini 5 | !Makefile 6 | !requirements.txt 7 | !scrapy.cfg 8 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | scrapy 2 | dataset==0.7.1 3 | alembic 4 | psycopg2 5 | xmltodict 6 | sqlalchemy 7 | python-dotenv 8 | requests==2.12.2 9 | ijson 10 | pytz 11 | python-dateutil 12 | raven 13 | -------------------------------------------------------------------------------- /requirements.dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | pyyaml==3.10 # for docker-cloud 3 | docker-cloud 4 | pylama 5 | tox 6 | mock 7 | pytest 8 | pytest-cov 9 | betamax==0.8 10 | coverage 11 | coveralls 12 | ipython 13 | -------------------------------------------------------------------------------- /pylama.ini: -------------------------------------------------------------------------------- 1 | [pylama] 2 | linters = pyflakes,mccabe,pep8 3 | ignore = E105,E128,E731 4 | 5 | [pylama:mccabe] 6 | complexity = 48 7 | 8 | [pylama:pep8] 9 | max_line_length = 160 10 | 11 | [pylama:*/__init__.py] 12 | ignore = W0611 13 | -------------------------------------------------------------------------------- /collectors/actrn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/euctr/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/fda_dap/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/fdadl/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/gsk/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/hra/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/icdcm/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/icdpcs/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/ictrp/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/isrctn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/jprn/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/nct/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/pfizer/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/pubmed/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/takeda/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /collectors/cochrane_reviews/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .collector import collect 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7 2 | WORKDIR /service 3 | COPY requirements.txt requirements.txt 4 | RUN pip install --upgrade -r requirements.txt 5 | COPY collectors collectors 6 | COPY migrations migrations 7 | COPY alembic.ini alembic.ini 8 | COPY Makefile Makefile 9 | COPY scrapy.cfg scrapy.cfg 10 | -------------------------------------------------------------------------------- /collectors/base/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from . import fields 8 | from . import helpers 9 | from .record import Record 10 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py27 4 | lint 5 | skipsdist = True 6 | 7 | [testenv] 8 | deps = 9 | -r{toxinidir}/requirements.txt 10 | pytest 11 | mock 12 | betamax==0.8 13 | setenv = 14 | PYTHON_ENV = testing 15 | passenv = 16 | TEST_WAREHOUSE_URL 17 | commands = 18 | py.test {posargs} 19 | 20 | [testenv:lint] 21 | deps = 22 | pylama 23 | commands = 24 | pylama {toxinidir}/collectors {toxinidir}/migrations 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all build install list migrate start test up 2 | 3 | all: list 4 | 5 | build: 6 | docker build -t opentrials/collectors . 7 | 8 | list: 9 | @grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n' 10 | 11 | migrate: 12 | alembic upgrade head 13 | 14 | start: 15 | python -m collectors.base.cli $(filter-out $@,$(MAKECMDGOALS)) 16 | 17 | test: 18 | tox 19 | 20 | up: 21 | docker-compose up 22 | 23 | %: 24 | @: 25 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: 2 | false 3 | 4 | language: 5 | python 6 | 7 | python: 8 | - 2.7 9 | 10 | services: 11 | - postgresql 12 | 13 | addons: 14 | postgresql: '9.4' 15 | 16 | env: 17 | global: 18 | - TEST_WAREHOUSE_URL=postgres://postgres@localhost:5432/opentrials_warehouse_test 19 | 20 | install: 21 | - pip install tox 22 | - psql -c 'create database opentrials_warehouse_test;' -U postgres 23 | 24 | script: 25 | - make test 26 | -------------------------------------------------------------------------------- /collectors/fda_dap/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn) 16 | process.start() 17 | -------------------------------------------------------------------------------- /collectors/pfizer/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn) 16 | process.start() 17 | -------------------------------------------------------------------------------- /collectors/takeda/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn) 16 | process.start() 17 | -------------------------------------------------------------------------------- /tests/test_isrctn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from importlib import import_module 8 | from collectors.isrctn.spider import _make_start_urls 9 | 10 | 11 | # Tests 12 | 13 | def test_make_start_urls(): 14 | result = _make_start_urls('prefix', '2016-01-01', '2016-01-15') 15 | print(result) 16 | assert result 17 | -------------------------------------------------------------------------------- /collectors/actrn/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn, date_from=None, date_to=None): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) 16 | process.start() 17 | -------------------------------------------------------------------------------- /collectors/euctr/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn, date_from=None, date_to=None): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) 16 | process.start() 17 | -------------------------------------------------------------------------------- /collectors/gsk/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn, date_from=None, date_to=None): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) 16 | process.start() 17 | -------------------------------------------------------------------------------- /collectors/jprn/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn, page_from=None, page_to=None): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to) 16 | process.start() 17 | -------------------------------------------------------------------------------- /tests/test_gsk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from importlib import import_module 8 | from collectors.gsk.spider import _make_start_urls 9 | 10 | 11 | # Tests 12 | 13 | def test_make_start_urls(): 14 | result = _make_start_urls( 15 | 'http://www.gsk-clinicalstudyregister.com/search', 16 | '2015-01-01', '2015-01-31') 17 | print(result) 18 | assert result 19 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards). 4 | 5 | ## Getting Started 6 | 7 | ``` 8 | virtualenv .python -p python2 9 | source .python/bin/activate 10 | make install 11 | cp .env.example .env 12 | editor .env # set your values 13 | set -a; source .env 14 | ``` 15 | 16 | ## Testing 17 | 18 | To run tests: 19 | 20 | ``` 21 | $ make test 22 | ``` 23 | 24 | ## Running 25 | 26 | To run a processor: 27 | 28 | ``` 29 | $ make start 30 | ``` 31 | -------------------------------------------------------------------------------- /collectors/isrctn/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn, date_from=None, date_to=None): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) 16 | process.start() 17 | -------------------------------------------------------------------------------- /collectors/pubmed/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn, date_from=None, date_to=None): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to) 16 | process.start() 17 | -------------------------------------------------------------------------------- /collectors/ictrp/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.crawler import CrawlerProcess 8 | from .spider import Spider 9 | 10 | 11 | # Module API 12 | 13 | def collect(conf, conn): 14 | process = CrawlerProcess(conf['SCRAPY_SETTINGS']) 15 | process.crawl(Spider, conn=conn, 16 | http_user=conf['ICTRP_USER'], 17 | http_pass=conf['ICTRP_PASS']) 18 | process.start() 19 | -------------------------------------------------------------------------------- /docs/warehouse.md: -------------------------------------------------------------------------------- 1 | # Warehouse 2 | 3 | The document describes OpenTrials `warehouse`. 4 | 5 | ### Basics 6 | 7 | This database stores records collected from different sources. 8 | It's a denormalized data storage. 9 | 10 | ### Tables 11 | 12 | Each table corresponds to a source and its schema follows the structure of data from the origin. 13 | To see the schema of each table please check the collector-specific docs 14 | [here](https://github.com/opentrials/collectors/tree/master/docs/collectors) 15 | 16 | ### Technology 17 | 18 | Database engine: `postgresql-9.4+`. 19 | -------------------------------------------------------------------------------- /tests/test_euctr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from importlib import import_module 8 | from collectors.euctr.spider import _make_start_urls 9 | 10 | 11 | # Tests 12 | 13 | def test_make_start_urls(): 14 | result = _make_start_urls( 15 | 'https://www.clinicaltrialsregister.eu/ctr-search/search', 16 | '2015-01-01', '2015-01-02') 17 | print(result) 18 | assert result 19 | -------------------------------------------------------------------------------- /collectors/cochrane_reviews/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Json 9 | 10 | 11 | class Record(base.Record): 12 | table = 'cochrane_reviews' 13 | 14 | # Fields 15 | 16 | id = Text(primary_key=True) 17 | study_id = Text() 18 | file_name = Text() 19 | study_type = Text() 20 | doi_id = Text() 21 | robs = Json() 22 | refs = Json() 23 | -------------------------------------------------------------------------------- /collectors/icdcm/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Array 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'icdcm' 18 | 19 | # General 20 | 21 | name = Text(primary_key=True) 22 | desc = Text() 23 | terms = Array() 24 | version = Text() 25 | last_updated = Date('%Y-%m-%d') 26 | -------------------------------------------------------------------------------- /migrations/versions/20160510091510_fda_rename_table_to_fdadl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = 'f38e14eac095' 12 | down_revision = u'9f367826f849' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade(): 18 | op.rename_table('fda', 'fdadl') 19 | 20 | 21 | def downgrade(): 22 | op.rename_table('fdadl', 'fda') 23 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Export: set -a; source .env 2 | # Dockerhost: ip route | awk '/docker0/ { print $NF }' 3 | PYTHON_ENV=development 4 | WAREHOUSE_URL=postgres://:@:5432/ 5 | TEST_WAREHOUSE_URL=postgres://:@:5432/ 6 | # LOGGING_URL='.papertrailapp.com:' # optional 7 | # DOWNLOAD_DELAY=1 # optional 8 | # ICTRP_USER='' # optional 9 | # ICTRP_PASS='' # optional 10 | # HRA_ENV='' # optional 11 | # HRA_URL='' # optional 12 | # HRA_USER='' # optional 13 | # HRA_PASS='' # optional 14 | # SENTRY_DSN='' # optional -------------------------------------------------------------------------------- /collectors/fdadl/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'fdadl' 18 | 19 | # General 20 | 21 | product_ndc = Text(primary_key=True) 22 | fda_application_number = Text() 23 | product_type = Text() 24 | generic_name = Text() 25 | brand_name = Text() 26 | last_updated = Date('%Y-%m-%d') 27 | -------------------------------------------------------------------------------- /migrations/script.py.mako: -------------------------------------------------------------------------------- 1 | ${"# -*- coding: utf-8 -*-"} 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | ${imports if imports else ""} 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = ${repr(up_revision)} 12 | down_revision = ${repr(down_revision)} 13 | branch_labels = ${repr(branch_labels)} 14 | depends_on = ${repr(depends_on)} 15 | 16 | 17 | def upgrade(): 18 | ${upgrades if upgrades else "pass"} 19 | 20 | 21 | def downgrade(): 22 | ${downgrades if downgrades else "pass"} 23 | -------------------------------------------------------------------------------- /collectors/icdpcs/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Boolean 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'icdpcs' 18 | 19 | # General 20 | 21 | code = Text(primary_key=True) 22 | is_header = Boolean('0') 23 | short_description = Text() 24 | long_description = Text() 25 | version = Text() 26 | last_updated = Date('%Y-%m-%d') 27 | -------------------------------------------------------------------------------- /migrations/versions/20160901171321_add_results_url_to_gsk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'bf807df84277' 13 | down_revision = u'2d52470f8e49' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.add_column('gsk', sa.Column('results_url', sa.Text)) 20 | 21 | 22 | def downgrade(): 23 | op.drop_column('gsk', 'results_url') 24 | -------------------------------------------------------------------------------- /migrations/versions/20161102144050_add_trial_results_url_to_euctr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'f35805a0a00f' 13 | down_revision = u'84910d455f31' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.add_column('euctr', sa.Column('trial_results_url', sa.Text)) 20 | 21 | 22 | def downgrade(): 23 | op.drop_column('euctr', 'trial_results_url') 24 | -------------------------------------------------------------------------------- /migrations/versions/20161206150412_add_exempt_results_to_nct.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '0087dc1eb534' 13 | down_revision = u'f35805a0a00f' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.add_column('nct', sa.Column('results_exemption_date', sa.Date)) 20 | 21 | 22 | def downgrade(): 23 | op.drop_column('nct', 'results_exemption_date') 24 | -------------------------------------------------------------------------------- /migrations/versions/20170215125221_add_registry_ids_to_pubmed.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'fd0bb12971d2' 13 | down_revision = u'3dbb46f23ed7' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.add_column('pubmed', sa.Column('registry_ids', sa.dialects.postgresql.JSONB)) 20 | 21 | 22 | def downgrade(): 23 | op.drop_column('pubmed', 'registry_ids') 24 | -------------------------------------------------------------------------------- /migrations/versions/20160819163953_fdadl_add_fda_application_number.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'bc7470719f51' 13 | down_revision = u'23c55ccc0649' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.add_column('fdadl', sa.Column('fda_application_number', sa.Text)) 20 | 21 | 22 | def downgrade(): 23 | op.drop_column('fdadl', 'fda_application_number') 24 | -------------------------------------------------------------------------------- /migrations/versions/20160610145922_pubmed_add_mesh.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | from sqlalchemy.dialects.postgresql import JSONB 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '3a3b663824f1' 14 | down_revision = u'c4c0db99bb1c' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.add_column('pubmed', sa.Column('mesh_headings', JSONB)) 21 | 22 | 23 | def downgrade(): 24 | op.drop_column('pubmed', 'mesh_headings') 25 | -------------------------------------------------------------------------------- /migrations/versions/20170214191843_pubmed_rename_identifiers_list_to_article_ids.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '3dbb46f23ed7' 12 | down_revision = u'b32475938a2d' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade(): 18 | op.alter_column('pubmed', 'identifiers_list', new_column_name='article_ids') 19 | 20 | 21 | def downgrade(): 22 | op.alter_column('pubmed', 'article_ids', new_column_name='identifiers_list') 23 | -------------------------------------------------------------------------------- /tests/collectors/pubmed/test_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from collectors.pubmed.spider import _make_start_urls 8 | 9 | 10 | class TestPubmedSpider(object): 11 | def test_make_start_urls(self, betamax_session): 12 | result = _make_start_urls( 13 | 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi/', 14 | 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id={pmid}&retmode=xml', 15 | '2016-01-01', '2016-01-01', 16 | session=betamax_session) 17 | assert result 18 | -------------------------------------------------------------------------------- /migrations/versions/20160303155834_pfizer_takeda_add_pk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = 'b0f8a397edad' 12 | down_revision = u'7518ba857fea' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade(): 18 | op.create_primary_key('pfizer_pkey', 'pfizer', ['nct_id']) 19 | op.create_primary_key('takeda_pkey', 'takeda', ['takeda_trial_id']) 20 | 21 | 22 | def downgrade(): 23 | op.drop_constraint('pfizer_pkey', 'pfizer') 24 | op.drop_constraint('takeda_pkey', 'takeda') 25 | -------------------------------------------------------------------------------- /migrations/versions/20160406115944_ictrp_simplify_primary_key.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '00d329f5f40a' 12 | down_revision = u'58d2189bc678' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade(): 18 | op.drop_constraint('ictrp_pkey', 'ictrp') 19 | op.create_primary_key('ictrp_pkey', 'ictrp', ['main_id']) 20 | 21 | 22 | def downgrade(): 23 | op.drop_constraint('ictrp_pkey', 'ictrp') 24 | op.create_primary_key('ictrp_pkey', 'ictrp', ['register', 'main_id']) 25 | -------------------------------------------------------------------------------- /migrations/versions/20160311153848_add_data_prefix_to_tables.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = 'ec1ab5776710' 12 | down_revision = u'46d169ce43d2' 13 | branch_labels = None 14 | depends_on = None 15 | tables = ['actrn', 'euctr', 'gsk', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'takeda'] 16 | 17 | 18 | def upgrade(): 19 | for table in tables: 20 | op.rename_table(table, 'data_'+table) 21 | 22 | 23 | def downgrade(): 24 | for table in tables: 25 | op.rename_table('data_'+table, table) 26 | -------------------------------------------------------------------------------- /migrations/versions/20160323090938_remove_data_prefix_from_tables.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '89c87deb5a02' 12 | down_revision = u'ec1ab5776710' 13 | branch_labels = None 14 | depends_on = None 15 | tables = ['actrn', 'euctr', 'gsk', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'takeda'] 16 | 17 | 18 | def upgrade(): 19 | for table in tables: 20 | op.rename_table('data_'+table, table) 21 | 22 | 23 | def downgrade(): 24 | for table in tables: 25 | op.rename_table(table, 'data_'+table) 26 | -------------------------------------------------------------------------------- /collectors/fda_dap/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Integer, Json 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'fda_dap' 18 | 19 | # General 20 | 21 | id = Text(primary_key=True) 22 | drug_name = Text() 23 | active_ingredients = Text() 24 | company = Text() 25 | fda_application_num = Text() 26 | supplement_number = Integer() 27 | action_date = Date('%m/%d/%Y') 28 | approval_type = Text() 29 | notes = Text() 30 | documents = Json() 31 | -------------------------------------------------------------------------------- /migrations/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import logging 9 | from logging.handlers import SysLogHandler 10 | from dotenv import load_dotenv 11 | load_dotenv('.env') 12 | 13 | 14 | # Storage 15 | 16 | WAREHOUSE_URL = os.environ['WAREHOUSE_URL'] 17 | 18 | # Logging 19 | 20 | LOGGING_URL = os.environ['LOGGING_URL'] 21 | logging.basicConfig(level=logging.DEBUG) 22 | root_logger = logging.getLogger() 23 | host, port = LOGGING_URL.split(':') 24 | syslog_handler = SysLogHandler(address=(host, int(port))) 25 | syslog_handler.setLevel(logging.INFO) 26 | root_logger.addHandler(syslog_handler) 27 | -------------------------------------------------------------------------------- /migrations/versions/20160408164205_create_meta_id_indexes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '014fd3f703aa' 12 | down_revision = u'00d329f5f40a' 13 | branch_labels = None 14 | depends_on = None 15 | tables = ['actrn', 'euctr', 'gsk', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'takeda'] 16 | 17 | 18 | def upgrade(): 19 | for table in tables: 20 | op.create_unique_constraint('%s_meta_id_unique' % table, table, ['meta_id']) 21 | 22 | 23 | def downgrade(): 24 | for table in tables: 25 | op.drop_constraint('%s_meta_id_unique' % table, table) 26 | -------------------------------------------------------------------------------- /collectors/base/cli.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sys 8 | import dataset 9 | import logging 10 | import importlib 11 | from . import config 12 | from . import helpers 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | # Module API 17 | 18 | def cli(argv): 19 | # Prepare conf dict 20 | conf = helpers.get_variables(config, str.isupper) 21 | 22 | # Prepare conn dict 23 | conn = { 24 | 'warehouse': dataset.connect(config.WAREHOUSE_URL), 25 | } 26 | 27 | # Get and call collector 28 | collect = importlib.import_module('collectors.%s' % argv[1]).collect 29 | collect(conf, conn, *argv[2:]) 30 | 31 | 32 | if __name__ == '__main__': 33 | cli(sys.argv) 34 | -------------------------------------------------------------------------------- /collectors/pfizer/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Boolean 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'pfizer' 18 | 19 | # General 20 | 21 | nct_id = Text(primary_key=True) 22 | title = Text() 23 | 24 | # Description 25 | 26 | study_type = Text() 27 | organization_id = Text() 28 | status = Text() 29 | study_start_date = Date('%B, %Y') 30 | study_end_date = Date('%B, %Y') 31 | 32 | # Eligibility 33 | 34 | eligibility_criteria = Text() 35 | gender = Text() 36 | age_range = Text() 37 | healthy_volunteers_allowed = Boolean('Accepts Healthy Volunteers') 38 | -------------------------------------------------------------------------------- /collectors/base/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import dataset 8 | from . import config 9 | from . import helpers 10 | 11 | 12 | # Module API 13 | 14 | class Warehouse(object): 15 | 16 | # Public 17 | 18 | def open_spider(self, spider): 19 | if spider.conf and spider.conn: 20 | self.__conf = spider.conf 21 | self.__conn = spider.conn 22 | else: 23 | # For runs trigered by scrapy CLI utility 24 | self.__conf = helpers.get_variables(config, str.isupper) 25 | self.__conn = {'warehouse': dataset.connect(config.WAREHOUSE_URL)} 26 | 27 | def process_item(self, record, spider): 28 | record.write(self.__conf, self.__conn) 29 | return record 30 | -------------------------------------------------------------------------------- /migrations/versions/20160525192212_euctr_fix_column_names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '6d709931cc58' 12 | down_revision = u'c83c754cc04e' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | MAPPING = { 17 | 'ethics_committee_opinion_reason_s_for_unfavourable_opinion': 'ethics_committee_opinion_reasons_for_unfavourable_opinion', 18 | } 19 | 20 | 21 | def upgrade(): 22 | for key, value in MAPPING.items(): 23 | op.alter_column('euctr', column_name=value, new_column_name=key) 24 | 25 | 26 | def downgrade(): 27 | for key, value in MAPPING.items(): 28 | op.alter_column('euctr', column_name=key, new_column_name=value) 29 | -------------------------------------------------------------------------------- /migrations/versions/20160525134303_takeda_fix_column_names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = 'c83c754cc04e' 12 | down_revision = u'59e2335b3d41' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | MAPPING = { 17 | 'enrollment_number_of_participants': 'enrollmentnumber_of_participants', 18 | 'trial_arms_groups_or_cohorts': 'trial_armsgroups_or_cohorts', 19 | } 20 | 21 | 22 | def upgrade(): 23 | for key, value in MAPPING.items(): 24 | op.alter_column('takeda', column_name=value, new_column_name=key) 25 | 26 | 27 | def downgrade(): 28 | for key, value in MAPPING.items(): 29 | op.alter_column('takeda', column_name=key, new_column_name=value) 30 | -------------------------------------------------------------------------------- /migrations/versions/20160525133746_isrctn_fix_column_names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '59e2335b3d41' 12 | down_revision = u'f736bb9d2499' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | MAPPING = { 17 | 'prospective_retrospective': 'prospectiveretrospective', 18 | 'protocol_serial_number': 'protocolserial_number', 19 | 'clinicaltrials_gov_number': 'clinicaltrialsgov_number', 20 | } 21 | 22 | 23 | def upgrade(): 24 | for key, value in MAPPING.items(): 25 | op.alter_column('isrctn', column_name=value, new_column_name=key) 26 | 27 | 28 | def downgrade(): 29 | for key, value in MAPPING.items(): 30 | op.alter_column('isrctn', column_name=key, new_column_name=value) 31 | -------------------------------------------------------------------------------- /tests/collectors/base/test_fields.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import datetime 9 | import pytest 10 | import collectors.base.fields as fields 11 | 12 | 13 | class TestFields(object): 14 | def test_date_accepts_single_format(self): 15 | date = fields.Date('%Y-%m') 16 | 17 | assert date.parse('2017-01') == datetime.date(2017, 1, 1) 18 | 19 | def test_date_accepts_multiple_formats(self): 20 | date = fields.Date(['%Y-%m', '%Y-%m-%d']) 21 | 22 | assert date.parse('2017-01') == datetime.date(2017, 1, 1) 23 | assert date.parse('2017-01-01') == datetime.date(2017, 1, 1) 24 | 25 | def test_date_raises_if_date_is_in_wrong_format(self): 26 | date = fields.Date('%Y-%m') 27 | with pytest.raises(ValueError): 28 | date.parse('2017-01-01') 29 | -------------------------------------------------------------------------------- /migrations/versions/20160311151047_update_meta_identifier.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '46d169ce43d2' 12 | down_revision = u'b0f8a397edad' 13 | branch_labels = None 14 | depends_on = None 15 | tables = ['actrn', 'euctr', 'gsk', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'takeda'] 16 | 17 | 18 | def upgrade(): 19 | for table in tables: 20 | op.alter_column(table, 'meta_uuid', new_column_name='meta_id') 21 | op.execute('ALTER TABLE %s ALTER COLUMN meta_id TYPE uuid USING meta_id::uuid' % table) 22 | 23 | 24 | def downgrade(): 25 | for table in tables: 26 | op.execute('ALTER TABLE %s ALTER COLUMN meta_id TYPE text USING meta_id::text' % table) 27 | op.alter_column(table, 'meta_id', new_column_name='meta_uuid') 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # pyenv 60 | .python-version 61 | 62 | # dotenv 63 | .env 64 | -------------------------------------------------------------------------------- /migrations/versions/20160831125422_add_drug_name_active_ingredients_and_company_to_fda_dap.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '2d52470f8e49' 13 | down_revision = u'bc7470719f51' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | with op.batch_alter_table('fda_dap') as batch_op: 20 | batch_op.add_column(sa.Column('drug_name', sa.Text)) 21 | batch_op.add_column(sa.Column('active_ingredients', sa.Text)) 22 | batch_op.add_column(sa.Column('company', sa.Text)) 23 | 24 | 25 | def downgrade(): 26 | with op.batch_alter_table('fda_dap') as batch_op: 27 | batch_op.drop_column('drug_name') 28 | batch_op.drop_column('active_ingredients') 29 | batch_op.drop_column('company') 30 | -------------------------------------------------------------------------------- /docs/collectors/actrn.md: -------------------------------------------------------------------------------- 1 | # ACTRN 2 | 3 | http://www.anzctr.org.au/ 4 | 5 | The ANZCTR is an online registry of clinical trials being 6 | undertaken in Australia, New Zealand and elsewhere. 7 | 8 | ## Source Data Model 9 | 10 | Data could be accessed thru the web interface. 11 | Example - https://www.anzctr.org.au/Trial/Registration/TrialReview.aspx?id=369698&isReview=true. 12 | Data is moving to the warehouse as it is with additional type casting. 13 | See the next section for more details. 14 | 15 | For more information - http://www.anzctr.org.au/docs/ANZCTR%20Data%20field%20explanation.pdf 16 | 17 | ## Warehouse Data Model 18 | 19 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/actrn/record.py) 20 | for the full data model. 21 | 22 | ## Primary Identifiers 23 | 24 | Trial identifier: `trial_id` 25 | 26 | ## Data Update Strategy 27 | 28 | Web interface and source model doesn't have something like 29 | `updated` field. So to stay up to date full scan is needed. 30 | 31 | ## License Terms 32 | 33 | http://www.anzctr.org.au/Support/Terms.aspx 34 | -------------------------------------------------------------------------------- /docs/collectors/takeda.md: -------------------------------------------------------------------------------- 1 | # Takeda 2 | 3 | http://www.takedaclinicaltrials.com/ 4 | 5 | This website is designed to advance Takeda's commitment to the health of patients and the science of medicine by providing greater access to information on Takeda's clinical trials while safeguarding patients' confidentiality. 6 | 7 | ## Source Data Model 8 | 9 | Data could be accessed thru the web interface. 10 | Example - http://www.takedaclinicaltrials.com/browse/summary/01-00-TL-OPI-501#overview. 11 | Data is moving to the warehouse as it is with additional type casting. 12 | See the next section for more details. 13 | 14 | ## Warehouse Data Model 15 | 16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/takeda/record.py) 17 | for the full data model. 18 | 19 | ## Primary Identifiers 20 | 21 | Trial identifier: `takeda_trial_id` 22 | 23 | ## Data Update Strategy 24 | 25 | Web interface and source model doesn't have something like 26 | `updated` field. So to stay up to date full scan is needed. 27 | 28 | ## License Terms 29 | 30 | http://www.takedaclinicaltrials.com/legal/terms 31 | -------------------------------------------------------------------------------- /migrations/versions/20160224180815_trials_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import ARRAY, UUID 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'c2ae4513dd2b' 14 | down_revision = u'9833dacb0b30' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('trials', 21 | sa.Column('uuid', UUID, primary_key=True), 22 | sa.Column('updated', sa.DateTime(timezone=True), nullable=False), 23 | sa.Column('records', ARRAY(sa.Text), nullable=False, unique=True), 24 | sa.Column('nct_id', sa.Text, unique=True), 25 | sa.Column('euctr_id', sa.Text, unique=True), 26 | sa.Column('isrctn_id', sa.Text, unique=True), 27 | sa.Column('scientific_title', sa.Text, unique=True), 28 | ) 29 | 30 | 31 | def downgrade(): 32 | op.drop_table('trials') 33 | -------------------------------------------------------------------------------- /migrations/versions/20160323145124_trials_remove_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import ARRAY, UUID 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '58d2189bc678' 14 | down_revision = u'89c87deb5a02' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.drop_table('trials') 21 | 22 | 23 | def downgrade(): 24 | op.create_table('trials', 25 | sa.Column('uuid', UUID, primary_key=True), 26 | sa.Column('updated', sa.DateTime(timezone=True), nullable=False), 27 | sa.Column('records', ARRAY(sa.Text), nullable=False, unique=True), 28 | sa.Column('nct_id', sa.Text, unique=True), 29 | sa.Column('euctr_id', sa.Text, unique=True), 30 | sa.Column('isrctn_id', sa.Text, unique=True), 31 | sa.Column('scientific_title', sa.Text, unique=True), 32 | ) 33 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 OKFN 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = migrations 6 | 7 | # template used to generate migration files 8 | file_template = %%(year)d%%(month).2d%%(day).2d%%(hour).2d%%(minute).2d%%(second).2d_%%(slug)s 9 | 10 | # max length of characters to apply to the 11 | # "slug" field 12 | #truncate_slug_length = 40 13 | 14 | # set to 'true' to run the environment during 15 | # the 'revision' command, regardless of autogenerate 16 | # revision_environment = false 17 | 18 | # set to 'true' to allow .pyc and .pyo files without 19 | # a source .py file to be detected as revisions in the 20 | # versions/ directory 21 | # sourceless = false 22 | 23 | # version location specification; this defaults 24 | # to migrations/versions. When using multiple version 25 | # directories, initial revisions must be specified with --version-path 26 | # version_locations = %(here)s/bar %(here)s/bat migrations/versions 27 | 28 | # the output encoding used when revision files 29 | # are written from script.py.mako 30 | # output_encoding = utf-8 31 | sqlalchemy.url = driver://user:pass@localhost/dbname 32 | -------------------------------------------------------------------------------- /docs/collectors/pfizer.md: -------------------------------------------------------------------------------- 1 | # Pfizer 2 | 3 | http://www.pfizer.com/research/clinical_trials 4 | 5 | Pfizer works to discover and develop innovative, safe, and effective ways to prevent or treat some of the world’s most challenging diseases. We are committed to the safety of patients who take part in our trials, and uphold the highest ethical standards in all of our research initiatives. 6 | 7 | ## Source Data Model 8 | 9 | Data could be accessed thru the web interface. 10 | Example - http://www.pfizer.com/research/clinical_trials/find_a_trial/NCT00795938. 11 | Data is moving to the warehouse as it is with additional type casting. 12 | See the next section for more details. 13 | 14 | ## Warehouse Data Model 15 | 16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/pfizer/record.py) 17 | for the full data model. 18 | 19 | ## Primary Identifiers 20 | 21 | Trial identifier: `nct_id` 22 | 23 | ## Data Update Strategy 24 | 25 | Web interface and source model doesn't have something like 26 | `updated` field. So to stay up to date full scan is needed. 27 | 28 | ## License Terms 29 | 30 | http://www.pfizer.com/general/terms 31 | -------------------------------------------------------------------------------- /docs/collectors/pubmed.md: -------------------------------------------------------------------------------- 1 | # Pubmed 2 | 3 | http://www.ncbi.nlm.nih.gov/pubmed 4 | 5 | PubMed comprises more than 26 million citations for biomedical literature from MEDLINE, life science journals, and online books. Citations may include links to full-text content from PubMed Central and publisher web sites. 6 | 7 | ## Source Data model 8 | 9 | Data could be accessed via [E-Utilities](http://www.ncbi.nlm.nih.gov/books/NBK25497/). 10 | Data model of publication - https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html. 11 | Data is moving to the warehouse as it is with additional type casting. 12 | See the next section for more details. 13 | 14 | ## Warehouse Data Model 15 | 16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/pubmed/record.py) 17 | for the full data model. 18 | 19 | ## Primary Identifiers 20 | 21 | Trial identifier: `pmid` 22 | 23 | ## Data Update Strategy 24 | 25 | Tha last recent modified data could be searched. 26 | After initial scraping we should use the last 2 days searches 27 | to stay up to date. 28 | 29 | ## License Terms 30 | 31 | http://www.ncbi.nlm.nih.gov/home/about/policies.shtml 32 | -------------------------------------------------------------------------------- /migrations/versions/20160220212552_nct_fix_boolean_columns.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '9833dacb0b30' 12 | down_revision = u'820db6031f39' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | 17 | def upgrade(): 18 | op.execute('ALTER TABLE nct ALTER COLUMN is_fda_regulated TYPE boolean USING is_fda_regulated::boolean') 19 | op.execute('ALTER TABLE nct ALTER COLUMN is_section_801 TYPE boolean USING is_section_801::boolean') 20 | op.execute('ALTER TABLE nct ALTER COLUMN has_expanded_access TYPE boolean USING has_expanded_access::boolean') 21 | 22 | 23 | def downgrade(): 24 | op.execute('ALTER TABLE nct ALTER COLUMN is_fda_regulated TYPE text USING is_fda_regulated::text') 25 | op.execute('ALTER TABLE nct ALTER COLUMN is_section_801 TYPE text USING is_section_801::text') 26 | op.execute('ALTER TABLE nct ALTER COLUMN has_expanded_access TYPE text USING has_expanded_access::text') 27 | -------------------------------------------------------------------------------- /migrations/versions/20160510000353_fda_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '9f367826f849' 13 | down_revision = u'6a990542e4b4' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.create_table('fda', 20 | 21 | # Meta 22 | 23 | sa.Column('meta_id', sa.Text, unique=True), 24 | sa.Column('meta_source', sa.Text), 25 | sa.Column('meta_created', sa.DateTime(timezone=True)), 26 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 27 | 28 | # General 29 | 30 | sa.Column('product_ndc', sa.Text, primary_key=True), 31 | sa.Column('product_type', sa.Text), 32 | sa.Column('generic_name', sa.Text), 33 | sa.Column('brand_name', sa.Text), 34 | sa.Column('last_updated', sa.Date), 35 | 36 | ) 37 | 38 | 39 | def downgrade(): 40 | op.drop_table('fda') 41 | -------------------------------------------------------------------------------- /migrations/versions/20160525132926_gsk_fix_column_names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = 'f736bb9d2499' 12 | down_revision = u'e77e7eaf0a34' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | MAPPING = { 17 | 'clinicaltrials_gov_identifier': 'clinicaltrialsgov_identifier', 18 | 'ind_ide_protocol': 'indide_protocol', 19 | 'ind_ide_grantor': 'indide_grantor', 20 | 'ind_ide_number': 'indide_number', 21 | 'ind_ide_serial_number': 'indide_serial_number', 22 | 'responsible_party_name_official_title': 'responsible_party_nameofficial_title', 23 | 'trade_name_product_name': 'trade_name__product_name', 24 | } 25 | 26 | 27 | def upgrade(): 28 | for key, value in MAPPING.items(): 29 | op.alter_column('gsk', column_name=value, new_column_name=key) 30 | 31 | 32 | def downgrade(): 33 | for key, value in MAPPING.items(): 34 | op.alter_column('gsk', column_name=key, new_column_name=value) 35 | -------------------------------------------------------------------------------- /docs/collectors/isrctn.md: -------------------------------------------------------------------------------- 1 | # ISRCTN 2 | 3 | http://www.isrctn.com/ 4 | 5 | The ISRCTN registry is a primary clinical trial registry recognised by WHO and ICMJE that accepts all clinical research studies (whether proposed, ongoing or completed), providing content validation and curation and the unique identification number necessary for publication. All study records in the database are freely accessible and searchable. 6 | 7 | ## Source Data Model 8 | 9 | Data could be accessed thru the web interface. 10 | Example - http://www.isrctn.com/ISRCTN13619480. 11 | Data is moving to the warehouse as it is with additional type casting. 12 | See the next section for more details. 13 | 14 | ## Warehouse Data Model 15 | 16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/isrctn/record.py) 17 | for the full data model. 18 | 19 | ## Primary Identifiers 20 | 21 | Trial identifier: `isrctn_id` 22 | 23 | ## Data Update Strategy 24 | 25 | Trials could be serched with `last_edited` filter. 26 | After initial scraping we should use the last 2 days searches 27 | to stay up to date (`recent` stack). 28 | 29 | ## License Terms 30 | 31 | http://www.isrctn.com/page/terms 32 | -------------------------------------------------------------------------------- /migrations/versions/20160509115712_icdcm_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import ARRAY 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'be9dfe290c44' 14 | down_revision = u'b720671a8c0f' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('icdcm', 21 | 22 | # Meta 23 | 24 | sa.Column('meta_id', sa.Text, unique=True), 25 | sa.Column('meta_source', sa.Text), 26 | sa.Column('meta_created', sa.DateTime(timezone=True)), 27 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 28 | 29 | # General 30 | 31 | sa.Column('name', sa.Text, primary_key=True), 32 | sa.Column('desc', sa.Text), 33 | sa.Column('terms', ARRAY(sa.Text)), 34 | sa.Column('version', sa.Text), 35 | sa.Column('last_updated', sa.Date), 36 | 37 | ) 38 | 39 | 40 | def downgrade(): 41 | op.drop_table('icdcm') 42 | -------------------------------------------------------------------------------- /migrations/versions/20160509133714_icdpcs_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '6a990542e4b4' 13 | down_revision = u'be9dfe290c44' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.create_table('icdpcs', 20 | 21 | # Meta 22 | 23 | sa.Column('meta_id', sa.Text, unique=True), 24 | sa.Column('meta_source', sa.Text), 25 | sa.Column('meta_created', sa.DateTime(timezone=True)), 26 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 27 | 28 | # General 29 | 30 | sa.Column('code', sa.Text, primary_key=True), 31 | sa.Column('is_header', sa.Boolean), 32 | sa.Column('short_description', sa.Text), 33 | sa.Column('long_description', sa.Text), 34 | sa.Column('version', sa.Text), 35 | sa.Column('last_updated', sa.Date), 36 | 37 | ) 38 | 39 | 40 | def downgrade(): 41 | op.drop_table('icdpcs') 42 | -------------------------------------------------------------------------------- /collectors/pfizer/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.spiders import Rule 8 | from scrapy.spiders import CrawlSpider 9 | from scrapy.linkextractors import LinkExtractor 10 | from .parser import parse_record 11 | 12 | 13 | # Module API 14 | 15 | class Spider(CrawlSpider): 16 | 17 | # Public 18 | 19 | name = 'pfizer' 20 | allowed_domains = ['pfizer.com'] 21 | 22 | def __init__(self, conf=None, conn=None): 23 | 24 | # Save conf/conn 25 | self.conf = conf 26 | self.conn = conn 27 | 28 | # Make urls 29 | self.start_urls = [ 30 | 'http://www.pfizer.com/research/clinical_trials/find_a_trial?recr=0', 31 | ] 32 | 33 | # Make rules 34 | self.rules = [ 35 | Rule(LinkExtractor( 36 | allow=r'find_a_trial/NCT\d+', 37 | ), callback=parse_record), 38 | Rule(LinkExtractor( 39 | allow=r'page=\d+', 40 | )), 41 | ] 42 | 43 | # Inherit parent 44 | super(Spider, self).__init__() 45 | -------------------------------------------------------------------------------- /collectors/takeda/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.spiders import Rule 8 | from scrapy.spiders import CrawlSpider 9 | from scrapy.linkextractors import LinkExtractor 10 | from .parser import parse_record 11 | 12 | 13 | # Module API 14 | 15 | class Spider(CrawlSpider): 16 | 17 | # Public 18 | 19 | name = 'takeda' 20 | allowed_domains = ['takedaclinicaltrials.com'] 21 | 22 | def __init__(self, conf=None, conn=None): 23 | 24 | # Save conf/conn 25 | self.conf = conf 26 | self.conn = conn 27 | 28 | # Make urls 29 | self.start_urls = [ 30 | 'http://www.takedaclinicaltrials.com/browse/?protocol_id=', 31 | ] 32 | 33 | # Make rules 34 | self.rules = [ 35 | Rule(LinkExtractor( 36 | allow=r'browse/summary/', 37 | ), callback=parse_record), 38 | Rule(LinkExtractor( 39 | allow=r'browse', 40 | )), 41 | ] 42 | 43 | # Inherit parent 44 | super(Spider, self).__init__() 45 | -------------------------------------------------------------------------------- /docs/collectors/ictrp.md: -------------------------------------------------------------------------------- 1 | # ICTRP 2 | 3 | http://apps.who.int/trialsearch/Default.aspx 4 | 5 | The Clinical Trials Search Portal provides access to a central database containing the trial registration data sets provided by the registries listed on the right. It also provides links to the full original records. 6 | 7 | ## Source Data Model 8 | 9 | Data could be accessed thru the web interface (http basic auth is required for crawling). 10 | Example - http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT00399620. 11 | Data is moving to the warehouse as it is with additional type casting. 12 | See the next section for more details. 13 | 14 | ## Warehouse Data Model 15 | 16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/ictrp/record.py) 17 | for the full data model. 18 | 19 | ## Primary Identifiers 20 | 21 | Trial identifier: `main_id` 22 | 23 | ## Data Update Strategy 24 | 25 | Web interface and source model doesn't have something like 26 | `updated` field. So to stay up to date full scan is needed. 27 | 28 | Proposed solution - add algorithm based on `main_id` intervals showed on 29 | index page for crawling. 30 | 31 | ## License Terms 32 | 33 | http://www.who.int/about/copyright/en/ 34 | -------------------------------------------------------------------------------- /migrations/versions/20170123144318_default_for_meta_created_and_meta_updated.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = '542425c4e70b' 13 | down_revision = u'0087dc1eb534' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | updatable_tables = ['actrn', 'cochrane_reviews', 'euctr', 'fda_dap', 'fdadl', 'gsk', 18 | 'hra', 'icdcm', 'icdpcs', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'pubmed', 'takeda'] 19 | 20 | 21 | def upgrade(): 22 | for table in updatable_tables: 23 | op.alter_column(table, 'meta_created', nullable=False, 24 | server_default=sa.func.current_timestamp()) 25 | op.alter_column(table, 'meta_updated', nullable=False, 26 | server_default=sa.func.current_timestamp()) 27 | 28 | 29 | def downgrade(): 30 | for table in updatable_tables: 31 | op.alter_column(table, 'meta_created', nullable=True, server_default=None) 32 | op.alter_column(table, 'meta_updated', nullable=True, server_default=None) 33 | -------------------------------------------------------------------------------- /migrations/versions/20161007222818_create_cochrane_reviews_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import UUID, JSONB 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '84910d455f31' 14 | down_revision = u'bf807df84277' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('cochrane_reviews', 21 | sa.Column('meta_id', sa.Text), 22 | sa.Column('meta_created', sa.DateTime(timezone=True), server_default=sa.text('now()')), 23 | sa.Column('meta_updated', sa.DateTime(timezone=True), server_default=sa.text('now()')), 24 | sa.Column('meta_source', sa.Text), 25 | 26 | sa.Column('id', UUID, primary_key=True), 27 | sa.Column('study_type', sa.Text), 28 | sa.Column('file_name', sa.Text), 29 | sa.Column('robs', JSONB), 30 | sa.Column('study_id', sa.Text), 31 | sa.Column('refs', JSONB), 32 | sa.Column('doi_id', sa.Text), 33 | ) 34 | 35 | 36 | def downgrade(): 37 | op.drop_table('nct') 38 | -------------------------------------------------------------------------------- /collectors/pubmed/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Json, Array 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'pubmed' 18 | 19 | # Medline 20 | 21 | pmid = Text(primary_key=True) 22 | date_created = Date('%Y-%m-%d') 23 | date_completed = Date('%Y-%m-%d') 24 | date_revised = Date('%Y-%m-%d') 25 | country = Text() 26 | medline_ta = Text() 27 | nlm_unique_id = Text() 28 | issn_linking = Text() 29 | mesh_headings = Json() 30 | 31 | # Journal 32 | 33 | journal_issn = Text() 34 | journal_title = Text() 35 | journal_iso = Text() 36 | 37 | # Article 38 | 39 | article_title = Text() 40 | article_abstract = Text() 41 | article_authors = Array() 42 | article_language = Text() 43 | article_publication_type_list = Array() 44 | article_vernacular_title = Text() 45 | article_date = Date('%Y-%m-%d') 46 | 47 | # Pubmed 48 | 49 | publication_status = Text() 50 | article_ids = Json() 51 | registry_ids = Json() 52 | -------------------------------------------------------------------------------- /docs/collectors/nct.md: -------------------------------------------------------------------------------- 1 | # NCT 2 | 3 | https://clinicaltrials.gov/ 4 | 5 | ClinicalTrials.gov is a registry and results database of publicly and privately supported clinical studies of human participants conducted around the world. 6 | 7 | ## Source Data model 8 | 9 | Analysis of NCT data model: 10 | - copy text from `https://www.clinicaltrials.gov/ct2/html/images/info/public.xsd` 11 | - past text to `http://xmlgrid.net/` and click `Submit` 12 | - now you can discover the whole data model, data types etc 13 | 14 | > Only around 10% of studies have a `clinical_results` section - https://www.clinicaltrials.gov/ct2/help/how-find/find-study-results 15 | 16 | --- 17 | 18 | ![](https://cloud.githubusercontent.com/assets/557395/10075868/d77548fe-62e0-11e5-84e0-c81ec6badcfe.png) 19 | 20 | ## Warehouse Data Model 21 | 22 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/nct/record.py) 23 | for the full data model. 24 | 25 | ## Primary Identifiers 26 | 27 | Trial identifier: `nct_id` 28 | 29 | ## Data Update Strategy 30 | 31 | Trials could be serched with `lastchanges_date` filter. 32 | After initial scraping we should use the last 2 days searches 33 | to stay up to date (`recent` stack). 34 | 35 | ## License Terms 36 | 37 | https://clinicaltrials.gov/ct2/about-site/terms-conditions 38 | -------------------------------------------------------------------------------- /migrations/versions/20160725130032_fda_dap_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import JSONB 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '23c55ccc0649' 14 | down_revision = u'3a3b663824f1' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('fda_dap', 21 | 22 | # Meta 23 | 24 | sa.Column('meta_id', sa.Text, unique=True), 25 | sa.Column('meta_source', sa.Text), 26 | sa.Column('meta_created', sa.DateTime(timezone=True)), 27 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 28 | 29 | # General 30 | 31 | sa.Column('id', sa.Text, unique=True), 32 | sa.Column('documents', JSONB), 33 | sa.Column('approval_type', sa.Text), 34 | sa.Column('supplement_number', sa.Integer), 35 | sa.Column('action_date', sa.Date), 36 | sa.Column('fda_application_num', sa.Text), 37 | sa.Column('notes', sa.Text), 38 | 39 | ) 40 | 41 | 42 | def downgrade(): 43 | op.drop_table('fda_dap') 44 | -------------------------------------------------------------------------------- /docs/collectors/gsk.md: -------------------------------------------------------------------------------- 1 | # GSK 2 | 3 | http://www.gsk-clinicalstudyregister.com/ 4 | 5 | The GlaxoSmithKline (GSK) Clinical Study Register provides an easily accessible repository of data from GSK-Sponsored Clinical Studies, supplementing communication in journals, at scientific meetings, in letters to healthcare professionals, and in approved prescribing information. It is important to emphasise that approved prescribing information must continue to guide appropriate use of GSK medicines. This information may vary from country to country. 6 | 7 | ## Source Data Model 8 | 9 | Data could be accessed thru the web interface. 10 | Example - http://www.gsk-clinicalstudyregister.com/study/100901. 11 | Data is moving to the warehouse as it is with additional type casting. 12 | See the next section for more details. 13 | 14 | ## Warehouse Data Model 15 | 16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/gsk/record.py) 17 | for the full data model. 18 | 19 | ## Primary Identifiers 20 | 21 | Trial identifier: `study_id` 22 | 23 | ## Data Update Strategy 24 | 25 | Trials could be serched with `last_updated` filter. 26 | After initial scraping we should use the last 2 days searches 27 | to stay up to date (`recent` stack). 28 | 29 | ## License Terms 30 | 31 | http://www.gsk.com/en-gb/terms-of-use/ 32 | -------------------------------------------------------------------------------- /tests/collectors/gsk/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from collectors.gsk.parser import parse_record 8 | 9 | 10 | class TestGskParser(object): 11 | def test_results_url_contains_absolute_url(self, get_url): 12 | url = 'http://www.gsk-clinicalstudyregister.com/study/100006' 13 | response = get_url(url) 14 | 15 | record = parse_record(response) 16 | 17 | assert record['results_url'].startswith('http') 18 | 19 | def test_results_url_is_none_for_trials_without_results(self, get_url): 20 | url = 'http://www.gsk-clinicalstudyregister.com/study/106847' 21 | response = get_url(url) 22 | 23 | record = parse_record(response) 24 | 25 | assert record.get('results_url') is None 26 | 27 | def test_handles_all_date_formats(self, get_url): 28 | url = 'https://www.gsk-clinicalstudyregister.com/study/100006' 29 | 30 | response = get_url(url) 31 | 32 | record = parse_record(response) 33 | 34 | assert record.get('last_updated') is not None 35 | assert record.get('record_verification_date') is not None 36 | assert record.get('study_start_date') is not None 37 | -------------------------------------------------------------------------------- /docs/collectors/jprn.md: -------------------------------------------------------------------------------- 1 | # JPRN 2 | 3 | http://www.umin.ac.jp/ctr/ 4 | 5 | UMIN was establshed in 1989 as a cooperative organization for national medical schools in Japan, sponsored by the Ministry of Education, Culsutre, Science, Sports and Technology (MEXT), Japan. Its most services are now made available to other heath care researchers via the Internet. 6 | 7 | ## Source Data Model 8 | 9 | Data could be accessed thru the web interface. 10 | Example - https://upload.umin.ac.jp/cgi-open-bin/ctr/ctr.cgi?function=brows&action=brows&type=summary&recptno=R000023978&language=E. 11 | Data is moving to the warehouse as it is with additional type casting. 12 | See the next section for more details. 13 | 14 | ## Warehouse Data Model 15 | 16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/jprn/record.py) 17 | for the full data model. 18 | 19 | ## Primary Identifiers 20 | 21 | Trial identifier: `unique_trial_number` 22 | 23 | ## Data Update Strategy 24 | 25 | Trials have `date_and_time_of_last_update` field. 26 | Newly created and updated trials have to be searched 27 | using desc last_updated ordering (by default). 28 | After initial scraping we should use the last 2 pages of search results 29 | to stay up to date (`recent` stack). 30 | 31 | ## License Terms 32 | 33 | http://www.umin.ac.jp/ctr/UMIN-CTR_e_FAQ.htm 34 | -------------------------------------------------------------------------------- /docs/collectors/euctr.md: -------------------------------------------------------------------------------- 1 | # Euctr 2 | 3 | https://www.clinicaltrialsregister.eu/ 4 | 5 | The EU Clinical Trials Register contains information on interventional clinical trials on medicines conducted in the European Union (EU), or the European Economic Area (EEA) which started after 1 May 2004. 6 | 7 | ## Source Data Model 8 | 9 | Data could be accessed thru the web interface. 10 | Example - https://www.clinicaltrialsregister.eu/ctr-search/trial/2004-000534-36/SK. 11 | Data is moving to the warehouse as it is with additional type casting. 12 | See the next section for more details. 13 | 14 | Additional information - https://eudract.ema.europa.eu/ 15 | 16 | ## Warehouse Data Model 17 | 18 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/euctr/record.py) 19 | for the full data model. 20 | 21 | ## Primary Identifiers 22 | 23 | Trial identifier: `eudract_number_with_country` 24 | 25 | ## Data Update Strategy 26 | 27 | Web interface and source model doesn't have something like 28 | `updated` field. So to stay up to date full scan is needed. 29 | 30 | Proposed solution - use [feed](https://www.clinicaltrialsregister.eu/ctr-search/rest/feed/bydates?query=&dateFrom=2000-01-01&dateTo=2015-01-02) of created/updated in the last 7 days items matching the filter parameters. 31 | 32 | ## License Terms 33 | 34 | https://www.clinicaltrialsregister.eu/disclaimer.html 35 | -------------------------------------------------------------------------------- /collectors/ictrp/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from scrapy.spiders import Rule 8 | from scrapy.spiders import CrawlSpider 9 | from scrapy.linkextractors import LinkExtractor 10 | from .parser import parse_record 11 | 12 | 13 | # Module API 14 | 15 | class Spider(CrawlSpider): 16 | 17 | # Public 18 | 19 | name = 'ictrp' 20 | allowed_domains = ['who.int'] 21 | 22 | def __init__(self, conf=None, conn=None, http_user=None, http_pass=None): 23 | 24 | # Save conf/conn 25 | self.conf = conf 26 | self.conn = conn 27 | 28 | # Save creadentials 29 | self.http_user = http_user 30 | self.http_pass = http_pass 31 | 32 | # Make urls 33 | self.start_urls = [ 34 | 'http://apps.who.int/trialsearch/crawl/crawl0.aspx', 35 | ] 36 | 37 | # Make rules 38 | self.rules = [ 39 | Rule(LinkExtractor( 40 | allow=r'trialsearch/Trial\d+\.aspx\?trialid=.+', 41 | ), callback=parse_record), 42 | Rule(LinkExtractor( 43 | allow=r'trialsearch/crawl/crawl\d+\.aspx', 44 | )), 45 | ] 46 | 47 | # Inherit parent 48 | super(Spider, self).__init__() 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # collectors 2 | 3 | [![Gitter](https://img.shields.io/gitter/room/opentrials/chat.svg)](https://gitter.im/opentrials/chat) 4 | [![Travis](https://img.shields.io/travis/opentrials/collectors/master.svg)](https://travis-ci.org/opentrials/collectors) 5 | [![Issues](https://img.shields.io/badge/issue-tracker-orange.svg)](https://github.com/opentrials/opentrials/issues) 6 | [![Docs](https://img.shields.io/badge/docs-latest-blue.svg)](http://docs.opentrials.net/en/latest/developers/) 7 | 8 | The OpenTrials data collectors + `warehouse` database schema definition. 9 | 10 | ## Documentation 11 | 12 | - [Overview](docs/overview.md) 13 | - [Warehouse](docs/warehouse.md) 14 | - [Collectors](docs/collectors/) 15 | - [ACTRN](docs/collectors/actrn.md) 16 | - [EUCTR](docs/collectors/euctr.md) 17 | - [GSK](docs/collectors/gsk.md) 18 | - [ICTRP](docs/collectors/ictrp.md) 19 | - [ISRCTN](docs/collectors/isrctn.md) 20 | - [JPRN](docs/collectors/jprn.md) 21 | - [NCT](docs/collectors/nct.md) 22 | - [Pfizer](docs/collectors/pfizer.md) 23 | - [Takeda](docs/collectors/takeda.md) 24 | - [Pubmed](docs/collectors/pubmed.md) 25 | 26 | ## Contributing 27 | 28 | Please read the contribution guideline: 29 | 30 | - [How to Contribute](CONTRIBUTING.md) 31 | - [How to Write a Collector](docs/collector-guide.md) 32 | - [How to Write a Collector (using Scrapy)](docs/collector-scrapy-guide.md) 33 | 34 | Thanks! 35 | -------------------------------------------------------------------------------- /collectors/ictrp/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Integer, Json, Array 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'ictrp' 18 | 19 | # Main 20 | 21 | main_id = Text(primary_key=True) 22 | register = Text() 23 | last_refreshed_on = Date('%d %B %Y') 24 | date_of_registration = Text() # non regular format 25 | primary_sponsor = Text() 26 | public_title = Text() 27 | scientific_title = Text() 28 | date_of_first_enrollment = Text() # non regular format 29 | target_sample_size = Integer() 30 | recruitment_status = Text() 31 | url = Text() 32 | study_type = Text() 33 | study_design = Text() 34 | study_phase = Text() 35 | 36 | # Additional 37 | 38 | countries_of_recruitment = Array() 39 | contacts = Json() 40 | key_inclusion_exclusion_criteria = Text() # not presented on the site 41 | health_conditions_or_problems_studied = Array() 42 | interventions = Array() 43 | primary_outcomes = Array() 44 | secondary_outcomes = Array() 45 | secondary_ids = Array() 46 | sources_of_monetary_support = Array() 47 | secondary_sponsors = Array() 48 | -------------------------------------------------------------------------------- /collectors/takeda/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Integer, Array 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'takeda' 18 | 19 | # General 20 | 21 | takeda_trial_id = Text(primary_key=True) 22 | official_title = Text() 23 | trial_phase = Text() 24 | condition = Text() 25 | compound = Array() 26 | recruitment_status = Text() 27 | 28 | # Description 29 | 30 | nct_number = Text() 31 | trial_type = Text() 32 | other_trial_ids = Text() 33 | acronym = Text() 34 | brief_summary = Text() 35 | detailed_description = Text() 36 | trial_design = Text() 37 | primary_outcome_measures = Text() 38 | secondary_outcome_measures = Text() 39 | trial_arms_groups_or_cohorts = Text() 40 | 41 | # Recruitment 42 | 43 | gender = Text() 44 | ages = Text() 45 | enrollment_number_of_participants = Integer() 46 | locations = Array() 47 | responsible_party = Text() 48 | trial_sponsor = Text() 49 | start_date = Date('%B %Y') 50 | completion_date = Date('%B %Y') 51 | eligibility_criteria = Text() 52 | 53 | # Results 54 | 55 | download_the_clinical_trial_summary = Text() 56 | other_available_languages = Text() 57 | -------------------------------------------------------------------------------- /migrations/versions/20160226134759_pfizer_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from alembic import op 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'a8d6e250d481' 13 | down_revision = u'c2ae4513dd2b' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.create_table('pfizer', 20 | 21 | # Meta 22 | 23 | sa.Column('meta_uuid', sa.Text), 24 | sa.Column('meta_source', sa.Text), 25 | sa.Column('meta_created', sa.DateTime(timezone=True)), 26 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 27 | 28 | # General 29 | 30 | sa.Column('title', sa.Text), 31 | 32 | # Description 33 | 34 | sa.Column('study_type', sa.Text), 35 | sa.Column('organization_id', sa.Text), 36 | sa.Column('nct_id', sa.Text), 37 | sa.Column('status', sa.Text), 38 | sa.Column('study_start_date', sa.Date), 39 | sa.Column('study_end_date', sa.Date), 40 | 41 | # Eligibility 42 | 43 | sa.Column('eligibility_criteria', sa.Text), 44 | sa.Column('gender', sa.Text), 45 | sa.Column('age_range', sa.Text), 46 | sa.Column('healthy_volunteers_allowed', sa.Boolean), 47 | 48 | ) 49 | 50 | 51 | def downgrade(): 52 | op.drop_table('pfizer') 53 | -------------------------------------------------------------------------------- /collectors/cochrane_reviews/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import logging 8 | import requests 9 | import zipfile 10 | import io 11 | from .. import base 12 | from .parser import parse_record 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def collect(conf, conn, date_from=None, date_to=None): 17 | file_count = 0 18 | base.helpers.start(conf, 'cochrane', {}) 19 | 20 | content = requests.get(conf['COCHRANE_ARCHIVE_URL']).content 21 | with zipfile.ZipFile(io.BytesIO(content)) as archive: 22 | for filename in archive.namelist(): 23 | base.config.SENTRY.extra_context({ 24 | 'filename': filename, 25 | }) 26 | 27 | with archive.open(filename, 'rU') as review_file: 28 | db_records = parse_record(conf['COCHRANE_ARCHIVE_URL'], review_file) 29 | for rec in db_records: 30 | query = {'file_name': rec['file_name'], 'study_id': rec['study_id']} 31 | if rec.table in conn['warehouse'].tables: 32 | existing = conn['warehouse'][rec.table].find_one(**query) 33 | if existing: 34 | rec['id'] = existing['id'] 35 | rec.write(conf, conn) 36 | file_count += 1 37 | 38 | base.helpers.stop(conf, 'cochrane', {'collected': file_count}) 39 | -------------------------------------------------------------------------------- /tests/collectors/euctr/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from collectors.euctr.parser import parse_record 8 | 9 | 10 | class TestEuctrParser(object): 11 | def test_trial_results_url_returns_absolute_results_url(self, get_url): 12 | url = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2011-005852-33/3rd' 13 | response = get_url(url) 14 | 15 | record = parse_record(response) 16 | 17 | assert record.get('trial_results_url') == 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2011-005852-33/results' 18 | assert record.get('trial_results') == 'View results' 19 | 20 | def test_trial_results_url_is_none_if_therere_no_results(self, get_url): 21 | url = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2009-016529-32/EE' 22 | response = get_url(url) 23 | 24 | record = parse_record(response) 25 | 26 | assert record.get('trial_results_url') is None 27 | assert record.get('trial_results') is None 28 | 29 | def test_trial_results_url_is_none_if_results_not_hyperlink(self, get_url): 30 | url = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2005-002909-23/PT' 31 | response = get_url(url) 32 | 33 | record = parse_record(response) 34 | 35 | assert record.get('trial_results_url') is None 36 | assert record.get('trial_results') == 'Removed from public view'; 37 | -------------------------------------------------------------------------------- /collectors/base/helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import re 8 | import logging 9 | import datetime 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | # Module API 14 | 15 | def slugify(value): 16 | """Slugify string value. 17 | """ 18 | value = re.sub(r'[\W_]+', '_', value) 19 | value = value.strip('_') 20 | value = value.lower() 21 | value = value[:63] # Postgres limitation is 63 22 | return value 23 | 24 | 25 | def parse_date(value, format): 26 | """Parse sting date. 27 | """ 28 | return datetime.datetime.strptime(value, format).date() 29 | 30 | 31 | def parse_datetime(value, format): 32 | """Parse sting datetime. 33 | """ 34 | return datetime.datetime.strptime(value, format) 35 | 36 | 37 | def get_variables(object, filter=None): 38 | """Exract variables from object to dict using name filter. 39 | """ 40 | variables = {} 41 | for name, value in vars(object).items(): 42 | if filter is not None: 43 | if not filter(name): 44 | continue 45 | variables[name] = value 46 | return variables 47 | 48 | 49 | def start(conf, name, message): 50 | """Log collector start. 51 | """ 52 | logger.info('Collector %s has been started(%s)', name, message) 53 | 54 | 55 | def stop(conf, name, message): 56 | """Log collector stop. 57 | """ 58 | logger.info('Collector %s has stopped (%s)', name, message) 59 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | actrn: 2 | extends: 3 | service: actrn 4 | file: docker-cloud.yml 5 | restart: 'no' 6 | 7 | euctr: 8 | extends: 9 | service: euctr 10 | file: docker-cloud.yml 11 | restart: 'no' 12 | 13 | fdadl: 14 | extends: 15 | service: fdadl 16 | file: docker-cloud.yml 17 | restart: 'no' 18 | 19 | fdadap: 20 | extends: 21 | service: fdadap 22 | file: docker-cloud.yml 23 | restart: 'no' 24 | 25 | gsk: 26 | extends: 27 | service: gsk 28 | file: docker-cloud.yml 29 | restart: 'no' 30 | 31 | icdcm: 32 | extends: 33 | service: icdcm 34 | file: docker-cloud.yml 35 | restart: 'no' 36 | 37 | icdpcs: 38 | extends: 39 | service: icdpcs 40 | file: docker-cloud.yml 41 | restart: 'no' 42 | 43 | ictrp: 44 | extends: 45 | service: ictrp 46 | file: docker-cloud.yml 47 | restart: 'no' 48 | 49 | isrctn: 50 | extends: 51 | service: isrctn 52 | file: docker-cloud.yml 53 | restart: 'no' 54 | 55 | jprn: 56 | extends: 57 | service: jprn 58 | file: docker-cloud.yml 59 | restart: 'no' 60 | 61 | nct: 62 | extends: 63 | service: nct 64 | file: docker-cloud.yml 65 | restart: 'no' 66 | 67 | pfizer: 68 | extends: 69 | service: pfizer 70 | file: docker-cloud.yml 71 | restart: 'no' 72 | 73 | pubmed: 74 | extends: 75 | service: pubmed 76 | file: docker-cloud.yml 77 | restart: 'no' 78 | 79 | takeda: 80 | extends: 81 | service: takeda 82 | file: docker-cloud.yml 83 | restart: 'no' 84 | 85 | cochrane-reviews: 86 | extends: 87 | service: cochrane-reviews 88 | file: docker-cloud.yml 89 | restart: 'no' 90 | -------------------------------------------------------------------------------- /migrations/env.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import sys 9 | import sqlalchemy as sa 10 | from alembic import context 11 | 12 | 13 | def run_migrations_offline(): 14 | """Run migrations in 'offline' mode. 15 | 16 | This configures the context with just a URL 17 | and not an Engine, though an Engine is acceptable 18 | here as well. By skipping the Engine creation 19 | we don't even need a DBAPI to be available. 20 | 21 | Calls to context.execute() here emit the given string to the 22 | script output. 23 | 24 | """ 25 | url = context.config.get_main_option("sqlalchemy.url") 26 | context.configure(url=url, target_metadata=None, literal_binds=True) 27 | with context.begin_transaction(): 28 | context.run_migrations() 29 | 30 | 31 | def run_migrations_online(): 32 | """Run migrations in 'online' mode. 33 | 34 | In this scenario we need to create an Engine 35 | and associate a connection with the context. 36 | 37 | """ 38 | sys.path.append(os.path.dirname(__file__)) 39 | import config 40 | connectable = sa.create_engine(config.WAREHOUSE_URL) 41 | with connectable.connect() as connection: 42 | context.configure(connection=connection, target_metadata=None) 43 | with context.begin_transaction(): 44 | context.run_migrations() 45 | 46 | 47 | # Run migrations 48 | if context.is_offline_mode(): 49 | run_migrations_offline() 50 | else: 51 | run_migrations_online() 52 | -------------------------------------------------------------------------------- /migrations/versions/20160525130300_actrn_fix_column_names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = 'e77e7eaf0a34' 12 | down_revision = u'11f80cc2fafb' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | MAPPING = { 17 | 'type_of_endpoint_s': 'type_of_endpoints', 18 | 'who_is_are_masked_blinded': 'who_is__are_masked__blinded', 19 | 'masking_blinding': 'masking__blinding', 20 | 'description_of_intervention_s_exposure': 'description_of_interventions__exposure', 21 | 'comparator_control_treatment': 'comparator__control_treatment', 22 | 'recruitment_state_s': 'recruitment_states', 23 | 'procedure_for_enrolling_a_subject_and_allocating_the_treatment_': 'procedure_for_enrolling_a_subject_and_allocating_the', 24 | 'methods_used_to_generate_the_sequence_in_which_subjects_will_be': 'methods_used_to_generate_the_sequence_in_which', 25 | 'statistical_methods_analysis': 'statistical_methods__analysis', 26 | 'trial_related_presentations_publications': 'trial_related_presentations__publications', 27 | 'target_follow_up_duration': 'target_followup_duration', 28 | 'target_follow_up_type': 'target_followup_type', 29 | } 30 | 31 | 32 | def upgrade(): 33 | for key, value in MAPPING.items(): 34 | op.alter_column('actrn', column_name=value, new_column_name=key) 35 | 36 | 37 | def downgrade(): 38 | for key, value in MAPPING.items(): 39 | op.alter_column('actrn', column_name=key, new_column_name=value) 40 | -------------------------------------------------------------------------------- /tests/collectors/nct/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import pytest 8 | import io 9 | from collectors.nct.parser import parse_record 10 | 11 | 12 | class TestNctParser(object): 13 | @pytest.fixture 14 | def get_record(self, get_url): 15 | def _get_record(nct_id): 16 | url = 'https://clinicaltrials.gov/show/{nct_id}?displayxml=true'.format(nct_id=nct_id) 17 | response = io.BytesIO(get_url(url).body) 18 | return parse_record(response) 19 | 20 | return _get_record 21 | 22 | def test_parser_parse_text(self, get_record): 23 | record = get_record('NCT02931214') 24 | assert record['url'] == 'https://clinicaltrials.gov/show/NCT02931214' 25 | 26 | def test_parser_parse_list(self, get_record): 27 | primary_outcomes = [ 28 | { 29 | 'measure': 'Treatment related adverse events', 30 | 'time_frame': '15 days', 31 | 'description': 'Treatment related adverse events as a measure of safety and tolerability of GMI-1359', 32 | 'safety_issue': 'Yes' 33 | } 34 | ] 35 | record = get_record('NCT02931214') 36 | assert record['primary_outcomes'] == primary_outcomes 37 | 38 | def test_parser_parse_dict(self, get_record): 39 | contact = { 40 | 'phone': '402-476-2811', 41 | 'last_name': 'Laura Sterling, MD' 42 | } 43 | record = get_record('NCT02931214') 44 | assert record['overall_contact'] == contact 45 | -------------------------------------------------------------------------------- /collectors/icdpcs/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import logging 9 | import zipfile 10 | import requests 11 | from .record import Record 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | # Module API 16 | 17 | def collect(conf, conn): 18 | """Collect ICD-XX-PCS procedures. 19 | """ 20 | 21 | # For more information see: 22 | # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-PCS-and-GEMs.html 23 | URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-PCS-Long-Abbrev-Titles.zip' 24 | FILE = 'icd10pcs_order_2016.txt' 25 | VERSION = 'ICD-10-PCS' 26 | LAST_UPDATED = '2015-10-01' 27 | 28 | # Prepare file 29 | zip = requests.get(URL).content 30 | file = zipfile.ZipFile(io.BytesIO(zip)).open(FILE) 31 | 32 | count = 0 33 | for line in file: 34 | # Prepare data 35 | # Format is described in instruction 36 | # stored in zip archive we download 37 | data = { 38 | 'code': line[6:6+7].strip(), 39 | 'is_header': line[14:14+1].strip(), 40 | 'short_description': line[16:16+60].strip(), 41 | 'long_description': line[77:].strip(), 42 | 'version': VERSION, 43 | 'last_updated': LAST_UPDATED, 44 | } 45 | 46 | # Create record 47 | record = Record.create(URL, data) 48 | 49 | # Write record 50 | record.write(conf, conn) 51 | 52 | # Log info 53 | count += 1 54 | if not count % 100: 55 | logger.info('Collected %s "%s" interventions', count, record.table) 56 | -------------------------------------------------------------------------------- /collectors/gsk/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from urllib import urlencode 8 | from collections import OrderedDict 9 | from datetime import date, timedelta 10 | from scrapy.spiders import Rule 11 | from scrapy.spiders import CrawlSpider 12 | from scrapy.linkextractors import LinkExtractor 13 | from .parser import parse_record 14 | 15 | 16 | # Module API 17 | 18 | class Spider(CrawlSpider): 19 | 20 | # Public 21 | 22 | name = 'gsk' 23 | allowed_domains = ['gsk-clinicalstudyregister.com'] 24 | 25 | def __init__(self, conf=None, conn=None, date_from=None, date_to=None): 26 | 27 | # Save conf/conn 28 | self.conf = conf 29 | self.conn = conn 30 | 31 | # Make start urls 32 | self.start_urls = _make_start_urls( 33 | prefix='http://www.gsk-clinicalstudyregister.com/search', 34 | date_from=date_from, date_to=date_to) 35 | 36 | # Make rules 37 | self.rules = [ 38 | Rule(LinkExtractor( 39 | allow=r'study\/\d+' 40 | ), callback=parse_record), 41 | ] 42 | 43 | # Inherit parent 44 | super(Spider, self).__init__() 45 | 46 | 47 | # Internal 48 | 49 | def _make_start_urls(prefix, date_from=None, date_to=None): 50 | """ Return start_urls. 51 | """ 52 | if date_from is None: 53 | date_from = str(date.today() - timedelta(days=1)) 54 | if date_to is None: 55 | date_to = str(date.today()) 56 | query = OrderedDict() 57 | query['last_updated_from'] = date_from 58 | query['last_updated_to'] = date_to 59 | return [prefix + '?' + urlencode(query)] 60 | -------------------------------------------------------------------------------- /migrations/versions/20170123151655_add_trigger_for_meta_updated.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'b32475938a2d' 13 | down_revision = u'542425c4e70b' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | updatable_tables = ['actrn', 'cochrane_reviews', 'euctr', 'fda_dap', 'fdadl', 'gsk', 18 | 'hra', 'icdcm', 'icdpcs', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'pubmed', 'takeda'] 19 | 20 | 21 | def upgrade(): 22 | conn = op.get_bind() 23 | func = sa.DDL("""CREATE FUNCTION set_meta_updated() 24 | RETURNS TRIGGER 25 | LANGUAGE plpgsql 26 | AS $$ 27 | BEGIN 28 | NEW.meta_updated := now(); 29 | RETURN NEW; 30 | END; 31 | $$;""") 32 | conn.execute(func) 33 | 34 | for table in updatable_tables: 35 | trigger_params = {'trigger': ('%s_set_meta_updated' % table), 'table': table} 36 | trigger = ("""CREATE TRIGGER %(trigger)s 37 | BEFORE UPDATE ON %(table)s 38 | FOR EACH ROW EXECUTE PROCEDURE set_meta_updated();""" % trigger_params) 39 | conn.execute(trigger) 40 | 41 | 42 | def downgrade(): 43 | conn = op.get_bind() 44 | for table in updatable_tables: 45 | trigger_params = {'trigger': ('%s_set_meta_updated' % table), 'table': table} 46 | trigger = ('DROP TRIGGER %(trigger)s ON %(table)s;' % trigger_params) 47 | conn.execute(trigger) 48 | 49 | conn.execute(sa.DDL('DROP FUNCTION set_meta_updated();')) 50 | -------------------------------------------------------------------------------- /tests/collectors/hra/test_collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import mock 8 | import pytest 9 | import datetime 10 | import requests 11 | from collections import defaultdict 12 | from collectors.hra.collector import collect, _make_request_url 13 | 14 | 15 | class TestHRACollector(object): 16 | def test_make_request_url(self): 17 | date_from = datetime.date(2015, 1, 1) 18 | date_to = datetime.date(2015, 12, 31) 19 | actual = _make_request_url('prefix', date_from, date_to) 20 | expect = 'prefix?datePublishedFrom=2015-01-01&datePublishedTo=2015-12-31' 21 | assert actual == expect 22 | 23 | @mock.patch('requests.Session.get') 24 | def test_collect_skips_deffered_records(self, session_get_mock, conn, conf, deferred_item_stub): 25 | response_mock = mock.Mock() 26 | response_mock.json.return_value = [ 27 | deferred_item_stub 28 | ] 29 | session_get_mock.return_value = response_mock 30 | collect(conf, conn, '2015-01-01', '2015-01-01') 31 | 32 | hra_id = ('HRA%s' % deferred_item_stub['ApplicationID']) 33 | assert conn['warehouse']['hra'].find_one(hra_id=hra_id) is None 34 | 35 | 36 | @pytest.fixture 37 | def deferred_item_stub(): 38 | deferred_item = defaultdict(lambda: None) 39 | attributes = { 40 | 'ApplicationID': '323854', 41 | 'PublicationDate': 'Publication of this data is currently deferred.', 42 | 'UpdatedDate': '2017-01-05T14:01:03.41', 43 | 'Decision': 'Publication of this data is currently deferred.', 44 | 'DecisionDate': 'Publication of this data is currently deferred.', 45 | } 46 | deferred_item.update(attributes) 47 | return deferred_item 48 | -------------------------------------------------------------------------------- /collectors/icdcm/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import logging 9 | import zipfile 10 | import requests 11 | from scrapy.http import TextResponse 12 | from .record import Record 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | # Module API 17 | 18 | def collect(conf, conn): 19 | """Collect ICD-XX-CM conditions. 20 | """ 21 | 22 | # For more information see: 23 | # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html 24 | URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip' 25 | FILE = 'Tabular.xml' 26 | VERSION = 'ICD-10-CM' 27 | LAST_UPDATED = '2015-10-01' 28 | 29 | # Prepare xml 30 | zip = requests.get(URL).content 31 | xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read() 32 | res = TextResponse(url=URL, body=xml, encoding='utf-8') 33 | 34 | count = 0 35 | for diag in res.xpath('//diag'): 36 | # We need only leafs 37 | childs = diag.xpath('./diag') 38 | if not childs: 39 | continue 40 | 41 | # Get data 42 | data = { 43 | 'name': diag.xpath('./name/text()').extract_first(), 44 | 'desc': diag.xpath('./desc/text()').extract_first(), 45 | 'terms': diag.xpath('.//note/text()').extract(), 46 | 'version': VERSION, 47 | 'last_updated': LAST_UPDATED, 48 | } 49 | 50 | # Create record 51 | record = Record.create(URL, data) 52 | 53 | # Write record 54 | record.write(conf, conn) 55 | 56 | # Log info 57 | count += 1 58 | if not count % 100: 59 | logger.info('Collected %s "%s" conditions', count, record.table) 60 | -------------------------------------------------------------------------------- /collectors/hra/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Datetime 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'hra' 18 | 19 | # General 20 | 21 | hra_id = Text(primary_key=True) 22 | publication_date = Date('%Y-%m-%dT%H:%M:%S') 23 | updated_date = Date('%Y-%m-%dT%H:%M:%S.%f') 24 | comittee_name = Text() 25 | comittee_ref_number = Text() 26 | iras_proj_id = Text() 27 | contact_name = Text() 28 | contact_email = Text() 29 | application_title = Text() 30 | study_type_id = Text() 31 | study_type = Text() 32 | sponsor_org = Text() 33 | research_programme = Text() 34 | data_coll_arrangements = Text() 35 | establishment_org = Text() 36 | establishment_org_address_1 = Text() 37 | establishment_org_address_2 = Text() 38 | establishment_org_address_3 = Text() 39 | establishment_org_post_code = Text() 40 | decision = Text() 41 | decision_date = Datetime('%Y-%m-%d %H:%M:%S') 42 | human_tissue_license = Text() 43 | rtb_title = Text() 44 | research_database_title = Text() 45 | application_full_title = Text() 46 | isrctn_id = Text() 47 | nct_id = Text() 48 | additional_ref_numbers = Text() 49 | duration_of_study_in_uk = Text() 50 | research_summary = Text() 51 | euctr_id = Text() 52 | social_value = Text() 53 | recuitment_arrangements = Text() 54 | risk_and_benefit = Text() 55 | participants_protection_and_care = Text() 56 | informed_consent = Text() 57 | applicant_and_staff_suitability = Text() 58 | independent_review = Text() 59 | supporting_info_suitability = Text() 60 | other_comments = Text() 61 | research_summary_suitability = Text() 62 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import pytest 9 | import betamax 10 | import dataset 11 | from scrapy.http import Request, HtmlResponse 12 | from collectors.base import config, helpers 13 | 14 | 15 | with betamax.Betamax.configure() as cfg: 16 | cfg.cassette_library_dir = 'tests/cassettes/' 17 | 18 | record_mode = 'none' if os.environ.get('CI') else 'once' 19 | cfg.default_cassette_options['record_mode'] = record_mode 20 | cfg.default_cassette_options['match_requests_on'] = [ 21 | 'uri', 22 | 'method', 23 | 'headers', 24 | 'body', 25 | ] 26 | 27 | 28 | # Fixtures 29 | 30 | @pytest.fixture 31 | def conf(): 32 | return helpers.get_variables(config, str.isupper) 33 | 34 | 35 | @pytest.fixture 36 | def conn(): 37 | warehouse = dataset.connect(config.WAREHOUSE_URL) 38 | for table in warehouse.tables: 39 | warehouse[table].delete() 40 | return {'warehouse': warehouse} 41 | 42 | 43 | @pytest.fixture 44 | def get_url(betamax_session): 45 | def _get_url(url, request_kwargs={}): 46 | '''Returns a scrapy.html.HtmlResponse with the contents of the received 47 | url. 48 | 49 | Note that the session is kept intact among multiple calls to this 50 | method (i.e. cookies are passed over). 51 | 52 | We also don't verify SSL certificates, because Takeda's certificate is 53 | invalid. If they become valid, we can resume verifying the 54 | certificates. 55 | ''' 56 | response = betamax_session.get(url, verify=False) 57 | scrapy_response = HtmlResponse( 58 | url=str(response.url), 59 | body=response.content, 60 | ) 61 | scrapy_response.request = Request(url, **request_kwargs) 62 | 63 | return scrapy_response 64 | return _get_url 65 | -------------------------------------------------------------------------------- /collectors/isrctn/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from urllib import urlencode 8 | from collections import OrderedDict 9 | from datetime import date, timedelta 10 | from scrapy.spiders import Rule 11 | from scrapy.spiders import CrawlSpider 12 | from scrapy.linkextractors import LinkExtractor 13 | from .parser import parse_record 14 | 15 | 16 | # Module API 17 | 18 | class Spider(CrawlSpider): 19 | 20 | # Public 21 | 22 | name = 'isrctn' 23 | allowed_domains = ['isrctn.com'] 24 | 25 | def __init__(self, conf=None, conn=None, date_from=None, date_to=None): 26 | 27 | # Save conf/conn 28 | self.conf = conf 29 | self.conn = conn 30 | 31 | # Make start urls 32 | self.start_urls = _make_start_urls( 33 | prefix='http://www.isrctn.com/search', 34 | date_from=date_from, date_to=date_to) 35 | 36 | # Make rules 37 | self.rules = [ 38 | Rule(LinkExtractor( 39 | allow=r'ISRCTN\d+', 40 | ), callback=parse_record), 41 | Rule(LinkExtractor( 42 | allow=r'page=\d+', 43 | )), 44 | ] 45 | 46 | # Inherit parent 47 | super(Spider, self).__init__() 48 | 49 | 50 | # Internal 51 | 52 | def _make_start_urls(prefix, date_from=None, date_to=None): 53 | """ Return start_urls. 54 | """ 55 | if date_from is None: 56 | date_from = str(date.today() - timedelta(days=1)) 57 | if date_to is None: 58 | date_to = str(date.today()) 59 | query = OrderedDict() 60 | query['q'] = '' 61 | gtle = 'GT lastEdited:%sT00:00:00.000Z' % date_from 62 | lele = 'LE lastEdited:%sT00:00:00.000Z' % date_to 63 | query['filters'] = ','.join([gtle, lele]) 64 | query['page'] = '1' 65 | query['pageSize'] = '100' 66 | query['searchType'] = 'advanced-search' 67 | return [prefix + '?' + urlencode(query)] 68 | -------------------------------------------------------------------------------- /migrations/versions/20160428204857_pubmed_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import ARRAY, JSONB 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'b720671a8c0f' 14 | down_revision = u'014fd3f703aa' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('pubmed', 21 | 22 | # Meta 23 | 24 | sa.Column('meta_id', sa.Text, unique=True), 25 | sa.Column('meta_source', sa.Text), 26 | sa.Column('meta_created', sa.DateTime(timezone=True)), 27 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 28 | 29 | # Medline 30 | 31 | sa.Column('pmid', sa.Text, primary_key=True), 32 | sa.Column('date_created', sa.Date), 33 | sa.Column('date_completed', sa.Date), 34 | sa.Column('date_revised', sa.Date), 35 | sa.Column('country', sa.Text), 36 | sa.Column('medline_ta', sa.Text), 37 | sa.Column('nlm_unique_id', sa.Text), 38 | sa.Column('issn_linking', sa.Text), 39 | 40 | # Journal 41 | 42 | sa.Column('journal_issn', sa.Text), 43 | sa.Column('journal_title', sa.Text), 44 | sa.Column('journal_iso', sa.Text), 45 | 46 | # Article 47 | 48 | sa.Column('article_title', sa.Text), 49 | sa.Column('article_abstract', sa.Text), 50 | sa.Column('article_authors', ARRAY(sa.Text)), 51 | sa.Column('article_language', sa.Text), 52 | sa.Column('article_publication_type_list', ARRAY(sa.Text)), 53 | sa.Column('article_vernacular_title', sa.Text), 54 | sa.Column('article_date', sa.Date), 55 | 56 | # Pubmed 57 | 58 | sa.Column('publication_status', sa.Text), 59 | sa.Column('identifiers_list', JSONB()), 60 | 61 | ) 62 | 63 | 64 | def downgrade(): 65 | op.drop_table('pubmed') 66 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile --output-file requirements.txt requirements.in 6 | # 7 | alembic==0.8.10 8 | appdirs==1.4.0 # via setuptools 9 | attrs==16.3.0 # via service-identity 10 | cffi==1.9.1 # via cryptography 11 | chardet==2.3.0 # via normality 12 | click==6.7 # via python-dotenv 13 | constantly==15.1.0 # via twisted 14 | contextlib2==0.5.4 # via raven 15 | cryptography==1.7.2 # via pyopenssl 16 | cssselect==1.0.1 # via parsel, scrapy 17 | dataset==0.7.1 18 | enum34==1.1.6 # via cryptography 19 | idna==2.2 # via cryptography 20 | ijson==2.3 21 | incremental==16.10.1 # via twisted 22 | ipaddress==1.0.18 # via cryptography 23 | lxml==3.7.2 # via parsel, scrapy 24 | mako==1.0.6 # via alembic 25 | markupsafe==0.23 # via mako 26 | normality==0.3.9 # via dataset 27 | packaging==16.8 # via setuptools 28 | parsel==1.1.0 # via scrapy 29 | psycopg2==2.6.2 30 | pyasn1-modules==0.0.8 # via service-identity 31 | pyasn1==0.1.9 # via cryptography, pyasn1-modules, service-identity 32 | pycparser==2.17 # via cffi 33 | pydispatcher==2.0.5 # via scrapy 34 | pyopenssl==16.2.0 # via scrapy, service-identity 35 | pyparsing==2.1.10 # via packaging 36 | python-dateutil==2.6.0 37 | python-dotenv==0.6.2 38 | python-editor==1.0.3 # via alembic 39 | pytz==2016.10 40 | pyyaml==3.12 # via dataset 41 | queuelib==1.4.2 # via scrapy 42 | raven==5.32.0 43 | requests==2.12.2 44 | scrapy==1.3.0 45 | service-identity==16.0.0 # via scrapy 46 | six==1.10.0 # via cryptography, dataset, normality, packaging, parsel, pyopenssl, python-dateutil, scrapy, setuptools, w3lib 47 | sqlalchemy==1.1.5 48 | twisted==16.6.0 # via scrapy 49 | w3lib==1.16.0 # via parsel, scrapy 50 | xmltodict==0.10.2 51 | zope.interface==4.3.3 # via twisted 52 | 53 | # The following packages are considered to be unsafe in a requirements file: 54 | # setuptools # via cryptography, zope.interface 55 | -------------------------------------------------------------------------------- /collectors/actrn/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from urllib import urlencode 8 | from collections import OrderedDict 9 | from datetime import datetime, date, timedelta 10 | from scrapy.spiders import Rule 11 | from scrapy.spiders import CrawlSpider 12 | from scrapy.linkextractors import LinkExtractor 13 | from .parser import parse_record 14 | 15 | 16 | # Module API 17 | 18 | class Spider(CrawlSpider): 19 | 20 | # Public 21 | 22 | name = 'actrn' 23 | allowed_domains = ['anzctr.org.au'] 24 | 25 | def __init__(self, conf=None, conn=None, date_from=None, date_to=None): 26 | 27 | # Save conf/conn 28 | self.conf = conf 29 | self.conn = conn 30 | 31 | # Make start urls 32 | self.start_urls = _make_start_urls( 33 | prefix='http://www.anzctr.org.au/TrialSearch.aspx', 34 | date_from=date_from, date_to=date_to) 35 | 36 | # Make rules 37 | self.rules = [ 38 | Rule(LinkExtractor( 39 | allow=r'Trial/Registration/TrialReview.aspx', 40 | process_value=lambda value: value.replace('http', 'https', 1), 41 | ), callback=parse_record), 42 | Rule(LinkExtractor( 43 | allow=r'page=\d+', 44 | )), 45 | ] 46 | 47 | # Inherit parent 48 | super(Spider, self).__init__() 49 | 50 | 51 | # Internal 52 | 53 | def _make_start_urls(prefix, date_from=None, date_to=None): 54 | """ Return start_urls. 55 | """ 56 | if date_from is None: 57 | date_from = str(date.today() - timedelta(days=1)) 58 | if date_to is None: 59 | date_to = str(date.today()) 60 | query = OrderedDict() 61 | date_from = datetime.strptime(date_from, '%Y-%m-%d').strftime('%d/%m/%Y') 62 | date_to = datetime.strptime(date_to, '%Y-%m-%d').strftime('%d/%m/%Y') 63 | query['searchTxt'] = '' 64 | query['dateOfRegistrationFrom'] = date_from 65 | query['dateOfRegistrationTo'] = date_to 66 | query['registry'] = 'ANZCTR' 67 | query['isBasic'] = 'False' 68 | return [prefix + '?' + urlencode(query)] 69 | -------------------------------------------------------------------------------- /collectors/pfizer/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .record import Record 8 | 9 | 10 | # Module API 11 | 12 | def parse_record(res): 13 | 14 | # Init data 15 | data = {} 16 | 17 | # Description 18 | 19 | key = 'study_type' 20 | path = '.field-name-field-study-type .field-item::text' 21 | value = res.css(path).extract_first() 22 | data[key] = value 23 | 24 | key = 'organization_id' 25 | path = '.field-name-field-organization-id .field-item::text' 26 | value = res.css(path).extract_first() 27 | data[key] = value 28 | 29 | key = 'nct_id' 30 | path = '.field-name-field-clinical-trial-id .field-item::text' 31 | value = res.css(path).extract_first() 32 | data[key] = value 33 | 34 | key = 'status' 35 | path = '//label[text() = "Status"]/../text()' 36 | value = ''.join(res.xpath(path).extract()).strip() 37 | data[key] = value 38 | 39 | key = 'study_start_date' 40 | path = '.field-name-field-study-start-date .field-item span::text' 41 | value = res.css(path).extract_first() 42 | data[key] = value 43 | 44 | key = 'study_end_date' 45 | path = '.field-name-field-study-end-date .field-item span::text' 46 | value = res.css(path).extract_first() 47 | data[key] = value 48 | 49 | # Eligibility 50 | 51 | key = 'eligibility_criteria' 52 | path = '.field-name-field-criteria .field-item *::text' 53 | value = ''.join(res.css(path).extract()) 54 | data[key] = value 55 | 56 | key = 'gender' 57 | path = '.field-name-field-gender .field-item::text' 58 | value = res.css(path).extract_first() 59 | data[key] = value 60 | 61 | key = 'age_range' 62 | path = '//label[text() = "Age Range:"]/../text()' 63 | value = ''.join(res.xpath(path).extract()).strip() 64 | data[key] = value 65 | 66 | key = 'healthy_volunteers_allowed' 67 | path = '.field-name-field-healthy-volunteers-allowed .field-item::text' 68 | value = res.css(path).extract_first() 69 | data[key] = value 70 | 71 | # Create record 72 | record = Record.create(res.url, data) 73 | 74 | return record 75 | -------------------------------------------------------------------------------- /collectors/takeda/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | try: 8 | import urlparse 9 | except ImportError: 10 | import urllib.parse as urlparse 11 | from .. import base 12 | from .record import Record 13 | 14 | 15 | # Module API 16 | 17 | def parse_record(res): 18 | 19 | # Init data 20 | data = {} 21 | 22 | # Parse rawdata 23 | gpath = 'h1' 24 | kpath = 'p.eyebrowbold' 25 | vpath = 'p.eyebrowbold+*' 26 | rawdata = _parse_data(res, gpath, kpath, vpath) 27 | for group, key, value in rawdata: 28 | 29 | # General 30 | 31 | if key == 'compound': 32 | value = value.split(',') 33 | 34 | # Recruitment 35 | 36 | if key == 'locations': 37 | value = value.split(',') 38 | 39 | # Collect plain values 40 | data[key] = value 41 | 42 | # Extract results URL 43 | selector = '#results div a::attr(href)' 44 | value = res.css(selector).extract_first() 45 | if value: 46 | url = urlparse.urljoin(res.url, value) 47 | data['download_the_clinical_trial_summary'] = url 48 | else: 49 | try: 50 | del data['download_the_clinical_trial_summary'] 51 | except KeyError: 52 | pass 53 | 54 | # Create record 55 | record = Record.create(res.url, data) 56 | 57 | return record 58 | 59 | 60 | # Internal 61 | 62 | def _parse_data(sel, gpath, kpath, vpath): 63 | data = [] 64 | group = None 65 | name = None 66 | value = None 67 | for sel in sel.css('%s, %s, %s' % (gpath, kpath, vpath)): 68 | text = _parse_text(sel) 69 | if sel.css(gpath): 70 | group = text 71 | elif sel.css(kpath): 72 | name = base.helpers.slugify(text) 73 | else: 74 | value = text 75 | if name and value: 76 | data.append((group, name, value)) 77 | name = None 78 | value = None 79 | return data 80 | 81 | 82 | def _parse_text(sel): 83 | text = '' 84 | texts = sel.xpath('.//text()').extract() 85 | if texts: 86 | text = ' '.join(texts).strip() 87 | return text 88 | -------------------------------------------------------------------------------- /migrations/versions/20160301131954_ictrp_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import ARRAY, JSONB 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '7518ba857fea' 14 | down_revision = u'393d51424903' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('ictrp', 21 | 22 | # Meta 23 | 24 | sa.Column('meta_uuid', sa.Text), 25 | sa.Column('meta_source', sa.Text), 26 | sa.Column('meta_created', sa.DateTime(timezone=True)), 27 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 28 | 29 | # Main 30 | 31 | sa.Column('register', sa.Text, primary_key=True), 32 | sa.Column('last_refreshed_on', sa.Date), 33 | sa.Column('main_id', sa.Text, primary_key=True), 34 | sa.Column('date_of_registration', sa.Text), 35 | sa.Column('primary_sponsor', sa.Text), 36 | sa.Column('public_title', sa.Text), 37 | sa.Column('scientific_title', sa.Text), 38 | sa.Column('date_of_first_enrollment', sa.Text), 39 | sa.Column('target_sample_size', sa.Integer), 40 | sa.Column('recruitment_status', sa.Text), 41 | sa.Column('url', sa.Text), 42 | sa.Column('study_type', sa.Text), 43 | sa.Column('study_design', sa.Text), 44 | sa.Column('study_phase', sa.Text), 45 | 46 | # Additional 47 | 48 | sa.Column('countries_of_recruitment', ARRAY(sa.Text)), 49 | sa.Column('contacts', JSONB), 50 | sa.Column('key_inclusion_exclusion_criteria', sa.Text), 51 | sa.Column('health_conditions_or_problems_studied', ARRAY(sa.Text)), 52 | sa.Column('interventions', ARRAY(sa.Text)), 53 | sa.Column('primary_outcomes', ARRAY(sa.Text)), 54 | sa.Column('secondary_outcomes', ARRAY(sa.Text)), 55 | sa.Column('secondary_ids', ARRAY(sa.Text)), 56 | sa.Column('sources_of_monetary_support', ARRAY(sa.Text)), 57 | sa.Column('secondary_sponsors', ARRAY(sa.Text)), 58 | 59 | ) 60 | 61 | 62 | def downgrade(): 63 | op.drop_table('ictrp') 64 | -------------------------------------------------------------------------------- /tests/collectors/takeda/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from collectors.takeda.parser import parse_record 8 | 9 | 10 | class TestTakedaParser(object): 11 | def test_download_the_clinical_trial_summary_contains_absolute_url(self, get_url): 12 | url = 'https://www.takedaclinicaltrials.com/browse/summary/TAK-648_101' 13 | response = get_url(url) 14 | 15 | record = parse_record(response) 16 | 17 | assert record['download_the_clinical_trial_summary'] == 'https://www.takedaclinicaltrials.com/files2/TAK-648-101-RDS-2016-02-10.pdf' 18 | 19 | def test_download_the_clinical_trial_summary_prefers_english_pdf_when_available(self, get_url): 20 | url = 'https://www.takedaclinicaltrials.com/browse/summary/073-011' 21 | response = get_url(url) 22 | 23 | record = parse_record(response) 24 | 25 | assert record.get('download_the_clinical_trial_summary') == 'https://www.takedaclinicaltrials.com/files2/073-011-RDS-2015-03-27.pdf' 26 | 27 | def test_download_the_clinical_trial_summary_gets_japanese_pdf_if_no_english_available(self, get_url): 28 | url = 'https://www.takedaclinicaltrials.com/browse/summary/AG-1749/CCT-352' 29 | response = get_url(url) 30 | 31 | record = parse_record(response) 32 | 33 | assert record.get('download_the_clinical_trial_summary') == 'https://www.takedaclinicaltrials.com/files2/AG-1749-CCT-352-RDS-2010-10-17_JP.pdf' 34 | 35 | def test_download_the_clinical_trial_summary_is_none_for_trials_without_results(self, get_url): 36 | url = 'https://www.takedaclinicaltrials.com/browse/summary/NaltrexBuprop-4004' 37 | response = get_url(url) 38 | 39 | record = parse_record(response) 40 | 41 | assert record.get('download_the_clinical_trial_summary') is None 42 | 43 | def test_download_the_clinical_trial_summary_is_none_for_trials_with_results_unavailable_on_takeda(self, get_url): 44 | url = 'https://www.takedaclinicaltrials.com/browse/summary/ATS%20K023' 45 | response = get_url(url) 46 | 47 | record = parse_record(response) 48 | 49 | assert record.get('download_the_clinical_trial_summary') is None 50 | -------------------------------------------------------------------------------- /migrations/versions/20160229142254_takeda_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import ARRAY 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '393d51424903' 14 | down_revision = u'a8d6e250d481' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('takeda', 21 | 22 | # Meta 23 | 24 | sa.Column('meta_uuid', sa.Text), 25 | sa.Column('meta_source', sa.Text), 26 | sa.Column('meta_created', sa.DateTime(timezone=True)), 27 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 28 | 29 | # General 30 | 31 | sa.Column('official_title', sa.Text), 32 | sa.Column('takeda_trial_id', sa.Text), 33 | sa.Column('trial_phase', sa.Text), 34 | sa.Column('condition', sa.Text), 35 | sa.Column('compound', ARRAY(sa.Text)), 36 | sa.Column('recruitment_status', sa.Text), 37 | 38 | # Description 39 | 40 | sa.Column('nct_number', sa.Text), 41 | sa.Column('trial_type', sa.Text), 42 | sa.Column('other_trial_ids', sa.Text), 43 | sa.Column('acronym', sa.Text), 44 | sa.Column('brief_summary', sa.Text), 45 | sa.Column('detailed_description', sa.Text), 46 | sa.Column('trial_design', sa.Text), 47 | sa.Column('primary_outcome_measures', sa.Text), 48 | sa.Column('secondary_outcome_measures', sa.Text), 49 | sa.Column('trial_armsgroups_or_cohorts', sa.Text), 50 | 51 | # Recruitment 52 | 53 | sa.Column('gender', sa.Text), 54 | sa.Column('ages', sa.Text), 55 | sa.Column('enrollmentnumber_of_participants', sa.Integer), 56 | sa.Column('locations', ARRAY(sa.Text)), 57 | sa.Column('responsible_party', sa.Text), 58 | sa.Column('trial_sponsor', sa.Text), 59 | sa.Column('start_date', sa.Date), 60 | sa.Column('completion_date', sa.Date), 61 | sa.Column('eligibility_criteria', sa.Text), 62 | 63 | # Results 64 | 65 | sa.Column('download_the_clinical_trial_summary', sa.Text), 66 | sa.Column('other_available_languages', sa.Text), 67 | 68 | ) 69 | 70 | 71 | def downgrade(): 72 | op.drop_table('takeda') 73 | -------------------------------------------------------------------------------- /collectors/euctr/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from urllib import urlencode 8 | from functools import partial 9 | from collections import OrderedDict 10 | from datetime import date, timedelta 11 | from scrapy.spiders import Rule 12 | from scrapy.spiders import CrawlSpider 13 | from scrapy.linkextractors import LinkExtractor 14 | from .parser import parse_record 15 | 16 | 17 | # Module API 18 | 19 | class Spider(CrawlSpider): 20 | 21 | # Public 22 | 23 | name = 'euctr' 24 | allowed_domains = ['clinicaltrialsregister.eu'] 25 | 26 | def __init__(self, conf=None, conn=None, date_from=None, date_to=None): 27 | 28 | # Save conf/conn 29 | self.conf = conf 30 | self.conn = conn 31 | 32 | # Make start urls 33 | self.start_urls = _make_start_urls( 34 | prefix='https://www.clinicaltrialsregister.eu/ctr-search/search', 35 | date_from=date_from, date_to=date_to) 36 | 37 | # Make rules 38 | self.rules = [ 39 | Rule( 40 | LinkExtractor( 41 | allow=r'ctr-search/trial/[\d-]+/[\w]+', 42 | deny=r'results$' 43 | ), 44 | callback=parse_record 45 | ), 46 | Rule( 47 | LinkExtractor( 48 | allow=r'page=\d+', 49 | restrict_css='[accesskey=n]' 50 | ), 51 | process_links=partial(_process_links, self.start_urls) 52 | ), 53 | ] 54 | 55 | # Inherit parent 56 | super(Spider, self).__init__() 57 | 58 | 59 | # Internal 60 | 61 | def _make_start_urls(prefix, date_from=None, date_to=None): 62 | """ Return start_urls. 63 | """ 64 | if date_from is None: 65 | date_from = str(date.today() - timedelta(days=1)) 66 | if date_to is None: 67 | date_to = str(date.today()) 68 | query = OrderedDict() 69 | query['query'] = '' 70 | query['dateFrom'] = date_from 71 | query['dateTo'] = date_to 72 | return [prefix + '?' + urlencode(query)] 73 | 74 | 75 | def _process_links(start_urls, links): 76 | result = [] 77 | for link in links: 78 | link.url = '&page='.join([start_urls[0], link.url.split('=')[-1]]) 79 | result.append(link) 80 | return result 81 | -------------------------------------------------------------------------------- /collectors/jprn/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from functools import partial 8 | from urllib import urlencode 9 | from collections import OrderedDict 10 | from scrapy.spiders import Rule 11 | from scrapy.spiders import CrawlSpider 12 | from scrapy.linkextractors import LinkExtractor 13 | from six.moves.urllib.parse import urlparse, parse_qs 14 | from .parser import parse_record 15 | 16 | 17 | # Module API 18 | 19 | class Spider(CrawlSpider): 20 | 21 | # Public 22 | 23 | name = 'jprn' 24 | allowed_domains = ['upload.umin.ac.jp'] 25 | 26 | def __init__(self, conf=None, conn=None, page_from=None, page_to=None): 27 | 28 | # Save conf/conn 29 | self.conf = conf 30 | self.conn = conn 31 | 32 | # Default values 33 | if page_from is None: 34 | page_from = '1' 35 | if page_to is None: 36 | page_to = '1' 37 | 38 | # Make start urls 39 | self.start_urls = _make_start_urls( 40 | prefix='https://upload.umin.ac.jp/cgi-open-bin/ctr_e/index.cgi', 41 | page_from=page_from) 42 | 43 | # Make rules 44 | self.rules = [ 45 | Rule(LinkExtractor( 46 | allow=r'cgi-open-bin/ctr_e/ctr_view.cgi', 47 | ), callback=parse_record), 48 | Rule(LinkExtractor( 49 | allow=r'page=\d+', 50 | process_value=partial(_process_url, page_from, page_to), 51 | )), 52 | ] 53 | 54 | # Inherit parent 55 | super(Spider, self).__init__() 56 | 57 | 58 | # Internal 59 | 60 | def _make_start_urls(prefix, page_from=None): 61 | """ Return start_urls. 62 | """ 63 | if page_from is None: 64 | page_from = '1' 65 | query = OrderedDict() 66 | query['page'] = page_from 67 | query['sort'] = '05' 68 | return [prefix + '?' + urlencode(query)] 69 | 70 | 71 | def _process_url(page_from, page_to, url): 72 | 73 | # Get url page 74 | query = urlparse(url).query 75 | query = parse_qs(query) 76 | page = query.get('page') 77 | 78 | # Preserve if match 79 | if page: 80 | page_from = int(page_from) 81 | page_to = int(page_to) 82 | page = int(page[0]) 83 | if page >= page_from and page <= page_to: 84 | return url 85 | 86 | return None 87 | -------------------------------------------------------------------------------- /collectors/jprn/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from collections import OrderedDict 8 | from .. import base 9 | from .record import Record 10 | 11 | 12 | # Module API 13 | 14 | def parse_record(res): 15 | fields_to_remove = [ 16 | 'item', 17 | ] 18 | 19 | # Parse rawdata 20 | data = {} 21 | 22 | # Get meta 23 | subdata = _parse_table(res, key_index=0, value_index=2) 24 | data.update(subdata) 25 | 26 | # Process rawdata 27 | rawdata = _parse_table(res, key_index=0, value_index=1) 28 | prefix = '' 29 | for key, value in rawdata.items(): 30 | 31 | # Interventions 32 | 33 | newkey = 'interventions' 34 | oldkey = 'interventionscontrol' 35 | data.setdefault(newkey, []) 36 | if key.startswith(oldkey): 37 | data[newkey].append(value) 38 | continue 39 | 40 | # Research contact person 41 | 42 | if key == 'name_of_lead_principal_investigator': 43 | prefix = 'research_' 44 | 45 | # Public contact 46 | 47 | if key == 'name_of_contact_person': 48 | prefix = 'public_' 49 | 50 | # Sponsor 51 | 52 | if key == 'name_of_primary_sponsor': 53 | prefix = '' 54 | 55 | # Collect plain values 56 | key = prefix + key 57 | data[key] = value 58 | 59 | # Remove data 60 | for key in fields_to_remove: 61 | if key in data: 62 | del data[key] 63 | 64 | identifier = data.get('unique_id_issued_by_umin') 65 | data['unique_trial_number'] = data.get('unique_trial_number', identifier) 66 | 67 | # Create record 68 | record = Record.create(res.url, data) 69 | 70 | return record 71 | 72 | 73 | # Internal 74 | 75 | def _parse_table(res, key_index, value_index): 76 | """parse data from tabular structure. 77 | """ 78 | data = OrderedDict() 79 | for sel in res.xpath('//tr'): 80 | columns = sel.xpath('td') 81 | if len(columns) == value_index+1: 82 | key = ''.join(columns[key_index].xpath('.//text()').extract()) 83 | key = base.helpers.slugify(key.strip()) 84 | value = ''.join(columns[value_index].xpath('.//text()').extract()) 85 | value = value.strip() 86 | if key and value: 87 | data[key] = value 88 | return data 89 | -------------------------------------------------------------------------------- /collectors/nct/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import zipfile 8 | import logging 9 | import requests 10 | import tempfile 11 | import contextlib 12 | from .parser import parse_record 13 | from .. import base 14 | logger = logging.getLogger(__name__) 15 | 16 | # Module API 17 | 18 | 19 | def collect(conf, conn, nct_xml_dump_url): 20 | ''' 21 | Downloads and parse data from NCT's XML dump. Considering you want the data 22 | from 2017-01-01 until 2017-02-01, the XML dump can be downloaded from: 23 | 24 | https://clinicaltrials.gov/search?resultsxml=True&rcv_s=01/01/2017&rcv_e=01/02/2017 25 | ''' 26 | base.helpers.start(conf, 'nct', {'url': nct_xml_dump_url}) 27 | 28 | with tempfile.TemporaryFile() as fp: 29 | _download_to_file(nct_xml_dump_url, fp) 30 | file_count = 0 31 | for identifier, record_fp in _iter_nct_dump_files(fp): 32 | base.config.SENTRY.extra_context({ 33 | 'url': nct_xml_dump_url, 34 | 'identifier': identifier, 35 | }) 36 | rec = parse_record(record_fp) 37 | query = {'nct_id': rec['nct_id']} 38 | if rec.table in conn['warehouse'].tables: 39 | existing = conn['warehouse'][rec.table].find_one(**query) 40 | if existing: 41 | rec['nct_id'] = existing['nct_id'] 42 | rec.write(conf, conn) 43 | file_count += 1 44 | logger.info('Collected %s NCT records', file_count) 45 | 46 | base.helpers.stop(conf, 'nct', { 47 | 'url': nct_xml_dump_url, 48 | 'collected': file_count, 49 | }) 50 | 51 | 52 | def _download_to_file(url, fp): 53 | CHUNK_SIZE = 1024 * 1024 # 1 MB 54 | bytes_to_mb = lambda value: value / 1048576.0 55 | with contextlib.closing(requests.get(url, stream=True)) as response: 56 | completed_bytes = 0 57 | chunk_count = 0 58 | for block in response.iter_content(CHUNK_SIZE): 59 | fp.write(block) 60 | completed_bytes += len(block) 61 | chunk_count += 1 62 | if chunk_count % 1000 == 0: 63 | logger.debug('Downloaded %.2f MB', bytes_to_mb(completed_bytes)) 64 | fp.seek(0) 65 | 66 | 67 | def _iter_nct_dump_files(fp): 68 | with zipfile.ZipFile(fp) as archive: 69 | for filename in archive.namelist(): 70 | identifier = filename.split('.')[0] 71 | with archive.open(filename, 'rU') as rec_file: 72 | yield identifier, rec_file 73 | -------------------------------------------------------------------------------- /tests/collectors/pubmed/test_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import datetime 8 | import pytest 9 | from collectors.pubmed.parser import parse_record 10 | 11 | 12 | class TestPubmedParser(object): 13 | def test_bug_abstracttext_without_text(self, get_url): 14 | url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=22078490&retmode=xml' 15 | response = get_url(url) 16 | 17 | record = parse_record(response) 18 | 19 | assert record['article_abstract'] is not None 20 | 21 | def test_bug_article_with_multiple_languages_pick_first_one(self, get_url): 22 | url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=19082263&retmode=xml' 23 | response = get_url(url) 24 | 25 | record = parse_record(response) 26 | 27 | assert record['article_language'].lower() == 'eng' 28 | 29 | @pytest.mark.skip(reason='need to find an article without medline journal country') 30 | def test_bug_article_without_medline_journal_country(self, get_url): 31 | url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=10838360&retmode=xml' 32 | response = get_url(url) 33 | 34 | record = parse_record(response) 35 | 36 | assert record.get('country') is None 37 | 38 | def test_bug_article_without_vernacular_title(self, get_url): 39 | url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=27305424&retmode=xml' 40 | response = get_url(url) 41 | 42 | record = parse_record(response) 43 | 44 | assert record.get('article_vernacular_title') is None 45 | 46 | def test_article_date_corectly_parsed(self, get_url): 47 | url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=24165173&retmode=xml' 48 | response = get_url(url) 49 | 50 | record = parse_record(response) 51 | 52 | assert record.get('article_date') == datetime.date(2013, 10, 28) 53 | 54 | def test_multiple_ids_same_registry_collected(self, get_url): 55 | url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=22327499&retmode=xml' 56 | response = get_url(url) 57 | record = parse_record(response) 58 | 59 | registry_ids = [reg_id for reg_entry in record.get('registry_ids', []) 60 | for reg_id in reg_entry.values()] 61 | 62 | nct_ids = [reg_id for reg_id in registry_ids if 'NCT' in reg_id] 63 | 64 | assert len(nct_ids) == 2 65 | -------------------------------------------------------------------------------- /collectors/nct/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Integer, Json, Array, Boolean 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'nct' 18 | _DATE_FORMATS = [ 19 | '%B %Y', 20 | '%B %d, %Y', 21 | ] 22 | 23 | # General 24 | 25 | nct_id = Text(primary_key=True) 26 | download_date = Text() 27 | link_text = Text() 28 | url = Text() 29 | org_study_id = Text() 30 | secondary_ids = Array() 31 | nct_aliases = Array() 32 | brief_title = Text() 33 | acronym = Text() 34 | official_title = Text() 35 | sponsors = Json() 36 | source = Text() 37 | oversight_info = Json() 38 | brief_summary = Text() 39 | detailed_description = Text() 40 | overall_status = Text() 41 | why_stopped = Text() 42 | start_date = Date(_DATE_FORMATS) 43 | completion_date_actual = Date(_DATE_FORMATS) 44 | completion_date_anticipated = Date(_DATE_FORMATS) 45 | primary_completion_date_actual = Date(_DATE_FORMATS) 46 | primary_completion_date_anticipated = Date(_DATE_FORMATS) 47 | phase = Text() 48 | study_type = Text() 49 | study_design = Text() 50 | target_duration = Text() 51 | primary_outcomes = Json() 52 | secondary_outcomes = Json() 53 | other_outcomes = Json() 54 | number_of_arms = Integer() 55 | number_of_groups = Integer() 56 | enrollment_actual = Integer() 57 | enrollment_anticipated = Integer() 58 | conditions = Array() 59 | arm_groups = Json() 60 | interventions = Json() 61 | biospec_retention = Text() 62 | biospec_desrc = Text() 63 | eligibility = Json() 64 | overall_officials = Json() 65 | overall_contact = Json() 66 | overall_contact_backup = Json() 67 | locations = Json() 68 | location_countries = Array() 69 | removed_countries = Array() 70 | links = Json() 71 | references = Json() 72 | results_references = Json() 73 | verification_date = Date(_DATE_FORMATS) 74 | lastchanged_date = Date(_DATE_FORMATS) 75 | firstreceived_date = Date(_DATE_FORMATS) 76 | firstreceived_results_date = Date(_DATE_FORMATS) 77 | responsible_party = Json() 78 | keywords = Array() 79 | is_fda_regulated = Boolean('Yes') 80 | is_section_801 = Boolean('Yes') 81 | has_expanded_access = Boolean('Yes') 82 | condition_browse = Json() 83 | intervention_browse = Json() 84 | clinical_results = Json() 85 | results_exemption_date = Date(_DATE_FORMATS) 86 | -------------------------------------------------------------------------------- /migrations/versions/20160525105409_euctr_fix_column_names.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | 9 | 10 | # revision identifiers, used by Alembic. 11 | revision = '11f80cc2fafb' 12 | down_revision = u'f38e14eac095' 13 | branch_labels = None 14 | depends_on = None 15 | 16 | MAPPING = { 17 | 'date_on_which_this_record_was_first_entered_in_the_eudract_data': 'date_on_which_this_record_was_first_entered', 18 | 'name_or_abbreviated_title_of_the_trial_where_available': 'name_or_abbreviated_title_of_the_trial_where', 19 | 'sponsor_s_protocol_code_number': 'sponsors_protocol_code_number', 20 | 'subject_plans_for_treatment_or_care_after_the_subject_has_ended': 'subject_plans_for_treatment_or_care_after_the_subject', 21 | 'title_of_the_trial_for_lay_people_in_easily_understood_i_e_non_': 'title_of_the_trial_for_lay_people_in', 22 | 'trial_definition_of_the_end_of_the_trial_and_justification_wher': 'trial_definition_of_the_end_of_the_trial_and', 23 | 'trial_full_title_date_and_version_of_each_sub_study_and_their_r': 'trial_full_title_date_and_version_of_each_substudy', 24 | 'trial_if_e_8_6_1_or_e_8_6_2_are_yes_specify_the_regions_in_whic': 'trial_if_e861_or_e862_are_yes_specify_the', 25 | 'trial_medical_condition_s_being_investigated': 'trial_medical_conditions_being_investigated', 26 | 'trial_other_medicinal_product_s': 'trial_other_medicinal_products', 27 | 'trial_primary_end_point_s': 'trial_primary_end_points', 28 | 'trial_secondary_end_point_s': 'trial_secondary_end_points', 29 | 'trial_specify_the_countries_outside_of_the_eea_in_which_trial_s': 'trial_specify_the_countries_outside_of_the_eea_in', 30 | 'trial_the_trial_involves_multiple_sites_in_the_member_state_con': 'trial_the_trial_involves_multiple_sites_in_the_member', 31 | 'trial_the_trial_involves_single_site_in_the_member_state_concer': 'trial_the_trial_involves_single_site_in_the_member', 32 | 'trial_timepoint_s_of_evaluation_of_this_end_point': 'trial_timepoints_of_evaluation_of_this_end_point', 33 | 'trial_trial_being_conducted_both_within_and_outside_the_eea': 'trial_trial_being_conducted_both_within_and_outside_the', 34 | 'trial_trial_contains_a_sub_study': 'trial_trial_contains_a_substudy', 35 | 'us_nct_clinicaltrials_gov_registry_number': 'us_nct_clinicaltrialsgov_registry_number', 36 | } 37 | 38 | 39 | def upgrade(): 40 | for key, value in MAPPING.items(): 41 | op.alter_column('euctr', column_name=value, new_column_name=key) 42 | 43 | 44 | def downgrade(): 45 | for key, value in MAPPING.items(): 46 | op.alter_column('euctr', column_name=key, new_column_name=value) 47 | -------------------------------------------------------------------------------- /collectors/fdadl/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import io 8 | import ijson 9 | import shutil 10 | import logging 11 | import zipfile 12 | import tempfile 13 | import requests 14 | from .. import base 15 | from .record import Record 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | # Module API 20 | 21 | def collect(conf, conn): 22 | """Collect FDA Drug Labels. 23 | """ 24 | 25 | # For more information see: 26 | # https://open.fda.gov/api/reference/ 27 | URL = 'http://download.open.fda.gov/drug/label/{file}.zip' 28 | FILES = [ 29 | 'drug-label-0001-of-0005.json', 30 | 'drug-label-0002-of-0005.json', 31 | 'drug-label-0003-of-0005.json', 32 | 'drug-label-0004-of-0005.json', 33 | 'drug-label-0005-of-0005.json', 34 | ] 35 | 36 | # Create temp directory 37 | dirpath = tempfile.mkdtemp() 38 | 39 | success = 0 40 | for file in FILES: 41 | 42 | # Download json 43 | url = URL.format(file=file) 44 | arch = zipfile.ZipFile(io.BytesIO(requests.get(url).content)) 45 | path = arch.extract(file, dirpath) 46 | file = io.open(path, encoding='utf-8') 47 | 48 | # Get last updated 49 | last_updated = list(ijson.items(file, 'meta.last_updated'))[0] 50 | 51 | # Get items iterator 52 | file.seek(0) 53 | items = ijson.items(file, 'results.item') 54 | 55 | for item in items: 56 | meta = item['openfda'] 57 | 58 | base.config.SENTRY.extra_context({ 59 | 'url': url, 60 | 'item': meta, 61 | }) 62 | 63 | # Skip if no NDC code 64 | if 'product_ndc' not in meta: 65 | continue 66 | 67 | # Get data 68 | data = { 69 | 'product_ndc': meta['product_ndc'][0], 70 | 'product_type': meta['product_type'][0], 71 | 'generic_name': meta['generic_name'][0], 72 | 'brand_name': meta['brand_name'][0], 73 | 'last_updated': last_updated, 74 | } 75 | if meta.get('application_number'): 76 | data['fda_application_number'] = meta['application_number'][0] 77 | 78 | # Create record 79 | record = Record.create(url, data) 80 | 81 | # Write record 82 | record.write(conf, conn) 83 | 84 | # Log info 85 | success += 1 86 | if not success % 100: 87 | logger.info('Collected %s "%s" interventions', 88 | success, record.table) 89 | 90 | # Remove temp directory 91 | shutil.rmtree(dirpath) 92 | -------------------------------------------------------------------------------- /collectors/isrctn/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Json 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'isrctn' 18 | 19 | # General 20 | 21 | isrctn_id = Text(primary_key=True) 22 | doi_isrctn_id = Text() 23 | title = Text() 24 | condition_category = Text() 25 | date_applied = Date('%d/%m/%Y') 26 | date_assigned = Date('%d/%m/%Y') 27 | last_edited = Date('%d/%m/%Y') 28 | prospective_retrospective = Text() 29 | overall_trial_status = Text() 30 | recruitment_status = Text() 31 | plain_english_summary = Text() 32 | trial_website = Text() 33 | 34 | # Contant information 35 | 36 | contacts = Json() 37 | 38 | # Additional identifiers 39 | 40 | eudract_number = Text() 41 | clinicaltrials_gov_number = Text() 42 | protocol_serial_number = Text() 43 | 44 | # Study information 45 | 46 | scientific_title = Text() 47 | acronym = Text() 48 | study_hypothesis = Text() 49 | ethics_approval = Text() 50 | study_design = Text() 51 | primary_study_design = Text() 52 | secondary_study_design = Text() 53 | trial_setting = Text() 54 | trial_type = Text() 55 | patient_information_sheet = Text() 56 | condition = Text() 57 | intervention = Text() 58 | intervention_type = Text() 59 | phase = Text() 60 | drug_names = Text() 61 | primary_outcome_measures = Text() 62 | secondary_outcome_measures = Text() 63 | overall_trial_start_date = Date('%d/%m/%Y') 64 | overall_trial_end_date = Date('%d/%m/%Y') 65 | reason_abandoned = Text() 66 | 67 | # Eligability 68 | 69 | participant_inclusion_criteria = Text() 70 | participant_type = Text() 71 | age_group = Text() 72 | gender = Text() 73 | target_number_of_participants = Text() 74 | participant_exclusion_criteria = Text() 75 | recruitment_start_date = Date('%d/%m/%Y') 76 | recruitment_end_date = Date('%d/%m/%Y') 77 | 78 | # Locations 79 | 80 | countries_of_recruitment = Text() 81 | trial_participating_centre = Text() 82 | 83 | # Sponsor information 84 | 85 | sponsors = Json() 86 | 87 | # Funders 88 | 89 | funders = Json() 90 | 91 | # Results and publications 92 | 93 | publication_and_dissemination_plan = Text() 94 | intention_to_publish_date = Date('%d/%m/%Y') 95 | participant_level_data = Text() 96 | results_basic_reporting = Text() 97 | publication_summary = Text() 98 | publication_citations = Text() 99 | 100 | # Additional files 101 | 102 | # ... 103 | 104 | # Editorial notes 105 | 106 | # ... 107 | -------------------------------------------------------------------------------- /collectors/hra/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import time 8 | import logging 9 | import requests 10 | import datetime 11 | from urllib import urlencode 12 | from collections import OrderedDict 13 | from .. import base 14 | from .parser import parse_record 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | # Module API 19 | 20 | def collect(conf, conn, date_from=None, date_to=None): 21 | 22 | # Start collector 23 | date_from = _get_date_from(conn, date_from) 24 | date_to = _get_date_to(conn, date_to) 25 | base.helpers.start(conf, 'hra', {'date_from': date_from, 'date_to': date_to}) 26 | 27 | # Get parameters 28 | URL = conf['HRA_URL'] 29 | USER = conf['HRA_USER'] 30 | PASS = conf['HRA_PASS'] 31 | 32 | count = 0 33 | chunk_days = 100 34 | session = requests.Session() 35 | loop_date_from = date_from 36 | while True: 37 | if loop_date_from > date_to: 38 | break 39 | loop_date_to = min(loop_date_from + datetime.timedelta(days=chunk_days), date_to) 40 | url = _make_request_url(URL, loop_date_from, loop_date_to) 41 | response = session.get(url, auth=(USER, PASS)) 42 | response.raise_for_status() 43 | base.config.SENTRY.extra_context({ 44 | 'url': response.url, 45 | }) 46 | for item in response.json(): 47 | record = parse_record(response.url, item) 48 | if not record: 49 | continue 50 | record.write(conf, conn) 51 | count += 1 52 | if not count % 100: 53 | logger.info('Collected "%s" hra records', count) 54 | loop_date_from = loop_date_to + datetime.timedelta(days=1) 55 | time.sleep(1) 56 | 57 | # Stop collector 58 | base.helpers.stop(conf, 'hra', {'collected': count}) 59 | 60 | 61 | # Internal 62 | 63 | def _get_date_from(conn, date_from): 64 | if date_from is not None: 65 | return datetime.datetime.strptime(date_from, '%Y-%m-%d') 66 | date_from = datetime.date(2008, 1, 1) 67 | if 'hra' in conn['warehouse'].tables: 68 | rows = conn['warehouse'].query(""" 69 | SELECT least(max(publication_date), max(updated_date)) as latest 70 | FROM hra 71 | """) 72 | latest = list(rows)[0]['latest'] 73 | if latest: 74 | date_from = latest 75 | return date_from 76 | 77 | 78 | def _get_date_to(conn, date_to): 79 | if date_to is not None: 80 | return datetime.datetime.strptime(date_to, '%Y-%m-%d') 81 | return datetime.date.today() 82 | 83 | 84 | def _make_request_url(prefix, date_from, date_to): 85 | query = OrderedDict() 86 | query['datePublishedFrom'] = date_from.strftime('%Y-%m-%d') 87 | query['datePublishedTo'] = date_to.strftime('%Y-%m-%d') 88 | url = '%s?%s' % (prefix, urlencode(query)) 89 | return url 90 | -------------------------------------------------------------------------------- /migrations/versions/20160603215242_hra_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from alembic import op 8 | import sqlalchemy as sa 9 | 10 | 11 | # revision identifiers, used by Alembic. 12 | revision = 'c4c0db99bb1c' 13 | down_revision = u'6d709931cc58' 14 | branch_labels = None 15 | depends_on = None 16 | 17 | 18 | def upgrade(): 19 | op.create_table('hra', 20 | 21 | # Meta 22 | 23 | sa.Column('meta_id', sa.Text, unique=True), 24 | sa.Column('meta_source', sa.Text), 25 | sa.Column('meta_created', sa.DateTime(timezone=True)), 26 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 27 | 28 | # General 29 | 30 | sa.Column('hra_id', sa.Text), 31 | sa.Column('publication_date', sa.Date), 32 | sa.Column('updated_date', sa.Date), 33 | sa.Column('comittee_name', sa.Text), 34 | sa.Column('comittee_ref_number', sa.Text), 35 | sa.Column('iras_proj_id', sa.Text), 36 | sa.Column('contact_name', sa.Text), 37 | sa.Column('contact_email', sa.Text), 38 | sa.Column('application_title', sa.Text), 39 | sa.Column('study_type_id', sa.Text), 40 | sa.Column('study_type', sa.Text), 41 | sa.Column('sponsor_org', sa.Text), 42 | sa.Column('research_programme', sa.Text), 43 | sa.Column('data_coll_arrangements', sa.Text), 44 | sa.Column('establishment_org', sa.Text), 45 | sa.Column('establishment_org_address_1', sa.Text), 46 | sa.Column('establishment_org_address_2', sa.Text), 47 | sa.Column('establishment_org_address_3', sa.Text), 48 | sa.Column('establishment_org_post_code', sa.Text), 49 | sa.Column('decision', sa.Text), 50 | sa.Column('decision_date', sa.DateTime(timezone=True)), 51 | sa.Column('human_tissue_license', sa.Text), 52 | sa.Column('rtb_title', sa.Text), 53 | sa.Column('research_database_title', sa.Text), 54 | sa.Column('application_full_title', sa.Text), 55 | sa.Column('isrctn_id', sa.Text), 56 | sa.Column('nct_id', sa.Text), 57 | sa.Column('additional_ref_numbers', sa.Text), 58 | sa.Column('duration_of_study_in_uk', sa.Text), 59 | sa.Column('research_summary', sa.Text), 60 | sa.Column('euctr_id', sa.Text), 61 | sa.Column('social_value', sa.Text), 62 | sa.Column('recuitment_arrangements', sa.Text), 63 | sa.Column('risk_and_benefit', sa.Text), 64 | sa.Column('participants_protection_and_care', sa.Text), 65 | sa.Column('informed_consent', sa.Text), 66 | sa.Column('applicant_and_staff_suitability', sa.Text), 67 | sa.Column('independent_review', sa.Text), 68 | sa.Column('supporting_info_suitability', sa.Text), 69 | sa.Column('other_comments', sa.Text), 70 | sa.Column('research_summary_suitability', sa.Text), 71 | 72 | ) 73 | 74 | 75 | def downgrade(): 76 | op.drop_table('hra') 77 | -------------------------------------------------------------------------------- /collectors/base/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import os 8 | import logging 9 | import logging.config 10 | import raven 11 | from dotenv import load_dotenv 12 | load_dotenv('.env') 13 | 14 | 15 | # Environment 16 | 17 | ENV = os.environ.get('PYTHON_ENV', 'development') 18 | if os.environ.get('CI'): 19 | ENV = 'testing' 20 | 21 | if ENV == 'testing': 22 | WAREHOUSE_URL = os.environ['TEST_WAREHOUSE_URL'] 23 | else: 24 | WAREHOUSE_URL = os.environ['WAREHOUSE_URL'] 25 | 26 | # Scrapy 27 | 28 | SCRAPY_SETTINGS = { 29 | 'SPIDER_MODULES': [ 30 | 'collectors.actrn.spider', 31 | 'collectors.euctr.spider', 32 | 'collectors.gsk.spider', 33 | 'collectors.ictrp.spider', 34 | 'collectors.isrctn.spider', 35 | 'collectors.jprn.spider', 36 | 'collectors.pfizer.spider', 37 | 'collectors.pubmed.spider', 38 | 'collectors.takeda.spider', 39 | ], 40 | 'DOWNLOAD_DELAY': float(os.getenv('DOWNLOAD_DELAY', 1)), 41 | 'AUTOTHROTTLE_ENABLED': True, 42 | 'ITEM_PIPELINES': { 43 | 'collectors.base.pipelines.Warehouse': 100, 44 | }, 45 | } 46 | 47 | 48 | # Logging 49 | 50 | def setup_syslog_handler(): 51 | if os.environ.get('LOGGING_URL', None): 52 | host, port = os.environ['LOGGING_URL'].split(':') 53 | handler = logging.handlers.SysLogHandler(address=(host, int(port))) 54 | else: 55 | handler = logging.handlers.SysLogHandler() 56 | return handler 57 | 58 | 59 | SENTRY_DSN = os.environ.get('SENTRY_DSN') 60 | SENTRY = raven.Client(SENTRY_DSN) 61 | 62 | LOGGING_CONFIG = { 63 | 'version': 1, 64 | 'disable_existing_loggers': False, 65 | 'formatters': { 66 | 'default': { 67 | 'format': '%(levelname)s %(name)s: %(message)s', 68 | }, 69 | }, 70 | 'handlers': { 71 | 'default_handler': { 72 | 'class': 'logging.StreamHandler', 73 | 'stream': 'ext://sys.stdout', 74 | 'level': 'DEBUG', 75 | 'formatter': 'default' 76 | }, 77 | 'syslog_handler': { 78 | '()': setup_syslog_handler, 79 | 'level': 'INFO', 80 | 'formatter': 'default', 81 | }, 82 | 'sentry': { 83 | 'level': 'ERROR', 84 | 'class': 'raven.handlers.logging.SentryHandler', 85 | 'dsn': SENTRY_DSN, 86 | }, 87 | }, 88 | 'root': { 89 | 'handlers': ['default_handler', 'syslog_handler'], 90 | 'level': os.environ.get('LOGGING_LEVEL', 'DEBUG').upper(), 91 | }, 92 | } 93 | 94 | logging.config.dictConfig(LOGGING_CONFIG) 95 | 96 | # ICTRP 97 | 98 | ICTRP_USER = os.environ.get('ICTRP_USER', None) 99 | ICTRP_PASS = os.environ.get('ICTRP_PASS', None) 100 | 101 | # HRA 102 | 103 | HRA_ENV = os.environ.get('HRA_ENV', None) 104 | HRA_URL = os.environ.get('HRA_URL', None) 105 | HRA_USER = os.environ.get('HRA_USER', None) 106 | HRA_PASS = os.environ.get('HRA_PASS', None) 107 | 108 | # Cochrane Reviews 109 | 110 | COCHRANE_ARCHIVE_URL = os.environ.get('COCHRANE_ARCHIVE_URL') 111 | -------------------------------------------------------------------------------- /collectors/hra/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import logging 8 | from .record import Record 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def parse_record(url, item): 13 | 14 | # Init data 15 | data = {} 16 | 17 | # Map data 18 | data['hra_id'] = 'HRA%s' % item['ApplicationID'] 19 | data['publication_date'] = item['PublicationDate'] 20 | data['updated_date'] = item['UpdatedDate'] 21 | data['comittee_name'] = item['CommitteeName'] 22 | data['comittee_ref_number'] = item['CommitteeReferenceNumber'] 23 | data['iras_proj_id'] = item['IrasProjectID'] 24 | data['contact_name'] = item['ContactName'] 25 | data['contact_email'] = item['ContactEmail'] 26 | data['application_title'] = item['ApplicationTitle'] 27 | data['study_type_id'] = item['StudyTypeID'] 28 | data['study_type'] = item['StudyType'] 29 | data['sponsor_org'] = item['SponsorOrganisation'] 30 | data['research_programme'] = item['ResearchProgramme'] 31 | data['data_coll_arrangements'] = item['DataCollectionArrangements'] 32 | data['establishment_org'] = item['EstablishmentOrganisation'] 33 | data['establishment_org_address_1'] = item['EstablishmentOrganisationAddress1'] 34 | data['establishment_org_address_2'] = item['EstablishmentOrganisationAddress2'] 35 | data['establishment_org_address_3'] = item['EstablishmentOrganisationAddress3'] 36 | data['establishment_org_post_code'] = item['EstablishmentOrganisationPostcode'] 37 | data['decision'] = item['Decision'] 38 | data['decision_date'] = item['DecisionDate'] 39 | data['human_tissue_license'] = item['HumanTissueAuthorityStorageLicence'] 40 | data['rtb_title'] = item['RTBTitle'] 41 | data['research_database_title'] = item['ResearchDatabaseTitle'] 42 | data['application_full_title'] = item['ApplicationFullTitle'] 43 | data['isrctn_id'] = item['ISRCTN'] 44 | data['nct_id'] = item['NCT'] 45 | data['additional_ref_numbers'] = item['AdditionalReferenceNumbers'] 46 | data['duration_of_study_in_uk'] = item['DurationOfStudyInUK'] 47 | data['research_summary'] = item['ResearchSummary'] 48 | data['euctr_id'] = item['EudraCT'] 49 | data['social_value'] = item['SocialValue'] 50 | data['recuitment_arrangements'] = item['RecruitmentArrangements'] 51 | data['risk_and_benefit'] = item['RiskAndBenefit'] 52 | data['participants_protection_and_care'] = item['ParticipantsProtectionAndCare'] 53 | data['informed_consent'] = item['InformedConsent'] 54 | data['applicant_and_staff_suitability'] = item['ApplicantAndStaffSuitability'] 55 | data['independent_review'] = item['IndependentReview'] 56 | data['supporting_info_suitability'] = item['SupportingInfoSuitability'] 57 | data['other_comments'] = item['OtherComments'] 58 | data['research_summary_suitability'] = item['ResearchSummarySuitability'] 59 | 60 | # Ignore deferred records 61 | date_fields = ['publication_date', 'decision_date', 'updated_date'] 62 | if any('deferred' in data[date_field] for date_field in date_fields): 63 | return None 64 | 65 | # Create record 66 | record = Record.create(url, data) 67 | 68 | return record 69 | -------------------------------------------------------------------------------- /docs/overview.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | This system is responsible for managing the schema of OpenTrials `warehouse` database and collecting 4 | data to populate it. 5 | 6 | ## Stack 7 | 8 | Collectors are fully compatible with Python2.7. 9 | 10 | We use PostgreSQL for our database and [Alembic](http://alembic.zzzcomputing.com/en/latest/) for migrations. 11 | 12 | Collectors are deployed and run in production with [DockerCloud](https://github.com/respect31/docker-cloud-example). 13 | 14 | ## Collectors 15 | 16 | The system's collectors are independent python modules that share the following signature: 17 | 18 | ```python 19 | def collect(conf, conn, *args): 20 | pass 21 | ``` 22 | 23 | Where arguments are: 24 | - `conf` - config dict 25 | - `conn` - connections dict 26 | - `args` - collector arguments 27 | 28 | To run a collector from command line: 29 | ``` 30 | $ make start [] 31 | ``` 32 | 33 | This code will trigger `collectors..collect(conf, conn, *args)` call. 34 | 35 | *NOTE*: Most collectors need `date_from` and `date_to` arguments that define a 36 | time range from which we want to extract resources. For example: 37 | 38 | ``` 39 | $ make start nct 2013-11-31 2013-12-01 40 | ``` 41 | 42 | To check if that is the case, see the `collect` function of the collector you are interested in. 43 | 44 | ### Scraping Collectors 45 | 46 | Many collectors are scrapers. Scraping is based on 47 | [Scrapy](https://scrapy.readthedocs.io/en/latest/intro/overview.html) framework. Here is 48 | an example of how to use Scrapy in the `collect` function: 49 | 50 | ```python 51 | from scrapy.crawler import CrawlerProcess 52 | from .spider import Spider 53 | 54 | def collect(conf, conn, ): 55 | process = CrawlerProcess(conf) 56 | process.crawl(Spider, conn=conn, ) 57 | process.start() 58 | ``` 59 | 60 | For more details check the tutorial [How to Write a Collector using Scrapy](https://github.com/opentrials/collectors/blob/master/docs/collector-scrapy-guide.md) 61 | 62 | ### Working with the database 63 | 64 | The folder `collectors/base` contains multiple reusable components and 65 | helpers including the [base class for a database record](https://github.com/opentrials/collectors/blob/master/collectors/base/record.py) 66 | and the [base class for a record's field](https://github.com/opentrials/collectors/blob/master/collectors/base/fields.py). 67 | Each collector that has a corresponding table in the `warehouse` database has to 68 | define the schema for that table in a class that inherits from the base class for record. 69 | 70 | For example the following class defines the schema for table `colors`. This table has 71 | 2 fields of type `Text`, one of which is a primary key: 72 | 73 | ```python 74 | class ColorRecord(base.Record): 75 | table = 'colors' 76 | 77 | # Fields 78 | 79 | id = Text(primary_key=True) 80 | color = Text() 81 | ``` 82 | 83 | To see how this connects to the other parts of the collector check the [How to Write a Collector](https://github.com/opentrials/collectors/blob/master/docs/collector-guide.md) tutorial. 84 | #### Altering the database schema 85 | 86 | 1. Define the table/field in the collector's record class as explained above. 87 | 2. Create a migration for it (more details in [Alembic docs](http://alembic.zzzcomputing.com/en/latest/tutorial.html#create-a-migration-script)). 88 | -------------------------------------------------------------------------------- /collectors/actrn/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Boolean, Integer, Json, Array 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'actrn' 18 | 19 | # General 20 | 21 | trial_id = Text(primary_key=True) 22 | ethics_application_status = Text() 23 | date_submitted = Date('%d/%m/%Y') 24 | date_registered = Date('%d/%m/%Y') 25 | type_of_registration = Text() 26 | 27 | # Titles & IDs 28 | 29 | public_title = Text() 30 | scientific_title = Text() 31 | secondary_ids = Array() 32 | universal_trial_number_utn = Text() 33 | trial_acronym = Text() 34 | 35 | # Health condition 36 | 37 | health_conditions_or_problems_studied = Text() 38 | condition_category = Text() 39 | condition_code = Text() 40 | 41 | # Intervention/exposure 42 | 43 | study_type = Text() 44 | patient_registry = Boolean('Yes') 45 | target_follow_up_duration = Integer() 46 | target_follow_up_type = Text() 47 | description_of_intervention_s_exposure = Text() 48 | intervention_codes = Array() 49 | comparator_control_treatment = Text() 50 | control_group = Text() 51 | 52 | # Outcomes 53 | 54 | primary_outcomes = Json() 55 | secondary_outcomes = Json() 56 | 57 | # Eligibility 58 | 59 | key_inclusion_criteria = Text() 60 | minimum_age = Text() 61 | maximum_age = Text() 62 | gender = Text() 63 | can_healthy_volunteers_participate = Boolean('Yes') 64 | key_exclusion_criteria = Text() 65 | 66 | # Study design 67 | 68 | purpose_of_the_study = Text() 69 | allocation_to_intervention = Text() 70 | procedure_for_enrolling_a_subject_and_allocating_the_treatment_ = Text() 71 | methods_used_to_generate_the_sequence_in_which_subjects_will_be = Text() 72 | masking_blinding = Text() 73 | who_is_are_masked_blinded = Text() 74 | intervention_assignment = Text() 75 | other_design_features = Text() 76 | phase = Text() 77 | type_of_endpoint_s = Text() 78 | purpose = Text() 79 | duration = Text() 80 | selection = Text() 81 | timing = Text() 82 | statistical_methods_analysis = Text() 83 | 84 | # Recruitment 85 | 86 | anticipated_date_of_first_participant_enrolment = Date('%d/%m/%Y') 87 | actual_date_of_first_participant_enrolment = Date('%d/%m/%Y') 88 | anticipated_date_last_participant_enrolled = Date('%d/%m/%Y') 89 | actual_date_last_participant_enrolled = Date('%d/%m/%Y') 90 | target_sample_size = Integer() 91 | actual_sample_size = Integer() 92 | recruitment_status = Text() 93 | recruitment_state_s = Text() 94 | 95 | # Funding & Sponsors 96 | 97 | primary_sponsor = Json() 98 | sponsors = Json() 99 | 100 | # Ethics approval 101 | 102 | ethics_application_status = Text() 103 | ethics_applications = Json() 104 | 105 | # Summary 106 | 107 | brief_summary = Text() 108 | trial_website = Text() 109 | trial_related_presentations_publications = Text() 110 | public_notes = Text() 111 | attachments = Array() 112 | 113 | # Contacts 114 | 115 | principal_investigator = Json() 116 | public_queries = Json() 117 | scientific_queries = Json() 118 | -------------------------------------------------------------------------------- /collectors/cochrane_reviews/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import uuid 8 | try: 9 | from lxml import etree 10 | except ImportError: 11 | import xml.etree.ElementTree as etree 12 | from .record import Record 13 | 14 | 15 | def parse_record(url, review_file): 16 | tree = etree.parse(review_file) 17 | study_robs = [] 18 | studies = [] 19 | 20 | # Get risk of bias 21 | 22 | root = tree.getroot() 23 | doi_id = root.attrib.get('DOI', '') 24 | quality_item_data_entries = tree.findall('//QUALITY_ITEM_DATA_ENTRY') 25 | for quality_item_data_entry in quality_item_data_entries: 26 | study_rob = { 27 | 'study_id': quality_item_data_entry.attrib['STUDY_ID'], 28 | 'modified': quality_item_data_entry.attrib.get('MODIFIED', ''), 29 | 'result': quality_item_data_entry.attrib['RESULT'], 30 | 'group_id': quality_item_data_entry.attrib.get('GROUP_ID', ''), 31 | 'group_name': '', 32 | 'result_description': quality_item_data_entry.findtext('DESCRIPTION/P', ''), 33 | } 34 | quality_item = quality_item_data_entry.getparent().getparent() 35 | study_rob['rob_id'] = quality_item.attrib['ID'] 36 | study_rob['rob_name'] = quality_item.findtext('NAME') 37 | study_rob['rob_description'] = quality_item.findtext('DESCRIPTION/P', '') 38 | for group in quality_item.iter('QUALITY_ITEM_DATA_ENTRY_GROUP'): 39 | group_id = group.attrib.get('ID') 40 | if group_id == study_rob['group_id']: 41 | study_rob['group_name'] = group.findtext('NAME') 42 | study_robs.append(study_rob) 43 | 44 | # Get references 45 | 46 | included_studies = tree.find('//INCLUDED_STUDIES') 47 | for study in included_studies.iter('STUDY'): 48 | study_info = { 49 | 'id': uuid.uuid1().hex, 50 | 'doi_id': doi_id, 51 | 'file_name': review_file.name, 52 | 'study_id': study.attrib['ID'], 53 | 'study_type': study.attrib['DATA_SOURCE'], 54 | 'refs': [], 55 | } 56 | corresponding_robs = [rob for rob in study_robs 57 | if rob['study_id'] == study_info['study_id']] 58 | study_info['robs'] = corresponding_robs 59 | for reference in study.iter('REFERENCE'): 60 | ref = { 61 | 'type': reference.attrib['TYPE'], 62 | 'authors': reference.findtext('AU', ''), 63 | 'title': reference.findtext('TI', ''), 64 | 'source': reference.findtext('SO', ''), 65 | 'year': reference.findtext('YR', ''), 66 | 'vl': reference.findtext('VL', ''), 67 | 'no': reference.findtext('NO', ''), 68 | 'pg': reference.findtext('PG', ''), 69 | 'country': reference.findtext('CY', ''), 70 | 'identifiers': [], 71 | } 72 | for identifier in reference.iter('IDENTIFIER'): 73 | ident = {key.lower(): value for key, value in identifier.items() 74 | if key not in ['MODIFIED', 'MODIFIED_BY']} 75 | ref['identifiers'].append(ident) 76 | study_info['refs'].append(ref) 77 | 78 | # Create record 79 | 80 | record = Record.create(url, study_info) 81 | studies.append(record) 82 | 83 | return studies 84 | -------------------------------------------------------------------------------- /collectors/pubmed/spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import logging 8 | import requests 9 | from urllib import urlencode 10 | from datetime import datetime, date, timedelta 11 | from collections import OrderedDict 12 | from scrapy.spiders import CrawlSpider 13 | from .parser import parse_record 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | # Module API 18 | 19 | class Spider(CrawlSpider): 20 | 21 | # Public 22 | 23 | name = 'pubmed' 24 | allowed_domains = ['eutils.ncbi.nlm.nih.gov'] 25 | 26 | def __init__(self, conf=None, conn=None, date_from=None, date_to=None): 27 | 28 | # Save conf/conn 29 | self.conf = conf 30 | self.conn = conn 31 | 32 | # Make start urls 33 | self.start_urls = _make_start_urls( 34 | prefix='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi/', 35 | template='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id={pmid}&retmode=xml', 36 | date_from=date_from, date_to=date_to) 37 | 38 | # Set parser 39 | self.parse = parse_record 40 | 41 | # Inherit parent 42 | super(Spider, self).__init__() 43 | 44 | 45 | # Internal 46 | 47 | def _make_start_urls(prefix, template, date_from=None, date_to=None, session=None): 48 | """ Return start_urls. 49 | """ 50 | 51 | # Init urls and session 52 | urls = set() 53 | if not session: 54 | session = requests.Session() 55 | adapter_opts = {'max_retries': requests.packages.urllib3.util.Retry(total=5, status_forcelist=[503])} 56 | session.mount('https://', requests.adapters.HTTPAdapter(**adapter_opts)) 57 | 58 | # Prepare dates 59 | if date_from is None: 60 | date_from = str(date.today() - timedelta(days=1)) 61 | if date_to is None: 62 | date_to = str(date.today()) 63 | date_from = datetime.strptime(date_from, '%Y-%m-%d').strftime('%Y/%m/%d') 64 | date_to = datetime.strptime(date_to, '%Y-%m-%d').strftime('%Y/%m/%d') 65 | 66 | # Prepare query 67 | query = OrderedDict() 68 | query['db'] = 'pubmed' 69 | query['retmode'] = 'json' 70 | query['mindate'] = date_from 71 | query['maxdate'] = date_to 72 | 73 | # Terms to search 74 | query['term'] = """(randomized controlled trial[Publication Type] OR 75 | (randomized[Title/Abstract] 76 | AND controlled[Title/Abstract] 77 | AND trial[Title/Abstract] 78 | )) 79 | """ 80 | 81 | # For both publication/modifiction 82 | for date_type in ['pdat', 'mdat']: 83 | retstart = 0 84 | retmax = 50000 85 | while True: 86 | query['datetype'] = date_type 87 | query['retstart'] = retstart 88 | query['retmax'] = retmax 89 | url = '%s?%s' % (prefix, urlencode(query)) 90 | response = session.get(url) 91 | pmids = response.json()['esearchresult']['idlist'] 92 | if not pmids: 93 | break 94 | for pmid in pmids: 95 | urls.add(template.format(pmid=pmid)) 96 | retstart += retmax 97 | 98 | # Log urls count 99 | logger.info('Populated Pubmed start urls: %s', len(urls)) 100 | 101 | return list(urls) 102 | -------------------------------------------------------------------------------- /collectors/base/fields.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from scrapy import Field 9 | from six import add_metaclass 10 | from abc import ABCMeta, abstractmethod 11 | from sqlalchemy.dialects.postgresql import ARRAY, JSONB 12 | from . import helpers 13 | 14 | 15 | @add_metaclass(ABCMeta) 16 | class Base(Field): 17 | 18 | # Public 19 | 20 | def __init__(self, primary_key=False): 21 | self.__primary_key = primary_key 22 | 23 | def __repr__(self): 24 | return type(self).__name__ 25 | 26 | @property 27 | @abstractmethod 28 | def column_type(self): 29 | pass # pragma: no cover 30 | 31 | @property 32 | def primary_key(self): 33 | return self.__primary_key 34 | 35 | def parse(self, value): 36 | return value 37 | 38 | 39 | class Text(Base): 40 | 41 | # Public 42 | 43 | column_type = sa.Text 44 | 45 | 46 | class Integer(Base): 47 | 48 | # Public 49 | 50 | column_type = sa.Integer 51 | 52 | def parse(self, value): 53 | return int(value) 54 | 55 | 56 | class Boolean(Base): 57 | 58 | # Public 59 | 60 | column_type = sa.Boolean 61 | 62 | def __init__(self, true_value=None, **params): 63 | super(Boolean, self).__init__(**params) 64 | self.__true_value = true_value 65 | 66 | def parse(self, value): 67 | if self.__true_value is not None: 68 | value = (value.lower() == self.__true_value.lower()) 69 | return value 70 | 71 | 72 | class Date(Base): 73 | 74 | # Public 75 | 76 | column_type = sa.Date 77 | 78 | def __init__(self, formats, **params): 79 | super(Date, self).__init__(**params) 80 | if not isinstance(formats, (list, tuple)): 81 | formats = [formats] 82 | self.__formats = formats 83 | 84 | def parse(self, value): 85 | for i, fmt in enumerate(self.__formats): 86 | try: 87 | return helpers.parse_date(value, format=fmt) 88 | except ValueError: 89 | pass 90 | msg = "time data '{value}' doesn't match any of the formats: {formats}" 91 | raise ValueError(msg.format(value=value, formats=self.__formats)) 92 | 93 | 94 | class Datetime(Base): 95 | 96 | # Public 97 | 98 | column_type = sa.DateTime(timezone=True) 99 | 100 | def __init__(self, format=None, **params): 101 | super(Datetime, self).__init__(**params) 102 | self.__format = format 103 | 104 | def parse(self, value): 105 | if self.__format is not None: 106 | value = helpers.parse_datetime(value, format=self.__format) 107 | return value 108 | 109 | 110 | class Json(Base): 111 | 112 | # Public 113 | 114 | column_type = JSONB 115 | 116 | 117 | class Array(Base): 118 | 119 | # Public 120 | 121 | def __init__(self, field=None, **params): 122 | super(Array, self).__init__(**params) 123 | if field is None: 124 | field = Text() 125 | self.__field = field 126 | self.__column_type = ARRAY(field.column_type) 127 | 128 | @property 129 | def column_type(self): 130 | return self.__column_type 131 | 132 | def parse(self, value): 133 | result = [] 134 | for item in value: 135 | result.append(self.__field.parse(item)) 136 | return result 137 | -------------------------------------------------------------------------------- /migrations/versions/20160220164104_nct_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import ARRAY, JSONB 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '999c8f33bc04' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('nct', 21 | 22 | # Meta 23 | 24 | sa.Column('meta_uuid', sa.Text), 25 | sa.Column('meta_source', sa.Text), 26 | sa.Column('meta_created', sa.DateTime(timezone=True)), 27 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 28 | 29 | # General 30 | 31 | sa.Column('download_date', sa.Text), 32 | sa.Column('link_text', sa.Text), 33 | sa.Column('url', sa.Text), 34 | sa.Column('org_study_id', sa.Text), 35 | sa.Column('nct_id', sa.Text, primary_key=True), 36 | sa.Column('secondary_ids', ARRAY(sa.Text)), 37 | sa.Column('nct_aliases', ARRAY(sa.Text)), 38 | sa.Column('brief_title', sa.Text), 39 | sa.Column('acronym', sa.Text), 40 | sa.Column('official_title', sa.Text), 41 | sa.Column('sponsors', JSONB), 42 | sa.Column('source', sa.Text), 43 | sa.Column('oversight_info', JSONB), 44 | sa.Column('brief_summary', sa.Text), 45 | sa.Column('detailed_description', sa.Text), 46 | sa.Column('overall_status', sa.Text), 47 | sa.Column('why_stopped', sa.Text), 48 | sa.Column('start_date', sa.Date), 49 | sa.Column('completion_date_actual', sa.Date), 50 | sa.Column('completion_date_anticipated', sa.Date), 51 | sa.Column('primary_completion_date_actual', sa.Date), 52 | sa.Column('primary_completion_date_anticipated', sa.Date), 53 | sa.Column('phase', sa.Text), 54 | sa.Column('study_type', sa.Text), 55 | sa.Column('study_design', sa.Text), 56 | sa.Column('target_duration', sa.Text), 57 | sa.Column('primary_outcomes', JSONB), 58 | sa.Column('secondary_outcomes', JSONB), 59 | sa.Column('other_outcomes', JSONB), 60 | sa.Column('number_of_arms', sa.Integer), 61 | sa.Column('number_of_groups', sa.Integer), 62 | sa.Column('enrollment_actual', sa.Integer), 63 | sa.Column('enrollment_anticipated', sa.Integer), 64 | sa.Column('conditions', ARRAY(sa.Text)), 65 | sa.Column('arm_groups', JSONB), 66 | sa.Column('interventions', JSONB), 67 | sa.Column('biospec_retention', sa.Text), 68 | sa.Column('biospec_desrc', sa.Text), 69 | sa.Column('eligibility', JSONB), 70 | sa.Column('overall_officials', JSONB), 71 | sa.Column('overall_contact', JSONB), 72 | sa.Column('overall_contact_backup', JSONB), 73 | sa.Column('locations', JSONB), 74 | sa.Column('location_countries', ARRAY(sa.Text)), 75 | sa.Column('removed_countries', ARRAY(sa.Text)), 76 | sa.Column('links', JSONB), 77 | sa.Column('references', JSONB), 78 | sa.Column('results_references', JSONB), 79 | sa.Column('verification_date', sa.Date), 80 | sa.Column('lastchanged_date', sa.Date), 81 | sa.Column('firstreceived_date', sa.Date), 82 | sa.Column('firstreceived_results_date', sa.Date), 83 | sa.Column('responsible_party', JSONB), 84 | sa.Column('keywords', ARRAY(sa.Text)), 85 | sa.Column('is_fda_regulated', sa.Text), 86 | sa.Column('is_section_801', sa.Text), 87 | sa.Column('has_expanded_access', sa.Text), 88 | sa.Column('condition_browse', JSONB), 89 | sa.Column('intervention_browse', JSONB), 90 | sa.Column('clinical_results', JSONB), 91 | 92 | ) 93 | 94 | 95 | def downgrade(): 96 | op.drop_table('nct') 97 | -------------------------------------------------------------------------------- /collectors/gsk/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Boolean, Integer, Json, Array 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | _FULL_DATE_FORMATS = [ 15 | '%B %d, %Y', 16 | '%b %d, %Y' 17 | ] 18 | 19 | # Config 20 | 21 | table = 'gsk' 22 | 23 | # General 24 | 25 | study_id = Text(primary_key=True) 26 | study_title = Text() 27 | patient_level_data = Text() 28 | clinicaltrials_gov_identifier = Text() 29 | sponsor = Text() 30 | collaborators = Text() 31 | study_recruitment_status = Text() 32 | generic_name = Text() 33 | trade_name = Text() 34 | study_indication = Text() 35 | 36 | # Protocol summary 37 | 38 | first_received = Date(_FULL_DATE_FORMATS) 39 | last_updated = Date(_FULL_DATE_FORMATS) 40 | title = Text() 41 | phase = Text() 42 | acronym = Text() 43 | secondary_ids = Array() 44 | fda_regulated_intervention = Boolean('yes') 45 | section_801_clinical_trial = Boolean('yes') 46 | delayed_posting = Boolean('yes') 47 | ind_ide_protocol = Text() 48 | ind_ide_grantor = Text() 49 | ind_ide_number = Text() 50 | ind_ide_serial_number = Text() 51 | has_expanded_access = Boolean('yes') 52 | study_type = Text() 53 | oversight_authority = Array() 54 | sponsor = Text() 55 | collaborators = Array() 56 | brief_summary = Text() 57 | detailed_description = Text() 58 | record_verification_date = Date(_FULL_DATE_FORMATS) 59 | status = Text() 60 | why_study_stopped = Text() 61 | study_start_date = Date('%B %Y') 62 | study_completion_date = Date('%B %Y') 63 | study_completion_date_type = Text() 64 | primary_completion_date = Date('%B %Y') 65 | primary_completion_date_type = Text() 66 | primary_purpose = Text() 67 | study_design = Text() 68 | time_perspective = Text() 69 | biospecimen_retention = Text() 70 | biospecimen_description = Text() 71 | allocation = Text() 72 | masking = Text() 73 | masked_subject = Boolean('yes') 74 | masked_caregiver = Boolean('yes') 75 | masked_investigator = Boolean('yes') 76 | masked_assessor = Boolean('yes') 77 | study_design_assignment = Text() 78 | study_classification_endpoint = Text() 79 | primary_outcomes = Json() 80 | secondary_outcomes = Json() 81 | arms = Json() 82 | interventions = Json() 83 | conditions = Array() 84 | keywords = Array() 85 | study_population = Text() 86 | sampling_method = Text() 87 | eligibility_criteria = Text() 88 | gender = Text() 89 | minimum_age = Text() 90 | maximum_age = Text() 91 | enrollment = Integer() 92 | enrollment_type = Text() 93 | healthy_volunteers = Boolean('yes') 94 | central_contact = Text() 95 | central_contact_phone = Text() 96 | central_contact_email = Text() 97 | overall_study_official = Text() 98 | overall_study_official_affiliation = Text() 99 | overall_study_official_role = Text() 100 | responsible_party_name_official_title = Text() 101 | responsible_party_organization = Text() 102 | 103 | # Locations 104 | 105 | contact_name = Text() 106 | contact_phone = Text() 107 | contact_email = Text() 108 | 109 | # Result summary 110 | 111 | protocol_id = Text() 112 | clinical_study_id = Text() 113 | official_study_title = Text() 114 | phase = Text() 115 | study_indication_or_diseases = Text() 116 | generic_name = Text() 117 | trade_name = Text() 118 | trade_name_product_name = Text() 119 | study_indications = Text() 120 | results_url = Text() 121 | 122 | # Publication 123 | 124 | citation = Text() 125 | publication_type = Text() 126 | -------------------------------------------------------------------------------- /collectors/base/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import uuid 8 | import scrapy 9 | import logging 10 | from abc import abstractmethod 11 | from . import config 12 | from . import fields 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | # Module API 17 | 18 | class Record(scrapy.Item): 19 | 20 | # Public 21 | 22 | def __repr__(self): 23 | template = '<%s: %s>' 24 | text = template % (self.table.upper(), self.get(self.__primary_key)) 25 | return text 26 | 27 | @property 28 | @abstractmethod 29 | def table(self): 30 | """Source name. 31 | """ 32 | pass # pragma: no cover 33 | 34 | @classmethod 35 | def create(cls, source, data): 36 | 37 | # Init dict 38 | self = cls() 39 | 40 | # Get primary_key 41 | self.__primary_key = None 42 | for key, field in self.fields.items(): 43 | if field.primary_key: 44 | self.__primary_key = key 45 | break 46 | if self.__primary_key is None: 47 | raise TypeError('Record %s requires primary key' % cls) 48 | if not isinstance(self.fields[self.__primary_key], fields.Text): 49 | raise TypeError('Record %s requires text primary key' % cls) 50 | 51 | # Get column types 52 | self.__column_types = {} 53 | for key, field in self.fields.items(): 54 | self.__column_types[key] = field.column_type 55 | 56 | # Add metadata 57 | ident = uuid.uuid1().hex 58 | self.fields['meta_id'] = fields.Text() 59 | self.fields['meta_source'] = fields.Text() 60 | self.fields['meta_created'] = fields.Datetime() 61 | self.fields['meta_updated'] = fields.Datetime() 62 | self['meta_id'] = ident 63 | self['meta_source'] = source 64 | 65 | # Add data 66 | undefined = [] 67 | for key, value in data.items(): 68 | field = self.fields.get(key) 69 | if field is None: 70 | undefined.append(key) 71 | continue 72 | if value is None: 73 | continue 74 | try: 75 | value = field.parse(value) 76 | except Exception: 77 | config.SENTRY.captureException() 78 | continue 79 | self[key] = value 80 | for key in undefined: 81 | logger.warning('Undefined field: %s - %s' % (self, key)) 82 | 83 | return self 84 | 85 | def write(self, conf, conn): 86 | """Write record to warehouse. 87 | 88 | Args: 89 | conf (dict): config dictionary 90 | conn (dict): connections dictionary 91 | 92 | """ 93 | config.SENTRY.extra_context({ 94 | 'record_table': self.table, 95 | 'record_id': self.__primary_key, 96 | }) 97 | 98 | if self.table not in conn['warehouse'].tables: 99 | if conf['ENV'] in ['development', 'testing']: 100 | table = conn['warehouse'].create_table( 101 | self.table, 102 | primary_id=self.__primary_key, 103 | primary_type='String') 104 | table = conn['warehouse'][self.table] 105 | action = 'created' 106 | if table.find_one(**{self.__primary_key: self[self.__primary_key]}): 107 | action = 'updated' 108 | del self['meta_id'] 109 | 110 | ensure_fields = False 111 | if conf['ENV'] in ['development', 'testing']: 112 | ensure_fields = True 113 | table.upsert( 114 | self, [self.__primary_key], 115 | ensure=ensure_fields, types=self.__column_types) 116 | 117 | logger.debug('Record - %s: %s - %s fields', action, self, len(self)) 118 | -------------------------------------------------------------------------------- /migrations/versions/20160220175816_isrctn_create_table.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | import sqlalchemy as sa 8 | from sqlalchemy.dialects.postgresql import JSONB 9 | from alembic import op 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '296d1e273220' 14 | down_revision = u'3433d4d2a0d1' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | op.create_table('isrctn', 21 | 22 | # Meta 23 | 24 | sa.Column('meta_uuid', sa.Text), 25 | sa.Column('meta_source', sa.Text), 26 | sa.Column('meta_created', sa.DateTime(timezone=True)), 27 | sa.Column('meta_updated', sa.DateTime(timezone=True)), 28 | 29 | # General 30 | 31 | sa.Column('isrctn_id', sa.Text, primary_key=True), 32 | sa.Column('doi_isrctn_id', sa.Text), 33 | sa.Column('title', sa.Text), 34 | sa.Column('condition_category', sa.Text), 35 | sa.Column('date_applied', sa.Date), 36 | sa.Column('date_assigned', sa.Date), 37 | sa.Column('last_edited', sa.Date), 38 | sa.Column('prospectiveretrospective', sa.Text), 39 | sa.Column('overall_trial_status', sa.Text), 40 | sa.Column('recruitment_status', sa.Text), 41 | sa.Column('plain_english_summary', sa.Text), 42 | sa.Column('trial_website', sa.Text), 43 | 44 | # Contant information 45 | 46 | sa.Column('contacts', JSONB), 47 | 48 | # Additional identifiers 49 | 50 | sa.Column('eudract_number', sa.Text), 51 | sa.Column('clinicaltrialsgov_number', sa.Text), 52 | sa.Column('protocolserial_number', sa.Text), 53 | 54 | # Study information 55 | 56 | sa.Column('scientific_title', sa.Text), 57 | sa.Column('acronym', sa.Text), 58 | sa.Column('study_hypothesis', sa.Text), 59 | sa.Column('ethics_approval', sa.Text), 60 | sa.Column('study_design', sa.Text), 61 | sa.Column('primary_study_design', sa.Text), 62 | sa.Column('secondary_study_design', sa.Text), 63 | sa.Column('trial_setting', sa.Text), 64 | sa.Column('trial_type', sa.Text), 65 | sa.Column('patient_information_sheet', sa.Text), 66 | sa.Column('condition', sa.Text), 67 | sa.Column('intervention', sa.Text), 68 | sa.Column('intervention_type', sa.Text), 69 | sa.Column('phase', sa.Text), 70 | sa.Column('drug_names', sa.Text), 71 | sa.Column('primary_outcome_measures', sa.Text), 72 | sa.Column('secondary_outcome_measures', sa.Text), 73 | sa.Column('overall_trial_start_date', sa.Date), 74 | sa.Column('overall_trial_end_date', sa.Date), 75 | sa.Column('reason_abandoned', sa.Text), 76 | 77 | # Eligability 78 | 79 | sa.Column('participant_inclusion_criteria', sa.Text), 80 | sa.Column('participant_type', sa.Text), 81 | sa.Column('age_group', sa.Text), 82 | sa.Column('gender', sa.Text), 83 | sa.Column('target_number_of_participants', sa.Text), 84 | sa.Column('participant_exclusion_criteria', sa.Text), 85 | sa.Column('recruitment_start_date', sa.Date), 86 | sa.Column('recruitment_end_date', sa.Date), 87 | 88 | # Locations 89 | 90 | sa.Column('countries_of_recruitment', sa.Text), 91 | sa.Column('trial_participating_centre', sa.Text), 92 | 93 | # Sponsor information 94 | 95 | sa.Column('sponsors', JSONB), 96 | 97 | # Funders 98 | 99 | sa.Column('funders', JSONB), 100 | 101 | # Results and publications 102 | 103 | sa.Column('publication_and_dissemination_plan', sa.Text), 104 | sa.Column('intention_to_publish_date', sa.Date), 105 | sa.Column('participant_level_data', sa.Text), 106 | sa.Column('results_basic_reporting', sa.Text), 107 | sa.Column('publication_summary', sa.Text), 108 | sa.Column('publication_citations', sa.Text), 109 | 110 | ) 111 | 112 | 113 | def downgrade(): 114 | op.drop_table('isrctn') 115 | -------------------------------------------------------------------------------- /collectors/jprn/record.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | from __future__ import unicode_literals 6 | 7 | from .. import base 8 | from ..base.fields import Text, Date, Boolean, Integer, Array, Datetime 9 | 10 | 11 | # Module API 12 | 13 | class Record(base.Record): 14 | 15 | # Config 16 | 17 | table = 'jprn' 18 | 19 | # General 20 | 21 | unique_trial_number = Text(primary_key=True) 22 | recruitment_status = Text() 23 | title_of_the_study = Text() 24 | date_of_formal_registrationdate_of_icmje_and_who = Date('%Y/%m/%d') 25 | date_and_time_of_last_update = Datetime('%Y/%m/%d %H:%M:%S') 26 | 27 | # Basic information 28 | 29 | official_scientific_title_of_the_study = Text() 30 | title_of_the_study_brief_title = Text() 31 | region = Text() 32 | 33 | # Condition 34 | 35 | condition = Text() 36 | classification_by_specialty = Text() 37 | classification_by_malignancy = Text() 38 | genomic_information = Boolean('YES') 39 | 40 | # Objectives 41 | 42 | narrative_objectives1 = Text() 43 | basic_objectives2 = Text() 44 | basic_objectives_others = Text() 45 | trial_characteristics_1 = Text() 46 | trial_characteristics_2 = Text() 47 | developmental_phase = Text() 48 | 49 | # Assessment 50 | 51 | primary_outcomes = Text() 52 | key_secondary_outcomes = Text() 53 | 54 | # Base 55 | 56 | study_type = Text() 57 | 58 | # Study design 59 | 60 | basic_design = Text() 61 | randomization = Text() 62 | randomization_unit = Text() 63 | blinding = Text() 64 | control = Text() 65 | stratification = Text() 66 | dynamic_allocation = Text() 67 | institution_consideration = Text() 68 | blocking = Text() 69 | concealment = Text() 70 | 71 | # Intervention 72 | 73 | no_of_arms = Integer() 74 | purpose_of_intervention = Text() 75 | type_of_intervention = Text() 76 | interventions = Array() 77 | 78 | # Eligibility 79 | 80 | agelower_limit = Text() 81 | ageupper_limit = Text() 82 | gender = Text() 83 | key_inclusion_criteria = Text() 84 | key_exclusion_criteria = Text() 85 | target_sample_size = Integer() 86 | 87 | # Research contact person 88 | 89 | research_name_of_lead_principal_investigator = Text() 90 | research_organization = Text() 91 | research_division_name = Text() 92 | research_address = Text() 93 | research_tel = Text() 94 | research_homepage_url = Text() 95 | research_email = Text() 96 | 97 | # Public contact 98 | 99 | public_name_of_contact_person = Text() 100 | public_organization = Text() 101 | public_division_name = Text() 102 | public_address = Text() 103 | public_tel = Text() 104 | public_homepage_url = Text() 105 | public_email = Text() 106 | 107 | # Sponsor 108 | 109 | name_of_primary_sponsor = Text() 110 | 111 | # Funding source 112 | 113 | source_of_funding = Text() 114 | category_of_org = Text() 115 | nation_of_funding = Text() 116 | 117 | # Other related organizations 118 | 119 | cosponsor = Text() 120 | name_of_secondary_funers = Text() 121 | 122 | # Secondary study IDs 123 | 124 | secondary_study_ids = Boolean('YES') 125 | secondary_study_id_1 = Text() 126 | org_issuing_secondary_study_id_1 = Text() 127 | secondary_study_id_2 = Text() 128 | org_issuing_secondary_study_id_2 = Text() 129 | ind_to_mhlw = Text() 130 | 131 | # Institutions 132 | 133 | institutions = Text() 134 | 135 | # Progress 136 | 137 | recruitment_status = Text() 138 | date_of_protocol_fixation = Date('%Y/%m/%d') 139 | anticipated_trial_start_date = Date('%Y/%m/%d') 140 | last_followup_date = Date('%Y/%m/%d') 141 | date_of_closure_to_data_entry = Date('%Y/%m/%d') 142 | date_trial_data_considered_complete = Date('%Y/%m/%d') 143 | date_analysis_concluded = Date('%Y/%m/%d') 144 | 145 | # Related information 146 | 147 | url_releasing_protocol = Text() 148 | publication_of_results = Text() 149 | url_releasing_results = Text() 150 | results = Text() 151 | other_related_information = Text() 152 | 153 | # Others 154 | 155 | date_of_registration = Date('%Y/%m/%d') 156 | date_of_last_update = Datetime('%Y/%m/%d %H:%M:%S') 157 | urljapanese = Text() 158 | urlenglish = Text() 159 | -------------------------------------------------------------------------------- /tests/cassettes/nct.test_parser.TestNctParser.test_parser_parse_text.json: -------------------------------------------------------------------------------- 1 | {"http_interactions": [{"request": {"body": {"string": "", "encoding": "utf-8"}, "headers": {"Connection": ["keep-alive"], "Accept-Encoding": ["gzip, deflate"], "Accept": ["*/*"], "User-Agent": ["python-requests/2.12.2"]}, "method": "GET", "uri": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "response": {"body": {"base64_string": "H4sIAAAAAAAAA9xZW2/cNhZ+L9D/wPVTAnhGHt/iGIoKx0laL+I0iJ1i+yRwJM4M15KoJSnH01+/3yEpjaSZ2GmLLYo1/DCiDs/9rviHh7Jg90IbqapXe7PpwR4TVaZyWS1f7X2+fTc52/sh+f67OCtkJTNepMY2+RonjMX/mEzY7UoaRjgyVS2ULg2zivGK/ev6PbvJVqLkjNtzAmdsZW1tzqOoxWW15IWZLtV9lNnDaGXLIpIlXwoTSSCL6mZeyGz6YHI2mXiSWvynkVrk6UrwXOjEI45z9aUqFM/TnFuRXAb8tx1+VmuVCWNEziwxDDDOVMU+qHtRzoVmRwf77PBgdhpHQ1QBPxDepVY82OQ9fpGIdiVY1mgtKst20NMiUzqfxtHmZkDV6CJ5RBFmpb5EHy5vDw5fHs0OZ8dxRBec7NEu4WOZp6SsFr3SS2+iVObJj9dXk9nRycvJ7GAWR4NXAbzKLD0NCIYzT7OPPp5rKRaplbYQyceCZ2KuJpeqsloVBVR7A6cpBHujjGA3RIgU9faeFw1UyW74Qtg1fCNnH1dclzxTd7ISVmaGqQVreWWyYj8J6GS1Zr+ooqmsgHfGUZ+2Y0YtFjKD3sLZBWEF4at99gk0VCl/E/k+mIETiclraBpP20zvt1xfmExU5PZf458s7mXYZ7eqEJrPZSHpaSzO/kDIfF3x8hEhL/KmsOymmf9bZBaCjuRysppaVUZp0/kjHCANh+EMp4icKlsnPxbrTF3L0qv2CtGsa6UhQh5HAWR0Jc0KbkxyVeWNsXrdgoXjQDPaQTTOoEQ+J+w7GLkU0BLyyp8ju0UijgbqiI1qdCYeFTuAeL9x2W65soO44Y1dKQ1zJp8racmZLa6ac/ZOqdzZ841uljBWiZAFt9x6wbprAQ98MM3LLPmg4qj97ZneQTcElGnKkusOBWWLeaGyu05ft3C9uoE4cEx4kUthLpKZdPlWtD5qNjFW74gx412dd65+9UtLIgfuLRddBRflzkVNcNFpa5kho22I9qWJc2G5RJiluTCZljVp7TE5IQ/+OdO9EM59CM99CNchhLNeCO+QywnkldRix428yci23CL5h7cMdywqAMSlCB9Yf9omrS+yKNicKJi2imjVLFeqsXSrpeARkvaNoJTOIB68x/CyLqBdetFoSZTDSYuXfFw41lBDHR8bS7bIRwYNvJRUgHpm+6pxvmoJFxC8oMrObWOSTyLTjbRQo/fZ3isfbpZr66vjRbNE5IbC2Tt3cJkiGYmMO2R2XYtXexcVeJc1heVe8s8GusDtFxTlA2iPotaSXCn9BlTXXGergOsr1wJOqhKJrxUoiv4xCEa1kXAjJcEl7qFa3OUFyda96oFCk3JZJRcF1Ozywfmg9Lyt8lpJ1x/AVhJZPQAFp3IlZp/1abFrlYvinH3kpHZRoCoRCTLyPrvmBrZfnoeaxlxNY89C5SBE98JYuaRE+RxVyWuBffSJ45zdasEtoWrlCfwPVA2Phtq6xqcU3DRaJN1ddDYFdzGUUz4TyD04RdlqIdvgRiJOF5oD1ewE/dYaIL2zAOW9PJXGNCL5VQBmcNJ2dz2PfYoRhAXSR2DG5bxNSrS9qt2PGQqOUUxEu/QRG0E55BE1jTuBx9RyfMaQP/TTeqFS8pRaLkv+wJ7Vgt9RgjTousEpZTZfqJ7vEnG3NH9jIS8+X7JnHA7Amgr9b9uD34v/D/FunQ0JEdX0v9aWbZP61znsm6NjlF6Uv0DQJ6ZvlqxqaG5L1SKFACY5xNgyPHFQoqIWwaWLHTXj8CCONhBt1UIH4ciH/pxKU3vkIIA9XaL6113r2B6k6FNFkWzSyvjN1gVXUd4+1OgWiAUqNaOXO1Tn5hM88yI0odtKG/L4NMthLvr7cCx7RbHF2D/zpKglx4i6db7rQkXeujHN9rs/ZM0hov8p752Nvpn1R626i3NRyKX09bEb8DDcUC+7md2223ZG01bR0AqJXQb48++/27xmbDbt5l0/TJQcHQx63YWgXxiZZy8npwdsLbh2YwhGQfZMerRI8NMhusMpuxY5rU6KdTelfJF2xSrF2q0KXlF/4xovJAB4nBAVTQdaGLBgRjiPpuydZ4boZytZ5HNwQxdqZUlNaOhL6ncFCKH8oG834qFxhGTFMwtGUZ/m6MEqgD93vfxRnwT+SowtK8PQXlCjr2hMIQrKoWvcb14xnmWitpy6vLnUECtMO0iWmDfzIePHG8YdeysOPjirBDpBYqnWYllxDNYMI42l4YcKp6sz1B91anFTY+BrxHXgkgSSnanDxOR6yZEuT6YYpFCiMRNUudmA+T1cjnRv2uvSrQ3Rx0FEQ4maHv0GBRqhASnwNmLJKQcKdE1+MD7hAwWrME8NOHr78LR/hj6a5k+f2YAV8hZQo7dwxmueSRoM8213/EkatN2ureQVoIuiwnQGjjg68yCqquGY4MGN8IJ9vNpnJS0E/O60oQ4Vx8E7ATWSeKNFsFW7eZ1cJffFCc6ppel2k+2wzuZrepa4gfHM176N8UYUdpkSYfEBFgH6Nq5Yt45ZM1o8iHEkHXdXEJoanTrNbM6gbhyGPxqq+Ns+89q9zyGM4xFi9gPY3y6UMc7akODk1E0W42gaIT6d0t6P+qgOc7j/4luuv5jSODZUHa+USwGdStwGt0V7ePY1vB7jeDh3W65hno2XguIneQ06cRQe2t4NNMumTJEik9lL9iulTPRtvdMWkD90R8itLWDvtF1a+Qya3nf71uTCZSCzcxW7A9xXlHH96FYL7U6z219yY33pes8bzTEKC01xjiH3TRxt3gZwZD7UP6RiskExGHTjyL1sKyDoFJL7DnOzfeydbnZxW3x13FKyRTr/g8zWK1WhWT44nBy/OJ0cns3cloHOhrT7VOJ2g9AiWSDT9MowNbxEYiNTjyLJnecUVL2KHGd0+z1UpgqAZ31cfolj0aiLuebmjrvljRV9gN9knZyenRygsaaffcTIVFaP9qTUJfvjjZMPmYqjkUzxjn1Tb8/ke/GeHX63JZ60hd8uD2wte671J+h+s8NGWwSBr+8LnWekXsFS9NTzpCE2yEbXYzhht5Hy27GfM6voU5jf5m2/99xA1mzFq6XIh7dm3Qe0LRB3cSHRDGiRCXQk4fwGCcZ/fTtt7+6AcrfhR7Tyl6j5KdWxzoe2Xvgm+sZ/IKBPZjvfe93sxhrfifUXpfN2AOx9mmnfDMC29wWPgm0m7QGYNOki5ylatcattfw2bOu0hcVU7CxzdjBz4/boyIHR5wfxUKPwQpfUUSIW288S43O/A4S10fmQ3rs6ZFauBU5lnfu5vvccNmXjW+678KfQxbhmFNUSnT0qODpN6lw2i+7uQ8YTf+77bxxtfYX+LwAAAP//AwCIX4I/wB4AAA==", "encoding": "UTF-8"}, "headers": {"Strict-Transport-Security": ["max-age=31536000; includeSubDomains; preload"], "Content-Security-Policy": ["upgrade-insecure-requests"], "Content-Encoding": ["gzip"], "Transfer-Encoding": ["chunked"], "Set-Cookie": ["Psid=_ihzm6CLPg4PUiokOyz3FQ7V9g4H5KCnxg0tORcBF608SgzqxBCRa8Wzj; Expires=Thu, 30-Nov-2017 21:06:26 GMT; Path=/; Secure", "CTOpts=Qihzm6CLPg4PUiokOyUgzw-R98L5xR4t-RoR; Expires=Wed, 30-Nov-2016 21:26:26 GMT; Path=/; Secure"], "Expires": ["Wed, 30 Nov 2016 21:06:27 GMT"], "Vary": ["Accept-Encoding"], "Keep-Alive": ["timeout=5, max=30"], "Connection": ["Keep-Alive"], "Date": ["Wed, 30 Nov 2016 21:06:26 GMT"], "Referrer-Policy": ["origin-when-cross-origin"], "Content-Type": ["text/xml;charset=UTF-8"]}, "status": {"message": "OK", "code": 200}, "url": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "recorded_at": "2016-11-30T21:06:26"}], "recorded_with": "betamax/0.8.0"} -------------------------------------------------------------------------------- /tests/cassettes/nct.test_parser.TestNctParser.test_parser_parse_dict.json: -------------------------------------------------------------------------------- 1 | {"http_interactions": [{"request": {"body": {"string": "", "encoding": "utf-8"}, "headers": {"Connection": ["keep-alive"], "Accept-Encoding": ["gzip, deflate"], "Accept": ["*/*"], "User-Agent": ["python-requests/2.12.2"]}, "method": "GET", "uri": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "response": {"body": {"base64_string": "H4sIAAAAAAAAA9xZW2/cNhZ+L9D/wPVTAnhGHt/iGIoKx0laL+I0iJ1i+yRwJM4M15KoJSnH01+/3yEpjaSZ2GmLLYo1/DCiDs/9rviHh7Jg90IbqapXe7PpwR4TVaZyWS1f7X2+fTc52/sh+f67OCtkJTNepMY2+RonjMX/mEzY7UoaRjgyVS2ULg2zivGK/ev6PbvJVqLkjNtzAmdsZW1tzqOoxWW15IWZLtV9lNnDaGXLIpIlXwoTSSCL6mZeyGz6YHI2mXiSWvynkVrk6UrwXOjEI45z9aUqFM/TnFuRXAb8tx1+VmuVCWNEziwxDDDOVMU+qHtRzoVmRwf77PBgdhpHQ1QBPxDepVY82OQ9fpGIdiVY1mgtKst20NMiUzqfxtHmZkDV6CJ5RBFmpb5EHy5vDw5fHs0OZ8dxRBec7NEu4WOZp6SsFr3SS2+iVObJj9dXk9nRycvJ7GAWR4NXAbzKLD0NCIYzT7OPPp5rKRaplbYQyceCZ2KuJpeqsloVBVR7A6cpBHujjGA3RIgU9faeFw1UyW74Qtg1fCNnH1dclzxTd7ISVmaGqQVreWWyYj8J6GS1Zr+ooqmsgHfGUZ+2Y0YtFjKD3sLZBWEF4at99gk0VCl/E/k+mIETiclraBpP20zvt1xfmExU5PZf458s7mXYZ7eqEJrPZSHpaSzO/kDIfF3x8hEhL/KmsOymmf9bZBaCjuRysppaVUZp0/kjHCANh+EMp4icKlsnPxbrTF3L0qv2CtGsa6UhQh5HAWR0Jc0KbkxyVeWNsXrdgoXjQDPaQTTOoEQ+J+w7GLkU0BLyyp8ju0UijgbqiI1qdCYeFTuAeL9x2W65soO44Y1dKQ1zJp8racmZLa6ac/ZOqdzZ841uljBWiZAFt9x6wbprAQ98MM3LLPmg4qj97ZneQTcElGnKkusOBWWLeaGyu05ft3C9uoE4cEx4kUthLpKZdPlWtD5qNjFW74gx412dd65+9UtLIgfuLRddBRflzkVNcNFpa5kho22I9qWJc2G5RJiluTCZljVp7TE5IQ/+OdO9EM59CM99CNchhLNeCO+QywnkldRix428yci23CL5h7cMdywqAMSlCB9Yf9omrS+yKNicKJi2imjVLFeqsXSrpeARkvaNoJTOIB68x/CyLqBdetFoSZTDSYuXfFw41lBDHR8bS7bIRwYNvJRUgHpm+6pxvmoJFxC8oMrObWOSTyLTjbRQo/fZ3isfbpZr66vjRbNE5IbC2Tt3cJkiGYmMO2R2XYtXexcVeJc1heVe8s8GusDtFxTlA2iPotaSXCn9BlTXXGergOsr1wJOqhKJrxUoiv4xCEa1kXAjJcEl7qFa3OUFyda96oFCk3JZJRcF1Ozywfmg9Lyt8lpJ1x/AVhJZPQAFp3IlZp/1abFrlYvinH3kpHZRoCoRCTLyPrvmBrZfnoeaxlxNY89C5SBE98JYuaRE+RxVyWuBffSJ45zdasEtoWrlCfwPVA2Phtq6xqcU3DRaJN1ddDYFdzGUUz4TyD04RdlqIdvgRiJOF5oD1ewE/dYaIL2zAOW9PJXGNCL5VQBmcNJ2dz2PfYoRhAXSR2DG5bxNSrS9qt2PGQqOUUxEu/QRG0E55BE1jTuBx9RyfMaQP/TTeqFS8pRaLkv+wJ7Vgt9RgjTousEpZTZfqJ7vEnG3NH9jIS8+X7JnHA7Amgr9b9uD34v/D/FunQ0JEdX0v9aWbZP61znsm6NjlF6Uv0DQJ6ZvlqxqaG5L1SKFACY5xNgyPHFQoqIWwaWLHTXj8CCONhBt1UIH4ciH/pxKU3vkIIA9XaL6113r2B6k6FNFkWzSyvjN1gVXUd4+1OgWiAUqNaOXO1Tn5hM88yI0odtKG/L4NMthLvr7cCx7RbHF2D/zpKglx4i6db7rQkXeujHN9rs/ZM0hov8p752Nvpn1R626i3NRyKX09bEb8DDcUC+7md2223ZG01bR0AqJXQb48++/27xmbDbt5l0/TJQcHQx63YWgXxiZZy8npwdsLbh2YwhGQfZMerRI8NMhusMpuxY5rU6KdTelfJF2xSrF2q0KXlF/4xovJAB4nBAVTQdaGLBgRjiPpuydZ4boZytZ5HNwQxdqZUlNaOhL6ncFCKH8oG834qFxhGTFMwtGUZ/m6MEqgD93vfxRnwT+SowtK8PQXlCjr2hMIQrKoWvcb14xnmWitpy6vLnUECtMO0iWmDfzIePHG8YdeysOPjirBDpBYqnWYllxDNYMI42l4YcKp6sz1B91anFTY+BrxHXgkgSSnanDxOR6yZEuT6YYpFCiMRNUudmA+T1cjnRv2uvSrQ3Rx0FEQ4maHv0GBRqhASnwNmLJKQcKdE1+MD7hAwWrME8NOHr78LR/hj6a5k+f2YAV8hZQo7dwxmueSRoM8213/EkatN2ureQVoIuiwnQGjjg68yCqquGY4MGN8IJ9vNpnJS0E/O60oQ4Vx8E7ATWSeKNFsFW7eZ1cJffFCc6ppel2k+2wzuZrepa4gfHM176N8UYUdpkSYfEBFgH6Nq5Yt45ZM1o8iHEkHXdXEJoanTrNbM6gbhyGPxqq+Ns+89q9zyGM4xFi9gPY3y6UMc7akODk1E0W42gaIT6d0t6P+qgOc7j/4luuv5jSODZUHa+USwGdStwGt0V7ePY1vB7jeDh3W65hno2XguIneQ06cRQe2t4NNMumTJEik9lL9iulTPRtvdMWkD90R8itLWDvtF1a+Qya3nf71uTCZSCzcxW7A9xXlHH96FYL7U6z219yY33pes8bzTEKC01xjiH3TRxt3gZwZD7UP6RiskExGHTjyL1sKyDoFJL7DnOzfeydbnZxW3x13FKyRTr/g8zWK1WhWT44nBy/OJ0cns3cloHOhrT7VOJ2g9AiWSDT9MowNbxEYiNTjyLJnecUVL2KHGd0+z1UpgqAZ31cfolj0aiLuebmjrvljRV9gN9knZyenRygsaaffcTIVFaP9qTUJfvjjZMPmYqjkUzxjn1Tb8/ke/GeHX63JZ60hd8uD2wte671J+h+s8NGWwSBr+8LnWekXsFS9NTzpCE2yEbXYzhht5Hy27GfM6voU5jf5m2/99xA1mzFq6XIh7dm3Qe0LRB3cSHRDGiRCXQk4fwGCcZ/fTtt7+6AcrfhR7Tyl6j5KdWxzoe2Xvgm+sZ/IKBPZjvfe93sxhrfifUXpfN2AOx9mmnfDMC29wWPgm0m7QGYNOki5ylatcattfw2bOu0hcVU7CxzdjBz4/boyIHR5wfxUKPwQpfUUSIW288S43O/A4S10fmQ3rs6ZFauBU5lnfu5vvccNmXjW+678KfQxbhmFNUSnT0qODpN6lw2i+7uQ8YTf+77bxxtfYX+LwAAAP//AwCIX4I/wB4AAA==", "encoding": "UTF-8"}, "headers": {"Strict-Transport-Security": ["max-age=31536000; includeSubDomains; preload"], "Content-Security-Policy": ["upgrade-insecure-requests"], "Content-Encoding": ["gzip"], "Transfer-Encoding": ["chunked"], "Set-Cookie": ["Psid=fihzm6CLPg4PUiCR-yz3FQ7V9K4BagC5agCRORcBF608FgzqaR48agHgyPt; Expires=Thu, 30-Nov-2017 21:06:27 GMT; Path=/; Secure", "CTOpts=Qihzm6CLPg4PUiCR-yUgzw-R98LyNR43YicR; Expires=Wed, 30-Nov-2016 21:26:27 GMT; Path=/; Secure"], "Expires": ["Wed, 30 Nov 2016 21:06:28 GMT"], "Vary": ["Accept-Encoding"], "Keep-Alive": ["timeout=5, max=30"], "Connection": ["Keep-Alive"], "Date": ["Wed, 30 Nov 2016 21:06:27 GMT"], "Referrer-Policy": ["origin-when-cross-origin"], "Content-Type": ["text/xml;charset=UTF-8"]}, "status": {"message": "OK", "code": 200}, "url": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "recorded_at": "2016-11-30T21:06:27"}], "recorded_with": "betamax/0.8.0"} -------------------------------------------------------------------------------- /tests/cassettes/nct.test_parser.TestNctParser.test_parser_parse_list.json: -------------------------------------------------------------------------------- 1 | {"http_interactions": [{"request": {"body": {"string": "", "encoding": "utf-8"}, "headers": {"Connection": ["keep-alive"], "Accept-Encoding": ["gzip, deflate"], "Accept": ["*/*"], "User-Agent": ["python-requests/2.12.2"]}, "method": "GET", "uri": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "response": {"body": {"base64_string": "H4sIAAAAAAAAA9xZW2/cNhZ+L9D/wPVTAnhGHt/iGIoKx0laL+I0iJ1i+yRwJM4M15KoJSnH01+/3yEpjaSZ2GmLLYo1/DCiDs/9rviHh7Jg90IbqapXe7PpwR4TVaZyWS1f7X2+fTc52/sh+f67OCtkJTNepMY2+RonjMX/mEzY7UoaRjgyVS2ULg2zivGK/ev6PbvJVqLkjNtzAmdsZW1tzqOoxWW15IWZLtV9lNnDaGXLIpIlXwoTSSCL6mZeyGz6YHI2mXiSWvynkVrk6UrwXOjEI45z9aUqFM/TnFuRXAb8tx1+VmuVCWNEziwxDDDOVMU+qHtRzoVmRwf77PBgdhpHQ1QBPxDepVY82OQ9fpGIdiVY1mgtKst20NMiUzqfxtHmZkDV6CJ5RBFmpb5EHy5vDw5fHs0OZ8dxRBec7NEu4WOZp6SsFr3SS2+iVObJj9dXk9nRycvJ7GAWR4NXAbzKLD0NCIYzT7OPPp5rKRaplbYQyceCZ2KuJpeqsloVBVR7A6cpBHujjGA3RIgU9faeFw1UyW74Qtg1fCNnH1dclzxTd7ISVmaGqQVreWWyYj8J6GS1Zr+ooqmsgHfGUZ+2Y0YtFjKD3sLZBWEF4at99gk0VCl/E/k+mIETiclraBpP20zvt1xfmExU5PZf458s7mXYZ7eqEJrPZSHpaSzO/kDIfF3x8hEhL/KmsOymmf9bZBaCjuRysppaVUZp0/kjHCANh+EMp4icKlsnPxbrTF3L0qv2CtGsa6UhQh5HAWR0Jc0KbkxyVeWNsXrdgoXjQDPaQTTOoEQ+J+w7GLkU0BLyyp8ju0UijgbqiI1qdCYeFTuAeL9x2W65soO44Y1dKQ1zJp8racmZLa6ac/ZOqdzZ841uljBWiZAFt9x6wbprAQ98MM3LLPmg4qj97ZneQTcElGnKkusOBWWLeaGyu05ft3C9uoE4cEx4kUthLpKZdPlWtD5qNjFW74gx412dd65+9UtLIgfuLRddBRflzkVNcNFpa5kho22I9qWJc2G5RJiluTCZljVp7TE5IQ/+OdO9EM59CM99CNchhLNeCO+QywnkldRix428yci23CL5h7cMdywqAMSlCB9Yf9omrS+yKNicKJi2imjVLFeqsXSrpeARkvaNoJTOIB68x/CyLqBdetFoSZTDSYuXfFw41lBDHR8bS7bIRwYNvJRUgHpm+6pxvmoJFxC8oMrObWOSTyLTjbRQo/fZ3isfbpZr66vjRbNE5IbC2Tt3cJkiGYmMO2R2XYtXexcVeJc1heVe8s8GusDtFxTlA2iPotaSXCn9BlTXXGergOsr1wJOqhKJrxUoiv4xCEa1kXAjJcEl7qFa3OUFyda96oFCk3JZJRcF1Ozywfmg9Lyt8lpJ1x/AVhJZPQAFp3IlZp/1abFrlYvinH3kpHZRoCoRCTLyPrvmBrZfnoeaxlxNY89C5SBE98JYuaRE+RxVyWuBffSJ45zdasEtoWrlCfwPVA2Phtq6xqcU3DRaJN1ddDYFdzGUUz4TyD04RdlqIdvgRiJOF5oD1ewE/dYaIL2zAOW9PJXGNCL5VQBmcNJ2dz2PfYoRhAXSR2DG5bxNSrS9qt2PGQqOUUxEu/QRG0E55BE1jTuBx9RyfMaQP/TTeqFS8pRaLkv+wJ7Vgt9RgjTousEpZTZfqJ7vEnG3NH9jIS8+X7JnHA7Amgr9b9uD34v/D/FunQ0JEdX0v9aWbZP61znsm6NjlF6Uv0DQJ6ZvlqxqaG5L1SKFACY5xNgyPHFQoqIWwaWLHTXj8CCONhBt1UIH4ciH/pxKU3vkIIA9XaL6113r2B6k6FNFkWzSyvjN1gVXUd4+1OgWiAUqNaOXO1Tn5hM88yI0odtKG/L4NMthLvr7cCx7RbHF2D/zpKglx4i6db7rQkXeujHN9rs/ZM0hov8p752Nvpn1R626i3NRyKX09bEb8DDcUC+7md2223ZG01bR0AqJXQb48++/27xmbDbt5l0/TJQcHQx63YWgXxiZZy8npwdsLbh2YwhGQfZMerRI8NMhusMpuxY5rU6KdTelfJF2xSrF2q0KXlF/4xovJAB4nBAVTQdaGLBgRjiPpuydZ4boZytZ5HNwQxdqZUlNaOhL6ncFCKH8oG834qFxhGTFMwtGUZ/m6MEqgD93vfxRnwT+SowtK8PQXlCjr2hMIQrKoWvcb14xnmWitpy6vLnUECtMO0iWmDfzIePHG8YdeysOPjirBDpBYqnWYllxDNYMI42l4YcKp6sz1B91anFTY+BrxHXgkgSSnanDxOR6yZEuT6YYpFCiMRNUudmA+T1cjnRv2uvSrQ3Rx0FEQ4maHv0GBRqhASnwNmLJKQcKdE1+MD7hAwWrME8NOHr78LR/hj6a5k+f2YAV8hZQo7dwxmueSRoM8213/EkatN2ureQVoIuiwnQGjjg68yCqquGY4MGN8IJ9vNpnJS0E/O60oQ4Vx8E7ATWSeKNFsFW7eZ1cJffFCc6ppel2k+2wzuZrepa4gfHM176N8UYUdpkSYfEBFgH6Nq5Yt45ZM1o8iHEkHXdXEJoanTrNbM6gbhyGPxqq+Ns+89q9zyGM4xFi9gPY3y6UMc7akODk1E0W42gaIT6d0t6P+qgOc7j/4luuv5jSODZUHa+USwGdStwGt0V7ePY1vB7jeDh3W65hno2XguIneQ06cRQe2t4NNMumTJEik9lL9iulTPRtvdMWkD90R8itLWDvtF1a+Qya3nf71uTCZSCzcxW7A9xXlHH96FYL7U6z219yY33pes8bzTEKC01xjiH3TRxt3gZwZD7UP6RiskExGHTjyL1sKyDoFJL7DnOzfeydbnZxW3x13FKyRTr/g8zWK1WhWT44nBy/OJ0cns3cloHOhrT7VOJ2g9AiWSDT9MowNbxEYiNTjyLJnecUVL2KHGd0+z1UpgqAZ31cfolj0aiLuebmjrvljRV9gN9knZyenRygsaaffcTIVFaP9qTUJfvjjZMPmYqjkUzxjn1Tb8/ke/GeHX63JZ60hd8uD2wte671J+h+s8NGWwSBr+8LnWekXsFS9NTzpCE2yEbXYzhht5Hy27GfM6voU5jf5m2/99xA1mzFq6XIh7dm3Qe0LRB3cSHRDGiRCXQk4fwGCcZ/fTtt7+6AcrfhR7Tyl6j5KdWxzoe2Xvgm+sZ/IKBPZjvfe93sxhrfifUXpfN2AOx9mmnfDMC29wWPgm0m7QGYNOki5ylatcattfw2bOu0hcVU7CxzdjBz4/boyIHR5wfxUKPwQpfUUSIW288S43O/A4S10fmQ3rs6ZFauBU5lnfu5vvccNmXjW+678KfQxbhmFNUSnT0qODpN6lw2i+7uQ8YTf+77bxxtfYX+LwAAAP//AwCIX4I/wB4AAA==", "encoding": "UTF-8"}, "headers": {"Strict-Transport-Security": ["max-age=31536000; includeSubDomains; preload"], "Content-Security-Policy": ["upgrade-insecure-requests"], "Content-Encoding": ["gzip"], "Transfer-Encoding": ["chunked"], "Set-Cookie": ["Psid=fihzm6CLPg4PUiC3pyz3FQ7V9K4BagC5agCRORcBF608SgzqaR48aRLgyPt; Expires=Thu, 30-Nov-2017 21:06:26 GMT; Path=/; Secure", "CTOpts=Qihzm6CLPg4PUiC3pyUgzw-R98LyNR43Yicj; Expires=Wed, 30-Nov-2016 21:26:26 GMT; Path=/; Secure"], "Expires": ["Wed, 30 Nov 2016 21:06:27 GMT"], "Vary": ["Accept-Encoding"], "Keep-Alive": ["timeout=5, max=30"], "Connection": ["Keep-Alive"], "Date": ["Wed, 30 Nov 2016 21:06:26 GMT"], "Referrer-Policy": ["origin-when-cross-origin"], "Content-Type": ["text/xml;charset=UTF-8"]}, "status": {"message": "OK", "code": 200}, "url": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "recorded_at": "2016-11-30T21:06:26"}], "recorded_with": "betamax/0.8.0"} --------------------------------------------------------------------------------