├── tests
    ├── __init__.py
    ├── collectors
    │   ├── gsk
    │   │   ├── __init__.py
    │   │   └── test_parser.py
    │   ├── hra
    │   │   ├── __init__.py
    │   │   └── test_collector.py
    │   ├── nct
    │   │   ├── __init__.py
    │   │   └── test_parser.py
    │   ├── euctr
    │   │   ├── __init__.py
    │   │   └── test_parser.py
    │   ├── pubmed
    │   │   ├── __init__.py
    │   │   ├── test_spider.py
    │   │   └── test_parser.py
    │   ├── takeda
    │   │   ├── __init__.py
    │   │   └── test_parser.py
    │   └── base
    │   │   └── test_fields.py
    ├── test_isrctn.py
    ├── test_gsk.py
    ├── test_euctr.py
    ├── conftest.py
    └── cassettes
    │   ├── nct.test_parser.TestNctParser.test_parser_parse_text.json
    │   ├── nct.test_parser.TestNctParser.test_parser_parse_dict.json
    │   └── nct.test_parser.TestNctParser.test_parser_parse_list.json
├── collectors
    ├── __init__.py
    ├── actrn
    │   ├── __init__.py
    │   ├── collector.py
    │   ├── spider.py
    │   └── record.py
    ├── euctr
    │   ├── __init__.py
    │   ├── collector.py
    │   └── spider.py
    ├── fda_dap
    │   ├── __init__.py
    │   ├── collector.py
    │   └── record.py
    ├── fdadl
    │   ├── __init__.py
    │   ├── record.py
    │   └── collector.py
    ├── gsk
    │   ├── __init__.py
    │   ├── collector.py
    │   ├── spider.py
    │   └── record.py
    ├── hra
    │   ├── __init__.py
    │   ├── record.py
    │   ├── collector.py
    │   └── parser.py
    ├── icdcm
    │   ├── __init__.py
    │   ├── record.py
    │   └── collector.py
    ├── icdpcs
    │   ├── __init__.py
    │   ├── record.py
    │   └── collector.py
    ├── ictrp
    │   ├── __init__.py
    │   ├── collector.py
    │   ├── spider.py
    │   └── record.py
    ├── isrctn
    │   ├── __init__.py
    │   ├── collector.py
    │   ├── spider.py
    │   └── record.py
    ├── jprn
    │   ├── __init__.py
    │   ├── collector.py
    │   ├── spider.py
    │   ├── parser.py
    │   └── record.py
    ├── nct
    │   ├── __init__.py
    │   ├── collector.py
    │   └── record.py
    ├── pfizer
    │   ├── __init__.py
    │   ├── collector.py
    │   ├── record.py
    │   ├── spider.py
    │   └── parser.py
    ├── pubmed
    │   ├── __init__.py
    │   ├── collector.py
    │   ├── record.py
    │   └── spider.py
    ├── takeda
    │   ├── __init__.py
    │   ├── collector.py
    │   ├── spider.py
    │   ├── record.py
    │   └── parser.py
    ├── cochrane_reviews
    │   ├── __init__.py
    │   ├── record.py
    │   ├── collector.py
    │   └── parser.py
    └── base
    │   ├── __init__.py
    │   ├── cli.py
    │   ├── pipelines.py
    │   ├── helpers.py
    │   ├── config.py
    │   ├── fields.py
    │   └── record.py
├── migrations
    ├── __init__.py
    ├── versions
    │   ├── 20160510091510_fda_rename_table_to_fdadl.py
    │   ├── 20160901171321_add_results_url_to_gsk.py
    │   ├── 20161102144050_add_trial_results_url_to_euctr.py
    │   ├── 20161206150412_add_exempt_results_to_nct.py
    │   ├── 20170215125221_add_registry_ids_to_pubmed.py
    │   ├── 20160819163953_fdadl_add_fda_application_number.py
    │   ├── 20160610145922_pubmed_add_mesh.py
    │   ├── 20170214191843_pubmed_rename_identifiers_list_to_article_ids.py
    │   ├── 20160303155834_pfizer_takeda_add_pk.py
    │   ├── 20160406115944_ictrp_simplify_primary_key.py
    │   ├── 20160311153848_add_data_prefix_to_tables.py
    │   ├── 20160323090938_remove_data_prefix_from_tables.py
    │   ├── 20160408164205_create_meta_id_indexes.py
    │   ├── 20160525192212_euctr_fix_column_names.py
    │   ├── 20160525134303_takeda_fix_column_names.py
    │   ├── 20160525133746_isrctn_fix_column_names.py
    │   ├── 20160311151047_update_meta_identifier.py
    │   ├── 20160831125422_add_drug_name_active_ingredients_and_company_to_fda_dap.py
    │   ├── 20160224180815_trials_create_table.py
    │   ├── 20160323145124_trials_remove_table.py
    │   ├── 20160220212552_nct_fix_boolean_columns.py
    │   ├── 20160510000353_fda_create_table.py
    │   ├── 20160525132926_gsk_fix_column_names.py
    │   ├── 20160509115712_icdcm_create_table.py
    │   ├── 20160509133714_icdpcs_create_table.py
    │   ├── 20170123144318_default_for_meta_created_and_meta_updated.py
    │   ├── 20161007222818_create_cochrane_reviews_table.py
    │   ├── 20160725130032_fda_dap_create_table.py
    │   ├── 20160226134759_pfizer_create_table.py
    │   ├── 20160525130300_actrn_fix_column_names.py
    │   ├── 20170123151655_add_trigger_for_meta_updated.py
    │   ├── 20160428204857_pubmed_create_table.py
    │   ├── 20160301131954_ictrp_create_table.py
    │   ├── 20160229142254_takeda_create_table.py
    │   ├── 20160525105409_euctr_fix_column_names.py
    │   ├── 20160603215242_hra_create_table.py
    │   ├── 20160220164104_nct_create_table.py
    │   └── 20160220175816_isrctn_create_table.py
    ├── script.py.mako
    ├── config.py
    └── env.py
├── pytest.ini
├── scrapy.cfg
├── .dockerignore
├── requirements.in
├── requirements.dev.txt
├── pylama.ini
├── Dockerfile
├── tox.ini
├── Makefile
├── .travis.yml
├── CONTRIBUTING.md
├── docs
    ├── warehouse.md
    ├── collectors
    │   ├── actrn.md
    │   ├── takeda.md
    │   ├── pfizer.md
    │   ├── pubmed.md
    │   ├── isrctn.md
    │   ├── ictrp.md
    │   ├── nct.md
    │   ├── gsk.md
    │   ├── jprn.md
    │   └── euctr.md
    └── overview.md
├── .env.example
├── .gitignore
├── LICENSE.md
├── alembic.ini
├── README.md
├── docker-compose.yml
└── requirements.txt


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/collectors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/migrations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/collectors/gsk/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/collectors/hra/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/collectors/nct/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/collectors/euctr/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/collectors/pubmed/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/collectors/takeda/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | testpaths=tests
3 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | [settings]
2 | default = collectors.base.config
3 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !collectors/
3 | !migrations/
4 | !alembic.ini
5 | !Makefile
6 | !requirements.txt
7 | !scrapy.cfg
8 | 


--------------------------------------------------------------------------------
/requirements.in:
--------------------------------------------------------------------------------
 1 | scrapy
 2 | dataset==0.7.1
 3 | alembic
 4 | psycopg2
 5 | xmltodict
 6 | sqlalchemy
 7 | python-dotenv
 8 | requests==2.12.2
 9 | ijson
10 | pytz
11 | python-dateutil
12 | raven
13 | 


--------------------------------------------------------------------------------
/requirements.dev.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | pyyaml==3.10 # for docker-cloud
 3 | docker-cloud
 4 | pylama
 5 | tox
 6 | mock
 7 | pytest
 8 | pytest-cov
 9 | betamax==0.8
10 | coverage
11 | coveralls
12 | ipython
13 | 


--------------------------------------------------------------------------------
/pylama.ini:
--------------------------------------------------------------------------------
 1 | [pylama]
 2 | linters = pyflakes,mccabe,pep8
 3 | ignore = E105,E128,E731
 4 | 
 5 | [pylama:mccabe]
 6 | complexity = 48
 7 | 
 8 | [pylama:pep8]
 9 | max_line_length = 160
10 | 
11 | [pylama:*/__init__.py]
12 | ignore = W0611
13 | 


--------------------------------------------------------------------------------
/collectors/actrn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/euctr/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/fda_dap/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/fdadl/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/gsk/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/hra/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/icdcm/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/icdpcs/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/ictrp/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/isrctn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/jprn/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/nct/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/pfizer/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/pubmed/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/takeda/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/collectors/cochrane_reviews/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from __future__ import absolute_import
3 | from __future__ import division
4 | from __future__ import print_function
5 | from __future__ import unicode_literals
6 | 
7 | from .collector import collect
8 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:2.7
 2 | WORKDIR /service
 3 | COPY requirements.txt requirements.txt
 4 | RUN pip install --upgrade -r requirements.txt
 5 | COPY collectors collectors
 6 | COPY migrations migrations
 7 | COPY alembic.ini alembic.ini
 8 | COPY Makefile Makefile
 9 | COPY scrapy.cfg scrapy.cfg
10 | 


--------------------------------------------------------------------------------
/collectors/base/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from . import fields
 8 | from . import helpers
 9 | from .record import Record
10 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist =
 3 |   py27
 4 |   lint
 5 | skipsdist = True
 6 | 
 7 | [testenv]
 8 | deps =
 9 |   -r{toxinidir}/requirements.txt
10 |   pytest
11 |   mock
12 |   betamax==0.8
13 | setenv =
14 |   PYTHON_ENV = testing
15 | passenv =
16 |   TEST_WAREHOUSE_URL
17 | commands =
18 |   py.test {posargs}
19 | 
20 | [testenv:lint]
21 | deps =
22 |   pylama
23 | commands =
24 |   pylama {toxinidir}/collectors {toxinidir}/migrations
25 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all build install list migrate start test up
 2 | 
 3 | all: list
 4 | 
 5 | build:
 6 | 	docker build -t opentrials/collectors .
 7 | 
 8 | list:
 9 | 	@grep '^\.PHONY' Makefile | cut -d' ' -f2- | tr ' ' '\n'
10 | 
11 | migrate:
12 | 	alembic upgrade head
13 | 
14 | start:
15 | 	python -m collectors.base.cli $(filter-out $@,$(MAKECMDGOALS))
16 | 
17 | test:
18 | 	tox
19 | 
20 | up:
21 | 	docker-compose up
22 | 
23 | %:
24 | 	@:
25 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo:
 2 |   false
 3 | 
 4 | language:
 5 |   python
 6 | 
 7 | python:
 8 |   - 2.7
 9 | 
10 | services:
11 |   - postgresql
12 | 
13 | addons:
14 |   postgresql: '9.4'
15 | 
16 | env:
17 |   global:
18 |     - TEST_WAREHOUSE_URL=postgres://postgres@localhost:5432/opentrials_warehouse_test
19 | 
20 | install:
21 |   - pip install tox
22 |   - psql -c 'create database opentrials_warehouse_test;' -U postgres
23 | 
24 | script:
25 |   - make test
26 | 


--------------------------------------------------------------------------------
/collectors/fda_dap/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/collectors/pfizer/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/collectors/takeda/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/tests/test_isrctn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from importlib import import_module
 8 | from collectors.isrctn.spider import _make_start_urls
 9 | 
10 | 
11 | # Tests
12 | 
13 | def test_make_start_urls():
14 |     result = _make_start_urls('prefix', '2016-01-01', '2016-01-15')
15 |     print(result)
16 |     assert result
17 | 


--------------------------------------------------------------------------------
/collectors/actrn/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn, date_from=None, date_to=None):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/collectors/euctr/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn, date_from=None, date_to=None):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/collectors/gsk/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn, date_from=None, date_to=None):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/collectors/jprn/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn, page_from=None, page_to=None):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/tests/test_gsk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from importlib import import_module
 8 | from collectors.gsk.spider import _make_start_urls
 9 | 
10 | 
11 | # Tests
12 | 
13 | def test_make_start_urls():
14 |     result = _make_start_urls(
15 |         'http://www.gsk-clinicalstudyregister.com/search',
16 |         '2015-01-01', '2015-01-31')
17 |     print(result)
18 |     assert result
19 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | The project follows the [Open Knowledge International coding standards](https://github.com/okfn/coding-standards).
 4 | 
 5 | ## Getting Started
 6 | 
 7 | ```
 8 | virtualenv .python -p python2
 9 | source .python/bin/activate
10 | make install
11 | cp .env.example .env
12 | editor .env # set your values
13 | set -a; source .env
14 | ```
15 | 
16 | ## Testing
17 | 
18 | To run tests:
19 | 
20 | ```
21 | $ make test
22 | ```
23 | 
24 | ## Running
25 | 
26 | To run a processor:
27 | 
28 | ```
29 | $ make start <name>
30 | ```
31 | 


--------------------------------------------------------------------------------
/collectors/isrctn/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn, date_from=None, date_to=None):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/collectors/pubmed/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn, date_from=None, date_to=None):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
16 |     process.start()
17 | 


--------------------------------------------------------------------------------
/collectors/ictrp/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.crawler import CrawlerProcess
 8 | from .spider import Spider
 9 | 
10 | 
11 | # Module API
12 | 
13 | def collect(conf, conn):
14 |     process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
15 |     process.crawl(Spider, conn=conn,
16 |         http_user=conf['ICTRP_USER'],
17 |         http_pass=conf['ICTRP_PASS'])
18 |     process.start()
19 | 


--------------------------------------------------------------------------------
/docs/warehouse.md:
--------------------------------------------------------------------------------
 1 | # Warehouse
 2 | 
 3 | The document describes OpenTrials `warehouse`.
 4 | 
 5 | ### Basics
 6 | 
 7 | This database stores records collected from different sources.
 8 | It's a denormalized data storage.
 9 | 
10 | ### Tables
11 | 
12 | Each table corresponds to a source and its schema follows the structure of data from the origin.
13 | To see the schema of each table please check the collector-specific docs
14 | [here](https://github.com/opentrials/collectors/tree/master/docs/collectors)
15 | 
16 | ### Technology
17 | 
18 | Database engine: `postgresql-9.4+`.
19 | 


--------------------------------------------------------------------------------
/tests/test_euctr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from importlib import import_module
 8 | from collectors.euctr.spider import _make_start_urls
 9 | 
10 | 
11 | # Tests
12 | 
13 | def test_make_start_urls():
14 |     result = _make_start_urls(
15 |             'https://www.clinicaltrialsregister.eu/ctr-search/search',
16 |             '2015-01-01', '2015-01-02')
17 |     print(result)
18 |     assert result
19 | 


--------------------------------------------------------------------------------
/collectors/cochrane_reviews/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Json
 9 | 
10 | 
11 | class Record(base.Record):
12 |     table = 'cochrane_reviews'
13 | 
14 |     # Fields
15 | 
16 |     id = Text(primary_key=True)
17 |     study_id = Text()
18 |     file_name = Text()
19 |     study_type = Text()
20 |     doi_id = Text()
21 |     robs = Json()
22 |     refs = Json()
23 | 


--------------------------------------------------------------------------------
/collectors/icdcm/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Array
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'icdcm'
18 | 
19 |     # General
20 | 
21 |     name = Text(primary_key=True)
22 |     desc = Text()
23 |     terms = Array()
24 |     version = Text()
25 |     last_updated = Date('%Y-%m-%d')
26 | 


--------------------------------------------------------------------------------
/migrations/versions/20160510091510_fda_rename_table_to_fdadl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = 'f38e14eac095'
12 | down_revision = u'9f367826f849'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade():
18 |     op.rename_table('fda', 'fdadl')
19 | 
20 | 
21 | def downgrade():
22 |     op.rename_table('fdadl', 'fda')
23 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Export: set -a; source .env
 2 | # Dockerhost: ip route | awk '/docker0/ { print $NF }'
 3 | PYTHON_ENV=development
 4 | WAREHOUSE_URL=postgres://<user>:<pass>@<host>:5432/<database>
 5 | TEST_WAREHOUSE_URL=postgres://<user>:<pass>@<host>:5432/<database-test>
 6 | # LOGGING_URL='<host>.papertrailapp.com:<port>'  # optional
 7 | # DOWNLOAD_DELAY=1 # optional
 8 | # ICTRP_USER='<user>'  # optional
 9 | # ICTRP_PASS='<pass>'  # optional
10 | # HRA_ENV='<staging/production>'  # optional
11 | # HRA_URL='<url>'  # optional
12 | # HRA_USER='<user>'  # optional
13 | # HRA_PASS='<pass>'  # optional
14 | # SENTRY_DSN=''  # optional


--------------------------------------------------------------------------------
/collectors/fdadl/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'fdadl'
18 | 
19 |     # General
20 | 
21 |     product_ndc = Text(primary_key=True)
22 |     fda_application_number = Text()
23 |     product_type = Text()
24 |     generic_name = Text()
25 |     brand_name = Text()
26 |     last_updated = Date('%Y-%m-%d')
27 | 


--------------------------------------------------------------------------------
/migrations/script.py.mako:
--------------------------------------------------------------------------------
 1 | ${"# -*- coding: utf-8 -*-"}
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | ${imports if imports else ""}
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = ${repr(up_revision)}
12 | down_revision = ${repr(down_revision)}
13 | branch_labels = ${repr(branch_labels)}
14 | depends_on = ${repr(depends_on)}
15 | 
16 | 
17 | def upgrade():
18 |     ${upgrades if upgrades else "pass"}
19 | 
20 | 
21 | def downgrade():
22 |     ${downgrades if downgrades else "pass"}
23 | 


--------------------------------------------------------------------------------
/collectors/icdpcs/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Boolean
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'icdpcs'
18 | 
19 |     # General
20 | 
21 |     code = Text(primary_key=True)
22 |     is_header = Boolean('0')
23 |     short_description = Text()
24 |     long_description = Text()
25 |     version = Text()
26 |     last_updated = Date('%Y-%m-%d')
27 | 


--------------------------------------------------------------------------------
/migrations/versions/20160901171321_add_results_url_to_gsk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'bf807df84277'
13 | down_revision = u'2d52470f8e49'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.add_column('gsk', sa.Column('results_url', sa.Text))
20 | 
21 | 
22 | def downgrade():
23 |     op.drop_column('gsk', 'results_url')
24 | 


--------------------------------------------------------------------------------
/migrations/versions/20161102144050_add_trial_results_url_to_euctr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'f35805a0a00f'
13 | down_revision = u'84910d455f31'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.add_column('euctr', sa.Column('trial_results_url', sa.Text))
20 | 
21 | 
22 | def downgrade():
23 |     op.drop_column('euctr', 'trial_results_url')
24 | 


--------------------------------------------------------------------------------
/migrations/versions/20161206150412_add_exempt_results_to_nct.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '0087dc1eb534'
13 | down_revision = u'f35805a0a00f'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.add_column('nct', sa.Column('results_exemption_date', sa.Date))
20 | 
21 | 
22 | def downgrade():
23 |     op.drop_column('nct', 'results_exemption_date')
24 | 


--------------------------------------------------------------------------------
/migrations/versions/20170215125221_add_registry_ids_to_pubmed.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'fd0bb12971d2'
13 | down_revision = u'3dbb46f23ed7'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.add_column('pubmed', sa.Column('registry_ids', sa.dialects.postgresql.JSONB))
20 | 
21 | 
22 | def downgrade():
23 |     op.drop_column('pubmed', 'registry_ids')
24 | 


--------------------------------------------------------------------------------
/migrations/versions/20160819163953_fdadl_add_fda_application_number.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'bc7470719f51'
13 | down_revision = u'23c55ccc0649'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.add_column('fdadl', sa.Column('fda_application_number', sa.Text))
20 | 
21 | 
22 | def downgrade():
23 |     op.drop_column('fdadl', 'fda_application_number')
24 | 


--------------------------------------------------------------------------------
/migrations/versions/20160610145922_pubmed_add_mesh.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | from sqlalchemy.dialects.postgresql import JSONB
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '3a3b663824f1'
14 | down_revision = u'c4c0db99bb1c'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.add_column('pubmed', sa.Column('mesh_headings', JSONB))
21 | 
22 | 
23 | def downgrade():
24 |     op.drop_column('pubmed', 'mesh_headings')
25 | 


--------------------------------------------------------------------------------
/migrations/versions/20170214191843_pubmed_rename_identifiers_list_to_article_ids.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '3dbb46f23ed7'
12 | down_revision = u'b32475938a2d'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade():
18 |     op.alter_column('pubmed', 'identifiers_list', new_column_name='article_ids')
19 | 
20 | 
21 | def downgrade():
22 |     op.alter_column('pubmed', 'article_ids', new_column_name='identifiers_list')
23 | 


--------------------------------------------------------------------------------
/tests/collectors/pubmed/test_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from collectors.pubmed.spider import _make_start_urls
 8 | 
 9 | 
10 | class TestPubmedSpider(object):
11 |     def test_make_start_urls(self, betamax_session):
12 |         result = _make_start_urls(
13 |             'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi/',
14 |             'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id={pmid}&retmode=xml',
15 |             '2016-01-01', '2016-01-01',
16 |             session=betamax_session)
17 |         assert result
18 | 


--------------------------------------------------------------------------------
/migrations/versions/20160303155834_pfizer_takeda_add_pk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = 'b0f8a397edad'
12 | down_revision = u'7518ba857fea'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade():
18 |     op.create_primary_key('pfizer_pkey', 'pfizer', ['nct_id'])
19 |     op.create_primary_key('takeda_pkey', 'takeda', ['takeda_trial_id'])
20 | 
21 | 
22 | def downgrade():
23 |     op.drop_constraint('pfizer_pkey', 'pfizer')
24 |     op.drop_constraint('takeda_pkey', 'takeda')
25 | 


--------------------------------------------------------------------------------
/migrations/versions/20160406115944_ictrp_simplify_primary_key.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '00d329f5f40a'
12 | down_revision = u'58d2189bc678'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade():
18 |     op.drop_constraint('ictrp_pkey', 'ictrp')
19 |     op.create_primary_key('ictrp_pkey', 'ictrp', ['main_id'])
20 | 
21 | 
22 | def downgrade():
23 |     op.drop_constraint('ictrp_pkey', 'ictrp')
24 |     op.create_primary_key('ictrp_pkey', 'ictrp', ['register', 'main_id'])
25 | 


--------------------------------------------------------------------------------
/migrations/versions/20160311153848_add_data_prefix_to_tables.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = 'ec1ab5776710'
12 | down_revision = u'46d169ce43d2'
13 | branch_labels = None
14 | depends_on = None
15 | tables = ['actrn', 'euctr', 'gsk', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'takeda']
16 | 
17 | 
18 | def upgrade():
19 |     for table in tables:
20 |         op.rename_table(table, 'data_'+table)
21 | 
22 | 
23 | def downgrade():
24 |     for table in tables:
25 |         op.rename_table('data_'+table, table)
26 | 


--------------------------------------------------------------------------------
/migrations/versions/20160323090938_remove_data_prefix_from_tables.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '89c87deb5a02'
12 | down_revision = u'ec1ab5776710'
13 | branch_labels = None
14 | depends_on = None
15 | tables = ['actrn', 'euctr', 'gsk', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'takeda']
16 | 
17 | 
18 | def upgrade():
19 |     for table in tables:
20 |         op.rename_table('data_'+table, table)
21 | 
22 | 
23 | def downgrade():
24 |     for table in tables:
25 |         op.rename_table(table, 'data_'+table)
26 | 


--------------------------------------------------------------------------------
/collectors/fda_dap/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Integer, Json
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'fda_dap'
18 | 
19 |     # General
20 | 
21 |     id = Text(primary_key=True)
22 |     drug_name = Text()
23 |     active_ingredients = Text()
24 |     company = Text()
25 |     fda_application_num = Text()
26 |     supplement_number = Integer()
27 |     action_date = Date('%m/%d/%Y')
28 |     approval_type = Text()
29 |     notes = Text()
30 |     documents = Json()
31 | 


--------------------------------------------------------------------------------
/migrations/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import logging
 9 | from logging.handlers import SysLogHandler
10 | from dotenv import load_dotenv
11 | load_dotenv('.env')
12 | 
13 | 
14 | # Storage
15 | 
16 | WAREHOUSE_URL = os.environ['WAREHOUSE_URL']
17 | 
18 | # Logging
19 | 
20 | LOGGING_URL = os.environ['LOGGING_URL']
21 | logging.basicConfig(level=logging.DEBUG)
22 | root_logger = logging.getLogger()
23 | host, port = LOGGING_URL.split(':')
24 | syslog_handler = SysLogHandler(address=(host, int(port)))
25 | syslog_handler.setLevel(logging.INFO)
26 | root_logger.addHandler(syslog_handler)
27 | 


--------------------------------------------------------------------------------
/migrations/versions/20160408164205_create_meta_id_indexes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '014fd3f703aa'
12 | down_revision = u'00d329f5f40a'
13 | branch_labels = None
14 | depends_on = None
15 | tables = ['actrn', 'euctr', 'gsk', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'takeda']
16 | 
17 | 
18 | def upgrade():
19 |     for table in tables:
20 |         op.create_unique_constraint('%s_meta_id_unique' % table, table, ['meta_id'])
21 | 
22 | 
23 | def downgrade():
24 |     for table in tables:
25 |         op.drop_constraint('%s_meta_id_unique' % table, table)
26 | 


--------------------------------------------------------------------------------
/collectors/base/cli.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sys
 8 | import dataset
 9 | import logging
10 | import importlib
11 | from . import config
12 | from . import helpers
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | # Module API
17 | 
18 | def cli(argv):
19 |     # Prepare conf dict
20 |     conf = helpers.get_variables(config, str.isupper)
21 | 
22 |     # Prepare conn dict
23 |     conn = {
24 |         'warehouse': dataset.connect(config.WAREHOUSE_URL),
25 |     }
26 | 
27 |     # Get and call collector
28 |     collect = importlib.import_module('collectors.%s' % argv[1]).collect
29 |     collect(conf, conn, *argv[2:])
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     cli(sys.argv)
34 | 


--------------------------------------------------------------------------------
/collectors/pfizer/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Boolean
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'pfizer'
18 | 
19 |     # General
20 | 
21 |     nct_id = Text(primary_key=True)
22 |     title = Text()
23 | 
24 |     # Description
25 | 
26 |     study_type = Text()
27 |     organization_id = Text()
28 |     status = Text()
29 |     study_start_date = Date('%B, %Y')
30 |     study_end_date = Date('%B, %Y')
31 | 
32 |     # Eligibility
33 | 
34 |     eligibility_criteria = Text()
35 |     gender = Text()
36 |     age_range = Text()
37 |     healthy_volunteers_allowed = Boolean('Accepts Healthy Volunteers')
38 | 


--------------------------------------------------------------------------------
/collectors/base/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import dataset
 8 | from . import config
 9 | from . import helpers
10 | 
11 | 
12 | # Module API
13 | 
14 | class Warehouse(object):
15 | 
16 |     # Public
17 | 
18 |     def open_spider(self, spider):
19 |         if spider.conf and spider.conn:
20 |             self.__conf = spider.conf
21 |             self.__conn = spider.conn
22 |         else:
23 |             # For runs trigered by scrapy CLI utility
24 |             self.__conf = helpers.get_variables(config, str.isupper)
25 |             self.__conn = {'warehouse': dataset.connect(config.WAREHOUSE_URL)}
26 | 
27 |     def process_item(self, record, spider):
28 |         record.write(self.__conf, self.__conn)
29 |         return record
30 | 


--------------------------------------------------------------------------------
/migrations/versions/20160525192212_euctr_fix_column_names.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '6d709931cc58'
12 | down_revision = u'c83c754cc04e'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | MAPPING = {
17 |     'ethics_committee_opinion_reason_s_for_unfavourable_opinion': 'ethics_committee_opinion_reasons_for_unfavourable_opinion',
18 | }
19 | 
20 | 
21 | def upgrade():
22 |     for key, value in MAPPING.items():
23 |         op.alter_column('euctr', column_name=value, new_column_name=key)
24 | 
25 | 
26 | def downgrade():
27 |     for key, value in MAPPING.items():
28 |         op.alter_column('euctr', column_name=key, new_column_name=value)
29 | 


--------------------------------------------------------------------------------
/migrations/versions/20160525134303_takeda_fix_column_names.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = 'c83c754cc04e'
12 | down_revision = u'59e2335b3d41'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | MAPPING = {
17 |     'enrollment_number_of_participants': 'enrollmentnumber_of_participants',
18 |     'trial_arms_groups_or_cohorts': 'trial_armsgroups_or_cohorts',
19 | }
20 | 
21 | 
22 | def upgrade():
23 |     for key, value in MAPPING.items():
24 |         op.alter_column('takeda', column_name=value, new_column_name=key)
25 | 
26 | 
27 | def downgrade():
28 |     for key, value in MAPPING.items():
29 |         op.alter_column('takeda', column_name=key, new_column_name=value)
30 | 


--------------------------------------------------------------------------------
/migrations/versions/20160525133746_isrctn_fix_column_names.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '59e2335b3d41'
12 | down_revision = u'f736bb9d2499'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | MAPPING = {
17 |     'prospective_retrospective': 'prospectiveretrospective',
18 |     'protocol_serial_number': 'protocolserial_number',
19 |     'clinicaltrials_gov_number': 'clinicaltrialsgov_number',
20 | }
21 | 
22 | 
23 | def upgrade():
24 |     for key, value in MAPPING.items():
25 |         op.alter_column('isrctn', column_name=value, new_column_name=key)
26 | 
27 | 
28 | def downgrade():
29 |     for key, value in MAPPING.items():
30 |         op.alter_column('isrctn', column_name=key, new_column_name=value)
31 | 


--------------------------------------------------------------------------------
/tests/collectors/base/test_fields.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import datetime
 9 | import pytest
10 | import collectors.base.fields as fields
11 | 
12 | 
13 | class TestFields(object):
14 |     def test_date_accepts_single_format(self):
15 |         date = fields.Date('%Y-%m')
16 | 
17 |         assert date.parse('2017-01') == datetime.date(2017, 1, 1)
18 | 
19 |     def test_date_accepts_multiple_formats(self):
20 |         date = fields.Date(['%Y-%m', '%Y-%m-%d'])
21 | 
22 |         assert date.parse('2017-01') == datetime.date(2017, 1, 1)
23 |         assert date.parse('2017-01-01') == datetime.date(2017, 1, 1)
24 | 
25 |     def test_date_raises_if_date_is_in_wrong_format(self):
26 |         date = fields.Date('%Y-%m')
27 |         with pytest.raises(ValueError):
28 |             date.parse('2017-01-01')
29 | 


--------------------------------------------------------------------------------
/migrations/versions/20160311151047_update_meta_identifier.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '46d169ce43d2'
12 | down_revision = u'b0f8a397edad'
13 | branch_labels = None
14 | depends_on = None
15 | tables = ['actrn', 'euctr', 'gsk', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'takeda']
16 | 
17 | 
18 | def upgrade():
19 |     for table in tables:
20 |         op.alter_column(table, 'meta_uuid', new_column_name='meta_id')
21 |         op.execute('ALTER TABLE %s ALTER COLUMN meta_id TYPE uuid USING meta_id::uuid' % table)
22 | 
23 | 
24 | def downgrade():
25 |     for table in tables:
26 |         op.execute('ALTER TABLE %s ALTER COLUMN meta_id TYPE text USING meta_id::text' % table)
27 |         op.alter_column(table, 'meta_id', new_column_name='meta_uuid')
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # pyenv
60 | .python-version
61 | 
62 | # dotenv
63 | .env
64 | 


--------------------------------------------------------------------------------
/migrations/versions/20160831125422_add_drug_name_active_ingredients_and_company_to_fda_dap.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '2d52470f8e49'
13 | down_revision = u'bc7470719f51'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     with op.batch_alter_table('fda_dap') as batch_op:
20 |         batch_op.add_column(sa.Column('drug_name', sa.Text))
21 |         batch_op.add_column(sa.Column('active_ingredients', sa.Text))
22 |         batch_op.add_column(sa.Column('company', sa.Text))
23 | 
24 | 
25 | def downgrade():
26 |     with op.batch_alter_table('fda_dap') as batch_op:
27 |         batch_op.drop_column('drug_name')
28 |         batch_op.drop_column('active_ingredients')
29 |         batch_op.drop_column('company')
30 | 


--------------------------------------------------------------------------------
/docs/collectors/actrn.md:
--------------------------------------------------------------------------------
 1 | # ACTRN
 2 | 
 3 | http://www.anzctr.org.au/
 4 | 
 5 | The ANZCTR is an online registry of clinical trials being
 6 | undertaken in Australia, New Zealand and elsewhere.
 7 | 
 8 | ## Source Data Model
 9 | 
10 | Data could be accessed thru the web interface.
11 | Example - https://www.anzctr.org.au/Trial/Registration/TrialReview.aspx?id=369698&isReview=true.
12 | Data is moving to the warehouse as it is with additional type casting.
13 | See the next section for more details.
14 | 
15 | For more information - http://www.anzctr.org.au/docs/ANZCTR%20Data%20field%20explanation.pdf
16 | 
17 | ## Warehouse Data Model
18 | 
19 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/actrn/record.py)
20 | for the full data model.
21 | 
22 | ## Primary Identifiers
23 | 
24 | Trial identifier: `trial_id`
25 | 
26 | ## Data Update Strategy
27 | 
28 | Web interface and source model doesn't have something like
29 | `updated` field. So to stay up to date full scan is needed.
30 | 
31 | ## License Terms
32 | 
33 | http://www.anzctr.org.au/Support/Terms.aspx
34 | 


--------------------------------------------------------------------------------
/docs/collectors/takeda.md:
--------------------------------------------------------------------------------
 1 | # Takeda
 2 | 
 3 | http://www.takedaclinicaltrials.com/
 4 | 
 5 | This website is designed to advance Takeda's commitment to the health of patients and the science of medicine by providing greater access to information on Takeda's clinical trials while safeguarding patients' confidentiality.
 6 | 
 7 | ## Source Data Model
 8 | 
 9 | Data could be accessed thru the web interface.
10 | Example - http://www.takedaclinicaltrials.com/browse/summary/01-00-TL-OPI-501#overview.
11 | Data is moving to the warehouse as it is with additional type casting.
12 | See the next section for more details.
13 | 
14 | ## Warehouse Data Model
15 | 
16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/takeda/record.py)
17 | for the full data model.
18 | 
19 | ## Primary Identifiers
20 | 
21 | Trial identifier: `takeda_trial_id`
22 | 
23 | ## Data Update Strategy
24 | 
25 | Web interface and source model doesn't have something like
26 | `updated` field. So to stay up to date full scan is needed.
27 | 
28 | ## License Terms
29 | 
30 | http://www.takedaclinicaltrials.com/legal/terms
31 | 


--------------------------------------------------------------------------------
/migrations/versions/20160224180815_trials_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import ARRAY, UUID
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'c2ae4513dd2b'
14 | down_revision = u'9833dacb0b30'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table('trials',
21 |         sa.Column('uuid', UUID, primary_key=True),
22 |         sa.Column('updated', sa.DateTime(timezone=True), nullable=False),
23 |         sa.Column('records', ARRAY(sa.Text), nullable=False, unique=True),
24 |         sa.Column('nct_id', sa.Text, unique=True),
25 |         sa.Column('euctr_id', sa.Text, unique=True),
26 |         sa.Column('isrctn_id', sa.Text, unique=True),
27 |         sa.Column('scientific_title', sa.Text, unique=True),
28 |     )
29 | 
30 | 
31 | def downgrade():
32 |     op.drop_table('trials')
33 | 


--------------------------------------------------------------------------------
/migrations/versions/20160323145124_trials_remove_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import ARRAY, UUID
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '58d2189bc678'
14 | down_revision = u'89c87deb5a02'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.drop_table('trials')
21 | 
22 | 
23 | def downgrade():
24 |     op.create_table('trials',
25 |         sa.Column('uuid', UUID, primary_key=True),
26 |         sa.Column('updated', sa.DateTime(timezone=True), nullable=False),
27 |         sa.Column('records', ARRAY(sa.Text), nullable=False, unique=True),
28 |         sa.Column('nct_id', sa.Text, unique=True),
29 |         sa.Column('euctr_id', sa.Text, unique=True),
30 |         sa.Column('isrctn_id', sa.Text, unique=True),
31 |         sa.Column('scientific_title', sa.Text, unique=True),
32 |     )
33 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 OKFN
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # path to migration scripts
 5 | script_location = migrations
 6 | 
 7 | # template used to generate migration files
 8 | file_template = %%(year)d%%(month).2d%%(day).2d%%(hour).2d%%(minute).2d%%(second).2d_%%(slug)s
 9 | 
10 | # max length of characters to apply to the
11 | # "slug" field
12 | #truncate_slug_length = 40
13 | 
14 | # set to 'true' to run the environment during
15 | # the 'revision' command, regardless of autogenerate
16 | # revision_environment = false
17 | 
18 | # set to 'true' to allow .pyc and .pyo files without
19 | # a source .py file to be detected as revisions in the
20 | # versions/ directory
21 | # sourceless = false
22 | 
23 | # version location specification; this defaults
24 | # to migrations/versions.  When using multiple version
25 | # directories, initial revisions must be specified with --version-path
26 | # version_locations = %(here)s/bar %(here)s/bat migrations/versions
27 | 
28 | # the output encoding used when revision files
29 | # are written from script.py.mako
30 | # output_encoding = utf-8
31 | sqlalchemy.url = driver://user:pass@localhost/dbname
32 | 


--------------------------------------------------------------------------------
/docs/collectors/pfizer.md:
--------------------------------------------------------------------------------
 1 | # Pfizer
 2 | 
 3 | http://www.pfizer.com/research/clinical_trials
 4 | 
 5 | Pfizer works to discover and develop innovative, safe, and effective ways to prevent or treat some of the world’s most challenging diseases. We are committed to the safety of patients who take part in our trials, and uphold the highest ethical standards in all of our research initiatives.
 6 | 
 7 | ## Source Data Model
 8 | 
 9 | Data could be accessed thru the web interface.
10 | Example - http://www.pfizer.com/research/clinical_trials/find_a_trial/NCT00795938.
11 | Data is moving to the warehouse as it is with additional type casting.
12 | See the next section for more details.
13 | 
14 | ## Warehouse Data Model
15 | 
16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/pfizer/record.py)
17 | for the full data model.
18 | 
19 | ## Primary Identifiers
20 | 
21 | Trial identifier: `nct_id`
22 | 
23 | ## Data Update Strategy
24 | 
25 | Web interface and source model doesn't have something like
26 | `updated` field. So to stay up to date full scan is needed.
27 | 
28 | ## License Terms
29 | 
30 | http://www.pfizer.com/general/terms
31 | 


--------------------------------------------------------------------------------
/docs/collectors/pubmed.md:
--------------------------------------------------------------------------------
 1 | # Pubmed
 2 | 
 3 | http://www.ncbi.nlm.nih.gov/pubmed
 4 | 
 5 | PubMed comprises more than 26 million citations for biomedical literature from MEDLINE, life science journals, and online books. Citations may include links to full-text content from PubMed Central and publisher web sites.
 6 | 
 7 | ## Source Data model
 8 | 
 9 | Data could be accessed via [E-Utilities](http://www.ncbi.nlm.nih.gov/books/NBK25497/).
10 | Data model of publication - https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html.
11 | Data is moving to the warehouse as it is with additional type casting.
12 | See the next section for more details.
13 | 
14 | ## Warehouse Data Model
15 | 
16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/pubmed/record.py)
17 | for the full data model.
18 | 
19 | ## Primary Identifiers
20 | 
21 | Trial identifier: `pmid`
22 | 
23 | ## Data Update Strategy
24 | 
25 | Tha last recent modified data could be searched.
26 | After initial scraping we should use the last 2 days searches
27 | to stay up to date.
28 | 
29 | ## License Terms
30 | 
31 | http://www.ncbi.nlm.nih.gov/home/about/policies.shtml
32 | 


--------------------------------------------------------------------------------
/migrations/versions/20160220212552_nct_fix_boolean_columns.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '9833dacb0b30'
12 | down_revision = u'820db6031f39'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | 
17 | def upgrade():
18 |     op.execute('ALTER TABLE nct ALTER COLUMN is_fda_regulated TYPE boolean USING is_fda_regulated::boolean')
19 |     op.execute('ALTER TABLE nct ALTER COLUMN is_section_801 TYPE boolean USING is_section_801::boolean')
20 |     op.execute('ALTER TABLE nct ALTER COLUMN has_expanded_access TYPE boolean USING has_expanded_access::boolean')
21 | 
22 | 
23 | def downgrade():
24 |     op.execute('ALTER TABLE nct ALTER COLUMN is_fda_regulated TYPE text USING is_fda_regulated::text')
25 |     op.execute('ALTER TABLE nct ALTER COLUMN is_section_801 TYPE text USING is_section_801::text')
26 |     op.execute('ALTER TABLE nct ALTER COLUMN has_expanded_access TYPE text USING has_expanded_access::text')
27 | 


--------------------------------------------------------------------------------
/migrations/versions/20160510000353_fda_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from alembic import op
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '9f367826f849'
13 | down_revision = u'6a990542e4b4'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.create_table('fda',
20 | 
21 |         # Meta
22 | 
23 |         sa.Column('meta_id', sa.Text, unique=True),
24 |         sa.Column('meta_source', sa.Text),
25 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
26 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
27 | 
28 |         # General
29 | 
30 |         sa.Column('product_ndc', sa.Text, primary_key=True),
31 |         sa.Column('product_type', sa.Text),
32 |         sa.Column('generic_name', sa.Text),
33 |         sa.Column('brand_name', sa.Text),
34 |         sa.Column('last_updated', sa.Date),
35 | 
36 |     )
37 | 
38 | 
39 | def downgrade():
40 |     op.drop_table('fda')
41 | 


--------------------------------------------------------------------------------
/migrations/versions/20160525132926_gsk_fix_column_names.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = 'f736bb9d2499'
12 | down_revision = u'e77e7eaf0a34'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | MAPPING = {
17 |     'clinicaltrials_gov_identifier': 'clinicaltrialsgov_identifier',
18 |     'ind_ide_protocol': 'indide_protocol',
19 |     'ind_ide_grantor': 'indide_grantor',
20 |     'ind_ide_number': 'indide_number',
21 |     'ind_ide_serial_number': 'indide_serial_number',
22 |     'responsible_party_name_official_title': 'responsible_party_nameofficial_title',
23 |     'trade_name_product_name': 'trade_name__product_name',
24 | }
25 | 
26 | 
27 | def upgrade():
28 |     for key, value in MAPPING.items():
29 |         op.alter_column('gsk', column_name=value, new_column_name=key)
30 | 
31 | 
32 | def downgrade():
33 |     for key, value in MAPPING.items():
34 |         op.alter_column('gsk', column_name=key, new_column_name=value)
35 | 


--------------------------------------------------------------------------------
/docs/collectors/isrctn.md:
--------------------------------------------------------------------------------
 1 | # ISRCTN
 2 | 
 3 | http://www.isrctn.com/
 4 | 
 5 | The ISRCTN registry is a primary clinical trial registry recognised by WHO and ICMJE that accepts all clinical research studies (whether proposed, ongoing or completed), providing content validation and curation and the unique identification number necessary for publication. All study records in the database are freely accessible and searchable.
 6 | 
 7 | ## Source Data Model
 8 | 
 9 | Data could be accessed thru the web interface.
10 | Example - http://www.isrctn.com/ISRCTN13619480.
11 | Data is moving to the warehouse as it is with additional type casting.
12 | See the next section for more details.
13 | 
14 | ## Warehouse Data Model
15 | 
16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/isrctn/record.py)
17 | for the full data model.
18 | 
19 | ## Primary Identifiers
20 | 
21 | Trial identifier: `isrctn_id`
22 | 
23 | ## Data Update Strategy
24 | 
25 | Trials could be serched with `last_edited` filter.
26 | After initial scraping we should use the last 2 days searches
27 | to stay up to date (`recent` stack).
28 | 
29 | ## License Terms
30 | 
31 | http://www.isrctn.com/page/terms
32 | 


--------------------------------------------------------------------------------
/migrations/versions/20160509115712_icdcm_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import ARRAY
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'be9dfe290c44'
14 | down_revision = u'b720671a8c0f'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table('icdcm',
21 | 
22 |         # Meta
23 | 
24 |         sa.Column('meta_id', sa.Text, unique=True),
25 |         sa.Column('meta_source', sa.Text),
26 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
27 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
28 | 
29 |         # General
30 | 
31 |         sa.Column('name', sa.Text, primary_key=True),
32 |         sa.Column('desc', sa.Text),
33 |         sa.Column('terms', ARRAY(sa.Text)),
34 |         sa.Column('version', sa.Text),
35 |         sa.Column('last_updated', sa.Date),
36 | 
37 |     )
38 | 
39 | 
40 | def downgrade():
41 |     op.drop_table('icdcm')
42 | 


--------------------------------------------------------------------------------
/migrations/versions/20160509133714_icdpcs_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from alembic import op
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '6a990542e4b4'
13 | down_revision = u'be9dfe290c44'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.create_table('icdpcs',
20 | 
21 |         # Meta
22 | 
23 |         sa.Column('meta_id', sa.Text, unique=True),
24 |         sa.Column('meta_source', sa.Text),
25 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
26 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
27 | 
28 |         # General
29 | 
30 |         sa.Column('code', sa.Text, primary_key=True),
31 |         sa.Column('is_header', sa.Boolean),
32 |         sa.Column('short_description', sa.Text),
33 |         sa.Column('long_description', sa.Text),
34 |         sa.Column('version', sa.Text),
35 |         sa.Column('last_updated', sa.Date),
36 | 
37 |     )
38 | 
39 | 
40 | def downgrade():
41 |     op.drop_table('icdpcs')
42 | 


--------------------------------------------------------------------------------
/collectors/pfizer/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.spiders import Rule
 8 | from scrapy.spiders import CrawlSpider
 9 | from scrapy.linkextractors import LinkExtractor
10 | from .parser import parse_record
11 | 
12 | 
13 | # Module API
14 | 
15 | class Spider(CrawlSpider):
16 | 
17 |     # Public
18 | 
19 |     name = 'pfizer'
20 |     allowed_domains = ['pfizer.com']
21 | 
22 |     def __init__(self, conf=None, conn=None):
23 | 
24 |         # Save conf/conn
25 |         self.conf = conf
26 |         self.conn = conn
27 | 
28 |         # Make urls
29 |         self.start_urls = [
30 |             'http://www.pfizer.com/research/clinical_trials/find_a_trial?recr=0',
31 |         ]
32 | 
33 |         # Make rules
34 |         self.rules = [
35 |             Rule(LinkExtractor(
36 |                 allow=r'find_a_trial/NCT\d+',
37 |             ), callback=parse_record),
38 |             Rule(LinkExtractor(
39 |                 allow=r'page=\d+',
40 |             )),
41 |         ]
42 | 
43 |         # Inherit parent
44 |         super(Spider, self).__init__()
45 | 


--------------------------------------------------------------------------------
/collectors/takeda/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.spiders import Rule
 8 | from scrapy.spiders import CrawlSpider
 9 | from scrapy.linkextractors import LinkExtractor
10 | from .parser import parse_record
11 | 
12 | 
13 | # Module API
14 | 
15 | class Spider(CrawlSpider):
16 | 
17 |     # Public
18 | 
19 |     name = 'takeda'
20 |     allowed_domains = ['takedaclinicaltrials.com']
21 | 
22 |     def __init__(self, conf=None, conn=None):
23 | 
24 |         # Save conf/conn
25 |         self.conf = conf
26 |         self.conn = conn
27 | 
28 |         # Make urls
29 |         self.start_urls = [
30 |             'http://www.takedaclinicaltrials.com/browse/?protocol_id=',
31 |         ]
32 | 
33 |         # Make rules
34 |         self.rules = [
35 |             Rule(LinkExtractor(
36 |                 allow=r'browse/summary/',
37 |             ), callback=parse_record),
38 |             Rule(LinkExtractor(
39 |                 allow=r'browse',
40 |             )),
41 |         ]
42 | 
43 |         # Inherit parent
44 |         super(Spider, self).__init__()
45 | 


--------------------------------------------------------------------------------
/docs/collectors/ictrp.md:
--------------------------------------------------------------------------------
 1 | # ICTRP
 2 | 
 3 | http://apps.who.int/trialsearch/Default.aspx
 4 | 
 5 | The Clinical Trials Search Portal provides access to a central database containing the trial registration data sets provided by the registries listed on the right. It also provides links to the full original records.
 6 | 
 7 | ## Source Data Model
 8 | 
 9 | Data could be accessed thru the web interface (http basic auth is required for crawling).
10 | Example - http://apps.who.int/trialsearch/Trial3.aspx?trialid=NCT00399620.
11 | Data is moving to the warehouse as it is with additional type casting.
12 | See the next section for more details.
13 | 
14 | ## Warehouse Data Model
15 | 
16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/ictrp/record.py)
17 | for the full data model.
18 | 
19 | ## Primary Identifiers
20 | 
21 | Trial identifier: `main_id`
22 | 
23 | ## Data Update Strategy
24 | 
25 | Web interface and source model doesn't have something like
26 | `updated` field. So to stay up to date full scan is needed.
27 | 
28 | Proposed solution - add algorithm based on `main_id` intervals showed on
29 | index page for crawling.
30 | 
31 | ## License Terms
32 | 
33 | http://www.who.int/about/copyright/en/
34 | 


--------------------------------------------------------------------------------
/migrations/versions/20170123144318_default_for_meta_created_and_meta_updated.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = '542425c4e70b'
13 | down_revision = u'0087dc1eb534'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | updatable_tables = ['actrn', 'cochrane_reviews', 'euctr', 'fda_dap', 'fdadl', 'gsk',
18 |     'hra', 'icdcm', 'icdpcs', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'pubmed', 'takeda']
19 | 
20 | 
21 | def upgrade():
22 |     for table in updatable_tables:
23 |         op.alter_column(table, 'meta_created', nullable=False,
24 |                         server_default=sa.func.current_timestamp())
25 |         op.alter_column(table, 'meta_updated', nullable=False,
26 |                         server_default=sa.func.current_timestamp())
27 | 
28 | 
29 | def downgrade():
30 |     for table in updatable_tables:
31 |         op.alter_column(table, 'meta_created', nullable=True, server_default=None)
32 |         op.alter_column(table, 'meta_updated', nullable=True, server_default=None)
33 | 


--------------------------------------------------------------------------------
/migrations/versions/20161007222818_create_cochrane_reviews_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import UUID, JSONB
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '84910d455f31'
14 | down_revision = u'bf807df84277'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table('cochrane_reviews',
21 |         sa.Column('meta_id', sa.Text),
22 |         sa.Column('meta_created', sa.DateTime(timezone=True), server_default=sa.text('now()')),
23 |         sa.Column('meta_updated', sa.DateTime(timezone=True), server_default=sa.text('now()')),
24 |         sa.Column('meta_source', sa.Text),
25 | 
26 |         sa.Column('id', UUID, primary_key=True),
27 |         sa.Column('study_type', sa.Text),
28 |         sa.Column('file_name', sa.Text),
29 |         sa.Column('robs', JSONB),
30 |         sa.Column('study_id', sa.Text),
31 |         sa.Column('refs', JSONB),
32 |         sa.Column('doi_id', sa.Text),
33 |     )
34 | 
35 | 
36 | def downgrade():
37 |     op.drop_table('nct')
38 | 


--------------------------------------------------------------------------------
/collectors/pubmed/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Json, Array
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'pubmed'
18 | 
19 |     # Medline
20 | 
21 |     pmid = Text(primary_key=True)
22 |     date_created = Date('%Y-%m-%d')
23 |     date_completed = Date('%Y-%m-%d')
24 |     date_revised = Date('%Y-%m-%d')
25 |     country = Text()
26 |     medline_ta = Text()
27 |     nlm_unique_id = Text()
28 |     issn_linking = Text()
29 |     mesh_headings = Json()
30 | 
31 |     # Journal
32 | 
33 |     journal_issn = Text()
34 |     journal_title = Text()
35 |     journal_iso = Text()
36 | 
37 |     # Article
38 | 
39 |     article_title = Text()
40 |     article_abstract = Text()
41 |     article_authors = Array()
42 |     article_language = Text()
43 |     article_publication_type_list = Array()
44 |     article_vernacular_title = Text()
45 |     article_date = Date('%Y-%m-%d')
46 | 
47 |     # Pubmed
48 | 
49 |     publication_status = Text()
50 |     article_ids = Json()
51 |     registry_ids = Json()
52 | 


--------------------------------------------------------------------------------
/docs/collectors/nct.md:
--------------------------------------------------------------------------------
 1 | # NCT
 2 | 
 3 | https://clinicaltrials.gov/
 4 | 
 5 | ClinicalTrials.gov is a registry and results database of publicly and privately supported clinical studies of human participants conducted around the world.
 6 | 
 7 | ## Source Data model
 8 | 
 9 | Analysis of NCT data model:
10 | - copy text from `https://www.clinicaltrials.gov/ct2/html/images/info/public.xsd`
11 | - past text to `http://xmlgrid.net/` and click `Submit`
12 | - now you can discover the whole data model, data types etc
13 | 
14 | > Only around 10% of studies have a `clinical_results` section - https://www.clinicaltrials.gov/ct2/help/how-find/find-study-results
15 | 
16 | ---
17 | 
18 | ![](https://cloud.githubusercontent.com/assets/557395/10075868/d77548fe-62e0-11e5-84e0-c81ec6badcfe.png)
19 | 
20 | ## Warehouse Data Model
21 | 
22 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/nct/record.py)
23 | for the full data model.
24 | 
25 | ## Primary Identifiers
26 | 
27 | Trial identifier: `nct_id`
28 | 
29 | ## Data Update Strategy
30 | 
31 | Trials could be serched with `lastchanges_date` filter.
32 | After initial scraping we should use the last 2 days searches
33 | to stay up to date (`recent` stack).
34 | 
35 | ## License Terms
36 | 
37 | https://clinicaltrials.gov/ct2/about-site/terms-conditions
38 | 


--------------------------------------------------------------------------------
/migrations/versions/20160725130032_fda_dap_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import JSONB
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '23c55ccc0649'
14 | down_revision = u'3a3b663824f1'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table('fda_dap',
21 | 
22 |         # Meta
23 | 
24 |         sa.Column('meta_id', sa.Text, unique=True),
25 |         sa.Column('meta_source', sa.Text),
26 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
27 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
28 | 
29 |         # General
30 | 
31 |         sa.Column('id', sa.Text, unique=True),
32 |         sa.Column('documents', JSONB),
33 |         sa.Column('approval_type', sa.Text),
34 |         sa.Column('supplement_number', sa.Integer),
35 |         sa.Column('action_date', sa.Date),
36 |         sa.Column('fda_application_num', sa.Text),
37 |         sa.Column('notes', sa.Text),
38 | 
39 |     )
40 | 
41 | 
42 | def downgrade():
43 |     op.drop_table('fda_dap')
44 | 


--------------------------------------------------------------------------------
/docs/collectors/gsk.md:
--------------------------------------------------------------------------------
 1 | # GSK
 2 | 
 3 | http://www.gsk-clinicalstudyregister.com/
 4 | 
 5 | The GlaxoSmithKline  (GSK)  Clinical Study Register provides an easily accessible repository of data from GSK-Sponsored Clinical Studies, supplementing communication in journals, at scientific meetings, in letters to healthcare professionals, and in approved prescribing information. It is important to emphasise that approved prescribing information must continue to guide appropriate use of GSK medicines. This information may vary from country to country.
 6 | 
 7 | ## Source Data Model
 8 | 
 9 | Data could be accessed thru the web interface.
10 | Example - http://www.gsk-clinicalstudyregister.com/study/100901.
11 | Data is moving to the warehouse as it is with additional type casting.
12 | See the next section for more details.
13 | 
14 | ## Warehouse Data Model
15 | 
16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/gsk/record.py)
17 | for the full data model.
18 | 
19 | ## Primary Identifiers
20 | 
21 | Trial identifier: `study_id`
22 | 
23 | ## Data Update Strategy
24 | 
25 | Trials could be serched with `last_updated` filter.
26 | After initial scraping we should use the last 2 days searches
27 | to stay up to date (`recent` stack).
28 | 
29 | ## License Terms
30 | 
31 | http://www.gsk.com/en-gb/terms-of-use/
32 | 


--------------------------------------------------------------------------------
/tests/collectors/gsk/test_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from collectors.gsk.parser import parse_record
 8 | 
 9 | 
10 | class TestGskParser(object):
11 |     def test_results_url_contains_absolute_url(self, get_url):
12 |         url = 'http://www.gsk-clinicalstudyregister.com/study/100006'
13 |         response = get_url(url)
14 | 
15 |         record = parse_record(response)
16 | 
17 |         assert record['results_url'].startswith('http')
18 | 
19 |     def test_results_url_is_none_for_trials_without_results(self, get_url):
20 |         url = 'http://www.gsk-clinicalstudyregister.com/study/106847'
21 |         response = get_url(url)
22 | 
23 |         record = parse_record(response)
24 | 
25 |         assert record.get('results_url') is None
26 | 
27 |     def test_handles_all_date_formats(self, get_url):
28 |         url = 'https://www.gsk-clinicalstudyregister.com/study/100006'
29 | 
30 |         response = get_url(url)
31 | 
32 |         record = parse_record(response)
33 | 
34 |         assert record.get('last_updated') is not None
35 |         assert record.get('record_verification_date') is not None
36 |         assert record.get('study_start_date') is not None
37 | 


--------------------------------------------------------------------------------
/docs/collectors/jprn.md:
--------------------------------------------------------------------------------
 1 | # JPRN
 2 | 
 3 | http://www.umin.ac.jp/ctr/
 4 | 
 5 | UMIN was establshed in 1989 as a cooperative organization for national medical schools in Japan, sponsored by the Ministry of Education, Culsutre, Science, Sports and Technology (MEXT), Japan. Its most services are now made available to other heath care researchers via the Internet.
 6 | 
 7 | ## Source Data Model
 8 | 
 9 | Data could be accessed thru the web interface.
10 | Example - https://upload.umin.ac.jp/cgi-open-bin/ctr/ctr.cgi?function=brows&action=brows&type=summary&recptno=R000023978&language=E.
11 | Data is moving to the warehouse as it is with additional type casting.
12 | See the next section for more details.
13 | 
14 | ## Warehouse Data Model
15 | 
16 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/jprn/record.py)
17 | for the full data model.
18 | 
19 | ## Primary Identifiers
20 | 
21 | Trial identifier: `unique_trial_number`
22 | 
23 | ## Data Update Strategy
24 | 
25 | Trials have `date_and_time_of_last_update` field.
26 | Newly created and updated trials have to be searched
27 | using desc last_updated ordering (by default).
28 | After initial scraping we should use the last 2 pages of search results
29 | to stay up to date (`recent` stack).
30 | 
31 | ## License Terms
32 | 
33 | http://www.umin.ac.jp/ctr/UMIN-CTR_e_FAQ.htm
34 | 


--------------------------------------------------------------------------------
/docs/collectors/euctr.md:
--------------------------------------------------------------------------------
 1 | # Euctr
 2 | 
 3 | https://www.clinicaltrialsregister.eu/
 4 | 
 5 | The EU Clinical Trials Register contains information on interventional clinical trials on medicines conducted in the European Union (EU), or the European Economic Area (EEA) which started after 1 May 2004.
 6 | 
 7 | ## Source Data Model
 8 | 
 9 | Data could be accessed thru the web interface.
10 | Example - https://www.clinicaltrialsregister.eu/ctr-search/trial/2004-000534-36/SK.
11 | Data is moving to the warehouse as it is with additional type casting.
12 | See the next section for more details.
13 | 
14 | Additional information - https://eudract.ema.europa.eu/
15 | 
16 | ## Warehouse Data Model
17 | 
18 | [See table definition](https://github.com/opentrials/collectors/blob/master/collectors/euctr/record.py)
19 | for the full data model.
20 | 
21 | ## Primary Identifiers
22 | 
23 | Trial identifier: `eudract_number_with_country`
24 | 
25 | ## Data Update Strategy
26 | 
27 | Web interface and source model doesn't have something like
28 | `updated` field. So to stay up to date full scan is needed.
29 | 
30 | Proposed solution - use [feed](https://www.clinicaltrialsregister.eu/ctr-search/rest/feed/bydates?query=&dateFrom=2000-01-01&dateTo=2015-01-02) of created/updated in the last 7 days items matching the filter parameters.
31 | 
32 | ## License Terms
33 | 
34 | https://www.clinicaltrialsregister.eu/disclaimer.html
35 | 


--------------------------------------------------------------------------------
/collectors/ictrp/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from scrapy.spiders import Rule
 8 | from scrapy.spiders import CrawlSpider
 9 | from scrapy.linkextractors import LinkExtractor
10 | from .parser import parse_record
11 | 
12 | 
13 | # Module API
14 | 
15 | class Spider(CrawlSpider):
16 | 
17 |     # Public
18 | 
19 |     name = 'ictrp'
20 |     allowed_domains = ['who.int']
21 | 
22 |     def __init__(self, conf=None, conn=None, http_user=None, http_pass=None):
23 | 
24 |         # Save conf/conn
25 |         self.conf = conf
26 |         self.conn = conn
27 | 
28 |         # Save creadentials
29 |         self.http_user = http_user
30 |         self.http_pass = http_pass
31 | 
32 |         # Make urls
33 |         self.start_urls = [
34 |             'http://apps.who.int/trialsearch/crawl/crawl0.aspx',
35 |         ]
36 | 
37 |         # Make rules
38 |         self.rules = [
39 |             Rule(LinkExtractor(
40 |                 allow=r'trialsearch/Trial\d+\.aspx\?trialid=.+',
41 |             ), callback=parse_record),
42 |             Rule(LinkExtractor(
43 |                 allow=r'trialsearch/crawl/crawl\d+\.aspx',
44 |             )),
45 |         ]
46 | 
47 |         # Inherit parent
48 |         super(Spider, self).__init__()
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # collectors
 2 | 
 3 | [![Gitter](https://img.shields.io/gitter/room/opentrials/chat.svg)](https://gitter.im/opentrials/chat)
 4 | [![Travis](https://img.shields.io/travis/opentrials/collectors/master.svg)](https://travis-ci.org/opentrials/collectors)
 5 | [![Issues](https://img.shields.io/badge/issue-tracker-orange.svg)](https://github.com/opentrials/opentrials/issues)
 6 | [![Docs](https://img.shields.io/badge/docs-latest-blue.svg)](http://docs.opentrials.net/en/latest/developers/)
 7 | 
 8 | The OpenTrials data collectors + `warehouse` database schema definition.
 9 | 
10 | ## Documentation
11 | 
12 | - [Overview](docs/overview.md)
13 | - [Warehouse](docs/warehouse.md)
14 | - [Collectors](docs/collectors/)
15 |   - [ACTRN](docs/collectors/actrn.md)
16 |   - [EUCTR](docs/collectors/euctr.md)
17 |   - [GSK](docs/collectors/gsk.md)
18 |   - [ICTRP](docs/collectors/ictrp.md)
19 |   - [ISRCTN](docs/collectors/isrctn.md)
20 |   - [JPRN](docs/collectors/jprn.md)
21 |   - [NCT](docs/collectors/nct.md)
22 |   - [Pfizer](docs/collectors/pfizer.md)
23 |   - [Takeda](docs/collectors/takeda.md)
24 |   - [Pubmed](docs/collectors/pubmed.md)
25 | 
26 | ## Contributing
27 | 
28 | Please read the contribution guideline:
29 | 
30 | - [How to Contribute](CONTRIBUTING.md)
31 | - [How to Write a Collector](docs/collector-guide.md)
32 | - [How to Write a Collector (using Scrapy)](docs/collector-scrapy-guide.md)
33 | 
34 | Thanks!
35 | 


--------------------------------------------------------------------------------
/collectors/ictrp/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Integer, Json, Array
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'ictrp'
18 | 
19 |     # Main
20 | 
21 |     main_id = Text(primary_key=True)
22 |     register = Text()
23 |     last_refreshed_on = Date('%d %B %Y')
24 |     date_of_registration = Text()  # non regular format
25 |     primary_sponsor = Text()
26 |     public_title = Text()
27 |     scientific_title = Text()
28 |     date_of_first_enrollment = Text()  # non regular format
29 |     target_sample_size = Integer()
30 |     recruitment_status = Text()
31 |     url = Text()
32 |     study_type = Text()
33 |     study_design = Text()
34 |     study_phase = Text()
35 | 
36 |     # Additional
37 | 
38 |     countries_of_recruitment = Array()
39 |     contacts = Json()
40 |     key_inclusion_exclusion_criteria = Text()  # not presented on the site
41 |     health_conditions_or_problems_studied = Array()
42 |     interventions = Array()
43 |     primary_outcomes = Array()
44 |     secondary_outcomes = Array()
45 |     secondary_ids = Array()
46 |     sources_of_monetary_support = Array()
47 |     secondary_sponsors = Array()
48 | 


--------------------------------------------------------------------------------
/collectors/takeda/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Integer, Array
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'takeda'
18 | 
19 |     # General
20 | 
21 |     takeda_trial_id = Text(primary_key=True)
22 |     official_title = Text()
23 |     trial_phase = Text()
24 |     condition = Text()
25 |     compound = Array()
26 |     recruitment_status = Text()
27 | 
28 |     # Description
29 | 
30 |     nct_number = Text()
31 |     trial_type = Text()
32 |     other_trial_ids = Text()
33 |     acronym = Text()
34 |     brief_summary = Text()
35 |     detailed_description = Text()
36 |     trial_design = Text()
37 |     primary_outcome_measures = Text()
38 |     secondary_outcome_measures = Text()
39 |     trial_arms_groups_or_cohorts = Text()
40 | 
41 |     # Recruitment
42 | 
43 |     gender = Text()
44 |     ages = Text()
45 |     enrollment_number_of_participants = Integer()
46 |     locations = Array()
47 |     responsible_party = Text()
48 |     trial_sponsor = Text()
49 |     start_date = Date('%B %Y')
50 |     completion_date = Date('%B %Y')
51 |     eligibility_criteria = Text()
52 | 
53 |     # Results
54 | 
55 |     download_the_clinical_trial_summary = Text()
56 |     other_available_languages = Text()
57 | 


--------------------------------------------------------------------------------
/migrations/versions/20160226134759_pfizer_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from alembic import op
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'a8d6e250d481'
13 | down_revision = u'c2ae4513dd2b'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.create_table('pfizer',
20 | 
21 |         # Meta
22 | 
23 |         sa.Column('meta_uuid', sa.Text),
24 |         sa.Column('meta_source', sa.Text),
25 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
26 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
27 | 
28 |         # General
29 | 
30 |         sa.Column('title', sa.Text),
31 | 
32 |         # Description
33 | 
34 |         sa.Column('study_type', sa.Text),
35 |         sa.Column('organization_id', sa.Text),
36 |         sa.Column('nct_id', sa.Text),
37 |         sa.Column('status', sa.Text),
38 |         sa.Column('study_start_date', sa.Date),
39 |         sa.Column('study_end_date', sa.Date),
40 | 
41 |         # Eligibility
42 | 
43 |         sa.Column('eligibility_criteria', sa.Text),
44 |         sa.Column('gender', sa.Text),
45 |         sa.Column('age_range', sa.Text),
46 |         sa.Column('healthy_volunteers_allowed', sa.Boolean),
47 | 
48 |     )
49 | 
50 | 
51 | def downgrade():
52 |     op.drop_table('pfizer')
53 | 


--------------------------------------------------------------------------------
/collectors/cochrane_reviews/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import logging
 8 | import requests
 9 | import zipfile
10 | import io
11 | from .. import base
12 | from .parser import parse_record
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def collect(conf, conn, date_from=None, date_to=None):
17 |     file_count = 0
18 |     base.helpers.start(conf, 'cochrane', {})
19 | 
20 |     content = requests.get(conf['COCHRANE_ARCHIVE_URL']).content
21 |     with zipfile.ZipFile(io.BytesIO(content)) as archive:
22 |         for filename in archive.namelist():
23 |             base.config.SENTRY.extra_context({
24 |                 'filename': filename,
25 |             })
26 | 
27 |             with archive.open(filename, 'rU') as review_file:
28 |                 db_records = parse_record(conf['COCHRANE_ARCHIVE_URL'], review_file)
29 |                 for rec in db_records:
30 |                     query = {'file_name': rec['file_name'], 'study_id': rec['study_id']}
31 |                     if rec.table in conn['warehouse'].tables:
32 |                         existing = conn['warehouse'][rec.table].find_one(**query)
33 |                         if existing:
34 |                             rec['id'] = existing['id']
35 |                     rec.write(conf, conn)
36 |                     file_count += 1
37 | 
38 |     base.helpers.stop(conf, 'cochrane', {'collected': file_count})
39 | 


--------------------------------------------------------------------------------
/tests/collectors/euctr/test_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from collectors.euctr.parser import parse_record
 8 | 
 9 | 
10 | class TestEuctrParser(object):
11 |     def test_trial_results_url_returns_absolute_results_url(self, get_url):
12 |         url = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2011-005852-33/3rd'
13 |         response = get_url(url)
14 | 
15 |         record = parse_record(response)
16 | 
17 |         assert record.get('trial_results_url') == 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2011-005852-33/results'
18 |         assert record.get('trial_results') == 'View results'
19 | 
20 |     def test_trial_results_url_is_none_if_therere_no_results(self, get_url):
21 |         url = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2009-016529-32/EE'
22 |         response = get_url(url)
23 | 
24 |         record = parse_record(response)
25 | 
26 |         assert record.get('trial_results_url') is None
27 |         assert record.get('trial_results') is None
28 | 
29 |     def test_trial_results_url_is_none_if_results_not_hyperlink(self, get_url):
30 |         url = 'https://www.clinicaltrialsregister.eu/ctr-search/trial/2005-002909-23/PT'
31 |         response = get_url(url)
32 | 
33 |         record = parse_record(response)
34 | 
35 |         assert record.get('trial_results_url') is None
36 |         assert record.get('trial_results') == 'Removed from public view';
37 | 


--------------------------------------------------------------------------------
/collectors/base/helpers.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import re
 8 | import logging
 9 | import datetime
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | # Module API
14 | 
15 | def slugify(value):
16 |     """Slugify string value.
17 |     """
18 |     value = re.sub(r'[\W_]+', '_', value)
19 |     value = value.strip('_')
20 |     value = value.lower()
21 |     value = value[:63]  # Postgres limitation is 63
22 |     return value
23 | 
24 | 
25 | def parse_date(value, format):
26 |     """Parse sting date.
27 |     """
28 |     return datetime.datetime.strptime(value, format).date()
29 | 
30 | 
31 | def parse_datetime(value, format):
32 |     """Parse sting datetime.
33 |     """
34 |     return datetime.datetime.strptime(value, format)
35 | 
36 | 
37 | def get_variables(object, filter=None):
38 |     """Exract variables from object to dict using name filter.
39 |     """
40 |     variables = {}
41 |     for name, value in vars(object).items():
42 |         if filter is not None:
43 |             if not filter(name):
44 |                 continue
45 |         variables[name] = value
46 |     return variables
47 | 
48 | 
49 | def start(conf, name, message):
50 |     """Log collector start.
51 |     """
52 |     logger.info('Collector %s has been started(%s)', name, message)
53 | 
54 | 
55 | def stop(conf, name, message):
56 |     """Log collector stop.
57 |     """
58 |     logger.info('Collector %s has stopped (%s)', name, message)
59 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | actrn:
 2 |   extends:
 3 |     service: actrn
 4 |     file: docker-cloud.yml
 5 |   restart: 'no'
 6 | 
 7 | euctr:
 8 |   extends:
 9 |     service: euctr
10 |     file: docker-cloud.yml
11 |   restart: 'no'
12 | 
13 | fdadl:
14 |   extends:
15 |     service: fdadl
16 |     file: docker-cloud.yml
17 |   restart: 'no'
18 | 
19 | fdadap:
20 |   extends:
21 |     service: fdadap
22 |     file: docker-cloud.yml
23 |   restart: 'no'
24 | 
25 | gsk:
26 |   extends:
27 |     service: gsk
28 |     file: docker-cloud.yml
29 |   restart: 'no'
30 | 
31 | icdcm:
32 |   extends:
33 |     service: icdcm
34 |     file: docker-cloud.yml
35 |   restart: 'no'
36 | 
37 | icdpcs:
38 |   extends:
39 |     service: icdpcs
40 |     file: docker-cloud.yml
41 |   restart: 'no'
42 | 
43 | ictrp:
44 |   extends:
45 |     service: ictrp
46 |     file: docker-cloud.yml
47 |   restart: 'no'
48 | 
49 | isrctn:
50 |   extends:
51 |     service: isrctn
52 |     file: docker-cloud.yml
53 |   restart: 'no'
54 | 
55 | jprn:
56 |   extends:
57 |     service: jprn
58 |     file: docker-cloud.yml
59 |   restart: 'no'
60 | 
61 | nct:
62 |   extends:
63 |     service: nct
64 |     file: docker-cloud.yml
65 |   restart: 'no'
66 | 
67 | pfizer:
68 |   extends:
69 |     service: pfizer
70 |     file: docker-cloud.yml
71 |   restart: 'no'
72 | 
73 | pubmed:
74 |   extends:
75 |     service: pubmed
76 |     file: docker-cloud.yml
77 |   restart: 'no'
78 | 
79 | takeda:
80 |   extends:
81 |     service: takeda
82 |     file: docker-cloud.yml
83 |   restart: 'no'
84 | 
85 | cochrane-reviews:
86 |   extends:
87 |     service: cochrane-reviews
88 |     file: docker-cloud.yml
89 |   restart: 'no'
90 | 


--------------------------------------------------------------------------------
/migrations/env.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import sys
 9 | import sqlalchemy as sa
10 | from alembic import context
11 | 
12 | 
13 | def run_migrations_offline():
14 |     """Run migrations in 'offline' mode.
15 | 
16 |     This configures the context with just a URL
17 |     and not an Engine, though an Engine is acceptable
18 |     here as well.  By skipping the Engine creation
19 |     we don't even need a DBAPI to be available.
20 | 
21 |     Calls to context.execute() here emit the given string to the
22 |     script output.
23 | 
24 |     """
25 |     url = context.config.get_main_option("sqlalchemy.url")
26 |     context.configure(url=url, target_metadata=None, literal_binds=True)
27 |     with context.begin_transaction():
28 |         context.run_migrations()
29 | 
30 | 
31 | def run_migrations_online():
32 |     """Run migrations in 'online' mode.
33 | 
34 |     In this scenario we need to create an Engine
35 |     and associate a connection with the context.
36 | 
37 |     """
38 |     sys.path.append(os.path.dirname(__file__))
39 |     import config
40 |     connectable = sa.create_engine(config.WAREHOUSE_URL)
41 |     with connectable.connect() as connection:
42 |         context.configure(connection=connection, target_metadata=None)
43 |         with context.begin_transaction():
44 |             context.run_migrations()
45 | 
46 | 
47 | # Run migrations
48 | if context.is_offline_mode():
49 |     run_migrations_offline()
50 | else:
51 |     run_migrations_online()
52 | 


--------------------------------------------------------------------------------
/migrations/versions/20160525130300_actrn_fix_column_names.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = 'e77e7eaf0a34'
12 | down_revision = u'11f80cc2fafb'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | MAPPING = {
17 |     'type_of_endpoint_s': 'type_of_endpoints',
18 |     'who_is_are_masked_blinded': 'who_is__are_masked__blinded',
19 |     'masking_blinding': 'masking__blinding',
20 |     'description_of_intervention_s_exposure': 'description_of_interventions__exposure',
21 |     'comparator_control_treatment': 'comparator__control_treatment',
22 |     'recruitment_state_s': 'recruitment_states',
23 |     'procedure_for_enrolling_a_subject_and_allocating_the_treatment_': 'procedure_for_enrolling_a_subject_and_allocating_the',
24 |     'methods_used_to_generate_the_sequence_in_which_subjects_will_be': 'methods_used_to_generate_the_sequence_in_which',
25 |     'statistical_methods_analysis': 'statistical_methods__analysis',
26 |     'trial_related_presentations_publications': 'trial_related_presentations__publications',
27 |     'target_follow_up_duration': 'target_followup_duration',
28 |     'target_follow_up_type': 'target_followup_type',
29 | }
30 | 
31 | 
32 | def upgrade():
33 |     for key, value in MAPPING.items():
34 |         op.alter_column('actrn', column_name=value, new_column_name=key)
35 | 
36 | 
37 | def downgrade():
38 |     for key, value in MAPPING.items():
39 |         op.alter_column('actrn', column_name=key, new_column_name=value)
40 | 


--------------------------------------------------------------------------------
/tests/collectors/nct/test_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import pytest
 8 | import io
 9 | from collectors.nct.parser import parse_record
10 | 
11 | 
12 | class TestNctParser(object):
13 |     @pytest.fixture
14 |     def get_record(self, get_url):
15 |         def _get_record(nct_id):
16 |             url = 'https://clinicaltrials.gov/show/{nct_id}?displayxml=true'.format(nct_id=nct_id)
17 |             response = io.BytesIO(get_url(url).body)
18 |             return parse_record(response)
19 | 
20 |         return _get_record
21 | 
22 |     def test_parser_parse_text(self, get_record):
23 |         record = get_record('NCT02931214')
24 |         assert record['url'] == 'https://clinicaltrials.gov/show/NCT02931214'
25 | 
26 |     def test_parser_parse_list(self, get_record):
27 |         primary_outcomes = [
28 |             {
29 |                 'measure': 'Treatment related adverse events',
30 |                 'time_frame': '15 days',
31 |                 'description': 'Treatment related adverse events as a measure of safety and tolerability of GMI-1359',
32 |                 'safety_issue': 'Yes'
33 |             }
34 |         ]
35 |         record = get_record('NCT02931214')
36 |         assert record['primary_outcomes'] == primary_outcomes
37 | 
38 |     def test_parser_parse_dict(self, get_record):
39 |         contact = {
40 |             'phone': '402-476-2811',
41 |             'last_name': 'Laura Sterling, MD'
42 |         }
43 |         record = get_record('NCT02931214')
44 |         assert record['overall_contact'] == contact
45 | 


--------------------------------------------------------------------------------
/collectors/icdpcs/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import logging
 9 | import zipfile
10 | import requests
11 | from .record import Record
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | # Module API
16 | 
17 | def collect(conf, conn):
18 |     """Collect ICD-XX-PCS procedures.
19 |     """
20 | 
21 |     # For more information see:
22 |     # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-PCS-and-GEMs.html
23 |     URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-PCS-Long-Abbrev-Titles.zip'
24 |     FILE = 'icd10pcs_order_2016.txt'
25 |     VERSION = 'ICD-10-PCS'
26 |     LAST_UPDATED = '2015-10-01'
27 | 
28 |     # Prepare file
29 |     zip = requests.get(URL).content
30 |     file = zipfile.ZipFile(io.BytesIO(zip)).open(FILE)
31 | 
32 |     count = 0
33 |     for line in file:
34 |         # Prepare data
35 |         # Format is described in instruction
36 |         # stored in zip archive we download
37 |         data = {
38 |             'code': line[6:6+7].strip(),
39 |             'is_header': line[14:14+1].strip(),
40 |             'short_description': line[16:16+60].strip(),
41 |             'long_description': line[77:].strip(),
42 |             'version': VERSION,
43 |             'last_updated': LAST_UPDATED,
44 |         }
45 | 
46 |         # Create record
47 |         record = Record.create(URL, data)
48 | 
49 |         # Write record
50 |         record.write(conf, conn)
51 | 
52 |         # Log info
53 |         count += 1
54 |         if not count % 100:
55 |             logger.info('Collected %s "%s" interventions', count, record.table)
56 | 


--------------------------------------------------------------------------------
/collectors/gsk/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from urllib import urlencode
 8 | from collections import OrderedDict
 9 | from datetime import date, timedelta
10 | from scrapy.spiders import Rule
11 | from scrapy.spiders import CrawlSpider
12 | from scrapy.linkextractors import LinkExtractor
13 | from .parser import parse_record
14 | 
15 | 
16 | # Module API
17 | 
18 | class Spider(CrawlSpider):
19 | 
20 |     # Public
21 | 
22 |     name = 'gsk'
23 |     allowed_domains = ['gsk-clinicalstudyregister.com']
24 | 
25 |     def __init__(self, conf=None, conn=None, date_from=None, date_to=None):
26 | 
27 |         # Save conf/conn
28 |         self.conf = conf
29 |         self.conn = conn
30 | 
31 |         # Make start urls
32 |         self.start_urls = _make_start_urls(
33 |                 prefix='http://www.gsk-clinicalstudyregister.com/search',
34 |                 date_from=date_from, date_to=date_to)
35 | 
36 |         # Make rules
37 |         self.rules = [
38 |             Rule(LinkExtractor(
39 |                 allow=r'study\/\d+'
40 |             ), callback=parse_record),
41 |         ]
42 | 
43 |         # Inherit parent
44 |         super(Spider, self).__init__()
45 | 
46 | 
47 | # Internal
48 | 
49 | def _make_start_urls(prefix, date_from=None, date_to=None):
50 |     """ Return start_urls.
51 |     """
52 |     if date_from is None:
53 |         date_from = str(date.today() - timedelta(days=1))
54 |     if date_to is None:
55 |         date_to = str(date.today())
56 |     query = OrderedDict()
57 |     query['last_updated_from'] = date_from
58 |     query['last_updated_to'] = date_to
59 |     return [prefix + '?' + urlencode(query)]
60 | 


--------------------------------------------------------------------------------
/migrations/versions/20170123151655_add_trigger_for_meta_updated.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'b32475938a2d'
13 | down_revision = u'542425c4e70b'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | updatable_tables = ['actrn', 'cochrane_reviews', 'euctr', 'fda_dap', 'fdadl', 'gsk',
18 |     'hra', 'icdcm', 'icdpcs', 'ictrp', 'isrctn', 'jprn', 'nct', 'pfizer', 'pubmed', 'takeda']
19 | 
20 | 
21 | def upgrade():
22 |     conn = op.get_bind()
23 |     func = sa.DDL("""CREATE FUNCTION set_meta_updated()
24 |                       RETURNS TRIGGER
25 |                       LANGUAGE plpgsql
26 |                     AS $$
27 |                     BEGIN
28 |                       NEW.meta_updated := now();
29 |                       RETURN NEW;
30 |                     END;
31 |                     $$;""")
32 |     conn.execute(func)
33 | 
34 |     for table in updatable_tables:
35 |         trigger_params = {'trigger': ('%s_set_meta_updated' % table), 'table': table}
36 |         trigger = ("""CREATE TRIGGER %(trigger)s
37 |                     BEFORE UPDATE ON %(table)s
38 |                     FOR EACH ROW EXECUTE PROCEDURE set_meta_updated();""" % trigger_params)
39 |         conn.execute(trigger)
40 | 
41 | 
42 | def downgrade():
43 |     conn = op.get_bind()
44 |     for table in updatable_tables:
45 |         trigger_params = {'trigger': ('%s_set_meta_updated' % table), 'table': table}
46 |         trigger = ('DROP TRIGGER %(trigger)s ON %(table)s;' % trigger_params)
47 |         conn.execute(trigger)
48 | 
49 |     conn.execute(sa.DDL('DROP FUNCTION set_meta_updated();'))
50 | 


--------------------------------------------------------------------------------
/tests/collectors/hra/test_collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import mock
 8 | import pytest
 9 | import datetime
10 | import requests
11 | from collections import defaultdict
12 | from collectors.hra.collector import collect, _make_request_url
13 | 
14 | 
15 | class TestHRACollector(object):
16 |     def test_make_request_url(self):
17 |         date_from = datetime.date(2015, 1, 1)
18 |         date_to = datetime.date(2015, 12, 31)
19 |         actual = _make_request_url('prefix', date_from, date_to)
20 |         expect = 'prefix?datePublishedFrom=2015-01-01&datePublishedTo=2015-12-31'
21 |         assert actual == expect
22 | 
23 |     @mock.patch('requests.Session.get')
24 |     def test_collect_skips_deffered_records(self, session_get_mock, conn, conf, deferred_item_stub):
25 |         response_mock = mock.Mock()
26 |         response_mock.json.return_value = [
27 |             deferred_item_stub
28 |         ]
29 |         session_get_mock.return_value = response_mock
30 |         collect(conf, conn, '2015-01-01', '2015-01-01')
31 | 
32 |         hra_id = ('HRA%s' % deferred_item_stub['ApplicationID'])
33 |         assert conn['warehouse']['hra'].find_one(hra_id=hra_id) is None
34 | 
35 | 
36 | @pytest.fixture
37 | def deferred_item_stub():
38 |     deferred_item = defaultdict(lambda: None)
39 |     attributes = {
40 |         'ApplicationID': '323854',
41 |         'PublicationDate': 'Publication of this data is currently deferred.',
42 |         'UpdatedDate': '2017-01-05T14:01:03.41',
43 |         'Decision': 'Publication of this data is currently deferred.',
44 |         'DecisionDate': 'Publication of this data is currently deferred.',
45 |     }
46 |     deferred_item.update(attributes)
47 |     return deferred_item
48 | 


--------------------------------------------------------------------------------
/collectors/icdcm/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import logging
 9 | import zipfile
10 | import requests
11 | from scrapy.http import TextResponse
12 | from .record import Record
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | # Module API
17 | 
18 | def collect(conf, conn):
19 |     """Collect ICD-XX-CM conditions.
20 |     """
21 | 
22 |     # For more information see:
23 |     # https://www.cms.gov/Medicare/Coding/ICD10/2016-ICD-10-CM-and-GEMs.html
24 |     URL = 'https://www.cms.gov/Medicare/Coding/ICD10/Downloads/2016-CM-Code-Tables-and-Index.zip'
25 |     FILE = 'Tabular.xml'
26 |     VERSION = 'ICD-10-CM'
27 |     LAST_UPDATED = '2015-10-01'
28 | 
29 |     # Prepare xml
30 |     zip = requests.get(URL).content
31 |     xml = zipfile.ZipFile(io.BytesIO(zip)).open(FILE).read()
32 |     res = TextResponse(url=URL, body=xml, encoding='utf-8')
33 | 
34 |     count = 0
35 |     for diag in res.xpath('//diag'):
36 |         # We need only leafs
37 |         childs = diag.xpath('./diag')
38 |         if not childs:
39 |             continue
40 | 
41 |         # Get data
42 |         data = {
43 |             'name': diag.xpath('./name/text()').extract_first(),
44 |             'desc': diag.xpath('./desc/text()').extract_first(),
45 |             'terms': diag.xpath('.//note/text()').extract(),
46 |             'version': VERSION,
47 |             'last_updated': LAST_UPDATED,
48 |         }
49 | 
50 |         # Create record
51 |         record = Record.create(URL, data)
52 | 
53 |         # Write record
54 |         record.write(conf, conn)
55 | 
56 |         # Log info
57 |         count += 1
58 |         if not count % 100:
59 |             logger.info('Collected %s "%s" conditions', count, record.table)
60 | 


--------------------------------------------------------------------------------
/collectors/hra/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Datetime
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'hra'
18 | 
19 |     # General
20 | 
21 |     hra_id = Text(primary_key=True)
22 |     publication_date = Date('%Y-%m-%dT%H:%M:%S')
23 |     updated_date = Date('%Y-%m-%dT%H:%M:%S.%f')
24 |     comittee_name = Text()
25 |     comittee_ref_number = Text()
26 |     iras_proj_id = Text()
27 |     contact_name = Text()
28 |     contact_email = Text()
29 |     application_title = Text()
30 |     study_type_id = Text()
31 |     study_type = Text()
32 |     sponsor_org = Text()
33 |     research_programme = Text()
34 |     data_coll_arrangements = Text()
35 |     establishment_org = Text()
36 |     establishment_org_address_1 = Text()
37 |     establishment_org_address_2 = Text()
38 |     establishment_org_address_3 = Text()
39 |     establishment_org_post_code = Text()
40 |     decision = Text()
41 |     decision_date = Datetime('%Y-%m-%d %H:%M:%S')
42 |     human_tissue_license = Text()
43 |     rtb_title = Text()
44 |     research_database_title = Text()
45 |     application_full_title = Text()
46 |     isrctn_id = Text()
47 |     nct_id = Text()
48 |     additional_ref_numbers = Text()
49 |     duration_of_study_in_uk = Text()
50 |     research_summary = Text()
51 |     euctr_id = Text()
52 |     social_value = Text()
53 |     recuitment_arrangements = Text()
54 |     risk_and_benefit = Text()
55 |     participants_protection_and_care = Text()
56 |     informed_consent = Text()
57 |     applicant_and_staff_suitability = Text()
58 |     independent_review = Text()
59 |     supporting_info_suitability = Text()
60 |     other_comments = Text()
61 |     research_summary_suitability = Text()
62 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import os
 8 | import pytest
 9 | import betamax
10 | import dataset
11 | from scrapy.http import Request, HtmlResponse
12 | from collectors.base import config, helpers
13 | 
14 | 
15 | with betamax.Betamax.configure() as cfg:
16 |     cfg.cassette_library_dir = 'tests/cassettes/'
17 | 
18 |     record_mode = 'none' if os.environ.get('CI') else 'once'
19 |     cfg.default_cassette_options['record_mode'] = record_mode
20 |     cfg.default_cassette_options['match_requests_on'] = [
21 |         'uri',
22 |         'method',
23 |         'headers',
24 |         'body',
25 |     ]
26 | 
27 | 
28 | # Fixtures
29 | 
30 | @pytest.fixture
31 | def conf():
32 |     return helpers.get_variables(config, str.isupper)
33 | 
34 | 
35 | @pytest.fixture
36 | def conn():
37 |     warehouse = dataset.connect(config.WAREHOUSE_URL)
38 |     for table in warehouse.tables:
39 |         warehouse[table].delete()
40 |     return {'warehouse': warehouse}
41 | 
42 | 
43 | @pytest.fixture
44 | def get_url(betamax_session):
45 |     def _get_url(url, request_kwargs={}):
46 |         '''Returns a scrapy.html.HtmlResponse with the contents of the received
47 |         url.
48 | 
49 |         Note that the session is kept intact among multiple calls to this
50 |         method (i.e. cookies are passed over).
51 | 
52 |         We also don't verify SSL certificates, because Takeda's certificate is
53 |         invalid. If they become valid, we can resume verifying the
54 |         certificates.
55 |         '''
56 |         response = betamax_session.get(url, verify=False)
57 |         scrapy_response = HtmlResponse(
58 |             url=str(response.url),
59 |             body=response.content,
60 |         )
61 |         scrapy_response.request = Request(url, **request_kwargs)
62 | 
63 |         return scrapy_response
64 |     return _get_url
65 | 


--------------------------------------------------------------------------------
/collectors/isrctn/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from urllib import urlencode
 8 | from collections import OrderedDict
 9 | from datetime import date, timedelta
10 | from scrapy.spiders import Rule
11 | from scrapy.spiders import CrawlSpider
12 | from scrapy.linkextractors import LinkExtractor
13 | from .parser import parse_record
14 | 
15 | 
16 | # Module API
17 | 
18 | class Spider(CrawlSpider):
19 | 
20 |     # Public
21 | 
22 |     name = 'isrctn'
23 |     allowed_domains = ['isrctn.com']
24 | 
25 |     def __init__(self, conf=None, conn=None, date_from=None, date_to=None):
26 | 
27 |         # Save conf/conn
28 |         self.conf = conf
29 |         self.conn = conn
30 | 
31 |         # Make start urls
32 |         self.start_urls = _make_start_urls(
33 |                 prefix='http://www.isrctn.com/search',
34 |                 date_from=date_from, date_to=date_to)
35 | 
36 |         # Make rules
37 |         self.rules = [
38 |             Rule(LinkExtractor(
39 |                 allow=r'ISRCTN\d+',
40 |             ), callback=parse_record),
41 |             Rule(LinkExtractor(
42 |                 allow=r'page=\d+',
43 |             )),
44 |         ]
45 | 
46 |         # Inherit parent
47 |         super(Spider, self).__init__()
48 | 
49 | 
50 | # Internal
51 | 
52 | def _make_start_urls(prefix, date_from=None, date_to=None):
53 |     """ Return start_urls.
54 |     """
55 |     if date_from is None:
56 |         date_from = str(date.today() - timedelta(days=1))
57 |     if date_to is None:
58 |         date_to = str(date.today())
59 |     query = OrderedDict()
60 |     query['q'] = ''
61 |     gtle = 'GT lastEdited:%sT00:00:00.000Z' % date_from
62 |     lele = 'LE lastEdited:%sT00:00:00.000Z' % date_to
63 |     query['filters'] = ','.join([gtle, lele])
64 |     query['page'] = '1'
65 |     query['pageSize'] = '100'
66 |     query['searchType'] = 'advanced-search'
67 |     return [prefix + '?' + urlencode(query)]
68 | 


--------------------------------------------------------------------------------
/migrations/versions/20160428204857_pubmed_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import ARRAY, JSONB
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'b720671a8c0f'
14 | down_revision = u'014fd3f703aa'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table('pubmed',
21 | 
22 |         # Meta
23 | 
24 |         sa.Column('meta_id', sa.Text, unique=True),
25 |         sa.Column('meta_source', sa.Text),
26 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
27 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
28 | 
29 |         # Medline
30 | 
31 |         sa.Column('pmid', sa.Text, primary_key=True),
32 |         sa.Column('date_created', sa.Date),
33 |         sa.Column('date_completed', sa.Date),
34 |         sa.Column('date_revised', sa.Date),
35 |         sa.Column('country', sa.Text),
36 |         sa.Column('medline_ta', sa.Text),
37 |         sa.Column('nlm_unique_id', sa.Text),
38 |         sa.Column('issn_linking', sa.Text),
39 | 
40 |         # Journal
41 | 
42 |         sa.Column('journal_issn', sa.Text),
43 |         sa.Column('journal_title', sa.Text),
44 |         sa.Column('journal_iso', sa.Text),
45 | 
46 |         # Article
47 | 
48 |         sa.Column('article_title', sa.Text),
49 |         sa.Column('article_abstract', sa.Text),
50 |         sa.Column('article_authors', ARRAY(sa.Text)),
51 |         sa.Column('article_language', sa.Text),
52 |         sa.Column('article_publication_type_list', ARRAY(sa.Text)),
53 |         sa.Column('article_vernacular_title', sa.Text),
54 |         sa.Column('article_date', sa.Date),
55 | 
56 |         # Pubmed
57 | 
58 |         sa.Column('publication_status', sa.Text),
59 |         sa.Column('identifiers_list', JSONB()),
60 | 
61 |     )
62 | 
63 | 
64 | def downgrade():
65 |     op.drop_table('pubmed')
66 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile --output-file requirements.txt requirements.in
 6 | #
 7 | alembic==0.8.10
 8 | appdirs==1.4.0            # via setuptools
 9 | attrs==16.3.0             # via service-identity
10 | cffi==1.9.1               # via cryptography
11 | chardet==2.3.0            # via normality
12 | click==6.7                # via python-dotenv
13 | constantly==15.1.0        # via twisted
14 | contextlib2==0.5.4        # via raven
15 | cryptography==1.7.2       # via pyopenssl
16 | cssselect==1.0.1          # via parsel, scrapy
17 | dataset==0.7.1
18 | enum34==1.1.6             # via cryptography
19 | idna==2.2                 # via cryptography
20 | ijson==2.3
21 | incremental==16.10.1      # via twisted
22 | ipaddress==1.0.18         # via cryptography
23 | lxml==3.7.2               # via parsel, scrapy
24 | mako==1.0.6               # via alembic
25 | markupsafe==0.23          # via mako
26 | normality==0.3.9          # via dataset
27 | packaging==16.8           # via setuptools
28 | parsel==1.1.0             # via scrapy
29 | psycopg2==2.6.2
30 | pyasn1-modules==0.0.8     # via service-identity
31 | pyasn1==0.1.9             # via cryptography, pyasn1-modules, service-identity
32 | pycparser==2.17           # via cffi
33 | pydispatcher==2.0.5       # via scrapy
34 | pyopenssl==16.2.0         # via scrapy, service-identity
35 | pyparsing==2.1.10         # via packaging
36 | python-dateutil==2.6.0
37 | python-dotenv==0.6.2
38 | python-editor==1.0.3      # via alembic
39 | pytz==2016.10
40 | pyyaml==3.12              # via dataset
41 | queuelib==1.4.2           # via scrapy
42 | raven==5.32.0
43 | requests==2.12.2
44 | scrapy==1.3.0
45 | service-identity==16.0.0  # via scrapy
46 | six==1.10.0               # via cryptography, dataset, normality, packaging, parsel, pyopenssl, python-dateutil, scrapy, setuptools, w3lib
47 | sqlalchemy==1.1.5
48 | twisted==16.6.0           # via scrapy
49 | w3lib==1.16.0             # via parsel, scrapy
50 | xmltodict==0.10.2
51 | zope.interface==4.3.3     # via twisted
52 | 
53 | # The following packages are considered to be unsafe in a requirements file:
54 | # setuptools                # via cryptography, zope.interface
55 | 


--------------------------------------------------------------------------------
/collectors/actrn/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from urllib import urlencode
 8 | from collections import OrderedDict
 9 | from datetime import datetime, date, timedelta
10 | from scrapy.spiders import Rule
11 | from scrapy.spiders import CrawlSpider
12 | from scrapy.linkextractors import LinkExtractor
13 | from .parser import parse_record
14 | 
15 | 
16 | # Module API
17 | 
18 | class Spider(CrawlSpider):
19 | 
20 |     # Public
21 | 
22 |     name = 'actrn'
23 |     allowed_domains = ['anzctr.org.au']
24 | 
25 |     def __init__(self, conf=None, conn=None, date_from=None, date_to=None):
26 | 
27 |         # Save conf/conn
28 |         self.conf = conf
29 |         self.conn = conn
30 | 
31 |         # Make start urls
32 |         self.start_urls = _make_start_urls(
33 |             prefix='http://www.anzctr.org.au/TrialSearch.aspx',
34 |             date_from=date_from, date_to=date_to)
35 | 
36 |         # Make rules
37 |         self.rules = [
38 |             Rule(LinkExtractor(
39 |                 allow=r'Trial/Registration/TrialReview.aspx',
40 |                 process_value=lambda value: value.replace('http', 'https', 1),
41 |             ), callback=parse_record),
42 |             Rule(LinkExtractor(
43 |                 allow=r'page=\d+',
44 |             )),
45 |         ]
46 | 
47 |         # Inherit parent
48 |         super(Spider, self).__init__()
49 | 
50 | 
51 | # Internal
52 | 
53 | def _make_start_urls(prefix, date_from=None, date_to=None):
54 |     """ Return start_urls.
55 |     """
56 |     if date_from is None:
57 |         date_from = str(date.today() - timedelta(days=1))
58 |     if date_to is None:
59 |         date_to = str(date.today())
60 |     query = OrderedDict()
61 |     date_from = datetime.strptime(date_from, '%Y-%m-%d').strftime('%d/%m/%Y')
62 |     date_to = datetime.strptime(date_to, '%Y-%m-%d').strftime('%d/%m/%Y')
63 |     query['searchTxt'] = ''
64 |     query['dateOfRegistrationFrom'] = date_from
65 |     query['dateOfRegistrationTo'] = date_to
66 |     query['registry'] = 'ANZCTR'
67 |     query['isBasic'] = 'False'
68 |     return [prefix + '?' + urlencode(query)]
69 | 


--------------------------------------------------------------------------------
/collectors/pfizer/parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .record import Record
 8 | 
 9 | 
10 | # Module API
11 | 
12 | def parse_record(res):
13 | 
14 |     # Init data
15 |     data = {}
16 | 
17 |     # Description
18 | 
19 |     key = 'study_type'
20 |     path = '.field-name-field-study-type .field-item::text'
21 |     value = res.css(path).extract_first()
22 |     data[key] = value
23 | 
24 |     key = 'organization_id'
25 |     path = '.field-name-field-organization-id .field-item::text'
26 |     value = res.css(path).extract_first()
27 |     data[key] = value
28 | 
29 |     key = 'nct_id'
30 |     path = '.field-name-field-clinical-trial-id .field-item::text'
31 |     value = res.css(path).extract_first()
32 |     data[key] = value
33 | 
34 |     key = 'status'
35 |     path = '//label[text() = "Status"]/../text()'
36 |     value = ''.join(res.xpath(path).extract()).strip()
37 |     data[key] = value
38 | 
39 |     key = 'study_start_date'
40 |     path = '.field-name-field-study-start-date .field-item span::text'
41 |     value = res.css(path).extract_first()
42 |     data[key] = value
43 | 
44 |     key = 'study_end_date'
45 |     path = '.field-name-field-study-end-date .field-item span::text'
46 |     value = res.css(path).extract_first()
47 |     data[key] = value
48 | 
49 |     # Eligibility
50 | 
51 |     key = 'eligibility_criteria'
52 |     path = '.field-name-field-criteria .field-item *::text'
53 |     value = ''.join(res.css(path).extract())
54 |     data[key] = value
55 | 
56 |     key = 'gender'
57 |     path = '.field-name-field-gender .field-item::text'
58 |     value = res.css(path).extract_first()
59 |     data[key] = value
60 | 
61 |     key = 'age_range'
62 |     path = '//label[text() = "Age Range:"]/../text()'
63 |     value = ''.join(res.xpath(path).extract()).strip()
64 |     data[key] = value
65 | 
66 |     key = 'healthy_volunteers_allowed'
67 |     path = '.field-name-field-healthy-volunteers-allowed .field-item::text'
68 |     value = res.css(path).extract_first()
69 |     data[key] = value
70 | 
71 |     # Create record
72 |     record = Record.create(res.url, data)
73 | 
74 |     return record
75 | 


--------------------------------------------------------------------------------
/collectors/takeda/parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | try:
 8 |     import urlparse
 9 | except ImportError:
10 |     import urllib.parse as urlparse
11 | from .. import base
12 | from .record import Record
13 | 
14 | 
15 | # Module API
16 | 
17 | def parse_record(res):
18 | 
19 |     # Init data
20 |     data = {}
21 | 
22 |     # Parse rawdata
23 |     gpath = 'h1'
24 |     kpath = 'p.eyebrowbold'
25 |     vpath = 'p.eyebrowbold+*'
26 |     rawdata = _parse_data(res, gpath, kpath, vpath)
27 |     for group, key, value in rawdata:
28 | 
29 |         # General
30 | 
31 |         if key == 'compound':
32 |             value = value.split(',')
33 | 
34 |         # Recruitment
35 | 
36 |         if key == 'locations':
37 |             value = value.split(',')
38 | 
39 |         # Collect plain values
40 |         data[key] = value
41 | 
42 |     # Extract results URL
43 |     selector = '#results div a::attr(href)'
44 |     value = res.css(selector).extract_first()
45 |     if value:
46 |         url = urlparse.urljoin(res.url, value)
47 |         data['download_the_clinical_trial_summary'] = url
48 |     else:
49 |         try:
50 |             del data['download_the_clinical_trial_summary']
51 |         except KeyError:
52 |             pass
53 | 
54 |     # Create record
55 |     record = Record.create(res.url, data)
56 | 
57 |     return record
58 | 
59 | 
60 | # Internal
61 | 
62 | def _parse_data(sel, gpath, kpath, vpath):
63 |     data = []
64 |     group = None
65 |     name = None
66 |     value = None
67 |     for sel in sel.css('%s, %s, %s' % (gpath, kpath, vpath)):
68 |         text = _parse_text(sel)
69 |         if sel.css(gpath):
70 |             group = text
71 |         elif sel.css(kpath):
72 |             name = base.helpers.slugify(text)
73 |         else:
74 |             value = text
75 |             if name and value:
76 |                 data.append((group, name, value))
77 |             name = None
78 |             value = None
79 |     return data
80 | 
81 | 
82 | def _parse_text(sel):
83 |     text = ''
84 |     texts = sel.xpath('.//text()').extract()
85 |     if texts:
86 |         text = ' '.join(texts).strip()
87 |     return text
88 | 


--------------------------------------------------------------------------------
/migrations/versions/20160301131954_ictrp_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import ARRAY, JSONB
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '7518ba857fea'
14 | down_revision = u'393d51424903'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table('ictrp',
21 | 
22 |         # Meta
23 | 
24 |         sa.Column('meta_uuid', sa.Text),
25 |         sa.Column('meta_source', sa.Text),
26 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
27 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
28 | 
29 |         # Main
30 | 
31 |         sa.Column('register', sa.Text, primary_key=True),
32 |         sa.Column('last_refreshed_on', sa.Date),
33 |         sa.Column('main_id', sa.Text, primary_key=True),
34 |         sa.Column('date_of_registration', sa.Text),
35 |         sa.Column('primary_sponsor', sa.Text),
36 |         sa.Column('public_title', sa.Text),
37 |         sa.Column('scientific_title', sa.Text),
38 |         sa.Column('date_of_first_enrollment', sa.Text),
39 |         sa.Column('target_sample_size', sa.Integer),
40 |         sa.Column('recruitment_status', sa.Text),
41 |         sa.Column('url', sa.Text),
42 |         sa.Column('study_type', sa.Text),
43 |         sa.Column('study_design', sa.Text),
44 |         sa.Column('study_phase', sa.Text),
45 | 
46 |         # Additional
47 | 
48 |         sa.Column('countries_of_recruitment', ARRAY(sa.Text)),
49 |         sa.Column('contacts', JSONB),
50 |         sa.Column('key_inclusion_exclusion_criteria', sa.Text),
51 |         sa.Column('health_conditions_or_problems_studied', ARRAY(sa.Text)),
52 |         sa.Column('interventions', ARRAY(sa.Text)),
53 |         sa.Column('primary_outcomes', ARRAY(sa.Text)),
54 |         sa.Column('secondary_outcomes', ARRAY(sa.Text)),
55 |         sa.Column('secondary_ids', ARRAY(sa.Text)),
56 |         sa.Column('sources_of_monetary_support', ARRAY(sa.Text)),
57 |         sa.Column('secondary_sponsors', ARRAY(sa.Text)),
58 | 
59 |     )
60 | 
61 | 
62 | def downgrade():
63 |     op.drop_table('ictrp')
64 | 


--------------------------------------------------------------------------------
/tests/collectors/takeda/test_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from collectors.takeda.parser import parse_record
 8 | 
 9 | 
10 | class TestTakedaParser(object):
11 |     def test_download_the_clinical_trial_summary_contains_absolute_url(self, get_url):
12 |         url = 'https://www.takedaclinicaltrials.com/browse/summary/TAK-648_101'
13 |         response = get_url(url)
14 | 
15 |         record = parse_record(response)
16 | 
17 |         assert record['download_the_clinical_trial_summary'] == 'https://www.takedaclinicaltrials.com/files2/TAK-648-101-RDS-2016-02-10.pdf'
18 | 
19 |     def test_download_the_clinical_trial_summary_prefers_english_pdf_when_available(self, get_url):
20 |         url = 'https://www.takedaclinicaltrials.com/browse/summary/073-011'
21 |         response = get_url(url)
22 | 
23 |         record = parse_record(response)
24 | 
25 |         assert record.get('download_the_clinical_trial_summary') == 'https://www.takedaclinicaltrials.com/files2/073-011-RDS-2015-03-27.pdf'
26 | 
27 |     def test_download_the_clinical_trial_summary_gets_japanese_pdf_if_no_english_available(self, get_url):
28 |         url = 'https://www.takedaclinicaltrials.com/browse/summary/AG-1749/CCT-352'
29 |         response = get_url(url)
30 | 
31 |         record = parse_record(response)
32 | 
33 |         assert record.get('download_the_clinical_trial_summary') == 'https://www.takedaclinicaltrials.com/files2/AG-1749-CCT-352-RDS-2010-10-17_JP.pdf'
34 | 
35 |     def test_download_the_clinical_trial_summary_is_none_for_trials_without_results(self, get_url):
36 |         url = 'https://www.takedaclinicaltrials.com/browse/summary/NaltrexBuprop-4004'
37 |         response = get_url(url)
38 | 
39 |         record = parse_record(response)
40 | 
41 |         assert record.get('download_the_clinical_trial_summary') is None
42 | 
43 |     def test_download_the_clinical_trial_summary_is_none_for_trials_with_results_unavailable_on_takeda(self, get_url):
44 |         url = 'https://www.takedaclinicaltrials.com/browse/summary/ATS%20K023'
45 |         response = get_url(url)
46 | 
47 |         record = parse_record(response)
48 | 
49 |         assert record.get('download_the_clinical_trial_summary') is None
50 | 


--------------------------------------------------------------------------------
/migrations/versions/20160229142254_takeda_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import ARRAY
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '393d51424903'
14 | down_revision = u'a8d6e250d481'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table('takeda',
21 | 
22 |         # Meta
23 | 
24 |         sa.Column('meta_uuid', sa.Text),
25 |         sa.Column('meta_source', sa.Text),
26 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
27 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
28 | 
29 |         # General
30 | 
31 |         sa.Column('official_title', sa.Text),
32 |         sa.Column('takeda_trial_id', sa.Text),
33 |         sa.Column('trial_phase', sa.Text),
34 |         sa.Column('condition', sa.Text),
35 |         sa.Column('compound', ARRAY(sa.Text)),
36 |         sa.Column('recruitment_status', sa.Text),
37 | 
38 |         # Description
39 | 
40 |         sa.Column('nct_number', sa.Text),
41 |         sa.Column('trial_type', sa.Text),
42 |         sa.Column('other_trial_ids', sa.Text),
43 |         sa.Column('acronym', sa.Text),
44 |         sa.Column('brief_summary', sa.Text),
45 |         sa.Column('detailed_description', sa.Text),
46 |         sa.Column('trial_design', sa.Text),
47 |         sa.Column('primary_outcome_measures', sa.Text),
48 |         sa.Column('secondary_outcome_measures', sa.Text),
49 |         sa.Column('trial_armsgroups_or_cohorts', sa.Text),
50 | 
51 |         # Recruitment
52 | 
53 |         sa.Column('gender', sa.Text),
54 |         sa.Column('ages', sa.Text),
55 |         sa.Column('enrollmentnumber_of_participants', sa.Integer),
56 |         sa.Column('locations', ARRAY(sa.Text)),
57 |         sa.Column('responsible_party', sa.Text),
58 |         sa.Column('trial_sponsor', sa.Text),
59 |         sa.Column('start_date', sa.Date),
60 |         sa.Column('completion_date', sa.Date),
61 |         sa.Column('eligibility_criteria', sa.Text),
62 | 
63 |         # Results
64 | 
65 |         sa.Column('download_the_clinical_trial_summary', sa.Text),
66 |         sa.Column('other_available_languages', sa.Text),
67 | 
68 |     )
69 | 
70 | 
71 | def downgrade():
72 |     op.drop_table('takeda')
73 | 


--------------------------------------------------------------------------------
/collectors/euctr/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from urllib import urlencode
 8 | from functools import partial
 9 | from collections import OrderedDict
10 | from datetime import date, timedelta
11 | from scrapy.spiders import Rule
12 | from scrapy.spiders import CrawlSpider
13 | from scrapy.linkextractors import LinkExtractor
14 | from .parser import parse_record
15 | 
16 | 
17 | # Module API
18 | 
19 | class Spider(CrawlSpider):
20 | 
21 |     # Public
22 | 
23 |     name = 'euctr'
24 |     allowed_domains = ['clinicaltrialsregister.eu']
25 | 
26 |     def __init__(self, conf=None, conn=None, date_from=None, date_to=None):
27 | 
28 |         # Save conf/conn
29 |         self.conf = conf
30 |         self.conn = conn
31 | 
32 |         # Make start urls
33 |         self.start_urls = _make_start_urls(
34 |                 prefix='https://www.clinicaltrialsregister.eu/ctr-search/search',
35 |                 date_from=date_from, date_to=date_to)
36 | 
37 |         # Make rules
38 |         self.rules = [
39 |             Rule(
40 |                 LinkExtractor(
41 |                     allow=r'ctr-search/trial/[\d-]+/[\w]+',
42 |                     deny=r'results$'
43 |                 ),
44 |                 callback=parse_record
45 |             ),
46 |             Rule(
47 |                 LinkExtractor(
48 |                     allow=r'page=\d+',
49 |                     restrict_css='[accesskey=n]'
50 |                 ),
51 |                 process_links=partial(_process_links, self.start_urls)
52 |             ),
53 |         ]
54 | 
55 |         # Inherit parent
56 |         super(Spider, self).__init__()
57 | 
58 | 
59 | # Internal
60 | 
61 | def _make_start_urls(prefix, date_from=None, date_to=None):
62 |     """ Return start_urls.
63 |     """
64 |     if date_from is None:
65 |         date_from = str(date.today() - timedelta(days=1))
66 |     if date_to is None:
67 |         date_to = str(date.today())
68 |     query = OrderedDict()
69 |     query['query'] = ''
70 |     query['dateFrom'] = date_from
71 |     query['dateTo'] = date_to
72 |     return [prefix + '?' + urlencode(query)]
73 | 
74 | 
75 | def _process_links(start_urls, links):
76 |     result = []
77 |     for link in links:
78 |         link.url = '&page='.join([start_urls[0], link.url.split('=')[-1]])
79 |         result.append(link)
80 |     return result
81 | 


--------------------------------------------------------------------------------
/collectors/jprn/spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from functools import partial
 8 | from urllib import urlencode
 9 | from collections import OrderedDict
10 | from scrapy.spiders import Rule
11 | from scrapy.spiders import CrawlSpider
12 | from scrapy.linkextractors import LinkExtractor
13 | from six.moves.urllib.parse import urlparse, parse_qs
14 | from .parser import parse_record
15 | 
16 | 
17 | # Module API
18 | 
19 | class Spider(CrawlSpider):
20 | 
21 |     # Public
22 | 
23 |     name = 'jprn'
24 |     allowed_domains = ['upload.umin.ac.jp']
25 | 
26 |     def __init__(self, conf=None, conn=None, page_from=None, page_to=None):
27 | 
28 |         # Save conf/conn
29 |         self.conf = conf
30 |         self.conn = conn
31 | 
32 |         # Default values
33 |         if page_from is None:
34 |             page_from = '1'
35 |         if page_to is None:
36 |             page_to = '1'
37 | 
38 |         # Make start urls
39 |         self.start_urls = _make_start_urls(
40 |                 prefix='https://upload.umin.ac.jp/cgi-open-bin/ctr_e/index.cgi',
41 |                 page_from=page_from)
42 | 
43 |         # Make rules
44 |         self.rules = [
45 |             Rule(LinkExtractor(
46 |                 allow=r'cgi-open-bin/ctr_e/ctr_view.cgi',
47 |             ), callback=parse_record),
48 |             Rule(LinkExtractor(
49 |                 allow=r'page=\d+',
50 |                 process_value=partial(_process_url, page_from, page_to),
51 |             )),
52 |         ]
53 | 
54 |         # Inherit parent
55 |         super(Spider, self).__init__()
56 | 
57 | 
58 | # Internal
59 | 
60 | def _make_start_urls(prefix, page_from=None):
61 |     """ Return start_urls.
62 |     """
63 |     if page_from is None:
64 |         page_from = '1'
65 |     query = OrderedDict()
66 |     query['page'] = page_from
67 |     query['sort'] = '05'
68 |     return [prefix + '?' + urlencode(query)]
69 | 
70 | 
71 | def _process_url(page_from, page_to, url):
72 | 
73 |     # Get url page
74 |     query = urlparse(url).query
75 |     query = parse_qs(query)
76 |     page = query.get('page')
77 | 
78 |     # Preserve if match
79 |     if page:
80 |         page_from = int(page_from)
81 |         page_to = int(page_to)
82 |         page = int(page[0])
83 |         if page >= page_from and page <= page_to:
84 |             return url
85 | 
86 |     return None
87 | 


--------------------------------------------------------------------------------
/collectors/jprn/parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from collections import OrderedDict
 8 | from .. import base
 9 | from .record import Record
10 | 
11 | 
12 | # Module API
13 | 
14 | def parse_record(res):
15 |     fields_to_remove = [
16 |         'item',
17 |     ]
18 | 
19 |     # Parse rawdata
20 |     data = {}
21 | 
22 |     # Get meta
23 |     subdata = _parse_table(res, key_index=0, value_index=2)
24 |     data.update(subdata)
25 | 
26 |     # Process rawdata
27 |     rawdata = _parse_table(res, key_index=0, value_index=1)
28 |     prefix = ''
29 |     for key, value in rawdata.items():
30 | 
31 |         # Interventions
32 | 
33 |         newkey = 'interventions'
34 |         oldkey = 'interventionscontrol'
35 |         data.setdefault(newkey, [])
36 |         if key.startswith(oldkey):
37 |             data[newkey].append(value)
38 |             continue
39 | 
40 |         # Research contact person
41 | 
42 |         if key == 'name_of_lead_principal_investigator':
43 |             prefix = 'research_'
44 | 
45 |         # Public contact
46 | 
47 |         if key == 'name_of_contact_person':
48 |             prefix = 'public_'
49 | 
50 |         # Sponsor
51 | 
52 |         if key == 'name_of_primary_sponsor':
53 |             prefix = ''
54 | 
55 |         # Collect plain values
56 |         key = prefix + key
57 |         data[key] = value
58 | 
59 |     # Remove data
60 |     for key in fields_to_remove:
61 |         if key in data:
62 |             del data[key]
63 | 
64 |     identifier = data.get('unique_id_issued_by_umin')
65 |     data['unique_trial_number'] = data.get('unique_trial_number', identifier)
66 | 
67 |     # Create record
68 |     record = Record.create(res.url, data)
69 | 
70 |     return record
71 | 
72 | 
73 | # Internal
74 | 
75 | def _parse_table(res, key_index, value_index):
76 |     """parse data from tabular structure.
77 |     """
78 |     data = OrderedDict()
79 |     for sel in res.xpath('//tr'):
80 |         columns = sel.xpath('td')
81 |         if len(columns) == value_index+1:
82 |             key = ''.join(columns[key_index].xpath('.//text()').extract())
83 |             key = base.helpers.slugify(key.strip())
84 |             value = ''.join(columns[value_index].xpath('.//text()').extract())
85 |             value = value.strip()
86 |             if key and value:
87 |                 data[key] = value
88 |     return data
89 | 


--------------------------------------------------------------------------------
/collectors/nct/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import zipfile
 8 | import logging
 9 | import requests
10 | import tempfile
11 | import contextlib
12 | from .parser import parse_record
13 | from .. import base
14 | logger = logging.getLogger(__name__)
15 | 
16 | # Module API
17 | 
18 | 
19 | def collect(conf, conn, nct_xml_dump_url):
20 |     '''
21 |     Downloads and parse data from NCT's XML dump. Considering you want the data
22 |     from 2017-01-01 until 2017-02-01, the XML dump can be downloaded from:
23 | 
24 |     https://clinicaltrials.gov/search?resultsxml=True&rcv_s=01/01/2017&rcv_e=01/02/2017
25 |     '''
26 |     base.helpers.start(conf, 'nct', {'url': nct_xml_dump_url})
27 | 
28 |     with tempfile.TemporaryFile() as fp:
29 |         _download_to_file(nct_xml_dump_url, fp)
30 |         file_count = 0
31 |         for identifier, record_fp in _iter_nct_dump_files(fp):
32 |             base.config.SENTRY.extra_context({
33 |                 'url': nct_xml_dump_url,
34 |                 'identifier': identifier,
35 |             })
36 |             rec = parse_record(record_fp)
37 |             query = {'nct_id': rec['nct_id']}
38 |             if rec.table in conn['warehouse'].tables:
39 |                 existing = conn['warehouse'][rec.table].find_one(**query)
40 |                 if existing:
41 |                     rec['nct_id'] = existing['nct_id']
42 |             rec.write(conf, conn)
43 |             file_count += 1
44 |         logger.info('Collected %s NCT records', file_count)
45 | 
46 |     base.helpers.stop(conf, 'nct', {
47 |         'url': nct_xml_dump_url,
48 |         'collected': file_count,
49 |     })
50 | 
51 | 
52 | def _download_to_file(url, fp):
53 |     CHUNK_SIZE = 1024 * 1024  # 1 MB
54 |     bytes_to_mb = lambda value: value / 1048576.0
55 |     with contextlib.closing(requests.get(url, stream=True)) as response:
56 |         completed_bytes = 0
57 |         chunk_count = 0
58 |         for block in response.iter_content(CHUNK_SIZE):
59 |             fp.write(block)
60 |             completed_bytes += len(block)
61 |             chunk_count += 1
62 |             if chunk_count % 1000 == 0:
63 |                 logger.debug('Downloaded %.2f MB', bytes_to_mb(completed_bytes))
64 |     fp.seek(0)
65 | 
66 | 
67 | def _iter_nct_dump_files(fp):
68 |     with zipfile.ZipFile(fp) as archive:
69 |         for filename in archive.namelist():
70 |             identifier = filename.split('.')[0]
71 |             with archive.open(filename, 'rU') as rec_file:
72 |                 yield identifier, rec_file
73 | 


--------------------------------------------------------------------------------
/tests/collectors/pubmed/test_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import datetime
 8 | import pytest
 9 | from collectors.pubmed.parser import parse_record
10 | 
11 | 
12 | class TestPubmedParser(object):
13 |     def test_bug_abstracttext_without_text(self, get_url):
14 |         url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=22078490&retmode=xml'
15 |         response = get_url(url)
16 | 
17 |         record = parse_record(response)
18 | 
19 |         assert record['article_abstract'] is not None
20 | 
21 |     def test_bug_article_with_multiple_languages_pick_first_one(self, get_url):
22 |         url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=19082263&retmode=xml'
23 |         response = get_url(url)
24 | 
25 |         record = parse_record(response)
26 | 
27 |         assert record['article_language'].lower() == 'eng'
28 | 
29 |     @pytest.mark.skip(reason='need to find an article without medline journal country')
30 |     def test_bug_article_without_medline_journal_country(self, get_url):
31 |         url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=10838360&retmode=xml'
32 |         response = get_url(url)
33 | 
34 |         record = parse_record(response)
35 | 
36 |         assert record.get('country') is None
37 | 
38 |     def test_bug_article_without_vernacular_title(self, get_url):
39 |         url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=27305424&retmode=xml'
40 |         response = get_url(url)
41 | 
42 |         record = parse_record(response)
43 | 
44 |         assert record.get('article_vernacular_title') is None
45 | 
46 |     def test_article_date_corectly_parsed(self, get_url):
47 |         url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=24165173&retmode=xml'
48 |         response = get_url(url)
49 | 
50 |         record = parse_record(response)
51 | 
52 |         assert record.get('article_date') == datetime.date(2013, 10, 28)
53 | 
54 |     def test_multiple_ids_same_registry_collected(self, get_url):
55 |         url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id=22327499&retmode=xml'
56 |         response = get_url(url)
57 |         record = parse_record(response)
58 | 
59 |         registry_ids = [reg_id for reg_entry in record.get('registry_ids', [])
60 |                         for reg_id in reg_entry.values()]
61 | 
62 |         nct_ids = [reg_id for reg_id in registry_ids if 'NCT' in reg_id]
63 | 
64 |         assert len(nct_ids) == 2
65 | 


--------------------------------------------------------------------------------
/collectors/nct/record.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from .. import base
 8 | from ..base.fields import Text, Date, Integer, Json, Array, Boolean
 9 | 
10 | 
11 | # Module API
12 | 
13 | class Record(base.Record):
14 | 
15 |     # Config
16 | 
17 |     table = 'nct'
18 |     _DATE_FORMATS = [
19 |         '%B %Y',
20 |         '%B %d, %Y',
21 |     ]
22 | 
23 |     # General
24 | 
25 |     nct_id = Text(primary_key=True)
26 |     download_date = Text()
27 |     link_text = Text()
28 |     url = Text()
29 |     org_study_id = Text()
30 |     secondary_ids = Array()
31 |     nct_aliases = Array()
32 |     brief_title = Text()
33 |     acronym = Text()
34 |     official_title = Text()
35 |     sponsors = Json()
36 |     source = Text()
37 |     oversight_info = Json()
38 |     brief_summary = Text()
39 |     detailed_description = Text()
40 |     overall_status = Text()
41 |     why_stopped = Text()
42 |     start_date = Date(_DATE_FORMATS)
43 |     completion_date_actual = Date(_DATE_FORMATS)
44 |     completion_date_anticipated = Date(_DATE_FORMATS)
45 |     primary_completion_date_actual = Date(_DATE_FORMATS)
46 |     primary_completion_date_anticipated = Date(_DATE_FORMATS)
47 |     phase = Text()
48 |     study_type = Text()
49 |     study_design = Text()
50 |     target_duration = Text()
51 |     primary_outcomes = Json()
52 |     secondary_outcomes = Json()
53 |     other_outcomes = Json()
54 |     number_of_arms = Integer()
55 |     number_of_groups = Integer()
56 |     enrollment_actual = Integer()
57 |     enrollment_anticipated = Integer()
58 |     conditions = Array()
59 |     arm_groups = Json()
60 |     interventions = Json()
61 |     biospec_retention = Text()
62 |     biospec_desrc = Text()
63 |     eligibility = Json()
64 |     overall_officials = Json()
65 |     overall_contact = Json()
66 |     overall_contact_backup = Json()
67 |     locations = Json()
68 |     location_countries = Array()
69 |     removed_countries = Array()
70 |     links = Json()
71 |     references = Json()
72 |     results_references = Json()
73 |     verification_date = Date(_DATE_FORMATS)
74 |     lastchanged_date = Date(_DATE_FORMATS)
75 |     firstreceived_date = Date(_DATE_FORMATS)
76 |     firstreceived_results_date = Date(_DATE_FORMATS)
77 |     responsible_party = Json()
78 |     keywords = Array()
79 |     is_fda_regulated = Boolean('Yes')
80 |     is_section_801 = Boolean('Yes')
81 |     has_expanded_access = Boolean('Yes')
82 |     condition_browse = Json()
83 |     intervention_browse = Json()
84 |     clinical_results = Json()
85 |     results_exemption_date = Date(_DATE_FORMATS)
86 | 


--------------------------------------------------------------------------------
/migrations/versions/20160525105409_euctr_fix_column_names.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | 
 9 | 
10 | # revision identifiers, used by Alembic.
11 | revision = '11f80cc2fafb'
12 | down_revision = u'f38e14eac095'
13 | branch_labels = None
14 | depends_on = None
15 | 
16 | MAPPING = {
17 |     'date_on_which_this_record_was_first_entered_in_the_eudract_data': 'date_on_which_this_record_was_first_entered',
18 |     'name_or_abbreviated_title_of_the_trial_where_available': 'name_or_abbreviated_title_of_the_trial_where',
19 |     'sponsor_s_protocol_code_number': 'sponsors_protocol_code_number',
20 |     'subject_plans_for_treatment_or_care_after_the_subject_has_ended': 'subject_plans_for_treatment_or_care_after_the_subject',
21 |     'title_of_the_trial_for_lay_people_in_easily_understood_i_e_non_': 'title_of_the_trial_for_lay_people_in',
22 |     'trial_definition_of_the_end_of_the_trial_and_justification_wher': 'trial_definition_of_the_end_of_the_trial_and',
23 |     'trial_full_title_date_and_version_of_each_sub_study_and_their_r': 'trial_full_title_date_and_version_of_each_substudy',
24 |     'trial_if_e_8_6_1_or_e_8_6_2_are_yes_specify_the_regions_in_whic': 'trial_if_e861_or_e862_are_yes_specify_the',
25 |     'trial_medical_condition_s_being_investigated': 'trial_medical_conditions_being_investigated',
26 |     'trial_other_medicinal_product_s': 'trial_other_medicinal_products',
27 |     'trial_primary_end_point_s': 'trial_primary_end_points',
28 |     'trial_secondary_end_point_s': 'trial_secondary_end_points',
29 |     'trial_specify_the_countries_outside_of_the_eea_in_which_trial_s': 'trial_specify_the_countries_outside_of_the_eea_in',
30 |     'trial_the_trial_involves_multiple_sites_in_the_member_state_con': 'trial_the_trial_involves_multiple_sites_in_the_member',
31 |     'trial_the_trial_involves_single_site_in_the_member_state_concer': 'trial_the_trial_involves_single_site_in_the_member',
32 |     'trial_timepoint_s_of_evaluation_of_this_end_point': 'trial_timepoints_of_evaluation_of_this_end_point',
33 |     'trial_trial_being_conducted_both_within_and_outside_the_eea': 'trial_trial_being_conducted_both_within_and_outside_the',
34 |     'trial_trial_contains_a_sub_study': 'trial_trial_contains_a_substudy',
35 |     'us_nct_clinicaltrials_gov_registry_number': 'us_nct_clinicaltrialsgov_registry_number',
36 | }
37 | 
38 | 
39 | def upgrade():
40 |     for key, value in MAPPING.items():
41 |         op.alter_column('euctr', column_name=value, new_column_name=key)
42 | 
43 | 
44 | def downgrade():
45 |     for key, value in MAPPING.items():
46 |         op.alter_column('euctr', column_name=key, new_column_name=value)
47 | 


--------------------------------------------------------------------------------
/collectors/fdadl/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import io
 8 | import ijson
 9 | import shutil
10 | import logging
11 | import zipfile
12 | import tempfile
13 | import requests
14 | from .. import base
15 | from .record import Record
16 | logger = logging.getLogger(__name__)
17 | 
18 | 
19 | # Module API
20 | 
21 | def collect(conf, conn):
22 |     """Collect FDA Drug Labels.
23 |     """
24 | 
25 |     # For more information see:
26 |     # https://open.fda.gov/api/reference/
27 |     URL = 'http://download.open.fda.gov/drug/label/{file}.zip'
28 |     FILES = [
29 |         'drug-label-0001-of-0005.json',
30 |         'drug-label-0002-of-0005.json',
31 |         'drug-label-0003-of-0005.json',
32 |         'drug-label-0004-of-0005.json',
33 |         'drug-label-0005-of-0005.json',
34 |     ]
35 | 
36 |     # Create temp directory
37 |     dirpath = tempfile.mkdtemp()
38 | 
39 |     success = 0
40 |     for file in FILES:
41 | 
42 |         # Download json
43 |         url = URL.format(file=file)
44 |         arch = zipfile.ZipFile(io.BytesIO(requests.get(url).content))
45 |         path = arch.extract(file, dirpath)
46 |         file = io.open(path, encoding='utf-8')
47 | 
48 |         # Get last updated
49 |         last_updated = list(ijson.items(file, 'meta.last_updated'))[0]
50 | 
51 |         # Get items iterator
52 |         file.seek(0)
53 |         items = ijson.items(file, 'results.item')
54 | 
55 |         for item in items:
56 |             meta = item['openfda']
57 | 
58 |             base.config.SENTRY.extra_context({
59 |                 'url': url,
60 |                 'item': meta,
61 |             })
62 | 
63 |             # Skip if no NDC code
64 |             if 'product_ndc' not in meta:
65 |                 continue
66 | 
67 |             # Get data
68 |             data = {
69 |                 'product_ndc': meta['product_ndc'][0],
70 |                 'product_type': meta['product_type'][0],
71 |                 'generic_name': meta['generic_name'][0],
72 |                 'brand_name': meta['brand_name'][0],
73 |                 'last_updated': last_updated,
74 |             }
75 |             if meta.get('application_number'):
76 |                 data['fda_application_number'] = meta['application_number'][0]
77 | 
78 |             # Create record
79 |             record = Record.create(url, data)
80 | 
81 |             # Write record
82 |             record.write(conf, conn)
83 | 
84 |             # Log info
85 |             success += 1
86 |             if not success % 100:
87 |                 logger.info('Collected %s "%s" interventions',
88 |                     success, record.table)
89 | 
90 |     # Remove temp directory
91 |     shutil.rmtree(dirpath)
92 | 


--------------------------------------------------------------------------------
/collectors/isrctn/record.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | from .. import base
  8 | from ..base.fields import Text, Date, Json
  9 | 
 10 | 
 11 | # Module API
 12 | 
 13 | class Record(base.Record):
 14 | 
 15 |     # Config
 16 | 
 17 |     table = 'isrctn'
 18 | 
 19 |     # General
 20 | 
 21 |     isrctn_id = Text(primary_key=True)
 22 |     doi_isrctn_id = Text()
 23 |     title = Text()
 24 |     condition_category = Text()
 25 |     date_applied = Date('%d/%m/%Y')
 26 |     date_assigned = Date('%d/%m/%Y')
 27 |     last_edited = Date('%d/%m/%Y')
 28 |     prospective_retrospective = Text()
 29 |     overall_trial_status = Text()
 30 |     recruitment_status = Text()
 31 |     plain_english_summary = Text()
 32 |     trial_website = Text()
 33 | 
 34 |     # Contant information
 35 | 
 36 |     contacts = Json()
 37 | 
 38 |     # Additional identifiers
 39 | 
 40 |     eudract_number = Text()
 41 |     clinicaltrials_gov_number = Text()
 42 |     protocol_serial_number = Text()
 43 | 
 44 |     # Study information
 45 | 
 46 |     scientific_title = Text()
 47 |     acronym = Text()
 48 |     study_hypothesis = Text()
 49 |     ethics_approval = Text()
 50 |     study_design = Text()
 51 |     primary_study_design = Text()
 52 |     secondary_study_design = Text()
 53 |     trial_setting = Text()
 54 |     trial_type = Text()
 55 |     patient_information_sheet = Text()
 56 |     condition = Text()
 57 |     intervention = Text()
 58 |     intervention_type = Text()
 59 |     phase = Text()
 60 |     drug_names = Text()
 61 |     primary_outcome_measures = Text()
 62 |     secondary_outcome_measures = Text()
 63 |     overall_trial_start_date = Date('%d/%m/%Y')
 64 |     overall_trial_end_date = Date('%d/%m/%Y')
 65 |     reason_abandoned = Text()
 66 | 
 67 |     # Eligability
 68 | 
 69 |     participant_inclusion_criteria = Text()
 70 |     participant_type = Text()
 71 |     age_group = Text()
 72 |     gender = Text()
 73 |     target_number_of_participants = Text()
 74 |     participant_exclusion_criteria = Text()
 75 |     recruitment_start_date = Date('%d/%m/%Y')
 76 |     recruitment_end_date = Date('%d/%m/%Y')
 77 | 
 78 |     # Locations
 79 | 
 80 |     countries_of_recruitment = Text()
 81 |     trial_participating_centre = Text()
 82 | 
 83 |     # Sponsor information
 84 | 
 85 |     sponsors = Json()
 86 | 
 87 |     # Funders
 88 | 
 89 |     funders = Json()
 90 | 
 91 |     # Results and publications
 92 | 
 93 |     publication_and_dissemination_plan = Text()
 94 |     intention_to_publish_date = Date('%d/%m/%Y')
 95 |     participant_level_data = Text()
 96 |     results_basic_reporting = Text()
 97 |     publication_summary = Text()
 98 |     publication_citations = Text()
 99 | 
100 |     # Additional files
101 | 
102 |     # ...
103 | 
104 |     # Editorial notes
105 | 
106 |     # ...
107 | 


--------------------------------------------------------------------------------
/collectors/hra/collector.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import time
 8 | import logging
 9 | import requests
10 | import datetime
11 | from urllib import urlencode
12 | from collections import OrderedDict
13 | from .. import base
14 | from .parser import parse_record
15 | logger = logging.getLogger(__name__)
16 | 
17 | 
18 | # Module API
19 | 
20 | def collect(conf, conn, date_from=None, date_to=None):
21 | 
22 |     # Start collector
23 |     date_from = _get_date_from(conn, date_from)
24 |     date_to = _get_date_to(conn, date_to)
25 |     base.helpers.start(conf, 'hra', {'date_from': date_from, 'date_to': date_to})
26 | 
27 |     # Get parameters
28 |     URL = conf['HRA_URL']
29 |     USER = conf['HRA_USER']
30 |     PASS = conf['HRA_PASS']
31 | 
32 |     count = 0
33 |     chunk_days = 100
34 |     session = requests.Session()
35 |     loop_date_from = date_from
36 |     while True:
37 |         if loop_date_from > date_to:
38 |             break
39 |         loop_date_to = min(loop_date_from + datetime.timedelta(days=chunk_days), date_to)
40 |         url = _make_request_url(URL, loop_date_from, loop_date_to)
41 |         response = session.get(url, auth=(USER, PASS))
42 |         response.raise_for_status()
43 |         base.config.SENTRY.extra_context({
44 |             'url': response.url,
45 |         })
46 |         for item in response.json():
47 |             record = parse_record(response.url, item)
48 |             if not record:
49 |                 continue
50 |             record.write(conf, conn)
51 |             count += 1
52 |             if not count % 100:
53 |                 logger.info('Collected "%s" hra records', count)
54 |         loop_date_from = loop_date_to + datetime.timedelta(days=1)
55 |         time.sleep(1)
56 | 
57 |     # Stop collector
58 |     base.helpers.stop(conf, 'hra', {'collected': count})
59 | 
60 | 
61 | # Internal
62 | 
63 | def _get_date_from(conn, date_from):
64 |     if date_from is not None:
65 |         return datetime.datetime.strptime(date_from, '%Y-%m-%d')
66 |     date_from = datetime.date(2008, 1, 1)
67 |     if 'hra' in conn['warehouse'].tables:
68 |         rows = conn['warehouse'].query("""
69 |             SELECT least(max(publication_date), max(updated_date)) as latest
70 |             FROM hra
71 |         """)
72 |         latest = list(rows)[0]['latest']
73 |         if latest:
74 |             date_from = latest
75 |     return date_from
76 | 
77 | 
78 | def _get_date_to(conn, date_to):
79 |     if date_to is not None:
80 |         return datetime.datetime.strptime(date_to, '%Y-%m-%d')
81 |     return datetime.date.today()
82 | 
83 | 
84 | def _make_request_url(prefix, date_from, date_to):
85 |     query = OrderedDict()
86 |     query['datePublishedFrom'] = date_from.strftime('%Y-%m-%d')
87 |     query['datePublishedTo'] = date_to.strftime('%Y-%m-%d')
88 |     url = '%s?%s' % (prefix, urlencode(query))
89 |     return url
90 | 


--------------------------------------------------------------------------------
/migrations/versions/20160603215242_hra_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | from alembic import op
 8 | import sqlalchemy as sa
 9 | 
10 | 
11 | # revision identifiers, used by Alembic.
12 | revision = 'c4c0db99bb1c'
13 | down_revision = u'6d709931cc58'
14 | branch_labels = None
15 | depends_on = None
16 | 
17 | 
18 | def upgrade():
19 |     op.create_table('hra',
20 | 
21 |         # Meta
22 | 
23 |         sa.Column('meta_id', sa.Text, unique=True),
24 |         sa.Column('meta_source', sa.Text),
25 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
26 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
27 | 
28 |         # General
29 | 
30 |         sa.Column('hra_id', sa.Text),
31 |         sa.Column('publication_date', sa.Date),
32 |         sa.Column('updated_date', sa.Date),
33 |         sa.Column('comittee_name', sa.Text),
34 |         sa.Column('comittee_ref_number', sa.Text),
35 |         sa.Column('iras_proj_id', sa.Text),
36 |         sa.Column('contact_name', sa.Text),
37 |         sa.Column('contact_email', sa.Text),
38 |         sa.Column('application_title', sa.Text),
39 |         sa.Column('study_type_id', sa.Text),
40 |         sa.Column('study_type', sa.Text),
41 |         sa.Column('sponsor_org', sa.Text),
42 |         sa.Column('research_programme', sa.Text),
43 |         sa.Column('data_coll_arrangements', sa.Text),
44 |         sa.Column('establishment_org', sa.Text),
45 |         sa.Column('establishment_org_address_1', sa.Text),
46 |         sa.Column('establishment_org_address_2', sa.Text),
47 |         sa.Column('establishment_org_address_3', sa.Text),
48 |         sa.Column('establishment_org_post_code', sa.Text),
49 |         sa.Column('decision', sa.Text),
50 |         sa.Column('decision_date', sa.DateTime(timezone=True)),
51 |         sa.Column('human_tissue_license', sa.Text),
52 |         sa.Column('rtb_title', sa.Text),
53 |         sa.Column('research_database_title', sa.Text),
54 |         sa.Column('application_full_title', sa.Text),
55 |         sa.Column('isrctn_id', sa.Text),
56 |         sa.Column('nct_id', sa.Text),
57 |         sa.Column('additional_ref_numbers', sa.Text),
58 |         sa.Column('duration_of_study_in_uk', sa.Text),
59 |         sa.Column('research_summary', sa.Text),
60 |         sa.Column('euctr_id', sa.Text),
61 |         sa.Column('social_value', sa.Text),
62 |         sa.Column('recuitment_arrangements', sa.Text),
63 |         sa.Column('risk_and_benefit', sa.Text),
64 |         sa.Column('participants_protection_and_care', sa.Text),
65 |         sa.Column('informed_consent', sa.Text),
66 |         sa.Column('applicant_and_staff_suitability', sa.Text),
67 |         sa.Column('independent_review', sa.Text),
68 |         sa.Column('supporting_info_suitability', sa.Text),
69 |         sa.Column('other_comments', sa.Text),
70 |         sa.Column('research_summary_suitability', sa.Text),
71 | 
72 |     )
73 | 
74 | 
75 | def downgrade():
76 |     op.drop_table('hra')
77 | 


--------------------------------------------------------------------------------
/collectors/base/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import os
  8 | import logging
  9 | import logging.config
 10 | import raven
 11 | from dotenv import load_dotenv
 12 | load_dotenv('.env')
 13 | 
 14 | 
 15 | # Environment
 16 | 
 17 | ENV = os.environ.get('PYTHON_ENV', 'development')
 18 | if os.environ.get('CI'):
 19 |     ENV = 'testing'
 20 | 
 21 | if ENV == 'testing':
 22 |     WAREHOUSE_URL = os.environ['TEST_WAREHOUSE_URL']
 23 | else:
 24 |     WAREHOUSE_URL = os.environ['WAREHOUSE_URL']
 25 | 
 26 | # Scrapy
 27 | 
 28 | SCRAPY_SETTINGS = {
 29 |     'SPIDER_MODULES': [
 30 |         'collectors.actrn.spider',
 31 |         'collectors.euctr.spider',
 32 |         'collectors.gsk.spider',
 33 |         'collectors.ictrp.spider',
 34 |         'collectors.isrctn.spider',
 35 |         'collectors.jprn.spider',
 36 |         'collectors.pfizer.spider',
 37 |         'collectors.pubmed.spider',
 38 |         'collectors.takeda.spider',
 39 |     ],
 40 |     'DOWNLOAD_DELAY': float(os.getenv('DOWNLOAD_DELAY', 1)),
 41 |     'AUTOTHROTTLE_ENABLED': True,
 42 |     'ITEM_PIPELINES': {
 43 |         'collectors.base.pipelines.Warehouse': 100,
 44 |     },
 45 | }
 46 | 
 47 | 
 48 | # Logging
 49 | 
 50 | def setup_syslog_handler():
 51 |     if os.environ.get('LOGGING_URL', None):
 52 |         host, port = os.environ['LOGGING_URL'].split(':')
 53 |         handler = logging.handlers.SysLogHandler(address=(host, int(port)))
 54 |     else:
 55 |         handler = logging.handlers.SysLogHandler()
 56 |     return handler
 57 | 
 58 | 
 59 | SENTRY_DSN = os.environ.get('SENTRY_DSN')
 60 | SENTRY = raven.Client(SENTRY_DSN)
 61 | 
 62 | LOGGING_CONFIG = {
 63 |     'version': 1,
 64 |     'disable_existing_loggers': False,
 65 |     'formatters': {
 66 |         'default': {
 67 |             'format': '%(levelname)s %(name)s: %(message)s',
 68 |         },
 69 |     },
 70 |     'handlers': {
 71 |         'default_handler': {
 72 |             'class': 'logging.StreamHandler',
 73 |             'stream': 'ext://sys.stdout',
 74 |             'level': 'DEBUG',
 75 |             'formatter': 'default'
 76 |         },
 77 |         'syslog_handler': {
 78 |             '()': setup_syslog_handler,
 79 |             'level': 'INFO',
 80 |             'formatter': 'default',
 81 |         },
 82 |         'sentry': {
 83 |             'level': 'ERROR',
 84 |             'class': 'raven.handlers.logging.SentryHandler',
 85 |             'dsn': SENTRY_DSN,
 86 |         },
 87 |     },
 88 |     'root': {
 89 |         'handlers': ['default_handler', 'syslog_handler'],
 90 |         'level': os.environ.get('LOGGING_LEVEL', 'DEBUG').upper(),
 91 |     },
 92 | }
 93 | 
 94 | logging.config.dictConfig(LOGGING_CONFIG)
 95 | 
 96 | # ICTRP
 97 | 
 98 | ICTRP_USER = os.environ.get('ICTRP_USER', None)
 99 | ICTRP_PASS = os.environ.get('ICTRP_PASS', None)
100 | 
101 | # HRA
102 | 
103 | HRA_ENV = os.environ.get('HRA_ENV', None)
104 | HRA_URL = os.environ.get('HRA_URL', None)
105 | HRA_USER = os.environ.get('HRA_USER', None)
106 | HRA_PASS = os.environ.get('HRA_PASS', None)
107 | 
108 | # Cochrane Reviews
109 | 
110 | COCHRANE_ARCHIVE_URL = os.environ.get('COCHRANE_ARCHIVE_URL')
111 | 


--------------------------------------------------------------------------------
/collectors/hra/parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import logging
 8 | from .record import Record
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def parse_record(url, item):
13 | 
14 |     # Init data
15 |     data = {}
16 | 
17 |     # Map data
18 |     data['hra_id'] = 'HRA%s' % item['ApplicationID']
19 |     data['publication_date'] = item['PublicationDate']
20 |     data['updated_date'] = item['UpdatedDate']
21 |     data['comittee_name'] = item['CommitteeName']
22 |     data['comittee_ref_number'] = item['CommitteeReferenceNumber']
23 |     data['iras_proj_id'] = item['IrasProjectID']
24 |     data['contact_name'] = item['ContactName']
25 |     data['contact_email'] = item['ContactEmail']
26 |     data['application_title'] = item['ApplicationTitle']
27 |     data['study_type_id'] = item['StudyTypeID']
28 |     data['study_type'] = item['StudyType']
29 |     data['sponsor_org'] = item['SponsorOrganisation']
30 |     data['research_programme'] = item['ResearchProgramme']
31 |     data['data_coll_arrangements'] = item['DataCollectionArrangements']
32 |     data['establishment_org'] = item['EstablishmentOrganisation']
33 |     data['establishment_org_address_1'] = item['EstablishmentOrganisationAddress1']
34 |     data['establishment_org_address_2'] = item['EstablishmentOrganisationAddress2']
35 |     data['establishment_org_address_3'] = item['EstablishmentOrganisationAddress3']
36 |     data['establishment_org_post_code'] = item['EstablishmentOrganisationPostcode']
37 |     data['decision'] = item['Decision']
38 |     data['decision_date'] = item['DecisionDate']
39 |     data['human_tissue_license'] = item['HumanTissueAuthorityStorageLicence']
40 |     data['rtb_title'] = item['RTBTitle']
41 |     data['research_database_title'] = item['ResearchDatabaseTitle']
42 |     data['application_full_title'] = item['ApplicationFullTitle']
43 |     data['isrctn_id'] = item['ISRCTN']
44 |     data['nct_id'] = item['NCT']
45 |     data['additional_ref_numbers'] = item['AdditionalReferenceNumbers']
46 |     data['duration_of_study_in_uk'] = item['DurationOfStudyInUK']
47 |     data['research_summary'] = item['ResearchSummary']
48 |     data['euctr_id'] = item['EudraCT']
49 |     data['social_value'] = item['SocialValue']
50 |     data['recuitment_arrangements'] = item['RecruitmentArrangements']
51 |     data['risk_and_benefit'] = item['RiskAndBenefit']
52 |     data['participants_protection_and_care'] = item['ParticipantsProtectionAndCare']
53 |     data['informed_consent'] = item['InformedConsent']
54 |     data['applicant_and_staff_suitability'] = item['ApplicantAndStaffSuitability']
55 |     data['independent_review'] = item['IndependentReview']
56 |     data['supporting_info_suitability'] = item['SupportingInfoSuitability']
57 |     data['other_comments'] = item['OtherComments']
58 |     data['research_summary_suitability'] = item['ResearchSummarySuitability']
59 | 
60 |     # Ignore deferred records
61 |     date_fields = ['publication_date', 'decision_date', 'updated_date']
62 |     if any('deferred' in data[date_field] for date_field in date_fields):
63 |         return None
64 | 
65 |     # Create record
66 |     record = Record.create(url, data)
67 | 
68 |     return record
69 | 


--------------------------------------------------------------------------------
/docs/overview.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | This system is responsible for managing the schema of OpenTrials `warehouse` database and collecting
 4 | data to populate it.
 5 | 
 6 | ## Stack
 7 | 
 8 | Collectors are fully compatible with Python2.7.
 9 | 
10 | We use PostgreSQL for our database and [Alembic](http://alembic.zzzcomputing.com/en/latest/) for migrations.
11 | 
12 | Collectors are deployed and run in production with [DockerCloud](https://github.com/respect31/docker-cloud-example).
13 | 
14 | ## Collectors
15 | 
16 | The system's collectors are independent python modules that share the following signature:
17 | 
18 | ```python
19 | def collect(conf, conn, *args):
20 |     pass
21 | ```
22 | 
23 | Where arguments are:
24 | - `conf` - config dict
25 | - `conn` - connections dict
26 | - `args` - collector arguments
27 | 
28 | To run a collector from command line:
29 | ```
30 | $ make start <name> [<args>]
31 | ```
32 | 
33 | This code will trigger `collectors.<name>.collect(conf, conn, *args)` call.
34 | 
35 | *NOTE*: Most collectors need `date_from` and `date_to` arguments that define a
36 | time range from which we want to extract resources. For example:
37 | 
38 | ```
39 | $ make start nct 2013-11-31 2013-12-01
40 | ```
41 | 
42 | To check if that is the case, see the `collect` function of the collector you are interested in.
43 | 
44 | ### Scraping Collectors
45 | 
46 | Many collectors are scrapers. Scraping is based on
47 | [Scrapy](https://scrapy.readthedocs.io/en/latest/intro/overview.html) framework. Here is
48 | an example of how to use Scrapy in the `collect` function:
49 | 
50 | ```python
51 | from scrapy.crawler import CrawlerProcess
52 | from .spider import <name>Spider
53 | 
54 | def collect(conf, conn, <args>):
55 |     process = CrawlerProcess(conf)
56 |     process.crawl(<name>Spider, conn=conn, <args>)
57 |     process.start()
58 | ```
59 | 
60 | For more details check the tutorial [How to Write a Collector using Scrapy](https://github.com/opentrials/collectors/blob/master/docs/collector-scrapy-guide.md)
61 | 
62 | ### Working with the database
63 | 
64 | The folder `collectors/base` contains multiple reusable components and
65 | helpers including the [base class for a database record](https://github.com/opentrials/collectors/blob/master/collectors/base/record.py)
66 | and the [base class for a record's field](https://github.com/opentrials/collectors/blob/master/collectors/base/fields.py).
67 | Each collector that has a corresponding table in the `warehouse` database has to
68 | define the schema for that table in a class that inherits from the base class for record.
69 | 
70 | For example the following class defines the schema for table `colors`. This table has
71 | 2 fields of type `Text`, one of which is a primary key:
72 | 
73 | ```python
74 | class ColorRecord(base.Record):
75 |     table = 'colors'
76 | 
77 |     # Fields
78 | 
79 |     id = Text(primary_key=True)
80 |     color = Text()
81 | ```
82 | 
83 | To see how this connects to the other parts of the collector check the [How to Write a Collector](https://github.com/opentrials/collectors/blob/master/docs/collector-guide.md) tutorial.
84 | #### Altering the database schema
85 | 
86 | 1. Define the table/field in the collector's record class as explained above.
87 | 2. Create a migration for it (more details in [Alembic docs](http://alembic.zzzcomputing.com/en/latest/tutorial.html#create-a-migration-script)).
88 | 


--------------------------------------------------------------------------------
/collectors/actrn/record.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | from .. import base
  8 | from ..base.fields import Text, Date, Boolean, Integer, Json, Array
  9 | 
 10 | 
 11 | # Module API
 12 | 
 13 | class Record(base.Record):
 14 | 
 15 |     # Config
 16 | 
 17 |     table = 'actrn'
 18 | 
 19 |     # General
 20 | 
 21 |     trial_id = Text(primary_key=True)
 22 |     ethics_application_status = Text()
 23 |     date_submitted = Date('%d/%m/%Y')
 24 |     date_registered = Date('%d/%m/%Y')
 25 |     type_of_registration = Text()
 26 | 
 27 |     # Titles & IDs
 28 | 
 29 |     public_title = Text()
 30 |     scientific_title = Text()
 31 |     secondary_ids = Array()
 32 |     universal_trial_number_utn = Text()
 33 |     trial_acronym = Text()
 34 | 
 35 |     # Health condition
 36 | 
 37 |     health_conditions_or_problems_studied = Text()
 38 |     condition_category = Text()
 39 |     condition_code = Text()
 40 | 
 41 |     # Intervention/exposure
 42 | 
 43 |     study_type = Text()
 44 |     patient_registry = Boolean('Yes')
 45 |     target_follow_up_duration = Integer()
 46 |     target_follow_up_type = Text()
 47 |     description_of_intervention_s_exposure = Text()
 48 |     intervention_codes = Array()
 49 |     comparator_control_treatment = Text()
 50 |     control_group = Text()
 51 | 
 52 |     # Outcomes
 53 | 
 54 |     primary_outcomes = Json()
 55 |     secondary_outcomes = Json()
 56 | 
 57 |     # Eligibility
 58 | 
 59 |     key_inclusion_criteria = Text()
 60 |     minimum_age = Text()
 61 |     maximum_age = Text()
 62 |     gender = Text()
 63 |     can_healthy_volunteers_participate = Boolean('Yes')
 64 |     key_exclusion_criteria = Text()
 65 | 
 66 |     # Study design
 67 | 
 68 |     purpose_of_the_study = Text()
 69 |     allocation_to_intervention = Text()
 70 |     procedure_for_enrolling_a_subject_and_allocating_the_treatment_ = Text()
 71 |     methods_used_to_generate_the_sequence_in_which_subjects_will_be = Text()
 72 |     masking_blinding = Text()
 73 |     who_is_are_masked_blinded = Text()
 74 |     intervention_assignment = Text()
 75 |     other_design_features = Text()
 76 |     phase = Text()
 77 |     type_of_endpoint_s = Text()
 78 |     purpose = Text()
 79 |     duration = Text()
 80 |     selection = Text()
 81 |     timing = Text()
 82 |     statistical_methods_analysis = Text()
 83 | 
 84 |     # Recruitment
 85 | 
 86 |     anticipated_date_of_first_participant_enrolment = Date('%d/%m/%Y')
 87 |     actual_date_of_first_participant_enrolment = Date('%d/%m/%Y')
 88 |     anticipated_date_last_participant_enrolled = Date('%d/%m/%Y')
 89 |     actual_date_last_participant_enrolled = Date('%d/%m/%Y')
 90 |     target_sample_size = Integer()
 91 |     actual_sample_size = Integer()
 92 |     recruitment_status = Text()
 93 |     recruitment_state_s = Text()
 94 | 
 95 |     # Funding & Sponsors
 96 | 
 97 |     primary_sponsor = Json()
 98 |     sponsors = Json()
 99 | 
100 |     # Ethics approval
101 | 
102 |     ethics_application_status = Text()
103 |     ethics_applications = Json()
104 | 
105 |     # Summary
106 | 
107 |     brief_summary = Text()
108 |     trial_website = Text()
109 |     trial_related_presentations_publications = Text()
110 |     public_notes = Text()
111 |     attachments = Array()
112 | 
113 |     # Contacts
114 | 
115 |     principal_investigator = Json()
116 |     public_queries = Json()
117 |     scientific_queries = Json()
118 | 


--------------------------------------------------------------------------------
/collectors/cochrane_reviews/parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import uuid
 8 | try:
 9 |     from lxml import etree
10 | except ImportError:
11 |     import xml.etree.ElementTree as etree
12 | from .record import Record
13 | 
14 | 
15 | def parse_record(url, review_file):
16 |     tree = etree.parse(review_file)
17 |     study_robs = []
18 |     studies = []
19 | 
20 |     # Get risk of bias
21 | 
22 |     root = tree.getroot()
23 |     doi_id = root.attrib.get('DOI', '')
24 |     quality_item_data_entries = tree.findall('//QUALITY_ITEM_DATA_ENTRY')
25 |     for quality_item_data_entry in quality_item_data_entries:
26 |         study_rob = {
27 |             'study_id': quality_item_data_entry.attrib['STUDY_ID'],
28 |             'modified': quality_item_data_entry.attrib.get('MODIFIED', ''),
29 |             'result': quality_item_data_entry.attrib['RESULT'],
30 |             'group_id': quality_item_data_entry.attrib.get('GROUP_ID', ''),
31 |             'group_name': '',
32 |             'result_description': quality_item_data_entry.findtext('DESCRIPTION/P', ''),
33 |         }
34 |         quality_item = quality_item_data_entry.getparent().getparent()
35 |         study_rob['rob_id'] = quality_item.attrib['ID']
36 |         study_rob['rob_name'] = quality_item.findtext('NAME')
37 |         study_rob['rob_description'] = quality_item.findtext('DESCRIPTION/P', '')
38 |         for group in quality_item.iter('QUALITY_ITEM_DATA_ENTRY_GROUP'):
39 |             group_id = group.attrib.get('ID')
40 |             if group_id == study_rob['group_id']:
41 |                 study_rob['group_name'] = group.findtext('NAME')
42 |         study_robs.append(study_rob)
43 | 
44 |     # Get references
45 | 
46 |     included_studies = tree.find('//INCLUDED_STUDIES')
47 |     for study in included_studies.iter('STUDY'):
48 |         study_info = {
49 |             'id': uuid.uuid1().hex,
50 |             'doi_id': doi_id,
51 |             'file_name': review_file.name,
52 |             'study_id': study.attrib['ID'],
53 |             'study_type': study.attrib['DATA_SOURCE'],
54 |             'refs': [],
55 |         }
56 |         corresponding_robs = [rob for rob in study_robs
57 |                               if rob['study_id'] == study_info['study_id']]
58 |         study_info['robs'] = corresponding_robs
59 |         for reference in study.iter('REFERENCE'):
60 |             ref = {
61 |                 'type': reference.attrib['TYPE'],
62 |                 'authors': reference.findtext('AU', ''),
63 |                 'title': reference.findtext('TI', ''),
64 |                 'source': reference.findtext('SO', ''),
65 |                 'year': reference.findtext('YR', ''),
66 |                 'vl': reference.findtext('VL', ''),
67 |                 'no': reference.findtext('NO', ''),
68 |                 'pg': reference.findtext('PG', ''),
69 |                 'country': reference.findtext('CY', ''),
70 |                 'identifiers': [],
71 |             }
72 |             for identifier in reference.iter('IDENTIFIER'):
73 |                 ident = {key.lower(): value for key, value in identifier.items()
74 |                          if key not in ['MODIFIED', 'MODIFIED_BY']}
75 |                 ref['identifiers'].append(ident)
76 |             study_info['refs'].append(ref)
77 | 
78 |         # Create record
79 | 
80 |         record = Record.create(url, study_info)
81 |         studies.append(record)
82 | 
83 |     return studies
84 | 


--------------------------------------------------------------------------------
/collectors/pubmed/spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import logging
  8 | import requests
  9 | from urllib import urlencode
 10 | from datetime import datetime, date, timedelta
 11 | from collections import OrderedDict
 12 | from scrapy.spiders import CrawlSpider
 13 | from .parser import parse_record
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | # Module API
 18 | 
 19 | class Spider(CrawlSpider):
 20 | 
 21 |     # Public
 22 | 
 23 |     name = 'pubmed'
 24 |     allowed_domains = ['eutils.ncbi.nlm.nih.gov']
 25 | 
 26 |     def __init__(self, conf=None, conn=None, date_from=None, date_to=None):
 27 | 
 28 |         # Save conf/conn
 29 |         self.conf = conf
 30 |         self.conn = conn
 31 | 
 32 |         # Make start urls
 33 |         self.start_urls = _make_start_urls(
 34 |             prefix='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi/',
 35 |             template='https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi/?db=pubmed&id={pmid}&retmode=xml',
 36 |             date_from=date_from, date_to=date_to)
 37 | 
 38 |         # Set parser
 39 |         self.parse = parse_record
 40 | 
 41 |         # Inherit parent
 42 |         super(Spider, self).__init__()
 43 | 
 44 | 
 45 | # Internal
 46 | 
 47 | def _make_start_urls(prefix, template, date_from=None, date_to=None, session=None):
 48 |     """ Return start_urls.
 49 |     """
 50 | 
 51 |     # Init urls and session
 52 |     urls = set()
 53 |     if not session:
 54 |         session = requests.Session()
 55 |     adapter_opts = {'max_retries': requests.packages.urllib3.util.Retry(total=5, status_forcelist=[503])}
 56 |     session.mount('https://', requests.adapters.HTTPAdapter(**adapter_opts))
 57 | 
 58 |     # Prepare dates
 59 |     if date_from is None:
 60 |         date_from = str(date.today() - timedelta(days=1))
 61 |     if date_to is None:
 62 |         date_to = str(date.today())
 63 |     date_from = datetime.strptime(date_from, '%Y-%m-%d').strftime('%Y/%m/%d')
 64 |     date_to = datetime.strptime(date_to, '%Y-%m-%d').strftime('%Y/%m/%d')
 65 | 
 66 |     # Prepare query
 67 |     query = OrderedDict()
 68 |     query['db'] = 'pubmed'
 69 |     query['retmode'] = 'json'
 70 |     query['mindate'] = date_from
 71 |     query['maxdate'] = date_to
 72 | 
 73 |     # Terms to search
 74 |     query['term'] = """(randomized controlled trial[Publication Type] OR
 75 |                         (randomized[Title/Abstract]
 76 |                          AND controlled[Title/Abstract]
 77 |                          AND trial[Title/Abstract]
 78 |                         ))
 79 |                     """
 80 | 
 81 |     # For both publication/modifiction
 82 |     for date_type in ['pdat', 'mdat']:
 83 |         retstart = 0
 84 |         retmax = 50000
 85 |         while True:
 86 |             query['datetype'] = date_type
 87 |             query['retstart'] = retstart
 88 |             query['retmax'] = retmax
 89 |             url = '%s?%s' % (prefix, urlencode(query))
 90 |             response = session.get(url)
 91 |             pmids = response.json()['esearchresult']['idlist']
 92 |             if not pmids:
 93 |                 break
 94 |             for pmid in pmids:
 95 |                 urls.add(template.format(pmid=pmid))
 96 |             retstart += retmax
 97 | 
 98 |     # Log urls count
 99 |     logger.info('Populated Pubmed start urls: %s', len(urls))
100 | 
101 |     return list(urls)
102 | 


--------------------------------------------------------------------------------
/collectors/base/fields.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import sqlalchemy as sa
  8 | from scrapy import Field
  9 | from six import add_metaclass
 10 | from abc import ABCMeta, abstractmethod
 11 | from sqlalchemy.dialects.postgresql import ARRAY, JSONB
 12 | from . import helpers
 13 | 
 14 | 
 15 | @add_metaclass(ABCMeta)
 16 | class Base(Field):
 17 | 
 18 |     # Public
 19 | 
 20 |     def __init__(self, primary_key=False):
 21 |         self.__primary_key = primary_key
 22 | 
 23 |     def __repr__(self):
 24 |         return type(self).__name__
 25 | 
 26 |     @property
 27 |     @abstractmethod
 28 |     def column_type(self):
 29 |         pass  # pragma: no cover
 30 | 
 31 |     @property
 32 |     def primary_key(self):
 33 |         return self.__primary_key
 34 | 
 35 |     def parse(self, value):
 36 |         return value
 37 | 
 38 | 
 39 | class Text(Base):
 40 | 
 41 |     # Public
 42 | 
 43 |     column_type = sa.Text
 44 | 
 45 | 
 46 | class Integer(Base):
 47 | 
 48 |     # Public
 49 | 
 50 |     column_type = sa.Integer
 51 | 
 52 |     def parse(self, value):
 53 |         return int(value)
 54 | 
 55 | 
 56 | class Boolean(Base):
 57 | 
 58 |     # Public
 59 | 
 60 |     column_type = sa.Boolean
 61 | 
 62 |     def __init__(self, true_value=None, **params):
 63 |         super(Boolean, self).__init__(**params)
 64 |         self.__true_value = true_value
 65 | 
 66 |     def parse(self, value):
 67 |         if self.__true_value is not None:
 68 |             value = (value.lower() == self.__true_value.lower())
 69 |         return value
 70 | 
 71 | 
 72 | class Date(Base):
 73 | 
 74 |     # Public
 75 | 
 76 |     column_type = sa.Date
 77 | 
 78 |     def __init__(self, formats, **params):
 79 |         super(Date, self).__init__(**params)
 80 |         if not isinstance(formats, (list, tuple)):
 81 |             formats = [formats]
 82 |         self.__formats = formats
 83 | 
 84 |     def parse(self, value):
 85 |         for i, fmt in enumerate(self.__formats):
 86 |             try:
 87 |                 return helpers.parse_date(value, format=fmt)
 88 |             except ValueError:
 89 |                 pass
 90 |         msg = "time data '{value}' doesn't match any of the formats: {formats}"
 91 |         raise ValueError(msg.format(value=value, formats=self.__formats))
 92 | 
 93 | 
 94 | class Datetime(Base):
 95 | 
 96 |     # Public
 97 | 
 98 |     column_type = sa.DateTime(timezone=True)
 99 | 
100 |     def __init__(self, format=None, **params):
101 |         super(Datetime, self).__init__(**params)
102 |         self.__format = format
103 | 
104 |     def parse(self, value):
105 |         if self.__format is not None:
106 |             value = helpers.parse_datetime(value, format=self.__format)
107 |         return value
108 | 
109 | 
110 | class Json(Base):
111 | 
112 |     # Public
113 | 
114 |     column_type = JSONB
115 | 
116 | 
117 | class Array(Base):
118 | 
119 |     # Public
120 | 
121 |     def __init__(self, field=None, **params):
122 |         super(Array, self).__init__(**params)
123 |         if field is None:
124 |             field = Text()
125 |         self.__field = field
126 |         self.__column_type = ARRAY(field.column_type)
127 | 
128 |     @property
129 |     def column_type(self):
130 |         return self.__column_type
131 | 
132 |     def parse(self, value):
133 |         result = []
134 |         for item in value:
135 |             result.append(self.__field.parse(item))
136 |         return result
137 | 


--------------------------------------------------------------------------------
/migrations/versions/20160220164104_nct_create_table.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from __future__ import absolute_import
 3 | from __future__ import division
 4 | from __future__ import print_function
 5 | from __future__ import unicode_literals
 6 | 
 7 | import sqlalchemy as sa
 8 | from sqlalchemy.dialects.postgresql import ARRAY, JSONB
 9 | from alembic import op
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '999c8f33bc04'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     op.create_table('nct',
21 | 
22 |         # Meta
23 | 
24 |         sa.Column('meta_uuid', sa.Text),
25 |         sa.Column('meta_source', sa.Text),
26 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
27 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
28 | 
29 |         # General
30 | 
31 |         sa.Column('download_date', sa.Text),
32 |         sa.Column('link_text', sa.Text),
33 |         sa.Column('url', sa.Text),
34 |         sa.Column('org_study_id', sa.Text),
35 |         sa.Column('nct_id', sa.Text, primary_key=True),
36 |         sa.Column('secondary_ids', ARRAY(sa.Text)),
37 |         sa.Column('nct_aliases', ARRAY(sa.Text)),
38 |         sa.Column('brief_title', sa.Text),
39 |         sa.Column('acronym', sa.Text),
40 |         sa.Column('official_title', sa.Text),
41 |         sa.Column('sponsors', JSONB),
42 |         sa.Column('source', sa.Text),
43 |         sa.Column('oversight_info', JSONB),
44 |         sa.Column('brief_summary', sa.Text),
45 |         sa.Column('detailed_description', sa.Text),
46 |         sa.Column('overall_status', sa.Text),
47 |         sa.Column('why_stopped', sa.Text),
48 |         sa.Column('start_date', sa.Date),
49 |         sa.Column('completion_date_actual', sa.Date),
50 |         sa.Column('completion_date_anticipated', sa.Date),
51 |         sa.Column('primary_completion_date_actual', sa.Date),
52 |         sa.Column('primary_completion_date_anticipated', sa.Date),
53 |         sa.Column('phase', sa.Text),
54 |         sa.Column('study_type', sa.Text),
55 |         sa.Column('study_design', sa.Text),
56 |         sa.Column('target_duration', sa.Text),
57 |         sa.Column('primary_outcomes', JSONB),
58 |         sa.Column('secondary_outcomes', JSONB),
59 |         sa.Column('other_outcomes', JSONB),
60 |         sa.Column('number_of_arms', sa.Integer),
61 |         sa.Column('number_of_groups', sa.Integer),
62 |         sa.Column('enrollment_actual', sa.Integer),
63 |         sa.Column('enrollment_anticipated', sa.Integer),
64 |         sa.Column('conditions', ARRAY(sa.Text)),
65 |         sa.Column('arm_groups', JSONB),
66 |         sa.Column('interventions', JSONB),
67 |         sa.Column('biospec_retention', sa.Text),
68 |         sa.Column('biospec_desrc', sa.Text),
69 |         sa.Column('eligibility', JSONB),
70 |         sa.Column('overall_officials', JSONB),
71 |         sa.Column('overall_contact', JSONB),
72 |         sa.Column('overall_contact_backup', JSONB),
73 |         sa.Column('locations', JSONB),
74 |         sa.Column('location_countries', ARRAY(sa.Text)),
75 |         sa.Column('removed_countries', ARRAY(sa.Text)),
76 |         sa.Column('links', JSONB),
77 |         sa.Column('references', JSONB),
78 |         sa.Column('results_references', JSONB),
79 |         sa.Column('verification_date', sa.Date),
80 |         sa.Column('lastchanged_date', sa.Date),
81 |         sa.Column('firstreceived_date', sa.Date),
82 |         sa.Column('firstreceived_results_date', sa.Date),
83 |         sa.Column('responsible_party', JSONB),
84 |         sa.Column('keywords', ARRAY(sa.Text)),
85 |         sa.Column('is_fda_regulated', sa.Text),
86 |         sa.Column('is_section_801', sa.Text),
87 |         sa.Column('has_expanded_access', sa.Text),
88 |         sa.Column('condition_browse', JSONB),
89 |         sa.Column('intervention_browse', JSONB),
90 |         sa.Column('clinical_results', JSONB),
91 | 
92 |     )
93 | 
94 | 
95 | def downgrade():
96 |     op.drop_table('nct')
97 | 


--------------------------------------------------------------------------------
/collectors/gsk/record.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | from .. import base
  8 | from ..base.fields import Text, Date, Boolean, Integer, Json, Array
  9 | 
 10 | 
 11 | # Module API
 12 | 
 13 | class Record(base.Record):
 14 |     _FULL_DATE_FORMATS = [
 15 |         '%B %d, %Y',
 16 |         '%b %d, %Y'
 17 |     ]
 18 | 
 19 |     # Config
 20 | 
 21 |     table = 'gsk'
 22 | 
 23 |     # General
 24 | 
 25 |     study_id = Text(primary_key=True)
 26 |     study_title = Text()
 27 |     patient_level_data = Text()
 28 |     clinicaltrials_gov_identifier = Text()
 29 |     sponsor = Text()
 30 |     collaborators = Text()
 31 |     study_recruitment_status = Text()
 32 |     generic_name = Text()
 33 |     trade_name = Text()
 34 |     study_indication = Text()
 35 | 
 36 |     # Protocol summary
 37 | 
 38 |     first_received = Date(_FULL_DATE_FORMATS)
 39 |     last_updated = Date(_FULL_DATE_FORMATS)
 40 |     title = Text()
 41 |     phase = Text()
 42 |     acronym = Text()
 43 |     secondary_ids = Array()
 44 |     fda_regulated_intervention = Boolean('yes')
 45 |     section_801_clinical_trial = Boolean('yes')
 46 |     delayed_posting = Boolean('yes')
 47 |     ind_ide_protocol = Text()
 48 |     ind_ide_grantor = Text()
 49 |     ind_ide_number = Text()
 50 |     ind_ide_serial_number = Text()
 51 |     has_expanded_access = Boolean('yes')
 52 |     study_type = Text()
 53 |     oversight_authority = Array()
 54 |     sponsor = Text()
 55 |     collaborators = Array()
 56 |     brief_summary = Text()
 57 |     detailed_description = Text()
 58 |     record_verification_date = Date(_FULL_DATE_FORMATS)
 59 |     status = Text()
 60 |     why_study_stopped = Text()
 61 |     study_start_date = Date('%B %Y')
 62 |     study_completion_date = Date('%B %Y')
 63 |     study_completion_date_type = Text()
 64 |     primary_completion_date = Date('%B %Y')
 65 |     primary_completion_date_type = Text()
 66 |     primary_purpose = Text()
 67 |     study_design = Text()
 68 |     time_perspective = Text()
 69 |     biospecimen_retention = Text()
 70 |     biospecimen_description = Text()
 71 |     allocation = Text()
 72 |     masking = Text()
 73 |     masked_subject = Boolean('yes')
 74 |     masked_caregiver = Boolean('yes')
 75 |     masked_investigator = Boolean('yes')
 76 |     masked_assessor = Boolean('yes')
 77 |     study_design_assignment = Text()
 78 |     study_classification_endpoint = Text()
 79 |     primary_outcomes = Json()
 80 |     secondary_outcomes = Json()
 81 |     arms = Json()
 82 |     interventions = Json()
 83 |     conditions = Array()
 84 |     keywords = Array()
 85 |     study_population = Text()
 86 |     sampling_method = Text()
 87 |     eligibility_criteria = Text()
 88 |     gender = Text()
 89 |     minimum_age = Text()
 90 |     maximum_age = Text()
 91 |     enrollment = Integer()
 92 |     enrollment_type = Text()
 93 |     healthy_volunteers = Boolean('yes')
 94 |     central_contact = Text()
 95 |     central_contact_phone = Text()
 96 |     central_contact_email = Text()
 97 |     overall_study_official = Text()
 98 |     overall_study_official_affiliation = Text()
 99 |     overall_study_official_role = Text()
100 |     responsible_party_name_official_title = Text()
101 |     responsible_party_organization = Text()
102 | 
103 |     # Locations
104 | 
105 |     contact_name = Text()
106 |     contact_phone = Text()
107 |     contact_email = Text()
108 | 
109 |     # Result summary
110 | 
111 |     protocol_id = Text()
112 |     clinical_study_id = Text()
113 |     official_study_title = Text()
114 |     phase = Text()
115 |     study_indication_or_diseases = Text()
116 |     generic_name = Text()
117 |     trade_name = Text()
118 |     trade_name_product_name = Text()
119 |     study_indications = Text()
120 |     results_url = Text()
121 | 
122 |     # Publication
123 | 
124 |     citation = Text()
125 |     publication_type = Text()
126 | 


--------------------------------------------------------------------------------
/collectors/base/record.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import uuid
  8 | import scrapy
  9 | import logging
 10 | from abc import abstractmethod
 11 | from . import config
 12 | from . import fields
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | # Module API
 17 | 
 18 | class Record(scrapy.Item):
 19 | 
 20 |     # Public
 21 | 
 22 |     def __repr__(self):
 23 |         template = '<%s: %s>'
 24 |         text = template % (self.table.upper(), self.get(self.__primary_key))
 25 |         return text
 26 | 
 27 |     @property
 28 |     @abstractmethod
 29 |     def table(self):
 30 |         """Source name.
 31 |         """
 32 |         pass  # pragma: no cover
 33 | 
 34 |     @classmethod
 35 |     def create(cls, source, data):
 36 | 
 37 |         # Init dict
 38 |         self = cls()
 39 | 
 40 |         # Get primary_key
 41 |         self.__primary_key = None
 42 |         for key, field in self.fields.items():
 43 |             if field.primary_key:
 44 |                 self.__primary_key = key
 45 |                 break
 46 |         if self.__primary_key is None:
 47 |             raise TypeError('Record %s requires primary key' % cls)
 48 |         if not isinstance(self.fields[self.__primary_key], fields.Text):
 49 |             raise TypeError('Record %s requires text primary key' % cls)
 50 | 
 51 |         # Get column types
 52 |         self.__column_types = {}
 53 |         for key, field in self.fields.items():
 54 |             self.__column_types[key] = field.column_type
 55 | 
 56 |         # Add metadata
 57 |         ident = uuid.uuid1().hex
 58 |         self.fields['meta_id'] = fields.Text()
 59 |         self.fields['meta_source'] = fields.Text()
 60 |         self.fields['meta_created'] = fields.Datetime()
 61 |         self.fields['meta_updated'] = fields.Datetime()
 62 |         self['meta_id'] = ident
 63 |         self['meta_source'] = source
 64 | 
 65 |         # Add data
 66 |         undefined = []
 67 |         for key, value in data.items():
 68 |             field = self.fields.get(key)
 69 |             if field is None:
 70 |                 undefined.append(key)
 71 |                 continue
 72 |             if value is None:
 73 |                 continue
 74 |             try:
 75 |                 value = field.parse(value)
 76 |             except Exception:
 77 |                 config.SENTRY.captureException()
 78 |                 continue
 79 |             self[key] = value
 80 |         for key in undefined:
 81 |             logger.warning('Undefined field: %s - %s' % (self, key))
 82 | 
 83 |         return self
 84 | 
 85 |     def write(self, conf, conn):
 86 |         """Write record to warehouse.
 87 | 
 88 |         Args:
 89 |             conf (dict): config dictionary
 90 |             conn (dict): connections dictionary
 91 | 
 92 |         """
 93 |         config.SENTRY.extra_context({
 94 |             'record_table': self.table,
 95 |             'record_id': self.__primary_key,
 96 |         })
 97 | 
 98 |         if self.table not in conn['warehouse'].tables:
 99 |             if conf['ENV'] in ['development', 'testing']:
100 |                 table = conn['warehouse'].create_table(
101 |                         self.table,
102 |                         primary_id=self.__primary_key,
103 |                         primary_type='String')
104 |         table = conn['warehouse'][self.table]
105 |         action = 'created'
106 |         if table.find_one(**{self.__primary_key: self[self.__primary_key]}):
107 |             action = 'updated'
108 |             del self['meta_id']
109 | 
110 |         ensure_fields = False
111 |         if conf['ENV'] in ['development', 'testing']:
112 |             ensure_fields = True
113 |         table.upsert(
114 |             self, [self.__primary_key],
115 |             ensure=ensure_fields, types=self.__column_types)
116 | 
117 |         logger.debug('Record - %s: %s - %s fields', action, self, len(self))
118 | 


--------------------------------------------------------------------------------
/migrations/versions/20160220175816_isrctn_create_table.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | import sqlalchemy as sa
  8 | from sqlalchemy.dialects.postgresql import JSONB
  9 | from alembic import op
 10 | 
 11 | 
 12 | # revision identifiers, used by Alembic.
 13 | revision = '296d1e273220'
 14 | down_revision = u'3433d4d2a0d1'
 15 | branch_labels = None
 16 | depends_on = None
 17 | 
 18 | 
 19 | def upgrade():
 20 |     op.create_table('isrctn',
 21 | 
 22 |         # Meta
 23 | 
 24 |         sa.Column('meta_uuid', sa.Text),
 25 |         sa.Column('meta_source', sa.Text),
 26 |         sa.Column('meta_created', sa.DateTime(timezone=True)),
 27 |         sa.Column('meta_updated', sa.DateTime(timezone=True)),
 28 | 
 29 |         # General
 30 | 
 31 |         sa.Column('isrctn_id', sa.Text, primary_key=True),
 32 |         sa.Column('doi_isrctn_id', sa.Text),
 33 |         sa.Column('title', sa.Text),
 34 |         sa.Column('condition_category', sa.Text),
 35 |         sa.Column('date_applied', sa.Date),
 36 |         sa.Column('date_assigned', sa.Date),
 37 |         sa.Column('last_edited', sa.Date),
 38 |         sa.Column('prospectiveretrospective', sa.Text),
 39 |         sa.Column('overall_trial_status', sa.Text),
 40 |         sa.Column('recruitment_status', sa.Text),
 41 |         sa.Column('plain_english_summary', sa.Text),
 42 |         sa.Column('trial_website', sa.Text),
 43 | 
 44 |         # Contant information
 45 | 
 46 |         sa.Column('contacts', JSONB),
 47 | 
 48 |         # Additional identifiers
 49 | 
 50 |         sa.Column('eudract_number', sa.Text),
 51 |         sa.Column('clinicaltrialsgov_number', sa.Text),
 52 |         sa.Column('protocolserial_number', sa.Text),
 53 | 
 54 |         # Study information
 55 | 
 56 |         sa.Column('scientific_title', sa.Text),
 57 |         sa.Column('acronym', sa.Text),
 58 |         sa.Column('study_hypothesis', sa.Text),
 59 |         sa.Column('ethics_approval', sa.Text),
 60 |         sa.Column('study_design', sa.Text),
 61 |         sa.Column('primary_study_design', sa.Text),
 62 |         sa.Column('secondary_study_design', sa.Text),
 63 |         sa.Column('trial_setting', sa.Text),
 64 |         sa.Column('trial_type', sa.Text),
 65 |         sa.Column('patient_information_sheet', sa.Text),
 66 |         sa.Column('condition', sa.Text),
 67 |         sa.Column('intervention', sa.Text),
 68 |         sa.Column('intervention_type', sa.Text),
 69 |         sa.Column('phase', sa.Text),
 70 |         sa.Column('drug_names', sa.Text),
 71 |         sa.Column('primary_outcome_measures', sa.Text),
 72 |         sa.Column('secondary_outcome_measures', sa.Text),
 73 |         sa.Column('overall_trial_start_date', sa.Date),
 74 |         sa.Column('overall_trial_end_date', sa.Date),
 75 |         sa.Column('reason_abandoned', sa.Text),
 76 | 
 77 |         # Eligability
 78 | 
 79 |         sa.Column('participant_inclusion_criteria', sa.Text),
 80 |         sa.Column('participant_type', sa.Text),
 81 |         sa.Column('age_group', sa.Text),
 82 |         sa.Column('gender', sa.Text),
 83 |         sa.Column('target_number_of_participants', sa.Text),
 84 |         sa.Column('participant_exclusion_criteria', sa.Text),
 85 |         sa.Column('recruitment_start_date', sa.Date),
 86 |         sa.Column('recruitment_end_date', sa.Date),
 87 | 
 88 |         # Locations
 89 | 
 90 |         sa.Column('countries_of_recruitment', sa.Text),
 91 |         sa.Column('trial_participating_centre', sa.Text),
 92 | 
 93 |         # Sponsor information
 94 | 
 95 |         sa.Column('sponsors', JSONB),
 96 | 
 97 |         # Funders
 98 | 
 99 |         sa.Column('funders', JSONB),
100 | 
101 |         # Results and publications
102 | 
103 |         sa.Column('publication_and_dissemination_plan', sa.Text),
104 |         sa.Column('intention_to_publish_date', sa.Date),
105 |         sa.Column('participant_level_data', sa.Text),
106 |         sa.Column('results_basic_reporting', sa.Text),
107 |         sa.Column('publication_summary', sa.Text),
108 |         sa.Column('publication_citations', sa.Text),
109 | 
110 |     )
111 | 
112 | 
113 | def downgrade():
114 |     op.drop_table('isrctn')
115 | 


--------------------------------------------------------------------------------
/collectors/jprn/record.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import absolute_import
  3 | from __future__ import division
  4 | from __future__ import print_function
  5 | from __future__ import unicode_literals
  6 | 
  7 | from .. import base
  8 | from ..base.fields import Text, Date, Boolean, Integer, Array, Datetime
  9 | 
 10 | 
 11 | # Module API
 12 | 
 13 | class Record(base.Record):
 14 | 
 15 |     # Config
 16 | 
 17 |     table = 'jprn'
 18 | 
 19 |     # General
 20 | 
 21 |     unique_trial_number = Text(primary_key=True)
 22 |     recruitment_status = Text()
 23 |     title_of_the_study = Text()
 24 |     date_of_formal_registrationdate_of_icmje_and_who = Date('%Y/%m/%d')
 25 |     date_and_time_of_last_update = Datetime('%Y/%m/%d %H:%M:%S')
 26 | 
 27 |     # Basic information
 28 | 
 29 |     official_scientific_title_of_the_study = Text()
 30 |     title_of_the_study_brief_title = Text()
 31 |     region = Text()
 32 | 
 33 |     # Condition
 34 | 
 35 |     condition = Text()
 36 |     classification_by_specialty = Text()
 37 |     classification_by_malignancy = Text()
 38 |     genomic_information = Boolean('YES')
 39 | 
 40 |     # Objectives
 41 | 
 42 |     narrative_objectives1 = Text()
 43 |     basic_objectives2 = Text()
 44 |     basic_objectives_others = Text()
 45 |     trial_characteristics_1 = Text()
 46 |     trial_characteristics_2 = Text()
 47 |     developmental_phase = Text()
 48 | 
 49 |     # Assessment
 50 | 
 51 |     primary_outcomes = Text()
 52 |     key_secondary_outcomes = Text()
 53 | 
 54 |     # Base
 55 | 
 56 |     study_type = Text()
 57 | 
 58 |     # Study design
 59 | 
 60 |     basic_design = Text()
 61 |     randomization = Text()
 62 |     randomization_unit = Text()
 63 |     blinding = Text()
 64 |     control = Text()
 65 |     stratification = Text()
 66 |     dynamic_allocation = Text()
 67 |     institution_consideration = Text()
 68 |     blocking = Text()
 69 |     concealment = Text()
 70 | 
 71 |     # Intervention
 72 | 
 73 |     no_of_arms = Integer()
 74 |     purpose_of_intervention = Text()
 75 |     type_of_intervention = Text()
 76 |     interventions = Array()
 77 | 
 78 |     # Eligibility
 79 | 
 80 |     agelower_limit = Text()
 81 |     ageupper_limit = Text()
 82 |     gender = Text()
 83 |     key_inclusion_criteria = Text()
 84 |     key_exclusion_criteria = Text()
 85 |     target_sample_size = Integer()
 86 | 
 87 |     # Research contact person
 88 | 
 89 |     research_name_of_lead_principal_investigator = Text()
 90 |     research_organization = Text()
 91 |     research_division_name = Text()
 92 |     research_address = Text()
 93 |     research_tel = Text()
 94 |     research_homepage_url = Text()
 95 |     research_email = Text()
 96 | 
 97 |     # Public contact
 98 | 
 99 |     public_name_of_contact_person = Text()
100 |     public_organization = Text()
101 |     public_division_name = Text()
102 |     public_address = Text()
103 |     public_tel = Text()
104 |     public_homepage_url = Text()
105 |     public_email = Text()
106 | 
107 |     # Sponsor
108 | 
109 |     name_of_primary_sponsor = Text()
110 | 
111 |     # Funding source
112 | 
113 |     source_of_funding = Text()
114 |     category_of_org = Text()
115 |     nation_of_funding = Text()
116 | 
117 |     # Other related organizations
118 | 
119 |     cosponsor = Text()
120 |     name_of_secondary_funers = Text()
121 | 
122 |     # Secondary study IDs
123 | 
124 |     secondary_study_ids = Boolean('YES')
125 |     secondary_study_id_1 = Text()
126 |     org_issuing_secondary_study_id_1 = Text()
127 |     secondary_study_id_2 = Text()
128 |     org_issuing_secondary_study_id_2 = Text()
129 |     ind_to_mhlw = Text()
130 | 
131 |     # Institutions
132 | 
133 |     institutions = Text()
134 | 
135 |     # Progress
136 | 
137 |     recruitment_status = Text()
138 |     date_of_protocol_fixation = Date('%Y/%m/%d')
139 |     anticipated_trial_start_date = Date('%Y/%m/%d')
140 |     last_followup_date = Date('%Y/%m/%d')
141 |     date_of_closure_to_data_entry = Date('%Y/%m/%d')
142 |     date_trial_data_considered_complete = Date('%Y/%m/%d')
143 |     date_analysis_concluded = Date('%Y/%m/%d')
144 | 
145 |     # Related information
146 | 
147 |     url_releasing_protocol = Text()
148 |     publication_of_results = Text()
149 |     url_releasing_results = Text()
150 |     results = Text()
151 |     other_related_information = Text()
152 | 
153 |     # Others
154 | 
155 |     date_of_registration = Date('%Y/%m/%d')
156 |     date_of_last_update = Datetime('%Y/%m/%d %H:%M:%S')
157 |     urljapanese = Text()
158 |     urlenglish = Text()
159 | 


--------------------------------------------------------------------------------
/tests/cassettes/nct.test_parser.TestNctParser.test_parser_parse_text.json:
--------------------------------------------------------------------------------
1 | {"http_interactions": [{"request": {"body": {"string": "", "encoding": "utf-8"}, "headers": {"Connection": ["keep-alive"], "Accept-Encoding": ["gzip, deflate"], "Accept": ["*/*"], "User-Agent": ["python-requests/2.12.2"]}, "method": "GET", "uri": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "response": {"body": {"base64_string": "H4sIAAAAAAAAA9xZW2/cNhZ+L9D/wPVTAnhGHt/iGIoKx0laL+I0iJ1i+yRwJM4M15KoJSnH01+/3yEpjaSZ2GmLLYo1/DCiDs/9rviHh7Jg90IbqapXe7PpwR4TVaZyWS1f7X2+fTc52/sh+f67OCtkJTNepMY2+RonjMX/mEzY7UoaRjgyVS2ULg2zivGK/ev6PbvJVqLkjNtzAmdsZW1tzqOoxWW15IWZLtV9lNnDaGXLIpIlXwoTSSCL6mZeyGz6YHI2mXiSWvynkVrk6UrwXOjEI45z9aUqFM/TnFuRXAb8tx1+VmuVCWNEziwxDDDOVMU+qHtRzoVmRwf77PBgdhpHQ1QBPxDepVY82OQ9fpGIdiVY1mgtKst20NMiUzqfxtHmZkDV6CJ5RBFmpb5EHy5vDw5fHs0OZ8dxRBec7NEu4WOZp6SsFr3SS2+iVObJj9dXk9nRycvJ7GAWR4NXAbzKLD0NCIYzT7OPPp5rKRaplbYQyceCZ2KuJpeqsloVBVR7A6cpBHujjGA3RIgU9faeFw1UyW74Qtg1fCNnH1dclzxTd7ISVmaGqQVreWWyYj8J6GS1Zr+ooqmsgHfGUZ+2Y0YtFjKD3sLZBWEF4at99gk0VCl/E/k+mIETiclraBpP20zvt1xfmExU5PZf458s7mXYZ7eqEJrPZSHpaSzO/kDIfF3x8hEhL/KmsOymmf9bZBaCjuRysppaVUZp0/kjHCANh+EMp4icKlsnPxbrTF3L0qv2CtGsa6UhQh5HAWR0Jc0KbkxyVeWNsXrdgoXjQDPaQTTOoEQ+J+w7GLkU0BLyyp8ju0UijgbqiI1qdCYeFTuAeL9x2W65soO44Y1dKQ1zJp8racmZLa6ac/ZOqdzZ841uljBWiZAFt9x6wbprAQ98MM3LLPmg4qj97ZneQTcElGnKkusOBWWLeaGyu05ft3C9uoE4cEx4kUthLpKZdPlWtD5qNjFW74gx412dd65+9UtLIgfuLRddBRflzkVNcNFpa5kho22I9qWJc2G5RJiluTCZljVp7TE5IQ/+OdO9EM59CM99CNchhLNeCO+QywnkldRix428yci23CL5h7cMdywqAMSlCB9Yf9omrS+yKNicKJi2imjVLFeqsXSrpeARkvaNoJTOIB68x/CyLqBdetFoSZTDSYuXfFw41lBDHR8bS7bIRwYNvJRUgHpm+6pxvmoJFxC8oMrObWOSTyLTjbRQo/fZ3isfbpZr66vjRbNE5IbC2Tt3cJkiGYmMO2R2XYtXexcVeJc1heVe8s8GusDtFxTlA2iPotaSXCn9BlTXXGergOsr1wJOqhKJrxUoiv4xCEa1kXAjJcEl7qFa3OUFyda96oFCk3JZJRcF1Ozywfmg9Lyt8lpJ1x/AVhJZPQAFp3IlZp/1abFrlYvinH3kpHZRoCoRCTLyPrvmBrZfnoeaxlxNY89C5SBE98JYuaRE+RxVyWuBffSJ45zdasEtoWrlCfwPVA2Phtq6xqcU3DRaJN1ddDYFdzGUUz4TyD04RdlqIdvgRiJOF5oD1ewE/dYaIL2zAOW9PJXGNCL5VQBmcNJ2dz2PfYoRhAXSR2DG5bxNSrS9qt2PGQqOUUxEu/QRG0E55BE1jTuBx9RyfMaQP/TTeqFS8pRaLkv+wJ7Vgt9RgjTousEpZTZfqJ7vEnG3NH9jIS8+X7JnHA7Amgr9b9uD34v/D/FunQ0JEdX0v9aWbZP61znsm6NjlF6Uv0DQJ6ZvlqxqaG5L1SKFACY5xNgyPHFQoqIWwaWLHTXj8CCONhBt1UIH4ciH/pxKU3vkIIA9XaL6113r2B6k6FNFkWzSyvjN1gVXUd4+1OgWiAUqNaOXO1Tn5hM88yI0odtKG/L4NMthLvr7cCx7RbHF2D/zpKglx4i6db7rQkXeujHN9rs/ZM0hov8p752Nvpn1R626i3NRyKX09bEb8DDcUC+7md2223ZG01bR0AqJXQb48++/27xmbDbt5l0/TJQcHQx63YWgXxiZZy8npwdsLbh2YwhGQfZMerRI8NMhusMpuxY5rU6KdTelfJF2xSrF2q0KXlF/4xovJAB4nBAVTQdaGLBgRjiPpuydZ4boZytZ5HNwQxdqZUlNaOhL6ncFCKH8oG834qFxhGTFMwtGUZ/m6MEqgD93vfxRnwT+SowtK8PQXlCjr2hMIQrKoWvcb14xnmWitpy6vLnUECtMO0iWmDfzIePHG8YdeysOPjirBDpBYqnWYllxDNYMI42l4YcKp6sz1B91anFTY+BrxHXgkgSSnanDxOR6yZEuT6YYpFCiMRNUudmA+T1cjnRv2uvSrQ3Rx0FEQ4maHv0GBRqhASnwNmLJKQcKdE1+MD7hAwWrME8NOHr78LR/hj6a5k+f2YAV8hZQo7dwxmueSRoM8213/EkatN2ureQVoIuiwnQGjjg68yCqquGY4MGN8IJ9vNpnJS0E/O60oQ4Vx8E7ATWSeKNFsFW7eZ1cJffFCc6ppel2k+2wzuZrepa4gfHM176N8UYUdpkSYfEBFgH6Nq5Yt45ZM1o8iHEkHXdXEJoanTrNbM6gbhyGPxqq+Ns+89q9zyGM4xFi9gPY3y6UMc7akODk1E0W42gaIT6d0t6P+qgOc7j/4luuv5jSODZUHa+USwGdStwGt0V7ePY1vB7jeDh3W65hno2XguIneQ06cRQe2t4NNMumTJEik9lL9iulTPRtvdMWkD90R8itLWDvtF1a+Qya3nf71uTCZSCzcxW7A9xXlHH96FYL7U6z219yY33pes8bzTEKC01xjiH3TRxt3gZwZD7UP6RiskExGHTjyL1sKyDoFJL7DnOzfeydbnZxW3x13FKyRTr/g8zWK1WhWT44nBy/OJ0cns3cloHOhrT7VOJ2g9AiWSDT9MowNbxEYiNTjyLJnecUVL2KHGd0+z1UpgqAZ31cfolj0aiLuebmjrvljRV9gN9knZyenRygsaaffcTIVFaP9qTUJfvjjZMPmYqjkUzxjn1Tb8/ke/GeHX63JZ60hd8uD2wte671J+h+s8NGWwSBr+8LnWekXsFS9NTzpCE2yEbXYzhht5Hy27GfM6voU5jf5m2/99xA1mzFq6XIh7dm3Qe0LRB3cSHRDGiRCXQk4fwGCcZ/fTtt7+6AcrfhR7Tyl6j5KdWxzoe2Xvgm+sZ/IKBPZjvfe93sxhrfifUXpfN2AOx9mmnfDMC29wWPgm0m7QGYNOki5ylatcattfw2bOu0hcVU7CxzdjBz4/boyIHR5wfxUKPwQpfUUSIW288S43O/A4S10fmQ3rs6ZFauBU5lnfu5vvccNmXjW+678KfQxbhmFNUSnT0qODpN6lw2i+7uQ8YTf+77bxxtfYX+LwAAAP//AwCIX4I/wB4AAA==", "encoding": "UTF-8"}, "headers": {"Strict-Transport-Security": ["max-age=31536000; includeSubDomains; preload"], "Content-Security-Policy": ["upgrade-insecure-requests"], "Content-Encoding": ["gzip"], "Transfer-Encoding": ["chunked"], "Set-Cookie": ["Psid=_ihzm6CLPg4PUiokOyz3FQ7V9g4H5KCnxg0tORcBF608SgzqxBCRa8Wzj; Expires=Thu, 30-Nov-2017 21:06:26 GMT; Path=/; Secure", "CTOpts=Qihzm6CLPg4PUiokOyUgzw-R98L5xR4t-RoR; Expires=Wed, 30-Nov-2016 21:26:26 GMT; Path=/; Secure"], "Expires": ["Wed, 30 Nov 2016 21:06:27 GMT"], "Vary": ["Accept-Encoding"], "Keep-Alive": ["timeout=5, max=30"], "Connection": ["Keep-Alive"], "Date": ["Wed, 30 Nov 2016 21:06:26 GMT"], "Referrer-Policy": ["origin-when-cross-origin"], "Content-Type": ["text/xml;charset=UTF-8"]}, "status": {"message": "OK", "code": 200}, "url": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "recorded_at": "2016-11-30T21:06:26"}], "recorded_with": "betamax/0.8.0"}


--------------------------------------------------------------------------------
/tests/cassettes/nct.test_parser.TestNctParser.test_parser_parse_dict.json:
--------------------------------------------------------------------------------
1 | {"http_interactions": [{"request": {"body": {"string": "", "encoding": "utf-8"}, "headers": {"Connection": ["keep-alive"], "Accept-Encoding": ["gzip, deflate"], "Accept": ["*/*"], "User-Agent": ["python-requests/2.12.2"]}, "method": "GET", "uri": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "response": {"body": {"base64_string": "H4sIAAAAAAAAA9xZW2/cNhZ+L9D/wPVTAnhGHt/iGIoKx0laL+I0iJ1i+yRwJM4M15KoJSnH01+/3yEpjaSZ2GmLLYo1/DCiDs/9rviHh7Jg90IbqapXe7PpwR4TVaZyWS1f7X2+fTc52/sh+f67OCtkJTNepMY2+RonjMX/mEzY7UoaRjgyVS2ULg2zivGK/ev6PbvJVqLkjNtzAmdsZW1tzqOoxWW15IWZLtV9lNnDaGXLIpIlXwoTSSCL6mZeyGz6YHI2mXiSWvynkVrk6UrwXOjEI45z9aUqFM/TnFuRXAb8tx1+VmuVCWNEziwxDDDOVMU+qHtRzoVmRwf77PBgdhpHQ1QBPxDepVY82OQ9fpGIdiVY1mgtKst20NMiUzqfxtHmZkDV6CJ5RBFmpb5EHy5vDw5fHs0OZ8dxRBec7NEu4WOZp6SsFr3SS2+iVObJj9dXk9nRycvJ7GAWR4NXAbzKLD0NCIYzT7OPPp5rKRaplbYQyceCZ2KuJpeqsloVBVR7A6cpBHujjGA3RIgU9faeFw1UyW74Qtg1fCNnH1dclzxTd7ISVmaGqQVreWWyYj8J6GS1Zr+ooqmsgHfGUZ+2Y0YtFjKD3sLZBWEF4at99gk0VCl/E/k+mIETiclraBpP20zvt1xfmExU5PZf458s7mXYZ7eqEJrPZSHpaSzO/kDIfF3x8hEhL/KmsOymmf9bZBaCjuRysppaVUZp0/kjHCANh+EMp4icKlsnPxbrTF3L0qv2CtGsa6UhQh5HAWR0Jc0KbkxyVeWNsXrdgoXjQDPaQTTOoEQ+J+w7GLkU0BLyyp8ju0UijgbqiI1qdCYeFTuAeL9x2W65soO44Y1dKQ1zJp8racmZLa6ac/ZOqdzZ841uljBWiZAFt9x6wbprAQ98MM3LLPmg4qj97ZneQTcElGnKkusOBWWLeaGyu05ft3C9uoE4cEx4kUthLpKZdPlWtD5qNjFW74gx412dd65+9UtLIgfuLRddBRflzkVNcNFpa5kho22I9qWJc2G5RJiluTCZljVp7TE5IQ/+OdO9EM59CM99CNchhLNeCO+QywnkldRix428yci23CL5h7cMdywqAMSlCB9Yf9omrS+yKNicKJi2imjVLFeqsXSrpeARkvaNoJTOIB68x/CyLqBdetFoSZTDSYuXfFw41lBDHR8bS7bIRwYNvJRUgHpm+6pxvmoJFxC8oMrObWOSTyLTjbRQo/fZ3isfbpZr66vjRbNE5IbC2Tt3cJkiGYmMO2R2XYtXexcVeJc1heVe8s8GusDtFxTlA2iPotaSXCn9BlTXXGergOsr1wJOqhKJrxUoiv4xCEa1kXAjJcEl7qFa3OUFyda96oFCk3JZJRcF1Ozywfmg9Lyt8lpJ1x/AVhJZPQAFp3IlZp/1abFrlYvinH3kpHZRoCoRCTLyPrvmBrZfnoeaxlxNY89C5SBE98JYuaRE+RxVyWuBffSJ45zdasEtoWrlCfwPVA2Phtq6xqcU3DRaJN1ddDYFdzGUUz4TyD04RdlqIdvgRiJOF5oD1ewE/dYaIL2zAOW9PJXGNCL5VQBmcNJ2dz2PfYoRhAXSR2DG5bxNSrS9qt2PGQqOUUxEu/QRG0E55BE1jTuBx9RyfMaQP/TTeqFS8pRaLkv+wJ7Vgt9RgjTousEpZTZfqJ7vEnG3NH9jIS8+X7JnHA7Amgr9b9uD34v/D/FunQ0JEdX0v9aWbZP61znsm6NjlF6Uv0DQJ6ZvlqxqaG5L1SKFACY5xNgyPHFQoqIWwaWLHTXj8CCONhBt1UIH4ciH/pxKU3vkIIA9XaL6113r2B6k6FNFkWzSyvjN1gVXUd4+1OgWiAUqNaOXO1Tn5hM88yI0odtKG/L4NMthLvr7cCx7RbHF2D/zpKglx4i6db7rQkXeujHN9rs/ZM0hov8p752Nvpn1R626i3NRyKX09bEb8DDcUC+7md2223ZG01bR0AqJXQb48++/27xmbDbt5l0/TJQcHQx63YWgXxiZZy8npwdsLbh2YwhGQfZMerRI8NMhusMpuxY5rU6KdTelfJF2xSrF2q0KXlF/4xovJAB4nBAVTQdaGLBgRjiPpuydZ4boZytZ5HNwQxdqZUlNaOhL6ncFCKH8oG834qFxhGTFMwtGUZ/m6MEqgD93vfxRnwT+SowtK8PQXlCjr2hMIQrKoWvcb14xnmWitpy6vLnUECtMO0iWmDfzIePHG8YdeysOPjirBDpBYqnWYllxDNYMI42l4YcKp6sz1B91anFTY+BrxHXgkgSSnanDxOR6yZEuT6YYpFCiMRNUudmA+T1cjnRv2uvSrQ3Rx0FEQ4maHv0GBRqhASnwNmLJKQcKdE1+MD7hAwWrME8NOHr78LR/hj6a5k+f2YAV8hZQo7dwxmueSRoM8213/EkatN2ureQVoIuiwnQGjjg68yCqquGY4MGN8IJ9vNpnJS0E/O60oQ4Vx8E7ATWSeKNFsFW7eZ1cJffFCc6ppel2k+2wzuZrepa4gfHM176N8UYUdpkSYfEBFgH6Nq5Yt45ZM1o8iHEkHXdXEJoanTrNbM6gbhyGPxqq+Ns+89q9zyGM4xFi9gPY3y6UMc7akODk1E0W42gaIT6d0t6P+qgOc7j/4luuv5jSODZUHa+USwGdStwGt0V7ePY1vB7jeDh3W65hno2XguIneQ06cRQe2t4NNMumTJEik9lL9iulTPRtvdMWkD90R8itLWDvtF1a+Qya3nf71uTCZSCzcxW7A9xXlHH96FYL7U6z219yY33pes8bzTEKC01xjiH3TRxt3gZwZD7UP6RiskExGHTjyL1sKyDoFJL7DnOzfeydbnZxW3x13FKyRTr/g8zWK1WhWT44nBy/OJ0cns3cloHOhrT7VOJ2g9AiWSDT9MowNbxEYiNTjyLJnecUVL2KHGd0+z1UpgqAZ31cfolj0aiLuebmjrvljRV9gN9knZyenRygsaaffcTIVFaP9qTUJfvjjZMPmYqjkUzxjn1Tb8/ke/GeHX63JZ60hd8uD2wte671J+h+s8NGWwSBr+8LnWekXsFS9NTzpCE2yEbXYzhht5Hy27GfM6voU5jf5m2/99xA1mzFq6XIh7dm3Qe0LRB3cSHRDGiRCXQk4fwGCcZ/fTtt7+6AcrfhR7Tyl6j5KdWxzoe2Xvgm+sZ/IKBPZjvfe93sxhrfifUXpfN2AOx9mmnfDMC29wWPgm0m7QGYNOki5ylatcattfw2bOu0hcVU7CxzdjBz4/boyIHR5wfxUKPwQpfUUSIW288S43O/A4S10fmQ3rs6ZFauBU5lnfu5vvccNmXjW+678KfQxbhmFNUSnT0qODpN6lw2i+7uQ8YTf+77bxxtfYX+LwAAAP//AwCIX4I/wB4AAA==", "encoding": "UTF-8"}, "headers": {"Strict-Transport-Security": ["max-age=31536000; includeSubDomains; preload"], "Content-Security-Policy": ["upgrade-insecure-requests"], "Content-Encoding": ["gzip"], "Transfer-Encoding": ["chunked"], "Set-Cookie": ["Psid=fihzm6CLPg4PUiCR-yz3FQ7V9K4BagC5agCRORcBF608FgzqaR48agHgyPt; Expires=Thu, 30-Nov-2017 21:06:27 GMT; Path=/; Secure", "CTOpts=Qihzm6CLPg4PUiCR-yUgzw-R98LyNR43YicR; Expires=Wed, 30-Nov-2016 21:26:27 GMT; Path=/; Secure"], "Expires": ["Wed, 30 Nov 2016 21:06:28 GMT"], "Vary": ["Accept-Encoding"], "Keep-Alive": ["timeout=5, max=30"], "Connection": ["Keep-Alive"], "Date": ["Wed, 30 Nov 2016 21:06:27 GMT"], "Referrer-Policy": ["origin-when-cross-origin"], "Content-Type": ["text/xml;charset=UTF-8"]}, "status": {"message": "OK", "code": 200}, "url": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "recorded_at": "2016-11-30T21:06:27"}], "recorded_with": "betamax/0.8.0"}


--------------------------------------------------------------------------------
/tests/cassettes/nct.test_parser.TestNctParser.test_parser_parse_list.json:
--------------------------------------------------------------------------------
1 | {"http_interactions": [{"request": {"body": {"string": "", "encoding": "utf-8"}, "headers": {"Connection": ["keep-alive"], "Accept-Encoding": ["gzip, deflate"], "Accept": ["*/*"], "User-Agent": ["python-requests/2.12.2"]}, "method": "GET", "uri": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "response": {"body": {"base64_string": "H4sIAAAAAAAAA9xZW2/cNhZ+L9D/wPVTAnhGHt/iGIoKx0laL+I0iJ1i+yRwJM4M15KoJSnH01+/3yEpjaSZ2GmLLYo1/DCiDs/9rviHh7Jg90IbqapXe7PpwR4TVaZyWS1f7X2+fTc52/sh+f67OCtkJTNepMY2+RonjMX/mEzY7UoaRjgyVS2ULg2zivGK/ev6PbvJVqLkjNtzAmdsZW1tzqOoxWW15IWZLtV9lNnDaGXLIpIlXwoTSSCL6mZeyGz6YHI2mXiSWvynkVrk6UrwXOjEI45z9aUqFM/TnFuRXAb8tx1+VmuVCWNEziwxDDDOVMU+qHtRzoVmRwf77PBgdhpHQ1QBPxDepVY82OQ9fpGIdiVY1mgtKst20NMiUzqfxtHmZkDV6CJ5RBFmpb5EHy5vDw5fHs0OZ8dxRBec7NEu4WOZp6SsFr3SS2+iVObJj9dXk9nRycvJ7GAWR4NXAbzKLD0NCIYzT7OPPp5rKRaplbYQyceCZ2KuJpeqsloVBVR7A6cpBHujjGA3RIgU9faeFw1UyW74Qtg1fCNnH1dclzxTd7ISVmaGqQVreWWyYj8J6GS1Zr+ooqmsgHfGUZ+2Y0YtFjKD3sLZBWEF4at99gk0VCl/E/k+mIETiclraBpP20zvt1xfmExU5PZf458s7mXYZ7eqEJrPZSHpaSzO/kDIfF3x8hEhL/KmsOymmf9bZBaCjuRysppaVUZp0/kjHCANh+EMp4icKlsnPxbrTF3L0qv2CtGsa6UhQh5HAWR0Jc0KbkxyVeWNsXrdgoXjQDPaQTTOoEQ+J+w7GLkU0BLyyp8ju0UijgbqiI1qdCYeFTuAeL9x2W65soO44Y1dKQ1zJp8racmZLa6ac/ZOqdzZ841uljBWiZAFt9x6wbprAQ98MM3LLPmg4qj97ZneQTcElGnKkusOBWWLeaGyu05ft3C9uoE4cEx4kUthLpKZdPlWtD5qNjFW74gx412dd65+9UtLIgfuLRddBRflzkVNcNFpa5kho22I9qWJc2G5RJiluTCZljVp7TE5IQ/+OdO9EM59CM99CNchhLNeCO+QywnkldRix428yci23CL5h7cMdywqAMSlCB9Yf9omrS+yKNicKJi2imjVLFeqsXSrpeARkvaNoJTOIB68x/CyLqBdetFoSZTDSYuXfFw41lBDHR8bS7bIRwYNvJRUgHpm+6pxvmoJFxC8oMrObWOSTyLTjbRQo/fZ3isfbpZr66vjRbNE5IbC2Tt3cJkiGYmMO2R2XYtXexcVeJc1heVe8s8GusDtFxTlA2iPotaSXCn9BlTXXGergOsr1wJOqhKJrxUoiv4xCEa1kXAjJcEl7qFa3OUFyda96oFCk3JZJRcF1Ozywfmg9Lyt8lpJ1x/AVhJZPQAFp3IlZp/1abFrlYvinH3kpHZRoCoRCTLyPrvmBrZfnoeaxlxNY89C5SBE98JYuaRE+RxVyWuBffSJ45zdasEtoWrlCfwPVA2Phtq6xqcU3DRaJN1ddDYFdzGUUz4TyD04RdlqIdvgRiJOF5oD1ewE/dYaIL2zAOW9PJXGNCL5VQBmcNJ2dz2PfYoRhAXSR2DG5bxNSrS9qt2PGQqOUUxEu/QRG0E55BE1jTuBx9RyfMaQP/TTeqFS8pRaLkv+wJ7Vgt9RgjTousEpZTZfqJ7vEnG3NH9jIS8+X7JnHA7Amgr9b9uD34v/D/FunQ0JEdX0v9aWbZP61znsm6NjlF6Uv0DQJ6ZvlqxqaG5L1SKFACY5xNgyPHFQoqIWwaWLHTXj8CCONhBt1UIH4ciH/pxKU3vkIIA9XaL6113r2B6k6FNFkWzSyvjN1gVXUd4+1OgWiAUqNaOXO1Tn5hM88yI0odtKG/L4NMthLvr7cCx7RbHF2D/zpKglx4i6db7rQkXeujHN9rs/ZM0hov8p752Nvpn1R626i3NRyKX09bEb8DDcUC+7md2223ZG01bR0AqJXQb48++/27xmbDbt5l0/TJQcHQx63YWgXxiZZy8npwdsLbh2YwhGQfZMerRI8NMhusMpuxY5rU6KdTelfJF2xSrF2q0KXlF/4xovJAB4nBAVTQdaGLBgRjiPpuydZ4boZytZ5HNwQxdqZUlNaOhL6ncFCKH8oG834qFxhGTFMwtGUZ/m6MEqgD93vfxRnwT+SowtK8PQXlCjr2hMIQrKoWvcb14xnmWitpy6vLnUECtMO0iWmDfzIePHG8YdeysOPjirBDpBYqnWYllxDNYMI42l4YcKp6sz1B91anFTY+BrxHXgkgSSnanDxOR6yZEuT6YYpFCiMRNUudmA+T1cjnRv2uvSrQ3Rx0FEQ4maHv0GBRqhASnwNmLJKQcKdE1+MD7hAwWrME8NOHr78LR/hj6a5k+f2YAV8hZQo7dwxmueSRoM8213/EkatN2ureQVoIuiwnQGjjg68yCqquGY4MGN8IJ9vNpnJS0E/O60oQ4Vx8E7ATWSeKNFsFW7eZ1cJffFCc6ppel2k+2wzuZrepa4gfHM176N8UYUdpkSYfEBFgH6Nq5Yt45ZM1o8iHEkHXdXEJoanTrNbM6gbhyGPxqq+Ns+89q9zyGM4xFi9gPY3y6UMc7akODk1E0W42gaIT6d0t6P+qgOc7j/4luuv5jSODZUHa+USwGdStwGt0V7ePY1vB7jeDh3W65hno2XguIneQ06cRQe2t4NNMumTJEik9lL9iulTPRtvdMWkD90R8itLWDvtF1a+Qya3nf71uTCZSCzcxW7A9xXlHH96FYL7U6z219yY33pes8bzTEKC01xjiH3TRxt3gZwZD7UP6RiskExGHTjyL1sKyDoFJL7DnOzfeydbnZxW3x13FKyRTr/g8zWK1WhWT44nBy/OJ0cns3cloHOhrT7VOJ2g9AiWSDT9MowNbxEYiNTjyLJnecUVL2KHGd0+z1UpgqAZ31cfolj0aiLuebmjrvljRV9gN9knZyenRygsaaffcTIVFaP9qTUJfvjjZMPmYqjkUzxjn1Tb8/ke/GeHX63JZ60hd8uD2wte671J+h+s8NGWwSBr+8LnWekXsFS9NTzpCE2yEbXYzhht5Hy27GfM6voU5jf5m2/99xA1mzFq6XIh7dm3Qe0LRB3cSHRDGiRCXQk4fwGCcZ/fTtt7+6AcrfhR7Tyl6j5KdWxzoe2Xvgm+sZ/IKBPZjvfe93sxhrfifUXpfN2AOx9mmnfDMC29wWPgm0m7QGYNOki5ylatcattfw2bOu0hcVU7CxzdjBz4/boyIHR5wfxUKPwQpfUUSIW288S43O/A4S10fmQ3rs6ZFauBU5lnfu5vvccNmXjW+678KfQxbhmFNUSnT0qODpN6lw2i+7uQ8YTf+77bxxtfYX+LwAAAP//AwCIX4I/wB4AAA==", "encoding": "UTF-8"}, "headers": {"Strict-Transport-Security": ["max-age=31536000; includeSubDomains; preload"], "Content-Security-Policy": ["upgrade-insecure-requests"], "Content-Encoding": ["gzip"], "Transfer-Encoding": ["chunked"], "Set-Cookie": ["Psid=fihzm6CLPg4PUiC3pyz3FQ7V9K4BagC5agCRORcBF608SgzqaR48aRLgyPt; Expires=Thu, 30-Nov-2017 21:06:26 GMT; Path=/; Secure", "CTOpts=Qihzm6CLPg4PUiC3pyUgzw-R98LyNR43Yicj; Expires=Wed, 30-Nov-2016 21:26:26 GMT; Path=/; Secure"], "Expires": ["Wed, 30 Nov 2016 21:06:27 GMT"], "Vary": ["Accept-Encoding"], "Keep-Alive": ["timeout=5, max=30"], "Connection": ["Keep-Alive"], "Date": ["Wed, 30 Nov 2016 21:06:26 GMT"], "Referrer-Policy": ["origin-when-cross-origin"], "Content-Type": ["text/xml;charset=UTF-8"]}, "status": {"message": "OK", "code": 200}, "url": "https://clinicaltrials.gov/show/NCT02931214?displayxml=true"}, "recorded_at": "2016-11-30T21:06:26"}], "recorded_with": "betamax/0.8.0"}


--------------------------------------------------------------------------------