├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── enhancement_proposal.md │ └── question.md └── pull_request_template.md ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── __attic__ ├── docs │ ├── README.md │ ├── design │ │ ├── 00_Header.jpg │ │ ├── 1.1_Colour_.jpg │ │ ├── 1.2_Typography.jpg │ │ ├── 1.3_Header.jpg │ │ ├── 1.4_Footer.jpg │ │ ├── 1.5_Breadcrumbs_2_levels_.jpg │ │ ├── 1.5_Breadcrumbs_3_levels_.jpg │ │ ├── 1.6_CTA_Links_.jpg │ │ ├── 1.7_Search.jpg │ │ ├── 1.8_Tables.jpg │ │ ├── design-system.md │ │ ├── mockups.md │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556632095254_00_Header2x.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556633077597_1.2_Typography.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556639329062_1.1_Colour+palletex2.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641211609_1.7_Search.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641251273_1.5_Breadcrumbs_3+levels.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641260288_1.5_Breadcrumbs_2+levels.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641267412_1.4_Footer.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641277300_1.3_Header.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556642898755_1.6_Buttons.jpg │ │ └── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556805892800_1.8_Tables.jpg │ ├── problems.md │ └── s3-layout.md └── pipeline │ └── reach-evaluator │ ├── Dockerfile │ ├── evaluator_task.py │ └── requirements.txt ├── argo ├── 00-namespace.yaml ├── README.md ├── argo.yaml ├── elasticsearch.yaml ├── postgres.yaml ├── psqlinit.yaml ├── reach-msf.yaml ├── reach-populate-pg.yaml └── secrets │ ├── minikube │ └── argo │ │ └── aws │ │ └── .gitkeep │ └── sync_secrets.py ├── base ├── Dockerfile ├── elastic │ ├── __init__.py │ ├── common.py │ ├── count.py │ ├── epmc_metadata.py │ ├── fulltext_docs.py │ ├── fuzzy_matched_citations.py │ ├── import_refs_from_s3.py │ └── import_sections_from_s3.py ├── hooks │ ├── s3hook.py │ └── sentry.py ├── requirements.txt ├── safe_import.py └── tests │ ├── common.py │ ├── mock_sites │ ├── gov │ │ ├── 1.html │ │ ├── 2.html │ │ └── 3.html │ ├── msf │ │ └── 1.html │ ├── nice │ │ ├── 1.html │ │ └── 2.html │ ├── parliament │ │ ├── 1.html │ │ └── 2.html │ ├── unicef │ │ ├── 1.html │ │ └── 2.html │ └── who │ │ ├── 1.html │ │ └── 2.html │ ├── pdfs │ ├── test_pdf.pdf │ ├── test_pdf_multipage.pdf │ └── test_pdf_page_number.pdf │ └── xml │ └── test_xml.xml ├── buildspec.yml ├── docker-compose.yaml ├── docs ├── antora.yml └── modules │ └── ROOT │ ├── nav.adoc │ └── pages │ └── index.adoc ├── export_wellcome_env.py ├── pipeline ├── reach-es-extractor │ ├── Dockerfile │ ├── Dockerfile.test │ ├── extract_refs_task.py │ ├── refparse │ │ ├── README.md │ │ ├── __init__.py │ │ ├── algo_evaluation │ │ │ ├── compare_found_sections.py │ │ │ ├── data_evaluate │ │ │ │ └── .gitkeep │ │ │ ├── evaluate_find_section.py │ │ │ ├── evaluate_match_references.py │ │ │ ├── evaluate_parse.py │ │ │ ├── evaluate_settings.py │ │ │ ├── evaluate_split_section.py │ │ │ ├── evaluation.md │ │ │ ├── exploratory │ │ │ │ ├── investigate_match_thresholds.py │ │ │ │ ├── negative_cosines_hist_2019-07-01-1211.png │ │ │ │ ├── negative_cosines_len_scatter_2019-07-01-1211.png │ │ │ │ ├── thresholds_F1Score_negative_heatmap_2019-07-01-1211.png │ │ │ │ ├── thresholds_Precision_negative_heatmap_2019-07-01-1211.png │ │ │ │ ├── thresholds_Recall_negative_heatmap_2019-07-01-1211.png │ │ │ │ └── title_lengths_2019-07-01-1211.png │ │ │ └── results │ │ │ │ └── .gitkeep │ │ ├── evaluate_algo.py │ │ ├── merge_results.py │ │ ├── parse_latest.py │ │ ├── reference_parser_models │ │ │ └── reference_parser_pipeline.pkl │ │ ├── refparse.py │ │ ├── settings.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_config_multitask.ini │ │ │ ├── test_exact_match.py │ │ │ ├── test_fuzzy_match.py │ │ │ └── test_split_parse.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── exact_match.py │ │ │ ├── file_manager.py │ │ │ ├── fuzzy_match.py │ │ │ ├── parse.py │ │ │ ├── s3.py │ │ │ └── serialiser.py │ └── requirements.txt ├── reach-es-indexer │ ├── Dockerfile │ ├── index_task.py │ └── requirements.txt ├── reach-fuzzy-matcher │ ├── Dockerfile │ ├── fuzzymatcher_task.py │ └── requirements.txt ├── reach-parser │ ├── Dockerfile │ ├── Dockerfile.test │ ├── __init__.py │ ├── normalizer │ │ ├── __init__.py │ │ └── title_normalizer.py │ ├── parser_task.py │ ├── pdf_parser │ │ ├── __init__.py │ │ ├── main.py │ │ ├── objects │ │ │ ├── PdfObjects.py │ │ │ └── __init__.py │ │ ├── pdf_parse.py │ │ ├── resources │ │ │ ├── keywords.txt │ │ │ └── section_keywords.txt │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_pdf_objects.py │ │ │ └── test_pdf_parser_tools.py │ │ └── tools │ │ │ ├── __init__.py │ │ │ ├── dbTools.py │ │ │ └── extraction.py │ └── requirements.txt └── reach-scraper │ ├── Dockerfile │ ├── Dockerfile.test │ ├── README.md │ ├── __init__.py │ ├── docker-compose.yaml │ ├── pg_exists.py │ ├── pg_isready.py │ ├── requirements.txt │ ├── scrapy.cfg │ ├── spider_task.py │ └── wsf_scraping │ ├── __init__.py │ ├── contracts.py │ ├── feed_storage.py │ ├── filter.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ ├── __init__.py │ ├── acme_spider.py │ ├── base_spider.py │ ├── gov_spider.py │ ├── msf_spider.py │ ├── nice_spider.py │ ├── parliament_spider.py │ ├── unicef_spider.py │ └── who_iris_spider.py │ └── tests │ ├── __init__.py │ ├── test_gov_spider.py │ ├── test_msf_spider.py │ ├── test_nice_spider.py │ ├── test_parliament_spider.py │ ├── test_scraper_spiders.py │ ├── test_unicef_spider.py │ └── test_who_spider.py ├── requirements.txt ├── test_target ├── README.md ├── inner_page.html ├── page.html └── target_server.py └── web ├── .babelrc ├── .dockerignore ├── .eslintrc.json ├── Dockerfile ├── Makefile ├── bin └── update_vendor.sh ├── config ├── docker.config.toml └── local.config.toml ├── package-lock.json ├── package.json ├── requirements.txt ├── setup.py └── web ├── __init__.py ├── api.py ├── config.py ├── db.py ├── docs ├── .gitignore ├── Makefile ├── README.md ├── build │ ├── doctrees │ │ ├── api.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ └── intro.doctree │ └── html │ │ ├── .buildinfo │ │ ├── _static │ │ ├── basic.css │ │ ├── css │ │ │ ├── badge_only.css │ │ │ └── theme.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── fonts │ │ │ ├── Inconsolata-Bold.ttf │ │ │ ├── Inconsolata-Regular.ttf │ │ │ ├── Inconsolata.ttf │ │ │ ├── Lato-Bold.ttf │ │ │ ├── Lato-Regular.ttf │ │ │ ├── Lato │ │ │ │ ├── lato-bold.eot │ │ │ │ ├── lato-bold.ttf │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-bolditalic.eot │ │ │ │ ├── lato-bolditalic.ttf │ │ │ │ ├── lato-bolditalic.woff │ │ │ │ ├── lato-bolditalic.woff2 │ │ │ │ ├── lato-italic.eot │ │ │ │ ├── lato-italic.ttf │ │ │ │ ├── lato-italic.woff │ │ │ │ ├── lato-italic.woff2 │ │ │ │ ├── lato-regular.eot │ │ │ │ ├── lato-regular.ttf │ │ │ │ ├── lato-regular.woff │ │ │ │ └── lato-regular.woff2 │ │ │ ├── RobotoSlab-Bold.ttf │ │ │ ├── RobotoSlab-Regular.ttf │ │ │ ├── RobotoSlab │ │ │ │ ├── roboto-slab-v7-bold.eot │ │ │ │ ├── roboto-slab-v7-bold.ttf │ │ │ │ ├── roboto-slab-v7-bold.woff │ │ │ │ ├── roboto-slab-v7-bold.woff2 │ │ │ │ ├── roboto-slab-v7-regular.eot │ │ │ │ ├── roboto-slab-v7-regular.ttf │ │ │ │ ├── roboto-slab-v7-regular.woff │ │ │ │ └── roboto-slab-v7-regular.woff2 │ │ │ ├── fontawesome-webfont.eot │ │ │ ├── fontawesome-webfont.svg │ │ │ ├── fontawesome-webfont.ttf │ │ │ ├── fontawesome-webfont.woff │ │ │ └── fontawesome-webfont.woff2 │ │ ├── jquery-3.5.1.js │ │ ├── jquery.js │ │ ├── js │ │ │ ├── modernizr.min.js │ │ │ └── theme.js │ │ ├── language_data.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── underscore-1.3.1.js │ │ └── underscore.js │ │ ├── api.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── intro.html │ │ ├── objects.inv │ │ ├── search.html │ │ └── searchindex.js ├── make.bat ├── requirements.txt └── source │ ├── api.md │ ├── conf.py │ ├── index.rst │ └── intro.md ├── src ├── css │ ├── about.less │ ├── contact.less │ ├── footer.less │ ├── header.less │ ├── home.less │ ├── icons.less │ ├── results.less │ ├── search.less │ ├── style.less │ ├── variables.less │ └── wellcome-bold-webfont.woff2 ├── favicon │ ├── android-icon-144x144.png │ ├── android-icon-192x192.png │ ├── android-icon-36x36.png │ ├── android-icon-48x48.png │ ├── android-icon-72x72.png │ ├── android-icon-96x96.png │ ├── apple-icon-114x114.png │ ├── apple-icon-120x120.png │ ├── apple-icon-144x144.png │ ├── apple-icon-152x152.png │ ├── apple-icon-180x180.png │ ├── apple-icon-57x57.png │ ├── apple-icon-60x60.png │ ├── apple-icon-72x72.png │ ├── apple-icon-76x76.png │ ├── apple-icon-precomposed.png │ ├── apple-icon.png │ ├── browserconfig.xml │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon-96x96.png │ ├── favicon.ico │ ├── manifest.json │ ├── ms-icon-144x144.png │ ├── ms-icon-150x150.png │ ├── ms-icon-310x310.png │ └── ms-icon-70x70.png ├── images │ ├── Icon_ New-window.svg │ ├── Icon_About_Accuracy_100px.svg │ ├── Icon_About_Open-source_100px.svg │ ├── Icon_About_Transparent_100px.svg │ ├── Icon_Arrow_down.svg │ ├── Icon_Chevron_Double.svg │ ├── Icon_Chevron_Down.svg │ ├── Icon_Download_16px.svg │ ├── Icon_How_Download_160px.svg │ ├── Icon_How_Extract_160px.svg │ ├── Icon_How_Match_160px.svg │ ├── Icon_Info.svg │ ├── Icon_Menu_16px.svg │ ├── Icon_Policy_24px.svg │ ├── Icon_Research_24px.svg │ ├── Icon_Scroll-arow.svg │ ├── Icon_Search_16px.svg │ ├── Icon_Sort-by_16px.svg │ ├── Icon_new_window.svg │ ├── Illustration_Glass.svg │ ├── Illustration_Papers.svg │ ├── Image_Product-shot.png │ ├── Shape_01.svg │ ├── Shape_02.svg │ ├── Wellcome_logo.svg │ ├── reach_alpha_branding.svg │ ├── reach_site_view.png │ ├── wave.svg │ ├── wellcome-logo.svg │ └── white-wave.svg ├── js │ ├── app.js │ ├── citationsTable.js │ ├── clearSearch.js │ ├── home.js │ ├── policyTable.js │ ├── resultsCommon.js │ ├── templates │ │ └── no_results.js │ └── v.contact.js ├── vendor │ └── spectre-0.5.8 │ │ ├── spectre-exp.css │ │ ├── spectre-exp.min.css │ │ ├── spectre-icons.css │ │ ├── spectre-icons.min.css │ │ ├── spectre.css │ │ └── spectre.min.css └── w-avatar-pitch-1.svg ├── templates ├── about.html ├── base.html ├── contact.html ├── how-it-works.html ├── index.html ├── privacy.html ├── results │ ├── citations.html │ └── policy-docs.html └── search │ ├── citations.html │ └── policy-docs.html ├── tests ├── test_search_api.py └── test_template.py ├── utils.py ├── views ├── __init__.py ├── api │ ├── __init__.py │ ├── api_search_citations.py │ ├── api_search_policies.py │ └── utils.py ├── apidocs.py ├── contact.py ├── opt_search.py ├── robotstxt.py ├── search │ ├── __init__.py │ ├── citations.py │ ├── export_citations.py │ ├── export_policies.py │ └── policies.py ├── search_exports.py └── template.py └── wsgi.py /.dockerignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | **/__pycache__/* 3 | .git 4 | .gitignore 5 | .idea 6 | .pytest_cache 7 | 8 | Dockerfile 9 | 10 | __attic__ 11 | 12 | argo 13 | 14 | web/build/web/static/* 15 | 16 | reach/refparse/algo_evaluation/data_evaluate/* 17 | 18 | **/env 19 | **/venv 20 | **/docs 21 | pull_request_template.md 22 | CONTRIBUTING.md 23 | 24 | web/node_modules/* 25 | web/package.json 26 | web/package-lock.json 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | --- 8 | Checklist: 9 | 10 | * [ ] I've included the version 11 | * [ ] I've included reproduction steps 12 | * [ ] I've included any config 13 | * [ ] I've included the logs 14 | 15 | 16 | ## What Happened 17 | 18 | ## What you expected to happen 19 | 20 | ## How to reproduce it (as minimally and preciselt as possible) 21 | 22 | ## Anything else we should know 23 | 24 | ## Environment 25 | 26 | * [ ] Production 27 | * [ ] Staging 28 | * [ ] Local 29 | 30 | ## Error Message / Logs 31 | 32 | --- 33 | 34 | ## Message from Maintainers 35 | 36 | If you are impacted by this bug please add a :thumbsup: reaction this is issue! 37 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement_proposal.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Enhancement proposal 3 | about: Propose an enhancement for this project 4 | title: '' 5 | labels: 'enhancement' 6 | assignees: '' 7 | --- 8 | # Description of feature/ functionality 9 | 10 | (Be sure to include the reasoning if this is not part of a bigger project. Link to spec, notion page, Zeplin etc) 11 | 12 | ## Risks & dependencies 13 | 14 | (Consider customer facing, internal and deployment) 15 | 16 | ## Acceptance Criteria 17 | (What needs to happen for this ticket to be closed?) 18 | 19 | ## Estimation of dev task size 20 | 21 | - [ ] Small 22 | - [ ] Medium 23 | - [ ] Large 24 | 25 | ## Who needs to test this? 26 | 27 | - [ ] Dev 28 | - [ ] UI 29 | - [ ] UX 30 | - [ ] Data science 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask a question regarding this project 4 | title: '' 5 | labels: 'question' 6 | assignees: '' 7 | --- 8 | 9 | 10 | # Summary 11 | 12 | What do you want to know about this project? 13 | 14 | # Motivation 15 | 16 | Why do you need to know this, any examples or use cases you could include? 17 | 18 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include a summary of the changes this PR introduces for the codebase. 4 | Also specify if some sections need special attention, and why you want to introduce this change. 5 | 6 | Make sure to split changes across multiple pull requests, as we won't review bundled pull requests. 7 | 8 | Finally, make sure your PR follows our code of conduct before posting (Check our [contributing guidelines](CONTRIBUTING.md) if you're not sure). 9 | 10 | ## Type of change 11 | 12 | Please delete options that are not relevant. 13 | 14 | - [ ] :bug: Bug fix (Add `Fix #(issue)` to your PR) 15 | - [ ] :sparkles: New feature 16 | - [ ] :fire: Breaking change 17 | - [ ] :memo: Documentation update 18 | 19 | # How Has This Been Tested? 20 | 21 | Please describe the tests that you ran to verify your changes. Provide instructions so we can run the tests. Please also list any relevant details for your test configuration: 22 | 23 | # Checklist: 24 | 25 | - [ ] My code follows the style guidelines of this project (pep8 AND pyflakes) 26 | - [ ] I have commented my code, particularly in hard-to-understand areas 27 | - [ ] If needed, I changed related parts of the documentation 28 | - [ ] I included tests in my PR 29 | - [ ] New and existing unit tests pass locally with my changes 30 | - [ ] Any dependent changes have been merged and published in downstream modules 31 | - [ ] If my PR aims to fix an issue, I referenced it using `#(issue)` 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *__pycache__ 3 | *.DS_Store 4 | .idea 5 | 6 | *.egg-info 7 | **/.cache/* 8 | 9 | *.csv 10 | *.pdf 11 | *.txt 12 | 13 | !package.json 14 | !base/tests/pdfs/* 15 | !keywords.txt 16 | !section_keywords.txt 17 | !**/requirements.* 18 | 19 | **/node_modules 20 | **/epmc-metadata.json.gz 21 | *env 22 | 23 | argo/secrets/minikube/argo/aws/* 24 | !argo/secrets/minikube/argo/aws/.gitkeep 25 | 26 | web/build/web/static/* 27 | !web/build/web/static/.gitkeep 28 | !web/docs/* 29 | venv 30 | .env 31 | package.lock 32 | web/config/dev.config.toml 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Wellcome Trust 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /__attic__/docs/README.md: -------------------------------------------------------------------------------- 1 | # Reach documentation 2 | 3 | Directories: 4 | 5 | - [design](./design) 6 | -------------------------------------------------------------------------------- /__attic__/docs/design/00_Header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/00_Header.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.1_Colour_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.1_Colour_.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.2_Typography.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.2_Typography.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.3_Header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.3_Header.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.4_Footer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.4_Footer.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.5_Breadcrumbs_2_levels_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.5_Breadcrumbs_2_levels_.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.5_Breadcrumbs_3_levels_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.5_Breadcrumbs_3_levels_.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.6_CTA_Links_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.6_CTA_Links_.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.7_Search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.7_Search.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.8_Tables.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.8_Tables.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/design-system.md: -------------------------------------------------------------------------------- 1 | # Reach – Design System 2 | Created by Data Labs at Wellcome Trust 3 | 4 | 5 | ![](00_Header.jpg) 6 | 7 | 8 | 9 | 10 | # 1.1. Intro 11 | ---------- 12 | 13 | The present document provides a high level overview of the Reach brand and style. [Spectre.CSS](https://picturepan2.github.io/spectre/index.html) is used as the CSS framework, but the UI components should be adapted to provide a unique user experience to the website. 14 | 15 | 16 | 17 | # 1.2. Colour Palette 18 | ---------- 19 | 20 | The web colour palette is taken largely from the Wellcome brand book ([Data Viz section](https://company-57536.frontify.com/d/gFEfjydViLRJ/wellcome-brand-book#/visuals/dataviz-elements-and-rationale)) with certain additions to accommodate the web environment. 21 | 22 | 23 | ![](1.1_Colour_.jpg) 24 | 25 | 26 | 27 | 28 | # 1.3. Typography 29 | ---------- 30 | 31 | Wellcome's brand fonts (Wellcome & Helvetica Neue) are used alternatively throughout the website with no exceptions. 32 | 33 | 34 | ![](1.2_Typography.jpg) 35 | 36 | 37 | 38 | 39 | # 1.4. Grid System 40 | ---------- 41 | 42 | The grid system follows [Spectre CSS framework](https://picturepan2.github.io/spectre/layout/responsive.html). 43 | 44 | 45 | | XS | 320 – 480px | padding left/right 11px | 46 | | --- | ------------ | ------------------------------------------------------ | 47 | | SM | 481 – 600px | TBC | 48 | | MD | 601 – 840px | TBC | 49 | | LG | 961 – 1280px | TBC | 50 | | XL | 961 – 1280px | 12 columns : gutter 22px : padding left/right 11px | 51 | | XXL | > 1280px | 12 columns : gutter 22px | 52 | 53 | 54 | 55 | 56 | # 1.5. UI Components 57 | 58 | 59 | ## 1.5.1. Buttons 60 | ---------- 61 | 62 | 63 | ![](1.6_CTA_Links_.jpg) 64 | 65 | 66 | 67 | 68 | ## 1.5.2. Header 69 | ---------- 70 | ![](1.3_Header.jpg) 71 | 72 | 73 | 74 | ## 1.5.3. Footer 75 | ---------- 76 | ![](1.4_Footer.jpg) 77 | 78 | 79 | 80 | ## 1.5.6. Breadcrumbs > 2 levels 81 | ---------- 82 | 83 | In the breadcrumb trail, the breadcrumb corresponding to the current page **should not be a link**. 84 | 85 | 86 | ![](1.5_Breadcrumbs_2_levels_.jpg) 87 | 88 | 89 | 90 | 91 | ## 1.5.7. Breadcrumbs > 3 levels (Search results pages) 92 | ---------- 93 | 94 | The **search term is excluded from breadcrumbs** (to avoid long pages on mobile) 95 | 96 | 97 | ![](1.5_Breadcrumbs_3_levels_.jpg) 98 | 99 | 100 | 101 | 102 | ## 1.5.8. Search 103 | ---------- 104 | 105 | 106 | ![](1.7_Search.jpg) 107 | 108 | 109 | 110 | ## 1.5.9. Tables 111 | ---------- 112 | 113 | 114 | ![](1.8_Tables.jpg) 115 | 116 | 117 | -------------------------------------------------------------------------------- /__attic__/docs/design/mockups.md: -------------------------------------------------------------------------------- 1 | # Internal assets 2 | 3 | We've put what we can into the public repo. However, some things from 4 | design are elsewhere, since it was easier for design to use tools like 5 | Google Drive & Invision. 6 | 7 | All raw design assets are kept in the [Data Labs Team Drive under "User 8 | Experience/UX & 9 | UI](https://drive.google.com/drive/u/0/folders/1kN5-MbDUGK1YdSw430T_mDhdQdPzGCms). 10 | Most are in Sketch format. 11 | 12 | UI devs working at Wellcome should also be to view these files (at least 13 | as of time of upload) in 14 | [invision](https://projects.invisionapp.com/d/main?origin=v7#/projects/prototypes/17303255). 15 | 16 | -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556632095254_00_Header2x.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556632095254_00_Header2x.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556633077597_1.2_Typography.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556633077597_1.2_Typography.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556639329062_1.1_Colour+palletex2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556639329062_1.1_Colour+palletex2.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641211609_1.7_Search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641211609_1.7_Search.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641251273_1.5_Breadcrumbs_3+levels.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641251273_1.5_Breadcrumbs_3+levels.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641260288_1.5_Breadcrumbs_2+levels.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641260288_1.5_Breadcrumbs_2+levels.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641267412_1.4_Footer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641267412_1.4_Footer.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641277300_1.3_Header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641277300_1.3_Header.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556642898755_1.6_Buttons.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556642898755_1.6_Buttons.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556805892800_1.8_Tables.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556805892800_1.8_Tables.jpg -------------------------------------------------------------------------------- /__attic__/docs/problems.md: -------------------------------------------------------------------------------- 1 | # Reach Output Problems 2 | 3 | The 10 most cited Wellcome Trust publications as predicted by the Reach tool were looked at. In total these 10 publications were cited 154 times in 62 different policy documents. We looked into these citations and found issues with 103 of them. The issues fell into 6 categories and are given in the table below. 4 | 5 | | GitHub Issue Number | Problem | Example | Solution | Proportion of problematic citations | 6 | | --- | --- | --- | --- | --- | 7 | | [#180](https://github.com/wellcometrust/reach/issues/180) | Text found in wrongly identified references section | A table with the row name "Treatment of severe malaria", was identified as a reference title since this table was at the end of the references section and got included in the scraped. | Improve extracting section | 64/103 | 8 | | [#181](https://github.com/wellcometrust/reach/issues/181) | Text found not in a reference during the exact text search | "attention deficit hyperactivity disorder" was found in the text of several documents and identified as a match to a paper with the same name. | Length threshold to exact matcher | 20/103 | 9 | | [#182](https://github.com/wellcometrust/reach/issues/182) | Reference repeated in the policy document | A citation for "Disability-adjusted life years (DALYs) for 291 diseases and injuries in 21 regions, 1990-2010: a systematic analysis for the Global Burden of Disease Study 2010" came up in two references sections of a policy document | Deduplicate repeats or decide to keep them in | 8/103 | 10 | | [#183](https://github.com/wellcometrust/reach/issues/183) | Duplicate reference found even though no duplicate found in policy document | A citation for "Trends in adult body-mass index in 200 countries from 1975 to 2014: a pooled analysis of 1698 population-based measurement studies with 1377-1396" only occurred once in a policy document, but the Reach output said it came up twice. | ? | 4/103 | 11 | | - | False positive - parsed reference matched to a similar but different reference | The Reach tool identified a publication entitled "Attention deficit hyperactivity disorder" from 2006, however in the policy document this reference was to a similarly titled paper from 1998. | Increase text similarity and length thresholds | 4/103 | 12 | | [#180](https://github.com/wellcometrust/reach/issues/180) | Text found in a reference during the exact text search | A citation for "Attention deficit hyperactivity disorder" was in the references section of a policy document, however it was only found in the exact text search and not the fuzzy match search. | Improve extracting section | 3/103 | 13 | -------------------------------------------------------------------------------- /__attic__/pipeline/reach-evaluator/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.evaluator.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.evaluator.txt 9 | 10 | 11 | COPY ./evaluator_task.py /opt/reach/evaluator_task.py 12 | 13 | # Give execution rights to the entrypoint Python script 14 | RUN chmod +x /opt/reach/evaluator_task.py 15 | 16 | ENTRYPOINT ["/opt/reach/evaluator_task.py"] 17 | -------------------------------------------------------------------------------- /__attic__/pipeline/reach-evaluator/requirements.txt: -------------------------------------------------------------------------------- 1 | https://datalabs-public.s3.eu-west-2.amazonaws.com/reach_evaluator/reach_evaluator-2020.1.1-py3-none-any.whl -------------------------------------------------------------------------------- /argo/00-namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: argo 5 | -------------------------------------------------------------------------------- /argo/README.md: -------------------------------------------------------------------------------- 1 | # Argo & Reach 2 | Reach's pipeline is deployed in production using Argo. 3 | These files are for local runs and development. 4 | 5 | ## How to run Reach's workflows 6 | To run this pipeline locally, you'll need: 7 | 8 | - Docker 9 | - Minikube 10 | - Python >= 3.6 11 | - The Argo cli (recommended but optionnal) 12 | 13 | If it is the first time you use minikube with your AWS account, please configure your ECR credentials: 14 | ``` 15 | minikube addons enable registry-creds 16 | minikube addons configure registry-creds 17 | ``` 18 | 19 | To build the required images, go to the root folder and run the following: 20 | ``` 21 | make docker-build 22 | ``` 23 | 24 | To install Argo to your selected cluster (this will install Argo to a namespace `argo`, so make sure it's available before running these commands or change it beforehand): 25 | ``` 26 | kubectl apply -f argo/00-namespace.yaml 27 | kubectl apply -f argo/argo.yaml 28 | kubectl apply -f argo/elasticsearch.yaml 29 | kubectl apply -f argo/psqlinit.yaml 30 | kubectl apply -f argo/postgres.yamls 31 | ``` 32 | 33 | 34 | You can then run your workflows as follows: 35 | ``` 36 | # this is the example workflow for WHO IRIS 37 | argo submit -n argo argo/reach-who.yaml 38 | ``` 39 | 40 | ## Using this infrastructure with the web application 41 | Reach's web application only relies on Postgresql. To expose it and make it usable locally (or within the `docker-compose` local deployment), run: 42 | ``` 43 | kubectl port-forward -n argo postgres-0 5432:5432 44 | ``` 45 | -------------------------------------------------------------------------------- /argo/postgres.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: argo-postgres-volume 5 | namespace: argo 6 | labels: 7 | type: local 8 | spec: 9 | storageClassName: manual 10 | capacity: 11 | storage: 5Gi 12 | accessModes: 13 | - ReadWriteOnce 14 | hostPath: 15 | path: "/mnt/data" 16 | --- 17 | apiVersion: v1 18 | kind: PersistentVolumeClaim 19 | metadata: 20 | name: argo-postgres-claim 21 | namespace: argo 22 | spec: 23 | storageClassName: manual 24 | accessModes: 25 | - ReadWriteOnce 26 | resources: 27 | requests: 28 | storage: 5Gi 29 | --- 30 | apiVersion: apps/v1 31 | kind: StatefulSet 32 | metadata: 33 | name: postgres 34 | namespace: argo 35 | annotations: 36 | kubernetes.io/change-cause: N/A 37 | spec: 38 | selector: 39 | matchLabels: 40 | app: postgres 41 | serviceName: "postgres" 42 | replicas: 1 43 | template: 44 | metadata: 45 | labels: 46 | app: postgres 47 | spec: 48 | containers: 49 | - name: postgresql 50 | image: postgres:12.2-alpine 51 | ports: 52 | - containerPort: 5432 53 | env: 54 | - name: POSTGRES_PASSWORD 55 | value: development 56 | - name: POSTGRES_DB 57 | value: warehouse 58 | volumeMounts: 59 | - name: argo-postgres-claim 60 | mountPath: /var/lib/postgresql/datalabs 61 | - name: psqlinit 62 | mountPath: /docker-entrypoint-initdb.d 63 | volumes: 64 | - name: argo-postgres-claim 65 | persistentVolumeClaim: 66 | claimName: argo-postgres-claim 67 | - name: psqlinit 68 | configMap: 69 | name: psqlinit 70 | --- 71 | apiVersion: v1 72 | kind: Service 73 | metadata: 74 | name: postgres 75 | namespace: argo 76 | labels: 77 | app: postgres 78 | spec: 79 | ports: 80 | - port: 5432 81 | targetPort: 5432 82 | protocol: TCP 83 | selector: 84 | app: postgres 85 | -------------------------------------------------------------------------------- /argo/psqlinit.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | create.sql: |2 4 | 5 | CREATE SCHEMA IF NOT EXISTS warehouse; 6 | 7 | CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; 8 | 9 | kind: ConfigMap 10 | metadata: 11 | creationTimestamp: "2020-05-15T15:42:31Z" 12 | managedFields: 13 | - apiVersion: v1 14 | fieldsType: FieldsV1 15 | fieldsV1: 16 | f:data: 17 | .: {} 18 | f:create.sql: {} 19 | manager: kubectl 20 | operation: Update 21 | time: "2020-05-15T15:42:31Z" 22 | name: psqlinit 23 | namespace: argo 24 | selfLink: /api/v1/namespaces/argo/configmaps/psqlinit 25 | -------------------------------------------------------------------------------- /argo/secrets/minikube/argo/aws/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/argo/secrets/minikube/argo/aws/.gitkeep -------------------------------------------------------------------------------- /base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a basic Python image, but current Debian 2 | FROM python:3.6-slim-stretch 3 | 4 | # Build UTF8 locale to avoid encoding issues with Scrapy encoding 5 | # C.UTF-8 is the new en_US.UTF-8. 6 | ENV LC_ALL=C.UTF-8 7 | ENV LANG=C.UTF-8 8 | ENV LANGUAGE=C.UTF-8 9 | 10 | WORKDIR /opt/reach 11 | 12 | COPY ./requirements.txt /opt/reach/requirements.txt 13 | 14 | # Poppler is needed to run pdftotext convertion 15 | RUN apt-get update -yqq && \ 16 | apt-get install -yqq --no-install-recommends \ 17 | build-essential \ 18 | libpoppler-cpp-dev \ 19 | poppler-utils \ 20 | locales && \ 21 | apt-get -q clean && \ 22 | locale-gen C.UTF-8 && \ 23 | pip install -U pip && \ 24 | python3 -m pip install -r /opt/reach/requirements.txt && \ 25 | apt-get remove --purge -y build-essential 26 | 27 | 28 | COPY ./safe_import.py /opt/reach/safe_import.py 29 | COPY ./hooks /opt/reach/hooks 30 | COPY ./elastic /opt/reach/elastic 31 | COPY ./tests /opt/reach/tests 32 | -------------------------------------------------------------------------------- /base/elastic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/base/elastic/__init__.py -------------------------------------------------------------------------------- /base/elastic/count.py: -------------------------------------------------------------------------------- 1 | """ 2 | Minimal CLI for counting records in ES. 3 | """ 4 | 5 | from . import common 6 | 7 | if __name__ == '__main__': 8 | parser = common.create_argument_parser(__doc__.strip()) 9 | parser.add_argument('index_name') 10 | args = parser.parse_args() 11 | es = common.es_from_args(args) 12 | print(common.count_es(es, args.index_name)) 13 | 14 | -------------------------------------------------------------------------------- /base/elastic/epmc_metadata.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inserts EPMC metadata into Elasticsearch. 3 | 4 | Sample URL for testing: 5 | 6 | s3://datalabs-staging/airflow/output/open-research/epmc-metadata/epmc-metadata.json.gz 7 | """ 8 | 9 | import json 10 | import logging 11 | import functools 12 | 13 | from . import common 14 | 15 | CHUNK_SIZE = 1000 # tuned for small(ish) size of pub metadata 16 | 17 | 18 | def to_es_action(es_index, line): 19 | d = json.loads(line) 20 | return { 21 | "_index": es_index, 22 | "doc": d, 23 | } 24 | 25 | 26 | def clean_es(es, es_index, organisation): 27 | """ Ensure an empty index exists. """ 28 | common.recreate_index(es, es_index) 29 | 30 | 31 | def insert_file(f, es, es_index, organisation, max_items=None): 32 | """ 33 | Inserts EPMC metadata from a json.gz file into Elasticsearch. 34 | 35 | Args: 36 | f: json.gz file object 37 | es: a living connection to elacticsearch 38 | max_items: maximum number of records to insert, or None 39 | """ 40 | logging.info( 41 | 'epmc_metadata.insert_file: f=%s es=%s max_items=%s', 42 | f, es, max_items) 43 | to_es_func = functools.partial(to_es_action, es_index) 44 | return common.insert_actions( 45 | es, 46 | common.yield_actions(f, to_es_func, max_items), 47 | CHUNK_SIZE, 48 | ) 49 | 50 | 51 | if __name__ == '__main__': 52 | def insert_func(f, es, max_items=None): 53 | return insert_file(f, es, 'policy-test-epmc-metadata', 54 | max_items=max_items) 55 | count = common.insert_from_argv( 56 | __doc__.strip(), clean_es, insert_file) 57 | logging.info('Imported %d pubs into ES', count) 58 | -------------------------------------------------------------------------------- /base/hooks/sentry.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | import os 3 | 4 | import sentry_sdk 5 | 6 | 7 | def init_sentry_sdk(sentry_dsn): 8 | kwargs = { 9 | 'integrations': [], # we'll add celery & flask eventually here 10 | 'default_integrations': True, 11 | } 12 | sentry_sdk.init(sentry_dsn) 13 | 14 | 15 | def report_exception(f): 16 | """ Minimal decorator for reporting exceptions that occur within a 17 | function. Does not support generators.""" 18 | @wraps(f) 19 | def wrapped_f(*args, **kwargs): 20 | try: 21 | return f(*args, **kwargs) 22 | except: 23 | sentry_sdk.capture_exception() 24 | raise 25 | 26 | return wrapped_f 27 | 28 | 29 | # SENTRY_DSN must be present at import time. If we don't have it then, 30 | # we won't have it later either. 31 | init_sentry_sdk(os.environ['SENTRY_DSN']) 32 | -------------------------------------------------------------------------------- /base/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | sentry-sdk 3 | elasticsearch 4 | -------------------------------------------------------------------------------- /base/safe_import.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prevents multiple threads from trying to import at the same time and 3 | hitting an import lock. Implemented because airflow's web server 4 | regularly re-imports all DAGs and all tasks therein -- and 5 | unfortunately, our tasks import so many dependencies that reloading them 6 | takes enough time (by some random distribution) that the Airflow times 7 | out imports, resulting in an endless stream of sentry reports from 8 | within gunicorn. 9 | 10 | So, we've moved "slow" imports, especially those pulling in ML libraries 11 | such as scipy or even pandas, into the execute() method of our tasks. 12 | 13 | This is almost always something you should NEVER do, because imports can 14 | only be trusted not to lock if they're done from the main thread and on 15 | module load. (And an import lock in Python tends not to (or never?) 16 | resolve itself.) But, not much choice, at least for now. And, it turns 17 | out that in our execution model, the celery executor spawns subprocesses 18 | to run each task. So, we shouldn't ever have an issue. 19 | 20 | Hope isn't a strategy, though. So, here's a context manager to use, so 21 | that we'll know if we were going to hit an import lock. 22 | 23 | Sample usage:: 24 | 25 | @report_exception 26 | def execute(self): 27 | with safe_import: 28 | from reach.rainbowpony import pony_ai 29 | 30 | # do things with pony_ai here. 31 | 32 | """ 33 | 34 | from contextlib import contextmanager 35 | from threading import Lock 36 | 37 | # Not a re-entrant lock b/c we believe imports of this sort should 38 | # only happen once from the calling thread. 39 | SAFE_IMPORT_LOCK = Lock() 40 | 41 | @contextmanager 42 | def safe_import(): 43 | """ 44 | Context manager for ensuring that only one thread is importing 45 | at a time. If two threads enter this context, the second will fail 46 | with an exception so that we can't get caught in an import lock. 47 | """ 48 | acquired = SAFE_IMPORT_LOCK.acquire(blocking=False) 49 | try: 50 | if not acquired: 51 | # NB: we could, instead, just wait here. But the invariant 52 | # we're expecting is that, thanks to how the celery executor 53 | # works, only one call to execute() should happen at a time, 54 | # because only one thread should ever be running. 55 | raise Exception('Multiple imports attempted at once!') 56 | yield 57 | finally: 58 | if acquired: 59 | SAFE_IMPORT_LOCK.release() 60 | -------------------------------------------------------------------------------- /base/tests/common.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | 4 | def get_path(p): 5 | return os.path.join( 6 | os.path.dirname(__file__), 7 | p 8 | ) 9 | 10 | TEST_PDF = get_path('pdfs/test_pdf.pdf') 11 | TEST_PDF_MULTIPAGE = get_path('pdfs/test_pdf_multipage.pdf') 12 | TEST_PDF_PAGE_NUMBER = get_path('pdfs/test_pdf_page_number.pdf') 13 | TEST_XML = get_path('xml/test_xml.xml') 14 | -------------------------------------------------------------------------------- /base/tests/mock_sites/parliament/1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Search results 5 | 6 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /base/tests/mock_sites/parliament/2.html: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | The Animal Feed (Amendment) (EU Exit) Regulations 2019 7 | 8 | 9 | PDF table of contents 10 | 11 | -------------------------------------------------------------------------------- /base/tests/mock_sites/unicef/1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Publications Archives - UNICEF DATA 5 | 6 | 7 |
8 |

9 | Including Everyone: Strengthening the collection and use of data about persons with disabilities in humanitarian situations 10 |

11 |
12 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /base/tests/mock_sites/unicef/2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | A Right to be Heard - Listening to children and young people on the move - UNICEF DATA 5 | 6 | 7 |

Download

8 | 9 | 10 | -------------------------------------------------------------------------------- /base/tests/mock_sites/who/1.html: -------------------------------------------------------------------------------- 1 | 2 | IRIS Home 3 | 4 | 22 | 23 | -------------------------------------------------------------------------------- /base/tests/pdfs/test_pdf.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/base/tests/pdfs/test_pdf.pdf -------------------------------------------------------------------------------- /base/tests/pdfs/test_pdf_multipage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/base/tests/pdfs/test_pdf_multipage.pdf -------------------------------------------------------------------------------- /base/tests/pdfs/test_pdf_page_number.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/base/tests/pdfs/test_pdf_page_number.pdf -------------------------------------------------------------------------------- /base/tests/xml/test_xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Test Page 1 6 | All bold line. 7 | Partly bold line. 8 | All italic line. 9 | Partly italic line. 10 | 11 | 12 | 13 | 14 | TestPage 2 15 | All bold line 16 | Partly bold line. 17 | All italic line 18 | Partly italic line. 19 | 20 | 21 | -------------------------------------------------------------------------------- /buildspec.yml: -------------------------------------------------------------------------------- 1 | version: 0.1 2 | 3 | phases: 4 | build: 5 | commands: 6 | - "echo resolved source version: $CODEBUILD_RESOLVED_SOURCE_VERSION" 7 | - "echo source version: $CODEBUILD_SOURCE_VERSION" 8 | - make docker-push-all 9 | - make push-web 10 | 11 | artifacts: 12 | files: [] 13 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | # This will handle the deployment of a local web application, postgresql 3 | # database and elasticsearch single-node cluster 4 | 5 | services: 6 | web: 7 | build: 8 | context: ./web 9 | dockerfile: Dockerfile 10 | image: uk.ac.wellcome/reach:latest 11 | ports: 12 | - 127.0.0.1:8081:8081 13 | environment: 14 | AWS_ACCESS_KEY_ID: "${AWS_ACCESS_KEY_ID}" 15 | AWS_SECRET_ACCESS_KEY: "${AWS_SECRET_ACCESS_KEY}" 16 | SENTRY_DSN: "${SENTRY_DSN}" 17 | STATIC_ROOT: /opt/reach/build/web/static 18 | DOCS_STATIC_ROOT: /opt/reach/web/docs/build/html/_static 19 | DB_HOST: "host.docker.internal" 20 | DB_PORT: 5432 21 | DB_NAME: "warehouse" 22 | DB_USER: "postgres" 23 | DB_PASSWORD: "development" 24 | 25 | command: 26 | - gunicorn 27 | - --bind=0.0.0.0:8081 28 | - --reload 29 | - web:application 30 | volumes: 31 | - ./web/web:/opt/reach/web/ 32 | deploy: 33 | resources: 34 | limits: 35 | memory: "64M" 36 | -------------------------------------------------------------------------------- /docs/antora.yml: -------------------------------------------------------------------------------- 1 | name: reach 2 | title: Reach 3 | version: "0.0.1" 4 | nav: 5 | - modules/ROOT/nav.adoc 6 | 7 | -------------------------------------------------------------------------------- /docs/modules/ROOT/nav.adoc: -------------------------------------------------------------------------------- 1 | * xref:index.adoc[] 2 | -------------------------------------------------------------------------------- /docs/modules/ROOT/pages/index.adoc: -------------------------------------------------------------------------------- 1 | = Reach 2 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.extracter.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.extracter.txt 9 | 10 | 11 | COPY ./extract_refs_task.py /opt/reach/extract_refs_task.py 12 | COPY ./refparse /opt/reach/refparse 13 | 14 | # Give execution rights to the entrypoint Python script 15 | RUN chmod +x /opt/reach/extract_refs_task.py 16 | 17 | ENTRYPOINT ["/opt/reach/extract_refs_task.py"] 18 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/Dockerfile.test: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.extracter.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.extracter.txt 9 | 10 | 11 | COPY ./extract_refs_task.py /opt/reach/extract_refs_task.py 12 | COPY ./refparse /opt/reach/refparse 13 | 14 | # Give execution rights to the entrypoint Python script 15 | RUN chmod +x /opt/reach/extract_refs_task.py 16 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/compare_found_sections.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | """ 3 | Utility for comparing predicted and actual reference sections. 4 | 5 | Takes the scarpe_data.csv produced by evaluate_algo.py, and produces an 6 | interactive dashboard through which actual and predicted references sections 7 | can be compared. 8 | 9 | Requires streamlit>=0.47.3 10 | 11 | pip3 install streamlit 12 | streamlit run compare_found_sections.py 13 | """ 14 | 15 | import numpy as np 16 | import pandas as pd 17 | import streamlit as st 18 | 19 | # Load scrape_date produced by evaluate_algo.py 20 | 21 | data = pd.read_csv("./scrape_data.csv") 22 | 23 | # Drop examples for which no comparison can be made 24 | 25 | data.dropna(subset=["Predicted text", "Actual text"], inplace=True) 26 | 27 | # Add sidebar 28 | 29 | st.sidebar.title("Reference section explorer") 30 | 31 | # Create selector for file hash in sidebar. 32 | 33 | pdf_file = st.sidebar.selectbox("pdf file", data["File"].to_list()) 34 | 35 | lev = data.loc[data["File"] == pdf_file, ["lev_distance"]].iloc[0]["lev_distance"] 36 | comment = st.sidebar.text_area("Comment about the prediction") 37 | actual = data.loc[data["File"] == pdf_file, ["Actual text"]].iloc[0]["Actual text"] 38 | predicted = data.loc[data["File"] == pdf_file, ["Predicted text"]].iloc[0]["Predicted text"] 39 | 40 | # Produce a line which can easily be copied and pasted into a markdown table 41 | 42 | st.write("Copy the line below into a markdown table:") 43 | st.write(f"|{pdf_file}|{len(actual)}|{len(predicted)}|{np.round(lev, 2)}|{comment}|") 44 | 45 | st.table(data.loc[data["File"] == pdf_file, ["Actual text" ,"Predicted text"]]) 46 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/data_evaluate/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/algo_evaluation/data_evaluate/.gitkeep -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/evaluate_settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | from refparse.settings import BaseSettings 3 | 4 | class TestSettings(BaseSettings): 5 | 6 | FOLDER_PREFIX = os.path.join( 7 | os.path.dirname(__file__), 8 | "../algo_evaluation/data_evaluate" 9 | ) 10 | LOG_FILE_PREFIX = './algo_evaluation/results' 11 | 12 | # Variables for find section evaluation data 13 | LEVENSHTEIN_DIST_SCRAPER_THRESHOLD = 0.3 14 | SCRAPE_DATA_PDF_FOLDER_NAME = "pdfs" 15 | SCRAPE_DATA_REF_PDF_FOLDER_NAME = "pdf_sections" 16 | SCRAPE_DATA_PROVIDERS_FILE_NAME = "pdf_providers.csv" 17 | 18 | # Variables for split section evaluation data 19 | SPLIT_SECTION_SIMILARITY_THRESHOLD = 40 20 | NUM_REFS_FILE_NAME = "split_section_test_data.csv" 21 | NUM_REFS_TEXT_FOLDER_NAME = "scraped_references_sections" 22 | 23 | # Variables for parse evaluation data 24 | LEVENSHTEIN_DIST_PARSE_THRESHOLD = 0.3 25 | MODEL_FILE_TYPE = 'pickle' 26 | MODEL_FILE_PREFIX = './reference_parser_models/' 27 | MODEL_FILE_NAME = 'reference_parser_pipeline.pkl' 28 | PARSE_REFERENCE_FILE_NAME = "actual_reference_structures_sample.csv" 29 | 30 | # Variables for match evaluation data 31 | EVAL_PUB_DATA_FILE_NAME = "epmc-metadata.json" 32 | EVAL_MATCH_NUMBER = 100000 33 | EVAL_SAMPLE_MATCH_NUMBER = 1000 34 | LENGTH_THRESHOLD = 50 35 | MATCH_THRESHOLD = 0.8 36 | 37 | settings = TestSettings() 38 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/negative_cosines_hist_2019-07-01-1211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/negative_cosines_hist_2019-07-01-1211.png -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/negative_cosines_len_scatter_2019-07-01-1211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/negative_cosines_len_scatter_2019-07-01-1211.png -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/thresholds_F1Score_negative_heatmap_2019-07-01-1211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/thresholds_F1Score_negative_heatmap_2019-07-01-1211.png -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/thresholds_Precision_negative_heatmap_2019-07-01-1211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/thresholds_Precision_negative_heatmap_2019-07-01-1211.png -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/thresholds_Recall_negative_heatmap_2019-07-01-1211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/thresholds_Recall_negative_heatmap_2019-07-01-1211.png -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/title_lengths_2019-07-01-1211.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/algo_evaluation/exploratory/title_lengths_2019-07-01-1211.png -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/algo_evaluation/results/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/algo_evaluation/results/.gitkeep -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/parse_latest.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code lets you run the reference parser with the 3 | latest scraped documents for an input organisations. 4 | e.g. 5 | python parse_latest.py msf 6 | which will parse and match the latest msf scrape in S3 7 | with the uber wellcome publications stored in S3 8 | """ 9 | 10 | from argparse import ArgumentParser 11 | from urllib.parse import urlparse 12 | import os 13 | import logging 14 | 15 | import boto3 16 | 17 | from .refparse import parse_references, create_argparser 18 | from .settings import settings 19 | 20 | parser = ArgumentParser(description=__doc__.strip()) 21 | 22 | ORG_NAMES = ( 23 | 'gov_uk', 24 | 'msf', 25 | 'nice', 26 | 'parliament', 27 | 'unicef', 28 | 'who_iris' 29 | ) 30 | 31 | if __name__ == "__main__": 32 | logger = settings.logger 33 | logger.setLevel(logging.INFO) 34 | 35 | parser = create_argparser(__doc__.strip()) 36 | parser.add_argument('org_name', choices=ORG_NAMES) 37 | 38 | args = parser.parse_args() 39 | org = args.org_name 40 | 41 | s3prefix = os.path.join(settings.SCRAPER_RESULTS_BASEDIR, org) 42 | u = urlparse(s3prefix) 43 | bucket_name, prefix = u.netloc, u.path[1:] 44 | 45 | s3 = boto3.resource('s3') 46 | bucket = s3.Bucket(bucket_name) 47 | 48 | # Get the most recently scraped filename 49 | key_name, obj = max( 50 | (obj.key, obj) for obj in bucket.objects.filter(Prefix=prefix).all() 51 | ) 52 | 53 | if args.output_url.startswith('file://'): 54 | # The output subfolder will be the name of the organisation 55 | # and the date of scrape (which is the name of the file) 56 | output_url = '{}/{}_{}'.format( 57 | args.output_url, 58 | org, 59 | os.path.splitext(os.path.basename(key_name))[0] 60 | ) 61 | if not os.path.exists(output_url[7:]): 62 | os.makedirs(output_url[7:]) 63 | 64 | scraper_file = "s3://{}/{}".format(bucket_name, key_name) 65 | 66 | parse_references( 67 | scraper_file, 68 | args.references_file, 69 | args.model_file, 70 | output_url, 71 | args.num_workers, 72 | logger 73 | ) 74 | else: 75 | logger.info("Output url should start with 'file://'") 76 | pass 77 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/reference_parser_models/reference_parser_pipeline.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/reference_parser_models/reference_parser_pipeline.pkl -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | class BaseSettings: 5 | logger = logging.getLogger(__name__) 6 | 7 | DEBUG = True 8 | 9 | PREDICTION_PROBABILITY_THRESHOLD = 0.75 10 | FUZZYMATCH_SIMILARITY_THRESHOLD = 0.8 11 | 12 | BUCKET = "datalabs-data" 13 | 14 | SCRAPER_RESULTS_BASEDIR = "s3://{}/scraper-results".format(BUCKET) 15 | SCRAPER_RESULTS_DIR = "{}".format(SCRAPER_RESULTS_BASEDIR) 16 | SCRAPER_RESULTS_FILENAME = '' 17 | 18 | LOCAL_OUTPUT_DIR = 'local_output' 19 | STRUCTURED_REFS_FILENAME = 'structured_references.json' 20 | MATCHED_REFS_FILENAME = 'matched_references.json' 21 | 22 | MIN_CHAR_LIMIT = 20 23 | MATCH_TITLE_LENGTH_THRESHOLD = 40 24 | 25 | REF_CLASSES = ['Authors', 'Journal', 'Volume', 'Issue', 'Pagination', 'Title', 'PubYear'] 26 | DRP_REF_COMPONENTS = ['title', 'year', 'author'] 27 | # This map is between the Deep Reference Parser component names and the legacy names 28 | # I thought it best to use the legacy names for possible downstream errors 29 | COMPONENT_NAME_MAP = {'title': 'Title', 'year': 'PubYear', 'author': 'Authors'} 30 | 31 | 32 | class ProdSettings(BaseSettings): 33 | DEBUG = False 34 | S3 = True 35 | 36 | 37 | class LocalSettings(BaseSettings): 38 | DEBUG = True 39 | S3 = False 40 | SCRAPER_RESULTS_DIR = "scraper-results" 41 | 42 | 43 | settings_mode = { 44 | 'DEV': BaseSettings, 45 | 'LOCAL': LocalSettings, 46 | 'PROD': ProdSettings 47 | } 48 | settings = settings_mode[os.environ.get('REF_PARSER_SETTINGS', 'LOCAL')] 49 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-es-extractor/refparse/tests/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/tests/test_config_multitask.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | version = test 3 | 4 | [data] 5 | test_proportion = 0.25 6 | valid_proportion = 0.25 7 | data_path = data/ 8 | respect_line_endings = 0 9 | respect_doc_endings = 1 10 | line_limit = 150 11 | rodrigues_train = data/rodrigues/clean_test.txt 12 | rodrigues_test = 13 | rodrigues_valid = 14 | policy_train = data/processed/annotated/deep_reference_parser/multitask/2020.3.18_multitask_test.tsv 15 | policy_test = data/processed/annotated/deep_reference_parser/multitask/2020.3.18_multitask_test.tsv 16 | policy_valid = data/processed/annotated/deep_reference_parser/multitask/2020.3.18_multitask_test.tsv 17 | # This needs to have a trailing slash! 18 | s3_slug = https://datalabs-public.s3.eu-west-2.amazonaws.com/deep_reference_parser/ 19 | 20 | [build] 21 | output_path = models/multitask/2020.4.5_multitask/ 22 | output = crf 23 | word_embeddings = embeddings/2020.1.1-wellcome-embeddings-300-test.txt 24 | pretrained_embedding = 0 25 | dropout = 0.5 26 | lstm_hidden = 400 27 | word_embedding_size = 300 28 | char_embedding_size = 100 29 | char_embedding_type = BILSTM 30 | optimizer = adam 31 | 32 | [train] 33 | epochs = 60 34 | batch_size = 100 35 | early_stopping_patience = 5 36 | metric = val_f1 37 | 38 | [evaluate] 39 | out_file = evaluation_data.tsv 40 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/tests/test_split_parse.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pytest 3 | 4 | from refparse.utils import structure_reference 5 | 6 | class TestStructure(unittest.TestCase): 7 | def test_empty_components(self): 8 | components_predictions = structure_reference([]) 9 | self.assertEqual(components_predictions.get('Title'), '', "Should be ''") 10 | 11 | def test_size(self): 12 | components_predictions = structure_reference([]) 13 | self.assertEqual(len(components_predictions), 7, "Should be 7 classes predicted") 14 | 15 | def test_string_component(self): 16 | reference_components = [ 17 | ('Medecins', 'author'), ('Sans', 'author'), ('Frontières', 'author'), 18 | ('.', 'o'), ('TB', 'title'), ('Spot', 'title'), ('Report', 'title'), 19 | ('.', 'o'), ('2011', 'year')] 20 | components_predictions = structure_reference(reference_components) 21 | self.assertEqual(isinstance(components_predictions['Title'], str), True, "Should be a string") 22 | 23 | def test_normal_components(self): 24 | reference_components = [ 25 | ('Medecins', 'author'), ('Sans', 'author'), ('Frontières', 'author'), 26 | ('.', 'o'), ('TB', 'title'), ('Spot', 'title'), ('Report', 'title'), 27 | ('.', 'o'), ('2011', 'year')] 28 | components_predictions = structure_reference(reference_components) 29 | self.assertEqual(components_predictions['Title'], 'TB Spot Report', "Should be 'TB Spot Report'") 30 | 31 | def test_split_title(self): 32 | reference_components = [ 33 | ('TB', 'title'), ('Spot', 'author'), ('Report', 'title')] 34 | components_predictions = structure_reference(reference_components) 35 | self.assertEqual(components_predictions['Title'], 'TB Report', "Should be 'TB Report'") 36 | 37 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .parse import structure_reference 2 | from .fuzzy_match import FuzzyMatcher 3 | from .file_manager import FileManager 4 | from .serialiser import serialise_matched_reference, serialise_reference 5 | from .exact_match import ExactMatcher 6 | 7 | __all__ = [ 8 | structure_reference, 9 | FuzzyMatcher, 10 | FileManager, 11 | serialise_matched_reference, 12 | serialise_reference, 13 | ExactMatcher 14 | ] 15 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/utils/exact_match.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class ExactMatcher: 4 | def __init__(self, sectioned_documents, title_length_threshold): 5 | self.texts = [ 6 | (doc.id, self.clean_text(doc.section)) 7 | for doc in sectioned_documents 8 | ] 9 | self.title_length_threshold = title_length_threshold 10 | 11 | def clean_text(self, string): 12 | """ 13 | Input: 14 | -A string 15 | Output: 16 | -A string, with white space normalised and 17 | non-alphanumeric characters removed 18 | Cleans up text such that it can easily be searched 19 | """ 20 | 21 | string = re.sub("\\n", " ", string) 22 | string = re.sub("\s{1,}", " ", string) 23 | string = re.sub("[^A-Za-z0-9 ]", "", string) 24 | 25 | string = string.lower() 26 | 27 | return string 28 | 29 | def match(self, publication): 30 | """ 31 | Input: 32 | publication: dict that contains title and uber_id of academic publication 33 | Output: 34 | matched_reference: dict that links an academic publication with a policy document 35 | """ 36 | publication_title = self.clean_text(publication['title']) 37 | if len(publication_title) < self.title_length_threshold: 38 | return 39 | 40 | for doc_id, text in self.texts: 41 | 42 | if publication_title in text: 43 | yield { 44 | 'Document id': doc_id, 45 | 'Matched title': publication_title, 46 | 'Matched publication id': publication['uber_id'], 47 | 'Match algorithm': 'Exact match' 48 | } 49 | 50 | return 51 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/utils/parse.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import re 3 | 4 | from refparse.settings import settings 5 | 6 | 7 | def structure_reference(reference_components): 8 | """ 9 | Join up all the predictions for each component type into one string 10 | TO DO: Evaluate how often the same component type is predicted 11 | but not next to one another. 12 | e.g. ref_components = ['title', 'title', 'year', 'title'] 13 | """ 14 | 15 | ref_tokens = [r[0] for r in reference_components] 16 | ref_components = [r[1] for r in reference_components] 17 | 18 | # Keep empty strings for classes not predicted from the deep reference parser 19 | # Useful for possible downstream errors 20 | structured_reference = {ref_class: '' for ref_class in settings.REF_CLASSES} 21 | for component in settings.DRP_REF_COMPONENTS: 22 | structured_reference[ 23 | settings.COMPONENT_NAME_MAP.get(component, component) 24 | ] = ' '.join([r[0] for r in reference_components if r[1]==component]) 25 | 26 | return structured_reference 27 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/utils/s3.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from refparse.settings import settings 3 | from botocore.exceptions import ClientError 4 | 5 | 6 | class S3(): 7 | def __init__(self, bucket_name): 8 | self.logger = settings.logger 9 | self.s3 = boto3.resource('s3') 10 | self.client = boto3.client('s3') 11 | self.bucket_name = bucket_name 12 | 13 | def _get_last_modified_file_key(self, prefix): 14 | try: 15 | objs = self.client.list_objects_v2( 16 | Bucket=self.bucket_name, 17 | Prefix=prefix 18 | ).get('Contents', []) 19 | except ClientError: 20 | self.logger.info('Could not connect to s3 bucket.') 21 | return '' 22 | 23 | if not objs: 24 | self.logger.info('Could not get last result file.') 25 | last_added = [] 26 | return last_added 27 | else: 28 | last_added = [ 29 | obj['Key'] 30 | for obj in sorted( 31 | objs, 32 | key=lambda obj: obj['LastModified'], 33 | reverse=True) 34 | ][0] 35 | return last_added 36 | 37 | def get(self, key, temp_file): 38 | self.logger.info('[+] Fetching s3://%s/%s', self.bucket_name, key) 39 | object = self.s3.Object(self.bucket_name, key) 40 | object.download_fileobj(temp_file) 41 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/refparse/utils/serialiser.py: -------------------------------------------------------------------------------- 1 | def serialise_matched_reference(data, current_timestamp): 2 | """Serialise the data matched by the model.""" 3 | serialised_data = { 4 | 'publication_id': data['WT_Ref_Id'], 5 | 'cosine_similarity': data['Cosine_Similarity'], 6 | 'datetime_creation': current_timestamp, 7 | 'document_hash': data['Document id'] 8 | } 9 | return serialised_data 10 | 11 | 12 | def serialise_reference(data, current_timestamp): 13 | """Serialise the data parsed by the model.""" 14 | if data.get('Title', '') and len(data.get('Title', '')) > 1024: 15 | title = data['Title'][:1024] 16 | else: 17 | title = data.get('Title', None) 18 | 19 | for key, value in data.items(): 20 | if value and key != 'Title' and type(value) == str: 21 | data[key] = value[:256] 22 | 23 | if type(data['PubYear']) != int: 24 | data['PubYear'] = None 25 | 26 | serialised_data = { 27 | 'author': data.get('Authors'), 28 | 'issue': data.get('Issue'), 29 | 'journal': data.get('Journal'), 30 | 'pub_year': data.get('PubYear'), 31 | 'pagination': data.get('Pagination'), 32 | 'title': title, 33 | 'file_hash': data['Document id'], 34 | 'datetime_creation': current_timestamp, 35 | 'volume': data.get('Volume', None), 36 | } 37 | return serialised_data 38 | -------------------------------------------------------------------------------- /pipeline/reach-es-extractor/requirements.txt: -------------------------------------------------------------------------------- 1 | editdistance 2 | numpy 3 | pandas 4 | scikit-learn 5 | sentry-sdk 6 | https://github.com/wellcometrust/deep_reference_parser/releases/download/2020.4.29/deep_reference_parser-2020.8.5-py3-none-any.whl 7 | -------------------------------------------------------------------------------- /pipeline/reach-es-indexer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.indexer.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.indexer.txt 9 | 10 | 11 | COPY ./index_task.py /opt/reach/index_task.py 12 | 13 | # Give execution rights to the entrypoint Python script 14 | RUN chmod +x /opt/reach/index_task.py 15 | 16 | ENTRYPOINT ["/opt/reach/index_task.py"] 17 | -------------------------------------------------------------------------------- /pipeline/reach-es-indexer/requirements.txt: -------------------------------------------------------------------------------- 1 | elasticsearch 2 | -------------------------------------------------------------------------------- /pipeline/reach-fuzzy-matcher/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.fuzzymatcher.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.fuzzymatcher.txt 9 | 10 | 11 | COPY ./fuzzymatcher_task.py /opt/reach/fuzzymatcher_task.py 12 | 13 | # Give execution rights to the entrypoint Python script 14 | RUN chmod +x /opt/reach/fuzzymatcher_task.py 15 | 16 | ENTRYPOINT ["/opt/reach/fuzzymatcher_task.py"] 17 | -------------------------------------------------------------------------------- /pipeline/reach-fuzzy-matcher/requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-fuzzy-matcher/requirements.txt -------------------------------------------------------------------------------- /pipeline/reach-parser/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.parser.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.parser.txt 9 | 10 | 11 | COPY ./parser_task.py /opt/reach/parser_task.py 12 | COPY ./pdf_parser /opt/reach/pdf_parser 13 | COPY ./normalizer /opt/reach/normalizer 14 | 15 | # Give execution rights to the entrypoint Python script 16 | RUN chmod +x /opt/reach/parser_task.py 17 | 18 | ENTRYPOINT ["/opt/reach/parser_task.py"] 19 | -------------------------------------------------------------------------------- /pipeline/reach-parser/Dockerfile.test: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.parser.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.parser.txt 9 | 10 | 11 | COPY ./parser_task.py /opt/reach/parser_task.py 12 | COPY ./pdf_parser /opt/reach/pdf_parser 13 | COPY ./normalizer /opt/reach/normalizer 14 | 15 | # Give execution rights to the entrypoint Python script 16 | RUN chmod +x /opt/reach/parser_task.py 17 | -------------------------------------------------------------------------------- /pipeline/reach-parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-parser/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-parser/normalizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-parser/normalizer/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-parser/parser_task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Operator to run the web scraper on every organisation. 4 | """ 5 | import os 6 | import logging 7 | import argparse 8 | 9 | from hooks.sentry import report_exception 10 | from hooks import s3hook 11 | from normalizer.title_normalizer import PolicyNameNormalizerOperator 12 | from pdf_parser import main as pdf_parser_main 13 | 14 | logging.basicConfig() 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class ParsePdfOperator: 19 | """ 20 | Pulls data from the dimensions.ai to a bucket in S3. 21 | 22 | Args: 23 | organisation: The organisation to pull documents from. 24 | """ 25 | 26 | def __init__(self, organisation, src_s3_dir, dst_s3_key): 27 | self.organisation = organisation 28 | self.src_s3_dir = src_s3_dir 29 | self.dst_s3_key = dst_s3_key 30 | 31 | self.client = s3hook.S3Hook() 32 | 33 | @report_exception 34 | def execute(self): 35 | os.environ.setdefault( 36 | 'SCRAPY_SETTINGS_MODULE', 37 | 'scraper.wsf_scraping.settings' 38 | ) 39 | if not self.src_s3_dir.startswith('s3://'): 40 | raise ValueError 41 | if not self.dst_s3_key.startswith('s3://'): 42 | raise ValueError 43 | 44 | pdf_parser_main.parse_all_pdf( 45 | self.organisation, 46 | self.src_s3_dir, 47 | self.dst_s3_key, 48 | ) 49 | 50 | 51 | if __name__ == '__main__': 52 | arg_parser = argparse.ArgumentParser( 53 | description='Run a web scraper for a given organisation and writes the' 54 | ' results to the given S3 path.' 55 | ) 56 | arg_parser.add_argument( 57 | 'src_s3_dir', 58 | help='The source path to s3.' 59 | ) 60 | arg_parser.add_argument( 61 | 'dst_s3_key', 62 | help='The destination path to s3.' 63 | ) 64 | arg_parser.add_argument( 65 | 'organisation', 66 | choices=s3hook.ORGS, 67 | help='The organisation to scrape.' 68 | ) 69 | 70 | args = arg_parser.parse_args() 71 | 72 | # Create an intermediate folder in s3 for raw parser output 73 | parser_dst_key = args.dst_s3_key.replace( 74 | "_normalized", 75 | "_raw", 76 | ) 77 | 78 | parser = ParsePdfOperator( 79 | args.organisation, 80 | args.src_s3_dir, 81 | parser_dst_key 82 | ) 83 | parser.execute() 84 | 85 | normalizer = PolicyNameNormalizerOperator( 86 | args.organisation, 87 | parser_dst_key, 88 | args.dst_s3_key 89 | ) 90 | 91 | normalizer.normalize() 92 | -------------------------------------------------------------------------------- /pipeline/reach-parser/pdf_parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-parser/pdf_parser/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-parser/pdf_parser/objects/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-parser/pdf_parser/objects/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-parser/pdf_parser/resources/keywords.txt: -------------------------------------------------------------------------------- 1 | # 1.Wellcome places 2 | 3 | Sanger 4 | Wellcome 5 | MOP 6 | FARR 7 | Crick 8 | Hilleman Institute 9 | Night Star 10 | Syncona 11 | 12 | # 2. Coalitions 13 | 14 | UPD 15 | CARI 16 | O’Neil review 17 | Cepi 18 | AESA 19 | Gavi 20 | India Alliance 21 | CarbX 22 | GLOPID/R 23 | Science Media Centre 24 | NC3R 25 | MQ 26 | National Stem Learning Centre 27 | 28 | # 3. Programmes 29 | 30 | Deltas 31 | H3 Africa 32 | 10000 Genome 33 | Genomics England 34 | HRCS 35 | 36 | # 4. Individuals we fund 37 | 38 | 39 | # 5. Wellcome staff where they operate in personal expert capacity 40 | 41 | 42 | # 6. Resources 43 | 44 | Institute for health metrics DRI map 45 | Malaria map 46 | 47 | # 7. Thematic 48 | 49 | Mitochondrial donation 50 | Gene editing 51 | 30 days 52 | DRI/AMR 53 | one health 54 | -------------------------------------------------------------------------------- /pipeline/reach-parser/pdf_parser/resources/section_keywords.txt: -------------------------------------------------------------------------------- 1 | reference 2 | bibliograph 3 | endnote -------------------------------------------------------------------------------- /pipeline/reach-parser/pdf_parser/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-parser/pdf_parser/tests/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-parser/pdf_parser/tests/test_pdf_parser_tools.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from lxml import etree 4 | 5 | from pdf_parser.pdf_parse import parse_pdf_document 6 | from pdf_parser.tools.extraction import (_find_elements, 7 | _flatten_text, 8 | _flatten_fontspec) 9 | from tests.common import TEST_PDF, TEST_XML 10 | 11 | 12 | class TestTools(unittest.TestCase): 13 | 14 | def setUp(self): 15 | self.test_file = open(TEST_PDF, 'rb') 16 | self.pdf_file_object, _, _, errors = parse_pdf_document(self.test_file) 17 | assert not errors 18 | 19 | def tearDown(self): 20 | self.test_file.close() 21 | 22 | def test_element_finder(self): 23 | elements = _find_elements(self.pdf_file_object, 'Reference') 24 | self.assertEqual(elements, []) 25 | 26 | class TestFlattenTools(unittest.TestCase): 27 | 28 | def setUp(self): 29 | 30 | self.test_file = open(TEST_XML, 'r') 31 | tree = etree.parse(self.test_file) 32 | self.fontspecs = tree.xpath('//fontspec') 33 | self.texts = tree.xpath('//text') 34 | 35 | def tearDown(self): 36 | #self.test_file.close() 37 | pass 38 | 39 | def test_flatten_text(self): 40 | text = _flatten_text(self.texts[0]) 41 | self.assertEqual(text, "Test Page 1") 42 | self.assertIs(type(text), str) 43 | 44 | def test_flatten_texts(self): 45 | """ Ensure that _flatten_text adequately captures text with formatting. 46 | """ 47 | texts = [_flatten_text(i) for i in self.texts] 48 | self.assertIs(type(texts), list) 49 | self.assertIs(len(texts), 10) 50 | self.assertEqual(texts[1], 'All bold line.') 51 | self.assertEqual(texts[2], 'Partly bold line.') 52 | self.assertEqual(texts[3], 'All italic line.') 53 | self.assertEqual(texts[4], 'Partly italic line.') 54 | 55 | def test_flatten_fontspec(self): 56 | font_map = _flatten_fontspec(self.fontspecs) 57 | self.assertEqual(len(font_map), 2) 58 | self.assertIs(type(font_map), dict) 59 | -------------------------------------------------------------------------------- /pipeline/reach-parser/pdf_parser/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-parser/pdf_parser/tools/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-parser/requirements.txt: -------------------------------------------------------------------------------- 1 | attrs 2 | boto3 3 | lxml 4 | pyahocorasick 5 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.scraper.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.scraper.txt 9 | 10 | 11 | COPY ./scrapy.cfg /etc/reach/scrapy.cfg 12 | COPY ./spider_task.py /opt/reach/spider_task.py 13 | COPY ./wsf_scraping /opt/reach/wsf_scraping 14 | 15 | # Give execution rights to the entrypoint Python script 16 | RUN chmod +x /opt/reach/spider_task.py 17 | 18 | ENTRYPOINT ["/opt/reach/spider_task.py"] 19 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/Dockerfile.test: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.scraper.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.scraper.txt 9 | 10 | 11 | COPY ./scrapy.cfg /etc/reach/scrapy.cfg 12 | COPY ./spider_task.py /opt/reach/spider_task.py 13 | COPY ./wsf_scraping /opt/reach/wsf_scraping 14 | 15 | # Give execution rights to the entrypoint Python script 16 | RUN chmod +x /opt/reach/spider_task.py 17 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/README.md: -------------------------------------------------------------------------------- 1 | # scraper 2 | 3 | A web scraper tool to get data for evaluating Wellcome impact. 4 | 5 | ## What do we scrape 6 | 7 | 8 | | Organisation | What is scraped | Years | 9 | |--------------|-------------------------------------------------------------------------------------|-------------| 10 | | WHO | Everything on apps.who.int/iris | 2012 - 2019 | 11 | | NICE | All the Guidances and evidences | 2000 - 2019 | 12 | | MSF | All the reports and activity reports | 2007 - 2019 | 13 | | GOV | Everything from gov.uk/government/publications | 1945 - 2019 | 14 | | UNICEF | Everything from data.unicef.org/resources/resource-type/[publications and guidance] | 2010 - 2019 | 15 | | Parliament | Everything from search-material.parliament.uk | 1984 - 2019 | 16 | 17 | 18 | 19 | ## Output Formatting 20 | 21 | The outputed file is meant to contain a number a different fields, which 22 | can vary depending on the scraper provider. 23 | 24 | It will always have the following attributes, though: 25 | 26 | |Unique|Attribute|Description| 27 | |------|---------|-----------| 28 | | |title | a string containing the document title| 29 | |* |uri | the url of the document| 30 | | |pdf | the name of the file| 31 | | |sections | a json object of section names, containing the text extracted from matching sections| 32 | | |keywords | a json object of keywords, containing the text extracted from matching text| 33 | |* |hash | a md5 digest of the file| 34 | | |provider | the provider from where the file has been downloaded| 35 | | |date_scraped | the date (YYYYMMDD) when the article has been scraped| 36 | 37 | Some providers will have additional parameters: 38 | 39 | ### WHO 40 | 41 | |Attribute|Description| 42 | |---------|-----------| 43 | |year | the publication year of the document| 44 | |types | an array containing the WHO type associated with the document| 45 | |subjects | an array containing the WHO subjects of the document| 46 | |authors | an array containing the authors (from WHO)| 47 | 48 | ### Nice 49 | 50 | |Attribute|Description| 51 | |---------|-----------| 52 | |year | the publication year of the document| 53 | 54 | ### Parliament 55 | 56 | |Attribute|Description| 57 | |---------|-----------| 58 | |year | the publication year of the document| 59 | |types | the type of the document | 60 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-scraper/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-scraper/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | 3 | x-airflow-image: &airflow-image 4 | 160358319781.dkr.ecr.eu-west-1.amazonaws.com/uk.ac.wellcome/reach:latest 5 | 6 | x-env: &env 7 | AWS_ACCESS_KEY_ID: "${AWS_ACCESS_KEY_ID}" 8 | AWS_SECRET_ACCESS_KEY: "${AWS_SECRET_ACCESS_KEY}" 9 | AWS_SESSION_TOKEN: "${AWS_SESSION_TOKEN}" 10 | 11 | SENTRY_DSN: "${SENTRY_DSN}" 12 | 13 | services: 14 | scraper_msf: 15 | image: uk.ac.wellcome/reach/scraper:latest 16 | environment: *env 17 | entrypoint: 18 | - /opt/scraper/spider_task.py 19 | - s3://datalabs-dev/scraper/split-container/ 20 | - msf 21 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/pg_exists.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Tests for whether something exists in postgres or not. 5 | """ 6 | 7 | import argparse 8 | import logging 9 | import os 10 | import sys 11 | 12 | import psycopg2 13 | 14 | CONNECT_TIMEOUT = 1 15 | 16 | parser = argparse.ArgumentParser(description=__doc__.strip()) 17 | parser.add_argument('tablename') 18 | parser.add_argument('-v', dest='verbose', action='store_true') 19 | 20 | 21 | def check_table(dsn, tablename): 22 | with psycopg2.connect(dsn, connect_timeout=CONNECT_TIMEOUT) as con: 23 | with con.cursor() as c: 24 | c.execute( 25 | 'SELECT 1 FROM pg_tables ' 26 | 'WHERE schemaname = %s AND tablename = %s', 27 | ('public', tablename) 28 | ) 29 | if c.fetchone() == (1,): 30 | return 0 31 | return 1 32 | 33 | 34 | if __name__ == '__main__': 35 | args = parser.parse_args() 36 | logging.basicConfig() 37 | if args.verbose: 38 | logging.getLogger().setLevel(logging.DEBUG) 39 | else: 40 | logging.getLogger().setLevel(logging.INFO) 41 | 42 | if 'POSTGRES_DSN' not in os.environ: 43 | logging.error('Error: POSTGRES_DSN not set') 44 | sys.exit(2) 45 | 46 | result = check_table(os.environ['POSTGRES_DSN'], args.tablename) 47 | sys.exit(result) 48 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/pg_isready.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | """ 4 | Executes a command when postgres is ready, or exits 1 if it's not ready 5 | after the timeout. 6 | 7 | Here, "ready" means: 8 | 9 | 1. We can connect to postgres on $POSTGRES_DSN, and 10 | 1. We can run a query (SELECT 1;) for > (success-secs), to accomodate 11 | postgres initialization scripts that restart the DB. 12 | """ 13 | 14 | import argparse 15 | import logging 16 | import os 17 | import sys 18 | import time 19 | 20 | import psycopg2 21 | 22 | CONNECT_TIMEOUT = 1 23 | POLL_WAIT = 0.5 24 | 25 | parser = argparse.ArgumentParser(description=__doc__.strip()) 26 | parser.add_argument('--timeout', type=int, default=15) 27 | parser.add_argument('--success-secs', type=float, default=5) 28 | parser.add_argument('-v', dest='verbose', action='store_true') 29 | parser.add_argument('args', nargs=argparse.REMAINDER) 30 | 31 | 32 | def test_connection(dsn): 33 | with psycopg2.connect(dsn, connect_timeout=CONNECT_TIMEOUT) as con: 34 | with con.cursor() as c: 35 | c.execute('SELECT 1') 36 | 37 | 38 | def pg_isready(dsn, timeout, success_secs): 39 | start = time.time() 40 | while time.time() - start < timeout: 41 | try: 42 | success_start = time.time() 43 | while time.time() - success_start < success_secs: 44 | if time.time() - start > timeout: 45 | return 1 46 | test_connection(dsn) 47 | logging.debug('pg_isready: successful connect') 48 | time.sleep(POLL_WAIT) 49 | return 0 50 | except psycopg2.OperationalError as e: 51 | logging.debug('pg_isready: %s', e) 52 | time.sleep(POLL_WAIT) 53 | return 1 54 | 55 | 56 | if __name__ == '__main__': 57 | args = parser.parse_args() 58 | logging.basicConfig() 59 | if args.verbose: 60 | logging.getLogger().setLevel(logging.DEBUG) 61 | else: 62 | logging.getLogger().setLevel(logging.INFO) 63 | 64 | if args.success_secs > args.timeout: 65 | logging.error( 66 | 'pg_isready: timeout time is lower than success secs. Failing' 67 | ) 68 | sys.exit(2) 69 | 70 | if 'POSTGRES_DSN' not in os.environ: 71 | logging.error('Error: POSTGRES_DSN not set') 72 | sys.exit(2) 73 | 74 | result = pg_isready( 75 | os.environ['POSTGRES_DSN'], args.timeout, args.success_secs) 76 | if result != 0: 77 | logging.error( 78 | 'pg_isready: queries did not succeeed after %ss. Failing', 79 | args.timeout 80 | ) 81 | sys.exit(result) 82 | os.execvp(args.args[0], args.args) 83 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scraper.wsf_scraping.settings 8 | 9 | [deploy] 10 | url = http://localhost:6800/ 11 | project = wsf_scraping 12 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-scraper/wsf_scraping/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/contracts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy.contracts import Contract 4 | 5 | 6 | class AjaxContract(Contract): 7 | """Add headers to a contract so that it becomes an ajax request.""" 8 | name = "ajax" 9 | 10 | def adjust_request_args(self, kwargs): 11 | headers = { 12 | 'X-Requested-With': 'XMLHttpRequest', 13 | 'referer': 'https://www.nice.org.uk/guidance/published' 14 | } 15 | kwargs['headers'] = headers 16 | return kwargs 17 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/feed_storage.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from scrapy.extensions.feedexport import BlockingFeedStorage 4 | from twisted.internet import reactor 5 | from twisted.internet import threads 6 | 7 | from hooks.s3hook import S3Hook 8 | from hooks.sentry import report_exception 9 | 10 | manifest_storage_error = object() 11 | 12 | 13 | class ManifestFeedStorage(BlockingFeedStorage): 14 | """This FeedStorage is given the informations about the pdf files scraped 15 | in the pipeline. It processes this information to update the manifest file 16 | in amazon s3. the PDF files are saved to s3 in the pipeline.py file. 17 | """ 18 | 19 | def __init__(self, url): 20 | """Initialise the Feed Storage with the feed uri.""" 21 | self.logger = logging.getLogger(__name__) 22 | self.dst_key_url = url 23 | self.spider = None 24 | 25 | def open(self, spider): 26 | """The FeedStorage is opened by scrapy autmatically to receive 27 | items returned by the pipeleine. This methos initialise the object with 28 | a file system backend and a spider name (organisation). 29 | 30 | Should always return a class object. 31 | """ 32 | self.spider = spider 33 | self.file_system = S3Hook() 34 | return super(ManifestFeedStorage, self).open(spider) 35 | 36 | @report_exception 37 | def _store_in_thread(self, data_file): 38 | """ 39 | Uploads our manifest file to S3. 40 | 41 | Called in Twisted's thread pool using 42 | twisted.internet.deferToThread. Thus the explicit exception 43 | reporting above. 44 | """ 45 | self.logger.info('Updating the manifest at {dst_key_url}'.format( 46 | dst_key_url=self.dst_key_url, 47 | )) 48 | try: 49 | self.file_system.update_manifest( 50 | data_file, 51 | self.dst_key_url, 52 | self.spider.name 53 | ) 54 | except Exception as e: 55 | # If it went bad, we need to inform the spider back in 56 | # Twisted space, so that eventually the calling airflow task 57 | # can find out, too. 58 | self.logger.error('ManifestFeedStorage error: %s', e) 59 | result = threads.blockingCallFromThread( 60 | reactor, 61 | self.spider.crawler.signals.send_catch_log, 62 | signal=manifest_storage_error, 63 | exception=e 64 | ) 65 | self.logger.info('send_catch_log: %s', result) 66 | raise 67 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/filter.py: -------------------------------------------------------------------------------- 1 | from pybloom_live import BloomFilter 2 | from scrapy.utils.job import job_dir 3 | from scrapy.dupefilters import BaseDupeFilter 4 | 5 | 6 | class BLOOMDupeFilter(BaseDupeFilter): 7 | """Request Fingerprint duplicates filter""" 8 | 9 | def __init__(self, path=None): 10 | self.file = None 11 | self.fingerprints = BloomFilter(2000000, 0.00001) 12 | 13 | @classmethod 14 | def from_settings(cls, settings): 15 | return cls(job_dir(settings)) 16 | 17 | def request_seen(self, request): 18 | fp = request.url 19 | if fp in self.fingerprints: 20 | return True 21 | self.fingerprints.add(fp) 22 | 23 | def close(self, reason): 24 | self.fingerprints = None 25 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | 5 | class Article(scrapy.Item): 6 | 7 | def __repr__(self): 8 | return repr({ 9 | 'title': self.get('title'), 10 | 'url': self.get('url'), 11 | }) 12 | 13 | title = scrapy.Field() 14 | year = scrapy.Field() 15 | url = scrapy.Field() 16 | url_filename = scrapy.Field() 17 | pdf = scrapy.Field() 18 | hash = scrapy.Field() 19 | has_text = scrapy.Field() 20 | types = scrapy.Field() 21 | subjects = scrapy.Field() 22 | authors = scrapy.Field() 23 | types = scrapy.Field() 24 | date_scraped = scrapy.Field() 25 | page_title = scrapy.Field() 26 | source_page = scrapy.Field() 27 | disposition_title = scrapy.Field() 28 | link_title = scrapy.Field() 29 | page_headings = scrapy.Field() 30 | path = scrapy.Field() 31 | did = scrapy.Field() 32 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import os 4 | from datetime import datetime 5 | 6 | # Get feed configuration from environment variable. Default to debug 7 | FEED_CONFIG = os.environ.get('SCRAPY_FEED_CONFIG', 'DEBUG') 8 | BOT_NAME = 'wsf_scraper' 9 | 10 | SPIDER_MODULES = ['wsf_scraping.spiders'] 11 | NEWSPIDER_MODULE = 'wsf_scraping.spiders' 12 | 13 | # Custom contrats for spider testing 14 | SPIDER_CONTRACTS = { 15 | 'wsf_scraping.contracts.AjaxContract': 10, 16 | } 17 | ITEM_PIPELINES = { 18 | 'wsf_scraping.pipelines.WsfScrapingPipeline': 10, 19 | } 20 | FEED_STORAGES = { 21 | 'manifests3': 'wsf_scraping.feed_storage.ManifestFeedStorage', 22 | 'local': 'wsf_scraping.feed_storage.ManifestFeedStorage', 23 | } 24 | 25 | SPIDER_MIDDLEWARES = { 26 | 'wsf_scraping.middlewares.ReachDisallowedHostMiddleware': 450, 27 | } 28 | 29 | LOG_LEVEL = 'INFO' 30 | LOG_FORMATTER = 'wsf_scraping.middlewares.PoliteLogFormatter' 31 | 32 | # Set pdfminer log to WARNING 33 | logging.basicConfig() 34 | logging.getLogger("pdfminer").setLevel(logging.WARNING) 35 | 36 | DUPEFILTER_CLASS = 'scrapy.dupefilters.RFPDupeFilter' 37 | # Use a physicqal queue, slower but add fiability 38 | DEPTH_PRIORITY = 1 39 | SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' 40 | 41 | # Crawl responsibly by identifying yourself (and your website) 42 | USER_AGENT = 'Wellcome Reach Scraper (datalabs-ops@wellcome.ac.uk)' 43 | 44 | # Obey robots.txt rules 45 | ROBOTSTXT_OBEY = True 46 | 47 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 48 | CONCURRENT_REQUESTS = 5 49 | CONCURRENT_REQUESTS_PER_DOMAIN = 5 50 | RETRY_ENABLED = True 51 | RETRY_TIMES = 3 52 | DOWNLOAD_WARNSIZE = 0 53 | DOWNLOAD_MAXSIZE = 0 54 | DOWNLOAD_TIMEOUT = 20 55 | DOWNLOAD_FAIL_ON_DATALOSS = True 56 | DOWNLOAD_DELAY = 0.25 57 | 58 | HTTPCACHE_ENABLED = False 59 | 60 | AUTOTHROTTLE_ENABLED = True 61 | AUTOTHROTTLE_START_DELAY = 0.1 62 | AUTOTHROTTLE_MAX_DELAY = 0.5 63 | AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 64 | 65 | # Disable cookies 66 | COOKIES_ENABLED = False 67 | 68 | MAX_ARTICLE = int(os.environ.get('MAX_ARTICLE', '-1')) 69 | 70 | # who_iris and who_iris_single_page dedicated settings 71 | WHO_IRIS_RPP = 250 72 | WHO_IRIS_LIMIT = False 73 | if 'WHO_IRIS_YEARS' in os.environ: 74 | WHO_IRIS_YEARS = [ 75 | int(x) for x in os.environ['WHO_IRIS_YEARS'].split(',') 76 | ] 77 | else: 78 | WHO_IRIS_YEARS = list(range(2012, datetime.now().year + 1)) 79 | 80 | # nice dedicated settings 81 | NICE_GET_HISTORY = False 82 | NICE_GET_EVIDENCES = False 83 | 84 | KEYWORDS_CONTEXT = 0 85 | 86 | # Jsonlines are cleaner for big feeds 87 | FEED_FORMAT = 'jsonlines' 88 | FEED_EXPORT_ENCODING = 'utf-8' 89 | FEED_TEMPDIR = '/tmp/' 90 | 91 | # By default, log the results in a local folder 92 | FEED_URI = os.environ.get('SCRAPY_FEED_URI', 'local:///tmp/%(name)s') 93 | 94 | DATABASE_URL = os.environ.get('DATABASE_URL') 95 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/spiders/msf_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from .base_spider import BaseSpider 3 | 4 | 5 | class MsfSpider(BaseSpider): 6 | name = 'msf' 7 | 8 | 9 | def start_requests(self): 10 | """Set up the initial request to the website to scrape.""" 11 | 12 | urls = [ 13 | 'https://www.msf.org.uk/activity-reports', 14 | 'https://www.msf.org.uk/reports', 15 | ] 16 | 17 | for url in urls: 18 | callback = self.parse 19 | if "/reports" in url: 20 | callback = self.parse_reports 21 | 22 | self.logger.info('Initial url: %s', url) 23 | yield scrapy.Request( 24 | url=url, 25 | errback=self.on_error, 26 | callback=callback, 27 | ) 28 | 29 | def parse(self, response): 30 | """ Parse activity-reports pages. 31 | 32 | @url https://www.msf.org.uk/activity-reports 33 | @returns items 0 0 34 | @returns requests 10 35 | """ 36 | 37 | # TODO: Can pull document title from image alt or title properties 38 | 39 | doc_links = list(response.css('.field-item p')) 40 | 41 | 42 | for item in doc_links: 43 | url = item.xpath('.//a[@class="btn"]/@href').extract_first() 44 | image_alt = item.xpath('.//img[@class="media-element file-default"]/@alt').extract_first() 45 | 46 | if self._is_valid_pdf_url(url): 47 | data_dict = { 48 | 'source_page': response.url, 49 | 'page_title': response.xpath('/html/head/title/text()').extract_first(), 50 | 'title': image_alt 51 | } 52 | yield scrapy.Request( 53 | url=response.urljoin(url), 54 | errback=self.on_error, 55 | callback=self.save_pdf, 56 | meta={'data_dict': data_dict} 57 | ) 58 | 59 | def parse_reports(self, response): 60 | """ Parse reports page 61 | 62 | Args: 63 | url: The reports page 64 | Returns: 65 | items 66 | requests 67 | """ 68 | 69 | doc_links = list(response.css('.field-item a')) 70 | 71 | data_dict = { 72 | 'source_page': response.url, 73 | 'page_title': response.xpath('/html/head/title/text()').extract_first(), 74 | 'title': None 75 | } 76 | 77 | for item in doc_links: 78 | url = item.xpath('@href').extract_first() 79 | if self._is_valid_pdf_url(url): 80 | data_dict['title'] = item.xpath('text()').extract_first() 81 | yield scrapy.Request( 82 | url=response.urljoin(url), 83 | errback=self.on_error, 84 | callback=self.save_pdf, 85 | dont_filter=True, 86 | meta={'data_dict': data_dict} 87 | ) 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/pipeline/reach-scraper/wsf_scraping/tests/__init__.py -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/tests/test_msf_spider.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from scrapy.http import Response, Request, HtmlResponse 3 | from scrapy.utils.project import get_project_settings 4 | from wsf_scraping.spiders.msf_spider import MsfSpider 5 | 6 | from tests.common import get_path, TEST_PDF 7 | 8 | 9 | class Crawler: 10 | 11 | class Stats: 12 | def get_value(*args): 13 | return None 14 | 15 | stats = Stats() 16 | 17 | 18 | class TestMsfSpider(unittest.TestCase): 19 | 20 | def setUp(self): 21 | self.test_file = open(TEST_PDF, 'rb') 22 | self.spider = MsfSpider() 23 | self.spider.settings = get_project_settings() 24 | self.spider.crawler = Crawler() 25 | 26 | def tearDown(self): 27 | self.test_file.close() 28 | 29 | def test_save_pdf(self): 30 | """Tests if, given a pdf-like response containing a data_dict metadata, 31 | the save_pdf method does: 32 | - Create a NamedTemporaryFile 33 | - Return an item 34 | """ 35 | 36 | meta = { 37 | 'data_dict': { 38 | 'title': 'foo', 39 | } 40 | } 41 | 42 | headers = { 43 | 'content-type': b'application/pdf' 44 | } 45 | 46 | request = Request('http://foo.bar/documents/document.pdf', meta=meta) 47 | pdf_response = Response( 48 | 'http://foo.bar/documents/document.pdf', 49 | body=self.test_file.read(), 50 | request=request, 51 | headers=headers 52 | ) 53 | 54 | res = self.spider.save_pdf(pdf_response) 55 | self.assertTrue(res) 56 | self.assertTrue('foo' == res['title']) 57 | 58 | def test_parse(self): 59 | """Test if given an publication listing page of the who website, 60 | the spider yields a request to a publication, parsed by the 61 | parse_article function. 62 | """ 63 | 64 | with open(get_path('mock_sites/msf/1.html'), 'rb') as html_site: 65 | request = Request('http://foo.bar') 66 | response = HtmlResponse( 67 | 'http://foo.bar', 68 | body=html_site.read(), 69 | request=request 70 | ) 71 | 72 | res = next(self.spider.parse(response)) 73 | 74 | # Check if something is returned 75 | self.assertTrue(res) 76 | 77 | self.assertEqual(res.callback.__name__, 'save_pdf') 78 | -------------------------------------------------------------------------------- /pipeline/reach-scraper/wsf_scraping/tests/test_scraper_spiders.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from scrapy.http import Response, Request 3 | from scrapy.utils.project import get_project_settings 4 | from wsf_scraping.spiders.base_spider import BaseSpider 5 | 6 | from tests.common import TEST_PDF 7 | 8 | 9 | class Crawler: 10 | 11 | class Stats: 12 | def get_value(*args): 13 | return None 14 | 15 | stats = Stats() 16 | 17 | 18 | class TestBaseSpider(unittest.TestCase): 19 | 20 | def setUp(self): 21 | self.test_file = open(TEST_PDF, 'rb') 22 | self.spider = BaseSpider() 23 | self.spider.settings = get_project_settings() 24 | self.spider.crawler = Crawler() 25 | 26 | meta = { 27 | 'data_dict': { 28 | 'title': 'foo', 29 | } 30 | } 31 | headers = { 32 | 'content-type': b'application/pdf' 33 | } 34 | request = Request('http://foo.bar/documents/document.pdf', meta=meta) 35 | self.pdf_response = Response( 36 | 'http://foo.bar/documents/document.pdf', 37 | body=self.test_file.read(), 38 | request=request, 39 | headers=headers 40 | ) 41 | 42 | def tearDown(self): 43 | self.test_file.close() 44 | 45 | def test_base_spider(self): 46 | """Tests if, given a pdf-like response containing a data_dict metadata, 47 | the save_pdf method does: 48 | - Create a NamedTemporaryFile 49 | - Return an item 50 | """ 51 | 52 | res = self.spider.save_pdf(self.pdf_response) 53 | self.assertTrue(res) 54 | self.assertTrue('foo' == res['title']) 55 | -------------------------------------------------------------------------------- /test_target/README.md: -------------------------------------------------------------------------------- 1 | # Scrape Target 2 | 3 | A basic HTTP endpoint serving HTML pages for testing the scraper 4 | 5 | -------------------------------------------------------------------------------- /test_target/inner_page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Inner Page 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 |
13 | 14 |

Some Page Title

15 | 16 |

Spicy jalapeno bacon ipsum dolor amet buffalo leberkas spare ribs chuck ball tip short ribs hamburger. Capicola drumstick chicken, swine turkey picanha frankfurter jowl shank landjaeger. Rump leberkas beef ribs bacon flank shankle. Pastrami porchetta tongue spare ribs ball tip shoulder strip steak doner ham hock sausage. Prosciutto cupim shoulder, ham hock pork chop capicola pig andouille shank pork loin salami doner pork belly.

17 | 18 |

Spicy jalapeno bacon ipsum dolor amet buffalo leberkas spare ribs chuck ball tip short ribs hamburger. Capicola drumstick chicken, swine turkey picanha frankfurter jowl shank landjaeger. Rump leberkas beef ribs bacon flank shankle. Pastrami porchetta tongue spare ribs ball tip shoulder strip steak doner ham hock sausage. Prosciutto cupim shoulder, ham hock pork chop capicola pig andouille shank pork loin salami doner pork belly.

19 | 20 |

Spicy jalapeno bacon ipsum dolor amet buffalo leberkas spare ribs chuck ball tip short ribs hamburger. Capicola drumstick chicken, swine turkey picanha frankfurter jowl shank landjaeger. Rump leberkas beef ribs bacon flank shankle. Pastrami porchetta tongue spare ribs ball tip shoulder strip steak doner ham hock sausage. Prosciutto cupim shoulder, ham hock pork chop capicola pig andouille shank pork loin salami doner pork belly.

21 | 22 |
23 | 24 | Download Now 25 | 26 |
27 | 28 |
29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /test_target/page.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Demo Page 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 |
13 | 14 |

Some Page Title

15 | 16 |

Spicy jalapeno bacon ipsum dolor amet buffalo leberkas spare ribs chuck ball tip short ribs hamburger. Capicola drumstick chicken, swine turkey picanha frankfurter jowl shank landjaeger. Rump leberkas beef ribs bacon flank shankle. Pastrami porchetta tongue spare ribs ball tip shoulder strip steak doner ham hock sausage. Prosciutto cupim shoulder, ham hock pork chop capicola pig andouille shank pork loin salami doner pork belly.

17 | 18 |

Spicy jalapeno bacon ipsum dolor amet buffalo leberkas spare ribs chuck ball tip short ribs hamburger. Capicola drumstick chicken, swine turkey picanha frankfurter jowl shank landjaeger. Rump leberkas beef ribs bacon flank shankle. Pastrami porchetta tongue spare ribs ball tip shoulder strip steak doner ham hock sausage. Prosciutto cupim shoulder, ham hock pork chop capicola pig andouille shank pork loin salami doner pork belly.

19 | 20 |

Spicy jalapeno bacon ipsum dolor amet buffalo leberkas spare ribs chuck ball tip short ribs hamburger. Capicola drumstick chicken, swine turkey picanha frankfurter jowl shank landjaeger. Rump leberkas beef ribs bacon flank shankle. Pastrami porchetta tongue spare ribs ball tip shoulder strip steak doner ham hock sausage. Prosciutto cupim shoulder, ham hock pork chop capicola pig andouille shank pork loin salami doner pork belly.

21 | 22 |
23 | 24 | View Page 25 | View Page 2 26 | View Page 3 27 | 28 |
29 | 30 |
31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /test_target/target_server.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import tornado.ioloop 4 | import tornado.web 5 | 6 | PORT = 8888 7 | 8 | class MainHandler(tornado.web.RequestHandler): 9 | def get(self): 10 | self.render("page.html") 11 | 12 | class PageHandler(tornado.web.RequestHandler): 13 | def get(self): 14 | self.render("inner_page.html") 15 | 16 | class RobotHandler(tornado.web.RequestHandler): 17 | def get(self): 18 | self.render("robots.txt") 19 | 20 | def make_app(): 21 | basedir = "/".join(os.path.abspath(__file__).split("/")[:-1]) 22 | return tornado.web.Application([ 23 | (r"/", MainHandler,), 24 | (r"/robots.txt", RobotHandler,), 25 | (r"/page", PageHandler,), 26 | (r"/page2", PageHandler,), 27 | (r"/page3", PageHandler,), 28 | (r"/static/(.*)", tornado.web.StaticFileHandler, {'path': basedir}), 29 | ]) 30 | 31 | if __name__ == "__main__": 32 | app = make_app() 33 | app.listen(PORT) 34 | print("### Test Scrape Target") 35 | print(" localhost:8888") 36 | tornado.ioloop.IOLoop.current().start() 37 | 38 | 39 | -------------------------------------------------------------------------------- /web/.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | presets: [[ 3 | "@babel/preset-env", 4 | { 5 | forceAllTransforms: true, 6 | debug: true, 7 | useBuiltIns: "entry", 8 | modules: "commonjs", 9 | targets: "> 0.25%, ie 11, not dead", 10 | corejs: { version: 3, proposals: true} 11 | } 12 | ]], 13 | plugins: [ 14 | "@babel/plugin-transform-arrow-functions", 15 | "@babel/plugin-transform-for-of", 16 | "@babel/plugin-transform-typeof-symbol" 17 | ] 18 | 19 | } 20 | -------------------------------------------------------------------------------- /web/.dockerignore: -------------------------------------------------------------------------------- 1 | venv 2 | env 3 | docs 4 | node_modules 5 | build/web/static/* 6 | -------------------------------------------------------------------------------- /web/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "sourceType": "module", 4 | "commonjs": true, 5 | "browser": true, 6 | "es6": true 7 | }, 8 | "extends": "eslint:recommended", 9 | "globals": { 10 | "Atomics": "readonly", 11 | "SharedArrayBuffer": "readonly" 12 | }, 13 | "parserOptions": { 14 | "ecmaVersion": 2018 15 | }, 16 | "rules": { 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /web/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:12.7.0-alpine as builder 2 | 3 | WORKDIR /opt/reach/web 4 | 5 | COPY ./package.json /opt/reach/web/package.json 6 | 7 | RUN \ 8 | npm install -g parcel && \ 9 | npm install 10 | 11 | COPY ./web/src /opt/reach/web/src 12 | COPY ./.babelrc /opt/reach/web/.babelrc 13 | 14 | RUN parcel build /opt/reach/web/src/js/app.js --out-dir /opt/reach/web/static/js/ 15 | RUN parcel build /opt/reach/web/src/css/style.less --out-dir /opt/reach/web/static/css/ 16 | 17 | 18 | FROM reach.base 19 | 20 | WORKDIR /opt/reach 21 | 22 | COPY ./requirements.txt /opt/reach/requirements.web.txt 23 | 24 | RUN pip install -U pip && \ 25 | python3 -m pip install -r /opt/reach/requirements.web.txt 26 | 27 | COPY ./web /opt/reach/web 28 | COPY --from=builder /opt/reach/web/static /opt/reach/build/static/ 29 | 30 | COPY ./web/src/images/ /opt/reach/build/static/images/ 31 | COPY ./web/src/favicon/ /opt/reach/build/static/favicon/ 32 | COPY ./web/src/favicon/favicon.ico /opt/reach/web/favicon.ico 33 | -------------------------------------------------------------------------------- /web/Makefile: -------------------------------------------------------------------------------- 1 | PYTHON := ${PWD}/venv/bin/python 2 | GUNICORN := ${PWD}/venv/bin/gunicorn 3 | STATIC_ROOT := ${PWD}/build/web/static 4 | DOCS_STATIC_ROOT := ${PWD}/docs/build/html/_static 5 | SENTRY_DSN := "" 6 | CMD_ARGS := "--bind=127.0.0.1 --workers=1 --reload" 7 | CONFIG_FILE := ${PWD}/config/dev.config.toml 8 | APP_OUT_DIR := ${STATIC_ROOT}/js 9 | APP_OUT_DIR := ${CSS_OUT_DIR}/css 10 | 11 | 12 | .PHONY: setup 13 | setup: 14 | python3 -m venv --copies venv 15 | ./venv/bin/pip install -r requirements.txt 16 | 17 | .PHONY: run-server 18 | run-server: 19 | CONFIG_FILE=${PWD}/config/dev.config.toml ${GUNICORN} web:application 20 | 21 | .PHONY: watch 22 | watch: watch-styles watch-app 23 | 24 | .PHONY: watch-app 25 | watch-app: 26 | parcel watch web/src/js/app.js --out-dir ${APP_OUT_DIR} 27 | 28 | .PHONY: watch-styles 29 | watch-styles: 30 | parcel watch web/src/css/style.less --out-dir ${CSS_OUT_DIR} 31 | 32 | 33 | .PHONY: run 34 | run: run-server watch 35 | -------------------------------------------------------------------------------- /web/bin/update_vendor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Updates files in static/vendor from the internet. 4 | 5 | DESTDIR=$1 6 | if [ -z "$DESTDIR" ]; then 7 | echo "Usage: $0 /path/to/static/vendor" >&2 8 | exit 1 9 | fi 10 | 11 | 12 | SPECTRE_VERSION=0.5.8 13 | 14 | 15 | # Spectre CSS 16 | mkdir -p $DESTDIR/spectre-${SPECTRE_VERSION} 17 | curl -L https://github.com/picturepan2/spectre/archive/v${SPECTRE_VERSION}.tar.gz \ 18 | | tar -C $DESTDIR/spectre-${SPECTRE_VERSION} \ 19 | -xzf - \ 20 | --strip-components 2 \ 21 | spectre-${SPECTRE_VERSION}/dist 22 | -------------------------------------------------------------------------------- /web/config/docker.config.toml: -------------------------------------------------------------------------------- 1 | debug = true 2 | static_root = "../build/web/static" 3 | docs_static_root = "docs/build/html/_static" 4 | 5 | [database] 6 | db_port = 5432 7 | db_name = "" 8 | db_host = "" 9 | db_user = "" 10 | db_password = "" 11 | min_conns = 1 12 | max_conns = 30 13 | 14 | 15 | [sentry] 16 | dsn = "" 17 | 18 | [analytics] 19 | ga_code = null 20 | hotjar_code = null 21 | 22 | [github] 23 | github_token = "" 24 | github_user = "" 25 | -------------------------------------------------------------------------------- /web/config/local.config.toml: -------------------------------------------------------------------------------- 1 | debug = true 2 | static_root = "../build/web/static" 3 | docs_static_root = "docs/build/html/_static" 4 | 5 | [database] 6 | db_port = 5432 7 | db_name = "" 8 | db_host = "" 9 | db_user = "" 10 | db_password = "" 11 | min_conns = 1 12 | max_conns = 30 13 | 14 | [sentry] 15 | dsn = "" 16 | 17 | [analytics] 18 | ga_code = "" 19 | hotjar_code = "" 20 | 21 | [github] 22 | github_token = "" 23 | github_user = "" 24 | -------------------------------------------------------------------------------- /web/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "reach-web", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "main.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "", 10 | "license": "MIT", 11 | "devDependencies": { 12 | "@babel/core": "^7.10.5", 13 | "@babel/plugin-transform-arrow-functions": "^7.10.4", 14 | "@babel/plugin-transform-for-of": "^7.10.4", 15 | "@babel/plugin-transform-typeof-symbol": "^7.10.4", 16 | "@babel/preset-env": "^7.10.4" 17 | }, 18 | "dependencies": { 19 | "core-js": "^3.6.5", 20 | "less": "^3.12.2" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /web/requirements.txt: -------------------------------------------------------------------------------- 1 | falcon 2 | gunicorn 3 | jinja2 4 | sentry-sdk 5 | psycopg2-binary 6 | toml 7 | uuid 8 | requests 9 | -------------------------------------------------------------------------------- /web/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | # Current directory must be data-labs repo before running setup.py! 4 | with open("README.md", "r") as f: 5 | long_description = f.read() 6 | 7 | with open('unpinned_requirements.txt') as f: 8 | unpinned_requirements = [ 9 | l.strip() for l in f 10 | if not l.startswith('#') 11 | ] 12 | 13 | setuptools.setup( 14 | name="wellcome-reach", 15 | version="0.0.1", 16 | author="Wellcome Trust Data Labs Team", 17 | author_email="datalabs-engineering@wellcomecloud.onmicrosoft.com", 18 | description="Wellcome Trust Data Labs Reach", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | url="https://github.com/wellcometrust/data-labs/common", 22 | packages=setuptools.find_packages( 23 | include=["reach.*"], 24 | ), 25 | classifiers=[ 26 | "Programming Language :: Python :: 3", 27 | "Operating System :: OS Independent", 28 | ], 29 | tests_require=[ 30 | 'pytest', 31 | ], 32 | install_requires=unpinned_requirements, 33 | ) 34 | -------------------------------------------------------------------------------- /web/web/__init__.py: -------------------------------------------------------------------------------- 1 | from .wsgi import application 2 | -------------------------------------------------------------------------------- /web/web/db.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from contextlib import contextmanager 3 | 4 | from psycopg2.extras import RealDictCursor 5 | from psycopg2.pool import ThreadedConnectionPool 6 | 7 | from web import config as conf 8 | 9 | pool = None 10 | MIN_CONNS = 1 11 | MAX_CONNS = 30 12 | 13 | def create_pool(): 14 | global pool 15 | 16 | if pool is None: 17 | pool = ThreadedConnectionPool( 18 | conf.CONFIG.min_conns, 19 | conf.CONFIG.max_conns, 20 | database=conf.CONFIG.db_name, 21 | user=conf.CONFIG.db_user, 22 | password=conf.CONFIG.db_password, 23 | host=conf.CONFIG.db_host, 24 | port=conf.CONFIG.db_port, 25 | ) 26 | return pool 27 | 28 | @contextmanager 29 | def get_db_connection(): 30 | """ Yields a database connection from the pool 31 | """ 32 | connection = None 33 | try: 34 | if pool is None: 35 | create_pool() 36 | connection = pool.getconn() 37 | yield connection 38 | finally: 39 | pool.putconn(connection) 40 | 41 | @contextmanager 42 | def get_db_cur(commit=False, name=None): 43 | """ Yields a cursor against the database 44 | 45 | Args: 46 | commit: Whether to commit at the end of a transaction 47 | """ 48 | with get_db_connection() as connection: 49 | cursor = connection.cursor(cursor_factory=RealDictCursor, name=name) 50 | try: 51 | yield cursor 52 | if commit: 53 | connection.commit() 54 | finally: 55 | cursor.close() 56 | 57 | -------------------------------------------------------------------------------- /web/web/docs/.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | !requirements.txt 3 | -------------------------------------------------------------------------------- /web/web/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /web/web/docs/README.md: -------------------------------------------------------------------------------- 1 | # Reach API Documentation 2 | 3 | ## How to contribute 4 | 5 | ### Requirements: 6 | - Python > 3.6 7 | - Virtualenv 8 | - Pip 9 | 10 | ### Install the documentation stack: 11 | 12 | ``` 13 | virtualenv env -p python3 14 | source env/bin/activate 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ### Build the documentation 19 | 20 | Sphinx accepts two types of files: 21 | - `.rst`: re:Structured files 22 | - `.md`: Markdown formatted files 23 | 24 | While both are allowed, for consistency, writing .md files is recommended. 25 | Once all files are written, add them by name to index.rst and run `make html`. 26 | -------------------------------------------------------------------------------- /web/web/docs/build/doctrees/api.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/doctrees/api.doctree -------------------------------------------------------------------------------- /web/web/docs/build/doctrees/environment.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/doctrees/environment.pickle -------------------------------------------------------------------------------- /web/web/docs/build/doctrees/index.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/doctrees/index.doctree -------------------------------------------------------------------------------- /web/web/docs/build/doctrees/intro.doctree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/doctrees/intro.doctree -------------------------------------------------------------------------------- /web/web/docs/build/html/.buildinfo: -------------------------------------------------------------------------------- 1 | # Sphinx build info version 1 2 | # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. 3 | config: 06336bc7aab796cbac45a22359d55abd 4 | tags: 645f666f9bcd5a90fca523b33c5a78b7 5 | -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/documentation_options.js: -------------------------------------------------------------------------------- 1 | var DOCUMENTATION_OPTIONS = { 2 | URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), 3 | VERSION: '2020.01.01', 4 | LANGUAGE: 'None', 5 | COLLAPSE_INDEX: false, 6 | BUILDER: 'html', 7 | FILE_SUFFIX: '.html', 8 | LINK_SUFFIX: '.html', 9 | HAS_SOURCE: true, 10 | SOURCELINK_SUFFIX: '.txt', 11 | NAVIGATION_WITH_KEYS: false 12 | }; -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/file.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/file.png -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Inconsolata-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Inconsolata-Bold.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Inconsolata-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Inconsolata-Regular.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Inconsolata.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Inconsolata.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato-Bold.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato-Regular.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-bold.eot -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-bold.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-bold.woff -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-bold.woff2 -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-bolditalic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-bolditalic.eot -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-bolditalic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-bolditalic.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-bolditalic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-bolditalic.woff -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-bolditalic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-bolditalic.woff2 -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-italic.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-italic.eot -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-italic.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-italic.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-italic.woff -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-italic.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-italic.woff2 -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-regular.eot -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-regular.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-regular.woff -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/Lato/lato-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/Lato/lato-regular.woff2 -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab-Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab-Bold.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab-Regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab-Regular.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.eot -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-bold.woff2 -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.eot -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/RobotoSlab/roboto-slab-v7-regular.woff2 -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/fontawesome-webfont.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/fontawesome-webfont.eot -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/fontawesome-webfont.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/fontawesome-webfont.ttf -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/fontawesome-webfont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/fontawesome-webfont.woff -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/fonts/fontawesome-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/fonts/fontawesome-webfont.woff2 -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/minus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/minus.png -------------------------------------------------------------------------------- /web/web/docs/build/html/_static/plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/_static/plus.png -------------------------------------------------------------------------------- /web/web/docs/build/html/objects.inv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/docs/build/html/objects.inv -------------------------------------------------------------------------------- /web/web/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /web/web/docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx 2 | recommonmark 3 | sphinx_rtd_theme 4 | -------------------------------------------------------------------------------- /web/web/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'reach-web-api' 21 | copyright = '2020, Datalabs' 22 | author = 'Datalabs' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '2020.01.01' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'recommonmark', 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = [] 44 | 45 | source_suffix = ['.md', '.rst'] 46 | 47 | # -- Options for HTML output ------------------------------------------------- 48 | 49 | # The theme to use for HTML and HTML Help pages. See the documentation for 50 | # a list of builtin themes. 51 | # 52 | html_theme = 'sphinx_rtd_theme' 53 | 54 | # Add any paths that contain custom static files (such as style sheets) here, 55 | # relative to this directory. They are copied after the builtin static files, 56 | # so a file named "default.css" will overwrite the builtin "default.css". 57 | html_static_path = ['_static'] 58 | -------------------------------------------------------------------------------- /web/web/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. rech-web-api documentation master file, created by 2 | sphinx-quickstart on Tue Jan 21 17:40:24 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Reach's web API documentation! 7 | ========================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | intro 14 | api 15 | -------------------------------------------------------------------------------- /web/web/docs/source/intro.md: -------------------------------------------------------------------------------- 1 | # Wellcome Reach 2 | 3 | Wellcome Reach is an open source service for discovering how research 4 | publications are cited in global policy documents, including those 5 | produced by policy organizations such as the WHO, MSF, and the UK 6 | government. Key parts of it include: 7 | 8 | 1. Web scrapers for pulling PDF "policy documents" from policy 9 | organizations, 10 | 1. A reference parser for extracting references from these documents, 11 | 1. A task for sourcing publications from Europe PMC (EPMC), 12 | 1. A task for matching policy document references to EPMC publications, 13 | 1. An Airflow installation for automating the above tasks, and 14 | 1. A web application for searching and retrieving data from the datasets 15 | produced above. 16 | 17 | Wellcome Reach is written in Python and developed using docker-compose. 18 | It's deployed into Kubernetes. 19 | 20 | Although parts of the Wellcome Reach have been in use at Wellcome since 21 | mid-2018, the project has only been open source since March 2019. Given 22 | these early days, please be patient as various parts of it are made 23 | accessible to external users. All issues and pull requests are welcome. 24 | 25 | 26 | ## Further reading 27 | - [Github repository](https://github.com/wellcometrust/reach) 28 | -------------------------------------------------------------------------------- /web/web/src/css/about.less: -------------------------------------------------------------------------------- 1 | @import "variables"; 2 | 3 | #about-page, 4 | #how-it-works-page { 5 | h3.category-title { 6 | font-weight: bold; 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /web/web/src/css/contact.less: -------------------------------------------------------------------------------- 1 | .contact-form { 2 | margin-bottom: 0; 3 | margin-left: auto; 4 | margin-right: auto; 5 | 6 | .ctc-form-field { 7 | display: flex; 8 | flex-direction: column; 9 | margin-bottom: 18px; 10 | } 11 | 12 | 13 | label { 14 | flex: 1; 15 | max-width: 30%; 16 | min-width: 30%; 17 | display: block; 18 | font-size: 1rem; 19 | letter-spacing: 0.5px; 20 | line-height: 1.5rem; 21 | color: #292929; 22 | font-family: Helvetica Neue, Helvetica, Arial, sans-serif; 23 | margin-bottom: 0.5rem; 24 | } 25 | 26 | .ctc-form-input { 27 | flex: 2; 28 | display: block; 29 | 30 | input[type="text"], input[type="email"] { 31 | width: 100%; 32 | height: 2.75rem; 33 | border: 1px solid #CCCCCC; 34 | color: #333; 35 | text-indent: 3px; 36 | } 37 | 38 | textarea { 39 | width: 100%; 40 | resize: none; 41 | height: 200px; 42 | overflow-y: auto; 43 | border: 1px solid #CCCCCC; 44 | color: #333333; 45 | padding: 4px; 46 | } 47 | } 48 | 49 | 50 | .ctc-controls { 51 | display: flex; 52 | flex-direction: row; 53 | justify-content: flex-end; 54 | align-items: flex-end; 55 | 56 | .ctc-note { 57 | flex: 1; 58 | display: flex; 59 | justify-content: flex-start; 60 | align-items: flex-start; 61 | flex-direction: column; 62 | 63 | 64 | p { 65 | margin: 0; 66 | padding: 0; 67 | font-size: 12px; 68 | color: #CCC; 69 | } 70 | } 71 | 72 | .ctct-submit { 73 | flex: 0; 74 | white-space: nowrap; 75 | display: inline-block; 76 | width: 9.875rem; 77 | height: 2.75rem; 78 | border-radius: 2px; 79 | background-color: #006272; 80 | color: white; 81 | padding-left: 21px; 82 | padding-right: 21px; 83 | border: none; 84 | cursor: pointer; 85 | transition: all 0.3s ease-in-out; 86 | 87 | &:disabled { 88 | background: #CCC; 89 | 90 | &:hover { 91 | background: #CCC !important; 92 | } 93 | } 94 | 95 | &:hover { 96 | background-color: #005361; 97 | } 98 | } 99 | 100 | } 101 | 102 | } 103 | 104 | 105 | 106 | #ctc-result-success { 107 | display: none; 108 | text-align: center; 109 | } 110 | 111 | #ctc-result-failure { 112 | display: none; 113 | text-align: center; 114 | } 115 | -------------------------------------------------------------------------------- /web/web/src/css/footer.less: -------------------------------------------------------------------------------- 1 | @import "variables"; 2 | 3 | footer { 4 | /* height: 6.25rem; */ 5 | /* line-height: 6.25rem; */ 6 | padding: 2.625rem 0; 7 | font-size: @smallFontSize; 8 | width: 100%; 9 | } 10 | 11 | footer.home { 12 | position: relative; 13 | background-color: white; 14 | } 15 | 16 | #wellcome-logo-container{ 17 | 18 | display: inline-block; 19 | margin-right: @smallFontSize; 20 | 21 | #wellcome-logo { 22 | height: 1.6rem; 23 | width: 1.6rem; 24 | vertical-align: middle; 25 | } 26 | 27 | } 28 | 29 | footer a, footer p { 30 | font-size: @smallFontSize; 31 | text-decoration: none; 32 | display: inline-block; 33 | } 34 | 35 | footer p { 36 | a { 37 | text-decoration: underline; 38 | } 39 | } 40 | 41 | /* Offsets the grid */ 42 | @media screen and (min-width: 780px) { 43 | footer { 44 | text-align: left; 45 | } 46 | } 47 | 48 | @media screen and (max-width: 780px) { 49 | footer { 50 | text-align: center; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /web/web/src/css/header.less: -------------------------------------------------------------------------------- 1 | @import "variables"; 2 | 3 | 4 | 5 | @media screen and (min-width: 1280px) { 6 | header.navbar { 7 | padding: 0 5.75rem; 8 | } 9 | 10 | } 11 | 12 | @media screen and (max-width: 1280px) { 13 | header.navbar { 14 | padding: 0 @smallPadding; 15 | } 16 | 17 | } 18 | 19 | header.navbar { 20 | color: white; 21 | background-color: @cyanDark; 22 | height: 3.75rem; 23 | 24 | img { 25 | color: white; 26 | font-weight: normal; 27 | margin: 0; 28 | height: 3.5rem; 29 | } 30 | 31 | a { 32 | white-space: nowrap; 33 | text-decoration: none; 34 | font-weight: normal; 35 | } 36 | 37 | #navbar-links a { 38 | margin: 0 @smallPadding; 39 | line-height: 3.5rem; 40 | } 41 | 42 | #navbar-links a:hover { 43 | color: @cyanLight; 44 | } 45 | 46 | #navbar-links a.active { 47 | border-bottom: .25rem solid white; 48 | padding-top: 0.25rem; 49 | line-height: 3.25rem; 50 | } 51 | 52 | } 53 | 54 | header.navbar.home { 55 | height: 5rem; 56 | 57 | #navbar-links a { 58 | line-height: 4.75rem; 59 | } 60 | 61 | img { 62 | margin-top: 8px; 63 | height: 3.5rem; 64 | } 65 | 66 | } 67 | 68 | .btn.cta-link { 69 | background-color: @cyanDark; 70 | border-radius: 28px; 71 | font-size: @smallFontSize; 72 | line-height: 2.75rem !important; 73 | height: 2.75rem; 74 | border: 1px solid white; 75 | text-decoration: none; 76 | color: white; 77 | margin: 0; 78 | padding: 0 @smallPadding; 79 | text-align: center; 80 | font-weight: normal; 81 | font-stretch: normal; 82 | font-style: normal; 83 | letter-spacing: normal; 84 | } 85 | 86 | .btn.cta-link:hover { 87 | background-color: @cyanPrimary; 88 | color: white !important; 89 | } 90 | 91 | /* Override Spectre default */ 92 | .breadcrumb .breadcrumb-item:not(:last-child) a { 93 | color: @cyanPrimary; 94 | } 95 | 96 | 97 | .breadcrumb .breadcrumb-item:not(:first-child)::before { 98 | color: @greyDark; 99 | content: ">"; 100 | padding-right: .4rem; 101 | } 102 | 103 | .breadcrumb .breadcrumb-item { 104 | color: @greyDark; 105 | font-size: @smallFontSize; 106 | } 107 | 108 | #breadcrumbs { 109 | background: white; 110 | border-bottom: 1px solid @greyLight; 111 | } 112 | 113 | 114 | @media screen and (min-width: 1280px) { 115 | #breadcrumbs { 116 | padding: 0 @guidePadding; 117 | } 118 | } 119 | 120 | @media screen and (max-width: 1280px) { 121 | #breadcrumbs { 122 | padding: 0 1.375rem; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /web/web/src/css/home.less: -------------------------------------------------------------------------------- 1 | @import "variables"; 2 | 3 | body.home { 4 | background-image: none; 5 | } 6 | 7 | .bg2 { 8 | background-color: @backgroundTint2; 9 | } 10 | 11 | @media screen and (min-width: 840px) { 12 | 13 | } 14 | 15 | @media screen and (max-width: 840px) { 16 | 17 | } 18 | 19 | /* 20 | * Page structures 21 | */ 22 | section#hero, section#about-us { 23 | display: flex; 24 | } 25 | 26 | .hero-picture { 27 | position: relative; 28 | 29 | .img-container { 30 | background-image: url(../images/reach_site_view.png); 31 | background-repeat: no-repeat; 32 | background-size: 100%; 33 | box-shadow: inset 0 -100px 20px -25px @backgroundTint1, 34 | 10px -10px 10px -15px black, 35 | -10px -10px 10px -15px black; 36 | min-height: 40vh; 37 | } 38 | } 39 | 40 | section#hero { 41 | background-color: @backgroundTint2; 42 | background-image: linear-gradient(-177deg, @backgroundTint2 70%, @backgroundTint1 calc(70% + 2px)); 43 | color: white; 44 | /* .container { 45 | min-height: 50%; 46 | } */ 47 | } 48 | 49 | #hero-picture-container { 50 | position: relative; 51 | bottom: 0; 52 | /* height: 90%; 53 | 54 | .column, .column .hero-picture { 55 | height: 90%; 56 | } */ 57 | } 58 | 59 | section#hero h1 { 60 | font-family: Wellcome, Helvetica, Arial, sans-serif; 61 | } 62 | 63 | section#scroll-arrow { 64 | position: absolute; 65 | width: 100%; 66 | bottom: 0; 67 | 68 | .container { 69 | position: relative; 70 | bottom: 0; 71 | } 72 | } 73 | 74 | 75 | section#hero { 76 | min-height: 85vh; 77 | } 78 | 79 | section#home-header { 80 | height: 5vh; 81 | min-height: 5rem; 82 | background-color: @backgroundTint2; 83 | } 84 | 85 | 86 | section#about-reach { 87 | background-image: url(../images/Shape_01.svg); 88 | background-repeat: no-repeat; 89 | background-size: cover; 90 | 91 | a { 92 | font-size: @baseFontSize; 93 | } 94 | } 95 | 96 | section#about-us { 97 | background-image: url(../images/Shape_02.svg); 98 | background-repeat: no-repeat; 99 | background-size: cover; 100 | } 101 | 102 | .home-hero-view { 103 | height: 100vh; 104 | } 105 | 106 | #about-us h3, #about-reach h3 { 107 | font-weight: bold; 108 | } 109 | -------------------------------------------------------------------------------- /web/web/src/css/icons.less: -------------------------------------------------------------------------------- 1 | @import "variables"; 2 | 3 | .icn.icn-search::before { 4 | color: white; 5 | fill: white; 6 | margin-bottom: -10px; 7 | content: url(../images/Icon_Search_16px.svg); 8 | } 9 | 10 | .icn.icn-download::before { 11 | color: white; 12 | fill: white; 13 | margin-bottom: -4px; 14 | content: url(../images/Icon_Download_16px.svg); 15 | } 16 | 17 | .icn.icn-research-paper::before { 18 | color: black; 19 | fill: black; 20 | margin-bottom: -4px; 21 | content: url(../images/Icon_Research_24px.svg); 22 | } 23 | 24 | .icn.icn-sort { 25 | color: grey; 26 | fill: grey; 27 | margin-bottom: -3px; 28 | padding: 4px 4px 0 4px; 29 | content: url(../images/Icon_Chevron_Double.svg); 30 | } 31 | 32 | .icn.icn-sorted { 33 | margin-bottom: -1px; 34 | margin-left: 4px; 35 | padding: 4px 4px 0 4px; 36 | content: url(../images/Icon_Arrow_down.svg); 37 | } 38 | 39 | .icn.icn-sorted-asc { 40 | transform: rotate(180deg); 41 | } 42 | 43 | .icn.icn-info { 44 | margin-bottom: -4px; 45 | margin-right: 4px; 46 | content: url(../images/Icon_Info.svg); 47 | } 48 | 49 | .icn.icn-new-page { 50 | margin-bottom: -3px; 51 | margin-right: -4px; 52 | content: url(../images/Icon_new_window.svg); 53 | } 54 | 55 | .icn.icn-chevron-left { 56 | margin-bottom: 2px; 57 | transform: rotate(90deg); 58 | filter: invert(25%) sepia(95%) saturate(911%) hue-rotate(153deg) brightness(93%) contrast(101%); 59 | content: url(../images/Icon_Chevron_Down.svg); 60 | } 61 | 62 | .icn.icn-chevron-right { 63 | margin-bottom: 2px; 64 | filter: invert(25%) sepia(95%) saturate(911%) hue-rotate(153deg) brightness(93%) contrast(101%); 65 | transform: rotate(270deg); 66 | content: url(../images/Icon_Chevron_Down.svg); 67 | } 68 | 69 | .icn-down { 70 | transform: rotate(0deg); 71 | color: @greyDark; 72 | } 73 | .icn-up { 74 | transform: rotate(180deg); 75 | color: @cyanLight; 76 | } 77 | -------------------------------------------------------------------------------- /web/web/src/css/search.less: -------------------------------------------------------------------------------- 1 | @import "variables"; 2 | 3 | .search-box { 4 | background: @backgroundTint1; 5 | } 6 | 7 | .search-tips { 8 | margin: @mediumPadding 0; 9 | } 10 | 11 | .search-tips .btn.help { 12 | background-color: @cyanPrimary; 13 | margin-top: 0; 14 | } 15 | 16 | .insights { 17 | padding: @mediumPadding 0; 18 | } 19 | 20 | .feedback-box { 21 | background-color: @backgroundTint2; 22 | padding: @smallPadding; 23 | } 24 | 25 | .form-label { 26 | margin-bottom: 0.5rem; 27 | font-size: .875rem; 28 | color: #292929; 29 | } 30 | 31 | /* Help tooltip */ 32 | .help { 33 | width: 22px; 34 | height: 22px; 35 | padding: 0; 36 | margin-left: @smallFontSize; 37 | color: white; 38 | font-size: 14px; 39 | font-weight: bold; 40 | border-radius: 50%; 41 | line-height: 22px; 42 | text-align: center; 43 | background-color: @cyanPrimary; 44 | } 45 | 46 | .popover-container { 47 | display: block; 48 | opacity: 1; 49 | transform: translate(-50%, -100%) scale(1); 50 | } 51 | 52 | .popover-container svg.arrow { 53 | margin: -1em auto 0 auto; 54 | width: 2em; 55 | height: 1em; 56 | } 57 | 58 | .popover-container .card { 59 | color: @greyDark; 60 | } 61 | -------------------------------------------------------------------------------- /web/web/src/css/variables.less: -------------------------------------------------------------------------------- 1 | /* 2 | * Colors and typefaces 3 | */ 4 | 5 | 6 | /* 7 | * Colors 8 | */ 9 | @cyanLight: #009BB2; 10 | @cyanPrimary: #006272; 11 | @cyanDark: #005361; 12 | @cyanFocused: #E5EFF1; 13 | 14 | @backgroundTint1: #F1FCFD; 15 | @backgroundTint2: #DCF4F9; 16 | @backgroundTint3: #BEEBF4; 17 | 18 | @greyLight: #CCCCCC; 19 | @greyLink: #767676; 20 | @greyDark: #292929; 21 | @greyTable: #E6E6E6; 22 | 23 | 24 | /* 25 | * Fonts 26 | */ 27 | 28 | :root { 29 | font-size: 16px; 30 | } 31 | 32 | @h1FontSize: 2rem; 33 | @h2FontSize: 1.5rem; 34 | @h3FontSize: 1.25rem; 35 | @h4FontSize: 1rem; 36 | @baseFontSize: 1rem; 37 | @smallFontSize: 0.875rem; 38 | 39 | @font-face { 40 | font-family: Wellcome; 41 | /* NB: this will be inlined by postcss-url */ 42 | src: url("./wellcome-bold-webfont.woff2") format("woff2"); 43 | } 44 | 45 | /* 46 | * Common spacings 47 | */ 48 | 49 | @guidePadding: 5.75rem; 50 | @heroPadding: 2.5rem; 51 | @mediumPadding: 1.5rem; 52 | @smallPadding: 1rem; 53 | -------------------------------------------------------------------------------- /web/web/src/css/wellcome-bold-webfont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/css/wellcome-bold-webfont.woff2 -------------------------------------------------------------------------------- /web/web/src/favicon/android-icon-144x144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/android-icon-144x144.png -------------------------------------------------------------------------------- /web/web/src/favicon/android-icon-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/android-icon-192x192.png -------------------------------------------------------------------------------- /web/web/src/favicon/android-icon-36x36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/android-icon-36x36.png -------------------------------------------------------------------------------- /web/web/src/favicon/android-icon-48x48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/android-icon-48x48.png -------------------------------------------------------------------------------- /web/web/src/favicon/android-icon-72x72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/android-icon-72x72.png -------------------------------------------------------------------------------- /web/web/src/favicon/android-icon-96x96.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/android-icon-96x96.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-114x114.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-114x114.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-120x120.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-120x120.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-144x144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-144x144.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-152x152.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-152x152.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-180x180.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-180x180.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-57x57.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-57x57.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-60x60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-60x60.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-72x72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-72x72.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-76x76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-76x76.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon-precomposed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon-precomposed.png -------------------------------------------------------------------------------- /web/web/src/favicon/apple-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/apple-icon.png -------------------------------------------------------------------------------- /web/web/src/favicon/browserconfig.xml: -------------------------------------------------------------------------------- 1 | 2 | #ffffff -------------------------------------------------------------------------------- /web/web/src/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /web/web/src/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /web/web/src/favicon/favicon-96x96.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/favicon-96x96.png -------------------------------------------------------------------------------- /web/web/src/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/favicon.ico -------------------------------------------------------------------------------- /web/web/src/favicon/manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "App", 3 | "icons": [ 4 | { 5 | "src": "\/android-icon-36x36.png", 6 | "sizes": "36x36", 7 | "type": "image\/png", 8 | "density": "0.75" 9 | }, 10 | { 11 | "src": "\/android-icon-48x48.png", 12 | "sizes": "48x48", 13 | "type": "image\/png", 14 | "density": "1.0" 15 | }, 16 | { 17 | "src": "\/android-icon-72x72.png", 18 | "sizes": "72x72", 19 | "type": "image\/png", 20 | "density": "1.5" 21 | }, 22 | { 23 | "src": "\/android-icon-96x96.png", 24 | "sizes": "96x96", 25 | "type": "image\/png", 26 | "density": "2.0" 27 | }, 28 | { 29 | "src": "\/android-icon-144x144.png", 30 | "sizes": "144x144", 31 | "type": "image\/png", 32 | "density": "3.0" 33 | }, 34 | { 35 | "src": "\/android-icon-192x192.png", 36 | "sizes": "192x192", 37 | "type": "image\/png", 38 | "density": "4.0" 39 | } 40 | ] 41 | } -------------------------------------------------------------------------------- /web/web/src/favicon/ms-icon-144x144.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/ms-icon-144x144.png -------------------------------------------------------------------------------- /web/web/src/favicon/ms-icon-150x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/ms-icon-150x150.png -------------------------------------------------------------------------------- /web/web/src/favicon/ms-icon-310x310.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/ms-icon-310x310.png -------------------------------------------------------------------------------- /web/web/src/favicon/ms-icon-70x70.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/favicon/ms-icon-70x70.png -------------------------------------------------------------------------------- /web/web/src/images/Icon_ New-window.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon/ New-window 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_About_Accuracy_100px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_About_Accuracy_100px 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_About_Open-source_100px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_About_Open-source_100px 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_About_Transparent_100px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_About_Transparent_100px 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Arrow_down.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon/Arrow/down 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Chevron_Double.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon/Chevron/Double 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Chevron_Down.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon/Chevron/Down 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Download_16px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_Download_16px 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Info.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon/Add 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Menu_16px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_Menu_16px 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Policy_24px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_Policy_24px 5 | Created with Sketch. 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Research_24px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_Research_24px 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Scroll-arow.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_Scroll-arow_ 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Search_16px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_Search_16px 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_Sort-by_16px.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon_Sort-by_16px 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /web/web/src/images/Icon_new_window.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Icon/ New-window 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /web/web/src/images/Image_Product-shot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/images/Image_Product-shot.png -------------------------------------------------------------------------------- /web/web/src/images/Shape_01.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Shape_01 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /web/web/src/images/Shape_02.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Shape_02 5 | Created with Sketch. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /web/web/src/images/reach_site_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/web/web/src/images/reach_site_view.png -------------------------------------------------------------------------------- /web/web/src/images/wave.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /web/web/src/images/wellcome-logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | -------------------------------------------------------------------------------- /web/web/src/images/white-wave.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /web/web/src/js/app.js: -------------------------------------------------------------------------------- 1 | import "core-js/stable"; 2 | import "core-js/stable/array"; 3 | 4 | import clearSearch from './clearSearch.js'; 5 | import policyTable from './policyTable.js'; 6 | import citationsTable from './citationsTable.js'; 7 | import contact from "./v.contact"; 8 | import home from './home.js'; 9 | 10 | document.addEventListener('DOMContentLoaded', function(event) { 11 | String.prototype.toTitleCase = function() { 12 | let lower = this.valueOf().toLowerCase(); 13 | return lower.replace(/^\w/, c => c.toUpperCase());; 14 | }; 15 | 16 | clearSearch(); 17 | policyTable(); 18 | citationsTable(); 19 | home(); 20 | contact(); 21 | 22 | // Tracking 23 | const headerLinks = document.getElementsByClassName('navbar'); 24 | headerLinks.forEach(item => { 25 | item.addEventListener('click', (e) => { 26 | if (e.target.tagName == "A") { 27 | gtag('event', 'Internal click', { 28 | event_category: 'Header', 29 | event_label: e.target.innerHTML 30 | }); 31 | } 32 | }); 33 | }); 34 | 35 | const footerLinks = document.getElementsByTagName('footer'); 36 | footerLinks.forEach(item => { 37 | item.addEventListener('click', (e) => { 38 | if (e.target.tagName == "A") { 39 | gtag('event', 'Internal click', { 40 | event_category: 'Footer', 41 | event_label: e.target.innerHTML 42 | }); 43 | } 44 | }); 45 | }); 46 | 47 | const resultsContactLink = document.getElementById('search-results-contact'); 48 | if (resultsContactLink) { 49 | resultsContactLink.addEventListener('click', (e) => { 50 | let source = (e.target.getAttribute('data-from') == "citations")? "Discover citations":"Browse pol docs"; 51 | gtag('event', 'Click', { 52 | event_category: source, 53 | event_label: 'Email: search results' 54 | }); 55 | }); 56 | } 57 | }); 58 | -------------------------------------------------------------------------------- /web/web/src/js/clearSearch.js: -------------------------------------------------------------------------------- 1 | const clearSearch = (reach) => { 2 | let clearButton = document.getElementById('search-clear'); 3 | let searchInput = document.getElementById('search-term'); 4 | if (clearButton) { 5 | clearButton.addEventListener('click', () => { 6 | searchInput.value = ''; 7 | }); 8 | 9 | } 10 | }; 11 | 12 | export default clearSearch; 13 | -------------------------------------------------------------------------------- /web/web/src/js/home.js: -------------------------------------------------------------------------------- 1 | const home = () => { 2 | const startButton = document.getElementById('start-button'); 3 | if (startButton) { 4 | 5 | startButton.addEventListener("click", (e) => { 6 | e.preventDefault(); 7 | document.getElementById('discover-reach').scrollIntoView({behavior: "smooth", block: "start"}); 8 | }); 9 | } 10 | } 11 | 12 | export default home; 13 | -------------------------------------------------------------------------------- /web/web/src/js/templates/no_results.js: -------------------------------------------------------------------------------- 1 | const getNoResultsTemplate = (term, source) => { 2 | 3 | let noResultsTitle = ``; 4 | let formLabel = ``; 5 | let formAction = ``; 6 | let formSubmit = ``; 7 | 8 | if (source == 'policies') { 9 | noResultsTitle = `Your search for "${term}" in policy documents did not return any results`; 10 | formLabel = `Search by topic, research area or policy document title`; 11 | formAction = `/search/policy-docs`; 12 | formSubmit = `Browse policy documents`; 13 | } else { 14 | noResultsTitle = `Your search for "${term}" in citations did not return any results`; 15 | formLabel = `Search by scientific publication title, topic or journal`; 16 | formAction = `/search/citations`; 17 | formSubmit = `Discover citations`; 18 | } 19 | 20 | const template = ` 21 |
22 |
23 |
24 |
25 |

${noResultsTitle}

26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |

${formLabel}

36 |
37 |
38 |
39 |
40 | 41 | 42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |

Search tips

51 |
    52 |
  • Check your spelling
  • 53 |
  • Broaden your search by using fewer words or more general terms
  • 54 |
  • Try searching by topic, area or work or insitute
  • 55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 | 67 |
68 |
69 |
70 |
71 |
72 | `; 73 | 74 | return template; 75 | }; 76 | 77 | export default getNoResultsTemplate; 78 | -------------------------------------------------------------------------------- /web/web/src/w-avatar-pitch-1.svg: -------------------------------------------------------------------------------- 1 | 2 | image/svg+xml -------------------------------------------------------------------------------- /web/web/templates/search/citations.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | 4 | {% block header %} 5 | 6 | 17 | 18 | {% endblock %} 19 | 20 | {% block main %} 21 | 22 | 30 | 31 |
32 |
33 |
34 |
35 |
36 |
37 |

Discover scientific publications that
have been cited in policy documents

38 |
39 |
40 |
41 |
42 |
43 |
44 |

Reach uses machine learning to find where health policy
organisations are using scientific research.

45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |

Search by scientific publication title, topic or journal

53 |
54 |
55 | 56 | 57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 | {% endblock %} 65 | -------------------------------------------------------------------------------- /web/web/templates/search/policy-docs.html: -------------------------------------------------------------------------------- 1 | {% extends 'base.html' %} 2 | 3 | 4 | {% block header %} 5 | 6 | 17 | 18 | {% endblock %} 19 | 20 | {% block main %} 21 | 22 | 30 | 31 |
32 |
33 |
34 |
35 |
36 |
37 |

Browse our collection of
over 129k policy documents

38 |
39 |
40 |
41 |
42 |
43 |
44 |

We source policy documents from UNICEF, Médecins Sans Frontières (MSF),
45 | National Institute of Clinical Excellence (NICE), the World Health Organisation (WHO),
46 | the UK government and the UK parliament.

47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |

Search by topic, research area or policy document title

55 |
56 |
57 | 58 | 59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 | {% endblock %} 67 | -------------------------------------------------------------------------------- /web/web/tests/test_template.py: -------------------------------------------------------------------------------- 1 | from reach.web.views import template 2 | 3 | 4 | def test_to_template_names(): 5 | cases = [ 6 | ('/', ('index.html',)), 7 | ('/foo', ('foo.html', 'foo/index.html')), 8 | ('/foo.html', ('foo.html', 'foo/index.html')), 9 | ('/foo/gar', ('foo/gar.html', 'foo/gar/index.html')), 10 | ('/_macros.html', tuple()), 11 | ] 12 | for path, expected in cases: 13 | assert expected == template.to_template_names(path) 14 | -------------------------------------------------------------------------------- /web/web/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import time 3 | 4 | import falcon 5 | 6 | Argument = collections.namedtuple("Argument", ('resource', 'window_size', 7 | 'per_second', 'error_message')) 8 | 9 | class _RateLimitDB(object): 10 | _RATE_LIMIT_DB = collections.defaultdict( 11 | lambda: collections.defaultdict(list) 12 | ) 13 | 14 | @staticmethod 15 | def filter(user, resource_name, window_size): 16 | p = _RateLimitDB._RATE_LIMIT_DB[user][resource_name] 17 | t = time.time() 18 | exp_int = t - window_size 19 | p = [s for s in p if s >= exp_int] 20 | _RateLimitDB._RATE_LIMIT_DB[user][resource_name] = p 21 | 22 | @staticmethod 23 | def add_call(user, resource_name): 24 | _RateLimitDB._RATE_LIMIT_DB[user][resource_name].append( 25 | time.time() 26 | ) 27 | 28 | @staticmethod 29 | def check_for(user, argument): 30 | _RateLimitDB.filter(user, argument.resource, argument.window_size) 31 | _RateLimitDB.add_call(user, argument.resource) 32 | p = len(_RateLimitDB._RATE_LIMIT_DB[user][argument.resource]) 33 | return (p / argument.window_size) > argument.per_second 34 | 35 | def _rate_db(req, resp, argument): 36 | if _RateLimitDB.check_for(req.forwarded_host, argument): 37 | print("RATE_LIMITED") 38 | resp.status = falcon.HTTP_429 39 | raise falcon.HTTPTooManyRequests(argument.error_message) 40 | 41 | def rate_limit(per_second=30, resource=u'default', window_size=10, 42 | error_message="429 Too Many Requests"): 43 | arg = Argument(resource, window_size, per_second, error_message) 44 | 45 | def hook(req, resp, resource, params): 46 | _rate_db(req, resp, arg) 47 | 48 | return hook 49 | -------------------------------------------------------------------------------- /web/web/views/__init__.py: -------------------------------------------------------------------------------- 1 | from .search import SearchCitations 2 | from .search import SearchPolicies 3 | from .search import ExportCitationsSearch 4 | from .search import ExportPoliciesSearch 5 | 6 | from .api import ApiSearchCitations 7 | from .api import ApiSearchPolicies 8 | 9 | from .contact import ContactView 10 | -------------------------------------------------------------------------------- /web/web/views/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_search_citations import ApiSearchCitations 2 | from .api_search_policies import ApiSearchPolicies 3 | -------------------------------------------------------------------------------- /web/web/views/api/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import datetime 3 | import uuid 4 | 5 | class JSONEncoder(json.JSONEncoder): 6 | def default(self, obj): 7 | if isinstance(obj, datetime.datetime): 8 | return obj.isoformat() 9 | elif isinstance(obj, datetime.date): 10 | return obj.isoformat() 11 | elif isinstance(obj, uuid.UUID): 12 | return str(obj) 13 | else: 14 | return str(obj) 15 | 16 | return json.JSONEncoder.default(self, obj) 17 | -------------------------------------------------------------------------------- /web/web/views/apidocs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import jinja2 3 | import falcon 4 | 5 | 6 | class APIDocRessource(object): 7 | """ 8 | Serves HTML templates. Note that templates are read from the FS for 9 | every request. 10 | """ 11 | 12 | def __init__(self, template_dir, context=None): 13 | self.env = jinja2.Environment( 14 | loader=jinja2.FileSystemLoader(template_dir), 15 | autoescape=jinja2.select_autoescape(['html']), 16 | ) 17 | if context is not None: 18 | self.context = context 19 | else: 20 | self.context = {} 21 | 22 | def render_template(self, resp, tname): 23 | tname = to_template_names(tname.replace('api/docs', '')) 24 | try: 25 | template = self.env.select_template(tname) 26 | resp.body = template.render(**self.context) 27 | resp.content_type = 'text/html' 28 | except jinja2.TemplateNotFound: 29 | resp.status = falcon.HTTP_404 30 | return 31 | 32 | def on_get(self, req, resp, name): 33 | self.render_template(resp, req.path) 34 | 35 | 36 | def to_template_names(path): 37 | """ 38 | Maps HTTP request paths to Jinja template paths. 39 | 40 | Args: 41 | path: path portion of HTTP GET request 42 | 43 | Returns: 44 | Tuple of file paths that Jinja should search for. 45 | """ 46 | 47 | if not path.startswith('/'): 48 | raise ValueError 49 | path = path[1:] # remove leading /, jinja won't want it 50 | 51 | if os.path.basename(path).startswith('_'): 52 | # Macros are kept in templates starting with _; don't allow 53 | # access to them. 54 | return tuple() 55 | 56 | if path == '': 57 | return ('index.html',) 58 | 59 | if path.endswith('/'): 60 | return ( 61 | path[:-1] + '.html', 62 | os.path.join(path, 'index.html'), 63 | ) 64 | 65 | if path.endswith('.html'): 66 | return ( 67 | path, 68 | os.path.join(path[:-5], 'index.html'), 69 | ) 70 | 71 | return ( 72 | path + '.html', 73 | os.path.join(path, 'index.html'), 74 | ) 75 | -------------------------------------------------------------------------------- /web/web/views/robotstxt.py: -------------------------------------------------------------------------------- 1 | """ Serve GET /robots.txt. """ 2 | 3 | # Allow all indexing following launch in July 2020 4 | # (cf. 5 | # https://medium.com/wellcome-data-labs/introducing-reach-find-and-track-research-being-put-into-action-dec2a2fca93b) 6 | ROBOTS_TXT = \ 7 | """User-agent: * 8 | Allow: / 9 | """ 10 | 11 | class RobotsTxtResource(object): 12 | def on_get(self, req, resp): 13 | resp.body = ROBOTS_TXT 14 | resp.content_type = 'text/plain' 15 | -------------------------------------------------------------------------------- /web/web/views/search/__init__.py: -------------------------------------------------------------------------------- 1 | from .citations import SearchCitations 2 | from .policies import SearchPolicies 3 | from .export_citations import ExportCitationsSearch 4 | from .export_policies import ExportPoliciesSearch 5 | -------------------------------------------------------------------------------- /web/web/views/search/citations.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | 4 | import falcon 5 | 6 | from web.db import get_db_cur 7 | from web.views import template 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class SearchCitations(template.TemplateResource): 12 | """ Search through publications returning a list of publications with inlined policies that have cited 13 | the publication, search rank. 14 | """ 15 | def __init__(self, template_dir, context=None): 16 | super(SearchCitations, self).__init__(template_dir, context) 17 | 18 | def on_get(self, req, resp): 19 | logger.info("Requesting some citations") 20 | 21 | if not req.params: 22 | super(SearchCitations, self).render_template( 23 | resp, 24 | "/search/citations", 25 | ) 26 | return 27 | 28 | term = req.params.get("terms", "") 29 | 30 | self.context.update(dict( 31 | term=term 32 | )) 33 | 34 | super(SearchCitations, self).render_template( 35 | resp, 36 | "/results/citations", 37 | ) 38 | 39 | 40 | -------------------------------------------------------------------------------- /web/web/views/search/policies.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import json 3 | 4 | import falcon 5 | 6 | from web.views import template 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class SearchPolicies(template.TemplateResource): 11 | """ Search through publications returning a list of publications with inlined policies that have cited 12 | the publication, search rank. 13 | """ 14 | def __init__(self, template_dir, context=None): 15 | super(SearchPolicies, self).__init__(template_dir, context) 16 | 17 | def on_get(self, req, resp): 18 | logger.info("Requesting some policies") 19 | 20 | if not req.params: 21 | super(SearchPolicies, self).render_template( 22 | resp, 23 | "/search/policy-docs", 24 | ) 25 | return 26 | 27 | term = req.params.get("terms", None) 28 | 29 | self.context.update(dict( 30 | term=term 31 | )) 32 | 33 | super(SearchPolicies, self).render_template( 34 | resp, 35 | "/results/policy-docs", 36 | ) 37 | 38 | 39 | -------------------------------------------------------------------------------- /web/web/views/template.py: -------------------------------------------------------------------------------- 1 | import os 2 | import jinja2 3 | import falcon 4 | 5 | 6 | class TemplateResource(object): 7 | """ 8 | Serves HTML templates. Note that templates are read from the FS for 9 | every request. 10 | """ 11 | 12 | def __init__(self, template_dir, context=None): 13 | from web.config import CONFIG 14 | self.env = jinja2.Environment( 15 | loader=jinja2.FileSystemLoader(template_dir), 16 | autoescape=jinja2.select_autoescape(['html']), 17 | ) 18 | 19 | self.env.globals.update(ga_code=CONFIG.ga_code) 20 | self.env.globals.update(hotjar_code=CONFIG.hotjar_code) 21 | 22 | if context is not None: 23 | self.context = context 24 | else: 25 | self.context = {} 26 | 27 | def render_template(self, resp, tname): 28 | tname = to_template_names(tname) 29 | try: 30 | template = self.env.select_template(tname) 31 | resp.body = template.render(**self.context) 32 | resp.content_type = 'text/html' 33 | except jinja2.TemplateNotFound: 34 | resp.status = falcon.HTTP_404 35 | return 36 | 37 | def on_get(self, req, resp): 38 | self.render_template(resp, req.path) 39 | 40 | 41 | def to_template_names(path): 42 | """ 43 | Maps HTTP request paths to Jinja template paths. 44 | 45 | Args: 46 | path: path portion of HTTP GET request 47 | 48 | Returns: 49 | Tuple of file paths that Jinja should search for. 50 | """ 51 | 52 | if not path.startswith('/'): 53 | raise ValueError 54 | path = path[1:] # remove leading /, jinja won't want it 55 | 56 | if os.path.basename(path).startswith('_'): 57 | # Macros are kept in templates starting with _; don't allow 58 | # access to them. 59 | return tuple() 60 | 61 | if path == '': 62 | return ('index.html',) 63 | 64 | if path.endswith('/'): 65 | return ( 66 | path[:-1] + '.html', 67 | os.path.join(path, 'index.html'), 68 | ) 69 | 70 | if path.endswith('.html'): 71 | return ( 72 | path, 73 | os.path.join(path[:-5], 'index.html'), 74 | ) 75 | 76 | return ( 77 | path + '.html', 78 | os.path.join(path, 'index.html'), 79 | ) 80 | -------------------------------------------------------------------------------- /web/web/wsgi.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import toml 4 | 5 | from . import api 6 | 7 | 8 | class Configuration: 9 | def __init__(self): 10 | """ 11 | Parses webapp configuration from the environment. Key variables: 12 | 13 | - ELASTICSEARCH_HOST 14 | - ELASTICSEARCH_EXPLAIN 15 | - ELASTICSEARCH_POLICYDOCS_INDEX 16 | - ELASTICSEARCH_CITATIONS_INDEX 17 | - STATIC_ROOT 18 | """ 19 | 20 | self.database_url = os.environ['DATABASE_URL'] 21 | if not self.database_url: 22 | raise Exception( 23 | "Database URL not found. DATABASE_URL=%r" % 24 | self.database_url 25 | ) 26 | 27 | self.static_root = os.environ.get('STATIC_ROOT') 28 | if not self.static_root or not os.path.isdir(self.static_root): 29 | raise Exception( 30 | "No static directory found. STATIC_ROOT=%r" % 31 | self.static_root 32 | ) 33 | 34 | self.docs_static_root = os.environ.get('DOCS_STATIC_ROOT') 35 | if not self.docs_static_root or not os.path.isdir( 36 | self.docs_static_root 37 | ): 38 | raise Exception( 39 | "No docs static directory found. DOCS_STATIC_ROOT=%r" % 40 | self.docs_static_root 41 | ) 42 | 43 | def parse_config_file(): 44 | 45 | config_path = os.environ.get("CONFIG_FILE", None) 46 | 47 | if config_path is None: 48 | return {} 49 | 50 | if not config_path.startswith("/"): 51 | config_path = os.path.join(os.path.basedir(__file__), config_path) 52 | 53 | config_data = toml.load(config_path) 54 | 55 | return config_data 56 | 57 | config = parse_config_file() 58 | application = api.create_api(config) 59 | --------------------------------------------------------------------------------