├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── enhancement_proposal.md │ └── question.md └── pull_request_template.md ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── __attic__ ├── docs │ ├── README.md │ ├── design │ │ ├── 00_Header.jpg │ │ ├── 1.1_Colour_.jpg │ │ ├── 1.2_Typography.jpg │ │ ├── 1.3_Header.jpg │ │ ├── 1.4_Footer.jpg │ │ ├── 1.5_Breadcrumbs_2_levels_.jpg │ │ ├── 1.5_Breadcrumbs_3_levels_.jpg │ │ ├── 1.6_CTA_Links_.jpg │ │ ├── 1.7_Search.jpg │ │ ├── 1.8_Tables.jpg │ │ ├── design-system.md │ │ ├── mockups.md │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556632095254_00_Header2x.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556633077597_1.2_Typography.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556639329062_1.1_Colour+palletex2.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641211609_1.7_Search.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641251273_1.5_Breadcrumbs_3+levels.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641260288_1.5_Breadcrumbs_2+levels.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641267412_1.4_Footer.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641277300_1.3_Header.jpg │ │ ├── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556642898755_1.6_Buttons.jpg │ │ └── s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556805892800_1.8_Tables.jpg │ ├── problems.md │ └── s3-layout.md └── pipeline │ └── reach-evaluator │ ├── Dockerfile │ ├── evaluator_task.py │ └── requirements.txt ├── argo ├── 00-namespace.yaml ├── README.md ├── argo.yaml ├── elasticsearch.yaml ├── postgres.yaml ├── psqlinit.yaml ├── reach-msf.yaml ├── reach-populate-pg.yaml └── secrets │ ├── minikube │ └── argo │ │ └── aws │ │ └── .gitkeep │ └── sync_secrets.py ├── base ├── Dockerfile ├── elastic │ ├── __init__.py │ ├── common.py │ ├── count.py │ ├── epmc_metadata.py │ ├── fulltext_docs.py │ ├── fuzzy_matched_citations.py │ ├── import_refs_from_s3.py │ └── import_sections_from_s3.py ├── hooks │ ├── s3hook.py │ └── sentry.py ├── requirements.txt ├── safe_import.py └── tests │ ├── common.py │ ├── mock_sites │ ├── gov │ │ ├── 1.html │ │ ├── 2.html │ │ └── 3.html │ ├── msf │ │ └── 1.html │ ├── nice │ │ ├── 1.html │ │ └── 2.html │ ├── parliament │ │ ├── 1.html │ │ └── 2.html │ ├── unicef │ │ ├── 1.html │ │ └── 2.html │ └── who │ │ ├── 1.html │ │ └── 2.html │ ├── pdfs │ ├── test_pdf.pdf │ ├── test_pdf_multipage.pdf │ └── test_pdf_page_number.pdf │ └── xml │ └── test_xml.xml ├── buildspec.yml ├── docker-compose.yaml ├── docs ├── antora.yml └── modules │ └── ROOT │ ├── nav.adoc │ └── pages │ └── index.adoc ├── export_wellcome_env.py ├── pipeline ├── reach-es-extractor │ ├── Dockerfile │ ├── Dockerfile.test │ ├── extract_refs_task.py │ ├── refparse │ │ ├── README.md │ │ ├── __init__.py │ │ ├── algo_evaluation │ │ │ ├── compare_found_sections.py │ │ │ ├── data_evaluate │ │ │ │ └── .gitkeep │ │ │ ├── evaluate_find_section.py │ │ │ ├── evaluate_match_references.py │ │ │ ├── evaluate_parse.py │ │ │ ├── evaluate_settings.py │ │ │ ├── evaluate_split_section.py │ │ │ ├── evaluation.md │ │ │ ├── exploratory │ │ │ │ ├── investigate_match_thresholds.py │ │ │ │ ├── negative_cosines_hist_2019-07-01-1211.png │ │ │ │ ├── negative_cosines_len_scatter_2019-07-01-1211.png │ │ │ │ ├── thresholds_F1Score_negative_heatmap_2019-07-01-1211.png │ │ │ │ ├── thresholds_Precision_negative_heatmap_2019-07-01-1211.png │ │ │ │ ├── thresholds_Recall_negative_heatmap_2019-07-01-1211.png │ │ │ │ └── title_lengths_2019-07-01-1211.png │ │ │ └── results │ │ │ │ └── .gitkeep │ │ ├── evaluate_algo.py │ │ ├── merge_results.py │ │ ├── parse_latest.py │ │ ├── reference_parser_models │ │ │ └── reference_parser_pipeline.pkl │ │ ├── refparse.py │ │ ├── settings.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_config_multitask.ini │ │ │ ├── test_exact_match.py │ │ │ ├── test_fuzzy_match.py │ │ │ └── test_split_parse.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── exact_match.py │ │ │ ├── file_manager.py │ │ │ ├── fuzzy_match.py │ │ │ ├── parse.py │ │ │ ├── s3.py │ │ │ └── serialiser.py │ └── requirements.txt ├── reach-es-indexer │ ├── Dockerfile │ ├── index_task.py │ └── requirements.txt ├── reach-fuzzy-matcher │ ├── Dockerfile │ ├── fuzzymatcher_task.py │ └── requirements.txt ├── reach-parser │ ├── Dockerfile │ ├── Dockerfile.test │ ├── __init__.py │ ├── normalizer │ │ ├── __init__.py │ │ └── title_normalizer.py │ ├── parser_task.py │ ├── pdf_parser │ │ ├── __init__.py │ │ ├── main.py │ │ ├── objects │ │ │ ├── PdfObjects.py │ │ │ └── __init__.py │ │ ├── pdf_parse.py │ │ ├── resources │ │ │ ├── keywords.txt │ │ │ └── section_keywords.txt │ │ ├── tests │ │ │ ├── __init__.py │ │ │ ├── test_pdf_objects.py │ │ │ └── test_pdf_parser_tools.py │ │ └── tools │ │ │ ├── __init__.py │ │ │ ├── dbTools.py │ │ │ └── extraction.py │ └── requirements.txt └── reach-scraper │ ├── Dockerfile │ ├── Dockerfile.test │ ├── README.md │ ├── __init__.py │ ├── docker-compose.yaml │ ├── pg_exists.py │ ├── pg_isready.py │ ├── requirements.txt │ ├── scrapy.cfg │ ├── spider_task.py │ └── wsf_scraping │ ├── __init__.py │ ├── contracts.py │ ├── feed_storage.py │ ├── filter.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ ├── __init__.py │ ├── acme_spider.py │ ├── base_spider.py │ ├── gov_spider.py │ ├── msf_spider.py │ ├── nice_spider.py │ ├── parliament_spider.py │ ├── unicef_spider.py │ └── who_iris_spider.py │ └── tests │ ├── __init__.py │ ├── test_gov_spider.py │ ├── test_msf_spider.py │ ├── test_nice_spider.py │ ├── test_parliament_spider.py │ ├── test_scraper_spiders.py │ ├── test_unicef_spider.py │ └── test_who_spider.py ├── requirements.txt ├── test_target ├── README.md ├── inner_page.html ├── page.html └── target_server.py └── web ├── .babelrc ├── .dockerignore ├── .eslintrc.json ├── Dockerfile ├── Makefile ├── bin └── update_vendor.sh ├── config ├── docker.config.toml └── local.config.toml ├── package-lock.json ├── package.json ├── requirements.txt ├── setup.py └── web ├── __init__.py ├── api.py ├── config.py ├── db.py ├── docs ├── .gitignore ├── Makefile ├── README.md ├── build │ ├── doctrees │ │ ├── api.doctree │ │ ├── environment.pickle │ │ ├── index.doctree │ │ └── intro.doctree │ └── html │ │ ├── .buildinfo │ │ ├── _static │ │ ├── basic.css │ │ ├── css │ │ │ ├── badge_only.css │ │ │ └── theme.css │ │ ├── doctools.js │ │ ├── documentation_options.js │ │ ├── file.png │ │ ├── fonts │ │ │ ├── Inconsolata-Bold.ttf │ │ │ ├── Inconsolata-Regular.ttf │ │ │ ├── Inconsolata.ttf │ │ │ ├── Lato-Bold.ttf │ │ │ ├── Lato-Regular.ttf │ │ │ ├── Lato │ │ │ │ ├── lato-bold.eot │ │ │ │ ├── lato-bold.ttf │ │ │ │ ├── lato-bold.woff │ │ │ │ ├── lato-bold.woff2 │ │ │ │ ├── lato-bolditalic.eot │ │ │ │ ├── lato-bolditalic.ttf │ │ │ │ ├── lato-bolditalic.woff │ │ │ │ ├── lato-bolditalic.woff2 │ │ │ │ ├── lato-italic.eot │ │ │ │ ├── lato-italic.ttf │ │ │ │ ├── lato-italic.woff │ │ │ │ ├── lato-italic.woff2 │ │ │ │ ├── lato-regular.eot │ │ │ │ ├── lato-regular.ttf │ │ │ │ ├── lato-regular.woff │ │ │ │ └── lato-regular.woff2 │ │ │ ├── RobotoSlab-Bold.ttf │ │ │ ├── RobotoSlab-Regular.ttf │ │ │ ├── RobotoSlab │ │ │ │ ├── roboto-slab-v7-bold.eot │ │ │ │ ├── roboto-slab-v7-bold.ttf │ │ │ │ ├── roboto-slab-v7-bold.woff │ │ │ │ ├── roboto-slab-v7-bold.woff2 │ │ │ │ ├── roboto-slab-v7-regular.eot │ │ │ │ ├── roboto-slab-v7-regular.ttf │ │ │ │ ├── roboto-slab-v7-regular.woff │ │ │ │ └── roboto-slab-v7-regular.woff2 │ │ │ ├── fontawesome-webfont.eot │ │ │ ├── fontawesome-webfont.svg │ │ │ ├── fontawesome-webfont.ttf │ │ │ ├── fontawesome-webfont.woff │ │ │ └── fontawesome-webfont.woff2 │ │ ├── jquery-3.5.1.js │ │ ├── jquery.js │ │ ├── js │ │ │ ├── modernizr.min.js │ │ │ └── theme.js │ │ ├── language_data.js │ │ ├── minus.png │ │ ├── plus.png │ │ ├── pygments.css │ │ ├── searchtools.js │ │ ├── underscore-1.3.1.js │ │ └── underscore.js │ │ ├── api.html │ │ ├── genindex.html │ │ ├── index.html │ │ ├── intro.html │ │ ├── objects.inv │ │ ├── search.html │ │ └── searchindex.js ├── make.bat ├── requirements.txt └── source │ ├── api.md │ ├── conf.py │ ├── index.rst │ └── intro.md ├── src ├── css │ ├── about.less │ ├── contact.less │ ├── footer.less │ ├── header.less │ ├── home.less │ ├── icons.less │ ├── results.less │ ├── search.less │ ├── style.less │ ├── variables.less │ └── wellcome-bold-webfont.woff2 ├── favicon │ ├── android-icon-144x144.png │ ├── android-icon-192x192.png │ ├── android-icon-36x36.png │ ├── android-icon-48x48.png │ ├── android-icon-72x72.png │ ├── android-icon-96x96.png │ ├── apple-icon-114x114.png │ ├── apple-icon-120x120.png │ ├── apple-icon-144x144.png │ ├── apple-icon-152x152.png │ ├── apple-icon-180x180.png │ ├── apple-icon-57x57.png │ ├── apple-icon-60x60.png │ ├── apple-icon-72x72.png │ ├── apple-icon-76x76.png │ ├── apple-icon-precomposed.png │ ├── apple-icon.png │ ├── browserconfig.xml │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon-96x96.png │ ├── favicon.ico │ ├── manifest.json │ ├── ms-icon-144x144.png │ ├── ms-icon-150x150.png │ ├── ms-icon-310x310.png │ └── ms-icon-70x70.png ├── images │ ├── Icon_ New-window.svg │ ├── Icon_About_Accuracy_100px.svg │ ├── Icon_About_Open-source_100px.svg │ ├── Icon_About_Transparent_100px.svg │ ├── Icon_Arrow_down.svg │ ├── Icon_Chevron_Double.svg │ ├── Icon_Chevron_Down.svg │ ├── Icon_Download_16px.svg │ ├── Icon_How_Download_160px.svg │ ├── Icon_How_Extract_160px.svg │ ├── Icon_How_Match_160px.svg │ ├── Icon_Info.svg │ ├── Icon_Menu_16px.svg │ ├── Icon_Policy_24px.svg │ ├── Icon_Research_24px.svg │ ├── Icon_Scroll-arow.svg │ ├── Icon_Search_16px.svg │ ├── Icon_Sort-by_16px.svg │ ├── Icon_new_window.svg │ ├── Illustration_Glass.svg │ ├── Illustration_Papers.svg │ ├── Image_Product-shot.png │ ├── Shape_01.svg │ ├── Shape_02.svg │ ├── Wellcome_logo.svg │ ├── reach_alpha_branding.svg │ ├── reach_site_view.png │ ├── wave.svg │ ├── wellcome-logo.svg │ └── white-wave.svg ├── js │ ├── app.js │ ├── citationsTable.js │ ├── clearSearch.js │ ├── home.js │ ├── policyTable.js │ ├── resultsCommon.js │ ├── templates │ │ └── no_results.js │ └── v.contact.js ├── vendor │ └── spectre-0.5.8 │ │ ├── spectre-exp.css │ │ ├── spectre-exp.min.css │ │ ├── spectre-icons.css │ │ ├── spectre-icons.min.css │ │ ├── spectre.css │ │ └── spectre.min.css └── w-avatar-pitch-1.svg ├── templates ├── about.html ├── base.html ├── contact.html ├── how-it-works.html ├── index.html ├── privacy.html ├── results │ ├── citations.html │ └── policy-docs.html └── search │ ├── citations.html │ └── policy-docs.html ├── tests ├── test_search_api.py └── test_template.py ├── utils.py ├── views ├── __init__.py ├── api │ ├── __init__.py │ ├── api_search_citations.py │ ├── api_search_policies.py │ └── utils.py ├── apidocs.py ├── contact.py ├── opt_search.py ├── robotstxt.py ├── search │ ├── __init__.py │ ├── citations.py │ ├── export_citations.py │ ├── export_policies.py │ └── policies.py ├── search_exports.py └── template.py └── wsgi.py /.dockerignore: -------------------------------------------------------------------------------- 1 | **/*.pyc 2 | **/__pycache__/* 3 | .git 4 | .gitignore 5 | .idea 6 | .pytest_cache 7 | 8 | Dockerfile 9 | 10 | __attic__ 11 | 12 | argo 13 | 14 | web/build/web/static/* 15 | 16 | reach/refparse/algo_evaluation/data_evaluate/* 17 | 18 | **/env 19 | **/venv 20 | **/docs 21 | pull_request_template.md 22 | CONTRIBUTING.md 23 | 24 | web/node_modules/* 25 | web/package.json 26 | web/package-lock.json 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: 'bug' 6 | assignees: '' 7 | --- 8 | Checklist: 9 | 10 | * [ ] I've included the version 11 | * [ ] I've included reproduction steps 12 | * [ ] I've included any config 13 | * [ ] I've included the logs 14 | 15 | 16 | ## What Happened 17 | 18 | ## What you expected to happen 19 | 20 | ## How to reproduce it (as minimally and preciselt as possible) 21 | 22 | ## Anything else we should know 23 | 24 | ## Environment 25 | 26 | * [ ] Production 27 | * [ ] Staging 28 | * [ ] Local 29 | 30 | ## Error Message / Logs 31 | 32 | --- 33 | 34 | ## Message from Maintainers 35 | 36 | If you are impacted by this bug please add a :thumbsup: reaction this is issue! 37 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement_proposal.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Enhancement proposal 3 | about: Propose an enhancement for this project 4 | title: '' 5 | labels: 'enhancement' 6 | assignees: '' 7 | --- 8 | # Description of feature/ functionality 9 | 10 | (Be sure to include the reasoning if this is not part of a bigger project. Link to spec, notion page, Zeplin etc) 11 | 12 | ## Risks & dependencies 13 | 14 | (Consider customer facing, internal and deployment) 15 | 16 | ## Acceptance Criteria 17 | (What needs to happen for this ticket to be closed?) 18 | 19 | ## Estimation of dev task size 20 | 21 | - [ ] Small 22 | - [ ] Medium 23 | - [ ] Large 24 | 25 | ## Who needs to test this? 26 | 27 | - [ ] Dev 28 | - [ ] UI 29 | - [ ] UX 30 | - [ ] Data science 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask a question regarding this project 4 | title: '' 5 | labels: 'question' 6 | assignees: '' 7 | --- 8 | 9 | 10 | # Summary 11 | 12 | What do you want to know about this project? 13 | 14 | # Motivation 15 | 16 | Why do you need to know this, any examples or use cases you could include? 17 | 18 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include a summary of the changes this PR introduces for the codebase. 4 | Also specify if some sections need special attention, and why you want to introduce this change. 5 | 6 | Make sure to split changes across multiple pull requests, as we won't review bundled pull requests. 7 | 8 | Finally, make sure your PR follows our code of conduct before posting (Check our [contributing guidelines](CONTRIBUTING.md) if you're not sure). 9 | 10 | ## Type of change 11 | 12 | Please delete options that are not relevant. 13 | 14 | - [ ] :bug: Bug fix (Add `Fix #(issue)` to your PR) 15 | - [ ] :sparkles: New feature 16 | - [ ] :fire: Breaking change 17 | - [ ] :memo: Documentation update 18 | 19 | # How Has This Been Tested? 20 | 21 | Please describe the tests that you ran to verify your changes. Provide instructions so we can run the tests. Please also list any relevant details for your test configuration: 22 | 23 | # Checklist: 24 | 25 | - [ ] My code follows the style guidelines of this project (pep8 AND pyflakes) 26 | - [ ] I have commented my code, particularly in hard-to-understand areas 27 | - [ ] If needed, I changed related parts of the documentation 28 | - [ ] I included tests in my PR 29 | - [ ] New and existing unit tests pass locally with my changes 30 | - [ ] Any dependent changes have been merged and published in downstream modules 31 | - [ ] If my PR aims to fix an issue, I referenced it using `#(issue)` 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *__pycache__ 3 | *.DS_Store 4 | .idea 5 | 6 | *.egg-info 7 | **/.cache/* 8 | 9 | *.csv 10 | *.pdf 11 | *.txt 12 | 13 | !package.json 14 | !base/tests/pdfs/* 15 | !keywords.txt 16 | !section_keywords.txt 17 | !**/requirements.* 18 | 19 | **/node_modules 20 | **/epmc-metadata.json.gz 21 | *env 22 | 23 | argo/secrets/minikube/argo/aws/* 24 | !argo/secrets/minikube/argo/aws/.gitkeep 25 | 26 | web/build/web/static/* 27 | !web/build/web/static/.gitkeep 28 | !web/docs/* 29 | venv 30 | .env 31 | package.lock 32 | web/config/dev.config.toml 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Wellcome Trust 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /__attic__/docs/README.md: -------------------------------------------------------------------------------- 1 | # Reach documentation 2 | 3 | Directories: 4 | 5 | - [design](./design) 6 | -------------------------------------------------------------------------------- /__attic__/docs/design/00_Header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/00_Header.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.1_Colour_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.1_Colour_.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.2_Typography.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.2_Typography.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.3_Header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.3_Header.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.4_Footer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.4_Footer.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.5_Breadcrumbs_2_levels_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.5_Breadcrumbs_2_levels_.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.5_Breadcrumbs_3_levels_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.5_Breadcrumbs_3_levels_.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.6_CTA_Links_.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.6_CTA_Links_.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.7_Search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.7_Search.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/1.8_Tables.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/1.8_Tables.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/design-system.md: -------------------------------------------------------------------------------- 1 | # Reach – Design System 2 | Created by Data Labs at Wellcome Trust 3 | 4 | 5 |  6 | 7 | 8 | 9 | 10 | # 1.1. Intro 11 | ---------- 12 | 13 | The present document provides a high level overview of the Reach brand and style. [Spectre.CSS](https://picturepan2.github.io/spectre/index.html) is used as the CSS framework, but the UI components should be adapted to provide a unique user experience to the website. 14 | 15 | 16 | 17 | # 1.2. Colour Palette 18 | ---------- 19 | 20 | The web colour palette is taken largely from the Wellcome brand book ([Data Viz section](https://company-57536.frontify.com/d/gFEfjydViLRJ/wellcome-brand-book#/visuals/dataviz-elements-and-rationale)) with certain additions to accommodate the web environment. 21 | 22 | 23 |  24 | 25 | 26 | 27 | 28 | # 1.3. Typography 29 | ---------- 30 | 31 | Wellcome's brand fonts (Wellcome & Helvetica Neue) are used alternatively throughout the website with no exceptions. 32 | 33 | 34 |  35 | 36 | 37 | 38 | 39 | # 1.4. Grid System 40 | ---------- 41 | 42 | The grid system follows [Spectre CSS framework](https://picturepan2.github.io/spectre/layout/responsive.html). 43 | 44 | 45 | | XS | 320 – 480px | padding left/right 11px | 46 | | --- | ------------ | ------------------------------------------------------ | 47 | | SM | 481 – 600px | TBC | 48 | | MD | 601 – 840px | TBC | 49 | | LG | 961 – 1280px | TBC | 50 | | XL | 961 – 1280px | 12 columns : gutter 22px : padding left/right 11px | 51 | | XXL | > 1280px | 12 columns : gutter 22px | 52 | 53 | 54 | 55 | 56 | # 1.5. UI Components 57 | 58 | 59 | ## 1.5.1. Buttons 60 | ---------- 61 | 62 | 63 |  64 | 65 | 66 | 67 | 68 | ## 1.5.2. Header 69 | ---------- 70 |  71 | 72 | 73 | 74 | ## 1.5.3. Footer 75 | ---------- 76 |  77 | 78 | 79 | 80 | ## 1.5.6. Breadcrumbs > 2 levels 81 | ---------- 82 | 83 | In the breadcrumb trail, the breadcrumb corresponding to the current page **should not be a link**. 84 | 85 | 86 |  87 | 88 | 89 | 90 | 91 | ## 1.5.7. Breadcrumbs > 3 levels (Search results pages) 92 | ---------- 93 | 94 | The **search term is excluded from breadcrumbs** (to avoid long pages on mobile) 95 | 96 | 97 |  98 | 99 | 100 | 101 | 102 | ## 1.5.8. Search 103 | ---------- 104 | 105 | 106 |  107 | 108 | 109 | 110 | ## 1.5.9. Tables 111 | ---------- 112 | 113 | 114 |  115 | 116 | 117 | -------------------------------------------------------------------------------- /__attic__/docs/design/mockups.md: -------------------------------------------------------------------------------- 1 | # Internal assets 2 | 3 | We've put what we can into the public repo. However, some things from 4 | design are elsewhere, since it was easier for design to use tools like 5 | Google Drive & Invision. 6 | 7 | All raw design assets are kept in the [Data Labs Team Drive under "User 8 | Experience/UX & 9 | UI](https://drive.google.com/drive/u/0/folders/1kN5-MbDUGK1YdSw430T_mDhdQdPzGCms). 10 | Most are in Sketch format. 11 | 12 | UI devs working at Wellcome should also be to view these files (at least 13 | as of time of upload) in 14 | [invision](https://projects.invisionapp.com/d/main?origin=v7#/projects/prototypes/17303255). 15 | 16 | -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556632095254_00_Header2x.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556632095254_00_Header2x.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556633077597_1.2_Typography.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556633077597_1.2_Typography.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556639329062_1.1_Colour+palletex2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556639329062_1.1_Colour+palletex2.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641211609_1.7_Search.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641211609_1.7_Search.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641251273_1.5_Breadcrumbs_3+levels.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641251273_1.5_Breadcrumbs_3+levels.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641260288_1.5_Breadcrumbs_2+levels.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641260288_1.5_Breadcrumbs_2+levels.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641267412_1.4_Footer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641267412_1.4_Footer.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641277300_1.3_Header.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556641277300_1.3_Header.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556642898755_1.6_Buttons.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556642898755_1.6_Buttons.jpg -------------------------------------------------------------------------------- /__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556805892800_1.8_Tables.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/__attic__/docs/design/s_80E0FAE63D8FEACBC2D84BC148149813EA5AB8AA2E1FDA99090E7199EBBFE5D9_1556805892800_1.8_Tables.jpg -------------------------------------------------------------------------------- /__attic__/docs/problems.md: -------------------------------------------------------------------------------- 1 | # Reach Output Problems 2 | 3 | The 10 most cited Wellcome Trust publications as predicted by the Reach tool were looked at. In total these 10 publications were cited 154 times in 62 different policy documents. We looked into these citations and found issues with 103 of them. The issues fell into 6 categories and are given in the table below. 4 | 5 | | GitHub Issue Number | Problem | Example | Solution | Proportion of problematic citations | 6 | | --- | --- | --- | --- | --- | 7 | | [#180](https://github.com/wellcometrust/reach/issues/180) | Text found in wrongly identified references section | A table with the row name "Treatment of severe malaria", was identified as a reference title since this table was at the end of the references section and got included in the scraped. | Improve extracting section | 64/103 | 8 | | [#181](https://github.com/wellcometrust/reach/issues/181) | Text found not in a reference during the exact text search | "attention deficit hyperactivity disorder" was found in the text of several documents and identified as a match to a paper with the same name. | Length threshold to exact matcher | 20/103 | 9 | | [#182](https://github.com/wellcometrust/reach/issues/182) | Reference repeated in the policy document | A citation for "Disability-adjusted life years (DALYs) for 291 diseases and injuries in 21 regions, 1990-2010: a systematic analysis for the Global Burden of Disease Study 2010" came up in two references sections of a policy document | Deduplicate repeats or decide to keep them in | 8/103 | 10 | | [#183](https://github.com/wellcometrust/reach/issues/183) | Duplicate reference found even though no duplicate found in policy document | A citation for "Trends in adult body-mass index in 200 countries from 1975 to 2014: a pooled analysis of 1698 population-based measurement studies with 1377-1396" only occurred once in a policy document, but the Reach output said it came up twice. | ? | 4/103 | 11 | | - | False positive - parsed reference matched to a similar but different reference | The Reach tool identified a publication entitled "Attention deficit hyperactivity disorder" from 2006, however in the policy document this reference was to a similarly titled paper from 1998. | Increase text similarity and length thresholds | 4/103 | 12 | | [#180](https://github.com/wellcometrust/reach/issues/180) | Text found in a reference during the exact text search | A citation for "Attention deficit hyperactivity disorder" was in the references section of a policy document, however it was only found in the exact text search and not the fuzzy match search. | Improve extracting section | 3/103 | 13 | -------------------------------------------------------------------------------- /__attic__/pipeline/reach-evaluator/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM reach.base 2 | 3 | WORKDIR /opt/reach 4 | 5 | COPY ./requirements.txt /opt/reach/requirements.evaluator.txt 6 | 7 | RUN pip install -U pip && \ 8 | python3 -m pip install -r /opt/reach/requirements.evaluator.txt 9 | 10 | 11 | COPY ./evaluator_task.py /opt/reach/evaluator_task.py 12 | 13 | # Give execution rights to the entrypoint Python script 14 | RUN chmod +x /opt/reach/evaluator_task.py 15 | 16 | ENTRYPOINT ["/opt/reach/evaluator_task.py"] 17 | -------------------------------------------------------------------------------- /__attic__/pipeline/reach-evaluator/requirements.txt: -------------------------------------------------------------------------------- 1 | https://datalabs-public.s3.eu-west-2.amazonaws.com/reach_evaluator/reach_evaluator-2020.1.1-py3-none-any.whl -------------------------------------------------------------------------------- /argo/00-namespace.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: argo 5 | -------------------------------------------------------------------------------- /argo/README.md: -------------------------------------------------------------------------------- 1 | # Argo & Reach 2 | Reach's pipeline is deployed in production using Argo. 3 | These files are for local runs and development. 4 | 5 | ## How to run Reach's workflows 6 | To run this pipeline locally, you'll need: 7 | 8 | - Docker 9 | - Minikube 10 | - Python >= 3.6 11 | - The Argo cli (recommended but optionnal) 12 | 13 | If it is the first time you use minikube with your AWS account, please configure your ECR credentials: 14 | ``` 15 | minikube addons enable registry-creds 16 | minikube addons configure registry-creds 17 | ``` 18 | 19 | To build the required images, go to the root folder and run the following: 20 | ``` 21 | make docker-build 22 | ``` 23 | 24 | To install Argo to your selected cluster (this will install Argo to a namespace `argo`, so make sure it's available before running these commands or change it beforehand): 25 | ``` 26 | kubectl apply -f argo/00-namespace.yaml 27 | kubectl apply -f argo/argo.yaml 28 | kubectl apply -f argo/elasticsearch.yaml 29 | kubectl apply -f argo/psqlinit.yaml 30 | kubectl apply -f argo/postgres.yamls 31 | ``` 32 | 33 | 34 | You can then run your workflows as follows: 35 | ``` 36 | # this is the example workflow for WHO IRIS 37 | argo submit -n argo argo/reach-who.yaml 38 | ``` 39 | 40 | ## Using this infrastructure with the web application 41 | Reach's web application only relies on Postgresql. To expose it and make it usable locally (or within the `docker-compose` local deployment), run: 42 | ``` 43 | kubectl port-forward -n argo postgres-0 5432:5432 44 | ``` 45 | -------------------------------------------------------------------------------- /argo/postgres.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: argo-postgres-volume 5 | namespace: argo 6 | labels: 7 | type: local 8 | spec: 9 | storageClassName: manual 10 | capacity: 11 | storage: 5Gi 12 | accessModes: 13 | - ReadWriteOnce 14 | hostPath: 15 | path: "/mnt/data" 16 | --- 17 | apiVersion: v1 18 | kind: PersistentVolumeClaim 19 | metadata: 20 | name: argo-postgres-claim 21 | namespace: argo 22 | spec: 23 | storageClassName: manual 24 | accessModes: 25 | - ReadWriteOnce 26 | resources: 27 | requests: 28 | storage: 5Gi 29 | --- 30 | apiVersion: apps/v1 31 | kind: StatefulSet 32 | metadata: 33 | name: postgres 34 | namespace: argo 35 | annotations: 36 | kubernetes.io/change-cause: N/A 37 | spec: 38 | selector: 39 | matchLabels: 40 | app: postgres 41 | serviceName: "postgres" 42 | replicas: 1 43 | template: 44 | metadata: 45 | labels: 46 | app: postgres 47 | spec: 48 | containers: 49 | - name: postgresql 50 | image: postgres:12.2-alpine 51 | ports: 52 | - containerPort: 5432 53 | env: 54 | - name: POSTGRES_PASSWORD 55 | value: development 56 | - name: POSTGRES_DB 57 | value: warehouse 58 | volumeMounts: 59 | - name: argo-postgres-claim 60 | mountPath: /var/lib/postgresql/datalabs 61 | - name: psqlinit 62 | mountPath: /docker-entrypoint-initdb.d 63 | volumes: 64 | - name: argo-postgres-claim 65 | persistentVolumeClaim: 66 | claimName: argo-postgres-claim 67 | - name: psqlinit 68 | configMap: 69 | name: psqlinit 70 | --- 71 | apiVersion: v1 72 | kind: Service 73 | metadata: 74 | name: postgres 75 | namespace: argo 76 | labels: 77 | app: postgres 78 | spec: 79 | ports: 80 | - port: 5432 81 | targetPort: 5432 82 | protocol: TCP 83 | selector: 84 | app: postgres 85 | -------------------------------------------------------------------------------- /argo/psqlinit.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | create.sql: |2 4 | 5 | CREATE SCHEMA IF NOT EXISTS warehouse; 6 | 7 | CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; 8 | 9 | kind: ConfigMap 10 | metadata: 11 | creationTimestamp: "2020-05-15T15:42:31Z" 12 | managedFields: 13 | - apiVersion: v1 14 | fieldsType: FieldsV1 15 | fieldsV1: 16 | f:data: 17 | .: {} 18 | f:create.sql: {} 19 | manager: kubectl 20 | operation: Update 21 | time: "2020-05-15T15:42:31Z" 22 | name: psqlinit 23 | namespace: argo 24 | selfLink: /api/v1/namespaces/argo/configmaps/psqlinit 25 | -------------------------------------------------------------------------------- /argo/secrets/minikube/argo/aws/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/argo/secrets/minikube/argo/aws/.gitkeep -------------------------------------------------------------------------------- /base/Dockerfile: -------------------------------------------------------------------------------- 1 | # Use a basic Python image, but current Debian 2 | FROM python:3.6-slim-stretch 3 | 4 | # Build UTF8 locale to avoid encoding issues with Scrapy encoding 5 | # C.UTF-8 is the new en_US.UTF-8. 6 | ENV LC_ALL=C.UTF-8 7 | ENV LANG=C.UTF-8 8 | ENV LANGUAGE=C.UTF-8 9 | 10 | WORKDIR /opt/reach 11 | 12 | COPY ./requirements.txt /opt/reach/requirements.txt 13 | 14 | # Poppler is needed to run pdftotext convertion 15 | RUN apt-get update -yqq && \ 16 | apt-get install -yqq --no-install-recommends \ 17 | build-essential \ 18 | libpoppler-cpp-dev \ 19 | poppler-utils \ 20 | locales && \ 21 | apt-get -q clean && \ 22 | locale-gen C.UTF-8 && \ 23 | pip install -U pip && \ 24 | python3 -m pip install -r /opt/reach/requirements.txt && \ 25 | apt-get remove --purge -y build-essential 26 | 27 | 28 | COPY ./safe_import.py /opt/reach/safe_import.py 29 | COPY ./hooks /opt/reach/hooks 30 | COPY ./elastic /opt/reach/elastic 31 | COPY ./tests /opt/reach/tests 32 | -------------------------------------------------------------------------------- /base/elastic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wellcometrust/reach/1aa42c7d8aaf0a91d033af8448a33f37563b0365/base/elastic/__init__.py -------------------------------------------------------------------------------- /base/elastic/count.py: -------------------------------------------------------------------------------- 1 | """ 2 | Minimal CLI for counting records in ES. 3 | """ 4 | 5 | from . import common 6 | 7 | if __name__ == '__main__': 8 | parser = common.create_argument_parser(__doc__.strip()) 9 | parser.add_argument('index_name') 10 | args = parser.parse_args() 11 | es = common.es_from_args(args) 12 | print(common.count_es(es, args.index_name)) 13 | 14 | -------------------------------------------------------------------------------- /base/elastic/epmc_metadata.py: -------------------------------------------------------------------------------- 1 | """ 2 | Inserts EPMC metadata into Elasticsearch. 3 | 4 | Sample URL for testing: 5 | 6 | s3://datalabs-staging/airflow/output/open-research/epmc-metadata/epmc-metadata.json.gz 7 | """ 8 | 9 | import json 10 | import logging 11 | import functools 12 | 13 | from . import common 14 | 15 | CHUNK_SIZE = 1000 # tuned for small(ish) size of pub metadata 16 | 17 | 18 | def to_es_action(es_index, line): 19 | d = json.loads(line) 20 | return { 21 | "_index": es_index, 22 | "doc": d, 23 | } 24 | 25 | 26 | def clean_es(es, es_index, organisation): 27 | """ Ensure an empty index exists. """ 28 | common.recreate_index(es, es_index) 29 | 30 | 31 | def insert_file(f, es, es_index, organisation, max_items=None): 32 | """ 33 | Inserts EPMC metadata from a json.gz file into Elasticsearch. 34 | 35 | Args: 36 | f: json.gz file object 37 | es: a living connection to elacticsearch 38 | max_items: maximum number of records to insert, or None 39 | """ 40 | logging.info( 41 | 'epmc_metadata.insert_file: f=%s es=%s max_items=%s', 42 | f, es, max_items) 43 | to_es_func = functools.partial(to_es_action, es_index) 44 | return common.insert_actions( 45 | es, 46 | common.yield_actions(f, to_es_func, max_items), 47 | CHUNK_SIZE, 48 | ) 49 | 50 | 51 | if __name__ == '__main__': 52 | def insert_func(f, es, max_items=None): 53 | return insert_file(f, es, 'policy-test-epmc-metadata', 54 | max_items=max_items) 55 | count = common.insert_from_argv( 56 | __doc__.strip(), clean_es, insert_file) 57 | logging.info('Imported %d pubs into ES', count) 58 | -------------------------------------------------------------------------------- /base/hooks/sentry.py: -------------------------------------------------------------------------------- 1 | from functools import wraps 2 | import os 3 | 4 | import sentry_sdk 5 | 6 | 7 | def init_sentry_sdk(sentry_dsn): 8 | kwargs = { 9 | 'integrations': [], # we'll add celery & flask eventually here 10 | 'default_integrations': True, 11 | } 12 | sentry_sdk.init(sentry_dsn) 13 | 14 | 15 | def report_exception(f): 16 | """ Minimal decorator for reporting exceptions that occur within a 17 | function. Does not support generators.""" 18 | @wraps(f) 19 | def wrapped_f(*args, **kwargs): 20 | try: 21 | return f(*args, **kwargs) 22 | except: 23 | sentry_sdk.capture_exception() 24 | raise 25 | 26 | return wrapped_f 27 | 28 | 29 | # SENTRY_DSN must be present at import time. If we don't have it then, 30 | # we won't have it later either. 31 | init_sentry_sdk(os.environ['SENTRY_DSN']) 32 | -------------------------------------------------------------------------------- /base/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | sentry-sdk 3 | elasticsearch 4 | -------------------------------------------------------------------------------- /base/safe_import.py: -------------------------------------------------------------------------------- 1 | """ 2 | Prevents multiple threads from trying to import at the same time and 3 | hitting an import lock. Implemented because airflow's web server 4 | regularly re-imports all DAGs and all tasks therein -- and 5 | unfortunately, our tasks import so many dependencies that reloading them 6 | takes enough time (by some random distribution) that the Airflow times 7 | out imports, resulting in an endless stream of sentry reports from 8 | within gunicorn. 9 | 10 | So, we've moved "slow" imports, especially those pulling in ML libraries 11 | such as scipy or even pandas, into the execute() method of our tasks. 12 | 13 | This is almost always something you should NEVER do, because imports can 14 | only be trusted not to lock if they're done from the main thread and on 15 | module load. (And an import lock in Python tends not to (or never?) 16 | resolve itself.) But, not much choice, at least for now. And, it turns 17 | out that in our execution model, the celery executor spawns subprocesses 18 | to run each task. So, we shouldn't ever have an issue. 19 | 20 | Hope isn't a strategy, though. So, here's a context manager to use, so 21 | that we'll know if we were going to hit an import lock. 22 | 23 | Sample usage:: 24 | 25 | @report_exception 26 | def execute(self): 27 | with safe_import: 28 | from reach.rainbowpony import pony_ai 29 | 30 | # do things with pony_ai here. 31 | 32 | """ 33 | 34 | from contextlib import contextmanager 35 | from threading import Lock 36 | 37 | # Not a re-entrant lock b/c we believe imports of this sort should 38 | # only happen once from the calling thread. 39 | SAFE_IMPORT_LOCK = Lock() 40 | 41 | @contextmanager 42 | def safe_import(): 43 | """ 44 | Context manager for ensuring that only one thread is importing 45 | at a time. If two threads enter this context, the second will fail 46 | with an exception so that we can't get caught in an import lock. 47 | """ 48 | acquired = SAFE_IMPORT_LOCK.acquire(blocking=False) 49 | try: 50 | if not acquired: 51 | # NB: we could, instead, just wait here. But the invariant 52 | # we're expecting is that, thanks to how the celery executor 53 | # works, only one call to execute() should happen at a time, 54 | # because only one thread should ever be running. 55 | raise Exception('Multiple imports attempted at once!') 56 | yield 57 | finally: 58 | if acquired: 59 | SAFE_IMPORT_LOCK.release() 60 | -------------------------------------------------------------------------------- /base/tests/common.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | 3 | 4 | def get_path(p): 5 | return os.path.join( 6 | os.path.dirname(__file__), 7 | p 8 | ) 9 | 10 | TEST_PDF = get_path('pdfs/test_pdf.pdf') 11 | TEST_PDF_MULTIPAGE = get_path('pdfs/test_pdf_multipage.pdf') 12 | TEST_PDF_PAGE_NUMBER = get_path('pdfs/test_pdf_page_number.pdf') 13 | TEST_XML = get_path('xml/test_xml.xml') 14 | -------------------------------------------------------------------------------- /base/tests/mock_sites/parliament/1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |14 | <p>To ask the Secretary of State for Transport, what steps his Department is taking to support British citizens driving in EU countries after 29 March 2019 who do not hold a green card.</p> 15 |
16 |17 | 06 Feb 2019 | 18 | Written questions | 19 | House of Commons | 20 | 214040 21 |
22 |