├── maven ├── datasets │ ├── __init__.py │ ├── coronavirus │ │ ├── __init__.py │ │ ├── README.md │ │ └── csse.py │ └── general_election │ │ ├── __init__.py │ │ ├── uk_2017_results.py │ │ ├── uk_2015_results.py │ │ ├── uk_2010_results.py │ │ ├── uk_2019_model.py │ │ ├── uk_2015_model.py │ │ ├── uk_2017_model.py │ │ ├── uk_polls.py │ │ ├── README.md │ │ └── base.py ├── __init__.py ├── get.py └── utils.py ├── MANIFEST.in ├── setup.cfg ├── dev-requirements.in ├── .gitignore ├── AUTHORS.md ├── requirements.txt ├── tests ├── test_get.py ├── datasets │ ├── coronavirus │ │ └── test_csse.py │ └── general_election │ │ ├── test_uk_results.py │ │ └── test_uk_models.py └── test_utils.py ├── setup.py ├── dev-requirements.txt ├── README.md ├── CHANGELOG.md └── LICENSE /maven/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md LICENSE Pipfile Pipfile.lock 2 | -------------------------------------------------------------------------------- /maven/__init__.py: -------------------------------------------------------------------------------- 1 | from . import utils 2 | from .get import get 3 | 4 | __version__ = "0.1.0" 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [tool:pytest] 5 | collect_ignore = ['setup.py'] 6 | -------------------------------------------------------------------------------- /dev-requirements.in: -------------------------------------------------------------------------------- 1 | -c requirements.txt 2 | ipython==7.16.3 3 | pip-tools==4.2.0 4 | pytest==5.2.2 5 | -------------------------------------------------------------------------------- /maven/datasets/coronavirus/__init__.py: -------------------------------------------------------------------------------- 1 | from .csse import CSSE 2 | 3 | __all__ = [ 4 | "CSSE", 5 | ] 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .eggs/ 2 | .venv/ 3 | build/ 4 | data/ 5 | dist/ 6 | maven.egg-info/ 7 | 8 | *.pyc 9 | 10 | # IDE ignores 11 | .vscode 12 | 13 | # Checklists 14 | DEPLOY 15 | REVIEW 16 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | # Credits 2 | 3 | ## Development Lead 4 | * John Sandall [@john-sandall](https://github.com/john-sandall) 5 | 6 | ## Contributors 7 | * [@joy-rosie](https://github.com/joy-rosie) 8 | * [@cruzzoe](https://github.com/cruzzoe) 9 | * [@tomviner](https://github.com/tomviner) 10 | * [@geo7](https://github.com/geo7/) 11 | * [@JosephSutcliffe](https://github.com/JosephSutcliffe/) 12 | * [@dwood023](https://github.com/dwood023/) 13 | -------------------------------------------------------------------------------- /maven/datasets/general_election/__init__.py: -------------------------------------------------------------------------------- 1 | from .uk_2010_results import UK2010Results 2 | from .uk_2015_model import UK2015Model 3 | from .uk_2015_results import UK2015Results 4 | from .uk_2017_model import UK2017Model 5 | from .uk_2017_results import UK2017Results 6 | from .uk_2019_model import UK2019Model 7 | from .uk_polls import UKPolls 8 | 9 | __all__ = [ 10 | "UK2010Results", 11 | "UK2015Model", 12 | "UK2015Results", 13 | "UK2017Model", 14 | "UK2017Results", 15 | "UK2019Model", 16 | "UKPolls", 17 | ] 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile 6 | # 7 | certifi==2019.6.16 # via requests 8 | chardet==3.0.4 # via requests 9 | idna==2.8 # via requests 10 | numpy==1.16.4 # via pandas 11 | pandas==1.0.0 # via maven (setup.py) 12 | python-dateutil==2.8.0 # via pandas 13 | pytz==2019.1 # via pandas 14 | requests==2.22.0 # via maven (setup.py) 15 | six==1.12.0 # via python-dateutil 16 | urllib3==1.25.3 # via requests 17 | xlrd==1.2.0 # via maven (setup.py) 18 | -------------------------------------------------------------------------------- /tests/test_get.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running tests in development: 3 | $ cd /path/to/repo 4 | $ python -m pytest 5 | 6 | Running tests against installed version (either `pip install .` or `pip install maven`): 7 | $ cd /path/to/repo 8 | $ pytest 9 | """ 10 | 11 | import maven 12 | import pytest 13 | 14 | 15 | def test_nonexisting_identifier(): 16 | with pytest.raises(KeyError): 17 | maven.get("this-identifier-will-never-exist", data_directory="./data/") 18 | 19 | 20 | def test_nothing_happens(): 21 | """Setting retrieve=False and process=False should do nothing.""" 22 | maven.get("general-election/UK/2010/results", retrieve=False, process=False) 23 | -------------------------------------------------------------------------------- /tests/datasets/coronavirus/test_csse.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running tests in development: 3 | $ cd /path/to/repo 4 | $ python -m pytest ./tests/datasets/coronavirus/test_csse.py 5 | 6 | Running tests against installed version (either `pip install .` or `pip install maven`): 7 | $ cd /path/to/repo 8 | $ pytest ./tests/datasets/coronavirus/test_csse.py 9 | """ 10 | 11 | from pathlib import Path 12 | 13 | import pandas as pd 14 | 15 | import maven 16 | 17 | 18 | def test_csse(): 19 | identifier = "coronavirus/CSSE" 20 | maven.get(identifier, data_directory="./data/") 21 | # CSSE_country.csv 22 | processed_filename = "CSSE_country.csv" 23 | df = pd.read_csv(Path("./data") / identifier / "processed" / processed_filename) 24 | assert df.columns.tolist() == ["date", "country_region", "confirmed", "deaths", "recovered"] 25 | # CSSE_country_province.csv 26 | processed_filename = "CSSE_country_province.csv" 27 | df = pd.read_csv(Path("./data") / identifier / "processed" / processed_filename) 28 | assert df.columns.tolist() == [ 29 | "date", 30 | "country_region", 31 | "province_state", 32 | "lat", 33 | "lon", 34 | "confirmed", 35 | "deaths", 36 | "recovered", 37 | ] 38 | -------------------------------------------------------------------------------- /maven/datasets/coronavirus/README.md: -------------------------------------------------------------------------------- 1 | # Coronavirus (COVID-19) datasets 2 | 3 | If you have any questions about these datasets please [contact me @John_Sandall](https://twitter.com/John_Sandall) on Twitter. 4 | 5 | 6 | ## Sources 7 | We aim to source our data directly from the most authorative data provider, falling back to less authorative sources where a primary source isn't available. 8 | 9 | Global providers/aggregators: 10 | - [Johns Hopkins Center for Systems Science and Engineering](https://github.com/CSSEGISandData/COVID-19/). 11 | 12 | 13 | ## Data dictionaries 14 | 15 | #### **`coronavirus/CSSE`** 16 | 17 | ##### `CSSE_country_province.csv` 18 | | Column | Type | Description | Example | 19 | | -- | -- | -- | -- | 20 | | `date` | date | Date | `2020-03-13` | 21 | | `country_region` | str | Country/Region | `US` | 22 | | `province_state` | str | Province/State | `Washington` | 23 | | `lat` | float | Latitude | `47.4009` | 24 | | `lon` | float | Longitude | `-121.4905` | 25 | | `confirmed` | int | Confirmed cases | `568` | 26 | | `deaths` | int | Fatalities | `37` | 27 | | `recovered` | int | Recovered | `1` | 28 | 29 | ##### `CSSE_country.csv` 30 | | Column | Type | Description | Example | 31 | | -- | -- | -- | -- | 32 | | `date` | date | Date | `2020-03-13` | 33 | | `country_region` | str | Country/Region | `US` | 34 | | `confirmed` | int | Confirmed cases | `2179` | 35 | | `deaths` | int | Fatalities | `47` | 36 | | `recovered` | int | Recovered | `12` | 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as f: 4 | long_description = f.read() 5 | 6 | 7 | setuptools.setup( 8 | name="maven", 9 | version="0.1.0", 10 | description=( 11 | "Maven's goal is to reduce the time data scientists spend on data cleaning and preparation " 12 | "by providing easy access to open datasets in both raw and processed formats." 13 | ), 14 | long_description=long_description, 15 | long_description_content_type="text/markdown", 16 | keywords="maven open data etl pipeline", 17 | author="John Sandall", 18 | author_email="contact@coefficient.ai", 19 | url="https://github.com/john-sandall/maven", 20 | packages=setuptools.find_packages(), 21 | include_package_data=True, 22 | install_requires=["pandas==1.0.0", "requests==2.22.0", "xlrd==1.2.0",], 23 | python_requires="==3.7.*", 24 | setup_requires=["pytest-runner"], 25 | test_suite="tests", 26 | tests_require=["pytest"], 27 | license="Apache 2.0", 28 | zip_safe=False, 29 | classifiers=[ 30 | "Development Status :: 2 - Pre-Alpha", 31 | "Programming Language :: Python", 32 | "Programming Language :: Python :: 3", 33 | "Programming Language :: Python :: 3.7", 34 | "License :: OSI Approved :: Apache Software License", 35 | "Operating System :: OS Independent", 36 | "Natural Language :: English", 37 | "Intended Audience :: Developers", 38 | ], 39 | ) 40 | -------------------------------------------------------------------------------- /tests/datasets/general_election/test_uk_results.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running tests in development: 3 | $ cd /path/to/repo 4 | $ python -m pytest ./tests/datasets/test_uk_models 5 | 6 | Running tests against installed version (either `pip install .` or `pip install maven`): 7 | $ cd /path/to/repo 8 | $ pytest ./tests/datasets/test_uk_models 9 | """ 10 | 11 | from pathlib import Path 12 | 13 | import pandas as pd 14 | 15 | import maven 16 | 17 | 18 | def check_uk_hoc_results_data(identifier, processed_filename): 19 | maven.get(identifier, data_directory="./data/") 20 | df = pd.read_csv(Path("./data") / identifier / "processed" / processed_filename) 21 | assert df.shape == (8450, 11) 22 | assert df.columns.tolist() == [ 23 | "ons_id", 24 | "constituency", 25 | "county", 26 | "region", 27 | "country", 28 | "electorate", 29 | "total_votes", 30 | "turnout", 31 | "party", 32 | "votes", 33 | "voteshare", 34 | ] 35 | 36 | 37 | def test_uk_2010_results(): 38 | check_uk_hoc_results_data( 39 | identifier="general-election/UK/2010/results", processed_filename="general_election-uk-2010-results.csv" 40 | ) 41 | 42 | 43 | def test_uk_2015_results(): 44 | check_uk_hoc_results_data( 45 | identifier="general-election/UK/2015/results", processed_filename="general_election-uk-2015-results.csv" 46 | ) 47 | 48 | 49 | def test_uk_2017_results(): 50 | check_uk_hoc_results_data( 51 | identifier="general-election/UK/2017/results", processed_filename="general_election-uk-2017-results.csv" 52 | ) 53 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile dev-requirements.in 6 | # 7 | atomicwrites==1.3.0 8 | # via pytest 9 | attrs==19.3.0 10 | # via pytest 11 | backcall==0.1.0 12 | # via ipython 13 | click==7.0 14 | # via pip-tools 15 | decorator==4.4.1 16 | # via 17 | # ipython 18 | # traitlets 19 | importlib-metadata==0.23 20 | # via 21 | # pluggy 22 | # pytest 23 | ipython==7.16.3 24 | # via -r dev-requirements.in 25 | ipython-genutils==0.2.0 26 | # via traitlets 27 | jedi==0.15.1 28 | # via ipython 29 | more-itertools==7.2.0 30 | # via pytest 31 | packaging==19.2 32 | # via pytest 33 | parso==0.5.1 34 | # via jedi 35 | pexpect==4.7.0 36 | # via ipython 37 | pickleshare==0.7.5 38 | # via ipython 39 | pip-tools==4.2.0 40 | # via -r dev-requirements.in 41 | pluggy==0.13.0 42 | # via pytest 43 | prompt-toolkit==2.0.10 44 | # via ipython 45 | ptyprocess==0.6.0 46 | # via pexpect 47 | py==1.8.0 48 | # via pytest 49 | pygments==2.4.2 50 | # via ipython 51 | pyparsing==2.4.5 52 | # via packaging 53 | pytest==5.2.2 54 | # via -r dev-requirements.in 55 | six==1.12.0 56 | # via 57 | # -c requirements.txt 58 | # packaging 59 | # pip-tools 60 | # prompt-toolkit 61 | # traitlets 62 | traitlets==4.3.3 63 | # via ipython 64 | wcwidth==0.1.7 65 | # via 66 | # prompt-toolkit 67 | # pytest 68 | zipp==0.6.0 69 | # via importlib-metadata 70 | 71 | # The following packages are considered to be unsafe in a requirements file: 72 | # setuptools 73 | -------------------------------------------------------------------------------- /maven/datasets/general_election/uk_2017_results.py: -------------------------------------------------------------------------------- 1 | """ 2 | Results data for the United Kingdom's 2017 General Election. 3 | 4 | Usage: 5 | >>> import maven 6 | >>> maven.get('general-election/UK/2017/results', data_directory='./data/') 7 | 8 | 9 | Sources: 10 | - http://researchbriefings.files.parliament.uk/documents/CBP-8647/1918-2017election_results.csv 11 | - From https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647 12 | 13 | Other sources: 14 | - https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-7186 15 | - http://researchbriefings.files.parliament.uk/documents/CBP-7979/HoC-GE2017-constituency-results.csv 16 | """ 17 | 18 | from pathlib import Path 19 | 20 | from maven.datasets.general_election.base import UKResults 21 | 22 | 23 | class UK2017Results(UKResults): 24 | """Handles results data for the United Kingdom's 2017 General Election.""" 25 | 26 | def __init__(self, directory=Path("data/general-election/UK/2017/results")): 27 | super(UK2017Results, self).__init__(directory=directory) 28 | self.directory = Path(directory) 29 | self.sources = [ 30 | # url, filename, checksum 31 | ( 32 | "http://researchbriefings.files.parliament.uk/documents/CBP-8647/", 33 | "1918-2017election_results_by_pcon.xlsx", 34 | "a1e4628945574639b541b21bada2531c", 35 | ), 36 | ] 37 | self.target = ("general_election-uk-2017-results.csv", "c7e1fde647e55f9d4567cb81e62c782a") # filename, checksum 38 | self.verbose_name = "UK 2017 General Election results" 39 | self.year = "2017" 40 | -------------------------------------------------------------------------------- /maven/datasets/general_election/uk_2015_results.py: -------------------------------------------------------------------------------- 1 | """ 2 | Results data for the United Kingdom's 2015 General Election. 3 | 4 | Usage: 5 | >>> import maven 6 | >>> maven.get('general-election/UK/2015/results', data_directory='./data/') 7 | 8 | 9 | Sources: 10 | - http://researchbriefings.files.parliament.uk/documents/CBP-8647/1918-2017election_results.csv 11 | - From https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647 12 | 13 | Deprecated sources: 14 | - http://www.electoralcommission.org.uk/__data/assets/file/0004/191650/2015-UK-general-election-data-results-WEB.zip 15 | 16 | Notes: 17 | - 2015-UK-general-election-data-results-WEB.zip has a lot more detailed data. 18 | """ 19 | 20 | from pathlib import Path 21 | 22 | from maven.datasets.general_election.base import UKResults 23 | 24 | 25 | class UK2015Results(UKResults): 26 | """Handles results data for the United Kingdom's 2015 General Election.""" 27 | 28 | def __init__(self, directory=Path("data/general-election/UK/2015/results")): 29 | super(UK2015Results, self).__init__(directory=directory) 30 | self.directory = Path(directory) 31 | self.sources = [ 32 | # url, filename, checksum 33 | ( 34 | "http://researchbriefings.files.parliament.uk/documents/CBP-8647/", 35 | "1918-2017election_results_by_pcon.xlsx", 36 | "a1e4628945574639b541b21bada2531c", 37 | ), 38 | ] 39 | self.target = ("general_election-uk-2015-results.csv", "9a785cb19275e4dbc79da67eece6067f") # filename, checksum 40 | self.verbose_name = "UK 2015 General Election results" 41 | self.year = "2015" 42 | -------------------------------------------------------------------------------- /maven/datasets/general_election/uk_2010_results.py: -------------------------------------------------------------------------------- 1 | """ 2 | Results data for the United Kingdom's 2010 General Election. 3 | 4 | Usage: 5 | >>> import maven 6 | >>> maven.get('general-election/UK/2010/results', data_directory='./data/') 7 | 8 | 9 | Sources: 10 | - http://researchbriefings.files.parliament.uk/documents/CBP-8647/1918-2017election_results.csv 11 | - From https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647 12 | 13 | Deprecated sources: 14 | - http://www.electoralcommission.org.uk/__data/assets/excel_doc/0003/105726/GE2010-results-flatfile-website.xls 15 | - https://s3-eu-west-1.amazonaws.com/sixfifty/GE2010-results-flatfile-website.xls 16 | 17 | Notes: 18 | - GE2010-results-flatfile-website.xls is currently the only known source with a full list of votes for ALL parties. 19 | """ 20 | 21 | from pathlib import Path 22 | 23 | from maven.datasets.general_election.base import UKResults 24 | 25 | 26 | class UK2010Results(UKResults): 27 | """Handles results data for the United Kingdom's 2010 General Election.""" 28 | 29 | def __init__(self, directory=Path("data/general-election/UK/2010/results")): 30 | super(UK2010Results, self).__init__(directory=directory) 31 | self.directory = Path(directory) 32 | self.sources = [ 33 | # url, filename, checksum 34 | ( 35 | "http://researchbriefings.files.parliament.uk/documents/CBP-8647/", 36 | "1918-2017election_results_by_pcon.xlsx", 37 | "a1e4628945574639b541b21bada2531c", 38 | ), 39 | ] 40 | self.target = ("general_election-uk-2010-results.csv", "954a0916f5ce791ca566484ce566088d") # filename, checksum 41 | self.verbose_name = "UK 2010 General Election results" 42 | self.year = "2010" 43 | -------------------------------------------------------------------------------- /maven/get.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main data getting functionality. Maps data identifiers to data pipeline classes. 3 | 4 | Example usage: 5 | > import maven 6 | > maven.get('general-election/UK/2015/results', data_directory='./data/') 7 | """ 8 | 9 | from pathlib import Path 10 | 11 | from .datasets import coronavirus, general_election 12 | 13 | 14 | def get(name, data_directory=Path("."), retrieve=True, process=True): 15 | """Core data getter function. 16 | 17 | Args: 18 | name (str): Name of dataset to retrieve/process. 19 | data_directory (str or pathlib.PosixPath): Path to directory where datasets will be saved (either as string 20 | a pathlib Path). 21 | retrieve (bool): Toggle dataset retrieval. 22 | process (bool): Toggle dataset processing. 23 | 24 | Returns: Nothing (datasets are placed into current working directory). 25 | """ 26 | mapper = { 27 | "coronavirus/CSSE": coronavirus.CSSE, 28 | "general-election/UK/2010/results": general_election.UK2010Results, 29 | "general-election/UK/2015/model": general_election.UK2015Model, 30 | "general-election/UK/2015/results": general_election.UK2015Results, 31 | "general-election/UK/2017/model": general_election.UK2017Model, 32 | "general-election/UK/2017/results": general_election.UK2017Results, 33 | # "general-election/UK/2019/model": general_election.UK2019Model, 34 | "general-election/UK/polls": general_election.UKPolls, 35 | } 36 | if name not in mapper: 37 | raise KeyError(f"'{name}' not found in datasets.") 38 | 39 | if isinstance(data_directory, str): 40 | data_directory = Path(data_directory) 41 | pipeline = mapper[name](directory=(data_directory / name)) 42 | 43 | if retrieve: 44 | pipeline.retrieve() 45 | if process: 46 | pipeline.process() 47 | -------------------------------------------------------------------------------- /maven/datasets/general_election/uk_2019_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model-ready dataset for the United Kingdom's 2019 General Election. 3 | 4 | Usage: 5 | > import maven 6 | > maven.get('general-election/UK/2019/model', data_directory='./data/') 7 | """ 8 | import os 9 | from pathlib import Path 10 | 11 | import pandas as pd 12 | 13 | from maven.datasets.general_election.base import UKModel 14 | 15 | 16 | class UK2019Model(UKModel): 17 | """Generates model-ready data for the United Kingdom's 2019 General Election.""" 18 | 19 | def __init__(self, directory=Path("data/general-election/UK/2019/model")): 20 | super(UK2019Model, self).__init__(directory=directory) # inherit base __init__ but override default directory 21 | self.sources = [ 22 | # tuples of (url, filename, checksum) 23 | ( 24 | "general-election/UK/2017/results", 25 | "general_election-uk-2017-results.csv", 26 | "c7e1fde647e55f9d4567cb81e62c782a", 27 | ), 28 | ("general-election/UK/polls", "general_election-uk-polls.csv", "cbc3c19a376b4ab632f122008f593799"), 29 | ("general-election/UK/polls", "general_election-london-polls.csv", "cd28ebb7233b808796535fc0b572304e"), 30 | ("general-election/UK/polls", "general_election-scotland-polls.csv", "6c2ba92e2325de0e22a208fb0b3e95fc"), 31 | ("general-election/UK/polls", "general_election-wales-polls.csv", "6857df3c18df525d5e59a6bf1170b10c"), 32 | ("general-election/UK/polls", "general_election-ni-polls.csv", "46bbe5e9dc29d4b3042837fe4c16ca07"), 33 | ] 34 | self.retrieve_all = True 35 | self.verbose_name = "UK2019Model" 36 | self.year = 2019 37 | self.last_date = pd.to_datetime("2017-06-08") 38 | self.now_date = pd.to_datetime("2019-12-12") 39 | self.last = self.last_date.year 40 | self.now = self.now_date.year 41 | -------------------------------------------------------------------------------- /maven/datasets/general_election/uk_2015_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model-ready dataset for the United Kingdom's 2015 General Election. 3 | 4 | Usage: 5 | > import maven 6 | > maven.get('general-election/UK/2015/model', data_directory='./data/') 7 | """ 8 | import os 9 | from pathlib import Path 10 | 11 | import pandas as pd 12 | 13 | from maven.datasets.general_election.base import UKModel 14 | 15 | 16 | class UK2015Model(UKModel): 17 | """Generates model-ready data for the United Kingdom's 2015 General Election.""" 18 | 19 | def __init__(self, directory=Path("data/general-election/UK/2015/model")): 20 | super(UK2015Model, self).__init__(directory=directory) # inherit base __init__ but override default directory 21 | self.sources = [ 22 | # tuples of (url, filename, checksum) 23 | ( 24 | "general-election/UK/2010/results", 25 | "general_election-uk-2010-results.csv", 26 | "954a0916f5ce791ca566484ce566088d", 27 | ), 28 | ( 29 | "general-election/UK/2015/results", 30 | "general_election-uk-2015-results.csv", 31 | "9a785cb19275e4dbc79da67eece6067f", 32 | ), 33 | ("general-election/UK/polls", "general_election-uk-polls.csv", "cbc3c19a376b4ab632f122008f593799"), 34 | ("general-election/UK/polls", "general_election-london-polls.csv", "cd28ebb7233b808796535fc0b572304e"), 35 | ("general-election/UK/polls", "general_election-scotland-polls.csv", "6c2ba92e2325de0e22a208fb0b3e95fc"), 36 | ("general-election/UK/polls", "general_election-wales-polls.csv", "6857df3c18df525d5e59a6bf1170b10c"), 37 | ("general-election/UK/polls", "general_election-ni-polls.csv", "46bbe5e9dc29d4b3042837fe4c16ca07"), 38 | ] 39 | self.retrieve_all = True 40 | self.verbose_name = "UK2015Model" 41 | self.year = 2015 42 | self.last_date = pd.to_datetime("2010-05-06") 43 | self.now_date = pd.to_datetime("2015-05-07") 44 | self.last = self.last_date.year 45 | self.now = self.now_date.year 46 | -------------------------------------------------------------------------------- /maven/datasets/general_election/uk_2017_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Model-ready dataset for the United Kingdom's 2017 General Election. 3 | 4 | Usage: 5 | > import maven 6 | > maven.get('general-election/UK/2017/model', data_directory='./data/') 7 | """ 8 | import os 9 | from pathlib import Path 10 | 11 | import pandas as pd 12 | 13 | from maven.datasets.general_election.base import UKModel 14 | 15 | 16 | class UK2017Model(UKModel): 17 | """Generates model-ready data for the United Kingdom's 2017 General Election.""" 18 | 19 | def __init__(self, directory=Path("data/general-election/UK/2017/model")): 20 | super(UK2017Model, self).__init__(directory=directory) # inherit base __init__ but override default directory 21 | self.sources = [ 22 | # tuples of (url, filename, checksum) 23 | ( 24 | "general-election/UK/2015/results", 25 | "general_election-uk-2015-results.csv", 26 | "9a785cb19275e4dbc79da67eece6067f", 27 | ), 28 | ( 29 | "general-election/UK/2017/results", 30 | "general_election-uk-2017-results.csv", 31 | "c7e1fde647e55f9d4567cb81e62c782a", 32 | ), 33 | ("general-election/UK/polls", "general_election-uk-polls.csv", "cbc3c19a376b4ab632f122008f593799"), 34 | ("general-election/UK/polls", "general_election-london-polls.csv", "cd28ebb7233b808796535fc0b572304e"), 35 | ("general-election/UK/polls", "general_election-scotland-polls.csv", "6c2ba92e2325de0e22a208fb0b3e95fc"), 36 | ("general-election/UK/polls", "general_election-wales-polls.csv", "6857df3c18df525d5e59a6bf1170b10c"), 37 | ("general-election/UK/polls", "general_election-ni-polls.csv", "46bbe5e9dc29d4b3042837fe4c16ca07"), 38 | ] 39 | self.retrieve_all = True 40 | self.verbose_name = "UK2017Model" 41 | self.year = 2017 42 | self.last_date = pd.to_datetime("2015-05-07") 43 | self.now_date = pd.to_datetime("2017-06-08") 44 | self.last = self.last_date.year 45 | self.now = self.now_date.year 46 | -------------------------------------------------------------------------------- /tests/datasets/general_election/test_uk_models.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running tests in development: 3 | $ cd /path/to/repo 4 | $ python -m pytest ./tests/datasets/general_election/test_uk_models.py 5 | 6 | Running tests against installed version (either `pip install .` or `pip install maven`): 7 | $ cd /path/to/repo 8 | $ pytest ./tests/datasets/general_election/test_uk_models.py 9 | """ 10 | 11 | from pathlib import Path 12 | 13 | import pandas as pd 14 | 15 | import maven 16 | 17 | 18 | def check_uk_model_output(identifier, output_file): 19 | maven.get(identifier, data_directory="./data/") 20 | df = pd.read_csv(Path("./data") / identifier / "processed" / output_file) 21 | geo_columns = [] 22 | target_columns = [] 23 | if "geo_polls_now" in df.columns: 24 | geo_columns += [ 25 | "geo_polls_now", 26 | "geo_voteshare_last", 27 | "geo_swing", 28 | "geo_swing_forecast", 29 | "geo_swing_winner", 30 | ] 31 | if "total_votes_now" in df.columns: 32 | target_columns += [ 33 | "total_votes_now", 34 | "turnout_now", 35 | "votes_now", 36 | "voteshare_now", 37 | "winner_now", 38 | ] 39 | column_list = ( 40 | [ 41 | "ons_id", 42 | "constituency", 43 | "county", 44 | "region", 45 | "geo", 46 | "country", 47 | "electorate", 48 | "total_votes_last", 49 | "turnout_last", 50 | "party", 51 | "votes_last", 52 | "voteshare_last", 53 | "winner_last", 54 | "won_here_last", 55 | "national_voteshare_last", 56 | "national_polls_now", 57 | "national_swing", 58 | "national_swing_forecast", 59 | "national_swing_winner", 60 | ] 61 | + geo_columns 62 | + target_columns 63 | ) 64 | assert df.shape == (7800, len(column_list)) 65 | assert df.columns.tolist() == column_list 66 | 67 | 68 | # TODO: Can't find general_election-london-polls.csv 69 | # def test_uk_2015_model(): 70 | # check_uk_model_output( 71 | # identifier="general-election/UK/2015/model", 72 | # output_file="general_election-uk-2015-model.csv", 73 | # ) 74 | 75 | 76 | # TODO: Can't find general_election-london-polls.csv 77 | # def test_uk_2017_model(): 78 | # check_uk_model_output( 79 | # identifier="general-election/UK/2017/model", 80 | # output_file="general_election-uk-2017-model.csv", 81 | # ) 82 | 83 | 84 | # TODO: Disable for now, investigate later 85 | # def test_uk_2019_model(): 86 | # check_uk_model_output( 87 | # identifier="general-election/UK/2019/model", 88 | # output_file="general_election-uk-2019-model.csv", 89 | # ) 90 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Running tests in development: 3 | $ cd /path/to/repo 4 | $ python -m pytest 5 | 6 | Running tests against installed version (either `pip install .` or `pip install maven`): 7 | $ cd /path/to/repo 8 | $ pytest 9 | """ 10 | import os 11 | from functools import partial 12 | from pathlib import Path 13 | 14 | import requests 15 | 16 | import pytest 17 | from maven import utils 18 | 19 | 20 | class MockResponse: 21 | """requests.get() returns an object of class Response. Let's mock that and add: 22 | - status_code attribute 23 | - content attribute 24 | """ 25 | 26 | status_code = 200 27 | content = b"some content" 28 | 29 | 30 | def test_sanitise(): 31 | assert utils.sanitise("Vote Count") == "vote_count" 32 | 33 | 34 | def test_calculate_md5_checksum(tmpdir): 35 | filepath = tmpdir / "file.txt" 36 | with open(filepath, "w") as f: 37 | f.write("some content") 38 | assert utils.calculate_md5_checksum(filename=filepath) == "9893532233caff98cd083a116b013c0b" 39 | 40 | 41 | def test_fetch_url(monkeypatch, tmpdir): 42 | """Ref: https://docs.pytest.org/en/latest/monkeypatch.html""" 43 | 44 | def mock_get(*args, **kwargs): 45 | return MockResponse() 46 | 47 | monkeypatch.setattr(requests, "get", mock_get) # replace requests.get() with our mock_get() 48 | utils.fetch_url(url="https://fakeurl", filename="fakefile.txt", target_dir=Path(tmpdir)) 49 | with open(tmpdir / "fakefile.txt", "rb") as f: 50 | assert f.read() == b"some content" 51 | 52 | 53 | def test_retrieve_from_cache_if_exists(tmpdir): 54 | def _create_file(target_dir): 55 | """Puts file.txt in the target_dir""" 56 | with open(target_dir / "file.txt", "w") as f: 57 | f.write("some content") 58 | 59 | # Put it there for now. 60 | _create_file(target_dir=tmpdir) 61 | 62 | # Test basic usage 63 | utils.retrieve_from_cache_if_exists( 64 | filename="file.txt", 65 | target_dir=Path(tmpdir), 66 | processing_fn=None, 67 | md5_checksum=None, 68 | caching_enabled=True, 69 | verbose=False, 70 | ) 71 | # Test incorrect MD5 72 | with pytest.warns(UserWarning): 73 | utils.retrieve_from_cache_if_exists( 74 | filename="file.txt", 75 | target_dir=Path(tmpdir), 76 | processing_fn=None, 77 | md5_checksum="badchecksum", 78 | caching_enabled=True, 79 | verbose=True, 80 | ) 81 | # Remove file & put it there via processing_fn 82 | os.remove(tmpdir / "file.txt") 83 | utils.retrieve_from_cache_if_exists( 84 | filename="file.txt", 85 | target_dir=Path(tmpdir), 86 | processing_fn=partial(_create_file, target_dir=tmpdir), 87 | md5_checksum=None, 88 | caching_enabled=True, 89 | verbose=True, 90 | ) 91 | -------------------------------------------------------------------------------- /maven/datasets/coronavirus/csse.py: -------------------------------------------------------------------------------- 1 | """ 2 | Coronavirus CSSE data from https://github.com/CSSEGISandData/COVID-19/ 3 | 4 | Usage: 5 | >>> import maven 6 | >>> maven.get('coronavirus/CSSE', data_directory='./data/') 7 | 8 | 9 | Sources: 10 | - https://github.com/CSSEGISandData/COVID-19/ 11 | """ 12 | import os 13 | from pathlib import Path 14 | 15 | import pandas as pd 16 | 17 | from maven import utils 18 | 19 | 20 | class CSSE(utils.Pipeline): 21 | """Handle CSSE data from https://github.com/CSSEGISandData/COVID-19/""" 22 | 23 | def __init__(self, directory=Path("data/coronavirus/CSSE")): 24 | # inherit base __init__ but override default directory 25 | super(CSSE, self).__init__(directory=directory) 26 | # Source & targets 27 | base_url = ( 28 | "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/" 29 | "csse_covid_19_data/csse_covid_19_time_series/" 30 | ) 31 | self.sources = [ 32 | # url, filename, checksum 33 | (base_url, "time_series_19-covid-Confirmed.csv", "09b6dfc1ee244ba652b8639f0aa2f093"), 34 | (base_url, "time_series_19-covid-Deaths.csv", "69a9dfa8a901c8f0bbe0f6499db8641c"), 35 | (base_url, "time_series_19-covid-Recovered.csv", "4d1c1d4f1c45514e3562cb42ef2729c7"), 36 | ] 37 | self.targets = [ 38 | # filename, checksum( 39 | ("CSSE_country_province.csv", "bfce6bf16571fbb3004f9e5eee7b9e30"), 40 | ("CSSE_country.csv", "b5b3ed6fc75f323593fd7710a4262e1b"), 41 | ] 42 | # Config 43 | self.rename_source = False 44 | self.retrieve_all = True 45 | self.cache = True 46 | self.verbose = False 47 | self.verbose_name = "CSSE" 48 | 49 | def process(self): 50 | """Process CSSE data.""" 51 | target_dir = self.directory / "processed" 52 | os.makedirs(target_dir, exist_ok=True) # create directory if it doesn't exist 53 | 54 | def process_and_export(): 55 | """Either caching disabled or file not yet processed; process regardless.""" 56 | data = {} 57 | for metric in ["Confirmed", "Deaths", "Recovered"]: 58 | df = pd.read_csv(self.directory / "raw" / f"time_series_19-covid-{metric}.csv") 59 | # Pivot all to long 60 | id_vars = ["Province/State", "Country/Region", "Lat", "Long"] 61 | value_vars = list(set(df.columns) - set(id_vars)) 62 | df = df.melt( 63 | id_vars=id_vars, value_vars=value_vars, var_name="date", value_name=metric 64 | ) 65 | df["date"] = pd.to_datetime(df.date, format="%m/%d/%y") 66 | data[metric] = df.copy() 67 | 68 | # Merge together 69 | df_country_province = pd.merge( 70 | data["Confirmed"], 71 | data["Deaths"], 72 | how="outer", 73 | on=["Province/State", "Country/Region", "Lat", "Long", "date"], 74 | ).merge( 75 | data["Recovered"], 76 | how="outer", 77 | on=["Province/State", "Country/Region", "Lat", "Long", "date"], 78 | ) 79 | 80 | # Clean 81 | df_country_province.columns = utils.sanitise( 82 | df_country_province.columns, replace={"long": "lon"} 83 | ) 84 | df_country_province = df_country_province[ 85 | [ 86 | "date", 87 | "country_region", 88 | "province_state", 89 | "lat", 90 | "lon", 91 | "confirmed", 92 | "deaths", 93 | "recovered", 94 | ] 95 | ].sort_values(["date", "country_region", "province_state"]) 96 | 97 | # Country-level data 98 | df_country = ( 99 | df_country_province.groupby(["date", "country_region"])[ 100 | ["confirmed", "deaths", "recovered"] 101 | ] 102 | .sum() 103 | .reset_index() 104 | ) 105 | 106 | # Export 107 | print(f"Exporting dataset to {target_dir.resolve()}") 108 | df_country_province.to_csv(target_dir / "CSSE_country_province.csv", index=False) 109 | df_country.to_csv(target_dir / "CSSE_country.csv", index=False) 110 | 111 | for filename, checksum in self.targets: 112 | utils.retrieve_from_cache_if_exists( 113 | filename=filename, 114 | target_dir=target_dir, 115 | processing_fn=process_and_export, 116 | md5_checksum=checksum, 117 | caching_enabled=self.cache, 118 | verbose=self.verbose, 119 | ) 120 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Maven 2 | > /meɪvən/ – a trusted expert who seeks to pass timely and relevant knowledge on to others. 3 | 4 | Maven's goal is to reduce the time data scientists spend on data cleaning and preparation by providing easy access to open datasets in both raw and processed formats. 5 | 6 | Maven was built to: 7 | 8 | - **Improve availability and integrity of open data** by eliminating data issues, adding common identifiers, and reshaping data to become model-ready. 9 | - **Source data in its rawest form** from the most authoritative data provider available with all transformations available as open source code to enhance integrity and trust. 10 | - **Honour data licences wherever possible** whilst avoiding potential issues relating to re-distribution of data (especially open datasets where no clear licence is provided) by performing all data retrieval and processing on-device. 11 | 12 | 13 | ## Install 14 | ``` 15 | pip install maven 16 | ``` 17 | 18 | 19 | ## Usage 20 | ```python 21 | import maven 22 | maven.get('general-election/UK/2017/results', data_directory='./data/') 23 | ``` 24 | 25 | 26 | ## Datasets 27 | Data dictionaries for all datasets are available by clicking on the dataset's name. 28 | 29 | | Dataset | Description | Date | Source | Licence | 30 | | -- | -- | -- | -- | -- | 31 | | **Coronavirus Datasets** | 32 | | [**`coronavirus/CSSE`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/coronavirus) | Daily CSSE cases/deaths/recovered by country/region/state | Updated daily | [Johns Hopkins Center for Systems Science and Engineering](https://github.com/CSSEGISandData/COVID-19/) | [See "Terms of Use" on CSSE repo](https://github.com/CSSEGISandData/COVID-19/) | 33 | | **UK Political Datasets** | 34 | | [**`general-election/UK/2010/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2010 General Election results | 6th May 2010 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) | 35 | | [**`general-election/UK/2015/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2015 General Election results | 7th May 2015 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) | 36 | | [**`general-election/UK/2017/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2017 General Election results | 8th June 2017 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) | 37 | | [**`general-election/UK/2015/model`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | Model-ready datasets for forecasting the 2015 UK General Election | 2010 & 2015 data | [uk_2015_model.py](https://github.com/john-sandall/maven/blob/master/maven/datasets/general_election/uk_2015_model.py) | Mixed | 38 | | [**`general-election/UK/2017/model`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | Model-ready datasets for forecasting the 2017 UK General Election | 2015 & 2017 data | [uk_2017_model.py](https://github.com/john-sandall/maven/blob/master/maven/datasets/general_election/uk_2017_model.py) | Mixed | 39 | | [**`general-election/UK/polls`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK General Election opinion polling | May 2005 - June 2017 | [SixFifty](https://github.com/six50/pipeline/tree/master/data/polls/) | Unknown | 40 | 41 | 42 | 43 | ## Running tests 44 | To run tests against an installed version (either `pip install .` or `pip install maven`): 45 | ``` 46 | $ cd /path/to/repo 47 | $ pytest 48 | ``` 49 | 50 | To run tests whilst in development: 51 | ``` 52 | $ cd /path/to/repo 53 | $ python -m pytest 54 | ``` 55 | 56 | 57 | ## Licences 58 | | Name | Description | Attribution Statement | 59 | | -- | -- | -- | 60 | | [Open Parliament Licence](http://www.parliament.uk/site-information/copyright/open-parliament-licence/) | Free to copy, publish, distribute, transmit, adapt and exploit commercially or non-commercially. See URL for full details. | Contains Parliamentary information licensed under the Open Parliament Licence v3.0. | 61 | | [Open Government Licence](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/2/) | Free to copy, publish, distribute, transmit, adapt and exploit commercially and non-commercially. See URL for full details. | Contains public sector information licensed under the Open Government Licence v2.0. | 62 | 63 | 64 | ## Contributing 65 | Maven was designed for your contributions! 66 | 67 | 1. Check for open issues or open a fresh issue to start a discussion around your idea or a bug. 68 | 2. Fork [the repository](https://github.com/john-sandall/maven) on GitHub to start making your changes to the master branch (or branch off of it). 69 | 3. For new datasets ensure the processed dataset is fully documented with a data dictionary. For new features and bugs, please write a test which shows that the bug was fixed or that the feature works as expected. 70 | 4. Send a [pull request](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork) and bug the maintainer until it gets merged and published. 😄 71 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | 8 | ## [Unreleased] 9 | 10 | ## [0.1.0] - 2020-02-03 11 | ### Changed 12 | - Model-ready datasets can now be "prediction-only" (i.e. for use pre-election when we don't know results). 13 | - Model-ready datasets include UKIP and BXP as part of "Other" until a better solution can be found. 14 | - Various changes to enable a better regional UNS forecast: 15 | - better handling of NI parties; 16 | - regional poll-of-polls goes back a month to incorporate large sample regional polling and not just sub-samples; 17 | - MRP sample sizes are disregarded for weighted poll-of-polls; 18 | - missing sample sizes (such as for polls derived from PollBase) are imputed using mean sample size within the same region; 19 | ### Added 20 | - Merged SixFifty UK polling data (detailed inc. sample sizes) up to June 2017 with Mark Pack's PollBase which has less columns but all polls up to Dec 2019. 21 | - Incorporated regional polling & regional sub-samples for December 2019 from SixFifty. 22 | - `general-election/UK/2019/model`: added model-ready dataset including UNS and regional UNS forecasts for the 2019 UK General Election. 23 | 24 | ## [0.0.12] - 2020-02-03 25 | ### Changed 26 | - `general-election/UK/2015/model`: model-ready dataset for just the 2015 UK General Election. 27 | ### Added 28 | - `general-election/UK/2017/model`: model-ready dataset for the 2017 UK General Election. 29 | 30 | ## [0.0.11] - 2020-02-02 31 | ### Added 32 | - Updated & refactored polling pipeline code. 33 | - Updated & refactored pipeline for building model-ready datasets for 2015/2017 UK general elections. 34 | 35 | ## [0.0.10] - 2020-01-26 36 | ### Added 37 | - Raw datasets are now cached on download, and processed datasets cached after processing, and always checked against MD5 for integrity. 38 | - Tests now exist for utils.py 39 | 40 | ## [0.0.9] - 2020-01-26 41 | ### Added 42 | - UK 2017 General Election dataset (**`general-election/UK/2017/results`**). 43 | - Some tests (that really need caching!). 44 | ### Changed 45 | - Now using data from the [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647). 46 | - The basic processed election results are now "long form" with less but more standardised information. 47 | - The full election results are (for now) no longer provided. 48 | - Lots of refactoring with some new base classes & utils making it faster to add new datasets. 49 | 50 | ## [0.0.8] - 2019-11-14 51 | ### Fixes 52 | - Electoral Commission [no longer hosts 2010 GE results](https://github.com/john-sandall/maven/pull/15) so use our fallback until a new primary can be found. 53 | - Fixed URL to EC's 2015 GE results. 54 | 55 | ## [0.0.7] - 2019-11-14 56 | ### Added 57 | - Tests added to setup.py. 58 | ### Changed 59 | - Switched to using [pip-tools](https://github.com/jazzband/pip-tools) instead of Pipenv for generating requirements.txt & locking dependencies. 60 | 61 | ## [0.0.6] - 2019-07-13 62 | ### Added 63 | - `general-election/UK/2015/model`: model-ready datasets for the 2015/2017 UK General Elections. 64 | - Regional polling datasets. 65 | 66 | ## [0.0.5] - 2019-07-13 67 | ### Added 68 | - Basic tests for `get.py` 69 | - Additional processing for the GE2015 results pipeline to generate a more useful dataset for common election modelling tasks. 70 | - Added `general-election/UK/2010/results` dataset. 71 | ### Changed 72 | - API design for dataset identifiers to use dash/slash instead of underscore/dash and capitalised country codes to make it clearer these will be ISO 3166 Alpha-2 codes, e.g. `general_election-gb-2015-results` -> `general-election/GB/2015/results`. 73 | - Changed GB to UK everywhere as these results are full UK results including Northern Ireland. 74 | 75 | ## [0.0.4] - 2019-07-07 76 | ### Fixes 77 | - Fixed relative imports and switch to using a class for each dataset. 78 | 79 | ## [0.0.3] - 2019-07-07 80 | ### Added 81 | - Improved README. 82 | 83 | ## [0.0.2] - 2019-07-07 84 | ### Added 85 | - UK 2015 General Election dataset (**`general_election-gb-2015-results`**). 86 | - Proper README plus data dictionary. 87 | - MANIFEST.in plus additional packaging info and this changelog. 88 | 89 | ## [0.0.1] - 2019-07-07 90 | ### Added 91 | - Barebones functionality, Python package requirements (setup.py, Pipfile, .gitignore, LICENSE) 92 | 93 | 94 | [Unreleased]: https://github.com/john-sandall/maven/compare/v0.1.0...HEAD 95 | [0.1.0]: https://github.com/john-sandall/maven/compare/v0.0.12...v0.1.0 96 | [0.0.12]: https://github.com/john-sandall/maven/compare/v0.0.11...v0.0.12 97 | [0.0.11]: https://github.com/john-sandall/maven/compare/v0.0.10...v0.0.11 98 | [0.0.10]: https://github.com/john-sandall/maven/compare/v0.0.9...v0.0.10 99 | [0.0.9]: https://github.com/john-sandall/maven/compare/v0.0.8...v0.0.9 100 | [0.0.8]: https://github.com/john-sandall/maven/compare/v0.0.7...v0.0.8 101 | [0.0.7]: https://github.com/john-sandall/maven/compare/v0.0.6...v0.0.7 102 | [0.0.6]: https://github.com/john-sandall/maven/compare/v0.0.5...v0.0.6 103 | [0.0.5]: https://github.com/john-sandall/maven/compare/v0.0.4...v0.0.5 104 | [0.0.4]: https://github.com/john-sandall/maven/compare/v0.0.3...v0.0.4 105 | [0.0.3]: https://github.com/john-sandall/maven/compare/v0.0.2...v0.0.3 106 | [0.0.2]: https://github.com/john-sandall/maven/compare/v0.0.1...v0.0.2 107 | [0.0.1]: https://github.com/john-sandall/maven/releases/tag/v0.0.1 108 | -------------------------------------------------------------------------------- /maven/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various helper functions. 3 | """ 4 | import hashlib 5 | import os 6 | import shutil 7 | import warnings 8 | from functools import partial 9 | from pathlib import Path 10 | from urllib.parse import urlparse 11 | 12 | import pandas as pd 13 | import requests 14 | 15 | import maven 16 | 17 | ######### 18 | # GENERAL 19 | ######### 20 | 21 | 22 | def sanitise(x, replace=None): 23 | if isinstance(x, str): 24 | out = x.lower().replace(" ", "_").replace("/", "_") 25 | if replace and out in replace: 26 | out = replace[out] 27 | return out 28 | elif isinstance(x, (list, pd.core.indexes.base.Index, pd.core.series.Series)): 29 | return [sanitise(element, replace=replace) for element in x] 30 | else: 31 | raise TypeError(f"Unexpected type encountered in sanitise: type(x) == '{type(x)}'") 32 | 33 | 34 | def calculate_md5_checksum(filename): 35 | """ 36 | Calculate the checksum of the file, exactly same as md5-sum linux util. 37 | Code from https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/downloader.py 38 | """ 39 | hash_md5 = hashlib.md5() 40 | with open(filename, "rb") as f: 41 | for chunk in iter(lambda: f.read(4096), b""): 42 | hash_md5.update(chunk) 43 | return hash_md5.hexdigest() 44 | 45 | 46 | def is_url(url): 47 | """Source: https://stackoverflow.com/a/52455972""" 48 | try: 49 | result = urlparse(url) 50 | return all([result.scheme, result.netloc]) 51 | except ValueError: 52 | return False 53 | 54 | 55 | def fetch_url(url, filename, target_dir, rename_file=False): 56 | """Download filename from url into target_dir.""" 57 | if rename_file: 58 | url_to_retrieve = url 59 | else: 60 | url_to_retrieve = url + filename 61 | response = requests.get(url_to_retrieve) 62 | if response.status_code != 200: 63 | warnings.warn( 64 | f"Received status {response.status_code} when trying to retrieve {url}{filename}" 65 | ) 66 | # Save to file 67 | with open(target_dir / filename, "wb") as f: 68 | f.write(response.content) 69 | print(f"Successfully downloaded {filename} into {target_dir.resolve()}") 70 | return target_dir / filename 71 | 72 | 73 | def get_and_copy(identifier, filename, target_dir): 74 | """Run maven.get(identifier) and copy filename from identifier/processed/ data 75 | into target/ directory. 76 | """ 77 | # target_dir by default is data/general-election/UK/2015/model 78 | subdirectories_below = str(target_dir).count("/") 79 | go_up = "/".join([".." for _ in range(subdirectories_below)]) 80 | data_directory = (target_dir / go_up).resolve() # sensible guess? 81 | maven.get(identifier, data_directory=data_directory) 82 | source = data_directory / identifier / "processed" 83 | print(f"Copying {filename} from {source} -> {target_dir}.") 84 | shutil.copyfile(src=source / filename, dst=target_dir / filename) 85 | 86 | 87 | def retrieve_from_cache_if_exists( 88 | filename, target_dir, processing_fn, md5_checksum=None, caching_enabled=True, verbose=False 89 | ): 90 | """Retrieve filename from target_dir if it exists, otherwise execute processing_fn. 91 | 92 | Raises a warning if the retrieved/processed file's checksum doesn't match the expected MD5. 93 | """ 94 | if caching_enabled and (target_dir / filename).exists(): 95 | # Check if it's already in target_dir. 96 | print(f"Cached file {filename} is already in {target_dir.resolve()}") 97 | else: 98 | # Either caching disabled or file not there yet. 99 | processing_fn() 100 | 101 | # File should now be there. Let's check checksums. 102 | downloaded_file_md5_checksum = calculate_md5_checksum(target_dir / filename) 103 | if verbose: 104 | print(f"Checksum for {filename}: {downloaded_file_md5_checksum}") 105 | if md5_checksum and downloaded_file_md5_checksum != md5_checksum: 106 | warnings.warn(f"MD5 checksum doesn't match for {filename}") 107 | 108 | 109 | ################## 110 | # PIPELINE CLASSES 111 | ################## 112 | 113 | 114 | class Pipeline: 115 | """Generic class for retrieving & processing datasets with built-in caching & MD5 checking.""" 116 | 117 | def __init__(self, directory): 118 | self.directory = Path(directory) 119 | self.sources = [] # tuples of (url, filename, checksum) 120 | self.rename_source = False 121 | self.retrieve_all = False 122 | self.target = (None, None) 123 | self.verbose_name = "" 124 | self.year = None 125 | self.verbose = False 126 | self.cache = True 127 | 128 | def retrieve(self): 129 | """ 130 | Retrieve data from self.sources into self.directory / 'raw' and validate against checksum. 131 | """ 132 | target_dir = self.directory / "raw" 133 | os.makedirs(target_dir, exist_ok=True) # create directory if it doesn't exist 134 | for url, filename, md5_checksum in self.sources: 135 | if is_url(url): 136 | processing_fn = partial( 137 | fetch_url, 138 | url=url, 139 | filename=filename, 140 | target_dir=target_dir, 141 | rename_file=self.rename_source, 142 | ) 143 | else: 144 | processing_fn = partial( 145 | get_and_copy, identifier=url, filename=filename, target_dir=target_dir 146 | ) 147 | retrieve_from_cache_if_exists( 148 | filename=filename, 149 | target_dir=target_dir, 150 | processing_fn=processing_fn, 151 | md5_checksum=md5_checksum, 152 | caching_enabled=self.cache, 153 | verbose=self.verbose, 154 | ) 155 | if not self.retrieve_all: # retrieve just the first dataset 156 | return 157 | if self.retrieve_all: # all datasets retrieved 158 | return 159 | else: # retrieving first dataset only but all fallbacks failed 160 | raise RuntimeError(f"Unable to download {self.verbose_name} data.") 161 | 162 | def process(self): 163 | pass 164 | -------------------------------------------------------------------------------- /maven/datasets/general_election/uk_polls.py: -------------------------------------------------------------------------------- 1 | """ 2 | General Election polling data for the United Kingdom. 3 | 4 | Usage: 5 | > import maven 6 | > maven.get('general-election/UK/polls', data_directory='./data/') 7 | 8 | Sources: 9 | - SixFifty polling data: https://github.com/six50/pipeline/tree/master/data/polls/ 10 | - https://s3-eu-west-1.amazonaws.com/sixfifty/polls.csv 11 | - https://s3-eu-west-1.amazonaws.com/sixfifty/polls_london.csv 12 | - https://s3-eu-west-1.amazonaws.com/sixfifty/polls_scotland.csv 13 | - https://s3-eu-west-1.amazonaws.com/sixfifty/polls_wales.csv 14 | - https://s3-eu-west-1.amazonaws.com/sixfifty/polls_ni.csv 15 | - PollBase: https://www.markpack.org.uk/opinion-polls/ 16 | """ 17 | import os 18 | from pathlib import Path 19 | 20 | import numpy as np 21 | import pandas as pd 22 | 23 | from maven import utils 24 | from maven.datasets.general_election.base import Pipeline 25 | 26 | 27 | class UKPolls(Pipeline): 28 | """Handles General Election polling data for the United Kingdom. 29 | 30 | Mark Pack's PollBase : https://www.markpack.org.uk/opinion-polls/ 31 | """ 32 | 33 | def __init__(self, directory=Path("data/general-election/UK/polls")): 34 | super(UKPolls, self).__init__( 35 | directory=directory 36 | ) # inherit base __init__ but override default directory 37 | self.sources = [ 38 | # tuples of (url, filename, checksum) 39 | ( 40 | "https://3859gp38qzh51h504x6gvv0o-wpengine.netdna-ssl.com/files/2020/01/", 41 | "PollBase-Q4-2019.xlsx", 42 | "81e9dd972f17d0b4f572e7da6c4c497f", 43 | ), 44 | ( 45 | "https://s3-eu-west-1.amazonaws.com/sixfifty/", 46 | "polls.csv", 47 | "8c32b623346c8c0faa603bc76c4d7fd1", 48 | ), 49 | ( 50 | "https://s3-eu-west-1.amazonaws.com/sixfifty/", 51 | "polls_london.csv", 52 | "cd28ebb7233b808796535fc0b572304e", 53 | ), 54 | ( 55 | "https://s3-eu-west-1.amazonaws.com/sixfifty/", 56 | "polls_scotland.csv", 57 | "6c2ba92e2325de0e22a208fb0b3e95fc", 58 | ), 59 | ( 60 | "https://s3-eu-west-1.amazonaws.com/sixfifty/", 61 | "polls_wales.csv", 62 | "6857df3c18df525d5e59a6bf1170b10c", 63 | ), 64 | ( 65 | "https://s3-eu-west-1.amazonaws.com/sixfifty/", 66 | "polls_ni.csv", 67 | "46bbe5e9dc29d4b3042837fe4c16ca07", 68 | ), 69 | ] 70 | self.retrieve_all = True 71 | self.target = ( 72 | "general_election-uk-polls.csv", 73 | "cbc3c19a376b4ab632f122008f593799", 74 | ) # filename, checksum 75 | self.verbose_name = "UKPolls" 76 | 77 | def process(self): 78 | """Process UK polling data.""" 79 | filename = self.sources[0][1] 80 | processed_results_location = self.directory / "processed" / self.target[0] 81 | os.makedirs( 82 | self.directory / "processed", exist_ok=True 83 | ) # create directory if it doesn't exist 84 | 85 | def process_and_export(): 86 | # Read in PollBase 87 | df = pd.read_excel( 88 | self.directory / "raw" / filename, 89 | sheet_name="17-19", 90 | usecols="A:C,G:H,I,K,M,O,Q,S,U,Y", 91 | ) 92 | 93 | # Clean it up 94 | df.columns = utils.sanitise( 95 | df.columns, 96 | replace={ 97 | "polling": "company", 98 | "publisher": "client", 99 | "unnamed:_24": "method", 100 | "green": "grn", 101 | "tig_cuk": "chuk", 102 | }, 103 | ) 104 | df["year"] = df.year.replace({"?": 2019}).ffill().astype(int) 105 | df["month"] = df.month.ffill() 106 | df = df[df["fieldwork"].notnull()].copy() 107 | df["day_from"] = df.fieldwork.apply( 108 | lambda x: str(x).split("-")[0].replace("?", "") 109 | if "-" in str(x) 110 | else str(x).replace("?", "") 111 | ) 112 | df["day_to"] = df.fieldwork.apply( 113 | lambda x: str(x).split("-")[1].replace("?", "") 114 | if "-" in str(x) 115 | else str(x).replace("?", "") 116 | ) 117 | df["from"] = pd.to_datetime( 118 | df.apply(lambda row: f"{row.year}-{row.month}-{row.day_from}", axis=1) 119 | ) 120 | df["to"] = pd.to_datetime( 121 | df.apply(lambda row: f"{row.year}-{row.month}-{row.day_to}", axis=1) 122 | ) 123 | 124 | # Fix month & year in df['to'] where e.g. fieldwork is "30-3 Jan" 125 | month_shifted = ( 126 | df.year.astype(str) 127 | + "-" 128 | + ((df.to.dt.month + 1) % 12).astype(str).replace("0", "12") 129 | + "-" 130 | + df.day_to.astype(str) 131 | ) 132 | year_needs_shifting = month_shifted.apply(lambda x: str(x).split("-")[1]) == "1" 133 | month_shifted.loc[year_needs_shifting] = ( 134 | ((df.loc[year_needs_shifting, "year"]).astype(int) + 1) 135 | .astype(str) 136 | .replace("0", "12") 137 | + "-" 138 | + ((df.to.dt.month + 1) % 12).astype(str) 139 | + "-" 140 | + df.day_to.astype(str) 141 | ) 142 | df.loc[df["from"] > df["to"], "to"] = month_shifted.loc[df["from"] > df["to"]] 143 | df["to"] = pd.to_datetime(df.to) 144 | 145 | # Divide numbers by 100 146 | for party in ["con", "lab", "ld", "ukip", "grn", "chuk", "bxp"]: 147 | df[party] = df[party].replace(" ", np.nan).astype(float) / 100 148 | 149 | # Prepare for merge with SixFifty data 150 | df["sample_size"] = np.nan 151 | df["snp"] = np.nan 152 | df["pdf"] = np.nan 153 | columns = [ 154 | "company", 155 | "client", 156 | "method", 157 | "from", 158 | "to", 159 | "sample_size", 160 | "con", 161 | "lab", 162 | "ld", 163 | "ukip", 164 | "grn", 165 | "chuk", 166 | "bxp", 167 | "snp", 168 | "pdf", 169 | ] 170 | df = df[columns].copy().sort_values("to") 171 | 172 | # Read in SixFifty polling data (2005 -> June 2017) 173 | df_sixfifty = pd.read_csv( 174 | self.directory / "raw" / "polls.csv", parse_dates=["from", "to"] 175 | ) 176 | df_sixfifty["chuk"] = np.nan 177 | df_sixfifty["bxp"] = np.nan 178 | df_sixfifty = df_sixfifty[columns].copy().sort_values("to") 179 | 180 | # Merge 181 | df_sixfifty = df_sixfifty[df_sixfifty.to < df.to.min()].copy() 182 | assert df_sixfifty.to.max() < df.to.min() 183 | df_polls = pd.concat([df_sixfifty, df], axis=0) 184 | 185 | # Export 186 | print(f"Exporting dataset to {processed_results_location.resolve()}") 187 | df_polls.to_csv(processed_results_location, index=False) 188 | 189 | utils.retrieve_from_cache_if_exists( 190 | filename=self.target[0], 191 | target_dir=(self.directory / "processed"), 192 | processing_fn=process_and_export, 193 | md5_checksum=self.target[1], 194 | caching_enabled=self.cache, 195 | verbose=self.verbose, 196 | ) 197 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /maven/datasets/general_election/README.md: -------------------------------------------------------------------------------- 1 | # General Election datasets 2 | 3 | If you have any questions about these datasets please [contact me @John_Sandall](https://twitter.com/John_Sandall) on Twitter. 4 | 5 | 6 | ## Sources 7 | We aim to source our data directly from the most authorative data provider, falling back to less authorative sources where a primary source isn't available. By country: 8 | - **United Kingdom:** [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647). 9 | 10 | 11 | ## Data dictionaries 12 | 13 | #### **`general-election/UK/2015/model`** 14 | | Column | Type | Description | Example | 15 | | -- | -- | -- | -- | 16 | | Constituency-level factors | 17 | | `ons_id` | int | ONS constituency identifier | `E14000530` | 18 | | `constituency` | str | Constituency name | `ALDERSHOT` | 19 | | `county` | str | County:{`Avon`, `Bedfordshire`, and 44 more} | `Hampshire` | 20 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` | 21 | | `geo` | str | Geographic region (aggregated level between region and country) | `england_not_london` | 22 | | `country`| str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` | 23 | | 2010 election data | 24 | | `electorate` | int | Electorate | `72430` | 25 | | `total_votes_last` | int | Total valid votes counted in this constituency in the 2010 election | `45384` | 26 | | `turnout_last` | float | Turnout in this constituency in the 2010 election | `0.635052123` | 27 | | `party` | str | Party:{`apni`, `con`, `dup`, `grn`, `lab`, `ld`, `other`, `pc`, `sdlp`, `sf`, `snp`, `ukip`, `uup`} | `con` | 28 | | `votes_last` | int | Votes counted for this party in this constituency in 2010 | `21203` | 29 | | `voteshare_last` | float | Percentage voteshare for this party in this constituency in 2010 | `0.467191` | 30 | | `winner_last` | str | Party that won in this constituency in 2010 | `con` | 31 | | `won_here_last` | bool | Did this party win in this constituency in 2010 | `True` | 32 | | `national_voteshare_last` | float | Percentage of national voteshare for this party from 2010 results | `0.360542872` | 33 | | 2015 pre-election data | 34 | | `national_polls_now` | float | Percentage of national voteshare for this party from 2015 pre-election polling | `0.338181818` | 35 | | `national_swing` | float | Uplift in national voteshare for this party between 2010 results and 2015 polling | `-0.062020512` | 36 | | `national_swing_forecast` | str | Projected voteshare for this party in this constituency using a UNS model | `0.438215651` | 37 | | `national_swing_winner` | str | Projected winner in this constituency using `national_swing_forecast` | `con` | 38 | | 2015 post-election data | 39 | | `total_votes_now` | int | Total valid votes counted in this constituency in the 2015 election | `46191` | 40 | | `turnout_now` | float | Turnout in this constituency in 2015 election | `0.637732984` | 41 | | `votes_now` | int | Total votes counted for this party in this constituency in 2015 | `23369` | 42 | | `voteshare_now` | float | Percentage voteshare for this party in this constituency in 2015 | `0.505921067` | 43 | | `winner_now` | str | Party that won in this constituency in 2015 | `con` | 44 | 45 | 46 | #### **`general-election/UK/2017/model`** 47 | | Column | Type | Description | Example | 48 | | -- | -- | -- | -- | 49 | | Constituency-level factors | 50 | | `ons_id` | int | ONS constituency identifier | `E14000530` | 51 | | `constituency` | str | Constituency name | `ALDERSHOT` | 52 | | `county` | str | County:{`Avon`, `Bedfordshire`, and 44 more} | `Hampshire` | 53 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` | 54 | | `geo` | str | Geographic region (aggregated level between region and country) | `england_not_london` | 55 | | `country`| str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` | 56 | | 2015 election data | 57 | | `electorate` | int | Electorate | `76205` | 58 | | `total_votes_last` | int | Total valid votes counted in this constituency in the 2015 election | `46191` | 59 | | `turnout_last` | float | Turnout in this constituency in the 2015 election | `0.637732984` | 60 | | `party` | str | Party:{`apni`, `con`, `dup`, `grn`, `lab`, `ld`, `other`, `pc`, `sdlp`, `sf`, `snp`, `ukip`, `uup`} | `con` | 61 | | `votes_last` | int | Votes counted for this party in this constituency in 2015 | `23369` | 62 | | `voteshare_last` | float | Percentage voteshare for this party in this constituency in 2015 | `0.505921067` | 63 | | `winner_last` | str | Party that won in this constituency in 2015 | `con` | 64 | | `won_here_last` | bool | Did this party win in this constituency in 2015 | `True` | 65 | | `national_voteshare_last` | float | Percentage of national voteshare for this party from 2015 results | `0.368095115` | 66 | | 2017 pre-election data | 67 | | `national_polls_now` | float | Percentage of national voteshare for this party from 2017 pre-election polling | `0.42729587` | 68 | | `national_swing` | float | Uplift in national voteshare for this party between 2015 results and 2017 polling | `0.160830048` | 69 | | `national_swing_forecast` | str | Projected voteshare for this party in this constituency using a UNS model | `0.587288376` | 70 | | `national_swing_winner` | str | Projected winner in this constituency using `national_swing_forecast` | `con` | 71 | | 2015/2017 regional data | 72 | | `geo_polls_now` | float | Percentage of regional voteshare for this party from 2017 pre-election polling | `0.470077263` | 73 | | `geo_voteshare_last` | float | Percentage of regional voteshare for this party from 2015 results | `0.418216805` | 74 | | `geo_swing` | float | Uplift in regional voteshare for this party between 2015 results and 2017 polling | `0.124003764` | 75 | | `geo_swing_forecast` | float | Projected voteshare for this party in this constituency using a regional UNS model | `0.568657183` | 76 | | `geo_swing_winner` | str | Projected winner in this constituency using `geo_swing_forecast` | `con` | 77 | | 2017 post-election data | 78 | | `total_votes_now` | int | Total valid votes counted in this constituency in the 2017 election | `48950` | 79 | | `turnout_now` | float | Turnout in this constituency in 2017 election | `0.642346303` | 80 | | `votes_now` | int | Total votes counted for this party in this constituency in 2017 | `26950` | 81 | | `voteshare_now` | float | Percentage voteshare for this party in this constituency in 2017 | `0.550561798` | 82 | | `winner_now` | str | Party that won in this constituency in 2017 | `con` | 83 | 84 | 85 | #### **`general-election/UK/2019/model`** 86 | | Column | Type | Description | Example | 87 | | -- | -- | -- | -- | 88 | | Constituency-level factors | 89 | | `ons_id` | int | ONS constituency identifier | `E14000530` | 90 | | `constituency` | str | Constituency name | `ALDERSHOT` | 91 | | `county` | str | County:{`Avon`, `Bedfordshire`, and 44 more} | `Hampshire` | 92 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` | 93 | | `geo` | str | Geographic region (aggregated level between region and country) | `england_not_london` | 94 | | `country`| str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` | 95 | | 2017 election data | 96 | | `electorate` | int | Electorate | `76205` | 97 | | `total_votes_last` | int | Total valid votes counted in this constituency in the 2017 election | `48950` | 98 | | `turnout_last` | float | Turnout in this constituency in the 2017 election | `0.642346303` | 99 | | `party` | str | Party:{`apni`, `con`, `dup`, `grn`, `lab`, `ld`, `other`, `pc`, `sdlp`, `sf`, `snp`, `ukip`, `uup`} | `con` | 100 | | `votes_last` | int | Votes counted for this party in this constituency in 2017 | `26950` | 101 | | `voteshare_last` | float | Percentage voteshare for this party in this constituency in 2017 | `0.550561798` | 102 | | `winner_last` | str | Party that won in this constituency in 2017 | `con` | 103 | | `won_here_last` | bool | Did this party win in this constituency in 2017 | `True` | 104 | | `national_voteshare_last` | float | Percentage of national voteshare for this party from 2017 results | `0.423444482` | 105 | | 2019 pre-election data | 106 | | `national_polls_now` | float | Percentage of national voteshare for this party from 2019 pre-election polling | `0.396538462` | 107 | | `national_swing` | float | Uplift in national voteshare for this party between 2017 results and 2019 polling | `-0.063540845` | 108 | | `national_swing_forecast` | str | Projected voteshare for this party in this constituency using a UNS model | `0.515578636` | 109 | | `national_swing_winner` | str | Projected winner in this constituency using `national_swing_forecast` | `con` | 110 | | 2017/2019 regional data | 111 | | `geo_polls_now` | float | Percentage of regional voteshare for this party from 2019 pre-election polling | `0.429089129` | 112 | | `geo_voteshare_last` | float | Percentage of regional voteshare for this party from 2017 results | `0.474642379` | 113 | | `geo_swing` | float | Uplift in regional voteshare for this party between 2017 results and 2019 polling | `-0.095973837` | 114 | | `geo_swing_forecast` | float | Projected voteshare for this party in this constituency using a regional UNS model | `0.497722269` | 115 | | `geo_swing_winner` | str | Projected winner in this constituency using `geo_swing_forecast` | `con` | 116 | 117 | 118 | #### **`general-election/UK/2010/results`** 119 | | Column | Type | Description | Example | 120 | | -- | -- | -- | -- | 121 | | `ons_id` | str | Standardised constituency identifier | `E14000530` | 122 | | `constituency` | str | Constituency name | `ALDERSHOT` | 123 | | `county` | str | County name | `Hampshire` | 124 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` | 125 | | `country` | str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` | 126 | | `electorate` | int | Electorate | `71465` | 127 | | `total_votes` | int | Total valid votes counted in this constituency | `45384` | 128 | | `turnout` | float | Turnout in this constituency | `0.635052123` | 129 | | `party` | str | Name of political party (lower-cased & abbreviated) | `con` | 130 | | `votes` | int | Votes for this party | `21203` | 131 | | `voteshare` | float | Vote share for this party within the constituency | `0.467191081` | 132 | 133 | 134 | #### **`general-election/UK/2015/results`** 135 | | Column | Type | Description | Example | 136 | | -- | -- | -- | -- | 137 | | `ons_id` | str | Standardised constituency identifier | `E14000530` | 138 | | `constituency` | str | Constituency name | `ALDERSHOT` | 139 | | `county` | str | County name | `Hampshire` | 140 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` | 141 | | `country` | str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` | 142 | | `electorate` | int | Electorate | `72430` | 143 | | `total_votes` | int | Total valid votes counted in this constituency | `46191` | 144 | | `turnout` | float | Turnout in this constituency | `0.637732984` | 145 | | `party` | str | Name of political party (lower-cased & abbreviated) | `con` | 146 | | `votes` | int | Votes for this party | `23369` | 147 | | `voteshare` | float | Vote share for this party within the constituency | `0.505921067` | 148 | 149 | 150 | #### **`general-election/UK/2017/results`** 151 | | Column | Type | Description | Example | 152 | | -- | -- | -- | -- | 153 | | `ons_id` | str | Standardised constituency identifier | `E14000530` | 154 | | `constituency` | str | Constituency name | `ALDERSHOT` | 155 | | `county` | str | County name | `Hampshire` | 156 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` | 157 | | `country` | str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` | 158 | | `electorate` | int | Electorate | `76205` | 159 | | `total_votes` | int | Total valid votes counted in this constituency | `48950` | 160 | | `turnout` | float | Turnout in this constituency | `0.642346303` | 161 | | `party` | str | Name of political party (lower-cased & abbreviated) | `con` | 162 | | `votes` | int | Votes for this party | `26950` | 163 | | `voteshare` | float | Vote share for this party within the constituency | `0.550561798` | 164 | 165 | 166 | #### **`general-election/UK/polls`** 167 | | Column | Type | Description | Example | 168 | | -- | -- | -- | -- | 169 | | `company` | str | Name of company conducting opinion poll | `Ipsos MORI Phone` | 170 | | `client` | str | Name of client/publisher commissioning the poll | `Evening Standard` | 171 | | `method` | str | Methodology: {`Online`, `Phone`, `Mobile`} | `Phone` | 172 | | `from` | date | Date fieldwork started | `2017-06-06` | 173 | | `to` | date | Date fieldwork completed | `2017-06-07` | 174 | | `sample_size` | int | Sample size of poll | `1291` | 175 | | `con` | float | National percentage voteshare for the Conservative party | `0.44` | 176 | | `lab` | float | National percentage voteshare for the Labour party | `0.36` | 177 | | `ld` | float | National percentage voteshare for the Liberal Democrat party | `0.07` | 178 | | `ukip` | float | National percentage voteshare for UKIP | `0.04` | 179 | | `grn` | float | National percentage voteshare for the Green party | `0.02` | 180 | | `snp` | float | National percentage voteshare for the SNP | `0.05` | 181 | | `pdf` | str | Download URL of PDF tables containing raw data | `https://www.ipsos.com/sites/default/files/2017-06/pm-election-2017-final-tables.pdf` | 182 | -------------------------------------------------------------------------------- /maven/datasets/general_election/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base classes. 3 | """ 4 | import os 5 | from functools import partial 6 | from pathlib import Path 7 | 8 | import pandas as pd 9 | 10 | from maven import utils 11 | 12 | 13 | class Pipeline: 14 | """Generic class for retrieving & processing datasets with built-in caching & MD5 checking.""" 15 | 16 | def __init__(self, directory): 17 | self.directory = Path(directory) 18 | self.sources = [] # tuples of (url, filename, checksum) 19 | self.retrieve_all = False 20 | self.target = (None, None) 21 | self.verbose_name = "" 22 | self.year = None 23 | self.verbose = False 24 | self.cache = True 25 | 26 | def retrieve(self): 27 | """Retrieve data from self.sources into self.directory / 'raw' and validate against checksum.""" 28 | target_dir = self.directory / "raw" 29 | os.makedirs(target_dir, exist_ok=True) # create directory if it doesn't exist 30 | for url, filename, md5_checksum in self.sources: 31 | if utils.is_url(url): 32 | processing_fn = partial( 33 | utils.fetch_url, url=url, filename=filename, target_dir=target_dir 34 | ) 35 | else: 36 | processing_fn = partial( 37 | utils.get_and_copy, identifier=url, filename=filename, target_dir=target_dir 38 | ) 39 | utils.retrieve_from_cache_if_exists( 40 | filename=filename, 41 | target_dir=target_dir, 42 | processing_fn=processing_fn, 43 | md5_checksum=md5_checksum, 44 | caching_enabled=self.cache, 45 | verbose=self.verbose, 46 | ) 47 | if not self.retrieve_all: # retrieve just the first dataset 48 | return 49 | if self.retrieve_all: # all datasets retrieved 50 | return 51 | else: # retrieving first dataset only but all fallbacks failed 52 | raise RuntimeError(f"Unable to download {self.verbose_name} data.") 53 | 54 | def process(self): 55 | pass 56 | 57 | 58 | class UKResults(Pipeline): 59 | """Handles results data for UK General Elections.""" 60 | 61 | @staticmethod 62 | def process_hoc_sheet(input_file, data_dir, sheet_name): 63 | # Import general election results 64 | print(f"Read and clean {input_file}") 65 | parties = [ 66 | "Con", 67 | "LD", 68 | "Lab", 69 | "UKIP", 70 | "Grn", 71 | "SNP", 72 | "PC", 73 | "DUP", 74 | "SF", 75 | "SDLP", 76 | "UUP", 77 | "APNI", 78 | "Other", 79 | ] 80 | results = pd.read_excel( 81 | data_dir / "raw" / input_file, 82 | sheet_name=sheet_name, 83 | skiprows=4, 84 | header=None, 85 | skipfooter=19, 86 | ) 87 | assert results.shape == (650, 49) 88 | 89 | # Specify columns (spread across multiple rows in Excel) 90 | cols = ["", "id", "Constituency", "County", "Country/Region", "Country", "Electorate", ""] 91 | for party in parties: 92 | cols += [f"{party}_Votes", f"{party}_Voteshare", ""] 93 | cols += ["Total votes", "Turnout"] 94 | results.columns = cols 95 | 96 | # Some basic data quality checks 97 | for party in parties: 98 | assert ( 99 | results[f"{party}_Voteshare"] - results[f"{party}_Votes"] / results["Total votes"] 100 | ).sum() == 0 101 | assert ( 102 | results[[f"{party}_Votes" for party in parties]].fillna(0.0).sum(axis=1) 103 | == results["Total votes"] 104 | ).all() 105 | assert ((results["Total votes"] / results["Electorate"]) == results["Turnout"]).all() 106 | 107 | # Drop blank columns plus those that can be calculated 108 | cols_to_drop = [""] + [c for c in cols if "Voteshare" in c] + ["Total votes", "Turnout"] 109 | results = results.drop(columns=cols_to_drop) 110 | 111 | # Sanitise column names 112 | results.columns = utils.sanitise(results.columns) 113 | results = results.rename(columns={"id": "ons_id", "country_region": "region"}) 114 | results.columns = [c.replace("_votes", "") for c in results.columns] 115 | 116 | # Reshape to long 117 | results_long = pd.melt( 118 | results, 119 | id_vars=["ons_id", "constituency", "county", "region", "country", "electorate"], 120 | var_name="party", 121 | value_name="votes", 122 | ) 123 | assert results.shape == (650, 19) 124 | assert results_long.shape == (650 * len(parties), 19 - len(parties) + 2) 125 | 126 | # Sort by (ons_id, party) 127 | results_long["party"] = pd.Categorical( 128 | results_long.party, categories=pd.Series(parties).apply(utils.sanitise), ordered=True 129 | ) 130 | results_long = results_long.sort_values(["ons_id", "party"]).reset_index(drop=True) 131 | 132 | # Re-add total_votes & voteshare 133 | results_long["total_votes"] = results_long.ons_id.map( 134 | results_long.groupby("ons_id").votes.sum().astype(int) 135 | ) 136 | results_long["voteshare"] = results_long["votes"] / results_long["total_votes"] 137 | results_long["turnout"] = results_long["total_votes"] / results_long["electorate"] 138 | 139 | # Reorder cols for export 140 | results_long = results_long[ 141 | [ 142 | "ons_id", 143 | "constituency", 144 | "county", 145 | "region", 146 | "country", 147 | "electorate", 148 | "total_votes", 149 | "turnout", 150 | "party", 151 | "votes", 152 | "voteshare", 153 | ] 154 | ].copy() 155 | 156 | return results_long 157 | 158 | def process(self): 159 | """Process results data for a UK General Election.""" 160 | filename = self.sources[0][1] 161 | processed_results_location = self.directory / "processed" / self.target[0] 162 | os.makedirs( 163 | self.directory / "processed", exist_ok=True 164 | ) # create directory if it doesn't exist 165 | 166 | def process_and_export(): 167 | # Either caching disabled or file not yet processed; process regardless. 168 | results = self.process_hoc_sheet( 169 | input_file=filename, data_dir=self.directory, sheet_name=str(self.year) 170 | ) 171 | # Export 172 | print(f"Exporting dataset to {processed_results_location.resolve()}") 173 | results.to_csv(processed_results_location, index=False) 174 | 175 | utils.retrieve_from_cache_if_exists( 176 | filename=self.target[0], 177 | target_dir=(self.directory / "processed"), 178 | processing_fn=process_and_export, 179 | md5_checksum=self.target[1], 180 | caching_enabled=self.cache, 181 | verbose=self.verbose, 182 | ) 183 | 184 | 185 | class UKModel(Pipeline): 186 | """Generates model-ready data for UK General Elections.""" 187 | 188 | # geos sit between region and country (e.g. "england_not_london") and map to things we can extract from polls 189 | geos = ["uk", "scotland", "wales", "ni", "london"] 190 | geo_lookup = { 191 | "Northern Ireland": "ni", 192 | "Scotland": "scotland", 193 | "Wales": "wales", 194 | "London": "london", 195 | "South East": "england_not_london", 196 | "West Midlands": "england_not_london", 197 | "North West": "england_not_london", 198 | "East Midlands": "england_not_london", 199 | "Yorkshire and The Humber": "england_not_london", 200 | "Eastern": "england_not_london", 201 | "South West": "england_not_london", 202 | "North East": "england_not_london", 203 | } 204 | 205 | results_seat_count = { 206 | 2010: { 207 | "con": 306, 208 | "lab": 258, 209 | "ld": 57, 210 | "dup": 8, 211 | "snp": 6, 212 | "sf": 5, 213 | "pc": 3, 214 | "sdlp": 3, 215 | "grn": 1, 216 | "apni": 1, 217 | "other": 2, # {'speaker': 'John Bercow', 'independent': 'Sylvia Hermon'} 218 | }, 219 | 2015: { 220 | "con": 330, 221 | "lab": 232, 222 | "snp": 56, 223 | "ld": 8, 224 | "dup": 8, 225 | "sf": 4, 226 | "pc": 3, 227 | "sdlp": 3, 228 | "uup": 2, 229 | "ukip": 1, 230 | "grn": 1, 231 | "other": 2, # {'speaker': 'John Bercow', 'independent': 'Sylvia Hermon'} 232 | }, 233 | 2017: { 234 | "con": 317, 235 | "lab": 262, 236 | "snp": 35, 237 | "ld": 12, 238 | "dup": 10, 239 | "sf": 7, 240 | "pc": 4, 241 | "grn": 1, 242 | "other": 2, # {'speaker': 'John Bercow', 'independent': 'Sylvia Hermon'} 243 | }, 244 | } 245 | 246 | winner_fixes = { 247 | 2010: [ 248 | # https://en.wikipedia.org/wiki/Fermanagh_and_South_Tyrone_(UK_Parliament_constituency) 249 | ( 250 | "N06000007", 251 | "sf", 252 | ), # SF = 21,304, Independent Unionist (with DUP support) = 21,300, Independent = 188 253 | ] 254 | } 255 | 256 | # Define these to make them available as expected attributes. 257 | last_date = None 258 | now_date = None 259 | last = None 260 | now = None 261 | prediction_only = False 262 | 263 | def load_results_data(self): 264 | """Load UK General Election results for consecutive elections with one row / party / constituency and add: 265 | - `geo`: geo this constituency is in (e.g. `scotland`, `england_not_london`) 266 | - `winner`: winner per constituency (derived from data, with corrections to match reported results) 267 | - `won_here`: did this party win this seat? 268 | 269 | Args: 270 | last (int): Year of prior election (used to provide historical trend information). 271 | now (int): Year of election to be modelled. 272 | 273 | Returns: dict containing key-value pairs of (year, pd.DataFrame) of results. 274 | """ 275 | # Define these for code readability 276 | last = self.last 277 | now = self.now 278 | 279 | # Import general election results 280 | results = {} 281 | results[last] = pd.read_csv( 282 | self.directory / "raw" / f"general_election-uk-{last}-results.csv" 283 | ) 284 | try: 285 | results[now] = pd.read_csv( 286 | self.directory / "raw" / f"general_election-uk-{now}-results.csv" 287 | ) 288 | except FileNotFoundError: 289 | self.prediction_only = True 290 | 291 | # Add geos 292 | results[last]["geo"] = results[last].region.map(self.geo_lookup) 293 | if not self.prediction_only: 294 | results[now]["geo"] = results[now].region.map(self.geo_lookup) 295 | 296 | # Check constituencies are mergeable 297 | assert ( 298 | results[last].sort_values("ons_id").ons_id 299 | == results[now].sort_values("ons_id").ons_id 300 | ).all() 301 | 302 | # Add the winner for the results 303 | if self.prediction_only: 304 | years = [last] 305 | else: 306 | years = [last, now] 307 | for year in years: 308 | res = results[year].copy() 309 | winners = self.calculate_winners(res, "voteshare") 310 | res["winner"] = res.ons_id.map(winners) 311 | 312 | # Apply fixes 313 | if year in self.winner_fixes: 314 | for ons_id, actual_winner in self.winner_fixes[year]: 315 | res.loc[res.ons_id == ons_id, "winner"] = actual_winner 316 | 317 | # Check this matches the results on record 318 | seat_count = res[["ons_id", "winner"]].drop_duplicates().groupby("winner").size() 319 | assert dict(seat_count) == self.results_seat_count[year] 320 | 321 | # Add boolean per row for if this party won this seat 322 | res["won_here"] = res.party == res.winner 323 | 324 | # Remove UKIP to deal with Brexit Party voteshare matching problems 325 | # TODO: This is not a great solution, need a better way to map in BXP for modelling 2019. 326 | res_list = [] 327 | for constituency in res.ons_id.unique(): 328 | res_con = res[res.ons_id == constituency].copy() 329 | for metric in ["votes", "voteshare"]: 330 | res_con.loc[res_con.party == "other", metric] = ( 331 | res_con.loc[res_con.party == "other", metric].sum() 332 | + res_con.loc[res_con.party == "ukip", metric].sum() 333 | ) 334 | res_list.append(res_con.query('party != "ukip"').copy()) 335 | res = pd.concat(res_list, axis=0) 336 | 337 | results[year] = res.copy() 338 | 339 | return results 340 | 341 | def load_polling_data(self): 342 | """Load polling data for UK General Elections.""" 343 | polls = {} 344 | for geo in self.geos: 345 | poll_df = pd.read_csv( 346 | self.directory / "raw" / f"general_election-{geo}-polls.csv", parse_dates=["to"] 347 | ).sort_values("to") 348 | poll_df.columns = utils.sanitise( 349 | poll_df.columns, 350 | replace={"ulster_unionist_party": "uup", "sinn_fein": "sf", "alliance": "apni"}, 351 | ) 352 | polls[geo] = poll_df 353 | 354 | return polls 355 | 356 | @staticmethod 357 | def calculate_poll_of_polls(polls, from_date, to_date): 358 | return polls[(polls.to >= from_date) & (polls.to < to_date)].groupby("company").tail(1) 359 | 360 | def get_regional_and_national_poll_of_polls(self, polls): 361 | """Takes straight average across each pollster's final poll in last week prior to election day. 362 | Repeat for regions, if regional polling is available. 363 | """ 364 | election_day = self.now_date 365 | one_week_before = election_day - pd.Timedelta(days=7) 366 | one_month_before = election_day - pd.Timedelta(days=30) 367 | 368 | # Use single last poll from each pollster in final week of polling then average out 369 | final_polls = {} 370 | for geo in self.geos: 371 | period_before = one_week_before if geo == "uk" else one_month_before 372 | final_polls[geo] = self.calculate_poll_of_polls( 373 | polls=polls[geo], from_date=period_before, to_date=election_day 374 | ) 375 | # Consider MRPs equivalent to a large poll 376 | final_polls[geo].loc[final_polls[geo].method == "MRP", "sample_size"] = ( 377 | final_polls[geo].query('method != "MRP"').sample_size.max() 378 | ) 379 | # Handle missing sample sizes 380 | mean_sample_size = final_polls[geo].query('method != "MRP"').sample_size.mean() 381 | if pd.isnull(mean_sample_size): 382 | mean_sample_size = 1 383 | final_polls[geo]["sample_size"] = final_polls[geo].sample_size.fillna(mean_sample_size) 384 | 385 | # Calculate regional polling 386 | regional_polling_missing = any(final_polls[geo].empty for geo in self.geos) 387 | 388 | # Regional polling is missing, just calculate UK-level polling only. 389 | if regional_polling_missing: 390 | # TODO: Check how this affects 2015/2017 models 391 | parties = ["con", "lab", "ld", "ukip", "grn", "chuk", "bxp", "snp"] 392 | # Create new polls dictionary by geo containing simple average across all pollsters 393 | national_polling = final_polls["uk"].mean().loc[parties] 394 | # We don't yet have regional polling in 2015 for Scotland, Wales, NI, London - add as other. 395 | national_polling["other"] = 1 - national_polling.sum() 396 | poll_of_polls = {"uk": national_polling} 397 | # Turn into dataframe 398 | polls_df_list = [] 399 | for geo in poll_of_polls: 400 | polls_df_list.append( 401 | pd.DataFrame( 402 | { 403 | "geo": geo, 404 | "party": poll_of_polls[geo].index, 405 | "voteshare": poll_of_polls[geo], 406 | } 407 | ).reset_index(drop=True) 408 | ) 409 | polls_df = pd.concat(polls_df_list, axis=0) 410 | 411 | # We have polling for all regions. 412 | else: 413 | parties = { 414 | # TODO: Add ["chuk", "bxp", "ukip"] to uk, scotland, wales, london 415 | "uk": ["con", "lab", "ld", "grn", "snp"], 416 | "scotland": ["con", "lab", "ld", "snp", "grn"], 417 | "wales": ["con", "lab", "ld", "pc", "grn"], 418 | "ni": ["dup", "uup", "sf", "sdlp", "apni", "grn", "con"], 419 | "london": ["con", "lab", "ld", "grn"], 420 | "england_not_london": ["con", "lab", "ld", "grn"], 421 | } 422 | all_parties = set(x for y in parties.values() for x in y) 423 | poll_of_polls = {} 424 | for geo in self.geos: 425 | sample_size_weights = ( 426 | final_polls[geo].sample_size / final_polls[geo].sample_size.sum() 427 | ) 428 | weighted_poll_of_polls = ( 429 | final_polls[geo][parties[geo]] 430 | .multiply(sample_size_weights, axis=0) 431 | .sum() 432 | .reindex(all_parties, fill_value=0.0) 433 | ) 434 | poll_of_polls[geo] = weighted_poll_of_polls 435 | 436 | # Estimate polling for England excluding London 437 | # survation_wts from http://survation.com/wp-content/uploads/2017/06/Final-MoS-Post-BBC-Event-Poll-020617SWCH-1c0d4h9.pdf 438 | survation_wts = pd.Series({"scotland": 85, "england": 881, "wales": 67, "ni": 16}) 439 | survation_wts["uk"] = survation_wts.sum() 440 | survation_wts["london"] = 137 441 | survation_wts["england_not_london"] = survation_wts.england - survation_wts.london 442 | 443 | england_not_london = poll_of_polls["uk"] * survation_wts["uk"] 444 | for geo in ["scotland", "wales", "ni", "london"]: 445 | england_not_london = england_not_london.sub( 446 | poll_of_polls[geo] * survation_wts[geo], fill_value=0.0 447 | ) 448 | england_not_london /= survation_wts["england_not_london"] 449 | england_not_london.loc[["pc", "snp"]] = 0.0 450 | poll_of_polls["england_not_london"] = england_not_london 451 | 452 | # Fix PC (Plaid Cymru) for UK 453 | poll_of_polls["uk"]["pc"] = ( 454 | poll_of_polls["wales"]["pc"] * survation_wts["wales"] / survation_wts["uk"] 455 | ) 456 | 457 | # Add Other & normalise 458 | for geo in self.geos + ["england_not_london"]: 459 | poll_of_polls[geo]["other"] = max( 460 | 1 - poll_of_polls[geo].sum(), 0 461 | ) # weighted means can sum > 1 462 | poll_of_polls[geo] = poll_of_polls[geo] / poll_of_polls[geo].sum() 463 | 464 | # Export 465 | polls_df_list = [] 466 | for geo in poll_of_polls: 467 | polls_df_list.append( 468 | pd.DataFrame( 469 | { 470 | "geo": geo, 471 | "party": poll_of_polls[geo].index, 472 | "voteshare": poll_of_polls[geo], 473 | } 474 | ).reset_index(drop=True) 475 | ) 476 | polls_df = pd.concat(polls_df_list, axis=0) 477 | 478 | return polls_df 479 | 480 | @staticmethod 481 | def combine_results_and_polls(results, polls): 482 | """Merge national polling, and geo-level polling if available, into results dataframe.""" 483 | # Merge into previous election's results to calculate swing 484 | results = ( 485 | results.merge( 486 | right=polls.query('geo == "uk"')[["party", "voteshare"]].rename( 487 | columns={"voteshare": "national_polls"} 488 | ), 489 | on="party", 490 | how="outer", 491 | ) 492 | .sort_values(["ons_id", "party"]) 493 | .reset_index(drop=True) 494 | ) 495 | # If we have geo-polls, add those too 496 | if set(polls.geo.unique()) != {"uk"}: 497 | results = ( 498 | results.merge( 499 | right=polls.query('geo != "uk"')[["geo", "party", "voteshare"]].rename( 500 | columns={"voteshare": "geo_polls"} 501 | ), 502 | on=["geo", "party"], 503 | how="outer", 504 | ) 505 | .sort_values(["ons_id", "party"]) 506 | .reset_index(drop=True) 507 | ) 508 | 509 | return results 510 | 511 | @staticmethod 512 | def calculate_winners(df, voteshare_col): 513 | """Assumes df has `ons_id` and `party` columns.""" 514 | return ( 515 | df.sort_values(voteshare_col, ascending=False) 516 | .groupby("ons_id") 517 | .head(1)[["ons_id", "party"]] 518 | .set_index("ons_id") 519 | .party 520 | ) 521 | 522 | def calculate_national_swing(self, results): 523 | """Uses previous election results plus current polling to calculate: 524 | - `national_voteshare`: per party 525 | - `national_swing`: from previous voteshare to current polling 526 | - `national_swing_forecast`: forecasted voteshare per party per seat 527 | - `national_swing_winner`: per seat 528 | 529 | Returns: updated results dataframe with new columns. 530 | """ 531 | # Calculate national voteshare 532 | national_voteshare_by_party = results.groupby("party").votes.sum() / results.votes.sum() 533 | results["national_voteshare"] = results.party.map(national_voteshare_by_party) 534 | 535 | # Calculate swing between last election results and latest poll-of-polls 536 | results["national_swing"] = (results.national_polls / results.national_voteshare) - 1 537 | 538 | # Forecast is previous result multiplied by swing uplift 539 | results["national_swing_forecast"] = results.voteshare * (1 + results.national_swing) 540 | 541 | # Predict the winner in each constituency using national_swing_forecast 542 | # Note: these are pointless for NI as polls/swings are all aggregated under "other" but results 543 | # are given per major party. 544 | national_swing_winners = self.calculate_winners(results, "national_swing_forecast") 545 | results["national_swing_winner"] = results.ons_id.map(national_swing_winners) 546 | 547 | return results 548 | 549 | def calculate_geo_swing(self, results): 550 | """Calculate geo-Level voteshare + swing inc. all parties. Adds: 551 | - `geo_voteshare`: geo-level voteshare (per party). 552 | - `geo_swing`: swing from previous geo_voteshare to current geo-polling. 553 | - `geo_swing_forecast`: geo-swing based forecast per party per seat. 554 | - `geo_swing_winner`: per seat. 555 | 556 | Returns: updated results dataframe with new columns. 557 | """ 558 | 559 | # Calculate geo-level voteshare 560 | votes_by_geo = results.groupby("geo").votes.sum().reset_index() 561 | votes_by_geo_by_party = ( 562 | results.groupby(["geo", "party"]) 563 | .votes.sum() 564 | .reset_index() 565 | .merge(votes_by_geo, on="geo", how="left", suffixes=("", "_geo")) 566 | ) 567 | votes_by_geo_by_party["geo_voteshare"] = ( 568 | votes_by_geo_by_party.votes / votes_by_geo_by_party.votes_geo 569 | ) 570 | results = results.merge( 571 | votes_by_geo_by_party[["geo", "party", "geo_voteshare"]], 572 | on=["geo", "party"], 573 | how="left", 574 | ) 575 | 576 | # Calculate geo-swing between last election results and latest geo-polls 577 | results["geo_swing"] = (results.geo_polls / results.geo_voteshare) - 1 578 | 579 | # Forecast is previous result multiplied by swing uplift 580 | results["geo_swing_forecast"] = results.voteshare * (1 + results.geo_swing) 581 | 582 | # Predict the winner in each constituency using geo_swing_forecast 583 | geo_swing_winners = self.calculate_winners(results, "geo_swing_forecast") 584 | results["geo_swing_winner"] = results.ons_id.map(geo_swing_winners) 585 | 586 | return results 587 | 588 | def export_model_ready_dataframe(self, results_dict): 589 | """Create ML-ready dataframe and export.""" 590 | 591 | # Cols to select from results dfs 592 | if self.prediction_only: 593 | df_cols_last = [ 594 | "ons_id", 595 | "constituency", 596 | "county", 597 | "region", 598 | "geo", 599 | "country", 600 | "electorate", 601 | "total_votes", 602 | "turnout", 603 | "party", 604 | "votes", 605 | "voteshare", 606 | "national_polls", 607 | "national_voteshare", 608 | "national_swing", 609 | "national_swing_forecast", 610 | "national_swing_winner", 611 | "winner", 612 | "won_here", 613 | ] 614 | else: 615 | df_cols_now = [ 616 | "ons_id", 617 | "constituency", 618 | "county", 619 | "region", 620 | "geo", 621 | "country", 622 | "electorate", 623 | "total_votes", 624 | "turnout", 625 | "party", 626 | "votes", 627 | "voteshare", 628 | "winner", 629 | ] 630 | df_cols_last = [ 631 | "ons_id", 632 | "party", 633 | "total_votes", 634 | "turnout", 635 | "votes", 636 | "voteshare", 637 | "national_polls", 638 | "national_voteshare", 639 | "national_swing", 640 | "national_swing_forecast", 641 | "national_swing_winner", 642 | "winner", 643 | "won_here", 644 | ] 645 | 646 | # Add geo polling if available 647 | df_cols_final_geo = [] 648 | if "geo_polls" in results_dict[self.last].columns: 649 | df_cols_last += [ 650 | "geo_polls", 651 | "geo_voteshare", 652 | "geo_swing", 653 | "geo_swing_forecast", 654 | "geo_swing_winner", 655 | ] 656 | df_cols_final_geo += [ 657 | "geo_polls_now", 658 | "geo_voteshare_last", 659 | "geo_swing", 660 | "geo_swing_forecast", 661 | "geo_swing_winner", 662 | ] 663 | 664 | # Specify cols to use in exported df 665 | df_cols_final = [ 666 | # Constant per constituency 667 | "ons_id", 668 | "constituency", 669 | "county", 670 | "region", 671 | "geo", 672 | "country", 673 | "electorate", 674 | "total_votes_last", 675 | "turnout_last", 676 | # Constant per party (per constituency) 677 | "party", 678 | "votes_last", 679 | "voteshare_last", 680 | "winner_last", 681 | "won_here_last", 682 | "national_voteshare_last", 683 | "national_polls_now", 684 | "national_swing", 685 | "national_swing_forecast", 686 | "national_swing_winner", 687 | ] + df_cols_final_geo 688 | 689 | # If we have results data for now, let's add that 690 | if not self.prediction_only: 691 | df_cols_final += [ 692 | # Target 693 | "total_votes_now", 694 | "turnout_now", 695 | "votes_now", 696 | "voteshare_now", 697 | "winner_now", 698 | ] 699 | 700 | # Build dataframe for export 701 | if self.prediction_only: 702 | df = ( 703 | results_dict[self.last][df_cols_last] 704 | .rename( 705 | columns={ 706 | "total_votes": "total_votes_last", 707 | "turnout": "turnout_last", 708 | "votes": "votes_last", 709 | "voteshare": "voteshare_last", 710 | "national_polls": "national_polls_now", 711 | "geo_polls": "geo_polls_now", 712 | "national_voteshare": "national_voteshare_last", 713 | "geo_voteshare": "geo_voteshare_last", 714 | "winner": "winner_last", 715 | "won_here": "won_here_last", 716 | } 717 | ) 718 | .filter(df_cols_final) 719 | ) 720 | else: 721 | df_cols_final = ( 722 | [ 723 | # Constant per constituency 724 | "ons_id", 725 | "constituency", 726 | "county", 727 | "region", 728 | "geo", 729 | "country", 730 | "electorate", 731 | "total_votes_last", 732 | "turnout_last", 733 | # Constant per party (per constituency) 734 | "party", 735 | "votes_last", 736 | "voteshare_last", 737 | "winner_last", 738 | "won_here_last", 739 | "national_voteshare_last", 740 | "national_polls_now", 741 | "national_swing", 742 | "national_swing_forecast", 743 | "national_swing_winner", 744 | ] 745 | + df_cols_final_geo 746 | + [ 747 | # Target 748 | "total_votes_now", 749 | "turnout_now", 750 | "votes_now", 751 | "voteshare_now", 752 | "winner_now", 753 | ] 754 | ) 755 | df = ( 756 | results_dict[self.now][df_cols_now] 757 | .rename( 758 | columns={ 759 | "total_votes": "total_votes_now", 760 | "turnout": "turnout_now", 761 | "votes": "votes_now", 762 | "voteshare": "voteshare_now", 763 | "winner": "winner_now", 764 | } 765 | ) 766 | .merge( 767 | # Note: even though polling represents "now", they're in results[last] to calculate swings. 768 | results_dict[self.last][df_cols_last].rename( 769 | columns={ 770 | "total_votes": "total_votes_last", 771 | "turnout": "turnout_last", 772 | "votes": "votes_last", 773 | "voteshare": "voteshare_last", 774 | "national_polls": "national_polls_now", 775 | "geo_polls": "geo_polls_now", 776 | "national_voteshare": "national_voteshare_last", 777 | "geo_voteshare": "geo_voteshare_last", 778 | "winner": "winner_last", 779 | "won_here": "won_here_last", 780 | } 781 | ), 782 | on=["ons_id", "party"], 783 | how="inner", 784 | validate="1:1", 785 | ) 786 | .filter(df_cols_final) 787 | ) 788 | 789 | return df 790 | 791 | def process(self): 792 | """Process results data from consecutive UK General Elections (e.g. 2010 and 2015) into a single model-ready 793 | dataset ready for predicting the later (e.g. 2015) election.""" 794 | processed_directory = self.directory / "processed" 795 | os.makedirs(processed_directory, exist_ok=True) # create directory if it doesn't exist 796 | 797 | # Import general election results & polling data 798 | results_dict = self.load_results_data() 799 | polls_full = self.load_polling_data() 800 | 801 | # Calculate poll of polls 802 | polls = self.get_regional_and_national_poll_of_polls(polls=polls_full) 803 | 804 | # Merge polls into previous election results dataframe 805 | results_dict[self.last] = self.combine_results_and_polls( 806 | results=results_dict[self.last], polls=polls 807 | ) 808 | 809 | # Add into previous election results: national voteshare, national swing (vs current polling), 810 | # national swing forecast (per party per seat) and national swing forecast winner (per seat). 811 | results_dict[self.last] = self.calculate_national_swing(results_dict[self.last]) 812 | 813 | # If we have geo-polling for previous election, also calculate a geo-level swing forecast. 814 | if "geo_polls" in results_dict[self.last].columns: 815 | results_dict[self.last] = self.calculate_geo_swing(results_dict[self.last]) 816 | 817 | # Create ML-ready dataframe and export 818 | model_df = self.export_model_ready_dataframe(results_dict=results_dict) 819 | 820 | print(f"Exporting {self.last}->{self.now} model dataset to {processed_directory.resolve()}") 821 | model_df.to_csv( 822 | processed_directory / f"general_election-uk-{self.now}-model.csv", index=False 823 | ) 824 | --------------------------------------------------------------------------------