├── maven
    ├── datasets
    │   ├── __init__.py
    │   ├── coronavirus
    │   │   ├── __init__.py
    │   │   ├── README.md
    │   │   └── csse.py
    │   └── general_election
    │   │   ├── __init__.py
    │   │   ├── uk_2017_results.py
    │   │   ├── uk_2015_results.py
    │   │   ├── uk_2010_results.py
    │   │   ├── uk_2019_model.py
    │   │   ├── uk_2015_model.py
    │   │   ├── uk_2017_model.py
    │   │   ├── uk_polls.py
    │   │   ├── README.md
    │   │   └── base.py
    ├── __init__.py
    ├── get.py
    └── utils.py
├── MANIFEST.in
├── setup.cfg
├── dev-requirements.in
├── .gitignore
├── AUTHORS.md
├── requirements.txt
├── tests
    ├── test_get.py
    ├── datasets
    │   ├── coronavirus
    │   │   └── test_csse.py
    │   └── general_election
    │   │   ├── test_uk_results.py
    │   │   └── test_uk_models.py
    └── test_utils.py
├── setup.py
├── dev-requirements.txt
├── README.md
├── CHANGELOG.md
└── LICENSE


/maven/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md LICENSE Pipfile Pipfile.lock
2 | 


--------------------------------------------------------------------------------
/maven/__init__.py:
--------------------------------------------------------------------------------
1 | from . import utils
2 | from .get import get
3 | 
4 | __version__ = "0.1.0"
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 
4 | [tool:pytest]
5 | collect_ignore = ['setup.py']
6 | 


--------------------------------------------------------------------------------
/dev-requirements.in:
--------------------------------------------------------------------------------
1 | -c requirements.txt
2 | ipython==7.16.3
3 | pip-tools==4.2.0
4 | pytest==5.2.2
5 | 


--------------------------------------------------------------------------------
/maven/datasets/coronavirus/__init__.py:
--------------------------------------------------------------------------------
1 | from .csse import CSSE
2 | 
3 | __all__ = [
4 |     "CSSE",
5 | ]
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .eggs/
 2 | .venv/
 3 | build/
 4 | data/
 5 | dist/
 6 | maven.egg-info/
 7 | 
 8 | *.pyc
 9 | 
10 | # IDE ignores
11 | .vscode
12 | 
13 | # Checklists
14 | DEPLOY
15 | REVIEW
16 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | # Credits
 2 | 
 3 | ## Development Lead
 4 | * John Sandall <john.sandall@gmail.com> [@john-sandall](https://github.com/john-sandall)
 5 | 
 6 | ## Contributors
 7 | * [@joy-rosie](https://github.com/joy-rosie)
 8 | * [@cruzzoe](https://github.com/cruzzoe)
 9 | * [@tomviner](https://github.com/tomviner)
10 | * [@geo7](https://github.com/geo7/)
11 | * [@JosephSutcliffe](https://github.com/JosephSutcliffe/)
12 | * [@dwood023](https://github.com/dwood023/)
13 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/__init__.py:
--------------------------------------------------------------------------------
 1 | from .uk_2010_results import UK2010Results
 2 | from .uk_2015_model import UK2015Model
 3 | from .uk_2015_results import UK2015Results
 4 | from .uk_2017_model import UK2017Model
 5 | from .uk_2017_results import UK2017Results
 6 | from .uk_2019_model import UK2019Model
 7 | from .uk_polls import UKPolls
 8 | 
 9 | __all__ = [
10 |     "UK2010Results",
11 |     "UK2015Model",
12 |     "UK2015Results",
13 |     "UK2017Model",
14 |     "UK2017Results",
15 |     "UK2019Model",
16 |     "UKPolls",
17 | ]
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile
 6 | #
 7 | certifi==2019.6.16        # via requests
 8 | chardet==3.0.4            # via requests
 9 | idna==2.8                 # via requests
10 | numpy==1.16.4             # via pandas
11 | pandas==1.0.0             # via maven (setup.py)
12 | python-dateutil==2.8.0    # via pandas
13 | pytz==2019.1              # via pandas
14 | requests==2.22.0          # via maven (setup.py)
15 | six==1.12.0               # via python-dateutil
16 | urllib3==1.25.3           # via requests
17 | xlrd==1.2.0               # via maven (setup.py)
18 | 


--------------------------------------------------------------------------------
/tests/test_get.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Running tests in development:
 3 |     $ cd /path/to/repo
 4 |     $ python -m pytest
 5 | 
 6 | Running tests against installed version (either `pip install .` or `pip install maven`):
 7 |     $ cd /path/to/repo
 8 |     $ pytest
 9 | """
10 | 
11 | import maven
12 | import pytest
13 | 
14 | 
15 | def test_nonexisting_identifier():
16 |     with pytest.raises(KeyError):
17 |         maven.get("this-identifier-will-never-exist", data_directory="./data/")
18 | 
19 | 
20 | def test_nothing_happens():
21 |     """Setting retrieve=False and process=False should do nothing."""
22 |     maven.get("general-election/UK/2010/results", retrieve=False, process=False)
23 | 


--------------------------------------------------------------------------------
/tests/datasets/coronavirus/test_csse.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Running tests in development:
 3 |     $ cd /path/to/repo
 4 |     $ python -m pytest ./tests/datasets/coronavirus/test_csse.py
 5 | 
 6 | Running tests against installed version (either `pip install .` or `pip install maven`):
 7 |     $ cd /path/to/repo
 8 |     $ pytest ./tests/datasets/coronavirus/test_csse.py
 9 | """
10 | 
11 | from pathlib import Path
12 | 
13 | import pandas as pd
14 | 
15 | import maven
16 | 
17 | 
18 | def test_csse():
19 |     identifier = "coronavirus/CSSE"
20 |     maven.get(identifier, data_directory="./data/")
21 |     # CSSE_country.csv
22 |     processed_filename = "CSSE_country.csv"
23 |     df = pd.read_csv(Path("./data") / identifier / "processed" / processed_filename)
24 |     assert df.columns.tolist() == ["date", "country_region", "confirmed", "deaths", "recovered"]
25 |     # CSSE_country_province.csv
26 |     processed_filename = "CSSE_country_province.csv"
27 |     df = pd.read_csv(Path("./data") / identifier / "processed" / processed_filename)
28 |     assert df.columns.tolist() == [
29 |         "date",
30 |         "country_region",
31 |         "province_state",
32 |         "lat",
33 |         "lon",
34 |         "confirmed",
35 |         "deaths",
36 |         "recovered",
37 |     ]
38 | 


--------------------------------------------------------------------------------
/maven/datasets/coronavirus/README.md:
--------------------------------------------------------------------------------
 1 | # Coronavirus (COVID-19) datasets
 2 | 
 3 | If you have any questions about these datasets please [contact me @John_Sandall](https://twitter.com/John_Sandall) on Twitter.
 4 | 
 5 | 
 6 | ## Sources
 7 | We aim to source our data directly from the most authorative data provider, falling back to less authorative sources where a primary source isn't available.
 8 | 
 9 | Global providers/aggregators:
10 | - [Johns Hopkins Center for Systems Science and Engineering](https://github.com/CSSEGISandData/COVID-19/).
11 | 
12 | 
13 | ## Data dictionaries
14 | 
15 | #### **`coronavirus/CSSE`**
16 | 
17 | ##### `CSSE_country_province.csv`
18 | | Column | Type | Description | Example |
19 | | -- | -- | -- | -- |
20 | | `date` | date | Date | `2020-03-13` |
21 | | `country_region` | str | Country/Region | `US` |
22 | | `province_state` | str | Province/State | `Washington` |
23 | | `lat` | float | Latitude | `47.4009` |
24 | | `lon` | float | Longitude | `-121.4905` |
25 | | `confirmed` | int | Confirmed cases | `568` |
26 | | `deaths` | int | Fatalities | `37` |
27 | | `recovered` | int | Recovered | `1` |
28 | 
29 | ##### `CSSE_country.csv`
30 | | Column | Type | Description | Example |
31 | | -- | -- | -- | -- |
32 | | `date` | date | Date | `2020-03-13` |
33 | | `country_region` | str | Country/Region | `US` |
34 | | `confirmed` | int | Confirmed cases | `2179` |
35 | | `deaths` | int | Fatalities | `47` |
36 | | `recovered` | int | Recovered | `12` |
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r") as f:
 4 |     long_description = f.read()
 5 | 
 6 | 
 7 | setuptools.setup(
 8 |     name="maven",
 9 |     version="0.1.0",
10 |     description=(
11 |         "Maven's goal is to reduce the time data scientists spend on data cleaning and preparation "
12 |         "by providing easy access to open datasets in both raw and processed formats."
13 |     ),
14 |     long_description=long_description,
15 |     long_description_content_type="text/markdown",
16 |     keywords="maven open data etl pipeline",
17 |     author="John Sandall",
18 |     author_email="contact@coefficient.ai",
19 |     url="https://github.com/john-sandall/maven",
20 |     packages=setuptools.find_packages(),
21 |     include_package_data=True,
22 |     install_requires=["pandas==1.0.0", "requests==2.22.0", "xlrd==1.2.0",],
23 |     python_requires="==3.7.*",
24 |     setup_requires=["pytest-runner"],
25 |     test_suite="tests",
26 |     tests_require=["pytest"],
27 |     license="Apache 2.0",
28 |     zip_safe=False,
29 |     classifiers=[
30 |         "Development Status :: 2 - Pre-Alpha",
31 |         "Programming Language :: Python",
32 |         "Programming Language :: Python :: 3",
33 |         "Programming Language :: Python :: 3.7",
34 |         "License :: OSI Approved :: Apache Software License",
35 |         "Operating System :: OS Independent",
36 |         "Natural Language :: English",
37 |         "Intended Audience :: Developers",
38 |     ],
39 | )
40 | 


--------------------------------------------------------------------------------
/tests/datasets/general_election/test_uk_results.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Running tests in development:
 3 |     $ cd /path/to/repo
 4 |     $ python -m pytest ./tests/datasets/test_uk_models
 5 | 
 6 | Running tests against installed version (either `pip install .` or `pip install maven`):
 7 |     $ cd /path/to/repo
 8 |     $ pytest ./tests/datasets/test_uk_models
 9 | """
10 | 
11 | from pathlib import Path
12 | 
13 | import pandas as pd
14 | 
15 | import maven
16 | 
17 | 
18 | def check_uk_hoc_results_data(identifier, processed_filename):
19 |     maven.get(identifier, data_directory="./data/")
20 |     df = pd.read_csv(Path("./data") / identifier / "processed" / processed_filename)
21 |     assert df.shape == (8450, 11)
22 |     assert df.columns.tolist() == [
23 |         "ons_id",
24 |         "constituency",
25 |         "county",
26 |         "region",
27 |         "country",
28 |         "electorate",
29 |         "total_votes",
30 |         "turnout",
31 |         "party",
32 |         "votes",
33 |         "voteshare",
34 |     ]
35 | 
36 | 
37 | def test_uk_2010_results():
38 |     check_uk_hoc_results_data(
39 |         identifier="general-election/UK/2010/results", processed_filename="general_election-uk-2010-results.csv"
40 |     )
41 | 
42 | 
43 | def test_uk_2015_results():
44 |     check_uk_hoc_results_data(
45 |         identifier="general-election/UK/2015/results", processed_filename="general_election-uk-2015-results.csv"
46 |     )
47 | 
48 | 
49 | def test_uk_2017_results():
50 |     check_uk_hoc_results_data(
51 |         identifier="general-election/UK/2017/results", processed_filename="general_election-uk-2017-results.csv"
52 |     )
53 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
 1 | #
 2 | # This file is autogenerated by pip-compile
 3 | # To update, run:
 4 | #
 5 | #    pip-compile dev-requirements.in
 6 | #
 7 | atomicwrites==1.3.0
 8 |     # via pytest
 9 | attrs==19.3.0
10 |     # via pytest
11 | backcall==0.1.0
12 |     # via ipython
13 | click==7.0
14 |     # via pip-tools
15 | decorator==4.4.1
16 |     # via
17 |     #   ipython
18 |     #   traitlets
19 | importlib-metadata==0.23
20 |     # via
21 |     #   pluggy
22 |     #   pytest
23 | ipython==7.16.3
24 |     # via -r dev-requirements.in
25 | ipython-genutils==0.2.0
26 |     # via traitlets
27 | jedi==0.15.1
28 |     # via ipython
29 | more-itertools==7.2.0
30 |     # via pytest
31 | packaging==19.2
32 |     # via pytest
33 | parso==0.5.1
34 |     # via jedi
35 | pexpect==4.7.0
36 |     # via ipython
37 | pickleshare==0.7.5
38 |     # via ipython
39 | pip-tools==4.2.0
40 |     # via -r dev-requirements.in
41 | pluggy==0.13.0
42 |     # via pytest
43 | prompt-toolkit==2.0.10
44 |     # via ipython
45 | ptyprocess==0.6.0
46 |     # via pexpect
47 | py==1.8.0
48 |     # via pytest
49 | pygments==2.4.2
50 |     # via ipython
51 | pyparsing==2.4.5
52 |     # via packaging
53 | pytest==5.2.2
54 |     # via -r dev-requirements.in
55 | six==1.12.0
56 |     # via
57 |     #   -c requirements.txt
58 |     #   packaging
59 |     #   pip-tools
60 |     #   prompt-toolkit
61 |     #   traitlets
62 | traitlets==4.3.3
63 |     # via ipython
64 | wcwidth==0.1.7
65 |     # via
66 |     #   prompt-toolkit
67 |     #   pytest
68 | zipp==0.6.0
69 |     # via importlib-metadata
70 | 
71 | # The following packages are considered to be unsafe in a requirements file:
72 | # setuptools
73 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/uk_2017_results.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Results data for the United Kingdom's 2017 General Election.
 3 | 
 4 | Usage:
 5 |     >>> import maven
 6 |     >>> maven.get('general-election/UK/2017/results', data_directory='./data/')
 7 | 
 8 | 
 9 | Sources:
10 |     - http://researchbriefings.files.parliament.uk/documents/CBP-8647/1918-2017election_results.csv
11 |         - From https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647
12 | 
13 | Other sources:
14 |     - https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-7186
15 |     - http://researchbriefings.files.parliament.uk/documents/CBP-7979/HoC-GE2017-constituency-results.csv
16 | """
17 | 
18 | from pathlib import Path
19 | 
20 | from maven.datasets.general_election.base import UKResults
21 | 
22 | 
23 | class UK2017Results(UKResults):
24 |     """Handles results data for the United Kingdom's 2017 General Election."""
25 | 
26 |     def __init__(self, directory=Path("data/general-election/UK/2017/results")):
27 |         super(UK2017Results, self).__init__(directory=directory)
28 |         self.directory = Path(directory)
29 |         self.sources = [
30 |             # url, filename, checksum
31 |             (
32 |                 "http://researchbriefings.files.parliament.uk/documents/CBP-8647/",
33 |                 "1918-2017election_results_by_pcon.xlsx",
34 |                 "a1e4628945574639b541b21bada2531c",
35 |             ),
36 |         ]
37 |         self.target = ("general_election-uk-2017-results.csv", "c7e1fde647e55f9d4567cb81e62c782a")  # filename, checksum
38 |         self.verbose_name = "UK 2017 General Election results"
39 |         self.year = "2017"
40 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/uk_2015_results.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Results data for the United Kingdom's 2015 General Election.
 3 | 
 4 | Usage:
 5 |     >>> import maven
 6 |     >>> maven.get('general-election/UK/2015/results', data_directory='./data/')
 7 | 
 8 | 
 9 | Sources:
10 |     - http://researchbriefings.files.parliament.uk/documents/CBP-8647/1918-2017election_results.csv
11 |         - From https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647
12 | 
13 | Deprecated sources:
14 |     - http://www.electoralcommission.org.uk/__data/assets/file/0004/191650/2015-UK-general-election-data-results-WEB.zip
15 | 
16 | Notes:
17 |     - 2015-UK-general-election-data-results-WEB.zip has a lot more detailed data.
18 | """
19 | 
20 | from pathlib import Path
21 | 
22 | from maven.datasets.general_election.base import UKResults
23 | 
24 | 
25 | class UK2015Results(UKResults):
26 |     """Handles results data for the United Kingdom's 2015 General Election."""
27 | 
28 |     def __init__(self, directory=Path("data/general-election/UK/2015/results")):
29 |         super(UK2015Results, self).__init__(directory=directory)
30 |         self.directory = Path(directory)
31 |         self.sources = [
32 |             # url, filename, checksum
33 |             (
34 |                 "http://researchbriefings.files.parliament.uk/documents/CBP-8647/",
35 |                 "1918-2017election_results_by_pcon.xlsx",
36 |                 "a1e4628945574639b541b21bada2531c",
37 |             ),
38 |         ]
39 |         self.target = ("general_election-uk-2015-results.csv", "9a785cb19275e4dbc79da67eece6067f")  # filename, checksum
40 |         self.verbose_name = "UK 2015 General Election results"
41 |         self.year = "2015"
42 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/uk_2010_results.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Results data for the United Kingdom's 2010 General Election.
 3 | 
 4 | Usage:
 5 |     >>> import maven
 6 |     >>> maven.get('general-election/UK/2010/results', data_directory='./data/')
 7 | 
 8 | 
 9 | Sources:
10 |     - http://researchbriefings.files.parliament.uk/documents/CBP-8647/1918-2017election_results.csv
11 |         - From https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647
12 | 
13 | Deprecated sources:
14 |     - http://www.electoralcommission.org.uk/__data/assets/excel_doc/0003/105726/GE2010-results-flatfile-website.xls
15 |     - https://s3-eu-west-1.amazonaws.com/sixfifty/GE2010-results-flatfile-website.xls
16 | 
17 | Notes:
18 |     - GE2010-results-flatfile-website.xls is currently the only known source with a full list of votes for ALL parties.
19 | """
20 | 
21 | from pathlib import Path
22 | 
23 | from maven.datasets.general_election.base import UKResults
24 | 
25 | 
26 | class UK2010Results(UKResults):
27 |     """Handles results data for the United Kingdom's 2010 General Election."""
28 | 
29 |     def __init__(self, directory=Path("data/general-election/UK/2010/results")):
30 |         super(UK2010Results, self).__init__(directory=directory)
31 |         self.directory = Path(directory)
32 |         self.sources = [
33 |             # url, filename, checksum
34 |             (
35 |                 "http://researchbriefings.files.parliament.uk/documents/CBP-8647/",
36 |                 "1918-2017election_results_by_pcon.xlsx",
37 |                 "a1e4628945574639b541b21bada2531c",
38 |             ),
39 |         ]
40 |         self.target = ("general_election-uk-2010-results.csv", "954a0916f5ce791ca566484ce566088d")  # filename, checksum
41 |         self.verbose_name = "UK 2010 General Election results"
42 |         self.year = "2010"
43 | 


--------------------------------------------------------------------------------
/maven/get.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Main data getting functionality. Maps data identifiers to data pipeline classes.
 3 | 
 4 | Example usage:
 5 |     > import maven
 6 |     > maven.get('general-election/UK/2015/results', data_directory='./data/')
 7 | """
 8 | 
 9 | from pathlib import Path
10 | 
11 | from .datasets import coronavirus, general_election
12 | 
13 | 
14 | def get(name, data_directory=Path("."), retrieve=True, process=True):
15 |     """Core data getter function.
16 | 
17 |     Args:
18 |         name (str): Name of dataset to retrieve/process.
19 |         data_directory (str or pathlib.PosixPath): Path to directory where datasets will be saved (either as string
20 |                                                    a pathlib Path).
21 |         retrieve (bool): Toggle dataset retrieval.
22 |         process (bool): Toggle dataset processing.
23 | 
24 |     Returns: Nothing (datasets are placed into current working directory).
25 |     """
26 |     mapper = {
27 |         "coronavirus/CSSE": coronavirus.CSSE,
28 |         "general-election/UK/2010/results": general_election.UK2010Results,
29 |         "general-election/UK/2015/model": general_election.UK2015Model,
30 |         "general-election/UK/2015/results": general_election.UK2015Results,
31 |         "general-election/UK/2017/model": general_election.UK2017Model,
32 |         "general-election/UK/2017/results": general_election.UK2017Results,
33 |         # "general-election/UK/2019/model": general_election.UK2019Model,
34 |         "general-election/UK/polls": general_election.UKPolls,
35 |     }
36 |     if name not in mapper:
37 |         raise KeyError(f"'{name}' not found in datasets.")
38 | 
39 |     if isinstance(data_directory, str):
40 |         data_directory = Path(data_directory)
41 |     pipeline = mapper[name](directory=(data_directory / name))
42 | 
43 |     if retrieve:
44 |         pipeline.retrieve()
45 |     if process:
46 |         pipeline.process()
47 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/uk_2019_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Model-ready dataset for the United Kingdom's 2019 General Election.
 3 | 
 4 | Usage:
 5 |     > import maven
 6 |     > maven.get('general-election/UK/2019/model', data_directory='./data/')
 7 | """
 8 | import os
 9 | from pathlib import Path
10 | 
11 | import pandas as pd
12 | 
13 | from maven.datasets.general_election.base import UKModel
14 | 
15 | 
16 | class UK2019Model(UKModel):
17 |     """Generates model-ready data for the United Kingdom's 2019 General Election."""
18 | 
19 |     def __init__(self, directory=Path("data/general-election/UK/2019/model")):
20 |         super(UK2019Model, self).__init__(directory=directory)  # inherit base __init__ but override default directory
21 |         self.sources = [
22 |             # tuples of (url, filename, checksum)
23 |             (
24 |                 "general-election/UK/2017/results",
25 |                 "general_election-uk-2017-results.csv",
26 |                 "c7e1fde647e55f9d4567cb81e62c782a",
27 |             ),
28 |             ("general-election/UK/polls", "general_election-uk-polls.csv", "cbc3c19a376b4ab632f122008f593799"),
29 |             ("general-election/UK/polls", "general_election-london-polls.csv", "cd28ebb7233b808796535fc0b572304e"),
30 |             ("general-election/UK/polls", "general_election-scotland-polls.csv", "6c2ba92e2325de0e22a208fb0b3e95fc"),
31 |             ("general-election/UK/polls", "general_election-wales-polls.csv", "6857df3c18df525d5e59a6bf1170b10c"),
32 |             ("general-election/UK/polls", "general_election-ni-polls.csv", "46bbe5e9dc29d4b3042837fe4c16ca07"),
33 |         ]
34 |         self.retrieve_all = True
35 |         self.verbose_name = "UK2019Model"
36 |         self.year = 2019
37 |         self.last_date = pd.to_datetime("2017-06-08")
38 |         self.now_date = pd.to_datetime("2019-12-12")
39 |         self.last = self.last_date.year
40 |         self.now = self.now_date.year
41 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/uk_2015_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Model-ready dataset for the United Kingdom's 2015 General Election.
 3 | 
 4 | Usage:
 5 |     > import maven
 6 |     > maven.get('general-election/UK/2015/model', data_directory='./data/')
 7 | """
 8 | import os
 9 | from pathlib import Path
10 | 
11 | import pandas as pd
12 | 
13 | from maven.datasets.general_election.base import UKModel
14 | 
15 | 
16 | class UK2015Model(UKModel):
17 |     """Generates model-ready data for the United Kingdom's 2015 General Election."""
18 | 
19 |     def __init__(self, directory=Path("data/general-election/UK/2015/model")):
20 |         super(UK2015Model, self).__init__(directory=directory)  # inherit base __init__ but override default directory
21 |         self.sources = [
22 |             # tuples of (url, filename, checksum)
23 |             (
24 |                 "general-election/UK/2010/results",
25 |                 "general_election-uk-2010-results.csv",
26 |                 "954a0916f5ce791ca566484ce566088d",
27 |             ),
28 |             (
29 |                 "general-election/UK/2015/results",
30 |                 "general_election-uk-2015-results.csv",
31 |                 "9a785cb19275e4dbc79da67eece6067f",
32 |             ),
33 |             ("general-election/UK/polls", "general_election-uk-polls.csv", "cbc3c19a376b4ab632f122008f593799"),
34 |             ("general-election/UK/polls", "general_election-london-polls.csv", "cd28ebb7233b808796535fc0b572304e"),
35 |             ("general-election/UK/polls", "general_election-scotland-polls.csv", "6c2ba92e2325de0e22a208fb0b3e95fc"),
36 |             ("general-election/UK/polls", "general_election-wales-polls.csv", "6857df3c18df525d5e59a6bf1170b10c"),
37 |             ("general-election/UK/polls", "general_election-ni-polls.csv", "46bbe5e9dc29d4b3042837fe4c16ca07"),
38 |         ]
39 |         self.retrieve_all = True
40 |         self.verbose_name = "UK2015Model"
41 |         self.year = 2015
42 |         self.last_date = pd.to_datetime("2010-05-06")
43 |         self.now_date = pd.to_datetime("2015-05-07")
44 |         self.last = self.last_date.year
45 |         self.now = self.now_date.year
46 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/uk_2017_model.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Model-ready dataset for the United Kingdom's 2017 General Election.
 3 | 
 4 | Usage:
 5 |     > import maven
 6 |     > maven.get('general-election/UK/2017/model', data_directory='./data/')
 7 | """
 8 | import os
 9 | from pathlib import Path
10 | 
11 | import pandas as pd
12 | 
13 | from maven.datasets.general_election.base import UKModel
14 | 
15 | 
16 | class UK2017Model(UKModel):
17 |     """Generates model-ready data for the United Kingdom's 2017 General Election."""
18 | 
19 |     def __init__(self, directory=Path("data/general-election/UK/2017/model")):
20 |         super(UK2017Model, self).__init__(directory=directory)  # inherit base __init__ but override default directory
21 |         self.sources = [
22 |             # tuples of (url, filename, checksum)
23 |             (
24 |                 "general-election/UK/2015/results",
25 |                 "general_election-uk-2015-results.csv",
26 |                 "9a785cb19275e4dbc79da67eece6067f",
27 |             ),
28 |             (
29 |                 "general-election/UK/2017/results",
30 |                 "general_election-uk-2017-results.csv",
31 |                 "c7e1fde647e55f9d4567cb81e62c782a",
32 |             ),
33 |             ("general-election/UK/polls", "general_election-uk-polls.csv", "cbc3c19a376b4ab632f122008f593799"),
34 |             ("general-election/UK/polls", "general_election-london-polls.csv", "cd28ebb7233b808796535fc0b572304e"),
35 |             ("general-election/UK/polls", "general_election-scotland-polls.csv", "6c2ba92e2325de0e22a208fb0b3e95fc"),
36 |             ("general-election/UK/polls", "general_election-wales-polls.csv", "6857df3c18df525d5e59a6bf1170b10c"),
37 |             ("general-election/UK/polls", "general_election-ni-polls.csv", "46bbe5e9dc29d4b3042837fe4c16ca07"),
38 |         ]
39 |         self.retrieve_all = True
40 |         self.verbose_name = "UK2017Model"
41 |         self.year = 2017
42 |         self.last_date = pd.to_datetime("2015-05-07")
43 |         self.now_date = pd.to_datetime("2017-06-08")
44 |         self.last = self.last_date.year
45 |         self.now = self.now_date.year
46 | 


--------------------------------------------------------------------------------
/tests/datasets/general_election/test_uk_models.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Running tests in development:
 3 |     $ cd /path/to/repo
 4 |     $ python -m pytest ./tests/datasets/general_election/test_uk_models.py
 5 | 
 6 | Running tests against installed version (either `pip install .` or `pip install maven`):
 7 |     $ cd /path/to/repo
 8 |     $ pytest ./tests/datasets/general_election/test_uk_models.py
 9 | """
10 | 
11 | from pathlib import Path
12 | 
13 | import pandas as pd
14 | 
15 | import maven
16 | 
17 | 
18 | def check_uk_model_output(identifier, output_file):
19 |     maven.get(identifier, data_directory="./data/")
20 |     df = pd.read_csv(Path("./data") / identifier / "processed" / output_file)
21 |     geo_columns = []
22 |     target_columns = []
23 |     if "geo_polls_now" in df.columns:
24 |         geo_columns += [
25 |             "geo_polls_now",
26 |             "geo_voteshare_last",
27 |             "geo_swing",
28 |             "geo_swing_forecast",
29 |             "geo_swing_winner",
30 |         ]
31 |     if "total_votes_now" in df.columns:
32 |         target_columns += [
33 |             "total_votes_now",
34 |             "turnout_now",
35 |             "votes_now",
36 |             "voteshare_now",
37 |             "winner_now",
38 |         ]
39 |     column_list = (
40 |         [
41 |             "ons_id",
42 |             "constituency",
43 |             "county",
44 |             "region",
45 |             "geo",
46 |             "country",
47 |             "electorate",
48 |             "total_votes_last",
49 |             "turnout_last",
50 |             "party",
51 |             "votes_last",
52 |             "voteshare_last",
53 |             "winner_last",
54 |             "won_here_last",
55 |             "national_voteshare_last",
56 |             "national_polls_now",
57 |             "national_swing",
58 |             "national_swing_forecast",
59 |             "national_swing_winner",
60 |         ]
61 |         + geo_columns
62 |         + target_columns
63 |     )
64 |     assert df.shape == (7800, len(column_list))
65 |     assert df.columns.tolist() == column_list
66 | 
67 | 
68 | # TODO: Can't find general_election-london-polls.csv
69 | # def test_uk_2015_model():
70 | #     check_uk_model_output(
71 | #         identifier="general-election/UK/2015/model",
72 | #         output_file="general_election-uk-2015-model.csv",
73 | #     )
74 | 
75 | 
76 | # TODO: Can't find general_election-london-polls.csv
77 | # def test_uk_2017_model():
78 | #     check_uk_model_output(
79 | #         identifier="general-election/UK/2017/model",
80 | #         output_file="general_election-uk-2017-model.csv",
81 | #     )
82 | 
83 | 
84 | # TODO: Disable for now, investigate later
85 | # def test_uk_2019_model():
86 | #     check_uk_model_output(
87 | #         identifier="general-election/UK/2019/model",
88 | #         output_file="general_election-uk-2019-model.csv",
89 | #     )
90 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Running tests in development:
 3 |     $ cd /path/to/repo
 4 |     $ python -m pytest
 5 | 
 6 | Running tests against installed version (either `pip install .` or `pip install maven`):
 7 |     $ cd /path/to/repo
 8 |     $ pytest
 9 | """
10 | import os
11 | from functools import partial
12 | from pathlib import Path
13 | 
14 | import requests
15 | 
16 | import pytest
17 | from maven import utils
18 | 
19 | 
20 | class MockResponse:
21 |     """requests.get() returns an object of class Response. Let's mock that and add:
22 |         - status_code attribute
23 |         - content attribute
24 |     """
25 | 
26 |     status_code = 200
27 |     content = b"some content"
28 | 
29 | 
30 | def test_sanitise():
31 |     assert utils.sanitise("Vote Count") == "vote_count"
32 | 
33 | 
34 | def test_calculate_md5_checksum(tmpdir):
35 |     filepath = tmpdir / "file.txt"
36 |     with open(filepath, "w") as f:
37 |         f.write("some content")
38 |     assert utils.calculate_md5_checksum(filename=filepath) == "9893532233caff98cd083a116b013c0b"
39 | 
40 | 
41 | def test_fetch_url(monkeypatch, tmpdir):
42 |     """Ref: https://docs.pytest.org/en/latest/monkeypatch.html"""
43 | 
44 |     def mock_get(*args, **kwargs):
45 |         return MockResponse()
46 | 
47 |     monkeypatch.setattr(requests, "get", mock_get)  # replace requests.get() with our mock_get()
48 |     utils.fetch_url(url="https://fakeurl", filename="fakefile.txt", target_dir=Path(tmpdir))
49 |     with open(tmpdir / "fakefile.txt", "rb") as f:
50 |         assert f.read() == b"some content"
51 | 
52 | 
53 | def test_retrieve_from_cache_if_exists(tmpdir):
54 |     def _create_file(target_dir):
55 |         """Puts file.txt in the target_dir"""
56 |         with open(target_dir / "file.txt", "w") as f:
57 |             f.write("some content")
58 | 
59 |     # Put it there for now.
60 |     _create_file(target_dir=tmpdir)
61 | 
62 |     # Test basic usage
63 |     utils.retrieve_from_cache_if_exists(
64 |         filename="file.txt",
65 |         target_dir=Path(tmpdir),
66 |         processing_fn=None,
67 |         md5_checksum=None,
68 |         caching_enabled=True,
69 |         verbose=False,
70 |     )
71 |     # Test incorrect MD5
72 |     with pytest.warns(UserWarning):
73 |         utils.retrieve_from_cache_if_exists(
74 |             filename="file.txt",
75 |             target_dir=Path(tmpdir),
76 |             processing_fn=None,
77 |             md5_checksum="badchecksum",
78 |             caching_enabled=True,
79 |             verbose=True,
80 |         )
81 |     # Remove file & put it there via processing_fn
82 |     os.remove(tmpdir / "file.txt")
83 |     utils.retrieve_from_cache_if_exists(
84 |         filename="file.txt",
85 |         target_dir=Path(tmpdir),
86 |         processing_fn=partial(_create_file, target_dir=tmpdir),
87 |         md5_checksum=None,
88 |         caching_enabled=True,
89 |         verbose=True,
90 |     )
91 | 


--------------------------------------------------------------------------------
/maven/datasets/coronavirus/csse.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Coronavirus CSSE data from https://github.com/CSSEGISandData/COVID-19/
  3 | 
  4 | Usage:
  5 |     >>> import maven
  6 |     >>> maven.get('coronavirus/CSSE', data_directory='./data/')
  7 | 
  8 | 
  9 | Sources:
 10 |     - https://github.com/CSSEGISandData/COVID-19/
 11 | """
 12 | import os
 13 | from pathlib import Path
 14 | 
 15 | import pandas as pd
 16 | 
 17 | from maven import utils
 18 | 
 19 | 
 20 | class CSSE(utils.Pipeline):
 21 |     """Handle CSSE data from https://github.com/CSSEGISandData/COVID-19/"""
 22 | 
 23 |     def __init__(self, directory=Path("data/coronavirus/CSSE")):
 24 |         # inherit base __init__ but override default directory
 25 |         super(CSSE, self).__init__(directory=directory)
 26 |         # Source & targets
 27 |         base_url = (
 28 |             "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/"
 29 |             "csse_covid_19_data/csse_covid_19_time_series/"
 30 |         )
 31 |         self.sources = [
 32 |             # url, filename, checksum
 33 |             (base_url, "time_series_19-covid-Confirmed.csv", "09b6dfc1ee244ba652b8639f0aa2f093"),
 34 |             (base_url, "time_series_19-covid-Deaths.csv", "69a9dfa8a901c8f0bbe0f6499db8641c"),
 35 |             (base_url, "time_series_19-covid-Recovered.csv", "4d1c1d4f1c45514e3562cb42ef2729c7"),
 36 |         ]
 37 |         self.targets = [
 38 |             # filename, checksum(
 39 |             ("CSSE_country_province.csv", "bfce6bf16571fbb3004f9e5eee7b9e30"),
 40 |             ("CSSE_country.csv", "b5b3ed6fc75f323593fd7710a4262e1b"),
 41 |         ]
 42 |         # Config
 43 |         self.rename_source = False
 44 |         self.retrieve_all = True
 45 |         self.cache = True
 46 |         self.verbose = False
 47 |         self.verbose_name = "CSSE"
 48 | 
 49 |     def process(self):
 50 |         """Process CSSE data."""
 51 |         target_dir = self.directory / "processed"
 52 |         os.makedirs(target_dir, exist_ok=True)  # create directory if it doesn't exist
 53 | 
 54 |         def process_and_export():
 55 |             """Either caching disabled or file not yet processed; process regardless."""
 56 |             data = {}
 57 |             for metric in ["Confirmed", "Deaths", "Recovered"]:
 58 |                 df = pd.read_csv(self.directory / "raw" / f"time_series_19-covid-{metric}.csv")
 59 |                 # Pivot all to long
 60 |                 id_vars = ["Province/State", "Country/Region", "Lat", "Long"]
 61 |                 value_vars = list(set(df.columns) - set(id_vars))
 62 |                 df = df.melt(
 63 |                     id_vars=id_vars, value_vars=value_vars, var_name="date", value_name=metric
 64 |                 )
 65 |                 df["date"] = pd.to_datetime(df.date, format="%m/%d/%y")
 66 |                 data[metric] = df.copy()
 67 | 
 68 |             # Merge together
 69 |             df_country_province = pd.merge(
 70 |                 data["Confirmed"],
 71 |                 data["Deaths"],
 72 |                 how="outer",
 73 |                 on=["Province/State", "Country/Region", "Lat", "Long", "date"],
 74 |             ).merge(
 75 |                 data["Recovered"],
 76 |                 how="outer",
 77 |                 on=["Province/State", "Country/Region", "Lat", "Long", "date"],
 78 |             )
 79 | 
 80 |             # Clean
 81 |             df_country_province.columns = utils.sanitise(
 82 |                 df_country_province.columns, replace={"long": "lon"}
 83 |             )
 84 |             df_country_province = df_country_province[
 85 |                 [
 86 |                     "date",
 87 |                     "country_region",
 88 |                     "province_state",
 89 |                     "lat",
 90 |                     "lon",
 91 |                     "confirmed",
 92 |                     "deaths",
 93 |                     "recovered",
 94 |                 ]
 95 |             ].sort_values(["date", "country_region", "province_state"])
 96 | 
 97 |             # Country-level data
 98 |             df_country = (
 99 |                 df_country_province.groupby(["date", "country_region"])[
100 |                     ["confirmed", "deaths", "recovered"]
101 |                 ]
102 |                 .sum()
103 |                 .reset_index()
104 |             )
105 | 
106 |             # Export
107 |             print(f"Exporting dataset to {target_dir.resolve()}")
108 |             df_country_province.to_csv(target_dir / "CSSE_country_province.csv", index=False)
109 |             df_country.to_csv(target_dir / "CSSE_country.csv", index=False)
110 | 
111 |         for filename, checksum in self.targets:
112 |             utils.retrieve_from_cache_if_exists(
113 |                 filename=filename,
114 |                 target_dir=target_dir,
115 |                 processing_fn=process_and_export,
116 |                 md5_checksum=checksum,
117 |                 caching_enabled=self.cache,
118 |                 verbose=self.verbose,
119 |             )
120 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Maven
 2 | > /meɪvən/ – a trusted expert who seeks to pass timely and relevant knowledge on to others.
 3 | 
 4 | Maven's goal is to reduce the time data scientists spend on data cleaning and preparation by providing easy access to open datasets in both raw and processed formats.
 5 | 
 6 | Maven was built to:
 7 | 
 8 | - **Improve availability and integrity of open data** by eliminating data issues, adding common identifiers, and reshaping data to become model-ready.
 9 | - **Source data in its rawest form** from the most authoritative data provider available with all transformations available as open source code to enhance integrity and trust.
10 | - **Honour data licences wherever possible** whilst avoiding potential issues relating to re-distribution of data (especially open datasets where no clear licence is provided) by performing all data retrieval and processing on-device.
11 | 
12 | 
13 | ## Install
14 | ```
15 | pip install maven
16 | ```
17 | 
18 | 
19 | ## Usage
20 | ```python
21 | import maven
22 | maven.get('general-election/UK/2017/results', data_directory='./data/')
23 | ```
24 | 
25 | 
26 | ## Datasets
27 | Data dictionaries for all datasets are available by clicking on the dataset's name.
28 | 
29 | | Dataset | Description | Date | Source | Licence |
30 | | -- | -- | -- | -- | -- |
31 | | **Coronavirus Datasets** |
32 | | [**`coronavirus/CSSE`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/coronavirus) | Daily CSSE cases/deaths/recovered by country/region/state | Updated daily | [Johns Hopkins Center for Systems Science and Engineering](https://github.com/CSSEGISandData/COVID-19/) | [See "Terms of Use" on CSSE repo](https://github.com/CSSEGISandData/COVID-19/) |
33 | | **UK Political Datasets** |
34 | | [**`general-election/UK/2010/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2010 General Election results | 6th May 2010 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) |
35 | | [**`general-election/UK/2015/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2015 General Election results | 7th May 2015 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) |
36 | | [**`general-election/UK/2017/results`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK 2017 General Election results | 8th June 2017 | [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647) | [Open Parliament Licence v3.0](https://www.parliament.uk/site-information/copyright-parliament/open-parliament-licence/) |
37 | | [**`general-election/UK/2015/model`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | Model-ready datasets for forecasting the 2015 UK General Election | 2010 & 2015 data | [uk_2015_model.py](https://github.com/john-sandall/maven/blob/master/maven/datasets/general_election/uk_2015_model.py) | Mixed |
38 | | [**`general-election/UK/2017/model`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | Model-ready datasets for forecasting the 2017 UK General Election | 2015 & 2017 data | [uk_2017_model.py](https://github.com/john-sandall/maven/blob/master/maven/datasets/general_election/uk_2017_model.py) | Mixed |
39 | | [**`general-election/UK/polls`**](https://github.com/john-sandall/maven/tree/master/maven/datasets/general_election) | UK General Election opinion polling | May 2005 - June 2017 | [SixFifty](https://github.com/six50/pipeline/tree/master/data/polls/) | Unknown |
40 | 
41 | 
42 | 
43 | ## Running tests
44 | To run tests against an installed version (either `pip install .` or `pip install maven`):
45 | ```
46 | $ cd /path/to/repo
47 | $ pytest
48 | ```
49 | 
50 | To run tests whilst in development:
51 | ```
52 | $ cd /path/to/repo
53 | $ python -m pytest
54 | ```
55 | 
56 | 
57 | ## Licences
58 | | Name | Description | Attribution Statement |
59 | | -- | -- | -- |
60 | | [Open Parliament Licence](http://www.parliament.uk/site-information/copyright/open-parliament-licence/) | Free to copy, publish, distribute, transmit, adapt and exploit commercially or non-commercially. See URL for full details. | Contains Parliamentary information licensed under the Open Parliament Licence v3.0. |
61 | | [Open Government Licence](http://www.nationalarchives.gov.uk/doc/open-government-licence/version/2/) | Free to copy, publish, distribute, transmit, adapt and exploit commercially and non-commercially. See URL for full details. | Contains public sector information licensed under the Open Government Licence v2.0. |
62 | 
63 | 
64 | ## Contributing
65 | Maven was designed for your contributions!
66 | 
67 | 1. Check for open issues or open a fresh issue to start a discussion around your idea or a bug.
68 | 2. Fork [the repository](https://github.com/john-sandall/maven) on GitHub to start making your changes to the master branch (or branch off of it).
69 | 3. For new datasets ensure the processed dataset is fully documented with a data dictionary. For new features and bugs, please write a test which shows that the bug was fixed or that the feature works as expected.
70 | 4. Send a [pull request](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork) and bug the maintainer until it gets merged and published. 😄
71 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | 
  8 | ## [Unreleased]
  9 | 
 10 | ## [0.1.0] - 2020-02-03
 11 | ### Changed
 12 | - Model-ready datasets can now be "prediction-only" (i.e. for use pre-election when we don't know results).
 13 | - Model-ready datasets include UKIP and BXP as part of "Other" until a better solution can be found.
 14 | - Various changes to enable a better regional UNS forecast:
 15 |     - better handling of NI parties;
 16 |     - regional poll-of-polls goes back a month to incorporate large sample regional polling and not just sub-samples;
 17 |     - MRP sample sizes are disregarded for weighted poll-of-polls;
 18 |     - missing sample sizes (such as for polls derived from PollBase) are imputed using mean sample size within the same region;
 19 | ### Added
 20 | - Merged SixFifty UK polling data (detailed inc. sample sizes) up to June 2017 with Mark Pack's PollBase which has less columns but all polls up to Dec 2019.
 21 | - Incorporated regional polling & regional sub-samples for December 2019 from SixFifty. 
 22 | - `general-election/UK/2019/model`: added model-ready dataset including UNS and regional UNS forecasts for the 2019 UK General Election.
 23 | 
 24 | ## [0.0.12] - 2020-02-03
 25 | ### Changed
 26 | - `general-election/UK/2015/model`: model-ready dataset for just the 2015 UK General Election.
 27 | ### Added
 28 | - `general-election/UK/2017/model`: model-ready dataset for the 2017 UK General Election.
 29 | 
 30 | ## [0.0.11] - 2020-02-02
 31 | ### Added
 32 | - Updated & refactored polling pipeline code.
 33 | - Updated & refactored pipeline for building model-ready datasets for 2015/2017 UK general elections.
 34 | 
 35 | ## [0.0.10] - 2020-01-26
 36 | ### Added
 37 | - Raw datasets are now cached on download, and processed datasets cached after processing, and always checked against MD5 for integrity.
 38 | - Tests now exist for utils.py
 39 | 
 40 | ## [0.0.9] - 2020-01-26
 41 | ### Added
 42 | - UK 2017 General Election dataset (**`general-election/UK/2017/results`**).
 43 | - Some tests (that really need caching!).
 44 | ### Changed
 45 | - Now using data from the [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647).
 46 | - The basic processed election results are now "long form" with less but more standardised information.
 47 | - The full election results are (for now) no longer provided.
 48 | - Lots of refactoring with some new base classes & utils making it faster to add new datasets.
 49 | 
 50 | ## [0.0.8] - 2019-11-14
 51 | ### Fixes
 52 | - Electoral Commission [no longer hosts 2010 GE results](https://github.com/john-sandall/maven/pull/15) so use our fallback until a new primary can be found.
 53 | - Fixed URL to EC's 2015 GE results.
 54 | 
 55 | ## [0.0.7] - 2019-11-14
 56 | ### Added
 57 | - Tests added to setup.py.
 58 | ### Changed
 59 | - Switched to using [pip-tools](https://github.com/jazzband/pip-tools) instead of Pipenv for generating requirements.txt & locking dependencies.
 60 | 
 61 | ## [0.0.6] - 2019-07-13
 62 | ### Added
 63 | - `general-election/UK/2015/model`: model-ready datasets for the 2015/2017 UK General Elections.
 64 | - Regional polling datasets.
 65 | 
 66 | ## [0.0.5] - 2019-07-13
 67 | ### Added
 68 | - Basic tests for `get.py`
 69 | - Additional processing for the GE2015 results pipeline to generate a more useful dataset for common election modelling tasks.
 70 | - Added `general-election/UK/2010/results` dataset.
 71 | ### Changed
 72 | - API design for dataset identifiers to use dash/slash instead of underscore/dash and capitalised country codes to make it clearer these will be ISO 3166 Alpha-2 codes, e.g. `general_election-gb-2015-results` -> `general-election/GB/2015/results`.
 73 | - Changed GB to UK everywhere as these results are full UK results including Northern Ireland.
 74 | 
 75 | ## [0.0.4] - 2019-07-07
 76 | ### Fixes
 77 | - Fixed relative imports and switch to using a class for each dataset.
 78 | 
 79 | ## [0.0.3] - 2019-07-07
 80 | ### Added
 81 | - Improved README.
 82 | 
 83 | ## [0.0.2] - 2019-07-07
 84 | ### Added
 85 | - UK 2015 General Election dataset (**`general_election-gb-2015-results`**).
 86 | - Proper README plus data dictionary.
 87 | - MANIFEST.in plus additional packaging info and this changelog.
 88 | 
 89 | ## [0.0.1] - 2019-07-07
 90 | ### Added
 91 | - Barebones functionality, Python package requirements (setup.py, Pipfile, .gitignore, LICENSE)
 92 | 
 93 | 
 94 | [Unreleased]: https://github.com/john-sandall/maven/compare/v0.1.0...HEAD
 95 | [0.1.0]: https://github.com/john-sandall/maven/compare/v0.0.12...v0.1.0
 96 | [0.0.12]: https://github.com/john-sandall/maven/compare/v0.0.11...v0.0.12
 97 | [0.0.11]: https://github.com/john-sandall/maven/compare/v0.0.10...v0.0.11
 98 | [0.0.10]: https://github.com/john-sandall/maven/compare/v0.0.9...v0.0.10
 99 | [0.0.9]: https://github.com/john-sandall/maven/compare/v0.0.8...v0.0.9
100 | [0.0.8]: https://github.com/john-sandall/maven/compare/v0.0.7...v0.0.8
101 | [0.0.7]: https://github.com/john-sandall/maven/compare/v0.0.6...v0.0.7
102 | [0.0.6]: https://github.com/john-sandall/maven/compare/v0.0.5...v0.0.6
103 | [0.0.5]: https://github.com/john-sandall/maven/compare/v0.0.4...v0.0.5
104 | [0.0.4]: https://github.com/john-sandall/maven/compare/v0.0.3...v0.0.4
105 | [0.0.3]: https://github.com/john-sandall/maven/compare/v0.0.2...v0.0.3
106 | [0.0.2]: https://github.com/john-sandall/maven/compare/v0.0.1...v0.0.2
107 | [0.0.1]: https://github.com/john-sandall/maven/releases/tag/v0.0.1
108 | 


--------------------------------------------------------------------------------
/maven/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Various helper functions.
  3 | """
  4 | import hashlib
  5 | import os
  6 | import shutil
  7 | import warnings
  8 | from functools import partial
  9 | from pathlib import Path
 10 | from urllib.parse import urlparse
 11 | 
 12 | import pandas as pd
 13 | import requests
 14 | 
 15 | import maven
 16 | 
 17 | #########
 18 | # GENERAL
 19 | #########
 20 | 
 21 | 
 22 | def sanitise(x, replace=None):
 23 |     if isinstance(x, str):
 24 |         out = x.lower().replace(" ", "_").replace("/", "_")
 25 |         if replace and out in replace:
 26 |             out = replace[out]
 27 |         return out
 28 |     elif isinstance(x, (list, pd.core.indexes.base.Index, pd.core.series.Series)):
 29 |         return [sanitise(element, replace=replace) for element in x]
 30 |     else:
 31 |         raise TypeError(f"Unexpected type encountered in sanitise: type(x) == '{type(x)}'")
 32 | 
 33 | 
 34 | def calculate_md5_checksum(filename):
 35 |     """
 36 |     Calculate the checksum of the file, exactly same as md5-sum linux util.
 37 |     Code from https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/downloader.py
 38 |     """
 39 |     hash_md5 = hashlib.md5()
 40 |     with open(filename, "rb") as f:
 41 |         for chunk in iter(lambda: f.read(4096), b""):
 42 |             hash_md5.update(chunk)
 43 |     return hash_md5.hexdigest()
 44 | 
 45 | 
 46 | def is_url(url):
 47 |     """Source: https://stackoverflow.com/a/52455972"""
 48 |     try:
 49 |         result = urlparse(url)
 50 |         return all([result.scheme, result.netloc])
 51 |     except ValueError:
 52 |         return False
 53 | 
 54 | 
 55 | def fetch_url(url, filename, target_dir, rename_file=False):
 56 |     """Download filename from url into target_dir."""
 57 |     if rename_file:
 58 |         url_to_retrieve = url
 59 |     else:
 60 |         url_to_retrieve = url + filename
 61 |     response = requests.get(url_to_retrieve)
 62 |     if response.status_code != 200:
 63 |         warnings.warn(
 64 |             f"Received status {response.status_code} when trying to retrieve {url}{filename}"
 65 |         )
 66 |     # Save to file
 67 |     with open(target_dir / filename, "wb") as f:
 68 |         f.write(response.content)
 69 |     print(f"Successfully downloaded {filename} into {target_dir.resolve()}")
 70 |     return target_dir / filename
 71 | 
 72 | 
 73 | def get_and_copy(identifier, filename, target_dir):
 74 |     """Run maven.get(identifier) and copy filename from identifier/processed/ data
 75 |        into target/ directory.
 76 |     """
 77 |     # target_dir by default is data/general-election/UK/2015/model
 78 |     subdirectories_below = str(target_dir).count("/")
 79 |     go_up = "/".join([".." for _ in range(subdirectories_below)])
 80 |     data_directory = (target_dir / go_up).resolve()  # sensible guess?
 81 |     maven.get(identifier, data_directory=data_directory)
 82 |     source = data_directory / identifier / "processed"
 83 |     print(f"Copying {filename} from {source} -> {target_dir}.")
 84 |     shutil.copyfile(src=source / filename, dst=target_dir / filename)
 85 | 
 86 | 
 87 | def retrieve_from_cache_if_exists(
 88 |     filename, target_dir, processing_fn, md5_checksum=None, caching_enabled=True, verbose=False
 89 | ):
 90 |     """Retrieve filename from target_dir if it exists, otherwise execute processing_fn.
 91 | 
 92 |     Raises a warning if the retrieved/processed file's checksum doesn't match the expected MD5.
 93 |     """
 94 |     if caching_enabled and (target_dir / filename).exists():
 95 |         # Check if it's already in target_dir.
 96 |         print(f"Cached file {filename} is already in {target_dir.resolve()}")
 97 |     else:
 98 |         # Either caching disabled or file not there yet.
 99 |         processing_fn()
100 | 
101 |     # File should now be there. Let's check checksums.
102 |     downloaded_file_md5_checksum = calculate_md5_checksum(target_dir / filename)
103 |     if verbose:
104 |         print(f"Checksum for {filename}: {downloaded_file_md5_checksum}")
105 |     if md5_checksum and downloaded_file_md5_checksum != md5_checksum:
106 |         warnings.warn(f"MD5 checksum doesn't match for {filename}")
107 | 
108 | 
109 | ##################
110 | # PIPELINE CLASSES
111 | ##################
112 | 
113 | 
114 | class Pipeline:
115 |     """Generic class for retrieving & processing datasets with built-in caching & MD5 checking."""
116 | 
117 |     def __init__(self, directory):
118 |         self.directory = Path(directory)
119 |         self.sources = []  # tuples of (url, filename, checksum)
120 |         self.rename_source = False
121 |         self.retrieve_all = False
122 |         self.target = (None, None)
123 |         self.verbose_name = ""
124 |         self.year = None
125 |         self.verbose = False
126 |         self.cache = True
127 | 
128 |     def retrieve(self):
129 |         """
130 |         Retrieve data from self.sources into self.directory / 'raw' and validate against checksum.
131 |         """
132 |         target_dir = self.directory / "raw"
133 |         os.makedirs(target_dir, exist_ok=True)  # create directory if it doesn't exist
134 |         for url, filename, md5_checksum in self.sources:
135 |             if is_url(url):
136 |                 processing_fn = partial(
137 |                     fetch_url,
138 |                     url=url,
139 |                     filename=filename,
140 |                     target_dir=target_dir,
141 |                     rename_file=self.rename_source,
142 |                 )
143 |             else:
144 |                 processing_fn = partial(
145 |                     get_and_copy, identifier=url, filename=filename, target_dir=target_dir
146 |                 )
147 |             retrieve_from_cache_if_exists(
148 |                 filename=filename,
149 |                 target_dir=target_dir,
150 |                 processing_fn=processing_fn,
151 |                 md5_checksum=md5_checksum,
152 |                 caching_enabled=self.cache,
153 |                 verbose=self.verbose,
154 |             )
155 |             if not self.retrieve_all:  # retrieve just the first dataset
156 |                 return
157 |         if self.retrieve_all:  # all datasets retrieved
158 |             return
159 |         else:  # retrieving first dataset only but all fallbacks failed
160 |             raise RuntimeError(f"Unable to download {self.verbose_name} data.")
161 | 
162 |     def process(self):
163 |         pass
164 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/uk_polls.py:
--------------------------------------------------------------------------------
  1 | """
  2 | General Election polling data for the United Kingdom.
  3 | 
  4 | Usage:
  5 |     > import maven
  6 |     > maven.get('general-election/UK/polls', data_directory='./data/')
  7 | 
  8 | Sources:
  9 |     - SixFifty polling data: https://github.com/six50/pipeline/tree/master/data/polls/
 10 |         - https://s3-eu-west-1.amazonaws.com/sixfifty/polls.csv
 11 |         - https://s3-eu-west-1.amazonaws.com/sixfifty/polls_london.csv
 12 |         - https://s3-eu-west-1.amazonaws.com/sixfifty/polls_scotland.csv
 13 |         - https://s3-eu-west-1.amazonaws.com/sixfifty/polls_wales.csv
 14 |         - https://s3-eu-west-1.amazonaws.com/sixfifty/polls_ni.csv
 15 |     - PollBase: https://www.markpack.org.uk/opinion-polls/
 16 | """
 17 | import os
 18 | from pathlib import Path
 19 | 
 20 | import numpy as np
 21 | import pandas as pd
 22 | 
 23 | from maven import utils
 24 | from maven.datasets.general_election.base import Pipeline
 25 | 
 26 | 
 27 | class UKPolls(Pipeline):
 28 |     """Handles General Election polling data for the United Kingdom.
 29 | 
 30 |     Mark Pack's PollBase : https://www.markpack.org.uk/opinion-polls/
 31 |     """
 32 | 
 33 |     def __init__(self, directory=Path("data/general-election/UK/polls")):
 34 |         super(UKPolls, self).__init__(
 35 |             directory=directory
 36 |         )  # inherit base __init__ but override default directory
 37 |         self.sources = [
 38 |             # tuples of (url, filename, checksum)
 39 |             (
 40 |                 "https://3859gp38qzh51h504x6gvv0o-wpengine.netdna-ssl.com/files/2020/01/",
 41 |                 "PollBase-Q4-2019.xlsx",
 42 |                 "81e9dd972f17d0b4f572e7da6c4c497f",
 43 |             ),
 44 |             (
 45 |                 "https://s3-eu-west-1.amazonaws.com/sixfifty/",
 46 |                 "polls.csv",
 47 |                 "8c32b623346c8c0faa603bc76c4d7fd1",
 48 |             ),
 49 |             (
 50 |                 "https://s3-eu-west-1.amazonaws.com/sixfifty/",
 51 |                 "polls_london.csv",
 52 |                 "cd28ebb7233b808796535fc0b572304e",
 53 |             ),
 54 |             (
 55 |                 "https://s3-eu-west-1.amazonaws.com/sixfifty/",
 56 |                 "polls_scotland.csv",
 57 |                 "6c2ba92e2325de0e22a208fb0b3e95fc",
 58 |             ),
 59 |             (
 60 |                 "https://s3-eu-west-1.amazonaws.com/sixfifty/",
 61 |                 "polls_wales.csv",
 62 |                 "6857df3c18df525d5e59a6bf1170b10c",
 63 |             ),
 64 |             (
 65 |                 "https://s3-eu-west-1.amazonaws.com/sixfifty/",
 66 |                 "polls_ni.csv",
 67 |                 "46bbe5e9dc29d4b3042837fe4c16ca07",
 68 |             ),
 69 |         ]
 70 |         self.retrieve_all = True
 71 |         self.target = (
 72 |             "general_election-uk-polls.csv",
 73 |             "cbc3c19a376b4ab632f122008f593799",
 74 |         )  # filename, checksum
 75 |         self.verbose_name = "UKPolls"
 76 | 
 77 |     def process(self):
 78 |         """Process UK polling data."""
 79 |         filename = self.sources[0][1]
 80 |         processed_results_location = self.directory / "processed" / self.target[0]
 81 |         os.makedirs(
 82 |             self.directory / "processed", exist_ok=True
 83 |         )  # create directory if it doesn't exist
 84 | 
 85 |         def process_and_export():
 86 |             # Read in PollBase
 87 |             df = pd.read_excel(
 88 |                 self.directory / "raw" / filename,
 89 |                 sheet_name="17-19",
 90 |                 usecols="A:C,G:H,I,K,M,O,Q,S,U,Y",
 91 |             )
 92 | 
 93 |             # Clean it up
 94 |             df.columns = utils.sanitise(
 95 |                 df.columns,
 96 |                 replace={
 97 |                     "polling": "company",
 98 |                     "publisher": "client",
 99 |                     "unnamed:_24": "method",
100 |                     "green": "grn",
101 |                     "tig_cuk": "chuk",
102 |                 },
103 |             )
104 |             df["year"] = df.year.replace({"?": 2019}).ffill().astype(int)
105 |             df["month"] = df.month.ffill()
106 |             df = df[df["fieldwork"].notnull()].copy()
107 |             df["day_from"] = df.fieldwork.apply(
108 |                 lambda x: str(x).split("-")[0].replace("?", "")
109 |                 if "-" in str(x)
110 |                 else str(x).replace("?", "")
111 |             )
112 |             df["day_to"] = df.fieldwork.apply(
113 |                 lambda x: str(x).split("-")[1].replace("?", "")
114 |                 if "-" in str(x)
115 |                 else str(x).replace("?", "")
116 |             )
117 |             df["from"] = pd.to_datetime(
118 |                 df.apply(lambda row: f"{row.year}-{row.month}-{row.day_from}", axis=1)
119 |             )
120 |             df["to"] = pd.to_datetime(
121 |                 df.apply(lambda row: f"{row.year}-{row.month}-{row.day_to}", axis=1)
122 |             )
123 | 
124 |             # Fix month & year in df['to'] where e.g. fieldwork is "30-3 Jan"
125 |             month_shifted = (
126 |                 df.year.astype(str)
127 |                 + "-"
128 |                 + ((df.to.dt.month + 1) % 12).astype(str).replace("0", "12")
129 |                 + "-"
130 |                 + df.day_to.astype(str)
131 |             )
132 |             year_needs_shifting = month_shifted.apply(lambda x: str(x).split("-")[1]) == "1"
133 |             month_shifted.loc[year_needs_shifting] = (
134 |                 ((df.loc[year_needs_shifting, "year"]).astype(int) + 1)
135 |                 .astype(str)
136 |                 .replace("0", "12")
137 |                 + "-"
138 |                 + ((df.to.dt.month + 1) % 12).astype(str)
139 |                 + "-"
140 |                 + df.day_to.astype(str)
141 |             )
142 |             df.loc[df["from"] > df["to"], "to"] = month_shifted.loc[df["from"] > df["to"]]
143 |             df["to"] = pd.to_datetime(df.to)
144 | 
145 |             # Divide numbers by 100
146 |             for party in ["con", "lab", "ld", "ukip", "grn", "chuk", "bxp"]:
147 |                 df[party] = df[party].replace(" ", np.nan).astype(float) / 100
148 | 
149 |             # Prepare for merge with SixFifty data
150 |             df["sample_size"] = np.nan
151 |             df["snp"] = np.nan
152 |             df["pdf"] = np.nan
153 |             columns = [
154 |                 "company",
155 |                 "client",
156 |                 "method",
157 |                 "from",
158 |                 "to",
159 |                 "sample_size",
160 |                 "con",
161 |                 "lab",
162 |                 "ld",
163 |                 "ukip",
164 |                 "grn",
165 |                 "chuk",
166 |                 "bxp",
167 |                 "snp",
168 |                 "pdf",
169 |             ]
170 |             df = df[columns].copy().sort_values("to")
171 | 
172 |             # Read in SixFifty polling data (2005 -> June 2017)
173 |             df_sixfifty = pd.read_csv(
174 |                 self.directory / "raw" / "polls.csv", parse_dates=["from", "to"]
175 |             )
176 |             df_sixfifty["chuk"] = np.nan
177 |             df_sixfifty["bxp"] = np.nan
178 |             df_sixfifty = df_sixfifty[columns].copy().sort_values("to")
179 | 
180 |             # Merge
181 |             df_sixfifty = df_sixfifty[df_sixfifty.to < df.to.min()].copy()
182 |             assert df_sixfifty.to.max() < df.to.min()
183 |             df_polls = pd.concat([df_sixfifty, df], axis=0)
184 | 
185 |             # Export
186 |             print(f"Exporting dataset to {processed_results_location.resolve()}")
187 |             df_polls.to_csv(processed_results_location, index=False)
188 | 
189 |         utils.retrieve_from_cache_if_exists(
190 |             filename=self.target[0],
191 |             target_dir=(self.directory / "processed"),
192 |             processing_fn=process_and_export,
193 |             md5_checksum=self.target[1],
194 |             caching_enabled=self.cache,
195 |             verbose=self.verbose,
196 |         )
197 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/README.md:
--------------------------------------------------------------------------------
  1 | # General Election datasets
  2 | 
  3 | If you have any questions about these datasets please [contact me @John_Sandall](https://twitter.com/John_Sandall) on Twitter.
  4 | 
  5 | 
  6 | ## Sources
  7 | We aim to source our data directly from the most authorative data provider, falling back to less authorative sources where a primary source isn't available. By country:
  8 | - **United Kingdom:** [House of Commons Library](https://researchbriefings.parliament.uk/ResearchBriefing/Summary/CBP-8647).
  9 | 
 10 | 
 11 | ## Data dictionaries
 12 | 
 13 | #### **`general-election/UK/2015/model`**
 14 | | Column | Type | Description | Example |
 15 | | -- | -- | -- | -- |
 16 | | Constituency-level factors |
 17 | | `ons_id` | int | ONS constituency identifier | `E14000530` |
 18 | | `constituency` | str | Constituency name | `ALDERSHOT` |
 19 | | `county` | str | County:{`Avon`, `Bedfordshire`, and 44 more} | `Hampshire` |
 20 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` |
 21 | | `geo` | str | Geographic region (aggregated level between region and country) | `england_not_london` |
 22 | | `country`| str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` |
 23 | | 2010 election data |
 24 | | `electorate` | int | Electorate | `72430` |
 25 | | `total_votes_last` | int | Total valid votes counted in this constituency in the 2010 election | `45384` |
 26 | | `turnout_last` | float | Turnout in this constituency in the 2010 election | `0.635052123` |
 27 | | `party` | str | Party:{`apni`, `con`, `dup`, `grn`, `lab`, `ld`, `other`, `pc`, `sdlp`, `sf`, `snp`, `ukip`, `uup`} | `con` |
 28 | | `votes_last` | int | Votes counted for this party in this constituency in 2010 | `21203` |
 29 | | `voteshare_last` | float | Percentage voteshare for this party in this constituency in 2010 | `0.467191` |
 30 | | `winner_last` | str | Party that won in this constituency in 2010 | `con` |
 31 | | `won_here_last` | bool | Did this party win in this constituency in 2010 | `True` |
 32 | | `national_voteshare_last` | float | Percentage of national voteshare for this party from 2010 results | `0.360542872` |
 33 | | 2015 pre-election data |
 34 | | `national_polls_now` | float | Percentage of national voteshare for this party from 2015 pre-election polling | `0.338181818` |
 35 | | `national_swing` | float | Uplift in national voteshare for this party between 2010 results and 2015 polling | `-0.062020512` |
 36 | | `national_swing_forecast` | str | Projected voteshare for this party in this constituency using a UNS model | `0.438215651` |
 37 | | `national_swing_winner` | str | Projected winner in this constituency using `national_swing_forecast` | `con` |
 38 | | 2015 post-election data |
 39 | | `total_votes_now` | int | Total valid votes counted in this constituency in the 2015 election | `46191` |
 40 | | `turnout_now` | float | Turnout in this constituency in 2015 election | `0.637732984` |
 41 | | `votes_now` | int | Total votes counted for this party in this constituency in 2015 | `23369` |
 42 | | `voteshare_now` | float | Percentage voteshare for this party in this constituency in 2015 | `0.505921067` |
 43 | | `winner_now` | str | Party that won in this constituency in 2015 | `con` |
 44 | 
 45 | 
 46 | #### **`general-election/UK/2017/model`**
 47 | | Column | Type | Description | Example |
 48 | | -- | -- | -- | -- |
 49 | | Constituency-level factors |
 50 | | `ons_id` | int | ONS constituency identifier | `E14000530` |
 51 | | `constituency` | str | Constituency name | `ALDERSHOT` |
 52 | | `county` | str | County:{`Avon`, `Bedfordshire`, and 44 more} | `Hampshire` |
 53 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` |
 54 | | `geo` | str | Geographic region (aggregated level between region and country) | `england_not_london` |
 55 | | `country`| str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` |
 56 | | 2015 election data |
 57 | | `electorate` | int | Electorate | `76205` |
 58 | | `total_votes_last` | int | Total valid votes counted in this constituency in the 2015 election | `46191` |
 59 | | `turnout_last` | float | Turnout in this constituency in the 2015 election | `0.637732984` |
 60 | | `party` | str | Party:{`apni`, `con`, `dup`, `grn`, `lab`, `ld`, `other`, `pc`, `sdlp`, `sf`, `snp`, `ukip`, `uup`} | `con` |
 61 | | `votes_last` | int | Votes counted for this party in this constituency in 2015 | `23369` |
 62 | | `voteshare_last` | float | Percentage voteshare for this party in this constituency in 2015 | `0.505921067` |
 63 | | `winner_last` | str | Party that won in this constituency in 2015 | `con` |
 64 | | `won_here_last` | bool | Did this party win in this constituency in 2015 | `True` |
 65 | | `national_voteshare_last` | float | Percentage of national voteshare for this party from 2015 results | `0.368095115` |
 66 | | 2017 pre-election data |
 67 | | `national_polls_now` | float | Percentage of national voteshare for this party from 2017 pre-election polling | `0.42729587` |
 68 | | `national_swing` | float | Uplift in national voteshare for this party between 2015 results and 2017 polling | `0.160830048` |
 69 | | `national_swing_forecast` | str | Projected voteshare for this party in this constituency using a UNS model | `0.587288376` |
 70 | | `national_swing_winner` | str | Projected winner in this constituency using `national_swing_forecast` | `con` |
 71 | | 2015/2017 regional data |
 72 | | `geo_polls_now` | float | Percentage of regional voteshare for this party from 2017 pre-election polling | `0.470077263` |
 73 | | `geo_voteshare_last` | float | Percentage of regional voteshare for this party from 2015 results | `0.418216805` |
 74 | | `geo_swing` | float | Uplift in regional voteshare for this party between 2015 results and 2017 polling | `0.124003764` |
 75 | | `geo_swing_forecast` | float | Projected voteshare for this party in this constituency using a regional UNS model | `0.568657183` |
 76 | | `geo_swing_winner` | str | Projected winner in this constituency using `geo_swing_forecast` | `con` |
 77 | | 2017 post-election data |
 78 | | `total_votes_now` | int | Total valid votes counted in this constituency in the 2017 election | `48950` |
 79 | | `turnout_now` | float | Turnout in this constituency in 2017 election | `0.642346303` |
 80 | | `votes_now` | int | Total votes counted for this party in this constituency in 2017 | `26950` |
 81 | | `voteshare_now` | float | Percentage voteshare for this party in this constituency in 2017 | `0.550561798` |
 82 | | `winner_now` | str | Party that won in this constituency in 2017 | `con` |
 83 | 
 84 | 
 85 | #### **`general-election/UK/2019/model`**
 86 | | Column | Type | Description | Example |
 87 | | -- | -- | -- | -- |
 88 | | Constituency-level factors |
 89 | | `ons_id` | int | ONS constituency identifier | `E14000530` |
 90 | | `constituency` | str | Constituency name | `ALDERSHOT` |
 91 | | `county` | str | County:{`Avon`, `Bedfordshire`, and 44 more} | `Hampshire` |
 92 | | `region` | str | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` |
 93 | | `geo` | str | Geographic region (aggregated level between region and country) | `england_not_london` |
 94 | | `country`| str | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` |
 95 | | 2017 election data |
 96 | | `electorate` | int | Electorate | `76205` |
 97 | | `total_votes_last` | int | Total valid votes counted in this constituency in the 2017 election | `48950` |
 98 | | `turnout_last` | float | Turnout in this constituency in the 2017 election | `0.642346303` |
 99 | | `party` | str | Party:{`apni`, `con`, `dup`, `grn`, `lab`, `ld`, `other`, `pc`, `sdlp`, `sf`, `snp`, `ukip`, `uup`} | `con` |
100 | | `votes_last` | int | Votes counted for this party in this constituency in 2017 | `26950` |
101 | | `voteshare_last` | float | Percentage voteshare for this party in this constituency in 2017 | `0.550561798` |
102 | | `winner_last` | str | Party that won in this constituency in 2017 | `con` |
103 | | `won_here_last` | bool | Did this party win in this constituency in 2017 | `True` |
104 | | `national_voteshare_last` | float | Percentage of national voteshare for this party from 2017 results | `0.423444482` |
105 | | 2019 pre-election data |
106 | | `national_polls_now` | float | Percentage of national voteshare for this party from 2019 pre-election polling | `0.396538462` |
107 | | `national_swing` | float | Uplift in national voteshare for this party between 2017 results and 2019 polling | `-0.063540845` |
108 | | `national_swing_forecast` | str | Projected voteshare for this party in this constituency using a UNS model | `0.515578636` |
109 | | `national_swing_winner` | str | Projected winner in this constituency using `national_swing_forecast` | `con` |
110 | | 2017/2019 regional data |
111 | | `geo_polls_now` | float | Percentage of regional voteshare for this party from 2019 pre-election polling | `0.429089129` |
112 | | `geo_voteshare_last` | float | Percentage of regional voteshare for this party from 2017 results | `0.474642379` |
113 | | `geo_swing` | float | Uplift in regional voteshare for this party between 2017 results and 2019 polling | `-0.095973837` |
114 | | `geo_swing_forecast` | float | Projected voteshare for this party in this constituency using a regional UNS model | `0.497722269` |
115 | | `geo_swing_winner` | str | Projected winner in this constituency using `geo_swing_forecast` | `con` |
116 | 
117 | 
118 | #### **`general-election/UK/2010/results`**
119 | | Column            | Type  | Description | Example |
120 | | --                | -- | -- | -- |
121 | | `ons_id`          | str   | Standardised constituency identifier | `E14000530` |
122 | | `constituency`    | str   | Constituency name | `ALDERSHOT` |
123 | | `county`          | str   | County name | `Hampshire` |
124 | | `region`          | str   | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` |
125 | | `country`         | str   | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` |
126 | | `electorate`      | int   | Electorate | `71465` |
127 | | `total_votes`     | int   | Total valid votes counted in this constituency | `45384` |
128 | | `turnout`         | float | Turnout in this constituency | `0.635052123` |
129 | | `party`           | str   | Name of political party (lower-cased & abbreviated) | `con` |
130 | | `votes`           | int   | Votes for this party | `21203` |
131 | | `voteshare`       | float | Vote share for this party within the constituency | `0.467191081` |
132 | 
133 | 
134 | #### **`general-election/UK/2015/results`**
135 | | Column            | Type  | Description | Example |
136 | | --                | -- | -- | -- |
137 | | `ons_id`          | str   | Standardised constituency identifier | `E14000530` |
138 | | `constituency`    | str   | Constituency name | `ALDERSHOT` |
139 | | `county`          | str   | County name | `Hampshire` |
140 | | `region`          | str   | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` |
141 | | `country`         | str   | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` |
142 | | `electorate`      | int   | Electorate | `72430` |
143 | | `total_votes`     | int   | Total valid votes counted in this constituency | `46191` |
144 | | `turnout`         | float | Turnout in this constituency | `0.637732984` |
145 | | `party`           | str   | Name of political party (lower-cased & abbreviated) | `con` |
146 | | `votes`           | int   | Votes for this party | `23369` |
147 | | `voteshare`       | float | Vote share for this party within the constituency | `0.505921067` |
148 | 
149 | 
150 | #### **`general-election/UK/2017/results`**
151 | | Column            | Type  | Description | Example |
152 | | --                | -- | -- | -- |
153 | | `ons_id`          | str   | Standardised constituency identifier | `E14000530` |
154 | | `constituency`    | str   | Constituency name | `ALDERSHOT` |
155 | | `county`          | str   | County name | `Hampshire` |
156 | | `region`          | str   | Region:{`East Midlands`, `Eastern`, `London`, `North East`, `North West`, `Northern Ireland`, `Scotland`, `South East`, `South West`, `Wales`, `West Midlands`, `Yorkshire and The Humber`} | `South East` |
157 | | `country`         | str   | Country:{`England`, `Northern Ireland`, `Scotland`, `Wales`} | `England` |
158 | | `electorate`      | int   | Electorate | `76205` |
159 | | `total_votes`     | int   | Total valid votes counted in this constituency | `48950` |
160 | | `turnout`         | float | Turnout in this constituency | `0.642346303` |
161 | | `party`           | str   | Name of political party (lower-cased & abbreviated) | `con` |
162 | | `votes`           | int   | Votes for this party | `26950` |
163 | | `voteshare`       | float | Vote share for this party within the constituency | `0.550561798` |
164 | 
165 | 
166 | #### **`general-election/UK/polls`**
167 | | Column | Type | Description | Example |
168 | | -- | -- | -- | -- |
169 | | `company` | str | Name of company conducting opinion poll | `Ipsos MORI Phone` |
170 | | `client` | str | Name of client/publisher commissioning the poll | `Evening Standard` |
171 | | `method` | str | Methodology: {`Online`, `Phone`, `Mobile`} | `Phone` |
172 | | `from` | date | Date fieldwork started | `2017-06-06` |
173 | | `to` | date | Date fieldwork completed | `2017-06-07` |
174 | | `sample_size` | int | Sample size of poll | `1291` |
175 | | `con` | float | National percentage voteshare for the Conservative party | `0.44` |
176 | | `lab` | float | National percentage voteshare for the Labour party | `0.36` |
177 | | `ld` | float | National percentage voteshare for the Liberal Democrat party | `0.07` |
178 | | `ukip` | float | National percentage voteshare for UKIP | `0.04` |
179 | | `grn` | float | National percentage voteshare for the Green party | `0.02` |
180 | | `snp` | float | National percentage voteshare for the SNP | `0.05` |
181 | | `pdf` | str | Download URL of PDF tables containing raw data | `https://www.ipsos.com/sites/default/files/2017-06/pm-election-2017-final-tables.pdf` |
182 | 


--------------------------------------------------------------------------------
/maven/datasets/general_election/base.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Base classes.
  3 | """
  4 | import os
  5 | from functools import partial
  6 | from pathlib import Path
  7 | 
  8 | import pandas as pd
  9 | 
 10 | from maven import utils
 11 | 
 12 | 
 13 | class Pipeline:
 14 |     """Generic class for retrieving & processing datasets with built-in caching & MD5 checking."""
 15 | 
 16 |     def __init__(self, directory):
 17 |         self.directory = Path(directory)
 18 |         self.sources = []  # tuples of (url, filename, checksum)
 19 |         self.retrieve_all = False
 20 |         self.target = (None, None)
 21 |         self.verbose_name = ""
 22 |         self.year = None
 23 |         self.verbose = False
 24 |         self.cache = True
 25 | 
 26 |     def retrieve(self):
 27 |         """Retrieve data from self.sources into self.directory / 'raw' and validate against checksum."""
 28 |         target_dir = self.directory / "raw"
 29 |         os.makedirs(target_dir, exist_ok=True)  # create directory if it doesn't exist
 30 |         for url, filename, md5_checksum in self.sources:
 31 |             if utils.is_url(url):
 32 |                 processing_fn = partial(
 33 |                     utils.fetch_url, url=url, filename=filename, target_dir=target_dir
 34 |                 )
 35 |             else:
 36 |                 processing_fn = partial(
 37 |                     utils.get_and_copy, identifier=url, filename=filename, target_dir=target_dir
 38 |                 )
 39 |             utils.retrieve_from_cache_if_exists(
 40 |                 filename=filename,
 41 |                 target_dir=target_dir,
 42 |                 processing_fn=processing_fn,
 43 |                 md5_checksum=md5_checksum,
 44 |                 caching_enabled=self.cache,
 45 |                 verbose=self.verbose,
 46 |             )
 47 |             if not self.retrieve_all:  # retrieve just the first dataset
 48 |                 return
 49 |         if self.retrieve_all:  # all datasets retrieved
 50 |             return
 51 |         else:  # retrieving first dataset only but all fallbacks failed
 52 |             raise RuntimeError(f"Unable to download {self.verbose_name} data.")
 53 | 
 54 |     def process(self):
 55 |         pass
 56 | 
 57 | 
 58 | class UKResults(Pipeline):
 59 |     """Handles results data for UK General Elections."""
 60 | 
 61 |     @staticmethod
 62 |     def process_hoc_sheet(input_file, data_dir, sheet_name):
 63 |         # Import general election results
 64 |         print(f"Read and clean {input_file}")
 65 |         parties = [
 66 |             "Con",
 67 |             "LD",
 68 |             "Lab",
 69 |             "UKIP",
 70 |             "Grn",
 71 |             "SNP",
 72 |             "PC",
 73 |             "DUP",
 74 |             "SF",
 75 |             "SDLP",
 76 |             "UUP",
 77 |             "APNI",
 78 |             "Other",
 79 |         ]
 80 |         results = pd.read_excel(
 81 |             data_dir / "raw" / input_file,
 82 |             sheet_name=sheet_name,
 83 |             skiprows=4,
 84 |             header=None,
 85 |             skipfooter=19,
 86 |         )
 87 |         assert results.shape == (650, 49)
 88 | 
 89 |         # Specify columns (spread across multiple rows in Excel)
 90 |         cols = ["", "id", "Constituency", "County", "Country/Region", "Country", "Electorate", ""]
 91 |         for party in parties:
 92 |             cols += [f"{party}_Votes", f"{party}_Voteshare", ""]
 93 |         cols += ["Total votes", "Turnout"]
 94 |         results.columns = cols
 95 | 
 96 |         # Some basic data quality checks
 97 |         for party in parties:
 98 |             assert (
 99 |                 results[f"{party}_Voteshare"] - results[f"{party}_Votes"] / results["Total votes"]
100 |             ).sum() == 0
101 |         assert (
102 |             results[[f"{party}_Votes" for party in parties]].fillna(0.0).sum(axis=1)
103 |             == results["Total votes"]
104 |         ).all()
105 |         assert ((results["Total votes"] / results["Electorate"]) == results["Turnout"]).all()
106 | 
107 |         # Drop blank columns plus those that can be calculated
108 |         cols_to_drop = [""] + [c for c in cols if "Voteshare" in c] + ["Total votes", "Turnout"]
109 |         results = results.drop(columns=cols_to_drop)
110 | 
111 |         # Sanitise column names
112 |         results.columns = utils.sanitise(results.columns)
113 |         results = results.rename(columns={"id": "ons_id", "country_region": "region"})
114 |         results.columns = [c.replace("_votes", "") for c in results.columns]
115 | 
116 |         # Reshape to long
117 |         results_long = pd.melt(
118 |             results,
119 |             id_vars=["ons_id", "constituency", "county", "region", "country", "electorate"],
120 |             var_name="party",
121 |             value_name="votes",
122 |         )
123 |         assert results.shape == (650, 19)
124 |         assert results_long.shape == (650 * len(parties), 19 - len(parties) + 2)
125 | 
126 |         # Sort by (ons_id, party)
127 |         results_long["party"] = pd.Categorical(
128 |             results_long.party, categories=pd.Series(parties).apply(utils.sanitise), ordered=True
129 |         )
130 |         results_long = results_long.sort_values(["ons_id", "party"]).reset_index(drop=True)
131 | 
132 |         # Re-add total_votes & voteshare
133 |         results_long["total_votes"] = results_long.ons_id.map(
134 |             results_long.groupby("ons_id").votes.sum().astype(int)
135 |         )
136 |         results_long["voteshare"] = results_long["votes"] / results_long["total_votes"]
137 |         results_long["turnout"] = results_long["total_votes"] / results_long["electorate"]
138 | 
139 |         # Reorder cols for export
140 |         results_long = results_long[
141 |             [
142 |                 "ons_id",
143 |                 "constituency",
144 |                 "county",
145 |                 "region",
146 |                 "country",
147 |                 "electorate",
148 |                 "total_votes",
149 |                 "turnout",
150 |                 "party",
151 |                 "votes",
152 |                 "voteshare",
153 |             ]
154 |         ].copy()
155 | 
156 |         return results_long
157 | 
158 |     def process(self):
159 |         """Process results data for a UK General Election."""
160 |         filename = self.sources[0][1]
161 |         processed_results_location = self.directory / "processed" / self.target[0]
162 |         os.makedirs(
163 |             self.directory / "processed", exist_ok=True
164 |         )  # create directory if it doesn't exist
165 | 
166 |         def process_and_export():
167 |             # Either caching disabled or file not yet processed; process regardless.
168 |             results = self.process_hoc_sheet(
169 |                 input_file=filename, data_dir=self.directory, sheet_name=str(self.year)
170 |             )
171 |             # Export
172 |             print(f"Exporting dataset to {processed_results_location.resolve()}")
173 |             results.to_csv(processed_results_location, index=False)
174 | 
175 |         utils.retrieve_from_cache_if_exists(
176 |             filename=self.target[0],
177 |             target_dir=(self.directory / "processed"),
178 |             processing_fn=process_and_export,
179 |             md5_checksum=self.target[1],
180 |             caching_enabled=self.cache,
181 |             verbose=self.verbose,
182 |         )
183 | 
184 | 
185 | class UKModel(Pipeline):
186 |     """Generates model-ready data for UK General Elections."""
187 | 
188 |     # geos sit between region and country (e.g. "england_not_london") and map to things we can extract from polls
189 |     geos = ["uk", "scotland", "wales", "ni", "london"]
190 |     geo_lookup = {
191 |         "Northern Ireland": "ni",
192 |         "Scotland": "scotland",
193 |         "Wales": "wales",
194 |         "London": "london",
195 |         "South East": "england_not_london",
196 |         "West Midlands": "england_not_london",
197 |         "North West": "england_not_london",
198 |         "East Midlands": "england_not_london",
199 |         "Yorkshire and The Humber": "england_not_london",
200 |         "Eastern": "england_not_london",
201 |         "South West": "england_not_london",
202 |         "North East": "england_not_london",
203 |     }
204 | 
205 |     results_seat_count = {
206 |         2010: {
207 |             "con": 306,
208 |             "lab": 258,
209 |             "ld": 57,
210 |             "dup": 8,
211 |             "snp": 6,
212 |             "sf": 5,
213 |             "pc": 3,
214 |             "sdlp": 3,
215 |             "grn": 1,
216 |             "apni": 1,
217 |             "other": 2,  # {'speaker': 'John Bercow', 'independent': 'Sylvia Hermon'}
218 |         },
219 |         2015: {
220 |             "con": 330,
221 |             "lab": 232,
222 |             "snp": 56,
223 |             "ld": 8,
224 |             "dup": 8,
225 |             "sf": 4,
226 |             "pc": 3,
227 |             "sdlp": 3,
228 |             "uup": 2,
229 |             "ukip": 1,
230 |             "grn": 1,
231 |             "other": 2,  # {'speaker': 'John Bercow', 'independent': 'Sylvia Hermon'}
232 |         },
233 |         2017: {
234 |             "con": 317,
235 |             "lab": 262,
236 |             "snp": 35,
237 |             "ld": 12,
238 |             "dup": 10,
239 |             "sf": 7,
240 |             "pc": 4,
241 |             "grn": 1,
242 |             "other": 2,  # {'speaker': 'John Bercow', 'independent': 'Sylvia Hermon'}
243 |         },
244 |     }
245 | 
246 |     winner_fixes = {
247 |         2010: [
248 |             # https://en.wikipedia.org/wiki/Fermanagh_and_South_Tyrone_(UK_Parliament_constituency)
249 |             (
250 |                 "N06000007",
251 |                 "sf",
252 |             ),  # SF = 21,304, Independent Unionist (with DUP support) = 21,300, Independent = 188
253 |         ]
254 |     }
255 | 
256 |     # Define these to make them available as expected attributes.
257 |     last_date = None
258 |     now_date = None
259 |     last = None
260 |     now = None
261 |     prediction_only = False
262 | 
263 |     def load_results_data(self):
264 |         """Load UK General Election results for consecutive elections with one row / party / constituency and add:
265 |              - `geo`: geo this constituency is in (e.g. `scotland`, `england_not_london`)
266 |              - `winner`: winner per constituency (derived from data, with corrections to match reported results)
267 |              - `won_here`: did this party win this seat?
268 | 
269 |         Args:
270 |             last (int): Year of prior election (used to provide historical trend information).
271 |             now (int):  Year of election to be modelled.
272 | 
273 |         Returns: dict containing key-value pairs of (year, pd.DataFrame) of results.
274 |         """
275 |         # Define these for code readability
276 |         last = self.last
277 |         now = self.now
278 | 
279 |         # Import general election results
280 |         results = {}
281 |         results[last] = pd.read_csv(
282 |             self.directory / "raw" / f"general_election-uk-{last}-results.csv"
283 |         )
284 |         try:
285 |             results[now] = pd.read_csv(
286 |                 self.directory / "raw" / f"general_election-uk-{now}-results.csv"
287 |             )
288 |         except FileNotFoundError:
289 |             self.prediction_only = True
290 | 
291 |         # Add geos
292 |         results[last]["geo"] = results[last].region.map(self.geo_lookup)
293 |         if not self.prediction_only:
294 |             results[now]["geo"] = results[now].region.map(self.geo_lookup)
295 | 
296 |             # Check constituencies are mergeable
297 |             assert (
298 |                 results[last].sort_values("ons_id").ons_id
299 |                 == results[now].sort_values("ons_id").ons_id
300 |             ).all()
301 | 
302 |         # Add the winner for the results
303 |         if self.prediction_only:
304 |             years = [last]
305 |         else:
306 |             years = [last, now]
307 |         for year in years:
308 |             res = results[year].copy()
309 |             winners = self.calculate_winners(res, "voteshare")
310 |             res["winner"] = res.ons_id.map(winners)
311 | 
312 |             # Apply fixes
313 |             if year in self.winner_fixes:
314 |                 for ons_id, actual_winner in self.winner_fixes[year]:
315 |                     res.loc[res.ons_id == ons_id, "winner"] = actual_winner
316 | 
317 |             # Check this matches the results on record
318 |             seat_count = res[["ons_id", "winner"]].drop_duplicates().groupby("winner").size()
319 |             assert dict(seat_count) == self.results_seat_count[year]
320 | 
321 |             # Add boolean per row for if this party won this seat
322 |             res["won_here"] = res.party == res.winner
323 | 
324 |             # Remove UKIP to deal with Brexit Party voteshare matching problems
325 |             # TODO: This is not a great solution, need a better way to map in BXP for modelling 2019.
326 |             res_list = []
327 |             for constituency in res.ons_id.unique():
328 |                 res_con = res[res.ons_id == constituency].copy()
329 |                 for metric in ["votes", "voteshare"]:
330 |                     res_con.loc[res_con.party == "other", metric] = (
331 |                         res_con.loc[res_con.party == "other", metric].sum()
332 |                         + res_con.loc[res_con.party == "ukip", metric].sum()
333 |                     )
334 |                 res_list.append(res_con.query('party != "ukip"').copy())
335 |             res = pd.concat(res_list, axis=0)
336 | 
337 |             results[year] = res.copy()
338 | 
339 |         return results
340 | 
341 |     def load_polling_data(self):
342 |         """Load polling data for UK General Elections."""
343 |         polls = {}
344 |         for geo in self.geos:
345 |             poll_df = pd.read_csv(
346 |                 self.directory / "raw" / f"general_election-{geo}-polls.csv", parse_dates=["to"]
347 |             ).sort_values("to")
348 |             poll_df.columns = utils.sanitise(
349 |                 poll_df.columns,
350 |                 replace={"ulster_unionist_party": "uup", "sinn_fein": "sf", "alliance": "apni"},
351 |             )
352 |             polls[geo] = poll_df
353 | 
354 |         return polls
355 | 
356 |     @staticmethod
357 |     def calculate_poll_of_polls(polls, from_date, to_date):
358 |         return polls[(polls.to >= from_date) & (polls.to < to_date)].groupby("company").tail(1)
359 | 
360 |     def get_regional_and_national_poll_of_polls(self, polls):
361 |         """Takes straight average across each pollster's final poll in last week prior to election day.
362 |             Repeat for regions, if regional polling is available.
363 |         """
364 |         election_day = self.now_date
365 |         one_week_before = election_day - pd.Timedelta(days=7)
366 |         one_month_before = election_day - pd.Timedelta(days=30)
367 | 
368 |         # Use single last poll from each pollster in final week of polling then average out
369 |         final_polls = {}
370 |         for geo in self.geos:
371 |             period_before = one_week_before if geo == "uk" else one_month_before
372 |             final_polls[geo] = self.calculate_poll_of_polls(
373 |                 polls=polls[geo], from_date=period_before, to_date=election_day
374 |             )
375 |             # Consider MRPs equivalent to a large poll
376 |             final_polls[geo].loc[final_polls[geo].method == "MRP", "sample_size"] = (
377 |                 final_polls[geo].query('method != "MRP"').sample_size.max()
378 |             )
379 |             # Handle missing sample sizes
380 |             mean_sample_size = final_polls[geo].query('method != "MRP"').sample_size.mean()
381 |             if pd.isnull(mean_sample_size):
382 |                 mean_sample_size = 1
383 |             final_polls[geo]["sample_size"] = final_polls[geo].sample_size.fillna(mean_sample_size)
384 | 
385 |         # Calculate regional polling
386 |         regional_polling_missing = any(final_polls[geo].empty for geo in self.geos)
387 | 
388 |         # Regional polling is missing, just calculate UK-level polling only.
389 |         if regional_polling_missing:
390 |             # TODO: Check how this affects 2015/2017 models
391 |             parties = ["con", "lab", "ld", "ukip", "grn", "chuk", "bxp", "snp"]
392 |             # Create new polls dictionary by geo containing simple average across all pollsters
393 |             national_polling = final_polls["uk"].mean().loc[parties]
394 |             # We don't yet have regional polling in 2015 for Scotland, Wales, NI, London - add as other.
395 |             national_polling["other"] = 1 - national_polling.sum()
396 |             poll_of_polls = {"uk": national_polling}
397 |             # Turn into dataframe
398 |             polls_df_list = []
399 |             for geo in poll_of_polls:
400 |                 polls_df_list.append(
401 |                     pd.DataFrame(
402 |                         {
403 |                             "geo": geo,
404 |                             "party": poll_of_polls[geo].index,
405 |                             "voteshare": poll_of_polls[geo],
406 |                         }
407 |                     ).reset_index(drop=True)
408 |                 )
409 |             polls_df = pd.concat(polls_df_list, axis=0)
410 | 
411 |         # We have polling for all regions.
412 |         else:
413 |             parties = {
414 |                 # TODO: Add ["chuk", "bxp", "ukip"] to uk, scotland, wales, london
415 |                 "uk": ["con", "lab", "ld", "grn", "snp"],
416 |                 "scotland": ["con", "lab", "ld", "snp", "grn"],
417 |                 "wales": ["con", "lab", "ld", "pc", "grn"],
418 |                 "ni": ["dup", "uup", "sf", "sdlp", "apni", "grn", "con"],
419 |                 "london": ["con", "lab", "ld", "grn"],
420 |                 "england_not_london": ["con", "lab", "ld", "grn"],
421 |             }
422 |             all_parties = set(x for y in parties.values() for x in y)
423 |             poll_of_polls = {}
424 |             for geo in self.geos:
425 |                 sample_size_weights = (
426 |                     final_polls[geo].sample_size / final_polls[geo].sample_size.sum()
427 |                 )
428 |                 weighted_poll_of_polls = (
429 |                     final_polls[geo][parties[geo]]
430 |                     .multiply(sample_size_weights, axis=0)
431 |                     .sum()
432 |                     .reindex(all_parties, fill_value=0.0)
433 |                 )
434 |                 poll_of_polls[geo] = weighted_poll_of_polls
435 | 
436 |             # Estimate polling for England excluding London
437 |             # survation_wts from http://survation.com/wp-content/uploads/2017/06/Final-MoS-Post-BBC-Event-Poll-020617SWCH-1c0d4h9.pdf
438 |             survation_wts = pd.Series({"scotland": 85, "england": 881, "wales": 67, "ni": 16})
439 |             survation_wts["uk"] = survation_wts.sum()
440 |             survation_wts["london"] = 137
441 |             survation_wts["england_not_london"] = survation_wts.england - survation_wts.london
442 | 
443 |             england_not_london = poll_of_polls["uk"] * survation_wts["uk"]
444 |             for geo in ["scotland", "wales", "ni", "london"]:
445 |                 england_not_london = england_not_london.sub(
446 |                     poll_of_polls[geo] * survation_wts[geo], fill_value=0.0
447 |                 )
448 |             england_not_london /= survation_wts["england_not_london"]
449 |             england_not_london.loc[["pc", "snp"]] = 0.0
450 |             poll_of_polls["england_not_london"] = england_not_london
451 | 
452 |             # Fix PC (Plaid Cymru) for UK
453 |             poll_of_polls["uk"]["pc"] = (
454 |                 poll_of_polls["wales"]["pc"] * survation_wts["wales"] / survation_wts["uk"]
455 |             )
456 | 
457 |             # Add Other & normalise
458 |             for geo in self.geos + ["england_not_london"]:
459 |                 poll_of_polls[geo]["other"] = max(
460 |                     1 - poll_of_polls[geo].sum(), 0
461 |                 )  # weighted means can sum > 1
462 |                 poll_of_polls[geo] = poll_of_polls[geo] / poll_of_polls[geo].sum()
463 | 
464 |             # Export
465 |             polls_df_list = []
466 |             for geo in poll_of_polls:
467 |                 polls_df_list.append(
468 |                     pd.DataFrame(
469 |                         {
470 |                             "geo": geo,
471 |                             "party": poll_of_polls[geo].index,
472 |                             "voteshare": poll_of_polls[geo],
473 |                         }
474 |                     ).reset_index(drop=True)
475 |                 )
476 |             polls_df = pd.concat(polls_df_list, axis=0)
477 | 
478 |         return polls_df
479 | 
480 |     @staticmethod
481 |     def combine_results_and_polls(results, polls):
482 |         """Merge national polling, and geo-level polling if available, into results dataframe."""
483 |         # Merge into previous election's results to calculate swing
484 |         results = (
485 |             results.merge(
486 |                 right=polls.query('geo == "uk"')[["party", "voteshare"]].rename(
487 |                     columns={"voteshare": "national_polls"}
488 |                 ),
489 |                 on="party",
490 |                 how="outer",
491 |             )
492 |             .sort_values(["ons_id", "party"])
493 |             .reset_index(drop=True)
494 |         )
495 |         # If we have geo-polls, add those too
496 |         if set(polls.geo.unique()) != {"uk"}:
497 |             results = (
498 |                 results.merge(
499 |                     right=polls.query('geo != "uk"')[["geo", "party", "voteshare"]].rename(
500 |                         columns={"voteshare": "geo_polls"}
501 |                     ),
502 |                     on=["geo", "party"],
503 |                     how="outer",
504 |                 )
505 |                 .sort_values(["ons_id", "party"])
506 |                 .reset_index(drop=True)
507 |             )
508 | 
509 |         return results
510 | 
511 |     @staticmethod
512 |     def calculate_winners(df, voteshare_col):
513 |         """Assumes df has `ons_id` and `party` columns."""
514 |         return (
515 |             df.sort_values(voteshare_col, ascending=False)
516 |             .groupby("ons_id")
517 |             .head(1)[["ons_id", "party"]]
518 |             .set_index("ons_id")
519 |             .party
520 |         )
521 | 
522 |     def calculate_national_swing(self, results):
523 |         """Uses previous election results plus current polling to calculate:
524 |                - `national_voteshare`: per party
525 |                - `national_swing`: from previous voteshare to current polling
526 |                - `national_swing_forecast`: forecasted voteshare per party per seat
527 |                - `national_swing_winner`: per seat
528 | 
529 |         Returns: updated results dataframe with new columns.
530 |         """
531 |         # Calculate national voteshare
532 |         national_voteshare_by_party = results.groupby("party").votes.sum() / results.votes.sum()
533 |         results["national_voteshare"] = results.party.map(national_voteshare_by_party)
534 | 
535 |         # Calculate swing between last election results and latest poll-of-polls
536 |         results["national_swing"] = (results.national_polls / results.national_voteshare) - 1
537 | 
538 |         # Forecast is previous result multiplied by swing uplift
539 |         results["national_swing_forecast"] = results.voteshare * (1 + results.national_swing)
540 | 
541 |         # Predict the winner in each constituency using national_swing_forecast
542 |         # Note: these are pointless for NI as polls/swings are all aggregated under "other" but results
543 |         # are given per major party.
544 |         national_swing_winners = self.calculate_winners(results, "national_swing_forecast")
545 |         results["national_swing_winner"] = results.ons_id.map(national_swing_winners)
546 | 
547 |         return results
548 | 
549 |     def calculate_geo_swing(self, results):
550 |         """Calculate geo-Level voteshare + swing inc. all parties. Adds:
551 |                - `geo_voteshare`: geo-level voteshare (per party).
552 |                - `geo_swing`: swing from previous geo_voteshare to current geo-polling.
553 |                - `geo_swing_forecast`: geo-swing based forecast per party per seat.
554 |                - `geo_swing_winner`: per seat.
555 | 
556 |         Returns: updated results dataframe with new columns.
557 |         """
558 | 
559 |         # Calculate geo-level voteshare
560 |         votes_by_geo = results.groupby("geo").votes.sum().reset_index()
561 |         votes_by_geo_by_party = (
562 |             results.groupby(["geo", "party"])
563 |             .votes.sum()
564 |             .reset_index()
565 |             .merge(votes_by_geo, on="geo", how="left", suffixes=("", "_geo"))
566 |         )
567 |         votes_by_geo_by_party["geo_voteshare"] = (
568 |             votes_by_geo_by_party.votes / votes_by_geo_by_party.votes_geo
569 |         )
570 |         results = results.merge(
571 |             votes_by_geo_by_party[["geo", "party", "geo_voteshare"]],
572 |             on=["geo", "party"],
573 |             how="left",
574 |         )
575 | 
576 |         # Calculate geo-swing between last election results and latest geo-polls
577 |         results["geo_swing"] = (results.geo_polls / results.geo_voteshare) - 1
578 | 
579 |         # Forecast is previous result multiplied by swing uplift
580 |         results["geo_swing_forecast"] = results.voteshare * (1 + results.geo_swing)
581 | 
582 |         # Predict the winner in each constituency using geo_swing_forecast
583 |         geo_swing_winners = self.calculate_winners(results, "geo_swing_forecast")
584 |         results["geo_swing_winner"] = results.ons_id.map(geo_swing_winners)
585 | 
586 |         return results
587 | 
588 |     def export_model_ready_dataframe(self, results_dict):
589 |         """Create ML-ready dataframe and export."""
590 | 
591 |         # Cols to select from results dfs
592 |         if self.prediction_only:
593 |             df_cols_last = [
594 |                 "ons_id",
595 |                 "constituency",
596 |                 "county",
597 |                 "region",
598 |                 "geo",
599 |                 "country",
600 |                 "electorate",
601 |                 "total_votes",
602 |                 "turnout",
603 |                 "party",
604 |                 "votes",
605 |                 "voteshare",
606 |                 "national_polls",
607 |                 "national_voteshare",
608 |                 "national_swing",
609 |                 "national_swing_forecast",
610 |                 "national_swing_winner",
611 |                 "winner",
612 |                 "won_here",
613 |             ]
614 |         else:
615 |             df_cols_now = [
616 |                 "ons_id",
617 |                 "constituency",
618 |                 "county",
619 |                 "region",
620 |                 "geo",
621 |                 "country",
622 |                 "electorate",
623 |                 "total_votes",
624 |                 "turnout",
625 |                 "party",
626 |                 "votes",
627 |                 "voteshare",
628 |                 "winner",
629 |             ]
630 |             df_cols_last = [
631 |                 "ons_id",
632 |                 "party",
633 |                 "total_votes",
634 |                 "turnout",
635 |                 "votes",
636 |                 "voteshare",
637 |                 "national_polls",
638 |                 "national_voteshare",
639 |                 "national_swing",
640 |                 "national_swing_forecast",
641 |                 "national_swing_winner",
642 |                 "winner",
643 |                 "won_here",
644 |             ]
645 | 
646 |         # Add geo polling if available
647 |         df_cols_final_geo = []
648 |         if "geo_polls" in results_dict[self.last].columns:
649 |             df_cols_last += [
650 |                 "geo_polls",
651 |                 "geo_voteshare",
652 |                 "geo_swing",
653 |                 "geo_swing_forecast",
654 |                 "geo_swing_winner",
655 |             ]
656 |             df_cols_final_geo += [
657 |                 "geo_polls_now",
658 |                 "geo_voteshare_last",
659 |                 "geo_swing",
660 |                 "geo_swing_forecast",
661 |                 "geo_swing_winner",
662 |             ]
663 | 
664 |         # Specify cols to use in exported df
665 |         df_cols_final = [
666 |             # Constant per constituency
667 |             "ons_id",
668 |             "constituency",
669 |             "county",
670 |             "region",
671 |             "geo",
672 |             "country",
673 |             "electorate",
674 |             "total_votes_last",
675 |             "turnout_last",
676 |             # Constant per party (per constituency)
677 |             "party",
678 |             "votes_last",
679 |             "voteshare_last",
680 |             "winner_last",
681 |             "won_here_last",
682 |             "national_voteshare_last",
683 |             "national_polls_now",
684 |             "national_swing",
685 |             "national_swing_forecast",
686 |             "national_swing_winner",
687 |         ] + df_cols_final_geo
688 | 
689 |         # If we have results data for now, let's add that
690 |         if not self.prediction_only:
691 |             df_cols_final += [
692 |                 # Target
693 |                 "total_votes_now",
694 |                 "turnout_now",
695 |                 "votes_now",
696 |                 "voteshare_now",
697 |                 "winner_now",
698 |             ]
699 | 
700 |         # Build dataframe for export
701 |         if self.prediction_only:
702 |             df = (
703 |                 results_dict[self.last][df_cols_last]
704 |                 .rename(
705 |                     columns={
706 |                         "total_votes": "total_votes_last",
707 |                         "turnout": "turnout_last",
708 |                         "votes": "votes_last",
709 |                         "voteshare": "voteshare_last",
710 |                         "national_polls": "national_polls_now",
711 |                         "geo_polls": "geo_polls_now",
712 |                         "national_voteshare": "national_voteshare_last",
713 |                         "geo_voteshare": "geo_voteshare_last",
714 |                         "winner": "winner_last",
715 |                         "won_here": "won_here_last",
716 |                     }
717 |                 )
718 |                 .filter(df_cols_final)
719 |             )
720 |         else:
721 |             df_cols_final = (
722 |                 [
723 |                     # Constant per constituency
724 |                     "ons_id",
725 |                     "constituency",
726 |                     "county",
727 |                     "region",
728 |                     "geo",
729 |                     "country",
730 |                     "electorate",
731 |                     "total_votes_last",
732 |                     "turnout_last",
733 |                     # Constant per party (per constituency)
734 |                     "party",
735 |                     "votes_last",
736 |                     "voteshare_last",
737 |                     "winner_last",
738 |                     "won_here_last",
739 |                     "national_voteshare_last",
740 |                     "national_polls_now",
741 |                     "national_swing",
742 |                     "national_swing_forecast",
743 |                     "national_swing_winner",
744 |                 ]
745 |                 + df_cols_final_geo
746 |                 + [
747 |                     # Target
748 |                     "total_votes_now",
749 |                     "turnout_now",
750 |                     "votes_now",
751 |                     "voteshare_now",
752 |                     "winner_now",
753 |                 ]
754 |             )
755 |             df = (
756 |                 results_dict[self.now][df_cols_now]
757 |                 .rename(
758 |                     columns={
759 |                         "total_votes": "total_votes_now",
760 |                         "turnout": "turnout_now",
761 |                         "votes": "votes_now",
762 |                         "voteshare": "voteshare_now",
763 |                         "winner": "winner_now",
764 |                     }
765 |                 )
766 |                 .merge(
767 |                     # Note: even though polling represents "now", they're in results[last] to calculate swings.
768 |                     results_dict[self.last][df_cols_last].rename(
769 |                         columns={
770 |                             "total_votes": "total_votes_last",
771 |                             "turnout": "turnout_last",
772 |                             "votes": "votes_last",
773 |                             "voteshare": "voteshare_last",
774 |                             "national_polls": "national_polls_now",
775 |                             "geo_polls": "geo_polls_now",
776 |                             "national_voteshare": "national_voteshare_last",
777 |                             "geo_voteshare": "geo_voteshare_last",
778 |                             "winner": "winner_last",
779 |                             "won_here": "won_here_last",
780 |                         }
781 |                     ),
782 |                     on=["ons_id", "party"],
783 |                     how="inner",
784 |                     validate="1:1",
785 |                 )
786 |                 .filter(df_cols_final)
787 |             )
788 | 
789 |         return df
790 | 
791 |     def process(self):
792 |         """Process results data from consecutive UK General Elections (e.g. 2010 and 2015) into a single model-ready
793 |            dataset ready for predicting the later (e.g. 2015) election."""
794 |         processed_directory = self.directory / "processed"
795 |         os.makedirs(processed_directory, exist_ok=True)  # create directory if it doesn't exist
796 | 
797 |         # Import general election results & polling data
798 |         results_dict = self.load_results_data()
799 |         polls_full = self.load_polling_data()
800 | 
801 |         # Calculate poll of polls
802 |         polls = self.get_regional_and_national_poll_of_polls(polls=polls_full)
803 | 
804 |         # Merge polls into previous election results dataframe
805 |         results_dict[self.last] = self.combine_results_and_polls(
806 |             results=results_dict[self.last], polls=polls
807 |         )
808 | 
809 |         # Add into previous election results: national voteshare, national swing (vs current polling),
810 |         # national swing forecast (per party per seat) and national swing forecast winner (per seat).
811 |         results_dict[self.last] = self.calculate_national_swing(results_dict[self.last])
812 | 
813 |         # If we have geo-polling for previous election, also calculate a geo-level swing forecast.
814 |         if "geo_polls" in results_dict[self.last].columns:
815 |             results_dict[self.last] = self.calculate_geo_swing(results_dict[self.last])
816 | 
817 |         # Create ML-ready dataframe and export
818 |         model_df = self.export_model_ready_dataframe(results_dict=results_dict)
819 | 
820 |         print(f"Exporting {self.last}->{self.now} model dataset to {processed_directory.resolve()}")
821 |         model_df.to_csv(
822 |             processed_directory / f"general_election-uk-{self.now}-model.csv", index=False
823 |         )
824 | 


--------------------------------------------------------------------------------