├── .gitignore
├── .pylintrc
├── LICENSE.md
├── README.md
├── docs
    ├── .gitignore
    ├── Makefile
    ├── README.md
    ├── build_docs.sh
    ├── conf.py
    ├── human_source
    │   ├── coderef.rst
    │   └── manual.rst
    ├── index.rst
    └── source
    │   ├── modules.rst
    │   ├── views.apps.data.missing.rst
    │   ├── views.apps.data.rst
    │   ├── views.apps.ensemble.rst
    │   ├── views.apps.evaluation.rst
    │   ├── views.apps.extras.rst
    │   ├── views.apps.model.rst
    │   ├── views.apps.pipeline.rst
    │   ├── views.apps.rst
    │   ├── views.apps.slurm.rst
    │   ├── views.apps.transforms.rst
    │   ├── views.database.rst
    │   ├── views.database.skeleton.rst
    │   ├── views.database.sources.acled.legacy.rst
    │   ├── views.database.sources.acled.rst
    │   ├── views.database.sources.cdum.rst
    │   ├── views.database.sources.fvp.rst
    │   ├── views.database.sources.ged.legacy.rst
    │   ├── views.database.sources.ged.rst
    │   ├── views.database.sources.icgcw.rst
    │   ├── views.database.sources.pgdata.rst
    │   ├── views.database.sources.reign.rst
    │   ├── views.database.sources.rst
    │   ├── views.database.sources.spei.rst
    │   ├── views.database.sources.vdem.rst
    │   ├── views.database.sources.wdi.rst
    │   ├── views.rst
    │   ├── views.specs.data.rst
    │   ├── views.specs.models.rst
    │   ├── views.specs.periods.rst
    │   ├── views.specs.rst
    │   └── views.utils.rst
├── env_static.yaml
├── install_views2.sh
├── misc
    ├── README.md
    ├── defaults.yaml
    ├── environment.yaml
    ├── freeze_env.sh
    └── pytest.ini
├── projects
    ├── model_development
    │   ├── README.md
    │   └── example.ipynb
    ├── monthly_report
    │   └── changelog.md
    ├── plots
    │   └── example_maps.ipynb
    ├── prediction_competition
    │   ├── README.md
    │   └── benchmark_notebook.ipynb
    ├── replication_jpr_2020
    │   ├── README.md
    │   └── gitlab_mirror
    │   │   └── views_jpr_2020_code.zip
    └── workshop
    │   └── presentation.ipynb
├── run_tools.sh
├── runners
    ├── README.md
    ├── export_data.py
    ├── import_data.py
    ├── predict.py
    ├── predict_slurm.py
    ├── refresh_data.py
    ├── refresh_data_slurm.py
    ├── train_all_local.py
    ├── train_all_slurm.py
    ├── train_model.py
    ├── train_slurm.py
    └── update_database.py
├── setup.py
├── tests
    ├── README.md
    ├── test_app_model_api.py
    ├── test_calibration.py
    ├── test_db.py
    ├── test_misc_utils.py
    ├── test_specs.py
    ├── test_structure.py
    ├── test_transforms_api.py
    ├── test_transforms_lib.py
    ├── test_utils.py
    └── test_utils_data.py
└── views
    ├── __init__.py
    ├── apps
        ├── __init__.py
        ├── data
        │   ├── README.md
        │   ├── __init__.py
        │   ├── api.py
        │   ├── export_readme
        │   │   └── README.md
        │   ├── missing
        │   │   ├── __init__.py
        │   │   ├── amelia.py
        │   │   ├── amelia_template.R
        │   │   └── missing.py
        │   └── public.py
        ├── ensemble
        │   ├── __init__.py
        │   ├── ebma.py
        │   └── templates
        │   │   ├── install_ebma.R
        │   │   └── run_ebma.R
        ├── evaluation
        │   ├── __init__.py
        │   ├── feature_importance.py
        │   └── lib.py
        ├── extras
        │   ├── __init__.py
        │   └── extras.py
        ├── model
        │   ├── __init__.py
        │   ├── api.py
        │   ├── calibration.py
        │   └── crosslevel.py
        ├── pipeline
        │   ├── README.md
        │   ├── __init__.py
        │   ├── ensembles_cm.py
        │   ├── ensembles_pgm.py
        │   ├── models_cm.py
        │   ├── models_pgm.py
        │   ├── predict.py
        │   └── train.py
        ├── plot
        │   ├── __init__.py
        │   └── maps.py
        ├── slurm
        │   ├── __init__.py
        │   ├── slurm.py
        │   └── templates
        │   │   ├── runfile_core.txt
        │   │   └── runfile_node.txt
        ├── transforms
        │   ├── __init__.py
        │   └── lib.py
        └── xgb
        │   └── lib.py
    ├── config.py
    ├── database
        ├── README.md
        ├── __init__.py
        ├── common.py
        ├── skeleton
        │   ├── __init__.py
        │   ├── create_skeleton.sql
        │   └── skeleton.py
        └── sources
        │   ├── __init__.py
        │   ├── acled
        │       ├── __init__.py
        │       ├── acled.py
        │       ├── acled.sql
        │       └── legacy
        │       │   ├── __init__.py
        │       │   ├── acled.py
        │       │   └── prepare_acled.sql
        │   ├── cdum
        │       ├── __init__.py
        │       └── cdum.py
        │   ├── fvp
        │       ├── __init__.py
        │       ├── fvp.py
        │       └── spec.yaml
        │   ├── ged
        │       ├── __init__.py
        │       ├── ged.py
        │       ├── ged.sql
        │       └── legacy
        │       │   ├── __init__.py
        │       │   ├── ged.py
        │       │   ├── impute.py
        │       │   └── prepare_ged.sql
        │   ├── icgcw
        │       ├── __init__.py
        │       ├── fetch.py
        │       ├── icgcw.py
        │       └── spec.yaml
        │   ├── pgdata
        │       ├── __init__.py
        │       ├── fetch.py
        │       ├── pgdata.py
        │       └── spec.yaml
        │   ├── reign
        │       ├── __init__.py
        │       ├── reign.py
        │       └── spec.yaml
        │   ├── spei
        │       ├── __init__.py
        │       ├── cleanup.sql
        │       ├── pg_ug.sql
        │       ├── spei.py
        │       └── stage.sql
        │   ├── vdem
        │       ├── __init__.py
        │       └── vdem.py
        │   └── wdi
        │       ├── __init__.py
        │       └── wdi.py
    ├── specs
        ├── README.md
        ├── __init__.py
        ├── data
        │   ├── README.md
        │   ├── __init__.py
        │   ├── parsed_datasets.py
        │   ├── solver.py
        │   └── spec.yaml
        ├── models
        │   ├── README.md
        │   ├── __init__.py
        │   ├── am.yaml
        │   ├── cm.yaml
        │   ├── featlists_hh.yaml
        │   ├── pgm.yaml
        │   └── solver.py
        └── periods
        │   ├── __init__.py
        │   └── periods.yaml
    └── utils
        ├── __init__.py
        ├── data.py
        ├── db.py
        ├── io.py
        ├── log.py
        ├── misc.py
        ├── mocker.py
        └── stats.py


/.gitignore:
--------------------------------------------------------------------------------
1 | sourceme.sh
2 | __pycache__*
3 | *.egg-info
4 | .ipynb_checkpoints
5 | .mypy_cache
6 | storage
7 | *.sublime*
8 | .DS_Store


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [MASTER]
 2 | disable=bad-continuation, # Conflicts with black
 3 |         logging-format-interpolation, # f-strings are better than %
 4 | 
 5 | 
 6 | good-names=log,
 7 |            df,
 8 |            f,
 9 |            i,
10 |            zf,
11 |            s,
12 |            y,
13 |            df_X,
14 |            ln,
15 |            t,
16 |            log,
17 |            tp,
18 |            tn,
19 |            fp,
20 |            fn


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ViEWS2
 2 | 
 3 | Getting started
 4 | 
 5 | Download and install miniconda3: https://docs.conda.io/en/latest/miniconda.html
 6 | After you have conda installed, in your terminal run
 7 | 
 8 |     ./install_views2.sh
 9 | 
10 | This will create a conda environment called views2 and install the views package there.
11 | To fetch the latest public data run
12 | 
13 |     conda activate views2
14 |     python runners/import_data.py --fetch
15 | 
16 | To start using ViEWS code simply run
17 | 
18 |     conda activate views2
19 |     jupyter notebook
20 | 
21 | A web browser should open with the jupyter notebook browser.
22 | If you wish to take part in the prediction competition, see projects/prediction_competition/
23 | An example notebook to get you started modelling is in projects/model_development/examply.ipynb.
24 | 
25 | We develop ViEWS on Mac and Linux computers, the procedure is slightly different for Windows and we haven't developed a streamlined process for it yet.
26 | 
27 | To open the HTML documentation from here on MacOS run
28 | 
29 |     ./run_tools.sh
30 |     open docs/_build/html/index.html
31 | 
32 | And it will take you to the locally built html documementation in your default browser.
33 | 
34 | To view .pdf documentation (a work in progress) see https://views.pcr.uu.se/download/docs/views.pdf


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | _static
3 | _templates


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | To build the documentation cd to this directory and run
 4 | 
 5 | `sphinx-apidoc -o source/ ../views`
 6 | `make html`
 7 | 
 8 | Or just run the run_tools.sh script in the root of the repo. It does this for you.
 9 | 
10 | Human written source files should go in human_source.
11 | Leave the `source` directory to sphinx-apidoc so that we can delete and rebuild it should it break.
12 | 
13 | ## PDF
14 | To build a pdf make sure you have latexpdf installed (miktex worked for me) and run
15 | 
16 |     make latexpdf
17 | 
18 | You will get a views.pdf in `_build/latex/views.pdf`.
19 | 


--------------------------------------------------------------------------------
/docs/build_docs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | clear
 4 | 
 5 | # Stop on non-zero exit
 6 | set -e
 7 | 
 8 | echo "Initalising conda for this shell"
 9 | eval "$(conda shell.bash hook)"
10 | conda activate views2
11 | 
12 | echo "Generating docs"
13 | # Clear existing generated docs
14 | rm -f source/*
15 | # Auto-generate new docs
16 | # --module-frist makes Package __init__ come before all the submodules
17 | # See https://www.sphinx-doc.org/en/master/man/sphinx-apidoc.html#options
18 | sphinx-apidoc --module-first -o source/ ../views
19 | # Make HTML docs
20 | make html
21 | # Make PDF with latex
22 | make latexpdf


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | sys.path.append(os.path.abspath('..'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'ViEWS'
21 | copyright = '2020, ViEWS Team'
22 | author = 'ViEWS Team'
23 | 
24 | # The full version, including alpha/beta/rc tags
25 | release = '2.0'
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     'sphinx.ext.autodoc',
35 |     'sphinx.ext.napoleon'
36 | ]
37 | 
38 | # Add any paths that contain templates here, relative to this directory.
39 | templates_path = ['_templates']
40 | 
41 | # List of patterns, relative to source directory, that match files and
42 | # directories to ignore when looking for source files.
43 | # This pattern also affects html_static_path and html_extra_path.
44 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
45 | 
46 | 
47 | # -- Options for HTML output -------------------------------------------------
48 | 
49 | # The theme to use for HTML and HTML Help pages.  See the documentation for
50 | # a list of builtin themes.
51 | #
52 | html_theme = 'alabaster'
53 | 
54 | # Add any paths that contain custom static files (such as style sheets) here,
55 | # relative to this directory. They are copied after the builtin static files,
56 | # so a file named "default.css" will overwrite the builtin "default.css".
57 | html_static_path = ['_static']


--------------------------------------------------------------------------------
/docs/human_source/coderef.rst:
--------------------------------------------------------------------------------
 1 | Code reference
 2 | ==============
 3 | 
 4 | Here you will hopefully soon find auto-generated documentation from the code itself. A bit broken at the moment. Sorry.
 5 | 
 6 | * :ref:`modindex`
 7 | * :ref:`genindex`
 8 | * :ref:`search`
 9 | 
10 | 
11 | .. automodule:: views
12 |     :members:


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. ViEWS documentation master file, created by
 2 |    sphinx-quickstart on Mon May 18 23:56:52 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | 
 7 | ViEWS
 8 | =====
 9 | 
10 | ViEWS is a project focused on forecasting political violence.
11 | To do this we fit models to data and use those models to predict the likelihood of future conflict.
12 | Most of the code in this package is focused on this task and provides a framework for doing this correctly and easily.
13 | 
14 | This document is a work in progress.
15 | We apologise for any confusing formatting or layout.
16 | 
17 | 
18 | 
19 | .. toctree::
20 |    :maxdepth: 3
21 |    :caption: Contents:
22 | 
23 |    human_source/manual
24 |    human_source/coderef
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | views
2 | =====
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    views
8 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.data.missing.rst:
--------------------------------------------------------------------------------
 1 | views.apps.data.missing package
 2 | ===============================
 3 | 
 4 | .. automodule:: views.apps.data.missing
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.apps.data.missing.amelia module
13 | -------------------------------------
14 | 
15 | .. automodule:: views.apps.data.missing.amelia
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.apps.data.missing.missing module
21 | --------------------------------------
22 | 
23 | .. automodule:: views.apps.data.missing.missing
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.data.rst:
--------------------------------------------------------------------------------
 1 | views.apps.data package
 2 | =======================
 3 | 
 4 | .. automodule:: views.apps.data
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    views.apps.data.missing
16 | 
17 | Submodules
18 | ----------
19 | 
20 | views.apps.data.api module
21 | --------------------------
22 | 
23 | .. automodule:: views.apps.data.api
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | views.apps.data.public module
29 | -----------------------------
30 | 
31 | .. automodule:: views.apps.data.public
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 
36 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.ensemble.rst:
--------------------------------------------------------------------------------
 1 | views.apps.ensemble package
 2 | ===========================
 3 | 
 4 | .. automodule:: views.apps.ensemble
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.apps.ensemble.ebma module
13 | -------------------------------
14 | 
15 | .. automodule:: views.apps.ensemble.ebma
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.evaluation.rst:
--------------------------------------------------------------------------------
 1 | views.apps.evaluation package
 2 | =============================
 3 | 
 4 | .. automodule:: views.apps.evaluation
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.apps.evaluation.feature\_importance module
13 | ------------------------------------------------
14 | 
15 | .. automodule:: views.apps.evaluation.feature_importance
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.apps.evaluation.lib module
21 | --------------------------------
22 | 
23 | .. automodule:: views.apps.evaluation.lib
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.extras.rst:
--------------------------------------------------------------------------------
 1 | views.apps.extras package
 2 | =========================
 3 | 
 4 | .. automodule:: views.apps.extras
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.apps.extras.extras module
13 | -------------------------------
14 | 
15 | .. automodule:: views.apps.extras.extras
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.model.rst:
--------------------------------------------------------------------------------
 1 | views.apps.model package
 2 | ========================
 3 | 
 4 | .. automodule:: views.apps.model
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.apps.model.api module
13 | ---------------------------
14 | 
15 | .. automodule:: views.apps.model.api
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.apps.model.calibration module
21 | -----------------------------------
22 | 
23 | .. automodule:: views.apps.model.calibration
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | views.apps.model.crosslevel module
29 | ----------------------------------
30 | 
31 | .. automodule:: views.apps.model.crosslevel
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 
36 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.pipeline.rst:
--------------------------------------------------------------------------------
 1 | views.apps.pipeline package
 2 | ===========================
 3 | 
 4 | .. automodule:: views.apps.pipeline
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.apps.pipeline.ensembles\_cm module
13 | ----------------------------------------
14 | 
15 | .. automodule:: views.apps.pipeline.ensembles_cm
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.apps.pipeline.ensembles\_pgm module
21 | -----------------------------------------
22 | 
23 | .. automodule:: views.apps.pipeline.ensembles_pgm
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | views.apps.pipeline.models\_cm module
29 | -------------------------------------
30 | 
31 | .. automodule:: views.apps.pipeline.models_cm
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 
36 | views.apps.pipeline.models\_pgm module
37 | --------------------------------------
38 | 
39 | .. automodule:: views.apps.pipeline.models_pgm
40 |    :members:
41 |    :undoc-members:
42 |    :show-inheritance:
43 | 
44 | views.apps.pipeline.predict module
45 | ----------------------------------
46 | 
47 | .. automodule:: views.apps.pipeline.predict
48 |    :members:
49 |    :undoc-members:
50 |    :show-inheritance:
51 | 
52 | views.apps.pipeline.train module
53 | --------------------------------
54 | 
55 | .. automodule:: views.apps.pipeline.train
56 |    :members:
57 |    :undoc-members:
58 |    :show-inheritance:
59 | 
60 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.rst:
--------------------------------------------------------------------------------
 1 | views.apps package
 2 | ==================
 3 | 
 4 | .. automodule:: views.apps
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    views.apps.data
16 |    views.apps.ensemble
17 |    views.apps.evaluation
18 |    views.apps.extras
19 |    views.apps.model
20 |    views.apps.pipeline
21 |    views.apps.slurm
22 |    views.apps.transforms
23 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.slurm.rst:
--------------------------------------------------------------------------------
 1 | views.apps.slurm package
 2 | ========================
 3 | 
 4 | .. automodule:: views.apps.slurm
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.apps.slurm.slurm module
13 | -----------------------------
14 | 
15 | .. automodule:: views.apps.slurm.slurm
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.apps.transforms.rst:
--------------------------------------------------------------------------------
 1 | views.apps.transforms package
 2 | =============================
 3 | 
 4 | .. automodule:: views.apps.transforms
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.apps.transforms.lib module
13 | --------------------------------
14 | 
15 | .. automodule:: views.apps.transforms.lib
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.database.rst:
--------------------------------------------------------------------------------
 1 | views.database package
 2 | ======================
 3 | 
 4 | .. automodule:: views.database
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    views.database.skeleton
16 |    views.database.sources
17 | 
18 | Submodules
19 | ----------
20 | 
21 | views.database.common module
22 | ----------------------------
23 | 
24 | .. automodule:: views.database.common
25 |    :members:
26 |    :undoc-members:
27 |    :show-inheritance:
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/source/views.database.skeleton.rst:
--------------------------------------------------------------------------------
 1 | views.database.skeleton package
 2 | ===============================
 3 | 
 4 | .. automodule:: views.database.skeleton
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.skeleton.skeleton module
13 | ---------------------------------------
14 | 
15 | .. automodule:: views.database.skeleton.skeleton
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.acled.legacy.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.acled.legacy package
 2 | ===========================================
 3 | 
 4 | .. automodule:: views.database.sources.acled.legacy
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.acled.legacy.acled module
13 | ------------------------------------------------
14 | 
15 | .. automodule:: views.database.sources.acled.legacy.acled
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.acled.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.acled package
 2 | ====================================
 3 | 
 4 | .. automodule:: views.database.sources.acled
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    views.database.sources.acled.legacy
16 | 
17 | Submodules
18 | ----------
19 | 
20 | views.database.sources.acled.acled module
21 | -----------------------------------------
22 | 
23 | .. automodule:: views.database.sources.acled.acled
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.cdum.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.cdum package
 2 | ===================================
 3 | 
 4 | .. automodule:: views.database.sources.cdum
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.cdum.cdum module
13 | ---------------------------------------
14 | 
15 | .. automodule:: views.database.sources.cdum.cdum
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.fvp.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.fvp package
 2 | ==================================
 3 | 
 4 | .. automodule:: views.database.sources.fvp
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.fvp.fvp module
13 | -------------------------------------
14 | 
15 | .. automodule:: views.database.sources.fvp.fvp
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.ged.legacy.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.ged.legacy package
 2 | =========================================
 3 | 
 4 | .. automodule:: views.database.sources.ged.legacy
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.ged.legacy.ged module
13 | --------------------------------------------
14 | 
15 | .. automodule:: views.database.sources.ged.legacy.ged
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.database.sources.ged.legacy.impute module
21 | -----------------------------------------------
22 | 
23 | .. automodule:: views.database.sources.ged.legacy.impute
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.ged.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.ged package
 2 | ==================================
 3 | 
 4 | .. automodule:: views.database.sources.ged
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    views.database.sources.ged.legacy
16 | 
17 | Submodules
18 | ----------
19 | 
20 | views.database.sources.ged.ged module
21 | -------------------------------------
22 | 
23 | .. automodule:: views.database.sources.ged.ged
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.icgcw.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.icgcw package
 2 | ====================================
 3 | 
 4 | .. automodule:: views.database.sources.icgcw
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.icgcw.fetch module
13 | -----------------------------------------
14 | 
15 | .. automodule:: views.database.sources.icgcw.fetch
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.database.sources.icgcw.icgcw module
21 | -----------------------------------------
22 | 
23 | .. automodule:: views.database.sources.icgcw.icgcw
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.pgdata.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.pgdata package
 2 | =====================================
 3 | 
 4 | .. automodule:: views.database.sources.pgdata
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.pgdata.fetch module
13 | ------------------------------------------
14 | 
15 | .. automodule:: views.database.sources.pgdata.fetch
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.database.sources.pgdata.pgdata module
21 | -------------------------------------------
22 | 
23 | .. automodule:: views.database.sources.pgdata.pgdata
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.reign.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.reign package
 2 | ====================================
 3 | 
 4 | .. automodule:: views.database.sources.reign
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.reign.reign module
13 | -----------------------------------------
14 | 
15 | .. automodule:: views.database.sources.reign.reign
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources package
 2 | ==============================
 3 | 
 4 | .. automodule:: views.database.sources
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    views.database.sources.acled
16 |    views.database.sources.cdum
17 |    views.database.sources.fvp
18 |    views.database.sources.ged
19 |    views.database.sources.icgcw
20 |    views.database.sources.pgdata
21 |    views.database.sources.reign
22 |    views.database.sources.spei
23 |    views.database.sources.vdem
24 |    views.database.sources.wdi
25 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.spei.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.spei package
 2 | ===================================
 3 | 
 4 | .. automodule:: views.database.sources.spei
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.spei.spei module
13 | ---------------------------------------
14 | 
15 | .. automodule:: views.database.sources.spei.spei
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.vdem.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.vdem package
 2 | ===================================
 3 | 
 4 | .. automodule:: views.database.sources.vdem
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.vdem.vdem module
13 | ---------------------------------------
14 | 
15 | .. automodule:: views.database.sources.vdem.vdem
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.database.sources.wdi.rst:
--------------------------------------------------------------------------------
 1 | views.database.sources.wdi package
 2 | ==================================
 3 | 
 4 | .. automodule:: views.database.sources.wdi
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.database.sources.wdi.wdi module
13 | -------------------------------------
14 | 
15 | .. automodule:: views.database.sources.wdi.wdi
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.rst:
--------------------------------------------------------------------------------
 1 | views package
 2 | =============
 3 | 
 4 | .. automodule:: views
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    views.apps
16 |    views.database
17 |    views.specs
18 |    views.utils
19 | 
20 | Submodules
21 | ----------
22 | 
23 | views.config module
24 | -------------------
25 | 
26 | .. automodule:: views.config
27 |    :members:
28 |    :undoc-members:
29 |    :show-inheritance:
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/source/views.specs.data.rst:
--------------------------------------------------------------------------------
 1 | views.specs.data package
 2 | ========================
 3 | 
 4 | .. automodule:: views.specs.data
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.specs.data.parsed\_datasets module
13 | ----------------------------------------
14 | 
15 | .. automodule:: views.specs.data.parsed_datasets
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.specs.data.solver module
21 | ------------------------------
22 | 
23 | .. automodule:: views.specs.data.solver
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/source/views.specs.models.rst:
--------------------------------------------------------------------------------
 1 | views.specs.models package
 2 | ==========================
 3 | 
 4 | .. automodule:: views.specs.models
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.specs.models.solver module
13 | --------------------------------
14 | 
15 | .. automodule:: views.specs.models.solver
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/source/views.specs.periods.rst:
--------------------------------------------------------------------------------
1 | views.specs.periods package
2 | ===========================
3 | 
4 | .. automodule:: views.specs.periods
5 |    :members:
6 |    :undoc-members:
7 |    :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/source/views.specs.rst:
--------------------------------------------------------------------------------
 1 | views.specs package
 2 | ===================
 3 | 
 4 | .. automodule:: views.specs
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Subpackages
10 | -----------
11 | 
12 | .. toctree::
13 |    :maxdepth: 4
14 | 
15 |    views.specs.data
16 |    views.specs.models
17 |    views.specs.periods
18 | 


--------------------------------------------------------------------------------
/docs/source/views.utils.rst:
--------------------------------------------------------------------------------
 1 | views.utils package
 2 | ===================
 3 | 
 4 | .. automodule:: views.utils
 5 |    :members:
 6 |    :undoc-members:
 7 |    :show-inheritance:
 8 | 
 9 | Submodules
10 | ----------
11 | 
12 | views.utils.data module
13 | -----------------------
14 | 
15 | .. automodule:: views.utils.data
16 |    :members:
17 |    :undoc-members:
18 |    :show-inheritance:
19 | 
20 | views.utils.db module
21 | ---------------------
22 | 
23 | .. automodule:: views.utils.db
24 |    :members:
25 |    :undoc-members:
26 |    :show-inheritance:
27 | 
28 | views.utils.io module
29 | ---------------------
30 | 
31 | .. automodule:: views.utils.io
32 |    :members:
33 |    :undoc-members:
34 |    :show-inheritance:
35 | 
36 | views.utils.log module
37 | ----------------------
38 | 
39 | .. automodule:: views.utils.log
40 |    :members:
41 |    :undoc-members:
42 |    :show-inheritance:
43 | 
44 | views.utils.misc module
45 | -----------------------
46 | 
47 | .. automodule:: views.utils.misc
48 |    :members:
49 |    :undoc-members:
50 |    :show-inheritance:
51 | 
52 | views.utils.mocker module
53 | -------------------------
54 | 
55 | .. automodule:: views.utils.mocker
56 |    :members:
57 |    :undoc-members:
58 |    :show-inheritance:
59 | 
60 | views.utils.stats module
61 | ------------------------
62 | 
63 | .. automodule:: views.utils.stats
64 |    :members:
65 |    :undoc-members:
66 |    :show-inheritance:
67 | 
68 | 


--------------------------------------------------------------------------------
/install_views2.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Stop on error
 4 | set -e
 5 | echo "Started installing views."
 6 | echo "Initalising conda for this shell"
 7 | eval "$(conda shell.bash hook)"
 8 | 
 9 | echo "Updating conda"
10 | conda update --all --yes
11 | echo "Removing existing views2 env"
12 | conda remove --name views2 --all --yes
13 | echo "Creating env from env_static.yaml"
14 | # @TODO: Change back to env_static.yaml asap when we have working "builds" for linux
15 | conda env create -f misc/environment.yaml
16 | echo "Activating env"
17 | conda activate views2
18 | echo "Running pip install --editable . to install the views package"
19 | pip install --editable .
20 | 
21 | echo "Creating storage directory here"
22 | mkdir -p ./storage
23 | 
24 | # Copy the default config file to default config dir ~/.views2/
25 | if [ ! -f ./config.yaml ];
26 |     then
27 |         echo "No current ./config.yaml found, copying the defaults"
28 |         cp ./misc/defaults.yaml ./config.yaml
29 |     else
30 |         echo "./config.yaml already exists, not changing it"
31 | fi
32 | 
33 | echo "Great success, you can now do \" conda activate views2 \" in your shell and get started."


--------------------------------------------------------------------------------
/misc/README.md:
--------------------------------------------------------------------------------
 1 | # Misc?
 2 | 
 3 | ## Static dependencies list
 4 | 
 5 | To avoid issues with breaking changes from updated dependencies the
 6 | main installer now uses a static list of versioned dependencies in env_static.yaml.
 7 | No more updated dependencies suddenly breaking code.
 8 | 
 9 | If you want to add a dependency:
10 | 
11 | * add it to environment.yaml in this dir,
12 | * recreate the views2 environment with (from this dir):
13 | 
14 |     conda remove --name views2 --all --yes
15 |     conda env create -f environment.yaml
16 | 
17 | Then run
18 | 
19 |     ./freeze_env.sh
20 | 
21 | to update env_static.yaml.
22 | 


--------------------------------------------------------------------------------
/misc/defaults.yaml:
--------------------------------------------------------------------------------
 1 | default_database: views
 2 | databases:
 3 |   views:
 4 |     user: username # CHANGE ME!
 5 |     host: janus
 6 |     dbname: views
 7 |     port: 5432
 8 |     use_ssl: True
 9 |     ssl_cert: "~/.postgres/postgresql.crt"
10 |     ssl_key: "~/.postgres/postgresql.key"
11 |     ssl_rootcert: "~/.postgres/root.crt"
12 |   local:
13 |     user: username
14 |     host: 127.0.0.1
15 |     dbname: postgres
16 |     port: 5432
17 |     use_ssl: False
18 |     password: ""
19 | 
20 | dirs:
21 |   storage: "" # Emtpy string will default to the storage directory in the repo
22 |   scratch: "" # Emtpy string will default to the storage/scratch/ directory in the repo
23 | 
24 | slurm:
25 |   username: ""
26 |   project: ""
27 | 
28 | qualtrics:
29 |   token: "QUALTRICS_TOKEN"
30 | 
31 | 


--------------------------------------------------------------------------------
/misc/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: views2
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |  - python == 3.8
 6 |  - pip # for the pip-only deps
 7 |  - jupyter # jupyter notebooks
 8 |  - jupyterlab # jupyter lab, much nicer notebooks
 9 |  - numpy
10 |  - pandas >= 1.0
11 |  - scipy # more math
12 |  - scikit-learn #models
13 |  - statsmodels # models
14 |  - matplotlib # plotting
15 |  - psycopg2 #postgres io
16 |  - sqlalchemy # postgres io
17 |  - ipython # interactive terminal
18 |  - pyyaml # YAML IO
19 |  - joblib # easier multiprocessing and a faster pickler
20 |  - h5py # hdf5 file for dynasim aggregation
21 |  - pylint == 2.4.4 # linting, freeze at 2.4.4 as 2.5 gave errors
22 |  - beautifulsoup4 # Web scraping
23 |  - lxml # parser for bs4
24 |  - xarray # SPEI loading
25 |  - numba # For fast stuff
26 |  - pyarrow # For Parquet IO in pandas
27 |  - psutil # For physical core detection, thread's don't help dynasim
28 |  - descartes # Basic Geopandas Plotting
29 |  - xlrd # Dependency pd.read_excel
30 |  - xlwt # For formatting (old) Excel files
31 |  - seaborn # Fancy plotting
32 |  # These packages are only available from pip or have later versions there
33 |  - pip:
34 |    - black # Code formatting
35 |    - coverage # Test coverage report
36 |    - flake8 # Grumpy linter
37 |    - geoalchemy2 # Geometry types for pushing geodataframes to postgres
38 |    - geopandas
39 |    - html5lib # Reign loader bs4 parser
40 |    - libpysal
41 |    - mypy # Type checking
42 |    - netcdf4 # SPEI loading
43 |    - pytest # Testing
44 |    - requests # read the web
45 |    - sphinx # Docs
46 |    - xgboost # pip has later versions than conda
47 |    - contextily
48 |    - pdpbox


--------------------------------------------------------------------------------
/misc/freeze_env.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Overwrite env_static.yaml with the latest versions of depencies from your env.
 4 | # Make sure to run all the tests before committing an env_static.yaml with
 5 | # newer packages so that we are all working on the same versions.
 6 | 
 7 | echo "Initalising conda for this shell"
 8 | eval "$(conda shell.bash hook)"
 9 | conda activate views2
10 | conda env export --no-builds | grep -v "prefix" > ../env_static.yaml
11 | 


--------------------------------------------------------------------------------
/misc/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | filterwarnings =
3 |     ignore:Using or importing the ABCs:DeprecationWarning


--------------------------------------------------------------------------------
/projects/model_development/README.md:
--------------------------------------------------------------------------------
1 | # Model development


--------------------------------------------------------------------------------
/projects/monthly_report/changelog.md:
--------------------------------------------------------------------------------
 1 | # r_2021_03_01
 2 | 
 3 | **General changes (to merge into master)**:
 4 | * Reduced cores per job by one via `mem_per_job` in `monthly.py` to avoid crashing multiprocessing pool.
 5 | * Simplified argparse of `train_model.py` and `train_slurm.py` to allow listing of steps and models like `--model model_a model_b` rather than `--model model_a --model model_b`.
 6 | * `get_files_latest_fetch` in `common.py` fetched first rather than last item in sorted list. This has applied to ICGCW and REIGN. Now fetches the correct latest version of the data.
 7 | * Allowed passing 0 to `tlag` in transforms lib for tlag_0 variables.
 8 | * Adding current violent history input to the model specs. Zero time-lagged to avoid step-shifting our outcome in model api. See specific changes to column sets below.
 9 | 
10 | **Changes at cm**:
11 | * Added to `cfshort`:
12 | ```
13 |     - tlags_0_ged_dummy_sb_ns_os
14 |     - ged_best_sb_ns_os
15 |     - greq_5_ged_best_sb_ns_os
16 |     - tlags_0_greq_5_ged_best_sb_ns_os
17 |     - tlags_0_greq_25_ged_best_sb_ns_os
18 |     - greq_100_ged_best_sb_ns_os
19 |     - tlags_0_greq_100_ged_best_sb_ns_os
20 | ```
21 | * Added to `cflong`:
22 | ```
23 |     - tlags_0_ged_dummy_sb_ns_os
24 |     - ged_best_sb_ns_os
25 |     - greq_5_ged_best_sb_ns_os
26 |     - tlags_0_greq_5_ged_best_sb_ns_os
27 |     - tlags_0_greq_25_ged_best_sb_ns_os
28 |     - greq_100_ged_best_sb_ns_os
29 |     - tlags_0_greq_100_ged_best_sb_ns_os
30 | ```
31 | 
32 | **Changes at pgm**:
33 | 
34 | * Added to `legacy_hist_common`:
35 | ```
36 |     - tlags_0_ged_dummy_sb_ns_os
37 |     - ged_best_sb_ns_os  # TODO?
38 |     - tlags_0_greq_5_ged_best_sb_ns_os
39 |     - tlags_0_greq_25_ged_best_sb_ns_os
40 |     - tlags_0_greq_100_ged_best_sb_ns_os
41 |     - acled_protest
42 | ```
43 | * Added colset `acled_protest`:
44 | ```
45 |     - acled_dummy_pr
46 |     - tlag_0_acled_dummy_pr
47 |     - acled_count_pr
48 | ```
49 | 
50 | 
51 | **Retrained models at cm**:
52 | * cm_sb_cfshort
53 | * cm_sb_cflong
54 | * cm_sb_acled_violence
55 | * cm_sb_acled_protest 
56 | * cm_sbonset24_25_all
57 | * cm_sb_all_global
58 | 
59 | 
60 | **Retrained models at pgm**:
61 | * pgm_sb_hist_legacy
62 | * pgm_sb_allthemes
63 | * pgm_sb_onset24_100_all
64 | * pgm_sb_onset24_1_all
65 | * pgm_sb_all_gxgb


--------------------------------------------------------------------------------
/projects/prediction_competition/README.md:
--------------------------------------------------------------------------------
 1 | # Welcome
 2 | 
 3 | Welcome to the ViEWS prediction competition.
 4 | 
 5 | # Getting started
 6 | 
 7 | See the README.md in the root of this repository for installing and starting the jupyter notebook server.
 8 | Then navigate to this directory in the jupyter notebook browser window and open benchmark_notebook.ipynb
 9 | 
10 | Good luck!


--------------------------------------------------------------------------------
/projects/replication_jpr_2020/README.md:
--------------------------------------------------------------------------------
1 | # JPR 2020 replication
2 | 
3 | This directory contains a .zip of the code in the repository used to publish the ViEWS paper in JPR in 2020.


--------------------------------------------------------------------------------
/projects/replication_jpr_2020/gitlab_mirror/views_jpr_2020_code.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UppsalaConflictDataProgram/OpenViEWS2/7eb3e63c8c046de31f70cd56f417fadf03686f5a/projects/replication_jpr_2020/gitlab_mirror/views_jpr_2020_code.zip


--------------------------------------------------------------------------------
/run_tools.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | clear
 4 | 
 5 | # Stop on non-zero exit
 6 | # We don't want to lint if the tests fail
 7 | set -e
 8 | 
 9 | echo "Initalising conda for this shell"
10 | eval "$(conda shell.bash hook)"
11 | conda activate views2
12 | 
13 | 
14 | echo "Black"
15 | black -l 79 views
16 | black -l 79 projects
17 | black -l 79 tests
18 | black -l 79 runners
19 | 
20 | echo "mypy views"
21 | mypy views
22 | #echo "mypy projects"
23 | # mypy projects/*
24 | mypy runners
25 | echo "mypy tests"
26 | mypy tests
27 | 
28 | 
29 | echo "Running pytest with coverage"
30 | coverage run --source views -m pytest -c misc/pytest.ini tests/
31 | coverage report --show-missing
32 | 
33 | # Allow non-zero exit for lints
34 | set +e
35 | 
36 | echo "flake8"
37 | # Ignores are for black conflicts, black wins
38 | flake8 --ignore=E203,W503 views
39 | flake8 --ignore=E203,W503 projects
40 | 
41 | echo "pylint"
42 | pylint views
43 | 
44 | echo "Generating docs"
45 | # Clear existing generated docs
46 | rm -f docs/source/*
47 | # Auto-generate new docs
48 | # --module-frist makes Package __init__ come before all the submodules
49 | # See https://www.sphinx-doc.org/en/master/man/sphinx-apidoc.html#options
50 | sphinx-apidoc --module-first -o docs/source/ views
51 | # Make HTML docs
52 | make -C docs/ html
53 | 
54 | git status
55 | 


--------------------------------------------------------------------------------
/runners/README.md:
--------------------------------------------------------------------------------
 1 | # Runners
 2 | 
 3 | Runners are entrypoint scripts to ViEWS functionality for
 4 | 
 5 | * Training
 6 | * Predicting
 7 | * Evaluating
 8 | 
 9 | They should be as simple as possible, with complexity handled in the apps themselves.
10 | Entrypoints should only handle
11 | 
12 | * Dealing with execution context (slurm, conda etc).
13 | * Parsing arguments
14 | * Logging
15 | * Executing the correct functionality from modules
16 | 
17 | 


--------------------------------------------------------------------------------
/runners/export_data.py:
--------------------------------------------------------------------------------
 1 | """ Refresh all datasets that are defined by the specs """
 2 | 
 3 | import logging
 4 | import views
 5 | 
 6 | logging.basicConfig(
 7 |     level=logging.DEBUG,
 8 |     format=views.config.LOGFMT,
 9 |     handlers=[
10 |         logging.FileHandler(views.utils.log.get_log_path(__file__)),
11 |         logging.StreamHandler(),
12 |     ],
13 | )
14 | log = logging.getLogger(__name__)
15 | 
16 | 
17 | def run_export_tables_and_geoms() -> None:
18 |     views.apps.data.public.export_tables_and_geoms(
19 |         tables=views.TABLES,
20 |         geometries=views.GEOMETRIES,
21 |         dir_output=views.DIR_SCRATCH,
22 |     )
23 | 
24 | 
25 | def main():
26 |     run_export_tables_and_geoms()
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 


--------------------------------------------------------------------------------
/runners/import_data.py:
--------------------------------------------------------------------------------
 1 | """ Import data to local cache """
 2 | 
 3 | import argparse
 4 | import logging
 5 | from typing import Optional, Tuple
 6 | import views
 7 | 
 8 | logging.basicConfig(
 9 |     level=logging.DEBUG,
10 |     format=views.config.LOGFMT,
11 |     handlers=[
12 |         logging.FileHandler(views.utils.log.get_log_path(__file__)),
13 |         logging.StreamHandler(),
14 |     ],
15 | )
16 | log = logging.getLogger(__name__)
17 | 
18 | 
19 | def parse_args() -> Tuple[Optional[str], bool, bool]:
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument(
22 |         "--path_zip", type=str, help="Path to zip to import from",
23 |     )
24 |     parser.add_argument(
25 |         "--fetch", action="store_true", help="Fetch from website."
26 |     )
27 |     parser.add_argument(
28 |         "--datasets", action="store_true", help="Refresh datasets."
29 |     )
30 |     args = parser.parse_args()
31 | 
32 |     if args.path_zip and args.fetch:
33 |         raise RuntimeError("Pass in --path_zip or --fetch, not both.")
34 | 
35 |     return args.path_zip, args.fetch, args.datasets
36 | 
37 | 
38 | def run_import_tables_and_geoms(path_zip) -> None:
39 |     views.apps.data.public.import_tables_and_geoms(
40 |         tables=views.TABLES, geometries=views.GEOMETRIES, path_zip=path_zip,
41 |     )
42 | 
43 | 
44 | def refresh_datasets() -> None:
45 | 
46 |     log.info("Started refreshing all datasets.")
47 | 
48 |     datasets_to_update = [
49 |         "cm_global_imp_0",
50 |         "cm_africa_imp_0",
51 |         "pgm_africa_imp_0",
52 |     ]
53 |     for dataset_name in datasets_to_update:
54 |         log.info(f"Started refreshing dataset {dataset_name}")
55 |         views.DATASETS[dataset_name].refresh(do_transforms=False)
56 | 
57 |     log.info("Finished refreshing all imp_0 datasets.")
58 | 
59 | 
60 | def main() -> None:
61 | 
62 |     path_zip, do_fetch, do_datasets = parse_args()
63 | 
64 |     if do_fetch:
65 |         path_zip = views.apps.data.public.fetch_latest_zip_from_website(
66 |             path_dir_destination=views.DIR_SCRATCH
67 |         )
68 | 
69 |     run_import_tables_and_geoms(path_zip)
70 |     refresh_datasets()
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/runners/predict.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import sys
  4 | import logging
  5 | from typing import Tuple, List
  6 | 
  7 | from views import DATASETS
  8 | from views.apps.model import api
  9 | from views.apps.pipeline import (
 10 |     predict,
 11 |     models_cm,
 12 |     models_pgm,
 13 |     ensembles_cm,
 14 |     ensembles_pgm,
 15 | )
 16 | from views.config import LOGFMT
 17 | from views.utils.log import get_log_path
 18 | from views.utils.data import assign_into_df
 19 | 
 20 | 
 21 | logging.basicConfig(
 22 |     level=logging.DEBUG,
 23 |     format=LOGFMT,
 24 |     handlers=[
 25 |         logging.FileHandler(get_log_path(__file__)),
 26 |         logging.StreamHandler(),
 27 |     ],
 28 | )
 29 | 
 30 | log = logging.getLogger(__name__)
 31 | 
 32 | 
 33 | def predict_cm_models(run_id: str, n_cores: int) -> None:
 34 |     """ Predict with all CM models """
 35 |     dataset = DATASETS["cm_africa_imp_0"]
 36 |     models = models_cm.all_cm_models
 37 |     predict.predict_models(
 38 |         models=models, dataset=dataset, run_id=run_id, n_cores=n_cores
 39 |     )
 40 | 
 41 | 
 42 | def predict_pgm_models(run_id: str, n_cores: int) -> None:
 43 |     """ Predict with all PGM models """
 44 |     dataset = DATASETS["pgm_africa_imp_0"]
 45 |     models = models_pgm.all_pgm_models
 46 |     predict.predict_models(
 47 |         models=models, dataset=dataset, run_id=run_id, n_cores=n_cores
 48 |     )
 49 | 
 50 | 
 51 | def predict_cm_ensembles(run_id: str, n_cores: int) -> None:
 52 |     """ Predict with all CM ensembles """
 53 |     ensembles = ensembles_cm.all_cm_ensembles
 54 |     dataset = DATASETS["cm_africa_imp_0"]
 55 |     predict.predict_ensembles(ensembles, dataset, run_id, n_cores=n_cores)
 56 | 
 57 | 
 58 | def predict_pgm_ensembles(run_id: str, n_cores: int) -> None:
 59 |     """ Predict with all PGM ensembles """
 60 |     ensembles = ensembles_pgm.all_pgm_ensembles
 61 |     dataset = DATASETS["pgm_africa_imp_0"]
 62 |     predict.predict_ensembles(ensembles, dataset, run_id, n_cores=n_cores)
 63 | 
 64 | 
 65 | def predict_pgm_ensembles_and_constituent(run_id: str, n_cores: int) -> None:
 66 |     """ Predict all PGM ensembles and their constituent models """
 67 |     log.info(f"Predicting PGM ensembles and their constituent models")
 68 |     ensembles = ensembles_pgm.all_pgm_ensembles
 69 |     models: List[api.Model] = []
 70 |     for ensemble in ensembles:
 71 |         for model in ensemble.models:
 72 |             if not any([m for m in models if m.name == model.name]):
 73 |                 models.append(model)
 74 | 
 75 |     dataset = DATASETS["pgm_africa_imp_0"]
 76 |     predict.predict_models(
 77 |         models=models, dataset=dataset, run_id=run_id, n_cores=n_cores
 78 |     )
 79 |     predict.predict_ensembles(
 80 |         ensembles=ensembles, dataset=dataset, run_id=run_id, n_cores=n_cores
 81 |     )
 82 | 
 83 | 
 84 | def predict_cm_ensembles_and_constituent(run_id: str, n_cores: int) -> None:
 85 |     """ Predict all cm ensembles and their constituent models """
 86 |     log.info(f"Predicting CM ensembles and their constituent models")
 87 |     ensembles = ensembles_cm.all_cm_ensembles
 88 |     models: List[api.Model] = []
 89 |     for ensemble in ensembles:
 90 |         for model in ensemble.models:
 91 |             if not any([m for m in models if m.name == model.name]):
 92 |                 models.append(model)
 93 | 
 94 |     dataset = DATASETS["cm_africa_imp_0"]
 95 |     predict.predict_models(
 96 |         models=models, dataset=dataset, run_id=run_id, n_cores=n_cores
 97 |     )
 98 |     predict.predict_ensembles(
 99 |         ensembles=ensembles, dataset=dataset, run_id=run_id, n_cores=n_cores
100 |     )
101 | 
102 | 
103 | def parse_args() -> Tuple[str, bool, bool, bool, bool, int]:
104 |     parser = argparse.ArgumentParser()
105 |     parser.add_argument(
106 |         "--pgm", action="store_true", help="Predict PGM models?"
107 |     )
108 |     parser.add_argument("--cm", action="store_true", help="Predict CM models?")
109 |     parser.add_argument(
110 |         "--run_id", type=str, help="Run ID to predict for", required=True
111 |     )
112 |     parser.add_argument(
113 |         "--model", action="store_true", help="Make model predictions"
114 |     )
115 |     parser.add_argument(
116 |         "--ensemble", action="store_true", help="Make ensemble predictions"
117 |     )
118 |     parser.add_argument("--n_cores", type=int, choices=range(0, 40), default=4)
119 |     args = parser.parse_args()
120 | 
121 |     return (
122 |         args.run_id,
123 |         args.pgm,
124 |         args.cm,
125 |         args.model,
126 |         args.ensemble,
127 |         args.n_cores,
128 |     )
129 | 
130 | 
131 | def main():
132 |     run_id, do_pgm, do_cm, do_model, do_ensemble, n_cores = parse_args()
133 |     log.info(
134 |         f"predict running with flags "
135 |         f"run_id {run_id} do_pgm {do_pgm} do_cm {do_cm} do_model "
136 |         f"{do_model} do_ensemble {do_ensemble}"
137 |     )
138 | 
139 |     if do_model and do_ensemble:
140 |         if do_cm:
141 |             predict_cm_ensembles_and_constituent(run_id, n_cores)
142 |         if do_pgm:
143 |             predict_pgm_ensembles_and_constituent(run_id, n_cores)
144 |     elif do_model:
145 |         if do_cm:
146 |             predict_cm_models(run_id, n_cores)
147 |         if do_pgm:
148 |             predict_pgm_models(run_id, n_cores)
149 |     elif do_ensemble:
150 |         if do_cm:
151 |             predict_cm_ensembles(run_id, n_cores)
152 |         if do_pgm:
153 |             predict_pgm_ensembles(run_id, n_cores)
154 |     else:
155 |         log.info(f"Nothing to do! Run predict.py --help to show args.")
156 | 
157 | 
158 | if __name__ == "__main__":
159 |     try:
160 |         main()
161 |     except:
162 |         log.exception(f"Something broke")
163 |         raise
164 | 


--------------------------------------------------------------------------------
/runners/predict_slurm.py:
--------------------------------------------------------------------------------
 1 | """ Train all models via slurm """
 2 | import argparse
 3 | import os
 4 | import sys
 5 | import logging
 6 | from typing import Tuple
 7 | 
 8 | from views.apps.slurm.slurm import run_command
 9 | from views.config import LOGFMT
10 | from views.utils.log import get_log_path
11 | 
12 | logging.basicConfig(
13 |     level=logging.DEBUG,
14 |     format=LOGFMT,
15 |     handlers=[
16 |         logging.FileHandler(get_log_path(__file__)),
17 |         logging.StreamHandler(),
18 |     ],
19 | )
20 | 
21 | 
22 | def parse_args() -> Tuple[str, bool, bool, int, int]:
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--pgm", action="store_true", help="Predict PGM?")
25 |     parser.add_argument("--cm", action="store_true", help="Predict CM?")
26 |     parser.add_argument(
27 |         "--run_id", type=str, help="Run ID to predict for", required=True
28 |     )
29 |     parser.add_argument("--n_cores", type=int, choices=range(0, 40), default=4)
30 |     parser.add_argument("--hours", type=int, choices=range(0, 128), default=24)
31 |     args = parser.parse_args()
32 | 
33 |     return args.run_id, args.pgm, args.cm, args.n_cores, args.hours
34 | 
35 | 
36 | def _build_command(loa: str, run_id: str, n_cores: int) -> str:
37 |     path_runner = os.path.join(
38 |         os.path.dirname(os.path.abspath(__file__)), "predict.py"
39 |     )
40 |     path_exec = sys.executable
41 |     return f"{path_exec} {path_runner} --{loa} --run_id {run_id} --model --ensemble --n_cores {n_cores}"
42 | 
43 | 
44 | def main() -> None:
45 | 
46 |     run_id, do_pgm, do_cm, n_cores, hours = parse_args()
47 | 
48 |     if do_cm:
49 |         cmd = _build_command(loa="cm", run_id=run_id, n_cores=n_cores)
50 |         run_command(cmd, hours=hours)
51 | 
52 |     if do_pgm:
53 |         cmd = _build_command(loa="pgm", run_id=run_id, n_cores=n_cores)
54 |         run_command(cmd, hours=hours)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/runners/refresh_data.py:
--------------------------------------------------------------------------------
 1 | """ Refresh all datasets that are defined by the specs """
 2 | 
 3 | from typing import Tuple
 4 | import argparse
 5 | import logging
 6 | 
 7 | import views
 8 | 
 9 | logging.basicConfig(
10 |     level=logging.DEBUG,
11 |     format=views.config.LOGFMT,
12 |     handlers=[
13 |         logging.FileHandler(views.utils.log.get_log_path(__file__)),
14 |         logging.StreamHandler(),
15 |     ],
16 | )
17 | log = logging.getLogger(__name__)
18 | 
19 | 
20 | def parse_args() -> Tuple[bool, bool, bool]:
21 | 
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("--all", action="store_true", help="refresh all")
24 |     parser.add_argument(
25 |         "--geom", action="store_true", help="refresh geometries"
26 |     )
27 |     parser.add_argument("--tables", action="store_true", help="refresh tables")
28 |     parser.add_argument(
29 |         "--datasets", action="store_true", help="refresh datasets"
30 |     )
31 |     args = parser.parse_args()
32 | 
33 |     do_geom = args.geom
34 |     do_tables = args.tables
35 |     do_datasets = args.datasets
36 |     if args.all:
37 |         do_geom, do_tables, do_datasets = True, True, True
38 | 
39 |     if not any([do_geom, do_tables, do_datasets]):
40 |         log.info("Nothing to do, see python refresh_data.py --help for args.")
41 | 
42 |     return do_geom, do_tables, do_datasets
43 | 
44 | 
45 | def refresh_geometries() -> None:
46 |     log.info(f"Refreshing all Geometries")
47 |     for geometry in views.GEOMETRIES.values():
48 |         geometry.refresh()
49 |     log.info("Finished refreshing all Geometries")
50 | 
51 | 
52 | def refresh_tables() -> None:
53 |     log.info(f"Refreshing all Tables")
54 |     for table in views.TABLES.values():
55 |         table.refresh()
56 |     log.info("Finished refreshing all Tables")
57 | 
58 | 
59 | def refresh_datasets() -> None:
60 |     log.info(f"Refreshing all Datasets")
61 |     for dataset in views.DATASETS.values():
62 |         dataset.refresh()
63 |     log.info("Finished refreshing all Datasets")
64 | 
65 | 
66 | def refresh_all():
67 |     do_geom, do_tables, do_datasets = parse_args()
68 |     if do_geom:
69 |         refresh_geometries()
70 |     if do_tables:
71 |         refresh_tables()
72 |     if do_datasets:
73 |         refresh_datasets()
74 | 
75 | 
76 | if __name__ == "__main__":
77 |     refresh_all()
78 | 


--------------------------------------------------------------------------------
/runners/refresh_data_slurm.py:
--------------------------------------------------------------------------------
 1 | """ Refresh data via slurm """
 2 | 
 3 | import os
 4 | import sys
 5 | import logging
 6 | 
 7 | from views.apps.slurm.slurm import run_command
 8 | from views.config import LOGFMT
 9 | from views.utils.log import get_log_path
10 | 
11 | logging.basicConfig(
12 |     level=logging.DEBUG,
13 |     format=LOGFMT,
14 |     handlers=[
15 |         logging.FileHandler(get_log_path(__file__)),
16 |         logging.StreamHandler(),
17 |     ],
18 | )
19 | 
20 | log = logging.getLogger(__name__)
21 | 
22 | 
23 | def _build_cmd_refresh_data() -> str:
24 |     """ Just get a shell command for starting refersh_data.py """
25 | 
26 |     path_runner = os.path.join(
27 |         os.path.dirname(os.path.abspath(__file__)), "refresh_data.py"
28 |     )
29 |     path_exec = sys.executable
30 |     cmd = f"{path_exec} {path_runner} --all"
31 |     return cmd
32 | 
33 | 
34 | def main() -> None:
35 |     run_command(command=_build_cmd_refresh_data(), hours=24)
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     main()
40 | 


--------------------------------------------------------------------------------
/runners/train_all_local.py:
--------------------------------------------------------------------------------
 1 | """ Train all models locally """
 2 | 
 3 | import argparse
 4 | import os
 5 | import sys
 6 | import logging
 7 | from typing import Tuple
 8 | 
 9 | from views import DATASETS
10 | from views.apps.model import api
11 | from views.apps.pipeline import models_cm, models_pgm
12 | from views.config import LOGFMT
13 | from views.utils.log import get_log_path
14 | 
15 | logging.basicConfig(
16 |     level=logging.DEBUG,
17 |     format=LOGFMT,
18 |     handlers=[
19 |         logging.FileHandler(get_log_path(__file__)),
20 |         logging.StreamHandler(),
21 |     ],
22 | )
23 | 
24 | log = logging.getLogger(__name__)
25 | 
26 | 
27 | def parse_args() -> Tuple[bool, bool]:
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument("--pgm", action="store_true", help="Train PGM models?")
30 |     parser.add_argument("--cm", action="store_true", help="Train CM models?")
31 |     args = parser.parse_args()
32 | 
33 |     return args.pgm, args.cm
34 | 
35 | 
36 | def main() -> None:
37 |     do_pgm, do_cm = parse_args()
38 | 
39 |     if do_pgm:
40 |         for model in models_pgm.all_pgm_models:
41 |             df = DATASETS["flat_pgm_africa_1"].df
42 |             model.fit_estimators(df)
43 |             model.save()
44 | 
45 |     if do_cm:
46 |         for model in models_cm.all_cm_models:
47 |             if "train_africa" in model.tags:
48 |                 df = DATASETS["flat_cm_africa_1"].df
49 |             elif "train_global" in model.tags:
50 |                 df = DATASETS["flat_cm_global_1"].df
51 |             model.fit_estimators(df)
52 |             model.save()
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     try:
57 |         main()
58 |     except:
59 |         log.exception(f"Training failed for some reason.")
60 | 


--------------------------------------------------------------------------------
/runners/train_all_slurm.py:
--------------------------------------------------------------------------------
 1 | """ Train all models via slurm """
 2 | 
 3 | import argparse
 4 | import os
 5 | import sys
 6 | import logging
 7 | from typing import Tuple
 8 | 
 9 | from views.apps.model import api
10 | from views.apps.pipeline import models_cm, models_pgm
11 | from views.apps.slurm.slurm import run_command
12 | from views.config import LOGFMT
13 | from views.utils.log import get_log_path
14 | 
15 | logging.basicConfig(
16 |     level=logging.DEBUG,
17 |     format=LOGFMT,
18 |     handlers=[
19 |         logging.FileHandler(get_log_path(__file__)),
20 |         logging.StreamHandler(),
21 |     ],
22 | )
23 | 
24 | log = logging.getLogger(__name__)
25 | 
26 | 
27 | def parse_args() -> Tuple[bool, bool]:
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument("--pgm", action="store_true", help="Train PGM models?")
30 |     parser.add_argument("--cm", action="store_true", help="Train CM models?")
31 |     args = parser.parse_args()
32 | 
33 |     return args.pgm, args.cm
34 | 
35 | 
36 | def _build_cmd_train_model(model: api.Model, dataset: str, loa: str) -> str:
37 |     path_runner = os.path.join(
38 |         os.path.dirname(os.path.abspath(__file__)), "train_model.py"
39 |     )
40 |     path_exec = sys.executable
41 |     cmd = (
42 |         f"{path_exec} {path_runner} "
43 |         f"--model {model.name} "
44 |         f"--dataset {dataset} "
45 |         f"--loa {loa} "
46 |     )
47 |     return cmd
48 | 
49 | 
50 | def main() -> None:
51 | 
52 |     train_pgm, train_cm = parse_args()
53 | 
54 |     # CM
55 |     if train_cm:
56 |         log.info(f"--cm was passed, training all CM models.")
57 |         for model in models_cm.all_cm_models:
58 |             if "train_africa" in model.tags:
59 |                 cmd = _build_cmd_train_model(
60 |                     model, dataset="flat_cm_africa_1", loa="cm"
61 |                 )
62 |             elif "train_global" in model.tags:
63 |                 cmd = _build_cmd_train_model(
64 |                     model, dataset="flat_cm_global_1", loa="cm"
65 |                 )
66 |             run_command(cmd)
67 | 
68 |     # PGM
69 |     if train_pgm:
70 |         log.info(f"--pgm was passed, training all pgm models.")
71 |         for model in models_pgm.all_pgm_models:
72 |             cmd = _build_cmd_train_model(
73 |                 model, dataset="flat_pgm_africa_1", loa="pgm"
74 |             )
75 |             run_command(cmd, hours=48)
76 | 
77 | 
78 | if __name__ == "__main__":
79 |     main()
80 | 


--------------------------------------------------------------------------------
/runners/train_model.py:
--------------------------------------------------------------------------------
 1 | """ Command line interface for model training """
 2 | from typing import Tuple
 3 | from typing_extensions import Literal
 4 | import argparse
 5 | import logging
 6 | 
 7 | from views.apps.pipeline import train
 8 | from views.config import LOGFMT
 9 | from views.utils.log import get_log_path, logtime
10 | 
11 | logging.basicConfig(
12 |     level=logging.DEBUG,
13 |     format=LOGFMT,
14 |     handlers=[
15 |         logging.FileHandler(get_log_path(__file__)),
16 |         logging.StreamHandler(),
17 |     ],
18 | )
19 | 
20 | 
21 | def parse_args() -> Tuple[str, str, str]:
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument(
24 |         "--loa", type=str, help="Level of analysis, either cm or pgm"
25 |     )
26 |     parser.add_argument("--model", type=str, help="name of model to train")
27 |     parser.add_argument("--dataset", type=str, help="name of dataset")
28 | 
29 |     args = parser.parse_args()
30 | 
31 |     assert args.loa in ["am", "cm", "pgm"]
32 |     loa: Literal["am", "cm", "pgm"] = args.loa
33 |     model: str = args.model
34 |     dataset = args.dataset
35 | 
36 |     return loa, model, dataset
37 | 
38 | 
39 | @logtime
40 | def main():
41 |     loa, model, dataset = parse_args()
42 |     train.train_and_store_model_by_name(loa, model, dataset)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/runners/train_slurm.py:
--------------------------------------------------------------------------------
  1 | """ Train on slurm """
  2 | 
  3 | import argparse
  4 | import os
  5 | import sys
  6 | import logging
  7 | from typing import Tuple, List
  8 | 
  9 | from views.apps.model import api
 10 | from views.apps.pipeline import models_cm, models_pgm
 11 | from views.apps.slurm.slurm import run_command
 12 | from views.config import LOGFMT
 13 | from views.utils.log import get_log_path
 14 | 
 15 | logging.basicConfig(
 16 |     level=logging.DEBUG,
 17 |     format=LOGFMT,
 18 |     handlers=[
 19 |         logging.FileHandler(get_log_path(__file__)),
 20 |         logging.StreamHandler(),
 21 |     ],
 22 | )
 23 | 
 24 | log = logging.getLogger(__name__)
 25 | 
 26 | 
 27 | def parse_args() -> Tuple[bool, bool, bool, List[str]]:
 28 |     parser = argparse.ArgumentParser()
 29 |     parser.add_argument("--pgm", action="store_true", help="Train PGM models?")
 30 |     parser.add_argument("--cm", action="store_true", help="Train CM models?")
 31 |     parser.add_argument(
 32 |         "--all",
 33 |         action="store_true",
 34 |         help="Train all models for selected LOAs?",
 35 |     )
 36 |     parser.add_argument(
 37 |         "--model",
 38 |         action="append",
 39 |         help="Train a particular model. Pass multiple times for multiple models",
 40 |     )
 41 |     args = parser.parse_args()
 42 | 
 43 |     if args.all and args.models:
 44 |         raise RuntimeError(f"Can't have --all and --model")
 45 | 
 46 |     # We don't know which LOA to train for
 47 |     if args.model and args.cm and args.pgm:
 48 |         raise RuntimeError(f"Can't have --model, --cm and --pgm")
 49 | 
 50 |     return args.pgm, args.cm, args.all, args.model
 51 | 
 52 | 
 53 | def _build_cmd_train_model(modelname: str, dataset: str, loa: str) -> str:
 54 |     path_runner = os.path.join(
 55 |         os.path.dirname(os.path.abspath(__file__)), "train_model.py"
 56 |     )
 57 |     path_exec = sys.executable
 58 |     cmd = (
 59 |         f"{path_exec} {path_runner} "
 60 |         f"--model {modelname} "
 61 |         f"--dataset {dataset} "
 62 |         f"--loa {loa} "
 63 |     )
 64 |     return cmd
 65 | 
 66 | 
 67 | def main() -> None:
 68 | 
 69 |     pgm, cm, train_all, modelnames = parse_args()
 70 | 
 71 |     if modelnames:
 72 |         for modelname in modelnames:
 73 |             if pgm:
 74 |                 if not modelname in models_pgm.all_pgm_models_by_name:
 75 |                     raise RuntimeError(f"Couldn't find model name {modelname}")
 76 |                 cmd = _build_cmd_train_model(
 77 |                     modelname, dataset="flat_pgm_africa_1", loa="pgm",
 78 |                 )
 79 |             elif cm:
 80 |                 # Check we have model
 81 |                 if not modelname in models_cm.all_cm_models_by_name:
 82 |                     raise RuntimeError(f"Couldn't find model name {modelname}")
 83 | 
 84 |                 model = models_cm.all_cm_models_by_name[modelname]
 85 |                 if "train_africa" in model.tags:
 86 |                     cmd = _build_cmd_train_model(
 87 |                         model.name, dataset="flat_cm_africa_1", loa="cm"
 88 |                     )
 89 |                 elif "train_global" in model.tags:
 90 |                     cmd = _build_cmd_train_model(
 91 |                         model.name, dataset="flat_cm_global_1", loa="cm"
 92 |                     )
 93 |             run_command(cmd)
 94 | 
 95 |     # CM
 96 |     if cm and train_all:
 97 |         log.info(f"--cm and --all was passed, training all CM models.")
 98 |         for model in models_cm.all_cm_models:
 99 |             if "train_africa" in model.tags:
100 |                 cmd = _build_cmd_train_model(
101 |                     model.name, dataset="flat_cm_africa_1", loa="cm"
102 |                 )
103 |             elif "train_global" in model.tags:
104 |                 cmd = _build_cmd_train_model(
105 |                     model.name, dataset="flat_cm_global_1", loa="cm"
106 |                 )
107 |             run_command(cmd)
108 | 
109 |     # PGM
110 |     if pgm and train_all:
111 |         log.info(f"--pgm and --all was passed, training all pgm models.")
112 |         for model in models_pgm.all_pgm_models:
113 |             cmd = _build_cmd_train_model(
114 |                 model.name, dataset="flat_pgm_africa_1", loa="pgm"
115 |             )
116 |             run_command(cmd, hours=48)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/runners/update_database.py:
--------------------------------------------------------------------------------
  1 | """ Update a source in the database """
  2 | from typing import Tuple
  3 | import argparse
  4 | import logging
  5 | import views
  6 | from views.database.sources import (
  7 |     acled,
  8 |     cdum,
  9 |     fvp,
 10 |     ged,
 11 |     icgcw,
 12 |     pgdata,
 13 |     reign,
 14 |     spei,
 15 |     vdem,
 16 |     wdi,
 17 | )
 18 | 
 19 | logging.basicConfig(
 20 |     level=logging.DEBUG,
 21 |     format=views.config.LOGFMT,
 22 |     handlers=[
 23 |         logging.FileHandler(views.utils.log.get_log_path(__file__)),
 24 |         logging.StreamHandler(),
 25 |     ],
 26 | )
 27 | 
 28 | log = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | def parse_args() -> Tuple[
 32 |     bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool
 33 | ]:
 34 |     parser = argparse.ArgumentParser()
 35 | 
 36 |     parser.add_argument(
 37 |         "--nofetch", action="store_true", help="No fetch, only load."
 38 |     )
 39 |     parser.add_argument("--wdi", action="store_true", help="Update WDI")
 40 |     parser.add_argument("--vdem", action="store_true", help="Update VDEM")
 41 |     parser.add_argument("--acled", action="store_true", help="Update ACLED")
 42 |     parser.add_argument("--ged", action="store_true", help="Update GED")
 43 |     parser.add_argument("--icgcw", action="store_true", help="Update ICGCW")
 44 |     parser.add_argument(
 45 |         "--pgdata", action="store_true", help="Update Priogrid"
 46 |     )
 47 |     parser.add_argument("--spei", action="store_true", help="Update SPEI")
 48 |     parser.add_argument("--fvp", action="store_true", help="Update FVP")
 49 |     parser.add_argument(
 50 |         "--cdum", action="store_true", help="Update country dummies"
 51 |     )
 52 |     parser.add_argument("--reign", action="store_true", help="Upate reign")
 53 | 
 54 |     args = parser.parse_args()
 55 | 
 56 |     return (
 57 |         args.nofetch,
 58 |         args.wdi,
 59 |         args.vdem,
 60 |         args.acled,
 61 |         args.ged,
 62 |         args.icgcw,
 63 |         args.pgdata,
 64 |         args.spei,
 65 |         args.fvp,
 66 |         args.cdum,
 67 |         args.reign,
 68 |     )
 69 | 
 70 | 
 71 | def main():
 72 | 
 73 |     (
 74 |         nofetch,
 75 |         do_wdi,
 76 |         do_vdem,
 77 |         do_acled,
 78 |         do_ged,
 79 |         do_icgcw,
 80 |         do_pgdata,
 81 |         do_spei,
 82 |         do_fvp,
 83 |         do_cdum,
 84 |         do_reign,
 85 |     ) = parse_args()
 86 | 
 87 |     if do_wdi:
 88 |         if not nofetch:
 89 |             wdi.fetch_wdi()
 90 |         wdi.load_wdi()
 91 | 
 92 |     if do_vdem:
 93 |         if not nofetch:
 94 |             vdem.fetch_vdem()
 95 |         vdem.load_vdem()
 96 | 
 97 |     if do_acled:
 98 |         if not nofetch:
 99 |             acled.fetch_acled()
100 |         acled.load_acled()
101 | 
102 |     if do_ged:
103 |         if not nofetch:
104 |             ged.fetch_ged()
105 |         ged.load_ged()
106 | 
107 |     if do_icgcw:
108 |         if not nofetch:
109 |             icgcw.fetch_icgcw()
110 |         icgcw.load_icgcw()
111 | 
112 |     if do_pgdata:
113 |         if not nofetch:
114 |             pgdata.fetch_pgdata()
115 |         pgdata.load_pgdata()
116 | 
117 |     if do_spei:
118 |         if not nofetch:
119 |             spei.fetch_spei()
120 |         spei.load_spei()
121 | 
122 |     if do_fvp:
123 |         if not nofetch:
124 |             fvp.fetch_fvp()
125 |         fvp.load_fvp()
126 | 
127 |     if do_cdum:
128 |         if not nofetch:
129 |             cdum.fetch_cdum()
130 |         cdum.load_cdum()
131 | 
132 |     if do_reign:
133 |         if not nofetch:
134 |             reign.fetch_reign()
135 |         reign.load_reign()
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     try:
140 |         main()
141 |     except:
142 |         log.exception(f"Something went wrong in update_database.py")
143 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | """ Setup file fore views project """
 2 | import os
 3 | from setuptools import setup
 4 | 
 5 | def main():
 6 |     """ Do the setup """
 7 |     setup(name='views',
 8 |           version='0.0.1',
 9 |           author="ViEWS Team",
10 |           install_requires=[])
11 | 
12 | if __name__ == "__main__":
13 |     main()
14 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | # Tests
2 | 
3 | Tests live here and should be run before pushing.
4 | Run tests by running the full suite `run_tools.sh` script in the root of the repo.
5 | 


--------------------------------------------------------------------------------
/tests/test_calibration.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd  # type: ignore
 2 | import numpy as np  # type: ignore
 3 | from views.apps.model.calibration import calibrate_real
 4 | from views.utils import mocker
 5 | 
 6 | 
 7 | def test_calibrate_real_perfect_on_calib() -> None:
 8 |     """ Test that calibrated values when test=calib match perfectly """
 9 | 
10 |     df = mocker.DfMocker(datatypes=["reals"]).df
11 | 
12 |     s_calibrated = calibrate_real(
13 |         s_test_pred=df["r_a"],
14 |         s_calib_pred=df["r_a"],
15 |         s_calib_actual=df["r_b"],
16 |     )
17 | 
18 |     assert np.isclose(s_calibrated.mean(), df["r_b"].mean())
19 |     assert np.isclose(s_calibrated.std(), df["r_b"].std())
20 | 
21 | 
22 | def test_calibrate_real_scales_right_way() -> None:
23 |     """ Test that calibration shifts mean the right way """
24 | 
25 |     calib_pred = [100, 200]  # <- Off by factor 0.5
26 |     calib_actual = [50, 100]
27 |     test_pred = [200, 400]
28 |     test_expected = [100, 200]  # <- test_pred * 0.5
29 | 
30 |     s_calibrated = calibrate_real(
31 |         s_test_pred=pd.Series(test_pred),
32 |         s_calib_pred=pd.Series(calib_pred),
33 |         s_calib_actual=pd.Series(calib_actual),
34 |     )
35 | 
36 |     assert all(s_calibrated == pd.Series(test_expected))
37 | 


--------------------------------------------------------------------------------
/tests/test_db.py:
--------------------------------------------------------------------------------
 1 | """ Tests for views.utils.db
 2 | 
 3 | @TODO: Add a testing db...
 4 | """
 5 | import pytest  # type: ignore
 6 | from views.utils import db
 7 | 
 8 | 
 9 | def test_unpack_fqtable() -> None:
10 |     """ Test unpack fqtable """
11 |     assert db._unpack_fqtable("schema.table") == ("schema", "table")
12 | 


--------------------------------------------------------------------------------
/tests/test_misc_utils.py:
--------------------------------------------------------------------------------
 1 | """ Test misc utils that don't fit anywhere else """
 2 | from views.utils import misc
 3 | 
 4 | 
 5 | def test_lists_disjoint() -> None:
 6 |     a = [1, 2]
 7 |     b = [3, 4]
 8 |     c = [5, 6]
 9 |     d = [6, 7]  # 6 shared with c
10 |     assert not misc.lists_disjoint([a, b, c, d])
11 |     assert misc.lists_disjoint([a, b, c])
12 | 


--------------------------------------------------------------------------------
/tests/test_specs.py:
--------------------------------------------------------------------------------
 1 | """ Test the specs interface """
 2 | import pytest  # type: ignore
 3 | import yaml
 4 | from views.specs import models
 5 | 
 6 | SPEC_TEST = yaml.safe_load(
 7 |     """
 8 | colsets:
 9 |     colset_z:
10 |         - zeus
11 |     colset_a:
12 |         - asda
13 |         - bobo
14 |     colset_b:
15 |         - bertil
16 |         - cesar
17 |     colset_c:
18 |         - cesar
19 |         - david
20 |     colset_steve:
21 |         - steven
22 |         - dave
23 | themes:
24 |     theme_a:
25 |         - colset_a
26 |         - colset_b
27 |     theme_b:
28 |         - colset_b
29 |         - colset_c
30 |     theme_nested:
31 |         - theme_a
32 |         - theme_b
33 |     theme_supernested:
34 |         - colset_steve
35 |         - theme_nested
36 | 
37 | formulas:
38 |     o:
39 |         col_outcome: asda
40 |         cols_features: theme_supernested
41 | """
42 | )
43 | 
44 | SPEC_TEST_BROKEN = yaml.safe_load(
45 |     """
46 | colsets:
47 |     colset_a:
48 |         - asda
49 |         - bobo
50 |     colset_b:
51 |         - bertil
52 |         - cesar
53 | themes:
54 |     theme_a:
55 |         - colset_a
56 |         - colset_b
57 |         - missing_key
58 | 
59 | formulas:
60 |     o:
61 |         col_outcome: asda
62 |         cols_features: theme_supernested
63 | """
64 | )
65 | 
66 | 
67 | def test_spec_models():
68 |     assert isinstance(models.cm, dict)
69 |     # assert isinstance(models.solver.solved_cm(), dict)
70 | 
71 | 
72 | def test_solver():
73 |     """ Test that solver solves properly """
74 |     filled_formulas = models.solver.solve_formulas(SPEC_TEST)
75 | 
76 |     wanted = ["steven", "dave", "asda", "bobo", "bertil", "cesar", "david"]
77 |     assert filled_formulas["o"]["cols_features"] == sorted(wanted)
78 | 
79 |     with pytest.raises(RuntimeError) as excinfo:
80 | 
81 |         _ = models.solver.solve_formulas(SPEC_TEST_BROKEN)
82 |         assert "No match for missing_key in" in str(excinfo.value)
83 | 


--------------------------------------------------------------------------------
/tests/test_structure.py:
--------------------------------------------------------------------------------
1 | """ Test import structure """
2 | 
3 | 
4 | def test_import_views() -> None:
5 |     """ Test that views can be imported from the top """
6 |     import views
7 | 


--------------------------------------------------------------------------------
/tests/test_transforms_api.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd  # type: ignore
 2 | import pytest  # type: ignore
 3 | from views.apps.data import api
 4 | from views.apps.transforms import lib
 5 | from views.utils import mocker
 6 | 
 7 | 
 8 | def test_col_cols_ok() -> None:
 9 | 
10 |     # These are ok
11 |     t = api.Transform(name="testname", f="rollmax", cols=["a", "b"])
12 |     t = api.Transform(name="testname", f="rollmax", col="a")
13 | 
14 | 
15 | def test_col_col_not_ok() -> None:
16 |     # Not OK, col is list
17 |     with pytest.raises(TypeError) as exc:
18 |         t = api.Transform(name="testname", f="rollmax", col=["a", "b"])
19 |         assert "col should be string" in str(exc.value)
20 | 
21 | 
22 | def test_col_cols_not_ok() -> None:
23 |     # Not OK, cols is str
24 |     with pytest.raises(TypeError) as exc:
25 |         t = api.Transform(name="testname", f="rollmax", cols="a")
26 |         assert "col should be string" in str(exc.value)
27 | 
28 | 
29 | def test_f_unknown() -> None:
30 |     with pytest.raises(KeyError) as exc:
31 |         t = api.Transform(name="testname", f="unknown", col="a")
32 |         assert "following values of f are recognised:" in str(exc.value)
33 | 
34 | 
35 | def test_f_missing() -> None:
36 |     with pytest.raises(KeyError) as exc:
37 |         t = api.Transform(name="testname", col="a")
38 |         assert "Transformer needs a 'f' field" in str(exc.value)
39 | 
40 | 
41 | def test_compute() -> None:
42 |     t = api.Transform(name="testname", f="tlag", col="b_a", time=1)
43 |     df = mocker.DfMocker().df
44 | 
45 |     s_t = t.compute(df)
46 |     s_raw = lib.tlag(s=df["b_a"], time=1)
47 |     pd.testing.assert_series_equal(s_t, s_raw)
48 | 


--------------------------------------------------------------------------------
/tests/test_transforms_lib.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd  # type: ignore
 2 | from views.apps.transforms import lib
 3 | 
 4 | 
 5 | def test_onset() -> None:
 6 |     """ Test onset formulation """
 7 | 
 8 |     c_id = pd.Series([1, 2, 3], name="c_id")
 9 |     t = pd.Series(list(range(1, 11)), name="t")
10 |     events_c1 = [0, 0, 0, 0, 1, 1, 1, 0, 0, 0]  # Events
11 |     onsets_c1 = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0]  # Wanted onsets
12 |     onspos_c1 = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]  # Wanted onsets_possible
13 | 
14 |     events_c2 = [0, 1, 0, 1, 0, 0, 1, 0, 0, 0]
15 |     onsets_c2 = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
16 |     onspos_c2 = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0]
17 | 
18 |     events_c3 = [0, 0, 1, 1, 1, 1, 0, 0, 0, 1]
19 |     onsets_c3 = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
20 |     onspos_c3 = [1, 1, 1, 0, 0, 0, 0, 0, 0, 1]
21 | 
22 |     df = (
23 |         pd.DataFrame(
24 |             {
25 |                 "event": events_c1 + events_c2 + events_c3,
26 |                 "wanted_onset_possible_3": onspos_c1 + onspos_c2 + onspos_c3,
27 |                 "wanted_onset_3": onsets_c1 + onsets_c2 + onsets_c3,
28 |             },
29 |             index=pd.MultiIndex.from_product([c_id, t]),
30 |         )
31 |         .swaplevel()
32 |         .sort_index()
33 |     )
34 | 
35 |     df["onset_possible_3"] = lib.onset_possible(s=df["event"], window=3)
36 |     df["onset_3"] = lib.onset(s=df["event"], window=3)
37 | 
38 |     pd.testing.assert_series_equal(
39 |         df["onset_3"], df["wanted_onset_3"], check_names=False
40 |     )
41 |     pd.testing.assert_series_equal(
42 |         df["onset_possible_3"],
43 |         df["wanted_onset_possible_3"],
44 |         check_names=False,
45 |     )
46 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | """ Tests for utilities """
2 | from views import utils
3 | 
4 | 
5 | def test_passing() -> None:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/tests/test_utils_data.py:
--------------------------------------------------------------------------------
 1 | import pytest  # type: ignore
 2 | import pandas as pd  # type: ignore
 3 | 
 4 | from views.utils.mocker import DfMocker
 5 | from views.utils import data
 6 | 
 7 | 
 8 | def test_assign_into_df() -> None:
 9 | 
10 |     df_a = DfMocker(n_t=20).df
11 |     df_b = df_a.copy()
12 |     df_into = df_a.loc[:, []].copy()
13 | 
14 |     # Test we get the full frame if we give all times
15 |     df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[0:9])
16 |     df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[10:19])
17 |     pd.testing.assert_frame_equal(df_a, df_into, check_dtype=False)
18 | 
19 |     # Test we get missing if we don't give all cols
20 |     df_into = df_a.loc[:, []].copy()
21 |     df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[0:3])
22 |     df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[10:19])
23 |     with pytest.raises(AssertionError):
24 |         pd.testing.assert_frame_equal(df_a, df_into, check_dtype=False)
25 | 


--------------------------------------------------------------------------------
/views/__init__.py:
--------------------------------------------------------------------------------
 1 | """ The views package """
 2 | 
 3 | __all__ = [
 4 |     "apps",
 5 |     "database",
 6 |     "specs",
 7 |     "utils",
 8 |     "ROOTDIR",
 9 |     "DATASETS",
10 |     "DIR_STORAGE",
11 |     "DIR_SCRATCH",
12 |     "Model",
13 |     "Ensemble",
14 |     "Period",
15 |     "Downsampling",
16 |     "Transform",
17 | ]
18 | 
19 | import os
20 | 
21 | from . import apps, config, database, specs, utils
22 | from .apps.model.api import Model, Ensemble, Period, Downsampling
23 | from .apps.data.api import Transform
24 | 
25 | ROOTDIR = os.path.dirname(__file__)
26 | DIR_STORAGE = config.DIR_STORAGE
27 | DIR_SCRATCH = config.DIR_SCRATCH
28 | GEOMETRIES = specs.data.GEOMETRIES
29 | TABLES = specs.data.TABLES
30 | DATASETS = specs.data.DATASETS
31 | 
32 | 
33 | def _setup_dirstructure() -> None:
34 |     """ Setup storage directory structure """
35 |     dirs = [
36 |         DIR_STORAGE,
37 |         os.path.join(DIR_STORAGE, "data", "datasets"),
38 |         os.path.join(DIR_STORAGE, "data", "geometries"),
39 |         os.path.join(DIR_STORAGE, "data", "raw"),
40 |         os.path.join(DIR_STORAGE, "data", "tables"),
41 |         os.path.join(DIR_STORAGE, "logs"),
42 |         os.path.join(DIR_STORAGE, "logs"),
43 |         os.path.join(DIR_STORAGE, "models"),
44 |         os.path.join(DIR_STORAGE, "pipeline", "predictions"),
45 |         os.path.join(DIR_STORAGE, "scratch"),
46 |     ]
47 |     for path_dir in dirs:
48 |         utils.io.create_directory(path_dir)
49 | 
50 | 
51 | _setup_dirstructure()
52 | 


--------------------------------------------------------------------------------
/views/apps/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Views applications """
 2 | __all__ = [
 3 |     "data",
 4 |     "ensemble",
 5 |     "evaluation",
 6 |     "extras",
 7 |     "model",
 8 |     "pipeline",
 9 |     "slurm",
10 |     "transforms",
11 |     "plot",
12 | ]
13 | from . import (
14 |     data,
15 |     ensemble,
16 |     evaluation,
17 |     extras,
18 |     model,
19 |     pipeline,
20 |     slurm,
21 |     transforms,
22 |     plot,
23 | )
24 | 


--------------------------------------------------------------------------------
/views/apps/data/README.md:
--------------------------------------------------------------------------------
 1 | ## Dataset
 2 | 
 3 | 
 4 | 
 5 | ## Transforms
 6 | 
 7 | ViEWS has a number of transformation functions built in.
 8 | For implementation details see the file views/apps/transforms/lib.py
 9 | where each transformation is defined in python code.
10 | 
11 | The naming convention is simple, source columns are prefixed with transformation names and parameters of the transformation.
12 | For example: `tlag_1_ged_dummy_sb` is the time lag of 1 month of ged_dummy_sb.
13 | Transformations can of course be chained.
14 | For example: `time_since_greq_100_ged_best_sb` is the time since ged_best_sb (the best estimate of state-based deaths from GED) was greater or equal to 100.
15 | Notice that order matters.
16 | For example: `splag_1_1_time_since_ged_dummy_sb` is the first order spatial lag of time since ged_dummy_sb. This becomes a very large number as the spatial lag is the sum across the neighboring cells, which evaluates to a sum across many times_since.
17 | This is different from `time_since_splag_1_1_ged_dummy_sb` which evaluates to the time since any neighboring cell had a ged_dummy_sb event.
18 | 
19 | ### summ (sum)
20 | 
21 | Compute the sum of columns. Names should
22 | 
23 | ### product (product)
24 | 
25 | ### delta (delta)
26 | 
27 | ### greater_or_equal (greq)
28 | 
29 | ### smaller_or_equal (smeq)
30 | 
31 | ### in_range (in_range)
32 | 
33 | ### tlag (tlag)
34 | 
35 | ### tlead (tlead)
36 | 
37 | ### moving_average (ma)
38 | 
39 | ### cweq (ma)
40 | 
41 | ### time_since (time_since)
42 | 
43 | ### decay (decay)
44 | 
45 | ### mean (mean)
46 | 
47 | ### ln (ln)
48 | 
49 | ### demean (demean)
50 | 
51 | ### rollmax (rollmax)
52 | 
53 | ### onset_possible (onset_possible)
54 | 
55 | ### onset (onset)
56 | 
57 | ### distance_to_event (spdist)
58 | 
59 | ### spacetime_distance_to_event (stdist)
60 | 
61 | ### spatial_lag (splag)
62 | 


--------------------------------------------------------------------------------
/views/apps/data/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Dataset and transforms API """
 2 | 
 3 | __all__ = [
 4 |     "GeomCountry",
 5 |     "GeomPriogrid",
 6 |     "Dataset",
 7 |     "Transform",
 8 |     "export_tables_and_geoms",
 9 |     "import_tables_and_geoms",
10 |     "fetch_latest_zip_from_website",
11 | ]
12 | from .api import GeomCountry, GeomPriogrid, Dataset, Transform
13 | from .public import (
14 |     export_tables_and_geoms,
15 |     import_tables_and_geoms,
16 |     fetch_latest_zip_from_website,
17 | )
18 | 


--------------------------------------------------------------------------------
/views/apps/data/export_readme/README.md:
--------------------------------------------------------------------------------
 1 | # Views Tables and Geoms
 2 | 
 3 | This is a data export from the ViEWS project.
 4 | Python code is available for joining these files into usable datasets at priogrid-month and country-month level and computing a large set of transformations on them.
 5 | See https://github.com/UppsalaConflictDataProgram/OpenViEWS2 for instructions on how to get started.
 6 | 
 7 | If you don't wish to use the python tooling but instead prepare your own data, read on.
 8 | There are three types of files here:
 9 | 
10 | * Skeleton tables, that represent a level of analysis in ViEWS and hold identifiers.
11 | * Data tables, that hold imputed source data at their native level of analysis.
12 | * Geometries in .geojson format to enable plotting and spatial transformations.
13 | 
14 | The key identifiers are
15 | 
16 | * country_id, that correspond to ids from cshapes.
17 | * pg_id, PRIO-GRID ids.
18 | * year
19 | * month_id, an incremental month identifier where 1 is 1980-01
20 | 
21 | Skeletons are available for
22 | 
23 | * priogrid-month (pgm)
24 | * priogrid-year (pgy)
25 | * country-month (cm)
26 | * country-year (cy)
27 | 
28 | Included data sources are
29 | 
30 | * ACLED, from https://www.acleddata.com/
31 | * FVP, a custom dataset of CY data.
32 | * GED from the UCDP
33 | * pgdata, from PRIOGRID
34 | * REIGN, from https://oefdatascience.github.io/REIGN.github.io
35 | * SPEI, from https://spei.csic.es/map/maps.html
36 | * VDEM, from https://www.v-dem.net/en/
37 | * WDI, from http://datatopics.worldbank.org/world-development-indicators/
38 | 
39 | Data tables containing "\_imp\_skelarn_number" are imputed using scikit learns in 5 different imputations and should have no missingness in numeric columns.
40 | Data tables ending in \_part\_number are partitioned to work around column number limitations in our database.
41 | 
42 | To construct a usable dataset from these start with a skeleton table and then join in the data sources that you want.


--------------------------------------------------------------------------------
/views/apps/data/missing/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Missing data management """
 2 | 
 3 | __all__ = [
 4 |     "extrapolate",
 5 |     "fill_groups_with_time_means",
 6 |     "fill_with_group_and_global_means",
 7 |     "impute_amelia",
 8 |     "impute_mice_generator",
 9 |     "list_totally_missing",
10 | ]
11 | 
12 | from .amelia import impute_amelia
13 | from .missing import (
14 |     extrapolate,
15 |     fill_groups_with_time_means,
16 |     fill_with_group_and_global_means,
17 |     impute_mice_generator,
18 |     list_totally_missing,
19 | )
20 | 


--------------------------------------------------------------------------------
/views/apps/data/missing/amelia.py:
--------------------------------------------------------------------------------
  1 | """ Amelia python-R wrapper """
  2 | from typing import List
  3 | import logging
  4 | import multiprocessing as mp
  5 | import os
  6 | import string
  7 | import subprocess
  8 | import tempfile
  9 | 
 10 | import pandas as pd  # type: ignore
 11 | 
 12 | from views.utils import data
 13 | from views.utils.log import logtime
 14 | 
 15 | log = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def run_subproc(cmd):
 19 |     """ Run cmd in subprocess and log output to debug """
 20 | 
 21 |     log.info(f"Running cmd: {cmd}")
 22 |     with subprocess.Popen(
 23 |         cmd,
 24 |         stdout=subprocess.PIPE,
 25 |         stderr=subprocess.STDOUT,
 26 |         bufsize=1,
 27 |         universal_newlines=True,
 28 |     ) as p:
 29 |         for line in p.stdout:
 30 |             log.debug(line.strip("\n"))
 31 | 
 32 |     if p.returncode != 0:
 33 |         raise subprocess.CalledProcessError(p.returncode, p.args)
 34 | 
 35 | 
 36 | # pylint: disable=too-many-locals
 37 | @logtime
 38 | def impute_amelia(df: pd.DataFrame, n_imp: int) -> List[pd.DataFrame]:
 39 |     """ Wrapper for calling Amelia in an R subprocess
 40 | 
 41 |     Args:
 42 |         df: Dataframe with MultiIndex set
 43 |         n_imp: Number of imputations to perform
 44 |     Return:
 45 |         dfs: List of imputed dataframes
 46 |     """
 47 | 
 48 |     def read_template():
 49 |         this_dir = os.path.dirname(os.path.abspath(__file__))
 50 |         path_template = os.path.join(this_dir, "amelia_template.R")
 51 |         with open(path_template, "r") as f:
 52 |             template_str = f.read()
 53 | 
 54 |         template = string.Template(template_str)
 55 | 
 56 |         return template
 57 | 
 58 |     log.info("Started impute_amelia()")
 59 | 
 60 |     data.check_has_multiindex(df)
 61 |     timevar, groupvar = df.index.names
 62 | 
 63 |     log.debug(f"n_imp: {n_imp}")
 64 |     log.debug(f"timevar: {timevar}")
 65 |     log.debug(f"groupvar: {groupvar}")
 66 |     log.debug(f"df shape: {df.shape}")
 67 |     log.debug(f"Share missing: {df.isnull().mean().mean()}")
 68 | 
 69 |     with tempfile.TemporaryDirectory() as tempdir:
 70 | 
 71 |         path_csv_in = os.path.join(tempdir, "input.csv")
 72 |         path_rscript = os.path.join(tempdir, "impute_script.R")
 73 |         path_out_stem = os.path.join(tempdir, "imputed_")
 74 | 
 75 |         values = {
 76 |             "PATH_CSV_INPUT": path_csv_in,
 77 |             "PATH_CSV_OUTPUT_STEM": path_out_stem,
 78 |             "TIMEVAR": timevar,
 79 |             "GROUPVAR": groupvar,
 80 |             "N_IMP": n_imp,
 81 |             "N_CPUS": mp.cpu_count(),
 82 |         }
 83 | 
 84 |         template = read_template()
 85 |         rscript = template.substitute(values)
 86 | 
 87 |         df.to_csv(path_csv_in, index=True)
 88 |         log.info(f"Wrote {path_csv_in}")
 89 | 
 90 |         with open(path_rscript, "w") as f:
 91 |             f.write(rscript)
 92 |         log.info(f"Wrote {path_rscript}")
 93 |         log.debug(rscript)
 94 | 
 95 |         cmd = ["Rscript", path_rscript]
 96 |         run_subproc(cmd)
 97 | 
 98 |         dfs = []
 99 |         for i in range(n_imp):
100 |             path_imputed = f"{path_out_stem}{i+1}.csv"
101 |             df_imp = pd.read_csv(path_imputed)
102 |             df_imp = df_imp.drop(columns=["Unnamed: 0"])
103 |             df_imp = df_imp.set_index([timevar, groupvar])
104 |             dfs.append(df_imp)
105 |             log.info(f"Read {path_imputed}")
106 | 
107 |     log.info("Finished impute_amelia()")
108 |     return dfs
109 | 


--------------------------------------------------------------------------------
/views/apps/data/missing/amelia_template.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Install Amelia if we don't have it
  4 | if (!require("Amelia")) install.packages("Amelia", repos="https://ftp.acc.umu.se/mirror/CRAN/")
  5 | library("Amelia")
  6 | 
  7 | library("foreign")
  8 | library("methods")
  9 | library("parallel")
 10 | 
 11 | find_bounds <- function(df){
 12 |   print("finding bounds")
 13 |   lower <- c()
 14 |   upper <- c()
 15 |   for (i in 1:length(df)) {
 16 |     lower <- c(lower, min(df[,i], na.rm=T))
 17 |     upper <- c(upper, max(df[,i], na.rm=T))
 18 |   }
 19 | 
 20 |   varnr <- c(1:ncol(df))
 21 |   lower <- lower[varnr]
 22 |   upper <- upper[varnr]
 23 |   bounds <- matrix(cbind(varnr,lower,upper),ncol(df))
 24 | 
 25 |   return(bounds)
 26 | 
 27 | }
 28 | 
 29 | keep_only_varying <- function(df){
 30 |   print("Removing non-varying columns from dataframe")
 31 |   # Find variance to remove non-varying variables.
 32 |   variances <- sapply(df, var, na.rm = TRUE)
 33 | 
 34 |   # Some vars are all missing, they get variance NA, give them zero instead
 35 |   variances[is.na(variances)] <- 0
 36 | 
 37 |   names.zero.variance <- colnames(df[variances == 0])
 38 |   names.positive.variance <- colnames(df[variances > 0])
 39 |   df <- df[names.positive.variance]
 40 | 
 41 |   return(df)
 42 | }
 43 | 
 44 | keep_only_numerics <- function(df){
 45 |   print("Removing non-numeric columns from dataframe")
 46 |   numerics <- sapply(df, is.numeric)
 47 |   df <- df[numerics]
 48 | 
 49 |   return(df)
 50 | }
 51 | 
 52 | 
 53 | time_start <- Sys.time()
 54 | print("Starting amelia imputation script")
 55 | 
 56 | path_csv_input <- "${PATH_CSV_INPUT}"
 57 | path_csv_output_stem <- "${PATH_CSV_OUTPUT_STEM}"
 58 | timevar <- "${TIMEVAR}"
 59 | groupvar <- "${GROUPVAR}"
 60 | n_imp <- ${N_IMP}
 61 | n_cpus <- ${N_CPUS}
 62 | 
 63 | print(paste("path_csv_input", path_csv_input))
 64 | print(paste("path_csv_output_stem", path_csv_output_stem))
 65 | print(paste("timevar", timevar))
 66 | print(paste("groupvar", groupvar))
 67 | print(paste("n_imp", n_imp))
 68 | print(paste("n_cpus", n_cpus))
 69 | 
 70 | df <- read.csv(path_csv_input)
 71 | 
 72 | # # Drop all vars that don't vary
 73 | # df <- keep_only_varying(df)
 74 | # # Drop all non-numeric vars
 75 | # df <- keep_only_numerics(df)
 76 | 
 77 | nominals <- c()
 78 | 
 79 | # Find the bounds of each var, we don't want never-before seen values
 80 | bounds <- find_bounds(df)
 81 | 
 82 | # Run the imputation
 83 | obj_amelia <- amelia(df,
 84 |                      m = n_imp,
 85 |                      ts = timevar,
 86 |                      cs = groupvar,
 87 |                      noms = nominals,
 88 |                      p2s = 2,
 89 |                      polytime = 1,
 90 |                      intercs = TRUE,
 91 |                      empri = .1*nrow(df),
 92 |                      bounds = bounds,
 93 |                      max.resample = 1000,
 94 |                      parallel = "multicore",
 95 |                      ncpus = n_cpus)
 96 | 
 97 | print("Finished imputing")
 98 | 
 99 | write.amelia(obj=obj_amelia,
100 |              file.stem = path_csv_output_stem, format = "csv")
101 | print("Saved imputed datasets")
102 | 
103 | time_end <- Sys.time()
104 | time_total = time_end - time_start
105 | print("FINISHED!")
106 | print(paste("Total runtime:", time_total))
107 | 
108 | 


--------------------------------------------------------------------------------
/views/apps/data/public.py:
--------------------------------------------------------------------------------
  1 | """ Data publication interface
  2 | 
  3 | Data is publichsed as .csv and .geojson files in a .zip archive.
  4 | These formats were chosen because they are the most common and can be
  5 | read by all.
  6 | Functions in this module take a zip file and cache the data in the
  7 | views structure as parquet and geojson files.
  8 | 
  9 | """
 10 | from typing import Dict, Union, List
 11 | import datetime
 12 | import os
 13 | import tempfile
 14 | import logging
 15 | 
 16 | from views.utils import io
 17 | from .api import Table, GeomCountry, GeomPriogrid
 18 | 
 19 | log = logging.getLogger(__name__)
 20 | 
 21 | 
 22 | def _date_now() -> str:
 23 |     """ Get current UTC time """
 24 |     return datetime.datetime.utcnow().strftime("%Y%m%d")
 25 | 
 26 | 
 27 | def export_tables_and_geoms(
 28 |     tables: Dict[str, Table],
 29 |     geometries: Dict[str, Union[GeomCountry, GeomPriogrid]],
 30 |     dir_output: str,
 31 | ) -> str:
 32 |     """ Export tables and geometries to timestamped zip in dir_output """
 33 |     path_zip = os.path.join(
 34 |         dir_output, f"views_tables_and_geoms_{_date_now()}.zip"
 35 |     )
 36 |     log.info(f"Started exporting tables and geoms to {path_zip}")
 37 |     with tempfile.TemporaryDirectory() as tempdir:
 38 |         paths: List[str] = []
 39 |         for table in tables.values():
 40 |             path = os.path.join(tempdir, f"{table.name}.csv")
 41 |             io.df_to_csv(df=table.df, path=path)
 42 |             paths.append(path)
 43 | 
 44 |         for geom in geometries.values():
 45 |             # Make sure we have gdf locally
 46 |             _ = geom.gdf
 47 |             paths.append(geom.path)
 48 | 
 49 |         # Add the README to the zip.
 50 |         paths.append(
 51 |             os.path.join(
 52 |                 os.path.dirname(__file__), "export_readme", "README.md"
 53 |             )
 54 |         )
 55 | 
 56 |         io.make_zipfile(
 57 |             path_zip=path_zip, paths_members=paths,
 58 |         )
 59 |     log.info(f"Finished exporting tables and geoms to {path_zip}")
 60 |     return path_zip
 61 | 
 62 | 
 63 | def import_tables_and_geoms(
 64 |     tables: Dict[str, Table],
 65 |     geometries: Dict[str, Union[GeomCountry, GeomPriogrid]],
 66 |     path_zip: str,
 67 | ) -> None:
 68 |     """ Import tables and geometries to local cache structure from zip"""
 69 | 
 70 |     log.info(f"Started initalising cache from zip at {path_zip}")
 71 |     with tempfile.TemporaryDirectory() as tempdir:
 72 |         io.unpack_zipfile(path_zip=path_zip, destination=tempdir)
 73 | 
 74 |         for geom in geometries.values():
 75 |             path = os.path.join(tempdir, geom.fname)
 76 |             if os.path.isfile(path):
 77 |                 geom.init_cache_from_geojson(path=path)
 78 |             else:
 79 |                 log.debug(f"No matching .geojson for {geom.name}")
 80 | 
 81 |         for table in tables.values():
 82 |             path = os.path.join(tempdir, f"{table.name}.csv")
 83 |             if os.path.isfile(path):
 84 |                 table.init_cache_from_csv(path)
 85 |             else:
 86 |                 raise RuntimeError(f"No matching .csv for {table.name}")
 87 |     log.info(f"Fininshed initalising cache from zip at {path_zip}")
 88 | 
 89 | 
 90 | def fetch_latest_zip_from_website(path_dir_destination: str) -> str:
 91 |     """ Feth the latest zip from the website """
 92 | 
 93 |     # Update this
 94 |     url_base = "https://views.pcr.uu.se/download/datasets"
 95 |     fnames = [
 96 |         fname
 97 |         for fname in io.list_files_in_webdir(url=url_base)
 98 |         if fname.startswith("views_tables_and_geoms_")
 99 |     ]
100 |     log.debug(f"Found {fnames} that look like views_tables_and_geoms_")
101 |     fname_latest = sorted(fnames).pop()
102 |     log.debug(f"Latest file looks like: {fname_latest}")
103 |     url = f"{url_base}/{fname_latest}"
104 |     path = os.path.join(path_dir_destination, fname_latest)
105 |     io.fetch_url_to_file(url, path)
106 |     return path
107 | 


--------------------------------------------------------------------------------
/views/apps/ensemble/__init__.py:
--------------------------------------------------------------------------------
1 | """ Ensemble functionality """
2 | __all__ = ["run_ebma"]
3 | from .ebma import run_ebma
4 | 


--------------------------------------------------------------------------------
/views/apps/ensemble/ebma.py:
--------------------------------------------------------------------------------
  1 | """ Wrapper for EBMA """
  2 | 
  3 | from typing import Any, Dict, Tuple, List
  4 | 
  5 | import logging
  6 | import os
  7 | import string
  8 | import tempfile
  9 | import subprocess
 10 | 
 11 | import pandas as pd  # type: ignore
 12 | 
 13 | from views.utils import data as datautils
 14 | 
 15 | log = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | def _read_template() -> string.Template:
 19 |     this_dir = os.path.dirname(os.path.abspath(__file__))
 20 |     with open(os.path.join(this_dir, "templates", "run_ebma.R"), "r") as f:
 21 |         template_str = f.read()
 22 | 
 23 |     template = string.Template(template_str)
 24 | 
 25 |     return template
 26 | 
 27 | 
 28 | def _run_subproc(cmd: List[str]) -> None:
 29 |     """ Run cmd in subprocess and log output to debug """
 30 | 
 31 |     log.debug(f"Running cmd: {cmd}")
 32 |     p: Any
 33 |     with subprocess.Popen(
 34 |         cmd,
 35 |         stdout=subprocess.PIPE,
 36 |         stderr=subprocess.STDOUT,
 37 |         bufsize=1,
 38 |         universal_newlines=True,
 39 |     ) as p:
 40 |         for line in p.stdout:
 41 |             log.debug(line.strip("\n"))
 42 | 
 43 |     if p.returncode != 0:
 44 |         raise subprocess.CalledProcessError(p.returncode, p.args)
 45 | 
 46 | 
 47 | # pylint: disable=too-many-arguments, too-many-locals
 48 | def run_ebma(
 49 |     df_calib: pd.DataFrame,
 50 |     df_test: pd.DataFrame,
 51 |     s_calib_actual: pd.Series,
 52 |     tolerance: float = 0.001,
 53 |     shrinkage: float = 3,
 54 |     const: float = 0.01,
 55 |     maxiter: int = 10_000,
 56 | ) -> Tuple[pd.Series, Dict[str, float]]:
 57 |     """ Compute EBMA predictions and weights using wrapped R EBMAforecast
 58 | 
 59 |     Args:
 60 |         df_calib: Dataframe with constituent models predictions for calibration
 61 |         df_test: Dataframe with constituent model
 62 |         predictions for test period
 63 |         s_calib_actual: Series with actuals for the calibration partition
 64 |         tolerance: See R docs
 65 |         shrinkage: See R docs
 66 |         const: See R docs
 67 |         maxiter: See R docs
 68 |     Returns:
 69 |         s_ebma: Series with ebma predictions
 70 |         weights: Dictionary of model weights
 71 | 
 72 |     R docs at:
 73 |     https://cran.r-project.org/web/packages/EBMAforecast/EBMAforecast.pdf
 74 | 
 75 |     Ensure df_calib, df_test and s_calib_actual have multiindex set.
 76 | 
 77 |     """
 78 | 
 79 |     # Copy data so we don't mess with callers data
 80 |     df_calib = df_calib.copy()
 81 |     df_test = df_test.copy()
 82 |     s_calib_actual = s_calib_actual.copy()
 83 |     s_calib_actual.name = "actual"
 84 | 
 85 |     # Make sure we're all indexed as expected
 86 |     datautils.check_has_multiindex(df_calib)
 87 |     datautils.check_has_multiindex(df_test)
 88 |     datautils.check_has_multiindex(s_calib_actual)
 89 | 
 90 |     if not len(s_calib_actual) == len(df_calib):
 91 |         msg = "Number of rows in df_calib and s_calib_actual don't match"
 92 |         raise RuntimeError(msg)
 93 | 
 94 |     offset = 1e-10
 95 |     upper = 1 - offset
 96 |     lower = 0 + offset
 97 | 
 98 |     # Sort indexes so they're aligned
 99 |     # Clip predictions
100 |     df_calib = df_calib.sort_index().clip(lower, upper)
101 |     df_test = df_test.sort_index().clip(lower, upper)
102 |     df_calib_actual = pd.DataFrame(s_calib_actual.sort_index())
103 | 
104 |     with tempfile.TemporaryDirectory() as tempdir:
105 | 
106 |         path_csv_calib = os.path.join(tempdir, "calib.csv")
107 |         path_csv_test = os.path.join(tempdir, "test.csv")
108 |         path_csv_actuals = os.path.join(tempdir, "actuals.csv")
109 |         path_csv_ebma = os.path.join(tempdir, "ebma.csv")
110 |         path_csv_weights = os.path.join(tempdir, "weights.csv")
111 |         path_rscript = os.path.join(tempdir, "ebma_script.R")
112 | 
113 |         values = {
114 |             "PATH_CSV_ACTUALS": path_csv_actuals,
115 |             "PATH_CSV_CALIB": path_csv_calib,
116 |             "PATH_CSV_TEST": path_csv_test,
117 |             "PATH_CSV_EBMA": path_csv_ebma,
118 |             "PATH_CSV_WEIGHTS": path_csv_weights,
119 |             "PARAM_TOLERANCE": tolerance,
120 |             "PARAM_SHRINKAGE": shrinkage,
121 |             "PARAM_CONST": const,
122 |             "PARAM_MAXITER": maxiter,
123 |         }
124 | 
125 |         template = _read_template()
126 |         rscript = template.substitute(values)
127 | 
128 |         df_calib.to_csv(path_csv_calib, index=False)
129 |         df_test.to_csv(path_csv_test, index=False)
130 |         df_calib_actual.to_csv(path_csv_actuals, index=False)
131 | 
132 |         with open(path_rscript, "w") as f:
133 |             f.write(rscript)
134 |         cmd = ["Rscript", path_rscript]
135 |         _run_subproc(cmd)
136 | 
137 |         df_ebma = pd.read_csv(path_csv_ebma)
138 |         df_weights = pd.read_csv(path_csv_weights)
139 | 
140 |     df_ebma.index = df_test.index
141 |     s_ebma = df_ebma["x"]
142 |     s_ebma.name = "ebma"
143 | 
144 |     s_weights = df_weights["x"]
145 |     s_weights.index = df_calib.columns
146 |     weights_dict = s_weights.to_dict()
147 | 
148 |     return s_ebma, weights_dict
149 | 


--------------------------------------------------------------------------------
/views/apps/ensemble/templates/install_ebma.R:
--------------------------------------------------------------------------------
1 | # Install dependencies
2 | install.packages("separationplot", repos="https://ftp.acc.umu.se/mirror/CRAN/")
3 | install.packages("plyr", repos="https://ftp.acc.umu.se/mirror/CRAN/")
4 | install.packages("Hmisc", repos="https://ftp.acc.umu.se/mirror/CRAN/")
5 | install.packages("abind", repos="https://ftp.acc.umu.se/mirror/CRAN/")
6 | install.packages("Rcpp", repos="https://ftp.acc.umu.se/mirror/CRAN/")
7 | # Install EBMAforecast from the CRAN archive
8 | install.packages("https://cran.r-project.org/src/contrib/Archive/EBMAforecast/EBMAforecast_0.52.tar.gz", repos = NULL, type="source")


--------------------------------------------------------------------------------
/views/apps/ensemble/templates/run_ebma.R:
--------------------------------------------------------------------------------
 1 | library("EBMAforecast")
 2 | 
 3 | # These are templated values
 4 | path_calib_actuals <- "${PATH_CSV_ACTUALS}"
 5 | path_csv_calib<- "${PATH_CSV_CALIB}"
 6 | path_csv_test <- "${PATH_CSV_TEST}"
 7 | path_ebma <- "${PATH_CSV_EBMA}"
 8 | path_weights <- "${PATH_CSV_WEIGHTS}"
 9 | 
10 | 
11 | 
12 | y_actual <- read.csv(path_calib_actuals, header=TRUE)
13 | df_calib <- read.csv(path_csv_calib, header=TRUE)
14 | df_test = read.csv(path_csv_test, header=TRUE)
15 | 
16 | colnames <- c(colnames(df_test))
17 | 
18 | # Equal weights by default
19 | n_models = ncol(df_calib)
20 | initial_weights <- rep((1/n_models), times=n_models)
21 | # logit, normal, binary
22 | param_model <- "logit"
23 | 
24 | # Defaults
25 | param_tolerance <- ${PARAM_TOLERANCE}
26 | param_shrinkage <- ${PARAM_SHRINKAGE}
27 | param_const <- ${PARAM_CONST}
28 | param_maxiter <- ${PARAM_MAXITER}
29 | 
30 | print("Started making forecast data")
31 | fd <- EBMAforecast::makeForecastData(
32 |     .predCalibration=df_calib,
33 |     .predTest=df_test,
34 |     .outcomeCalibration=y_actual$$actual, #double $$ for template
35 |     .modelNames=colnames
36 |     )
37 | 
38 | print("Started calibrateEnsemble")
39 | ebma <- EBMAforecast::calibrateEnsemble(
40 |     fd,
41 |     model=param_model,
42 |     tol=param_tolerance,
43 |     exp=param_shrinkage,
44 |     const=param_const,
45 |     W=initial_weights,
46 |     maxIter=param_maxiter
47 |     )
48 | 
49 | 
50 | ebma_prediction <- ebma@predTest[, "EBMA", ]
51 | weights <- ebma@modelWeights
52 | 
53 | print("Writing result csvs")
54 | write.csv(weights, path_weights, row.names=FALSE)
55 | write.csv(ebma_prediction, path_ebma, row.names=FALSE)


--------------------------------------------------------------------------------
/views/apps/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | """Evaluations module"""
2 | __all__ = ["lib"]
3 | from . import lib
4 | 


--------------------------------------------------------------------------------
/views/apps/evaluation/feature_importance.py:
--------------------------------------------------------------------------------
 1 | """Feature importances module"""
 2 | 
 3 | from typing import List, Dict
 4 | import os
 5 | import logging
 6 | from datetime import datetime
 7 | 
 8 | import pandas as pd  # type: ignore
 9 | import joblib  # type: ignore
10 | from sklearn.ensemble import RandomForestRegressor  # type: ignore
11 | 
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | 
16 | def get_feature_importance_from_pickle(
17 |     path_pickle: str, features: List[str], period: str, step: int
18 | ) -> Dict[str, float]:
19 |     """ Get feature importance from pickle at path.
20 | 
21 |     Args:
22 |         path: Path to pickled RandomForestRegressor.
23 |         features: List of feature names.
24 |         period: Which period (str).
25 |         step: Which step (int).
26 |     Returns:
27 |         fi_dict: A dictionary of feature importance scores.
28 |     """
29 |     fi_dict = {}
30 |     if os.path.isfile(path_pickle):
31 |         log.debug(f"Started reading {path_pickle}")
32 |         try:
33 |             model = joblib.load(path_pickle)
34 |             model = model.estimators[period][step]
35 |             log.debug(f"Finished reading {path_pickle}")
36 |             # Only populate if it's a RandomForestRegressor
37 |             if isinstance(model, RandomForestRegressor):
38 |                 importances = model.feature_importances_
39 |                 for feature, value in zip(features, importances):
40 |                     fi_dict[feature] = value
41 | 
42 |         except EOFError:
43 |             log.warning(f"Couldn't read {path_pickle}")
44 | 
45 |     return fi_dict
46 | 
47 | 
48 | def reorder_fi_dict(fi_dict: Dict[str, float], top: int = None) -> Dict:
49 |     """ Get feature importances in an ordered (desc) table and write .tex.
50 | 
51 |     Args:
52 |         fi_dict: Dictionary of feature importances, {feature: importance}.
53 |         top (optional): Top number of feature importances to include.
54 |     Returns:
55 |         fi_dict: Ordered tab dictionary of feature importance scores, i.e.
56 |             {"feature": [features], "importance": [importances]}.
57 |     """
58 |     desc = dict(
59 |         sorted(fi_dict.items(), key=lambda item: item[1], reverse=True)
60 |     )
61 | 
62 |     top_desc = {k: desc[k] for k in list(desc)[:top]} if top else desc
63 | 
64 |     featimps_tabular = {
65 |         "feature": [k for k, v in top_desc.items()],
66 |         "importance": [v for k, v in top_desc.items()],
67 |     }
68 | 
69 |     return featimps_tabular
70 | 
71 | 
72 | def write_fi_tex(df: pd.DataFrame, path: str):
73 |     """ Write feature importances df to .tex with info added.
74 | 
75 |     Args:
76 |         df: pd.DataFrame containing importances per row, indexed on feature.
77 |         path: Full path including filename to write .tex to.
78 |     """
79 |     tex = df.to_latex()
80 |     # Add meta information.
81 |     now = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
82 |     meta = f"""
83 |     %Output created by feature_importance.py.
84 |     %Produced on {now}, written to {path}.
85 |     \\
86 |     """
87 |     tex = meta + tex
88 | 
89 |     with open(path, "w") as f:
90 |         f.write(tex)
91 |     log.info(f"Wrote feature importances to .tex under {path}.")
92 | 


--------------------------------------------------------------------------------
/views/apps/evaluation/lib.py:
--------------------------------------------------------------------------------
  1 | """ Evaluation library. """
  2 | import pandas as pd  # type: ignore
  3 | import numpy as np  # type: ignore
  4 | 
  5 | from sklearn import metrics  # type: ignore
  6 | 
  7 | 
  8 | # real
  9 | def mean_squared_error(actuals: pd.Series, preds: pd.Series) -> float:
 10 |     """Computes MSE given array of actuals and probs."""
 11 |     return metrics.mean_squared_error(y_true=actuals, y_pred=preds)
 12 | 
 13 | 
 14 | def log_loss(actuals: pd.Series, preds: pd.Series) -> float:
 15 |     """Computes the log loss score given array of actuals and probs."""
 16 |     return metrics.log_loss(y_true=actuals, y_pred=preds)
 17 | 
 18 | 
 19 | def tadda_score(
 20 |     y_deltas: pd.Series,
 21 |     f_deltas: pd.Series,
 22 |     epsilon=1,
 23 |     smooth_penalty=False,
 24 |     element_by_element=False,
 25 | ) -> float:
 26 |     """ Computes TADDA given array of y deltas and f deltas.
 27 | 
 28 |     Args:
 29 |         y_deltas: 1d np.ndarray of length N holding the actual changes.
 30 |         f_deltas: 1d np.ndarray of length N holding the forecasted values.
 31 |         epsilon: a positive scalar that defines the target around actual values
 32 |             where values are "close enough".
 33 |         smooth_penalty: when y[i] =/- epsilon(E) crosses 0, there is a jump in
 34 |             TADDA. This can be smoothed away by only penalizing directional
 35 |             chance by |f-E| (False by default).
 36 |         element_by_element: return the mean of the individual contributions if
 37 |             False or the vector of individual TADDA values if True (False by
 38 |             default).
 39 |     Returns:
 40 |         scalar if element_by_element is False, and np.array if
 41 |             element_by_element is True.
 42 |     """
 43 | 
 44 |     def sign_not_equal(y_deltas, f_deltas):
 45 |         # 0 is treated as both pos and neg: returns 0 (not not equal) when y=0
 46 |         y_sign = np.where(y_deltas > 0.0, 1, 0)
 47 |         f_sign = np.where(f_deltas > 0.0, 1, 0)
 48 |         return np.where((y_sign == f_sign) | (np.equal(y_deltas, 0.0)), 0, 1)
 49 | 
 50 |     term1 = np.abs(y_deltas - f_deltas)
 51 | 
 52 |     if not smooth_penalty:
 53 |         sign_equality = sign_not_equal(y_deltas, f_deltas)
 54 |         over_epsilon = np.where(np.abs(y_deltas - f_deltas) > epsilon, 1, 0)
 55 |         term2 = np.abs(f_deltas) * sign_equality * over_epsilon
 56 |     else:
 57 |         cutoff = np.where(
 58 |             np.abs(y_deltas) < epsilon, np.abs(np.abs(y_deltas) - epsilon), 0
 59 |         )
 60 |         sign_equality = sign_not_equal(y_deltas, f_deltas)
 61 |         over_epsilon = np.where(np.abs(y_deltas - f_deltas) > epsilon, 1, 0)
 62 |         term2 = (
 63 |             np.abs(np.abs(f_deltas) - cutoff) * sign_equality * over_epsilon
 64 |         )
 65 | 
 66 |     return term1 + term2 if element_by_element else (term1 + term2).mean()
 67 | 
 68 | 
 69 | # real
 70 | def mean_absolute_error(actuals: pd.Series, preds: pd.Series) -> float:
 71 |     """Computes MAE given array of actuals and preds."""
 72 |     return metrics.mean_absolute_error(y_true=actuals, y_pred=preds)
 73 | 
 74 | 
 75 | # real
 76 | def r2_score(actuals: pd.Series, preds: pd.Series) -> float:
 77 |     """Computes r2 given array of actuals and preds."""
 78 |     return metrics.r2_score(y_true=actuals, y_pred=preds)
 79 | 
 80 | 
 81 | # prob
 82 | def average_precision(actuals: pd.Series, probs: pd.Series) -> float:
 83 |     """Computes AUPR given array of actuals and probs."""
 84 |     return metrics.average_precision_score(y_true=actuals, y_score=probs)
 85 | 
 86 | 
 87 | # prob
 88 | def area_under_roc(actuals: pd.Series, probs: pd.Series) -> float:
 89 |     """Computes AUROC given array of actuals and probs."""
 90 |     return metrics.roc_auc_score(y_true=actuals, y_score=probs)
 91 | 
 92 | 
 93 | # prob
 94 | def brier(actuals: pd.Series, probs: pd.Series) -> float:
 95 |     """Computes brier score given array of actuals and probs."""
 96 |     return metrics.brier_score_loss(y_true=actuals, y_prob=probs)
 97 | 
 98 | 
 99 | # @TODO: These need bool predictions, apply a threshold to a prob maybe?
100 | # bool
101 | def accuracy(actuals: pd.Series, preds: pd.Series) -> float:
102 |     """Computes accuracy from series of actuals and predictions with
103 |     single threshold applied."""
104 |     return metrics.accuracy_score(y_true=actuals, y_pred=preds)
105 | 
106 | 
107 | # bool
108 | def precision(actuals: pd.Series, preds: pd.Series) -> float:
109 |     """Computes precision from confusion matrix."""
110 |     return metrics.precision_score(y_true=actuals, y_pred=preds)
111 | 
112 | 
113 | # bool
114 | def recall(actuals: pd.Series, preds: pd.Series) -> float:
115 |     """Computes recall from confusion matrix."""
116 |     return metrics.recall_score(y_true=actuals, y_pred=preds)
117 | 
118 | 
119 | # bool
120 | def f1_score(actuals: pd.Series, preds: pd.Series) -> float:
121 |     """Computes F1-score given precision and recall."""
122 |     return metrics.f1_score(y_true=actuals, y_pred=preds)
123 | 
124 | 
125 | # bool
126 | def class_report(actuals: pd.Series, preds: pd.Series) -> float:
127 |     """ Classification report """
128 |     return metrics.classification_report(y_true=actuals, y_pred=preds)
129 | 


--------------------------------------------------------------------------------
/views/apps/extras/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Extra modules for miscellanous tasks such as data publication """
 2 | 
 3 | __all__ = [
 4 |     "fetch_prediction_competition_data",
 5 |     "extract_and_package_data",
 6 |     "refresh_datasets_from_website",
 7 | ]
 8 | from .extras import (
 9 |     fetch_prediction_competition_data,
10 |     extract_and_package_data,
11 |     refresh_datasets_from_website,
12 | )
13 | 


--------------------------------------------------------------------------------
/views/apps/extras/extras.py:
--------------------------------------------------------------------------------
  1 | """ Get the prediction competition data from the ViEWS website """
  2 | from typing import List, Optional, Dict
  3 | import tempfile
  4 | import logging
  5 | import os
  6 | from datetime import date
  7 | 
  8 | from views.apps.data import api
  9 | from views.utils import io
 10 | from views.config import DIR_STORAGE
 11 | from views.specs.data import DATASETS
 12 | 
 13 | log = logging.getLogger()
 14 | 
 15 | DIR_UPLOAD = os.path.join(DIR_STORAGE, "upload")
 16 | io.create_directory(DIR_UPLOAD)
 17 | 
 18 | 
 19 | def fetch_prediction_competition_data(
 20 |     fnames_want: Optional[List[str]] = None,
 21 | ) -> Dict[str, str]:
 22 |     """ Fetch and unpack the prediction competition data"""
 23 | 
 24 |     fname_zip = "views_pred_comp_data_20200427.zip"
 25 |     url = f"https://views.pcr.uu.se/download/datasets/{fname_zip}"
 26 | 
 27 |     if not fnames_want:
 28 |         fnames_want = ["cm.csv", "pgm.csv"]
 29 | 
 30 |     dir_destination = os.path.join(DIR_STORAGE, "prediction_competition")
 31 |     paths_want = [
 32 |         os.path.join(dir_destination, fname) for fname in fnames_want
 33 |     ]
 34 | 
 35 |     io.create_directory(dir_destination)
 36 | 
 37 |     if all([os.path.isfile(path) for path in paths_want]):
 38 |         log.info("Files already where we need them")
 39 |     else:
 40 |         log.info(f"Fetching {fnames_want} from {url}")
 41 |         with tempfile.TemporaryDirectory() as tempdir:
 42 |             path_zip = os.path.join(tempdir, fname_zip)
 43 |             io.fetch_url_to_file(url=url, path=path_zip)
 44 |             paths_unzipped = io.unpack_zipfile(path_zip, destination=tempdir)
 45 |             paths_destination: List[str] = []
 46 |             for path in paths_unzipped:
 47 |                 fname = os.path.basename(path)
 48 |                 if fname in fnames_want:
 49 |                     path_destination = os.path.join(dir_destination, fname)
 50 |                     io.move_file(path_from=path, path_to=path_destination)
 51 |                     paths_destination.append(path_destination)
 52 | 
 53 |         paths_missing = [
 54 |             path for path in paths_want if path not in paths_destination
 55 |         ]
 56 |         if paths_missing:
 57 |             raise RuntimeError(f"Missing paths {paths_missing}")
 58 | 
 59 |     data = {os.path.basename(path): path for path in paths_want}
 60 | 
 61 |     return data
 62 | 
 63 | 
 64 | def extract_and_package_data():
 65 |     """ Get raw data from database, dump to files and zip it up """
 66 |     with tempfile.TemporaryDirectory() as tempdir:
 67 |         paths = []
 68 |         # Dump tables to csv
 69 |         for name, dataset in DATASETS.items():
 70 |             fname = f"{name}.csv"
 71 |             path = os.path.join(tempdir, fname)
 72 |             dataset.export_raw_to_csv(path=path)
 73 |             paths.append(path)
 74 | 
 75 |         geom_c = api.GeomCountry()
 76 |         geom_c.refresh()
 77 |         paths.append(geom_c.path)
 78 |         geom_pg = api.GeomPriogrid()
 79 |         geom_pg.refresh()
 80 |         paths.append(geom_pg.path)
 81 | 
 82 |         today = date.today().strftime("%Y%m%d")
 83 |         fname_zip = f"data_export_{today}.zip"
 84 |         io.make_zipfile(
 85 |             path_zip=os.path.join(DIR_UPLOAD, fname_zip), paths_members=paths
 86 |         )
 87 |     log.info(f"Wrote zip to {os.path.join(DIR_UPLOAD, fname_zip)}")
 88 |     log.info("Now go ahead and upload it to the webserver manually.")
 89 | 
 90 | 
 91 | def refresh_datasets_from_website(fname_zip="data_export_20200513.zip"):
 92 |     """ Initialise local data cache from website public data """
 93 | 
 94 |     url = f"https://views.pcr.uu.se/download/datasets/{fname_zip}"
 95 |     log.info(f"Fetching from {url}")
 96 |     with tempfile.TemporaryDirectory() as tempdir:
 97 |         path_zip = os.path.join(tempdir, fname_zip)
 98 |         io.fetch_url_to_file(url=url, path=path_zip)
 99 |         log.info("Done fetching. Unpacking zipfile.")
100 |         _ = io.unpack_zipfile(path_zip, destination=tempdir)
101 | 
102 |         log.info("Initalising local geometries")
103 |         geom_c = api.GeomCountry()
104 |         geom_pg = api.GeomPriogrid()
105 |         path_geom_c = os.path.join(tempdir, os.path.basename(geom_c.path))
106 |         path_geom_pg = os.path.join(tempdir, os.path.basename(geom_pg.path))
107 |         geom_c.init_cache_from_geojson(path_geom_c)
108 |         geom_pg.init_cache_from_geojson(path_geom_pg)
109 | 
110 |         log.info("Initalising datasets.")
111 |         for name, dataset in DATASETS.items():
112 |             fname = f"{name}.csv"
113 |             path_csv = os.path.join(tempdir, fname)
114 |             dataset.init_cache_from_csv(path_csv)
115 |     log.info("Done initalising data, you can now use views.DATASETS")
116 | 


--------------------------------------------------------------------------------
/views/apps/model/__init__.py:
--------------------------------------------------------------------------------
1 | """ Model specification """
2 | __all__ = ["api", "Model", "Ensemble", "Period", "Downsampling"]
3 | from . import api
4 | from .api import Model, Ensemble, Period, Downsampling
5 | 


--------------------------------------------------------------------------------
/views/apps/model/calibration.py:
--------------------------------------------------------------------------------
  1 | """ Calibration """
  2 | import logging
  3 | from typing import Tuple
  4 | import warnings
  5 | import numpy as np  # type: ignore
  6 | import pandas as pd  # type: ignore
  7 | import statsmodels.api as sm  # type: ignore
  8 | 
  9 | from views.utils import stats
 10 | 
 11 | log = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | def _log_missing_indices(s: pd.Series) -> None:
 15 |     log.warning(f"Missing indices: {s.loc[s.isnull()].index}")
 16 | 
 17 | 
 18 | def calibrate_real(
 19 |     s_test_pred: pd.Series, s_calib_pred: pd.Series, s_calib_actual: pd.Series
 20 | ) -> pd.Series:
 21 |     """ Calibrate real value predictions
 22 | 
 23 |     Scaling parameters applied would, if applied to s_calib_pred,
 24 |     make them near-equal in mean and variance to s_calib_actual.
 25 | 
 26 |     For the case of transforming one set to have a given mean and std
 27 |     see:
 28 |     https://stats.stackexchange.com/questions/46429/transform-data-to-desired-mean-and-standard-deviation
 29 | 
 30 |     This case is slightly more involved as we want to shift the test
 31 |     predictions by parameters "learned" from comparing calibration
 32 |     predictions to actuals.
 33 | 
 34 | 
 35 |     """
 36 | 
 37 |     # Compute standard deviation ratio
 38 |     std_ratio = s_calib_actual.std() / s_calib_pred.std()
 39 |     # Remoe the calib mean from test predictions
 40 |     s_test_demeaned = s_test_pred - s_calib_pred.mean()
 41 |     # Shift calib de-meaned test predictions by the calib actual mean
 42 |     # And scale to the std ratio
 43 |     s_test_pred_scaled = s_calib_actual.mean() + s_test_demeaned * std_ratio
 44 | 
 45 |     return s_test_pred_scaled
 46 | 
 47 | 
 48 | def calibrate_prob(
 49 |     s_test_pred: pd.Series, s_calib_pred: pd.Series, s_calib_actual: pd.Series
 50 | ) -> pd.Series:
 51 |     """ Calibrate s_test_pred
 52 | 
 53 |     First predictions are transformed into logodds.
 54 |     Then a logit model is fit on
 55 |     "actual_outcomes ~ alpha + beta*logodds(p_calib)".
 56 |     Then alpha and beta are applied to test predictions like
 57 |     A =  e^(alpha+(beta*p_test))
 58 |     p_test_calibrated = A/(A+1)
 59 | 
 60 |     See: https://en.wikipedia.org/wiki/Logistic_regression
 61 | 
 62 |     """
 63 | 
 64 |     def _get_scaling_params(
 65 |         s_calib_actual: pd.Series, s_calib: pd.Series
 66 |     ) -> Tuple[float, float]:
 67 |         """ Gets scaling params """
 68 | 
 69 |         y = np.array(s_calib_actual)
 70 |         intercept = np.ones(len(s_calib))
 71 |         X = np.array([intercept, s_calib]).T
 72 | 
 73 |         model = sm.Logit(y, X).fit(disp=0)
 74 |         beta_0 = model.params[0]
 75 |         beta_1 = model.params[1]
 76 | 
 77 |         return beta_0, beta_1
 78 | 
 79 |     def _apply_scaling_params(
 80 |         s_test: pd.Series, beta_0: float, beta_1: float
 81 |     ) -> pd.Series:
 82 |         """ Scale logodds in s_test using intercept and beta"""
 83 |         numerator = np.exp(beta_0 + (beta_1 * s_test))
 84 |         denominator = numerator + 1
 85 |         scaled_probs = numerator / denominator
 86 | 
 87 |         return scaled_probs
 88 | 
 89 |     def _check_inputs(
 90 |         s_test_pred: pd.Series,
 91 |         s_calib_pred: pd.Series,
 92 |         s_calib_actual: pd.Series,
 93 |     ) -> None:
 94 |         """ Check that inputs have valid names and could be proabilities """
 95 | 
 96 |         if (
 97 |             s_test_pred.min() < 0
 98 |             or s_test_pred.max() > 1
 99 |             or s_calib_pred.min() < 0
100 |             or s_calib_pred.max() > 1
101 |         ):
102 |             raise RuntimeError(
103 |                 "Probabilities outside (0,1) range were passed to calibrate"
104 |             )
105 | 
106 |         if not s_calib_pred.name == s_test_pred.name:
107 |             warnings.warn(f"{s_calib_pred.name} != {s_test_pred.name}")
108 |         if s_test_pred.isnull().sum() > 0:
109 |             _log_missing_indices(s_test_pred)
110 |             raise RuntimeError("Missing values in s_test_pred")
111 |         if s_calib_pred.isnull().sum() > 0:
112 |             _log_missing_indices(s_calib_pred)
113 |             raise RuntimeError("Missing values in s_calib_pred")
114 |         if s_calib_actual.isnull().sum() > 0:
115 |             _log_missing_indices(s_calib_actual)
116 |             raise RuntimeError("Missing values in s_calib_actual")
117 | 
118 |         if (
119 |             not len(s_calib_pred) == len(s_calib_actual)
120 |             or len(s_calib_pred.index.difference(s_calib_actual.index)) > 0
121 |         ):
122 |             raise RuntimeError(
123 |                 f"len(s_calib_pred): {len(s_calib_pred)} "
124 |                 f"len(s_calib_actual): {len(s_calib_actual)} "
125 |                 f"index diff: "
126 |                 f"{s_calib_pred.index.difference(s_calib_actual.index)}"
127 |                 f"s_calib_pred.head() : {s_calib_pred.head()}"
128 |                 f"s_calib_pred.tail() : {s_calib_pred.tail()}"
129 |                 f"s_calib_actual.head() : {s_calib_actual.head()}"
130 |                 f"s_calib_actual.tail() : {s_calib_actual.tail()}"
131 |             )
132 | 
133 |     _check_inputs(s_test_pred, s_calib_pred, s_calib_actual)
134 | 
135 |     beta_0, beta_1 = _get_scaling_params(
136 |         s_calib_actual=s_calib_actual,
137 |         s_calib=stats.prob_to_logodds(s_calib_pred.copy()),
138 |     )
139 |     if beta_1 < 0:
140 |         warnings.warn(f"Beta_1 < 0. Very weak {s_calib_pred.name} ?")
141 | 
142 |     s_test_pred_scaled = _apply_scaling_params(
143 |         stats.prob_to_logodds(s_test_pred.copy()), beta_0, beta_1
144 |     )
145 |     return s_test_pred_scaled
146 | 


--------------------------------------------------------------------------------
/views/apps/model/crosslevel.py:
--------------------------------------------------------------------------------
 1 | """ Cross level model functions """
 2 | # flake8: noqa
 3 | # pylint: skip-file
 4 | import pandas as pd  # type: ignore
 5 | from .api import Model
 6 | 
 7 | 
 8 | # class CrossLevel:
 9 | #     def __init__(model_high_res: Model, model_low_res: Model):
10 | #         self.model_high_res = model_high_res
11 | #         self.model_low_res = model_low_res
12 | #         self.steps = self.steps_in_common([model_high_res, model_low_res])
13 | #         self.steps = sorted(list(self.steps))
14 | 
15 | #     @staticmethod
16 | #     def steps_in_common(models: List[Model]):
17 | #         """ Find steps that all models have in common """
18 | #         return sorted(
19 | #             set.intersection(*[set(model.steps) for model in models])
20 | #         )
21 | 
22 | #     def predict(
23 | #         self, df_high_res: pd.DataFrame, df_low_res: pd.DataFrame
24 | #     ) -> pd.Series:
25 | #         """ Combine high and low res predictions """
26 | #         cols_ss_h = [self.model_high_res.cols_ss[step] for step in self.steps]
27 | #         cols_ss_l = [self.model_low_res.cols_ss[step] for step in self.steps]
28 | #         df_h = df_high_res[cols_ss_h]
29 | #         df_l = df_low_res[cols_ss_l]
30 | 
31 | 
32 | # def fetch_df_links():
33 | #     """Get a df linking pg_ids to country_ids."""
34 | 
35 | #     query = """
36 | #     SELECT pgm.priogrid_gid AS pg_id,
37 | #        cm.country_id
38 | #     FROM staging.priogrid_month AS pgm
39 | #          INNER JOIN staging.country_month AS cm ON pgm.country_month_id = cm.id
40 | #     --- Month 500 arbitrary choice
41 | #     WHERE pgm.month_id = 500;
42 | #     """
43 | #     return dbutils.query_to_df(query)
44 | 
45 | 
46 | # def compute_colaresi(df, col_pgm, col_cm):
47 | #     """ Colaresian cross level probability """
48 | 
49 | #     # Sum of high resolution probabilities for each low level area
50 | #     sum_h_by_l = df.groupby(["month_id", "country_id"])[col_pgm].transform(sum)
51 | 
52 | #     # Low resolution prob multiplied by share of high res prob in particular area
53 | #     joint_prob = df[col_cm] * (df[col_pgm] / sum_h_by_l)
54 | 
55 | #     return joint_prob
56 | 
57 | 
58 | # def crosslevel(df_pgm, df_cm, df_links, col_pgm, col_cm):
59 | #     # Join in country_id
60 | #     df = df_pgm[[col_pgm]].join(df_links.set_index(["pg_id"])[["country_id"]])
61 | #     df = df.reset_index().set_index(["month_id", "country_id"])
62 | #     df = (
63 | #         df.join(df_cm[[col_cm]]).reset_index().set_index(["month_id", "pg_id"])
64 | #     )
65 | #     s = compute_colaresi(df, col_pgm, col_cm)
66 | #     share_missing = s.isnull().sum() / len(s)
67 | #     if share_missing > 0.01:
68 | #         raise RuntimeError(
69 | #             f"Too much missing in prediction, something's wrong"
70 | #         )
71 | #     s = s.fillna(s.mean())
72 | #     return s
73 | 
74 | 
75 | # if False:
76 | #     df_links = fetch_df_links()
77 | #     for step in [1, 6, 12, 24, 36]:
78 | #         for outcome in ["sb", "ns", "os"]:
79 | #             col_cl = f"ss.{outcome}_crosslevel.{step}"
80 | #             col_pgm = (
81 | #                 f"ss.{outcome}_xgb.{step}"  # Use the allthemes model for pgm
82 | #             )
83 | #             col_cm = f"ss.{outcome}_all_glob.{step}"  # Use the all_glob model for CM
84 | #             df_pgm_a[col_cl] = crosslevel(
85 | #                 df_pgm_a, df_cm_a, df_links, col_pgm, col_cm
86 | #             )
87 | #             df_pgm_b[col_cl] = crosslevel(
88 | #                 df_pgm_b, df_cm_b, df_links, col_pgm, col_cm
89 | #             )
90 | #             df_pgm_c[col_cl] = crosslevel(
91 | #                 df_pgm_c, df_cm_c, df_links, col_pgm, col_cm
92 | #             )
93 | 


--------------------------------------------------------------------------------
/views/apps/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | # Pipeline
 2 | 
 3 | The pipeline is what produces monthly forecasts.
 4 | It does this in steps
 5 | 
 6 | ## Training
 7 | Is done rarely and models persisted.
 8 | Training runs
 9 | 
10 | ## Prediction
11 | 


--------------------------------------------------------------------------------
/views/apps/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | """ Forecasting and training pipelines """
2 | # __all__ = ["train", "models_cm", "models_pgm"]
3 | # from . import train, models_cm, models_pgm
4 | 


--------------------------------------------------------------------------------
/views/apps/pipeline/ensembles_cm.py:
--------------------------------------------------------------------------------
  1 | """ All CM Ensemble objects
  2 | 
  3 | 
  4 | The following models are included in the JPR 2020 CM
  5 | ensemble:
  6 | cm_sb_cflong
  7 | cm_sb_acled_violence
  8 | cm_ns_neibhist
  9 | cm_sb_cdummies
 10 | cm_sb_acled_protest
 11 | cm_sb_reign_coups
 12 | cm_sb_icgcw
 13 | cm_sb_reign_drought
 14 | cm_sb_reign_global
 15 | cm_sb_vdem_global
 16 | cm_sb_demog
 17 | cm_sb_wdi_global
 18 | cm_sb_all_global
 19 | cm_sbonset24_25_all
 20 | 
 21 | and are all included in the prelim ensembles below.
 22 | @TODO: Not ready in this repo yet are and are to be added later:
 23 | 
 24 | ds_25
 25 | ds_dummy
 26 | 
 27 | """
 28 | 
 29 | # pylint: disable=invalid-name
 30 | 
 31 | 
 32 | from typing import Dict, List
 33 | from views.apps.model.api import Ensemble, Model, Period
 34 | from views.specs.periods import get_periods
 35 | from . import models_cm
 36 | 
 37 | 
 38 | # The currently latest model development run id
 39 | run_id = "d_2020_04_01"
 40 | periods: List[Period] = get_periods(run_id=run_id)
 41 | 
 42 | models_cm_sb_prelim: List[Model] = [
 43 |     models_cm.cm_sb_cflong,
 44 |     models_cm.cm_sb_acled_violence,
 45 |     models_cm.cm_sb_neibhist,
 46 |     models_cm.cm_sb_cdummies,
 47 |     models_cm.cm_sb_acled_protest,
 48 |     models_cm.cm_sb_reign_coups,
 49 |     models_cm.cm_sb_icgcw,
 50 |     models_cm.cm_sb_reign_drought,
 51 |     models_cm.cm_sb_reign_global,
 52 |     models_cm.cm_sb_vdem_global,
 53 |     models_cm.cm_sb_demog,
 54 |     models_cm.cm_sb_wdi_global,
 55 |     models_cm.cm_sb_all_global,
 56 |     models_cm.cm_sbonset24_25_all,
 57 | ]
 58 | 
 59 | models_cm_ns_prelim: List[Model] = [
 60 |     models_cm.cm_ns_cflong,
 61 |     models_cm.cm_ns_acled_violence,
 62 |     models_cm.cm_ns_neibhist,
 63 |     models_cm.cm_ns_cdummies,
 64 |     models_cm.cm_ns_acled_protest,
 65 |     models_cm.cm_ns_reign_coups,
 66 |     models_cm.cm_ns_icgcw,
 67 |     models_cm.cm_ns_reign_drought,
 68 |     models_cm.cm_ns_reign_global,
 69 |     models_cm.cm_ns_vdem_global,
 70 |     models_cm.cm_ns_demog,
 71 |     models_cm.cm_ns_wdi_global,
 72 |     models_cm.cm_ns_all_global,
 73 |     models_cm.cm_nsonset24_25_all,
 74 | ]
 75 | 
 76 | models_cm_os_prelim: List[Model] = [
 77 |     models_cm.cm_os_cflong,
 78 |     models_cm.cm_os_acled_violence,
 79 |     models_cm.cm_os_neibhist,
 80 |     models_cm.cm_os_cdummies,
 81 |     models_cm.cm_os_acled_protest,
 82 |     models_cm.cm_os_reign_coups,
 83 |     models_cm.cm_os_icgcw,
 84 |     models_cm.cm_os_reign_drought,
 85 |     models_cm.cm_os_reign_global,
 86 |     models_cm.cm_os_vdem_global,
 87 |     models_cm.cm_os_demog,
 88 |     models_cm.cm_os_wdi_global,
 89 |     models_cm.cm_os_all_global,
 90 |     models_cm.cm_osonset24_25_all,
 91 | ]
 92 | 
 93 | cm_sb_prelim = Ensemble(
 94 |     name="cm_sb_prelim",
 95 |     models=models_cm_sb_prelim,
 96 |     method="ebma",
 97 |     outcome_type="prob",
 98 |     col_outcome="greq_25_ged_best_sb",
 99 |     periods=periods,
100 |     delta_outcome=False,
101 | )
102 | 
103 | cm_ns_prelim = Ensemble(
104 |     name="cm_ns_prelim",
105 |     models=models_cm_ns_prelim,
106 |     method="ebma",
107 |     outcome_type="prob",
108 |     col_outcome="greq_25_ged_best_ns",
109 |     periods=periods,
110 |     delta_outcome=False,
111 | )
112 | cm_os_prelim = Ensemble(
113 |     name="cm_os_prelim",
114 |     models=models_cm_os_prelim,
115 |     method="ebma",
116 |     outcome_type="prob",
117 |     col_outcome="greq_25_ged_best_os",
118 |     periods=periods,
119 |     delta_outcome=False,
120 | )
121 | 
122 | all_cm_ensembles: List[Ensemble] = [
123 |     cm_sb_prelim,
124 |     cm_ns_prelim,
125 |     cm_os_prelim,
126 | ]
127 | all_cm_ensembles_by_name: Dict[str, Ensemble] = dict()
128 | for ensemble in all_cm_ensembles:
129 |     all_cm_ensembles_by_name[ensemble.name] = ensemble
130 | 


--------------------------------------------------------------------------------
/views/apps/pipeline/ensembles_pgm.py:
--------------------------------------------------------------------------------
  1 | """ All PGM Ensemble objects
  2 | 
  3 | The following models are included in the JPR 2020 PGM
  4 | ensemble:
  5 | allthemes
  6 | hist_legacy
  7 | onset24_100_all
  8 | onset24_1_all
  9 | pgd_natural
 10 | pgd_social
 11 | sptime
 12 | 
 13 | These 4 are not included yet but will be when implemented in this repo
 14 | ds_25
 15 | ds_dummy
 16 | xgb
 17 | crosslevel
 18 | 
 19 | ]
 20 | """
 21 | 
 22 | # pylint: disable=invalid-name
 23 | 
 24 | from typing import Dict, List
 25 | from views.apps.model.api import Ensemble, Model, Period
 26 | from views.specs.periods import get_periods
 27 | from . import models_pgm
 28 | 
 29 | 
 30 | # The currently latest model development run id
 31 | run_id = "d_2020_04_01"
 32 | periods: List[Period] = get_periods(run_id=run_id)
 33 | 
 34 | 
 35 | models_pgm_sb_prelim: List[Model] = [
 36 |     models_pgm.pgm_sb_hist_legacy,
 37 |     models_pgm.pgm_sb_allthemes,
 38 |     models_pgm.pgm_sb_onset24_100_all,
 39 |     models_pgm.pgm_sb_onset24_1_all,
 40 |     models_pgm.pgm_sb_pgd_natural,
 41 |     models_pgm.pgm_sb_pgd_social,
 42 |     models_pgm.pgm_sb_sptime,
 43 | ]
 44 | 
 45 | models_pgm_ns_prelim: List[Model] = [
 46 |     models_pgm.pgm_ns_hist_legacy,
 47 |     models_pgm.pgm_ns_allthemes,
 48 |     models_pgm.pgm_ns_onset24_100_all,
 49 |     models_pgm.pgm_ns_onset24_1_all,
 50 |     models_pgm.pgm_ns_pgd_natural,
 51 |     models_pgm.pgm_ns_pgd_social,
 52 |     models_pgm.pgm_ns_sptime,
 53 | ]
 54 | 
 55 | models_pgm_os_prelim: List[Model] = [
 56 |     models_pgm.pgm_os_hist_legacy,
 57 |     models_pgm.pgm_os_allthemes,
 58 |     models_pgm.pgm_os_onset24_100_all,
 59 |     models_pgm.pgm_os_onset24_1_all,
 60 |     models_pgm.pgm_os_pgd_natural,
 61 |     models_pgm.pgm_os_pgd_social,
 62 |     models_pgm.pgm_os_sptime,
 63 | ]
 64 | 
 65 | pgm_sb_prelim = Ensemble(
 66 |     name="pgm_sb_prelim",
 67 |     models=models_pgm_sb_prelim,
 68 |     method="average",
 69 |     outcome_type="prob",
 70 |     col_outcome="ged_dummy_sb",
 71 |     periods=periods,
 72 | )
 73 | 
 74 | pgm_ns_prelim = Ensemble(
 75 |     name="pgm_ns_prelim",
 76 |     models=models_pgm_ns_prelim,
 77 |     method="average",
 78 |     outcome_type="prob",
 79 |     col_outcome="ged_dummy_ns",
 80 |     periods=periods,
 81 | )
 82 | 
 83 | pgm_os_prelim = Ensemble(
 84 |     name="pgm_os_prelim",
 85 |     models=models_pgm_os_prelim,
 86 |     method="average",
 87 |     outcome_type="prob",
 88 |     col_outcome="ged_dummy_os",
 89 |     periods=periods,
 90 | )
 91 | 
 92 | all_pgm_ensembles: List[Ensemble] = [
 93 |     pgm_sb_prelim,
 94 |     pgm_ns_prelim,
 95 |     pgm_os_prelim,
 96 | ]
 97 | 
 98 | all_pgm_ensembles_by_name: Dict[str, Ensemble] = dict()
 99 | for ensemble in all_pgm_ensembles:
100 |     all_pgm_ensembles_by_name[ensemble.name] = ensemble
101 | 


--------------------------------------------------------------------------------
/views/apps/pipeline/train.py:
--------------------------------------------------------------------------------
 1 | """ This module defines the training of all models used in ViEWS
 2 | 
 3 | After it is run, all required models should be persisted on disk and
 4 | ready for prediction.
 5 | """
 6 | import logging
 7 | 
 8 | from typing_extensions import Literal
 9 | 
10 | from views.specs.data import DATASETS
11 | from . import models_cm, models_pgm
12 | 
13 | log = logging.getLogger(__name__)
14 | 
15 | 
16 | def train_and_store_model_by_name(
17 |     loa: Literal["am", "cm", "pgm"], model: str, dataset: str
18 | ) -> None:
19 |     """ Lookup a model by name and fit, evaluate and store it """
20 | 
21 |     if loa == "cm":
22 |         model_object = models_cm.all_cm_models_by_name[model]
23 |     elif loa == "pgm":
24 |         model_object = models_pgm.all_pgm_models_by_name[model]
25 |     else:
26 |         raise NotImplementedError(f"cm and pgm models only yet, not {loa}")
27 | 
28 |     df = DATASETS[dataset].df
29 |     model_object.fit_estimators(df)
30 |     model_object.save()
31 | 


--------------------------------------------------------------------------------
/views/apps/plot/__init__.py:
--------------------------------------------------------------------------------
1 | """ Plotting modules """
2 | 
3 | __all__ = [
4 |     "MapData",
5 |     "plot_map",
6 | ]
7 | 
8 | from .maps import MapData, plot_map
9 | 


--------------------------------------------------------------------------------
/views/apps/slurm/__init__.py:
--------------------------------------------------------------------------------
1 | """ Slurm interface """
2 | __all__ = ["run_command"]
3 | from .slurm import run_command
4 | 


--------------------------------------------------------------------------------
/views/apps/slurm/templates/runfile_core.txt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -o ${LOGFILE_LOCATION}
 3 | #SBATCH -A ${PROJECT_ID}
 4 | #SBATCH -J ${NAME}
 5 | #SBATCH -p core
 6 | #SBATCH -n ${N_CORES}
 7 | #SBATCH -t ${TIME}
 8 | 
 9 | echo $$(date -u) - Starting job ${NAME}
10 | 
11 | ${COMMAND}
12 | 
13 | echo $$(date -u) - Finished job ${NAME}
14 | 


--------------------------------------------------------------------------------
/views/apps/slurm/templates/runfile_node.txt:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | #SBATCH -o ${LOGFILE_LOCATION}
 3 | #SBATCH -A ${PROJECT_ID}
 4 | #SBATCH -J ${NAME}
 5 | #SBATCH -p node
 6 | #SBATCH -t ${TIME}
 7 | 
 8 | echo $$(date -u) - Starting job ${NAME}
 9 | 
10 | ${COMMAND}
11 | 
12 | echo $$(date -u) - Finished job ${NAME}
13 | 


--------------------------------------------------------------------------------
/views/apps/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | """ Data transformations """
2 | __all__ = ["lib"]
3 | from . import lib
4 | 


--------------------------------------------------------------------------------
/views/apps/xgb/lib.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UppsalaConflictDataProgram/OpenViEWS2/7eb3e63c8c046de31f70cd56f417fadf03686f5a/views/apps/xgb/lib.py


--------------------------------------------------------------------------------
/views/config.py:
--------------------------------------------------------------------------------
  1 | """ Config module. Reads config.yaml in repo root and exposes vars """
  2 | from dataclasses import dataclass
  3 | from typing import Any, Dict, Tuple, Optional
  4 | import os
  5 | import copy
  6 | import json
  7 | import yaml
  8 | 
  9 | 
 10 | REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 11 | LOGFMT = "[%(asctime)s] - %(name)s:%(lineno)d - %(levelname)s - %(message)s"
 12 | 
 13 | 
 14 | def _resolve(path: str) -> str:
 15 |     """ Resolve env vars and home in path """
 16 |     return os.path.expanduser(os.path.expandvars(path))
 17 | 
 18 | 
 19 | # pylint: disable=too-many-instance-attributes
 20 | @dataclass
 21 | class Db:
 22 |     """ Holds connection options for connecting through sqlalchemy """
 23 | 
 24 |     user: str
 25 |     host: str
 26 |     dbname: str
 27 |     port: int
 28 |     password: Optional[str] = None
 29 |     use_ssl: Optional[bool] = False
 30 |     ssl_cert: Optional[str] = None
 31 |     ssl_key: Optional[str] = None
 32 |     ssl_rootcert: Optional[str] = None
 33 | 
 34 |     @property
 35 |     def connectstring(self) -> str:
 36 |         """ Get a connectstring """
 37 | 
 38 |         if self.password:
 39 |             userpart = f"{self.user}:{self.password}"
 40 |         else:
 41 |             userpart = self.user
 42 | 
 43 |         return f"postgresql://{userpart}@{self.host}:{self.port}/{self.dbname}"
 44 | 
 45 |     @property
 46 |     def connect_args(self) -> Dict[str, str]:
 47 |         """ Get dict of connect_args """
 48 | 
 49 |         if self.use_ssl:
 50 |             assert self.ssl_cert
 51 |             assert self.ssl_key
 52 |             assert self.ssl_rootcert
 53 |             connectargs = {
 54 |                 "sslmode": "require",
 55 |                 "sslcert": _resolve(self.ssl_cert),
 56 |                 "sslkey": _resolve(self.ssl_key),
 57 |                 "sslrootcert": _resolve(self.ssl_rootcert),
 58 |             }
 59 |         else:
 60 |             connectargs = dict()
 61 | 
 62 |         return connectargs
 63 | 
 64 |     def __repr__(self):
 65 |         repdict = copy.copy(self.__dict__)
 66 | 
 67 |         # Never log the password
 68 |         if self.password:
 69 |             repdict["password"] = "******"
 70 |             repdict["connectstring"] = self.connectstring.replace(
 71 |                 self.password, "******"
 72 |             )
 73 | 
 74 |         return json.dumps(repdict)
 75 | 
 76 |     def __str__(self):
 77 |         return self.__repr__()
 78 | 
 79 | 
 80 | def _get_configfile() -> Dict[str, Any]:
 81 |     """ Read the raw configfile """
 82 |     with open(os.path.join(REPO_ROOT, "config.yaml"), "r") as f:
 83 |         return yaml.safe_load(f)
 84 | 
 85 | 
 86 | def _get_dirs() -> Tuple[str, str]:
 87 |     """ Get and resolve all the directories in config.yaml """
 88 |     config = _get_configfile()
 89 | 
 90 |     dir_storage = config["dirs"]["storage"]
 91 |     dir_scratch = config["dirs"]["scratch"]
 92 | 
 93 |     if not dir_storage:
 94 |         dir_storage = os.path.join(REPO_ROOT, "storage")
 95 | 
 96 |     if not dir_scratch:
 97 |         dir_scratch = os.path.join(dir_storage, "scratch")
 98 | 
 99 |     dir_storage = _resolve(dir_storage)
100 |     dir_scratch = _resolve(dir_scratch)
101 | 
102 |     return dir_storage, dir_scratch
103 | 
104 | 
105 | def _get_databases() -> Dict[str, Db]:
106 |     """ Get all the database configs in config.yaml """
107 |     config = _get_configfile()
108 | 
109 |     dbs = dict()
110 |     for db_name, db_spec in config["databases"].items():
111 |         dbs[db_name] = Db(**db_spec)
112 | 
113 |     dbs["default"] = dbs[config["default_database"]]
114 | 
115 |     return dbs
116 | 
117 | 
118 | def _get_slurm_cfg() -> Dict[str, str]:
119 |     config = _get_configfile()
120 |     if "slurm" in config.keys():
121 |         slurm_cfg = config["slurm"]
122 |     else:
123 |         slurm_cfg = {"username": "", "project": ""}
124 | 
125 |     return slurm_cfg
126 | 
127 | 
128 | DIR_STORAGE, DIR_SCRATCH = _get_dirs()
129 | DATABASES = _get_databases()
130 | SLURM = _get_slurm_cfg()
131 | 


--------------------------------------------------------------------------------
/views/database/README.md:
--------------------------------------------------------------------------------
1 | # Database
2 | This directory is proposed to be the new home of the views database.
3 | 
4 | ## Structure
5 | 
6 | A proposed structure is presented here.
7 | Each data sources lives in its own directory in sources.
8 | Sources that are versioned and might change the list of columns they provide are organised by version.
9 | A skeleton schema that builds


--------------------------------------------------------------------------------
/views/database/__init__.py:
--------------------------------------------------------------------------------
1 | """ Database related functionality """
2 | 
3 | from . import sources, skeleton
4 | 
5 | __all__ = ["sources", "skeleton"]
6 | 


--------------------------------------------------------------------------------
/views/database/common.py:
--------------------------------------------------------------------------------
 1 | """ Common utils for database data management """
 2 | import logging
 3 | from typing import List, Optional
 4 | import tempfile
 5 | import os
 6 | from datetime import date
 7 | 
 8 | from views.utils import io
 9 | from ..config import DIR_STORAGE
10 | 
11 | DIR_FETCHES = os.path.join(DIR_STORAGE, "data", "raw")
12 | log = logging.getLogger(__name__)
13 | 
14 | 
15 | def get_path_tar(name: str) -> str:
16 |     """ Get a path to a tarfile timestamped for today """
17 |     io.create_directory(DIR_FETCHES)
18 |     today = date.today().strftime("%Y%m%d")
19 |     return os.path.join(DIR_FETCHES, f"{name}_{today}.tar.xz")
20 | 
21 | 
22 | def fetch_source_simply(
23 |     name: str, url: Optional[str] = None, urls: Optional[List[str]] = None
24 | ) -> None:
25 |     """ Download file at url (or urls) and store in tarfile by name """
26 | 
27 |     def _get_urls(url: Optional[str], urls: Optional[List[str]]) -> List[str]:
28 |         """ If url return list of one, else pass through urls """
29 |         if url and urls:
30 |             raise TypeError("Use url or urls, not both.")
31 |         if url:
32 |             # pylint: disable=redefined-argument-from-local
33 |             urls = [url]
34 |         assert isinstance(urls, list)
35 | 
36 |         return urls
37 | 
38 |     urls = _get_urls(url, urls)
39 |     with tempfile.TemporaryDirectory() as tempdir:
40 |         paths = []
41 |         for url in urls:  # pylint: disable=redefined-argument-from-local
42 |             fname = url.split("/")[-1]
43 |             path_source = os.path.join(tempdir, fname)
44 |             io.fetch_url_to_file(url, path=path_source)
45 |             paths.append(path_source)
46 |         io.make_tarfile(path_tar=get_path_tar(name), paths_members=paths)
47 | 
48 | 
49 | def get_files_latest_fetch(name, tempdir) -> List[str]:
50 |     """ Get files from latest fetch
51 | 
52 |     Unpack the tarfile for the latest fetch for source name into tempdir
53 |     and return paths.
54 |     """
55 |     log.debug(f"Getting files for latest fetch for {name}")
56 |     paths_fetches = io.list_files_in_dir(path_dir=DIR_FETCHES)
57 |     try:
58 |         path_tar = [
59 |             path
60 |             for path in sorted(paths_fetches)
61 |             if os.path.basename(path).startswith(name)
62 |         ].pop(0)
63 |         log.debug(f"Got {path_tar} as latest {name} of {paths_fetches}")
64 |     except IndexError:
65 |         log.exception(f"Couldn't find a latest fetch for {name}.")
66 |         raise
67 | 
68 |     paths = io.unpack_tarfile(path_tar=path_tar, dir_destination=tempdir)
69 |     return paths
70 | 


--------------------------------------------------------------------------------
/views/database/skeleton/__init__.py:
--------------------------------------------------------------------------------
1 | """ The database skeleton schema """
2 | __all__ = ["build_skeleton"]
3 | 
4 | from .skeleton import build_skeleton
5 | 


--------------------------------------------------------------------------------
/views/database/skeleton/create_skeleton.sql:
--------------------------------------------------------------------------------
 1 | -- Create a skeleton schema with identifiers and geographic extent only.
 2 | -- To be used down the line in joining data in pandas
 3 | 
 4 | DROP SCHEMA IF EXISTS skeleton CASCADE;
 5 | CREATE SCHEMA skeleton;
 6 | 
 7 | -- PGY
 8 | CREATE TABLE skeleton.pgy_global AS
 9 | SELECT pgy.priogrid_gid AS pg_id,
10 |        y.year,
11 |        cy.country_id,
12 |        pg.in_africa,
13 |        c.name AS country_name
14 | FROM staging.priogrid_year AS pgy
15 |          INNER JOIN staging.year AS y ON pgy.year_id = y.year
16 | -- LEFT here because pgy-cy mapping stops
17 |          LEFT JOIN staging.country_year AS cy ON pgy.country_year_id = cy.id
18 |          INNER JOIN staging.priogrid AS pg ON pgy.priogrid_gid = pg.gid
19 |          INNER JOIN staging.country AS c ON c.id=cy.country_id
20 | WHERE y.year < 2031;
21 | 
22 | CREATE TABLE skeleton.pgy_africa AS
23 | SELECT *
24 | FROM skeleton.pgy_global
25 | WHERE in_africa = TRUE;
26 | 
27 | -- PGM
28 | CREATE TABLE skeleton.pgm_global AS
29 | SELECT pgm.priogrid_gid AS pg_id,
30 |        m.id             AS month_id,
31 |        m.year_id        AS year,
32 |        m.month          AS month,
33 |        cm.country_id       country_id,
34 |        pg.in_africa,
35 |        c.name AS country_name
36 | FROM staging.priogrid_month AS pgm
37 |          INNER JOIN staging.month AS m ON m.id = pgm.month_id
38 |          INNER JOIN staging.priogrid_year AS pgy ON pgy.year_id = m.year_id AND pgy.priogrid_gid = pgm.priogrid_gid
39 |          INNER JOIN staging.country_month AS cm ON pgm.country_month_id = cm.id
40 |          INNER JOIN staging.priogrid AS pg ON pg.gid = pgm.priogrid_gid
41 |          INNER JOIN staging.country AS c ON c.id=cm.country_id
42 | WHERE m.year_id < 2031;
43 | 
44 | CREATE TABLE skeleton.pgm_africa AS
45 | SELECT *
46 | FROM skeleton.pgm_global
47 | WHERE in_africa = TRUE;
48 | 
49 | -- CY
50 | DROP TABLE IF EXISTS skeleton.cy_global;
51 | CREATE TABLE skeleton.cy_global AS
52 | SELECT c.id AS country_id,
53 |        c.in_africa,
54 |        c.name AS country_name,
55 |        y.year
56 | FROM staging.country AS c
57 |          CROSS JOIN staging.year AS y
58 | WHERE c.gweyear = 2016
59 |   AND y.year < 2031;
60 | 
61 | CREATE TABLE skeleton.cy_africa AS
62 | SELECT *
63 | FROM skeleton.cy_global
64 | WHERE in_africa = 1;
65 | 
66 | -- CM
67 | CREATE TABLE skeleton.cm_global AS
68 | SELECT c.id      AS country_id,
69 |        m.year_id AS year,
70 |        m.id      AS month_id,
71 |        m.month,
72 |        c.name AS country_name,
73 |        c.in_africa
74 | FROM staging.country AS c
75 |          CROSS JOIN staging.month AS m
76 | WHERE c.gweyear = 2016
77 |   AND m.year_id < 2031;
78 | 
79 | CREATE TABLE skeleton.cm_africa AS
80 | SELECT *
81 | FROM skeleton.cm_global
82 | WHERE in_africa = 1;
83 | 
84 | 


--------------------------------------------------------------------------------
/views/database/skeleton/skeleton.py:
--------------------------------------------------------------------------------
 1 | """ Skeleton building code """
 2 | import os
 3 | import logging
 4 | from views.utils import db
 5 | 
 6 | log = logging.getLogger(__name__)
 7 | 
 8 | 
 9 | def build_skeleton() -> None:
10 |     """ Build skeleton schema by executing create_skeleton.sql """
11 |     log.info("Started rebuilding skeleton schema.")
12 |     path_query = os.path.join(os.path.dirname(__file__), "create_skeleton.sql")
13 |     with open(path_query, "r") as f:
14 |         query = f.read()
15 |     db.execute_query(query)
16 |     log.info("Finished rebuilding skeleton schema.")
17 | 


--------------------------------------------------------------------------------
/views/database/sources/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Data sources for the database """
 2 | from . import acled, cdum, fvp, ged, icgcw, pgdata, spei, vdem, wdi
 3 | 
 4 | __all__ = [
 5 |     "acled",
 6 |     "cdum",
 7 |     "fvp",
 8 |     "ged",
 9 |     "icgcw",
10 |     "pgdata",
11 |     "spei",
12 |     "vdem",
13 |     "wdi",
14 | ]
15 | 


--------------------------------------------------------------------------------
/views/database/sources/acled/__init__.py:
--------------------------------------------------------------------------------
1 | """ ACLED package """
2 | __all__ = ["load_acled", "fetch_acled"]
3 | from .acled import load_acled, fetch_acled
4 | 


--------------------------------------------------------------------------------
/views/database/sources/acled/acled.py:
--------------------------------------------------------------------------------
 1 | """ ACLED data loader, depends on original DB implementation
 2 | 
 3 | # TODO: Rewrite to hold all ACLED loading logic.
 4 | """
 5 | import os
 6 | import logging
 7 | from views.utils import db, io
 8 | from .legacy import load_acled as load_legacy_acled
 9 | 
10 | 
11 | log = logging.getLogger(__name__)
12 | 
13 | 
14 | def fetch_acled() -> None:
15 |     """ Do nothing, ACLED still fetched by old code """
16 | 
17 | 
18 | def load_acled() -> None:
19 |     """ Code that brings acled to staging yet to be merged """
20 | 
21 |     log.info("Started loading ACLED.")
22 | 
23 |     load_legacy_acled(
24 |         from_date="2020-01-01", from_month_id=483, to_month_id=484
25 |     )
26 | 
27 |     db.drop_schema("acled")
28 |     db.create_schema("acled")
29 | 
30 |     db.execute_query(
31 |         query=io.read_file(
32 |             path=os.path.join(os.path.dirname(__file__), "acled.sql")
33 |         )
34 |     )
35 |     log.info("Finished loading ACLED.")
36 | 


--------------------------------------------------------------------------------
/views/database/sources/acled/acled.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE acled.cm AS
 2 |     SELECT cm.month_id,
 3 |            cm.country_id,
 4 |            coalesce(cm.acled_count_pr, 0) AS acled_count_pr,
 5 |            coalesce(cm.acled_count_sb, 0) AS acled_count_sb,
 6 |            coalesce(cm.acled_count_ns, 0) AS acled_count_ns,
 7 |            coalesce(cm.acled_count_os, 0) AS acled_count_os
 8 | FROM staging.country_month AS cm;
 9 | 
10 | CREATE TABLE acled.pgm AS
11 |     SELECT pgm.month_id,
12 |            pgm.priogrid_gid AS pg_id,
13 |            coalesce(pgm.acled_count_pr, 0) AS acled_count_pr,
14 |            coalesce(pgm.acled_count_sb, 0) AS acled_count_sb,
15 |            coalesce(pgm.acled_count_ns, 0) AS acled_count_ns,
16 |            coalesce(pgm.acled_count_os, 0) AS acled_count_os,
17 |            coalesce(pgm.acled_fat_sb, 0) AS acled_fat_sb,
18 |            coalesce(pgm.acled_fat_ns, 0) AS acled_fat_ns,
19 |            coalesce(pgm.acled_fat_os, 0) AS acled_fat_os,
20 |            coalesce(pgm.acled_fat_pr, 0) AS acled_fat_pr
21 | FROM staging.priogrid_month AS pgm;


--------------------------------------------------------------------------------
/views/database/sources/acled/legacy/__init__.py:
--------------------------------------------------------------------------------
1 | """ Legacy ACLED """
2 | __all__ = ["load_acled"]
3 | from .acled import load_acled
4 | 


--------------------------------------------------------------------------------
/views/database/sources/acled/legacy/prepare_acled.sql:
--------------------------------------------------------------------------------
  1 | -- Rebuilds preflight.acled_full and prefligh.acled
  2 | 
  3 | DROP TABLE IF EXISTS preflight.acled_full;
  4 | DROP TABLE IF EXISTS preflight.acled;
  5 | 
  6 | CREATE TABLE preflight.acled_full AS
  7 | WITH month_acled AS
  8 |          (
  9 |              SELECT *,
 10 |                     EXTRACT(MONTH FROM event_date :: DATE)               AS month,
 11 |                     public.priogrid(latitude::float4, longitude::float4) AS priogrid_gid
 12 |              FROM dataprep.acled
 13 |              WHERE latitude::float BETWEEN -180 AND 180
 14 |                AND longitude::float BETWEEN -90 AND 90
 15 |          ),
 16 |      month_acled2 AS
 17 |          (
 18 |              SELECT month_acled.*,
 19 |                     staging.month.id AS month_id
 20 |              FROM month_acled,
 21 |                   staging.month
 22 |              WHERE (month_acled.year :: INT = staging.month.year_id AND
 23 |                     month_acled.month = staging.month.month)
 24 |          )
 25 | SELECT *
 26 | FROM month_acled2;
 27 | 
 28 | 
 29 | 
 30 | ALTER TABLE preflight.acled_full
 31 |     ADD COLUMN type_of_violence INT;
 32 | 
 33 | ALTER TABLE preflight.acled_full
 34 |     ADD COLUMN type_of_protest TEXT;
 35 | 
 36 | -- 1. We are emulating UCDP/ViEWS StateBased category using ACLED data.
 37 | -- i.e. Military Forces vs. others/other Military Forces, only "battles" and "remote violence"
 38 | -- no civilians involved.
 39 | -- TODO: shelling and remote violence may need to be treated differently
 40 | UPDATE preflight.acled_full
 41 | SET type_of_violence = 1
 42 | WHERE (event_type ILIKE '%%battle%%' OR event_type ILIKE '%%remote%%')
 43 |   AND actor1 || actor2 ILIKE '%%military forces%%'
 44 |   AND actor1 || actor2 NOT ILIKE '%%civilians%%';
 45 | 
 46 | 
 47 | 
 48 | -- 2. We are emulating UCDP/ViEWS StateBased category using ACLED data.
 49 | -- i.e. no military forces, no civilians, only "battles" and "remote violence"
 50 | -- UCDP''s artificial organizational criteria are not included and cannot for now be included
 51 | UPDATE preflight.acled_full
 52 | SET type_of_violence = 2
 53 | WHERE (event_type ILIKE '%%battle%%' OR event_type ILIKE '%%remote%%')
 54 |   AND actor1 || actor2 NOT ILIKE '%%military forces%%'
 55 |   AND actor1 || actor2 NOT ILIKE '%%civilians%%';
 56 | 
 57 | 
 58 | 
 59 | -- 3: Emulate UCDP/Views OneSided category.
 60 | -- Remote violence, battle and violence against civilians
 61 | -- TODO: This may be improved using a better division of "Remote Violence"
 62 | UPDATE preflight.acled_full
 63 | SET type_of_violence = 3
 64 | WHERE (event_type ILIKE '%%battle%%' OR event_type ILIKE '%%remote%%' OR event_type ILIKE '%%civi%%')
 65 |   AND actor1 || actor2 ILIKE '%%civilians%%';
 66 | 
 67 | -- 4: Protests
 68 | -- The entire protest category, as is
 69 | UPDATE preflight.acled_full
 70 | SET type_of_violence = 4
 71 | WHERE event_type ILIKE '%%protest%%';
 72 | 
 73 | UPDATE preflight.acled_full
 74 | SET type_of_protest = 'p'
 75 | WHERE type_of_violence = 4
 76 |   AND (inter1::int = 6 OR inter2::int = 6);
 77 | 
 78 | 
 79 | 
 80 | UPDATE preflight.acled_full
 81 | SET type_of_protest = COALESCE (type_of_protest, '') || 'r'
 82 | WHERE
 83 |     type_of_violence=4
 84 |   AND (inter1::INT =5
 85 |    OR inter2::INT =5);
 86 | 
 87 | 
 88 | 
 89 | UPDATE preflight.acled_full
 90 | SET type_of_protest = COALESCE(type_of_protest, '') || 'x'
 91 | WHERE event_type ILIKE '%violence against civi%'
 92 |   AND interaction::int IN (15, 16, 25, 26, 35, 36, 45, 46);
 93 | 
 94 | UPDATE preflight.acled_full
 95 | SET type_of_protest = COALESCE(type_of_protest, '') || 'y'
 96 | WHERE event_type ILIKE '%violence against civi%'
 97 |   AND interaction::int IN (15, 16);
 98 | 
 99 | 
100 | 
101 | -- We are only using events precise enough to have locations within PGM cells
102 | -- Thus, we exclude geo_precision 3 which indicates "larger area"
103 | -- (unclear what that means but during testing, it was nearly always ADM1 or higher.
104 | 
105 | 
106 | CREATE TABLE preflight.acled AS
107 | SELECT *
108 | FROM preflight.acled_full
109 | WHERE geo_precision::int < 3;
110 | 
111 | 
112 | 
113 | ALTER TABLE preflight.acled
114 |     ADD PRIMARY KEY (index);
115 | ALTER TABLE preflight.acled_full
116 |     ADD PRIMARY KEY (index);
117 | CREATE INDEX acled_idx ON preflight.acled (priogrid_gid, month_id, type_of_violence);
118 | CREATE INDEX acled_full_idx ON preflight.acled_full (priogrid_gid, month_id, type_of_violence);
119 | CREATE INDEX acled2_idx ON preflight.acled (priogrid_gid, month_id, type_of_violence, type_of_protest);
120 | CREATE INDEX acled2_full_idx ON preflight.acled_full (priogrid_gid, month_id, type_of_violence, type_of_protest);
121 | 
122 | 


--------------------------------------------------------------------------------
/views/database/sources/cdum/__init__.py:
--------------------------------------------------------------------------------
1 | """ Country dummy package """
2 | __all__ = ["fetch_cdum", "load_cdum"]
3 | from .cdum import fetch_cdum, load_cdum
4 | 


--------------------------------------------------------------------------------
/views/database/sources/cdum/cdum.py:
--------------------------------------------------------------------------------
 1 | """ Country dummy module """
 2 | import pandas as pd  # type: ignore
 3 | from views.utils import db
 4 | 
 5 | 
 6 | def fetch_cdum() -> None:
 7 |     """ Nothing to fetch for country dummies """
 8 | 
 9 | 
10 | def load_cdum() -> None:
11 |     """ Load country dummies """
12 | 
13 |     df = db.db_to_df(fqtable="staging.country", cols=["id"], ids=["id"])
14 |     df = df.reset_index().rename(columns={"id": "country_id"})
15 |     df["to_dummy"] = df["country_id"]
16 |     df = df.set_index(["country_id"])
17 |     df = pd.get_dummies(df.to_dummy, prefix="cdum")
18 |     db.drop_schema("cdum")
19 |     db.create_schema("cdum")
20 |     db.df_to_db(fqtable="cdum.c", df=df)
21 | 


--------------------------------------------------------------------------------
/views/database/sources/fvp/__init__.py:
--------------------------------------------------------------------------------
1 | """ Future of Violent politics package """
2 | __all__ = ["fetch_fvp", "load_fvp"]
3 | from .fvp import fetch_fvp, load_fvp
4 | 


--------------------------------------------------------------------------------
/views/database/sources/fvp/fvp.py:
--------------------------------------------------------------------------------
 1 | """ Future of violent politics module """
 2 | import logging
 3 | import os
 4 | import tempfile
 5 | 
 6 | from sklearn.tree import DecisionTreeRegressor  # type: ignore
 7 | 
 8 | from views.utils import io, db
 9 | from views.database import common
10 | from views.apps.data import missing
11 | 
12 | log = logging.getLogger(__name__)
13 | 
14 | 
15 | def fetch_fvp():
16 |     """ FVP data is in the Dropbox
17 | 
18 |     # TODO: Store properly
19 |     """
20 |     print("FVP MUST BE FETCHED MANUALLY! ITS IN THE DROPBOX.")
21 | 
22 | 
23 | def load_fvp():
24 |     """ Load FVP data """
25 |     log.info("Started loading FVP")
26 |     with tempfile.TemporaryDirectory() as tempdir:
27 |         _ = common.get_files_latest_fetch(name="fvp", tempdir=tempdir)
28 |         df = io.csv_to_df(path=os.path.join(tempdir, "MasterData.csv"))
29 | 
30 |     df = df.drop(columns=["Conflict"])
31 |     df = df.rename(columns=lambda col: col.lower())
32 |     df = df.set_index(["year", "gwno"])
33 | 
34 |     spec = io.load_yaml(
35 |         path=os.path.join(os.path.dirname(__file__), "spec.yaml")
36 |     )
37 |     df = df[spec["cols"]]
38 | 
39 |     log.debug("Fetching df_keys")
40 |     query = "SELECT id AS country_id, gwcode AS gwno FROM staging.country;"
41 |     df = df.join(
42 |         db.query_to_df(query=query)
43 |         .sort_values(by="country_id", ascending=False)
44 |         .drop_duplicates(subset=["gwno"])
45 |         .set_index(["gwno"])
46 |     )
47 | 
48 |     log.debug("Joining to skeleton")
49 |     df = db.db_to_df(
50 |         fqtable="skeleton.cy_global",
51 |         ids=["year", "country_id"],
52 |         cols=["year", "country_id"],
53 |     ).join(df.reset_index().set_index(["year", "country_id"]), how="left")
54 | 
55 |     df = df.drop(columns=["gwno"])
56 | 
57 |     # Add consistent fvp_ prefix
58 |     df = df.rename(
59 |         columns=lambda col: col if col.startswith("fvp_") else f"fvp_{col}"
60 |     )
61 |     df = df.sort_index(axis=1).sort_index(axis=0)
62 | 
63 |     # Push raw
64 |     db.create_schema("fvp_v2")
65 |     db.df_to_db(fqtable="fvp_v2.cy_unimp", df=df)
66 | 
67 |     # Extrapolate before imputing
68 |     df = missing.extrapolate(df)
69 | 
70 |     # Impute and push
71 |     for i, df_imp in enumerate(
72 |         missing.impute_mice_generator(
73 |             df=df,
74 |             n_imp=10,
75 |             estimator=DecisionTreeRegressor(max_features="sqrt"),
76 |             parallel=True,
77 |         )
78 |     ):
79 |         db.df_to_db(df=df_imp, fqtable=f"fvp_v2.cy_imp_sklearn_{i}")
80 | 
81 |     log.info("Fininshed loading FVP")
82 | 


--------------------------------------------------------------------------------
/views/database/sources/fvp/spec.yaml:
--------------------------------------------------------------------------------
 1 | cols:
 2 |   - fvp_auto
 3 |   - fvp_demo
 4 |   - fvp_democracy
 5 |   - fvp_electoral
 6 |   - fvp_liberal
 7 |   - fvp_participatory
 8 |   - fvp_regime3c
 9 |   - fvp_semi
10 |   - gdp200
11 |   - gdpcap_nonoilrent
12 |   - gdpcap_oilrent
13 |   - gdppc200
14 |   - govt
15 |   - grgdpcap_nonoilrent
16 |   - grgdpcap_oilrent
17 |   - grgdppercapita200
18 |   - grpop200
19 |   - lngdp200
20 |   - lngdpcap_nonoilrent
21 |   - lngdpcap_oilrent
22 |   - lngdppercapita200
23 |   - lnoilrent
24 |   - lnpop200
25 |   - ltimeindep
26 |   - ssp2_edu_sec_15_24_prop
27 |   - prop_diexpo
28 |   - prop_discexclpowless
29 |   - prop_discriminated
30 |   - prop_dominant
31 |   - prop_excluded
32 |   - prop_irrelevant
33 |   - prop_junpart
34 |   - prop_powerless
35 |   - prop_selfexclusion
36 |   - prop_senpart
37 |   - population200
38 |   - ssp2_urban_share_iiasa
39 |   - timesincepreindepwar
40 |   - timesinceregimechange
41 |   - indepyear
42 |   - timeindep


--------------------------------------------------------------------------------
/views/database/sources/ged/__init__.py:
--------------------------------------------------------------------------------
1 | """ GED package """
2 | __all__ = ["fetch_ged", "load_ged"]
3 | from .ged import fetch_ged, load_ged
4 | 


--------------------------------------------------------------------------------
/views/database/sources/ged/ged.py:
--------------------------------------------------------------------------------
 1 | """ Ged loader, depends on original DB implementation
 2 | 
 3 | # TODO: Rewrite to hold all loading logic
 4 | """
 5 | import os
 6 | import logging
 7 | from views.utils import db, io
 8 | from .legacy import load_ged as load_legacy_ged
 9 | 
10 | log = logging.getLogger(__name__)
11 | 
12 | 
13 | def fetch_ged() -> None:
14 |     """ Do nothing, GED still fetched by old code """
15 | 
16 | 
17 | def load_ged() -> None:
18 |     """ Collect imputed and unimputed GED """
19 | 
20 |     log.info("Started loading GED.")
21 | 
22 |     load_legacy_ged("20.9.4", 484, 484)  # 2020-04
23 | 
24 |     db.drop_schema("ged")
25 |     db.create_schema("ged")
26 |     db.execute_query(
27 |         query=io.read_file(
28 |             path=os.path.join(os.path.dirname(__file__), "ged.sql")
29 |         )
30 |     )
31 |     log.info("Finished loading GED.")
32 | 


--------------------------------------------------------------------------------
/views/database/sources/ged/ged.sql:
--------------------------------------------------------------------------------
  1 | CREATE TABLE ged.cm AS
  2 | SELECT cm.month_id,
  3 |        cm.country_id,
  4 |        cm.ged_best_sb,
  5 |        cm.ged_best_ns,
  6 |        cm.ged_best_os,
  7 |        cm.ged_count_sb,
  8 |        cm.ged_count_ns,
  9 |        cm.ged_count_os
 10 | FROM staging.country_month AS cm;
 11 | 
 12 | CREATE TABLE ged.pgm_unimp
 13 | AS
 14 | SELECT pgm.month_id,
 15 |        pgm.priogrid_gid                  AS pg_id,
 16 |        pgm.ged_best_sb,
 17 |        pgm.ged_best_ns,
 18 |        pgm.ged_best_os,
 19 |        pgm.ged_count_sb,
 20 |        pgm.ged_count_ns,
 21 |        pgm.ged_count_os,
 22 |        public.to_dummy(pgm.ged_count_sb) AS ged_dummy_sb,
 23 |        public.to_dummy(pgm.ged_count_ns) AS ged_dummy_ns,
 24 |        public.to_dummy(pgm.ged_count_os) AS ged_dummy_os
 25 | FROM staging.priogrid_month AS pgm;
 26 | 
 27 | CREATE TABLE ged.pgm_geoimp_0
 28 | AS
 29 | SELECT pgm.month_id,
 30 |        pgm.priogrid_gid       AS pg_id,
 31 |        pgm.ged_best_sb,
 32 |        pgm.ged_best_ns,
 33 |        pgm.ged_best_os,
 34 |        pgm.ged_count_sb,
 35 |        pgm.ged_count_ns,
 36 |        pgm.ged_count_os,
 37 |        pgm_imp.ged_sb_dummy_1 AS ged_dummy_sb,
 38 |        pgm_imp.ged_ns_dummy_1 AS ged_dummy_ns,
 39 |        pgm_imp.ged_os_dummy_1 AS ged_dummy_os
 40 | FROM staging.priogrid_month AS pgm
 41 |          LEFT JOIN left_imputation.pgm AS pgm_imp
 42 |                    ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id;
 43 | 
 44 | CREATE TABLE ged.pgm_geoimp_1
 45 | AS
 46 | SELECT pgm.month_id,
 47 |        pgm.priogrid_gid       AS pg_id,
 48 |        pgm.ged_best_sb,
 49 |        pgm.ged_best_ns,
 50 |        pgm.ged_best_os,
 51 |        pgm.ged_count_sb,
 52 |        pgm.ged_count_ns,
 53 |        pgm.ged_count_os,
 54 |        pgm_imp.ged_sb_dummy_2 AS ged_dummy_sb,
 55 |        pgm_imp.ged_ns_dummy_2 AS ged_dummy_ns,
 56 |        pgm_imp.ged_os_dummy_2 AS ged_dummy_os
 57 | FROM staging.priogrid_month AS pgm
 58 |          LEFT JOIN left_imputation.pgm AS pgm_imp
 59 |                    ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id;
 60 | 
 61 | CREATE TABLE ged.pgm_geoimp_2
 62 | AS
 63 | SELECT pgm.month_id,
 64 |        pgm.priogrid_gid       AS pg_id,
 65 |        pgm.ged_best_sb,
 66 |        pgm.ged_best_ns,
 67 |        pgm.ged_best_os,
 68 |        pgm.ged_count_sb,
 69 |        pgm.ged_count_ns,
 70 |        pgm.ged_count_os,
 71 |        pgm_imp.ged_sb_dummy_3 AS ged_dummy_sb,
 72 |        pgm_imp.ged_ns_dummy_3 AS ged_dummy_ns,
 73 |        pgm_imp.ged_os_dummy_3 AS ged_dummy_os
 74 | FROM staging.priogrid_month AS pgm
 75 |          LEFT JOIN left_imputation.pgm AS pgm_imp
 76 |                    ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id;
 77 | 
 78 | CREATE TABLE ged.pgm_geoimp_3
 79 | AS
 80 | SELECT pgm.month_id,
 81 |        pgm.priogrid_gid       AS pg_id,
 82 |        pgm.ged_best_sb,
 83 |        pgm.ged_best_ns,
 84 |        pgm.ged_best_os,
 85 |        pgm.ged_count_sb,
 86 |        pgm.ged_count_ns,
 87 |        pgm.ged_count_os,
 88 |        pgm_imp.ged_sb_dummy_4 AS ged_dummy_sb,
 89 |        pgm_imp.ged_ns_dummy_4 AS ged_dummy_ns,
 90 |        pgm_imp.ged_os_dummy_4 AS ged_dummy_os
 91 | FROM staging.priogrid_month AS pgm
 92 |          LEFT JOIN left_imputation.pgm AS pgm_imp
 93 |                    ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id;
 94 | 
 95 | CREATE TABLE ged.pgm_geoimp_4
 96 | AS
 97 | SELECT pgm.month_id,
 98 |        pgm.priogrid_gid       AS pg_id,
 99 |        pgm.ged_best_sb,
100 |        pgm.ged_best_ns,
101 |        pgm.ged_best_os,
102 |        pgm.ged_count_sb,
103 |        pgm.ged_count_ns,
104 |        pgm.ged_count_os,
105 |        pgm_imp.ged_sb_dummy_5 AS ged_dummy_sb,
106 |        pgm_imp.ged_ns_dummy_5 AS ged_dummy_ns,
107 |        pgm_imp.ged_os_dummy_5 AS ged_dummy_os
108 | FROM staging.priogrid_month AS pgm
109 |          LEFT JOIN left_imputation.pgm AS pgm_imp
110 |                    ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id;
111 | 


--------------------------------------------------------------------------------
/views/database/sources/ged/legacy/__init__.py:
--------------------------------------------------------------------------------
1 | """ Legacy GED """
2 | __all__ = ["load_ged"]
3 | from .ged import load_ged
4 | 


--------------------------------------------------------------------------------
/views/database/sources/ged/legacy/prepare_ged.sql:
--------------------------------------------------------------------------------
 1 | -- Drop existing attached
 2 | DROP TABLE IF EXISTS preflight.ged_attached_full;
 3 | DROP TABLE IF EXISTS preflight.ged_attached;
 4 | 
 5 | -- Create preflight.ged_attached
 6 | CREATE TABLE preflight.ged_attached AS
 7 |     (
 8 |         WITH month_ged AS
 9 |                  (
10 |                      SELECT *,
11 |                             EXTRACT(MONTH FROM date_start :: DATE) AS month_start,
12 |                             EXTRACT(MONTH FROM date_end :: DATE)   AS month_end
13 |                      FROM dataprep.ged
14 |                  ),
15 |              month_ged_start AS
16 |                  (
17 |                      SELECT month_ged.*,
18 |                             staging.month.id AS month_id_start
19 |                      FROM month_ged,
20 |                           staging.month
21 |                      WHERE (month_ged.year :: INT = staging.month.year_id AND
22 |                             month_ged.month_start = staging.month.month)
23 |                  ),
24 |              month_ged_full AS
25 |                  (
26 |                      SELECT month_ged_start.*,
27 |                             staging.month.id AS month_id_end
28 |                      FROM month_ged_start,
29 |                           staging.month
30 |                      WHERE (month_ged_start.year :: INT = staging.month.year_id AND
31 |                             month_ged_start.month_end = staging.month.month)
32 |                  )
33 |         SELECT *
34 |         FROM month_ged_full
35 |     );
36 | 
37 | -- Add ids
38 | ALTER TABLE preflight.ged_attached ADD PRIMARY KEY (id);
39 | ALTER TABLE preflight.ged_attached ADD COLUMN country_month_id_end bigint;
40 | ALTER TABLE preflight.ged_attached ADD COLUMN country_month_id_start bigint;
41 | ALTER TABLE preflight.ged_attached DROP COLUMN IF EXISTS geom;
42 | ALTER TABLE preflight.ged_attached ADD COLUMN geom geometry(point, 4326);
43 | UPDATE preflight.ged_attached
44 | SET geom=st_setsrid(st_geometryfromtext(geom_wkt), 4326)
45 | WHERE geom_wkt <> '';
46 | 
47 | -- Create preflight.ged_attached_full
48 | CREATE TABLE preflight.ged_attached_full AS SELECT * FROM preflight.ged_attached;
49 | 
50 | 
51 | DELETE FROM preflight.ged_attached WHERE where_prec IN (4,6,7);
52 | ALTER TABLE preflight.ged_attached_full ADD PRIMARY KEY (id);
53 | CREATE INDEX ged_attached_gidx ON preflight.ged_attached USING GIST(geom);
54 | CREATE INDEX ged_attached_idx ON preflight.ged_attached (priogrid_gid,month_id_end, type_of_violence);
55 | CREATE INDEX ged_attached_s_idx ON preflight.ged_attached (priogrid_gid,month_id_start, type_of_violence);
56 | CREATE INDEX ged_attached_full_gidx ON preflight.ged_attached_full USING GIST(geom);
57 | CREATE INDEX ged_attached_fullx_s_idx ON preflight.ged_attached_full (priogrid_gid,month_id_end, type_of_violence);
58 | CREATE INDEX ged_attached_fullx_gidx ON preflight.ged_attached_full (priogrid_gid,month_id_start, type_of_violence);
59 | 
60 | 
61 | -- Update preflight.ged_attached_full
62 | WITH a AS
63 |          (SELECT cm.*, c.gwcode
64 |           FROM staging.country_month cm
65 |                    LEFT JOIN
66 |                staging.country c ON (cm.country_id = c.id))
67 | UPDATE preflight.ged_attached_full
68 | SET country_month_id_end=a.id
69 | FROM a
70 | WHERE (a.gwcode = ged_attached_full.country_id AND a.month_id = ged_attached_full.month_id_end);
71 | WITH a AS
72 |          (SELECT cm.*, c.gwcode
73 |           FROM staging.country_month cm
74 |                    LEFT JOIN
75 |                staging.country c ON (cm.country_id = c.id))
76 | UPDATE preflight.ged_attached_full
77 | SET country_month_id_start=a.id
78 | FROM a
79 | WHERE (a.gwcode = ged_attached_full.country_id AND a.month_id = ged_attached_full.month_id_start);


--------------------------------------------------------------------------------
/views/database/sources/icgcw/__init__.py:
--------------------------------------------------------------------------------
1 | """ International Crisis Group - Crisis Watch Package """
2 | __all__ = ["fetch_icgcw", "load_icgcw"]
3 | from .fetch import fetch_icgcw
4 | from .icgcw import load_icgcw
5 | 


--------------------------------------------------------------------------------
/views/database/sources/icgcw/fetch.py:
--------------------------------------------------------------------------------
  1 | """Scrapes all ICG CrisisWatch to file """
  2 | 
  3 | # pylint: disable=too-many-arguments
  4 | 
  5 | import os
  6 | import tempfile
  7 | import logging
  8 | import datetime
  9 | 
 10 | import requests  # type: ignore
 11 | from bs4 import BeautifulSoup  # type: ignore
 12 | 
 13 | from views.utils import io
 14 | from views.database import common
 15 | 
 16 | log = logging.getLogger(__name__)
 17 | 
 18 | 
 19 | def check_if_more_pages(path_html):
 20 |     """ True if more pages to fetch indicated in path_html """
 21 | 
 22 |     log.debug(f"Checking if more pages in {path_html}")
 23 | 
 24 |     with open(path_html, "r", encoding="utf-8") as f:
 25 |         soup = BeautifulSoup(f.read(), "html.parser")
 26 |         search = {
 27 |             "class": "c-crisiswatch-entry [ o-container o-container--m u-pr ]"
 28 |         }
 29 |         matches = soup.find_all("div", search)
 30 | 
 31 |         # matches is a list, if len is zero it evaluates to False
 32 |         if matches:
 33 |             more_pages = True
 34 |             log.debug("Found more pages.")
 35 |         else:
 36 |             more_pages = False
 37 |             log.debug("No more pages.")
 38 | 
 39 |     return more_pages
 40 | 
 41 | 
 42 | def fetch_page_content(url, page, from_year, from_month, to_year, to_month):
 43 |     """ Fetch page contents  """
 44 |     params = {
 45 |         "date_range": "custom",
 46 |         "page": page,
 47 |         "from_year": from_year,
 48 |         "from_month": from_month,
 49 |         "to_year": to_year,
 50 |         "to_month": to_month,
 51 |     }
 52 |     headers = {"User-Agent": "Mozilla/5.0"}  # Header because 504 otherwise
 53 |     req = requests.get(url=url, params=params, timeout=60, headers=headers)
 54 |     log.debug(f"GET {req.url}")
 55 |     log.debug(f"Status code: {req.status_code}")
 56 |     content = req.content
 57 | 
 58 |     return content
 59 | 
 60 | 
 61 | def fetch_page_to_file(url, path_dir, page, y_start, m_start, y_end, m_end):
 62 |     """ Fetch page at url with time params to file in path_dir """
 63 | 
 64 |     # Pad with some zeros
 65 |     m_start = str(m_start).zfill(2)
 66 |     m_end = str(m_end).zfill(2)
 67 | 
 68 |     content = fetch_page_content(url, page, y_start, m_start, y_end, m_end)
 69 | 
 70 |     fname = f"{y_start}.{m_start}_{y_end}.{m_end}_p{str(page).zfill(4)}.html"
 71 |     path = os.path.join(path_dir, fname)
 72 |     with open(path, "wb") as f:
 73 |         f.write(content)
 74 |     log.info(f"Wrote {path}")
 75 | 
 76 |     return path
 77 | 
 78 | 
 79 | def fetch_pages(url, path_dir, y_start=2004, m_start=1):
 80 |     """ Fetch pages from y_start-m_start until today to path_dir """
 81 |     y_end = datetime.date.today().year
 82 |     m_end = datetime.date.today().month
 83 | 
 84 |     paths = []
 85 |     more_pages = True
 86 |     page = 0
 87 |     while more_pages:
 88 |         log.debug(f"Page: {page}")
 89 |         path = fetch_page_to_file(
 90 |             url, path_dir, page, y_start, m_start, y_end, m_end
 91 |         )
 92 |         paths.append(path)
 93 |         more_pages = check_if_more_pages(path_html=path)
 94 |         page = page + 1
 95 | 
 96 |     return paths
 97 | 
 98 | 
 99 | def fetch_icgcw():
100 |     """ Fetch icgcw to fetch library """
101 |     with tempfile.TemporaryDirectory() as tempdir:
102 |         paths = fetch_pages(
103 |             url="https://www.crisisgroup.org/crisiswatch/database",
104 |             path_dir=tempdir,
105 |         )
106 |         io.make_tarfile(
107 |             paths_members=paths, path_tar=common.get_path_tar(name="icgcw")
108 |         )
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     fetch_icgcw()
113 | 


--------------------------------------------------------------------------------
/views/database/sources/icgcw/spec.yaml:
--------------------------------------------------------------------------------
 1 | cname_fixes:
 2 |   - old: "Israel/Palestine"
 3 |     new: "Israel"
 4 |   - old: "China (internal)"
 5 |     new: "China"
 6 |   - old: "Western Sahara"
 7 |     new: "Morocco"
 8 |   - old: "Democratic Republic of Congo"
 9 |     new: "Congo, DRC"
10 |   - old: "Somaliland"
11 |     new: "Somalia"
12 |   - old: "C\u00f4te d\u2019Ivoire"
13 |     new: "Cote d'Ivoire"
14 |   - old: "India (non-Kashmir)"
15 |     new: "India"
16 |   - old: "Russia/North Caucasus"
17 |     new: "Russia"
18 |   - old: "Bosnia And Herzegovina"
19 |     new: "Bosnia and Herzegovina"
20 |   - old: "Nagorno-Karabakh (Azerbaijan)"
21 |     new: "Azerbaijan"
22 |   - old: "Chechnya (Russia)"
23 |     new: "Russia"
24 |   - old: "Basque Country (Spain)"
25 |     new: "Spain"
26 |   - old: "Corsica"
27 |     new: "France"
28 |   - old: "Northern Ireland (UK)"
29 |     new: "United Kingdom"
30 |   - old: "Comoros Islands"
31 |     new: "Comoros"
32 |   - old: "Taiwan Strait"
33 |     new: "China"
34 |   - old: "Timor-Leste"
35 |     new: "Timor Leste"
36 |   - old: "Republic of Congo"
37 |     new: "Congo"
38 |   - old: "Solomon Islands"
39 |     new: "Solomon Is."
40 |   - old: "Gambia"
41 |     new: "The Gambia"
42 |   - old: "Abkhazia (Georgia)"
43 |     new: "Georgia"
44 |   - old: "UK"
45 |     new: "United Kingdom"
46 |   - old: "North Macedonia"
47 |     new: "Macedonia"
48 |   - old: "Central Africa"
49 |     new: "Central African Republic"
50 |   - old: "U.S."
51 |     new: "United States"
52 |   - old: "Kashmir"
53 |     new: India/Pakistan
54 |   - old: "Korean Peninsula"
55 |     new: "North Korea/South Korea"
56 |   - old: "Northern Territories (Russia"
57 |     new: Russia
58 |   - old: "Japan)"
59 |     new: "Japan"
60 | drops:
61 |   - "South China Sea"
62 |   - "Gulf and Arabian Peninsula"
63 |   - "Kuril Islands"
64 | cols_data:
65 |   - alerts
66 |   - opportunities
67 |   - deteriorated
68 |   - improved
69 |   - unobserved
70 | 


--------------------------------------------------------------------------------
/views/database/sources/pgdata/__init__.py:
--------------------------------------------------------------------------------
1 | """ Priogrid Data Package """
2 | from .fetch import fetch_pgdata
3 | from .pgdata import load_pgdata
4 | 
5 | __all__ = ["fetch_pgdata", "load_pgdata"]
6 | 


--------------------------------------------------------------------------------
/views/database/sources/pgdata/fetch.py:
--------------------------------------------------------------------------------
  1 | """ Fetch priogrid data from their API """
  2 | 
  3 | from typing import Any, Dict, List
  4 | import os
  5 | import tempfile
  6 | import json
  7 | import logging
  8 | import time
  9 | import random
 10 | import multiprocessing as mp
 11 | 
 12 | import requests
 13 | 
 14 | from views.utils import io
 15 | from views.database import common
 16 | 
 17 | log = logging.getLogger(__name__)
 18 | 
 19 | URL_BASE = "https://grid.prio.org/api"
 20 | 
 21 | 
 22 | def fetch_variable(
 23 |     varinfo: Dict[Any, Any], dir_destination: str, try_number: int = 1
 24 | ) -> str:
 25 |     """ Fetch a single variable from API """
 26 | 
 27 |     url = varinfo["url"]
 28 |     params = varinfo["payload"]
 29 |     log.debug(f"Fetching {url} with params {params} try_number {try_number}")
 30 | 
 31 |     try:
 32 |         data = requests.get(url=url, params=params).json()
 33 |     except json.decoder.JSONDecodeError:
 34 |         time.sleep(2 ** try_number + random.random() * 0.01)
 35 |         data = fetch_variable(
 36 |             varinfo, dir_destination, try_number=try_number + 1
 37 |         )
 38 | 
 39 |     path = os.path.join(dir_destination, f"{varinfo['name']}.json")
 40 |     io.dict_to_json(data, path)
 41 |     return path
 42 | 
 43 | 
 44 | def fetch_data(
 45 |     varinfos: List[Dict[Any, Any]], dir_destination: str
 46 | ) -> List[str]:
 47 |     """ Fetch all the data to dir_destination"""
 48 | 
 49 |     with mp.Pool(processes=mp.cpu_count()) as pool:
 50 |         results = []
 51 |         for varinfo in varinfos:
 52 |             results.append(
 53 |                 pool.apply_async(fetch_variable, (varinfo, dir_destination,))
 54 |             )
 55 |         paths = [result.get() for result in results]
 56 | 
 57 |         return paths
 58 | 
 59 | 
 60 | def fetch_varinfos() -> List[Dict[Any, Any]]:
 61 |     """ Update varinfo dictionaries with API endpoint URLs """
 62 | 
 63 |     varinfos = requests.get(f"{URL_BASE}/variables").json()
 64 |     varinfos = varinfos.copy()
 65 |     for varinfo in varinfos:
 66 |         url = f"{URL_BASE}/data/{varinfo['id']}"
 67 |         if varinfo["type"] == "yearly":
 68 |             payload = {k: varinfo[k] for k in ("startYear", "endYear")}
 69 |         elif varinfo["type"] == "static":
 70 |             payload = {}
 71 | 
 72 |         varinfo.update({"url": url, "payload": payload})
 73 | 
 74 |     return varinfos
 75 | 
 76 | 
 77 | def fetch_pgdata() -> None:
 78 |     """ Fetch priogrid data from API """
 79 | 
 80 |     path_tar = common.get_path_tar(name="pgdata")
 81 | 
 82 |     log.info("Started fetching pgdata")
 83 | 
 84 |     grid = requests.get(f"{URL_BASE}/data/basegrid").json()
 85 |     varinfos = fetch_varinfos()
 86 | 
 87 |     with tempfile.TemporaryDirectory() as tempdir:
 88 | 
 89 |         path_grid = os.path.join(tempdir, "basegrid.json")
 90 |         path_varinfos = os.path.join(tempdir, "varinfos.json")
 91 |         io.dict_to_json(data=grid, path=path_grid)
 92 |         io.dict_to_json(data=varinfos, path=path_varinfos)
 93 |         paths_data = fetch_data(varinfos=varinfos, dir_destination=tempdir)
 94 | 
 95 |         paths_all = paths_data + [path_varinfos] + [path_grid]
 96 | 
 97 |         io.make_tarfile(path_tar=path_tar, paths_members=paths_all)
 98 | 
 99 |     log.info("Finished fetching pgdata")
100 | 


--------------------------------------------------------------------------------
/views/database/sources/pgdata/spec.yaml:
--------------------------------------------------------------------------------
  1 | # There's a core var called gid, but they're already indexed by gid
  2 | # So we drop this one to avoid duplicates.
  3 | excludes_core:
  4 |   - gid
  5 | 
  6 | # _y and _s columns have nulls for no grids without data, fill with zero
  7 | nulls_to_zero:
  8 |   - diamprim_y
  9 |   - diamsec_y
 10 |   - drug_y
 11 |   - gem_y
 12 |   - goldplacer_y
 13 |   - goldsurface_y
 14 |   - goldvein_y
 15 |   - petroleum_y
 16 |   - diamprim_s
 17 |   - diamsec_s
 18 |   - gem_s
 19 |   - goldplacer_s
 20 |   - goldsurface_s
 21 |   - goldvein_s
 22 |   - petroleum_s
 23 | 
 24 | # Yearly data often stops early, fill it forward instead of zeroing
 25 | # This ffilling happens before zeroing the nulls.
 26 | cols_ffill:
 27 |   - diamprim_y
 28 |   - diamsec_y
 29 |   - drug_y
 30 |   - gem_y
 31 |   - goldplacer_y
 32 |   - goldsurface_y
 33 |   - goldvein_y
 34 |   - petroleum_y
 35 | 
 36 | prefix: pgd
 37 | public_tables:
 38 |   pgy: pgdata.pgy
 39 | cols_data:
 40 |   - agri_gc
 41 |   - agri_ih
 42 |   - aquaveg_gc
 43 |   - barren_gc
 44 |   - barren_ih
 45 |   - bdist3
 46 |   - capdist
 47 |   - cmr_mean
 48 |   - diamprim # combined as max(_s, _y)
 49 |   - diamsec # combined as max(_s, _y)
 50 |   - drug_y
 51 |   - excluded
 52 |   - forest_gc
 53 |   - forest_ih
 54 |   - gcp_mer
 55 |   - gem # combined as max(_s, _y)
 56 |   - goldplacer # combined as max(_s, _y)
 57 |   - goldsurface # combined as max(_s, _y)
 58 |   - goldvein # combined as max(_s, _y)
 59 |   - grass_ih
 60 |   - gwarea
 61 |   - harvarea
 62 |   - herb_gc
 63 |   - imr_mean
 64 |   - irrig_sum
 65 |   - landarea
 66 |   - maincrop
 67 |   - mountains_mean
 68 |   - nlights_calib_mean
 69 |   - pasture_ih
 70 |   - petroleum # combined as max(_s, _y)
 71 |   - pop_gpw_sum
 72 |   - savanna_ih
 73 |   - shrub_gc
 74 |   - shrub_ih
 75 |   - temp
 76 |   - ttime_mean
 77 |   - urban_gc
 78 |   - urban_ih
 79 |   - water_gc
 80 |   - water_ih
 81 |   # Following cols not used
 82 |   # - bdist1
 83 |   # - bdist2
 84 |   # - cmr_max
 85 |   # - cmr_min
 86 |   # - cmr_sd
 87 |   # - droughtcrop_speibase
 88 |   # - droughtcrop_speigdm
 89 |   # - droughtcrop_spi
 90 |   # - droughtend_speibase
 91 |   # - droughtend_speigdm
 92 |   # - droughtend_spi
 93 |   # - droughtstart_speibase
 94 |   # - droughtstart_speigdm
 95 |   # - droughtstart_spi
 96 |   # - droughtyr_speibase
 97 |   # - droughtyr_speigdm
 98 |   # - droughtyr_spi
 99 |   # - gcp_ppp
100 |   # - gcp_qual
101 |   # - growend
102 |   # - growstart
103 |   # - gwno
104 |   # - imr_max
105 |   # - imr_min
106 |   # - imr_sd
107 |   # - irrig_max
108 |   # - irrig_min
109 |   # - irrig_sd
110 |   # - nlights_max
111 |   # - nlights_mean
112 |   # - nlights_min
113 |   # - nlights_sd
114 |   # - pop_gpw_max
115 |   # - pop_gpw_min
116 |   # - pop_gpw_sd
117 |   # - pop_hyd_max
118 |   # - pop_hyd_min
119 |   # - pop_hyd_sd
120 |   # - pop_hyd_sum
121 |   # - prec_gpcc
122 |   # - prec_gpcp
123 |   # - rainseas
124 |   # - ttime_max
125 |   # - ttime_min
126 |   # - ttime_sd
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/views/database/sources/reign/__init__.py:
--------------------------------------------------------------------------------
1 | """ Reign package """
2 | __all__ = ["fetch_reign", "load_reign"]
3 | from .reign import fetch_reign, load_reign
4 | 


--------------------------------------------------------------------------------
/views/database/sources/reign/reign.py:
--------------------------------------------------------------------------------
  1 | """ Reign """
  2 | import os
  3 | import tempfile
  4 | import logging
  5 | from typing import Any, Dict
  6 | import requests
  7 | import pandas as pd  # type: ignore
  8 | import bs4  # type: ignore
  9 | 
 10 | from views.apps.data import missing
 11 | from views.database import common
 12 | from views.utils import io, db
 13 | 
 14 | log = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | def fetch_reign() -> None:
 18 |     """ Fetch REIGN data """
 19 | 
 20 |     def get_latest_data_url(url_report) -> str:
 21 |         html_doc = requests.get(url_report).content
 22 |         soup = bs4.BeautifulSoup(html_doc, "html5lib")
 23 |         container = soup.find("div", {"class": "post-container"})
 24 |         url_data = container.find("a", href=True)["href"]
 25 |         log.debug(f"url_data: {url_data}")
 26 | 
 27 |         if not url_data.endswith(".csv"):
 28 |             raise RuntimeError(f"Reign link doesn't look like .csv {url_data}")
 29 | 
 30 |         return url_data
 31 | 
 32 |     log.debug("Started fetching reign")
 33 |     url_base = "https://oefdatascience.github.io/REIGN.github.io"
 34 |     url_report = f"{url_base}/menu/reign_current.html"
 35 |     url = get_latest_data_url(url_report=url_report)
 36 |     common.fetch_source_simply(name="reign", url=url)
 37 |     log.debug("Finished fetching reign")
 38 | 
 39 | 
 40 | def fix_ccodes(df: pd.DataFrame, spec: Dict[str, Any]) -> pd.DataFrame:
 41 |     """ Fix country codes as defined by spec ccode_replaces """
 42 |     log.debug("Fixing ccodes")
 43 | 
 44 |     fixes = spec["ccode_replaces"]
 45 |     for fix_name, values in fixes.items():
 46 |         old = values["old"]
 47 |         new = values["new"]
 48 |         df.loc[df.ccode == old, "ccode"] = new
 49 |         log.debug(f"Replaced ccode {old} with {new} for {fix_name}")
 50 | 
 51 |     log.debug("Dropping duplicate country-months for leadership changes.")
 52 |     dropdup_cols = ["ccode", "year", "month"]
 53 |     # Some messages are too big even for debug...
 54 |     # msg = df[df.duplicated(subset=dropdup_cols, keep=False)].to_string()
 55 |     # log.debug(msg)
 56 |     df = df.sort_values("tenure_months")
 57 |     len_df_predrop = len(df)
 58 |     df = df.drop_duplicates(subset=dropdup_cols, keep="first")
 59 |     len_df_postdrop = len(df)
 60 | 
 61 |     log.debug(f"Dropped {len_df_predrop - len_df_postdrop} duplicate obs")
 62 | 
 63 |     return df
 64 | 
 65 | 
 66 | def encode_govt_dummies(df: pd.DataFrame) -> pd.DataFrame:
 67 |     """ Encode government dummies """
 68 |     log.debug("Encoding reign government dummies")
 69 | 
 70 |     def cleanup_govtype_name(name):
 71 |         """ Remove " ", "-", "/" from government type strings """
 72 |         name = name.lower()
 73 |         name = name.replace(" ", "_").replace("-", "_").replace("/", "_")
 74 |         name = name.replace("__", "_").replace("__", "_")
 75 |         return name
 76 | 
 77 |     df["government"] = df["government"].apply(cleanup_govtype_name)
 78 |     df_gov = pd.get_dummies(df["government"], prefix="gov")
 79 |     log.debug(f"Adding dummy cols {list(df_gov.columns)}")
 80 |     df = df.join(df_gov)
 81 |     return df
 82 | 
 83 | 
 84 | def load_reign() -> None:
 85 |     """ Load reign """
 86 |     log.info("Started loading reign.")
 87 | 
 88 |     spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml"))
 89 |     with tempfile.TemporaryDirectory() as tempdir:
 90 |         paths = common.get_files_latest_fetch(name="reign", tempdir=tempdir)
 91 |         path_csv = [path for path in paths if path.endswith(".csv")].pop()
 92 |         df = io.csv_to_df(path=path_csv)
 93 | 
 94 |     df = fix_ccodes(df, spec)
 95 |     df = encode_govt_dummies(df)
 96 | 
 97 |     df = df.set_index(["year", "month", "ccode"])
 98 |     df = df.join(
 99 |         db.query_to_df(
100 |             query="""
101 |                 SELECT id AS country_id, gwcode AS ccode
102 |                 FROM staging.country WHERE gweyear=2016;
103 |                 """
104 |         ).set_index(["ccode"])
105 |     )
106 |     df = df.join(
107 |         db.query_to_df(
108 |             query="""
109 |             SELECT id AS month_id, year_id AS year, month FROM staging.month;
110 |             """
111 |         ).set_index(["year", "month"])
112 |     )
113 |     df = df.reset_index().set_index(["month_id", "country_id"])
114 |     df = df.drop(
115 |         columns=["year", "month", "ccode", "country", "government", "leader"]
116 |     )
117 | 
118 |     df_skeleton = db.db_to_df(
119 |         fqtable="skeleton.cm_global",
120 |         cols=["month_id", "country_id"],
121 |         ids=["month_id", "country_id"],
122 |     )
123 |     len_skel = len(df_skeleton)
124 |     df = df_skeleton.join(df, how="left")
125 |     if not len(df) == len_skel:
126 |         raise RuntimeError(f"Join not correct, {len_skel} != {len(df)}")
127 | 
128 |     df = df.add_prefix("reign_")
129 | 
130 |     db.drop_schema("reign_v2")
131 |     db.create_schema("reign_v2")
132 |     db.df_to_db(df=df, fqtable="reign_v2.cm_unimp")
133 | 
134 |     db.df_to_db(
135 |         df=missing.fill_groups_with_time_means(missing.extrapolate(df)),
136 |         fqtable="reign_v2.cm_extrapolated",
137 |     )
138 | 
139 |     log.info("Finished loading reign.")
140 | 


--------------------------------------------------------------------------------
/views/database/sources/reign/spec.yaml:
--------------------------------------------------------------------------------
 1 | ccode_replaces:
 2 |   germany:
 3 |     old: 255
 4 |     new: 260
 5 |   yemen:
 6 |     old: 679
 7 |     new: 678
 8 |   nauru:
 9 |     old: 970
10 |     new: 971
11 |   kiribati:
12 |     old: 946
13 |     new: 970
14 |   tuvalu:
15 |     old: 947
16 |     new: 973
17 |   tonga:
18 |     old: 955
19 |     new: 972
20 |   serbia:
21 |     old: 345
22 |     new: 340


--------------------------------------------------------------------------------
/views/database/sources/spei/__init__.py:
--------------------------------------------------------------------------------
1 | """ SPEI package """
2 | __all__ = ["fetch_spei", "load_spei"]
3 | from .spei import fetch_spei, load_spei
4 | 


--------------------------------------------------------------------------------
/views/database/sources/spei/cleanup.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE spei_v2.spei_1;
 2 | DROP TABLE spei_v2.spei_2;
 3 | DROP TABLE spei_v2.spei_3;
 4 | DROP TABLE spei_v2.spei_4;
 5 | DROP TABLE spei_v2.spei_5;
 6 | DROP TABLE spei_v2.spei_6;
 7 | DROP TABLE spei_v2.spei_7;
 8 | DROP TABLE spei_v2.spei_8;
 9 | DROP TABLE spei_v2.spei_9;
10 | DROP TABLE spei_v2.spei_10;
11 | DROP TABLE spei_v2.spei_11;
12 | DROP TABLE spei_v2.spei_12;
13 | DROP TABLE spei_v2.spei_13;
14 | DROP TABLE spei_v2.spei_14;
15 | DROP TABLE spei_v2.spei_15;
16 | DROP TABLE spei_v2.spei_16;
17 | DROP TABLE spei_v2.spei_17;
18 | DROP TABLE spei_v2.spei_18;
19 | DROP TABLE spei_v2.spei_19;
20 | DROP TABLE spei_v2.spei_20;
21 | DROP TABLE spei_v2.spei_21;
22 | DROP TABLE spei_v2.spei_22;
23 | DROP TABLE spei_v2.spei_23;
24 | DROP TABLE spei_v2.spei_24;
25 | DROP TABLE spei_v2.spei_25;
26 | DROP TABLE spei_v2.spei_26;
27 | DROP TABLE spei_v2.spei_27;
28 | DROP TABLE spei_v2.spei_28;
29 | DROP TABLE spei_v2.spei_29;
30 | DROP TABLE spei_v2.spei_30;
31 | DROP TABLE spei_v2.spei_31;
32 | DROP TABLE spei_v2.spei_32;
33 | DROP TABLE spei_v2.spei_33;
34 | DROP TABLE spei_v2.spei_34;
35 | DROP TABLE spei_v2.spei_35;
36 | DROP TABLE spei_v2.spei_36;
37 | DROP TABLE spei_v2.spei_37;
38 | DROP TABLE spei_v2.spei_38;
39 | DROP TABLE spei_v2.spei_39;
40 | DROP TABLE spei_v2.spei_40;
41 | DROP TABLE spei_v2.spei_41;
42 | DROP TABLE spei_v2.spei_42;
43 | DROP TABLE spei_v2.spei_43;
44 | DROP TABLE spei_v2.spei_44;
45 | DROP TABLE spei_v2.spei_45;
46 | DROP TABLE spei_v2.spei_46;
47 | DROP TABLE spei_v2.spei_47;
48 | DROP TABLE spei_v2.spei_48;
49 | DROP TABLE spei_v2.pg_ug;
50 | 


--------------------------------------------------------------------------------
/views/database/sources/spei/pg_ug.sql:
--------------------------------------------------------------------------------
 1 | -- Create a grid similar to priogrid but with 1x1 degree resolution
 2 | -- Priogrid is 0.5x0.5 degree resolution.
 3 | -- SPEI comies in at 1x1 resolution so we use this to map SPEI to pg_ids
 4 | 
 5 | CREATE OR REPLACE FUNCTION
 6 | ST_CreateFishnet(
 7 |     -- PARAMETERS
 8 |         nrow integer, ncol integer,
 9 |         ysize float8, xsize float8,
10 |         y0 float8 DEFAULT 0, x0 float8 DEFAULT 0,
11 |         srid integer DEFAULT 4326,
12 |         OUT "row" integer, OUT col integer,
13 |         OUT geom geometry)
14 |     -- RETURNS
15 |         RETURNS SETOF record AS
16 |     -- PROCESS
17 |         $$
18 |         SELECT i + 1 AS row, j + 1 AS col, ST_SetSRID(ST_Translate(cell, j * $3 + $5, i * $4 + $6), $7) AS geom
19 |         FROM generate_series(0, $1 - 1) AS j,
20 |              generate_series(0, $2 - 1) AS i,
21 |              (SELECT ('POLYGON((0 0, 0 '||$4||', '||$3||' '||$4||', '||$3||' 0,0 0))')::geometry AS cell) AS foo;
22 |         $$ LANGUAGE sql IMMUTABLE STRICT;
23 | 
24 | 
25 | -- Create global 1x1 grid
26 | DROP TABLE IF EXISTS spei_v2.unigrid_world;
27 | CREATE TABLE spei_v2.unigrid_world (
28 |     gid serial NOT NULL,
29 |     "row" integer,
30 |     col integer,
31 |     cell geometry(Polygon, 4326),
32 |     CONSTRAINT unigrid_pkey PRIMARY KEY (gid));
33 | INSERT INTO spei_v2.unigrid_world ("row", col, cell) SELECT * FROM ST_CreateFishnet(360, 180, 1.0, 1.0, -180, -90, 4326) AS cells;
34 | CREATE INDEX ON spei_v2.unigrid_world USING GIST (cell);
35 | 
36 | 
37 | -- Create table of pg_ids to ug_ids
38 | DROP TABLE IF EXISTS spei_v2.pg_ug;
39 | CREATE TABLE spei_v2.pg_ug AS
40 | SELECT pg.gid AS pg_id,
41 |        ug.gid AS ug_id
42 | FROM staging.priogrid AS pg,
43 |      spei_v2.unigrid_world as ug
44 | --Returns true if no point in pg.geom is outside of ug.cell, otherwise false.
45 | -- ug.cells cover pg.geometries
46 | WHERE ST_Covers(ug.cell, pg.geom);
47 | 
48 | DROP TABLE spei_v2.unigrid_world;


--------------------------------------------------------------------------------
/views/database/sources/vdem/__init__.py:
--------------------------------------------------------------------------------
1 | """ VDEM package """
2 | __all__ = ["fetch_vdem", "load_vdem"]
3 | from .vdem import fetch_vdem, load_vdem
4 | 


--------------------------------------------------------------------------------
/views/database/sources/wdi/__init__.py:
--------------------------------------------------------------------------------
1 | """ WDI package """
2 | __all__ = ["fetch_wdi", "load_wdi"]
3 | from .wdi import fetch_wdi, load_wdi
4 | 


--------------------------------------------------------------------------------
/views/specs/README.md:
--------------------------------------------------------------------------------
1 | # specs
2 | 
3 | ViEWS has many definitions that should be the same throughout the project:
4 | 
5 | * Models, which features go into which models?
6 | * Periods, what are the time limits for training, calibrating and predicting?
7 | 
8 | This module provides a system of specfiles to use as references throughout.


--------------------------------------------------------------------------------
/views/specs/__init__.py:
--------------------------------------------------------------------------------
1 | """ All production specs should be placed and accessed through here """
2 | __all__ = ["data", "models", "periods"]
3 | from . import data, models, periods
4 | 


--------------------------------------------------------------------------------
/views/specs/data/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/UppsalaConflictDataProgram/OpenViEWS2/7eb3e63c8c046de31f70cd56f417fadf03686f5a/views/specs/data/README.md


--------------------------------------------------------------------------------
/views/specs/data/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Specification of datasets and transformations """
 2 | from typing import Any, Dict, Union
 3 | 
 4 | from views.apps.data import api
 5 | 
 6 | from . import parsed_datasets
 7 | 
 8 | 
 9 | def build_geometries() -> Dict[str, Any]:
10 |     """ Just expose our custom geometries as dict to be consistent """
11 |     geometries = {
12 |         "GeomPriogrid": api.GeomPriogrid(),
13 |         "GeomCountry": api.GeomCountry(),
14 |     }
15 |     return geometries
16 | 
17 | 
18 | GEOMETRIES: Dict[
19 |     str, Union[api.GeomPriogrid, api.GeomCountry]
20 | ] = build_geometries()
21 | TABLES: Dict[str, api.Table] = parsed_datasets.build_tables()
22 | DATASETS: Dict[str, api.Dataset] = parsed_datasets.build_datasets()
23 | 


--------------------------------------------------------------------------------
/views/specs/data/parsed_datasets.py:
--------------------------------------------------------------------------------
 1 | """ Parsers for Table and Dataset Dicts from spec.yaml """
 2 | from typing import Dict
 3 | import os
 4 | from views.utils import io
 5 | from views.apps.data.api import Dataset, Table
 6 | from . import solver
 7 | 
 8 | 
 9 | def build_tables() -> Dict[str, Table]:
10 |     """ Build Table objects from spec.yaml in this dir """
11 |     specs = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml"))
12 |     # Build tables dict
13 |     tables: Dict[str, Table] = dict()
14 |     for fqtable, spec in specs["tables"].items():
15 |         tables[fqtable] = Table(fqtable=fqtable, ids=spec["ids"])
16 | 
17 |     return tables
18 | 
19 | 
20 | def build_datasets() -> Dict[str, Dataset]:
21 |     """ Build Datasets from spec.yaml in this dir """
22 |     specs = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml"))
23 |     tables: Dict[str, Table] = build_tables()
24 | 
25 |     # Build transformsets dict
26 |     datasets: Dict[str, Dataset] = dict()
27 |     for name, spec in specs["datasets"].items():
28 |         dataset = Dataset(
29 |             name=name,
30 |             ids=spec["ids"],
31 |             table_skeleton=tables[spec["table_skeleton"]],
32 |             tables=[tables[table] for table in spec["tables"]],
33 |             loa=spec["loa"],
34 |             cols=spec["cols"] if "cols" in spec.keys() else None,
35 |             transforms=solver.make_transforms_ordered(spec["transforms"]),
36 |             balance=spec["balance"],
37 |         )
38 |         datasets[name] = dataset
39 | 
40 |     return datasets
41 | 


--------------------------------------------------------------------------------
/views/specs/data/solver.py:
--------------------------------------------------------------------------------
 1 | """ Specification solver for transformations
 2 | 
 3 | A user can specify a set of transformations as a dictionary.
 4 | make_transforms_ordered() returns a dependency ordered list of
 5 | the corresponding Transform() instances.
 6 | """
 7 | from typing import Any, Dict, List
 8 | 
 9 | from views.apps.data import api
10 | 
11 | 
12 | def _get_cols_source(transforms: List[api.Transform]) -> List[str]:
13 |     """ Get a list of source columns needed for a list of Tranforms """
14 | 
15 |     all_names = [transform.name for transform in transforms]
16 |     all_cols = []
17 |     for transform in transforms:
18 |         for col in transform.cols_input:
19 |             all_cols.append(col)
20 | 
21 |     # Dedup
22 |     all_cols = sorted(list(set(all_cols)))
23 |     cols_source = [col for col in all_cols if col not in all_names]
24 |     cols_source = sorted(cols_source)
25 | 
26 |     return cols_source
27 | 
28 | 
29 | def _order_transforms(transforms: List[api.Transform]) -> List[api.Transform]:
30 |     """ Order transformations so they are done in dependency order """
31 | 
32 |     def names(tasks):
33 |         return [task.name for task in tasks]
34 | 
35 |     ordered: List[api.Transform] = list()
36 |     while transforms:
37 |         progress = False
38 |         for task in transforms:
39 |             # if task has deps in the other transforms that haven't
40 |             # been solved themselves wait
41 |             if any(
42 |                 [
43 |                     col in names(transforms) and col not in names(ordered)
44 |                     for col in task.cols_input
45 |                 ]
46 |             ):
47 |                 pass
48 |             else:
49 |                 ordered.append(task)
50 |                 transforms.remove(task)
51 |                 progress = True
52 |         if not progress:
53 |             raise RuntimeError(
54 |                 "No progress, transform spec broken."
55 |                 f"Ordered (OK): {ordered}"
56 |                 f"Remaining: {transforms}"
57 |             )
58 | 
59 |     return ordered
60 | 
61 | 
62 | def make_transforms_ordered(
63 |     specs: Dict[str, Dict[str, Any]]
64 | ) -> List[api.Transform]:
65 |     """ Make dependency ordered list of Transform objects """
66 |     transforms = [api.Transform(name, **spec) for name, spec in specs.items()]
67 |     transforms = _order_transforms(transforms)
68 |     return transforms
69 | 


--------------------------------------------------------------------------------
/views/specs/models/README.md:
--------------------------------------------------------------------------------
 1 | # Model specs
 2 | 
 3 | ViEWS has a lot of big models with very many features.
 4 | Keeping track of which column goes where can be very difficult.
 5 | This module provides three master spec files, am.yaml, cm.yaml and pgm.yaml
 6 | to attempt to keep track of them.
 7 | 
 8 | The idea is to group columns into a hierarchy of
 9 | * colsets, that list plain columns
10 | * themes, that groups colsets and other themes
11 | * formulas, that resolve a list of columns from the above
12 | 
13 | Colsets, or column sets, are simply lists of columns with a name.
14 | Themes are made of colsets or by combining themes and colsets.
15 | Finally, formulas map all columns from a theme or colset to an outcome column.
16 | By applying the solver to these spec files we get solved formulas.
17 | They have a name, a col_outcome and a list of cols_features, which is found by recursively looking them up through themes and colsets.
18 | For a minimal example see tests/test_specs.py
19 | 
20 | 


--------------------------------------------------------------------------------
/views/specs/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Defines which columns go into which models """
 2 | from typing import Dict, Any
 3 | import os
 4 | from views.utils import io
 5 | from . import solver
 6 | 
 7 | _THIS_DIR = os.path.dirname(__file__)
 8 | 
 9 | cm: Dict[Any, Any] = solver.solve_formulas(
10 |     io.load_yaml(os.path.join(_THIS_DIR, "cm.yaml"))
11 | )
12 | pgm: Dict[Any, Any] = solver.solve_formulas(
13 |     io.load_yaml(os.path.join(_THIS_DIR, "pgm.yaml"))
14 | )
15 | 
16 | __all__ = ["cm", "pgm"]
17 | 


--------------------------------------------------------------------------------
/views/specs/models/am.yaml:
--------------------------------------------------------------------------------
1 | colsets: {}
2 | themes: {}
3 | formulas: {}


--------------------------------------------------------------------------------
/views/specs/models/solver.py:
--------------------------------------------------------------------------------
 1 | """ Model specification solver """
 2 | 
 3 | import os
 4 | from typing import Any, Dict, List, Union
 5 | 
 6 | from views.utils import io
 7 | 
 8 | 
 9 | def solve_formulas(
10 |     spec: Dict[Any, Any]
11 | ) -> Dict[str, Dict[str, Union[List[str], str]]]:
12 |     """ Solve the colsets, themes and formulas from spec """
13 | 
14 |     def solve_theme(
15 |         name_theme: str, colsets: Dict[str, str], themes: Dict[str, str]
16 |     ) -> List[str]:
17 |         """ Get a dictionary of column-populated themes """
18 | 
19 |         cols_theme = list()
20 |         refs = themes[name_theme]
21 |         for ref in refs:
22 | 
23 |             # If the reference is to a colset just get the cols
24 |             if ref in colsets.keys():
25 |                 for col in colsets[ref]:
26 |                     if col not in cols_theme:  # avoid dups
27 |                         cols_theme.append(col)
28 | 
29 |             # Recursive lookup for themes
30 |             elif ref in themes.keys():
31 |                 for col in solve_theme(ref, colsets, themes):
32 |                     if col not in cols_theme:  # avoid dups
33 |                         cols_theme.append(col)
34 | 
35 |             else:
36 |                 raise RuntimeError(
37 |                     f"{ref} not found in {colsets.keys()} or {themes.keys()}"
38 |                 )
39 | 
40 |         return sorted(cols_theme)
41 | 
42 |     def solve_themes_and_colsets(spec: Dict[Any, Any]) -> Dict[str, List[str]]:
43 |         """ Solve the themes by looking up names from colsets or themes """
44 | 
45 |         solved_themes = dict()
46 |         for name_theme in spec["themes"].keys():
47 |             solved_themes[name_theme] = solve_theme(
48 |                 name_theme, spec["colsets"], spec["themes"]
49 |             )
50 |         for name_colset, colset in spec["colsets"].items():
51 |             solved_themes[name_colset] = colset
52 |         return solved_themes
53 | 
54 |     assert list(spec.keys()) == ["colsets", "themes", "formulas"]
55 | 
56 |     solved_themes: Dict[str, List[str]] = solve_themes_and_colsets(spec)
57 |     solved_formulas = dict()
58 |     for name_formula, formula in spec["formulas"].items():
59 |         solved_formulas[name_formula] = {
60 |             "col_outcome": formula["col_outcome"],
61 |             "cols_features": sorted(solved_themes[formula["cols_features"]]),
62 |         }
63 |     return solved_formulas
64 | 
65 | 
66 | def solved_cm() -> Dict[str, Dict[str, Union[List[str], str]]]:
67 |     """ Get solved CM formulas from cm.yaml """
68 |     spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "cm.yaml"))
69 |     formulas = solve_formulas(spec)
70 |     return formulas
71 | 
72 | 
73 | def solved_pgm() -> Dict[str, Dict[str, Union[List[str], str]]]:
74 |     """ Get solved CM formulas from cm.yaml """
75 |     spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "pgm.yaml"))
76 |     formulas = solve_formulas(spec)
77 |     return formulas
78 | 


--------------------------------------------------------------------------------
/views/specs/periods/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Defines which columns go into which models """
 2 | from typing import Dict, List
 3 | import os
 4 | from views.utils import io
 5 | from views.apps.model import api
 6 | 
 7 | 
 8 | def get_periods(run_id: str) -> List[api.Period]:
 9 |     """ Get periods for a particular run as list """
10 |     _this_dir = os.path.dirname(__file__)
11 |     spec = io.load_yaml(os.path.join(_this_dir, "periods.yaml"))["runs"]
12 | 
13 |     spec_run = spec[run_id]
14 |     periods = []
15 |     for period_name, data in spec_run.items():
16 |         period = api.Period(
17 |             name=period_name,
18 |             train_start=data["train"]["start"],
19 |             train_end=data["train"]["end"],
20 |             predict_start=data["predict"]["start"],
21 |             predict_end=data["predict"]["end"],
22 |         )
23 |         periods.append(period)
24 |     return periods
25 | 
26 | 
27 | def get_periods_by_name(run_id: str) -> Dict[str, api.Period]:
28 |     """ Get periods for a particular run as name-index dict """
29 |     periods_list = get_periods(run_id)
30 |     periods_by_name = dict()
31 |     for period in periods_list:
32 |         periods_by_name[period.name] = period
33 | 
34 |     return periods_by_name
35 | 
36 | 
37 | __all__ = ["get_periods", "get_periods_by_name"]
38 | 


--------------------------------------------------------------------------------
/views/specs/periods/periods.yaml:
--------------------------------------------------------------------------------
  1 | # This file defines the time limits used in ViEWS
  2 | # they are organised by each run
  3 | #
  4 | 
  5 | runs:
  6 | 
  7 |   # First model development run
  8 |   # All models will be trained here
  9 |   # When yearly data is released in 2020.05 or 2020.06 this should be
 10 |   # copied and re-run under a new name so that we are training on
 11 |   # latest yearly-release data
 12 |   # We don't want to move C train back to end at 2018.12 now though
 13 |   # because we have published results trained on data up op 2019.12
 14 |   # already.
 15 |   d_2020_04_01:
 16 |     A: # Calibration period for B
 17 |       train:
 18 |         start: 121  # 1990.01
 19 |         end: 396    # 2012.12
 20 |       predict:
 21 |         start: 397  # 2013.01
 22 |         end: 432    # 2015.12,
 23 |     B:  # Evaluation period. Calibration for C.
 24 |       train:
 25 |         start: 121  # 1990.01
 26 |         end: 432    # 2015.12
 27 |       predict:
 28 |         start: 433  # 2016.01,
 29 |         end: 468    # 2018.12, last month yearly data
 30 |     C:
 31 |       train:
 32 |         start: 121  # 1990.01
 33 |         end: 480    # 2019.12, last month latest data
 34 |       predict:
 35 |         start: 483  # 2020.03
 36 |         end: 520    # 2023.04
 37 | 
 38 | 
 39 |   d_2020_05_01_prelim:
 40 |     # A preliminary run for UN Covid19 report
 41 |     # Includes the A partition so as to not break any compatibility for now
 42 |     A: # Calibration period for B
 43 |       train:
 44 |         start: 121  # 1990.01
 45 |         end: 396    # 2012.12
 46 |       predict:
 47 |         start: 397  # 2013.01
 48 |         end: 432    # 2015.12,
 49 |     B:  # Evaluation period. Calibration for C.
 50 |       train:
 51 |         start: 121  # 1990.01
 52 |         end: 432    # 2015.12
 53 |       predict:
 54 |         start: 433  # 2016.01,
 55 |         end: 468    # 2018.12, last month yearly data
 56 |     C:
 57 |       train:
 58 |         start: 121  # 1990.01
 59 |         end: 480    # 2019.12
 60 |       predict:
 61 |         start: 484  # 2020.04
 62 |         end: 521    # 2023.05
 63 | 
 64 |   d_2020_06_01_prelim:
 65 |     # A preliminary run with OSA only, XGB and DS not merged yet =(
 66 |     # Includes the A partition so as to not break any compatibility for now
 67 |     A: # Calibration period for B
 68 |       train:
 69 |         start: 121  # 1990.01
 70 |         end: 396    # 2012.12
 71 |       predict:
 72 |         start: 397  # 2013.01
 73 |         end: 432    # 2015.12,
 74 |     B:  # Evaluation period. Calibration for C.
 75 |       train:
 76 |         start: 121  # 1990.01
 77 |         end: 432    # 2015.12
 78 |       predict:
 79 |         start: 433  # 2016.01,
 80 |         end: 468    # 2018.12, last month yearly data
 81 |     C:
 82 |       train:
 83 |         start: 121  # 1990.01
 84 |         end: 480    # 2019.12
 85 |       predict:
 86 |         start: 485  # 2020.05
 87 |         end: 522    # 2023.06
 88 | 
 89 | 
 90 |   r_2020_02_01:
 91 |     B:
 92 |       train:
 93 |         start: 121  # 1990.01
 94 |         end: 432    # 2015.12
 95 |       predict:
 96 |         start: 433  # 2016.01,
 97 |         end: 468    # 2018.12, last month yearly data
 98 |     C:
 99 |       train:
100 |         start: 121  # 1990.01
101 |         end: 480    # 2019.12
102 |       predict:
103 |         start: 481  # 2020.01
104 |         end: 518    # 2023.02 # 38 months of forecast
105 | 
106 |   r_2020_03_01:
107 |     B:
108 |       train:
109 |         start: 121  # 1990.01
110 |         end: 432    # 2015.12
111 |       predict:
112 |         start: 433  # 2016.01,
113 |         end: 468    # 2018.12, last month yearly data
114 |     C:
115 |       train:
116 |         start: 121  # 1990.01
117 |         end: 480    # 2019.12
118 |       predict:
119 |         start: 482  # 2020.02
120 |         end: 519    # 2023.03
121 | 
122 |   r_2020_04_01:
123 |     B:
124 |       train:
125 |         start: 121  # 1990.01
126 |         end: 432    # 2015.12
127 |       predict:
128 |         start: 433  # 2016.01,
129 |         end: 468    # 2018.12, last month yearly data
130 |     C:
131 |       train:
132 |         start: 121  # 1990.01
133 |         end: 480    # 2019.12
134 |       predict:
135 |         start: 483  # 2020.03
136 |         end: 520    # 2023.04
137 | 
138 |   r_2020_05_01:
139 |     B:
140 |       train:
141 |         start: 121  # 1990.01
142 |         end: 432    # 2015.12
143 |       predict:
144 |         start: 433  # 2016.01,
145 |         end: 468    # 2018.12, last month yearly data
146 |     C:
147 |       train:
148 |         start: 121  # 1990.01
149 |         end: 480    # 2019.12
150 |       predict:
151 |         start: 484  # 2020.04
152 |         end: 521    # 2023.05
153 | 
154 |   r_2020_06_01:
155 |     B:
156 |       train:
157 |         start: 121  # 1990.01
158 |         end: 432    # 2015.12
159 |       predict:
160 |         start: 433  # 2016.01,
161 |         end: 468    # 2018.12, last month yearly data
162 |     C:
163 |       train:
164 |         start: 121  # 1990.01
165 |         end: 480    # 2019.12
166 |       predict:
167 |         start: 485  # 2020.05
168 |         end: 522    # 2023.06
169 | 
170 |   r_2020_07_01:
171 |     B:
172 |       train:
173 |         start: 121  # 1990.01
174 |         end: 432    # 2015.12
175 |       predict:
176 |         start: 433  # 2016.01,
177 |         end: 468    # 2018.12, last month yearly data
178 |     C:
179 |       train:
180 |         start: 121  # 1990.01
181 |         end: 480    # 2019.12
182 |       predict:
183 |         start: 486  # 2020.06
184 |         end: 523    # 2023.07


--------------------------------------------------------------------------------
/views/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Common utilities """
 2 | __all__ = [
 3 |     "data",
 4 |     "db",
 5 |     "io",
 6 |     "log",
 7 |     "misc",
 8 |     "mocker",
 9 |     "stats",
10 | ]
11 | 
12 | from . import data, db, io, log, misc, mocker, stats
13 | 


--------------------------------------------------------------------------------
/views/utils/data.py:
--------------------------------------------------------------------------------
  1 | """ Common data utilities """
  2 | from typing import List, Union
  3 | import logging
  4 | 
  5 | import numpy as np  # type: ignore
  6 | import pandas as pd  # type: ignore
  7 | 
  8 | log = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | def resample(
 12 |     df: pd.DataFrame,
 13 |     cols: List[str],
 14 |     share_positives: float,
 15 |     share_negatives: float,
 16 |     threshold=0,
 17 | ):
 18 |     """ Resample a dataframe with respect to cols
 19 | 
 20 |     Resampling is a technique for changing the positive/negative balance
 21 |     of a dataframe. Positives are rows where any of the specified cols
 22 |     are greater than the threshold. Useful for highly unbalanced
 23 |     datasets where positive outcomes are rare.
 24 | 
 25 |     """
 26 |     # If both shares are 1 just return the unaltered df
 27 |     if share_positives == 1 and share_negatives == 1:
 28 |         return df
 29 | 
 30 |     # Negatives are rows where all cols are close to zero
 31 |     mask_negatives = np.isclose(df[cols], threshold).max(axis=1)
 32 |     # Positives are all the others
 33 |     mask_positives = ~mask_negatives
 34 | 
 35 |     df_positives = df.loc[mask_positives]
 36 |     df_negatives = df.loc[mask_negatives]
 37 | 
 38 |     len_positives = len(df_positives)
 39 |     len_negatives = len(df_negatives)
 40 | 
 41 |     n_positives_wanted = int(share_positives * len_positives)
 42 |     n_negatives_wanted = int(share_negatives * len_negatives)
 43 | 
 44 |     replacement_pos = share_positives > 1
 45 |     replacement_neg = share_negatives > 1
 46 |     df = pd.concat(
 47 |         [
 48 |             df_positives.sample(n=n_positives_wanted, replace=replacement_pos),
 49 |             df_negatives.sample(n=n_negatives_wanted, replace=replacement_neg),
 50 |         ]
 51 |     )
 52 |     return df
 53 | 
 54 | 
 55 | def check_has_multiindex(data: Union[pd.Series, pd.DataFrame]) -> None:
 56 |     """ Raise RuntimeError if Series s doesn't have MultiIndex """
 57 |     if not isinstance(data.index, pd.MultiIndex):
 58 |         msg = (
 59 |             "Data is lacking a multiindex that was expected."
 60 |             "Set the index with df.set_index([timevar, groupvar])."
 61 |         )
 62 |         raise RuntimeError(msg)
 63 | 
 64 | 
 65 | def balance_panel_last_t(df: pd.DataFrame) -> pd.DataFrame:
 66 |     """ Balance a multiindexed dataframe panel.
 67 | 
 68 |     The balanced index has observations for all groups present at the
 69 |     last t.
 70 |     Assumens df is indexed with timevar as index level 0, and groupvar
 71 |     at index level 1.
 72 | 
 73 |     Args:
 74 |         df: Dataframe with multiindex to balance
 75 |     Returns:
 76 |         df: A reindexed dataframe
 77 |     """
 78 |     log.debug(f"Balancing index of panel with shape {df.shape}")
 79 |     check_has_multiindex(df)
 80 | 
 81 |     # Reset the index to actual values,
 82 |     # Needed in case data has been subseted with .loc before
 83 |     # If this isn't done, df.index.levels[0].max() gets the
 84 |     # pre-subsetting max
 85 |     df = df.reset_index().set_index(df.index.names).sort_index()
 86 | 
 87 |     return df.reindex(
 88 |         pd.MultiIndex.from_product(
 89 |             [
 90 |                 df.index.levels[0].unique(),
 91 |                 df.loc[df.index.levels[0].max()].index.unique(),
 92 |             ],
 93 |             names=df.index.names,
 94 |         )
 95 |     ).sort_index()
 96 | 
 97 | 
 98 | def assign_into_df(df_to: pd.DataFrame, df_from: pd.DataFrame) -> pd.DataFrame:
 99 |     """ Assign all columns from df_from into df_to
100 | 
101 |     Only assigns non-missing values from df_from, meaning the
102 |     same column can be inserted multiple times and values be
103 |     retained if the row coverage is different between calls.
104 |     So a df_a with col_a covering months 100-110 and df_b with col_a covering
105 |     months 111-120 could be assigned into a single df which would get
106 |     values of col_a for months 100 - 120.
107 |     """
108 | 
109 |     for col in df_from:
110 |         log.debug(f"Inserting col {col}")
111 |         # Get a Series of the col for all rows
112 |         s = df_from.loc[:, col]
113 |         # Get the "is not null" boolean series to use as mask, ~ is NOT
114 |         mask = ~s.isnull()
115 |         # Get the index from that mask,
116 |         # ix is now index labels of rows with (not missing) data
117 |         ix = s.loc[mask].index
118 |         df_to.loc[ix, col] = s.loc[ix]
119 |     return df_to
120 | 
121 | 
122 | def rebuild_index(data: pd.DataFrame) -> pd.DataFrame:
123 |     """ Rebuild the index of the dataframe
124 | 
125 |     Sometimes we construct new dataframes from old ones or subset
126 |     dataframes by time. The contents of the df.index of the new
127 |     dataframes then still contain the full set of values from the old
128 |     df. This function rebuilds the index to only have the actual
129 |     values with rows.
130 |     """
131 |     check_has_multiindex(data)
132 |     return data.reset_index().set_index(data.index.names).sort_index()
133 | 


--------------------------------------------------------------------------------
/views/utils/log.py:
--------------------------------------------------------------------------------
 1 | """ Logging utils """
 2 | from functools import wraps
 3 | import datetime
 4 | import logging
 5 | import os
 6 | import time
 7 | import uuid
 8 | 
 9 | from views.config import DIR_STORAGE
10 | 
11 | log = logging.getLogger(__name__)
12 | 
13 | 
14 | def utc_now() -> str:
15 |     """ Get current UTC time """
16 |     return datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
17 | 
18 | 
19 | def get_log_path(caller_path: str) -> str:
20 |     """ Get unique and timestamped path to a logfile """
21 |     name = os.path.basename(caller_path).replace(".py", "")
22 |     # Hopefully unique filename with timestamp and part of a uuid
23 |     fname = f"{name}_{utc_now()}_{str(uuid.uuid4()).split('-')[0]}.log"
24 |     path = os.path.join(DIR_STORAGE, "logs", fname)
25 |     print(f"Logging to {path}")
26 |     return path
27 | 
28 | 
29 | def logtime(func):
30 |     """This decorator logs the execution time for the decorated function."""
31 | 
32 |     @wraps(func)
33 |     def wrapper(*args, **kwargs):
34 |         start = time.time()
35 |         result = func(*args, **kwargs)
36 |         end = time.time()
37 |         log.debug("{} ran in {}s".format(func.__name__, round(end - start, 2)))
38 |         return result
39 | 
40 |     return wrapper
41 | 


--------------------------------------------------------------------------------
/views/utils/misc.py:
--------------------------------------------------------------------------------
 1 | """ Misc utils that don't fit anyhwere else """
 2 | from typing import Any, List
 3 | 
 4 | 
 5 | def lists_disjoint(lists: List[List[Any]]) -> bool:
 6 |     """ Do lists share any elements"""
 7 |     disjoint = True
 8 |     for i, base_list in enumerate(lists):
 9 |         lists_to_check = lists[i + 1 :]
10 |         for to_check in lists_to_check:
11 |             if not set(base_list).isdisjoint(to_check):
12 |                 disjoint = False
13 |     return disjoint
14 | 


--------------------------------------------------------------------------------
/views/utils/stats.py:
--------------------------------------------------------------------------------
 1 | """ Statistical utils
 2 | 
 3 | #@TODO: Figure out numpy / pandas types here
 4 | """
 5 | from typing import Any
 6 | import warnings
 7 | 
 8 | import numpy as np  # type: ignore
 9 | 
10 | 
11 | def prob_to_odds(p: Any, clip=True) -> Any:
12 |     """ Cast probability into odds """
13 | 
14 |     if isinstance(p, list):
15 |         p = np.array(p)
16 | 
17 |     if clip:
18 |         offset = 1e-10
19 |         offset = 1e-10
20 |         upper = 1 - offset
21 |         lower = 0 + offset
22 |         p = np.clip(p, lower, upper)
23 | 
24 |     # Check for probs greq 1 because odds of 1 is inf which might break things
25 |     if np.any(p >= 1):
26 |         msg = "probs >= 1 passed to get_odds, expect infs"
27 |         warnings.warn(msg)
28 | 
29 |     odds = p / (1 - p)
30 |     return odds
31 | 
32 | 
33 | def prob_to_logodds(p: Any) -> Any:
34 |     """ Cast probability to log-odds """
35 |     return np.log(prob_to_odds(p))
36 | 
37 | 
38 | def odds_to_prob(odds: Any) -> Any:
39 |     """ Cast odds ratio to probability """
40 |     return odds / (odds + 1)
41 | 
42 | 
43 | def logodds_to_prob(logodds: Any) -> Any:
44 |     """ Cast logodds to probability """
45 |     return odds_to_prob(np.exp(logodds))
46 | 


--------------------------------------------------------------------------------