├── .gitignore ├── .pylintrc ├── LICENSE.md ├── README.md ├── docs ├── .gitignore ├── Makefile ├── README.md ├── build_docs.sh ├── conf.py ├── human_source │ ├── coderef.rst │ └── manual.rst ├── index.rst └── source │ ├── modules.rst │ ├── views.apps.data.missing.rst │ ├── views.apps.data.rst │ ├── views.apps.ensemble.rst │ ├── views.apps.evaluation.rst │ ├── views.apps.extras.rst │ ├── views.apps.model.rst │ ├── views.apps.pipeline.rst │ ├── views.apps.rst │ ├── views.apps.slurm.rst │ ├── views.apps.transforms.rst │ ├── views.database.rst │ ├── views.database.skeleton.rst │ ├── views.database.sources.acled.legacy.rst │ ├── views.database.sources.acled.rst │ ├── views.database.sources.cdum.rst │ ├── views.database.sources.fvp.rst │ ├── views.database.sources.ged.legacy.rst │ ├── views.database.sources.ged.rst │ ├── views.database.sources.icgcw.rst │ ├── views.database.sources.pgdata.rst │ ├── views.database.sources.reign.rst │ ├── views.database.sources.rst │ ├── views.database.sources.spei.rst │ ├── views.database.sources.vdem.rst │ ├── views.database.sources.wdi.rst │ ├── views.rst │ ├── views.specs.data.rst │ ├── views.specs.models.rst │ ├── views.specs.periods.rst │ ├── views.specs.rst │ └── views.utils.rst ├── env_static.yaml ├── install_views2.sh ├── misc ├── README.md ├── defaults.yaml ├── environment.yaml ├── freeze_env.sh └── pytest.ini ├── projects ├── model_development │ ├── README.md │ └── example.ipynb ├── monthly_report │ └── changelog.md ├── plots │ └── example_maps.ipynb ├── prediction_competition │ ├── README.md │ └── benchmark_notebook.ipynb ├── replication_jpr_2020 │ ├── README.md │ └── gitlab_mirror │ │ └── views_jpr_2020_code.zip └── workshop │ └── presentation.ipynb ├── run_tools.sh ├── runners ├── README.md ├── export_data.py ├── import_data.py ├── predict.py ├── predict_slurm.py ├── refresh_data.py ├── refresh_data_slurm.py ├── train_all_local.py ├── train_all_slurm.py ├── train_model.py ├── train_slurm.py └── update_database.py ├── setup.py ├── tests ├── README.md ├── test_app_model_api.py ├── test_calibration.py ├── test_db.py ├── test_misc_utils.py ├── test_specs.py ├── test_structure.py ├── test_transforms_api.py ├── test_transforms_lib.py ├── test_utils.py └── test_utils_data.py └── views ├── __init__.py ├── apps ├── __init__.py ├── data │ ├── README.md │ ├── __init__.py │ ├── api.py │ ├── export_readme │ │ └── README.md │ ├── missing │ │ ├── __init__.py │ │ ├── amelia.py │ │ ├── amelia_template.R │ │ └── missing.py │ └── public.py ├── ensemble │ ├── __init__.py │ ├── ebma.py │ └── templates │ │ ├── install_ebma.R │ │ └── run_ebma.R ├── evaluation │ ├── __init__.py │ ├── feature_importance.py │ └── lib.py ├── extras │ ├── __init__.py │ └── extras.py ├── model │ ├── __init__.py │ ├── api.py │ ├── calibration.py │ └── crosslevel.py ├── pipeline │ ├── README.md │ ├── __init__.py │ ├── ensembles_cm.py │ ├── ensembles_pgm.py │ ├── models_cm.py │ ├── models_pgm.py │ ├── predict.py │ └── train.py ├── plot │ ├── __init__.py │ └── maps.py ├── slurm │ ├── __init__.py │ ├── slurm.py │ └── templates │ │ ├── runfile_core.txt │ │ └── runfile_node.txt ├── transforms │ ├── __init__.py │ └── lib.py └── xgb │ └── lib.py ├── config.py ├── database ├── README.md ├── __init__.py ├── common.py ├── skeleton │ ├── __init__.py │ ├── create_skeleton.sql │ └── skeleton.py └── sources │ ├── __init__.py │ ├── acled │ ├── __init__.py │ ├── acled.py │ ├── acled.sql │ └── legacy │ │ ├── __init__.py │ │ ├── acled.py │ │ └── prepare_acled.sql │ ├── cdum │ ├── __init__.py │ └── cdum.py │ ├── fvp │ ├── __init__.py │ ├── fvp.py │ └── spec.yaml │ ├── ged │ ├── __init__.py │ ├── ged.py │ ├── ged.sql │ └── legacy │ │ ├── __init__.py │ │ ├── ged.py │ │ ├── impute.py │ │ └── prepare_ged.sql │ ├── icgcw │ ├── __init__.py │ ├── fetch.py │ ├── icgcw.py │ └── spec.yaml │ ├── pgdata │ ├── __init__.py │ ├── fetch.py │ ├── pgdata.py │ └── spec.yaml │ ├── reign │ ├── __init__.py │ ├── reign.py │ └── spec.yaml │ ├── spei │ ├── __init__.py │ ├── cleanup.sql │ ├── pg_ug.sql │ ├── spei.py │ └── stage.sql │ ├── vdem │ ├── __init__.py │ └── vdem.py │ └── wdi │ ├── __init__.py │ └── wdi.py ├── specs ├── README.md ├── __init__.py ├── data │ ├── README.md │ ├── __init__.py │ ├── parsed_datasets.py │ ├── solver.py │ └── spec.yaml ├── models │ ├── README.md │ ├── __init__.py │ ├── am.yaml │ ├── cm.yaml │ ├── featlists_hh.yaml │ ├── pgm.yaml │ └── solver.py └── periods │ ├── __init__.py │ └── periods.yaml └── utils ├── __init__.py ├── data.py ├── db.py ├── io.py ├── log.py ├── misc.py ├── mocker.py └── stats.py /.gitignore: -------------------------------------------------------------------------------- 1 | sourceme.sh 2 | __pycache__* 3 | *.egg-info 4 | .ipynb_checkpoints 5 | .mypy_cache 6 | storage 7 | *.sublime* 8 | .DS_Store -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | disable=bad-continuation, # Conflicts with black 3 | logging-format-interpolation, # f-strings are better than % 4 | 5 | 6 | good-names=log, 7 | df, 8 | f, 9 | i, 10 | zf, 11 | s, 12 | y, 13 | df_X, 14 | ln, 15 | t, 16 | log, 17 | tp, 18 | tn, 19 | fp, 20 | fn -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ViEWS2 2 | 3 | Getting started 4 | 5 | Download and install miniconda3: https://docs.conda.io/en/latest/miniconda.html 6 | After you have conda installed, in your terminal run 7 | 8 | ./install_views2.sh 9 | 10 | This will create a conda environment called views2 and install the views package there. 11 | To fetch the latest public data run 12 | 13 | conda activate views2 14 | python runners/import_data.py --fetch 15 | 16 | To start using ViEWS code simply run 17 | 18 | conda activate views2 19 | jupyter notebook 20 | 21 | A web browser should open with the jupyter notebook browser. 22 | If you wish to take part in the prediction competition, see projects/prediction_competition/ 23 | An example notebook to get you started modelling is in projects/model_development/examply.ipynb. 24 | 25 | We develop ViEWS on Mac and Linux computers, the procedure is slightly different for Windows and we haven't developed a streamlined process for it yet. 26 | 27 | To open the HTML documentation from here on MacOS run 28 | 29 | ./run_tools.sh 30 | open docs/_build/html/index.html 31 | 32 | And it will take you to the locally built html documementation in your default browser. 33 | 34 | To view .pdf documentation (a work in progress) see https://views.pcr.uu.se/download/docs/views.pdf -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | _static 3 | _templates -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | To build the documentation cd to this directory and run 4 | 5 | `sphinx-apidoc -o source/ ../views` 6 | `make html` 7 | 8 | Or just run the run_tools.sh script in the root of the repo. It does this for you. 9 | 10 | Human written source files should go in human_source. 11 | Leave the `source` directory to sphinx-apidoc so that we can delete and rebuild it should it break. 12 | 13 | ## PDF 14 | To build a pdf make sure you have latexpdf installed (miktex worked for me) and run 15 | 16 | make latexpdf 17 | 18 | You will get a views.pdf in `_build/latex/views.pdf`. 19 | -------------------------------------------------------------------------------- /docs/build_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | clear 4 | 5 | # Stop on non-zero exit 6 | set -e 7 | 8 | echo "Initalising conda for this shell" 9 | eval "$(conda shell.bash hook)" 10 | conda activate views2 11 | 12 | echo "Generating docs" 13 | # Clear existing generated docs 14 | rm -f source/* 15 | # Auto-generate new docs 16 | # --module-frist makes Package __init__ come before all the submodules 17 | # See https://www.sphinx-doc.org/en/master/man/sphinx-apidoc.html#options 18 | sphinx-apidoc --module-first -o source/ ../views 19 | # Make HTML docs 20 | make html 21 | # Make PDF with latex 22 | make latexpdf -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.append(os.path.abspath('..')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'ViEWS' 21 | copyright = '2020, ViEWS Team' 22 | author = 'ViEWS Team' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '2.0' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.napoleon' 36 | ] 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # List of patterns, relative to source directory, that match files and 42 | # directories to ignore when looking for source files. 43 | # This pattern also affects html_static_path and html_extra_path. 44 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 45 | 46 | 47 | # -- Options for HTML output ------------------------------------------------- 48 | 49 | # The theme to use for HTML and HTML Help pages. See the documentation for 50 | # a list of builtin themes. 51 | # 52 | html_theme = 'alabaster' 53 | 54 | # Add any paths that contain custom static files (such as style sheets) here, 55 | # relative to this directory. They are copied after the builtin static files, 56 | # so a file named "default.css" will overwrite the builtin "default.css". 57 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /docs/human_source/coderef.rst: -------------------------------------------------------------------------------- 1 | Code reference 2 | ============== 3 | 4 | Here you will hopefully soon find auto-generated documentation from the code itself. A bit broken at the moment. Sorry. 5 | 6 | * :ref:`modindex` 7 | * :ref:`genindex` 8 | * :ref:`search` 9 | 10 | 11 | .. automodule:: views 12 | :members: -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ViEWS documentation master file, created by 2 | sphinx-quickstart on Mon May 18 23:56:52 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | ViEWS 8 | ===== 9 | 10 | ViEWS is a project focused on forecasting political violence. 11 | To do this we fit models to data and use those models to predict the likelihood of future conflict. 12 | Most of the code in this package is focused on this task and provides a framework for doing this correctly and easily. 13 | 14 | This document is a work in progress. 15 | We apologise for any confusing formatting or layout. 16 | 17 | 18 | 19 | .. toctree:: 20 | :maxdepth: 3 21 | :caption: Contents: 22 | 23 | human_source/manual 24 | human_source/coderef 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | views 2 | ===== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | views 8 | -------------------------------------------------------------------------------- /docs/source/views.apps.data.missing.rst: -------------------------------------------------------------------------------- 1 | views.apps.data.missing package 2 | =============================== 3 | 4 | .. automodule:: views.apps.data.missing 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.apps.data.missing.amelia module 13 | ------------------------------------- 14 | 15 | .. automodule:: views.apps.data.missing.amelia 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.apps.data.missing.missing module 21 | -------------------------------------- 22 | 23 | .. automodule:: views.apps.data.missing.missing 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/source/views.apps.data.rst: -------------------------------------------------------------------------------- 1 | views.apps.data package 2 | ======================= 3 | 4 | .. automodule:: views.apps.data 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 4 14 | 15 | views.apps.data.missing 16 | 17 | Submodules 18 | ---------- 19 | 20 | views.apps.data.api module 21 | -------------------------- 22 | 23 | .. automodule:: views.apps.data.api 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | views.apps.data.public module 29 | ----------------------------- 30 | 31 | .. automodule:: views.apps.data.public 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | -------------------------------------------------------------------------------- /docs/source/views.apps.ensemble.rst: -------------------------------------------------------------------------------- 1 | views.apps.ensemble package 2 | =========================== 3 | 4 | .. automodule:: views.apps.ensemble 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.apps.ensemble.ebma module 13 | ------------------------------- 14 | 15 | .. automodule:: views.apps.ensemble.ebma 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.apps.evaluation.rst: -------------------------------------------------------------------------------- 1 | views.apps.evaluation package 2 | ============================= 3 | 4 | .. automodule:: views.apps.evaluation 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.apps.evaluation.feature\_importance module 13 | ------------------------------------------------ 14 | 15 | .. automodule:: views.apps.evaluation.feature_importance 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.apps.evaluation.lib module 21 | -------------------------------- 22 | 23 | .. automodule:: views.apps.evaluation.lib 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/source/views.apps.extras.rst: -------------------------------------------------------------------------------- 1 | views.apps.extras package 2 | ========================= 3 | 4 | .. automodule:: views.apps.extras 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.apps.extras.extras module 13 | ------------------------------- 14 | 15 | .. automodule:: views.apps.extras.extras 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.apps.model.rst: -------------------------------------------------------------------------------- 1 | views.apps.model package 2 | ======================== 3 | 4 | .. automodule:: views.apps.model 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.apps.model.api module 13 | --------------------------- 14 | 15 | .. automodule:: views.apps.model.api 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.apps.model.calibration module 21 | ----------------------------------- 22 | 23 | .. automodule:: views.apps.model.calibration 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | views.apps.model.crosslevel module 29 | ---------------------------------- 30 | 31 | .. automodule:: views.apps.model.crosslevel 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | -------------------------------------------------------------------------------- /docs/source/views.apps.pipeline.rst: -------------------------------------------------------------------------------- 1 | views.apps.pipeline package 2 | =========================== 3 | 4 | .. automodule:: views.apps.pipeline 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.apps.pipeline.ensembles\_cm module 13 | ---------------------------------------- 14 | 15 | .. automodule:: views.apps.pipeline.ensembles_cm 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.apps.pipeline.ensembles\_pgm module 21 | ----------------------------------------- 22 | 23 | .. automodule:: views.apps.pipeline.ensembles_pgm 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | views.apps.pipeline.models\_cm module 29 | ------------------------------------- 30 | 31 | .. automodule:: views.apps.pipeline.models_cm 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | views.apps.pipeline.models\_pgm module 37 | -------------------------------------- 38 | 39 | .. automodule:: views.apps.pipeline.models_pgm 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | views.apps.pipeline.predict module 45 | ---------------------------------- 46 | 47 | .. automodule:: views.apps.pipeline.predict 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | views.apps.pipeline.train module 53 | -------------------------------- 54 | 55 | .. automodule:: views.apps.pipeline.train 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | 60 | -------------------------------------------------------------------------------- /docs/source/views.apps.rst: -------------------------------------------------------------------------------- 1 | views.apps package 2 | ================== 3 | 4 | .. automodule:: views.apps 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 4 14 | 15 | views.apps.data 16 | views.apps.ensemble 17 | views.apps.evaluation 18 | views.apps.extras 19 | views.apps.model 20 | views.apps.pipeline 21 | views.apps.slurm 22 | views.apps.transforms 23 | -------------------------------------------------------------------------------- /docs/source/views.apps.slurm.rst: -------------------------------------------------------------------------------- 1 | views.apps.slurm package 2 | ======================== 3 | 4 | .. automodule:: views.apps.slurm 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.apps.slurm.slurm module 13 | ----------------------------- 14 | 15 | .. automodule:: views.apps.slurm.slurm 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.apps.transforms.rst: -------------------------------------------------------------------------------- 1 | views.apps.transforms package 2 | ============================= 3 | 4 | .. automodule:: views.apps.transforms 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.apps.transforms.lib module 13 | -------------------------------- 14 | 15 | .. automodule:: views.apps.transforms.lib 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.database.rst: -------------------------------------------------------------------------------- 1 | views.database package 2 | ====================== 3 | 4 | .. automodule:: views.database 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 4 14 | 15 | views.database.skeleton 16 | views.database.sources 17 | 18 | Submodules 19 | ---------- 20 | 21 | views.database.common module 22 | ---------------------------- 23 | 24 | .. automodule:: views.database.common 25 | :members: 26 | :undoc-members: 27 | :show-inheritance: 28 | 29 | -------------------------------------------------------------------------------- /docs/source/views.database.skeleton.rst: -------------------------------------------------------------------------------- 1 | views.database.skeleton package 2 | =============================== 3 | 4 | .. automodule:: views.database.skeleton 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.skeleton.skeleton module 13 | --------------------------------------- 14 | 15 | .. automodule:: views.database.skeleton.skeleton 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.acled.legacy.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.acled.legacy package 2 | =========================================== 3 | 4 | .. automodule:: views.database.sources.acled.legacy 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.acled.legacy.acled module 13 | ------------------------------------------------ 14 | 15 | .. automodule:: views.database.sources.acled.legacy.acled 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.acled.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.acled package 2 | ==================================== 3 | 4 | .. automodule:: views.database.sources.acled 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 4 14 | 15 | views.database.sources.acled.legacy 16 | 17 | Submodules 18 | ---------- 19 | 20 | views.database.sources.acled.acled module 21 | ----------------------------------------- 22 | 23 | .. automodule:: views.database.sources.acled.acled 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.cdum.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.cdum package 2 | =================================== 3 | 4 | .. automodule:: views.database.sources.cdum 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.cdum.cdum module 13 | --------------------------------------- 14 | 15 | .. automodule:: views.database.sources.cdum.cdum 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.fvp.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.fvp package 2 | ================================== 3 | 4 | .. automodule:: views.database.sources.fvp 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.fvp.fvp module 13 | ------------------------------------- 14 | 15 | .. automodule:: views.database.sources.fvp.fvp 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.ged.legacy.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.ged.legacy package 2 | ========================================= 3 | 4 | .. automodule:: views.database.sources.ged.legacy 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.ged.legacy.ged module 13 | -------------------------------------------- 14 | 15 | .. automodule:: views.database.sources.ged.legacy.ged 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.database.sources.ged.legacy.impute module 21 | ----------------------------------------------- 22 | 23 | .. automodule:: views.database.sources.ged.legacy.impute 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.ged.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.ged package 2 | ================================== 3 | 4 | .. automodule:: views.database.sources.ged 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 4 14 | 15 | views.database.sources.ged.legacy 16 | 17 | Submodules 18 | ---------- 19 | 20 | views.database.sources.ged.ged module 21 | ------------------------------------- 22 | 23 | .. automodule:: views.database.sources.ged.ged 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.icgcw.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.icgcw package 2 | ==================================== 3 | 4 | .. automodule:: views.database.sources.icgcw 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.icgcw.fetch module 13 | ----------------------------------------- 14 | 15 | .. automodule:: views.database.sources.icgcw.fetch 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.database.sources.icgcw.icgcw module 21 | ----------------------------------------- 22 | 23 | .. automodule:: views.database.sources.icgcw.icgcw 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.pgdata.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.pgdata package 2 | ===================================== 3 | 4 | .. automodule:: views.database.sources.pgdata 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.pgdata.fetch module 13 | ------------------------------------------ 14 | 15 | .. automodule:: views.database.sources.pgdata.fetch 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.database.sources.pgdata.pgdata module 21 | ------------------------------------------- 22 | 23 | .. automodule:: views.database.sources.pgdata.pgdata 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.reign.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.reign package 2 | ==================================== 3 | 4 | .. automodule:: views.database.sources.reign 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.reign.reign module 13 | ----------------------------------------- 14 | 15 | .. automodule:: views.database.sources.reign.reign 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.rst: -------------------------------------------------------------------------------- 1 | views.database.sources package 2 | ============================== 3 | 4 | .. automodule:: views.database.sources 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 4 14 | 15 | views.database.sources.acled 16 | views.database.sources.cdum 17 | views.database.sources.fvp 18 | views.database.sources.ged 19 | views.database.sources.icgcw 20 | views.database.sources.pgdata 21 | views.database.sources.reign 22 | views.database.sources.spei 23 | views.database.sources.vdem 24 | views.database.sources.wdi 25 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.spei.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.spei package 2 | =================================== 3 | 4 | .. automodule:: views.database.sources.spei 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.spei.spei module 13 | --------------------------------------- 14 | 15 | .. automodule:: views.database.sources.spei.spei 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.vdem.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.vdem package 2 | =================================== 3 | 4 | .. automodule:: views.database.sources.vdem 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.vdem.vdem module 13 | --------------------------------------- 14 | 15 | .. automodule:: views.database.sources.vdem.vdem 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.database.sources.wdi.rst: -------------------------------------------------------------------------------- 1 | views.database.sources.wdi package 2 | ================================== 3 | 4 | .. automodule:: views.database.sources.wdi 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.database.sources.wdi.wdi module 13 | ------------------------------------- 14 | 15 | .. automodule:: views.database.sources.wdi.wdi 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.rst: -------------------------------------------------------------------------------- 1 | views package 2 | ============= 3 | 4 | .. automodule:: views 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 4 14 | 15 | views.apps 16 | views.database 17 | views.specs 18 | views.utils 19 | 20 | Submodules 21 | ---------- 22 | 23 | views.config module 24 | ------------------- 25 | 26 | .. automodule:: views.config 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | -------------------------------------------------------------------------------- /docs/source/views.specs.data.rst: -------------------------------------------------------------------------------- 1 | views.specs.data package 2 | ======================== 3 | 4 | .. automodule:: views.specs.data 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.specs.data.parsed\_datasets module 13 | ---------------------------------------- 14 | 15 | .. automodule:: views.specs.data.parsed_datasets 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.specs.data.solver module 21 | ------------------------------ 22 | 23 | .. automodule:: views.specs.data.solver 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | -------------------------------------------------------------------------------- /docs/source/views.specs.models.rst: -------------------------------------------------------------------------------- 1 | views.specs.models package 2 | ========================== 3 | 4 | .. automodule:: views.specs.models 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.specs.models.solver module 13 | -------------------------------- 14 | 15 | .. automodule:: views.specs.models.solver 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | -------------------------------------------------------------------------------- /docs/source/views.specs.periods.rst: -------------------------------------------------------------------------------- 1 | views.specs.periods package 2 | =========================== 3 | 4 | .. automodule:: views.specs.periods 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/views.specs.rst: -------------------------------------------------------------------------------- 1 | views.specs package 2 | =================== 3 | 4 | .. automodule:: views.specs 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Subpackages 10 | ----------- 11 | 12 | .. toctree:: 13 | :maxdepth: 4 14 | 15 | views.specs.data 16 | views.specs.models 17 | views.specs.periods 18 | -------------------------------------------------------------------------------- /docs/source/views.utils.rst: -------------------------------------------------------------------------------- 1 | views.utils package 2 | =================== 3 | 4 | .. automodule:: views.utils 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | Submodules 10 | ---------- 11 | 12 | views.utils.data module 13 | ----------------------- 14 | 15 | .. automodule:: views.utils.data 16 | :members: 17 | :undoc-members: 18 | :show-inheritance: 19 | 20 | views.utils.db module 21 | --------------------- 22 | 23 | .. automodule:: views.utils.db 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | views.utils.io module 29 | --------------------- 30 | 31 | .. automodule:: views.utils.io 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | 36 | views.utils.log module 37 | ---------------------- 38 | 39 | .. automodule:: views.utils.log 40 | :members: 41 | :undoc-members: 42 | :show-inheritance: 43 | 44 | views.utils.misc module 45 | ----------------------- 46 | 47 | .. automodule:: views.utils.misc 48 | :members: 49 | :undoc-members: 50 | :show-inheritance: 51 | 52 | views.utils.mocker module 53 | ------------------------- 54 | 55 | .. automodule:: views.utils.mocker 56 | :members: 57 | :undoc-members: 58 | :show-inheritance: 59 | 60 | views.utils.stats module 61 | ------------------------ 62 | 63 | .. automodule:: views.utils.stats 64 | :members: 65 | :undoc-members: 66 | :show-inheritance: 67 | 68 | -------------------------------------------------------------------------------- /install_views2.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Stop on error 4 | set -e 5 | echo "Started installing views." 6 | echo "Initalising conda for this shell" 7 | eval "$(conda shell.bash hook)" 8 | 9 | echo "Updating conda" 10 | conda update --all --yes 11 | echo "Removing existing views2 env" 12 | conda remove --name views2 --all --yes 13 | echo "Creating env from env_static.yaml" 14 | # @TODO: Change back to env_static.yaml asap when we have working "builds" for linux 15 | conda env create -f misc/environment.yaml 16 | echo "Activating env" 17 | conda activate views2 18 | echo "Running pip install --editable . to install the views package" 19 | pip install --editable . 20 | 21 | echo "Creating storage directory here" 22 | mkdir -p ./storage 23 | 24 | # Copy the default config file to default config dir ~/.views2/ 25 | if [ ! -f ./config.yaml ]; 26 | then 27 | echo "No current ./config.yaml found, copying the defaults" 28 | cp ./misc/defaults.yaml ./config.yaml 29 | else 30 | echo "./config.yaml already exists, not changing it" 31 | fi 32 | 33 | echo "Great success, you can now do \" conda activate views2 \" in your shell and get started." -------------------------------------------------------------------------------- /misc/README.md: -------------------------------------------------------------------------------- 1 | # Misc? 2 | 3 | ## Static dependencies list 4 | 5 | To avoid issues with breaking changes from updated dependencies the 6 | main installer now uses a static list of versioned dependencies in env_static.yaml. 7 | No more updated dependencies suddenly breaking code. 8 | 9 | If you want to add a dependency: 10 | 11 | * add it to environment.yaml in this dir, 12 | * recreate the views2 environment with (from this dir): 13 | 14 | conda remove --name views2 --all --yes 15 | conda env create -f environment.yaml 16 | 17 | Then run 18 | 19 | ./freeze_env.sh 20 | 21 | to update env_static.yaml. 22 | -------------------------------------------------------------------------------- /misc/defaults.yaml: -------------------------------------------------------------------------------- 1 | default_database: views 2 | databases: 3 | views: 4 | user: username # CHANGE ME! 5 | host: janus 6 | dbname: views 7 | port: 5432 8 | use_ssl: True 9 | ssl_cert: "~/.postgres/postgresql.crt" 10 | ssl_key: "~/.postgres/postgresql.key" 11 | ssl_rootcert: "~/.postgres/root.crt" 12 | local: 13 | user: username 14 | host: 127.0.0.1 15 | dbname: postgres 16 | port: 5432 17 | use_ssl: False 18 | password: "" 19 | 20 | dirs: 21 | storage: "" # Emtpy string will default to the storage directory in the repo 22 | scratch: "" # Emtpy string will default to the storage/scratch/ directory in the repo 23 | 24 | slurm: 25 | username: "" 26 | project: "" 27 | 28 | qualtrics: 29 | token: "QUALTRICS_TOKEN" 30 | 31 | -------------------------------------------------------------------------------- /misc/environment.yaml: -------------------------------------------------------------------------------- 1 | name: views2 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python == 3.8 6 | - pip # for the pip-only deps 7 | - jupyter # jupyter notebooks 8 | - jupyterlab # jupyter lab, much nicer notebooks 9 | - numpy 10 | - pandas >= 1.0 11 | - scipy # more math 12 | - scikit-learn #models 13 | - statsmodels # models 14 | - matplotlib # plotting 15 | - psycopg2 #postgres io 16 | - sqlalchemy # postgres io 17 | - ipython # interactive terminal 18 | - pyyaml # YAML IO 19 | - joblib # easier multiprocessing and a faster pickler 20 | - h5py # hdf5 file for dynasim aggregation 21 | - pylint == 2.4.4 # linting, freeze at 2.4.4 as 2.5 gave errors 22 | - beautifulsoup4 # Web scraping 23 | - lxml # parser for bs4 24 | - xarray # SPEI loading 25 | - numba # For fast stuff 26 | - pyarrow # For Parquet IO in pandas 27 | - psutil # For physical core detection, thread's don't help dynasim 28 | - descartes # Basic Geopandas Plotting 29 | - xlrd # Dependency pd.read_excel 30 | - xlwt # For formatting (old) Excel files 31 | - seaborn # Fancy plotting 32 | # These packages are only available from pip or have later versions there 33 | - pip: 34 | - black # Code formatting 35 | - coverage # Test coverage report 36 | - flake8 # Grumpy linter 37 | - geoalchemy2 # Geometry types for pushing geodataframes to postgres 38 | - geopandas 39 | - html5lib # Reign loader bs4 parser 40 | - libpysal 41 | - mypy # Type checking 42 | - netcdf4 # SPEI loading 43 | - pytest # Testing 44 | - requests # read the web 45 | - sphinx # Docs 46 | - xgboost # pip has later versions than conda 47 | - contextily 48 | - pdpbox -------------------------------------------------------------------------------- /misc/freeze_env.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Overwrite env_static.yaml with the latest versions of depencies from your env. 4 | # Make sure to run all the tests before committing an env_static.yaml with 5 | # newer packages so that we are all working on the same versions. 6 | 7 | echo "Initalising conda for this shell" 8 | eval "$(conda shell.bash hook)" 9 | conda activate views2 10 | conda env export --no-builds | grep -v "prefix" > ../env_static.yaml 11 | -------------------------------------------------------------------------------- /misc/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore:Using or importing the ABCs:DeprecationWarning -------------------------------------------------------------------------------- /projects/model_development/README.md: -------------------------------------------------------------------------------- 1 | # Model development -------------------------------------------------------------------------------- /projects/monthly_report/changelog.md: -------------------------------------------------------------------------------- 1 | # r_2021_03_01 2 | 3 | **General changes (to merge into master)**: 4 | * Reduced cores per job by one via `mem_per_job` in `monthly.py` to avoid crashing multiprocessing pool. 5 | * Simplified argparse of `train_model.py` and `train_slurm.py` to allow listing of steps and models like `--model model_a model_b` rather than `--model model_a --model model_b`. 6 | * `get_files_latest_fetch` in `common.py` fetched first rather than last item in sorted list. This has applied to ICGCW and REIGN. Now fetches the correct latest version of the data. 7 | * Allowed passing 0 to `tlag` in transforms lib for tlag_0 variables. 8 | * Adding current violent history input to the model specs. Zero time-lagged to avoid step-shifting our outcome in model api. See specific changes to column sets below. 9 | 10 | **Changes at cm**: 11 | * Added to `cfshort`: 12 | ``` 13 | - tlags_0_ged_dummy_sb_ns_os 14 | - ged_best_sb_ns_os 15 | - greq_5_ged_best_sb_ns_os 16 | - tlags_0_greq_5_ged_best_sb_ns_os 17 | - tlags_0_greq_25_ged_best_sb_ns_os 18 | - greq_100_ged_best_sb_ns_os 19 | - tlags_0_greq_100_ged_best_sb_ns_os 20 | ``` 21 | * Added to `cflong`: 22 | ``` 23 | - tlags_0_ged_dummy_sb_ns_os 24 | - ged_best_sb_ns_os 25 | - greq_5_ged_best_sb_ns_os 26 | - tlags_0_greq_5_ged_best_sb_ns_os 27 | - tlags_0_greq_25_ged_best_sb_ns_os 28 | - greq_100_ged_best_sb_ns_os 29 | - tlags_0_greq_100_ged_best_sb_ns_os 30 | ``` 31 | 32 | **Changes at pgm**: 33 | 34 | * Added to `legacy_hist_common`: 35 | ``` 36 | - tlags_0_ged_dummy_sb_ns_os 37 | - ged_best_sb_ns_os # TODO? 38 | - tlags_0_greq_5_ged_best_sb_ns_os 39 | - tlags_0_greq_25_ged_best_sb_ns_os 40 | - tlags_0_greq_100_ged_best_sb_ns_os 41 | - acled_protest 42 | ``` 43 | * Added colset `acled_protest`: 44 | ``` 45 | - acled_dummy_pr 46 | - tlag_0_acled_dummy_pr 47 | - acled_count_pr 48 | ``` 49 | 50 | 51 | **Retrained models at cm**: 52 | * cm_sb_cfshort 53 | * cm_sb_cflong 54 | * cm_sb_acled_violence 55 | * cm_sb_acled_protest 56 | * cm_sbonset24_25_all 57 | * cm_sb_all_global 58 | 59 | 60 | **Retrained models at pgm**: 61 | * pgm_sb_hist_legacy 62 | * pgm_sb_allthemes 63 | * pgm_sb_onset24_100_all 64 | * pgm_sb_onset24_1_all 65 | * pgm_sb_all_gxgb -------------------------------------------------------------------------------- /projects/prediction_competition/README.md: -------------------------------------------------------------------------------- 1 | # Welcome 2 | 3 | Welcome to the ViEWS prediction competition. 4 | 5 | # Getting started 6 | 7 | See the README.md in the root of this repository for installing and starting the jupyter notebook server. 8 | Then navigate to this directory in the jupyter notebook browser window and open benchmark_notebook.ipynb 9 | 10 | Good luck! -------------------------------------------------------------------------------- /projects/replication_jpr_2020/README.md: -------------------------------------------------------------------------------- 1 | # JPR 2020 replication 2 | 3 | This directory contains a .zip of the code in the repository used to publish the ViEWS paper in JPR in 2020. -------------------------------------------------------------------------------- /projects/replication_jpr_2020/gitlab_mirror/views_jpr_2020_code.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UppsalaConflictDataProgram/OpenViEWS2/7eb3e63c8c046de31f70cd56f417fadf03686f5a/projects/replication_jpr_2020/gitlab_mirror/views_jpr_2020_code.zip -------------------------------------------------------------------------------- /run_tools.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | clear 4 | 5 | # Stop on non-zero exit 6 | # We don't want to lint if the tests fail 7 | set -e 8 | 9 | echo "Initalising conda for this shell" 10 | eval "$(conda shell.bash hook)" 11 | conda activate views2 12 | 13 | 14 | echo "Black" 15 | black -l 79 views 16 | black -l 79 projects 17 | black -l 79 tests 18 | black -l 79 runners 19 | 20 | echo "mypy views" 21 | mypy views 22 | #echo "mypy projects" 23 | # mypy projects/* 24 | mypy runners 25 | echo "mypy tests" 26 | mypy tests 27 | 28 | 29 | echo "Running pytest with coverage" 30 | coverage run --source views -m pytest -c misc/pytest.ini tests/ 31 | coverage report --show-missing 32 | 33 | # Allow non-zero exit for lints 34 | set +e 35 | 36 | echo "flake8" 37 | # Ignores are for black conflicts, black wins 38 | flake8 --ignore=E203,W503 views 39 | flake8 --ignore=E203,W503 projects 40 | 41 | echo "pylint" 42 | pylint views 43 | 44 | echo "Generating docs" 45 | # Clear existing generated docs 46 | rm -f docs/source/* 47 | # Auto-generate new docs 48 | # --module-frist makes Package __init__ come before all the submodules 49 | # See https://www.sphinx-doc.org/en/master/man/sphinx-apidoc.html#options 50 | sphinx-apidoc --module-first -o docs/source/ views 51 | # Make HTML docs 52 | make -C docs/ html 53 | 54 | git status 55 | -------------------------------------------------------------------------------- /runners/README.md: -------------------------------------------------------------------------------- 1 | # Runners 2 | 3 | Runners are entrypoint scripts to ViEWS functionality for 4 | 5 | * Training 6 | * Predicting 7 | * Evaluating 8 | 9 | They should be as simple as possible, with complexity handled in the apps themselves. 10 | Entrypoints should only handle 11 | 12 | * Dealing with execution context (slurm, conda etc). 13 | * Parsing arguments 14 | * Logging 15 | * Executing the correct functionality from modules 16 | 17 | -------------------------------------------------------------------------------- /runners/export_data.py: -------------------------------------------------------------------------------- 1 | """ Refresh all datasets that are defined by the specs """ 2 | 3 | import logging 4 | import views 5 | 6 | logging.basicConfig( 7 | level=logging.DEBUG, 8 | format=views.config.LOGFMT, 9 | handlers=[ 10 | logging.FileHandler(views.utils.log.get_log_path(__file__)), 11 | logging.StreamHandler(), 12 | ], 13 | ) 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | def run_export_tables_and_geoms() -> None: 18 | views.apps.data.public.export_tables_and_geoms( 19 | tables=views.TABLES, 20 | geometries=views.GEOMETRIES, 21 | dir_output=views.DIR_SCRATCH, 22 | ) 23 | 24 | 25 | def main(): 26 | run_export_tables_and_geoms() 27 | 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /runners/import_data.py: -------------------------------------------------------------------------------- 1 | """ Import data to local cache """ 2 | 3 | import argparse 4 | import logging 5 | from typing import Optional, Tuple 6 | import views 7 | 8 | logging.basicConfig( 9 | level=logging.DEBUG, 10 | format=views.config.LOGFMT, 11 | handlers=[ 12 | logging.FileHandler(views.utils.log.get_log_path(__file__)), 13 | logging.StreamHandler(), 14 | ], 15 | ) 16 | log = logging.getLogger(__name__) 17 | 18 | 19 | def parse_args() -> Tuple[Optional[str], bool, bool]: 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument( 22 | "--path_zip", type=str, help="Path to zip to import from", 23 | ) 24 | parser.add_argument( 25 | "--fetch", action="store_true", help="Fetch from website." 26 | ) 27 | parser.add_argument( 28 | "--datasets", action="store_true", help="Refresh datasets." 29 | ) 30 | args = parser.parse_args() 31 | 32 | if args.path_zip and args.fetch: 33 | raise RuntimeError("Pass in --path_zip or --fetch, not both.") 34 | 35 | return args.path_zip, args.fetch, args.datasets 36 | 37 | 38 | def run_import_tables_and_geoms(path_zip) -> None: 39 | views.apps.data.public.import_tables_and_geoms( 40 | tables=views.TABLES, geometries=views.GEOMETRIES, path_zip=path_zip, 41 | ) 42 | 43 | 44 | def refresh_datasets() -> None: 45 | 46 | log.info("Started refreshing all datasets.") 47 | 48 | datasets_to_update = [ 49 | "cm_global_imp_0", 50 | "cm_africa_imp_0", 51 | "pgm_africa_imp_0", 52 | ] 53 | for dataset_name in datasets_to_update: 54 | log.info(f"Started refreshing dataset {dataset_name}") 55 | views.DATASETS[dataset_name].refresh(do_transforms=False) 56 | 57 | log.info("Finished refreshing all imp_0 datasets.") 58 | 59 | 60 | def main() -> None: 61 | 62 | path_zip, do_fetch, do_datasets = parse_args() 63 | 64 | if do_fetch: 65 | path_zip = views.apps.data.public.fetch_latest_zip_from_website( 66 | path_dir_destination=views.DIR_SCRATCH 67 | ) 68 | 69 | run_import_tables_and_geoms(path_zip) 70 | refresh_datasets() 71 | 72 | 73 | if __name__ == "__main__": 74 | main() 75 | -------------------------------------------------------------------------------- /runners/predict.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import logging 5 | from typing import Tuple, List 6 | 7 | from views import DATASETS 8 | from views.apps.model import api 9 | from views.apps.pipeline import ( 10 | predict, 11 | models_cm, 12 | models_pgm, 13 | ensembles_cm, 14 | ensembles_pgm, 15 | ) 16 | from views.config import LOGFMT 17 | from views.utils.log import get_log_path 18 | from views.utils.data import assign_into_df 19 | 20 | 21 | logging.basicConfig( 22 | level=logging.DEBUG, 23 | format=LOGFMT, 24 | handlers=[ 25 | logging.FileHandler(get_log_path(__file__)), 26 | logging.StreamHandler(), 27 | ], 28 | ) 29 | 30 | log = logging.getLogger(__name__) 31 | 32 | 33 | def predict_cm_models(run_id: str, n_cores: int) -> None: 34 | """ Predict with all CM models """ 35 | dataset = DATASETS["cm_africa_imp_0"] 36 | models = models_cm.all_cm_models 37 | predict.predict_models( 38 | models=models, dataset=dataset, run_id=run_id, n_cores=n_cores 39 | ) 40 | 41 | 42 | def predict_pgm_models(run_id: str, n_cores: int) -> None: 43 | """ Predict with all PGM models """ 44 | dataset = DATASETS["pgm_africa_imp_0"] 45 | models = models_pgm.all_pgm_models 46 | predict.predict_models( 47 | models=models, dataset=dataset, run_id=run_id, n_cores=n_cores 48 | ) 49 | 50 | 51 | def predict_cm_ensembles(run_id: str, n_cores: int) -> None: 52 | """ Predict with all CM ensembles """ 53 | ensembles = ensembles_cm.all_cm_ensembles 54 | dataset = DATASETS["cm_africa_imp_0"] 55 | predict.predict_ensembles(ensembles, dataset, run_id, n_cores=n_cores) 56 | 57 | 58 | def predict_pgm_ensembles(run_id: str, n_cores: int) -> None: 59 | """ Predict with all PGM ensembles """ 60 | ensembles = ensembles_pgm.all_pgm_ensembles 61 | dataset = DATASETS["pgm_africa_imp_0"] 62 | predict.predict_ensembles(ensembles, dataset, run_id, n_cores=n_cores) 63 | 64 | 65 | def predict_pgm_ensembles_and_constituent(run_id: str, n_cores: int) -> None: 66 | """ Predict all PGM ensembles and their constituent models """ 67 | log.info(f"Predicting PGM ensembles and their constituent models") 68 | ensembles = ensembles_pgm.all_pgm_ensembles 69 | models: List[api.Model] = [] 70 | for ensemble in ensembles: 71 | for model in ensemble.models: 72 | if not any([m for m in models if m.name == model.name]): 73 | models.append(model) 74 | 75 | dataset = DATASETS["pgm_africa_imp_0"] 76 | predict.predict_models( 77 | models=models, dataset=dataset, run_id=run_id, n_cores=n_cores 78 | ) 79 | predict.predict_ensembles( 80 | ensembles=ensembles, dataset=dataset, run_id=run_id, n_cores=n_cores 81 | ) 82 | 83 | 84 | def predict_cm_ensembles_and_constituent(run_id: str, n_cores: int) -> None: 85 | """ Predict all cm ensembles and their constituent models """ 86 | log.info(f"Predicting CM ensembles and their constituent models") 87 | ensembles = ensembles_cm.all_cm_ensembles 88 | models: List[api.Model] = [] 89 | for ensemble in ensembles: 90 | for model in ensemble.models: 91 | if not any([m for m in models if m.name == model.name]): 92 | models.append(model) 93 | 94 | dataset = DATASETS["cm_africa_imp_0"] 95 | predict.predict_models( 96 | models=models, dataset=dataset, run_id=run_id, n_cores=n_cores 97 | ) 98 | predict.predict_ensembles( 99 | ensembles=ensembles, dataset=dataset, run_id=run_id, n_cores=n_cores 100 | ) 101 | 102 | 103 | def parse_args() -> Tuple[str, bool, bool, bool, bool, int]: 104 | parser = argparse.ArgumentParser() 105 | parser.add_argument( 106 | "--pgm", action="store_true", help="Predict PGM models?" 107 | ) 108 | parser.add_argument("--cm", action="store_true", help="Predict CM models?") 109 | parser.add_argument( 110 | "--run_id", type=str, help="Run ID to predict for", required=True 111 | ) 112 | parser.add_argument( 113 | "--model", action="store_true", help="Make model predictions" 114 | ) 115 | parser.add_argument( 116 | "--ensemble", action="store_true", help="Make ensemble predictions" 117 | ) 118 | parser.add_argument("--n_cores", type=int, choices=range(0, 40), default=4) 119 | args = parser.parse_args() 120 | 121 | return ( 122 | args.run_id, 123 | args.pgm, 124 | args.cm, 125 | args.model, 126 | args.ensemble, 127 | args.n_cores, 128 | ) 129 | 130 | 131 | def main(): 132 | run_id, do_pgm, do_cm, do_model, do_ensemble, n_cores = parse_args() 133 | log.info( 134 | f"predict running with flags " 135 | f"run_id {run_id} do_pgm {do_pgm} do_cm {do_cm} do_model " 136 | f"{do_model} do_ensemble {do_ensemble}" 137 | ) 138 | 139 | if do_model and do_ensemble: 140 | if do_cm: 141 | predict_cm_ensembles_and_constituent(run_id, n_cores) 142 | if do_pgm: 143 | predict_pgm_ensembles_and_constituent(run_id, n_cores) 144 | elif do_model: 145 | if do_cm: 146 | predict_cm_models(run_id, n_cores) 147 | if do_pgm: 148 | predict_pgm_models(run_id, n_cores) 149 | elif do_ensemble: 150 | if do_cm: 151 | predict_cm_ensembles(run_id, n_cores) 152 | if do_pgm: 153 | predict_pgm_ensembles(run_id, n_cores) 154 | else: 155 | log.info(f"Nothing to do! Run predict.py --help to show args.") 156 | 157 | 158 | if __name__ == "__main__": 159 | try: 160 | main() 161 | except: 162 | log.exception(f"Something broke") 163 | raise 164 | -------------------------------------------------------------------------------- /runners/predict_slurm.py: -------------------------------------------------------------------------------- 1 | """ Train all models via slurm """ 2 | import argparse 3 | import os 4 | import sys 5 | import logging 6 | from typing import Tuple 7 | 8 | from views.apps.slurm.slurm import run_command 9 | from views.config import LOGFMT 10 | from views.utils.log import get_log_path 11 | 12 | logging.basicConfig( 13 | level=logging.DEBUG, 14 | format=LOGFMT, 15 | handlers=[ 16 | logging.FileHandler(get_log_path(__file__)), 17 | logging.StreamHandler(), 18 | ], 19 | ) 20 | 21 | 22 | def parse_args() -> Tuple[str, bool, bool, int, int]: 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--pgm", action="store_true", help="Predict PGM?") 25 | parser.add_argument("--cm", action="store_true", help="Predict CM?") 26 | parser.add_argument( 27 | "--run_id", type=str, help="Run ID to predict for", required=True 28 | ) 29 | parser.add_argument("--n_cores", type=int, choices=range(0, 40), default=4) 30 | parser.add_argument("--hours", type=int, choices=range(0, 128), default=24) 31 | args = parser.parse_args() 32 | 33 | return args.run_id, args.pgm, args.cm, args.n_cores, args.hours 34 | 35 | 36 | def _build_command(loa: str, run_id: str, n_cores: int) -> str: 37 | path_runner = os.path.join( 38 | os.path.dirname(os.path.abspath(__file__)), "predict.py" 39 | ) 40 | path_exec = sys.executable 41 | return f"{path_exec} {path_runner} --{loa} --run_id {run_id} --model --ensemble --n_cores {n_cores}" 42 | 43 | 44 | def main() -> None: 45 | 46 | run_id, do_pgm, do_cm, n_cores, hours = parse_args() 47 | 48 | if do_cm: 49 | cmd = _build_command(loa="cm", run_id=run_id, n_cores=n_cores) 50 | run_command(cmd, hours=hours) 51 | 52 | if do_pgm: 53 | cmd = _build_command(loa="pgm", run_id=run_id, n_cores=n_cores) 54 | run_command(cmd, hours=hours) 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /runners/refresh_data.py: -------------------------------------------------------------------------------- 1 | """ Refresh all datasets that are defined by the specs """ 2 | 3 | from typing import Tuple 4 | import argparse 5 | import logging 6 | 7 | import views 8 | 9 | logging.basicConfig( 10 | level=logging.DEBUG, 11 | format=views.config.LOGFMT, 12 | handlers=[ 13 | logging.FileHandler(views.utils.log.get_log_path(__file__)), 14 | logging.StreamHandler(), 15 | ], 16 | ) 17 | log = logging.getLogger(__name__) 18 | 19 | 20 | def parse_args() -> Tuple[bool, bool, bool]: 21 | 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument("--all", action="store_true", help="refresh all") 24 | parser.add_argument( 25 | "--geom", action="store_true", help="refresh geometries" 26 | ) 27 | parser.add_argument("--tables", action="store_true", help="refresh tables") 28 | parser.add_argument( 29 | "--datasets", action="store_true", help="refresh datasets" 30 | ) 31 | args = parser.parse_args() 32 | 33 | do_geom = args.geom 34 | do_tables = args.tables 35 | do_datasets = args.datasets 36 | if args.all: 37 | do_geom, do_tables, do_datasets = True, True, True 38 | 39 | if not any([do_geom, do_tables, do_datasets]): 40 | log.info("Nothing to do, see python refresh_data.py --help for args.") 41 | 42 | return do_geom, do_tables, do_datasets 43 | 44 | 45 | def refresh_geometries() -> None: 46 | log.info(f"Refreshing all Geometries") 47 | for geometry in views.GEOMETRIES.values(): 48 | geometry.refresh() 49 | log.info("Finished refreshing all Geometries") 50 | 51 | 52 | def refresh_tables() -> None: 53 | log.info(f"Refreshing all Tables") 54 | for table in views.TABLES.values(): 55 | table.refresh() 56 | log.info("Finished refreshing all Tables") 57 | 58 | 59 | def refresh_datasets() -> None: 60 | log.info(f"Refreshing all Datasets") 61 | for dataset in views.DATASETS.values(): 62 | dataset.refresh() 63 | log.info("Finished refreshing all Datasets") 64 | 65 | 66 | def refresh_all(): 67 | do_geom, do_tables, do_datasets = parse_args() 68 | if do_geom: 69 | refresh_geometries() 70 | if do_tables: 71 | refresh_tables() 72 | if do_datasets: 73 | refresh_datasets() 74 | 75 | 76 | if __name__ == "__main__": 77 | refresh_all() 78 | -------------------------------------------------------------------------------- /runners/refresh_data_slurm.py: -------------------------------------------------------------------------------- 1 | """ Refresh data via slurm """ 2 | 3 | import os 4 | import sys 5 | import logging 6 | 7 | from views.apps.slurm.slurm import run_command 8 | from views.config import LOGFMT 9 | from views.utils.log import get_log_path 10 | 11 | logging.basicConfig( 12 | level=logging.DEBUG, 13 | format=LOGFMT, 14 | handlers=[ 15 | logging.FileHandler(get_log_path(__file__)), 16 | logging.StreamHandler(), 17 | ], 18 | ) 19 | 20 | log = logging.getLogger(__name__) 21 | 22 | 23 | def _build_cmd_refresh_data() -> str: 24 | """ Just get a shell command for starting refersh_data.py """ 25 | 26 | path_runner = os.path.join( 27 | os.path.dirname(os.path.abspath(__file__)), "refresh_data.py" 28 | ) 29 | path_exec = sys.executable 30 | cmd = f"{path_exec} {path_runner} --all" 31 | return cmd 32 | 33 | 34 | def main() -> None: 35 | run_command(command=_build_cmd_refresh_data(), hours=24) 36 | 37 | 38 | if __name__ == "__main__": 39 | main() 40 | -------------------------------------------------------------------------------- /runners/train_all_local.py: -------------------------------------------------------------------------------- 1 | """ Train all models locally """ 2 | 3 | import argparse 4 | import os 5 | import sys 6 | import logging 7 | from typing import Tuple 8 | 9 | from views import DATASETS 10 | from views.apps.model import api 11 | from views.apps.pipeline import models_cm, models_pgm 12 | from views.config import LOGFMT 13 | from views.utils.log import get_log_path 14 | 15 | logging.basicConfig( 16 | level=logging.DEBUG, 17 | format=LOGFMT, 18 | handlers=[ 19 | logging.FileHandler(get_log_path(__file__)), 20 | logging.StreamHandler(), 21 | ], 22 | ) 23 | 24 | log = logging.getLogger(__name__) 25 | 26 | 27 | def parse_args() -> Tuple[bool, bool]: 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--pgm", action="store_true", help="Train PGM models?") 30 | parser.add_argument("--cm", action="store_true", help="Train CM models?") 31 | args = parser.parse_args() 32 | 33 | return args.pgm, args.cm 34 | 35 | 36 | def main() -> None: 37 | do_pgm, do_cm = parse_args() 38 | 39 | if do_pgm: 40 | for model in models_pgm.all_pgm_models: 41 | df = DATASETS["flat_pgm_africa_1"].df 42 | model.fit_estimators(df) 43 | model.save() 44 | 45 | if do_cm: 46 | for model in models_cm.all_cm_models: 47 | if "train_africa" in model.tags: 48 | df = DATASETS["flat_cm_africa_1"].df 49 | elif "train_global" in model.tags: 50 | df = DATASETS["flat_cm_global_1"].df 51 | model.fit_estimators(df) 52 | model.save() 53 | 54 | 55 | if __name__ == "__main__": 56 | try: 57 | main() 58 | except: 59 | log.exception(f"Training failed for some reason.") 60 | -------------------------------------------------------------------------------- /runners/train_all_slurm.py: -------------------------------------------------------------------------------- 1 | """ Train all models via slurm """ 2 | 3 | import argparse 4 | import os 5 | import sys 6 | import logging 7 | from typing import Tuple 8 | 9 | from views.apps.model import api 10 | from views.apps.pipeline import models_cm, models_pgm 11 | from views.apps.slurm.slurm import run_command 12 | from views.config import LOGFMT 13 | from views.utils.log import get_log_path 14 | 15 | logging.basicConfig( 16 | level=logging.DEBUG, 17 | format=LOGFMT, 18 | handlers=[ 19 | logging.FileHandler(get_log_path(__file__)), 20 | logging.StreamHandler(), 21 | ], 22 | ) 23 | 24 | log = logging.getLogger(__name__) 25 | 26 | 27 | def parse_args() -> Tuple[bool, bool]: 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--pgm", action="store_true", help="Train PGM models?") 30 | parser.add_argument("--cm", action="store_true", help="Train CM models?") 31 | args = parser.parse_args() 32 | 33 | return args.pgm, args.cm 34 | 35 | 36 | def _build_cmd_train_model(model: api.Model, dataset: str, loa: str) -> str: 37 | path_runner = os.path.join( 38 | os.path.dirname(os.path.abspath(__file__)), "train_model.py" 39 | ) 40 | path_exec = sys.executable 41 | cmd = ( 42 | f"{path_exec} {path_runner} " 43 | f"--model {model.name} " 44 | f"--dataset {dataset} " 45 | f"--loa {loa} " 46 | ) 47 | return cmd 48 | 49 | 50 | def main() -> None: 51 | 52 | train_pgm, train_cm = parse_args() 53 | 54 | # CM 55 | if train_cm: 56 | log.info(f"--cm was passed, training all CM models.") 57 | for model in models_cm.all_cm_models: 58 | if "train_africa" in model.tags: 59 | cmd = _build_cmd_train_model( 60 | model, dataset="flat_cm_africa_1", loa="cm" 61 | ) 62 | elif "train_global" in model.tags: 63 | cmd = _build_cmd_train_model( 64 | model, dataset="flat_cm_global_1", loa="cm" 65 | ) 66 | run_command(cmd) 67 | 68 | # PGM 69 | if train_pgm: 70 | log.info(f"--pgm was passed, training all pgm models.") 71 | for model in models_pgm.all_pgm_models: 72 | cmd = _build_cmd_train_model( 73 | model, dataset="flat_pgm_africa_1", loa="pgm" 74 | ) 75 | run_command(cmd, hours=48) 76 | 77 | 78 | if __name__ == "__main__": 79 | main() 80 | -------------------------------------------------------------------------------- /runners/train_model.py: -------------------------------------------------------------------------------- 1 | """ Command line interface for model training """ 2 | from typing import Tuple 3 | from typing_extensions import Literal 4 | import argparse 5 | import logging 6 | 7 | from views.apps.pipeline import train 8 | from views.config import LOGFMT 9 | from views.utils.log import get_log_path, logtime 10 | 11 | logging.basicConfig( 12 | level=logging.DEBUG, 13 | format=LOGFMT, 14 | handlers=[ 15 | logging.FileHandler(get_log_path(__file__)), 16 | logging.StreamHandler(), 17 | ], 18 | ) 19 | 20 | 21 | def parse_args() -> Tuple[str, str, str]: 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument( 24 | "--loa", type=str, help="Level of analysis, either cm or pgm" 25 | ) 26 | parser.add_argument("--model", type=str, help="name of model to train") 27 | parser.add_argument("--dataset", type=str, help="name of dataset") 28 | 29 | args = parser.parse_args() 30 | 31 | assert args.loa in ["am", "cm", "pgm"] 32 | loa: Literal["am", "cm", "pgm"] = args.loa 33 | model: str = args.model 34 | dataset = args.dataset 35 | 36 | return loa, model, dataset 37 | 38 | 39 | @logtime 40 | def main(): 41 | loa, model, dataset = parse_args() 42 | train.train_and_store_model_by_name(loa, model, dataset) 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /runners/train_slurm.py: -------------------------------------------------------------------------------- 1 | """ Train on slurm """ 2 | 3 | import argparse 4 | import os 5 | import sys 6 | import logging 7 | from typing import Tuple, List 8 | 9 | from views.apps.model import api 10 | from views.apps.pipeline import models_cm, models_pgm 11 | from views.apps.slurm.slurm import run_command 12 | from views.config import LOGFMT 13 | from views.utils.log import get_log_path 14 | 15 | logging.basicConfig( 16 | level=logging.DEBUG, 17 | format=LOGFMT, 18 | handlers=[ 19 | logging.FileHandler(get_log_path(__file__)), 20 | logging.StreamHandler(), 21 | ], 22 | ) 23 | 24 | log = logging.getLogger(__name__) 25 | 26 | 27 | def parse_args() -> Tuple[bool, bool, bool, List[str]]: 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument("--pgm", action="store_true", help="Train PGM models?") 30 | parser.add_argument("--cm", action="store_true", help="Train CM models?") 31 | parser.add_argument( 32 | "--all", 33 | action="store_true", 34 | help="Train all models for selected LOAs?", 35 | ) 36 | parser.add_argument( 37 | "--model", 38 | action="append", 39 | help="Train a particular model. Pass multiple times for multiple models", 40 | ) 41 | args = parser.parse_args() 42 | 43 | if args.all and args.models: 44 | raise RuntimeError(f"Can't have --all and --model") 45 | 46 | # We don't know which LOA to train for 47 | if args.model and args.cm and args.pgm: 48 | raise RuntimeError(f"Can't have --model, --cm and --pgm") 49 | 50 | return args.pgm, args.cm, args.all, args.model 51 | 52 | 53 | def _build_cmd_train_model(modelname: str, dataset: str, loa: str) -> str: 54 | path_runner = os.path.join( 55 | os.path.dirname(os.path.abspath(__file__)), "train_model.py" 56 | ) 57 | path_exec = sys.executable 58 | cmd = ( 59 | f"{path_exec} {path_runner} " 60 | f"--model {modelname} " 61 | f"--dataset {dataset} " 62 | f"--loa {loa} " 63 | ) 64 | return cmd 65 | 66 | 67 | def main() -> None: 68 | 69 | pgm, cm, train_all, modelnames = parse_args() 70 | 71 | if modelnames: 72 | for modelname in modelnames: 73 | if pgm: 74 | if not modelname in models_pgm.all_pgm_models_by_name: 75 | raise RuntimeError(f"Couldn't find model name {modelname}") 76 | cmd = _build_cmd_train_model( 77 | modelname, dataset="flat_pgm_africa_1", loa="pgm", 78 | ) 79 | elif cm: 80 | # Check we have model 81 | if not modelname in models_cm.all_cm_models_by_name: 82 | raise RuntimeError(f"Couldn't find model name {modelname}") 83 | 84 | model = models_cm.all_cm_models_by_name[modelname] 85 | if "train_africa" in model.tags: 86 | cmd = _build_cmd_train_model( 87 | model.name, dataset="flat_cm_africa_1", loa="cm" 88 | ) 89 | elif "train_global" in model.tags: 90 | cmd = _build_cmd_train_model( 91 | model.name, dataset="flat_cm_global_1", loa="cm" 92 | ) 93 | run_command(cmd) 94 | 95 | # CM 96 | if cm and train_all: 97 | log.info(f"--cm and --all was passed, training all CM models.") 98 | for model in models_cm.all_cm_models: 99 | if "train_africa" in model.tags: 100 | cmd = _build_cmd_train_model( 101 | model.name, dataset="flat_cm_africa_1", loa="cm" 102 | ) 103 | elif "train_global" in model.tags: 104 | cmd = _build_cmd_train_model( 105 | model.name, dataset="flat_cm_global_1", loa="cm" 106 | ) 107 | run_command(cmd) 108 | 109 | # PGM 110 | if pgm and train_all: 111 | log.info(f"--pgm and --all was passed, training all pgm models.") 112 | for model in models_pgm.all_pgm_models: 113 | cmd = _build_cmd_train_model( 114 | model.name, dataset="flat_pgm_africa_1", loa="pgm" 115 | ) 116 | run_command(cmd, hours=48) 117 | 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /runners/update_database.py: -------------------------------------------------------------------------------- 1 | """ Update a source in the database """ 2 | from typing import Tuple 3 | import argparse 4 | import logging 5 | import views 6 | from views.database.sources import ( 7 | acled, 8 | cdum, 9 | fvp, 10 | ged, 11 | icgcw, 12 | pgdata, 13 | reign, 14 | spei, 15 | vdem, 16 | wdi, 17 | ) 18 | 19 | logging.basicConfig( 20 | level=logging.DEBUG, 21 | format=views.config.LOGFMT, 22 | handlers=[ 23 | logging.FileHandler(views.utils.log.get_log_path(__file__)), 24 | logging.StreamHandler(), 25 | ], 26 | ) 27 | 28 | log = logging.getLogger(__name__) 29 | 30 | 31 | def parse_args() -> Tuple[ 32 | bool, bool, bool, bool, bool, bool, bool, bool, bool, bool, bool 33 | ]: 34 | parser = argparse.ArgumentParser() 35 | 36 | parser.add_argument( 37 | "--nofetch", action="store_true", help="No fetch, only load." 38 | ) 39 | parser.add_argument("--wdi", action="store_true", help="Update WDI") 40 | parser.add_argument("--vdem", action="store_true", help="Update VDEM") 41 | parser.add_argument("--acled", action="store_true", help="Update ACLED") 42 | parser.add_argument("--ged", action="store_true", help="Update GED") 43 | parser.add_argument("--icgcw", action="store_true", help="Update ICGCW") 44 | parser.add_argument( 45 | "--pgdata", action="store_true", help="Update Priogrid" 46 | ) 47 | parser.add_argument("--spei", action="store_true", help="Update SPEI") 48 | parser.add_argument("--fvp", action="store_true", help="Update FVP") 49 | parser.add_argument( 50 | "--cdum", action="store_true", help="Update country dummies" 51 | ) 52 | parser.add_argument("--reign", action="store_true", help="Upate reign") 53 | 54 | args = parser.parse_args() 55 | 56 | return ( 57 | args.nofetch, 58 | args.wdi, 59 | args.vdem, 60 | args.acled, 61 | args.ged, 62 | args.icgcw, 63 | args.pgdata, 64 | args.spei, 65 | args.fvp, 66 | args.cdum, 67 | args.reign, 68 | ) 69 | 70 | 71 | def main(): 72 | 73 | ( 74 | nofetch, 75 | do_wdi, 76 | do_vdem, 77 | do_acled, 78 | do_ged, 79 | do_icgcw, 80 | do_pgdata, 81 | do_spei, 82 | do_fvp, 83 | do_cdum, 84 | do_reign, 85 | ) = parse_args() 86 | 87 | if do_wdi: 88 | if not nofetch: 89 | wdi.fetch_wdi() 90 | wdi.load_wdi() 91 | 92 | if do_vdem: 93 | if not nofetch: 94 | vdem.fetch_vdem() 95 | vdem.load_vdem() 96 | 97 | if do_acled: 98 | if not nofetch: 99 | acled.fetch_acled() 100 | acled.load_acled() 101 | 102 | if do_ged: 103 | if not nofetch: 104 | ged.fetch_ged() 105 | ged.load_ged() 106 | 107 | if do_icgcw: 108 | if not nofetch: 109 | icgcw.fetch_icgcw() 110 | icgcw.load_icgcw() 111 | 112 | if do_pgdata: 113 | if not nofetch: 114 | pgdata.fetch_pgdata() 115 | pgdata.load_pgdata() 116 | 117 | if do_spei: 118 | if not nofetch: 119 | spei.fetch_spei() 120 | spei.load_spei() 121 | 122 | if do_fvp: 123 | if not nofetch: 124 | fvp.fetch_fvp() 125 | fvp.load_fvp() 126 | 127 | if do_cdum: 128 | if not nofetch: 129 | cdum.fetch_cdum() 130 | cdum.load_cdum() 131 | 132 | if do_reign: 133 | if not nofetch: 134 | reign.fetch_reign() 135 | reign.load_reign() 136 | 137 | 138 | if __name__ == "__main__": 139 | try: 140 | main() 141 | except: 142 | log.exception(f"Something went wrong in update_database.py") 143 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ Setup file fore views project """ 2 | import os 3 | from setuptools import setup 4 | 5 | def main(): 6 | """ Do the setup """ 7 | setup(name='views', 8 | version='0.0.1', 9 | author="ViEWS Team", 10 | install_requires=[]) 11 | 12 | if __name__ == "__main__": 13 | main() 14 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | # Tests 2 | 3 | Tests live here and should be run before pushing. 4 | Run tests by running the full suite `run_tools.sh` script in the root of the repo. 5 | -------------------------------------------------------------------------------- /tests/test_calibration.py: -------------------------------------------------------------------------------- 1 | import pandas as pd # type: ignore 2 | import numpy as np # type: ignore 3 | from views.apps.model.calibration import calibrate_real 4 | from views.utils import mocker 5 | 6 | 7 | def test_calibrate_real_perfect_on_calib() -> None: 8 | """ Test that calibrated values when test=calib match perfectly """ 9 | 10 | df = mocker.DfMocker(datatypes=["reals"]).df 11 | 12 | s_calibrated = calibrate_real( 13 | s_test_pred=df["r_a"], 14 | s_calib_pred=df["r_a"], 15 | s_calib_actual=df["r_b"], 16 | ) 17 | 18 | assert np.isclose(s_calibrated.mean(), df["r_b"].mean()) 19 | assert np.isclose(s_calibrated.std(), df["r_b"].std()) 20 | 21 | 22 | def test_calibrate_real_scales_right_way() -> None: 23 | """ Test that calibration shifts mean the right way """ 24 | 25 | calib_pred = [100, 200] # <- Off by factor 0.5 26 | calib_actual = [50, 100] 27 | test_pred = [200, 400] 28 | test_expected = [100, 200] # <- test_pred * 0.5 29 | 30 | s_calibrated = calibrate_real( 31 | s_test_pred=pd.Series(test_pred), 32 | s_calib_pred=pd.Series(calib_pred), 33 | s_calib_actual=pd.Series(calib_actual), 34 | ) 35 | 36 | assert all(s_calibrated == pd.Series(test_expected)) 37 | -------------------------------------------------------------------------------- /tests/test_db.py: -------------------------------------------------------------------------------- 1 | """ Tests for views.utils.db 2 | 3 | @TODO: Add a testing db... 4 | """ 5 | import pytest # type: ignore 6 | from views.utils import db 7 | 8 | 9 | def test_unpack_fqtable() -> None: 10 | """ Test unpack fqtable """ 11 | assert db._unpack_fqtable("schema.table") == ("schema", "table") 12 | -------------------------------------------------------------------------------- /tests/test_misc_utils.py: -------------------------------------------------------------------------------- 1 | """ Test misc utils that don't fit anywhere else """ 2 | from views.utils import misc 3 | 4 | 5 | def test_lists_disjoint() -> None: 6 | a = [1, 2] 7 | b = [3, 4] 8 | c = [5, 6] 9 | d = [6, 7] # 6 shared with c 10 | assert not misc.lists_disjoint([a, b, c, d]) 11 | assert misc.lists_disjoint([a, b, c]) 12 | -------------------------------------------------------------------------------- /tests/test_specs.py: -------------------------------------------------------------------------------- 1 | """ Test the specs interface """ 2 | import pytest # type: ignore 3 | import yaml 4 | from views.specs import models 5 | 6 | SPEC_TEST = yaml.safe_load( 7 | """ 8 | colsets: 9 | colset_z: 10 | - zeus 11 | colset_a: 12 | - asda 13 | - bobo 14 | colset_b: 15 | - bertil 16 | - cesar 17 | colset_c: 18 | - cesar 19 | - david 20 | colset_steve: 21 | - steven 22 | - dave 23 | themes: 24 | theme_a: 25 | - colset_a 26 | - colset_b 27 | theme_b: 28 | - colset_b 29 | - colset_c 30 | theme_nested: 31 | - theme_a 32 | - theme_b 33 | theme_supernested: 34 | - colset_steve 35 | - theme_nested 36 | 37 | formulas: 38 | o: 39 | col_outcome: asda 40 | cols_features: theme_supernested 41 | """ 42 | ) 43 | 44 | SPEC_TEST_BROKEN = yaml.safe_load( 45 | """ 46 | colsets: 47 | colset_a: 48 | - asda 49 | - bobo 50 | colset_b: 51 | - bertil 52 | - cesar 53 | themes: 54 | theme_a: 55 | - colset_a 56 | - colset_b 57 | - missing_key 58 | 59 | formulas: 60 | o: 61 | col_outcome: asda 62 | cols_features: theme_supernested 63 | """ 64 | ) 65 | 66 | 67 | def test_spec_models(): 68 | assert isinstance(models.cm, dict) 69 | # assert isinstance(models.solver.solved_cm(), dict) 70 | 71 | 72 | def test_solver(): 73 | """ Test that solver solves properly """ 74 | filled_formulas = models.solver.solve_formulas(SPEC_TEST) 75 | 76 | wanted = ["steven", "dave", "asda", "bobo", "bertil", "cesar", "david"] 77 | assert filled_formulas["o"]["cols_features"] == sorted(wanted) 78 | 79 | with pytest.raises(RuntimeError) as excinfo: 80 | 81 | _ = models.solver.solve_formulas(SPEC_TEST_BROKEN) 82 | assert "No match for missing_key in" in str(excinfo.value) 83 | -------------------------------------------------------------------------------- /tests/test_structure.py: -------------------------------------------------------------------------------- 1 | """ Test import structure """ 2 | 3 | 4 | def test_import_views() -> None: 5 | """ Test that views can be imported from the top """ 6 | import views 7 | -------------------------------------------------------------------------------- /tests/test_transforms_api.py: -------------------------------------------------------------------------------- 1 | import pandas as pd # type: ignore 2 | import pytest # type: ignore 3 | from views.apps.data import api 4 | from views.apps.transforms import lib 5 | from views.utils import mocker 6 | 7 | 8 | def test_col_cols_ok() -> None: 9 | 10 | # These are ok 11 | t = api.Transform(name="testname", f="rollmax", cols=["a", "b"]) 12 | t = api.Transform(name="testname", f="rollmax", col="a") 13 | 14 | 15 | def test_col_col_not_ok() -> None: 16 | # Not OK, col is list 17 | with pytest.raises(TypeError) as exc: 18 | t = api.Transform(name="testname", f="rollmax", col=["a", "b"]) 19 | assert "col should be string" in str(exc.value) 20 | 21 | 22 | def test_col_cols_not_ok() -> None: 23 | # Not OK, cols is str 24 | with pytest.raises(TypeError) as exc: 25 | t = api.Transform(name="testname", f="rollmax", cols="a") 26 | assert "col should be string" in str(exc.value) 27 | 28 | 29 | def test_f_unknown() -> None: 30 | with pytest.raises(KeyError) as exc: 31 | t = api.Transform(name="testname", f="unknown", col="a") 32 | assert "following values of f are recognised:" in str(exc.value) 33 | 34 | 35 | def test_f_missing() -> None: 36 | with pytest.raises(KeyError) as exc: 37 | t = api.Transform(name="testname", col="a") 38 | assert "Transformer needs a 'f' field" in str(exc.value) 39 | 40 | 41 | def test_compute() -> None: 42 | t = api.Transform(name="testname", f="tlag", col="b_a", time=1) 43 | df = mocker.DfMocker().df 44 | 45 | s_t = t.compute(df) 46 | s_raw = lib.tlag(s=df["b_a"], time=1) 47 | pd.testing.assert_series_equal(s_t, s_raw) 48 | -------------------------------------------------------------------------------- /tests/test_transforms_lib.py: -------------------------------------------------------------------------------- 1 | import pandas as pd # type: ignore 2 | from views.apps.transforms import lib 3 | 4 | 5 | def test_onset() -> None: 6 | """ Test onset formulation """ 7 | 8 | c_id = pd.Series([1, 2, 3], name="c_id") 9 | t = pd.Series(list(range(1, 11)), name="t") 10 | events_c1 = [0, 0, 0, 0, 1, 1, 1, 0, 0, 0] # Events 11 | onsets_c1 = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0] # Wanted onsets 12 | onspos_c1 = [1, 1, 1, 1, 1, 0, 0, 0, 0, 0] # Wanted onsets_possible 13 | 14 | events_c2 = [0, 1, 0, 1, 0, 0, 1, 0, 0, 0] 15 | onsets_c2 = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0] 16 | onspos_c2 = [1, 1, 0, 0, 0, 0, 0, 0, 0, 0] 17 | 18 | events_c3 = [0, 0, 1, 1, 1, 1, 0, 0, 0, 1] 19 | onsets_c3 = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1] 20 | onspos_c3 = [1, 1, 1, 0, 0, 0, 0, 0, 0, 1] 21 | 22 | df = ( 23 | pd.DataFrame( 24 | { 25 | "event": events_c1 + events_c2 + events_c3, 26 | "wanted_onset_possible_3": onspos_c1 + onspos_c2 + onspos_c3, 27 | "wanted_onset_3": onsets_c1 + onsets_c2 + onsets_c3, 28 | }, 29 | index=pd.MultiIndex.from_product([c_id, t]), 30 | ) 31 | .swaplevel() 32 | .sort_index() 33 | ) 34 | 35 | df["onset_possible_3"] = lib.onset_possible(s=df["event"], window=3) 36 | df["onset_3"] = lib.onset(s=df["event"], window=3) 37 | 38 | pd.testing.assert_series_equal( 39 | df["onset_3"], df["wanted_onset_3"], check_names=False 40 | ) 41 | pd.testing.assert_series_equal( 42 | df["onset_possible_3"], 43 | df["wanted_onset_possible_3"], 44 | check_names=False, 45 | ) 46 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """ Tests for utilities """ 2 | from views import utils 3 | 4 | 5 | def test_passing() -> None: 6 | pass 7 | -------------------------------------------------------------------------------- /tests/test_utils_data.py: -------------------------------------------------------------------------------- 1 | import pytest # type: ignore 2 | import pandas as pd # type: ignore 3 | 4 | from views.utils.mocker import DfMocker 5 | from views.utils import data 6 | 7 | 8 | def test_assign_into_df() -> None: 9 | 10 | df_a = DfMocker(n_t=20).df 11 | df_b = df_a.copy() 12 | df_into = df_a.loc[:, []].copy() 13 | 14 | # Test we get the full frame if we give all times 15 | df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[0:9]) 16 | df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[10:19]) 17 | pd.testing.assert_frame_equal(df_a, df_into, check_dtype=False) 18 | 19 | # Test we get missing if we don't give all cols 20 | df_into = df_a.loc[:, []].copy() 21 | df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[0:3]) 22 | df_into = data.assign_into_df(df_to=df_into, df_from=df_a.loc[10:19]) 23 | with pytest.raises(AssertionError): 24 | pd.testing.assert_frame_equal(df_a, df_into, check_dtype=False) 25 | -------------------------------------------------------------------------------- /views/__init__.py: -------------------------------------------------------------------------------- 1 | """ The views package """ 2 | 3 | __all__ = [ 4 | "apps", 5 | "database", 6 | "specs", 7 | "utils", 8 | "ROOTDIR", 9 | "DATASETS", 10 | "DIR_STORAGE", 11 | "DIR_SCRATCH", 12 | "Model", 13 | "Ensemble", 14 | "Period", 15 | "Downsampling", 16 | "Transform", 17 | ] 18 | 19 | import os 20 | 21 | from . import apps, config, database, specs, utils 22 | from .apps.model.api import Model, Ensemble, Period, Downsampling 23 | from .apps.data.api import Transform 24 | 25 | ROOTDIR = os.path.dirname(__file__) 26 | DIR_STORAGE = config.DIR_STORAGE 27 | DIR_SCRATCH = config.DIR_SCRATCH 28 | GEOMETRIES = specs.data.GEOMETRIES 29 | TABLES = specs.data.TABLES 30 | DATASETS = specs.data.DATASETS 31 | 32 | 33 | def _setup_dirstructure() -> None: 34 | """ Setup storage directory structure """ 35 | dirs = [ 36 | DIR_STORAGE, 37 | os.path.join(DIR_STORAGE, "data", "datasets"), 38 | os.path.join(DIR_STORAGE, "data", "geometries"), 39 | os.path.join(DIR_STORAGE, "data", "raw"), 40 | os.path.join(DIR_STORAGE, "data", "tables"), 41 | os.path.join(DIR_STORAGE, "logs"), 42 | os.path.join(DIR_STORAGE, "logs"), 43 | os.path.join(DIR_STORAGE, "models"), 44 | os.path.join(DIR_STORAGE, "pipeline", "predictions"), 45 | os.path.join(DIR_STORAGE, "scratch"), 46 | ] 47 | for path_dir in dirs: 48 | utils.io.create_directory(path_dir) 49 | 50 | 51 | _setup_dirstructure() 52 | -------------------------------------------------------------------------------- /views/apps/__init__.py: -------------------------------------------------------------------------------- 1 | """ Views applications """ 2 | __all__ = [ 3 | "data", 4 | "ensemble", 5 | "evaluation", 6 | "extras", 7 | "model", 8 | "pipeline", 9 | "slurm", 10 | "transforms", 11 | "plot", 12 | ] 13 | from . import ( 14 | data, 15 | ensemble, 16 | evaluation, 17 | extras, 18 | model, 19 | pipeline, 20 | slurm, 21 | transforms, 22 | plot, 23 | ) 24 | -------------------------------------------------------------------------------- /views/apps/data/README.md: -------------------------------------------------------------------------------- 1 | ## Dataset 2 | 3 | 4 | 5 | ## Transforms 6 | 7 | ViEWS has a number of transformation functions built in. 8 | For implementation details see the file views/apps/transforms/lib.py 9 | where each transformation is defined in python code. 10 | 11 | The naming convention is simple, source columns are prefixed with transformation names and parameters of the transformation. 12 | For example: `tlag_1_ged_dummy_sb` is the time lag of 1 month of ged_dummy_sb. 13 | Transformations can of course be chained. 14 | For example: `time_since_greq_100_ged_best_sb` is the time since ged_best_sb (the best estimate of state-based deaths from GED) was greater or equal to 100. 15 | Notice that order matters. 16 | For example: `splag_1_1_time_since_ged_dummy_sb` is the first order spatial lag of time since ged_dummy_sb. This becomes a very large number as the spatial lag is the sum across the neighboring cells, which evaluates to a sum across many times_since. 17 | This is different from `time_since_splag_1_1_ged_dummy_sb` which evaluates to the time since any neighboring cell had a ged_dummy_sb event. 18 | 19 | ### summ (sum) 20 | 21 | Compute the sum of columns. Names should 22 | 23 | ### product (product) 24 | 25 | ### delta (delta) 26 | 27 | ### greater_or_equal (greq) 28 | 29 | ### smaller_or_equal (smeq) 30 | 31 | ### in_range (in_range) 32 | 33 | ### tlag (tlag) 34 | 35 | ### tlead (tlead) 36 | 37 | ### moving_average (ma) 38 | 39 | ### cweq (ma) 40 | 41 | ### time_since (time_since) 42 | 43 | ### decay (decay) 44 | 45 | ### mean (mean) 46 | 47 | ### ln (ln) 48 | 49 | ### demean (demean) 50 | 51 | ### rollmax (rollmax) 52 | 53 | ### onset_possible (onset_possible) 54 | 55 | ### onset (onset) 56 | 57 | ### distance_to_event (spdist) 58 | 59 | ### spacetime_distance_to_event (stdist) 60 | 61 | ### spatial_lag (splag) 62 | -------------------------------------------------------------------------------- /views/apps/data/__init__.py: -------------------------------------------------------------------------------- 1 | """ Dataset and transforms API """ 2 | 3 | __all__ = [ 4 | "GeomCountry", 5 | "GeomPriogrid", 6 | "Dataset", 7 | "Transform", 8 | "export_tables_and_geoms", 9 | "import_tables_and_geoms", 10 | "fetch_latest_zip_from_website", 11 | ] 12 | from .api import GeomCountry, GeomPriogrid, Dataset, Transform 13 | from .public import ( 14 | export_tables_and_geoms, 15 | import_tables_and_geoms, 16 | fetch_latest_zip_from_website, 17 | ) 18 | -------------------------------------------------------------------------------- /views/apps/data/export_readme/README.md: -------------------------------------------------------------------------------- 1 | # Views Tables and Geoms 2 | 3 | This is a data export from the ViEWS project. 4 | Python code is available for joining these files into usable datasets at priogrid-month and country-month level and computing a large set of transformations on them. 5 | See https://github.com/UppsalaConflictDataProgram/OpenViEWS2 for instructions on how to get started. 6 | 7 | If you don't wish to use the python tooling but instead prepare your own data, read on. 8 | There are three types of files here: 9 | 10 | * Skeleton tables, that represent a level of analysis in ViEWS and hold identifiers. 11 | * Data tables, that hold imputed source data at their native level of analysis. 12 | * Geometries in .geojson format to enable plotting and spatial transformations. 13 | 14 | The key identifiers are 15 | 16 | * country_id, that correspond to ids from cshapes. 17 | * pg_id, PRIO-GRID ids. 18 | * year 19 | * month_id, an incremental month identifier where 1 is 1980-01 20 | 21 | Skeletons are available for 22 | 23 | * priogrid-month (pgm) 24 | * priogrid-year (pgy) 25 | * country-month (cm) 26 | * country-year (cy) 27 | 28 | Included data sources are 29 | 30 | * ACLED, from https://www.acleddata.com/ 31 | * FVP, a custom dataset of CY data. 32 | * GED from the UCDP 33 | * pgdata, from PRIOGRID 34 | * REIGN, from https://oefdatascience.github.io/REIGN.github.io 35 | * SPEI, from https://spei.csic.es/map/maps.html 36 | * VDEM, from https://www.v-dem.net/en/ 37 | * WDI, from http://datatopics.worldbank.org/world-development-indicators/ 38 | 39 | Data tables containing "\_imp\_skelarn_number" are imputed using scikit learns in 5 different imputations and should have no missingness in numeric columns. 40 | Data tables ending in \_part\_number are partitioned to work around column number limitations in our database. 41 | 42 | To construct a usable dataset from these start with a skeleton table and then join in the data sources that you want. -------------------------------------------------------------------------------- /views/apps/data/missing/__init__.py: -------------------------------------------------------------------------------- 1 | """ Missing data management """ 2 | 3 | __all__ = [ 4 | "extrapolate", 5 | "fill_groups_with_time_means", 6 | "fill_with_group_and_global_means", 7 | "impute_amelia", 8 | "impute_mice_generator", 9 | "list_totally_missing", 10 | ] 11 | 12 | from .amelia import impute_amelia 13 | from .missing import ( 14 | extrapolate, 15 | fill_groups_with_time_means, 16 | fill_with_group_and_global_means, 17 | impute_mice_generator, 18 | list_totally_missing, 19 | ) 20 | -------------------------------------------------------------------------------- /views/apps/data/missing/amelia.py: -------------------------------------------------------------------------------- 1 | """ Amelia python-R wrapper """ 2 | from typing import List 3 | import logging 4 | import multiprocessing as mp 5 | import os 6 | import string 7 | import subprocess 8 | import tempfile 9 | 10 | import pandas as pd # type: ignore 11 | 12 | from views.utils import data 13 | from views.utils.log import logtime 14 | 15 | log = logging.getLogger(__name__) 16 | 17 | 18 | def run_subproc(cmd): 19 | """ Run cmd in subprocess and log output to debug """ 20 | 21 | log.info(f"Running cmd: {cmd}") 22 | with subprocess.Popen( 23 | cmd, 24 | stdout=subprocess.PIPE, 25 | stderr=subprocess.STDOUT, 26 | bufsize=1, 27 | universal_newlines=True, 28 | ) as p: 29 | for line in p.stdout: 30 | log.debug(line.strip("\n")) 31 | 32 | if p.returncode != 0: 33 | raise subprocess.CalledProcessError(p.returncode, p.args) 34 | 35 | 36 | # pylint: disable=too-many-locals 37 | @logtime 38 | def impute_amelia(df: pd.DataFrame, n_imp: int) -> List[pd.DataFrame]: 39 | """ Wrapper for calling Amelia in an R subprocess 40 | 41 | Args: 42 | df: Dataframe with MultiIndex set 43 | n_imp: Number of imputations to perform 44 | Return: 45 | dfs: List of imputed dataframes 46 | """ 47 | 48 | def read_template(): 49 | this_dir = os.path.dirname(os.path.abspath(__file__)) 50 | path_template = os.path.join(this_dir, "amelia_template.R") 51 | with open(path_template, "r") as f: 52 | template_str = f.read() 53 | 54 | template = string.Template(template_str) 55 | 56 | return template 57 | 58 | log.info("Started impute_amelia()") 59 | 60 | data.check_has_multiindex(df) 61 | timevar, groupvar = df.index.names 62 | 63 | log.debug(f"n_imp: {n_imp}") 64 | log.debug(f"timevar: {timevar}") 65 | log.debug(f"groupvar: {groupvar}") 66 | log.debug(f"df shape: {df.shape}") 67 | log.debug(f"Share missing: {df.isnull().mean().mean()}") 68 | 69 | with tempfile.TemporaryDirectory() as tempdir: 70 | 71 | path_csv_in = os.path.join(tempdir, "input.csv") 72 | path_rscript = os.path.join(tempdir, "impute_script.R") 73 | path_out_stem = os.path.join(tempdir, "imputed_") 74 | 75 | values = { 76 | "PATH_CSV_INPUT": path_csv_in, 77 | "PATH_CSV_OUTPUT_STEM": path_out_stem, 78 | "TIMEVAR": timevar, 79 | "GROUPVAR": groupvar, 80 | "N_IMP": n_imp, 81 | "N_CPUS": mp.cpu_count(), 82 | } 83 | 84 | template = read_template() 85 | rscript = template.substitute(values) 86 | 87 | df.to_csv(path_csv_in, index=True) 88 | log.info(f"Wrote {path_csv_in}") 89 | 90 | with open(path_rscript, "w") as f: 91 | f.write(rscript) 92 | log.info(f"Wrote {path_rscript}") 93 | log.debug(rscript) 94 | 95 | cmd = ["Rscript", path_rscript] 96 | run_subproc(cmd) 97 | 98 | dfs = [] 99 | for i in range(n_imp): 100 | path_imputed = f"{path_out_stem}{i+1}.csv" 101 | df_imp = pd.read_csv(path_imputed) 102 | df_imp = df_imp.drop(columns=["Unnamed: 0"]) 103 | df_imp = df_imp.set_index([timevar, groupvar]) 104 | dfs.append(df_imp) 105 | log.info(f"Read {path_imputed}") 106 | 107 | log.info("Finished impute_amelia()") 108 | return dfs 109 | -------------------------------------------------------------------------------- /views/apps/data/missing/amelia_template.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Install Amelia if we don't have it 4 | if (!require("Amelia")) install.packages("Amelia", repos="https://ftp.acc.umu.se/mirror/CRAN/") 5 | library("Amelia") 6 | 7 | library("foreign") 8 | library("methods") 9 | library("parallel") 10 | 11 | find_bounds <- function(df){ 12 | print("finding bounds") 13 | lower <- c() 14 | upper <- c() 15 | for (i in 1:length(df)) { 16 | lower <- c(lower, min(df[,i], na.rm=T)) 17 | upper <- c(upper, max(df[,i], na.rm=T)) 18 | } 19 | 20 | varnr <- c(1:ncol(df)) 21 | lower <- lower[varnr] 22 | upper <- upper[varnr] 23 | bounds <- matrix(cbind(varnr,lower,upper),ncol(df)) 24 | 25 | return(bounds) 26 | 27 | } 28 | 29 | keep_only_varying <- function(df){ 30 | print("Removing non-varying columns from dataframe") 31 | # Find variance to remove non-varying variables. 32 | variances <- sapply(df, var, na.rm = TRUE) 33 | 34 | # Some vars are all missing, they get variance NA, give them zero instead 35 | variances[is.na(variances)] <- 0 36 | 37 | names.zero.variance <- colnames(df[variances == 0]) 38 | names.positive.variance <- colnames(df[variances > 0]) 39 | df <- df[names.positive.variance] 40 | 41 | return(df) 42 | } 43 | 44 | keep_only_numerics <- function(df){ 45 | print("Removing non-numeric columns from dataframe") 46 | numerics <- sapply(df, is.numeric) 47 | df <- df[numerics] 48 | 49 | return(df) 50 | } 51 | 52 | 53 | time_start <- Sys.time() 54 | print("Starting amelia imputation script") 55 | 56 | path_csv_input <- "${PATH_CSV_INPUT}" 57 | path_csv_output_stem <- "${PATH_CSV_OUTPUT_STEM}" 58 | timevar <- "${TIMEVAR}" 59 | groupvar <- "${GROUPVAR}" 60 | n_imp <- ${N_IMP} 61 | n_cpus <- ${N_CPUS} 62 | 63 | print(paste("path_csv_input", path_csv_input)) 64 | print(paste("path_csv_output_stem", path_csv_output_stem)) 65 | print(paste("timevar", timevar)) 66 | print(paste("groupvar", groupvar)) 67 | print(paste("n_imp", n_imp)) 68 | print(paste("n_cpus", n_cpus)) 69 | 70 | df <- read.csv(path_csv_input) 71 | 72 | # # Drop all vars that don't vary 73 | # df <- keep_only_varying(df) 74 | # # Drop all non-numeric vars 75 | # df <- keep_only_numerics(df) 76 | 77 | nominals <- c() 78 | 79 | # Find the bounds of each var, we don't want never-before seen values 80 | bounds <- find_bounds(df) 81 | 82 | # Run the imputation 83 | obj_amelia <- amelia(df, 84 | m = n_imp, 85 | ts = timevar, 86 | cs = groupvar, 87 | noms = nominals, 88 | p2s = 2, 89 | polytime = 1, 90 | intercs = TRUE, 91 | empri = .1*nrow(df), 92 | bounds = bounds, 93 | max.resample = 1000, 94 | parallel = "multicore", 95 | ncpus = n_cpus) 96 | 97 | print("Finished imputing") 98 | 99 | write.amelia(obj=obj_amelia, 100 | file.stem = path_csv_output_stem, format = "csv") 101 | print("Saved imputed datasets") 102 | 103 | time_end <- Sys.time() 104 | time_total = time_end - time_start 105 | print("FINISHED!") 106 | print(paste("Total runtime:", time_total)) 107 | 108 | -------------------------------------------------------------------------------- /views/apps/data/public.py: -------------------------------------------------------------------------------- 1 | """ Data publication interface 2 | 3 | Data is publichsed as .csv and .geojson files in a .zip archive. 4 | These formats were chosen because they are the most common and can be 5 | read by all. 6 | Functions in this module take a zip file and cache the data in the 7 | views structure as parquet and geojson files. 8 | 9 | """ 10 | from typing import Dict, Union, List 11 | import datetime 12 | import os 13 | import tempfile 14 | import logging 15 | 16 | from views.utils import io 17 | from .api import Table, GeomCountry, GeomPriogrid 18 | 19 | log = logging.getLogger(__name__) 20 | 21 | 22 | def _date_now() -> str: 23 | """ Get current UTC time """ 24 | return datetime.datetime.utcnow().strftime("%Y%m%d") 25 | 26 | 27 | def export_tables_and_geoms( 28 | tables: Dict[str, Table], 29 | geometries: Dict[str, Union[GeomCountry, GeomPriogrid]], 30 | dir_output: str, 31 | ) -> str: 32 | """ Export tables and geometries to timestamped zip in dir_output """ 33 | path_zip = os.path.join( 34 | dir_output, f"views_tables_and_geoms_{_date_now()}.zip" 35 | ) 36 | log.info(f"Started exporting tables and geoms to {path_zip}") 37 | with tempfile.TemporaryDirectory() as tempdir: 38 | paths: List[str] = [] 39 | for table in tables.values(): 40 | path = os.path.join(tempdir, f"{table.name}.csv") 41 | io.df_to_csv(df=table.df, path=path) 42 | paths.append(path) 43 | 44 | for geom in geometries.values(): 45 | # Make sure we have gdf locally 46 | _ = geom.gdf 47 | paths.append(geom.path) 48 | 49 | # Add the README to the zip. 50 | paths.append( 51 | os.path.join( 52 | os.path.dirname(__file__), "export_readme", "README.md" 53 | ) 54 | ) 55 | 56 | io.make_zipfile( 57 | path_zip=path_zip, paths_members=paths, 58 | ) 59 | log.info(f"Finished exporting tables and geoms to {path_zip}") 60 | return path_zip 61 | 62 | 63 | def import_tables_and_geoms( 64 | tables: Dict[str, Table], 65 | geometries: Dict[str, Union[GeomCountry, GeomPriogrid]], 66 | path_zip: str, 67 | ) -> None: 68 | """ Import tables and geometries to local cache structure from zip""" 69 | 70 | log.info(f"Started initalising cache from zip at {path_zip}") 71 | with tempfile.TemporaryDirectory() as tempdir: 72 | io.unpack_zipfile(path_zip=path_zip, destination=tempdir) 73 | 74 | for geom in geometries.values(): 75 | path = os.path.join(tempdir, geom.fname) 76 | if os.path.isfile(path): 77 | geom.init_cache_from_geojson(path=path) 78 | else: 79 | log.debug(f"No matching .geojson for {geom.name}") 80 | 81 | for table in tables.values(): 82 | path = os.path.join(tempdir, f"{table.name}.csv") 83 | if os.path.isfile(path): 84 | table.init_cache_from_csv(path) 85 | else: 86 | raise RuntimeError(f"No matching .csv for {table.name}") 87 | log.info(f"Fininshed initalising cache from zip at {path_zip}") 88 | 89 | 90 | def fetch_latest_zip_from_website(path_dir_destination: str) -> str: 91 | """ Feth the latest zip from the website """ 92 | 93 | # Update this 94 | url_base = "https://views.pcr.uu.se/download/datasets" 95 | fnames = [ 96 | fname 97 | for fname in io.list_files_in_webdir(url=url_base) 98 | if fname.startswith("views_tables_and_geoms_") 99 | ] 100 | log.debug(f"Found {fnames} that look like views_tables_and_geoms_") 101 | fname_latest = sorted(fnames).pop() 102 | log.debug(f"Latest file looks like: {fname_latest}") 103 | url = f"{url_base}/{fname_latest}" 104 | path = os.path.join(path_dir_destination, fname_latest) 105 | io.fetch_url_to_file(url, path) 106 | return path 107 | -------------------------------------------------------------------------------- /views/apps/ensemble/__init__.py: -------------------------------------------------------------------------------- 1 | """ Ensemble functionality """ 2 | __all__ = ["run_ebma"] 3 | from .ebma import run_ebma 4 | -------------------------------------------------------------------------------- /views/apps/ensemble/ebma.py: -------------------------------------------------------------------------------- 1 | """ Wrapper for EBMA """ 2 | 3 | from typing import Any, Dict, Tuple, List 4 | 5 | import logging 6 | import os 7 | import string 8 | import tempfile 9 | import subprocess 10 | 11 | import pandas as pd # type: ignore 12 | 13 | from views.utils import data as datautils 14 | 15 | log = logging.getLogger(__name__) 16 | 17 | 18 | def _read_template() -> string.Template: 19 | this_dir = os.path.dirname(os.path.abspath(__file__)) 20 | with open(os.path.join(this_dir, "templates", "run_ebma.R"), "r") as f: 21 | template_str = f.read() 22 | 23 | template = string.Template(template_str) 24 | 25 | return template 26 | 27 | 28 | def _run_subproc(cmd: List[str]) -> None: 29 | """ Run cmd in subprocess and log output to debug """ 30 | 31 | log.debug(f"Running cmd: {cmd}") 32 | p: Any 33 | with subprocess.Popen( 34 | cmd, 35 | stdout=subprocess.PIPE, 36 | stderr=subprocess.STDOUT, 37 | bufsize=1, 38 | universal_newlines=True, 39 | ) as p: 40 | for line in p.stdout: 41 | log.debug(line.strip("\n")) 42 | 43 | if p.returncode != 0: 44 | raise subprocess.CalledProcessError(p.returncode, p.args) 45 | 46 | 47 | # pylint: disable=too-many-arguments, too-many-locals 48 | def run_ebma( 49 | df_calib: pd.DataFrame, 50 | df_test: pd.DataFrame, 51 | s_calib_actual: pd.Series, 52 | tolerance: float = 0.001, 53 | shrinkage: float = 3, 54 | const: float = 0.01, 55 | maxiter: int = 10_000, 56 | ) -> Tuple[pd.Series, Dict[str, float]]: 57 | """ Compute EBMA predictions and weights using wrapped R EBMAforecast 58 | 59 | Args: 60 | df_calib: Dataframe with constituent models predictions for calibration 61 | df_test: Dataframe with constituent model 62 | predictions for test period 63 | s_calib_actual: Series with actuals for the calibration partition 64 | tolerance: See R docs 65 | shrinkage: See R docs 66 | const: See R docs 67 | maxiter: See R docs 68 | Returns: 69 | s_ebma: Series with ebma predictions 70 | weights: Dictionary of model weights 71 | 72 | R docs at: 73 | https://cran.r-project.org/web/packages/EBMAforecast/EBMAforecast.pdf 74 | 75 | Ensure df_calib, df_test and s_calib_actual have multiindex set. 76 | 77 | """ 78 | 79 | # Copy data so we don't mess with callers data 80 | df_calib = df_calib.copy() 81 | df_test = df_test.copy() 82 | s_calib_actual = s_calib_actual.copy() 83 | s_calib_actual.name = "actual" 84 | 85 | # Make sure we're all indexed as expected 86 | datautils.check_has_multiindex(df_calib) 87 | datautils.check_has_multiindex(df_test) 88 | datautils.check_has_multiindex(s_calib_actual) 89 | 90 | if not len(s_calib_actual) == len(df_calib): 91 | msg = "Number of rows in df_calib and s_calib_actual don't match" 92 | raise RuntimeError(msg) 93 | 94 | offset = 1e-10 95 | upper = 1 - offset 96 | lower = 0 + offset 97 | 98 | # Sort indexes so they're aligned 99 | # Clip predictions 100 | df_calib = df_calib.sort_index().clip(lower, upper) 101 | df_test = df_test.sort_index().clip(lower, upper) 102 | df_calib_actual = pd.DataFrame(s_calib_actual.sort_index()) 103 | 104 | with tempfile.TemporaryDirectory() as tempdir: 105 | 106 | path_csv_calib = os.path.join(tempdir, "calib.csv") 107 | path_csv_test = os.path.join(tempdir, "test.csv") 108 | path_csv_actuals = os.path.join(tempdir, "actuals.csv") 109 | path_csv_ebma = os.path.join(tempdir, "ebma.csv") 110 | path_csv_weights = os.path.join(tempdir, "weights.csv") 111 | path_rscript = os.path.join(tempdir, "ebma_script.R") 112 | 113 | values = { 114 | "PATH_CSV_ACTUALS": path_csv_actuals, 115 | "PATH_CSV_CALIB": path_csv_calib, 116 | "PATH_CSV_TEST": path_csv_test, 117 | "PATH_CSV_EBMA": path_csv_ebma, 118 | "PATH_CSV_WEIGHTS": path_csv_weights, 119 | "PARAM_TOLERANCE": tolerance, 120 | "PARAM_SHRINKAGE": shrinkage, 121 | "PARAM_CONST": const, 122 | "PARAM_MAXITER": maxiter, 123 | } 124 | 125 | template = _read_template() 126 | rscript = template.substitute(values) 127 | 128 | df_calib.to_csv(path_csv_calib, index=False) 129 | df_test.to_csv(path_csv_test, index=False) 130 | df_calib_actual.to_csv(path_csv_actuals, index=False) 131 | 132 | with open(path_rscript, "w") as f: 133 | f.write(rscript) 134 | cmd = ["Rscript", path_rscript] 135 | _run_subproc(cmd) 136 | 137 | df_ebma = pd.read_csv(path_csv_ebma) 138 | df_weights = pd.read_csv(path_csv_weights) 139 | 140 | df_ebma.index = df_test.index 141 | s_ebma = df_ebma["x"] 142 | s_ebma.name = "ebma" 143 | 144 | s_weights = df_weights["x"] 145 | s_weights.index = df_calib.columns 146 | weights_dict = s_weights.to_dict() 147 | 148 | return s_ebma, weights_dict 149 | -------------------------------------------------------------------------------- /views/apps/ensemble/templates/install_ebma.R: -------------------------------------------------------------------------------- 1 | # Install dependencies 2 | install.packages("separationplot", repos="https://ftp.acc.umu.se/mirror/CRAN/") 3 | install.packages("plyr", repos="https://ftp.acc.umu.se/mirror/CRAN/") 4 | install.packages("Hmisc", repos="https://ftp.acc.umu.se/mirror/CRAN/") 5 | install.packages("abind", repos="https://ftp.acc.umu.se/mirror/CRAN/") 6 | install.packages("Rcpp", repos="https://ftp.acc.umu.se/mirror/CRAN/") 7 | # Install EBMAforecast from the CRAN archive 8 | install.packages("https://cran.r-project.org/src/contrib/Archive/EBMAforecast/EBMAforecast_0.52.tar.gz", repos = NULL, type="source") -------------------------------------------------------------------------------- /views/apps/ensemble/templates/run_ebma.R: -------------------------------------------------------------------------------- 1 | library("EBMAforecast") 2 | 3 | # These are templated values 4 | path_calib_actuals <- "${PATH_CSV_ACTUALS}" 5 | path_csv_calib<- "${PATH_CSV_CALIB}" 6 | path_csv_test <- "${PATH_CSV_TEST}" 7 | path_ebma <- "${PATH_CSV_EBMA}" 8 | path_weights <- "${PATH_CSV_WEIGHTS}" 9 | 10 | 11 | 12 | y_actual <- read.csv(path_calib_actuals, header=TRUE) 13 | df_calib <- read.csv(path_csv_calib, header=TRUE) 14 | df_test = read.csv(path_csv_test, header=TRUE) 15 | 16 | colnames <- c(colnames(df_test)) 17 | 18 | # Equal weights by default 19 | n_models = ncol(df_calib) 20 | initial_weights <- rep((1/n_models), times=n_models) 21 | # logit, normal, binary 22 | param_model <- "logit" 23 | 24 | # Defaults 25 | param_tolerance <- ${PARAM_TOLERANCE} 26 | param_shrinkage <- ${PARAM_SHRINKAGE} 27 | param_const <- ${PARAM_CONST} 28 | param_maxiter <- ${PARAM_MAXITER} 29 | 30 | print("Started making forecast data") 31 | fd <- EBMAforecast::makeForecastData( 32 | .predCalibration=df_calib, 33 | .predTest=df_test, 34 | .outcomeCalibration=y_actual$$actual, #double $$ for template 35 | .modelNames=colnames 36 | ) 37 | 38 | print("Started calibrateEnsemble") 39 | ebma <- EBMAforecast::calibrateEnsemble( 40 | fd, 41 | model=param_model, 42 | tol=param_tolerance, 43 | exp=param_shrinkage, 44 | const=param_const, 45 | W=initial_weights, 46 | maxIter=param_maxiter 47 | ) 48 | 49 | 50 | ebma_prediction <- ebma@predTest[, "EBMA", ] 51 | weights <- ebma@modelWeights 52 | 53 | print("Writing result csvs") 54 | write.csv(weights, path_weights, row.names=FALSE) 55 | write.csv(ebma_prediction, path_ebma, row.names=FALSE) -------------------------------------------------------------------------------- /views/apps/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | """Evaluations module""" 2 | __all__ = ["lib"] 3 | from . import lib 4 | -------------------------------------------------------------------------------- /views/apps/evaluation/feature_importance.py: -------------------------------------------------------------------------------- 1 | """Feature importances module""" 2 | 3 | from typing import List, Dict 4 | import os 5 | import logging 6 | from datetime import datetime 7 | 8 | import pandas as pd # type: ignore 9 | import joblib # type: ignore 10 | from sklearn.ensemble import RandomForestRegressor # type: ignore 11 | 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | def get_feature_importance_from_pickle( 17 | path_pickle: str, features: List[str], period: str, step: int 18 | ) -> Dict[str, float]: 19 | """ Get feature importance from pickle at path. 20 | 21 | Args: 22 | path: Path to pickled RandomForestRegressor. 23 | features: List of feature names. 24 | period: Which period (str). 25 | step: Which step (int). 26 | Returns: 27 | fi_dict: A dictionary of feature importance scores. 28 | """ 29 | fi_dict = {} 30 | if os.path.isfile(path_pickle): 31 | log.debug(f"Started reading {path_pickle}") 32 | try: 33 | model = joblib.load(path_pickle) 34 | model = model.estimators[period][step] 35 | log.debug(f"Finished reading {path_pickle}") 36 | # Only populate if it's a RandomForestRegressor 37 | if isinstance(model, RandomForestRegressor): 38 | importances = model.feature_importances_ 39 | for feature, value in zip(features, importances): 40 | fi_dict[feature] = value 41 | 42 | except EOFError: 43 | log.warning(f"Couldn't read {path_pickle}") 44 | 45 | return fi_dict 46 | 47 | 48 | def reorder_fi_dict(fi_dict: Dict[str, float], top: int = None) -> Dict: 49 | """ Get feature importances in an ordered (desc) table and write .tex. 50 | 51 | Args: 52 | fi_dict: Dictionary of feature importances, {feature: importance}. 53 | top (optional): Top number of feature importances to include. 54 | Returns: 55 | fi_dict: Ordered tab dictionary of feature importance scores, i.e. 56 | {"feature": [features], "importance": [importances]}. 57 | """ 58 | desc = dict( 59 | sorted(fi_dict.items(), key=lambda item: item[1], reverse=True) 60 | ) 61 | 62 | top_desc = {k: desc[k] for k in list(desc)[:top]} if top else desc 63 | 64 | featimps_tabular = { 65 | "feature": [k for k, v in top_desc.items()], 66 | "importance": [v for k, v in top_desc.items()], 67 | } 68 | 69 | return featimps_tabular 70 | 71 | 72 | def write_fi_tex(df: pd.DataFrame, path: str): 73 | """ Write feature importances df to .tex with info added. 74 | 75 | Args: 76 | df: pd.DataFrame containing importances per row, indexed on feature. 77 | path: Full path including filename to write .tex to. 78 | """ 79 | tex = df.to_latex() 80 | # Add meta information. 81 | now = datetime.now().strftime("%Y/%m/%d %H:%M:%S") 82 | meta = f""" 83 | %Output created by feature_importance.py. 84 | %Produced on {now}, written to {path}. 85 | \\ 86 | """ 87 | tex = meta + tex 88 | 89 | with open(path, "w") as f: 90 | f.write(tex) 91 | log.info(f"Wrote feature importances to .tex under {path}.") 92 | -------------------------------------------------------------------------------- /views/apps/evaluation/lib.py: -------------------------------------------------------------------------------- 1 | """ Evaluation library. """ 2 | import pandas as pd # type: ignore 3 | import numpy as np # type: ignore 4 | 5 | from sklearn import metrics # type: ignore 6 | 7 | 8 | # real 9 | def mean_squared_error(actuals: pd.Series, preds: pd.Series) -> float: 10 | """Computes MSE given array of actuals and probs.""" 11 | return metrics.mean_squared_error(y_true=actuals, y_pred=preds) 12 | 13 | 14 | def log_loss(actuals: pd.Series, preds: pd.Series) -> float: 15 | """Computes the log loss score given array of actuals and probs.""" 16 | return metrics.log_loss(y_true=actuals, y_pred=preds) 17 | 18 | 19 | def tadda_score( 20 | y_deltas: pd.Series, 21 | f_deltas: pd.Series, 22 | epsilon=1, 23 | smooth_penalty=False, 24 | element_by_element=False, 25 | ) -> float: 26 | """ Computes TADDA given array of y deltas and f deltas. 27 | 28 | Args: 29 | y_deltas: 1d np.ndarray of length N holding the actual changes. 30 | f_deltas: 1d np.ndarray of length N holding the forecasted values. 31 | epsilon: a positive scalar that defines the target around actual values 32 | where values are "close enough". 33 | smooth_penalty: when y[i] =/- epsilon(E) crosses 0, there is a jump in 34 | TADDA. This can be smoothed away by only penalizing directional 35 | chance by |f-E| (False by default). 36 | element_by_element: return the mean of the individual contributions if 37 | False or the vector of individual TADDA values if True (False by 38 | default). 39 | Returns: 40 | scalar if element_by_element is False, and np.array if 41 | element_by_element is True. 42 | """ 43 | 44 | def sign_not_equal(y_deltas, f_deltas): 45 | # 0 is treated as both pos and neg: returns 0 (not not equal) when y=0 46 | y_sign = np.where(y_deltas > 0.0, 1, 0) 47 | f_sign = np.where(f_deltas > 0.0, 1, 0) 48 | return np.where((y_sign == f_sign) | (np.equal(y_deltas, 0.0)), 0, 1) 49 | 50 | term1 = np.abs(y_deltas - f_deltas) 51 | 52 | if not smooth_penalty: 53 | sign_equality = sign_not_equal(y_deltas, f_deltas) 54 | over_epsilon = np.where(np.abs(y_deltas - f_deltas) > epsilon, 1, 0) 55 | term2 = np.abs(f_deltas) * sign_equality * over_epsilon 56 | else: 57 | cutoff = np.where( 58 | np.abs(y_deltas) < epsilon, np.abs(np.abs(y_deltas) - epsilon), 0 59 | ) 60 | sign_equality = sign_not_equal(y_deltas, f_deltas) 61 | over_epsilon = np.where(np.abs(y_deltas - f_deltas) > epsilon, 1, 0) 62 | term2 = ( 63 | np.abs(np.abs(f_deltas) - cutoff) * sign_equality * over_epsilon 64 | ) 65 | 66 | return term1 + term2 if element_by_element else (term1 + term2).mean() 67 | 68 | 69 | # real 70 | def mean_absolute_error(actuals: pd.Series, preds: pd.Series) -> float: 71 | """Computes MAE given array of actuals and preds.""" 72 | return metrics.mean_absolute_error(y_true=actuals, y_pred=preds) 73 | 74 | 75 | # real 76 | def r2_score(actuals: pd.Series, preds: pd.Series) -> float: 77 | """Computes r2 given array of actuals and preds.""" 78 | return metrics.r2_score(y_true=actuals, y_pred=preds) 79 | 80 | 81 | # prob 82 | def average_precision(actuals: pd.Series, probs: pd.Series) -> float: 83 | """Computes AUPR given array of actuals and probs.""" 84 | return metrics.average_precision_score(y_true=actuals, y_score=probs) 85 | 86 | 87 | # prob 88 | def area_under_roc(actuals: pd.Series, probs: pd.Series) -> float: 89 | """Computes AUROC given array of actuals and probs.""" 90 | return metrics.roc_auc_score(y_true=actuals, y_score=probs) 91 | 92 | 93 | # prob 94 | def brier(actuals: pd.Series, probs: pd.Series) -> float: 95 | """Computes brier score given array of actuals and probs.""" 96 | return metrics.brier_score_loss(y_true=actuals, y_prob=probs) 97 | 98 | 99 | # @TODO: These need bool predictions, apply a threshold to a prob maybe? 100 | # bool 101 | def accuracy(actuals: pd.Series, preds: pd.Series) -> float: 102 | """Computes accuracy from series of actuals and predictions with 103 | single threshold applied.""" 104 | return metrics.accuracy_score(y_true=actuals, y_pred=preds) 105 | 106 | 107 | # bool 108 | def precision(actuals: pd.Series, preds: pd.Series) -> float: 109 | """Computes precision from confusion matrix.""" 110 | return metrics.precision_score(y_true=actuals, y_pred=preds) 111 | 112 | 113 | # bool 114 | def recall(actuals: pd.Series, preds: pd.Series) -> float: 115 | """Computes recall from confusion matrix.""" 116 | return metrics.recall_score(y_true=actuals, y_pred=preds) 117 | 118 | 119 | # bool 120 | def f1_score(actuals: pd.Series, preds: pd.Series) -> float: 121 | """Computes F1-score given precision and recall.""" 122 | return metrics.f1_score(y_true=actuals, y_pred=preds) 123 | 124 | 125 | # bool 126 | def class_report(actuals: pd.Series, preds: pd.Series) -> float: 127 | """ Classification report """ 128 | return metrics.classification_report(y_true=actuals, y_pred=preds) 129 | -------------------------------------------------------------------------------- /views/apps/extras/__init__.py: -------------------------------------------------------------------------------- 1 | """ Extra modules for miscellanous tasks such as data publication """ 2 | 3 | __all__ = [ 4 | "fetch_prediction_competition_data", 5 | "extract_and_package_data", 6 | "refresh_datasets_from_website", 7 | ] 8 | from .extras import ( 9 | fetch_prediction_competition_data, 10 | extract_and_package_data, 11 | refresh_datasets_from_website, 12 | ) 13 | -------------------------------------------------------------------------------- /views/apps/extras/extras.py: -------------------------------------------------------------------------------- 1 | """ Get the prediction competition data from the ViEWS website """ 2 | from typing import List, Optional, Dict 3 | import tempfile 4 | import logging 5 | import os 6 | from datetime import date 7 | 8 | from views.apps.data import api 9 | from views.utils import io 10 | from views.config import DIR_STORAGE 11 | from views.specs.data import DATASETS 12 | 13 | log = logging.getLogger() 14 | 15 | DIR_UPLOAD = os.path.join(DIR_STORAGE, "upload") 16 | io.create_directory(DIR_UPLOAD) 17 | 18 | 19 | def fetch_prediction_competition_data( 20 | fnames_want: Optional[List[str]] = None, 21 | ) -> Dict[str, str]: 22 | """ Fetch and unpack the prediction competition data""" 23 | 24 | fname_zip = "views_pred_comp_data_20200427.zip" 25 | url = f"https://views.pcr.uu.se/download/datasets/{fname_zip}" 26 | 27 | if not fnames_want: 28 | fnames_want = ["cm.csv", "pgm.csv"] 29 | 30 | dir_destination = os.path.join(DIR_STORAGE, "prediction_competition") 31 | paths_want = [ 32 | os.path.join(dir_destination, fname) for fname in fnames_want 33 | ] 34 | 35 | io.create_directory(dir_destination) 36 | 37 | if all([os.path.isfile(path) for path in paths_want]): 38 | log.info("Files already where we need them") 39 | else: 40 | log.info(f"Fetching {fnames_want} from {url}") 41 | with tempfile.TemporaryDirectory() as tempdir: 42 | path_zip = os.path.join(tempdir, fname_zip) 43 | io.fetch_url_to_file(url=url, path=path_zip) 44 | paths_unzipped = io.unpack_zipfile(path_zip, destination=tempdir) 45 | paths_destination: List[str] = [] 46 | for path in paths_unzipped: 47 | fname = os.path.basename(path) 48 | if fname in fnames_want: 49 | path_destination = os.path.join(dir_destination, fname) 50 | io.move_file(path_from=path, path_to=path_destination) 51 | paths_destination.append(path_destination) 52 | 53 | paths_missing = [ 54 | path for path in paths_want if path not in paths_destination 55 | ] 56 | if paths_missing: 57 | raise RuntimeError(f"Missing paths {paths_missing}") 58 | 59 | data = {os.path.basename(path): path for path in paths_want} 60 | 61 | return data 62 | 63 | 64 | def extract_and_package_data(): 65 | """ Get raw data from database, dump to files and zip it up """ 66 | with tempfile.TemporaryDirectory() as tempdir: 67 | paths = [] 68 | # Dump tables to csv 69 | for name, dataset in DATASETS.items(): 70 | fname = f"{name}.csv" 71 | path = os.path.join(tempdir, fname) 72 | dataset.export_raw_to_csv(path=path) 73 | paths.append(path) 74 | 75 | geom_c = api.GeomCountry() 76 | geom_c.refresh() 77 | paths.append(geom_c.path) 78 | geom_pg = api.GeomPriogrid() 79 | geom_pg.refresh() 80 | paths.append(geom_pg.path) 81 | 82 | today = date.today().strftime("%Y%m%d") 83 | fname_zip = f"data_export_{today}.zip" 84 | io.make_zipfile( 85 | path_zip=os.path.join(DIR_UPLOAD, fname_zip), paths_members=paths 86 | ) 87 | log.info(f"Wrote zip to {os.path.join(DIR_UPLOAD, fname_zip)}") 88 | log.info("Now go ahead and upload it to the webserver manually.") 89 | 90 | 91 | def refresh_datasets_from_website(fname_zip="data_export_20200513.zip"): 92 | """ Initialise local data cache from website public data """ 93 | 94 | url = f"https://views.pcr.uu.se/download/datasets/{fname_zip}" 95 | log.info(f"Fetching from {url}") 96 | with tempfile.TemporaryDirectory() as tempdir: 97 | path_zip = os.path.join(tempdir, fname_zip) 98 | io.fetch_url_to_file(url=url, path=path_zip) 99 | log.info("Done fetching. Unpacking zipfile.") 100 | _ = io.unpack_zipfile(path_zip, destination=tempdir) 101 | 102 | log.info("Initalising local geometries") 103 | geom_c = api.GeomCountry() 104 | geom_pg = api.GeomPriogrid() 105 | path_geom_c = os.path.join(tempdir, os.path.basename(geom_c.path)) 106 | path_geom_pg = os.path.join(tempdir, os.path.basename(geom_pg.path)) 107 | geom_c.init_cache_from_geojson(path_geom_c) 108 | geom_pg.init_cache_from_geojson(path_geom_pg) 109 | 110 | log.info("Initalising datasets.") 111 | for name, dataset in DATASETS.items(): 112 | fname = f"{name}.csv" 113 | path_csv = os.path.join(tempdir, fname) 114 | dataset.init_cache_from_csv(path_csv) 115 | log.info("Done initalising data, you can now use views.DATASETS") 116 | -------------------------------------------------------------------------------- /views/apps/model/__init__.py: -------------------------------------------------------------------------------- 1 | """ Model specification """ 2 | __all__ = ["api", "Model", "Ensemble", "Period", "Downsampling"] 3 | from . import api 4 | from .api import Model, Ensemble, Period, Downsampling 5 | -------------------------------------------------------------------------------- /views/apps/model/calibration.py: -------------------------------------------------------------------------------- 1 | """ Calibration """ 2 | import logging 3 | from typing import Tuple 4 | import warnings 5 | import numpy as np # type: ignore 6 | import pandas as pd # type: ignore 7 | import statsmodels.api as sm # type: ignore 8 | 9 | from views.utils import stats 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | def _log_missing_indices(s: pd.Series) -> None: 15 | log.warning(f"Missing indices: {s.loc[s.isnull()].index}") 16 | 17 | 18 | def calibrate_real( 19 | s_test_pred: pd.Series, s_calib_pred: pd.Series, s_calib_actual: pd.Series 20 | ) -> pd.Series: 21 | """ Calibrate real value predictions 22 | 23 | Scaling parameters applied would, if applied to s_calib_pred, 24 | make them near-equal in mean and variance to s_calib_actual. 25 | 26 | For the case of transforming one set to have a given mean and std 27 | see: 28 | https://stats.stackexchange.com/questions/46429/transform-data-to-desired-mean-and-standard-deviation 29 | 30 | This case is slightly more involved as we want to shift the test 31 | predictions by parameters "learned" from comparing calibration 32 | predictions to actuals. 33 | 34 | 35 | """ 36 | 37 | # Compute standard deviation ratio 38 | std_ratio = s_calib_actual.std() / s_calib_pred.std() 39 | # Remoe the calib mean from test predictions 40 | s_test_demeaned = s_test_pred - s_calib_pred.mean() 41 | # Shift calib de-meaned test predictions by the calib actual mean 42 | # And scale to the std ratio 43 | s_test_pred_scaled = s_calib_actual.mean() + s_test_demeaned * std_ratio 44 | 45 | return s_test_pred_scaled 46 | 47 | 48 | def calibrate_prob( 49 | s_test_pred: pd.Series, s_calib_pred: pd.Series, s_calib_actual: pd.Series 50 | ) -> pd.Series: 51 | """ Calibrate s_test_pred 52 | 53 | First predictions are transformed into logodds. 54 | Then a logit model is fit on 55 | "actual_outcomes ~ alpha + beta*logodds(p_calib)". 56 | Then alpha and beta are applied to test predictions like 57 | A = e^(alpha+(beta*p_test)) 58 | p_test_calibrated = A/(A+1) 59 | 60 | See: https://en.wikipedia.org/wiki/Logistic_regression 61 | 62 | """ 63 | 64 | def _get_scaling_params( 65 | s_calib_actual: pd.Series, s_calib: pd.Series 66 | ) -> Tuple[float, float]: 67 | """ Gets scaling params """ 68 | 69 | y = np.array(s_calib_actual) 70 | intercept = np.ones(len(s_calib)) 71 | X = np.array([intercept, s_calib]).T 72 | 73 | model = sm.Logit(y, X).fit(disp=0) 74 | beta_0 = model.params[0] 75 | beta_1 = model.params[1] 76 | 77 | return beta_0, beta_1 78 | 79 | def _apply_scaling_params( 80 | s_test: pd.Series, beta_0: float, beta_1: float 81 | ) -> pd.Series: 82 | """ Scale logodds in s_test using intercept and beta""" 83 | numerator = np.exp(beta_0 + (beta_1 * s_test)) 84 | denominator = numerator + 1 85 | scaled_probs = numerator / denominator 86 | 87 | return scaled_probs 88 | 89 | def _check_inputs( 90 | s_test_pred: pd.Series, 91 | s_calib_pred: pd.Series, 92 | s_calib_actual: pd.Series, 93 | ) -> None: 94 | """ Check that inputs have valid names and could be proabilities """ 95 | 96 | if ( 97 | s_test_pred.min() < 0 98 | or s_test_pred.max() > 1 99 | or s_calib_pred.min() < 0 100 | or s_calib_pred.max() > 1 101 | ): 102 | raise RuntimeError( 103 | "Probabilities outside (0,1) range were passed to calibrate" 104 | ) 105 | 106 | if not s_calib_pred.name == s_test_pred.name: 107 | warnings.warn(f"{s_calib_pred.name} != {s_test_pred.name}") 108 | if s_test_pred.isnull().sum() > 0: 109 | _log_missing_indices(s_test_pred) 110 | raise RuntimeError("Missing values in s_test_pred") 111 | if s_calib_pred.isnull().sum() > 0: 112 | _log_missing_indices(s_calib_pred) 113 | raise RuntimeError("Missing values in s_calib_pred") 114 | if s_calib_actual.isnull().sum() > 0: 115 | _log_missing_indices(s_calib_actual) 116 | raise RuntimeError("Missing values in s_calib_actual") 117 | 118 | if ( 119 | not len(s_calib_pred) == len(s_calib_actual) 120 | or len(s_calib_pred.index.difference(s_calib_actual.index)) > 0 121 | ): 122 | raise RuntimeError( 123 | f"len(s_calib_pred): {len(s_calib_pred)} " 124 | f"len(s_calib_actual): {len(s_calib_actual)} " 125 | f"index diff: " 126 | f"{s_calib_pred.index.difference(s_calib_actual.index)}" 127 | f"s_calib_pred.head() : {s_calib_pred.head()}" 128 | f"s_calib_pred.tail() : {s_calib_pred.tail()}" 129 | f"s_calib_actual.head() : {s_calib_actual.head()}" 130 | f"s_calib_actual.tail() : {s_calib_actual.tail()}" 131 | ) 132 | 133 | _check_inputs(s_test_pred, s_calib_pred, s_calib_actual) 134 | 135 | beta_0, beta_1 = _get_scaling_params( 136 | s_calib_actual=s_calib_actual, 137 | s_calib=stats.prob_to_logodds(s_calib_pred.copy()), 138 | ) 139 | if beta_1 < 0: 140 | warnings.warn(f"Beta_1 < 0. Very weak {s_calib_pred.name} ?") 141 | 142 | s_test_pred_scaled = _apply_scaling_params( 143 | stats.prob_to_logodds(s_test_pred.copy()), beta_0, beta_1 144 | ) 145 | return s_test_pred_scaled 146 | -------------------------------------------------------------------------------- /views/apps/model/crosslevel.py: -------------------------------------------------------------------------------- 1 | """ Cross level model functions """ 2 | # flake8: noqa 3 | # pylint: skip-file 4 | import pandas as pd # type: ignore 5 | from .api import Model 6 | 7 | 8 | # class CrossLevel: 9 | # def __init__(model_high_res: Model, model_low_res: Model): 10 | # self.model_high_res = model_high_res 11 | # self.model_low_res = model_low_res 12 | # self.steps = self.steps_in_common([model_high_res, model_low_res]) 13 | # self.steps = sorted(list(self.steps)) 14 | 15 | # @staticmethod 16 | # def steps_in_common(models: List[Model]): 17 | # """ Find steps that all models have in common """ 18 | # return sorted( 19 | # set.intersection(*[set(model.steps) for model in models]) 20 | # ) 21 | 22 | # def predict( 23 | # self, df_high_res: pd.DataFrame, df_low_res: pd.DataFrame 24 | # ) -> pd.Series: 25 | # """ Combine high and low res predictions """ 26 | # cols_ss_h = [self.model_high_res.cols_ss[step] for step in self.steps] 27 | # cols_ss_l = [self.model_low_res.cols_ss[step] for step in self.steps] 28 | # df_h = df_high_res[cols_ss_h] 29 | # df_l = df_low_res[cols_ss_l] 30 | 31 | 32 | # def fetch_df_links(): 33 | # """Get a df linking pg_ids to country_ids.""" 34 | 35 | # query = """ 36 | # SELECT pgm.priogrid_gid AS pg_id, 37 | # cm.country_id 38 | # FROM staging.priogrid_month AS pgm 39 | # INNER JOIN staging.country_month AS cm ON pgm.country_month_id = cm.id 40 | # --- Month 500 arbitrary choice 41 | # WHERE pgm.month_id = 500; 42 | # """ 43 | # return dbutils.query_to_df(query) 44 | 45 | 46 | # def compute_colaresi(df, col_pgm, col_cm): 47 | # """ Colaresian cross level probability """ 48 | 49 | # # Sum of high resolution probabilities for each low level area 50 | # sum_h_by_l = df.groupby(["month_id", "country_id"])[col_pgm].transform(sum) 51 | 52 | # # Low resolution prob multiplied by share of high res prob in particular area 53 | # joint_prob = df[col_cm] * (df[col_pgm] / sum_h_by_l) 54 | 55 | # return joint_prob 56 | 57 | 58 | # def crosslevel(df_pgm, df_cm, df_links, col_pgm, col_cm): 59 | # # Join in country_id 60 | # df = df_pgm[[col_pgm]].join(df_links.set_index(["pg_id"])[["country_id"]]) 61 | # df = df.reset_index().set_index(["month_id", "country_id"]) 62 | # df = ( 63 | # df.join(df_cm[[col_cm]]).reset_index().set_index(["month_id", "pg_id"]) 64 | # ) 65 | # s = compute_colaresi(df, col_pgm, col_cm) 66 | # share_missing = s.isnull().sum() / len(s) 67 | # if share_missing > 0.01: 68 | # raise RuntimeError( 69 | # f"Too much missing in prediction, something's wrong" 70 | # ) 71 | # s = s.fillna(s.mean()) 72 | # return s 73 | 74 | 75 | # if False: 76 | # df_links = fetch_df_links() 77 | # for step in [1, 6, 12, 24, 36]: 78 | # for outcome in ["sb", "ns", "os"]: 79 | # col_cl = f"ss.{outcome}_crosslevel.{step}" 80 | # col_pgm = ( 81 | # f"ss.{outcome}_xgb.{step}" # Use the allthemes model for pgm 82 | # ) 83 | # col_cm = f"ss.{outcome}_all_glob.{step}" # Use the all_glob model for CM 84 | # df_pgm_a[col_cl] = crosslevel( 85 | # df_pgm_a, df_cm_a, df_links, col_pgm, col_cm 86 | # ) 87 | # df_pgm_b[col_cl] = crosslevel( 88 | # df_pgm_b, df_cm_b, df_links, col_pgm, col_cm 89 | # ) 90 | # df_pgm_c[col_cl] = crosslevel( 91 | # df_pgm_c, df_cm_c, df_links, col_pgm, col_cm 92 | # ) 93 | -------------------------------------------------------------------------------- /views/apps/pipeline/README.md: -------------------------------------------------------------------------------- 1 | # Pipeline 2 | 3 | The pipeline is what produces monthly forecasts. 4 | It does this in steps 5 | 6 | ## Training 7 | Is done rarely and models persisted. 8 | Training runs 9 | 10 | ## Prediction 11 | -------------------------------------------------------------------------------- /views/apps/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | """ Forecasting and training pipelines """ 2 | # __all__ = ["train", "models_cm", "models_pgm"] 3 | # from . import train, models_cm, models_pgm 4 | -------------------------------------------------------------------------------- /views/apps/pipeline/ensembles_cm.py: -------------------------------------------------------------------------------- 1 | """ All CM Ensemble objects 2 | 3 | 4 | The following models are included in the JPR 2020 CM 5 | ensemble: 6 | cm_sb_cflong 7 | cm_sb_acled_violence 8 | cm_ns_neibhist 9 | cm_sb_cdummies 10 | cm_sb_acled_protest 11 | cm_sb_reign_coups 12 | cm_sb_icgcw 13 | cm_sb_reign_drought 14 | cm_sb_reign_global 15 | cm_sb_vdem_global 16 | cm_sb_demog 17 | cm_sb_wdi_global 18 | cm_sb_all_global 19 | cm_sbonset24_25_all 20 | 21 | and are all included in the prelim ensembles below. 22 | @TODO: Not ready in this repo yet are and are to be added later: 23 | 24 | ds_25 25 | ds_dummy 26 | 27 | """ 28 | 29 | # pylint: disable=invalid-name 30 | 31 | 32 | from typing import Dict, List 33 | from views.apps.model.api import Ensemble, Model, Period 34 | from views.specs.periods import get_periods 35 | from . import models_cm 36 | 37 | 38 | # The currently latest model development run id 39 | run_id = "d_2020_04_01" 40 | periods: List[Period] = get_periods(run_id=run_id) 41 | 42 | models_cm_sb_prelim: List[Model] = [ 43 | models_cm.cm_sb_cflong, 44 | models_cm.cm_sb_acled_violence, 45 | models_cm.cm_sb_neibhist, 46 | models_cm.cm_sb_cdummies, 47 | models_cm.cm_sb_acled_protest, 48 | models_cm.cm_sb_reign_coups, 49 | models_cm.cm_sb_icgcw, 50 | models_cm.cm_sb_reign_drought, 51 | models_cm.cm_sb_reign_global, 52 | models_cm.cm_sb_vdem_global, 53 | models_cm.cm_sb_demog, 54 | models_cm.cm_sb_wdi_global, 55 | models_cm.cm_sb_all_global, 56 | models_cm.cm_sbonset24_25_all, 57 | ] 58 | 59 | models_cm_ns_prelim: List[Model] = [ 60 | models_cm.cm_ns_cflong, 61 | models_cm.cm_ns_acled_violence, 62 | models_cm.cm_ns_neibhist, 63 | models_cm.cm_ns_cdummies, 64 | models_cm.cm_ns_acled_protest, 65 | models_cm.cm_ns_reign_coups, 66 | models_cm.cm_ns_icgcw, 67 | models_cm.cm_ns_reign_drought, 68 | models_cm.cm_ns_reign_global, 69 | models_cm.cm_ns_vdem_global, 70 | models_cm.cm_ns_demog, 71 | models_cm.cm_ns_wdi_global, 72 | models_cm.cm_ns_all_global, 73 | models_cm.cm_nsonset24_25_all, 74 | ] 75 | 76 | models_cm_os_prelim: List[Model] = [ 77 | models_cm.cm_os_cflong, 78 | models_cm.cm_os_acled_violence, 79 | models_cm.cm_os_neibhist, 80 | models_cm.cm_os_cdummies, 81 | models_cm.cm_os_acled_protest, 82 | models_cm.cm_os_reign_coups, 83 | models_cm.cm_os_icgcw, 84 | models_cm.cm_os_reign_drought, 85 | models_cm.cm_os_reign_global, 86 | models_cm.cm_os_vdem_global, 87 | models_cm.cm_os_demog, 88 | models_cm.cm_os_wdi_global, 89 | models_cm.cm_os_all_global, 90 | models_cm.cm_osonset24_25_all, 91 | ] 92 | 93 | cm_sb_prelim = Ensemble( 94 | name="cm_sb_prelim", 95 | models=models_cm_sb_prelim, 96 | method="ebma", 97 | outcome_type="prob", 98 | col_outcome="greq_25_ged_best_sb", 99 | periods=periods, 100 | delta_outcome=False, 101 | ) 102 | 103 | cm_ns_prelim = Ensemble( 104 | name="cm_ns_prelim", 105 | models=models_cm_ns_prelim, 106 | method="ebma", 107 | outcome_type="prob", 108 | col_outcome="greq_25_ged_best_ns", 109 | periods=periods, 110 | delta_outcome=False, 111 | ) 112 | cm_os_prelim = Ensemble( 113 | name="cm_os_prelim", 114 | models=models_cm_os_prelim, 115 | method="ebma", 116 | outcome_type="prob", 117 | col_outcome="greq_25_ged_best_os", 118 | periods=periods, 119 | delta_outcome=False, 120 | ) 121 | 122 | all_cm_ensembles: List[Ensemble] = [ 123 | cm_sb_prelim, 124 | cm_ns_prelim, 125 | cm_os_prelim, 126 | ] 127 | all_cm_ensembles_by_name: Dict[str, Ensemble] = dict() 128 | for ensemble in all_cm_ensembles: 129 | all_cm_ensembles_by_name[ensemble.name] = ensemble 130 | -------------------------------------------------------------------------------- /views/apps/pipeline/ensembles_pgm.py: -------------------------------------------------------------------------------- 1 | """ All PGM Ensemble objects 2 | 3 | The following models are included in the JPR 2020 PGM 4 | ensemble: 5 | allthemes 6 | hist_legacy 7 | onset24_100_all 8 | onset24_1_all 9 | pgd_natural 10 | pgd_social 11 | sptime 12 | 13 | These 4 are not included yet but will be when implemented in this repo 14 | ds_25 15 | ds_dummy 16 | xgb 17 | crosslevel 18 | 19 | ] 20 | """ 21 | 22 | # pylint: disable=invalid-name 23 | 24 | from typing import Dict, List 25 | from views.apps.model.api import Ensemble, Model, Period 26 | from views.specs.periods import get_periods 27 | from . import models_pgm 28 | 29 | 30 | # The currently latest model development run id 31 | run_id = "d_2020_04_01" 32 | periods: List[Period] = get_periods(run_id=run_id) 33 | 34 | 35 | models_pgm_sb_prelim: List[Model] = [ 36 | models_pgm.pgm_sb_hist_legacy, 37 | models_pgm.pgm_sb_allthemes, 38 | models_pgm.pgm_sb_onset24_100_all, 39 | models_pgm.pgm_sb_onset24_1_all, 40 | models_pgm.pgm_sb_pgd_natural, 41 | models_pgm.pgm_sb_pgd_social, 42 | models_pgm.pgm_sb_sptime, 43 | ] 44 | 45 | models_pgm_ns_prelim: List[Model] = [ 46 | models_pgm.pgm_ns_hist_legacy, 47 | models_pgm.pgm_ns_allthemes, 48 | models_pgm.pgm_ns_onset24_100_all, 49 | models_pgm.pgm_ns_onset24_1_all, 50 | models_pgm.pgm_ns_pgd_natural, 51 | models_pgm.pgm_ns_pgd_social, 52 | models_pgm.pgm_ns_sptime, 53 | ] 54 | 55 | models_pgm_os_prelim: List[Model] = [ 56 | models_pgm.pgm_os_hist_legacy, 57 | models_pgm.pgm_os_allthemes, 58 | models_pgm.pgm_os_onset24_100_all, 59 | models_pgm.pgm_os_onset24_1_all, 60 | models_pgm.pgm_os_pgd_natural, 61 | models_pgm.pgm_os_pgd_social, 62 | models_pgm.pgm_os_sptime, 63 | ] 64 | 65 | pgm_sb_prelim = Ensemble( 66 | name="pgm_sb_prelim", 67 | models=models_pgm_sb_prelim, 68 | method="average", 69 | outcome_type="prob", 70 | col_outcome="ged_dummy_sb", 71 | periods=periods, 72 | ) 73 | 74 | pgm_ns_prelim = Ensemble( 75 | name="pgm_ns_prelim", 76 | models=models_pgm_ns_prelim, 77 | method="average", 78 | outcome_type="prob", 79 | col_outcome="ged_dummy_ns", 80 | periods=periods, 81 | ) 82 | 83 | pgm_os_prelim = Ensemble( 84 | name="pgm_os_prelim", 85 | models=models_pgm_os_prelim, 86 | method="average", 87 | outcome_type="prob", 88 | col_outcome="ged_dummy_os", 89 | periods=periods, 90 | ) 91 | 92 | all_pgm_ensembles: List[Ensemble] = [ 93 | pgm_sb_prelim, 94 | pgm_ns_prelim, 95 | pgm_os_prelim, 96 | ] 97 | 98 | all_pgm_ensembles_by_name: Dict[str, Ensemble] = dict() 99 | for ensemble in all_pgm_ensembles: 100 | all_pgm_ensembles_by_name[ensemble.name] = ensemble 101 | -------------------------------------------------------------------------------- /views/apps/pipeline/train.py: -------------------------------------------------------------------------------- 1 | """ This module defines the training of all models used in ViEWS 2 | 3 | After it is run, all required models should be persisted on disk and 4 | ready for prediction. 5 | """ 6 | import logging 7 | 8 | from typing_extensions import Literal 9 | 10 | from views.specs.data import DATASETS 11 | from . import models_cm, models_pgm 12 | 13 | log = logging.getLogger(__name__) 14 | 15 | 16 | def train_and_store_model_by_name( 17 | loa: Literal["am", "cm", "pgm"], model: str, dataset: str 18 | ) -> None: 19 | """ Lookup a model by name and fit, evaluate and store it """ 20 | 21 | if loa == "cm": 22 | model_object = models_cm.all_cm_models_by_name[model] 23 | elif loa == "pgm": 24 | model_object = models_pgm.all_pgm_models_by_name[model] 25 | else: 26 | raise NotImplementedError(f"cm and pgm models only yet, not {loa}") 27 | 28 | df = DATASETS[dataset].df 29 | model_object.fit_estimators(df) 30 | model_object.save() 31 | -------------------------------------------------------------------------------- /views/apps/plot/__init__.py: -------------------------------------------------------------------------------- 1 | """ Plotting modules """ 2 | 3 | __all__ = [ 4 | "MapData", 5 | "plot_map", 6 | ] 7 | 8 | from .maps import MapData, plot_map 9 | -------------------------------------------------------------------------------- /views/apps/slurm/__init__.py: -------------------------------------------------------------------------------- 1 | """ Slurm interface """ 2 | __all__ = ["run_command"] 3 | from .slurm import run_command 4 | -------------------------------------------------------------------------------- /views/apps/slurm/templates/runfile_core.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -o ${LOGFILE_LOCATION} 3 | #SBATCH -A ${PROJECT_ID} 4 | #SBATCH -J ${NAME} 5 | #SBATCH -p core 6 | #SBATCH -n ${N_CORES} 7 | #SBATCH -t ${TIME} 8 | 9 | echo $$(date -u) - Starting job ${NAME} 10 | 11 | ${COMMAND} 12 | 13 | echo $$(date -u) - Finished job ${NAME} 14 | -------------------------------------------------------------------------------- /views/apps/slurm/templates/runfile_node.txt: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | #SBATCH -o ${LOGFILE_LOCATION} 3 | #SBATCH -A ${PROJECT_ID} 4 | #SBATCH -J ${NAME} 5 | #SBATCH -p node 6 | #SBATCH -t ${TIME} 7 | 8 | echo $$(date -u) - Starting job ${NAME} 9 | 10 | ${COMMAND} 11 | 12 | echo $$(date -u) - Finished job ${NAME} 13 | -------------------------------------------------------------------------------- /views/apps/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | """ Data transformations """ 2 | __all__ = ["lib"] 3 | from . import lib 4 | -------------------------------------------------------------------------------- /views/apps/xgb/lib.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UppsalaConflictDataProgram/OpenViEWS2/7eb3e63c8c046de31f70cd56f417fadf03686f5a/views/apps/xgb/lib.py -------------------------------------------------------------------------------- /views/config.py: -------------------------------------------------------------------------------- 1 | """ Config module. Reads config.yaml in repo root and exposes vars """ 2 | from dataclasses import dataclass 3 | from typing import Any, Dict, Tuple, Optional 4 | import os 5 | import copy 6 | import json 7 | import yaml 8 | 9 | 10 | REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) 11 | LOGFMT = "[%(asctime)s] - %(name)s:%(lineno)d - %(levelname)s - %(message)s" 12 | 13 | 14 | def _resolve(path: str) -> str: 15 | """ Resolve env vars and home in path """ 16 | return os.path.expanduser(os.path.expandvars(path)) 17 | 18 | 19 | # pylint: disable=too-many-instance-attributes 20 | @dataclass 21 | class Db: 22 | """ Holds connection options for connecting through sqlalchemy """ 23 | 24 | user: str 25 | host: str 26 | dbname: str 27 | port: int 28 | password: Optional[str] = None 29 | use_ssl: Optional[bool] = False 30 | ssl_cert: Optional[str] = None 31 | ssl_key: Optional[str] = None 32 | ssl_rootcert: Optional[str] = None 33 | 34 | @property 35 | def connectstring(self) -> str: 36 | """ Get a connectstring """ 37 | 38 | if self.password: 39 | userpart = f"{self.user}:{self.password}" 40 | else: 41 | userpart = self.user 42 | 43 | return f"postgresql://{userpart}@{self.host}:{self.port}/{self.dbname}" 44 | 45 | @property 46 | def connect_args(self) -> Dict[str, str]: 47 | """ Get dict of connect_args """ 48 | 49 | if self.use_ssl: 50 | assert self.ssl_cert 51 | assert self.ssl_key 52 | assert self.ssl_rootcert 53 | connectargs = { 54 | "sslmode": "require", 55 | "sslcert": _resolve(self.ssl_cert), 56 | "sslkey": _resolve(self.ssl_key), 57 | "sslrootcert": _resolve(self.ssl_rootcert), 58 | } 59 | else: 60 | connectargs = dict() 61 | 62 | return connectargs 63 | 64 | def __repr__(self): 65 | repdict = copy.copy(self.__dict__) 66 | 67 | # Never log the password 68 | if self.password: 69 | repdict["password"] = "******" 70 | repdict["connectstring"] = self.connectstring.replace( 71 | self.password, "******" 72 | ) 73 | 74 | return json.dumps(repdict) 75 | 76 | def __str__(self): 77 | return self.__repr__() 78 | 79 | 80 | def _get_configfile() -> Dict[str, Any]: 81 | """ Read the raw configfile """ 82 | with open(os.path.join(REPO_ROOT, "config.yaml"), "r") as f: 83 | return yaml.safe_load(f) 84 | 85 | 86 | def _get_dirs() -> Tuple[str, str]: 87 | """ Get and resolve all the directories in config.yaml """ 88 | config = _get_configfile() 89 | 90 | dir_storage = config["dirs"]["storage"] 91 | dir_scratch = config["dirs"]["scratch"] 92 | 93 | if not dir_storage: 94 | dir_storage = os.path.join(REPO_ROOT, "storage") 95 | 96 | if not dir_scratch: 97 | dir_scratch = os.path.join(dir_storage, "scratch") 98 | 99 | dir_storage = _resolve(dir_storage) 100 | dir_scratch = _resolve(dir_scratch) 101 | 102 | return dir_storage, dir_scratch 103 | 104 | 105 | def _get_databases() -> Dict[str, Db]: 106 | """ Get all the database configs in config.yaml """ 107 | config = _get_configfile() 108 | 109 | dbs = dict() 110 | for db_name, db_spec in config["databases"].items(): 111 | dbs[db_name] = Db(**db_spec) 112 | 113 | dbs["default"] = dbs[config["default_database"]] 114 | 115 | return dbs 116 | 117 | 118 | def _get_slurm_cfg() -> Dict[str, str]: 119 | config = _get_configfile() 120 | if "slurm" in config.keys(): 121 | slurm_cfg = config["slurm"] 122 | else: 123 | slurm_cfg = {"username": "", "project": ""} 124 | 125 | return slurm_cfg 126 | 127 | 128 | DIR_STORAGE, DIR_SCRATCH = _get_dirs() 129 | DATABASES = _get_databases() 130 | SLURM = _get_slurm_cfg() 131 | -------------------------------------------------------------------------------- /views/database/README.md: -------------------------------------------------------------------------------- 1 | # Database 2 | This directory is proposed to be the new home of the views database. 3 | 4 | ## Structure 5 | 6 | A proposed structure is presented here. 7 | Each data sources lives in its own directory in sources. 8 | Sources that are versioned and might change the list of columns they provide are organised by version. 9 | A skeleton schema that builds -------------------------------------------------------------------------------- /views/database/__init__.py: -------------------------------------------------------------------------------- 1 | """ Database related functionality """ 2 | 3 | from . import sources, skeleton 4 | 5 | __all__ = ["sources", "skeleton"] 6 | -------------------------------------------------------------------------------- /views/database/common.py: -------------------------------------------------------------------------------- 1 | """ Common utils for database data management """ 2 | import logging 3 | from typing import List, Optional 4 | import tempfile 5 | import os 6 | from datetime import date 7 | 8 | from views.utils import io 9 | from ..config import DIR_STORAGE 10 | 11 | DIR_FETCHES = os.path.join(DIR_STORAGE, "data", "raw") 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | def get_path_tar(name: str) -> str: 16 | """ Get a path to a tarfile timestamped for today """ 17 | io.create_directory(DIR_FETCHES) 18 | today = date.today().strftime("%Y%m%d") 19 | return os.path.join(DIR_FETCHES, f"{name}_{today}.tar.xz") 20 | 21 | 22 | def fetch_source_simply( 23 | name: str, url: Optional[str] = None, urls: Optional[List[str]] = None 24 | ) -> None: 25 | """ Download file at url (or urls) and store in tarfile by name """ 26 | 27 | def _get_urls(url: Optional[str], urls: Optional[List[str]]) -> List[str]: 28 | """ If url return list of one, else pass through urls """ 29 | if url and urls: 30 | raise TypeError("Use url or urls, not both.") 31 | if url: 32 | # pylint: disable=redefined-argument-from-local 33 | urls = [url] 34 | assert isinstance(urls, list) 35 | 36 | return urls 37 | 38 | urls = _get_urls(url, urls) 39 | with tempfile.TemporaryDirectory() as tempdir: 40 | paths = [] 41 | for url in urls: # pylint: disable=redefined-argument-from-local 42 | fname = url.split("/")[-1] 43 | path_source = os.path.join(tempdir, fname) 44 | io.fetch_url_to_file(url, path=path_source) 45 | paths.append(path_source) 46 | io.make_tarfile(path_tar=get_path_tar(name), paths_members=paths) 47 | 48 | 49 | def get_files_latest_fetch(name, tempdir) -> List[str]: 50 | """ Get files from latest fetch 51 | 52 | Unpack the tarfile for the latest fetch for source name into tempdir 53 | and return paths. 54 | """ 55 | log.debug(f"Getting files for latest fetch for {name}") 56 | paths_fetches = io.list_files_in_dir(path_dir=DIR_FETCHES) 57 | try: 58 | path_tar = [ 59 | path 60 | for path in sorted(paths_fetches) 61 | if os.path.basename(path).startswith(name) 62 | ].pop(0) 63 | log.debug(f"Got {path_tar} as latest {name} of {paths_fetches}") 64 | except IndexError: 65 | log.exception(f"Couldn't find a latest fetch for {name}.") 66 | raise 67 | 68 | paths = io.unpack_tarfile(path_tar=path_tar, dir_destination=tempdir) 69 | return paths 70 | -------------------------------------------------------------------------------- /views/database/skeleton/__init__.py: -------------------------------------------------------------------------------- 1 | """ The database skeleton schema """ 2 | __all__ = ["build_skeleton"] 3 | 4 | from .skeleton import build_skeleton 5 | -------------------------------------------------------------------------------- /views/database/skeleton/create_skeleton.sql: -------------------------------------------------------------------------------- 1 | -- Create a skeleton schema with identifiers and geographic extent only. 2 | -- To be used down the line in joining data in pandas 3 | 4 | DROP SCHEMA IF EXISTS skeleton CASCADE; 5 | CREATE SCHEMA skeleton; 6 | 7 | -- PGY 8 | CREATE TABLE skeleton.pgy_global AS 9 | SELECT pgy.priogrid_gid AS pg_id, 10 | y.year, 11 | cy.country_id, 12 | pg.in_africa, 13 | c.name AS country_name 14 | FROM staging.priogrid_year AS pgy 15 | INNER JOIN staging.year AS y ON pgy.year_id = y.year 16 | -- LEFT here because pgy-cy mapping stops 17 | LEFT JOIN staging.country_year AS cy ON pgy.country_year_id = cy.id 18 | INNER JOIN staging.priogrid AS pg ON pgy.priogrid_gid = pg.gid 19 | INNER JOIN staging.country AS c ON c.id=cy.country_id 20 | WHERE y.year < 2031; 21 | 22 | CREATE TABLE skeleton.pgy_africa AS 23 | SELECT * 24 | FROM skeleton.pgy_global 25 | WHERE in_africa = TRUE; 26 | 27 | -- PGM 28 | CREATE TABLE skeleton.pgm_global AS 29 | SELECT pgm.priogrid_gid AS pg_id, 30 | m.id AS month_id, 31 | m.year_id AS year, 32 | m.month AS month, 33 | cm.country_id country_id, 34 | pg.in_africa, 35 | c.name AS country_name 36 | FROM staging.priogrid_month AS pgm 37 | INNER JOIN staging.month AS m ON m.id = pgm.month_id 38 | INNER JOIN staging.priogrid_year AS pgy ON pgy.year_id = m.year_id AND pgy.priogrid_gid = pgm.priogrid_gid 39 | INNER JOIN staging.country_month AS cm ON pgm.country_month_id = cm.id 40 | INNER JOIN staging.priogrid AS pg ON pg.gid = pgm.priogrid_gid 41 | INNER JOIN staging.country AS c ON c.id=cm.country_id 42 | WHERE m.year_id < 2031; 43 | 44 | CREATE TABLE skeleton.pgm_africa AS 45 | SELECT * 46 | FROM skeleton.pgm_global 47 | WHERE in_africa = TRUE; 48 | 49 | -- CY 50 | DROP TABLE IF EXISTS skeleton.cy_global; 51 | CREATE TABLE skeleton.cy_global AS 52 | SELECT c.id AS country_id, 53 | c.in_africa, 54 | c.name AS country_name, 55 | y.year 56 | FROM staging.country AS c 57 | CROSS JOIN staging.year AS y 58 | WHERE c.gweyear = 2016 59 | AND y.year < 2031; 60 | 61 | CREATE TABLE skeleton.cy_africa AS 62 | SELECT * 63 | FROM skeleton.cy_global 64 | WHERE in_africa = 1; 65 | 66 | -- CM 67 | CREATE TABLE skeleton.cm_global AS 68 | SELECT c.id AS country_id, 69 | m.year_id AS year, 70 | m.id AS month_id, 71 | m.month, 72 | c.name AS country_name, 73 | c.in_africa 74 | FROM staging.country AS c 75 | CROSS JOIN staging.month AS m 76 | WHERE c.gweyear = 2016 77 | AND m.year_id < 2031; 78 | 79 | CREATE TABLE skeleton.cm_africa AS 80 | SELECT * 81 | FROM skeleton.cm_global 82 | WHERE in_africa = 1; 83 | 84 | -------------------------------------------------------------------------------- /views/database/skeleton/skeleton.py: -------------------------------------------------------------------------------- 1 | """ Skeleton building code """ 2 | import os 3 | import logging 4 | from views.utils import db 5 | 6 | log = logging.getLogger(__name__) 7 | 8 | 9 | def build_skeleton() -> None: 10 | """ Build skeleton schema by executing create_skeleton.sql """ 11 | log.info("Started rebuilding skeleton schema.") 12 | path_query = os.path.join(os.path.dirname(__file__), "create_skeleton.sql") 13 | with open(path_query, "r") as f: 14 | query = f.read() 15 | db.execute_query(query) 16 | log.info("Finished rebuilding skeleton schema.") 17 | -------------------------------------------------------------------------------- /views/database/sources/__init__.py: -------------------------------------------------------------------------------- 1 | """ Data sources for the database """ 2 | from . import acled, cdum, fvp, ged, icgcw, pgdata, spei, vdem, wdi 3 | 4 | __all__ = [ 5 | "acled", 6 | "cdum", 7 | "fvp", 8 | "ged", 9 | "icgcw", 10 | "pgdata", 11 | "spei", 12 | "vdem", 13 | "wdi", 14 | ] 15 | -------------------------------------------------------------------------------- /views/database/sources/acled/__init__.py: -------------------------------------------------------------------------------- 1 | """ ACLED package """ 2 | __all__ = ["load_acled", "fetch_acled"] 3 | from .acled import load_acled, fetch_acled 4 | -------------------------------------------------------------------------------- /views/database/sources/acled/acled.py: -------------------------------------------------------------------------------- 1 | """ ACLED data loader, depends on original DB implementation 2 | 3 | # TODO: Rewrite to hold all ACLED loading logic. 4 | """ 5 | import os 6 | import logging 7 | from views.utils import db, io 8 | from .legacy import load_acled as load_legacy_acled 9 | 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | def fetch_acled() -> None: 15 | """ Do nothing, ACLED still fetched by old code """ 16 | 17 | 18 | def load_acled() -> None: 19 | """ Code that brings acled to staging yet to be merged """ 20 | 21 | log.info("Started loading ACLED.") 22 | 23 | load_legacy_acled( 24 | from_date="2020-01-01", from_month_id=483, to_month_id=484 25 | ) 26 | 27 | db.drop_schema("acled") 28 | db.create_schema("acled") 29 | 30 | db.execute_query( 31 | query=io.read_file( 32 | path=os.path.join(os.path.dirname(__file__), "acled.sql") 33 | ) 34 | ) 35 | log.info("Finished loading ACLED.") 36 | -------------------------------------------------------------------------------- /views/database/sources/acled/acled.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE acled.cm AS 2 | SELECT cm.month_id, 3 | cm.country_id, 4 | coalesce(cm.acled_count_pr, 0) AS acled_count_pr, 5 | coalesce(cm.acled_count_sb, 0) AS acled_count_sb, 6 | coalesce(cm.acled_count_ns, 0) AS acled_count_ns, 7 | coalesce(cm.acled_count_os, 0) AS acled_count_os 8 | FROM staging.country_month AS cm; 9 | 10 | CREATE TABLE acled.pgm AS 11 | SELECT pgm.month_id, 12 | pgm.priogrid_gid AS pg_id, 13 | coalesce(pgm.acled_count_pr, 0) AS acled_count_pr, 14 | coalesce(pgm.acled_count_sb, 0) AS acled_count_sb, 15 | coalesce(pgm.acled_count_ns, 0) AS acled_count_ns, 16 | coalesce(pgm.acled_count_os, 0) AS acled_count_os, 17 | coalesce(pgm.acled_fat_sb, 0) AS acled_fat_sb, 18 | coalesce(pgm.acled_fat_ns, 0) AS acled_fat_ns, 19 | coalesce(pgm.acled_fat_os, 0) AS acled_fat_os, 20 | coalesce(pgm.acled_fat_pr, 0) AS acled_fat_pr 21 | FROM staging.priogrid_month AS pgm; -------------------------------------------------------------------------------- /views/database/sources/acled/legacy/__init__.py: -------------------------------------------------------------------------------- 1 | """ Legacy ACLED """ 2 | __all__ = ["load_acled"] 3 | from .acled import load_acled 4 | -------------------------------------------------------------------------------- /views/database/sources/acled/legacy/prepare_acled.sql: -------------------------------------------------------------------------------- 1 | -- Rebuilds preflight.acled_full and prefligh.acled 2 | 3 | DROP TABLE IF EXISTS preflight.acled_full; 4 | DROP TABLE IF EXISTS preflight.acled; 5 | 6 | CREATE TABLE preflight.acled_full AS 7 | WITH month_acled AS 8 | ( 9 | SELECT *, 10 | EXTRACT(MONTH FROM event_date :: DATE) AS month, 11 | public.priogrid(latitude::float4, longitude::float4) AS priogrid_gid 12 | FROM dataprep.acled 13 | WHERE latitude::float BETWEEN -180 AND 180 14 | AND longitude::float BETWEEN -90 AND 90 15 | ), 16 | month_acled2 AS 17 | ( 18 | SELECT month_acled.*, 19 | staging.month.id AS month_id 20 | FROM month_acled, 21 | staging.month 22 | WHERE (month_acled.year :: INT = staging.month.year_id AND 23 | month_acled.month = staging.month.month) 24 | ) 25 | SELECT * 26 | FROM month_acled2; 27 | 28 | 29 | 30 | ALTER TABLE preflight.acled_full 31 | ADD COLUMN type_of_violence INT; 32 | 33 | ALTER TABLE preflight.acled_full 34 | ADD COLUMN type_of_protest TEXT; 35 | 36 | -- 1. We are emulating UCDP/ViEWS StateBased category using ACLED data. 37 | -- i.e. Military Forces vs. others/other Military Forces, only "battles" and "remote violence" 38 | -- no civilians involved. 39 | -- TODO: shelling and remote violence may need to be treated differently 40 | UPDATE preflight.acled_full 41 | SET type_of_violence = 1 42 | WHERE (event_type ILIKE '%%battle%%' OR event_type ILIKE '%%remote%%') 43 | AND actor1 || actor2 ILIKE '%%military forces%%' 44 | AND actor1 || actor2 NOT ILIKE '%%civilians%%'; 45 | 46 | 47 | 48 | -- 2. We are emulating UCDP/ViEWS StateBased category using ACLED data. 49 | -- i.e. no military forces, no civilians, only "battles" and "remote violence" 50 | -- UCDP''s artificial organizational criteria are not included and cannot for now be included 51 | UPDATE preflight.acled_full 52 | SET type_of_violence = 2 53 | WHERE (event_type ILIKE '%%battle%%' OR event_type ILIKE '%%remote%%') 54 | AND actor1 || actor2 NOT ILIKE '%%military forces%%' 55 | AND actor1 || actor2 NOT ILIKE '%%civilians%%'; 56 | 57 | 58 | 59 | -- 3: Emulate UCDP/Views OneSided category. 60 | -- Remote violence, battle and violence against civilians 61 | -- TODO: This may be improved using a better division of "Remote Violence" 62 | UPDATE preflight.acled_full 63 | SET type_of_violence = 3 64 | WHERE (event_type ILIKE '%%battle%%' OR event_type ILIKE '%%remote%%' OR event_type ILIKE '%%civi%%') 65 | AND actor1 || actor2 ILIKE '%%civilians%%'; 66 | 67 | -- 4: Protests 68 | -- The entire protest category, as is 69 | UPDATE preflight.acled_full 70 | SET type_of_violence = 4 71 | WHERE event_type ILIKE '%%protest%%'; 72 | 73 | UPDATE preflight.acled_full 74 | SET type_of_protest = 'p' 75 | WHERE type_of_violence = 4 76 | AND (inter1::int = 6 OR inter2::int = 6); 77 | 78 | 79 | 80 | UPDATE preflight.acled_full 81 | SET type_of_protest = COALESCE (type_of_protest, '') || 'r' 82 | WHERE 83 | type_of_violence=4 84 | AND (inter1::INT =5 85 | OR inter2::INT =5); 86 | 87 | 88 | 89 | UPDATE preflight.acled_full 90 | SET type_of_protest = COALESCE(type_of_protest, '') || 'x' 91 | WHERE event_type ILIKE '%violence against civi%' 92 | AND interaction::int IN (15, 16, 25, 26, 35, 36, 45, 46); 93 | 94 | UPDATE preflight.acled_full 95 | SET type_of_protest = COALESCE(type_of_protest, '') || 'y' 96 | WHERE event_type ILIKE '%violence against civi%' 97 | AND interaction::int IN (15, 16); 98 | 99 | 100 | 101 | -- We are only using events precise enough to have locations within PGM cells 102 | -- Thus, we exclude geo_precision 3 which indicates "larger area" 103 | -- (unclear what that means but during testing, it was nearly always ADM1 or higher. 104 | 105 | 106 | CREATE TABLE preflight.acled AS 107 | SELECT * 108 | FROM preflight.acled_full 109 | WHERE geo_precision::int < 3; 110 | 111 | 112 | 113 | ALTER TABLE preflight.acled 114 | ADD PRIMARY KEY (index); 115 | ALTER TABLE preflight.acled_full 116 | ADD PRIMARY KEY (index); 117 | CREATE INDEX acled_idx ON preflight.acled (priogrid_gid, month_id, type_of_violence); 118 | CREATE INDEX acled_full_idx ON preflight.acled_full (priogrid_gid, month_id, type_of_violence); 119 | CREATE INDEX acled2_idx ON preflight.acled (priogrid_gid, month_id, type_of_violence, type_of_protest); 120 | CREATE INDEX acled2_full_idx ON preflight.acled_full (priogrid_gid, month_id, type_of_violence, type_of_protest); 121 | 122 | -------------------------------------------------------------------------------- /views/database/sources/cdum/__init__.py: -------------------------------------------------------------------------------- 1 | """ Country dummy package """ 2 | __all__ = ["fetch_cdum", "load_cdum"] 3 | from .cdum import fetch_cdum, load_cdum 4 | -------------------------------------------------------------------------------- /views/database/sources/cdum/cdum.py: -------------------------------------------------------------------------------- 1 | """ Country dummy module """ 2 | import pandas as pd # type: ignore 3 | from views.utils import db 4 | 5 | 6 | def fetch_cdum() -> None: 7 | """ Nothing to fetch for country dummies """ 8 | 9 | 10 | def load_cdum() -> None: 11 | """ Load country dummies """ 12 | 13 | df = db.db_to_df(fqtable="staging.country", cols=["id"], ids=["id"]) 14 | df = df.reset_index().rename(columns={"id": "country_id"}) 15 | df["to_dummy"] = df["country_id"] 16 | df = df.set_index(["country_id"]) 17 | df = pd.get_dummies(df.to_dummy, prefix="cdum") 18 | db.drop_schema("cdum") 19 | db.create_schema("cdum") 20 | db.df_to_db(fqtable="cdum.c", df=df) 21 | -------------------------------------------------------------------------------- /views/database/sources/fvp/__init__.py: -------------------------------------------------------------------------------- 1 | """ Future of Violent politics package """ 2 | __all__ = ["fetch_fvp", "load_fvp"] 3 | from .fvp import fetch_fvp, load_fvp 4 | -------------------------------------------------------------------------------- /views/database/sources/fvp/fvp.py: -------------------------------------------------------------------------------- 1 | """ Future of violent politics module """ 2 | import logging 3 | import os 4 | import tempfile 5 | 6 | from sklearn.tree import DecisionTreeRegressor # type: ignore 7 | 8 | from views.utils import io, db 9 | from views.database import common 10 | from views.apps.data import missing 11 | 12 | log = logging.getLogger(__name__) 13 | 14 | 15 | def fetch_fvp(): 16 | """ FVP data is in the Dropbox 17 | 18 | # TODO: Store properly 19 | """ 20 | print("FVP MUST BE FETCHED MANUALLY! ITS IN THE DROPBOX.") 21 | 22 | 23 | def load_fvp(): 24 | """ Load FVP data """ 25 | log.info("Started loading FVP") 26 | with tempfile.TemporaryDirectory() as tempdir: 27 | _ = common.get_files_latest_fetch(name="fvp", tempdir=tempdir) 28 | df = io.csv_to_df(path=os.path.join(tempdir, "MasterData.csv")) 29 | 30 | df = df.drop(columns=["Conflict"]) 31 | df = df.rename(columns=lambda col: col.lower()) 32 | df = df.set_index(["year", "gwno"]) 33 | 34 | spec = io.load_yaml( 35 | path=os.path.join(os.path.dirname(__file__), "spec.yaml") 36 | ) 37 | df = df[spec["cols"]] 38 | 39 | log.debug("Fetching df_keys") 40 | query = "SELECT id AS country_id, gwcode AS gwno FROM staging.country;" 41 | df = df.join( 42 | db.query_to_df(query=query) 43 | .sort_values(by="country_id", ascending=False) 44 | .drop_duplicates(subset=["gwno"]) 45 | .set_index(["gwno"]) 46 | ) 47 | 48 | log.debug("Joining to skeleton") 49 | df = db.db_to_df( 50 | fqtable="skeleton.cy_global", 51 | ids=["year", "country_id"], 52 | cols=["year", "country_id"], 53 | ).join(df.reset_index().set_index(["year", "country_id"]), how="left") 54 | 55 | df = df.drop(columns=["gwno"]) 56 | 57 | # Add consistent fvp_ prefix 58 | df = df.rename( 59 | columns=lambda col: col if col.startswith("fvp_") else f"fvp_{col}" 60 | ) 61 | df = df.sort_index(axis=1).sort_index(axis=0) 62 | 63 | # Push raw 64 | db.create_schema("fvp_v2") 65 | db.df_to_db(fqtable="fvp_v2.cy_unimp", df=df) 66 | 67 | # Extrapolate before imputing 68 | df = missing.extrapolate(df) 69 | 70 | # Impute and push 71 | for i, df_imp in enumerate( 72 | missing.impute_mice_generator( 73 | df=df, 74 | n_imp=10, 75 | estimator=DecisionTreeRegressor(max_features="sqrt"), 76 | parallel=True, 77 | ) 78 | ): 79 | db.df_to_db(df=df_imp, fqtable=f"fvp_v2.cy_imp_sklearn_{i}") 80 | 81 | log.info("Fininshed loading FVP") 82 | -------------------------------------------------------------------------------- /views/database/sources/fvp/spec.yaml: -------------------------------------------------------------------------------- 1 | cols: 2 | - fvp_auto 3 | - fvp_demo 4 | - fvp_democracy 5 | - fvp_electoral 6 | - fvp_liberal 7 | - fvp_participatory 8 | - fvp_regime3c 9 | - fvp_semi 10 | - gdp200 11 | - gdpcap_nonoilrent 12 | - gdpcap_oilrent 13 | - gdppc200 14 | - govt 15 | - grgdpcap_nonoilrent 16 | - grgdpcap_oilrent 17 | - grgdppercapita200 18 | - grpop200 19 | - lngdp200 20 | - lngdpcap_nonoilrent 21 | - lngdpcap_oilrent 22 | - lngdppercapita200 23 | - lnoilrent 24 | - lnpop200 25 | - ltimeindep 26 | - ssp2_edu_sec_15_24_prop 27 | - prop_diexpo 28 | - prop_discexclpowless 29 | - prop_discriminated 30 | - prop_dominant 31 | - prop_excluded 32 | - prop_irrelevant 33 | - prop_junpart 34 | - prop_powerless 35 | - prop_selfexclusion 36 | - prop_senpart 37 | - population200 38 | - ssp2_urban_share_iiasa 39 | - timesincepreindepwar 40 | - timesinceregimechange 41 | - indepyear 42 | - timeindep -------------------------------------------------------------------------------- /views/database/sources/ged/__init__.py: -------------------------------------------------------------------------------- 1 | """ GED package """ 2 | __all__ = ["fetch_ged", "load_ged"] 3 | from .ged import fetch_ged, load_ged 4 | -------------------------------------------------------------------------------- /views/database/sources/ged/ged.py: -------------------------------------------------------------------------------- 1 | """ Ged loader, depends on original DB implementation 2 | 3 | # TODO: Rewrite to hold all loading logic 4 | """ 5 | import os 6 | import logging 7 | from views.utils import db, io 8 | from .legacy import load_ged as load_legacy_ged 9 | 10 | log = logging.getLogger(__name__) 11 | 12 | 13 | def fetch_ged() -> None: 14 | """ Do nothing, GED still fetched by old code """ 15 | 16 | 17 | def load_ged() -> None: 18 | """ Collect imputed and unimputed GED """ 19 | 20 | log.info("Started loading GED.") 21 | 22 | load_legacy_ged("20.9.4", 484, 484) # 2020-04 23 | 24 | db.drop_schema("ged") 25 | db.create_schema("ged") 26 | db.execute_query( 27 | query=io.read_file( 28 | path=os.path.join(os.path.dirname(__file__), "ged.sql") 29 | ) 30 | ) 31 | log.info("Finished loading GED.") 32 | -------------------------------------------------------------------------------- /views/database/sources/ged/ged.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE ged.cm AS 2 | SELECT cm.month_id, 3 | cm.country_id, 4 | cm.ged_best_sb, 5 | cm.ged_best_ns, 6 | cm.ged_best_os, 7 | cm.ged_count_sb, 8 | cm.ged_count_ns, 9 | cm.ged_count_os 10 | FROM staging.country_month AS cm; 11 | 12 | CREATE TABLE ged.pgm_unimp 13 | AS 14 | SELECT pgm.month_id, 15 | pgm.priogrid_gid AS pg_id, 16 | pgm.ged_best_sb, 17 | pgm.ged_best_ns, 18 | pgm.ged_best_os, 19 | pgm.ged_count_sb, 20 | pgm.ged_count_ns, 21 | pgm.ged_count_os, 22 | public.to_dummy(pgm.ged_count_sb) AS ged_dummy_sb, 23 | public.to_dummy(pgm.ged_count_ns) AS ged_dummy_ns, 24 | public.to_dummy(pgm.ged_count_os) AS ged_dummy_os 25 | FROM staging.priogrid_month AS pgm; 26 | 27 | CREATE TABLE ged.pgm_geoimp_0 28 | AS 29 | SELECT pgm.month_id, 30 | pgm.priogrid_gid AS pg_id, 31 | pgm.ged_best_sb, 32 | pgm.ged_best_ns, 33 | pgm.ged_best_os, 34 | pgm.ged_count_sb, 35 | pgm.ged_count_ns, 36 | pgm.ged_count_os, 37 | pgm_imp.ged_sb_dummy_1 AS ged_dummy_sb, 38 | pgm_imp.ged_ns_dummy_1 AS ged_dummy_ns, 39 | pgm_imp.ged_os_dummy_1 AS ged_dummy_os 40 | FROM staging.priogrid_month AS pgm 41 | LEFT JOIN left_imputation.pgm AS pgm_imp 42 | ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id; 43 | 44 | CREATE TABLE ged.pgm_geoimp_1 45 | AS 46 | SELECT pgm.month_id, 47 | pgm.priogrid_gid AS pg_id, 48 | pgm.ged_best_sb, 49 | pgm.ged_best_ns, 50 | pgm.ged_best_os, 51 | pgm.ged_count_sb, 52 | pgm.ged_count_ns, 53 | pgm.ged_count_os, 54 | pgm_imp.ged_sb_dummy_2 AS ged_dummy_sb, 55 | pgm_imp.ged_ns_dummy_2 AS ged_dummy_ns, 56 | pgm_imp.ged_os_dummy_2 AS ged_dummy_os 57 | FROM staging.priogrid_month AS pgm 58 | LEFT JOIN left_imputation.pgm AS pgm_imp 59 | ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id; 60 | 61 | CREATE TABLE ged.pgm_geoimp_2 62 | AS 63 | SELECT pgm.month_id, 64 | pgm.priogrid_gid AS pg_id, 65 | pgm.ged_best_sb, 66 | pgm.ged_best_ns, 67 | pgm.ged_best_os, 68 | pgm.ged_count_sb, 69 | pgm.ged_count_ns, 70 | pgm.ged_count_os, 71 | pgm_imp.ged_sb_dummy_3 AS ged_dummy_sb, 72 | pgm_imp.ged_ns_dummy_3 AS ged_dummy_ns, 73 | pgm_imp.ged_os_dummy_3 AS ged_dummy_os 74 | FROM staging.priogrid_month AS pgm 75 | LEFT JOIN left_imputation.pgm AS pgm_imp 76 | ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id; 77 | 78 | CREATE TABLE ged.pgm_geoimp_3 79 | AS 80 | SELECT pgm.month_id, 81 | pgm.priogrid_gid AS pg_id, 82 | pgm.ged_best_sb, 83 | pgm.ged_best_ns, 84 | pgm.ged_best_os, 85 | pgm.ged_count_sb, 86 | pgm.ged_count_ns, 87 | pgm.ged_count_os, 88 | pgm_imp.ged_sb_dummy_4 AS ged_dummy_sb, 89 | pgm_imp.ged_ns_dummy_4 AS ged_dummy_ns, 90 | pgm_imp.ged_os_dummy_4 AS ged_dummy_os 91 | FROM staging.priogrid_month AS pgm 92 | LEFT JOIN left_imputation.pgm AS pgm_imp 93 | ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id; 94 | 95 | CREATE TABLE ged.pgm_geoimp_4 96 | AS 97 | SELECT pgm.month_id, 98 | pgm.priogrid_gid AS pg_id, 99 | pgm.ged_best_sb, 100 | pgm.ged_best_ns, 101 | pgm.ged_best_os, 102 | pgm.ged_count_sb, 103 | pgm.ged_count_ns, 104 | pgm.ged_count_os, 105 | pgm_imp.ged_sb_dummy_5 AS ged_dummy_sb, 106 | pgm_imp.ged_ns_dummy_5 AS ged_dummy_ns, 107 | pgm_imp.ged_os_dummy_5 AS ged_dummy_os 108 | FROM staging.priogrid_month AS pgm 109 | LEFT JOIN left_imputation.pgm AS pgm_imp 110 | ON pgm_imp.priogrid_gid = pgm.priogrid_gid AND pgm_imp.month_id = pgm.month_id; 111 | -------------------------------------------------------------------------------- /views/database/sources/ged/legacy/__init__.py: -------------------------------------------------------------------------------- 1 | """ Legacy GED """ 2 | __all__ = ["load_ged"] 3 | from .ged import load_ged 4 | -------------------------------------------------------------------------------- /views/database/sources/ged/legacy/prepare_ged.sql: -------------------------------------------------------------------------------- 1 | -- Drop existing attached 2 | DROP TABLE IF EXISTS preflight.ged_attached_full; 3 | DROP TABLE IF EXISTS preflight.ged_attached; 4 | 5 | -- Create preflight.ged_attached 6 | CREATE TABLE preflight.ged_attached AS 7 | ( 8 | WITH month_ged AS 9 | ( 10 | SELECT *, 11 | EXTRACT(MONTH FROM date_start :: DATE) AS month_start, 12 | EXTRACT(MONTH FROM date_end :: DATE) AS month_end 13 | FROM dataprep.ged 14 | ), 15 | month_ged_start AS 16 | ( 17 | SELECT month_ged.*, 18 | staging.month.id AS month_id_start 19 | FROM month_ged, 20 | staging.month 21 | WHERE (month_ged.year :: INT = staging.month.year_id AND 22 | month_ged.month_start = staging.month.month) 23 | ), 24 | month_ged_full AS 25 | ( 26 | SELECT month_ged_start.*, 27 | staging.month.id AS month_id_end 28 | FROM month_ged_start, 29 | staging.month 30 | WHERE (month_ged_start.year :: INT = staging.month.year_id AND 31 | month_ged_start.month_end = staging.month.month) 32 | ) 33 | SELECT * 34 | FROM month_ged_full 35 | ); 36 | 37 | -- Add ids 38 | ALTER TABLE preflight.ged_attached ADD PRIMARY KEY (id); 39 | ALTER TABLE preflight.ged_attached ADD COLUMN country_month_id_end bigint; 40 | ALTER TABLE preflight.ged_attached ADD COLUMN country_month_id_start bigint; 41 | ALTER TABLE preflight.ged_attached DROP COLUMN IF EXISTS geom; 42 | ALTER TABLE preflight.ged_attached ADD COLUMN geom geometry(point, 4326); 43 | UPDATE preflight.ged_attached 44 | SET geom=st_setsrid(st_geometryfromtext(geom_wkt), 4326) 45 | WHERE geom_wkt <> ''; 46 | 47 | -- Create preflight.ged_attached_full 48 | CREATE TABLE preflight.ged_attached_full AS SELECT * FROM preflight.ged_attached; 49 | 50 | 51 | DELETE FROM preflight.ged_attached WHERE where_prec IN (4,6,7); 52 | ALTER TABLE preflight.ged_attached_full ADD PRIMARY KEY (id); 53 | CREATE INDEX ged_attached_gidx ON preflight.ged_attached USING GIST(geom); 54 | CREATE INDEX ged_attached_idx ON preflight.ged_attached (priogrid_gid,month_id_end, type_of_violence); 55 | CREATE INDEX ged_attached_s_idx ON preflight.ged_attached (priogrid_gid,month_id_start, type_of_violence); 56 | CREATE INDEX ged_attached_full_gidx ON preflight.ged_attached_full USING GIST(geom); 57 | CREATE INDEX ged_attached_fullx_s_idx ON preflight.ged_attached_full (priogrid_gid,month_id_end, type_of_violence); 58 | CREATE INDEX ged_attached_fullx_gidx ON preflight.ged_attached_full (priogrid_gid,month_id_start, type_of_violence); 59 | 60 | 61 | -- Update preflight.ged_attached_full 62 | WITH a AS 63 | (SELECT cm.*, c.gwcode 64 | FROM staging.country_month cm 65 | LEFT JOIN 66 | staging.country c ON (cm.country_id = c.id)) 67 | UPDATE preflight.ged_attached_full 68 | SET country_month_id_end=a.id 69 | FROM a 70 | WHERE (a.gwcode = ged_attached_full.country_id AND a.month_id = ged_attached_full.month_id_end); 71 | WITH a AS 72 | (SELECT cm.*, c.gwcode 73 | FROM staging.country_month cm 74 | LEFT JOIN 75 | staging.country c ON (cm.country_id = c.id)) 76 | UPDATE preflight.ged_attached_full 77 | SET country_month_id_start=a.id 78 | FROM a 79 | WHERE (a.gwcode = ged_attached_full.country_id AND a.month_id = ged_attached_full.month_id_start); -------------------------------------------------------------------------------- /views/database/sources/icgcw/__init__.py: -------------------------------------------------------------------------------- 1 | """ International Crisis Group - Crisis Watch Package """ 2 | __all__ = ["fetch_icgcw", "load_icgcw"] 3 | from .fetch import fetch_icgcw 4 | from .icgcw import load_icgcw 5 | -------------------------------------------------------------------------------- /views/database/sources/icgcw/fetch.py: -------------------------------------------------------------------------------- 1 | """Scrapes all ICG CrisisWatch to file """ 2 | 3 | # pylint: disable=too-many-arguments 4 | 5 | import os 6 | import tempfile 7 | import logging 8 | import datetime 9 | 10 | import requests # type: ignore 11 | from bs4 import BeautifulSoup # type: ignore 12 | 13 | from views.utils import io 14 | from views.database import common 15 | 16 | log = logging.getLogger(__name__) 17 | 18 | 19 | def check_if_more_pages(path_html): 20 | """ True if more pages to fetch indicated in path_html """ 21 | 22 | log.debug(f"Checking if more pages in {path_html}") 23 | 24 | with open(path_html, "r", encoding="utf-8") as f: 25 | soup = BeautifulSoup(f.read(), "html.parser") 26 | search = { 27 | "class": "c-crisiswatch-entry [ o-container o-container--m u-pr ]" 28 | } 29 | matches = soup.find_all("div", search) 30 | 31 | # matches is a list, if len is zero it evaluates to False 32 | if matches: 33 | more_pages = True 34 | log.debug("Found more pages.") 35 | else: 36 | more_pages = False 37 | log.debug("No more pages.") 38 | 39 | return more_pages 40 | 41 | 42 | def fetch_page_content(url, page, from_year, from_month, to_year, to_month): 43 | """ Fetch page contents """ 44 | params = { 45 | "date_range": "custom", 46 | "page": page, 47 | "from_year": from_year, 48 | "from_month": from_month, 49 | "to_year": to_year, 50 | "to_month": to_month, 51 | } 52 | headers = {"User-Agent": "Mozilla/5.0"} # Header because 504 otherwise 53 | req = requests.get(url=url, params=params, timeout=60, headers=headers) 54 | log.debug(f"GET {req.url}") 55 | log.debug(f"Status code: {req.status_code}") 56 | content = req.content 57 | 58 | return content 59 | 60 | 61 | def fetch_page_to_file(url, path_dir, page, y_start, m_start, y_end, m_end): 62 | """ Fetch page at url with time params to file in path_dir """ 63 | 64 | # Pad with some zeros 65 | m_start = str(m_start).zfill(2) 66 | m_end = str(m_end).zfill(2) 67 | 68 | content = fetch_page_content(url, page, y_start, m_start, y_end, m_end) 69 | 70 | fname = f"{y_start}.{m_start}_{y_end}.{m_end}_p{str(page).zfill(4)}.html" 71 | path = os.path.join(path_dir, fname) 72 | with open(path, "wb") as f: 73 | f.write(content) 74 | log.info(f"Wrote {path}") 75 | 76 | return path 77 | 78 | 79 | def fetch_pages(url, path_dir, y_start=2004, m_start=1): 80 | """ Fetch pages from y_start-m_start until today to path_dir """ 81 | y_end = datetime.date.today().year 82 | m_end = datetime.date.today().month 83 | 84 | paths = [] 85 | more_pages = True 86 | page = 0 87 | while more_pages: 88 | log.debug(f"Page: {page}") 89 | path = fetch_page_to_file( 90 | url, path_dir, page, y_start, m_start, y_end, m_end 91 | ) 92 | paths.append(path) 93 | more_pages = check_if_more_pages(path_html=path) 94 | page = page + 1 95 | 96 | return paths 97 | 98 | 99 | def fetch_icgcw(): 100 | """ Fetch icgcw to fetch library """ 101 | with tempfile.TemporaryDirectory() as tempdir: 102 | paths = fetch_pages( 103 | url="https://www.crisisgroup.org/crisiswatch/database", 104 | path_dir=tempdir, 105 | ) 106 | io.make_tarfile( 107 | paths_members=paths, path_tar=common.get_path_tar(name="icgcw") 108 | ) 109 | 110 | 111 | if __name__ == "__main__": 112 | fetch_icgcw() 113 | -------------------------------------------------------------------------------- /views/database/sources/icgcw/spec.yaml: -------------------------------------------------------------------------------- 1 | cname_fixes: 2 | - old: "Israel/Palestine" 3 | new: "Israel" 4 | - old: "China (internal)" 5 | new: "China" 6 | - old: "Western Sahara" 7 | new: "Morocco" 8 | - old: "Democratic Republic of Congo" 9 | new: "Congo, DRC" 10 | - old: "Somaliland" 11 | new: "Somalia" 12 | - old: "C\u00f4te d\u2019Ivoire" 13 | new: "Cote d'Ivoire" 14 | - old: "India (non-Kashmir)" 15 | new: "India" 16 | - old: "Russia/North Caucasus" 17 | new: "Russia" 18 | - old: "Bosnia And Herzegovina" 19 | new: "Bosnia and Herzegovina" 20 | - old: "Nagorno-Karabakh (Azerbaijan)" 21 | new: "Azerbaijan" 22 | - old: "Chechnya (Russia)" 23 | new: "Russia" 24 | - old: "Basque Country (Spain)" 25 | new: "Spain" 26 | - old: "Corsica" 27 | new: "France" 28 | - old: "Northern Ireland (UK)" 29 | new: "United Kingdom" 30 | - old: "Comoros Islands" 31 | new: "Comoros" 32 | - old: "Taiwan Strait" 33 | new: "China" 34 | - old: "Timor-Leste" 35 | new: "Timor Leste" 36 | - old: "Republic of Congo" 37 | new: "Congo" 38 | - old: "Solomon Islands" 39 | new: "Solomon Is." 40 | - old: "Gambia" 41 | new: "The Gambia" 42 | - old: "Abkhazia (Georgia)" 43 | new: "Georgia" 44 | - old: "UK" 45 | new: "United Kingdom" 46 | - old: "North Macedonia" 47 | new: "Macedonia" 48 | - old: "Central Africa" 49 | new: "Central African Republic" 50 | - old: "U.S." 51 | new: "United States" 52 | - old: "Kashmir" 53 | new: India/Pakistan 54 | - old: "Korean Peninsula" 55 | new: "North Korea/South Korea" 56 | - old: "Northern Territories (Russia" 57 | new: Russia 58 | - old: "Japan)" 59 | new: "Japan" 60 | drops: 61 | - "South China Sea" 62 | - "Gulf and Arabian Peninsula" 63 | - "Kuril Islands" 64 | cols_data: 65 | - alerts 66 | - opportunities 67 | - deteriorated 68 | - improved 69 | - unobserved 70 | -------------------------------------------------------------------------------- /views/database/sources/pgdata/__init__.py: -------------------------------------------------------------------------------- 1 | """ Priogrid Data Package """ 2 | from .fetch import fetch_pgdata 3 | from .pgdata import load_pgdata 4 | 5 | __all__ = ["fetch_pgdata", "load_pgdata"] 6 | -------------------------------------------------------------------------------- /views/database/sources/pgdata/fetch.py: -------------------------------------------------------------------------------- 1 | """ Fetch priogrid data from their API """ 2 | 3 | from typing import Any, Dict, List 4 | import os 5 | import tempfile 6 | import json 7 | import logging 8 | import time 9 | import random 10 | import multiprocessing as mp 11 | 12 | import requests 13 | 14 | from views.utils import io 15 | from views.database import common 16 | 17 | log = logging.getLogger(__name__) 18 | 19 | URL_BASE = "https://grid.prio.org/api" 20 | 21 | 22 | def fetch_variable( 23 | varinfo: Dict[Any, Any], dir_destination: str, try_number: int = 1 24 | ) -> str: 25 | """ Fetch a single variable from API """ 26 | 27 | url = varinfo["url"] 28 | params = varinfo["payload"] 29 | log.debug(f"Fetching {url} with params {params} try_number {try_number}") 30 | 31 | try: 32 | data = requests.get(url=url, params=params).json() 33 | except json.decoder.JSONDecodeError: 34 | time.sleep(2 ** try_number + random.random() * 0.01) 35 | data = fetch_variable( 36 | varinfo, dir_destination, try_number=try_number + 1 37 | ) 38 | 39 | path = os.path.join(dir_destination, f"{varinfo['name']}.json") 40 | io.dict_to_json(data, path) 41 | return path 42 | 43 | 44 | def fetch_data( 45 | varinfos: List[Dict[Any, Any]], dir_destination: str 46 | ) -> List[str]: 47 | """ Fetch all the data to dir_destination""" 48 | 49 | with mp.Pool(processes=mp.cpu_count()) as pool: 50 | results = [] 51 | for varinfo in varinfos: 52 | results.append( 53 | pool.apply_async(fetch_variable, (varinfo, dir_destination,)) 54 | ) 55 | paths = [result.get() for result in results] 56 | 57 | return paths 58 | 59 | 60 | def fetch_varinfos() -> List[Dict[Any, Any]]: 61 | """ Update varinfo dictionaries with API endpoint URLs """ 62 | 63 | varinfos = requests.get(f"{URL_BASE}/variables").json() 64 | varinfos = varinfos.copy() 65 | for varinfo in varinfos: 66 | url = f"{URL_BASE}/data/{varinfo['id']}" 67 | if varinfo["type"] == "yearly": 68 | payload = {k: varinfo[k] for k in ("startYear", "endYear")} 69 | elif varinfo["type"] == "static": 70 | payload = {} 71 | 72 | varinfo.update({"url": url, "payload": payload}) 73 | 74 | return varinfos 75 | 76 | 77 | def fetch_pgdata() -> None: 78 | """ Fetch priogrid data from API """ 79 | 80 | path_tar = common.get_path_tar(name="pgdata") 81 | 82 | log.info("Started fetching pgdata") 83 | 84 | grid = requests.get(f"{URL_BASE}/data/basegrid").json() 85 | varinfos = fetch_varinfos() 86 | 87 | with tempfile.TemporaryDirectory() as tempdir: 88 | 89 | path_grid = os.path.join(tempdir, "basegrid.json") 90 | path_varinfos = os.path.join(tempdir, "varinfos.json") 91 | io.dict_to_json(data=grid, path=path_grid) 92 | io.dict_to_json(data=varinfos, path=path_varinfos) 93 | paths_data = fetch_data(varinfos=varinfos, dir_destination=tempdir) 94 | 95 | paths_all = paths_data + [path_varinfos] + [path_grid] 96 | 97 | io.make_tarfile(path_tar=path_tar, paths_members=paths_all) 98 | 99 | log.info("Finished fetching pgdata") 100 | -------------------------------------------------------------------------------- /views/database/sources/pgdata/spec.yaml: -------------------------------------------------------------------------------- 1 | # There's a core var called gid, but they're already indexed by gid 2 | # So we drop this one to avoid duplicates. 3 | excludes_core: 4 | - gid 5 | 6 | # _y and _s columns have nulls for no grids without data, fill with zero 7 | nulls_to_zero: 8 | - diamprim_y 9 | - diamsec_y 10 | - drug_y 11 | - gem_y 12 | - goldplacer_y 13 | - goldsurface_y 14 | - goldvein_y 15 | - petroleum_y 16 | - diamprim_s 17 | - diamsec_s 18 | - gem_s 19 | - goldplacer_s 20 | - goldsurface_s 21 | - goldvein_s 22 | - petroleum_s 23 | 24 | # Yearly data often stops early, fill it forward instead of zeroing 25 | # This ffilling happens before zeroing the nulls. 26 | cols_ffill: 27 | - diamprim_y 28 | - diamsec_y 29 | - drug_y 30 | - gem_y 31 | - goldplacer_y 32 | - goldsurface_y 33 | - goldvein_y 34 | - petroleum_y 35 | 36 | prefix: pgd 37 | public_tables: 38 | pgy: pgdata.pgy 39 | cols_data: 40 | - agri_gc 41 | - agri_ih 42 | - aquaveg_gc 43 | - barren_gc 44 | - barren_ih 45 | - bdist3 46 | - capdist 47 | - cmr_mean 48 | - diamprim # combined as max(_s, _y) 49 | - diamsec # combined as max(_s, _y) 50 | - drug_y 51 | - excluded 52 | - forest_gc 53 | - forest_ih 54 | - gcp_mer 55 | - gem # combined as max(_s, _y) 56 | - goldplacer # combined as max(_s, _y) 57 | - goldsurface # combined as max(_s, _y) 58 | - goldvein # combined as max(_s, _y) 59 | - grass_ih 60 | - gwarea 61 | - harvarea 62 | - herb_gc 63 | - imr_mean 64 | - irrig_sum 65 | - landarea 66 | - maincrop 67 | - mountains_mean 68 | - nlights_calib_mean 69 | - pasture_ih 70 | - petroleum # combined as max(_s, _y) 71 | - pop_gpw_sum 72 | - savanna_ih 73 | - shrub_gc 74 | - shrub_ih 75 | - temp 76 | - ttime_mean 77 | - urban_gc 78 | - urban_ih 79 | - water_gc 80 | - water_ih 81 | # Following cols not used 82 | # - bdist1 83 | # - bdist2 84 | # - cmr_max 85 | # - cmr_min 86 | # - cmr_sd 87 | # - droughtcrop_speibase 88 | # - droughtcrop_speigdm 89 | # - droughtcrop_spi 90 | # - droughtend_speibase 91 | # - droughtend_speigdm 92 | # - droughtend_spi 93 | # - droughtstart_speibase 94 | # - droughtstart_speigdm 95 | # - droughtstart_spi 96 | # - droughtyr_speibase 97 | # - droughtyr_speigdm 98 | # - droughtyr_spi 99 | # - gcp_ppp 100 | # - gcp_qual 101 | # - growend 102 | # - growstart 103 | # - gwno 104 | # - imr_max 105 | # - imr_min 106 | # - imr_sd 107 | # - irrig_max 108 | # - irrig_min 109 | # - irrig_sd 110 | # - nlights_max 111 | # - nlights_mean 112 | # - nlights_min 113 | # - nlights_sd 114 | # - pop_gpw_max 115 | # - pop_gpw_min 116 | # - pop_gpw_sd 117 | # - pop_hyd_max 118 | # - pop_hyd_min 119 | # - pop_hyd_sd 120 | # - pop_hyd_sum 121 | # - prec_gpcc 122 | # - prec_gpcp 123 | # - rainseas 124 | # - ttime_max 125 | # - ttime_min 126 | # - ttime_sd 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /views/database/sources/reign/__init__.py: -------------------------------------------------------------------------------- 1 | """ Reign package """ 2 | __all__ = ["fetch_reign", "load_reign"] 3 | from .reign import fetch_reign, load_reign 4 | -------------------------------------------------------------------------------- /views/database/sources/reign/reign.py: -------------------------------------------------------------------------------- 1 | """ Reign """ 2 | import os 3 | import tempfile 4 | import logging 5 | from typing import Any, Dict 6 | import requests 7 | import pandas as pd # type: ignore 8 | import bs4 # type: ignore 9 | 10 | from views.apps.data import missing 11 | from views.database import common 12 | from views.utils import io, db 13 | 14 | log = logging.getLogger(__name__) 15 | 16 | 17 | def fetch_reign() -> None: 18 | """ Fetch REIGN data """ 19 | 20 | def get_latest_data_url(url_report) -> str: 21 | html_doc = requests.get(url_report).content 22 | soup = bs4.BeautifulSoup(html_doc, "html5lib") 23 | container = soup.find("div", {"class": "post-container"}) 24 | url_data = container.find("a", href=True)["href"] 25 | log.debug(f"url_data: {url_data}") 26 | 27 | if not url_data.endswith(".csv"): 28 | raise RuntimeError(f"Reign link doesn't look like .csv {url_data}") 29 | 30 | return url_data 31 | 32 | log.debug("Started fetching reign") 33 | url_base = "https://oefdatascience.github.io/REIGN.github.io" 34 | url_report = f"{url_base}/menu/reign_current.html" 35 | url = get_latest_data_url(url_report=url_report) 36 | common.fetch_source_simply(name="reign", url=url) 37 | log.debug("Finished fetching reign") 38 | 39 | 40 | def fix_ccodes(df: pd.DataFrame, spec: Dict[str, Any]) -> pd.DataFrame: 41 | """ Fix country codes as defined by spec ccode_replaces """ 42 | log.debug("Fixing ccodes") 43 | 44 | fixes = spec["ccode_replaces"] 45 | for fix_name, values in fixes.items(): 46 | old = values["old"] 47 | new = values["new"] 48 | df.loc[df.ccode == old, "ccode"] = new 49 | log.debug(f"Replaced ccode {old} with {new} for {fix_name}") 50 | 51 | log.debug("Dropping duplicate country-months for leadership changes.") 52 | dropdup_cols = ["ccode", "year", "month"] 53 | # Some messages are too big even for debug... 54 | # msg = df[df.duplicated(subset=dropdup_cols, keep=False)].to_string() 55 | # log.debug(msg) 56 | df = df.sort_values("tenure_months") 57 | len_df_predrop = len(df) 58 | df = df.drop_duplicates(subset=dropdup_cols, keep="first") 59 | len_df_postdrop = len(df) 60 | 61 | log.debug(f"Dropped {len_df_predrop - len_df_postdrop} duplicate obs") 62 | 63 | return df 64 | 65 | 66 | def encode_govt_dummies(df: pd.DataFrame) -> pd.DataFrame: 67 | """ Encode government dummies """ 68 | log.debug("Encoding reign government dummies") 69 | 70 | def cleanup_govtype_name(name): 71 | """ Remove " ", "-", "/" from government type strings """ 72 | name = name.lower() 73 | name = name.replace(" ", "_").replace("-", "_").replace("/", "_") 74 | name = name.replace("__", "_").replace("__", "_") 75 | return name 76 | 77 | df["government"] = df["government"].apply(cleanup_govtype_name) 78 | df_gov = pd.get_dummies(df["government"], prefix="gov") 79 | log.debug(f"Adding dummy cols {list(df_gov.columns)}") 80 | df = df.join(df_gov) 81 | return df 82 | 83 | 84 | def load_reign() -> None: 85 | """ Load reign """ 86 | log.info("Started loading reign.") 87 | 88 | spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml")) 89 | with tempfile.TemporaryDirectory() as tempdir: 90 | paths = common.get_files_latest_fetch(name="reign", tempdir=tempdir) 91 | path_csv = [path for path in paths if path.endswith(".csv")].pop() 92 | df = io.csv_to_df(path=path_csv) 93 | 94 | df = fix_ccodes(df, spec) 95 | df = encode_govt_dummies(df) 96 | 97 | df = df.set_index(["year", "month", "ccode"]) 98 | df = df.join( 99 | db.query_to_df( 100 | query=""" 101 | SELECT id AS country_id, gwcode AS ccode 102 | FROM staging.country WHERE gweyear=2016; 103 | """ 104 | ).set_index(["ccode"]) 105 | ) 106 | df = df.join( 107 | db.query_to_df( 108 | query=""" 109 | SELECT id AS month_id, year_id AS year, month FROM staging.month; 110 | """ 111 | ).set_index(["year", "month"]) 112 | ) 113 | df = df.reset_index().set_index(["month_id", "country_id"]) 114 | df = df.drop( 115 | columns=["year", "month", "ccode", "country", "government", "leader"] 116 | ) 117 | 118 | df_skeleton = db.db_to_df( 119 | fqtable="skeleton.cm_global", 120 | cols=["month_id", "country_id"], 121 | ids=["month_id", "country_id"], 122 | ) 123 | len_skel = len(df_skeleton) 124 | df = df_skeleton.join(df, how="left") 125 | if not len(df) == len_skel: 126 | raise RuntimeError(f"Join not correct, {len_skel} != {len(df)}") 127 | 128 | df = df.add_prefix("reign_") 129 | 130 | db.drop_schema("reign_v2") 131 | db.create_schema("reign_v2") 132 | db.df_to_db(df=df, fqtable="reign_v2.cm_unimp") 133 | 134 | db.df_to_db( 135 | df=missing.fill_groups_with_time_means(missing.extrapolate(df)), 136 | fqtable="reign_v2.cm_extrapolated", 137 | ) 138 | 139 | log.info("Finished loading reign.") 140 | -------------------------------------------------------------------------------- /views/database/sources/reign/spec.yaml: -------------------------------------------------------------------------------- 1 | ccode_replaces: 2 | germany: 3 | old: 255 4 | new: 260 5 | yemen: 6 | old: 679 7 | new: 678 8 | nauru: 9 | old: 970 10 | new: 971 11 | kiribati: 12 | old: 946 13 | new: 970 14 | tuvalu: 15 | old: 947 16 | new: 973 17 | tonga: 18 | old: 955 19 | new: 972 20 | serbia: 21 | old: 345 22 | new: 340 -------------------------------------------------------------------------------- /views/database/sources/spei/__init__.py: -------------------------------------------------------------------------------- 1 | """ SPEI package """ 2 | __all__ = ["fetch_spei", "load_spei"] 3 | from .spei import fetch_spei, load_spei 4 | -------------------------------------------------------------------------------- /views/database/sources/spei/cleanup.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE spei_v2.spei_1; 2 | DROP TABLE spei_v2.spei_2; 3 | DROP TABLE spei_v2.spei_3; 4 | DROP TABLE spei_v2.spei_4; 5 | DROP TABLE spei_v2.spei_5; 6 | DROP TABLE spei_v2.spei_6; 7 | DROP TABLE spei_v2.spei_7; 8 | DROP TABLE spei_v2.spei_8; 9 | DROP TABLE spei_v2.spei_9; 10 | DROP TABLE spei_v2.spei_10; 11 | DROP TABLE spei_v2.spei_11; 12 | DROP TABLE spei_v2.spei_12; 13 | DROP TABLE spei_v2.spei_13; 14 | DROP TABLE spei_v2.spei_14; 15 | DROP TABLE spei_v2.spei_15; 16 | DROP TABLE spei_v2.spei_16; 17 | DROP TABLE spei_v2.spei_17; 18 | DROP TABLE spei_v2.spei_18; 19 | DROP TABLE spei_v2.spei_19; 20 | DROP TABLE spei_v2.spei_20; 21 | DROP TABLE spei_v2.spei_21; 22 | DROP TABLE spei_v2.spei_22; 23 | DROP TABLE spei_v2.spei_23; 24 | DROP TABLE spei_v2.spei_24; 25 | DROP TABLE spei_v2.spei_25; 26 | DROP TABLE spei_v2.spei_26; 27 | DROP TABLE spei_v2.spei_27; 28 | DROP TABLE spei_v2.spei_28; 29 | DROP TABLE spei_v2.spei_29; 30 | DROP TABLE spei_v2.spei_30; 31 | DROP TABLE spei_v2.spei_31; 32 | DROP TABLE spei_v2.spei_32; 33 | DROP TABLE spei_v2.spei_33; 34 | DROP TABLE spei_v2.spei_34; 35 | DROP TABLE spei_v2.spei_35; 36 | DROP TABLE spei_v2.spei_36; 37 | DROP TABLE spei_v2.spei_37; 38 | DROP TABLE spei_v2.spei_38; 39 | DROP TABLE spei_v2.spei_39; 40 | DROP TABLE spei_v2.spei_40; 41 | DROP TABLE spei_v2.spei_41; 42 | DROP TABLE spei_v2.spei_42; 43 | DROP TABLE spei_v2.spei_43; 44 | DROP TABLE spei_v2.spei_44; 45 | DROP TABLE spei_v2.spei_45; 46 | DROP TABLE spei_v2.spei_46; 47 | DROP TABLE spei_v2.spei_47; 48 | DROP TABLE spei_v2.spei_48; 49 | DROP TABLE spei_v2.pg_ug; 50 | -------------------------------------------------------------------------------- /views/database/sources/spei/pg_ug.sql: -------------------------------------------------------------------------------- 1 | -- Create a grid similar to priogrid but with 1x1 degree resolution 2 | -- Priogrid is 0.5x0.5 degree resolution. 3 | -- SPEI comies in at 1x1 resolution so we use this to map SPEI to pg_ids 4 | 5 | CREATE OR REPLACE FUNCTION 6 | ST_CreateFishnet( 7 | -- PARAMETERS 8 | nrow integer, ncol integer, 9 | ysize float8, xsize float8, 10 | y0 float8 DEFAULT 0, x0 float8 DEFAULT 0, 11 | srid integer DEFAULT 4326, 12 | OUT "row" integer, OUT col integer, 13 | OUT geom geometry) 14 | -- RETURNS 15 | RETURNS SETOF record AS 16 | -- PROCESS 17 | $$ 18 | SELECT i + 1 AS row, j + 1 AS col, ST_SetSRID(ST_Translate(cell, j * $3 + $5, i * $4 + $6), $7) AS geom 19 | FROM generate_series(0, $1 - 1) AS j, 20 | generate_series(0, $2 - 1) AS i, 21 | (SELECT ('POLYGON((0 0, 0 '||$4||', '||$3||' '||$4||', '||$3||' 0,0 0))')::geometry AS cell) AS foo; 22 | $$ LANGUAGE sql IMMUTABLE STRICT; 23 | 24 | 25 | -- Create global 1x1 grid 26 | DROP TABLE IF EXISTS spei_v2.unigrid_world; 27 | CREATE TABLE spei_v2.unigrid_world ( 28 | gid serial NOT NULL, 29 | "row" integer, 30 | col integer, 31 | cell geometry(Polygon, 4326), 32 | CONSTRAINT unigrid_pkey PRIMARY KEY (gid)); 33 | INSERT INTO spei_v2.unigrid_world ("row", col, cell) SELECT * FROM ST_CreateFishnet(360, 180, 1.0, 1.0, -180, -90, 4326) AS cells; 34 | CREATE INDEX ON spei_v2.unigrid_world USING GIST (cell); 35 | 36 | 37 | -- Create table of pg_ids to ug_ids 38 | DROP TABLE IF EXISTS spei_v2.pg_ug; 39 | CREATE TABLE spei_v2.pg_ug AS 40 | SELECT pg.gid AS pg_id, 41 | ug.gid AS ug_id 42 | FROM staging.priogrid AS pg, 43 | spei_v2.unigrid_world as ug 44 | --Returns true if no point in pg.geom is outside of ug.cell, otherwise false. 45 | -- ug.cells cover pg.geometries 46 | WHERE ST_Covers(ug.cell, pg.geom); 47 | 48 | DROP TABLE spei_v2.unigrid_world; -------------------------------------------------------------------------------- /views/database/sources/vdem/__init__.py: -------------------------------------------------------------------------------- 1 | """ VDEM package """ 2 | __all__ = ["fetch_vdem", "load_vdem"] 3 | from .vdem import fetch_vdem, load_vdem 4 | -------------------------------------------------------------------------------- /views/database/sources/wdi/__init__.py: -------------------------------------------------------------------------------- 1 | """ WDI package """ 2 | __all__ = ["fetch_wdi", "load_wdi"] 3 | from .wdi import fetch_wdi, load_wdi 4 | -------------------------------------------------------------------------------- /views/specs/README.md: -------------------------------------------------------------------------------- 1 | # specs 2 | 3 | ViEWS has many definitions that should be the same throughout the project: 4 | 5 | * Models, which features go into which models? 6 | * Periods, what are the time limits for training, calibrating and predicting? 7 | 8 | This module provides a system of specfiles to use as references throughout. -------------------------------------------------------------------------------- /views/specs/__init__.py: -------------------------------------------------------------------------------- 1 | """ All production specs should be placed and accessed through here """ 2 | __all__ = ["data", "models", "periods"] 3 | from . import data, models, periods 4 | -------------------------------------------------------------------------------- /views/specs/data/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/UppsalaConflictDataProgram/OpenViEWS2/7eb3e63c8c046de31f70cd56f417fadf03686f5a/views/specs/data/README.md -------------------------------------------------------------------------------- /views/specs/data/__init__.py: -------------------------------------------------------------------------------- 1 | """ Specification of datasets and transformations """ 2 | from typing import Any, Dict, Union 3 | 4 | from views.apps.data import api 5 | 6 | from . import parsed_datasets 7 | 8 | 9 | def build_geometries() -> Dict[str, Any]: 10 | """ Just expose our custom geometries as dict to be consistent """ 11 | geometries = { 12 | "GeomPriogrid": api.GeomPriogrid(), 13 | "GeomCountry": api.GeomCountry(), 14 | } 15 | return geometries 16 | 17 | 18 | GEOMETRIES: Dict[ 19 | str, Union[api.GeomPriogrid, api.GeomCountry] 20 | ] = build_geometries() 21 | TABLES: Dict[str, api.Table] = parsed_datasets.build_tables() 22 | DATASETS: Dict[str, api.Dataset] = parsed_datasets.build_datasets() 23 | -------------------------------------------------------------------------------- /views/specs/data/parsed_datasets.py: -------------------------------------------------------------------------------- 1 | """ Parsers for Table and Dataset Dicts from spec.yaml """ 2 | from typing import Dict 3 | import os 4 | from views.utils import io 5 | from views.apps.data.api import Dataset, Table 6 | from . import solver 7 | 8 | 9 | def build_tables() -> Dict[str, Table]: 10 | """ Build Table objects from spec.yaml in this dir """ 11 | specs = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml")) 12 | # Build tables dict 13 | tables: Dict[str, Table] = dict() 14 | for fqtable, spec in specs["tables"].items(): 15 | tables[fqtable] = Table(fqtable=fqtable, ids=spec["ids"]) 16 | 17 | return tables 18 | 19 | 20 | def build_datasets() -> Dict[str, Dataset]: 21 | """ Build Datasets from spec.yaml in this dir """ 22 | specs = io.load_yaml(os.path.join(os.path.dirname(__file__), "spec.yaml")) 23 | tables: Dict[str, Table] = build_tables() 24 | 25 | # Build transformsets dict 26 | datasets: Dict[str, Dataset] = dict() 27 | for name, spec in specs["datasets"].items(): 28 | dataset = Dataset( 29 | name=name, 30 | ids=spec["ids"], 31 | table_skeleton=tables[spec["table_skeleton"]], 32 | tables=[tables[table] for table in spec["tables"]], 33 | loa=spec["loa"], 34 | cols=spec["cols"] if "cols" in spec.keys() else None, 35 | transforms=solver.make_transforms_ordered(spec["transforms"]), 36 | balance=spec["balance"], 37 | ) 38 | datasets[name] = dataset 39 | 40 | return datasets 41 | -------------------------------------------------------------------------------- /views/specs/data/solver.py: -------------------------------------------------------------------------------- 1 | """ Specification solver for transformations 2 | 3 | A user can specify a set of transformations as a dictionary. 4 | make_transforms_ordered() returns a dependency ordered list of 5 | the corresponding Transform() instances. 6 | """ 7 | from typing import Any, Dict, List 8 | 9 | from views.apps.data import api 10 | 11 | 12 | def _get_cols_source(transforms: List[api.Transform]) -> List[str]: 13 | """ Get a list of source columns needed for a list of Tranforms """ 14 | 15 | all_names = [transform.name for transform in transforms] 16 | all_cols = [] 17 | for transform in transforms: 18 | for col in transform.cols_input: 19 | all_cols.append(col) 20 | 21 | # Dedup 22 | all_cols = sorted(list(set(all_cols))) 23 | cols_source = [col for col in all_cols if col not in all_names] 24 | cols_source = sorted(cols_source) 25 | 26 | return cols_source 27 | 28 | 29 | def _order_transforms(transforms: List[api.Transform]) -> List[api.Transform]: 30 | """ Order transformations so they are done in dependency order """ 31 | 32 | def names(tasks): 33 | return [task.name for task in tasks] 34 | 35 | ordered: List[api.Transform] = list() 36 | while transforms: 37 | progress = False 38 | for task in transforms: 39 | # if task has deps in the other transforms that haven't 40 | # been solved themselves wait 41 | if any( 42 | [ 43 | col in names(transforms) and col not in names(ordered) 44 | for col in task.cols_input 45 | ] 46 | ): 47 | pass 48 | else: 49 | ordered.append(task) 50 | transforms.remove(task) 51 | progress = True 52 | if not progress: 53 | raise RuntimeError( 54 | "No progress, transform spec broken." 55 | f"Ordered (OK): {ordered}" 56 | f"Remaining: {transforms}" 57 | ) 58 | 59 | return ordered 60 | 61 | 62 | def make_transforms_ordered( 63 | specs: Dict[str, Dict[str, Any]] 64 | ) -> List[api.Transform]: 65 | """ Make dependency ordered list of Transform objects """ 66 | transforms = [api.Transform(name, **spec) for name, spec in specs.items()] 67 | transforms = _order_transforms(transforms) 68 | return transforms 69 | -------------------------------------------------------------------------------- /views/specs/models/README.md: -------------------------------------------------------------------------------- 1 | # Model specs 2 | 3 | ViEWS has a lot of big models with very many features. 4 | Keeping track of which column goes where can be very difficult. 5 | This module provides three master spec files, am.yaml, cm.yaml and pgm.yaml 6 | to attempt to keep track of them. 7 | 8 | The idea is to group columns into a hierarchy of 9 | * colsets, that list plain columns 10 | * themes, that groups colsets and other themes 11 | * formulas, that resolve a list of columns from the above 12 | 13 | Colsets, or column sets, are simply lists of columns with a name. 14 | Themes are made of colsets or by combining themes and colsets. 15 | Finally, formulas map all columns from a theme or colset to an outcome column. 16 | By applying the solver to these spec files we get solved formulas. 17 | They have a name, a col_outcome and a list of cols_features, which is found by recursively looking them up through themes and colsets. 18 | For a minimal example see tests/test_specs.py 19 | 20 | -------------------------------------------------------------------------------- /views/specs/models/__init__.py: -------------------------------------------------------------------------------- 1 | """ Defines which columns go into which models """ 2 | from typing import Dict, Any 3 | import os 4 | from views.utils import io 5 | from . import solver 6 | 7 | _THIS_DIR = os.path.dirname(__file__) 8 | 9 | cm: Dict[Any, Any] = solver.solve_formulas( 10 | io.load_yaml(os.path.join(_THIS_DIR, "cm.yaml")) 11 | ) 12 | pgm: Dict[Any, Any] = solver.solve_formulas( 13 | io.load_yaml(os.path.join(_THIS_DIR, "pgm.yaml")) 14 | ) 15 | 16 | __all__ = ["cm", "pgm"] 17 | -------------------------------------------------------------------------------- /views/specs/models/am.yaml: -------------------------------------------------------------------------------- 1 | colsets: {} 2 | themes: {} 3 | formulas: {} -------------------------------------------------------------------------------- /views/specs/models/solver.py: -------------------------------------------------------------------------------- 1 | """ Model specification solver """ 2 | 3 | import os 4 | from typing import Any, Dict, List, Union 5 | 6 | from views.utils import io 7 | 8 | 9 | def solve_formulas( 10 | spec: Dict[Any, Any] 11 | ) -> Dict[str, Dict[str, Union[List[str], str]]]: 12 | """ Solve the colsets, themes and formulas from spec """ 13 | 14 | def solve_theme( 15 | name_theme: str, colsets: Dict[str, str], themes: Dict[str, str] 16 | ) -> List[str]: 17 | """ Get a dictionary of column-populated themes """ 18 | 19 | cols_theme = list() 20 | refs = themes[name_theme] 21 | for ref in refs: 22 | 23 | # If the reference is to a colset just get the cols 24 | if ref in colsets.keys(): 25 | for col in colsets[ref]: 26 | if col not in cols_theme: # avoid dups 27 | cols_theme.append(col) 28 | 29 | # Recursive lookup for themes 30 | elif ref in themes.keys(): 31 | for col in solve_theme(ref, colsets, themes): 32 | if col not in cols_theme: # avoid dups 33 | cols_theme.append(col) 34 | 35 | else: 36 | raise RuntimeError( 37 | f"{ref} not found in {colsets.keys()} or {themes.keys()}" 38 | ) 39 | 40 | return sorted(cols_theme) 41 | 42 | def solve_themes_and_colsets(spec: Dict[Any, Any]) -> Dict[str, List[str]]: 43 | """ Solve the themes by looking up names from colsets or themes """ 44 | 45 | solved_themes = dict() 46 | for name_theme in spec["themes"].keys(): 47 | solved_themes[name_theme] = solve_theme( 48 | name_theme, spec["colsets"], spec["themes"] 49 | ) 50 | for name_colset, colset in spec["colsets"].items(): 51 | solved_themes[name_colset] = colset 52 | return solved_themes 53 | 54 | assert list(spec.keys()) == ["colsets", "themes", "formulas"] 55 | 56 | solved_themes: Dict[str, List[str]] = solve_themes_and_colsets(spec) 57 | solved_formulas = dict() 58 | for name_formula, formula in spec["formulas"].items(): 59 | solved_formulas[name_formula] = { 60 | "col_outcome": formula["col_outcome"], 61 | "cols_features": sorted(solved_themes[formula["cols_features"]]), 62 | } 63 | return solved_formulas 64 | 65 | 66 | def solved_cm() -> Dict[str, Dict[str, Union[List[str], str]]]: 67 | """ Get solved CM formulas from cm.yaml """ 68 | spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "cm.yaml")) 69 | formulas = solve_formulas(spec) 70 | return formulas 71 | 72 | 73 | def solved_pgm() -> Dict[str, Dict[str, Union[List[str], str]]]: 74 | """ Get solved CM formulas from cm.yaml """ 75 | spec = io.load_yaml(os.path.join(os.path.dirname(__file__), "pgm.yaml")) 76 | formulas = solve_formulas(spec) 77 | return formulas 78 | -------------------------------------------------------------------------------- /views/specs/periods/__init__.py: -------------------------------------------------------------------------------- 1 | """ Defines which columns go into which models """ 2 | from typing import Dict, List 3 | import os 4 | from views.utils import io 5 | from views.apps.model import api 6 | 7 | 8 | def get_periods(run_id: str) -> List[api.Period]: 9 | """ Get periods for a particular run as list """ 10 | _this_dir = os.path.dirname(__file__) 11 | spec = io.load_yaml(os.path.join(_this_dir, "periods.yaml"))["runs"] 12 | 13 | spec_run = spec[run_id] 14 | periods = [] 15 | for period_name, data in spec_run.items(): 16 | period = api.Period( 17 | name=period_name, 18 | train_start=data["train"]["start"], 19 | train_end=data["train"]["end"], 20 | predict_start=data["predict"]["start"], 21 | predict_end=data["predict"]["end"], 22 | ) 23 | periods.append(period) 24 | return periods 25 | 26 | 27 | def get_periods_by_name(run_id: str) -> Dict[str, api.Period]: 28 | """ Get periods for a particular run as name-index dict """ 29 | periods_list = get_periods(run_id) 30 | periods_by_name = dict() 31 | for period in periods_list: 32 | periods_by_name[period.name] = period 33 | 34 | return periods_by_name 35 | 36 | 37 | __all__ = ["get_periods", "get_periods_by_name"] 38 | -------------------------------------------------------------------------------- /views/specs/periods/periods.yaml: -------------------------------------------------------------------------------- 1 | # This file defines the time limits used in ViEWS 2 | # they are organised by each run 3 | # 4 | 5 | runs: 6 | 7 | # First model development run 8 | # All models will be trained here 9 | # When yearly data is released in 2020.05 or 2020.06 this should be 10 | # copied and re-run under a new name so that we are training on 11 | # latest yearly-release data 12 | # We don't want to move C train back to end at 2018.12 now though 13 | # because we have published results trained on data up op 2019.12 14 | # already. 15 | d_2020_04_01: 16 | A: # Calibration period for B 17 | train: 18 | start: 121 # 1990.01 19 | end: 396 # 2012.12 20 | predict: 21 | start: 397 # 2013.01 22 | end: 432 # 2015.12, 23 | B: # Evaluation period. Calibration for C. 24 | train: 25 | start: 121 # 1990.01 26 | end: 432 # 2015.12 27 | predict: 28 | start: 433 # 2016.01, 29 | end: 468 # 2018.12, last month yearly data 30 | C: 31 | train: 32 | start: 121 # 1990.01 33 | end: 480 # 2019.12, last month latest data 34 | predict: 35 | start: 483 # 2020.03 36 | end: 520 # 2023.04 37 | 38 | 39 | d_2020_05_01_prelim: 40 | # A preliminary run for UN Covid19 report 41 | # Includes the A partition so as to not break any compatibility for now 42 | A: # Calibration period for B 43 | train: 44 | start: 121 # 1990.01 45 | end: 396 # 2012.12 46 | predict: 47 | start: 397 # 2013.01 48 | end: 432 # 2015.12, 49 | B: # Evaluation period. Calibration for C. 50 | train: 51 | start: 121 # 1990.01 52 | end: 432 # 2015.12 53 | predict: 54 | start: 433 # 2016.01, 55 | end: 468 # 2018.12, last month yearly data 56 | C: 57 | train: 58 | start: 121 # 1990.01 59 | end: 480 # 2019.12 60 | predict: 61 | start: 484 # 2020.04 62 | end: 521 # 2023.05 63 | 64 | d_2020_06_01_prelim: 65 | # A preliminary run with OSA only, XGB and DS not merged yet =( 66 | # Includes the A partition so as to not break any compatibility for now 67 | A: # Calibration period for B 68 | train: 69 | start: 121 # 1990.01 70 | end: 396 # 2012.12 71 | predict: 72 | start: 397 # 2013.01 73 | end: 432 # 2015.12, 74 | B: # Evaluation period. Calibration for C. 75 | train: 76 | start: 121 # 1990.01 77 | end: 432 # 2015.12 78 | predict: 79 | start: 433 # 2016.01, 80 | end: 468 # 2018.12, last month yearly data 81 | C: 82 | train: 83 | start: 121 # 1990.01 84 | end: 480 # 2019.12 85 | predict: 86 | start: 485 # 2020.05 87 | end: 522 # 2023.06 88 | 89 | 90 | r_2020_02_01: 91 | B: 92 | train: 93 | start: 121 # 1990.01 94 | end: 432 # 2015.12 95 | predict: 96 | start: 433 # 2016.01, 97 | end: 468 # 2018.12, last month yearly data 98 | C: 99 | train: 100 | start: 121 # 1990.01 101 | end: 480 # 2019.12 102 | predict: 103 | start: 481 # 2020.01 104 | end: 518 # 2023.02 # 38 months of forecast 105 | 106 | r_2020_03_01: 107 | B: 108 | train: 109 | start: 121 # 1990.01 110 | end: 432 # 2015.12 111 | predict: 112 | start: 433 # 2016.01, 113 | end: 468 # 2018.12, last month yearly data 114 | C: 115 | train: 116 | start: 121 # 1990.01 117 | end: 480 # 2019.12 118 | predict: 119 | start: 482 # 2020.02 120 | end: 519 # 2023.03 121 | 122 | r_2020_04_01: 123 | B: 124 | train: 125 | start: 121 # 1990.01 126 | end: 432 # 2015.12 127 | predict: 128 | start: 433 # 2016.01, 129 | end: 468 # 2018.12, last month yearly data 130 | C: 131 | train: 132 | start: 121 # 1990.01 133 | end: 480 # 2019.12 134 | predict: 135 | start: 483 # 2020.03 136 | end: 520 # 2023.04 137 | 138 | r_2020_05_01: 139 | B: 140 | train: 141 | start: 121 # 1990.01 142 | end: 432 # 2015.12 143 | predict: 144 | start: 433 # 2016.01, 145 | end: 468 # 2018.12, last month yearly data 146 | C: 147 | train: 148 | start: 121 # 1990.01 149 | end: 480 # 2019.12 150 | predict: 151 | start: 484 # 2020.04 152 | end: 521 # 2023.05 153 | 154 | r_2020_06_01: 155 | B: 156 | train: 157 | start: 121 # 1990.01 158 | end: 432 # 2015.12 159 | predict: 160 | start: 433 # 2016.01, 161 | end: 468 # 2018.12, last month yearly data 162 | C: 163 | train: 164 | start: 121 # 1990.01 165 | end: 480 # 2019.12 166 | predict: 167 | start: 485 # 2020.05 168 | end: 522 # 2023.06 169 | 170 | r_2020_07_01: 171 | B: 172 | train: 173 | start: 121 # 1990.01 174 | end: 432 # 2015.12 175 | predict: 176 | start: 433 # 2016.01, 177 | end: 468 # 2018.12, last month yearly data 178 | C: 179 | train: 180 | start: 121 # 1990.01 181 | end: 480 # 2019.12 182 | predict: 183 | start: 486 # 2020.06 184 | end: 523 # 2023.07 -------------------------------------------------------------------------------- /views/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ Common utilities """ 2 | __all__ = [ 3 | "data", 4 | "db", 5 | "io", 6 | "log", 7 | "misc", 8 | "mocker", 9 | "stats", 10 | ] 11 | 12 | from . import data, db, io, log, misc, mocker, stats 13 | -------------------------------------------------------------------------------- /views/utils/data.py: -------------------------------------------------------------------------------- 1 | """ Common data utilities """ 2 | from typing import List, Union 3 | import logging 4 | 5 | import numpy as np # type: ignore 6 | import pandas as pd # type: ignore 7 | 8 | log = logging.getLogger(__name__) 9 | 10 | 11 | def resample( 12 | df: pd.DataFrame, 13 | cols: List[str], 14 | share_positives: float, 15 | share_negatives: float, 16 | threshold=0, 17 | ): 18 | """ Resample a dataframe with respect to cols 19 | 20 | Resampling is a technique for changing the positive/negative balance 21 | of a dataframe. Positives are rows where any of the specified cols 22 | are greater than the threshold. Useful for highly unbalanced 23 | datasets where positive outcomes are rare. 24 | 25 | """ 26 | # If both shares are 1 just return the unaltered df 27 | if share_positives == 1 and share_negatives == 1: 28 | return df 29 | 30 | # Negatives are rows where all cols are close to zero 31 | mask_negatives = np.isclose(df[cols], threshold).max(axis=1) 32 | # Positives are all the others 33 | mask_positives = ~mask_negatives 34 | 35 | df_positives = df.loc[mask_positives] 36 | df_negatives = df.loc[mask_negatives] 37 | 38 | len_positives = len(df_positives) 39 | len_negatives = len(df_negatives) 40 | 41 | n_positives_wanted = int(share_positives * len_positives) 42 | n_negatives_wanted = int(share_negatives * len_negatives) 43 | 44 | replacement_pos = share_positives > 1 45 | replacement_neg = share_negatives > 1 46 | df = pd.concat( 47 | [ 48 | df_positives.sample(n=n_positives_wanted, replace=replacement_pos), 49 | df_negatives.sample(n=n_negatives_wanted, replace=replacement_neg), 50 | ] 51 | ) 52 | return df 53 | 54 | 55 | def check_has_multiindex(data: Union[pd.Series, pd.DataFrame]) -> None: 56 | """ Raise RuntimeError if Series s doesn't have MultiIndex """ 57 | if not isinstance(data.index, pd.MultiIndex): 58 | msg = ( 59 | "Data is lacking a multiindex that was expected." 60 | "Set the index with df.set_index([timevar, groupvar])." 61 | ) 62 | raise RuntimeError(msg) 63 | 64 | 65 | def balance_panel_last_t(df: pd.DataFrame) -> pd.DataFrame: 66 | """ Balance a multiindexed dataframe panel. 67 | 68 | The balanced index has observations for all groups present at the 69 | last t. 70 | Assumens df is indexed with timevar as index level 0, and groupvar 71 | at index level 1. 72 | 73 | Args: 74 | df: Dataframe with multiindex to balance 75 | Returns: 76 | df: A reindexed dataframe 77 | """ 78 | log.debug(f"Balancing index of panel with shape {df.shape}") 79 | check_has_multiindex(df) 80 | 81 | # Reset the index to actual values, 82 | # Needed in case data has been subseted with .loc before 83 | # If this isn't done, df.index.levels[0].max() gets the 84 | # pre-subsetting max 85 | df = df.reset_index().set_index(df.index.names).sort_index() 86 | 87 | return df.reindex( 88 | pd.MultiIndex.from_product( 89 | [ 90 | df.index.levels[0].unique(), 91 | df.loc[df.index.levels[0].max()].index.unique(), 92 | ], 93 | names=df.index.names, 94 | ) 95 | ).sort_index() 96 | 97 | 98 | def assign_into_df(df_to: pd.DataFrame, df_from: pd.DataFrame) -> pd.DataFrame: 99 | """ Assign all columns from df_from into df_to 100 | 101 | Only assigns non-missing values from df_from, meaning the 102 | same column can be inserted multiple times and values be 103 | retained if the row coverage is different between calls. 104 | So a df_a with col_a covering months 100-110 and df_b with col_a covering 105 | months 111-120 could be assigned into a single df which would get 106 | values of col_a for months 100 - 120. 107 | """ 108 | 109 | for col in df_from: 110 | log.debug(f"Inserting col {col}") 111 | # Get a Series of the col for all rows 112 | s = df_from.loc[:, col] 113 | # Get the "is not null" boolean series to use as mask, ~ is NOT 114 | mask = ~s.isnull() 115 | # Get the index from that mask, 116 | # ix is now index labels of rows with (not missing) data 117 | ix = s.loc[mask].index 118 | df_to.loc[ix, col] = s.loc[ix] 119 | return df_to 120 | 121 | 122 | def rebuild_index(data: pd.DataFrame) -> pd.DataFrame: 123 | """ Rebuild the index of the dataframe 124 | 125 | Sometimes we construct new dataframes from old ones or subset 126 | dataframes by time. The contents of the df.index of the new 127 | dataframes then still contain the full set of values from the old 128 | df. This function rebuilds the index to only have the actual 129 | values with rows. 130 | """ 131 | check_has_multiindex(data) 132 | return data.reset_index().set_index(data.index.names).sort_index() 133 | -------------------------------------------------------------------------------- /views/utils/log.py: -------------------------------------------------------------------------------- 1 | """ Logging utils """ 2 | from functools import wraps 3 | import datetime 4 | import logging 5 | import os 6 | import time 7 | import uuid 8 | 9 | from views.config import DIR_STORAGE 10 | 11 | log = logging.getLogger(__name__) 12 | 13 | 14 | def utc_now() -> str: 15 | """ Get current UTC time """ 16 | return datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") 17 | 18 | 19 | def get_log_path(caller_path: str) -> str: 20 | """ Get unique and timestamped path to a logfile """ 21 | name = os.path.basename(caller_path).replace(".py", "") 22 | # Hopefully unique filename with timestamp and part of a uuid 23 | fname = f"{name}_{utc_now()}_{str(uuid.uuid4()).split('-')[0]}.log" 24 | path = os.path.join(DIR_STORAGE, "logs", fname) 25 | print(f"Logging to {path}") 26 | return path 27 | 28 | 29 | def logtime(func): 30 | """This decorator logs the execution time for the decorated function.""" 31 | 32 | @wraps(func) 33 | def wrapper(*args, **kwargs): 34 | start = time.time() 35 | result = func(*args, **kwargs) 36 | end = time.time() 37 | log.debug("{} ran in {}s".format(func.__name__, round(end - start, 2))) 38 | return result 39 | 40 | return wrapper 41 | -------------------------------------------------------------------------------- /views/utils/misc.py: -------------------------------------------------------------------------------- 1 | """ Misc utils that don't fit anyhwere else """ 2 | from typing import Any, List 3 | 4 | 5 | def lists_disjoint(lists: List[List[Any]]) -> bool: 6 | """ Do lists share any elements""" 7 | disjoint = True 8 | for i, base_list in enumerate(lists): 9 | lists_to_check = lists[i + 1 :] 10 | for to_check in lists_to_check: 11 | if not set(base_list).isdisjoint(to_check): 12 | disjoint = False 13 | return disjoint 14 | -------------------------------------------------------------------------------- /views/utils/stats.py: -------------------------------------------------------------------------------- 1 | """ Statistical utils 2 | 3 | #@TODO: Figure out numpy / pandas types here 4 | """ 5 | from typing import Any 6 | import warnings 7 | 8 | import numpy as np # type: ignore 9 | 10 | 11 | def prob_to_odds(p: Any, clip=True) -> Any: 12 | """ Cast probability into odds """ 13 | 14 | if isinstance(p, list): 15 | p = np.array(p) 16 | 17 | if clip: 18 | offset = 1e-10 19 | offset = 1e-10 20 | upper = 1 - offset 21 | lower = 0 + offset 22 | p = np.clip(p, lower, upper) 23 | 24 | # Check for probs greq 1 because odds of 1 is inf which might break things 25 | if np.any(p >= 1): 26 | msg = "probs >= 1 passed to get_odds, expect infs" 27 | warnings.warn(msg) 28 | 29 | odds = p / (1 - p) 30 | return odds 31 | 32 | 33 | def prob_to_logodds(p: Any) -> Any: 34 | """ Cast probability to log-odds """ 35 | return np.log(prob_to_odds(p)) 36 | 37 | 38 | def odds_to_prob(odds: Any) -> Any: 39 | """ Cast odds ratio to probability """ 40 | return odds / (odds + 1) 41 | 42 | 43 | def logodds_to_prob(logodds: Any) -> Any: 44 | """ Cast logodds to probability """ 45 | return odds_to_prob(np.exp(logodds)) 46 | --------------------------------------------------------------------------------