├── dashboard-dataviz
    ├── README.md
    ├── dashboard
    │   ├── text_inputs
    │   │   └── README.md
    │   ├── data_inputs_for_dashboard
    │   │   └── README.md
    │   ├── preparing_data_for_dashboard
    │   │   ├── README.md
    │   │   ├── 01_clean_spatial_data
    │   │   │   ├── README.md
    │   │   │   ├── clean_adm3_file.R
    │   │   │   └── clean_adm2_file.R
    │   │   ├── 03_dashboard_data_prep
    │   │   │   ├── prep_subs_obs_totals_data.R
    │   │   │   ├── data_to_github.R
    │   │   │   └── README.md
    │   │   ├── 02_clean_telecom_data
    │   │   │   ├── clean_i3_subscribers_data.R
    │   │   │   ├── clean_i5_net_movement_data.R
    │   │   │   ├── clean_i5_movement_inout_data.R
    │   │   │   ├── clean_i7_distance_traveled.R
    │   │   │   └── README.md
    │   │   └── _dash_master.R
    │   ├── functions.R
    │   ├── styles.css
    │   └── README.md
    └── figures
    │   ├── _master_figures.R
    │   ├── i3_figures.R
    │   ├── i5_net_figures.R
    │   └── i5_into_out.R
├── data-checks
    ├── Archive
    │   ├── patch_cleaning.py
    │   ├── Descr-exploratory
    │   │   ├── draf.py
    │   │   ├── i5-plot.py
    │   │   └── fb-comparisson-draft.py
    │   ├── globals.py
    │   ├── quick_checks
    │   │   ├── check_subscribers.R
    │   │   └── ward_neighbors_tower_down.R
    │   ├── usage_outliers.py
    │   ├── i10-check.py
    │   ├── MASTER.py
    │   ├── 02_summary_stats.py
    │   ├── 03_i_specific_checks_i1_admin2.py
    │   ├── data_files_comparisson.py
    │   ├── 01_completenes_checks.py
    │   └── od_scaling.py
    └── README.md
├── cdr-aggregation
    ├── docker-compose.yml
    ├── notebooks
    │   ├── modules
    │   │   ├── setup.py
    │   │   ├── README.md
    │   │   ├── folder_utils.py
    │   │   ├── import_packages.py
    │   │   ├── utilities.py
    │   │   ├── flowminder_aggregator.py
    │   │   ├── voronoi.py
    │   │   ├── outliers.py
    │   │   ├── aggregator.py
    │   │   ├── tower_clustering.py
    │   │   └── sql_code_aggregates.py
    │   ├── folder_setup.py
    │   ├── README.md
    │   ├── folder_setup.ipynb
    │   └── aggregation_master.py
    ├── docker
    │   └── Dockerfile
    ├── config_file_template.py
    └── config_file_template_hive.py
├── data-panel
    ├── Archive
    │   ├── _master.py
    │   ├── usage_outliers.py
    │   ├── 02_clean.py
    │   ├── panel_draft2.py
    │   └── panel_draft.py
    ├── 01_construct.py
    └── utils.py
└── .gitignore


/dashboard-dataviz/README.md:
--------------------------------------------------------------------------------
1 | # Dashboard and Figures


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/text_inputs/README.md:
--------------------------------------------------------------------------------
1 | # Text Inputs


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/data_inputs_for_dashboard/README.md:
--------------------------------------------------------------------------------
1 | # Data Inputs for Dashboard


--------------------------------------------------------------------------------
/data-checks/Archive/patch_cleaning.py:
--------------------------------------------------------------------------------
1 | 
2 | # Cleaning
3 | 
4 | fi = pd.read_csv(ICUST_adm3_path + file_name) 
5 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/README.md:
--------------------------------------------------------------------------------
1 | # Files for data visualization and dashboards
2 | 


--------------------------------------------------------------------------------
/cdr-aggregation/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 | 
 4 |   jupyter:
 5 |     build:
 6 |       context: .
 7 |       dockerfile: ./docker/Dockerfile
 8 |     image: sebxwolf/cdr_aggregation_pyspark:v1
 9 |     container_name: cdr_aggregation
10 |     ports:
11 |       - "8888:8888"
12 |       - "4040:4040"
13 |     volumes:
14 |           - ./:/home/jovyan/work
15 | 


--------------------------------------------------------------------------------
/data-panel/Archive/_master.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # PANEl MASTER
3 | #-----------------------------------------------------------------#
4 | 
5 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
6 | DATA_POC = DATA_path + "proof-of-concept/"
7 | DATA_panel = DATA_POC + "panel_indicators/"
8 | DATA_panel_raw = DATA_panel + 'raw/'
9 | DATA_panel_clean = DATA_panel + 'clean/'


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/setup.py:
--------------------------------------------------------------------------------
 1 | # all the modules we need
 2 | 
 3 | from modules.import_packages import *
 4 | from modules.DataSource import *
 5 | from modules.utilities import *
 6 | from modules.aggregator import *
 7 | from modules.flowminder_aggregator import *
 8 | from modules.priority_aggregator import *
 9 | from modules.custom_aggregator import *
10 | from modules.scaled_aggregator import *
11 | from modules.sql_code_aggregates import *
12 | from modules.folder_utils import *
13 | 


--------------------------------------------------------------------------------
/cdr-aggregation/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jupyter/pyspark-notebook:dc9744740e12
 2 | 
 3 | RUN python --version
 4 | 
 5 | RUN conda install --quiet --yes -c \
 6 |     conda-forge jupyter_contrib_nbextensions jupyter_nbextensions_configurator \
 7 |     geopandas folium descartes
 8 | 
 9 | RUN pip install -U folium \
10 |                    geovoronoi \
11 |                    geopy
12 | 
13 | RUN jupyter labextension install @jupyterlab/toc
14 | 
15 | VOLUME /home/jovyan/work
16 | WORKDIR /home/jovyan/work
17 | 


--------------------------------------------------------------------------------
/data-checks/Archive/Descr-exploratory/draf.py:
--------------------------------------------------------------------------------
 1 | # Indicator 1 panel data
 2 | i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv')
 3 | i1 = i1[i1.region != '99999']
 4 | 
 5 | i3 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i3_admin3.csv')
 6 | i3 = i3[i3.region != '99999']
 7 | 
 8 | i1['date'] = pd.to_datetime(i1['hour']).dt.date
 9 | i3['date'] = pd.to_datetime(i3['day']).dt.date
10 | 
11 | 
12 | # Number of calls per day
13 | i1_day = i1.groupby(['date', 'region'])['count_p'].sum().reset_index()
14 | 
15 | # Merge 
16 | i13 = i1_day.merge(i3[['date', 'count_p', 'region']].rename(columns = {'count_p' : 'subscribers'}), 
17 |                    on = ['date', 'region'])
18 | 
19 | np.mean(i13['count_p']/i13['subscribers'])


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/folder_setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding: utf-8
 3 | 
 4 | # In[ ]:
 5 | 
 6 | 
 7 | import datetime as dt
 8 | from modules.DataSource import *
 9 | from modules.folder_utils import *
10 | 
11 | 
12 | # In[ ]:
13 | 
14 | 
15 | #Set relative file path to config file
16 | config_file = '../config_file.py'
17 | exec(open(config_file).read())
18 | 
19 | 
20 | # In[ ]:
21 | 
22 | 
23 | #Create the DataSource object and show config
24 | ds = DataSource(datasource_configs)
25 | ds.show_config()
26 | 
27 | 
28 | # In[ ]:
29 | 
30 | 
31 | #Setup all required data folders
32 | setup_folder(ds)
33 | 
34 | 
35 | # In[ ]:
36 | 
37 | 
38 | #Check if required data folders already exists
39 | check_folders(ds)
40 | 
41 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Notebook organization
2 | 
3 | The [aggregation_master.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master.py) script is currently set to run all flowminder, priority and scaled indicators. Additional custom indicators are left out.
4 | 
5 | The [aggregation_master.ipynb](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master.py) notebook does the same and can be used for data exploration, too.
6 | 
7 | The [aggregation_master_databricks.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master_databricks.py) notebook is customised for databricks.
8 | 


--------------------------------------------------------------------------------
/cdr-aggregation/config_file_template.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import *
 2 | schema = StructType([
 3 |   StructField("msisdn", IntegerType(), True),
 4 |   StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files()
 5 |   StructField("location_id", StringType(), True)
 6 | ])
 7 | 
 8 | datasource_configs = {
 9 |   "base_path": "/home/jovyan/work/data", #folder path used in this docker env
10 |   "country_code": "",
11 |   "telecom_alias": "",
12 |   "schema" : schema,
13 |   "data_paths" : ["*.csv"],
14 |   "filestub": "",
15 |   "geofiles": {},
16 |   "shapefiles": ['admin2','admin3', 'voronoi'],
17 |   "dates": {'start_date' : dt.datetime(2020,2,1),
18 |             'end_date' : dt.datetime(2020,3,31)}
19 | }
20 | 


--------------------------------------------------------------------------------
/data-checks/Archive/globals.py:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------------#
 2 | # DATA CHECKS - Globals
 3 | #-----------------------------------------------------------------#
 4 | 
 5 | # This file contains settings and globals used across data checks 
 6 | # files
 7 | 
 8 | # LIBRARIES
 9 | import os
10 | import re
11 | import pandas as pd
12 | import numpy as np
13 | import datetime as dt
14 | 
15 | import seaborn as sns; sns.set()
16 | from matplotlib import rcParams
17 | import matplotlib.pyplot as plt
18 | 
19 | from bokeh.plotting import figure, output_file, show
20 | from bokeh.models import Span
21 | from bokeh.io import export_png
22 | 
23 | 
24 | # GLOBALS
25 | 
26 | # File paths
27 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
28 | OUT_path = DATA_path + 'proof-of-concept/outputs/'
29 | 
30 | # Default values
31 | missing_values = ['99999','']


--------------------------------------------------------------------------------
/cdr-aggregation/config_file_template_hive.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import *
 2 | schema = StructType([
 3 |   StructField("msisdn", IntegerType(), True),
 4 |   StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files()
 5 |   StructField("location_id", StringType(), True)
 6 | ])
 7 | 
 8 | datasource_configs = {
 9 |   "base_path": "path_to_folder/data", #folder path used in this docker env
10 |   "hive_warehouse_location": "path_to_hive_warehouse",
11 |   "spark_mode": 'hive',
12 |   "hive_vars":{ 'msisdn' : 'col1',
13 |                 'call_datetime': 'col2',
14 |                 'location_id': 'col3',
15 |                 'calls': 'table'},
16 |   "country_code": "",
17 |   "telecom_alias": "",
18 |   "schema" : schema,
19 |   "data_paths" : ["*.csv"],
20 |   "filestub": "",
21 |   "geofiles": {},
22 |   "shapefiles": ['admin2','admin3', 'voronoi'],
23 |   "dates": {'start_date' : dt.datetime(2020,2,1),
24 |             'end_date' : dt.datetime(2020,3,31)}
25 | }
26 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/README.md:
--------------------------------------------------------------------------------
 1 | # Clean Spatial Data
 2 | 
 3 | Cleans spatial datasets:
 4 | 1. Aggregate units when needed (e.g., aggregating wards)
 5 | 2. Add additional variables (e.g., area)
 6 | 3. Standardize variable names
 7 | 4. Orders spatial data by region
 8 | 
 9 | ### Standardize Variable Names
10 | Each spatial dataset should have standardized variable names. Standardizing
11 | variable names helps ensure different units (eg, admin2, admin3) can be
12 | easily switched in the dashboard
13 | 
14 | | variable | format | example | description |
15 | |---|---|---|---|
16 | | region | string | ZW123456 | Unique identifier of the spatial unit |
17 | | name | string | Name| | Spatial unit name |
18 | | area | numeric | 1234 | Area of the spatial unit in kilometers squared |
19 | | province | string | Name| Name of the province |
20 | 
21 | ### Order Spatial Data
22 | Spatial datasets are ordered by region. When cleaning other datasets at the
23 | region level, we also order by region and ensure all regions are present. This
24 | ensures that no reordering needs to be done in the dashboard.
25 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/clean_adm3_file.R:
--------------------------------------------------------------------------------
 1 | # Clean ADM2 File
 2 | 
 3 | # Load Data --------------------------------------------------------------------
 4 | # LOAD DATA HERE
 5 | 
 6 | # Subset/Add Variables ---------------------------------------------------------
 7 | adm3@data <- adm3@data %>%
 8 |   dplyr::select(NAME_3) %>%
 9 |   dplyr::rename(name = NAME_3) %>%
10 |   dplyr::mutate(region = name)
11 | 
12 | adm3$area <- geosphere::areaPolygon(adm3) / 1000^2
13 | 
14 | # Simplify (to speed up plotting) ----------------------------------------------
15 | # For ms_simplify, polygon IDs and other ID need to match
16 | pid <- sapply(slot(adm3, "polygons"), function(x) slot(x, "ID")) 
17 | row.names(adm3) <- pid
18 | 
19 | adm3 <- rmapshaper::ms_simplify(adm3)
20 | 
21 | # Arrange ----------------------------------------------------------------------
22 | #### Order by region
23 | adm3$region <- adm3$region %>% as.character()
24 | adm3 <- adm3[order(adm3$region),]
25 | 
26 | # Export -----------------------------------------------------------------------
27 | saveRDS(adm3, file.path(GEO_PATH, "adm3.Rds"))
28 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/clean_adm2_file.R:
--------------------------------------------------------------------------------
 1 | # Clean ADM2 File
 2 | 
 3 | # Load Data --------------------------------------------------------------------
 4 | # LOAD DATA HERE
 5 | 
 6 | # Subset/Add Variables ---------------------------------------------------------
 7 | adm2@data <- adm2@data %>%
 8 |   dplyr::select(NAME_2) %>%
 9 |   dplyr::rename(name = NAME_2) %>%
10 |   dplyr::mutate(region = name)
11 | 
12 | adm2$area <- geosphere::areaPolygon(adm2) / 1000^2
13 | 
14 | adm2$province <- NA
15 | 
16 | # Simplify (to speed up plotting) ----------------------------------------------
17 | # For ms_simplify, polygon IDs and other ID need to match
18 | pid <- sapply(slot(adm2, "polygons"), function(x) slot(x, "ID")) 
19 | row.names(adm2) <- pid
20 | 
21 | adm2 <- rmapshaper::ms_simplify(adm2)
22 | 
23 | # Arrange ----------------------------------------------------------------------
24 | #### Order by region
25 | adm2$region <- adm2$region %>% as.character()
26 | adm2 <- adm2[order(adm2$region),]
27 | 
28 | # Export -----------------------------------------------------------------------
29 | saveRDS(adm2, file.path(GEO_PATH, "adm2.Rds"))
30 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/functions.R:
--------------------------------------------------------------------------------
 1 | # Functions ====================================================================
 2 | 
 3 | #### Log values with negatives
 4 | # Define function to take the log of values that can deal with negative
 5 | # values. Just takes the absoltue value, logs, then reapplies negative
 6 | log_neg <- function(values){
 7 |   # Log that takes into account zero. Only for logging values for
 8 |   # displaying!
 9 |   
10 |   values_pos_index <- (values > 0)  %in% T # %in% T to account for NAs 
11 |   values_neg_index <- (values <= 0) %in% T
12 |   
13 |   values_pos_log <- log(values[values_pos_index]+1)
14 |   values_neg_log <- -log(-(values[values_neg_index])+1)
15 |   
16 |   values[values_pos_index] <- values_pos_log
17 |   values[values_neg_index] <- values_neg_log
18 |   
19 |   return(values)
20 | }
21 | 
22 | as.character.htmlwidget <- function(x, ...) {
23 |   htmltools::HTML(
24 |     htmltools:::as.character.shiny.tag.list(
25 |       htmlwidgets:::as.tags.htmlwidget(
26 |         x
27 |       ),
28 |       ...
29 |     )
30 |   )
31 | }
32 | 
33 | add_deps <- function(dtbl, name, pkg = name) {
34 |   tagList(
35 |     dtbl,
36 |     htmlwidgets::getDependency(name, pkg)
37 |   )
38 | }


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/styles.css:
--------------------------------------------------------------------------------
 1 | input[type="number"] {
 2 |   max-width: 80%;
 3 | }
 4 | 
 5 | div.outer {
 6 |   position: fixed;
 7 |   top: 50px;
 8 |   left: 0;
 9 |   right: 0;
10 |   bottom: 0;
11 |   overflow: hidden;
12 |   padding: 0;
13 | }
14 | 
15 | /* Customize fonts */
16 | body, label, input, button, select { 
17 |   font-family: 'Helvetica Neue', Helvetica;
18 |   font-weight: 200;
19 | }
20 | h1, h2, h3, h4 { font-weight: 400; }
21 | 
22 | #controls {
23 |   /* Appearance */
24 |   background-color: white;
25 |   padding: 0 20px 20px 20px;
26 |   cursor: move;
27 |   /* Fade out while not hovering */
28 |   opacity: 0.76;
29 |   zoom: 0.95;
30 |   transition: opacity 0ms 0ms;
31 | }
32 | #controls:hover {
33 |   /* Fade in while hovering */
34 |   opacity: 0.99;
35 |   transition-delay: 0;
36 | }
37 | 
38 | #logo {
39 |   /* Appearance */
40 |   background-color: transparent;
41 |   cursor: move;
42 |   /* Fade out while not hovering */
43 |   opacity: 0.25;
44 |   zoom: 0.9;
45 |   transition: opacity 500ms 1s;
46 | }
47 | 
48 | #logo:hover {
49 |   /* Fade in while hovering */
50 |   opacity: 0.95;
51 |   transition-delay: 0;
52 | }
53 | 
54 | #img-id{
55 |   position: fixed;
56 |   right: 10px;
57 |   top: 5px;
58 | }


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/prep_subs_obs_totals_data.R:
--------------------------------------------------------------------------------
 1 | # Prep Subscribers / Observations Total Data
 2 | 
 3 | # Prep datsets for line graphs on about page.
 4 | 
 5 | # Subscribers ------------------------------------------------------------------
 6 | subs_adm2 <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_01_02_adm3_hour_result.csv")),
 7 |                    stringsAsFactors=F)
 8 | 
 9 | subs_adm2 <- subs_adm2 %>%
10 |   group_by(pdate) %>%
11 |   dplyr::summarise(Subscribers = sum(totalimei)) %>%
12 |   dplyr::rename(Date = pdate) %>%
13 |   mutate(Date = Date %>% ymd)
14 | 
15 | saveRDS(subs_adm2, file.path(DASHBOARD_DATA_ONEDRIVE_PATH,"subscribers_total.Rds"))
16 | 
17 | # Observations -----------------------------------------------------------------
18 | obs_adm2 <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_01_02_adm3_hour_result.csv")),
19 |                       stringsAsFactors=F)
20 | 
21 | obs_adm2 <- obs_adm2 %>%
22 |   group_by(pdate) %>%
23 |   dplyr::summarise(Observations = sum(total)) %>%
24 |   dplyr::rename(Date = pdate) %>%
25 |   mutate(Date = Date %>% ymd)
26 | 
27 | saveRDS(obs_adm2, file.path(DASHBOARD_DATA_ONEDRIVE_PATH,"observations_total.Rds"))
28 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/README.md:
--------------------------------------------------------------------------------
 1 | # Module organization
 2 | 
 3 | ## Aggregation
 4 | The base class `aggregator` defined in [aggregator.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/modules/aggregator.py) implements methods and attributes shared by all aggregator classes. At the next level, `flowminder_aggregator` and `priority_aggregator` implement sql queries from [Flowminder](https://github.com/Flowminder), and priority indicators designed by this task force written in pyspark, respectively. Beyond that, the classes `scaled_aggregator` and `custom_aggregator` implement priority indicators scaled by a resident count, and additional custom pyspark indicators, respectively. Both inherit from the `priority_aggregator` class.
 5 | 
 6 | ```
 7 | |-- aggregator
 8 | | |-- flowminder_aggregator
 9 | | |-- priority_aggregator
10 | |   |-- scaled_aggregator
11 | |   |-- custom_aggregator
12 | ```
13 | 
14 | ## Clustering and tesselation
15 | Modules `voronoi` and `tower_clustering` implement voronoi tesselation given tower locations, these will be of use in the setup phase to create tower-region mappings.
16 | 
17 | ## Outlier analysis
18 | Module `outliers` can be used to study outlier observations.
19 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/data_to_github.R:
--------------------------------------------------------------------------------
 1 | # Transfer dashboard data from OneDrive to Github
 2 | 
 3 | 
 4 | ## Remove previous files in github
 5 | REMOVE_PREVIOUS_FILES <- F
 6 | 
 7 | if(REMOVE_PREVIOUS_FILES){
 8 |   temp <- list.files(DASHBOARD_DATA_GITHUB_PATH, 
 9 |                      full.names = T, 
10 |                      pattern = "*.Rds") %>%
11 |     lapply(file.remove)
12 |   
13 | }
14 | 
15 | 
16 | # Move telecom data to github folder -------------------------------------------
17 | i <- 1
18 | 
19 | telecom_files <- list.files(DASHBOARD_DATA_ONEDRIVE_PATH, pattern = "*.Rds")
20 | 
21 | #telecom_files <- telecom_files[grepl("spark", telecom_files)]
22 | 
23 | temp <- telecom_files %>%
24 |   lapply(function(file_i){
25 |     if((i %% 100) %in% 0) print(paste(i, "/", length(telecom_files)))
26 |     i <<- i + 1
27 |     
28 |     file.copy(file.path(DASHBOARD_DATA_ONEDRIVE_PATH, file_i),
29 |               paste0(DASHBOARD_DATA_GITHUB_PATH, "/"),
30 |               overwrite=T)
31 |   })
32 | 
33 | 
34 | # Move geofiles to github folder -----------------------------------------------
35 | for(file_i in list.files(GEO_PATH)){
36 |   file.copy(file.path(GEO_PATH, file_i),
37 |             paste0(DASHBOARD_DATA_GITHUB_PATH, "/"),
38 |             overwrite=T)
39 | }
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/folder_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | ######################################
 3 |   # Folder setup methods - written for the jupyter notebook docker image
 4 | 
 5 | #Loops over the requred folders for teh datasource and create any missing folders
 6 | def setup_folder(datasource):
 7 |     #Loop over required paths, and return true
 8 |     for folder in datasource.required_folders():
 9 |         test_folder(folder, create_if_not_exist=True)
10 |     return True
11 | 
12 | #Check if all required folders exist without creating them
13 | def check_folders(datasource):
14 |     return_boolean = True
15 |     #loop over required folders
16 |     for folder in datasource.required_folders():
17 |         if not test_folder(folder, create_if_not_exist=False):
18 |             print("Folder '{}' is required but does not exist".format(folder))
19 |             return_boolean = False
20 |     return return_boolean
21 | 
22 | #Utility that check if folder exist
23 | def test_folder(path, create_if_not_exist):
24 |     #If folder exists return true
25 |     if os.path.exists(path): return True
26 |     #Else: if create_if_not_exist is true then create folder and return true
27 |     elif create_if_not_exist:
28 |         os.makedirs(path)
29 |         return True
30 |     #Else: Folder does not exist and folder is not created, return false
31 |     else: return False
32 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/README.md:
--------------------------------------------------------------------------------
 1 | # Dashboard Data Prep
 2 | 
 3 | Due to the high volume of data, data transformations (e.g., aggregating, filtering, etc) are done outside of the dashboard in order to minimize the processing and data needed to be loaded in memory at any point as the dashboard is running. These scripts filter the cleaned telecom data into individual datasets so that no additional filtering or transformations need to be applied within the dashboard; the dashboard can just read the files then immediately use the data in the map, line graph and table. Here, we create smaller datasets that contain the same variables as above. Indicators include density, movement in, movement out, mean distance traveled, etc.
 4 | 
 5 | The following datasets are made.
 6 | 
 7 | | Dataset Type | Naming Convention | Description |
 8 | | --- | --- | --- |
 9 | | unit-level | [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Date/Week].Rds | For a given day or week, this dataset contains information for all wards or districts for a specified indicator. For O-D level datasets, values are aggregated to the specified origin or destination unit (eg, movement into unit from all other units). |
10 | | time-level |  [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name].Rds | For a given admin unit, this dataset contains a time series of values for a specified indicator. |
11 | | unit-time-level |  [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name]\_[Date/Week].Rds | These datasets are only used for O-D variables. The show, for a given origin or destination unit, the movement in or out of that unit to all other units for the specified day/week. |
12 | 
13 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/import_packages.py:
--------------------------------------------------------------------------------
 1 | # Imports necessary packages and sets some global vars
 2 | ### spark etc
 3 | # import rarfile
 4 | 
 5 | import os, pyspark, time, sys
 6 | import pyspark.sql.functions as F
 7 | from pyspark.sql.functions import pandas_udf, PandasUDFType
 8 | from pyspark import *
 9 | from pyspark.sql import *
10 | from pyspark.rdd import *
11 | from pyspark.ml import *
12 | from pyspark.sql.types import ArrayType
13 | from pyspark.sql.types import IntegerType
14 | from pyspark.sql.types import DoubleType
15 | from pyspark.sql.types import FloatType
16 | 
17 | ### data wrangling
18 | import pandas as pd
19 | import glob
20 | import shutil
21 | pd.options.display.float_format = '{:,.0f}'.format
22 | # pd.set_option("display.max_rows", 100)
23 | pd.options.display.max_columns = None
24 | import datetime as dt
25 | import numpy as np
26 | from random import sample, seed
27 | seed(510)
28 | # timezone = dt.timezone(offset = -dt.timedelta(hours=5), name = "America/Bogota")
29 | timezone = dt.timezone(offset = -dt.timedelta(hours=0), name = "Africa/Harare")
30 | import re
31 | #import fiona
32 | #import geopandas as gpd
33 | import copy
34 | from collections import Counter
35 | from shapely import wkt
36 | 
37 | ### plotting
38 | import matplotlib.pyplot as plt
39 | import matplotlib.dates as mdates
40 | import seaborn as sns
41 | #import folium
42 | #import gif
43 | #from folium.plugins import HeatMap, DualMap, Fullscreen
44 | #from folium.features import DivIcon
45 | #from branca.element import Template, MacroElement
46 | import locale
47 | from matplotlib.ticker import FuncFormatter
48 | import matplotlib.lines as mlines
49 | font = {'family' : 'Calibri',
50 |         'weight' : 'normal',
51 |         'size'   : 18}
52 | import matplotlib
53 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/figures/_master_figures.R:
--------------------------------------------------------------------------------
 1 | # Master R Script for Prepping Data for Dashboard
 2 | 
 3 | #### Packages #### =============================================================
 4 | library(tidyverse)
 5 | library(sf)
 6 | library(sp)
 7 | library(plotly)
 8 | library(stargazer)
 9 | library(knitr)
10 | library(gridExtra)
11 | library(leaflet)
12 | library(ggpubr)
13 | library(purrr)
14 | library(parallel)
15 | library(pbmcapply)
16 | library(rgeos)
17 | library(rgdal)
18 | library(sp)
19 | library(rmapshaper)
20 | library(raster)
21 | library(geosphere)
22 | library(lubridate)
23 | library(data.table)
24 | library(mapview)
25 | library(hrbrthemes)
26 | 
27 | #### File paths #### ===========================================================
28 | 
29 | # Define Root Paths ------------------------------------------------------------
30 | if(Sys.info()[["user"]] == "robmarty") PROJECT_PATH <- "~/Documents/World Bank/Sveta Milusheva - COVID 19 Results"
31 | if(Sys.info()[["user"]] == "wb519128") PROJECT_PATH <- "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results"
32 | if(Sys.info()[["user"]] == "WB521633") PROJECT_PATH <- "C:/Users/wb521633/WBG/Sveta Milusheva - COVID 19 Results"
33 | 
34 | if(Sys.info()[["user"]] == "robmarty") GITHUB_PATH <- "~/Documents/Github/covid-mobile-data"
35 | if(Sys.info()[["user"]] == "wb519128") GITHUB_PATH <- "C:/Users/wb519128/Github/covid-mobile-data"
36 | if(Sys.info()[["user"]] == "WB521633") GITHUB_PATH <- "C:/Users/wb521633/Documents/Github/covid-mobile-data"
37 | 
38 | # Define Paths from Root -------------------------------------------------------
39 | CLEAN_DATA_ADM2_PATH <- file.path(PROJECT_PATH, "proof-of-concept", "files_for_dashboard", "files_clean", "adm2")
40 | CLEAN_DATA_ADM3_PATH <- file.path(PROJECT_PATH, "proof-of-concept", "files_for_dashboard", "files_clean", "adm3")
41 | figures_path <- file.path(PROJECT_PATH, "proof-of-concept", "outputs", "figures")
42 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/folder_setup.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import datetime as dt\n",
10 |     "from modules.DataSource import *\n",
11 |     "from modules.folder_utils import *"
12 |    ]
13 |   },
14 |   {
15 |    "cell_type": "code",
16 |    "execution_count": null,
17 |    "metadata": {},
18 |    "outputs": [],
19 |    "source": [
20 |     "#Set relative file path to config file\n",
21 |     "config_file = '../config_file.py'\n",
22 |     "exec(open(config_file).read())"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "code",
27 |    "execution_count": null,
28 |    "metadata": {
29 |     "scrolled": true
30 |    },
31 |    "outputs": [],
32 |    "source": [
33 |     "#Create the DataSource object and show config\n",
34 |     "ds = DataSource(datasource_configs)\n",
35 |     "ds.show_config()"
36 |    ]
37 |   },
38 |   {
39 |    "cell_type": "code",
40 |    "execution_count": null,
41 |    "metadata": {},
42 |    "outputs": [],
43 |    "source": [
44 |     "#Setup all required data folders\n",
45 |     "setup_folder(ds)"
46 |    ]
47 |   },
48 |   {
49 |    "cell_type": "code",
50 |    "execution_count": null,
51 |    "metadata": {},
52 |    "outputs": [],
53 |    "source": [
54 |     "#Check if required data folders already exists\n",
55 |     "check_folders(ds)"
56 |    ]
57 |   }
58 |  ],
59 |  "metadata": {
60 |   "kernelspec": {
61 |    "display_name": "Python 3",
62 |    "language": "python",
63 |    "name": "python3"
64 |   },
65 |   "language_info": {
66 |    "codemirror_mode": {
67 |     "name": "ipython",
68 |     "version": 3
69 |    },
70 |    "file_extension": ".py",
71 |    "mimetype": "text/x-python",
72 |    "name": "python",
73 |    "nbconvert_exporter": "python",
74 |    "pygments_lexer": "ipython3",
75 |    "version": "3.7.6"
76 |   }
77 |  },
78 |  "nbformat": 4,
79 |  "nbformat_minor": 4
80 | }
81 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/utilities.py:
--------------------------------------------------------------------------------
 1 | 
 2 | ############# Utility functions used throughout
 3 | import os
 4 | if os.environ['HOME'] != '/root':
 5 |     from modules.import_packages import *
 6 |     from modules.DataSource import *
 7 |     databricks = False
 8 | else:
 9 |     databricks = True
10 | 
11 | def save_and_load_parquet(df, filename, ds):
12 |     # write parquet
13 |     df.write.mode('overwrite').parquet(filename)
14 |     #load parquet
15 |     df = ds.spark.read.format("parquet").load(filename)
16 |     return df
17 | 
18 | def save_csv(matrix, path, filename):
19 |     # write to csv
20 |     matrix.repartition(1).write.mode('overwrite').format('com.databricks.spark.csv') \
21 |         .save(os.path.join(path, filename), header = 'true')
22 |     # move one folder up and rename to human-legible .csv name
23 |     if databricks:
24 |         dbutils.fs.mv(dbutils.fs.ls(path + '/' + filename)[-1].path,
25 |                   path + '/' + filename + '.csv')
26 |         # remove the old folder
27 |         dbutils.fs.rm(path + '/' + filename + '/', recurse = True)
28 | 
29 |     else:
30 |         os.rename(glob.glob(os.path.join(path, filename + '/*.csv'))[0],
31 |                   os.path.join(path, filename + '.csv'))
32 |         shutil.rmtree(os.path.join(path, filename))
33 | 
34 | ############# Windows for window functions
35 | 
36 | # window by cardnumber
37 | user_window = Window\
38 |     .partitionBy('msisdn').orderBy('call_datetime')
39 | 
40 | # window by cardnumber starting with last transaction
41 | user_window_rev = Window\
42 |     .partitionBy('msisdn').orderBy(F.desc('call_datetime'))
43 | 
44 | # user date window
45 | user_date_window = Window\
46 |     .partitionBy('msisdn', 'call_date').orderBy('call_datetime')
47 | 
48 | # user date window starting from last date
49 | user_date_window_rev = Window\
50 |     .partitionBy('msisdn', 'call_date').orderBy(F.desc('call_datetime'))
51 | 
52 | 
53 | ############# Plotting
54 | 
55 | def zero_to_nan(values):
56 |     """Replace every 0 with 'nan' and return a copy."""
57 |     values[ values==0 ] = np.nan
58 |     return values
59 | 
60 | def fill_zero_dates(pd_df):
61 |     pd_df = pd_df[~pd_df.index.isnull()].sort_index()
62 |     msisdnx = pd.date_range(pd_df.index[0], pd_df.index[-1])
63 |     pd_df = pd_df.reindex(msisdnx, fill_value= 0)
64 |     return pd_df
65 | 


--------------------------------------------------------------------------------
/data-checks/README.md:
--------------------------------------------------------------------------------
 1 | # Data Checks
 2 | 
 3 | This folder contains code for running basic checks of aggregated CDR indicators. The data quality checks are intended to achieve the following:
 4 | 1. **Ensure the data is complete.** This means that there are no missing values in two main dimensions: spatial-all admin areas should have data; and temporal: all time slots (month, day and hour) should have data. This check is required for all indicators.
 5 | 2. **Cell tower down checks**. This is a special type of missing data where the data may be missing due to cell tower. This check is required for all indicators?
 6 | 3. **Consistency checks**. This check can be done for a single indicator to check for several things. But it can also be done cross indicators to ensure consistency of total numbers.
 7 | 
 8 | ## Requirements
 9 | 
10 | - Python3
11 | - pandas
12 | - numpy
13 | - plotly
14 | 
15 | ## Basic usage:
16 | 
17 | ```bash
18 | $ git clone git@github.com:worldbank/covid-mobile-data.git
19 | $ cd covid-mobile-data/data-checks/
20 | $ python checker.py   --Path path/to/indicators
21 |                     [--prefix "your_prefix_"]
22 |                     [--outputs path/to/outputs]
23 | ```
24 | 
25 | ## Custom usage:
26 | You can create an instance of the checker class to customize any of the default values.
27 | 
28 | ```python
29 | from checker import *
30 | 
31 | check = checker(path = 'path/to/indicaotrs',
32 |                 outputs_path = 'path/to/outputs',
33 |                 level = 'subfolder',
34 |                 ind_dict = {'i1' : 'transactions_per_hour.csv',
35 |                             'i3' : 'unique_subscrivers_per_day.csv',
36 |                             'i5' :  'origin_destination_connection_matrix_per_day.csv'},
37 |                 prefix = 'your_prefix_',
38 |                 col_names_dict = col_names_dict = {
39 |                     'i1': {'Time':'hour', 
40 |                            'Geography':'region',
41 |                            'Count':'count'},
42 |                     'i3': {'Time':'day',
43 |                            'Geography':'region',
44 |                            'Count':'count'},
45 |                     'i5': {'Time':'connection_date',
46 |                            'Geography_from':'region_from',
47 |                            'Geography_to':'region_to',
48 |                            'Count':'total_count'} })
49 | ```
50 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/figures/i3_figures.R:
--------------------------------------------------------------------------------
 1 | # i3 Figures
 2 | 
 3 | unit <- "districts"
 4 | 
 5 | # Load Data --------------------------------------------------------------------
 6 | if(unit %in% "wards"){
 7 |   CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
 8 | }
 9 | 
10 | if(unit %in% "districts"){
11 |   CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
12 | }
13 | 
14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i3_daily.Rds"))
15 | 
16 | data <- data %>%
17 |   group_by(region) %>%
18 |   mutate(value_pre = mean(value[date < "2020-03-30"]),
19 |          value_post = mean(value[date > "2020-03-30"])) %>%
20 |   ungroup() %>%
21 |   mutate(value_change = value_post - value_pre) %>%
22 |   mutate(value_change_rank = rank(value_change))
23 | 
24 | data$value_change_rank[is.na(data$value_change)] <- NA
25 | 
26 | # Figures ----------------------------------------------------------------------
27 | rank_high <- data$value_change_rank %>% unique() %>% sort() %>% head(5)
28 | 
29 | p_high <- data %>%
30 |   dplyr::filter(value_change_rank %in% rank_high) %>%
31 |   ggplot(aes(x = date, y = value)) +
32 |   geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
33 |   geom_line() +
34 |   labs(x = "",
35 |        y = "Number of Subscribers",
36 |        title = "Largest Decreases") +
37 |   facet_wrap(~name,
38 |              scales = "free_y",
39 |              nrow = 1) +
40 |   theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
41 |         strip.text.x = element_text(face = "bold"))
42 | p_high
43 | 
44 | rank_low <- data$value_change_rank %>% unique() %>% sort() %>% tail(5)
45 | 
46 | p_low <- data %>%
47 |   dplyr::filter(value_change_rank %in% rank_low) %>%
48 |   ggplot(aes(x = date, y = value)) +
49 |   geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
50 |   geom_line() +
51 |   labs(x = "",
52 |        y = "",
53 |        title = "Largest Increases") +
54 |   facet_wrap(~name,
55 |              scales = "free_y",
56 |              nrow = 1) +
57 |   theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
58 |         strip.text.x = element_text(face = "bold"))
59 | 
60 | p_all <- ggarrange(p_high, p_low, nrow = 2)
61 | ggsave(p_all, filename = file.path(figures_path, 
62 |                                    paste0(unit, "_subsc_top_chng.png")),
63 |        height = 5, width=12)
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/data-panel/01_construct.py:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------------#
 2 | # CREATE PANEL
 3 | #-----------------------------------------------------------------#
 4 | 
 5 | # This code creates panel datasets combinig different versions of 
 6 | # indicator files. 
 7 | 
 8 | from utils import *
 9 | from panel_constructor import *
10 | 
11 | #-----------------------------------------------------------------#
12 | # Settings 
13 | 
14 | EXPORT = False
15 | 
16 | #-------------------#
17 | # Indicator dataframe
18 | 
19 | # Load list of indicators to make it easier to bulk load files
20 | indicators_df = pd.read_csv('path/to/indicators_list.csv')
21 | 
22 | 
23 | #-------------------#
24 | # Set default values
25 | levels_dict = { 1: [3],
26 |                 2: [3],
27 |                 3: [2,3],
28 |                 4: ['country'],
29 |                 5: [2,3],
30 |                 6: [3],
31 |                 7: [2,3],
32 |                 8: [2,3],
33 |                 9: [2,3],
34 |                 10: [2,3],
35 |                 11: [2,3]}
36 | 
37 | 
38 | #-----------------------------------------------------------------#
39 | # Load indicators and create comparisson "dirty" panel
40 | 
41 | indicators = panel_constructor(levels_dict, indicators_df)
42 | 
43 | # Create class instance
44 | # If no levels dictionary is provided, it will use the default, which is all of them!
45 | # indicators = panel_constructor()
46 | 
47 | # Run panel creation
48 | indicators.dirty_panel()
49 | 
50 | #-----------------------------------------------------------------#
51 | # Load usage outliers file
52 | 
53 | # This file is created in data-checks
54 | i1_ag_df_tower_down = pd.read_csv("/path/to/usage-outliers/file")
55 | 
56 | #-----------------------------------------------------------------#
57 | # Export comparison panel
58 | 
59 | if EXPORT:
60 |     indicators.export('/export/path/')
61 | 
62 | #-----------------------------------------------------------------#
63 | # Create clean panel
64 | 
65 | # This replaces the old panel attribute with the clean version, with
66 | # standardized column names
67 | 
68 | indicators.clean_panel(i1_ag_df_tower_down)
69 | 
70 | #-----------------------------------------------------------------#
71 | 
72 | 
73 | indicators.add_other_provider(mno_path =  "/path/to/other/mno/indicator/folder", 
74 |                               mno_suffix = '_mno')
75 | 
76 | 
77 | #-----------------------------------------------------------------#
78 | # Export
79 | if EXPORT:
80 |     indicators.export('/export/path/')
81 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/figures/i5_net_figures.R:
--------------------------------------------------------------------------------
 1 | # i3 Figures
 2 | 
 3 | unit <- "wards"
 4 | 
 5 | # Load Data --------------------------------------------------------------------
 6 | if(unit %in% "wards"){
 7 |   CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
 8 | }
 9 | 
10 | if(unit %in% "districts"){
11 |   CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
12 | }
13 | 
14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i5_net_daily.Rds"))
15 | 
16 | data <- data %>%
17 |   group_by(region) %>%
18 |   mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T),
19 |          value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>%
20 |   ungroup() %>%
21 |   mutate(value_change = value_post - value_pre) %>%
22 |   mutate(value_change_rank = rank(value_change))
23 | 
24 | data$value_change_rank[is.na(data$value_change)] <- NA
25 | 
26 | data <- data[!is.na(data$date),]
27 | data$date <- data$date %>% as.Date()
28 | 
29 | # Figures ----------------------------------------------------------------------
30 | rank_high <- data$value_change_rank %>% unique() %>% sort() %>% head(5)
31 | 
32 | p_high <- data %>%
33 |   dplyr::filter(value_change_rank %in% rank_high) %>%
34 |   ggplot(aes(x = date, y = value)) +
35 |   geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
36 |   geom_line() +
37 |   labs(x = "",
38 |        y = "Number of Subscribers",
39 |        title = "Largest Decreases") +
40 |   facet_wrap(~name,
41 |              scales = "free_y",
42 |              nrow = 1) +
43 |   theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
44 |         strip.text.x = element_text(face = "bold"))
45 | p_high
46 | 
47 | datarank_low <- data$value_change_rank %>% unique() %>% sort() %>% tail(5)
48 | 
49 | p_low <- data %>%
50 |   dplyr::filter(value_change_rank %in% rank_low) %>%
51 |   ggplot(aes(x = date, y = value)) +
52 |   geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
53 |   geom_line() +
54 |   labs(x = "",
55 |        y = "",
56 |        title = "Largest Increases") +
57 |   facet_wrap(~name,
58 |              scales = "free_y",
59 |              nrow = 1) +
60 |   theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
61 |         strip.text.x = element_text(face = "bold"))
62 | 
63 | p_all <- ggarrange(p_high, p_low, nrow = 2)
64 | ggsave(p_all, filename = file.path(figures_path, 
65 |                                    paste0(unit, "_netmovement_top_chng.png")),
66 |        height = 5, width=12)
67 | 
68 | 
69 | data$value[data$date < "2020-03-30"] %>% log() %>% hist()
70 | 


--------------------------------------------------------------------------------
/data-checks/Archive/Descr-exploratory/i5-plot.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import pandas as pd
 4 | import numpy as np
 5 | import datetime as dt
 6 | import time
 7 | 
 8 | from bokeh.plotting import figure, output_file, show
 9 | from bokeh.models import Span
10 | from bokeh.io import export_png
11 | 
12 | #-----------------------------------------------------------------#
13 | # Folder structure
14 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
15 | DATA_POC = DATA_path + "proof-of-concept/"
16 | DATA_Panel = DATA_POC + "panel_indicators/"
17 | OUT_path = DATA_POC + "outputs/"
18 | 
19 | 
20 | #-----------------------------------------------------------------#
21 | # Load data
22 | 
23 | i5 = pd.read_csv(DATA_Panel + 'i5_admin2.csv')
24 | 
25 | 
26 | #-----------------------------------------------------------------#
27 | # Process data
28 | i5 = i5[['connection_date', 'region_from', 'region_to', 'od_count_p', 'subscriber_count_p', 'total_count_p']]
29 | 
30 | i5['date'] = pd.to_datetime(i5['connection_date']).dt.date
31 | i5['month'] = pd.to_datetime(i5['connection_date']).dt.month
32 | 
33 | 
34 | i5_agg = i5\
35 |         .groupby('date')\
36 |         .agg({'region_from' : pd.Series.nunique ,
37 |               'region_to' : pd.Series.nunique,
38 |               'subscriber_count_p' : np.sum,
39 |               'total_count_p' : np.sum})\
40 |         .reset_index()\
41 |         .sort_values('date')
42 | 
43 | i5_agg_month = i5\
44 |         .groupby('month')\
45 |         .agg({'subscriber_count_p' : np.sum,
46 |               'total_count_p' : np.sum})\
47 |         .reset_index()\
48 |         .sort_values('month')
49 | 
50 | #-----------------------------------------------------------------#
51 | # Plot
52 | 
53 | p = figure(title="Total Daily Movement Between Districts on a Given Day",
54 |            plot_width=800, 
55 |            plot_height=500,
56 |            x_axis_type='datetime')
57 | p.circle(i5_agg['date'], 
58 |          i5_agg['subscriber_count_p'])
59 | 
60 | # Add lockdown dates vertical line
61 | 
62 | vline1 = Span(location= dt.date(2020, 3, 27), 
63 |              dimension='height', 
64 |              line_color='black',
65 |              line_dash='dashed')
66 | vline2 = Span(location= dt.date(2020, 3, 30), 
67 |              dimension='height', 
68 |              line_color='black',
69 |              line_dash='dashed')
70 | 
71 | p.renderers.extend([vline1, vline2])
72 | 
73 | # Additional formatting
74 | p.left[0].formatter.use_scientific = False
75 | p.toolbar.logo = None
76 | p.toolbar_location = None
77 | p.xaxis.axis_label = "Date"
78 | p.yaxis.axis_label = "Movement Day"
79 | p.title.text_font_size = '15pt'
80 | p.xaxis.axis_label_text_font_size = "12pt"
81 | p.yaxis.axis_label_text_font_size = "12pt"
82 | p.yaxis.major_label_text_font_size = "10pt"
83 | p.xaxis.major_label_text_font_size = "10pt"
84 | 
85 | # Display plot
86 | show(p)
87 | 
88 | # Export
89 | export_png(p, 
90 |            filename= OUT_path + "all_movement.png")
91 | 


--------------------------------------------------------------------------------
/data-panel/Archive/usage_outliers.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #-----------------------------------------------------------------#
 3 | # Settings
 4 | 
 5 | import os
 6 | import re
 7 | import copy
 8 | import pandas as pd
 9 | import numpy as np
10 | import datetime as dt
11 | 
12 | 
13 | EXPORT = True
14 | 
15 | # Number of hours below avg, used as a trashold to 
16 | # define a tower down
17 | htrahshold = -3
18 | 
19 | 
20 | #-----------------------------------------------------------------#
21 | # Process data
22 | i1p = copy.deepcopy(i1.panel)
23 | 
24 | i1p['date'] = pd.to_datetime(i1p['hour']).dt.date
25 | i1p['hour_int'] = pd.to_datetime(i1p['hour']).dt.hour
26 | 
27 | 
28 | # Number of observations per ward that is total number of hours
29 | i1freq = i1p.groupby('region').size()
30 | 
31 | i1freq = i1freq.reset_index()
32 | i1freq.columns = ['region', 'freq']
33 | 
34 | # Select wards with less than 12h on average
35 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1p.date.nunique())]
36 | 
37 | i1_low_total_hours = i1_low_total_hours\
38 |     .rename(columns = {'freq' : 'total_hours'})
39 | # # Proportion of wards with at least one tower down
40 | # freq[freq < 1392].count()/len(set(i1['region']))
41 | 
42 | # # Proportion of wards with very 
43 | # freq[freq < 700].count()
44 | # freq[freq < 700].count()/len(set(i1['region']))
45 | 
46 | # Export
47 | if(EXPORT):
48 |     (i1_low_total_hours
49 |     .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv', 
50 |             index = False) )
51 | 
52 | #-----------------------------------------------------------------#
53 | # USAGE OUTILERS: Indicator wards and days with towers down
54 | 
55 | # Number of hours with transactions per region day
56 | hours_per_day = i1p.groupby(['region', 'date']).size()
57 | 
58 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column
59 | hours_per_day.columns = ['region', 'date', 'hcount']
60 | 
61 | 
62 | # Average hours per day per region
63 | avg_hours = (hours_per_day.groupby(['region'])
64 |     .mean()
65 |     .rename(columns={'hcount' :'avg_hours' }))
66 | 
67 | # Create region day data set
68 | i1_ag_df = hours_per_day.merge(avg_hours,
69 |                                 on = 'region')
70 | 
71 | # Difference from average usage per hour
72 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours']
73 | 
74 | # Create data only with pairs of wards and days potential 
75 | # towers down
76 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold]
77 | 
78 | # Read me text
79 | readme_text = "This file contains a combinations of wards and  days that are assumed to have a tower down."
80 | readme_text += "If a day has " + str(abs(htrahshold))  
81 | readme_text += " hours with any calls below the daily avergage for that ward,"
82 | readme_text += " it is considered to have a trower down at some point that day."  
83 | 
84 | # Export
85 | if(EXPORT):
86 |     (i1_ag_df_tower_down 
87 |     .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv', 
88 |             index = False) )
89 |     # Read me file
90 |     file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w") 
91 |     file.write(readme_text) 
92 |     file.close() 
93 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i3_subscribers_data.R:
--------------------------------------------------------------------------------
 1 | # Clean i3 for Dashboard
 2 | 
 3 | unit <- "adm2"
 4 | for(unit in c("adm2", "adm3")){
 5 |   
 6 |   # Load Data / Set Paths ------------------------------------------------------
 7 |   df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_03_",unit,"_day_result.csv")),
 8 |                      stringsAsFactors=F)
 9 |   admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds")))
10 |   
11 |   if(unit %in% "adm2"){
12 |     CLEAN_DATA_PATH  <- CLEAN_DATA_ADM2_PATH
13 |   } 
14 |   if(unit %in% "adm3"){
15 |     CLEAN_DATA_PATH  <- CLEAN_DATA_ADM3_PATH
16 |   }
17 |   
18 |   # Daily ----------------------------------------------------------------------
19 |   df_day_clean <- df_day %>% 
20 |     
21 |     tp_standardize_vars("pdate", unit, "totalimei") %>%
22 | 
23 |     # Clean datset
24 |     tp_clean_date() %>%
25 |     tp_fill_regions(admin_sp) %>%
26 |     tp_complete_date_region() %>%
27 |     tp_add_polygon_data(admin_sp) %>%
28 |     
29 |     # Interpolate/Clean Values
30 |     tp_interpolate_outliers(NAs_as_zero = T, outlier_sd=3) %>%
31 |     tp_replace_zeros(NAs_as_zero = T) %>%
32 |     tp_less15_NA() %>%
33 |     
34 |     # Percent change
35 |     tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i3_daily_base.csv"),
36 |                                baseline_date = BASELINE_DATE) %>%
37 |     tp_add_percent_change() %>%
38 |     
39 |     # Add labels
40 |     tp_add_label_level(timeunit = "day", OD = F) %>%
41 |     tp_add_label_baseline(timeunit = "day", OD = F) %>%
42 |     
43 |     # Add density
44 |     mutate(density = value / area) 
45 |   
46 |   ## Export
47 |   saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, "i3_daily.Rds"))
48 |   write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, "i3_daily.csv"), row.names=F)
49 |   
50 |   # Weekly ---------------------------------------------------------------------
51 |   print("week")
52 |   
53 |   df_week_clean <- df_day_clean %>% 
54 |     
55 |     tp_standardize_vars("date", "region", "value") %>%
56 |     
57 |     # Clean datset
58 |     tp_clean_week() %>%
59 |     tp_agg_day_to_week(fun = "mean") %>%
60 |     tp_complete_date_region() %>%
61 |     tp_add_polygon_data(admin_sp) %>%
62 |     
63 |     # Interpolate/Clean Values
64 |     tp_interpolate_outliers(NAs_as_zero = T) %>%
65 |     tp_replace_zeros(NAs_as_zero = T) %>%
66 |     tp_less15_NA() %>%
67 |     
68 |     # Percent change
69 |     tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i3_weekly_base.csv"),
70 |                                type = "weekly",
71 |                                baseline_date = BASELINE_DATE) %>%
72 |     tp_add_percent_change() %>%
73 |     
74 |     # Add labels
75 |     tp_add_label_level(timeunit = "week", OD = F) %>%
76 |     tp_add_label_baseline(timeunit = "week", OD = F) %>%
77 |     
78 |     # Add density
79 |     mutate(density = value / area)
80 |   
81 |   ## Export
82 |   saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH, "i3_weekly.Rds"))
83 |   write.csv(df_week_clean, file.path(CLEAN_DATA_PATH, "i3_weekly.csv"), row.names=F)
84 |   
85 | }
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/data-panel/utils.py:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------------#
 2 | # Panel utils
 3 | #-----------------------------------------------------------------#
 4 | 
 5 | import os
 6 | import re
 7 | import copy
 8 | import pandas as pd
 9 | import numpy as np
10 | import datetime as dt
11 | 
12 | #-----------------------------------------------------------------#
13 | # General functions
14 | 
15 | def clean(data, index_cols):
16 |     na_list = [np.nan, '', '99999', 99999, float("inf")]
17 |     data = data[~data[index_cols].isin(na_list).any(axis ='columns')]
18 |     return(data)
19 | 
20 | #-----------------------------------------------------------------#
21 | # Clean panel function
22 | 
23 | # Remove low usage outliers assuming these are towers down and 
24 | # trims columns
25 | 
26 | def clean_columns(indicator, timevar):
27 |     # Remove comparison columns
28 |     keepcols = copy.deepcopy(indicator.index_cols)
29 |     keepcols.extend(indicator.panel.filter(like='_p', axis=1).columns.to_list())
30 |     new_df = indicator.panel[keepcols]
31 |     # Rename columns
32 |     new_df.columns = new_df.columns.str.strip('_p')
33 |     # Create time variables
34 |     new_df['date'] = pd.to_datetime(new_df[timevar]).dt.date
35 |     return new_df
36 | 
37 | def remove_towers_down(df, region_vars, outliers_df):
38 |     # Process outliers file
39 |     # outliers_df = copy.deepcopy(i1_ag_df_tower_down) # created in usage_outliers.py
40 |     outliers_df = outliers_df\
41 |         .drop(['hcount', 'avg_hours', 'h_diff'], axis = 1)\
42 |         .rename(columns = {'region':'region_right'})
43 |     outliers_df['flag'] = 1
44 |     # Merge outliers
45 |     if len(region_vars) == 1:
46 |         new_df = df\
47 |             .merge(outliers_df,
48 |                         left_on = ['date', region_vars[0]],
49 |                         right_on = ['date', 'region_right'],
50 |                         how = 'outer')\
51 |             .drop(['region_right'], axis = 1)
52 |     else:
53 |         new_df = df\
54 |             .merge(outliers_df,
55 |                         left_on = ['date', region_vars[0]],
56 |                         right_on = ['date', 'region_right'],
57 |                         how = 'outer')\
58 |             .drop(['region_right'], axis = 1)\
59 |             .merge(outliers_df,
60 |                         left_on = ['date', region_vars[1]],
61 |                         right_on = ['date', 'region_right'],
62 |                         how = 'outer')\
63 |             .drop(['region_right'], axis = 1)
64 |         # Flag if either is true
65 |         new_df['flag'] = ((new_df['flag_x'] == 1) | (new_df['flag_y'] == 1)).astype(int)
66 |         new_df = new_df.drop(['flag_x', 'flag_y'], axis =1)
67 |     # Drop outliers and processual columns
68 |     new_df = new_df[~(new_df['flag'] == 1)].drop(['flag'], axis = 1)
69 |     return new_df
70 | 
71 | def clean_pipeline(indicator, timevar, region_vars, outliers_df):
72 |     return remove_towers_down( 
73 |                        clean_columns(indicator, 
74 |                                      timevar = timevar), 
75 |                        region_vars = region_vars,
76 |                        outliers_df = outliers_df)


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i5_net_movement_data.R:
--------------------------------------------------------------------------------
 1 | # Clean Subscribers Data
 2 | 
 3 | # Depends on: clean_movement_inout_data.R
 4 | 
 5 | unit <- "adm2"
 6 | timeunit <- "daily"
 7 | for(unit in c("adm2", "adm3")){
 8 |   for(timeunit in c("daily", "weekly")){
 9 |     
10 |     print(paste(unit, timeunit, "--------------------------------------------"))
11 |     
12 |     # Set parameters -------------------------------------------------------------
13 |     admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds")))
14 |     
15 |     if(unit %in% "adm2"){
16 |       CLEAN_DATA_PATH  <- CLEAN_DATA_ADM2_PATH
17 |     }
18 |     
19 |     if(unit %in% "adm3"){
20 |       CLEAN_DATA_PATH  <- CLEAN_DATA_ADM3_PATH
21 |     }
22 |     
23 |     # Clean ----------------------------------------------------------------------
24 |     df <- readRDS(file.path(CLEAN_DATA_PATH,
25 |                             paste0("i5_",
26 |                                    timeunit,
27 |                                    ".Rds"))) %>%
28 |       as.data.table()
29 |     
30 |     ## Aggregate Origin
31 |     df_orign <- df[, .(value   = sum(value, na.rm=T)), 
32 |                    by = list(region_origin, date)]   
33 |     
34 |     names(df_orign)[names(df_orign) %in% "region_origin"] <- "region"
35 |     names(df_orign)[names(df_orign) %in% "value"] <- "value_origin"
36 |     
37 |     ## Aggregate Destination
38 |     df_dest <- df[, .(value   = sum(value, na.rm=T)), 
39 |                   by = list(region_dest, date)]   
40 |     
41 |     names(df_dest)[names(df_dest) %in% "region_dest"] <- "region"
42 |     names(df_dest)[names(df_dest) %in% "value"] <- "value_dest"
43 |     
44 |     ## Merge
45 |     df_day_clean <- merge(df_orign, df_dest, by=c("region", "date")) %>%
46 |       as.data.frame()
47 |     
48 |     ## Prep data
49 |     df_day_clean <- df_day_clean %>%
50 |       
51 |       dplyr::mutate(value = value_dest - value_origin) %>%
52 |       
53 |       tp_standardize_vars("date", "region", "value") %>%
54 |       
55 |       # Clean Data
56 |       tp_fill_regions(admin_sp) %>%
57 |       tp_complete_date_region() %>%
58 |       tp_add_polygon_data(admin_sp) %>%
59 |       
60 |       # Percent change
61 |       tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, 
62 |                                                        paste0("i5_net_",timeunit,"_base.csv")),
63 |                                  type = timeunit) %>%
64 |       tp_add_percent_change() %>%
65 |       
66 |       # Add labels
67 |       tp_add_label_level(timeunit = timeunit, OD = F) %>%
68 |       tp_add_label_baseline(timeunit = timeunit, OD = F) 
69 |     
70 |     
71 |     ## Export
72 |     saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH,
73 |                                     paste0("i5_net_",
74 |                                            timeunit,
75 |                                            ".Rds")))
76 |     
77 |     write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, 
78 |                                       paste0("i5_net_",
79 |                                              timeunit,
80 |                                              ".csv")), 
81 |               row.names=F)
82 |     
83 |     
84 |     
85 |   }
86 | }
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/data-panel/Archive/02_clean.py:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------------#
 2 | # PANEL CLEAN
 3 | #-----------------------------------------------------------------#
 4 | 
 5 | #-----------------------------------------------------------------#
 6 | # Settings
 7 | 
 8 | import pandas as pd
 9 | 
10 | EXPORT = False
11 | 
12 | # Number of hours below avg, used as a trashold to 
13 | # define a tower down
14 | htrahshold = -3
15 | 
16 | #-----------------------------------------------------------------#
17 | # Import data
18 | 
19 | i1 = pd.read_csv( DATA_panel + 'i1_admin3.csv')
20 | 
21 | #-----------------------------------------------------------------#
22 | # Process data
23 | 
24 | i1['date'] = pd.to_datetime(i1['hour']).dt.date
25 | i1['hour_int'] = pd.to_datetime(i1['hour']).dt.hour
26 | 
27 | 
28 | 
29 | #-----------------------------------------------------------------#
30 | # USAGE OUTILERS: Wards with very little data
31 | 
32 | # Number of observations per ward that is total number of hours
33 | i1freq = i1.groupby('region').size()
34 | 
35 | i1freq = i1freq.reset_index()
36 | i1freq.columns = ['region', 'freq']
37 | 
38 | # Select wards with less than 12h on average
39 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1.date.nunique())]
40 | 
41 | i1_low_total_hours = i1_low_total_hours\
42 |     .rename(columns = {'freq' : 'total_hours'})
43 | # # Proportion of wards with at least one tower down
44 | # freq[freq < 1392].count()/len(set(i1['region']))
45 | 
46 | # # Proportion of wards with very 
47 | # freq[freq < 700].count()
48 | # freq[freq < 700].count()/len(set(i1['region']))
49 | 
50 | # Export
51 | if(EXPORT):
52 |     (i1_low_total_hours
53 |     .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv', 
54 |             index = False) )
55 | 
56 | #-----------------------------------------------------------------#
57 | # USAGE OUTILERS: Indicator wards and days with towers down
58 | 
59 | # Number of hours with transactions per region day
60 | hours_per_day = i1.groupby(['region', 'date']).size()
61 | 
62 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column
63 | hours_per_day.columns = ['region', 'date', 'hcount']
64 | 
65 | 
66 | # Average hours per day per region
67 | avg_hours = (hours_per_day.groupby(['region'])
68 |     .mean()
69 |     .rename(columns={'hcount' :'avg_hours' }))
70 | 
71 | # Create region day data set
72 | i1_ag_df = hours_per_day.merge(avg_hours,
73 |                                 on = 'region')
74 | 
75 | # Difference from average usage per hour
76 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours']
77 | 
78 | # Create data only with pairs of wards and days potential 
79 | # towers down
80 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold]
81 | 
82 | # Read me text
83 | readme_text = "This file contains a combinations of wards and  days that are assumed to have a tower down."
84 | readme_text += "If a day has " + str(abs(htrahshold))  
85 | readme_text += " hours with any calls below the daily avergage for that ward,"
86 | readme_text += " it is considered to have a trower down at some point that day."  
87 | 
88 | # Export
89 | if(EXPORT):
90 |     (i1_ag_df_tower_down 
91 |     .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv', 
92 |             index = False) )
93 |     # Read me file
94 |     file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w") 
95 |     file.write(readme_text) 
96 |     file.close() 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | ########################################################################
  2 | #
  3 | # Based on DIME .gitignore template. Follow the instructions in the URL
  4 | # below to set up this template in your own repository
  5 | # https://github.com/worldbank/dime-github-trainings/tree/master/GitHub-resources/DIME-GitHub-Templates
  6 | #
  7 | # Note that if you are using GitKraken, you need to use version 5.x or more
  8 | # recent for this template to work properly
  9 | #
 10 | ########################################################################
 11 | 
 12 | #######################
 13 | # Start by ignoring everything, and below we are explicitly saying
 14 | # what to not ignore
 15 | *
 16 | 
 17 | #######################
 18 | # List of files with GitHub functionality anywhere in the repo
 19 | # that we do not want to ignore
 20 | 
 21 | # These files include GitHub settings
 22 | !.gitignore
 23 | !.gitattributes
 24 | 
 25 | # Keep markdown files used for documentation on GitHub
 26 | !README.md
 27 | !CONTRIBUTING.md
 28 | !LICENSE*
 29 | 
 30 | #######################
 31 | # For performance reasons, if a folder is already ignored, then
 32 | # GitHub does not check the content for that folder for matches
 33 | # with additional rules. The line below includes folder in the
 34 | # top folder (but not their content), so that anything matching
 35 | # the rules below will still not be ignored.
 36 | !*/
 37 | 
 38 | #######################
 39 | # The following file types are code that should always be
 40 | # included no matter where in the repository folder they are
 41 | # located unless you explicitly ignore that folder
 42 | 
 43 | # Stata
 44 | !/**/*.do
 45 | !/**/*.ado
 46 | 
 47 | # R
 48 | !/**/*.R
 49 | !/**/*.Rmd
 50 | 
 51 | # LaTeX
 52 | !/**/*.tex
 53 | !/**/*.bib
 54 | 
 55 | # Python
 56 | !/**/*.py
 57 | !/**/*.ipynb
 58 | # Still ignore .ipynb files in checkpoint folders
 59 | .ipynb_checkpoints
 60 | 
 61 | # Matlab
 62 | !/**/*.m
 63 | 
 64 | # Markdown
 65 | !/**/*.md
 66 | 
 67 | # Julia
 68 | !/**/*.jl
 69 | 
 70 | # CSS
 71 | !/**/*.css
 72 | 
 73 | # Docker
 74 | !/**/*.yml
 75 | !/**/docker/*
 76 | 
 77 | #######################
 78 | # Include some additional file formats in any output folder. You might have
 79 | # to change the name of the Output folder to whatever it is called in your
 80 | # project, but we strongly recommend that you only include these files in
 81 | # a subset of the folders where you are certain no private data is ever stored.
 82 | !/**/Output/**/*.txt
 83 | !/**/Output/**/*.csv
 84 | !/**/Output/**/*.xml
 85 | !/**/Output/**/*.eps
 86 | !/**/Output/**/*.svg
 87 | 
 88 | #######################
 89 | # Include all the files with passwords or tokens here. All files named
 90 | # password or passwords are with this template ignored no matter which
 91 | # format you are using. Additionally, all content in any folder called
 92 | # password or passwords are also ignored. NOTE that your project might be
 93 | # using different names and then you must edit the lines below accordingly.
 94 | password.*
 95 | passwords.*
 96 | password/
 97 | passwords/
 98 | 
 99 | generate_password.R
100 | generate_password*
101 | 
102 | 
103 | 
104 | #######################
105 | # Explicitly exclude data methods and sources description which should
106 | # be kept private. These are already excluded from above lines, just
107 | # including in case change.
108 | data_methods.txt
109 | data_source_description.txt
110 | /**/notebooks/ignored_scripts*
111 | /**/config_file.py
112 | /**/**/.DS_Storec
113 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/flowminder_aggregator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | if os.environ['HOME'] != '/root':
 3 |     from modules.DataSource import *
 4 |     from modules.sql_code_aggregates import *
 5 |     from modules.aggregator import *
 6 |     databricks = False
 7 | else:
 8 |     databricks = True
 9 | 
10 | # Databricks notebook source
11 | class flowminder_aggregator(aggregator):
12 |     """Class to handle sql aggregations of flowminder code.
13 |     For the original sql code from flowminder see https://github.com/Flowminder/COVID-19
14 | 
15 |     Attributes
16 |     ----------
17 |     result_stub : a string. File path where to save results
18 |     datasource : an instance of DataSource class. Holds all dataframes and paths required
19 |     regions : a pyspark dataframe. Admin level this aggregator will be used for
20 |     intermediate_tables : a list. Names of tables that we don't want written to csv
21 |     calls : a pyspark dataframe. pyspcdr data
22 |     cells : a pyspark dataframe. admin region to tower mapping
23 |     spark : an initialised spark connection. spark connection this aggregator should use
24 |     dates : a dictionary. dates the aggregator should run over
25 |     sql_code : a string. the flowminder sql code to be used
26 | 
27 | 
28 |     Methods
29 |     -------
30 |     run_and_save_all(table_name)
31 |         runs run_and_save on the list of all flowminder queries at once
32 | 
33 |     run_save_and_rename_all()
34 |         runs run_and_save_all and then renames the csv files created and
35 |         moves them to their parent folder
36 | 
37 |     attempt_aggregation(indicators_to_produce = 'all', no_of_attempts = 4)
38 |         - attempts aggregation of all flowminder indicators
39 |         - tries mutiple times (this is relevant for databricks env,
40 |             but should be dropped going forward and replaced by a more
41 |             solid handling of databricks timeouts)
42 | 
43 | 
44 |     """
45 | 
46 |     def __init__(self,
47 |                  result_stub,
48 |                  datasource,
49 |                  regions,
50 |                  intermediate_tables = ['home_locations']):
51 |         """
52 |         Parameters
53 |         ----------
54 |         result_stub : where to save results
55 |         datasource : holds all dataframes and paths required
56 |         regions : admin level this aggregator will be used for
57 |         intermediate_tables : tables that we don't want written to csv
58 |         """
59 |         # initiate with parent init
60 |         super().__init__(result_stub,datasource,regions)
61 | 
62 |     def run_and_save_all(self):
63 |       for table_name in self.table_names:
64 |           df = self.spark.sql(self.sql_code[table_name])
65 |           self.save_and_report(df, table_name)
66 | 
67 |     def run_save_and_rename_all(self):
68 |       self.run_and_save_all()
69 |       self.rename_all_csvs()
70 | 
71 | 
72 |     def attempt_aggregation(self, indicators_to_produce = 'all'):
73 |       try:
74 |           # all indicators
75 |           if indicators_to_produce == 'all':
76 |             self.run_save_and_rename_all()
77 | 
78 |           # single indicator
79 |           else:
80 |             for table in indicators_to_produce.keys():
81 |               table_name = indicators_to_produce[table]
82 |               print('--> Producing: ' + table_name)
83 |               self.run_save_and_rename(table_name + '_per_' + indicators_to_produce[table_name])
84 |           print('Indicators saved.')
85 | 
86 |       except Exception as e:
87 |         print(e)
88 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i5_movement_inout_data.R:
--------------------------------------------------------------------------------
  1 | # Clean i5 Data for Dashboard
  2 | 
  3 | EXPORT <- T
  4 | 
  5 | unit = "adm2"
  6 | for(unit in c("adm2", "adm3")){
  7 |   
  8 |   # Load Data / Set Paths ------------------------------------------------------
  9 |   df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_05_",unit,"_day_result.csv")),
 10 |                      stringsAsFactors=F)
 11 |   admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds")))
 12 |   
 13 |   if(unit %in% "adm2"){
 14 |     CLEAN_DATA_PATH  <- CLEAN_DATA_ADM2_PATH
 15 |   } 
 16 |   
 17 |   if(unit %in% "adm3"){
 18 |     CLEAN_DATA_PATH  <- CLEAN_DATA_ADM3_PATH
 19 |   }
 20 |   
 21 |   #### Remove small observations
 22 |   # If less than 15, make NA. Doing this now removes some region-pairs. For 
 23 |   # example, if a o-d pair has a value less than 15 for every time period, 
 24 |   # we don't considered here and helps improve code speed both here and in
 25 |   # the script to prepare data for dashboard.
 26 |   df_day <- df_day[df_day$totalOD > 15,]
 27 |   
 28 |   # Daily ----------------------------------------------------------------------
 29 |   #### Process data for dashboard
 30 |   df_day_clean <- df_day %>% 
 31 |     
 32 |     tp_standardize_vars_od("pdate", 
 33 |                            unit, 
 34 |                            paste0("N_", unit), 
 35 |                            "totalOD") %>%
 36 |     
 37 |     # Clean datset
 38 |     tp_clean_date() %>%
 39 |     tp_complete_date_region_od() %>%
 40 |     tp_add_polygon_data_od(admin_sp) %>%
 41 |     
 42 |     # Interpolate/Clean Values
 43 |     tp_interpolate_outliers(NAs_as_zero = F) %>% 
 44 |     #tp_replace_zeros(NAs_as_zero = T) %>%
 45 |     tp_less15_NA() %>%
 46 |     
 47 |     # Percent change
 48 |     tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i5_daily_base.csv"),
 49 |                                baseline_date = BASELINE_DATE) %>%
 50 |     tp_add_percent_change() %>%
 51 |     
 52 |     # Add labels
 53 |     tp_add_label_level(timeunit = "day", OD = T) %>%
 54 |     tp_add_label_baseline(timeunit = "day", OD = T) 
 55 |   
 56 |   ## Export
 57 |   saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, "i5_daily.Rds"))
 58 |   write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, "i5_daily.csv"), row.names=F)
 59 |   
 60 |   # Weekly ---------------------------------------------------------------------
 61 |   print("week")
 62 |   
 63 |   df_week_clean <- df_day_clean %>% 
 64 |     
 65 |     dplyr::select(date, region_origin, region_dest, value) %>%
 66 |     
 67 |     tp_standardize_vars_od("date", "region_origin", "region_dest", "value") %>%
 68 |     
 69 |     # Clean datset
 70 |     tp_clean_week() %>%
 71 |     tp_agg_day_to_week_od() %>%
 72 |     tp_complete_date_region_od() %>%
 73 |     tp_add_polygon_data_od(admin_sp) %>%
 74 |     
 75 |     # Interpolate/Clean Values
 76 |     #tp_interpolate_outliers(NAs_as_zero = F) %>%
 77 |     #tp_replace_zeros(NAs_as_zero = T) %>%
 78 |     tp_less15_NA() %>%
 79 |     
 80 |     # Percent change
 81 |     tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i5_weekly_base.csv"),
 82 |                                type = "weekly",
 83 |                                baseline_date = BASELINE_DATE) %>%
 84 |     tp_add_percent_change() %>%
 85 |     
 86 |     # Add labels
 87 |     tp_add_label_level(timeunit = "week", OD = T) %>%
 88 |     tp_add_label_baseline(timeunit = "week", OD = T) 
 89 |   
 90 |   ## Export
 91 |   saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH, "i5_weekly.Rds"))
 92 |   write.csv(df_week_clean, file.path(CLEAN_DATA_PATH, "i5_weekly.csv"), row.names=F)
 93 |   
 94 | }
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/data-checks/Archive/quick_checks/check_subscribers.R:
--------------------------------------------------------------------------------
 1 | # Check subscribers data
 2 | 
 3 | FIG_PATH <- file.path(PROJECT_PATH, "proof-of-concept",
 4 |                       "outputs", "data-checks", "figures_indicators", "subscribers_daily")
 5 | 
 6 | # Load Data --------------------------------------------------------------------
 7 | # FILE PATHS NEED TO BE UPDATED
 8 | ISAAC_DATA_PATH_2 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin2_flowminder")
 9 | ISAAC_DATA_PATH_3 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin3_flowminder")
10 | 
11 | #### Raw Data
12 | df_day_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2,
13 |                                  "count_unique_subscribers_per_region_per_day.csv"),
14 |                            stringsAsFactors=F) %>%
15 |   dplyr::rename(value_raw = subscriber_count,
16 |                 date = visit_date) %>%
17 |   dplyr::mutate(region = region %>% as.character(),
18 |                 date = date %>% as.Date())
19 | 
20 | df_week_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2,
21 |                                   "count_unique_subscribers_per_region_per_week.csv"),
22 |                             stringsAsFactors=F) %>%
23 |   dplyr::rename(value_raw = subscriber_count,
24 |                 date = visit_week) %>%
25 |   dplyr::mutate(region = region %>% as.character())
26 | 
27 | df_day_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3,
28 |                                  "count_unique_subscribers_per_region_per_day.csv"),
29 |                            stringsAsFactors=F) %>%
30 |   dplyr::rename(value_raw = subscriber_count,
31 |                 date = visit_date) %>%
32 |   dplyr::mutate(region = region %>% as.character(),
33 |                 date = date %>% as.Date())
34 | 
35 | df_week_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3,
36 |                                   "count_unique_subscribers_per_region_per_week.csv"),
37 |                             stringsAsFactors=F) %>%
38 |   dplyr::rename(value_raw = subscriber_count,
39 |                 date = visit_week) %>%
40 |   dplyr::mutate(region = region %>% as.character())
41 | 
42 | #### Cleaned Data
43 | df_day_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH,
44 |                                  "count_unique_subscribers_per_region_per_day.Rds")) %>%
45 |   left_join(df_day_adm2_raw, by=c("date", "region"))
46 | 
47 | df_week_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH,
48 |                                   "count_unique_subscribers_per_region_per_week.Rds"))
49 | 
50 | df_day_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH,
51 |                                  "count_unique_subscribers_per_region_per_day.Rds")) %>%
52 |   left_join(df_day_adm3_raw, by=c("date", "region")) %>%
53 |   mutate(value_raw = value_raw %>% as.numeric())
54 | 
55 | df_week_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH,
56 |                                   "count_unique_subscribers_per_region_per_week.Rds"))
57 | 
58 | # Trends Over Time -------------------------------------------------------------
59 | df_day_adm2 %>%
60 |   group_by(date) %>%
61 |   summarise(value = sum(value),
62 |             value_raw = sum(value_raw)) %>%
63 |   ggplot() +
64 |   geom_line(aes(x=date, y=value), color="black") +
65 |   geom_point(aes(x=date, y=value), color="black") +
66 |   geom_vline(xintercept = as.Date("2020-03-27"), color="red")
67 | 
68 | lapply(unique(df_day_adm3$province), function(province_i){
69 |   print(province_i)
70 |   
71 |   p <- df_day_adm3 %>% 
72 |     filter(province %in% province_i) %>%
73 |     ggplot(aes(x=date)) +
74 |     geom_line(aes(y=value_raw), color="red", alpha=0.2, size=1.5) +
75 |     geom_line(aes(y=value)) +
76 |     facet_wrap(~region,
77 |                scales = "free_y")
78 |   ggsave(p, filename = file.path(FIG_PATH, paste0(province_i, ".png")), height = 25, width = 25)
79 |   
80 |   return(NULL)
81 | })
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i7_distance_traveled.R:
--------------------------------------------------------------------------------
  1 | # Clean Subscribers Data
  2 | 
  3 | unit = "adm2"
  4 | metric = "avg_dist"
  5 | for(unit in c("adm2", "adm3")){
  6 |   for(metric in c("avg_dist", "stddev")){
  7 |     
  8 |     print(paste(unit, metric,  "---------------------------------------------"))
  9 |     
 10 |     # Load Data / Set Paths ------------------------------------------------------
 11 |     df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_07_home_",unit,"_day_result.csv")),
 12 |                        stringsAsFactors=F)
 13 |     admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds")))
 14 |     
 15 |     if(unit %in% "adm2"){
 16 |       CLEAN_DATA_PATH  <- CLEAN_DATA_ADM2_PATH
 17 |       
 18 |       df_day <- clean_moz_names(df_day, 
 19 |                                 name = "H_adm2", 
 20 |                                 name_higher = "H_adm1", 
 21 |                                 type = "adm2")
 22 |       
 23 |     } 
 24 |     if(unit %in% "adm3"){
 25 |       CLEAN_DATA_PATH  <- CLEAN_DATA_ADM3_PATH
 26 |       
 27 |       df_day <- clean_moz_names(df_day, 
 28 |                                 name = "H_adm3", 
 29 |                                 name_higher = "H_adm2", 
 30 |                                 type = "adm3")
 31 |       
 32 |     }
 33 |     
 34 |     # Daily ----------------------------------------------------------------------
 35 |     print("day")
 36 |     
 37 |     df_day_clean <- df_day %>% 
 38 |       
 39 |       tp_standardize_vars("pdate", paste0("H_", unit), metric) %>%
 40 |       
 41 |       # Clean datset
 42 |       tp_clean_date() %>%
 43 |       tp_fill_regions(admin_sp) %>%
 44 |       tp_complete_date_region() %>%
 45 |       tp_add_polygon_data(admin_sp) %>%
 46 |       
 47 |       # Interpolate/Clean Values
 48 |       tp_interpolate_outliers(NAs_as_zero = T, outlier_replace="both") %>%
 49 |       tp_replace_zeros(NAs_as_zero = T) %>%
 50 |       tp_less15_NA(threshold = 0) %>%
 51 |       
 52 |       # Percent change
 53 |       tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, paste0("i7_",metric,"_daily_base.csv"))) %>%
 54 |       tp_add_percent_change() %>%
 55 |       
 56 |       # Add labels
 57 |       tp_add_label_level(timeunit = "day", OD = F) %>%
 58 |       tp_add_label_baseline(timeunit = "day", OD = F)
 59 |     
 60 |     ## Export
 61 |     saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, paste0("i7_daily_",metric,".Rds")))
 62 |     write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, paste0("i7_daily_",metric,".csv")), row.names=F)
 63 |     
 64 |     
 65 |     # Weekly ---------------------------------------------------------------------
 66 |     print("week")
 67 |     
 68 |     df_week_clean <- df_day_clean %>% 
 69 |       
 70 |       dplyr::select(date, region, value) %>%
 71 |       
 72 |       tp_standardize_vars("date", "region", "value") %>%
 73 |       
 74 |       # Clean datset
 75 |       tp_clean_week() %>%
 76 |       tp_agg_day_to_week(fun="mean") %>%
 77 |       tp_fill_regions(admin_sp) %>%
 78 |       tp_complete_date_region() %>%
 79 |       tp_add_polygon_data(admin_sp) %>%
 80 |       
 81 |       # Interpolate/Clean Values
 82 |       #tp_interpolate_outliers(NAs_as_zero = T) %>%
 83 |       #tp_replace_zeros(NAs_as_zero = T) %>%
 84 |       #tp_less15_NA() %>%
 85 |       
 86 |       # Percent change
 87 |       tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, paste0("i7_",metric,"_weekly_base.csv")),
 88 |                                  type = "weekly") %>%
 89 |       tp_add_percent_change() %>%
 90 |       
 91 |       # Add labels
 92 |       tp_add_label_level(timeunit = "week", OD = F) %>%
 93 |       tp_add_label_baseline(timeunit = "week", OD = F) 
 94 |     
 95 |     
 96 |     ## Export
 97 |     saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH,
 98 |                                      paste0("i7_weekly_",metric,".Rds")))
 99 |     write.csv(df_week_clean, file.path(CLEAN_DATA_PATH, 
100 |                                        paste0("i7_weekly_",metric,".csv")), 
101 |               row.names=F)
102 |     
103 |     
104 |   }
105 | }
106 | 
107 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/_dash_master.R:
--------------------------------------------------------------------------------
  1 | # Master R Script for Prepping Data for Dashboard
  2 | # Mozambique
  3 | 
  4 | #### Settings #### =============================================================
  5 | options(rsconnect.max.bundle.files = 400000)
  6 | 
  7 | CLEAN_SPATIAL_DATA <- F
  8 | CLEAN_TELECOM_DATA <- F
  9 | PREP_DATA_FOR_DASH <- T
 10 | 
 11 | BASELINE_DATE <- "2020-03-31"
 12 | 
 13 | #### Packages #### =============================================================
 14 | library(tidyverse)
 15 | library(sparkline)
 16 | library(sf)
 17 | library(sp)
 18 | library(plotly)
 19 | library(stargazer)
 20 | library(knitr)
 21 | library(gridExtra)
 22 | library(leaflet)
 23 | library(ggpubr)
 24 | library(purrr)
 25 | library(parallel)
 26 | library(pbmcapply)
 27 | library(rgeos)
 28 | library(rgdal)
 29 | library(sp)
 30 | library(rmapshaper)
 31 | library(raster)
 32 | library(geosphere)
 33 | library(lubridate)
 34 | library(data.table)
 35 | library(mapview)
 36 | library(bcrypt)
 37 | 
 38 | #### File paths #### ===========================================================
 39 | 
 40 | # Define Root Paths ------------------------------------------------------------
 41 | if(Sys.info()[["user"]] == "robmarty") PROJECT_PATH <- "~/Documents/World Bank/Sveta Milusheva - COVID 19 Results"
 42 | if(Sys.info()[["user"]] == "wb519128") PROJECT_PATH <- "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results"
 43 | if(Sys.info()[["user"]] == "WB521633") PROJECT_PATH <- "C:/Users/wb521633/WBG/Sveta Milusheva - COVID 19 Results"
 44 | 
 45 | if(Sys.info()[["user"]] == "robmarty") GITHUB_PATH <- "~/Documents/Github/covid-mobile-dashboards"
 46 | if(Sys.info()[["user"]] == "wb519128") GITHUB_PATH <- "C:/Users/wb519128/Github/covid-mobile-dashboards"
 47 | if(Sys.info()[["user"]] == "WB521633") GITHUB_PATH <- "C:/Users/wb521633/Documents/Github/covid-mobile-dashboards"
 48 | 
 49 | # Define Paths from Root -------------------------------------------------------
 50 | GADM_PATH       <- "PATH-HERE"
 51 | GEO_PATH        <- "PATH-HERE"
 52 | 
 53 | CLEAN_DATA_ADM2_PATH <- "PATH-HERE"
 54 | CLEAN_DATA_ADM3_PATH <- "PATH-HERE"
 55 | 
 56 | DASHBOARD_DATA_ONEDRIVE_PATH <- "PATH-HERE"
 57 | DASHBOARD_DATA_GITHUB_PATH     <- "PATH-HERE"
 58 | 
 59 | PREP_DATA_CODE_PATH <- "PATH-HERE"
 60 | 
 61 | #### Functions #### ============================================================
 62 | source(file.path(GITHUB_PATH, "dashboard-dataviz", "dashboards",
 63 |                  "_tp_functions.R"))
 64 | 
 65 | source(file.path(GITHUB_PATH, "dashboard-dataviz", "dashboards",
 66 |                  "_prep_data_for_dash_functions.R"))
 67 | 
 68 | 
 69 | #### Scripts #### ==============================================================
 70 | 
 71 | # 1. Prepare Spatial Data ------------------------------------------------------
 72 | if(CLEAN_SPATIAL_DATA){
 73 |   source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "download_gadm.R"))
 74 |   source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "clean_adm2_file.R"))
 75 |   source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "clean_adm3_file.R"))
 76 | }
 77 | 
 78 | # 2. Prepare Spatial Data ------------------------------------------------------
 79 | if(CLEAN_TELECOM_DATA){
 80 |   source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i3_subscribers_data.R"))
 81 |   source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i5_movement_inout_data.R"))
 82 |   source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i5_net_movement_data.R"))
 83 |   source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i7_distance_traveled.R"))
 84 | }
 85 | 
 86 | # 3. Prep Data for Dashboard ---------------------------------------------------
 87 | if(PREP_DATA_FOR_DASH){
 88 |  source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "prep_subs_obs_totals_data.R"))
 89 |  source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "prep_telecom_agg_data.R"))
 90 |  source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "data_to_github.R"))
 91 | }
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/data-checks/Archive/usage_outliers.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------#
  2 | # Outliers and towers down
  3 | #-----------------------------------------------------------------#
  4 | 
  5 | # This code depends on MASTER.py to run as file path objects are
  6 | # defined there
  7 | 
  8 | 
  9 | #-----------------------------------------------------------------#
 10 | # TO DO:
 11 | 
 12 | # Identify regions with very sparse use
 13 |  # 1. Count obs per region
 14 |  # 2. Count obs per region per day
 15 | 
 16 | # Identify regions with normal use and big valleys of usage, that 
 17 | # would probably indicate a tower being down
 18 | 
 19 | #-----------------------------------------------------------------#
 20 | # Settings
 21 | 
 22 | import pandas as pd
 23 | 
 24 | EXPORT = False
 25 | TEMP_PANEL = True
 26 | # Number of hours below avg, used as a trashold to 
 27 | # define a tower down
 28 | htrahshold = -3
 29 | 
 30 | #-----------------------------------------------------------------#
 31 | # Import data
 32 | 
 33 | if TEMP_PANEL:
 34 |     i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv')
 35 | else:
 36 |     i1 = pd.read_csv(I1_Adm3_path + "transactions_per_hour.csv")
 37 | 
 38 | i1 = i1[i1.region != '99999']
 39 | # Wards data
 40 | 
 41 | 
 42 | 
 43 | # Hourly transactions per region 
 44 | 
 45 | # Unique subscribers per hour
 46 | # i2a3 = pd.read_csv(I2_Adm3_path + "unique_subscribers_per_hour.csv")
 47 | # i2t = pd.read_csv(I2_towercluster_path + "unique_subscribers_per_hour.csv")
 48 | 
 49 | 
 50 | #-----------------------------------------------------------------#
 51 | # Process data
 52 | 
 53 | i1['date'] = pd.to_datetime(i1['hour']).dt.date
 54 | i1['hour_int'] = pd.to_datetime(i1['hour']).dt.hour
 55 | 
 56 | 
 57 | #-----------------------------------------------------------------#
 58 | # Wards with very little data
 59 | 
 60 | # Number of observations per ward that is total number of hours
 61 | i1freq = i1.groupby('region').size()
 62 | 
 63 | i1freq = i1freq.reset_index()
 64 | i1freq.columns = ['region', 'freq']
 65 | 
 66 | # Select wards with less than 12h on average
 67 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1.date.nunique())]
 68 | 
 69 | i1_low_total_hours = i1_low_total_hours\
 70 |     .rename(columns = {'freq' : 'total_hours'})
 71 | # # Proportion of wards with at least one tower down
 72 | # freq[freq < 1392].count()/len(set(i1['region']))
 73 | 
 74 | # # Proportion of wards with very 
 75 | # freq[freq < 700].count()
 76 | # freq[freq < 700].count()/len(set(i1['region']))
 77 | 
 78 | # Export
 79 | if(EXPORT):
 80 |     (i1_low_total_hours
 81 |     .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv', 
 82 |             index = False) )
 83 | 
 84 | #-----------------------------------------------------------------#
 85 | # Indicator wards and days with towers down
 86 | 
 87 | # Number of hours with transactions per region day
 88 | hours_per_day = i1.groupby(['region', 'date']).size()
 89 | 
 90 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column
 91 | hours_per_day.columns = ['region', 'date', 'hcount']
 92 | 
 93 | 
 94 | # Average hours per day per region
 95 | avg_hours = (hours_per_day.groupby(['region'])
 96 |     .mean()
 97 |     .rename(columns={'hcount' :'avg_hours' }))
 98 | 
 99 | # Create region day data set
100 | i1_ag_df = hours_per_day.merge(avg_hours,
101 |                                 on = 'region')
102 | 
103 | # Difference from average usage per hour
104 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours']
105 | 
106 | # Create data only with pairs of wards and days potential 
107 | # towers down
108 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold]
109 | 
110 | # Read me text
111 | readme_text = "This file contains a combinations of wards and  days that are assumed to have a tower down."
112 | readme_text += "If a day has " + str(abs(htrahshold))  
113 | readme_text += " hours with any calls below the daily avergage for that ward,"
114 | readme_text += " it is considered to have a trower down at some point that day."  
115 | 
116 | # Export
117 | if(EXPORT):
118 |     (i1_ag_df_tower_down 
119 |     .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv', 
120 |             index = False) )
121 |     # Read me file
122 |     file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w") 
123 |     file.write(readme_text) 
124 |     file.close() 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/data-checks/Archive/i10-check.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # dbutils.fs.ls('/mnt/')
  3 | # dbutils.fs.refreshMounts()
  4 | 
  5 | # COMMAND ----------
  6 | 
  7 | import pyspark.sql.functions as F
  8 | from pyspark.sql.functions import to_timestamp
  9 | from pyspark.sql.types import *
 10 | from pyspark.sql.window import Window
 11 | 
 12 | # Constat definitions
 13 | privacy_filter = 15
 14 | missing_value_code = 99999
 15 | cutoff_days = 7
 16 | max_duration = 21
 17 | 
 18 | user_window = Window\
 19 |     .partitionBy('msisdn').orderBy('call_datetime')
 20 | 
 21 | 
 22 | # COMMAND ----------
 23 | 
 24 | # dbutils.fs.ls('/mnt/COVID19Data/Sveta Milusheva - mar20')
 25 | base_path = '/mnt/COVID19Data/Sveta Milusheva - mar20/'
 26 | geo_path =  'mnt/COVID19Data/proof-of-concept/support-data/geo-files/'
 27 | 
 28 | # COMMAND ----------
 29 | 
 30 | # Load tower mapping to districts
 31 | cells = spark.read.format("csv")\
 32 |   .option("header", "true")\
 33 |   .load(geo_path + 'zw_admin3_tower_map.csv')
 34 | 
 35 | # COMMAND ----------
 36 | 
 37 | cells.show()
 38 | 
 39 | # COMMAND ----------
 40 | 
 41 | # Set default schema
 42 | schema = StructType([
 43 |   StructField("msisdn", IntegerType(), True),
 44 |   StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files()
 45 |   StructField("location_id", StringType(), True)
 46 | ])
 47 | 
 48 | # Import one day at a time
 49 | 
 50 | mar20 = spark.read.format("csv")\
 51 |   .option("header", "true")\
 52 |   .load(base_path + 'MOH_EWZ_20200320.csv', schema = schema)
 53 | 
 54 | mar21 = spark.read.format("csv")\
 55 |   .option("header", "true")\
 56 |   .load(base_path + 'MOH_EWZ_20200320.csv', schema = schema)
 57 | 
 58 | 
 59 | 
 60 | # COMMAND ----------
 61 | 
 62 | # Process data
 63 | 
 64 | def create_vars(df, cells):
 65 |     # Loading variables
 66 |     df = df.withColumn("call_datetime", to_timestamp("call_datetime","dd/MM/yyyy HH:mm:ss"))
 67 |     #get call_date from call_datetime
 68 |     df = df.withColumn('call_date', df.call_datetime.cast('date'))
 69 |     
 70 |     # Recreate analysis variables
 71 |     df = df.join(cells, df.location_id == cells.cell_id, how = 'left').drop('cell_id')\
 72 |       .orderBy('msisdn', 'call_datetime')\
 73 |       .withColumn('region_lag', F.lag('region').over(user_window))\
 74 |       .withColumn('region_lead', F.lead('region').over(user_window))\
 75 |       .withColumn('call_datetime_lag', F.lag('call_datetime').over(user_window))\
 76 |       .withColumn('call_datetime_lead', F.lead('call_datetime').over(user_window))\
 77 |       .withColumn('hour_of_day', F.hour('call_datetime').cast('byte'))\
 78 |       .withColumn('hour', F.date_trunc('hour', F.col('call_datetime')))\
 79 |       .withColumn('week', F.date_trunc('week', F.col('call_datetime')))\
 80 |       .withColumn('month', F.date_trunc('month', F.col('call_datetime')))\
 81 |       .withColumn('constant', F.lit(1).cast('byte'))\
 82 |       .withColumn('day', F.date_trunc('day', F.col('call_datetime')))\
 83 |       .na.fill({'region' : missing_value_code ,
 84 |                 'region_lag' : missing_value_code ,
 85 |                 'region_lead' : missing_value_code })    
 86 | 
 87 |     return df
 88 | 
 89 | mar20 = create_vars(mar20, cells)
 90 | mar21 = create_vars(mar21, cells)
 91 | 
 92 | # COMMAND ----------
 93 | 
 94 | mar20.columns
 95 | 
 96 | # COMMAND ----------
 97 | 
 98 | # Create simple OD matrix
 99 | def simp_od(df):
100 |   
101 |   # Kepp if region and region_lag/lead are not the same
102 |   df = df.where((F.col('region_lag') != F.col('region')) | (F.col('region_lead') != F.col('region')) | (F.col('call_datetime_lead').isNull()))
103 |   
104 |   # Aggregate total sum by region and region_lag
105 |   agg_df = df.groupby('region', 'region_lag')\
106 |     .agg(F.count("*"))
107 |   
108 |   return agg_df
109 |   
110 | m20_agg = simp_od(mar20)
111 | m21_agg = simp_od(mar21)
112 | 
113 | # COMMAND ----------
114 | 
115 | m20_agg.show()
116 | 
117 | # COMMAND ----------
118 | 
119 | # mar20.show()
120 | 
121 | # COMMAND ----------
122 | 
123 | 
124 | # 1. Merge with tower mapping to wards
125 | 
126 | # 2. Recreate vars
127 | 
128 | # 4. 
129 | 
130 | 
131 | # COMMAND ----------
132 | 
133 | a
134 | 
135 | # COMMAND ----------
136 | 
137 | 
138 | 
139 | # COMMAND ----------
140 | 
141 | test_df = spark.read\
142 |   .option('header', 'true')\
143 |   .option('inferSchema', 'true')\
144 |   .csv('/mnt/COVID19Data/proof-of-concept/new/ZW/telecel/world_bank_cdr_new.csv')
145 | dd
146 | 
147 | # COMMAND ----------
148 | 
149 | test_df.printSchema()
150 | 


--------------------------------------------------------------------------------
/data-checks/Archive/MASTER.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------#
  2 | # DATA CHECKS MASTER
  3 | #-----------------------------------------------------------------#
  4 | 
  5 | # This script sets file paths and (will) map all processes for checking
  6 | # incoming data
  7 | 
  8 | #-----------------------------------------------------------------#
  9 | #### Settings
 10 | 
 11 | import os
 12 | import re
 13 | import pandas as pd
 14 | import numpy as np
 15 | import datetime as dt
 16 | 
 17 | import seaborn as sns; sns.set()
 18 | from matplotlib import rcParams
 19 | import matplotlib.pyplot as plt
 20 | 
 21 | #-----------------------------------------------------------------#
 22 | #### Set file paths
 23 | 
 24 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
 25 | DATA_POC = DATA_path + "proof-of-concept/"
 26 | DATA_GIS = DATA_POC + 'geo_files/'
 27 | 
 28 | DATA_DB_raw_indicators = DATA_POC + "databricks-results/zw/"
 29 | DATA_dashboad_clean = DATA_POC + "/files_for_dashboard/files_clean/"
 30 | 
 31 | DATA_dash_clean_a2 = DATA_dashboad_clean + "adm2/"
 32 | DATA_dash_clean_a3 = DATA_dashboad_clean + "adm3/"
 33 | 
 34 | #---------------#
 35 | # Main indicators
 36 | 
 37 | # Transactions per hour
 38 | I1_path = DATA_DB_raw_indicators + "indicator 1/"
 39 | I1_Adm3_path = I1_path + "admin3/"
 40 | 
 41 | 
 42 | # Unique subcribers per hour
 43 | I2_path = DATA_DB_raw_indicators + "indicator 2/"
 44 | I2_Adm3_path = I2_path + "admin3/"
 45 | I2_towercluster_path = I2_path + "tower_cluster/"
 46 | 
 47 | 
 48 | # Unique subscribers per day
 49 | I3_path = DATA_DB_raw_indicators + "indicator 3/"
 50 | I3_Adm2_path = I3_path + "admin2/"
 51 | I3_Adm3_path = I3_path + "admin3/"
 52 | 
 53 | # Ratio of residents active that day based on those present 
 54 | # during baseline
 55 | I4_path = DATA_DB_raw_indicators + "indicator 4/"
 56 | I4_Adm2_path = I4_path + 'admin2/'
 57 | I4_Adm3_path = I4_path + 'admin3/'
 58 | 
 59 | # OD matrix
 60 | I5_path = DATA_DB_raw_indicators + "indicator 5/"
 61 | I5_Adm2_path = I5_path + "admin2/"
 62 | I5_Adm3_path = I5_path + "admin3/"
 63 | 
 64 | # Residents living in area
 65 | I6_path = DATA_DB_raw_indicators + "indicator 6/"
 66 | I6_Adm2_path = I6_path + "admin2/"
 67 | I6_Adm3_path = I6_path + "admin3/"
 68 | 
 69 | # Mean and Standard Deviation of distance 
 70 | # traveled (by home location) day
 71 | I7_path = DATA_DB_raw_indicators + "indicator 7/"
 72 | I7_Adm2_path = I7_path + "admin2/"
 73 | I7_Adm3_path = I7_path + "admin3/"
 74 | 
 75 | # Mean and Standard Deviation of distance 
 76 | # traveled (by home location) week
 77 | I8_path = DATA_DB_raw_indicators + "indicator 8/"
 78 | I8_Adm2_path = I5_path + "admin2/"
 79 | I8_Adm3_path = I5_path + "admin3/"
 80 | 
 81 | # Daily locations based on Home Region with 
 82 | # average stay time and SD of stay time
 83 | I9_path = DATA_DB_raw_indicators + "indicator 9/"
 84 | I9_Adm2_path = I9_path + "admin2/"
 85 | I9_Adm3_path = I9_path + "admin3/"
 86 | 
 87 | #Simple Origin Destination Matrix - trips 
 88 | # between consecutive in time regions with time
 89 | I10_path = DATA_DB_raw_indicators + "indicator 10/"
 90 | I10_Adm2_path = I5_path + "admin2/"
 91 | I10_Adm3_path = I5_path + "admin3/"
 92 | 
 93 | #---------------------#
 94 | # Flowminder indicators
 95 | FLOWM_path = DATA_DB_raw_indicators + "flowminder indicators/"
 96 | FLOWM_adm2_path = FLOWM_path + "admin2/"
 97 | FLOWM_adm3_path = FLOWM_path + "admin3/"
 98 | 
 99 | #-------------------#
100 | # External indicators
101 | 
102 | # Update file path
103 | IRESULTS = DATA_path + "Isaac-results/"
104 | 
105 | IFLOW_path = IRESULTS + "flowminder/"
106 | ICUST_path = IRESULTS + "custom/"
107 | 
108 | # Flowminder
109 | IFLOWM_adm2_path = IFLOW_path + "admin2/"
110 | IFLOWM_adm3_path = IFLOW_path + "admin3/"
111 | 
112 | # Custum
113 | ICUST_adm2_path = ICUST_path + "admin2/"
114 | ICUST_adm3_path = ICUST_path + "admin3/"
115 | 
116 | 
117 | #---------------#
118 | # Outputs
119 | OUT_path = DATA_POC + "outputs/"
120 | OUT_plots = OUT_path + "Figures/"
121 | OUT_hfcs = OUT_path + "data-checks/"
122 | # OUT_hfcs_sheets =  OUT_hfcs + "Sheet differences/"
123 | 
124 | #-----------------------------------------------------------------#
125 | # Indicator dataframes
126 | 
127 | # Load list of internal indicators to make it
128 | # easier to bulk load files
129 | internal_indicators = pd\
130 |     .read_csv(DATA_POC + 'documentation/indicators_list.csv')
131 | 
132 | # Since sheet contains relative paths add path global
133 | # to have absolute paths    
134 | internal_indicators['path'] = DATA_path + internal_indicators['path']   


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/README.md:
--------------------------------------------------------------------------------
 1 | # Dashboard
 2 | 
 3 | This dashboard is build using R Shiny.
 4 | 
 5 | # Preparing Data for Dashboard
 6 | 
 7 | `01_preparing_data_for_dashboard` contains three folders with scripts for cleaning and preparing data for the dashboard.
 8 | 
 9 | ## Clean Spatial Data
10 | 
11 | The files in `01_clean_spatial_data` clean spatial polygons to be used in the dashboard and subsequent cleaning steps. The following cleaning steps are conducted:
12 | 
13 | 1. Aggregate units when needed (e.g., aggregating wards)
14 | 2. Add additional variables (e.g., area)
15 | 3. Standardize variable names
16 | 4. Orders spatial data by region
17 | 
18 | #### Standardize Variable Names
19 | Each spatial dataset should have standardized variable names. Standardizing
20 | variable names helps ensure different units (eg, admin2, admin3) can be
21 | easily switched in the dashboard
22 | 
23 | | variable | format | example | description |
24 | |---|---|---|---|
25 | | region | string | ZONE123456 | Unique identifier of the spatial unit |
26 | | name | string | name-here | Spatial unit name |
27 | | area | numeric | 1234 | Area of the spatial unit in kilometers squared |
28 | | adm1| string | name-here | Name of the province |
29 | 
30 | #### Order Spatial Data
31 | Spatial datasets are ordered by region. When cleaning other datasets at the
32 | region level, we also order by region and ensure all regions are present. This
33 | ensures that no reordering needs to be done in the dashboard.
34 | 
35 | ## Clean Telecom Data
36 | 
37 | The files in `02_clean_telecom_data` clean telecom data. They clean variable values (eg, accounting for outliers), standardize variable names and add variables needed for the dashboard.
38 | 
39 | #### Dataset
40 | 
41 | A number of indicators are cleaned. To facilitate further processing for the datasets
42 | to be used in the dashboard, all cleaned datasets have the following standardized
43 | variables:
44 | 
45 | | variable | format | example | description |
46 | |---|---|---|---|
47 | | region | string | ZONE123456 | Unique identifier of the spatial unit |
48 | | name | string | Name1 | Spatial unit name |
49 | | date | date or string | 2020-02-01 | The date |
50 | | value | numeric | 1000 | Value (e.g., number of subscribers, number of trips, distance traveled) |
51 | | value_lag | numeric | 1000 | Value from the previous time period |
52 | | value_base | numeric | 1000 | Baseline value |
53 | | value_perchange_base | numeric | 50 | Percent change from baseline |
54 | | value_zscore_base | numeric | 50 | Z-score change since baseline |
55 | | label_level | string | Name1 <br>This day's value: 1000<br>...  | Label for when level of variable is shown |
56 | | label_base| string | Name1 <br>This day's value: 1000<br>...  | Label for when change since baseline value is shown. |
57 | 
58 | ## Dashboard Data Prep
59 | 
60 | The files in `03_dashboard_data_prep` further process data into datasets that are used for the dashboard. Due to the high volume of data, data transformations (e.g., aggregating, filtering, etc) are done outside of the dashboard in order to minimize the processing and data needed to be loaded in memory at any point as the dashboard is running. These scripts filter the cleaned telecom data into individual datasets so that no additional filtering or transformations need to be applied within the dashboard; the dashboard can just read the files then immediately use the data in the map, line graph and table. Here, we create smaller datasets that contain the same variables as above. Indicators include density, movement in, movement out, mean distance traveled, etc.
61 | 
62 | The following datasets are made.
63 | 
64 | | Dataset Type | Naming Convention | Description |
65 | | --- | --- | --- |
66 | | unit-level | [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Date/Week].Rds | For a given day or week, this dataset contains information for all units for a specified indicator. For O-D level datasets, values are aggregated to the specified origin or destination unit (eg, movement into unit from all other units). |
67 | | time-level |  [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name].Rds | For a given admin unit, this dataset contains a time series of values for a specified indicator. |
68 | | unit-time-level |  [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name]\_[Date/Week].Rds | These datasets are only used for O-D variables. The show, for a given origin or destination unit, the movement in or out of that unit to all other units for the specified day/week. |
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/data-checks/Archive/02_summary_stats.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #-----------------------------------------------------------------#
  3 | # Exploratory analysis
  4 | #-----------------------------------------------------------------#
  5 | #-----------------------------------------------------------------#
  6 | #### Settings
  7 | 
  8 | from globals import *
  9 | 
 10 | #-----------------------------------------------------------------#
 11 | #### Set file paths
 12 | 
 13 | DATA_GIS = DATA_path + 'proof-of-concept/geo_files/'
 14 | INDICATORS_path = DATA_path + "proof-of-concept/panel_indicators/clean/"
 15 | 
 16 | #-----------------------------------------------------------------#
 17 | #### Load data
 18 | 
 19 | i1 = pd.read_csv(INDICATORS_path + 'i1_3.csv') # Number of calls
 20 | i3 = pd.read_csv(INDICATORS_path + 'i3_3.csv') # number of users
 21 | i5 = pd.read_csv(INDICATORS_path + 'i5_3.csv') # orgin and destination 
 22 | i52 = pd.read_csv(INDICATORS_path + 'i5_2.csv') # orgin and destination 
 23 | 
 24 | i7 = pd.read_csv(INDICATORS_path + 'i7_3.csv') # distance travelled
 25 | 
 26 | #-----------------------------------------------------------------#
 27 | #### Aggregate data at the country level
 28 | 
 29 | i1_agg = i1\
 30 |         .groupby('date')\
 31 |         .agg({'count' : np.sum})\
 32 |         .reset_index()\
 33 |         .sort_values('date')
 34 | 
 35 | i3_agg = i3\
 36 |         .groupby('date')\
 37 |         .agg({'count' : np.sum})\
 38 |         .reset_index()\
 39 |         .sort_values('date')\
 40 |         .rename(columns = {'count': 'subs'})
 41 | 
 42 | # Add number of subscribers to indicator 1 aggregated data
 43 | i1_agg = i1_agg.merge(i3_agg, on = 'date')
 44 | i1_agg['calls_p'] = i1_agg['count']/i1_agg['subs']
 45 | 
 46 | 
 47 | # OD matrix aggregated data
 48 | i5_agg = i5\
 49 |         .groupby('date')\
 50 |         .agg({'subscriber_count' : np.mean,
 51 |               'total_count' : np.sum,
 52 |               'region_to': pd.Series.nunique,
 53 |               'region_from': pd.Series.nunique})\
 54 |         .reset_index()\
 55 |         .sort_values('date')
 56 | 
 57 | i5_agg = i5_agg.merge(i3_agg, on = 'date')
 58 | i5_agg['moves_p_sub'] = i5_agg['subscriber_count']/i5_agg['subs']
 59 | 
 60 | 
 61 | #-----------------------------------------------------------------#
 62 | # Comparisson between pre and post lockdown stats  
 63 | 
 64 | # Pre-post lockdown variables
 65 | lockdown_date = np.datetime64(dt.date(2020, 3, 27))
 66 | 
 67 | i1['post'] = (i1['date'].astype('datetime64') > lockdown_date).astype(int)
 68 | i3['post'] = (i3['date'].astype('datetime64') > lockdown_date).astype(int)
 69 | i5['post'] = (i5['date'].astype('datetime64') > lockdown_date).astype(int)
 70 | i7['post'] = (i7['date'].astype('datetime64') > lockdown_date).astype(int)
 71 | 
 72 | i1_agg['post'] = (i1_agg['date'].astype('datetime64') > lockdown_date).astype(int)
 73 | i5_agg['post'] = (i5_agg['date'].astype('datetime64') > lockdown_date).astype(int)
 74 | i7['post'] = (i7['date'].astype('datetime64') > lockdown_date).astype(int)
 75 | 
 76 | # Number of calls per user
 77 | i1_agg['calls_p'].mean()
 78 | i1_agg['calls_p'][i1_agg['post'] == 0].mean()
 79 | i1_agg['calls_p'][i1_agg['post'] == 1].mean()
 80 | 
 81 | # Number of districts visited?
 82 | i5_agg['moves_p_sub'].mean()
 83 | i5_agg['moves_p_sub'][i5_agg['post'] == 0].mean()
 84 | i5_agg['moves_p_sub'][i5_agg['post'] == 1].mean()
 85 | 
 86 | # Average distance travelled
 87 | i7['mean_distance'].mean()
 88 | 
 89 | i7['mean_distance'][i7['post'] == 0].mean()
 90 | i7['mean_distance'][i7['post'] == 1].mean()
 91 | 
 92 | # Number of wards
 93 | i5['subscriber_count'].mean()
 94 | i5['subscriber_count'][i5['post'] == 0].sum()
 95 | i5['subscriber_count'][i5['post'] == 1].mean()
 96 | 
 97 | # Distance travelled
 98 | i7['mean_distance'].mean()
 99 | i7['mean_distance'][i7['post'] == 0].mean()
100 | i7['mean_distance'][i7['post'] == 1].mean()
101 | 
102 | 
103 | #-----------------------------------------------------------------#
104 | # Plot number of regions that received visitors per day  
105 | 
106 | import plotly.express as px
107 | 
108 | fig = px.line(i5_agg, x="date", y="region_to")
109 | fig.show()
110 | 
111 | 
112 | #-----------------------------------------------------------------#
113 | # Compare regions that received and sent visitors  
114 | 
115 | import plotly.graph_objects as go
116 | 
117 | # set up plotly figure
118 | fig = go.Figure()
119 | 
120 | # add line / trace 1 to figure
121 | fig.add_trace(go.Scatter(
122 |     x=i5_agg['date'],
123 |     y=i5_agg['region_to'],
124 |     marker=dict(
125 |         color="blue"
126 |     )))
127 | fig.add_trace(go.Scatter(
128 |     x=i5_agg['date'],
129 |     y=i5_agg['region_from'],
130 |     marker=dict(
131 |         color="red"
132 |     )))
133 | 
134 | fig.show()
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/data-checks/Archive/03_i_specific_checks_i1_admin2.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------#
  2 | # Create Admin2 Indicator 1
  3 | #-----------------------------------------------------------------#
  4 | 
  5 | EXPORT = False
  6 | 
  7 | # import shapely
  8 | # import geojsonio
  9 | import os
 10 | import geopandas as gpd
 11 | import matplotlib.pyplot as plt
 12 | import plotly.graph_objects as go
 13 | import plotly.express as px
 14 | from plotly.subplots import make_subplots
 15 | 
 16 | import seaborn as sns; sns.set()
 17 | 
 18 | 
 19 | #-----------------------------------------------------------------#
 20 | # Load data
 21 | 
 22 | # Indicator 1 panel data
 23 | i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv')
 24 | i1 = i1[i1.region != '99999']
 25 | # Wards data
 26 | wards = gpd.read_file(DATA_GIS + 'wards_aggregated.geojson')
 27 | wd = wards[['ward_id', 'province_name', 'district_id', 'district_name']]
 28 | 
 29 | #-----------------------------------------------------------------#
 30 | # Create wards mapping into disctrics
 31 | 
 32 | i1 = i1.merge(wd, left_on = 'region', right_on = 'ward_id')
 33 | 
 34 | 
 35 | # Aggregate values by district
 36 | i1_agg = i1.groupby(['district_id', 'district_name', 'hour']).agg(lambda x : sum(x)).reset_index()
 37 | 
 38 | # Make sure hour is in date time
 39 | i1_agg['hour'] = i1_agg['hour'].astype('datetime64')
 40 | i1_agg['district_id'] = i1_agg['district_id'].astype('int')
 41 | 
 42 | #-----------------------------------------------------------------#
 43 | # Transactions per hour by district line plot.
 44 | 
 45 | # Line plot function definition
 46 | def line_plot(reg_i,
 47 |                var = 'count_p',
 48 |                data = i1_agg,
 49 |                region = 'district_id',
 50 |                region_str = 'district_name',
 51 |                time = 'hour'):
 52 |     plt_data = data[data[region] == reg_i]
 53 |     fig = go.Figure()
 54 |     # Create line
 55 |     fig.add_trace(go.Scatter(x=plt_data[time], y=plt_data[var],
 56 |                     mode='lines',
 57 |                     name='lines'))
 58 |     # Additional formatting 
 59 |     title =  str(plt_data[region].iloc[0]) + plt_data[region_str].iloc[0]
 60 |     fig.update_layout(
 61 |         title=title,
 62 |         xaxis_title="Time",
 63 |         yaxis_title="Count",
 64 |         font=dict(
 65 |             # family="Courier New, monospace",
 66 |             size=18,
 67 |             color="#7f7f7f"),
 68 |         autosize=False,
 69 |         width=1200,
 70 |         height=700
 71 |         )
 72 |     return(fig)
 73 | 
 74 | # Districts list
 75 | dists = list(set(i1_agg['district_id']))
 76 | 
 77 | # region_plt(d)
 78 | # plt.show()
 79 | 
 80 | # Loop over districts
 81 | for d in dists:
 82 |     print(d)
 83 |     # Create plot
 84 |     plt_i = line_plot(d)
 85 |     # Export
 86 |     save_name = None
 87 |     save_name = 'i1_districts_count' + str(d) + '.png'
 88 |     plt_i.write_image(OUT_plots + 'daily_obs_region/' + save_name)
 89 | 
 90 | 
 91 | #-----------------------------------------------------------------#
 92 | # Transactions per hour by day. That is one plot per hour
 93 | i1_agg['time'] = pd.to_datetime(i1_agg['hour']).dt.hour
 94 | i1_agg['date'] = pd.to_datetime(i1_agg['hour']).dt.date
 95 | 
 96 | 
 97 | def hourly_scatter(reg_i,
 98 |                var = 'count_p',
 99 |                data = i1_agg,
100 |                region = 'district_id',
101 |                region_str = 'district_name',
102 |                time = 'date',
103 |                facets = 'time'):
104 |     # Subset data
105 |     plt_data = data[data[region] == reg_i]
106 |     # Create plot
107 |     fig = px.scatter(plt_data, 
108 |                     x= time, 
109 |                     y = var, 
110 |                     facet_col = facets, 
111 |                     facet_col_wrap = 5,
112 |                     width=1200,
113 |                     height=700)
114 |     # Additional formatting 
115 |     title =  str(plt_data[region].iloc[0]) + ' - ' + plt_data[region_str].iloc[0]
116 |     fig.update_layout(title_text= title)
117 |     fig.update_yaxes(matches=None)
118 |     fig.for_each_annotation(lambda a: a.update(text=a.text.replace("time=", "")))
119 |     # Format axis titles
120 |     return(fig)
121 | 
122 | # Loop over districts
123 | for d in dists:
124 |     print(d)
125 |     # Create plot
126 |     plt_i = hourly_scatter(d)
127 |     # Export
128 |     save_name = None
129 |     save_name = 'i1_hourly_obs_byhour' + str(d) + '.png'
130 |     plt_i.write_image(OUT_plots + 'hourly_obs_by_hour_region/' + save_name)
131 | 
132 | 
133 | 
134 | 
135 | #-----------------------------------------------------------------#
136 | # Export data 
137 | if EXPORT:
138 |     i1_agg.to_csv(OUT_hfcs + 'Sheet comp panel/i1_admin2.csv', index = False)
139 | 
140 | 
141 | 
142 | 
143 | #-----------------------------------------------------------------#
144 | # DRAFT
145 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/figures/i5_into_out.R:
--------------------------------------------------------------------------------
  1 | # i3 Figures
  2 | 
  3 | unit <- "wards"
  4 | 
  5 | # Load Data --------------------------------------------------------------------
  6 | if(unit %in% "wards"){
  7 |   CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
  8 | }
  9 | 
 10 | if(unit %in% "districts"){
 11 |   CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
 12 | }
 13 | 
 14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i5_daily.Rds"))
 15 | 
 16 | data_into <- data %>%
 17 |   group_by(region_dest, name_dest, date) %>%
 18 |   summarise(value = sum(value, na.rm=T)) %>%
 19 |   dplyr::rename(region = region_dest,
 20 |                 name = name_dest)
 21 | 
 22 | data_out <- data %>%
 23 |   group_by(region_origin, name_origin, date) %>%
 24 |   summarise(value = sum(value, na.rm=T)) %>%
 25 |   dplyr::rename(region = region_origin,
 26 |                 name = name_origin)
 27 | 
 28 | ##
 29 | data_into <- data_into %>%
 30 |   group_by(region) %>%
 31 |   mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T),
 32 |          value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>%
 33 |   ungroup() %>%
 34 |   mutate(value_change = value_post - value_pre) %>%
 35 |   mutate(value_change_rank = rank(value_change))
 36 | data_into$value_change_rank[is.na(data_into$value_change)] <- NA
 37 | 
 38 | data_out <- data_out %>%
 39 |   group_by(region) %>%
 40 |   mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T),
 41 |          value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>%
 42 |   ungroup() %>%
 43 |   mutate(value_change = value_post - value_pre) %>%
 44 |   mutate(value_change_rank = rank(value_change))
 45 | data_out$value_change_rank[is.na(data_out$value_change)] <- NA
 46 | 
 47 | 
 48 | ## FIX
 49 | data_into <- data_into[!is.na(data_into$date),]
 50 | data_into$date <- data_into$date %>% as.Date()
 51 | 
 52 | data_out <- data_out[!is.na(data_out$date),]
 53 | data_out$date <- data_out$date %>% as.Date()
 54 | 
 55 | 
 56 | # Into -------------------------------------------------------------------------
 57 | rank_high <- data_into$value_change_rank %>% unique() %>% sort() %>% head(5)
 58 | 
 59 | p_high <- data_into %>%
 60 |   dplyr::filter(value_change_rank %in% rank_high) %>%
 61 |   ggplot(aes(x = date, y = value)) +
 62 |   geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
 63 |   geom_line() +
 64 |   labs(x = "",
 65 |        y = "Number of Subscribers",
 66 |        title = "Largest Decreases") +
 67 |   facet_wrap(~name,
 68 |              scales = "free_y",
 69 |              nrow = 1) +
 70 |   theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
 71 |         strip.text.x = element_text(face = "bold"))
 72 | p_high
 73 | 
 74 | rank_low <- data_into$value_change_rank %>% unique() %>% sort() %>% tail(5)
 75 | 
 76 | p_low <- data_into %>%
 77 |   dplyr::filter(value_change_rank %in% rank_low) %>%
 78 |   ggplot(aes(x = date, y = value)) +
 79 |   geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
 80 |   geom_line() +
 81 |   labs(x = "",
 82 |        y = "",
 83 |        title = "Largest Increases") +
 84 |   facet_wrap(~name,
 85 |              scales = "free_y",
 86 |              nrow = 1) +
 87 |   theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
 88 |         strip.text.x = element_text(face = "bold"))
 89 | 
 90 | p_low
 91 | 
 92 | p_all <- ggarrange(p_high, p_low, nrow = 2)
 93 | ggsave(p_all, filename = file.path(figures_path, 
 94 |                                    paste0(unit, "_netmovement_top_chng.png")),
 95 |        height = 5, width=12)
 96 | 
 97 | 
 98 | 
 99 | # Out Of -------------------------------------------------------------------------
100 | rank_high <- data_out$value_change_rank %>% unique() %>% sort() %>% head(5)
101 | 
102 | p_high <- data_out %>%
103 |   dplyr::filter(value_change_rank %in% rank_high) %>%
104 |   ggplot(aes(x = date, y = value)) +
105 |   geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
106 |   geom_line() +
107 |   labs(x = "",
108 |        y = "Number of Subscribers",
109 |        title = "Largest Decreases") +
110 |   facet_wrap(~name,
111 |              scales = "free_y",
112 |              nrow = 1) +
113 |   theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
114 |         strip.text.x = element_text(face = "bold"))
115 | p_high
116 | 
117 | rank_low <- data_out$value_change_rank %>% unique() %>% sort() %>% tail(5)
118 | 
119 | p_low <- data_out %>%
120 |   dplyr::filter(value_change_rank %in% rank_low) %>%
121 |   ggplot(aes(x = date, y = value)) +
122 |   geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
123 |   geom_line() +
124 |   labs(x = "",
125 |        y = "",
126 |        title = "Largest Increases") +
127 |   facet_wrap(~name,
128 |              scales = "free_y",
129 |              nrow = 1) +
130 |   theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
131 |         strip.text.x = element_text(face = "bold"))
132 | 
133 | p_low
134 | 
135 | p_all <- ggarrange(p_high, p_low, nrow = 2)
136 | ggsave(p_all, filename = file.path(figures_path, 
137 |                                    paste0(unit, "_netmovement_top_chng.png")),
138 |        height = 5, width=12)
139 | 
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/README.md:
--------------------------------------------------------------------------------
  1 | # Clean Aggregated Telecom Data
  2 | 
  3 | These scripts clean and standardize aggregated telecom data. This is a necessary
  4 | part of the process towards preparing datasets for the dashboard.
  5 | 
  6 | ## Dataset
  7 | 
  8 | A number of indicators are cleaned. To facilitate further processing for the datasets
  9 | to be used in the dashboard, all cleaned datasets have the following standardized
 10 | variables:
 11 | 
 12 | | variable | format | example | description |
 13 | |---|---|---|---|
 14 | | region | string | ZONE123456 | Unique identifier of the spatial unit |
 15 | | name | string | Name| Spatial unit name |
 16 | | date | date or string | 2020-02-01| The date |
 17 | | value | numeric | 1000 | Value (e.g., number of subscribers, number of trips, distance traveled) |
 18 | | value_lag | numeric | 1000 | Value from the previous time period |
 19 | | value_base | numeric | 1000 | Baseline value |
 20 | | value_perchange_base | numeric | 50 | Percent change from baseline |
 21 | | value_zscore_base | numeric | 50 | Z-score change since baseline |
 22 | | label_level | string | Name<br>This day's value: 1000<br>...  | Label for when level of variable is shown |
 23 | | label_base| string | Name<br>This day's value: 1000<br>...  | Label for when change since baseline value is shown. |
 24 | 
 25 | ## telecom prep [tp] functions
 26 | 
 27 | The `_tp_functions.R` file defines a number of functions to help standardize
 28 | the cleaning process.
 29 | 
 30 | #### Set/Standardize Variables
 31 | 
 32 | * __tp_standardize_vars:__ Renames the date, region and value variable names to
 33 | `date`, `region` and `value`. The remaining `tp_` functions take these variable
 34 | names as defaults.
 35 | * __tp_standardize_vars_od:__ Renames variables for origin-destination matrices.
 36 | Inputs include the date, region_origin, region_destination and value variables. This function
 37 | standardizes those variables and creates a new variable that concatenates region_origin and
 38 | region_destination as a unique identifier for the origin-destination pair.
 39 | 
 40 | #### Clean Dataset
 41 | 
 42 | * __tp_fill_regions:__ Checks for any regions that are missing in the telecom data that are in the polygon/admin data. Adds these regions to the dataset.
 43 | * __tp_clean_day:__ If `date` is day of week, cleans into a `Date` variable.
 44 | * __tp_clean_week:__ Transforms `date` to represent the week (e.g., `Feb 01 - Feb 07`). Handles
 45 | both integers (e.g., week `6`) and groups day of week (e.g., `2020-02-01`)
 46 | * __tp_agg_day_to_week:__ Aggregates the dataset from daily to weekly.
 47 | * __tp_complete_date_region:__ Completes data with all data/region pairs.
 48 | * __tp_complete_date_region_od:__ Completes data with all data/region pairs for
 49 | origin-destination datasets.
 50 | * __tp_add_polygon_data:__ Adds polygon data to dataset (primarily for `name`)
 51 | * __tp_add_polygon_data_od:__ Adds polygon data to dataset for origin-destination data.
 52 | Adds all polygon variables as `_origin` and `_dest`
 53 | 
 54 | #### Clean Value Variable
 55 | 
 56 | * __tp_interpolate_outliers:__ Interpolates outliers on the `value` variable. Includes
 57 | options for replacing negative, positive or both types of outliers, and for what is considered
 58 | and outlier. Defaults to 4 standard deviations.
 59 | * __tp_replace_zeros:__ Interpolates values of zero. Only interpolates when the
 60 | number of zeros is equal to or less than `N_zero_thresh`.
 61 | 
 62 | #### Add Variables
 63 | 
 64 | * __tp_add_percent_change:__ Adds percent change from the last time period (day or week)
 65 | on the `value` variable
 66 | * __tp_add_baseline_comp_stats:__ Adds percent change and z-score change values
 67 | compared to baseline using `value` variable.
 68 | 
 69 | #### Add Labels for Leaflet
 70 | 
 71 | * __tp_add_label_level:__ Adds label for the original (level) value to be used in
 72 | leaflet in the dashboard.
 73 | * __tp_add_label_baseline:__ Adds label for change metrics since baseline to be used
 74 | in leaflet in the dashboard.
 75 | 
 76 | 
 77 | ## Example cleaning
 78 | 
 79 | The following shows an example of cleaning data. Here we have two datasets:
 80 | 
 81 | 1. __df_day:__ Which is a daily dataset of the number of subscribers at the unit level and contains three
 82 | relevant variables: `visit_date` (e.g., `2020-02-01T00:00:00.000Z`), `region` (e.g., `ZONE123456`) and
 83 | `subscriber_count` (e.g., `1000`).
 84 | 
 85 | 2. __admin_sp:__ Which is a SpatialPolygonsDataFrame of units. It contains the variables
 86 | described in `01_clean_spatial_data` (i.e., `name`, `region`, `area` and `adm1`).
 87 | 
 88 | ```r
 89 | df_day_clean <- df_day %>%
 90 | 
 91 |   # Standardizes variable names so can avoid defining variable names in the
 92 |   # tp_ functions.
 93 |   tp_standardize_vars("visit_date", "region", "subscriber_count") %>%
 94 | 
 95 |   # Clean dataset
 96 |   tp_clean_date() %>%
 97 |   tp_fill_regions(admin_sp) %>%
 98 |   tp_complete_date_region() %>%
 99 |   tp_add_polygon_data(admin_sp) %>%
100 | 
101 |   # Interpolate/Clean Values
102 |   tp_interpolate_outliers(NAs_as_zero = T) %>%
103 |   tp_replace_zeros(NAs_as_zero = T) %>%
104 | 
105 |   # Add change metrics
106 |   tp_add_baseline_comp_stats() %>%
107 |   tp_add_percent_change() %>%
108 | 
109 |   # Add labels
110 |   tp_add_label_level(timeunit = "day", OD = F) %>%
111 |   tp_add_label_baseline(timeunit = "day", OD = F)
112 | ```
113 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/voronoi.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | from geovoronoi import voronoi_regions_from_coords
  3 | 
  4 | import os
  5 | if os.environ['HOME'] != '/root':
  6 |     from modules.tower_clustering import *
  7 | 
  8 | ## Class to handle spark and df in session
  9 | class voronoi_maker:
 10 |     """Class to handle all voronoi transformations and files for a specific df
 11 | 
 12 | 
 13 |     Attributes
 14 |     ----------
 15 |     datasource :  an instance of DataSource class.
 16 |     shape : a geopandas dataframe. Shapefile to use for clustering
 17 |     region_var : a string. Name of the region variable in the shapefile.
 18 |     sites :  a string. Name of the attribute of datasource that holds the tower coordinates.
 19 |     spark_df : a pyspark dataframe. Holds the cdr data
 20 |     result_path : a string. Where to save results.
 21 |     clusterer : an instance of tower_clusterer.
 22 |     sites_df :  a pyspark dataframe. Holds clustered sites.
 23 |     distances_pd_long : a pyspark dataframe. Holds distances between sites.
 24 |     sites : a pyspark dataframe. Clustered sites without NAs.
 25 | 
 26 |     Methods
 27 |     -------
 28 |     make_voronoi()
 29 |         orchestrates all methods
 30 | 
 31 |     filter_towers_for_voronoi()
 32 |         we can't run on duplicates (location duplicates), so we have to filter them out first
 33 | 
 34 |     make_shape(towers_for_voronoi)
 35 |         makes a buffer around towers to create bubble shapes
 36 | 
 37 |     create_voronoi(towers_for_voronoi, shape)
 38 |         creats voronoi cells from tower list
 39 | 
 40 |     save_voronoi(poly_shapes)
 41 |         saves voronoi shape file and voronoi-tower mapping
 42 | 
 43 |     assign_to_spark_df()
 44 |         adds voronoi id to cdr records (not used currently)
 45 |     """
 46 | 
 47 |     def __init__(self,
 48 |                 datasource,
 49 |                 shape,
 50 |                 region_var,
 51 |                 sites = 'tower_sites'):
 52 |         """
 53 |         Parameters
 54 |         ----------
 55 |         """
 56 |         self.spark = datasource.spark
 57 |         self.datasource = datasource
 58 |         self.spark_df = datasource.parquet_df
 59 |         self.result_path = datasource.results_path
 60 |         self.clusterer = tower_clusterer(datasource, shape, region_var, sites)
 61 |         self.clusterer.cluster_towers()
 62 |         self.sites_df = self.clusterer.sites_with_clusters.loc[:,['cell_id', 'centroid_LAT', 'centroid_LNG']].rename(columns={'centroid_LAT' : 'LAT', 'centroid_LNG': 'LNG'})
 63 |         self.distances_pd_long = self.clusterer.distances_pd_long
 64 |         if (self.sites_df.columns == ['cell_id', 'LAT', 'LNG']).all():
 65 |           self.sites = self.sites_df[self.sites_df.LAT.notna()]
 66 |         else:
 67 |           raise 'The sites dataframe does not have the correct columns / column order. Should be cell_id, LAT, LNG'
 68 | 
 69 |     def make_voronoi(self):
 70 | 
 71 |         towers_for_voronoi = self.filter_towers_for_voronoi()
 72 |         shape, towers_for_voronoi = self.make_shape(towers_for_voronoi = towers_for_voronoi)
 73 |         poly_shapes = self.create_voronoi(shape = shape, towers_for_voronoi = towers_for_voronoi)
 74 |         self.save_voronoi(poly_shapes = poly_shapes)
 75 |         return self.voronoi_dict
 76 | 
 77 |     def filter_towers_for_voronoi(self):
 78 | 
 79 |         # get unique towers in data
 80 |         distinct_towers = self.spark_df.select('location_id').distinct().toPandas()
 81 | 
 82 |         # filter list of towers for unique towers
 83 |         self.sites = self.sites[self.sites.cell_id.isin(list(distinct_towers.location_id))]
 84 | 
 85 |         # Assign gpd
 86 |         self.towers = gpd.GeoDataFrame(
 87 |         self.sites, geometry = gpd.points_from_xy(self.sites.LNG, self.sites.LAT), crs = 'epsg:4326')
 88 | 
 89 |         # Find towers that are in same location
 90 |         self.towers.LAT = self.towers.LAT.apply(lambda x: round(x,4))
 91 |         self.towers.LNG = self.towers.LNG.apply(lambda x: round(x,4))
 92 |         towers_for_voronoi = self.towers[~self.towers.duplicated(subset = ['LAT', 'LNG'])]
 93 | 
 94 |         return towers_for_voronoi
 95 | 
 96 |     def make_shape(self, towers_for_voronoi):
 97 | 
 98 |         # Make border shape
 99 |         radians =   35 / 40000  * 360
100 |         self.shape = towers_for_voronoi.buffer(radians).unary_union
101 | 
102 |         return self.shape, towers_for_voronoi
103 | 
104 |     def create_voronoi(self, towers_for_voronoi, shape):
105 | 
106 |         # Create np array of vertices
107 |         points = towers_for_voronoi.loc[:,['LNG','LAT']].to_numpy()
108 | 
109 |         # Create voronoi shapes
110 |         self.poly_shapes, pts, poly_to_pt_assignments = voronoi_regions_from_coords(points, shape)
111 | 
112 |         return self.poly_shapes
113 | 
114 |     def save_voronoi(self, poly_shapes):
115 | 
116 |         # Save voronoi
117 |         self.voronoi_pd = pd.DataFrame(poly_shapes)
118 |         self.voronoi_pd.columns =['geometry']
119 |         self.voronoi_gpd = deepcopy(self.voronoi_pd)
120 |         self.voronoi_gpd = gpd.GeoDataFrame(self.voronoi_gpd, geometry = 'geometry', crs = 'epsg:4326')
121 |         self.voronoi_pd['geometry'] = self.voronoi_pd.geometry.astype(str)
122 |         self.voronoi_pd = self.voronoi_pd.reset_index()
123 |         self.voronoi_pd.columns = ['region', 'geometry']
124 |         self.voronoi_pd  = self.spark.createDataFrame(self.voronoi_pd)
125 |         save_csv(self.voronoi_pd, self.result_path, self.datasource.country_code + '_voronoi_shapefile')
126 | 
127 |         # Match towers to voronoi so that all towers are assigned to a cell
128 |         voronoi_towers = gpd.sjoin(self.voronoi_gpd, self.towers, op="intersects")
129 |         self.voronoi_dict = voronoi_towers.drop(['geometry', 'LAT', 'LNG', 'index_right'], axis = 'columns')
130 |         self.voronoi_dict = self.voronoi_dict.reset_index()
131 |         self.voronoi_dict.columns = ['region', 'cell_id']
132 |         self.voronoi_dict  = self.spark.createDataFrame(self.voronoi_dict)
133 |         save_csv(self.voronoi_dict, self.result_path, self.datasource.country_code + '_voronoi_tower_map')
134 | 
135 |     def assign_to_spark_df(self):
136 | 
137 |         self.new_spark_df = self.spark_df.join(self.voronoi_dict, self.spark_df['location_id'] == self.voronoi_dict['cell_id'], how = 'left')
138 |         return self.new_spark_df
139 | 


--------------------------------------------------------------------------------
/data-checks/Archive/Descr-exploratory/fb-comparisson-draft.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------#
  2 | # CDR vs FB comparisson
  3 | #-----------------------------------------------------------------#
  4 | # TO DO 
  5 | 
  6 | 
  7 | # Read documentation
  8 | 
  9 | # Do same process for  movement data
 10 | 
 11 | # Look at the results
 12 | 
 13 | # Do the merging  with only overlaping dates
 14 | 
 15 | #-----------------------------------------------------------------#
 16 | # Settings
 17 | 
 18 | import os
 19 | import pandas as pd
 20 | import numpy as np
 21 | import glob
 22 | 
 23 | base_path = 'C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/proof-of-concept/'
 24 | 
 25 | fb_data = base_path + 'Facebook Data/'
 26 | cdr_path = base_path + 'panel_indicators/'
 27 | 
 28 | doc_path = base_path + 'documentation/'
 29 | 
 30 | OUT_path = base_path + '/outputs/data-checks/'
 31 | 
 32 | data_pop = fb_data + 'Population Administrative Regions/'
 33 | data_mov = fb_data + 'Movement Admin Regions/'
 34 | 
 35 | # File names need to be updated
 36 | # # prefix = 'Coronavirus Disease Prevention Map Apr 16 2020 Id  Movement between Administrative Regions__'
 37 | # prefix_pop = 'Coronavirus Disease Prevention Map Apr 16 2020 Id  Facebook Population (Administrative Regions)__'
 38 | # prefix_mov = 'Coronavirus Disease Prevention Map Apr 16 2020 Id  Movement between Administrative Regions__'
 39 | 
 40 | 
 41 | #-----------------------------------------------------------------#
 42 | # Load FB data
 43 | 
 44 | # Population - Load all csv files in the folder
 45 | files_pop = glob.glob(data_pop + prefix_pop + "*.csv")
 46 | files_mov = glob.glob(data_mov + prefix_mov + "*.csv")
 47 | fpop = pd.concat([pd.read_csv(f, encoding='latin1') for f in files_pop], ignore_index=True)
 48 | fmov = pd.concat([pd.read_csv(f, encoding='latin1') for f in files_mov], ignore_index=True)
 49 | 
 50 | 
 51 | # df1 = pd.read_csv(data_pop + prefix + '2020-06-24 0000.csv')
 52 | # df2 = pd.read_csv(data_pop + prefix + '2020-06-24 0800.csv')
 53 | # df3 = pd.read_csv(data_pop + prefix + '2020-06-24 1600.csv')
 54 | 
 55 | #-----------------------------------------------------------------#
 56 | # Load CDR data
 57 | 
 58 | # Using i3, Count of unique subscribers, for now. Not sure how this
 59 | # fb indicator was calculated, so it might make sense to use another
 60 | # indicator
 61 | cpop = pd.read_csv(cdr_path + 'i3_admin2.csv')
 62 | 
 63 | 
 64 | # i5
 65 | cmov = pd.read_csv(cdr_path + 'i5_admin2.csv')
 66 | 
 67 | # load and merge keys for str name matching
 68 | a2_keys = pd.read_csv(doc_path + 'keys_districts.csv')
 69 | a2_keys = a2_keys[['id2', 'name2']]
 70 | 
 71 | # process cdr population
 72 | cp = cpop.merge(a2_keys,
 73 |                 left_on= 'region',
 74 |                 right_on = 'id2')
 75 | 
 76 | cp['date'] = pd.to_datetime(cp['day']).dt.date
 77 | 
 78 | cp = cp[['date', 'id2', 'name2','count_p']]\
 79 |             .rename(columns = {'name2' : 'name',
 80 |                                'count_p' : 'count'})
 81 | 
 82 | # process cdr movement
 83 | 
 84 | cmov['date'] = pd.to_datetime(cmov['connection_date']).dt.date
 85 | 
 86 | 
 87 | cm = cmov\
 88 |     .merge(a2_keys,
 89 |            left_on = 'region_from',
 90 |            right_on= 'id2')\
 91 |     .rename(columns = {'name2' : 'st_name'})\
 92 |     .merge(a2_keys,
 93 |            left_on = 'region_to',
 94 |            right_on= 'id2')\
 95 |     .rename(columns = {'name2' : 'ed_name',
 96 |                        'total_count_p' : 'count'})\
 97 |     [['date', 'st_name','ed_name', 'count']]
 98 |     
 99 | 
100 | #-----------------------------------------------------------------#
101 | # Process FB data
102 | 
103 | def process(df, time, group_by, count):
104 |     # Remove other countried
105 |     df = df.loc[df['country'] == 'ZW']
106 |     # Date var
107 |     df['date'] = pd.to_datetime(df[time]).dt.date
108 |     # Group by
109 |     gby = ['date']
110 |     gby.extend(group_by)
111 |     # Aggregate 
112 |     agg = df\
113 |         .groupby(gby)\
114 |         .agg({count : np.sum})\
115 |         .reset_index()
116 |     
117 |     return agg
118 | 
119 | 
120 | fp = process(fpop, 'date_time', ['polygon_name'], 'n_crisis')\
121 |         .rename(columns = {'polygon_name' : 'name',
122 |                            'n_crisis' : 'count'})
123 | fm = process(fmov, 'date_time', ['start_polygon_name', 'end_polygon_name'], 'n_crisis')\
124 |         .rename(columns = {'start_polygon_name' : 'st_name',
125 |                            'end_polygon_name' : 'ed_name',
126 |                            'n_crisis' : 'count'})
127 | 
128 | #-----------------------------------------------------------------#
129 | # Merge
130 | 
131 | # Make sure I'm comparing same period
132 | overlapping_dates = set(cp['date']).intersection(set(fp['date']))
133 | 
134 | fp = fp[fp['date'].isin(overlapping_dates)]
135 | cp = cp[cp['date'].isin(overlapping_dates)]
136 | 
137 | # String matching corrections
138 | fp['name'].loc[fp['name'] == 'Hwedza'] = 'Wedza'
139 | fp['name'].loc[fp['name'] == 'Chirumanzu'] = 'Chirumhanzu'
140 | fp['name'].loc[fp['name'] == 'Bulilimamangwe'] = 'Bulilima (North)'
141 | 
142 | 
143 | 
144 | # def agg_rank(df, gby = 'name'):
145 | #     df = df.groupby(gby).agg('mean').reset_index()
146 | #     df["rank"] = df["count"].rank(ascending = False) 
147 | #     return df.sort_values('rank')
148 | 
149 | # foo
150 | 
151 | 
152 | # full_period_comp = cp\
153 | #     .merge(fp, 
154 | #            on = ['name', 'date'], 
155 | #            how = 'outer',
156 | #            suffixes=('', '_fb'))\
157 | #     .sort_values('rank')
158 | 
159 | 
160 | 
161 | 
162 | #-----------------------------------------------------------------#
163 | # Aggregated merge
164 | 
165 | # Create full period ranking
166 | def agg_rank(df, gby = 'name'):
167 |     df = df.groupby(gby).agg('mean').reset_index()
168 |     df["rank"] = df["count"].rank(ascending = False) 
169 |     return df.sort_values('rank')
170 | 
171 | cp_rank = agg_rank(cp)
172 | fp_rank = agg_rank(fp)
173 | 
174 | 
175 | 
176 | full_period_comp = cp_rank\
177 |     .merge(fp_rank, 
178 |            on = 'name', 
179 |            how = 'outer',
180 |            suffixes=('', '_fb'))\
181 |     .sort_values('rank')
182 |     
183 | #-----------------------------------------------------------------#
184 | # Export
185 | 
186 | # full_period_comp.to_csv(OUT_path + 'i3_fb_comp.csv',
187 | #                         index = False)
188 | 
189 | 
190 | # agg_rank(fm, ['st_name', 'ed_name'])
191 | # agg_rank(cm, ['st_name', 'ed_name'])
192 | 
193 | fp_rank.sort_values('name')


--------------------------------------------------------------------------------
/data-checks/Archive/data_files_comparisson.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------#
  2 | # CSV comparisson
  3 | #-----------------------------------------------------------------#
  4 | 
  5 | import os
  6 | import re
  7 | import numpy as np
  8 | import pandas as pd
  9 | import datetime as dt
 10 | import matplotlib.pyplot as plt 
 11 | # import seaborn as sns
 12 | from datetime import datetime
 13 | 
 14 | 
 15 | IRESULTS = DATA_path + "Isaac-results/"
 16 | 
 17 | IFLOW = IRESULTS + "flowminder/"
 18 | ICUST = IRESULTS + "custom/"
 19 | 
 20 | #-----------------------------------------------------------------#
 21 | # Load files
 22 | 
 23 | filenames = os.listdir(IFLOW)
 24 | 
 25 | # Custom indicatos
 26 | # filenames = os.listdir(ICUST)
 27 | 
 28 | #-----------------------------------------------------------------#
 29 | # Make sure data is compatible
 30 | 
 31 | # Masure order and date formats are the same
 32 | def compat(data,
 33 |            timevar,
 34 |            standardize_time = False,
 35 |            regvar = 'region'):
 36 |     new_data = data
 37 |     
 38 |     # If has a date convert to standard
 39 |     if len(timevar) != 0:
 40 |         timevar = np.asscalar(np.array(timevar))
 41 |         new_data[timevar] = pd.to_datetime(new_data[timevar]).dt.date
 42 |     # Make sure order is the same
 43 |     if len(data.columns) == 2:
 44 |         new_data = new_data.sort_values( by = [new_data.columns[0], new_data.columns[1] ])
 45 |     else :
 46 |         new_data = new_data.sort_values( by = [new_data.columns[0], new_data.columns[1], new_data.columns[2] ])
 47 |     return new_data
 48 | 
 49 | 
 50 | # Comparisson outputs function
 51 | def compare_dfs(df1,df2, filename = None, outputdf = False):
 52 |     # Set time var (GARMBIARRA WARNING)
 53 |     time = list(df1.columns[list(df1.columns.str.contains('date'))])
 54 |     # Process data to be in the same format
 55 |     df1 = compat(df1, timevar = time)
 56 |     df2 = compat(df2, timevar = time)
 57 |     # Merge dfs
 58 |     index_cols = list(df1.columns[0:-1])
 59 |     #Make sure merging columns are str
 60 |     df1[index_cols] = df1[index_cols].astype(str)
 61 |     df2[index_cols] = df2[index_cols].astype(str)
 62 |     cdf = df1.merge(df2, left_on = index_cols, right_on = index_cols)
 63 |     #--------------------#
 64 |     # Calculate differeces
 65 |     # Proportion of mismatches
 66 |     p_rows_diff = sum(cdf[cdf.columns[-1]] != cdf[cdf.columns[-2]])/cdf.shape[0]
 67 |     p_rows_diff = str(round(p_rows_diff, 4)*100)
 68 |     # Value difference
 69 |     cdf['pdiff'] = ((cdf[cdf.columns[-1]] -
 70 |                     cdf[cdf.columns[-2]])/cdf[cdf.columns[-2]])
 71 |     # Average difference
 72 |     avg_diff = str(round(cdf['pdiff'].mean(skipna = True), 4)*100)
 73 |     
 74 |     if outputdf:
 75 |         return(cdf)
 76 |     else:
 77 |         # Print report
 78 |         print(filename)
 79 |         print('N rows ours: ' + str(df1.shape[0]) )
 80 |         print("N rows Isaac's: " + str(df2.shape[0]))
 81 |         print('Of matching rows:')
 82 |         print(' - Average difference of count column: ' + avg_diff + "%")
 83 |         print(' - Percentage rows that are different: ' + p_rows_diff + "%")
 84 |         print('\n')
 85 | 
 86 | 
 87 | #-----------------------------------------------------------------#
 88 | # Flowminder csvs
 89 | for i in range(0, len(filenames)-1):
 90 |     file_i = filenames[i]
 91 |     # print(i)
 92 |     # print(filenames[i])
 93 |     # Our file
 94 |     d1 = pd.read_csv(FLOWM_adm3_path + file_i) 
 95 |     # I's file
 96 |     d2 = pd.read_csv(IFLOW + file_i) 
 97 |     
 98 |     # Run comparisson
 99 |     print(i)
100 |     print(filenames[i])
101 |     compare_dfs(d1,d2)
102 |     
103 | #-----------------------------------------------------------------#    
104 | # Custom indicatos csv
105 | 
106 | # Indicator 1 #
107 | 
108 | i1 = pd.read_csv(I1_Adm3_path + 'transactions_per_hour.csv')
109 | i1i = pd.read_csv(ICUST + 'transactions_per_hour.csv')
110 | 
111 | # i1 = compat(i1, timevar = [])
112 | # i1i = compat(i1i, timevar = [])
113 | 
114 | cdf = compare_dfs(i1,i1i, outputdf = True)
115 | cdf["diff_flag"] = cdf["count_x"] != cdf["count_y"]
116 | cdf['date'] = pd.to_datetime(cdf['hour']).dt.date
117 | 
118 | 
119 | foo = cdf[cdf['diff_flag']]
120 | 
121 | foo.to_csv('C:/Users/wb519128/Desktop/i1_differences.csv',
122 |            index = False)
123 | 
124 | # Indicator 3 #
125 | 
126 | i3 = pd.read_csv(I3_Adm3_path + 'unique_subscribers_per_day.csv')
127 | i3i = pd.read_csv(ICUST + 'unique_subscribers_per_day.csv')
128 | 
129 | cdf3 = compare_dfs(i3,i3i, outputdf = True)
130 | cdf3["diff_flag"] = cdf3["count_x"] != cdf3["count_y"]
131 | cdf3['day'] = pd.to_datetime(cdf3['day']).dt.date
132 | 
133 | 
134 | cdf3[cdf3['day'] == dt.date(2020, 2, 3)]
135 | 
136 | foo = cdf3[cdf3['diff_flag']]
137 | 
138 | foo.to_csv('C:/Users/wb519128/Desktop/i3_differences.csv',
139 |            index = False)
140 | 
141 | 
142 | 
143 | # Indicator 5 #
144 | I5_Adm3_path
145 | 
146 | i5 = pd.read_csv(I5_Adm3_path + 'origin_destination_connection_matrix_per_day.csv')
147 | i5i = pd.read_csv(ICUST + 'origin_destination_connection_matrix_per_day.csv')
148 | 
149 | #cdf5 = compare_dfs(i5,i5i, outputdf = True)
150 | #cdf5["diff_flag"] = cdf5["total_count_x"] != cdf5["total_count_y"]
151 | 
152 | compare_dfs(i5,i5i, outputdf = False)
153 | 
154 | bar = i5.merge(i5i, on = ['connection_date', 'region_from', 'region_to'])
155 | bar["diff_flag"] = bar["od_count_x"] != bar["od_count_y"]
156 | 
157 | diff_day_df = bar.groupby('connection_date').sum()
158 | diff_day_df = diff_day_df.reset_index()
159 | 
160 | diff_day_df['day'] = pd.to_datetime(diff_day_df['connection_date']).dt.day
161 | 
162 | plt.plot('day',
163 |          'diff',
164 |          data = diff_day_df)
165 | 
166 | # set(bar['connection_date'])
167 | # len(set(pd.to_datetime(foo['connection_date']).dt.date))
168 | # len(set(foo['region_from']))
169 | 
170 | # # Absolute difference by day
171 | # bar['diff'] = bar['od_count_x']- bar['od_count_y'] 
172 | 
173 | 
174 | 
175 | # foo = bar[bar['diff_flag']]
176 | 
177 | # foo['diff'] = foo['od_count_x']- foo['od_count_y'] 
178 | # foo['diff'].mean()
179 | 
180 | export_i5_merged = bar.rename(
181 |     columns = {
182 |         'subscriber_count_x' : 'subscriber_count',
183 |         'subscriber_count_y' : 'subscriber_count_isaac',
184 |         'od_count_x': 'od_count_x',
185 |         'od_count_y': 'od_count_isaac',
186 |         'total_count_x' : 'total_count',
187 |         'total_count_y' : 'total_count_isaac'})
188 | 
189 | 
190 | 
191 | export_i5_merged\
192 |     .to_csv('C:/Users/wb519128/Desktop/i5_merged_with_Isaacs.csv',
193 |            index = False)
194 | 
195 | 
196 | 
197 | 
198 | #-----------------------------------------------------------------#    
199 | # DRAFT
200 | 
201 | file_i = filenames[0]
202 | 
203 | d1 = pd.read_csv(FLOWM_adm3_path + file_i) 
204 | d2 = pd.read_csv(IFLOW + file_i) 
205 | 
206 | cdf = compare_dfs(d1,d2, outputdf = True)
207 | cdf["diff_flag"] = cdf["count_x"] != cdf["count_y"]


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/outliers.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | # Class to help counting outliers
  3 | class outlier_counter:
  4 |     """Class to count outliers
  5 | 
  6 |     Attributes
  7 |     ----------
  8 |     calls : a dataframe. which data to process
  9 |     spark : an initialised spark connection.
 10 |     thresholds : a dictionary with outlier thresholds to be used.
 11 | 
 12 |     Methods
 13 |     -------
 14 |     count()
 15 |         count outliers and print results
 16 | 
 17 |     print_results(df)
 18 |         print results of outlier counts
 19 |     """
 20 | 
 21 |     def __init__(self,
 22 |                  calls,
 23 |                  spark = spark,
 24 |                  thresholds = {'min_transactions' : 3,
 25 |                                'max_avg_transactions' : 100,
 26 |                                'max_transactions_in_single_day' : 200}):
 27 |         """
 28 |         Parameters
 29 |         ----------
 30 | 
 31 |         """
 32 |         self.calls = calls
 33 |         self.spark = spark
 34 |         self.counts = {}
 35 |         self.dfs = {}
 36 |         self.thresholds = thresholds
 37 | 
 38 | 
 39 |     def count(self):
 40 |       # count all records
 41 |       self.counts['all_records'] = self.calls.count()
 42 | 
 43 |       # count of days in dataframe
 44 |       self.counts['number_of_days'] = self.calls.select('call_date').distinct().count()
 45 | 
 46 |       # Count number of distinct users
 47 |       self.counts['distinct_ids'] = self.calls.select('msisdn').distinct().count()
 48 | 
 49 |       # Get # of records per user
 50 |       self.dfs['records_per_user'] = self.calls.groupby('msisdn').count()
 51 | 
 52 |       # Get # of records per user per day
 53 |       self.dfs['records_per_user_per_day'] = self.calls.groupby('msisdn', 'call_date').count()
 54 | 
 55 |       # Identify daily usage outlier msidsdn
 56 |       self.dfs['too_few_transactions'] = self.dfs['records_per_user']\
 57 |         .where(F.col('count') < self.thresholds['min_transactions'])\
 58 |         .select('msisdn').distinct()
 59 |       self.dfs['too_many_avg_transactions'] = self.dfs['records_per_user']\
 60 |         .where(F.col('count') > (self.counts['number_of_days'] * \
 61 |         self.thresholds['max_avg_transactions']))\
 62 |         .select('msisdn').distinct() # more than __ calls and texts per day on average
 63 |       self.dfs['too_many_transactions_in_single_day'] = \
 64 |         self.dfs['records_per_user_per_day']\
 65 |         .where(F.col('count') > self.thresholds['max_transactions_in_single_day'])\
 66 |         .select('msisdn').distinct() # more than __ calls and texts in a single day
 67 | 
 68 |       # Count the outlier accounts
 69 |       self.counts['too_few_transactions'] = \
 70 |         self.dfs['too_few_transactions'].count()
 71 |       self.counts['too_many_avg_transactions'] = \
 72 |         self.dfs['too_many_avg_transactions'].count()
 73 |       self.counts['too_many_transactions_in_single_day'] = \
 74 |         self.dfs['too_many_transactions_in_single_day'].count()
 75 | 
 76 |       # Caclulate the outlier account fraction
 77 |       self.counts['too_few_transactions_fraction'] = \
 78 |         self.counts['too_few_transactions'] / self.counts['distinct_ids']
 79 |       self.counts['too_many_avg_transactions_fraction'] = \
 80 |         self.counts['too_many_avg_transactions'] / self.counts['distinct_ids']
 81 |       self.counts['too_many_transactions_in_single_day_fraction'] = \
 82 |         self.counts['too_many_transactions_in_single_day'] / self.counts['distinct_ids']
 83 | 
 84 |       # Keep only ids that aren't among the outlier accounts
 85 |       self.filtered_transactions = self.calls.join(self.dfs['too_few_transactions'],
 86 |                                    self.calls['msisdn'] == \
 87 |                                    self.dfs['too_few_transactions']['msisdn'],
 88 |                                    how ='leftanti').select(self.calls.columns[0:])
 89 |       self.filtered_transactions = self.filtered_transactions\
 90 |                                     .join(self.dfs['too_many_avg_transactions'],
 91 |                                     self.filtered_transactions['msisdn'] == \
 92 |                                     self.dfs['too_many_avg_transactions']['msisdn'],
 93 |                                     how ='leftanti')\
 94 |                                     .select(self.filtered_transactions.columns[0:])
 95 |       self.filtered_transactions = self.filtered_transactions\
 96 |                                     .join(self.dfs['too_many_transactions_in_single_day'],
 97 |                                    self.filtered_transactions['msisdn'] == \
 98 |                                    self.dfs['too_many_transactions_in_single_day']['msisdn'],
 99 |                                    how ='leftanti')\
100 |                                    .select(self.filtered_transactions.columns[0:])
101 | 
102 |       # count how many we kept and dropped
103 |       self.counts['filtered_transactions'] = self.filtered_transactions.count()
104 |       self.counts['dropped_calls'] = \
105 |         self.counts['all_records'] - self.counts['filtered_transactions']
106 |       self.print_results()
107 | 
108 | 
109 |     def print_results(self):
110 |       print('Total number of unique SIMs: {:,}'.format(self.counts['distinct_ids']))
111 |       print('Number of SIMs with less than {} transactions: {:,}'\
112 |         .format(self.thresholds['min_transactions'],
113 |         self.counts['too_few_transactions']))
114 |       print('Number of SIMs with more than {} transactions per day on average: {:,}'\
115 |         .format(self.thresholds['max_avg_transactions'],
116 |         self.counts['too_many_avg_transactions'] ))
117 |       print('Number of SIMs with more than {} transactions in a single day: {:,}'\
118 |         .format(self.thresholds['max_transactions_in_single_day'],
119 |         self.counts['too_many_transactions_in_single_day']))
120 |       print('SIMs with less than {} transactions as a fraction of all accounts: {:.8f}'\
121 |         .format(self.thresholds['min_transactions'],
122 |         self.counts['too_few_transactions_fraction']))
123 |       print('SIMs with more than {} transactions per day on average as a fraction of all accounts: {:.8f}'\
124 |         .format(self.thresholds['max_avg_transactions'],
125 |         self.counts['too_many_avg_transactions_fraction']))
126 |       print('SIMs with more than {} transactions on a single day as a fraction of all accounts: {:.8f}'\
127 |         .format(self.thresholds['max_transactions_in_single_day'],
128 |         self.counts['too_many_transactions_in_single_day_fraction']))
129 |       print('Number of transactions that would be kept: {:,}'\
130 |         .format(self.counts['filtered_transactions']))
131 |       print('Number of transactions that would be deleted: {:,}'\
132 |         .format(self.counts['dropped_calls']))
133 |       print('Fraction of transactions that would be deleted: {:.8f}'\
134 |         .format(self.counts['dropped_calls'] / self.counts['all_records']))


--------------------------------------------------------------------------------
/data-checks/Archive/01_completenes_checks.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------#
  2 | # DATA CHECKS - Completeness checks
  3 | #-----------------------------------------------------------------#
  4 | 
  5 | #-----------------------------------------------------------------#
  6 | # Settings
  7 | 
  8 | from globals import *
  9 | 
 10 | EXPORT_FIGURES = True
 11 | 
 12 | # Default variable names
 13 | timevar = 'hour'
 14 | regvar = 'region'
 15 | 
 16 | INDICATORS_path = DATA_path + 'isaac-results/Archive/e_23_07_2020_converage_23_05_to_30_06/'
 17 | 
 18 | #-----------------------------------------------------------------#
 19 | # Load data
 20 | 
 21 | # Define loading function that depends on the existing folder 
 22 | # structure but also remove headers in the middle of the data if
 23 | # if there is any
 24 | def loadfiles(file_name, 
 25 |               admin = 3,
 26 |               path = INDICATORS_path):
 27 |     print(file_name, admin)
 28 |     # Load external file
 29 |     folder = path + 'admin' + str(admin) + '/' 
 30 |     de = None
 31 |     de = pd.read_csv(folder + file_name)
 32 |     # Patch cleannig of headers in the middle of the data
 33 |     c1_name = de.columns[0]
 34 |     de = de[~de[c1_name].astype(str).str.contains(c1_name)]
 35 |     return(de)
 36 | 
 37 | 
 38 | # Indicator 1
 39 | fi = loadfiles(file_name = 'transactions_per_hour.csv')
 40 | 
 41 | # Indicator 2
 42 | f2 = loadfiles('unique_subscribers_per_day.csv')
 43 | 
 44 | # Indicator 5
 45 | f5 = loadfiles('origin_destination_connection_matrix_per_day.csv')
 46 | 
 47 | # Indicator 9
 48 | f9 = loadfiles('week_home_vs_day_location_per_day.csv', admin = 2)
 49 | 
 50 | 
 51 | #-----------------------------------------------------------------#
 52 | # Processing data
 53 | 
 54 | # Remove missings
 55 | reg_missings_bol = fi['region'].isin(missing_values) 
 56 | fi_cl = fi[~reg_missings_bol]
 57 | 
 58 | # Check for duplicates
 59 | # sum(fi_cl.duplicated())  
 60 | fi_cl['count'] = fi_cl['count'].astype(int)
 61 | 
 62 | # Date vars
 63 | fi_cl['date'] = pd.to_datetime(fi_cl['hour']).dt.date
 64 | # fi_cl['hour'] = pd.to_datetime(fi_cl[timevar]).dt.hour
 65 | # fi_cl['month'] = pd.to_datetime(fi_cl['date']).dt.month
 66 | 
 67 | # Make sure dates are datetime
 68 | fi_cl['hour'] = fi_cl['hour'].astype('datetime64') 
 69 | 
 70 | 
 71 | # I5
 72 | f5['date'] = pd.to_datetime(f5['connection_date']).dt.date
 73 | 
 74 | 
 75 | #-----------------------------------------------------------------#
 76 | # Create aggregated datasets to the country level for ploting
 77 | 
 78 | #----------------------------
 79 | # I1 - transactions per hour
 80 | 
 81 | # Create plots data    
 82 | f1_agg_hour = fi_cl\
 83 |     .groupby(['date', 'hour'])\
 84 |     .agg({'region' : pd.Series.nunique ,
 85 |           'count' : np.sum})\
 86 |     .reset_index()\
 87 |     .sort_values(['date', 'hour'])\
 88 |     .rename(columns = {'region' : 'n_regions'})     
 89 | 
 90 | f1_agg_date = fi_cl\
 91 |     .groupby('date')\
 92 |     .agg({'region' : pd.Series.nunique ,
 93 |           'count' : np.sum})\
 94 |     .reset_index()\
 95 |     .sort_values(['date'])\
 96 |     .rename(columns = {'region' : 'n_regions'})   
 97 | 
 98 | #----------------------------
 99 | # I5 - OD matrix per day data
100 | 
101 | f5['date'] = pd.to_datetime(f5['connection_date']).dt.date
102 | 
103 | f5_agg_date = f5\
104 |         .groupby('date')\
105 |         .agg({'region_from' : pd.Series.nunique ,
106 |               'region_to' : pd.Series.nunique,
107 |               'total_count' : np.sum})\
108 |         .reset_index()\
109 |         .sort_values('date')
110 | 
111 | #----------------------------
112 | # Complete dates and time
113 | 
114 | # Create data sets with time indexes and fill blanks with 0s
115 | def time_complete(data, timevar = timevar, timefreq = 'D'):
116 |     data[timevar] = data[timevar].astype('datetime64')
117 |     full_time_range = pd.date_range(data[timevar].min(),  
118 |                                     data[timevar].max(), 
119 |                                     freq = timefreq)
120 |     data = data.set_index(timevar)
121 |     data = data.reindex(full_time_range,  fill_value=0)
122 |     return(data)
123 | 
124 | f1_agg_date = time_complete(f1_agg_date, 'date')
125 | f1_agg_hour = time_complete(f1_agg_hour, 'hour', 'H')
126 | f5_agg_date = time_complete(f5_agg_date, 'date')
127 | 
128 | #-----------------------------------------------------------------#
129 | # I1 - Day Plots
130 | 
131 | # PLot number of regions with transactions per day.
132 | 
133 | # Number of regions plot
134 | plt.figure(figsize=(12, 6))
135 | date_plot = sns.lineplot(f1_agg_date.index,
136 |                          f1_agg_date['n_regions'])
137 | # Export
138 | date_plot.figure.savefig(OUT_path + "i1_dates_ward_count.png")
139 | 
140 | 
141 | # Number of transactions plot
142 | plt.figure(figsize=(12, 6))
143 | obs_per_day_plot = sns.lineplot(
144 |     f1_agg_date.index,
145 |     f1_agg_date['count'])
146 | # Export
147 | if EXPORT_FIGURES:
148 |     obs_per_day_plot.figure.savefig(OUT_path + "i1_dates_n_obs.png")
149 | 
150 | 
151 | #-----------------------------------------------------------------#
152 | # I1 - Hour Plots
153 | 
154 | # Plot total number of transactions per hour to check for outliers
155 | 
156 | #------------------
157 | # Number of regions 
158 | plt.figure(figsize=(12, 6))
159 | hour_plot = sns.lineplot(
160 |     f1_agg_hour.index,
161 |     f1_agg_hour['n_regions'])
162 | 
163 | # Cosmetics
164 | # x_ticks = list(set(fi_agg_hour['hour'].astype(str)))[0:len(fi_agg_hour):5]
165 | # x_ticks.sort()
166 | # hour_plot.set_xticklabels(x_ticks)
167 | 
168 | # Export
169 | if EXPORT_FIGURES:
170 |     hour_plot.figure.savefig(OUT_path + "i1_hours_ward_count.png")
171 | 
172 | #----------------------------
173 | # Total count of transactions
174 | plt.figure(figsize=(12, 6))
175 | obs_per_hour_plot = sns.lineplot(
176 |     f1_agg_hour.index.values,
177 |     f1_agg_hour['count'])
178 | 
179 | # Cosmetics
180 | # x_ticks = list(set(fi_agg_hour['date'].astype(str)))[0:len(fi_agg_hour):5]
181 | # x_ticks.sort()
182 | # obs_per_hour_plot.set_xticklabels(x_ticks)
183 | 
184 | # Export
185 | if EXPORT_FIGURES:
186 |     obs_per_hour_plot.figure.savefig(OUT_path + "i1_hours_n_obs.png")
187 | 
188 | 
189 | # Table with hours 
190 | # fi_obs_per_hour[fi_obs_per_hour['date'] == dt.date(2020, 4, 30)]
191 | # apr30 = f1_agg_hour[f1_agg_hour['date'] == dt.date(2020, 4, 30)]    
192 | 
193 | # apr30.to_csv(OUT_path + "i1_hour_apr30.csv",
194 | #              index = False)
195 | 
196 | 
197 | #-----------------------------------------------------------------#
198 | # I5 - Day Plots
199 | 
200 | # Plot total number of movements per day
201 | 
202 | # plot total count
203 | f5_plot = sns.lineplot(
204 |     f5_agg_date.index,
205 |     f5_agg_date['total_count'])
206 | # Export
207 | if EXPORT_FIGURES:
208 |     f5_plot.figure.savefig(OUT_path + "i5_dates_total_count.png")
209 | 
210 | 
211 | #-----------------------------------------------------------------#
212 | # I9 - Week plots
213 | 
214 | 
215 | # f9_plot = sns.lineplot(
216 | #     f9_agg_date['week'],
217 | #     f9_agg_date['mean_distance'])
218 | # # Export
219 | # f9_plot.figure.savefig(OUT_path + "i9_week_mean_distance.png")
220 | 


--------------------------------------------------------------------------------
/data-panel/Archive/panel_draft2.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Custom suffixes?
  3 | # Class??
  4 | 
  5 | EXPORT = False
  6 | 
  7 | #-----------------------------------------------------------------#
  8 | # Settings
  9 | 
 10 | import os
 11 | import re
 12 | import pandas as pd
 13 | import numpy as np
 14 | import datetime as dt
 15 | 
 16 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
 17 | DATA_POC = DATA_path + "proof-of-concept/"
 18 | 
 19 | OUT_panel = DATA_POC + "panel_indicators/"
 20 | 
 21 | 
 22 | # CHANGE:
 23 | IRESULTS = DATA_path + "Isaac-results/"
 24 | 
 25 | IFLOW_path = IRESULTS + "flowminder/"
 26 | ICUST_path = IRESULTS + "custom/"
 27 | 
 28 | INEW_PATH_2_mar = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin2_priority/mar1-mar31/"
 29 | INEW_PATH_2_apr = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin2_priority/mar23-apr30/"
 30 | 
 31 | INEW_PATH_3_mar = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin3_priority/mar1-mar31/"
 32 | INEW_PATH_3_apr = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin3_priority/mar23-apr30/"
 33 | 
 34 | 
 35 | IOLD_PATH_2_mar = IRESULTS + "custom/admin2/"
 36 | IOLD_PATH_3_mar = IRESULTS + "Archive/e_08_06_2020_coverage_04_to_05/admin3_custom/"
 37 | 
 38 | 
 39 | # Load list of internal indicators to make it
 40 | # easier to bulk load files
 41 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
 42 | 
 43 | internal_indicators = pd.read_csv(DATA_POC + 'indicators_list.csv')
 44 | internal_indicators['path'] = DATA_path + internal_indicators['path']   
 45 | 
 46 | # Load files function
 47 | def loadfiles(file_name, 
 48 |               files_df = internal_indicators,
 49 |               admin = 3,
 50 |               path_external = None):
 51 |     if path_external is None:
 52 |         # Set intex
 53 |         idx = files_df[(files_df['file'] == file_name) & (files_df['level'] == admin)].index.values[0]    # Load internal
 54 |         # Custom file names for i5, i7 and i9
 55 |         if file_name in ['mean_distance_per_day', 
 56 |                         'origin_destination_connection_matrix_per_day',
 57 |                         'mean_distance_per_week',
 58 |                         'month_home_vs_day_location_per_day',
 59 |                         'week_home_vs_day_location_per_day']:
 60 |             file_name_i = file_name + '_7day_limit.csv'
 61 |         else:
 62 |             file_name_i = file_name + '.csv'
 63 |         # External names
 64 |         print(file_name, admin)
 65 |         # Load data
 66 |         d = None
 67 |         d = pd.read_csv(files_df['path'][idx] + file_name_i)
 68 |     else:
 69 |         print(file_name)
 70 |         file_name = file_name + '.csv'
 71 |         d = None
 72 |         d = pd.read_csv(path_external + file_name)
 73 |         # Patch clean of headers in the middle of the data
 74 |         c1_name = d.columns[0]
 75 |         d = d[~d[c1_name].astype(str).str.contains(c1_name)]
 76 |     # Turn everything to string for simplicity
 77 |     d.astype(str)
 78 |     return d
 79 | 
 80 | # i1 = loadfiles('transactions_per_hour')
 81 | 
 82 | # i1e = loadfiles('transactions_per_hour',
 83 | #                path_external= INEW_PATH_3_apr)
 84 | 
 85 | # Drop custom missigs
 86 | def drop_custna(data, columns):
 87 |     na_list = ['nan', '', '99999', float("inf")] 
 88 |     for cols in columns:
 89 |         data = data[~(data[cols].isin(na_list))]
 90 |     return(data)
 91 | 
 92 | # Clean function
 93 | def clean(d, index_cols):
 94 |     # Remove missins
 95 |     d = d.dropna()
 96 |     # All but the last column
 97 |     #index_cols = list(d.columns[0:-1])
 98 |     # d = drop_custna(d, index_cols)
 99 |     return(d)
100 | 
101 | #-----------------------------------------------------------------#
102 | # Load indicators
103 | i5_index = ['connection_date', 'region_from', 'region_to']
104 | 
105 | i5 = loadfiles('origin_destination_connection_matrix_per_day',
106 |                admin = 2)
107 | i5e_mar = loadfiles('origin_destination_connection_matrix_per_day',
108 |                     path_external= INEW_PATH_2_mar)
109 | i5e_apr = loadfiles('origin_destination_connection_matrix_per_day',
110 |                     path_external= INEW_PATH_2_apr)
111 | 
112 | i7_index = ['home_region', 'day']
113 | i7 = loadfiles('mean_distance_per_day', admin = 2)
114 | 
115 | # March files where only rerun for i5 and i9 so I'm using the old extraction from feb to apr
116 | i7e_mar = loadfiles('mean_distance_per_day',
117 |                     path_external= IOLD_PATH_2_mar)
118 | i7e_apr = loadfiles('mean_distance_per_day',
119 |                     path_external= INEW_PATH_2_apr)
120 | 
121 | #-----------------------------------------------------------------#
122 | # Panel
123 | # Create panel
124 | def panel(d,
125 |                de,
126 |                index_cols,
127 |                #countvars,
128 |                r_suffix = '_ecnt',
129 |                timevar = None,
130 |                how = 'outer'):
131 |     if timevar is None:
132 |         timevar = index_cols[0]
133 |     # MAke sure time var is date
134 |     d[timevar] = d[timevar].astype('datetime64')    
135 |     de[timevar] = de[timevar].astype('datetime64')    
136 |     # Join
137 |     md = d.merge(de,
138 |                  on = index_cols, 
139 |                  how = how,
140 |                  suffixes=('', r_suffix))
141 |     return md
142 | 
143 | 
144 | d1_bol = (p7['day'] >= np.datetime64(dt.date(2020, 3, 15)))
145 | d2_bol = (p7['day'] >= np.datetime64(dt.date(2020, 4, 1)))
146 | 
147 | #--------#
148 | # i5 Panel
149 | p5 = panel(i5, i5e_mar, i5_index, timevar = 'connection_date')
150 | p5 = panel(p5, 
151 |                  i5e_apr, 
152 |                  i5_index,
153 |                 r_suffix= '_ecnt_apr', 
154 |                  timevar = 'connection_date')
155 | 
156 | d1_bol = (p5['connection_date'] >= np.datetime64(dt.date(2020, 3, 15)))
157 | d2_bol = (p5['connection_date'] >= np.datetime64(dt.date(2020, 4, 1)))
158 | 
159 | 
160 | countvars =  ['subscriber_count','od_count', 'total_count']
161 | for var in countvars:
162 |     varname = var + '_p'
163 |     # Base value as our indicator
164 |     p5[varname] = p5[var]
165 |     # Replace values based on dates
166 |     p5.loc[d1_bol, varname] = p5.loc[d1_bol, var + '_ecnt'] 
167 |     p5.loc[d2_bol, varname] = p5.loc[d2_bol, var + '_ecnt_apr'] 
168 | 
169 | p5 = p5.dropna(subset = ['connection_date']).sort_values(i5_index)
170 | 
171 | # p5.to_csv('C:/Users/wb519128/Desktop/i5_test.csv', index = False)
172 | 
173 | if EXPORT:
174 |     p5.to_csv(OUT_panel + 'i5_admin2_temp.csv', index = False)
175 | 
176 | #--------#
177 | # i7 Panel
178 | p7 = panel(i7, i7e_mar, i7_index, timevar = 'day')
179 | p7 = panel(p7,
180 |             i7e_apr, 
181 |             i7_index, 
182 |             r_suffix= '_ecnt_apr', 
183 |             timevar = 'day')
184 | 
185 | 
186 | d1_bol = (p7['day'] >= np.datetime64(dt.date(2020, 3, 15)))
187 | d2_bol = (p7['day'] >= np.datetime64(dt.date(2020, 4, 1)))
188 | 
189 | countvars =  ['mean_distance', 'stdev_distance']
190 | for var in countvars:
191 |     varname = var + '_p'
192 |     # Base value as our indicator
193 |     p7[varname] = p7[var]
194 |     # Replace values based on dates
195 |     p7.loc[d1_bol, varname] = p7.loc[d1_bol, var + '_ecnt'] 
196 |     p7.loc[d2_bol, varname] = p7.loc[d2_bol, var + '_ecnt_apr'] 
197 |     
198 | 
199 | 
200 | 
201 | p7 = p7.dropna(subset = ['day']).sort_values(i7_index)
202 | 
203 | # Export
204 | if EXPORT:
205 |     p7.to_csv(OUT_panel + 'i7_admin2_temp.csv', index = False)
206 | 
207 | 
208 | # p7.to_csv('C:/Users/wb519128/Desktop/i7_test.csv', index = False)
209 | 
210 | 


--------------------------------------------------------------------------------
/data-panel/Archive/panel_draft.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------#
  2 | # CREATE PANEL
  3 | #-----------------------------------------------------------------#
  4 | 
  5 | # This file combines two different sources of the indicators created
  6 | # in cdr-aggregation to create a panel.
  7 | 
  8 | # Dates at which different sources are connected are specific to 
  9 | # each indicator and the particularities of those sources
 10 | 
 11 | #-----------------------------------------------------------------#
 12 | # TO DO
 13 | 
 14 | # Rewrite load function
 15 | 
 16 | # Reorganize file paths to remove dependency on MASTER.py
 17 | 
 18 | #-----------------------------------------------------------------#
 19 | # Settings
 20 | 
 21 | import os
 22 | import re
 23 | import pandas as pd
 24 | import numpy as np
 25 | import datetime as dt
 26 | 
 27 | #-----------------------------------------------------------------#
 28 | # Globals
 29 | 
 30 | # Default connection date. 
 31 | append_date = dt.date(2020, 3, 15)
 32 | 
 33 | #-----------------------------------------------------------------#
 34 | # Function definitions
 35 | 
 36 | # Drop custom missigs
 37 | def drop_custna(data, columns):
 38 |     na_list = ['nan', '', '99999',  float("inf")] 
 39 |     for cols in columns:
 40 |         data = data[~(data[cols].isin(na_list))]
 41 |     return(data)
 42 | 
 43 | # Load files function
 44 | def loadfiles(file_name, 
 45 |               files_df = internal_indicators,
 46 |               admin = 3):
 47 |     # Set intex
 48 |     idx = files_df[(files_df['file'] == file_name) & (files_df['level'] == admin)].index.values[0]    # Load internal
 49 |     # Custom file names for i5, i7 and i9
 50 |     if file_name in ['mean_distance_per_day', 
 51 |                      'origin_destination_connection_matrix_per_day',
 52 |                      'mean_distance_per_week',
 53 |                      'month_home_vs_day_location_per_day',
 54 |                      'week_home_vs_day_location_per_day']:
 55 |         file_name_i = file_name + '_7day_limit.csv'
 56 |     else:
 57 |         file_name_i = file_name + '.csv'
 58 |     # External names
 59 |     file_name_e = file_name + '.csv'
 60 |     print(file_name, admin)
 61 |     # Load data
 62 |     d = None
 63 |     d = pd.read_csv(files_df['path'][idx] + file_name_i)
 64 |     # Load external
 65 |     if files_df['indicator'][idx] == 'flow':
 66 |         ext_path = IFLOW_path
 67 |     else:
 68 |         ext_path = ICUST_path
 69 |     # Load external file
 70 |     ext_folder = ext_path + 'admin' + str(files_df['level'][idx]) + '/' 
 71 |     de = None
 72 |     de = pd.read_csv(ext_folder + file_name_e)
 73 |     # Patch cleannig of headers in the middle of the data
 74 |     c1_name = d.columns[0]
 75 |     de = de[~de[c1_name].astype(str).str.contains(c1_name)]    
 76 |     return([d, de])
 77 | 
 78 | # Clean function
 79 | def clean(d, index_cols):
 80 |     # Remove missins
 81 |     d = d.dropna()
 82 |     # All but the last column
 83 |     #index_cols = list(d.columns[0:-1])
 84 |     d = drop_custna(d, index_cols)
 85 |     return(d)
 86 | 
 87 | # Create panel
 88 | def simp_panel(d,
 89 |                de,
 90 |                index_cols,
 91 |                #countvars,
 92 |                append_date,
 93 |                compare = False,
 94 |                timevar = None,
 95 |                how = 'outer'):
 96 |     if timevar is None:
 97 |         timevar = index_cols[0]
 98 |     # Clean
 99 |     d = clean(d, index_cols)
100 |     de = clean(de, index_cols)
101 |     # Join
102 |     md = d.merge(de,
103 |                  on = index_cols, 
104 |                  how = how,
105 |                  suffixes=('', '_ecnt'))
106 |     # Replace count values with internal until the 7th of march and 
107 |     # external after
108 |     countvars =  list(set(d.columns) - set(index_cols))
109 |     for var in countvars:
110 |         if compare:
111 |             varname = var + '_p'
112 |         else:
113 |             varname = var
114 |         
115 |         md[varname] = np.where(pd.to_datetime(md[timevar]).dt.date <= append_date, 
116 |                    md[var], 
117 |                    md[var + '_ecnt'])
118 |     # Remove other columns
119 |     if not compare:
120 |         md = md.filter(regex=r'^((?!_ecnt).)*$')
121 |     # Return
122 |     return md.sort_values(index_cols).dropna(subset= index_cols)
123 | 
124 | #-----------------------------------------------------------------#
125 | # Load indicators
126 | 
127 | # Define indicator class that 
128 | class i_indicator:
129 |     """
130 |     This class contains information to load indicator files both
131 |     from our original indicators and externally created ones.
132 |     
133 |     load() method loads both datasets
134 |     clean() method removes missings from both datasets
135 |     """
136 |     def __init__(self, 
137 |                  file_name, 
138 |                  index_cols,
139 |                  admin = 3):
140 |         self.file_name = file_name
141 |         self.index_cols = index_cols
142 |         self.admin = admin
143 |         # Call methods when intializing
144 |         self.load()
145 |         self.clean()
146 |     # Load data
147 |     def load(self):
148 |         self.data, self.data_e = loadfiles(self.file_name,
149 |                                            admin = self.admin)
150 |     # Clean data
151 |     def clean(self):
152 |         self.data = clean(self.data, self.index_cols)
153 |         self.data_e = clean(self.data_e, self.index_cols)
154 |     
155 |     # Create panel data
156 |     def create_panel(self,
157 |                     timevar = None,
158 |                     compare = False,
159 |                     append_date = append_date):
160 |         panel = simp_panel(self.data,
161 |                            self.data_e, 
162 |                            self.index_cols,
163 |                            append_date,
164 |                            compare = compare,
165 |                            timevar=timevar)
166 |         return panel
167 | 
168 | # Indicator 1
169 | # 	Sum across all observations in the given hour and lowest admin 
170 | # area.
171 | i1 = i_indicator('transactions_per_hour', 
172 |                  ['hour', 'region'])
173 | 
174 | # Indicator 2
175 | # Sum all unique subscribers with an observation in the given 
176 | # admin area and time period.
177 | i2 = i_indicator('unique_subscribers_per_hour',
178 |                  ['hour', 'region'])
179 | 
180 | 
181 | # Indicator 3
182 | # Sum all unique subscribers with an observation in the given 
183 | # admin area and time period.
184 | i3 = i_indicator('unique_subscribers_per_day',
185 |                  ['day', 'region'])
186 | 
187 | # Indicator 4
188 | # i4 = i_indicator('percent_of_all_subscribers_active_per_day',
189 | #                  ['home_region', 'day'])
190 | 
191 | # Indicator 5
192 | i5 = i_indicator('origin_destination_connection_matrix_per_day',
193 |                  ['connection_date', 'region_from', 'region_to'])
194 | # Indicator 7 
195 | i7 = i_indicator('mean_distance_per_day',
196 |                  ['home_region', 'day'])
197 | 
198 | # Indicator 8
199 | i8 = i_indicator('mean_distance_per_week',
200 |                  ['home_region', 'week'])
201 | 
202 | # Indicator 9
203 | i9 = i_indicator('week_home_vs_day_location_per_day',
204 |                  ['region', 'home_region', 'day'],
205 |                  admin = 2)
206 |  
207 | #-----------------------------------------------------------------#
208 | # Create panel 
209 | 
210 | # Make particular changes to  indicators as needed here
211 | 
212 | # Panel with defaults
213 | i_list = [i1, i2, i3, i5, i9]
214 | panel_list = list(map(lambda x: x.create_panel() , i_list)) 
215 | 
216 | # Custom arguments
217 | i7_p = i7.create_panel( timevar = 'day')
218 | 
219 | #-----------------------------------------------------------------#
220 | # Export 
221 |  


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/aggregator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | if os.environ['HOME'] != '/root':
  3 |     from modules.DataSource import *
  4 |     from modules.sql_code_aggregates import *
  5 |     databricks = False
  6 | else:
  7 |     databricks = True
  8 | 
  9 | # Databricks notebook source
 10 | class aggregator:
 11 |     """Class to handle aggregations.
 12 | 
 13 | 
 14 |     Attributes
 15 |     ----------
 16 |     result_stub : a string. File path where to save results
 17 |     datasource : an instance of DataSource class. Holds all dataframes and paths required
 18 |     regions : a pyspark dataframe. Admin level this aggregator will be used for
 19 |     calls : a pyspark dataframe. cdr data
 20 |     cells : a pyspark dataframe. admin region to tower mapping
 21 |     spark : an initialised spark connection. spark connection this aggregator should use
 22 |     dates : a dictionary. dates the aggregator should run over
 23 |     intermediate_tables : tables that we don't want written to csv
 24 | 
 25 | 
 26 |     Methods
 27 |     -------
 28 |     create_sql_dates()
 29 |         Convert the dates to strings to be used in the flowminder sql queries
 30 | 
 31 |     create_view(df, table_name)
 32 |         Creates a view of a dataframe
 33 | 
 34 |     save(table_name)
 35 |       Repartitions a dataframe into a single partition and writes it to a csv file
 36 | 
 37 |     save_and_report(table_name)
 38 |         Checks whether csv file exists before saving table_name to csv
 39 | 
 40 |     rename_csv(table_name)
 41 |         - rename a specific csv
 42 |         - move a csv to parent folder, rename it, then delete its remaining folder
 43 | 
 44 |     rename_all_csvs(table_name)
 45 |         renames all csvs at once
 46 | 
 47 |     rename_if_not_existing(table_name)
 48 |         rename only if the file doesn't exist as csv yet, handles errors
 49 | 
 50 |     check_if_file_exists(table_name)
 51 |         checks whether a csv exists before we re-create
 52 | 
 53 | 
 54 | 
 55 |     """
 56 | 
 57 |     def __init__(self,
 58 |                  result_stub,
 59 |                  datasource,
 60 |                  regions,
 61 |                  intermediate_tables = ['home_locations']):
 62 |         """
 63 |         Parameters
 64 |         ----------
 65 |         result_stub : where to save results
 66 |         datasource : holds all dataframes and paths required
 67 |         regions : admin level this aggregator will be used for
 68 |         intermediate_tables : tables that we don't want written to csv
 69 |         """
 70 |         self.datasource = datasource
 71 |         self.result_path = datasource.results_path + result_stub
 72 |         self.calls = datasource.parquet_df
 73 |         self.calls.createOrReplaceTempView('calls')
 74 |         self.cells = getattr(datasource, regions)
 75 |         self.cells.createOrReplaceTempView("cells")
 76 |         self.spark = datasource.spark
 77 |         self.dates = datasource.dates
 78 |         self.create_sql_dates()
 79 |         self.sql_code = write_sql_code(calls = self.calls,
 80 |                                        start_date = self.dates_sql['start_date'],
 81 |                                        end_date = self.dates_sql['end_date'],
 82 |                                        start_date_weeks = self.dates_sql['start_date_weeks'],
 83 |                                        end_date_weeks = self.dates_sql['end_date_weeks'])
 84 |         self.table_names = self.sql_code.keys()
 85 |         self.intermediate_tables = intermediate_tables
 86 | 
 87 |     def create_sql_dates(self):
 88 |         self.dates_sql = {'start_date' : "\'" + self.dates['start_date'].isoformat('-')[:10] +  "\'",
 89 |                           'end_date' :  "\'" + self.dates['end_date'].isoformat('-')[:10] +  "\'",
 90 |                           'start_date_weeks' :  "\'" + self.dates['start_date_weeks'].isoformat('-')[:10] +  "\'",
 91 |                           'end_date_weeks' : "\'" + self.dates['end_date_weeks'].isoformat('-')[:10] +  "\'"}
 92 | 
 93 |     def create_view(self, df, table_name):
 94 |       df.createOrReplaceTempView(table_name)
 95 | 
 96 |     def save(self, df, table_name):
 97 |       df.repartition(1).write.mode('overwrite').format('com.databricks.spark.csv') \
 98 |         .save(os.path.join(self.result_path, table_name), header = 'true')
 99 | 
100 |     def save_and_report(self, df, table_name):
101 |       if table_name not in self.intermediate_tables:
102 |         if self.check_if_file_exists(table_name):
103 |             print('Skipped: ' + table_name)
104 |         else:
105 |             print('--> File does not exist. Saving: ' + table_name)
106 |             self.save(df, table_name)
107 |       else:
108 |         print('Caching: home_locations')
109 |         df.createOrReplaceTempView("home_locations")
110 |         self.spark.sql('CACHE TABLE home_locations').collect()
111 |       self.create_view(df, table_name)
112 |       return table_name
113 | 
114 |     def rename_csv(self, table_name):
115 |       # move one folder up and rename to human-legible .csv name
116 |       if databricks:
117 |           dbutils.fs.mv(dbutils.fs.ls(self.result_path + '/' + table_name)[-1].path,
118 |                   self.result_path + '/' + table_name + '.csv')
119 |           # remove the old folder
120 |           dbutils.fs.rm(self.result_path + '/' + table_name + '/', recurse = True)
121 |       else:
122 |           os.rename(glob.glob(os.path.join(self.result_path, table_name + '/*.csv'))[0],
123 |                             os.path.join(self.result_path, table_name + '.csv'))
124 |           shutil.rmtree(os.path.join(self.result_path, table_name))
125 | 
126 |     def save_and_rename_one(self, df, table_name):
127 |       self.rename_if_not_existing(self.save_and_report(df, table_name))
128 | 
129 |     def rename_all_csvs(self):
130 |       for table_name in self.table_names:
131 |         if table_name in self.intermediate_tables:
132 |           pass
133 |         else:
134 |             self.rename_if_not_existing(table_name)
135 | 
136 |     def rename_if_not_existing(self, table_name):
137 |             if databricks:
138 |               try:
139 |                 # does the csv already exist
140 |                 dbutils.fs.ls(self.result_path + '/' + table_name + '.csv')
141 |               except Exception as e:
142 |                 # the csv doesn't exist yet, move the file and delete the folder
143 |                 if 'java.io.FileNotFoundException' in str(e):
144 |                   print('--> Renaming: ' + table_name)
145 |                   self.rename_csv(table_name)
146 |                 else:
147 |                   raise
148 |             else:
149 |               if os.path.exists(self.result_path + '/' + table_name + '.csv'):
150 |                   pass
151 |               else:
152 |                   print('--> Renaming: ' + table_name)
153 |                   self.rename_csv(table_name)
154 | 
155 |     def check_if_file_exists(self, table_name):
156 |         if databricks:
157 |           try:
158 |             # does the folder exist?
159 |             dbutils.fs.ls(self.result_path + '/' + table_name)
160 |             return True
161 |           except Exception as e:
162 |             # the folder does not exist
163 |             if 'java.io.FileNotFoundException' in str(e):
164 |               try:
165 |                 # does the csv exist?
166 |                 dbutils.fs.ls(self.result_path + '/' + table_name + '.csv')
167 |                 return True
168 |               except Exception as e:
169 |                 # the csv does not exist
170 |                 if 'java.io.FileNotFoundException' in str(e):
171 |                   return False
172 |                 else:
173 |                    raise
174 |             else:
175 |               raise
176 |         else:
177 |             return os.path.exists(self.result_path + '/' + table_name) | \
178 |                    os.path.exists(self.result_path + '/' + table_name + '.csv')
179 | 


--------------------------------------------------------------------------------
/data-checks/Archive/od_scaling.py:
--------------------------------------------------------------------------------
  1 | #-----------------------------------------------------------------#
  2 | # OD matrix scaling checks
  3 | #-----------------------------------------------------------------#
  4 | 
  5 | # This code depends on MASTER.py to run as file path objects are
  6 | # defined there
  7 | 
  8 | #-----------------------------------------------------------------#
  9 | # Settings
 10 | 
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | import datetime
 14 | import os
 15 | 
 16 | 
 17 | #-----------------------------------------------------------------#
 18 | # Load data
 19 | od = pd.read_csv(I5_Adm3_path + 
 20 |                  "origin_destination_connection_matrix_per_day.csv")
 21 | 
 22 | 
 23 | # Number of residents
 24 | res = pd.read_csv(FLOWM_adm3_path + 
 25 |                  "home_location_counts_per_region.csv")
 26 | 
 27 | # Active residents
 28 | ares = pd.read_csv(FLOWM_adm3_path + 
 29 |                    "count_unique_active_residents_per_region_per_day.csv")
 30 | 
 31 | # Number of calls 
 32 | cal = pd.read_csv(FLOWM_adm3_path + 
 33 |                    "total_calls_per_region_per_day.csv")
 34 | 
 35 | 
 36 | #-----------------------------------------------------------------#
 37 | # Process data
 38 | 
 39 | # Create date variable
 40 | def convert_dates(df,date_col ='connection_date'):
 41 |     df['date'] = pd.\
 42 |         to_datetime(df[date_col]).\
 43 |             dt.date
 44 |     return(df)
 45 | 
 46 | od = convert_dates(od, 'connection_date')
 47 | ares = convert_dates(ares, 'visit_date')
 48 | cal = convert_dates(cal, 'call_date')
 49 | 
 50 | #-----------------------------------------------------------------#
 51 | # Create different scaling factors
 52 | 
 53 | #--------------------#
 54 | # Create new variables
 55 | 
 56 | # Number of active subscribers over total residents
 57 | ares = ares.merge(res.rename(columns={"subscriber_count" : "residents"}), 
 58 |                   on = 'region', 
 59 |                   how='outer')
 60 | 
 61 | ares = ares.rename(columns={"subscriber_count" : 'active_res'})
 62 | 
 63 | # Check pp > 1 !!!!
 64 | ares['p_active_res'] = ares['active_res']/ares['residents']
 65 | 
 66 | 
 67 | 
 68 | # Number of calls over residents
 69 | cal = cal.merge(res.rename(columns={"subscriber_count" : "residents"}), 
 70 |                   on = 'region', 
 71 |                   how='outer')
 72 | 
 73 | cal['p_cals'] = cal['total_calls']/cal['residents']
 74 | 
 75 | #------------------------------#
 76 | # Add new variables to od matrix
 77 | 
 78 | # Proportion of active residents in orig and dest
 79 | od = od.\
 80 |     merge(ares[['region','date', 'p_active_res']], 
 81 |           left_on= ['region_from','date'], 
 82 |           right_on= ['region', 'date'], 
 83 |           how='left').\
 84 |             rename(columns={'p_active_res' : 'p_active_res_O'}).\
 85 |             drop(columns='region').\
 86 |     merge(ares[['region','date', 'p_active_res']], 
 87 |           left_on= ['region_to','date'], 
 88 |           right_on= ['region', 'date'], 
 89 |           how='left').\
 90 |             rename(columns={'p_active_res' : 'p_active_res_D'}).\
 91 |             drop(columns='region')
 92 | 
 93 | 
 94 | # Proportion of calls per residents in orig and dest
 95 | od = od.\
 96 |     merge(cal[['region','date', 'p_cals']], 
 97 |           left_on= ['region_from','date'], 
 98 |           right_on= ['region', 'date'], 
 99 |           how='left').\
100 |             rename(columns={'p_cals' : 'p_cals_O'}).\
101 |             drop(columns='region').\
102 |     merge(cal[['region','date', 'p_cals']], 
103 |           left_on= ['region_to','date'], 
104 |           right_on= ['region', 'date'], 
105 |           how='left').\
106 |             rename(columns={'p_cals' : 'p_cals_D'}).\
107 |             drop(columns='region')
108 | 
109 | 
110 | #-----------------#
111 | # Create indicators
112 | 
113 | # Multiplication of total active residents in origin and 
114 | # destiantion
115 | od['w1'] = od['p_active_res_O'] * od['p_active_res_D']
116 | 
117 | 
118 | # Sum of calls per person in origin and destinaion
119 | od['w2'] = od['p_cals_O'] + od['p_cals_D']
120 | 
121 | 
122 | # od['p_cals_O'].isnull().sum()/od.shape[0] 
123 | # 0.5159950493247425
124 | 
125 | #-----------------------------------------------------------------#
126 | # Create scaled values
127 | od['total_count_w1'] = od['total_count']/od['w1'] 
128 | 
129 | od['total_count_w2'] = od['total_count']/od['w2'] 
130 | 
131 | #-----------------------------------------------------------------#
132 | # Plot
133 | 
134 | # Set origin region
135 | od1 = od[od['region_from'] == 'ZW102109']
136 | 
137 | # Select a set of destinations
138 | # od1_top_dest = ['ZW120435','ZW142513','ZW192205',
139 | #                 'ZW130720','ZW170530' ]
140 | 
141 | od1_top_dest = od1['region_to'].value_counts().head(9).index
142 | 
143 | # Create plot df
144 | # p1_df = od1[od1['region_to'] == 'ZW120435']
145 | p1_df = od1[od1['region_to'].isin(od1_top_dest)]
146 | p1_df.set_index(['date'],inplace=True)
147 | 
148 | 
149 | # Plot function that already adds it to the grid
150 | def add_plts(dest_value,
151 |              grid_pos,
152 |              df = p1_df,
153 |              dest_var = 'region_to',
154 |              #x_axis = 'connection_date',
155 |              y_axis = 'total_count'):
156 |   
157 |     df[df[dest_var] == dest_value].\
158 |     plot(y= y_axis,
159 |          legend= False,
160 |          ax = fig.add_subplot(grid_pos))
161 | 
162 | # Run plots
163 | # # Gambiarra da porra. Fazer isso melhor se tiver tempo
164 | # def plots_together(var):
165 | #     fig, ax = plt.subplots(nrows=3,ncols=3)
166 | #     fig = plt.figure()
167 | #     gs = fig.add_gridspec(3, 3)
168 |     
169 | #     add_plts(od1_top_dest[0], gs[0, 0], y_axis = var)
170 | #     add_plts(od1_top_dest[1], gs[0, 1], y_axis = var)
171 | #     add_plts(od1_top_dest[2], gs[0, 2], y_axis = var)
172 | #     add_plts(od1_top_dest[3], gs[1, 0], y_axis = var)
173 | #     add_plts(od1_top_dest[4], gs[1, 1], y_axis = var)
174 | #     add_plts(od1_top_dest[5], gs[1, 2], y_axis = var)
175 | #     add_plts(od1_top_dest[6], gs[2, 0], y_axis = var)
176 | #     add_plts(od1_top_dest[7], gs[2, 1], y_axis = var)
177 | #     add_plts(od1_top_dest[8], gs[2, 2], y_axis = var)
178 |     
179 | #     return(fig)
180 | #     # fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png')
181 | 
182 | # plots_together('total_count')
183 | 
184 | var = 'total_count'
185 | 
186 | # Set plot parameters
187 | fig, ax = plt.subplots(nrows=3,ncols=3)
188 | fig = plt.figure()
189 | gs = fig.add_gridspec(3, 3)
190 | 
191 | 
192 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var)
193 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var)
194 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var)
195 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var)
196 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var)
197 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var)
198 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var)
199 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var)
200 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var)
201 | 
202 | # Export
203 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png')
204 | 
205 | 
206 | var = 'total_count_w2'
207 | 
208 | # Set plot parameters
209 | fig, ax = plt.subplots(nrows=3,ncols=3)
210 | fig = plt.figure()
211 | gs = fig.add_gridspec(3, 3)
212 | 
213 | 
214 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var)
215 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var)
216 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var)
217 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var)
218 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var)
219 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var)
220 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var)
221 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var)
222 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var)
223 | 
224 | # Export
225 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png')
226 | 
227 | var = 'total_count_w1'
228 | 
229 | # Set plot parameters
230 | fig, ax = plt.subplots(nrows=3,ncols=3)
231 | fig = plt.figure()
232 | gs = fig.add_gridspec(3, 3)
233 | 
234 | 
235 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var)
236 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var)
237 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var)
238 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var)
239 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var)
240 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var)
241 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var)
242 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var)
243 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var)
244 | 
245 | # Export
246 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png')
247 | 
248 | 
249 | # df = p1_df
250 | # dest_value = od1_top_dest[0]
251 | # dest_var = 'region_to'
252 | # x_axis = 'connection_date'
253 | # y_axis = 'total_count'
254 | 
255 | # df[df[dest_var] == dest_value].\
256 | #     plot(y= y_axis,
257 | #          legend= False,
258 | #          fontsize=6,
259 | #          rot= 30)
260 | # plt.show()
261 | 


--------------------------------------------------------------------------------
/data-checks/Archive/quick_checks/ward_neighbors_tower_down.R:
--------------------------------------------------------------------------------
  1 | # Check subscribers data
  2 | 
  3 | FIG_PATH <- file.path(PROJECT_PATH, "proof-of-concept",
  4 |                       "outputs", "data-checks", "figures_indicators", "subscribers_neighbors_daily")
  5 | 
  6 | FIG_PATH_OUTLIER <- file.path(PROJECT_PATH, "proof-of-concept",
  7 |                               "outputs", "data-checks", "figures_indicators", "subscribers_neighbors_daily_outlier")
  8 | 
  9 | # Load Data --------------------------------------------------------------------
 10 | ISAAC_DATA_PATH_2 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin2_flowminder")
 11 | ISAAC_DATA_PATH_3 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin3_flowminder")
 12 | 
 13 | #### Wards
 14 | wards_sp <- readRDS(file.path(CLEAN_DATA_ADM3_PATH, "wards_aggregated.Rds"))
 15 | 
 16 | #### Tower down
 17 | towers_down <- read.csv(file.path(PROOF_CONCEPT_PATH, 
 18 |                                   "outputs", 
 19 |                                   "data-checks", 
 20 |                                   "days_wards_with_low_hours_I1_panel.csv"))
 21 | 
 22 | towers_down <- towers_down %>%
 23 |   dplyr::select(region, date) %>%
 24 |   mutate(tower_down = T) %>%
 25 |   mutate(date = date %>% as.character %>% as.Date(),
 26 |          region = region %>% as.character())
 27 | 
 28 | #### Raw Data
 29 | df_day_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2,
 30 |                                       "count_unique_subscribers_per_region_per_day.csv"),
 31 |                             stringsAsFactors=F) %>%
 32 |   dplyr::rename(value_raw = subscriber_count,
 33 |                 date = visit_date) %>%
 34 |   dplyr::mutate(region = region %>% as.character(),
 35 |                 date = date %>% as.Date())
 36 | 
 37 | df_week_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2,
 38 |                                        "count_unique_subscribers_per_region_per_week.csv"),
 39 |                              stringsAsFactors=F) %>%
 40 |   dplyr::rename(value_raw = subscriber_count,
 41 |                 date = visit_week) %>%
 42 |   dplyr::mutate(region = region %>% as.character())
 43 | 
 44 | df_day_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3,
 45 |                                       "count_unique_subscribers_per_region_per_day.csv"),
 46 |                             stringsAsFactors=F) %>%
 47 |   dplyr::rename(value_raw = subscriber_count,
 48 |                 date = visit_date) %>%
 49 |   dplyr::mutate(region = region %>% as.character(),
 50 |                 date = date %>% as.Date())
 51 | 
 52 | df_week_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3,
 53 |                                        "count_unique_subscribers_per_region_per_week.csv"),
 54 |                              stringsAsFactors=F) %>%
 55 |   dplyr::rename(value_raw = subscriber_count,
 56 |                 date = visit_week) %>%
 57 |   dplyr::mutate(region = region %>% as.character())
 58 | 
 59 | #### Cleaned Data
 60 | df_day_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH,
 61 |                                  "count_unique_subscribers_per_region_per_day.Rds")) %>%
 62 |   left_join(df_day_adm2_raw, by=c("date", "region"))
 63 | 
 64 | df_week_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH,
 65 |                                   "count_unique_subscribers_per_region_per_week.Rds"))
 66 | 
 67 | df_day_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH,
 68 |                                  "count_unique_subscribers_per_region_per_day.Rds")) %>%
 69 |   left_join(df_day_adm3_raw, by=c("date", "region")) %>%
 70 |   mutate(value_raw = value_raw %>% as.numeric())
 71 | 
 72 | df_week_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH,
 73 |                                   "count_unique_subscribers_per_region_per_week.Rds"))
 74 | 
 75 | # Create Ward Neighbors --------------------------------------------------------
 76 | #### Region and id datasets
 77 | ward_id_df <- wards_sp@data %>%
 78 |   dplyr::select(region) %>%
 79 |   mutate(id = 1:n())
 80 | 
 81 | #### Create neighbor matrix
 82 | neighbor_df <- gTouches(wards_sp, byid=TRUE) %>%
 83 |   as.data.frame() %>%
 84 |   mutate(id = 1:n()) %>%
 85 |   pivot_longer(-id) %>%
 86 |   dplyr::rename(n_id = name,
 87 |                 neighbors = value) %>%
 88 |   dplyr::mutate(n_id = n_id %>% as.numeric()) %>%
 89 |   
 90 |   # id_n (neighbor) region
 91 |   left_join(ward_id_df, by = c("n_id" = "id")) %>%
 92 |   dplyr::rename(n_region = region) %>%
 93 |   
 94 |   # id region
 95 |   left_join(ward_id_df, by = "id") %>%
 96 |   
 97 |   # restrict to neighbors
 98 |   filter(neighbors %in% T)
 99 | 
100 | #### Merge data to neighbor matrix
101 | ward_data <- df_day_adm3 %>%
102 |   dplyr::select(region, date, value, value_raw)
103 | 
104 | neighbor_df <- neighbor_df %>%
105 |   
106 |   # neighbor data
107 |   left_join(ward_data, by = c("n_region" = "region")) %>%
108 |   dplyr::rename(value_n = value,
109 |                 value_raw_n = value_raw) %>%
110 |   
111 |   # ward data
112 |   left_join(ward_data, by = c("region", "date"))
113 | 
114 | #### Merge in Neighbor down
115 | neighbor_df <- neighbor_df %>%
116 |   left_join(towers_down, by = c("region", "date")) %>%
117 |   
118 |   # tower down on any day?
119 |   group_by(region) %>%
120 |   mutate(tower_down_anyday = (TRUE %in% tower_down)) %>%
121 |   
122 |   # restrict to observations where tower down on any day
123 |   filter(tower_down_anyday %in% T)
124 | 
125 | #### Merge in province
126 | prov_df <- wards_sp@data %>%
127 |   dplyr::select(region, province) 
128 | 
129 | neighbor_df <- neighbor_df %>%
130 |   left_join(prov_df, by="region")
131 | 
132 | # Neighbor Stats ---------------------------------------------------------------
133 | # TODO: Not naming things well, should be value_n_raw_avg, for example
134 | #### Average neighbor value
135 | neighbor_df <- neighbor_df %>%
136 |   group_by(region, date) %>%
137 |   mutate(value_n_avg = mean(value_raw_n, na.rm=T))
138 | 
139 | #### Percen change of neighbor value from average
140 | neighbor_df <- neighbor_df %>%
141 |   group_by(n_region) %>%
142 |   mutate(region_n_value_avg = mean(value_raw_n, na.rm=T)) %>%
143 |   mutate(region_n_value_pc = (value_raw_n - region_n_value_avg)/region_n_value_avg) %>%
144 |   mutate(region_n_value_pc_max = max(region_n_value_pc, na.rm=T))
145 | 
146 | # Export Datset ----------------------------------------------------------------
147 | #neighbor_df_clean <- neighbor_df %>%
148 | #  dplyr::select(region, n_region, date, value_n)
149 | 
150 | #head(neighbor_df)
151 | 
152 | 
153 | 
154 | # Trends Over Time -------------------------------------------------------------
155 | neighbor_df %>% 
156 |   filter(id %in% 10) %>%
157 |   ggplot() +
158 |   geom_vline(data = . %>% filter(tower_down), aes(xintercept = date),
159 |              color = "gray50", size=2, alpha = 0.2) +
160 |   geom_line(aes(x=date, y=value_raw_n, 
161 |                 group=n_id %>% as.factor(), 
162 |                 color=n_id %>% as.factor())) +
163 |   geom_line(aes(x=date, y=value_raw), size=2, color="black") +
164 |   theme_minimal() +
165 |   theme(legend.position = "none")
166 | 
167 | 
168 | lapply(unique(neighbor_df$province), function(province_i){
169 |   print(province_i)
170 |   
171 |   p <- neighbor_df %>% 
172 |     filter(province %in% province_i) %>%
173 |     ggplot() +
174 |     geom_vline(data = . %>% filter(tower_down), aes(xintercept = date),
175 |                color = "gray50", size=2, alpha = 0.2) +
176 |     geom_line(aes(x=date, y=value_raw), size=1.5, color="black") +
177 |     geom_line(aes(x=date, y=value_n_avg), size=1.5, color="red") +
178 |     geom_line(aes(x=date, y=value_raw_n, 
179 |                   group=n_id %>% as.factor(), 
180 |                   color=n_id %>% as.factor()),
181 |               size=.4) +
182 |     theme_minimal() +
183 |     theme(legend.position = "none") +
184 |     facet_wrap(~region,
185 |                scales = "free_y")
186 |   
187 |   ggsave(p, filename = file.path(FIG_PATH, paste0(province_i, ".png")), height = 25, width = 25)
188 |   
189 |   return(NULL)
190 | })
191 | 
192 | # Bad Cases -------------------------------------------------------------
193 | for(percent in c(50, 75, 100)){
194 |   
195 |   print(percent)
196 |   
197 |   neighbor_df_bad <- neighbor_df %>%
198 |     mutate(keep = (tower_down %in% TRUE) & (region_n_value_pc > percent/100)) %>%
199 |     group_by(region) %>%
200 |     mutate(keep_any = (TRUE %in% keep)) %>%
201 |     ungroup() %>%
202 |     filter(keep_any %in% TRUE) %>%
203 |     filter(region_n_value_pc_max > percent/100)
204 |   
205 |   p_bad <- neighbor_df_bad %>%
206 |     ggplot() +
207 |     geom_vline(data = . %>% filter(tower_down), aes(xintercept = date),
208 |                color = "gray50", size=2, alpha = 0.2) +
209 |     geom_line(aes(x=date, y=value_raw), size=1.75, color="black") +
210 |     geom_line(aes(x=date, y=value_raw_n, 
211 |                   group=n_id %>% as.factor(), 
212 |                   color=n_id %>% as.factor()),
213 |               size=1) +
214 | 
215 |     #geom_line(aes(x=date, y=value_n_avg), size=1.5, color="red") +
216 |     theme_minimal() +
217 |     theme(legend.position = "none") +
218 |     facet_wrap(~region,
219 |                scales = "free_y")
220 |   
221 |   ggsave(p_bad, filename = file.path(FIG_PATH_OUTLIER, paste0(percent, "percent_thresh.png")), height = 25, width = 25)
222 | }
223 | 
224 | 
225 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/tower_clustering.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | import geopandas as gpd
  3 | import numpy as np
  4 | import pandas as pd
  5 | from shapely.geometry import Polygon, LineString
  6 | from sklearn.neighbors import DistanceMetric
  7 | from scipy.spatial.distance import squareform
  8 | from scipy.cluster.hierarchy import linkage
  9 | from scipy.cluster.hierarchy import fcluster
 10 | from copy import deepcopy
 11 | import os
 12 | if os.environ['HOME'] != '/root':
 13 |     from modules.utilities import *
 14 |     databricks = False
 15 | else:
 16 |     databricks = True
 17 | 
 18 | 
 19 | ## Class to handle spark and df in session
 20 | class tower_clusterer:
 21 |     """Class to cluster towers together.
 22 | 
 23 | 
 24 |     Attributes
 25 |     ----------
 26 |     datasource :  an instance of DataSource class.
 27 |     shape : a geopandas dataframe. Shapefile to use for clustering
 28 |     region_var : a string. Name of the region variable in the shapefile.
 29 |     sites :  a string. Name of the attribute of datasource that holds the tower coordinates.
 30 |     shape_df : a pyspark dataframe. Shapefile to use for clustering, in pyspark df.
 31 |     spark : an initialised spark connection
 32 |     spark_df : a pyspark dataframe. Holds the cdr data
 33 |     result_path : a string. Where to save results.
 34 |     filename : a string. Name for result file.
 35 |     dist : a string. Metric to use to calculate distances.
 36 |     sites :  a pyspark dataframe. Code, Lat, Lng for all tower_sites
 37 |     sites_with_clusters : a pyspark dataframe. Clustered sites (once methods have run)
 38 | 
 39 | 
 40 | 
 41 |     Methods
 42 |     -------
 43 |     cluster_towers()
 44 |         runs clustering algorithm
 45 | 
 46 |     get_centroids()
 47 |         computes centroids of clusters
 48 | 
 49 |     map_to_regions()
 50 |         maps cluster centroids to admin regions
 51 | 
 52 |     save_results()
 53 |         saves the results to csv
 54 | 
 55 |     """
 56 | 
 57 |     def __init__(self,
 58 |                 datasource,
 59 |                 shape,
 60 |                 region_var,
 61 |                 sites = 'tower_sites'):
 62 |         """
 63 |         Parameters
 64 |         ----------
 65 |         datasource :  an instance of DataSource class.
 66 |         shape : a geopandas dataframe. Shapefile to use for clustering
 67 |         region_var : a string. Name of the region variable in the shapefile.
 68 |         sites :  a string. Name of the attribute of datasource that holds the tower coordinates.
 69 |         """
 70 |         self.datasource = datasource
 71 |         self.spark = datasource.spark
 72 |         self.shape = getattr(datasource, shape + '_gpd')
 73 |         self.shape_df = getattr(datasource, shape)
 74 |         self.result_path = datasource.results_path
 75 |         self.filename = shape
 76 |         self.region_var = region_var
 77 |         self.dist = DistanceMetric.get_metric('haversine')
 78 |         sites_df = getattr(datasource, sites + '_pd')
 79 |         if (sites_df.columns == ['cell_id', 'LAT', 'LNG']).all():
 80 |           self.sites = sites_df[sites_df.LAT.notna()]
 81 |           self.sites_with_clusters = self.sites
 82 |         else:
 83 |           raise 'The sites dataframe does not have the correct columns / \
 84 |             column order. Should be cell_id, LAT, LNG'
 85 | 
 86 |     def cluster_towers(self):
 87 |         ## deepcopy sites since we will need it later on
 88 |         self.radians = deepcopy(self.sites)
 89 |         # convert degrees to radians
 90 |         self.radians['LAT'] = np.radians(self.sites['LAT'])
 91 |         self.radians['LNG'] = np.radians(self.sites['LNG'])
 92 |         # run clustering algorithm
 93 |         self.clusters = fcluster(
 94 |             linkage(
 95 |             squareform(
 96 |             self.dist.pairwise(self.radians[['LAT','LNG']]\
 97 |             .to_numpy())*6373), method='ward'), t = 1, criterion = 'distance')
 98 |         self.sites_with_clusters = self.radians
 99 |         self.sites_with_clusters['cluster'] = self.clusters
100 |         # compute centroids of clusters
101 |         self.get_centroids()
102 |         self.sites_with_clusters['LAT'] = np.rad2deg(self.sites_with_clusters['LAT'])
103 |         self.sites_with_clusters['LNG'] = np.rad2deg(self.sites_with_clusters['LNG'])
104 |         self.sites_with_clusters['centroid_LAT'] = \
105 |             np.rad2deg(self.sites_with_clusters['centroid_LAT'])
106 |         self.sites_with_clusters['centroid_LNG'] = \
107 |             np.rad2deg(self.sites_with_clusters['centroid_LNG'])
108 |         # put clusters in geodataframe
109 |         self.sites_gpd = gpd.GeoDataFrame(self.sites_with_clusters,
110 |                                           geometry=gpd.points_from_xy(
111 |                                           self.sites_with_clusters.centroid_LNG,
112 |                                           self.sites_with_clusters.centroid_LAT),
113 |                                           crs = 'epsg:4326')
114 |         # compute distances between cluters
115 |         self.distances_pd = pd.DataFrame(
116 |             self.dist.pairwise(
117 |             np.radians(
118 |             self.sites_with_clusters[['centroid_LAT','centroid_LNG']])\
119 |                 .to_numpy())*6373, columns=self.sites_with_clusters.cell_id.unique(),
120 |                                     index=self.sites_with_clusters.cell_id.unique())
121 |         # create long form of distance matrix
122 |         distances = []
123 |         origin = []
124 |         destination = []
125 |         for a in self.distances_pd.index:
126 |           for b in self.distances_pd.index:
127 |             distances.append(self.distances_pd.loc[a,b])
128 |             origin.append(a)
129 |             destination.append(b)
130 |         self.distances_pd_long = pd.DataFrame(list(zip(distances, origin, destination)),
131 |             columns =['distance', 'origin', 'destination'])
132 |         # map clusters to regions
133 |         self.map_to_regions()
134 |         return self.save_results()
135 | 
136 |     def get_centroids(self):
137 |       # loop through clusters to compute centroids
138 |       for cluster_num in self.sites_with_clusters.cluster.unique():
139 |         subset = self.sites_with_clusters[self.sites_with_clusters.cluster == cluster_num]
140 |         # use line method if we have only two towers in cluster
141 |         if len(subset) == 2:
142 |             line = LineString(subset.loc[:,['LNG', 'LAT']].to_numpy())
143 |             self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \
144 |                 cluster_num, 'centroid_LNG'] = line.interpolate(0.5, normalized = True).x
145 |             self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \
146 |                 cluster_num, 'centroid_LAT'] = line.interpolate(0.5, normalized = True).y
147 |         # use polygon method if we have more than two towers in cluster
148 |         if len(subset) > 2:
149 |             self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \
150 |                 cluster_num, 'centroid_LNG'] = \
151 |                 Polygon(subset.loc[:,['LNG', 'LAT']].to_numpy()).convex_hull.centroid.x
152 |             self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \
153 |                 cluster_num, 'centroid_LAT'] = \
154 |                 Polygon(subset.loc[:,['LNG', 'LAT']].to_numpy()).convex_hull.centroid.y
155 |       # replace NAs
156 |       self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(),
157 |         'centroid_LNG'] = \
158 |         self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(), 'LNG']
159 |       self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(),
160 |         'centroid_LAT'] = \
161 |         self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(), 'LAT']
162 | 
163 |     def map_to_regions(self):
164 |       # spatial join clusteres with shapefile
165 |       self.joined = gpd.sjoin(self.sites_gpd, self.shape, op="intersects")
166 | 
167 |     def save_results(self):
168 |       # save results of mapping of clusters to regions
169 |       self.joined = self.joined.rename(columns={self.region_var:'region'})
170 |       self.towers_regions_clusters_all_vars = \
171 |         self.joined.loc[:,['cell_id', 'LAT', 'LNG', 'centroid_LAT',
172 |                            'centroid_LNG', 'region', 'cluster']]
173 |       self.towers_regions_clusters_all_vars  = \
174 |         self.spark.createDataFrame(self.towers_regions_clusters_all_vars)
175 |       save_csv(self.towers_regions_clusters_all_vars,
176 |         self.result_path,
177 |         self.datasource.country_code + '_' + self.filename + '_tower_map_all_vars')
178 |       # save results with only essential variables, for use in data processing
179 |       self.towers_regions_clusters = \
180 |         self.joined.loc[:,['cell_id', 'region']]
181 |       self.towers_regions_clusters  = \
182 |         self.spark.createDataFrame(self.towers_regions_clusters)
183 |       save_csv(self.towers_regions_clusters,
184 |         self.result_path,
185 |         self.datasource.country_code + '_' + self.filename + '_tower_map')
186 |       # save distance matrix in long form
187 |       self.distances_df_long  = \
188 |         self.spark.createDataFrame(self.distances_pd_long)
189 |       save_csv(self.distances_df_long,
190 |         self.result_path, self.datasource.country_code + '_distances_pd_long')
191 |       # save shapefile used, for dashboarding
192 |       save_csv(self.shape_df, self.result_path,
193 |         self.datasource.country_code + '_' + self.filename  + '_shapefile')
194 |       return self.towers_regions_clusters, self.distances_df_long
195 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/aggregation_master.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Production of indicators for the COVID19 Mobility Task Force
  5 | # 
  6 | # In this notebook we produce indicators for the [COVID19 Mobility Task Force](https://github.com/worldbank/covid-mobile-data).
  7 | # 
  8 | # [Flowminder](https://covid19.flowminder.org) indicators are produced to increase the availability of comparable datasets across countries, and have been copied without modification from the [Flowminder COVID-19 github repository](https://github.com/Flowminder/COVID-19) (except for the start and end dates). These have been supplemented by a set of *priority* indicators with data for ingestion into the dashboard in this repository.
  9 | # 
 10 | # In this notebook we produce indicators in the following four steps:
 11 | # 
 12 | # - **Import code**: The code for the aggregation is included in the 'custom_aggregation' and 'flowminder_aggregation' scripts
 13 | # - **Import data**: 
 14 | # To set up the data import we need to place the CDR data files into the `data/new/CC/telco/` folder, where we replace `CC` with the country code and `telco` with the company abbreviation. 
 15 | # We also need to place csv files with the tower-region mapping and distance matrices into the `data/support-data/CC/telco/geofiles` folder, and then modify the `data/support_data/config_file.py` to specify:
 16 | #     - *geofiles*: the names of the geofiles, 
 17 | #     - *country_code*: country code and company abbreviation,
 18 | #     - *telecom_alias*: the path to the `data` folder,
 19 | #     - *data_paths*: the names to the subfolders in `data/new/CC/telco/` that hold the csv files. Simply change this to `[*]` if you didn't create subfolders and want to load all files.
 20 | #     - *dates*: set the start and end date of the data you want to produce the indicators for.
 21 | #     
 22 | # Find more information about the `config_file.py` settings see the [github page](https://github.com/worldbank/covid-mobile-data/tree/master/cdr-aggregation).
 23 | #     
 24 | # - **Run aggregations**: By default, we produce all flowminder and priority indicators. We've included 4 re-tries in case of failure, which we have experienced to help on databricks but is probably irrelevant in other settings. Note that before you can re-run these aggregations, you need to move the csv outputs that have been saved in `data/results/CC/telco/` in previous runs to another folder, else these indicators will be skipped. This prevents you from accidentally overwriting previous results. This way you can also delete the files only for the indicators you want to re-produce, and skip any indicatos you don't want to re-produce.
 25 | # 
 26 | # The outcome of this effort will be used to inform policy making using a [mobility indicator dashboard](https://github.com/worldbank/covid-mobile-data/tree/master/dashboard-dataviz).
 27 | 
 28 | # # Import code
 29 | 
 30 | # In[1]:
 31 | 
 32 | 
 33 | get_ipython().run_line_magic('load_ext', 'autoreload')
 34 | get_ipython().run_line_magic('autoreload', '2')
 35 | 
 36 | 
 37 | # In[2]:
 38 | 
 39 | 
 40 | from modules.DataSource import *
 41 | 
 42 | 
 43 | # In[3]:
 44 | 
 45 | 
 46 | config_file = '../config_file.py'
 47 | 
 48 | 
 49 | # In[4]:
 50 | 
 51 | 
 52 | exec(open(config_file).read())
 53 | 
 54 | 
 55 | # In[5]:
 56 | 
 57 | 
 58 | ds = DataSource(datasource_configs)
 59 | ds.show_config()
 60 | 
 61 | 
 62 | # In[6]:
 63 | 
 64 | 
 65 | from modules.setup import *
 66 | 
 67 | 
 68 | # # Import data
 69 | 
 70 | # ## Load CDR data
 71 | 
 72 | # ### Process/standardize raw data, save as parquet, and then load it
 73 | 
 74 | # In[7]:
 75 | 
 76 | 
 77 | # ds.standardize_csv_files(show=True)
 78 | # ds.save_as_parquet()
 79 | 
 80 | 
 81 | # In[8]:
 82 | 
 83 | 
 84 | #ds.load_standardized_parquet_file()
 85 | 
 86 | 
 87 | # ### Alternatively, specify and load hive table
 88 | 
 89 | # In[9]:
 90 | 
 91 | 
 92 | # # Specify and load hive data
 93 | # ds.parquet_df = ds.spark.sql("""SELECT {} AS msisdn, 
 94 | #                                        {} AS call_datetime, 
 95 | #                                        {} AS location_id FROM {}""".format(ds.hive_vars['msisdn'],
 96 | #                                                                            ds.hive_vars['call_datetime'],
 97 | #                                                                            ds.hive_vars['location_id'],
 98 | #                                                                            ds.hive_vars['calls']))
 99 | 
100 | 
101 | # ### Or load a sample file
102 | 
103 | # In[10]:
104 | 
105 | 
106 | ## Use this in case you want to sample the data and run the code on the sample
107 | 
108 | # #ds.sample_and_save(number_of_ids=1000)
109 | ds.load_sample('sample_feb_mar2020')
110 | ds.parquet_df = ds.sample_df
111 | 
112 | 
113 | # ## Load geo data
114 | 
115 | # In[11]:
116 | 
117 | 
118 | ds.load_geo_csvs()
119 | 
120 | 
121 | # In[12]:
122 | 
123 | 
124 | ## Use this in case you want to cluster the towers and create a distance matrix
125 | 
126 | # ds.create_gpds()
127 | # from modules.tower_clustering import *
128 | # clusterer = tower_clusterer(ds, 'admin2', 'ID_2')
129 | # ds.admin2_tower_map, ds.distances = clusterer.cluster_towers()
130 | # clusterer = tower_clusterer(ds, 'admin3', 'ADM3_PCODE')
131 | # ds.admin3_tower_map, ds.distances  = clusterer.cluster_towers()
132 | 
133 | 
134 | # In[13]:
135 | 
136 | 
137 | ## Use this in case you want to create a voronoi tesselation
138 | 
139 | # from modules.voronoi import *
140 | # voronoi = voronoi_maker(ds, 'admin3', 'ADM3_PCODE')
141 | # ds.voronoi = voronoi.make_voronoi()
142 | 
143 | 
144 | # # Run aggregations
145 | 
146 | # ## Flowminder indicators for admin2
147 | 
148 | # In[14]:
149 | 
150 | 
151 | agg_flowminder_admin2 = flowminder_aggregator(result_stub = '/admin2/flowminder',
152 |                             datasource = ds,
153 |                             regions = 'admin2_tower_map')
154 | 
155 | agg_flowminder_admin2.attempt_aggregation()
156 | 
157 | 
158 | # ## Flowminder indicators for admin3
159 | 
160 | # In[15]:
161 | 
162 | 
163 | agg_flowminder_admin3 = flowminder_aggregator(result_stub = '/admin3/flowminder',
164 |                             datasource = ds,
165 |                             regions = 'admin3_tower_map')
166 | 
167 | agg_flowminder_admin3.attempt_aggregation()
168 | 
169 | 
170 | # ## Priority indicators for admin2
171 | 
172 | # In[16]:
173 | 
174 | 
175 | agg_priority_admin2 = priority_aggregator(result_stub = '/admin2/priority',
176 |                                datasource = ds,
177 |                                regions = 'admin2_tower_map')
178 | 
179 | agg_priority_admin2.attempt_aggregation(indicators_to_produce = {'unique_subscribers_per_day' : ['unique_subscribers', 'day'],
180 |                                                                  'percent_of_all_subscribers_active_per_day' : ['percent_of_all_subscribers_active', 'day'],
181 |                                                                  'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day'],
182 |                                                                  'mean_distance_per_day' : ['mean_distance', 'day'],
183 |                                                                  'mean_distance_per_week' : ['mean_distance', 'week'],
184 |                                                                  'origin_destination_matrix_time_per_day' : ['origin_destination_matrix_time', 'day'],
185 |                                                                  'home_vs_day_location_per_day' : ['home_vs_day_location_per_day', ['day','week']],
186 |                                                                  'home_vs_day_location_per_day' : ['home_vs_day_location_per_day', ['day','month']]})
187 | 
188 | 
189 | # ## Priority indicators for admin3
190 | 
191 | # In[17]:
192 | 
193 | 
194 | agg_priority_admin3 = priority_aggregator(result_stub = '/admin3/priority',
195 |                             datasource = ds,
196 |                             regions = 'admin3_tower_map')
197 | 
198 | agg_priority_admin3.attempt_aggregation(indicators_to_produce = {'transactions_per_hour' : ['transactions', 'hour'],
199 |                                                                  'transactions_per_hour' : ['transactions', 'hour']})
200 | 
201 | 
202 | # ## Scaled priority indicators for admin2
203 | 
204 | # In[ ]:
205 | 
206 | 
207 | agg_scaled_admin2 = scaled_aggregator(result_stub = '/admin2/scaled',
208 |                                datasource = ds,
209 |                                regions = 'admin2_tower_map')
210 | 
211 | agg_scaled_admin2.attempt_aggregation()
212 | 
213 | 
214 | # ## Priority indicators for tower-cluster
215 | 
216 | # In[ ]:
217 | 
218 | 
219 | agg_priority_tower = priority_aggregator(result_stub = '/voronoi/priority',
220 |                                datasource = ds,
221 |                                regions = 'voronoi_tower_map')
222 | 
223 | agg_priority_tower.attempt_aggregation(indicators_to_produce = {'unique_subscribers_per_hour' : ['unique_subscribers', 'hour'],
224 |                                                         'mean_distance_per_day' : ['mean_distance', 'day'],
225 |                                                         'mean_distance_per_week' : ['mean_distance', 'week']})
226 | 
227 | 
228 | # In[ ]:
229 | 
230 | 
231 | agg_priority_tower_harare = priority_aggregator(result_stub = '/voronoi/priority/harare',
232 |                                datasource = ds,
233 |                                regions = 'voronoi_tower_map_harare')
234 | 
235 | agg_priority_tower_harare.attempt_aggregation(indicators_to_produce = {'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day']})
236 | 
237 | 
238 | # In[ ]:
239 | 
240 | 
241 | agg_priority_tower_bulawayo = priority_aggregator(result_stub = '/voronoi/priority/bulawayo',
242 |                                datasource = ds,
243 |                                regions = 'voronoi_tower_map_bulawayo')
244 | 
245 | agg_priority_tower_bulawayo.attempt_aggregation(indicators_to_produce = {'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day']})
246 | 
247 | 
248 | # # Produce script
249 | 
250 | # In[ ]:
251 | 
252 | 
253 | get_ipython().system('jupyter nbconvert --to script *.ipynb')
254 | 
255 | 
256 | # In[ ]:
257 | 
258 | 
259 | 
260 | 
261 | 


--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/sql_code_aggregates.py:
--------------------------------------------------------------------------------
  1 | # Databricks notebook source
  2 | def write_sql_code(calls = 'calls',
  3 |                    start_date = "\'2020-02-01\'",
  4 |                    end_date =  "\'2020-03-31\'",
  5 |                    start_date_weeks =  "\'2020-02-03\'",
  6 |                    end_date_weeks = "\'2020-03-29\'"):
  7 | 
  8 |   sql_code = {
  9 |     # Aggregate 1 (April 1 version)
 10 |     'count_unique_subscribers_per_region_per_day' :
 11 |     """
 12 |       SELECT * FROM (
 13 |           SELECT calls.call_date AS visit_date,
 14 |               cells.region AS region,
 15 |               count(DISTINCT msisdn) AS subscriber_count
 16 |           FROM calls
 17 |           INNER JOIN cells
 18 |               ON calls.location_id = cells.cell_id
 19 |           WHERE calls.call_date >= {}
 20 |               AND calls.call_date <= CURRENT_DATE
 21 |           GROUP BY 1, 2
 22 |       ) AS grouped
 23 |       WHERE grouped.subscriber_count >= 15
 24 |       """.format(start_date),
 25 | 
 26 |     # Intermediate Result - Home location
 27 |     'home_locations' :
 28 |     """
 29 |       SELECT msisdn, region FROM (
 30 |           SELECT
 31 |               msisdn,
 32 |               region,
 33 |               row_number() OVER (
 34 |                   PARTITION BY msisdn
 35 |                   ORDER BY total DESC, latest_date DESC
 36 |               ) AS daily_location_rank
 37 |           FROM (
 38 | 
 39 |               SELECT msisdn,
 40 |                   region,
 41 |                   count(*) AS total,
 42 |                   max(call_date) AS latest_date
 43 |               FROM (
 44 |                   SELECT calls.msisdn,
 45 |                       cells.region,
 46 |                       calls.call_date,
 47 |                       row_number() OVER (
 48 |                           PARTITION BY calls.msisdn, calls.call_date
 49 |                           ORDER BY calls.call_datetime DESC
 50 |                       ) AS event_rank
 51 |                   FROM calls
 52 |                   INNER JOIN cells
 53 |                       ON calls.location_id = cells.cell_id
 54 |                   WHERE calls.call_date >= {}
 55 |                       AND calls.call_date <= {}
 56 | 
 57 |               ) ranked_events
 58 | 
 59 |               WHERE event_rank = 1
 60 |               GROUP BY 1, 2
 61 | 
 62 |           ) times_visited
 63 |       ) ranked_locations
 64 |       WHERE daily_location_rank = 1
 65 |       """.format(start_date, end_date),
 66 | 
 67 |     # Aggregate 2 (April 1 version)
 68 |     'count_unique_active_residents_per_region_per_day' :
 69 |     """
 70 |       SELECT * FROM (
 71 |           SELECT calls.call_date AS visit_date,
 72 |               cells.region AS region,
 73 |               count(DISTINCT calls.msisdn) AS subscriber_count
 74 |           FROM calls
 75 |           INNER JOIN cells
 76 |               ON calls.location_id = cells.cell_id
 77 |           INNER JOIN home_locations homes     -- See intermediate_queries.sql for code to create the home_locations table
 78 |               ON calls.msisdn = homes.msisdn
 79 |               AND cells.region = homes.region
 80 |           GROUP BY 1, 2
 81 |       ) AS grouped
 82 |       WHERE grouped.subscriber_count >= 15""",
 83 | 
 84 |     'count_unique_visitors_per_region_per_day' :
 85 |     """
 86 |       SELECT * FROM (
 87 |           SELECT all_visits.visit_date,
 88 |               all_visits.region,
 89 |               all_visits.subscriber_count - coalesce(home_visits.subscriber_count, 0) AS subscriber_count
 90 |           FROM count_unique_subscribers_per_region_per_day all_visits
 91 |           LEFT JOIN count_unique_active_residents_per_region_per_day home_visits
 92 |               ON all_visits.visit_date = home_visits.visit_date
 93 |               AND all_visits.region = home_visits.region
 94 |       ) AS visitors
 95 |       WHERE visitors.subscriber_count >= 15""",
 96 | 
 97 |     # Aggregate 3 (April 1 version)
 98 |     'count_unique_subscribers_per_region_per_week' :
 99 |     """
100 |       SELECT * FROM (
101 |           SELECT extract(WEEK FROM calls.call_date) AS visit_week,
102 |               cells.region AS region,
103 |               count(DISTINCT calls.msisdn) AS subscriber_count
104 |           FROM calls
105 |           INNER JOIN cells
106 |               ON calls.location_id = cells.cell_id
107 |           WHERE calls.call_date >= {}
108 |               AND calls.call_date <= {}
109 |           GROUP BY 1, 2
110 |       ) AS grouped
111 |       WHERE grouped.subscriber_count >= 15
112 |       """.format(start_date_weeks, end_date_weeks),
113 | 
114 |     # Aggregate 4 (April 1 version)
115 |     'count_unique_active_residents_per_region_per_week' :
116 |     """
117 |     SELECT * FROM (
118 |           SELECT extract(WEEK FROM calls.call_date) AS visit_week,
119 |               cells.region AS region,
120 |               count(DISTINCT calls.msisdn) AS subscriber_count
121 |           FROM calls
122 |           INNER JOIN cells
123 |               ON calls.location_id = cells.cell_id
124 |           INNER JOIN home_locations homes     -- See intermediate_queries.sql for code to create the home_locations table
125 |               ON calls.msisdn = homes.msisdn
126 |               AND cells.region = homes.region
127 |           WHERE calls.call_date >= {}
128 |               AND calls.call_date <= {}
129 |           GROUP BY 1, 2
130 |       ) AS grouped
131 |       WHERE grouped.subscriber_count >= 15
132 |       """.format(start_date_weeks, end_date_weeks),
133 | 
134 |     'count_unique_visitors_per_region_per_week' :
135 |     """
136 |     SELECT * FROM (
137 |           SELECT all_visits.visit_week,
138 |               all_visits.region,
139 |               all_visits.subscriber_count - coalesce(home_visits.subscriber_count, 0) AS subscriber_count
140 |           FROM count_unique_subscribers_per_region_per_week all_visits
141 |           LEFT JOIN count_unique_active_residents_per_region_per_week home_visits
142 |               ON all_visits.visit_week = home_visits.visit_week
143 |               AND all_visits.region = home_visits.region
144 |       ) AS visitors
145 |       WHERE visitors.subscriber_count >= 15""",
146 | 
147 |     # Aggregate 5 (April 1 version)
148 |     'regional_pair_connections_per_day' :
149 |     """
150 |     SELECT * FROM (
151 |           SELECT connection_date,
152 |               region1,
153 |               region2,
154 |               count(*) AS subscriber_count
155 |           FROM (
156 | 
157 |               SELECT t1.call_date AS connection_date,
158 |                   t1.msisdn AS msisdn,
159 |                   t1.region AS region1,
160 |                   t2.region AS region2
161 |               FROM (
162 |                   SELECT DISTINCT calls.msisdn,
163 |                       calls.call_date,
164 |                       cells.region
165 |                   FROM calls
166 |                   INNER JOIN cells
167 |                       ON calls.location_id = cells.cell_id
168 |                   WHERE calls.call_date >= {}
169 |                       AND calls.call_date <= CURRENT_DATE
170 |                   ) t1
171 | 
172 |                   FULL OUTER JOIN
173 | 
174 |                   (
175 |                   SELECT DISTINCT calls.msisdn,
176 |                       calls.call_date,
177 |                       cells.region
178 |                   FROM calls
179 |                   INNER JOIN cells
180 |                       ON calls.location_id = cells.cell_id
181 |                   WHERE calls.call_date >= {}
182 |                       AND calls.call_date <= CURRENT_DATE
183 |                   ) t2
184 | 
185 |                   ON t1.msisdn = t2.msisdn
186 |                   AND t1.call_date = t2.call_date
187 |               WHERE t1.region < t2.region
188 | 
189 |           ) AS pair_connections
190 |           GROUP BY 1, 2, 3
191 |       ) AS grouped
192 |       WHERE grouped.subscriber_count >= 15
193 |       """.format(start_date, start_date),
194 | 
195 |     # Aggregate 6 (April 2 version)
196 |     'directed_regional_pair_connections_per_day' :
197 |     """
198 |       WITH subscriber_locations AS (
199 |           SELECT calls.msisdn,
200 |               calls.call_date,
201 |               cells.region,
202 |               min(calls.call_datetime) AS earliest_visit,
203 |               max(calls.call_datetime) AS latest_visit
204 |           FROM calls
205 |           INNER JOIN cells
206 |               ON calls.location_id = cells.cell_id
207 |           WHERE calls.call_date >= {}
208 |               AND calls.call_date <= CURRENT_DATE
209 |           GROUP BY msisdn, call_date, region
210 |       )
211 |       SELECT * FROM (
212 |           SELECT connection_date,
213 |               region_from,
214 |               region_to,
215 |               count(*) AS subscriber_count
216 |           FROM (
217 | 
218 |               SELECT t1.call_date AS connection_date,
219 |                   t1.msisdn AS msisdn,
220 |                   t1.region AS region_from,
221 |                   t2.region AS region_to
222 |               FROM subscriber_locations t1
223 |               FULL OUTER JOIN subscriber_locations t2
224 |               ON t1.msisdn = t2.msisdn
225 |                   AND t1.call_date = t2.call_date
226 |               WHERE t1.region <> t2.region
227 |                   AND t1.earliest_visit < t2.latest_visit
228 | 
229 |           ) AS pair_connections
230 |           GROUP BY 1, 2, 3
231 |       ) AS grouped
232 |       WHERE grouped.subscriber_count >= 15
233 |       """.format(start_date),
234 | 
235 |     # Aggregate 7 (April 3 version)
236 |     'total_calls_per_region_per_day' :
237 |     """
238 |       SELECT
239 |           call_date,
240 |           region,
241 |           total_calls
242 |       FROM (
243 |           SELECT calls.call_date AS call_date,
244 |               cells.region AS region,
245 |               count(DISTINCT msisdn) AS subscriber_count,
246 |               count(*) AS total_calls
247 |           FROM calls
248 |           INNER JOIN cells
249 |               ON calls.location_id = cells.cell_id
250 |           WHERE calls.call_date >= {}
251 |               AND calls.call_date <= CURRENT_DATE
252 |           GROUP BY 1, 2
253 |       ) AS grouped
254 |       WHERE grouped.subscriber_count >= 15
255 |       """.format(start_date),
256 | 
257 |     # Aggregate 8 (April 3 version)
258 |     'home_location_counts_per_region' :
259 |     """
260 |       SELECT * FROM (
261 |           SELECT region, count(msisdn) AS subscriber_count
262 |           FROM home_locations     -- See intermediate_queries.sql for code to create the home_locations table
263 |           GROUP BY region
264 |       ) AS home_counts
265 |       WHERE home_counts.subscriber_count >= 15"""}
266 |   return sql_code
267 | 


--------------------------------------------------------------------------------