├── dashboard-dataviz
├── README.md
├── dashboard
│ ├── text_inputs
│ │ └── README.md
│ ├── data_inputs_for_dashboard
│ │ └── README.md
│ ├── preparing_data_for_dashboard
│ │ ├── README.md
│ │ ├── 01_clean_spatial_data
│ │ │ ├── README.md
│ │ │ ├── clean_adm3_file.R
│ │ │ └── clean_adm2_file.R
│ │ ├── 03_dashboard_data_prep
│ │ │ ├── prep_subs_obs_totals_data.R
│ │ │ ├── data_to_github.R
│ │ │ └── README.md
│ │ ├── 02_clean_telecom_data
│ │ │ ├── clean_i3_subscribers_data.R
│ │ │ ├── clean_i5_net_movement_data.R
│ │ │ ├── clean_i5_movement_inout_data.R
│ │ │ ├── clean_i7_distance_traveled.R
│ │ │ └── README.md
│ │ └── _dash_master.R
│ ├── functions.R
│ ├── styles.css
│ └── README.md
└── figures
│ ├── _master_figures.R
│ ├── i3_figures.R
│ ├── i5_net_figures.R
│ └── i5_into_out.R
├── data-checks
├── Archive
│ ├── patch_cleaning.py
│ ├── Descr-exploratory
│ │ ├── draf.py
│ │ ├── i5-plot.py
│ │ └── fb-comparisson-draft.py
│ ├── globals.py
│ ├── quick_checks
│ │ ├── check_subscribers.R
│ │ └── ward_neighbors_tower_down.R
│ ├── usage_outliers.py
│ ├── i10-check.py
│ ├── MASTER.py
│ ├── 02_summary_stats.py
│ ├── 03_i_specific_checks_i1_admin2.py
│ ├── data_files_comparisson.py
│ ├── 01_completenes_checks.py
│ └── od_scaling.py
└── README.md
├── cdr-aggregation
├── docker-compose.yml
├── notebooks
│ ├── modules
│ │ ├── setup.py
│ │ ├── README.md
│ │ ├── folder_utils.py
│ │ ├── import_packages.py
│ │ ├── utilities.py
│ │ ├── flowminder_aggregator.py
│ │ ├── voronoi.py
│ │ ├── outliers.py
│ │ ├── aggregator.py
│ │ ├── tower_clustering.py
│ │ └── sql_code_aggregates.py
│ ├── folder_setup.py
│ ├── README.md
│ ├── folder_setup.ipynb
│ └── aggregation_master.py
├── docker
│ └── Dockerfile
├── config_file_template.py
└── config_file_template_hive.py
├── data-panel
├── Archive
│ ├── _master.py
│ ├── usage_outliers.py
│ ├── 02_clean.py
│ ├── panel_draft2.py
│ └── panel_draft.py
├── 01_construct.py
└── utils.py
└── .gitignore
/dashboard-dataviz/README.md:
--------------------------------------------------------------------------------
1 | # Dashboard and Figures
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/text_inputs/README.md:
--------------------------------------------------------------------------------
1 | # Text Inputs
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/data_inputs_for_dashboard/README.md:
--------------------------------------------------------------------------------
1 | # Data Inputs for Dashboard
--------------------------------------------------------------------------------
/data-checks/Archive/patch_cleaning.py:
--------------------------------------------------------------------------------
1 |
2 | # Cleaning
3 |
4 | fi = pd.read_csv(ICUST_adm3_path + file_name)
5 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/README.md:
--------------------------------------------------------------------------------
1 | # Files for data visualization and dashboards
2 |
--------------------------------------------------------------------------------
/cdr-aggregation/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 | services:
3 |
4 | jupyter:
5 | build:
6 | context: .
7 | dockerfile: ./docker/Dockerfile
8 | image: sebxwolf/cdr_aggregation_pyspark:v1
9 | container_name: cdr_aggregation
10 | ports:
11 | - "8888:8888"
12 | - "4040:4040"
13 | volumes:
14 | - ./:/home/jovyan/work
15 |
--------------------------------------------------------------------------------
/data-panel/Archive/_master.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # PANEl MASTER
3 | #-----------------------------------------------------------------#
4 |
5 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
6 | DATA_POC = DATA_path + "proof-of-concept/"
7 | DATA_panel = DATA_POC + "panel_indicators/"
8 | DATA_panel_raw = DATA_panel + 'raw/'
9 | DATA_panel_clean = DATA_panel + 'clean/'
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/setup.py:
--------------------------------------------------------------------------------
1 | # all the modules we need
2 |
3 | from modules.import_packages import *
4 | from modules.DataSource import *
5 | from modules.utilities import *
6 | from modules.aggregator import *
7 | from modules.flowminder_aggregator import *
8 | from modules.priority_aggregator import *
9 | from modules.custom_aggregator import *
10 | from modules.scaled_aggregator import *
11 | from modules.sql_code_aggregates import *
12 | from modules.folder_utils import *
13 |
--------------------------------------------------------------------------------
/cdr-aggregation/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM jupyter/pyspark-notebook:dc9744740e12
2 |
3 | RUN python --version
4 |
5 | RUN conda install --quiet --yes -c \
6 | conda-forge jupyter_contrib_nbextensions jupyter_nbextensions_configurator \
7 | geopandas folium descartes
8 |
9 | RUN pip install -U folium \
10 | geovoronoi \
11 | geopy
12 |
13 | RUN jupyter labextension install @jupyterlab/toc
14 |
15 | VOLUME /home/jovyan/work
16 | WORKDIR /home/jovyan/work
17 |
--------------------------------------------------------------------------------
/data-checks/Archive/Descr-exploratory/draf.py:
--------------------------------------------------------------------------------
1 | # Indicator 1 panel data
2 | i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv')
3 | i1 = i1[i1.region != '99999']
4 |
5 | i3 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i3_admin3.csv')
6 | i3 = i3[i3.region != '99999']
7 |
8 | i1['date'] = pd.to_datetime(i1['hour']).dt.date
9 | i3['date'] = pd.to_datetime(i3['day']).dt.date
10 |
11 |
12 | # Number of calls per day
13 | i1_day = i1.groupby(['date', 'region'])['count_p'].sum().reset_index()
14 |
15 | # Merge
16 | i13 = i1_day.merge(i3[['date', 'count_p', 'region']].rename(columns = {'count_p' : 'subscribers'}),
17 | on = ['date', 'region'])
18 |
19 | np.mean(i13['count_p']/i13['subscribers'])
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/folder_setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # In[ ]:
5 |
6 |
7 | import datetime as dt
8 | from modules.DataSource import *
9 | from modules.folder_utils import *
10 |
11 |
12 | # In[ ]:
13 |
14 |
15 | #Set relative file path to config file
16 | config_file = '../config_file.py'
17 | exec(open(config_file).read())
18 |
19 |
20 | # In[ ]:
21 |
22 |
23 | #Create the DataSource object and show config
24 | ds = DataSource(datasource_configs)
25 | ds.show_config()
26 |
27 |
28 | # In[ ]:
29 |
30 |
31 | #Setup all required data folders
32 | setup_folder(ds)
33 |
34 |
35 | # In[ ]:
36 |
37 |
38 | #Check if required data folders already exists
39 | check_folders(ds)
40 |
41 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/README.md:
--------------------------------------------------------------------------------
1 | # Notebook organization
2 |
3 | The [aggregation_master.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master.py) script is currently set to run all flowminder, priority and scaled indicators. Additional custom indicators are left out.
4 |
5 | The [aggregation_master.ipynb](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master.py) notebook does the same and can be used for data exploration, too.
6 |
7 | The [aggregation_master_databricks.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master_databricks.py) notebook is customised for databricks.
8 |
--------------------------------------------------------------------------------
/cdr-aggregation/config_file_template.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql.types import *
2 | schema = StructType([
3 | StructField("msisdn", IntegerType(), True),
4 | StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files()
5 | StructField("location_id", StringType(), True)
6 | ])
7 |
8 | datasource_configs = {
9 | "base_path": "/home/jovyan/work/data", #folder path used in this docker env
10 | "country_code": "",
11 | "telecom_alias": "",
12 | "schema" : schema,
13 | "data_paths" : ["*.csv"],
14 | "filestub": "",
15 | "geofiles": {},
16 | "shapefiles": ['admin2','admin3', 'voronoi'],
17 | "dates": {'start_date' : dt.datetime(2020,2,1),
18 | 'end_date' : dt.datetime(2020,3,31)}
19 | }
20 |
--------------------------------------------------------------------------------
/data-checks/Archive/globals.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # DATA CHECKS - Globals
3 | #-----------------------------------------------------------------#
4 |
5 | # This file contains settings and globals used across data checks
6 | # files
7 |
8 | # LIBRARIES
9 | import os
10 | import re
11 | import pandas as pd
12 | import numpy as np
13 | import datetime as dt
14 |
15 | import seaborn as sns; sns.set()
16 | from matplotlib import rcParams
17 | import matplotlib.pyplot as plt
18 |
19 | from bokeh.plotting import figure, output_file, show
20 | from bokeh.models import Span
21 | from bokeh.io import export_png
22 |
23 |
24 | # GLOBALS
25 |
26 | # File paths
27 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
28 | OUT_path = DATA_path + 'proof-of-concept/outputs/'
29 |
30 | # Default values
31 | missing_values = ['99999','']
--------------------------------------------------------------------------------
/cdr-aggregation/config_file_template_hive.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql.types import *
2 | schema = StructType([
3 | StructField("msisdn", IntegerType(), True),
4 | StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files()
5 | StructField("location_id", StringType(), True)
6 | ])
7 |
8 | datasource_configs = {
9 | "base_path": "path_to_folder/data", #folder path used in this docker env
10 | "hive_warehouse_location": "path_to_hive_warehouse",
11 | "spark_mode": 'hive',
12 | "hive_vars":{ 'msisdn' : 'col1',
13 | 'call_datetime': 'col2',
14 | 'location_id': 'col3',
15 | 'calls': 'table'},
16 | "country_code": "",
17 | "telecom_alias": "",
18 | "schema" : schema,
19 | "data_paths" : ["*.csv"],
20 | "filestub": "",
21 | "geofiles": {},
22 | "shapefiles": ['admin2','admin3', 'voronoi'],
23 | "dates": {'start_date' : dt.datetime(2020,2,1),
24 | 'end_date' : dt.datetime(2020,3,31)}
25 | }
26 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/README.md:
--------------------------------------------------------------------------------
1 | # Clean Spatial Data
2 |
3 | Cleans spatial datasets:
4 | 1. Aggregate units when needed (e.g., aggregating wards)
5 | 2. Add additional variables (e.g., area)
6 | 3. Standardize variable names
7 | 4. Orders spatial data by region
8 |
9 | ### Standardize Variable Names
10 | Each spatial dataset should have standardized variable names. Standardizing
11 | variable names helps ensure different units (eg, admin2, admin3) can be
12 | easily switched in the dashboard
13 |
14 | | variable | format | example | description |
15 | |---|---|---|---|
16 | | region | string | ZW123456 | Unique identifier of the spatial unit |
17 | | name | string | Name| | Spatial unit name |
18 | | area | numeric | 1234 | Area of the spatial unit in kilometers squared |
19 | | province | string | Name| Name of the province |
20 |
21 | ### Order Spatial Data
22 | Spatial datasets are ordered by region. When cleaning other datasets at the
23 | region level, we also order by region and ensure all regions are present. This
24 | ensures that no reordering needs to be done in the dashboard.
25 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/clean_adm3_file.R:
--------------------------------------------------------------------------------
1 | # Clean ADM2 File
2 |
3 | # Load Data --------------------------------------------------------------------
4 | # LOAD DATA HERE
5 |
6 | # Subset/Add Variables ---------------------------------------------------------
7 | adm3@data <- adm3@data %>%
8 | dplyr::select(NAME_3) %>%
9 | dplyr::rename(name = NAME_3) %>%
10 | dplyr::mutate(region = name)
11 |
12 | adm3$area <- geosphere::areaPolygon(adm3) / 1000^2
13 |
14 | # Simplify (to speed up plotting) ----------------------------------------------
15 | # For ms_simplify, polygon IDs and other ID need to match
16 | pid <- sapply(slot(adm3, "polygons"), function(x) slot(x, "ID"))
17 | row.names(adm3) <- pid
18 |
19 | adm3 <- rmapshaper::ms_simplify(adm3)
20 |
21 | # Arrange ----------------------------------------------------------------------
22 | #### Order by region
23 | adm3$region <- adm3$region %>% as.character()
24 | adm3 <- adm3[order(adm3$region),]
25 |
26 | # Export -----------------------------------------------------------------------
27 | saveRDS(adm3, file.path(GEO_PATH, "adm3.Rds"))
28 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/clean_adm2_file.R:
--------------------------------------------------------------------------------
1 | # Clean ADM2 File
2 |
3 | # Load Data --------------------------------------------------------------------
4 | # LOAD DATA HERE
5 |
6 | # Subset/Add Variables ---------------------------------------------------------
7 | adm2@data <- adm2@data %>%
8 | dplyr::select(NAME_2) %>%
9 | dplyr::rename(name = NAME_2) %>%
10 | dplyr::mutate(region = name)
11 |
12 | adm2$area <- geosphere::areaPolygon(adm2) / 1000^2
13 |
14 | adm2$province <- NA
15 |
16 | # Simplify (to speed up plotting) ----------------------------------------------
17 | # For ms_simplify, polygon IDs and other ID need to match
18 | pid <- sapply(slot(adm2, "polygons"), function(x) slot(x, "ID"))
19 | row.names(adm2) <- pid
20 |
21 | adm2 <- rmapshaper::ms_simplify(adm2)
22 |
23 | # Arrange ----------------------------------------------------------------------
24 | #### Order by region
25 | adm2$region <- adm2$region %>% as.character()
26 | adm2 <- adm2[order(adm2$region),]
27 |
28 | # Export -----------------------------------------------------------------------
29 | saveRDS(adm2, file.path(GEO_PATH, "adm2.Rds"))
30 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/functions.R:
--------------------------------------------------------------------------------
1 | # Functions ====================================================================
2 |
3 | #### Log values with negatives
4 | # Define function to take the log of values that can deal with negative
5 | # values. Just takes the absoltue value, logs, then reapplies negative
6 | log_neg <- function(values){
7 | # Log that takes into account zero. Only for logging values for
8 | # displaying!
9 |
10 | values_pos_index <- (values > 0) %in% T # %in% T to account for NAs
11 | values_neg_index <- (values <= 0) %in% T
12 |
13 | values_pos_log <- log(values[values_pos_index]+1)
14 | values_neg_log <- -log(-(values[values_neg_index])+1)
15 |
16 | values[values_pos_index] <- values_pos_log
17 | values[values_neg_index] <- values_neg_log
18 |
19 | return(values)
20 | }
21 |
22 | as.character.htmlwidget <- function(x, ...) {
23 | htmltools::HTML(
24 | htmltools:::as.character.shiny.tag.list(
25 | htmlwidgets:::as.tags.htmlwidget(
26 | x
27 | ),
28 | ...
29 | )
30 | )
31 | }
32 |
33 | add_deps <- function(dtbl, name, pkg = name) {
34 | tagList(
35 | dtbl,
36 | htmlwidgets::getDependency(name, pkg)
37 | )
38 | }
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/styles.css:
--------------------------------------------------------------------------------
1 | input[type="number"] {
2 | max-width: 80%;
3 | }
4 |
5 | div.outer {
6 | position: fixed;
7 | top: 50px;
8 | left: 0;
9 | right: 0;
10 | bottom: 0;
11 | overflow: hidden;
12 | padding: 0;
13 | }
14 |
15 | /* Customize fonts */
16 | body, label, input, button, select {
17 | font-family: 'Helvetica Neue', Helvetica;
18 | font-weight: 200;
19 | }
20 | h1, h2, h3, h4 { font-weight: 400; }
21 |
22 | #controls {
23 | /* Appearance */
24 | background-color: white;
25 | padding: 0 20px 20px 20px;
26 | cursor: move;
27 | /* Fade out while not hovering */
28 | opacity: 0.76;
29 | zoom: 0.95;
30 | transition: opacity 0ms 0ms;
31 | }
32 | #controls:hover {
33 | /* Fade in while hovering */
34 | opacity: 0.99;
35 | transition-delay: 0;
36 | }
37 |
38 | #logo {
39 | /* Appearance */
40 | background-color: transparent;
41 | cursor: move;
42 | /* Fade out while not hovering */
43 | opacity: 0.25;
44 | zoom: 0.9;
45 | transition: opacity 500ms 1s;
46 | }
47 |
48 | #logo:hover {
49 | /* Fade in while hovering */
50 | opacity: 0.95;
51 | transition-delay: 0;
52 | }
53 |
54 | #img-id{
55 | position: fixed;
56 | right: 10px;
57 | top: 5px;
58 | }
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/prep_subs_obs_totals_data.R:
--------------------------------------------------------------------------------
1 | # Prep Subscribers / Observations Total Data
2 |
3 | # Prep datsets for line graphs on about page.
4 |
5 | # Subscribers ------------------------------------------------------------------
6 | subs_adm2 <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_01_02_adm3_hour_result.csv")),
7 | stringsAsFactors=F)
8 |
9 | subs_adm2 <- subs_adm2 %>%
10 | group_by(pdate) %>%
11 | dplyr::summarise(Subscribers = sum(totalimei)) %>%
12 | dplyr::rename(Date = pdate) %>%
13 | mutate(Date = Date %>% ymd)
14 |
15 | saveRDS(subs_adm2, file.path(DASHBOARD_DATA_ONEDRIVE_PATH,"subscribers_total.Rds"))
16 |
17 | # Observations -----------------------------------------------------------------
18 | obs_adm2 <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_01_02_adm3_hour_result.csv")),
19 | stringsAsFactors=F)
20 |
21 | obs_adm2 <- obs_adm2 %>%
22 | group_by(pdate) %>%
23 | dplyr::summarise(Observations = sum(total)) %>%
24 | dplyr::rename(Date = pdate) %>%
25 | mutate(Date = Date %>% ymd)
26 |
27 | saveRDS(obs_adm2, file.path(DASHBOARD_DATA_ONEDRIVE_PATH,"observations_total.Rds"))
28 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/README.md:
--------------------------------------------------------------------------------
1 | # Module organization
2 |
3 | ## Aggregation
4 | The base class `aggregator` defined in [aggregator.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/modules/aggregator.py) implements methods and attributes shared by all aggregator classes. At the next level, `flowminder_aggregator` and `priority_aggregator` implement sql queries from [Flowminder](https://github.com/Flowminder), and priority indicators designed by this task force written in pyspark, respectively. Beyond that, the classes `scaled_aggregator` and `custom_aggregator` implement priority indicators scaled by a resident count, and additional custom pyspark indicators, respectively. Both inherit from the `priority_aggregator` class.
5 |
6 | ```
7 | |-- aggregator
8 | | |-- flowminder_aggregator
9 | | |-- priority_aggregator
10 | | |-- scaled_aggregator
11 | | |-- custom_aggregator
12 | ```
13 |
14 | ## Clustering and tesselation
15 | Modules `voronoi` and `tower_clustering` implement voronoi tesselation given tower locations, these will be of use in the setup phase to create tower-region mappings.
16 |
17 | ## Outlier analysis
18 | Module `outliers` can be used to study outlier observations.
19 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/data_to_github.R:
--------------------------------------------------------------------------------
1 | # Transfer dashboard data from OneDrive to Github
2 |
3 |
4 | ## Remove previous files in github
5 | REMOVE_PREVIOUS_FILES <- F
6 |
7 | if(REMOVE_PREVIOUS_FILES){
8 | temp <- list.files(DASHBOARD_DATA_GITHUB_PATH,
9 | full.names = T,
10 | pattern = "*.Rds") %>%
11 | lapply(file.remove)
12 |
13 | }
14 |
15 |
16 | # Move telecom data to github folder -------------------------------------------
17 | i <- 1
18 |
19 | telecom_files <- list.files(DASHBOARD_DATA_ONEDRIVE_PATH, pattern = "*.Rds")
20 |
21 | #telecom_files <- telecom_files[grepl("spark", telecom_files)]
22 |
23 | temp <- telecom_files %>%
24 | lapply(function(file_i){
25 | if((i %% 100) %in% 0) print(paste(i, "/", length(telecom_files)))
26 | i <<- i + 1
27 |
28 | file.copy(file.path(DASHBOARD_DATA_ONEDRIVE_PATH, file_i),
29 | paste0(DASHBOARD_DATA_GITHUB_PATH, "/"),
30 | overwrite=T)
31 | })
32 |
33 |
34 | # Move geofiles to github folder -----------------------------------------------
35 | for(file_i in list.files(GEO_PATH)){
36 | file.copy(file.path(GEO_PATH, file_i),
37 | paste0(DASHBOARD_DATA_GITHUB_PATH, "/"),
38 | overwrite=T)
39 | }
40 |
41 |
42 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/folder_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | ######################################
3 | # Folder setup methods - written for the jupyter notebook docker image
4 |
5 | #Loops over the requred folders for teh datasource and create any missing folders
6 | def setup_folder(datasource):
7 | #Loop over required paths, and return true
8 | for folder in datasource.required_folders():
9 | test_folder(folder, create_if_not_exist=True)
10 | return True
11 |
12 | #Check if all required folders exist without creating them
13 | def check_folders(datasource):
14 | return_boolean = True
15 | #loop over required folders
16 | for folder in datasource.required_folders():
17 | if not test_folder(folder, create_if_not_exist=False):
18 | print("Folder '{}' is required but does not exist".format(folder))
19 | return_boolean = False
20 | return return_boolean
21 |
22 | #Utility that check if folder exist
23 | def test_folder(path, create_if_not_exist):
24 | #If folder exists return true
25 | if os.path.exists(path): return True
26 | #Else: if create_if_not_exist is true then create folder and return true
27 | elif create_if_not_exist:
28 | os.makedirs(path)
29 | return True
30 | #Else: Folder does not exist and folder is not created, return false
31 | else: return False
32 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/README.md:
--------------------------------------------------------------------------------
1 | # Dashboard Data Prep
2 |
3 | Due to the high volume of data, data transformations (e.g., aggregating, filtering, etc) are done outside of the dashboard in order to minimize the processing and data needed to be loaded in memory at any point as the dashboard is running. These scripts filter the cleaned telecom data into individual datasets so that no additional filtering or transformations need to be applied within the dashboard; the dashboard can just read the files then immediately use the data in the map, line graph and table. Here, we create smaller datasets that contain the same variables as above. Indicators include density, movement in, movement out, mean distance traveled, etc.
4 |
5 | The following datasets are made.
6 |
7 | | Dataset Type | Naming Convention | Description |
8 | | --- | --- | --- |
9 | | unit-level | [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Date/Week].Rds | For a given day or week, this dataset contains information for all wards or districts for a specified indicator. For O-D level datasets, values are aggregated to the specified origin or destination unit (eg, movement into unit from all other units). |
10 | | time-level | [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name].Rds | For a given admin unit, this dataset contains a time series of values for a specified indicator. |
11 | | unit-time-level | [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name]\_[Date/Week].Rds | These datasets are only used for O-D variables. The show, for a given origin or destination unit, the movement in or out of that unit to all other units for the specified day/week. |
12 |
13 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/import_packages.py:
--------------------------------------------------------------------------------
1 | # Imports necessary packages and sets some global vars
2 | ### spark etc
3 | # import rarfile
4 |
5 | import os, pyspark, time, sys
6 | import pyspark.sql.functions as F
7 | from pyspark.sql.functions import pandas_udf, PandasUDFType
8 | from pyspark import *
9 | from pyspark.sql import *
10 | from pyspark.rdd import *
11 | from pyspark.ml import *
12 | from pyspark.sql.types import ArrayType
13 | from pyspark.sql.types import IntegerType
14 | from pyspark.sql.types import DoubleType
15 | from pyspark.sql.types import FloatType
16 |
17 | ### data wrangling
18 | import pandas as pd
19 | import glob
20 | import shutil
21 | pd.options.display.float_format = '{:,.0f}'.format
22 | # pd.set_option("display.max_rows", 100)
23 | pd.options.display.max_columns = None
24 | import datetime as dt
25 | import numpy as np
26 | from random import sample, seed
27 | seed(510)
28 | # timezone = dt.timezone(offset = -dt.timedelta(hours=5), name = "America/Bogota")
29 | timezone = dt.timezone(offset = -dt.timedelta(hours=0), name = "Africa/Harare")
30 | import re
31 | #import fiona
32 | #import geopandas as gpd
33 | import copy
34 | from collections import Counter
35 | from shapely import wkt
36 |
37 | ### plotting
38 | import matplotlib.pyplot as plt
39 | import matplotlib.dates as mdates
40 | import seaborn as sns
41 | #import folium
42 | #import gif
43 | #from folium.plugins import HeatMap, DualMap, Fullscreen
44 | #from folium.features import DivIcon
45 | #from branca.element import Template, MacroElement
46 | import locale
47 | from matplotlib.ticker import FuncFormatter
48 | import matplotlib.lines as mlines
49 | font = {'family' : 'Calibri',
50 | 'weight' : 'normal',
51 | 'size' : 18}
52 | import matplotlib
53 |
--------------------------------------------------------------------------------
/dashboard-dataviz/figures/_master_figures.R:
--------------------------------------------------------------------------------
1 | # Master R Script for Prepping Data for Dashboard
2 |
3 | #### Packages #### =============================================================
4 | library(tidyverse)
5 | library(sf)
6 | library(sp)
7 | library(plotly)
8 | library(stargazer)
9 | library(knitr)
10 | library(gridExtra)
11 | library(leaflet)
12 | library(ggpubr)
13 | library(purrr)
14 | library(parallel)
15 | library(pbmcapply)
16 | library(rgeos)
17 | library(rgdal)
18 | library(sp)
19 | library(rmapshaper)
20 | library(raster)
21 | library(geosphere)
22 | library(lubridate)
23 | library(data.table)
24 | library(mapview)
25 | library(hrbrthemes)
26 |
27 | #### File paths #### ===========================================================
28 |
29 | # Define Root Paths ------------------------------------------------------------
30 | if(Sys.info()[["user"]] == "robmarty") PROJECT_PATH <- "~/Documents/World Bank/Sveta Milusheva - COVID 19 Results"
31 | if(Sys.info()[["user"]] == "wb519128") PROJECT_PATH <- "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results"
32 | if(Sys.info()[["user"]] == "WB521633") PROJECT_PATH <- "C:/Users/wb521633/WBG/Sveta Milusheva - COVID 19 Results"
33 |
34 | if(Sys.info()[["user"]] == "robmarty") GITHUB_PATH <- "~/Documents/Github/covid-mobile-data"
35 | if(Sys.info()[["user"]] == "wb519128") GITHUB_PATH <- "C:/Users/wb519128/Github/covid-mobile-data"
36 | if(Sys.info()[["user"]] == "WB521633") GITHUB_PATH <- "C:/Users/wb521633/Documents/Github/covid-mobile-data"
37 |
38 | # Define Paths from Root -------------------------------------------------------
39 | CLEAN_DATA_ADM2_PATH <- file.path(PROJECT_PATH, "proof-of-concept", "files_for_dashboard", "files_clean", "adm2")
40 | CLEAN_DATA_ADM3_PATH <- file.path(PROJECT_PATH, "proof-of-concept", "files_for_dashboard", "files_clean", "adm3")
41 | figures_path <- file.path(PROJECT_PATH, "proof-of-concept", "outputs", "figures")
42 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/folder_setup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import datetime as dt\n",
10 | "from modules.DataSource import *\n",
11 | "from modules.folder_utils import *"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": null,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "#Set relative file path to config file\n",
21 | "config_file = '../config_file.py'\n",
22 | "exec(open(config_file).read())"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": null,
28 | "metadata": {
29 | "scrolled": true
30 | },
31 | "outputs": [],
32 | "source": [
33 | "#Create the DataSource object and show config\n",
34 | "ds = DataSource(datasource_configs)\n",
35 | "ds.show_config()"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": null,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "#Setup all required data folders\n",
45 | "setup_folder(ds)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": null,
51 | "metadata": {},
52 | "outputs": [],
53 | "source": [
54 | "#Check if required data folders already exists\n",
55 | "check_folders(ds)"
56 | ]
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.7.6"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 4
80 | }
81 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/utilities.py:
--------------------------------------------------------------------------------
1 |
2 | ############# Utility functions used throughout
3 | import os
4 | if os.environ['HOME'] != '/root':
5 | from modules.import_packages import *
6 | from modules.DataSource import *
7 | databricks = False
8 | else:
9 | databricks = True
10 |
11 | def save_and_load_parquet(df, filename, ds):
12 | # write parquet
13 | df.write.mode('overwrite').parquet(filename)
14 | #load parquet
15 | df = ds.spark.read.format("parquet").load(filename)
16 | return df
17 |
18 | def save_csv(matrix, path, filename):
19 | # write to csv
20 | matrix.repartition(1).write.mode('overwrite').format('com.databricks.spark.csv') \
21 | .save(os.path.join(path, filename), header = 'true')
22 | # move one folder up and rename to human-legible .csv name
23 | if databricks:
24 | dbutils.fs.mv(dbutils.fs.ls(path + '/' + filename)[-1].path,
25 | path + '/' + filename + '.csv')
26 | # remove the old folder
27 | dbutils.fs.rm(path + '/' + filename + '/', recurse = True)
28 |
29 | else:
30 | os.rename(glob.glob(os.path.join(path, filename + '/*.csv'))[0],
31 | os.path.join(path, filename + '.csv'))
32 | shutil.rmtree(os.path.join(path, filename))
33 |
34 | ############# Windows for window functions
35 |
36 | # window by cardnumber
37 | user_window = Window\
38 | .partitionBy('msisdn').orderBy('call_datetime')
39 |
40 | # window by cardnumber starting with last transaction
41 | user_window_rev = Window\
42 | .partitionBy('msisdn').orderBy(F.desc('call_datetime'))
43 |
44 | # user date window
45 | user_date_window = Window\
46 | .partitionBy('msisdn', 'call_date').orderBy('call_datetime')
47 |
48 | # user date window starting from last date
49 | user_date_window_rev = Window\
50 | .partitionBy('msisdn', 'call_date').orderBy(F.desc('call_datetime'))
51 |
52 |
53 | ############# Plotting
54 |
55 | def zero_to_nan(values):
56 | """Replace every 0 with 'nan' and return a copy."""
57 | values[ values==0 ] = np.nan
58 | return values
59 |
60 | def fill_zero_dates(pd_df):
61 | pd_df = pd_df[~pd_df.index.isnull()].sort_index()
62 | msisdnx = pd.date_range(pd_df.index[0], pd_df.index[-1])
63 | pd_df = pd_df.reindex(msisdnx, fill_value= 0)
64 | return pd_df
65 |
--------------------------------------------------------------------------------
/data-checks/README.md:
--------------------------------------------------------------------------------
1 | # Data Checks
2 |
3 | This folder contains code for running basic checks of aggregated CDR indicators. The data quality checks are intended to achieve the following:
4 | 1. **Ensure the data is complete.** This means that there are no missing values in two main dimensions: spatial-all admin areas should have data; and temporal: all time slots (month, day and hour) should have data. This check is required for all indicators.
5 | 2. **Cell tower down checks**. This is a special type of missing data where the data may be missing due to cell tower. This check is required for all indicators?
6 | 3. **Consistency checks**. This check can be done for a single indicator to check for several things. But it can also be done cross indicators to ensure consistency of total numbers.
7 |
8 | ## Requirements
9 |
10 | - Python3
11 | - pandas
12 | - numpy
13 | - plotly
14 |
15 | ## Basic usage:
16 |
17 | ```bash
18 | $ git clone git@github.com:worldbank/covid-mobile-data.git
19 | $ cd covid-mobile-data/data-checks/
20 | $ python checker.py --Path path/to/indicators
21 | [--prefix "your_prefix_"]
22 | [--outputs path/to/outputs]
23 | ```
24 |
25 | ## Custom usage:
26 | You can create an instance of the checker class to customize any of the default values.
27 |
28 | ```python
29 | from checker import *
30 |
31 | check = checker(path = 'path/to/indicaotrs',
32 | outputs_path = 'path/to/outputs',
33 | level = 'subfolder',
34 | ind_dict = {'i1' : 'transactions_per_hour.csv',
35 | 'i3' : 'unique_subscrivers_per_day.csv',
36 | 'i5' : 'origin_destination_connection_matrix_per_day.csv'},
37 | prefix = 'your_prefix_',
38 | col_names_dict = col_names_dict = {
39 | 'i1': {'Time':'hour',
40 | 'Geography':'region',
41 | 'Count':'count'},
42 | 'i3': {'Time':'day',
43 | 'Geography':'region',
44 | 'Count':'count'},
45 | 'i5': {'Time':'connection_date',
46 | 'Geography_from':'region_from',
47 | 'Geography_to':'region_to',
48 | 'Count':'total_count'} })
49 | ```
50 |
--------------------------------------------------------------------------------
/dashboard-dataviz/figures/i3_figures.R:
--------------------------------------------------------------------------------
1 | # i3 Figures
2 |
3 | unit <- "districts"
4 |
5 | # Load Data --------------------------------------------------------------------
6 | if(unit %in% "wards"){
7 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
8 | }
9 |
10 | if(unit %in% "districts"){
11 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
12 | }
13 |
14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i3_daily.Rds"))
15 |
16 | data <- data %>%
17 | group_by(region) %>%
18 | mutate(value_pre = mean(value[date < "2020-03-30"]),
19 | value_post = mean(value[date > "2020-03-30"])) %>%
20 | ungroup() %>%
21 | mutate(value_change = value_post - value_pre) %>%
22 | mutate(value_change_rank = rank(value_change))
23 |
24 | data$value_change_rank[is.na(data$value_change)] <- NA
25 |
26 | # Figures ----------------------------------------------------------------------
27 | rank_high <- data$value_change_rank %>% unique() %>% sort() %>% head(5)
28 |
29 | p_high <- data %>%
30 | dplyr::filter(value_change_rank %in% rank_high) %>%
31 | ggplot(aes(x = date, y = value)) +
32 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
33 | geom_line() +
34 | labs(x = "",
35 | y = "Number of Subscribers",
36 | title = "Largest Decreases") +
37 | facet_wrap(~name,
38 | scales = "free_y",
39 | nrow = 1) +
40 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
41 | strip.text.x = element_text(face = "bold"))
42 | p_high
43 |
44 | rank_low <- data$value_change_rank %>% unique() %>% sort() %>% tail(5)
45 |
46 | p_low <- data %>%
47 | dplyr::filter(value_change_rank %in% rank_low) %>%
48 | ggplot(aes(x = date, y = value)) +
49 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
50 | geom_line() +
51 | labs(x = "",
52 | y = "",
53 | title = "Largest Increases") +
54 | facet_wrap(~name,
55 | scales = "free_y",
56 | nrow = 1) +
57 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
58 | strip.text.x = element_text(face = "bold"))
59 |
60 | p_all <- ggarrange(p_high, p_low, nrow = 2)
61 | ggsave(p_all, filename = file.path(figures_path,
62 | paste0(unit, "_subsc_top_chng.png")),
63 | height = 5, width=12)
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/data-panel/01_construct.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # CREATE PANEL
3 | #-----------------------------------------------------------------#
4 |
5 | # This code creates panel datasets combinig different versions of
6 | # indicator files.
7 |
8 | from utils import *
9 | from panel_constructor import *
10 |
11 | #-----------------------------------------------------------------#
12 | # Settings
13 |
14 | EXPORT = False
15 |
16 | #-------------------#
17 | # Indicator dataframe
18 |
19 | # Load list of indicators to make it easier to bulk load files
20 | indicators_df = pd.read_csv('path/to/indicators_list.csv')
21 |
22 |
23 | #-------------------#
24 | # Set default values
25 | levels_dict = { 1: [3],
26 | 2: [3],
27 | 3: [2,3],
28 | 4: ['country'],
29 | 5: [2,3],
30 | 6: [3],
31 | 7: [2,3],
32 | 8: [2,3],
33 | 9: [2,3],
34 | 10: [2,3],
35 | 11: [2,3]}
36 |
37 |
38 | #-----------------------------------------------------------------#
39 | # Load indicators and create comparisson "dirty" panel
40 |
41 | indicators = panel_constructor(levels_dict, indicators_df)
42 |
43 | # Create class instance
44 | # If no levels dictionary is provided, it will use the default, which is all of them!
45 | # indicators = panel_constructor()
46 |
47 | # Run panel creation
48 | indicators.dirty_panel()
49 |
50 | #-----------------------------------------------------------------#
51 | # Load usage outliers file
52 |
53 | # This file is created in data-checks
54 | i1_ag_df_tower_down = pd.read_csv("/path/to/usage-outliers/file")
55 |
56 | #-----------------------------------------------------------------#
57 | # Export comparison panel
58 |
59 | if EXPORT:
60 | indicators.export('/export/path/')
61 |
62 | #-----------------------------------------------------------------#
63 | # Create clean panel
64 |
65 | # This replaces the old panel attribute with the clean version, with
66 | # standardized column names
67 |
68 | indicators.clean_panel(i1_ag_df_tower_down)
69 |
70 | #-----------------------------------------------------------------#
71 |
72 |
73 | indicators.add_other_provider(mno_path = "/path/to/other/mno/indicator/folder",
74 | mno_suffix = '_mno')
75 |
76 |
77 | #-----------------------------------------------------------------#
78 | # Export
79 | if EXPORT:
80 | indicators.export('/export/path/')
81 |
--------------------------------------------------------------------------------
/dashboard-dataviz/figures/i5_net_figures.R:
--------------------------------------------------------------------------------
1 | # i3 Figures
2 |
3 | unit <- "wards"
4 |
5 | # Load Data --------------------------------------------------------------------
6 | if(unit %in% "wards"){
7 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
8 | }
9 |
10 | if(unit %in% "districts"){
11 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
12 | }
13 |
14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i5_net_daily.Rds"))
15 |
16 | data <- data %>%
17 | group_by(region) %>%
18 | mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T),
19 | value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>%
20 | ungroup() %>%
21 | mutate(value_change = value_post - value_pre) %>%
22 | mutate(value_change_rank = rank(value_change))
23 |
24 | data$value_change_rank[is.na(data$value_change)] <- NA
25 |
26 | data <- data[!is.na(data$date),]
27 | data$date <- data$date %>% as.Date()
28 |
29 | # Figures ----------------------------------------------------------------------
30 | rank_high <- data$value_change_rank %>% unique() %>% sort() %>% head(5)
31 |
32 | p_high <- data %>%
33 | dplyr::filter(value_change_rank %in% rank_high) %>%
34 | ggplot(aes(x = date, y = value)) +
35 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
36 | geom_line() +
37 | labs(x = "",
38 | y = "Number of Subscribers",
39 | title = "Largest Decreases") +
40 | facet_wrap(~name,
41 | scales = "free_y",
42 | nrow = 1) +
43 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
44 | strip.text.x = element_text(face = "bold"))
45 | p_high
46 |
47 | datarank_low <- data$value_change_rank %>% unique() %>% sort() %>% tail(5)
48 |
49 | p_low <- data %>%
50 | dplyr::filter(value_change_rank %in% rank_low) %>%
51 | ggplot(aes(x = date, y = value)) +
52 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
53 | geom_line() +
54 | labs(x = "",
55 | y = "",
56 | title = "Largest Increases") +
57 | facet_wrap(~name,
58 | scales = "free_y",
59 | nrow = 1) +
60 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
61 | strip.text.x = element_text(face = "bold"))
62 |
63 | p_all <- ggarrange(p_high, p_low, nrow = 2)
64 | ggsave(p_all, filename = file.path(figures_path,
65 | paste0(unit, "_netmovement_top_chng.png")),
66 | height = 5, width=12)
67 |
68 |
69 | data$value[data$date < "2020-03-30"] %>% log() %>% hist()
70 |
--------------------------------------------------------------------------------
/data-checks/Archive/Descr-exploratory/i5-plot.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import pandas as pd
4 | import numpy as np
5 | import datetime as dt
6 | import time
7 |
8 | from bokeh.plotting import figure, output_file, show
9 | from bokeh.models import Span
10 | from bokeh.io import export_png
11 |
12 | #-----------------------------------------------------------------#
13 | # Folder structure
14 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
15 | DATA_POC = DATA_path + "proof-of-concept/"
16 | DATA_Panel = DATA_POC + "panel_indicators/"
17 | OUT_path = DATA_POC + "outputs/"
18 |
19 |
20 | #-----------------------------------------------------------------#
21 | # Load data
22 |
23 | i5 = pd.read_csv(DATA_Panel + 'i5_admin2.csv')
24 |
25 |
26 | #-----------------------------------------------------------------#
27 | # Process data
28 | i5 = i5[['connection_date', 'region_from', 'region_to', 'od_count_p', 'subscriber_count_p', 'total_count_p']]
29 |
30 | i5['date'] = pd.to_datetime(i5['connection_date']).dt.date
31 | i5['month'] = pd.to_datetime(i5['connection_date']).dt.month
32 |
33 |
34 | i5_agg = i5\
35 | .groupby('date')\
36 | .agg({'region_from' : pd.Series.nunique ,
37 | 'region_to' : pd.Series.nunique,
38 | 'subscriber_count_p' : np.sum,
39 | 'total_count_p' : np.sum})\
40 | .reset_index()\
41 | .sort_values('date')
42 |
43 | i5_agg_month = i5\
44 | .groupby('month')\
45 | .agg({'subscriber_count_p' : np.sum,
46 | 'total_count_p' : np.sum})\
47 | .reset_index()\
48 | .sort_values('month')
49 |
50 | #-----------------------------------------------------------------#
51 | # Plot
52 |
53 | p = figure(title="Total Daily Movement Between Districts on a Given Day",
54 | plot_width=800,
55 | plot_height=500,
56 | x_axis_type='datetime')
57 | p.circle(i5_agg['date'],
58 | i5_agg['subscriber_count_p'])
59 |
60 | # Add lockdown dates vertical line
61 |
62 | vline1 = Span(location= dt.date(2020, 3, 27),
63 | dimension='height',
64 | line_color='black',
65 | line_dash='dashed')
66 | vline2 = Span(location= dt.date(2020, 3, 30),
67 | dimension='height',
68 | line_color='black',
69 | line_dash='dashed')
70 |
71 | p.renderers.extend([vline1, vline2])
72 |
73 | # Additional formatting
74 | p.left[0].formatter.use_scientific = False
75 | p.toolbar.logo = None
76 | p.toolbar_location = None
77 | p.xaxis.axis_label = "Date"
78 | p.yaxis.axis_label = "Movement Day"
79 | p.title.text_font_size = '15pt'
80 | p.xaxis.axis_label_text_font_size = "12pt"
81 | p.yaxis.axis_label_text_font_size = "12pt"
82 | p.yaxis.major_label_text_font_size = "10pt"
83 | p.xaxis.major_label_text_font_size = "10pt"
84 |
85 | # Display plot
86 | show(p)
87 |
88 | # Export
89 | export_png(p,
90 | filename= OUT_path + "all_movement.png")
91 |
--------------------------------------------------------------------------------
/data-panel/Archive/usage_outliers.py:
--------------------------------------------------------------------------------
1 |
2 | #-----------------------------------------------------------------#
3 | # Settings
4 |
5 | import os
6 | import re
7 | import copy
8 | import pandas as pd
9 | import numpy as np
10 | import datetime as dt
11 |
12 |
13 | EXPORT = True
14 |
15 | # Number of hours below avg, used as a trashold to
16 | # define a tower down
17 | htrahshold = -3
18 |
19 |
20 | #-----------------------------------------------------------------#
21 | # Process data
22 | i1p = copy.deepcopy(i1.panel)
23 |
24 | i1p['date'] = pd.to_datetime(i1p['hour']).dt.date
25 | i1p['hour_int'] = pd.to_datetime(i1p['hour']).dt.hour
26 |
27 |
28 | # Number of observations per ward that is total number of hours
29 | i1freq = i1p.groupby('region').size()
30 |
31 | i1freq = i1freq.reset_index()
32 | i1freq.columns = ['region', 'freq']
33 |
34 | # Select wards with less than 12h on average
35 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1p.date.nunique())]
36 |
37 | i1_low_total_hours = i1_low_total_hours\
38 | .rename(columns = {'freq' : 'total_hours'})
39 | # # Proportion of wards with at least one tower down
40 | # freq[freq < 1392].count()/len(set(i1['region']))
41 |
42 | # # Proportion of wards with very
43 | # freq[freq < 700].count()
44 | # freq[freq < 700].count()/len(set(i1['region']))
45 |
46 | # Export
47 | if(EXPORT):
48 | (i1_low_total_hours
49 | .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv',
50 | index = False) )
51 |
52 | #-----------------------------------------------------------------#
53 | # USAGE OUTILERS: Indicator wards and days with towers down
54 |
55 | # Number of hours with transactions per region day
56 | hours_per_day = i1p.groupby(['region', 'date']).size()
57 |
58 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column
59 | hours_per_day.columns = ['region', 'date', 'hcount']
60 |
61 |
62 | # Average hours per day per region
63 | avg_hours = (hours_per_day.groupby(['region'])
64 | .mean()
65 | .rename(columns={'hcount' :'avg_hours' }))
66 |
67 | # Create region day data set
68 | i1_ag_df = hours_per_day.merge(avg_hours,
69 | on = 'region')
70 |
71 | # Difference from average usage per hour
72 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours']
73 |
74 | # Create data only with pairs of wards and days potential
75 | # towers down
76 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold]
77 |
78 | # Read me text
79 | readme_text = "This file contains a combinations of wards and days that are assumed to have a tower down."
80 | readme_text += "If a day has " + str(abs(htrahshold))
81 | readme_text += " hours with any calls below the daily avergage for that ward,"
82 | readme_text += " it is considered to have a trower down at some point that day."
83 |
84 | # Export
85 | if(EXPORT):
86 | (i1_ag_df_tower_down
87 | .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv',
88 | index = False) )
89 | # Read me file
90 | file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w")
91 | file.write(readme_text)
92 | file.close()
93 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i3_subscribers_data.R:
--------------------------------------------------------------------------------
1 | # Clean i3 for Dashboard
2 |
3 | unit <- "adm2"
4 | for(unit in c("adm2", "adm3")){
5 |
6 | # Load Data / Set Paths ------------------------------------------------------
7 | df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_03_",unit,"_day_result.csv")),
8 | stringsAsFactors=F)
9 | admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds")))
10 |
11 | if(unit %in% "adm2"){
12 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
13 | }
14 | if(unit %in% "adm3"){
15 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
16 | }
17 |
18 | # Daily ----------------------------------------------------------------------
19 | df_day_clean <- df_day %>%
20 |
21 | tp_standardize_vars("pdate", unit, "totalimei") %>%
22 |
23 | # Clean datset
24 | tp_clean_date() %>%
25 | tp_fill_regions(admin_sp) %>%
26 | tp_complete_date_region() %>%
27 | tp_add_polygon_data(admin_sp) %>%
28 |
29 | # Interpolate/Clean Values
30 | tp_interpolate_outliers(NAs_as_zero = T, outlier_sd=3) %>%
31 | tp_replace_zeros(NAs_as_zero = T) %>%
32 | tp_less15_NA() %>%
33 |
34 | # Percent change
35 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i3_daily_base.csv"),
36 | baseline_date = BASELINE_DATE) %>%
37 | tp_add_percent_change() %>%
38 |
39 | # Add labels
40 | tp_add_label_level(timeunit = "day", OD = F) %>%
41 | tp_add_label_baseline(timeunit = "day", OD = F) %>%
42 |
43 | # Add density
44 | mutate(density = value / area)
45 |
46 | ## Export
47 | saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, "i3_daily.Rds"))
48 | write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, "i3_daily.csv"), row.names=F)
49 |
50 | # Weekly ---------------------------------------------------------------------
51 | print("week")
52 |
53 | df_week_clean <- df_day_clean %>%
54 |
55 | tp_standardize_vars("date", "region", "value") %>%
56 |
57 | # Clean datset
58 | tp_clean_week() %>%
59 | tp_agg_day_to_week(fun = "mean") %>%
60 | tp_complete_date_region() %>%
61 | tp_add_polygon_data(admin_sp) %>%
62 |
63 | # Interpolate/Clean Values
64 | tp_interpolate_outliers(NAs_as_zero = T) %>%
65 | tp_replace_zeros(NAs_as_zero = T) %>%
66 | tp_less15_NA() %>%
67 |
68 | # Percent change
69 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i3_weekly_base.csv"),
70 | type = "weekly",
71 | baseline_date = BASELINE_DATE) %>%
72 | tp_add_percent_change() %>%
73 |
74 | # Add labels
75 | tp_add_label_level(timeunit = "week", OD = F) %>%
76 | tp_add_label_baseline(timeunit = "week", OD = F) %>%
77 |
78 | # Add density
79 | mutate(density = value / area)
80 |
81 | ## Export
82 | saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH, "i3_weekly.Rds"))
83 | write.csv(df_week_clean, file.path(CLEAN_DATA_PATH, "i3_weekly.csv"), row.names=F)
84 |
85 | }
86 |
87 |
88 |
--------------------------------------------------------------------------------
/data-panel/utils.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # Panel utils
3 | #-----------------------------------------------------------------#
4 |
5 | import os
6 | import re
7 | import copy
8 | import pandas as pd
9 | import numpy as np
10 | import datetime as dt
11 |
12 | #-----------------------------------------------------------------#
13 | # General functions
14 |
15 | def clean(data, index_cols):
16 | na_list = [np.nan, '', '99999', 99999, float("inf")]
17 | data = data[~data[index_cols].isin(na_list).any(axis ='columns')]
18 | return(data)
19 |
20 | #-----------------------------------------------------------------#
21 | # Clean panel function
22 |
23 | # Remove low usage outliers assuming these are towers down and
24 | # trims columns
25 |
26 | def clean_columns(indicator, timevar):
27 | # Remove comparison columns
28 | keepcols = copy.deepcopy(indicator.index_cols)
29 | keepcols.extend(indicator.panel.filter(like='_p', axis=1).columns.to_list())
30 | new_df = indicator.panel[keepcols]
31 | # Rename columns
32 | new_df.columns = new_df.columns.str.strip('_p')
33 | # Create time variables
34 | new_df['date'] = pd.to_datetime(new_df[timevar]).dt.date
35 | return new_df
36 |
37 | def remove_towers_down(df, region_vars, outliers_df):
38 | # Process outliers file
39 | # outliers_df = copy.deepcopy(i1_ag_df_tower_down) # created in usage_outliers.py
40 | outliers_df = outliers_df\
41 | .drop(['hcount', 'avg_hours', 'h_diff'], axis = 1)\
42 | .rename(columns = {'region':'region_right'})
43 | outliers_df['flag'] = 1
44 | # Merge outliers
45 | if len(region_vars) == 1:
46 | new_df = df\
47 | .merge(outliers_df,
48 | left_on = ['date', region_vars[0]],
49 | right_on = ['date', 'region_right'],
50 | how = 'outer')\
51 | .drop(['region_right'], axis = 1)
52 | else:
53 | new_df = df\
54 | .merge(outliers_df,
55 | left_on = ['date', region_vars[0]],
56 | right_on = ['date', 'region_right'],
57 | how = 'outer')\
58 | .drop(['region_right'], axis = 1)\
59 | .merge(outliers_df,
60 | left_on = ['date', region_vars[1]],
61 | right_on = ['date', 'region_right'],
62 | how = 'outer')\
63 | .drop(['region_right'], axis = 1)
64 | # Flag if either is true
65 | new_df['flag'] = ((new_df['flag_x'] == 1) | (new_df['flag_y'] == 1)).astype(int)
66 | new_df = new_df.drop(['flag_x', 'flag_y'], axis =1)
67 | # Drop outliers and processual columns
68 | new_df = new_df[~(new_df['flag'] == 1)].drop(['flag'], axis = 1)
69 | return new_df
70 |
71 | def clean_pipeline(indicator, timevar, region_vars, outliers_df):
72 | return remove_towers_down(
73 | clean_columns(indicator,
74 | timevar = timevar),
75 | region_vars = region_vars,
76 | outliers_df = outliers_df)
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i5_net_movement_data.R:
--------------------------------------------------------------------------------
1 | # Clean Subscribers Data
2 |
3 | # Depends on: clean_movement_inout_data.R
4 |
5 | unit <- "adm2"
6 | timeunit <- "daily"
7 | for(unit in c("adm2", "adm3")){
8 | for(timeunit in c("daily", "weekly")){
9 |
10 | print(paste(unit, timeunit, "--------------------------------------------"))
11 |
12 | # Set parameters -------------------------------------------------------------
13 | admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds")))
14 |
15 | if(unit %in% "adm2"){
16 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
17 | }
18 |
19 | if(unit %in% "adm3"){
20 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
21 | }
22 |
23 | # Clean ----------------------------------------------------------------------
24 | df <- readRDS(file.path(CLEAN_DATA_PATH,
25 | paste0("i5_",
26 | timeunit,
27 | ".Rds"))) %>%
28 | as.data.table()
29 |
30 | ## Aggregate Origin
31 | df_orign <- df[, .(value = sum(value, na.rm=T)),
32 | by = list(region_origin, date)]
33 |
34 | names(df_orign)[names(df_orign) %in% "region_origin"] <- "region"
35 | names(df_orign)[names(df_orign) %in% "value"] <- "value_origin"
36 |
37 | ## Aggregate Destination
38 | df_dest <- df[, .(value = sum(value, na.rm=T)),
39 | by = list(region_dest, date)]
40 |
41 | names(df_dest)[names(df_dest) %in% "region_dest"] <- "region"
42 | names(df_dest)[names(df_dest) %in% "value"] <- "value_dest"
43 |
44 | ## Merge
45 | df_day_clean <- merge(df_orign, df_dest, by=c("region", "date")) %>%
46 | as.data.frame()
47 |
48 | ## Prep data
49 | df_day_clean <- df_day_clean %>%
50 |
51 | dplyr::mutate(value = value_dest - value_origin) %>%
52 |
53 | tp_standardize_vars("date", "region", "value") %>%
54 |
55 | # Clean Data
56 | tp_fill_regions(admin_sp) %>%
57 | tp_complete_date_region() %>%
58 | tp_add_polygon_data(admin_sp) %>%
59 |
60 | # Percent change
61 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH,
62 | paste0("i5_net_",timeunit,"_base.csv")),
63 | type = timeunit) %>%
64 | tp_add_percent_change() %>%
65 |
66 | # Add labels
67 | tp_add_label_level(timeunit = timeunit, OD = F) %>%
68 | tp_add_label_baseline(timeunit = timeunit, OD = F)
69 |
70 |
71 | ## Export
72 | saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH,
73 | paste0("i5_net_",
74 | timeunit,
75 | ".Rds")))
76 |
77 | write.csv(df_day_clean, file.path(CLEAN_DATA_PATH,
78 | paste0("i5_net_",
79 | timeunit,
80 | ".csv")),
81 | row.names=F)
82 |
83 |
84 |
85 | }
86 | }
87 |
88 |
89 |
--------------------------------------------------------------------------------
/data-panel/Archive/02_clean.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # PANEL CLEAN
3 | #-----------------------------------------------------------------#
4 |
5 | #-----------------------------------------------------------------#
6 | # Settings
7 |
8 | import pandas as pd
9 |
10 | EXPORT = False
11 |
12 | # Number of hours below avg, used as a trashold to
13 | # define a tower down
14 | htrahshold = -3
15 |
16 | #-----------------------------------------------------------------#
17 | # Import data
18 |
19 | i1 = pd.read_csv( DATA_panel + 'i1_admin3.csv')
20 |
21 | #-----------------------------------------------------------------#
22 | # Process data
23 |
24 | i1['date'] = pd.to_datetime(i1['hour']).dt.date
25 | i1['hour_int'] = pd.to_datetime(i1['hour']).dt.hour
26 |
27 |
28 |
29 | #-----------------------------------------------------------------#
30 | # USAGE OUTILERS: Wards with very little data
31 |
32 | # Number of observations per ward that is total number of hours
33 | i1freq = i1.groupby('region').size()
34 |
35 | i1freq = i1freq.reset_index()
36 | i1freq.columns = ['region', 'freq']
37 |
38 | # Select wards with less than 12h on average
39 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1.date.nunique())]
40 |
41 | i1_low_total_hours = i1_low_total_hours\
42 | .rename(columns = {'freq' : 'total_hours'})
43 | # # Proportion of wards with at least one tower down
44 | # freq[freq < 1392].count()/len(set(i1['region']))
45 |
46 | # # Proportion of wards with very
47 | # freq[freq < 700].count()
48 | # freq[freq < 700].count()/len(set(i1['region']))
49 |
50 | # Export
51 | if(EXPORT):
52 | (i1_low_total_hours
53 | .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv',
54 | index = False) )
55 |
56 | #-----------------------------------------------------------------#
57 | # USAGE OUTILERS: Indicator wards and days with towers down
58 |
59 | # Number of hours with transactions per region day
60 | hours_per_day = i1.groupby(['region', 'date']).size()
61 |
62 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column
63 | hours_per_day.columns = ['region', 'date', 'hcount']
64 |
65 |
66 | # Average hours per day per region
67 | avg_hours = (hours_per_day.groupby(['region'])
68 | .mean()
69 | .rename(columns={'hcount' :'avg_hours' }))
70 |
71 | # Create region day data set
72 | i1_ag_df = hours_per_day.merge(avg_hours,
73 | on = 'region')
74 |
75 | # Difference from average usage per hour
76 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours']
77 |
78 | # Create data only with pairs of wards and days potential
79 | # towers down
80 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold]
81 |
82 | # Read me text
83 | readme_text = "This file contains a combinations of wards and days that are assumed to have a tower down."
84 | readme_text += "If a day has " + str(abs(htrahshold))
85 | readme_text += " hours with any calls below the daily avergage for that ward,"
86 | readme_text += " it is considered to have a trower down at some point that day."
87 |
88 | # Export
89 | if(EXPORT):
90 | (i1_ag_df_tower_down
91 | .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv',
92 | index = False) )
93 | # Read me file
94 | file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w")
95 | file.write(readme_text)
96 | file.close()
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ########################################################################
2 | #
3 | # Based on DIME .gitignore template. Follow the instructions in the URL
4 | # below to set up this template in your own repository
5 | # https://github.com/worldbank/dime-github-trainings/tree/master/GitHub-resources/DIME-GitHub-Templates
6 | #
7 | # Note that if you are using GitKraken, you need to use version 5.x or more
8 | # recent for this template to work properly
9 | #
10 | ########################################################################
11 |
12 | #######################
13 | # Start by ignoring everything, and below we are explicitly saying
14 | # what to not ignore
15 | *
16 |
17 | #######################
18 | # List of files with GitHub functionality anywhere in the repo
19 | # that we do not want to ignore
20 |
21 | # These files include GitHub settings
22 | !.gitignore
23 | !.gitattributes
24 |
25 | # Keep markdown files used for documentation on GitHub
26 | !README.md
27 | !CONTRIBUTING.md
28 | !LICENSE*
29 |
30 | #######################
31 | # For performance reasons, if a folder is already ignored, then
32 | # GitHub does not check the content for that folder for matches
33 | # with additional rules. The line below includes folder in the
34 | # top folder (but not their content), so that anything matching
35 | # the rules below will still not be ignored.
36 | !*/
37 |
38 | #######################
39 | # The following file types are code that should always be
40 | # included no matter where in the repository folder they are
41 | # located unless you explicitly ignore that folder
42 |
43 | # Stata
44 | !/**/*.do
45 | !/**/*.ado
46 |
47 | # R
48 | !/**/*.R
49 | !/**/*.Rmd
50 |
51 | # LaTeX
52 | !/**/*.tex
53 | !/**/*.bib
54 |
55 | # Python
56 | !/**/*.py
57 | !/**/*.ipynb
58 | # Still ignore .ipynb files in checkpoint folders
59 | .ipynb_checkpoints
60 |
61 | # Matlab
62 | !/**/*.m
63 |
64 | # Markdown
65 | !/**/*.md
66 |
67 | # Julia
68 | !/**/*.jl
69 |
70 | # CSS
71 | !/**/*.css
72 |
73 | # Docker
74 | !/**/*.yml
75 | !/**/docker/*
76 |
77 | #######################
78 | # Include some additional file formats in any output folder. You might have
79 | # to change the name of the Output folder to whatever it is called in your
80 | # project, but we strongly recommend that you only include these files in
81 | # a subset of the folders where you are certain no private data is ever stored.
82 | !/**/Output/**/*.txt
83 | !/**/Output/**/*.csv
84 | !/**/Output/**/*.xml
85 | !/**/Output/**/*.eps
86 | !/**/Output/**/*.svg
87 |
88 | #######################
89 | # Include all the files with passwords or tokens here. All files named
90 | # password or passwords are with this template ignored no matter which
91 | # format you are using. Additionally, all content in any folder called
92 | # password or passwords are also ignored. NOTE that your project might be
93 | # using different names and then you must edit the lines below accordingly.
94 | password.*
95 | passwords.*
96 | password/
97 | passwords/
98 |
99 | generate_password.R
100 | generate_password*
101 |
102 |
103 |
104 | #######################
105 | # Explicitly exclude data methods and sources description which should
106 | # be kept private. These are already excluded from above lines, just
107 | # including in case change.
108 | data_methods.txt
109 | data_source_description.txt
110 | /**/notebooks/ignored_scripts*
111 | /**/config_file.py
112 | /**/**/.DS_Storec
113 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/flowminder_aggregator.py:
--------------------------------------------------------------------------------
1 | import os
2 | if os.environ['HOME'] != '/root':
3 | from modules.DataSource import *
4 | from modules.sql_code_aggregates import *
5 | from modules.aggregator import *
6 | databricks = False
7 | else:
8 | databricks = True
9 |
10 | # Databricks notebook source
11 | class flowminder_aggregator(aggregator):
12 | """Class to handle sql aggregations of flowminder code.
13 | For the original sql code from flowminder see https://github.com/Flowminder/COVID-19
14 |
15 | Attributes
16 | ----------
17 | result_stub : a string. File path where to save results
18 | datasource : an instance of DataSource class. Holds all dataframes and paths required
19 | regions : a pyspark dataframe. Admin level this aggregator will be used for
20 | intermediate_tables : a list. Names of tables that we don't want written to csv
21 | calls : a pyspark dataframe. pyspcdr data
22 | cells : a pyspark dataframe. admin region to tower mapping
23 | spark : an initialised spark connection. spark connection this aggregator should use
24 | dates : a dictionary. dates the aggregator should run over
25 | sql_code : a string. the flowminder sql code to be used
26 |
27 |
28 | Methods
29 | -------
30 | run_and_save_all(table_name)
31 | runs run_and_save on the list of all flowminder queries at once
32 |
33 | run_save_and_rename_all()
34 | runs run_and_save_all and then renames the csv files created and
35 | moves them to their parent folder
36 |
37 | attempt_aggregation(indicators_to_produce = 'all', no_of_attempts = 4)
38 | - attempts aggregation of all flowminder indicators
39 | - tries mutiple times (this is relevant for databricks env,
40 | but should be dropped going forward and replaced by a more
41 | solid handling of databricks timeouts)
42 |
43 |
44 | """
45 |
46 | def __init__(self,
47 | result_stub,
48 | datasource,
49 | regions,
50 | intermediate_tables = ['home_locations']):
51 | """
52 | Parameters
53 | ----------
54 | result_stub : where to save results
55 | datasource : holds all dataframes and paths required
56 | regions : admin level this aggregator will be used for
57 | intermediate_tables : tables that we don't want written to csv
58 | """
59 | # initiate with parent init
60 | super().__init__(result_stub,datasource,regions)
61 |
62 | def run_and_save_all(self):
63 | for table_name in self.table_names:
64 | df = self.spark.sql(self.sql_code[table_name])
65 | self.save_and_report(df, table_name)
66 |
67 | def run_save_and_rename_all(self):
68 | self.run_and_save_all()
69 | self.rename_all_csvs()
70 |
71 |
72 | def attempt_aggregation(self, indicators_to_produce = 'all'):
73 | try:
74 | # all indicators
75 | if indicators_to_produce == 'all':
76 | self.run_save_and_rename_all()
77 |
78 | # single indicator
79 | else:
80 | for table in indicators_to_produce.keys():
81 | table_name = indicators_to_produce[table]
82 | print('--> Producing: ' + table_name)
83 | self.run_save_and_rename(table_name + '_per_' + indicators_to_produce[table_name])
84 | print('Indicators saved.')
85 |
86 | except Exception as e:
87 | print(e)
88 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i5_movement_inout_data.R:
--------------------------------------------------------------------------------
1 | # Clean i5 Data for Dashboard
2 |
3 | EXPORT <- T
4 |
5 | unit = "adm2"
6 | for(unit in c("adm2", "adm3")){
7 |
8 | # Load Data / Set Paths ------------------------------------------------------
9 | df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_05_",unit,"_day_result.csv")),
10 | stringsAsFactors=F)
11 | admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds")))
12 |
13 | if(unit %in% "adm2"){
14 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
15 | }
16 |
17 | if(unit %in% "adm3"){
18 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
19 | }
20 |
21 | #### Remove small observations
22 | # If less than 15, make NA. Doing this now removes some region-pairs. For
23 | # example, if a o-d pair has a value less than 15 for every time period,
24 | # we don't considered here and helps improve code speed both here and in
25 | # the script to prepare data for dashboard.
26 | df_day <- df_day[df_day$totalOD > 15,]
27 |
28 | # Daily ----------------------------------------------------------------------
29 | #### Process data for dashboard
30 | df_day_clean <- df_day %>%
31 |
32 | tp_standardize_vars_od("pdate",
33 | unit,
34 | paste0("N_", unit),
35 | "totalOD") %>%
36 |
37 | # Clean datset
38 | tp_clean_date() %>%
39 | tp_complete_date_region_od() %>%
40 | tp_add_polygon_data_od(admin_sp) %>%
41 |
42 | # Interpolate/Clean Values
43 | tp_interpolate_outliers(NAs_as_zero = F) %>%
44 | #tp_replace_zeros(NAs_as_zero = T) %>%
45 | tp_less15_NA() %>%
46 |
47 | # Percent change
48 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i5_daily_base.csv"),
49 | baseline_date = BASELINE_DATE) %>%
50 | tp_add_percent_change() %>%
51 |
52 | # Add labels
53 | tp_add_label_level(timeunit = "day", OD = T) %>%
54 | tp_add_label_baseline(timeunit = "day", OD = T)
55 |
56 | ## Export
57 | saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, "i5_daily.Rds"))
58 | write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, "i5_daily.csv"), row.names=F)
59 |
60 | # Weekly ---------------------------------------------------------------------
61 | print("week")
62 |
63 | df_week_clean <- df_day_clean %>%
64 |
65 | dplyr::select(date, region_origin, region_dest, value) %>%
66 |
67 | tp_standardize_vars_od("date", "region_origin", "region_dest", "value") %>%
68 |
69 | # Clean datset
70 | tp_clean_week() %>%
71 | tp_agg_day_to_week_od() %>%
72 | tp_complete_date_region_od() %>%
73 | tp_add_polygon_data_od(admin_sp) %>%
74 |
75 | # Interpolate/Clean Values
76 | #tp_interpolate_outliers(NAs_as_zero = F) %>%
77 | #tp_replace_zeros(NAs_as_zero = T) %>%
78 | tp_less15_NA() %>%
79 |
80 | # Percent change
81 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i5_weekly_base.csv"),
82 | type = "weekly",
83 | baseline_date = BASELINE_DATE) %>%
84 | tp_add_percent_change() %>%
85 |
86 | # Add labels
87 | tp_add_label_level(timeunit = "week", OD = T) %>%
88 | tp_add_label_baseline(timeunit = "week", OD = T)
89 |
90 | ## Export
91 | saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH, "i5_weekly.Rds"))
92 | write.csv(df_week_clean, file.path(CLEAN_DATA_PATH, "i5_weekly.csv"), row.names=F)
93 |
94 | }
95 |
96 |
97 |
98 |
99 |
100 |
101 |
--------------------------------------------------------------------------------
/data-checks/Archive/quick_checks/check_subscribers.R:
--------------------------------------------------------------------------------
1 | # Check subscribers data
2 |
3 | FIG_PATH <- file.path(PROJECT_PATH, "proof-of-concept",
4 | "outputs", "data-checks", "figures_indicators", "subscribers_daily")
5 |
6 | # Load Data --------------------------------------------------------------------
7 | # FILE PATHS NEED TO BE UPDATED
8 | ISAAC_DATA_PATH_2 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin2_flowminder")
9 | ISAAC_DATA_PATH_3 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin3_flowminder")
10 |
11 | #### Raw Data
12 | df_day_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2,
13 | "count_unique_subscribers_per_region_per_day.csv"),
14 | stringsAsFactors=F) %>%
15 | dplyr::rename(value_raw = subscriber_count,
16 | date = visit_date) %>%
17 | dplyr::mutate(region = region %>% as.character(),
18 | date = date %>% as.Date())
19 |
20 | df_week_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2,
21 | "count_unique_subscribers_per_region_per_week.csv"),
22 | stringsAsFactors=F) %>%
23 | dplyr::rename(value_raw = subscriber_count,
24 | date = visit_week) %>%
25 | dplyr::mutate(region = region %>% as.character())
26 |
27 | df_day_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3,
28 | "count_unique_subscribers_per_region_per_day.csv"),
29 | stringsAsFactors=F) %>%
30 | dplyr::rename(value_raw = subscriber_count,
31 | date = visit_date) %>%
32 | dplyr::mutate(region = region %>% as.character(),
33 | date = date %>% as.Date())
34 |
35 | df_week_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3,
36 | "count_unique_subscribers_per_region_per_week.csv"),
37 | stringsAsFactors=F) %>%
38 | dplyr::rename(value_raw = subscriber_count,
39 | date = visit_week) %>%
40 | dplyr::mutate(region = region %>% as.character())
41 |
42 | #### Cleaned Data
43 | df_day_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH,
44 | "count_unique_subscribers_per_region_per_day.Rds")) %>%
45 | left_join(df_day_adm2_raw, by=c("date", "region"))
46 |
47 | df_week_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH,
48 | "count_unique_subscribers_per_region_per_week.Rds"))
49 |
50 | df_day_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH,
51 | "count_unique_subscribers_per_region_per_day.Rds")) %>%
52 | left_join(df_day_adm3_raw, by=c("date", "region")) %>%
53 | mutate(value_raw = value_raw %>% as.numeric())
54 |
55 | df_week_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH,
56 | "count_unique_subscribers_per_region_per_week.Rds"))
57 |
58 | # Trends Over Time -------------------------------------------------------------
59 | df_day_adm2 %>%
60 | group_by(date) %>%
61 | summarise(value = sum(value),
62 | value_raw = sum(value_raw)) %>%
63 | ggplot() +
64 | geom_line(aes(x=date, y=value), color="black") +
65 | geom_point(aes(x=date, y=value), color="black") +
66 | geom_vline(xintercept = as.Date("2020-03-27"), color="red")
67 |
68 | lapply(unique(df_day_adm3$province), function(province_i){
69 | print(province_i)
70 |
71 | p <- df_day_adm3 %>%
72 | filter(province %in% province_i) %>%
73 | ggplot(aes(x=date)) +
74 | geom_line(aes(y=value_raw), color="red", alpha=0.2, size=1.5) +
75 | geom_line(aes(y=value)) +
76 | facet_wrap(~region,
77 | scales = "free_y")
78 | ggsave(p, filename = file.path(FIG_PATH, paste0(province_i, ".png")), height = 25, width = 25)
79 |
80 | return(NULL)
81 | })
82 |
83 |
84 |
85 |
86 |
87 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i7_distance_traveled.R:
--------------------------------------------------------------------------------
1 | # Clean Subscribers Data
2 |
3 | unit = "adm2"
4 | metric = "avg_dist"
5 | for(unit in c("adm2", "adm3")){
6 | for(metric in c("avg_dist", "stddev")){
7 |
8 | print(paste(unit, metric, "---------------------------------------------"))
9 |
10 | # Load Data / Set Paths ------------------------------------------------------
11 | df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_07_home_",unit,"_day_result.csv")),
12 | stringsAsFactors=F)
13 | admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds")))
14 |
15 | if(unit %in% "adm2"){
16 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
17 |
18 | df_day <- clean_moz_names(df_day,
19 | name = "H_adm2",
20 | name_higher = "H_adm1",
21 | type = "adm2")
22 |
23 | }
24 | if(unit %in% "adm3"){
25 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
26 |
27 | df_day <- clean_moz_names(df_day,
28 | name = "H_adm3",
29 | name_higher = "H_adm2",
30 | type = "adm3")
31 |
32 | }
33 |
34 | # Daily ----------------------------------------------------------------------
35 | print("day")
36 |
37 | df_day_clean <- df_day %>%
38 |
39 | tp_standardize_vars("pdate", paste0("H_", unit), metric) %>%
40 |
41 | # Clean datset
42 | tp_clean_date() %>%
43 | tp_fill_regions(admin_sp) %>%
44 | tp_complete_date_region() %>%
45 | tp_add_polygon_data(admin_sp) %>%
46 |
47 | # Interpolate/Clean Values
48 | tp_interpolate_outliers(NAs_as_zero = T, outlier_replace="both") %>%
49 | tp_replace_zeros(NAs_as_zero = T) %>%
50 | tp_less15_NA(threshold = 0) %>%
51 |
52 | # Percent change
53 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, paste0("i7_",metric,"_daily_base.csv"))) %>%
54 | tp_add_percent_change() %>%
55 |
56 | # Add labels
57 | tp_add_label_level(timeunit = "day", OD = F) %>%
58 | tp_add_label_baseline(timeunit = "day", OD = F)
59 |
60 | ## Export
61 | saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, paste0("i7_daily_",metric,".Rds")))
62 | write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, paste0("i7_daily_",metric,".csv")), row.names=F)
63 |
64 |
65 | # Weekly ---------------------------------------------------------------------
66 | print("week")
67 |
68 | df_week_clean <- df_day_clean %>%
69 |
70 | dplyr::select(date, region, value) %>%
71 |
72 | tp_standardize_vars("date", "region", "value") %>%
73 |
74 | # Clean datset
75 | tp_clean_week() %>%
76 | tp_agg_day_to_week(fun="mean") %>%
77 | tp_fill_regions(admin_sp) %>%
78 | tp_complete_date_region() %>%
79 | tp_add_polygon_data(admin_sp) %>%
80 |
81 | # Interpolate/Clean Values
82 | #tp_interpolate_outliers(NAs_as_zero = T) %>%
83 | #tp_replace_zeros(NAs_as_zero = T) %>%
84 | #tp_less15_NA() %>%
85 |
86 | # Percent change
87 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, paste0("i7_",metric,"_weekly_base.csv")),
88 | type = "weekly") %>%
89 | tp_add_percent_change() %>%
90 |
91 | # Add labels
92 | tp_add_label_level(timeunit = "week", OD = F) %>%
93 | tp_add_label_baseline(timeunit = "week", OD = F)
94 |
95 |
96 | ## Export
97 | saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH,
98 | paste0("i7_weekly_",metric,".Rds")))
99 | write.csv(df_week_clean, file.path(CLEAN_DATA_PATH,
100 | paste0("i7_weekly_",metric,".csv")),
101 | row.names=F)
102 |
103 |
104 | }
105 | }
106 |
107 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/_dash_master.R:
--------------------------------------------------------------------------------
1 | # Master R Script for Prepping Data for Dashboard
2 | # Mozambique
3 |
4 | #### Settings #### =============================================================
5 | options(rsconnect.max.bundle.files = 400000)
6 |
7 | CLEAN_SPATIAL_DATA <- F
8 | CLEAN_TELECOM_DATA <- F
9 | PREP_DATA_FOR_DASH <- T
10 |
11 | BASELINE_DATE <- "2020-03-31"
12 |
13 | #### Packages #### =============================================================
14 | library(tidyverse)
15 | library(sparkline)
16 | library(sf)
17 | library(sp)
18 | library(plotly)
19 | library(stargazer)
20 | library(knitr)
21 | library(gridExtra)
22 | library(leaflet)
23 | library(ggpubr)
24 | library(purrr)
25 | library(parallel)
26 | library(pbmcapply)
27 | library(rgeos)
28 | library(rgdal)
29 | library(sp)
30 | library(rmapshaper)
31 | library(raster)
32 | library(geosphere)
33 | library(lubridate)
34 | library(data.table)
35 | library(mapview)
36 | library(bcrypt)
37 |
38 | #### File paths #### ===========================================================
39 |
40 | # Define Root Paths ------------------------------------------------------------
41 | if(Sys.info()[["user"]] == "robmarty") PROJECT_PATH <- "~/Documents/World Bank/Sveta Milusheva - COVID 19 Results"
42 | if(Sys.info()[["user"]] == "wb519128") PROJECT_PATH <- "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results"
43 | if(Sys.info()[["user"]] == "WB521633") PROJECT_PATH <- "C:/Users/wb521633/WBG/Sveta Milusheva - COVID 19 Results"
44 |
45 | if(Sys.info()[["user"]] == "robmarty") GITHUB_PATH <- "~/Documents/Github/covid-mobile-dashboards"
46 | if(Sys.info()[["user"]] == "wb519128") GITHUB_PATH <- "C:/Users/wb519128/Github/covid-mobile-dashboards"
47 | if(Sys.info()[["user"]] == "WB521633") GITHUB_PATH <- "C:/Users/wb521633/Documents/Github/covid-mobile-dashboards"
48 |
49 | # Define Paths from Root -------------------------------------------------------
50 | GADM_PATH <- "PATH-HERE"
51 | GEO_PATH <- "PATH-HERE"
52 |
53 | CLEAN_DATA_ADM2_PATH <- "PATH-HERE"
54 | CLEAN_DATA_ADM3_PATH <- "PATH-HERE"
55 |
56 | DASHBOARD_DATA_ONEDRIVE_PATH <- "PATH-HERE"
57 | DASHBOARD_DATA_GITHUB_PATH <- "PATH-HERE"
58 |
59 | PREP_DATA_CODE_PATH <- "PATH-HERE"
60 |
61 | #### Functions #### ============================================================
62 | source(file.path(GITHUB_PATH, "dashboard-dataviz", "dashboards",
63 | "_tp_functions.R"))
64 |
65 | source(file.path(GITHUB_PATH, "dashboard-dataviz", "dashboards",
66 | "_prep_data_for_dash_functions.R"))
67 |
68 |
69 | #### Scripts #### ==============================================================
70 |
71 | # 1. Prepare Spatial Data ------------------------------------------------------
72 | if(CLEAN_SPATIAL_DATA){
73 | source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "download_gadm.R"))
74 | source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "clean_adm2_file.R"))
75 | source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "clean_adm3_file.R"))
76 | }
77 |
78 | # 2. Prepare Spatial Data ------------------------------------------------------
79 | if(CLEAN_TELECOM_DATA){
80 | source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i3_subscribers_data.R"))
81 | source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i5_movement_inout_data.R"))
82 | source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i5_net_movement_data.R"))
83 | source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i7_distance_traveled.R"))
84 | }
85 |
86 | # 3. Prep Data for Dashboard ---------------------------------------------------
87 | if(PREP_DATA_FOR_DASH){
88 | source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "prep_subs_obs_totals_data.R"))
89 | source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "prep_telecom_agg_data.R"))
90 | source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "data_to_github.R"))
91 | }
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
--------------------------------------------------------------------------------
/data-checks/Archive/usage_outliers.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # Outliers and towers down
3 | #-----------------------------------------------------------------#
4 |
5 | # This code depends on MASTER.py to run as file path objects are
6 | # defined there
7 |
8 |
9 | #-----------------------------------------------------------------#
10 | # TO DO:
11 |
12 | # Identify regions with very sparse use
13 | # 1. Count obs per region
14 | # 2. Count obs per region per day
15 |
16 | # Identify regions with normal use and big valleys of usage, that
17 | # would probably indicate a tower being down
18 |
19 | #-----------------------------------------------------------------#
20 | # Settings
21 |
22 | import pandas as pd
23 |
24 | EXPORT = False
25 | TEMP_PANEL = True
26 | # Number of hours below avg, used as a trashold to
27 | # define a tower down
28 | htrahshold = -3
29 |
30 | #-----------------------------------------------------------------#
31 | # Import data
32 |
33 | if TEMP_PANEL:
34 | i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv')
35 | else:
36 | i1 = pd.read_csv(I1_Adm3_path + "transactions_per_hour.csv")
37 |
38 | i1 = i1[i1.region != '99999']
39 | # Wards data
40 |
41 |
42 |
43 | # Hourly transactions per region
44 |
45 | # Unique subscribers per hour
46 | # i2a3 = pd.read_csv(I2_Adm3_path + "unique_subscribers_per_hour.csv")
47 | # i2t = pd.read_csv(I2_towercluster_path + "unique_subscribers_per_hour.csv")
48 |
49 |
50 | #-----------------------------------------------------------------#
51 | # Process data
52 |
53 | i1['date'] = pd.to_datetime(i1['hour']).dt.date
54 | i1['hour_int'] = pd.to_datetime(i1['hour']).dt.hour
55 |
56 |
57 | #-----------------------------------------------------------------#
58 | # Wards with very little data
59 |
60 | # Number of observations per ward that is total number of hours
61 | i1freq = i1.groupby('region').size()
62 |
63 | i1freq = i1freq.reset_index()
64 | i1freq.columns = ['region', 'freq']
65 |
66 | # Select wards with less than 12h on average
67 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1.date.nunique())]
68 |
69 | i1_low_total_hours = i1_low_total_hours\
70 | .rename(columns = {'freq' : 'total_hours'})
71 | # # Proportion of wards with at least one tower down
72 | # freq[freq < 1392].count()/len(set(i1['region']))
73 |
74 | # # Proportion of wards with very
75 | # freq[freq < 700].count()
76 | # freq[freq < 700].count()/len(set(i1['region']))
77 |
78 | # Export
79 | if(EXPORT):
80 | (i1_low_total_hours
81 | .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv',
82 | index = False) )
83 |
84 | #-----------------------------------------------------------------#
85 | # Indicator wards and days with towers down
86 |
87 | # Number of hours with transactions per region day
88 | hours_per_day = i1.groupby(['region', 'date']).size()
89 |
90 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column
91 | hours_per_day.columns = ['region', 'date', 'hcount']
92 |
93 |
94 | # Average hours per day per region
95 | avg_hours = (hours_per_day.groupby(['region'])
96 | .mean()
97 | .rename(columns={'hcount' :'avg_hours' }))
98 |
99 | # Create region day data set
100 | i1_ag_df = hours_per_day.merge(avg_hours,
101 | on = 'region')
102 |
103 | # Difference from average usage per hour
104 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours']
105 |
106 | # Create data only with pairs of wards and days potential
107 | # towers down
108 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold]
109 |
110 | # Read me text
111 | readme_text = "This file contains a combinations of wards and days that are assumed to have a tower down."
112 | readme_text += "If a day has " + str(abs(htrahshold))
113 | readme_text += " hours with any calls below the daily avergage for that ward,"
114 | readme_text += " it is considered to have a trower down at some point that day."
115 |
116 | # Export
117 | if(EXPORT):
118 | (i1_ag_df_tower_down
119 | .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv',
120 | index = False) )
121 | # Read me file
122 | file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w")
123 | file.write(readme_text)
124 | file.close()
125 |
126 |
127 |
128 |
129 |
130 |
--------------------------------------------------------------------------------
/data-checks/Archive/i10-check.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # dbutils.fs.ls('/mnt/')
3 | # dbutils.fs.refreshMounts()
4 |
5 | # COMMAND ----------
6 |
7 | import pyspark.sql.functions as F
8 | from pyspark.sql.functions import to_timestamp
9 | from pyspark.sql.types import *
10 | from pyspark.sql.window import Window
11 |
12 | # Constat definitions
13 | privacy_filter = 15
14 | missing_value_code = 99999
15 | cutoff_days = 7
16 | max_duration = 21
17 |
18 | user_window = Window\
19 | .partitionBy('msisdn').orderBy('call_datetime')
20 |
21 |
22 | # COMMAND ----------
23 |
24 | # dbutils.fs.ls('/mnt/COVID19Data/Sveta Milusheva - mar20')
25 | base_path = '/mnt/COVID19Data/Sveta Milusheva - mar20/'
26 | geo_path = 'mnt/COVID19Data/proof-of-concept/support-data/geo-files/'
27 |
28 | # COMMAND ----------
29 |
30 | # Load tower mapping to districts
31 | cells = spark.read.format("csv")\
32 | .option("header", "true")\
33 | .load(geo_path + 'zw_admin3_tower_map.csv')
34 |
35 | # COMMAND ----------
36 |
37 | cells.show()
38 |
39 | # COMMAND ----------
40 |
41 | # Set default schema
42 | schema = StructType([
43 | StructField("msisdn", IntegerType(), True),
44 | StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files()
45 | StructField("location_id", StringType(), True)
46 | ])
47 |
48 | # Import one day at a time
49 |
50 | mar20 = spark.read.format("csv")\
51 | .option("header", "true")\
52 | .load(base_path + 'MOH_EWZ_20200320.csv', schema = schema)
53 |
54 | mar21 = spark.read.format("csv")\
55 | .option("header", "true")\
56 | .load(base_path + 'MOH_EWZ_20200320.csv', schema = schema)
57 |
58 |
59 |
60 | # COMMAND ----------
61 |
62 | # Process data
63 |
64 | def create_vars(df, cells):
65 | # Loading variables
66 | df = df.withColumn("call_datetime", to_timestamp("call_datetime","dd/MM/yyyy HH:mm:ss"))
67 | #get call_date from call_datetime
68 | df = df.withColumn('call_date', df.call_datetime.cast('date'))
69 |
70 | # Recreate analysis variables
71 | df = df.join(cells, df.location_id == cells.cell_id, how = 'left').drop('cell_id')\
72 | .orderBy('msisdn', 'call_datetime')\
73 | .withColumn('region_lag', F.lag('region').over(user_window))\
74 | .withColumn('region_lead', F.lead('region').over(user_window))\
75 | .withColumn('call_datetime_lag', F.lag('call_datetime').over(user_window))\
76 | .withColumn('call_datetime_lead', F.lead('call_datetime').over(user_window))\
77 | .withColumn('hour_of_day', F.hour('call_datetime').cast('byte'))\
78 | .withColumn('hour', F.date_trunc('hour', F.col('call_datetime')))\
79 | .withColumn('week', F.date_trunc('week', F.col('call_datetime')))\
80 | .withColumn('month', F.date_trunc('month', F.col('call_datetime')))\
81 | .withColumn('constant', F.lit(1).cast('byte'))\
82 | .withColumn('day', F.date_trunc('day', F.col('call_datetime')))\
83 | .na.fill({'region' : missing_value_code ,
84 | 'region_lag' : missing_value_code ,
85 | 'region_lead' : missing_value_code })
86 |
87 | return df
88 |
89 | mar20 = create_vars(mar20, cells)
90 | mar21 = create_vars(mar21, cells)
91 |
92 | # COMMAND ----------
93 |
94 | mar20.columns
95 |
96 | # COMMAND ----------
97 |
98 | # Create simple OD matrix
99 | def simp_od(df):
100 |
101 | # Kepp if region and region_lag/lead are not the same
102 | df = df.where((F.col('region_lag') != F.col('region')) | (F.col('region_lead') != F.col('region')) | (F.col('call_datetime_lead').isNull()))
103 |
104 | # Aggregate total sum by region and region_lag
105 | agg_df = df.groupby('region', 'region_lag')\
106 | .agg(F.count("*"))
107 |
108 | return agg_df
109 |
110 | m20_agg = simp_od(mar20)
111 | m21_agg = simp_od(mar21)
112 |
113 | # COMMAND ----------
114 |
115 | m20_agg.show()
116 |
117 | # COMMAND ----------
118 |
119 | # mar20.show()
120 |
121 | # COMMAND ----------
122 |
123 |
124 | # 1. Merge with tower mapping to wards
125 |
126 | # 2. Recreate vars
127 |
128 | # 4.
129 |
130 |
131 | # COMMAND ----------
132 |
133 | a
134 |
135 | # COMMAND ----------
136 |
137 |
138 |
139 | # COMMAND ----------
140 |
141 | test_df = spark.read\
142 | .option('header', 'true')\
143 | .option('inferSchema', 'true')\
144 | .csv('/mnt/COVID19Data/proof-of-concept/new/ZW/telecel/world_bank_cdr_new.csv')
145 | dd
146 |
147 | # COMMAND ----------
148 |
149 | test_df.printSchema()
150 |
--------------------------------------------------------------------------------
/data-checks/Archive/MASTER.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # DATA CHECKS MASTER
3 | #-----------------------------------------------------------------#
4 |
5 | # This script sets file paths and (will) map all processes for checking
6 | # incoming data
7 |
8 | #-----------------------------------------------------------------#
9 | #### Settings
10 |
11 | import os
12 | import re
13 | import pandas as pd
14 | import numpy as np
15 | import datetime as dt
16 |
17 | import seaborn as sns; sns.set()
18 | from matplotlib import rcParams
19 | import matplotlib.pyplot as plt
20 |
21 | #-----------------------------------------------------------------#
22 | #### Set file paths
23 |
24 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
25 | DATA_POC = DATA_path + "proof-of-concept/"
26 | DATA_GIS = DATA_POC + 'geo_files/'
27 |
28 | DATA_DB_raw_indicators = DATA_POC + "databricks-results/zw/"
29 | DATA_dashboad_clean = DATA_POC + "/files_for_dashboard/files_clean/"
30 |
31 | DATA_dash_clean_a2 = DATA_dashboad_clean + "adm2/"
32 | DATA_dash_clean_a3 = DATA_dashboad_clean + "adm3/"
33 |
34 | #---------------#
35 | # Main indicators
36 |
37 | # Transactions per hour
38 | I1_path = DATA_DB_raw_indicators + "indicator 1/"
39 | I1_Adm3_path = I1_path + "admin3/"
40 |
41 |
42 | # Unique subcribers per hour
43 | I2_path = DATA_DB_raw_indicators + "indicator 2/"
44 | I2_Adm3_path = I2_path + "admin3/"
45 | I2_towercluster_path = I2_path + "tower_cluster/"
46 |
47 |
48 | # Unique subscribers per day
49 | I3_path = DATA_DB_raw_indicators + "indicator 3/"
50 | I3_Adm2_path = I3_path + "admin2/"
51 | I3_Adm3_path = I3_path + "admin3/"
52 |
53 | # Ratio of residents active that day based on those present
54 | # during baseline
55 | I4_path = DATA_DB_raw_indicators + "indicator 4/"
56 | I4_Adm2_path = I4_path + 'admin2/'
57 | I4_Adm3_path = I4_path + 'admin3/'
58 |
59 | # OD matrix
60 | I5_path = DATA_DB_raw_indicators + "indicator 5/"
61 | I5_Adm2_path = I5_path + "admin2/"
62 | I5_Adm3_path = I5_path + "admin3/"
63 |
64 | # Residents living in area
65 | I6_path = DATA_DB_raw_indicators + "indicator 6/"
66 | I6_Adm2_path = I6_path + "admin2/"
67 | I6_Adm3_path = I6_path + "admin3/"
68 |
69 | # Mean and Standard Deviation of distance
70 | # traveled (by home location) day
71 | I7_path = DATA_DB_raw_indicators + "indicator 7/"
72 | I7_Adm2_path = I7_path + "admin2/"
73 | I7_Adm3_path = I7_path + "admin3/"
74 |
75 | # Mean and Standard Deviation of distance
76 | # traveled (by home location) week
77 | I8_path = DATA_DB_raw_indicators + "indicator 8/"
78 | I8_Adm2_path = I5_path + "admin2/"
79 | I8_Adm3_path = I5_path + "admin3/"
80 |
81 | # Daily locations based on Home Region with
82 | # average stay time and SD of stay time
83 | I9_path = DATA_DB_raw_indicators + "indicator 9/"
84 | I9_Adm2_path = I9_path + "admin2/"
85 | I9_Adm3_path = I9_path + "admin3/"
86 |
87 | #Simple Origin Destination Matrix - trips
88 | # between consecutive in time regions with time
89 | I10_path = DATA_DB_raw_indicators + "indicator 10/"
90 | I10_Adm2_path = I5_path + "admin2/"
91 | I10_Adm3_path = I5_path + "admin3/"
92 |
93 | #---------------------#
94 | # Flowminder indicators
95 | FLOWM_path = DATA_DB_raw_indicators + "flowminder indicators/"
96 | FLOWM_adm2_path = FLOWM_path + "admin2/"
97 | FLOWM_adm3_path = FLOWM_path + "admin3/"
98 |
99 | #-------------------#
100 | # External indicators
101 |
102 | # Update file path
103 | IRESULTS = DATA_path + "Isaac-results/"
104 |
105 | IFLOW_path = IRESULTS + "flowminder/"
106 | ICUST_path = IRESULTS + "custom/"
107 |
108 | # Flowminder
109 | IFLOWM_adm2_path = IFLOW_path + "admin2/"
110 | IFLOWM_adm3_path = IFLOW_path + "admin3/"
111 |
112 | # Custum
113 | ICUST_adm2_path = ICUST_path + "admin2/"
114 | ICUST_adm3_path = ICUST_path + "admin3/"
115 |
116 |
117 | #---------------#
118 | # Outputs
119 | OUT_path = DATA_POC + "outputs/"
120 | OUT_plots = OUT_path + "Figures/"
121 | OUT_hfcs = OUT_path + "data-checks/"
122 | # OUT_hfcs_sheets = OUT_hfcs + "Sheet differences/"
123 |
124 | #-----------------------------------------------------------------#
125 | # Indicator dataframes
126 |
127 | # Load list of internal indicators to make it
128 | # easier to bulk load files
129 | internal_indicators = pd\
130 | .read_csv(DATA_POC + 'documentation/indicators_list.csv')
131 |
132 | # Since sheet contains relative paths add path global
133 | # to have absolute paths
134 | internal_indicators['path'] = DATA_path + internal_indicators['path']
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/README.md:
--------------------------------------------------------------------------------
1 | # Dashboard
2 |
3 | This dashboard is build using R Shiny.
4 |
5 | # Preparing Data for Dashboard
6 |
7 | `01_preparing_data_for_dashboard` contains three folders with scripts for cleaning and preparing data for the dashboard.
8 |
9 | ## Clean Spatial Data
10 |
11 | The files in `01_clean_spatial_data` clean spatial polygons to be used in the dashboard and subsequent cleaning steps. The following cleaning steps are conducted:
12 |
13 | 1. Aggregate units when needed (e.g., aggregating wards)
14 | 2. Add additional variables (e.g., area)
15 | 3. Standardize variable names
16 | 4. Orders spatial data by region
17 |
18 | #### Standardize Variable Names
19 | Each spatial dataset should have standardized variable names. Standardizing
20 | variable names helps ensure different units (eg, admin2, admin3) can be
21 | easily switched in the dashboard
22 |
23 | | variable | format | example | description |
24 | |---|---|---|---|
25 | | region | string | ZONE123456 | Unique identifier of the spatial unit |
26 | | name | string | name-here | Spatial unit name |
27 | | area | numeric | 1234 | Area of the spatial unit in kilometers squared |
28 | | adm1| string | name-here | Name of the province |
29 |
30 | #### Order Spatial Data
31 | Spatial datasets are ordered by region. When cleaning other datasets at the
32 | region level, we also order by region and ensure all regions are present. This
33 | ensures that no reordering needs to be done in the dashboard.
34 |
35 | ## Clean Telecom Data
36 |
37 | The files in `02_clean_telecom_data` clean telecom data. They clean variable values (eg, accounting for outliers), standardize variable names and add variables needed for the dashboard.
38 |
39 | #### Dataset
40 |
41 | A number of indicators are cleaned. To facilitate further processing for the datasets
42 | to be used in the dashboard, all cleaned datasets have the following standardized
43 | variables:
44 |
45 | | variable | format | example | description |
46 | |---|---|---|---|
47 | | region | string | ZONE123456 | Unique identifier of the spatial unit |
48 | | name | string | Name1 | Spatial unit name |
49 | | date | date or string | 2020-02-01 | The date |
50 | | value | numeric | 1000 | Value (e.g., number of subscribers, number of trips, distance traveled) |
51 | | value_lag | numeric | 1000 | Value from the previous time period |
52 | | value_base | numeric | 1000 | Baseline value |
53 | | value_perchange_base | numeric | 50 | Percent change from baseline |
54 | | value_zscore_base | numeric | 50 | Z-score change since baseline |
55 | | label_level | string | Name1
This day's value: 1000
... | Label for when level of variable is shown |
56 | | label_base| string | Name1
This day's value: 1000
... | Label for when change since baseline value is shown. |
57 |
58 | ## Dashboard Data Prep
59 |
60 | The files in `03_dashboard_data_prep` further process data into datasets that are used for the dashboard. Due to the high volume of data, data transformations (e.g., aggregating, filtering, etc) are done outside of the dashboard in order to minimize the processing and data needed to be loaded in memory at any point as the dashboard is running. These scripts filter the cleaned telecom data into individual datasets so that no additional filtering or transformations need to be applied within the dashboard; the dashboard can just read the files then immediately use the data in the map, line graph and table. Here, we create smaller datasets that contain the same variables as above. Indicators include density, movement in, movement out, mean distance traveled, etc.
61 |
62 | The following datasets are made.
63 |
64 | | Dataset Type | Naming Convention | Description |
65 | | --- | --- | --- |
66 | | unit-level | [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Date/Week].Rds | For a given day or week, this dataset contains information for all units for a specified indicator. For O-D level datasets, values are aggregated to the specified origin or destination unit (eg, movement into unit from all other units). |
67 | | time-level | [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name].Rds | For a given admin unit, this dataset contains a time series of values for a specified indicator. |
68 | | unit-time-level | [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name]\_[Date/Week].Rds | These datasets are only used for O-D variables. The show, for a given origin or destination unit, the movement in or out of that unit to all other units for the specified day/week. |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/data-checks/Archive/02_summary_stats.py:
--------------------------------------------------------------------------------
1 |
2 | #-----------------------------------------------------------------#
3 | # Exploratory analysis
4 | #-----------------------------------------------------------------#
5 | #-----------------------------------------------------------------#
6 | #### Settings
7 |
8 | from globals import *
9 |
10 | #-----------------------------------------------------------------#
11 | #### Set file paths
12 |
13 | DATA_GIS = DATA_path + 'proof-of-concept/geo_files/'
14 | INDICATORS_path = DATA_path + "proof-of-concept/panel_indicators/clean/"
15 |
16 | #-----------------------------------------------------------------#
17 | #### Load data
18 |
19 | i1 = pd.read_csv(INDICATORS_path + 'i1_3.csv') # Number of calls
20 | i3 = pd.read_csv(INDICATORS_path + 'i3_3.csv') # number of users
21 | i5 = pd.read_csv(INDICATORS_path + 'i5_3.csv') # orgin and destination
22 | i52 = pd.read_csv(INDICATORS_path + 'i5_2.csv') # orgin and destination
23 |
24 | i7 = pd.read_csv(INDICATORS_path + 'i7_3.csv') # distance travelled
25 |
26 | #-----------------------------------------------------------------#
27 | #### Aggregate data at the country level
28 |
29 | i1_agg = i1\
30 | .groupby('date')\
31 | .agg({'count' : np.sum})\
32 | .reset_index()\
33 | .sort_values('date')
34 |
35 | i3_agg = i3\
36 | .groupby('date')\
37 | .agg({'count' : np.sum})\
38 | .reset_index()\
39 | .sort_values('date')\
40 | .rename(columns = {'count': 'subs'})
41 |
42 | # Add number of subscribers to indicator 1 aggregated data
43 | i1_agg = i1_agg.merge(i3_agg, on = 'date')
44 | i1_agg['calls_p'] = i1_agg['count']/i1_agg['subs']
45 |
46 |
47 | # OD matrix aggregated data
48 | i5_agg = i5\
49 | .groupby('date')\
50 | .agg({'subscriber_count' : np.mean,
51 | 'total_count' : np.sum,
52 | 'region_to': pd.Series.nunique,
53 | 'region_from': pd.Series.nunique})\
54 | .reset_index()\
55 | .sort_values('date')
56 |
57 | i5_agg = i5_agg.merge(i3_agg, on = 'date')
58 | i5_agg['moves_p_sub'] = i5_agg['subscriber_count']/i5_agg['subs']
59 |
60 |
61 | #-----------------------------------------------------------------#
62 | # Comparisson between pre and post lockdown stats
63 |
64 | # Pre-post lockdown variables
65 | lockdown_date = np.datetime64(dt.date(2020, 3, 27))
66 |
67 | i1['post'] = (i1['date'].astype('datetime64') > lockdown_date).astype(int)
68 | i3['post'] = (i3['date'].astype('datetime64') > lockdown_date).astype(int)
69 | i5['post'] = (i5['date'].astype('datetime64') > lockdown_date).astype(int)
70 | i7['post'] = (i7['date'].astype('datetime64') > lockdown_date).astype(int)
71 |
72 | i1_agg['post'] = (i1_agg['date'].astype('datetime64') > lockdown_date).astype(int)
73 | i5_agg['post'] = (i5_agg['date'].astype('datetime64') > lockdown_date).astype(int)
74 | i7['post'] = (i7['date'].astype('datetime64') > lockdown_date).astype(int)
75 |
76 | # Number of calls per user
77 | i1_agg['calls_p'].mean()
78 | i1_agg['calls_p'][i1_agg['post'] == 0].mean()
79 | i1_agg['calls_p'][i1_agg['post'] == 1].mean()
80 |
81 | # Number of districts visited?
82 | i5_agg['moves_p_sub'].mean()
83 | i5_agg['moves_p_sub'][i5_agg['post'] == 0].mean()
84 | i5_agg['moves_p_sub'][i5_agg['post'] == 1].mean()
85 |
86 | # Average distance travelled
87 | i7['mean_distance'].mean()
88 |
89 | i7['mean_distance'][i7['post'] == 0].mean()
90 | i7['mean_distance'][i7['post'] == 1].mean()
91 |
92 | # Number of wards
93 | i5['subscriber_count'].mean()
94 | i5['subscriber_count'][i5['post'] == 0].sum()
95 | i5['subscriber_count'][i5['post'] == 1].mean()
96 |
97 | # Distance travelled
98 | i7['mean_distance'].mean()
99 | i7['mean_distance'][i7['post'] == 0].mean()
100 | i7['mean_distance'][i7['post'] == 1].mean()
101 |
102 |
103 | #-----------------------------------------------------------------#
104 | # Plot number of regions that received visitors per day
105 |
106 | import plotly.express as px
107 |
108 | fig = px.line(i5_agg, x="date", y="region_to")
109 | fig.show()
110 |
111 |
112 | #-----------------------------------------------------------------#
113 | # Compare regions that received and sent visitors
114 |
115 | import plotly.graph_objects as go
116 |
117 | # set up plotly figure
118 | fig = go.Figure()
119 |
120 | # add line / trace 1 to figure
121 | fig.add_trace(go.Scatter(
122 | x=i5_agg['date'],
123 | y=i5_agg['region_to'],
124 | marker=dict(
125 | color="blue"
126 | )))
127 | fig.add_trace(go.Scatter(
128 | x=i5_agg['date'],
129 | y=i5_agg['region_from'],
130 | marker=dict(
131 | color="red"
132 | )))
133 |
134 | fig.show()
135 |
136 |
137 |
138 |
--------------------------------------------------------------------------------
/data-checks/Archive/03_i_specific_checks_i1_admin2.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # Create Admin2 Indicator 1
3 | #-----------------------------------------------------------------#
4 |
5 | EXPORT = False
6 |
7 | # import shapely
8 | # import geojsonio
9 | import os
10 | import geopandas as gpd
11 | import matplotlib.pyplot as plt
12 | import plotly.graph_objects as go
13 | import plotly.express as px
14 | from plotly.subplots import make_subplots
15 |
16 | import seaborn as sns; sns.set()
17 |
18 |
19 | #-----------------------------------------------------------------#
20 | # Load data
21 |
22 | # Indicator 1 panel data
23 | i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv')
24 | i1 = i1[i1.region != '99999']
25 | # Wards data
26 | wards = gpd.read_file(DATA_GIS + 'wards_aggregated.geojson')
27 | wd = wards[['ward_id', 'province_name', 'district_id', 'district_name']]
28 |
29 | #-----------------------------------------------------------------#
30 | # Create wards mapping into disctrics
31 |
32 | i1 = i1.merge(wd, left_on = 'region', right_on = 'ward_id')
33 |
34 |
35 | # Aggregate values by district
36 | i1_agg = i1.groupby(['district_id', 'district_name', 'hour']).agg(lambda x : sum(x)).reset_index()
37 |
38 | # Make sure hour is in date time
39 | i1_agg['hour'] = i1_agg['hour'].astype('datetime64')
40 | i1_agg['district_id'] = i1_agg['district_id'].astype('int')
41 |
42 | #-----------------------------------------------------------------#
43 | # Transactions per hour by district line plot.
44 |
45 | # Line plot function definition
46 | def line_plot(reg_i,
47 | var = 'count_p',
48 | data = i1_agg,
49 | region = 'district_id',
50 | region_str = 'district_name',
51 | time = 'hour'):
52 | plt_data = data[data[region] == reg_i]
53 | fig = go.Figure()
54 | # Create line
55 | fig.add_trace(go.Scatter(x=plt_data[time], y=plt_data[var],
56 | mode='lines',
57 | name='lines'))
58 | # Additional formatting
59 | title = str(plt_data[region].iloc[0]) + plt_data[region_str].iloc[0]
60 | fig.update_layout(
61 | title=title,
62 | xaxis_title="Time",
63 | yaxis_title="Count",
64 | font=dict(
65 | # family="Courier New, monospace",
66 | size=18,
67 | color="#7f7f7f"),
68 | autosize=False,
69 | width=1200,
70 | height=700
71 | )
72 | return(fig)
73 |
74 | # Districts list
75 | dists = list(set(i1_agg['district_id']))
76 |
77 | # region_plt(d)
78 | # plt.show()
79 |
80 | # Loop over districts
81 | for d in dists:
82 | print(d)
83 | # Create plot
84 | plt_i = line_plot(d)
85 | # Export
86 | save_name = None
87 | save_name = 'i1_districts_count' + str(d) + '.png'
88 | plt_i.write_image(OUT_plots + 'daily_obs_region/' + save_name)
89 |
90 |
91 | #-----------------------------------------------------------------#
92 | # Transactions per hour by day. That is one plot per hour
93 | i1_agg['time'] = pd.to_datetime(i1_agg['hour']).dt.hour
94 | i1_agg['date'] = pd.to_datetime(i1_agg['hour']).dt.date
95 |
96 |
97 | def hourly_scatter(reg_i,
98 | var = 'count_p',
99 | data = i1_agg,
100 | region = 'district_id',
101 | region_str = 'district_name',
102 | time = 'date',
103 | facets = 'time'):
104 | # Subset data
105 | plt_data = data[data[region] == reg_i]
106 | # Create plot
107 | fig = px.scatter(plt_data,
108 | x= time,
109 | y = var,
110 | facet_col = facets,
111 | facet_col_wrap = 5,
112 | width=1200,
113 | height=700)
114 | # Additional formatting
115 | title = str(plt_data[region].iloc[0]) + ' - ' + plt_data[region_str].iloc[0]
116 | fig.update_layout(title_text= title)
117 | fig.update_yaxes(matches=None)
118 | fig.for_each_annotation(lambda a: a.update(text=a.text.replace("time=", "")))
119 | # Format axis titles
120 | return(fig)
121 |
122 | # Loop over districts
123 | for d in dists:
124 | print(d)
125 | # Create plot
126 | plt_i = hourly_scatter(d)
127 | # Export
128 | save_name = None
129 | save_name = 'i1_hourly_obs_byhour' + str(d) + '.png'
130 | plt_i.write_image(OUT_plots + 'hourly_obs_by_hour_region/' + save_name)
131 |
132 |
133 |
134 |
135 | #-----------------------------------------------------------------#
136 | # Export data
137 | if EXPORT:
138 | i1_agg.to_csv(OUT_hfcs + 'Sheet comp panel/i1_admin2.csv', index = False)
139 |
140 |
141 |
142 |
143 | #-----------------------------------------------------------------#
144 | # DRAFT
145 |
--------------------------------------------------------------------------------
/dashboard-dataviz/figures/i5_into_out.R:
--------------------------------------------------------------------------------
1 | # i3 Figures
2 |
3 | unit <- "wards"
4 |
5 | # Load Data --------------------------------------------------------------------
6 | if(unit %in% "wards"){
7 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH
8 | }
9 |
10 | if(unit %in% "districts"){
11 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH
12 | }
13 |
14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i5_daily.Rds"))
15 |
16 | data_into <- data %>%
17 | group_by(region_dest, name_dest, date) %>%
18 | summarise(value = sum(value, na.rm=T)) %>%
19 | dplyr::rename(region = region_dest,
20 | name = name_dest)
21 |
22 | data_out <- data %>%
23 | group_by(region_origin, name_origin, date) %>%
24 | summarise(value = sum(value, na.rm=T)) %>%
25 | dplyr::rename(region = region_origin,
26 | name = name_origin)
27 |
28 | ##
29 | data_into <- data_into %>%
30 | group_by(region) %>%
31 | mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T),
32 | value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>%
33 | ungroup() %>%
34 | mutate(value_change = value_post - value_pre) %>%
35 | mutate(value_change_rank = rank(value_change))
36 | data_into$value_change_rank[is.na(data_into$value_change)] <- NA
37 |
38 | data_out <- data_out %>%
39 | group_by(region) %>%
40 | mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T),
41 | value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>%
42 | ungroup() %>%
43 | mutate(value_change = value_post - value_pre) %>%
44 | mutate(value_change_rank = rank(value_change))
45 | data_out$value_change_rank[is.na(data_out$value_change)] <- NA
46 |
47 |
48 | ## FIX
49 | data_into <- data_into[!is.na(data_into$date),]
50 | data_into$date <- data_into$date %>% as.Date()
51 |
52 | data_out <- data_out[!is.na(data_out$date),]
53 | data_out$date <- data_out$date %>% as.Date()
54 |
55 |
56 | # Into -------------------------------------------------------------------------
57 | rank_high <- data_into$value_change_rank %>% unique() %>% sort() %>% head(5)
58 |
59 | p_high <- data_into %>%
60 | dplyr::filter(value_change_rank %in% rank_high) %>%
61 | ggplot(aes(x = date, y = value)) +
62 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
63 | geom_line() +
64 | labs(x = "",
65 | y = "Number of Subscribers",
66 | title = "Largest Decreases") +
67 | facet_wrap(~name,
68 | scales = "free_y",
69 | nrow = 1) +
70 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
71 | strip.text.x = element_text(face = "bold"))
72 | p_high
73 |
74 | rank_low <- data_into$value_change_rank %>% unique() %>% sort() %>% tail(5)
75 |
76 | p_low <- data_into %>%
77 | dplyr::filter(value_change_rank %in% rank_low) %>%
78 | ggplot(aes(x = date, y = value)) +
79 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
80 | geom_line() +
81 | labs(x = "",
82 | y = "",
83 | title = "Largest Increases") +
84 | facet_wrap(~name,
85 | scales = "free_y",
86 | nrow = 1) +
87 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
88 | strip.text.x = element_text(face = "bold"))
89 |
90 | p_low
91 |
92 | p_all <- ggarrange(p_high, p_low, nrow = 2)
93 | ggsave(p_all, filename = file.path(figures_path,
94 | paste0(unit, "_netmovement_top_chng.png")),
95 | height = 5, width=12)
96 |
97 |
98 |
99 | # Out Of -------------------------------------------------------------------------
100 | rank_high <- data_out$value_change_rank %>% unique() %>% sort() %>% head(5)
101 |
102 | p_high <- data_out %>%
103 | dplyr::filter(value_change_rank %in% rank_high) %>%
104 | ggplot(aes(x = date, y = value)) +
105 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
106 | geom_line() +
107 | labs(x = "",
108 | y = "Number of Subscribers",
109 | title = "Largest Decreases") +
110 | facet_wrap(~name,
111 | scales = "free_y",
112 | nrow = 1) +
113 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
114 | strip.text.x = element_text(face = "bold"))
115 | p_high
116 |
117 | rank_low <- data_out$value_change_rank %>% unique() %>% sort() %>% tail(5)
118 |
119 | p_low <- data_out %>%
120 | dplyr::filter(value_change_rank %in% rank_low) %>%
121 | ggplot(aes(x = date, y = value)) +
122 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) +
123 | geom_line() +
124 | labs(x = "",
125 | y = "",
126 | title = "Largest Increases") +
127 | facet_wrap(~name,
128 | scales = "free_y",
129 | nrow = 1) +
130 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
131 | strip.text.x = element_text(face = "bold"))
132 |
133 | p_low
134 |
135 | p_all <- ggarrange(p_high, p_low, nrow = 2)
136 | ggsave(p_all, filename = file.path(figures_path,
137 | paste0(unit, "_netmovement_top_chng.png")),
138 | height = 5, width=12)
139 |
140 |
141 |
142 |
143 |
--------------------------------------------------------------------------------
/dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/README.md:
--------------------------------------------------------------------------------
1 | # Clean Aggregated Telecom Data
2 |
3 | These scripts clean and standardize aggregated telecom data. This is a necessary
4 | part of the process towards preparing datasets for the dashboard.
5 |
6 | ## Dataset
7 |
8 | A number of indicators are cleaned. To facilitate further processing for the datasets
9 | to be used in the dashboard, all cleaned datasets have the following standardized
10 | variables:
11 |
12 | | variable | format | example | description |
13 | |---|---|---|---|
14 | | region | string | ZONE123456 | Unique identifier of the spatial unit |
15 | | name | string | Name| Spatial unit name |
16 | | date | date or string | 2020-02-01| The date |
17 | | value | numeric | 1000 | Value (e.g., number of subscribers, number of trips, distance traveled) |
18 | | value_lag | numeric | 1000 | Value from the previous time period |
19 | | value_base | numeric | 1000 | Baseline value |
20 | | value_perchange_base | numeric | 50 | Percent change from baseline |
21 | | value_zscore_base | numeric | 50 | Z-score change since baseline |
22 | | label_level | string | Name
This day's value: 1000
... | Label for when level of variable is shown |
23 | | label_base| string | Name
This day's value: 1000
... | Label for when change since baseline value is shown. |
24 |
25 | ## telecom prep [tp] functions
26 |
27 | The `_tp_functions.R` file defines a number of functions to help standardize
28 | the cleaning process.
29 |
30 | #### Set/Standardize Variables
31 |
32 | * __tp_standardize_vars:__ Renames the date, region and value variable names to
33 | `date`, `region` and `value`. The remaining `tp_` functions take these variable
34 | names as defaults.
35 | * __tp_standardize_vars_od:__ Renames variables for origin-destination matrices.
36 | Inputs include the date, region_origin, region_destination and value variables. This function
37 | standardizes those variables and creates a new variable that concatenates region_origin and
38 | region_destination as a unique identifier for the origin-destination pair.
39 |
40 | #### Clean Dataset
41 |
42 | * __tp_fill_regions:__ Checks for any regions that are missing in the telecom data that are in the polygon/admin data. Adds these regions to the dataset.
43 | * __tp_clean_day:__ If `date` is day of week, cleans into a `Date` variable.
44 | * __tp_clean_week:__ Transforms `date` to represent the week (e.g., `Feb 01 - Feb 07`). Handles
45 | both integers (e.g., week `6`) and groups day of week (e.g., `2020-02-01`)
46 | * __tp_agg_day_to_week:__ Aggregates the dataset from daily to weekly.
47 | * __tp_complete_date_region:__ Completes data with all data/region pairs.
48 | * __tp_complete_date_region_od:__ Completes data with all data/region pairs for
49 | origin-destination datasets.
50 | * __tp_add_polygon_data:__ Adds polygon data to dataset (primarily for `name`)
51 | * __tp_add_polygon_data_od:__ Adds polygon data to dataset for origin-destination data.
52 | Adds all polygon variables as `_origin` and `_dest`
53 |
54 | #### Clean Value Variable
55 |
56 | * __tp_interpolate_outliers:__ Interpolates outliers on the `value` variable. Includes
57 | options for replacing negative, positive or both types of outliers, and for what is considered
58 | and outlier. Defaults to 4 standard deviations.
59 | * __tp_replace_zeros:__ Interpolates values of zero. Only interpolates when the
60 | number of zeros is equal to or less than `N_zero_thresh`.
61 |
62 | #### Add Variables
63 |
64 | * __tp_add_percent_change:__ Adds percent change from the last time period (day or week)
65 | on the `value` variable
66 | * __tp_add_baseline_comp_stats:__ Adds percent change and z-score change values
67 | compared to baseline using `value` variable.
68 |
69 | #### Add Labels for Leaflet
70 |
71 | * __tp_add_label_level:__ Adds label for the original (level) value to be used in
72 | leaflet in the dashboard.
73 | * __tp_add_label_baseline:__ Adds label for change metrics since baseline to be used
74 | in leaflet in the dashboard.
75 |
76 |
77 | ## Example cleaning
78 |
79 | The following shows an example of cleaning data. Here we have two datasets:
80 |
81 | 1. __df_day:__ Which is a daily dataset of the number of subscribers at the unit level and contains three
82 | relevant variables: `visit_date` (e.g., `2020-02-01T00:00:00.000Z`), `region` (e.g., `ZONE123456`) and
83 | `subscriber_count` (e.g., `1000`).
84 |
85 | 2. __admin_sp:__ Which is a SpatialPolygonsDataFrame of units. It contains the variables
86 | described in `01_clean_spatial_data` (i.e., `name`, `region`, `area` and `adm1`).
87 |
88 | ```r
89 | df_day_clean <- df_day %>%
90 |
91 | # Standardizes variable names so can avoid defining variable names in the
92 | # tp_ functions.
93 | tp_standardize_vars("visit_date", "region", "subscriber_count") %>%
94 |
95 | # Clean dataset
96 | tp_clean_date() %>%
97 | tp_fill_regions(admin_sp) %>%
98 | tp_complete_date_region() %>%
99 | tp_add_polygon_data(admin_sp) %>%
100 |
101 | # Interpolate/Clean Values
102 | tp_interpolate_outliers(NAs_as_zero = T) %>%
103 | tp_replace_zeros(NAs_as_zero = T) %>%
104 |
105 | # Add change metrics
106 | tp_add_baseline_comp_stats() %>%
107 | tp_add_percent_change() %>%
108 |
109 | # Add labels
110 | tp_add_label_level(timeunit = "day", OD = F) %>%
111 | tp_add_label_baseline(timeunit = "day", OD = F)
112 | ```
113 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/voronoi.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | from geovoronoi import voronoi_regions_from_coords
3 |
4 | import os
5 | if os.environ['HOME'] != '/root':
6 | from modules.tower_clustering import *
7 |
8 | ## Class to handle spark and df in session
9 | class voronoi_maker:
10 | """Class to handle all voronoi transformations and files for a specific df
11 |
12 |
13 | Attributes
14 | ----------
15 | datasource : an instance of DataSource class.
16 | shape : a geopandas dataframe. Shapefile to use for clustering
17 | region_var : a string. Name of the region variable in the shapefile.
18 | sites : a string. Name of the attribute of datasource that holds the tower coordinates.
19 | spark_df : a pyspark dataframe. Holds the cdr data
20 | result_path : a string. Where to save results.
21 | clusterer : an instance of tower_clusterer.
22 | sites_df : a pyspark dataframe. Holds clustered sites.
23 | distances_pd_long : a pyspark dataframe. Holds distances between sites.
24 | sites : a pyspark dataframe. Clustered sites without NAs.
25 |
26 | Methods
27 | -------
28 | make_voronoi()
29 | orchestrates all methods
30 |
31 | filter_towers_for_voronoi()
32 | we can't run on duplicates (location duplicates), so we have to filter them out first
33 |
34 | make_shape(towers_for_voronoi)
35 | makes a buffer around towers to create bubble shapes
36 |
37 | create_voronoi(towers_for_voronoi, shape)
38 | creats voronoi cells from tower list
39 |
40 | save_voronoi(poly_shapes)
41 | saves voronoi shape file and voronoi-tower mapping
42 |
43 | assign_to_spark_df()
44 | adds voronoi id to cdr records (not used currently)
45 | """
46 |
47 | def __init__(self,
48 | datasource,
49 | shape,
50 | region_var,
51 | sites = 'tower_sites'):
52 | """
53 | Parameters
54 | ----------
55 | """
56 | self.spark = datasource.spark
57 | self.datasource = datasource
58 | self.spark_df = datasource.parquet_df
59 | self.result_path = datasource.results_path
60 | self.clusterer = tower_clusterer(datasource, shape, region_var, sites)
61 | self.clusterer.cluster_towers()
62 | self.sites_df = self.clusterer.sites_with_clusters.loc[:,['cell_id', 'centroid_LAT', 'centroid_LNG']].rename(columns={'centroid_LAT' : 'LAT', 'centroid_LNG': 'LNG'})
63 | self.distances_pd_long = self.clusterer.distances_pd_long
64 | if (self.sites_df.columns == ['cell_id', 'LAT', 'LNG']).all():
65 | self.sites = self.sites_df[self.sites_df.LAT.notna()]
66 | else:
67 | raise 'The sites dataframe does not have the correct columns / column order. Should be cell_id, LAT, LNG'
68 |
69 | def make_voronoi(self):
70 |
71 | towers_for_voronoi = self.filter_towers_for_voronoi()
72 | shape, towers_for_voronoi = self.make_shape(towers_for_voronoi = towers_for_voronoi)
73 | poly_shapes = self.create_voronoi(shape = shape, towers_for_voronoi = towers_for_voronoi)
74 | self.save_voronoi(poly_shapes = poly_shapes)
75 | return self.voronoi_dict
76 |
77 | def filter_towers_for_voronoi(self):
78 |
79 | # get unique towers in data
80 | distinct_towers = self.spark_df.select('location_id').distinct().toPandas()
81 |
82 | # filter list of towers for unique towers
83 | self.sites = self.sites[self.sites.cell_id.isin(list(distinct_towers.location_id))]
84 |
85 | # Assign gpd
86 | self.towers = gpd.GeoDataFrame(
87 | self.sites, geometry = gpd.points_from_xy(self.sites.LNG, self.sites.LAT), crs = 'epsg:4326')
88 |
89 | # Find towers that are in same location
90 | self.towers.LAT = self.towers.LAT.apply(lambda x: round(x,4))
91 | self.towers.LNG = self.towers.LNG.apply(lambda x: round(x,4))
92 | towers_for_voronoi = self.towers[~self.towers.duplicated(subset = ['LAT', 'LNG'])]
93 |
94 | return towers_for_voronoi
95 |
96 | def make_shape(self, towers_for_voronoi):
97 |
98 | # Make border shape
99 | radians = 35 / 40000 * 360
100 | self.shape = towers_for_voronoi.buffer(radians).unary_union
101 |
102 | return self.shape, towers_for_voronoi
103 |
104 | def create_voronoi(self, towers_for_voronoi, shape):
105 |
106 | # Create np array of vertices
107 | points = towers_for_voronoi.loc[:,['LNG','LAT']].to_numpy()
108 |
109 | # Create voronoi shapes
110 | self.poly_shapes, pts, poly_to_pt_assignments = voronoi_regions_from_coords(points, shape)
111 |
112 | return self.poly_shapes
113 |
114 | def save_voronoi(self, poly_shapes):
115 |
116 | # Save voronoi
117 | self.voronoi_pd = pd.DataFrame(poly_shapes)
118 | self.voronoi_pd.columns =['geometry']
119 | self.voronoi_gpd = deepcopy(self.voronoi_pd)
120 | self.voronoi_gpd = gpd.GeoDataFrame(self.voronoi_gpd, geometry = 'geometry', crs = 'epsg:4326')
121 | self.voronoi_pd['geometry'] = self.voronoi_pd.geometry.astype(str)
122 | self.voronoi_pd = self.voronoi_pd.reset_index()
123 | self.voronoi_pd.columns = ['region', 'geometry']
124 | self.voronoi_pd = self.spark.createDataFrame(self.voronoi_pd)
125 | save_csv(self.voronoi_pd, self.result_path, self.datasource.country_code + '_voronoi_shapefile')
126 |
127 | # Match towers to voronoi so that all towers are assigned to a cell
128 | voronoi_towers = gpd.sjoin(self.voronoi_gpd, self.towers, op="intersects")
129 | self.voronoi_dict = voronoi_towers.drop(['geometry', 'LAT', 'LNG', 'index_right'], axis = 'columns')
130 | self.voronoi_dict = self.voronoi_dict.reset_index()
131 | self.voronoi_dict.columns = ['region', 'cell_id']
132 | self.voronoi_dict = self.spark.createDataFrame(self.voronoi_dict)
133 | save_csv(self.voronoi_dict, self.result_path, self.datasource.country_code + '_voronoi_tower_map')
134 |
135 | def assign_to_spark_df(self):
136 |
137 | self.new_spark_df = self.spark_df.join(self.voronoi_dict, self.spark_df['location_id'] == self.voronoi_dict['cell_id'], how = 'left')
138 | return self.new_spark_df
139 |
--------------------------------------------------------------------------------
/data-checks/Archive/Descr-exploratory/fb-comparisson-draft.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # CDR vs FB comparisson
3 | #-----------------------------------------------------------------#
4 | # TO DO
5 |
6 |
7 | # Read documentation
8 |
9 | # Do same process for movement data
10 |
11 | # Look at the results
12 |
13 | # Do the merging with only overlaping dates
14 |
15 | #-----------------------------------------------------------------#
16 | # Settings
17 |
18 | import os
19 | import pandas as pd
20 | import numpy as np
21 | import glob
22 |
23 | base_path = 'C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/proof-of-concept/'
24 |
25 | fb_data = base_path + 'Facebook Data/'
26 | cdr_path = base_path + 'panel_indicators/'
27 |
28 | doc_path = base_path + 'documentation/'
29 |
30 | OUT_path = base_path + '/outputs/data-checks/'
31 |
32 | data_pop = fb_data + 'Population Administrative Regions/'
33 | data_mov = fb_data + 'Movement Admin Regions/'
34 |
35 | # File names need to be updated
36 | # # prefix = 'Coronavirus Disease Prevention Map Apr 16 2020 Id Movement between Administrative Regions__'
37 | # prefix_pop = 'Coronavirus Disease Prevention Map Apr 16 2020 Id Facebook Population (Administrative Regions)__'
38 | # prefix_mov = 'Coronavirus Disease Prevention Map Apr 16 2020 Id Movement between Administrative Regions__'
39 |
40 |
41 | #-----------------------------------------------------------------#
42 | # Load FB data
43 |
44 | # Population - Load all csv files in the folder
45 | files_pop = glob.glob(data_pop + prefix_pop + "*.csv")
46 | files_mov = glob.glob(data_mov + prefix_mov + "*.csv")
47 | fpop = pd.concat([pd.read_csv(f, encoding='latin1') for f in files_pop], ignore_index=True)
48 | fmov = pd.concat([pd.read_csv(f, encoding='latin1') for f in files_mov], ignore_index=True)
49 |
50 |
51 | # df1 = pd.read_csv(data_pop + prefix + '2020-06-24 0000.csv')
52 | # df2 = pd.read_csv(data_pop + prefix + '2020-06-24 0800.csv')
53 | # df3 = pd.read_csv(data_pop + prefix + '2020-06-24 1600.csv')
54 |
55 | #-----------------------------------------------------------------#
56 | # Load CDR data
57 |
58 | # Using i3, Count of unique subscribers, for now. Not sure how this
59 | # fb indicator was calculated, so it might make sense to use another
60 | # indicator
61 | cpop = pd.read_csv(cdr_path + 'i3_admin2.csv')
62 |
63 |
64 | # i5
65 | cmov = pd.read_csv(cdr_path + 'i5_admin2.csv')
66 |
67 | # load and merge keys for str name matching
68 | a2_keys = pd.read_csv(doc_path + 'keys_districts.csv')
69 | a2_keys = a2_keys[['id2', 'name2']]
70 |
71 | # process cdr population
72 | cp = cpop.merge(a2_keys,
73 | left_on= 'region',
74 | right_on = 'id2')
75 |
76 | cp['date'] = pd.to_datetime(cp['day']).dt.date
77 |
78 | cp = cp[['date', 'id2', 'name2','count_p']]\
79 | .rename(columns = {'name2' : 'name',
80 | 'count_p' : 'count'})
81 |
82 | # process cdr movement
83 |
84 | cmov['date'] = pd.to_datetime(cmov['connection_date']).dt.date
85 |
86 |
87 | cm = cmov\
88 | .merge(a2_keys,
89 | left_on = 'region_from',
90 | right_on= 'id2')\
91 | .rename(columns = {'name2' : 'st_name'})\
92 | .merge(a2_keys,
93 | left_on = 'region_to',
94 | right_on= 'id2')\
95 | .rename(columns = {'name2' : 'ed_name',
96 | 'total_count_p' : 'count'})\
97 | [['date', 'st_name','ed_name', 'count']]
98 |
99 |
100 | #-----------------------------------------------------------------#
101 | # Process FB data
102 |
103 | def process(df, time, group_by, count):
104 | # Remove other countried
105 | df = df.loc[df['country'] == 'ZW']
106 | # Date var
107 | df['date'] = pd.to_datetime(df[time]).dt.date
108 | # Group by
109 | gby = ['date']
110 | gby.extend(group_by)
111 | # Aggregate
112 | agg = df\
113 | .groupby(gby)\
114 | .agg({count : np.sum})\
115 | .reset_index()
116 |
117 | return agg
118 |
119 |
120 | fp = process(fpop, 'date_time', ['polygon_name'], 'n_crisis')\
121 | .rename(columns = {'polygon_name' : 'name',
122 | 'n_crisis' : 'count'})
123 | fm = process(fmov, 'date_time', ['start_polygon_name', 'end_polygon_name'], 'n_crisis')\
124 | .rename(columns = {'start_polygon_name' : 'st_name',
125 | 'end_polygon_name' : 'ed_name',
126 | 'n_crisis' : 'count'})
127 |
128 | #-----------------------------------------------------------------#
129 | # Merge
130 |
131 | # Make sure I'm comparing same period
132 | overlapping_dates = set(cp['date']).intersection(set(fp['date']))
133 |
134 | fp = fp[fp['date'].isin(overlapping_dates)]
135 | cp = cp[cp['date'].isin(overlapping_dates)]
136 |
137 | # String matching corrections
138 | fp['name'].loc[fp['name'] == 'Hwedza'] = 'Wedza'
139 | fp['name'].loc[fp['name'] == 'Chirumanzu'] = 'Chirumhanzu'
140 | fp['name'].loc[fp['name'] == 'Bulilimamangwe'] = 'Bulilima (North)'
141 |
142 |
143 |
144 | # def agg_rank(df, gby = 'name'):
145 | # df = df.groupby(gby).agg('mean').reset_index()
146 | # df["rank"] = df["count"].rank(ascending = False)
147 | # return df.sort_values('rank')
148 |
149 | # foo
150 |
151 |
152 | # full_period_comp = cp\
153 | # .merge(fp,
154 | # on = ['name', 'date'],
155 | # how = 'outer',
156 | # suffixes=('', '_fb'))\
157 | # .sort_values('rank')
158 |
159 |
160 |
161 |
162 | #-----------------------------------------------------------------#
163 | # Aggregated merge
164 |
165 | # Create full period ranking
166 | def agg_rank(df, gby = 'name'):
167 | df = df.groupby(gby).agg('mean').reset_index()
168 | df["rank"] = df["count"].rank(ascending = False)
169 | return df.sort_values('rank')
170 |
171 | cp_rank = agg_rank(cp)
172 | fp_rank = agg_rank(fp)
173 |
174 |
175 |
176 | full_period_comp = cp_rank\
177 | .merge(fp_rank,
178 | on = 'name',
179 | how = 'outer',
180 | suffixes=('', '_fb'))\
181 | .sort_values('rank')
182 |
183 | #-----------------------------------------------------------------#
184 | # Export
185 |
186 | # full_period_comp.to_csv(OUT_path + 'i3_fb_comp.csv',
187 | # index = False)
188 |
189 |
190 | # agg_rank(fm, ['st_name', 'ed_name'])
191 | # agg_rank(cm, ['st_name', 'ed_name'])
192 |
193 | fp_rank.sort_values('name')
--------------------------------------------------------------------------------
/data-checks/Archive/data_files_comparisson.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # CSV comparisson
3 | #-----------------------------------------------------------------#
4 |
5 | import os
6 | import re
7 | import numpy as np
8 | import pandas as pd
9 | import datetime as dt
10 | import matplotlib.pyplot as plt
11 | # import seaborn as sns
12 | from datetime import datetime
13 |
14 |
15 | IRESULTS = DATA_path + "Isaac-results/"
16 |
17 | IFLOW = IRESULTS + "flowminder/"
18 | ICUST = IRESULTS + "custom/"
19 |
20 | #-----------------------------------------------------------------#
21 | # Load files
22 |
23 | filenames = os.listdir(IFLOW)
24 |
25 | # Custom indicatos
26 | # filenames = os.listdir(ICUST)
27 |
28 | #-----------------------------------------------------------------#
29 | # Make sure data is compatible
30 |
31 | # Masure order and date formats are the same
32 | def compat(data,
33 | timevar,
34 | standardize_time = False,
35 | regvar = 'region'):
36 | new_data = data
37 |
38 | # If has a date convert to standard
39 | if len(timevar) != 0:
40 | timevar = np.asscalar(np.array(timevar))
41 | new_data[timevar] = pd.to_datetime(new_data[timevar]).dt.date
42 | # Make sure order is the same
43 | if len(data.columns) == 2:
44 | new_data = new_data.sort_values( by = [new_data.columns[0], new_data.columns[1] ])
45 | else :
46 | new_data = new_data.sort_values( by = [new_data.columns[0], new_data.columns[1], new_data.columns[2] ])
47 | return new_data
48 |
49 |
50 | # Comparisson outputs function
51 | def compare_dfs(df1,df2, filename = None, outputdf = False):
52 | # Set time var (GARMBIARRA WARNING)
53 | time = list(df1.columns[list(df1.columns.str.contains('date'))])
54 | # Process data to be in the same format
55 | df1 = compat(df1, timevar = time)
56 | df2 = compat(df2, timevar = time)
57 | # Merge dfs
58 | index_cols = list(df1.columns[0:-1])
59 | #Make sure merging columns are str
60 | df1[index_cols] = df1[index_cols].astype(str)
61 | df2[index_cols] = df2[index_cols].astype(str)
62 | cdf = df1.merge(df2, left_on = index_cols, right_on = index_cols)
63 | #--------------------#
64 | # Calculate differeces
65 | # Proportion of mismatches
66 | p_rows_diff = sum(cdf[cdf.columns[-1]] != cdf[cdf.columns[-2]])/cdf.shape[0]
67 | p_rows_diff = str(round(p_rows_diff, 4)*100)
68 | # Value difference
69 | cdf['pdiff'] = ((cdf[cdf.columns[-1]] -
70 | cdf[cdf.columns[-2]])/cdf[cdf.columns[-2]])
71 | # Average difference
72 | avg_diff = str(round(cdf['pdiff'].mean(skipna = True), 4)*100)
73 |
74 | if outputdf:
75 | return(cdf)
76 | else:
77 | # Print report
78 | print(filename)
79 | print('N rows ours: ' + str(df1.shape[0]) )
80 | print("N rows Isaac's: " + str(df2.shape[0]))
81 | print('Of matching rows:')
82 | print(' - Average difference of count column: ' + avg_diff + "%")
83 | print(' - Percentage rows that are different: ' + p_rows_diff + "%")
84 | print('\n')
85 |
86 |
87 | #-----------------------------------------------------------------#
88 | # Flowminder csvs
89 | for i in range(0, len(filenames)-1):
90 | file_i = filenames[i]
91 | # print(i)
92 | # print(filenames[i])
93 | # Our file
94 | d1 = pd.read_csv(FLOWM_adm3_path + file_i)
95 | # I's file
96 | d2 = pd.read_csv(IFLOW + file_i)
97 |
98 | # Run comparisson
99 | print(i)
100 | print(filenames[i])
101 | compare_dfs(d1,d2)
102 |
103 | #-----------------------------------------------------------------#
104 | # Custom indicatos csv
105 |
106 | # Indicator 1 #
107 |
108 | i1 = pd.read_csv(I1_Adm3_path + 'transactions_per_hour.csv')
109 | i1i = pd.read_csv(ICUST + 'transactions_per_hour.csv')
110 |
111 | # i1 = compat(i1, timevar = [])
112 | # i1i = compat(i1i, timevar = [])
113 |
114 | cdf = compare_dfs(i1,i1i, outputdf = True)
115 | cdf["diff_flag"] = cdf["count_x"] != cdf["count_y"]
116 | cdf['date'] = pd.to_datetime(cdf['hour']).dt.date
117 |
118 |
119 | foo = cdf[cdf['diff_flag']]
120 |
121 | foo.to_csv('C:/Users/wb519128/Desktop/i1_differences.csv',
122 | index = False)
123 |
124 | # Indicator 3 #
125 |
126 | i3 = pd.read_csv(I3_Adm3_path + 'unique_subscribers_per_day.csv')
127 | i3i = pd.read_csv(ICUST + 'unique_subscribers_per_day.csv')
128 |
129 | cdf3 = compare_dfs(i3,i3i, outputdf = True)
130 | cdf3["diff_flag"] = cdf3["count_x"] != cdf3["count_y"]
131 | cdf3['day'] = pd.to_datetime(cdf3['day']).dt.date
132 |
133 |
134 | cdf3[cdf3['day'] == dt.date(2020, 2, 3)]
135 |
136 | foo = cdf3[cdf3['diff_flag']]
137 |
138 | foo.to_csv('C:/Users/wb519128/Desktop/i3_differences.csv',
139 | index = False)
140 |
141 |
142 |
143 | # Indicator 5 #
144 | I5_Adm3_path
145 |
146 | i5 = pd.read_csv(I5_Adm3_path + 'origin_destination_connection_matrix_per_day.csv')
147 | i5i = pd.read_csv(ICUST + 'origin_destination_connection_matrix_per_day.csv')
148 |
149 | #cdf5 = compare_dfs(i5,i5i, outputdf = True)
150 | #cdf5["diff_flag"] = cdf5["total_count_x"] != cdf5["total_count_y"]
151 |
152 | compare_dfs(i5,i5i, outputdf = False)
153 |
154 | bar = i5.merge(i5i, on = ['connection_date', 'region_from', 'region_to'])
155 | bar["diff_flag"] = bar["od_count_x"] != bar["od_count_y"]
156 |
157 | diff_day_df = bar.groupby('connection_date').sum()
158 | diff_day_df = diff_day_df.reset_index()
159 |
160 | diff_day_df['day'] = pd.to_datetime(diff_day_df['connection_date']).dt.day
161 |
162 | plt.plot('day',
163 | 'diff',
164 | data = diff_day_df)
165 |
166 | # set(bar['connection_date'])
167 | # len(set(pd.to_datetime(foo['connection_date']).dt.date))
168 | # len(set(foo['region_from']))
169 |
170 | # # Absolute difference by day
171 | # bar['diff'] = bar['od_count_x']- bar['od_count_y']
172 |
173 |
174 |
175 | # foo = bar[bar['diff_flag']]
176 |
177 | # foo['diff'] = foo['od_count_x']- foo['od_count_y']
178 | # foo['diff'].mean()
179 |
180 | export_i5_merged = bar.rename(
181 | columns = {
182 | 'subscriber_count_x' : 'subscriber_count',
183 | 'subscriber_count_y' : 'subscriber_count_isaac',
184 | 'od_count_x': 'od_count_x',
185 | 'od_count_y': 'od_count_isaac',
186 | 'total_count_x' : 'total_count',
187 | 'total_count_y' : 'total_count_isaac'})
188 |
189 |
190 |
191 | export_i5_merged\
192 | .to_csv('C:/Users/wb519128/Desktop/i5_merged_with_Isaacs.csv',
193 | index = False)
194 |
195 |
196 |
197 |
198 | #-----------------------------------------------------------------#
199 | # DRAFT
200 |
201 | file_i = filenames[0]
202 |
203 | d1 = pd.read_csv(FLOWM_adm3_path + file_i)
204 | d2 = pd.read_csv(IFLOW + file_i)
205 |
206 | cdf = compare_dfs(d1,d2, outputdf = True)
207 | cdf["diff_flag"] = cdf["count_x"] != cdf["count_y"]
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/outliers.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | # Class to help counting outliers
3 | class outlier_counter:
4 | """Class to count outliers
5 |
6 | Attributes
7 | ----------
8 | calls : a dataframe. which data to process
9 | spark : an initialised spark connection.
10 | thresholds : a dictionary with outlier thresholds to be used.
11 |
12 | Methods
13 | -------
14 | count()
15 | count outliers and print results
16 |
17 | print_results(df)
18 | print results of outlier counts
19 | """
20 |
21 | def __init__(self,
22 | calls,
23 | spark = spark,
24 | thresholds = {'min_transactions' : 3,
25 | 'max_avg_transactions' : 100,
26 | 'max_transactions_in_single_day' : 200}):
27 | """
28 | Parameters
29 | ----------
30 |
31 | """
32 | self.calls = calls
33 | self.spark = spark
34 | self.counts = {}
35 | self.dfs = {}
36 | self.thresholds = thresholds
37 |
38 |
39 | def count(self):
40 | # count all records
41 | self.counts['all_records'] = self.calls.count()
42 |
43 | # count of days in dataframe
44 | self.counts['number_of_days'] = self.calls.select('call_date').distinct().count()
45 |
46 | # Count number of distinct users
47 | self.counts['distinct_ids'] = self.calls.select('msisdn').distinct().count()
48 |
49 | # Get # of records per user
50 | self.dfs['records_per_user'] = self.calls.groupby('msisdn').count()
51 |
52 | # Get # of records per user per day
53 | self.dfs['records_per_user_per_day'] = self.calls.groupby('msisdn', 'call_date').count()
54 |
55 | # Identify daily usage outlier msidsdn
56 | self.dfs['too_few_transactions'] = self.dfs['records_per_user']\
57 | .where(F.col('count') < self.thresholds['min_transactions'])\
58 | .select('msisdn').distinct()
59 | self.dfs['too_many_avg_transactions'] = self.dfs['records_per_user']\
60 | .where(F.col('count') > (self.counts['number_of_days'] * \
61 | self.thresholds['max_avg_transactions']))\
62 | .select('msisdn').distinct() # more than __ calls and texts per day on average
63 | self.dfs['too_many_transactions_in_single_day'] = \
64 | self.dfs['records_per_user_per_day']\
65 | .where(F.col('count') > self.thresholds['max_transactions_in_single_day'])\
66 | .select('msisdn').distinct() # more than __ calls and texts in a single day
67 |
68 | # Count the outlier accounts
69 | self.counts['too_few_transactions'] = \
70 | self.dfs['too_few_transactions'].count()
71 | self.counts['too_many_avg_transactions'] = \
72 | self.dfs['too_many_avg_transactions'].count()
73 | self.counts['too_many_transactions_in_single_day'] = \
74 | self.dfs['too_many_transactions_in_single_day'].count()
75 |
76 | # Caclulate the outlier account fraction
77 | self.counts['too_few_transactions_fraction'] = \
78 | self.counts['too_few_transactions'] / self.counts['distinct_ids']
79 | self.counts['too_many_avg_transactions_fraction'] = \
80 | self.counts['too_many_avg_transactions'] / self.counts['distinct_ids']
81 | self.counts['too_many_transactions_in_single_day_fraction'] = \
82 | self.counts['too_many_transactions_in_single_day'] / self.counts['distinct_ids']
83 |
84 | # Keep only ids that aren't among the outlier accounts
85 | self.filtered_transactions = self.calls.join(self.dfs['too_few_transactions'],
86 | self.calls['msisdn'] == \
87 | self.dfs['too_few_transactions']['msisdn'],
88 | how ='leftanti').select(self.calls.columns[0:])
89 | self.filtered_transactions = self.filtered_transactions\
90 | .join(self.dfs['too_many_avg_transactions'],
91 | self.filtered_transactions['msisdn'] == \
92 | self.dfs['too_many_avg_transactions']['msisdn'],
93 | how ='leftanti')\
94 | .select(self.filtered_transactions.columns[0:])
95 | self.filtered_transactions = self.filtered_transactions\
96 | .join(self.dfs['too_many_transactions_in_single_day'],
97 | self.filtered_transactions['msisdn'] == \
98 | self.dfs['too_many_transactions_in_single_day']['msisdn'],
99 | how ='leftanti')\
100 | .select(self.filtered_transactions.columns[0:])
101 |
102 | # count how many we kept and dropped
103 | self.counts['filtered_transactions'] = self.filtered_transactions.count()
104 | self.counts['dropped_calls'] = \
105 | self.counts['all_records'] - self.counts['filtered_transactions']
106 | self.print_results()
107 |
108 |
109 | def print_results(self):
110 | print('Total number of unique SIMs: {:,}'.format(self.counts['distinct_ids']))
111 | print('Number of SIMs with less than {} transactions: {:,}'\
112 | .format(self.thresholds['min_transactions'],
113 | self.counts['too_few_transactions']))
114 | print('Number of SIMs with more than {} transactions per day on average: {:,}'\
115 | .format(self.thresholds['max_avg_transactions'],
116 | self.counts['too_many_avg_transactions'] ))
117 | print('Number of SIMs with more than {} transactions in a single day: {:,}'\
118 | .format(self.thresholds['max_transactions_in_single_day'],
119 | self.counts['too_many_transactions_in_single_day']))
120 | print('SIMs with less than {} transactions as a fraction of all accounts: {:.8f}'\
121 | .format(self.thresholds['min_transactions'],
122 | self.counts['too_few_transactions_fraction']))
123 | print('SIMs with more than {} transactions per day on average as a fraction of all accounts: {:.8f}'\
124 | .format(self.thresholds['max_avg_transactions'],
125 | self.counts['too_many_avg_transactions_fraction']))
126 | print('SIMs with more than {} transactions on a single day as a fraction of all accounts: {:.8f}'\
127 | .format(self.thresholds['max_transactions_in_single_day'],
128 | self.counts['too_many_transactions_in_single_day_fraction']))
129 | print('Number of transactions that would be kept: {:,}'\
130 | .format(self.counts['filtered_transactions']))
131 | print('Number of transactions that would be deleted: {:,}'\
132 | .format(self.counts['dropped_calls']))
133 | print('Fraction of transactions that would be deleted: {:.8f}'\
134 | .format(self.counts['dropped_calls'] / self.counts['all_records']))
--------------------------------------------------------------------------------
/data-checks/Archive/01_completenes_checks.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # DATA CHECKS - Completeness checks
3 | #-----------------------------------------------------------------#
4 |
5 | #-----------------------------------------------------------------#
6 | # Settings
7 |
8 | from globals import *
9 |
10 | EXPORT_FIGURES = True
11 |
12 | # Default variable names
13 | timevar = 'hour'
14 | regvar = 'region'
15 |
16 | INDICATORS_path = DATA_path + 'isaac-results/Archive/e_23_07_2020_converage_23_05_to_30_06/'
17 |
18 | #-----------------------------------------------------------------#
19 | # Load data
20 |
21 | # Define loading function that depends on the existing folder
22 | # structure but also remove headers in the middle of the data if
23 | # if there is any
24 | def loadfiles(file_name,
25 | admin = 3,
26 | path = INDICATORS_path):
27 | print(file_name, admin)
28 | # Load external file
29 | folder = path + 'admin' + str(admin) + '/'
30 | de = None
31 | de = pd.read_csv(folder + file_name)
32 | # Patch cleannig of headers in the middle of the data
33 | c1_name = de.columns[0]
34 | de = de[~de[c1_name].astype(str).str.contains(c1_name)]
35 | return(de)
36 |
37 |
38 | # Indicator 1
39 | fi = loadfiles(file_name = 'transactions_per_hour.csv')
40 |
41 | # Indicator 2
42 | f2 = loadfiles('unique_subscribers_per_day.csv')
43 |
44 | # Indicator 5
45 | f5 = loadfiles('origin_destination_connection_matrix_per_day.csv')
46 |
47 | # Indicator 9
48 | f9 = loadfiles('week_home_vs_day_location_per_day.csv', admin = 2)
49 |
50 |
51 | #-----------------------------------------------------------------#
52 | # Processing data
53 |
54 | # Remove missings
55 | reg_missings_bol = fi['region'].isin(missing_values)
56 | fi_cl = fi[~reg_missings_bol]
57 |
58 | # Check for duplicates
59 | # sum(fi_cl.duplicated())
60 | fi_cl['count'] = fi_cl['count'].astype(int)
61 |
62 | # Date vars
63 | fi_cl['date'] = pd.to_datetime(fi_cl['hour']).dt.date
64 | # fi_cl['hour'] = pd.to_datetime(fi_cl[timevar]).dt.hour
65 | # fi_cl['month'] = pd.to_datetime(fi_cl['date']).dt.month
66 |
67 | # Make sure dates are datetime
68 | fi_cl['hour'] = fi_cl['hour'].astype('datetime64')
69 |
70 |
71 | # I5
72 | f5['date'] = pd.to_datetime(f5['connection_date']).dt.date
73 |
74 |
75 | #-----------------------------------------------------------------#
76 | # Create aggregated datasets to the country level for ploting
77 |
78 | #----------------------------
79 | # I1 - transactions per hour
80 |
81 | # Create plots data
82 | f1_agg_hour = fi_cl\
83 | .groupby(['date', 'hour'])\
84 | .agg({'region' : pd.Series.nunique ,
85 | 'count' : np.sum})\
86 | .reset_index()\
87 | .sort_values(['date', 'hour'])\
88 | .rename(columns = {'region' : 'n_regions'})
89 |
90 | f1_agg_date = fi_cl\
91 | .groupby('date')\
92 | .agg({'region' : pd.Series.nunique ,
93 | 'count' : np.sum})\
94 | .reset_index()\
95 | .sort_values(['date'])\
96 | .rename(columns = {'region' : 'n_regions'})
97 |
98 | #----------------------------
99 | # I5 - OD matrix per day data
100 |
101 | f5['date'] = pd.to_datetime(f5['connection_date']).dt.date
102 |
103 | f5_agg_date = f5\
104 | .groupby('date')\
105 | .agg({'region_from' : pd.Series.nunique ,
106 | 'region_to' : pd.Series.nunique,
107 | 'total_count' : np.sum})\
108 | .reset_index()\
109 | .sort_values('date')
110 |
111 | #----------------------------
112 | # Complete dates and time
113 |
114 | # Create data sets with time indexes and fill blanks with 0s
115 | def time_complete(data, timevar = timevar, timefreq = 'D'):
116 | data[timevar] = data[timevar].astype('datetime64')
117 | full_time_range = pd.date_range(data[timevar].min(),
118 | data[timevar].max(),
119 | freq = timefreq)
120 | data = data.set_index(timevar)
121 | data = data.reindex(full_time_range, fill_value=0)
122 | return(data)
123 |
124 | f1_agg_date = time_complete(f1_agg_date, 'date')
125 | f1_agg_hour = time_complete(f1_agg_hour, 'hour', 'H')
126 | f5_agg_date = time_complete(f5_agg_date, 'date')
127 |
128 | #-----------------------------------------------------------------#
129 | # I1 - Day Plots
130 |
131 | # PLot number of regions with transactions per day.
132 |
133 | # Number of regions plot
134 | plt.figure(figsize=(12, 6))
135 | date_plot = sns.lineplot(f1_agg_date.index,
136 | f1_agg_date['n_regions'])
137 | # Export
138 | date_plot.figure.savefig(OUT_path + "i1_dates_ward_count.png")
139 |
140 |
141 | # Number of transactions plot
142 | plt.figure(figsize=(12, 6))
143 | obs_per_day_plot = sns.lineplot(
144 | f1_agg_date.index,
145 | f1_agg_date['count'])
146 | # Export
147 | if EXPORT_FIGURES:
148 | obs_per_day_plot.figure.savefig(OUT_path + "i1_dates_n_obs.png")
149 |
150 |
151 | #-----------------------------------------------------------------#
152 | # I1 - Hour Plots
153 |
154 | # Plot total number of transactions per hour to check for outliers
155 |
156 | #------------------
157 | # Number of regions
158 | plt.figure(figsize=(12, 6))
159 | hour_plot = sns.lineplot(
160 | f1_agg_hour.index,
161 | f1_agg_hour['n_regions'])
162 |
163 | # Cosmetics
164 | # x_ticks = list(set(fi_agg_hour['hour'].astype(str)))[0:len(fi_agg_hour):5]
165 | # x_ticks.sort()
166 | # hour_plot.set_xticklabels(x_ticks)
167 |
168 | # Export
169 | if EXPORT_FIGURES:
170 | hour_plot.figure.savefig(OUT_path + "i1_hours_ward_count.png")
171 |
172 | #----------------------------
173 | # Total count of transactions
174 | plt.figure(figsize=(12, 6))
175 | obs_per_hour_plot = sns.lineplot(
176 | f1_agg_hour.index.values,
177 | f1_agg_hour['count'])
178 |
179 | # Cosmetics
180 | # x_ticks = list(set(fi_agg_hour['date'].astype(str)))[0:len(fi_agg_hour):5]
181 | # x_ticks.sort()
182 | # obs_per_hour_plot.set_xticklabels(x_ticks)
183 |
184 | # Export
185 | if EXPORT_FIGURES:
186 | obs_per_hour_plot.figure.savefig(OUT_path + "i1_hours_n_obs.png")
187 |
188 |
189 | # Table with hours
190 | # fi_obs_per_hour[fi_obs_per_hour['date'] == dt.date(2020, 4, 30)]
191 | # apr30 = f1_agg_hour[f1_agg_hour['date'] == dt.date(2020, 4, 30)]
192 |
193 | # apr30.to_csv(OUT_path + "i1_hour_apr30.csv",
194 | # index = False)
195 |
196 |
197 | #-----------------------------------------------------------------#
198 | # I5 - Day Plots
199 |
200 | # Plot total number of movements per day
201 |
202 | # plot total count
203 | f5_plot = sns.lineplot(
204 | f5_agg_date.index,
205 | f5_agg_date['total_count'])
206 | # Export
207 | if EXPORT_FIGURES:
208 | f5_plot.figure.savefig(OUT_path + "i5_dates_total_count.png")
209 |
210 |
211 | #-----------------------------------------------------------------#
212 | # I9 - Week plots
213 |
214 |
215 | # f9_plot = sns.lineplot(
216 | # f9_agg_date['week'],
217 | # f9_agg_date['mean_distance'])
218 | # # Export
219 | # f9_plot.figure.savefig(OUT_path + "i9_week_mean_distance.png")
220 |
--------------------------------------------------------------------------------
/data-panel/Archive/panel_draft2.py:
--------------------------------------------------------------------------------
1 |
2 | # Custom suffixes?
3 | # Class??
4 |
5 | EXPORT = False
6 |
7 | #-----------------------------------------------------------------#
8 | # Settings
9 |
10 | import os
11 | import re
12 | import pandas as pd
13 | import numpy as np
14 | import datetime as dt
15 |
16 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
17 | DATA_POC = DATA_path + "proof-of-concept/"
18 |
19 | OUT_panel = DATA_POC + "panel_indicators/"
20 |
21 |
22 | # CHANGE:
23 | IRESULTS = DATA_path + "Isaac-results/"
24 |
25 | IFLOW_path = IRESULTS + "flowminder/"
26 | ICUST_path = IRESULTS + "custom/"
27 |
28 | INEW_PATH_2_mar = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin2_priority/mar1-mar31/"
29 | INEW_PATH_2_apr = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin2_priority/mar23-apr30/"
30 |
31 | INEW_PATH_3_mar = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin3_priority/mar1-mar31/"
32 | INEW_PATH_3_apr = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin3_priority/mar23-apr30/"
33 |
34 |
35 | IOLD_PATH_2_mar = IRESULTS + "custom/admin2/"
36 | IOLD_PATH_3_mar = IRESULTS + "Archive/e_08_06_2020_coverage_04_to_05/admin3_custom/"
37 |
38 |
39 | # Load list of internal indicators to make it
40 | # easier to bulk load files
41 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/"
42 |
43 | internal_indicators = pd.read_csv(DATA_POC + 'indicators_list.csv')
44 | internal_indicators['path'] = DATA_path + internal_indicators['path']
45 |
46 | # Load files function
47 | def loadfiles(file_name,
48 | files_df = internal_indicators,
49 | admin = 3,
50 | path_external = None):
51 | if path_external is None:
52 | # Set intex
53 | idx = files_df[(files_df['file'] == file_name) & (files_df['level'] == admin)].index.values[0] # Load internal
54 | # Custom file names for i5, i7 and i9
55 | if file_name in ['mean_distance_per_day',
56 | 'origin_destination_connection_matrix_per_day',
57 | 'mean_distance_per_week',
58 | 'month_home_vs_day_location_per_day',
59 | 'week_home_vs_day_location_per_day']:
60 | file_name_i = file_name + '_7day_limit.csv'
61 | else:
62 | file_name_i = file_name + '.csv'
63 | # External names
64 | print(file_name, admin)
65 | # Load data
66 | d = None
67 | d = pd.read_csv(files_df['path'][idx] + file_name_i)
68 | else:
69 | print(file_name)
70 | file_name = file_name + '.csv'
71 | d = None
72 | d = pd.read_csv(path_external + file_name)
73 | # Patch clean of headers in the middle of the data
74 | c1_name = d.columns[0]
75 | d = d[~d[c1_name].astype(str).str.contains(c1_name)]
76 | # Turn everything to string for simplicity
77 | d.astype(str)
78 | return d
79 |
80 | # i1 = loadfiles('transactions_per_hour')
81 |
82 | # i1e = loadfiles('transactions_per_hour',
83 | # path_external= INEW_PATH_3_apr)
84 |
85 | # Drop custom missigs
86 | def drop_custna(data, columns):
87 | na_list = ['nan', '', '99999', float("inf")]
88 | for cols in columns:
89 | data = data[~(data[cols].isin(na_list))]
90 | return(data)
91 |
92 | # Clean function
93 | def clean(d, index_cols):
94 | # Remove missins
95 | d = d.dropna()
96 | # All but the last column
97 | #index_cols = list(d.columns[0:-1])
98 | # d = drop_custna(d, index_cols)
99 | return(d)
100 |
101 | #-----------------------------------------------------------------#
102 | # Load indicators
103 | i5_index = ['connection_date', 'region_from', 'region_to']
104 |
105 | i5 = loadfiles('origin_destination_connection_matrix_per_day',
106 | admin = 2)
107 | i5e_mar = loadfiles('origin_destination_connection_matrix_per_day',
108 | path_external= INEW_PATH_2_mar)
109 | i5e_apr = loadfiles('origin_destination_connection_matrix_per_day',
110 | path_external= INEW_PATH_2_apr)
111 |
112 | i7_index = ['home_region', 'day']
113 | i7 = loadfiles('mean_distance_per_day', admin = 2)
114 |
115 | # March files where only rerun for i5 and i9 so I'm using the old extraction from feb to apr
116 | i7e_mar = loadfiles('mean_distance_per_day',
117 | path_external= IOLD_PATH_2_mar)
118 | i7e_apr = loadfiles('mean_distance_per_day',
119 | path_external= INEW_PATH_2_apr)
120 |
121 | #-----------------------------------------------------------------#
122 | # Panel
123 | # Create panel
124 | def panel(d,
125 | de,
126 | index_cols,
127 | #countvars,
128 | r_suffix = '_ecnt',
129 | timevar = None,
130 | how = 'outer'):
131 | if timevar is None:
132 | timevar = index_cols[0]
133 | # MAke sure time var is date
134 | d[timevar] = d[timevar].astype('datetime64')
135 | de[timevar] = de[timevar].astype('datetime64')
136 | # Join
137 | md = d.merge(de,
138 | on = index_cols,
139 | how = how,
140 | suffixes=('', r_suffix))
141 | return md
142 |
143 |
144 | d1_bol = (p7['day'] >= np.datetime64(dt.date(2020, 3, 15)))
145 | d2_bol = (p7['day'] >= np.datetime64(dt.date(2020, 4, 1)))
146 |
147 | #--------#
148 | # i5 Panel
149 | p5 = panel(i5, i5e_mar, i5_index, timevar = 'connection_date')
150 | p5 = panel(p5,
151 | i5e_apr,
152 | i5_index,
153 | r_suffix= '_ecnt_apr',
154 | timevar = 'connection_date')
155 |
156 | d1_bol = (p5['connection_date'] >= np.datetime64(dt.date(2020, 3, 15)))
157 | d2_bol = (p5['connection_date'] >= np.datetime64(dt.date(2020, 4, 1)))
158 |
159 |
160 | countvars = ['subscriber_count','od_count', 'total_count']
161 | for var in countvars:
162 | varname = var + '_p'
163 | # Base value as our indicator
164 | p5[varname] = p5[var]
165 | # Replace values based on dates
166 | p5.loc[d1_bol, varname] = p5.loc[d1_bol, var + '_ecnt']
167 | p5.loc[d2_bol, varname] = p5.loc[d2_bol, var + '_ecnt_apr']
168 |
169 | p5 = p5.dropna(subset = ['connection_date']).sort_values(i5_index)
170 |
171 | # p5.to_csv('C:/Users/wb519128/Desktop/i5_test.csv', index = False)
172 |
173 | if EXPORT:
174 | p5.to_csv(OUT_panel + 'i5_admin2_temp.csv', index = False)
175 |
176 | #--------#
177 | # i7 Panel
178 | p7 = panel(i7, i7e_mar, i7_index, timevar = 'day')
179 | p7 = panel(p7,
180 | i7e_apr,
181 | i7_index,
182 | r_suffix= '_ecnt_apr',
183 | timevar = 'day')
184 |
185 |
186 | d1_bol = (p7['day'] >= np.datetime64(dt.date(2020, 3, 15)))
187 | d2_bol = (p7['day'] >= np.datetime64(dt.date(2020, 4, 1)))
188 |
189 | countvars = ['mean_distance', 'stdev_distance']
190 | for var in countvars:
191 | varname = var + '_p'
192 | # Base value as our indicator
193 | p7[varname] = p7[var]
194 | # Replace values based on dates
195 | p7.loc[d1_bol, varname] = p7.loc[d1_bol, var + '_ecnt']
196 | p7.loc[d2_bol, varname] = p7.loc[d2_bol, var + '_ecnt_apr']
197 |
198 |
199 |
200 |
201 | p7 = p7.dropna(subset = ['day']).sort_values(i7_index)
202 |
203 | # Export
204 | if EXPORT:
205 | p7.to_csv(OUT_panel + 'i7_admin2_temp.csv', index = False)
206 |
207 |
208 | # p7.to_csv('C:/Users/wb519128/Desktop/i7_test.csv', index = False)
209 |
210 |
--------------------------------------------------------------------------------
/data-panel/Archive/panel_draft.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # CREATE PANEL
3 | #-----------------------------------------------------------------#
4 |
5 | # This file combines two different sources of the indicators created
6 | # in cdr-aggregation to create a panel.
7 |
8 | # Dates at which different sources are connected are specific to
9 | # each indicator and the particularities of those sources
10 |
11 | #-----------------------------------------------------------------#
12 | # TO DO
13 |
14 | # Rewrite load function
15 |
16 | # Reorganize file paths to remove dependency on MASTER.py
17 |
18 | #-----------------------------------------------------------------#
19 | # Settings
20 |
21 | import os
22 | import re
23 | import pandas as pd
24 | import numpy as np
25 | import datetime as dt
26 |
27 | #-----------------------------------------------------------------#
28 | # Globals
29 |
30 | # Default connection date.
31 | append_date = dt.date(2020, 3, 15)
32 |
33 | #-----------------------------------------------------------------#
34 | # Function definitions
35 |
36 | # Drop custom missigs
37 | def drop_custna(data, columns):
38 | na_list = ['nan', '', '99999', float("inf")]
39 | for cols in columns:
40 | data = data[~(data[cols].isin(na_list))]
41 | return(data)
42 |
43 | # Load files function
44 | def loadfiles(file_name,
45 | files_df = internal_indicators,
46 | admin = 3):
47 | # Set intex
48 | idx = files_df[(files_df['file'] == file_name) & (files_df['level'] == admin)].index.values[0] # Load internal
49 | # Custom file names for i5, i7 and i9
50 | if file_name in ['mean_distance_per_day',
51 | 'origin_destination_connection_matrix_per_day',
52 | 'mean_distance_per_week',
53 | 'month_home_vs_day_location_per_day',
54 | 'week_home_vs_day_location_per_day']:
55 | file_name_i = file_name + '_7day_limit.csv'
56 | else:
57 | file_name_i = file_name + '.csv'
58 | # External names
59 | file_name_e = file_name + '.csv'
60 | print(file_name, admin)
61 | # Load data
62 | d = None
63 | d = pd.read_csv(files_df['path'][idx] + file_name_i)
64 | # Load external
65 | if files_df['indicator'][idx] == 'flow':
66 | ext_path = IFLOW_path
67 | else:
68 | ext_path = ICUST_path
69 | # Load external file
70 | ext_folder = ext_path + 'admin' + str(files_df['level'][idx]) + '/'
71 | de = None
72 | de = pd.read_csv(ext_folder + file_name_e)
73 | # Patch cleannig of headers in the middle of the data
74 | c1_name = d.columns[0]
75 | de = de[~de[c1_name].astype(str).str.contains(c1_name)]
76 | return([d, de])
77 |
78 | # Clean function
79 | def clean(d, index_cols):
80 | # Remove missins
81 | d = d.dropna()
82 | # All but the last column
83 | #index_cols = list(d.columns[0:-1])
84 | d = drop_custna(d, index_cols)
85 | return(d)
86 |
87 | # Create panel
88 | def simp_panel(d,
89 | de,
90 | index_cols,
91 | #countvars,
92 | append_date,
93 | compare = False,
94 | timevar = None,
95 | how = 'outer'):
96 | if timevar is None:
97 | timevar = index_cols[0]
98 | # Clean
99 | d = clean(d, index_cols)
100 | de = clean(de, index_cols)
101 | # Join
102 | md = d.merge(de,
103 | on = index_cols,
104 | how = how,
105 | suffixes=('', '_ecnt'))
106 | # Replace count values with internal until the 7th of march and
107 | # external after
108 | countvars = list(set(d.columns) - set(index_cols))
109 | for var in countvars:
110 | if compare:
111 | varname = var + '_p'
112 | else:
113 | varname = var
114 |
115 | md[varname] = np.where(pd.to_datetime(md[timevar]).dt.date <= append_date,
116 | md[var],
117 | md[var + '_ecnt'])
118 | # Remove other columns
119 | if not compare:
120 | md = md.filter(regex=r'^((?!_ecnt).)*$')
121 | # Return
122 | return md.sort_values(index_cols).dropna(subset= index_cols)
123 |
124 | #-----------------------------------------------------------------#
125 | # Load indicators
126 |
127 | # Define indicator class that
128 | class i_indicator:
129 | """
130 | This class contains information to load indicator files both
131 | from our original indicators and externally created ones.
132 |
133 | load() method loads both datasets
134 | clean() method removes missings from both datasets
135 | """
136 | def __init__(self,
137 | file_name,
138 | index_cols,
139 | admin = 3):
140 | self.file_name = file_name
141 | self.index_cols = index_cols
142 | self.admin = admin
143 | # Call methods when intializing
144 | self.load()
145 | self.clean()
146 | # Load data
147 | def load(self):
148 | self.data, self.data_e = loadfiles(self.file_name,
149 | admin = self.admin)
150 | # Clean data
151 | def clean(self):
152 | self.data = clean(self.data, self.index_cols)
153 | self.data_e = clean(self.data_e, self.index_cols)
154 |
155 | # Create panel data
156 | def create_panel(self,
157 | timevar = None,
158 | compare = False,
159 | append_date = append_date):
160 | panel = simp_panel(self.data,
161 | self.data_e,
162 | self.index_cols,
163 | append_date,
164 | compare = compare,
165 | timevar=timevar)
166 | return panel
167 |
168 | # Indicator 1
169 | # Sum across all observations in the given hour and lowest admin
170 | # area.
171 | i1 = i_indicator('transactions_per_hour',
172 | ['hour', 'region'])
173 |
174 | # Indicator 2
175 | # Sum all unique subscribers with an observation in the given
176 | # admin area and time period.
177 | i2 = i_indicator('unique_subscribers_per_hour',
178 | ['hour', 'region'])
179 |
180 |
181 | # Indicator 3
182 | # Sum all unique subscribers with an observation in the given
183 | # admin area and time period.
184 | i3 = i_indicator('unique_subscribers_per_day',
185 | ['day', 'region'])
186 |
187 | # Indicator 4
188 | # i4 = i_indicator('percent_of_all_subscribers_active_per_day',
189 | # ['home_region', 'day'])
190 |
191 | # Indicator 5
192 | i5 = i_indicator('origin_destination_connection_matrix_per_day',
193 | ['connection_date', 'region_from', 'region_to'])
194 | # Indicator 7
195 | i7 = i_indicator('mean_distance_per_day',
196 | ['home_region', 'day'])
197 |
198 | # Indicator 8
199 | i8 = i_indicator('mean_distance_per_week',
200 | ['home_region', 'week'])
201 |
202 | # Indicator 9
203 | i9 = i_indicator('week_home_vs_day_location_per_day',
204 | ['region', 'home_region', 'day'],
205 | admin = 2)
206 |
207 | #-----------------------------------------------------------------#
208 | # Create panel
209 |
210 | # Make particular changes to indicators as needed here
211 |
212 | # Panel with defaults
213 | i_list = [i1, i2, i3, i5, i9]
214 | panel_list = list(map(lambda x: x.create_panel() , i_list))
215 |
216 | # Custom arguments
217 | i7_p = i7.create_panel( timevar = 'day')
218 |
219 | #-----------------------------------------------------------------#
220 | # Export
221 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/aggregator.py:
--------------------------------------------------------------------------------
1 | import os
2 | if os.environ['HOME'] != '/root':
3 | from modules.DataSource import *
4 | from modules.sql_code_aggregates import *
5 | databricks = False
6 | else:
7 | databricks = True
8 |
9 | # Databricks notebook source
10 | class aggregator:
11 | """Class to handle aggregations.
12 |
13 |
14 | Attributes
15 | ----------
16 | result_stub : a string. File path where to save results
17 | datasource : an instance of DataSource class. Holds all dataframes and paths required
18 | regions : a pyspark dataframe. Admin level this aggregator will be used for
19 | calls : a pyspark dataframe. cdr data
20 | cells : a pyspark dataframe. admin region to tower mapping
21 | spark : an initialised spark connection. spark connection this aggregator should use
22 | dates : a dictionary. dates the aggregator should run over
23 | intermediate_tables : tables that we don't want written to csv
24 |
25 |
26 | Methods
27 | -------
28 | create_sql_dates()
29 | Convert the dates to strings to be used in the flowminder sql queries
30 |
31 | create_view(df, table_name)
32 | Creates a view of a dataframe
33 |
34 | save(table_name)
35 | Repartitions a dataframe into a single partition and writes it to a csv file
36 |
37 | save_and_report(table_name)
38 | Checks whether csv file exists before saving table_name to csv
39 |
40 | rename_csv(table_name)
41 | - rename a specific csv
42 | - move a csv to parent folder, rename it, then delete its remaining folder
43 |
44 | rename_all_csvs(table_name)
45 | renames all csvs at once
46 |
47 | rename_if_not_existing(table_name)
48 | rename only if the file doesn't exist as csv yet, handles errors
49 |
50 | check_if_file_exists(table_name)
51 | checks whether a csv exists before we re-create
52 |
53 |
54 |
55 | """
56 |
57 | def __init__(self,
58 | result_stub,
59 | datasource,
60 | regions,
61 | intermediate_tables = ['home_locations']):
62 | """
63 | Parameters
64 | ----------
65 | result_stub : where to save results
66 | datasource : holds all dataframes and paths required
67 | regions : admin level this aggregator will be used for
68 | intermediate_tables : tables that we don't want written to csv
69 | """
70 | self.datasource = datasource
71 | self.result_path = datasource.results_path + result_stub
72 | self.calls = datasource.parquet_df
73 | self.calls.createOrReplaceTempView('calls')
74 | self.cells = getattr(datasource, regions)
75 | self.cells.createOrReplaceTempView("cells")
76 | self.spark = datasource.spark
77 | self.dates = datasource.dates
78 | self.create_sql_dates()
79 | self.sql_code = write_sql_code(calls = self.calls,
80 | start_date = self.dates_sql['start_date'],
81 | end_date = self.dates_sql['end_date'],
82 | start_date_weeks = self.dates_sql['start_date_weeks'],
83 | end_date_weeks = self.dates_sql['end_date_weeks'])
84 | self.table_names = self.sql_code.keys()
85 | self.intermediate_tables = intermediate_tables
86 |
87 | def create_sql_dates(self):
88 | self.dates_sql = {'start_date' : "\'" + self.dates['start_date'].isoformat('-')[:10] + "\'",
89 | 'end_date' : "\'" + self.dates['end_date'].isoformat('-')[:10] + "\'",
90 | 'start_date_weeks' : "\'" + self.dates['start_date_weeks'].isoformat('-')[:10] + "\'",
91 | 'end_date_weeks' : "\'" + self.dates['end_date_weeks'].isoformat('-')[:10] + "\'"}
92 |
93 | def create_view(self, df, table_name):
94 | df.createOrReplaceTempView(table_name)
95 |
96 | def save(self, df, table_name):
97 | df.repartition(1).write.mode('overwrite').format('com.databricks.spark.csv') \
98 | .save(os.path.join(self.result_path, table_name), header = 'true')
99 |
100 | def save_and_report(self, df, table_name):
101 | if table_name not in self.intermediate_tables:
102 | if self.check_if_file_exists(table_name):
103 | print('Skipped: ' + table_name)
104 | else:
105 | print('--> File does not exist. Saving: ' + table_name)
106 | self.save(df, table_name)
107 | else:
108 | print('Caching: home_locations')
109 | df.createOrReplaceTempView("home_locations")
110 | self.spark.sql('CACHE TABLE home_locations').collect()
111 | self.create_view(df, table_name)
112 | return table_name
113 |
114 | def rename_csv(self, table_name):
115 | # move one folder up and rename to human-legible .csv name
116 | if databricks:
117 | dbutils.fs.mv(dbutils.fs.ls(self.result_path + '/' + table_name)[-1].path,
118 | self.result_path + '/' + table_name + '.csv')
119 | # remove the old folder
120 | dbutils.fs.rm(self.result_path + '/' + table_name + '/', recurse = True)
121 | else:
122 | os.rename(glob.glob(os.path.join(self.result_path, table_name + '/*.csv'))[0],
123 | os.path.join(self.result_path, table_name + '.csv'))
124 | shutil.rmtree(os.path.join(self.result_path, table_name))
125 |
126 | def save_and_rename_one(self, df, table_name):
127 | self.rename_if_not_existing(self.save_and_report(df, table_name))
128 |
129 | def rename_all_csvs(self):
130 | for table_name in self.table_names:
131 | if table_name in self.intermediate_tables:
132 | pass
133 | else:
134 | self.rename_if_not_existing(table_name)
135 |
136 | def rename_if_not_existing(self, table_name):
137 | if databricks:
138 | try:
139 | # does the csv already exist
140 | dbutils.fs.ls(self.result_path + '/' + table_name + '.csv')
141 | except Exception as e:
142 | # the csv doesn't exist yet, move the file and delete the folder
143 | if 'java.io.FileNotFoundException' in str(e):
144 | print('--> Renaming: ' + table_name)
145 | self.rename_csv(table_name)
146 | else:
147 | raise
148 | else:
149 | if os.path.exists(self.result_path + '/' + table_name + '.csv'):
150 | pass
151 | else:
152 | print('--> Renaming: ' + table_name)
153 | self.rename_csv(table_name)
154 |
155 | def check_if_file_exists(self, table_name):
156 | if databricks:
157 | try:
158 | # does the folder exist?
159 | dbutils.fs.ls(self.result_path + '/' + table_name)
160 | return True
161 | except Exception as e:
162 | # the folder does not exist
163 | if 'java.io.FileNotFoundException' in str(e):
164 | try:
165 | # does the csv exist?
166 | dbutils.fs.ls(self.result_path + '/' + table_name + '.csv')
167 | return True
168 | except Exception as e:
169 | # the csv does not exist
170 | if 'java.io.FileNotFoundException' in str(e):
171 | return False
172 | else:
173 | raise
174 | else:
175 | raise
176 | else:
177 | return os.path.exists(self.result_path + '/' + table_name) | \
178 | os.path.exists(self.result_path + '/' + table_name + '.csv')
179 |
--------------------------------------------------------------------------------
/data-checks/Archive/od_scaling.py:
--------------------------------------------------------------------------------
1 | #-----------------------------------------------------------------#
2 | # OD matrix scaling checks
3 | #-----------------------------------------------------------------#
4 |
5 | # This code depends on MASTER.py to run as file path objects are
6 | # defined there
7 |
8 | #-----------------------------------------------------------------#
9 | # Settings
10 |
11 | import pandas as pd
12 | import matplotlib.pyplot as plt
13 | import datetime
14 | import os
15 |
16 |
17 | #-----------------------------------------------------------------#
18 | # Load data
19 | od = pd.read_csv(I5_Adm3_path +
20 | "origin_destination_connection_matrix_per_day.csv")
21 |
22 |
23 | # Number of residents
24 | res = pd.read_csv(FLOWM_adm3_path +
25 | "home_location_counts_per_region.csv")
26 |
27 | # Active residents
28 | ares = pd.read_csv(FLOWM_adm3_path +
29 | "count_unique_active_residents_per_region_per_day.csv")
30 |
31 | # Number of calls
32 | cal = pd.read_csv(FLOWM_adm3_path +
33 | "total_calls_per_region_per_day.csv")
34 |
35 |
36 | #-----------------------------------------------------------------#
37 | # Process data
38 |
39 | # Create date variable
40 | def convert_dates(df,date_col ='connection_date'):
41 | df['date'] = pd.\
42 | to_datetime(df[date_col]).\
43 | dt.date
44 | return(df)
45 |
46 | od = convert_dates(od, 'connection_date')
47 | ares = convert_dates(ares, 'visit_date')
48 | cal = convert_dates(cal, 'call_date')
49 |
50 | #-----------------------------------------------------------------#
51 | # Create different scaling factors
52 |
53 | #--------------------#
54 | # Create new variables
55 |
56 | # Number of active subscribers over total residents
57 | ares = ares.merge(res.rename(columns={"subscriber_count" : "residents"}),
58 | on = 'region',
59 | how='outer')
60 |
61 | ares = ares.rename(columns={"subscriber_count" : 'active_res'})
62 |
63 | # Check pp > 1 !!!!
64 | ares['p_active_res'] = ares['active_res']/ares['residents']
65 |
66 |
67 |
68 | # Number of calls over residents
69 | cal = cal.merge(res.rename(columns={"subscriber_count" : "residents"}),
70 | on = 'region',
71 | how='outer')
72 |
73 | cal['p_cals'] = cal['total_calls']/cal['residents']
74 |
75 | #------------------------------#
76 | # Add new variables to od matrix
77 |
78 | # Proportion of active residents in orig and dest
79 | od = od.\
80 | merge(ares[['region','date', 'p_active_res']],
81 | left_on= ['region_from','date'],
82 | right_on= ['region', 'date'],
83 | how='left').\
84 | rename(columns={'p_active_res' : 'p_active_res_O'}).\
85 | drop(columns='region').\
86 | merge(ares[['region','date', 'p_active_res']],
87 | left_on= ['region_to','date'],
88 | right_on= ['region', 'date'],
89 | how='left').\
90 | rename(columns={'p_active_res' : 'p_active_res_D'}).\
91 | drop(columns='region')
92 |
93 |
94 | # Proportion of calls per residents in orig and dest
95 | od = od.\
96 | merge(cal[['region','date', 'p_cals']],
97 | left_on= ['region_from','date'],
98 | right_on= ['region', 'date'],
99 | how='left').\
100 | rename(columns={'p_cals' : 'p_cals_O'}).\
101 | drop(columns='region').\
102 | merge(cal[['region','date', 'p_cals']],
103 | left_on= ['region_to','date'],
104 | right_on= ['region', 'date'],
105 | how='left').\
106 | rename(columns={'p_cals' : 'p_cals_D'}).\
107 | drop(columns='region')
108 |
109 |
110 | #-----------------#
111 | # Create indicators
112 |
113 | # Multiplication of total active residents in origin and
114 | # destiantion
115 | od['w1'] = od['p_active_res_O'] * od['p_active_res_D']
116 |
117 |
118 | # Sum of calls per person in origin and destinaion
119 | od['w2'] = od['p_cals_O'] + od['p_cals_D']
120 |
121 |
122 | # od['p_cals_O'].isnull().sum()/od.shape[0]
123 | # 0.5159950493247425
124 |
125 | #-----------------------------------------------------------------#
126 | # Create scaled values
127 | od['total_count_w1'] = od['total_count']/od['w1']
128 |
129 | od['total_count_w2'] = od['total_count']/od['w2']
130 |
131 | #-----------------------------------------------------------------#
132 | # Plot
133 |
134 | # Set origin region
135 | od1 = od[od['region_from'] == 'ZW102109']
136 |
137 | # Select a set of destinations
138 | # od1_top_dest = ['ZW120435','ZW142513','ZW192205',
139 | # 'ZW130720','ZW170530' ]
140 |
141 | od1_top_dest = od1['region_to'].value_counts().head(9).index
142 |
143 | # Create plot df
144 | # p1_df = od1[od1['region_to'] == 'ZW120435']
145 | p1_df = od1[od1['region_to'].isin(od1_top_dest)]
146 | p1_df.set_index(['date'],inplace=True)
147 |
148 |
149 | # Plot function that already adds it to the grid
150 | def add_plts(dest_value,
151 | grid_pos,
152 | df = p1_df,
153 | dest_var = 'region_to',
154 | #x_axis = 'connection_date',
155 | y_axis = 'total_count'):
156 |
157 | df[df[dest_var] == dest_value].\
158 | plot(y= y_axis,
159 | legend= False,
160 | ax = fig.add_subplot(grid_pos))
161 |
162 | # Run plots
163 | # # Gambiarra da porra. Fazer isso melhor se tiver tempo
164 | # def plots_together(var):
165 | # fig, ax = plt.subplots(nrows=3,ncols=3)
166 | # fig = plt.figure()
167 | # gs = fig.add_gridspec(3, 3)
168 |
169 | # add_plts(od1_top_dest[0], gs[0, 0], y_axis = var)
170 | # add_plts(od1_top_dest[1], gs[0, 1], y_axis = var)
171 | # add_plts(od1_top_dest[2], gs[0, 2], y_axis = var)
172 | # add_plts(od1_top_dest[3], gs[1, 0], y_axis = var)
173 | # add_plts(od1_top_dest[4], gs[1, 1], y_axis = var)
174 | # add_plts(od1_top_dest[5], gs[1, 2], y_axis = var)
175 | # add_plts(od1_top_dest[6], gs[2, 0], y_axis = var)
176 | # add_plts(od1_top_dest[7], gs[2, 1], y_axis = var)
177 | # add_plts(od1_top_dest[8], gs[2, 2], y_axis = var)
178 |
179 | # return(fig)
180 | # # fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png')
181 |
182 | # plots_together('total_count')
183 |
184 | var = 'total_count'
185 |
186 | # Set plot parameters
187 | fig, ax = plt.subplots(nrows=3,ncols=3)
188 | fig = plt.figure()
189 | gs = fig.add_gridspec(3, 3)
190 |
191 |
192 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var)
193 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var)
194 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var)
195 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var)
196 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var)
197 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var)
198 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var)
199 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var)
200 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var)
201 |
202 | # Export
203 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png')
204 |
205 |
206 | var = 'total_count_w2'
207 |
208 | # Set plot parameters
209 | fig, ax = plt.subplots(nrows=3,ncols=3)
210 | fig = plt.figure()
211 | gs = fig.add_gridspec(3, 3)
212 |
213 |
214 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var)
215 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var)
216 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var)
217 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var)
218 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var)
219 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var)
220 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var)
221 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var)
222 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var)
223 |
224 | # Export
225 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png')
226 |
227 | var = 'total_count_w1'
228 |
229 | # Set plot parameters
230 | fig, ax = plt.subplots(nrows=3,ncols=3)
231 | fig = plt.figure()
232 | gs = fig.add_gridspec(3, 3)
233 |
234 |
235 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var)
236 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var)
237 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var)
238 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var)
239 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var)
240 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var)
241 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var)
242 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var)
243 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var)
244 |
245 | # Export
246 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png')
247 |
248 |
249 | # df = p1_df
250 | # dest_value = od1_top_dest[0]
251 | # dest_var = 'region_to'
252 | # x_axis = 'connection_date'
253 | # y_axis = 'total_count'
254 |
255 | # df[df[dest_var] == dest_value].\
256 | # plot(y= y_axis,
257 | # legend= False,
258 | # fontsize=6,
259 | # rot= 30)
260 | # plt.show()
261 |
--------------------------------------------------------------------------------
/data-checks/Archive/quick_checks/ward_neighbors_tower_down.R:
--------------------------------------------------------------------------------
1 | # Check subscribers data
2 |
3 | FIG_PATH <- file.path(PROJECT_PATH, "proof-of-concept",
4 | "outputs", "data-checks", "figures_indicators", "subscribers_neighbors_daily")
5 |
6 | FIG_PATH_OUTLIER <- file.path(PROJECT_PATH, "proof-of-concept",
7 | "outputs", "data-checks", "figures_indicators", "subscribers_neighbors_daily_outlier")
8 |
9 | # Load Data --------------------------------------------------------------------
10 | ISAAC_DATA_PATH_2 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin2_flowminder")
11 | ISAAC_DATA_PATH_3 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin3_flowminder")
12 |
13 | #### Wards
14 | wards_sp <- readRDS(file.path(CLEAN_DATA_ADM3_PATH, "wards_aggregated.Rds"))
15 |
16 | #### Tower down
17 | towers_down <- read.csv(file.path(PROOF_CONCEPT_PATH,
18 | "outputs",
19 | "data-checks",
20 | "days_wards_with_low_hours_I1_panel.csv"))
21 |
22 | towers_down <- towers_down %>%
23 | dplyr::select(region, date) %>%
24 | mutate(tower_down = T) %>%
25 | mutate(date = date %>% as.character %>% as.Date(),
26 | region = region %>% as.character())
27 |
28 | #### Raw Data
29 | df_day_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2,
30 | "count_unique_subscribers_per_region_per_day.csv"),
31 | stringsAsFactors=F) %>%
32 | dplyr::rename(value_raw = subscriber_count,
33 | date = visit_date) %>%
34 | dplyr::mutate(region = region %>% as.character(),
35 | date = date %>% as.Date())
36 |
37 | df_week_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2,
38 | "count_unique_subscribers_per_region_per_week.csv"),
39 | stringsAsFactors=F) %>%
40 | dplyr::rename(value_raw = subscriber_count,
41 | date = visit_week) %>%
42 | dplyr::mutate(region = region %>% as.character())
43 |
44 | df_day_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3,
45 | "count_unique_subscribers_per_region_per_day.csv"),
46 | stringsAsFactors=F) %>%
47 | dplyr::rename(value_raw = subscriber_count,
48 | date = visit_date) %>%
49 | dplyr::mutate(region = region %>% as.character(),
50 | date = date %>% as.Date())
51 |
52 | df_week_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3,
53 | "count_unique_subscribers_per_region_per_week.csv"),
54 | stringsAsFactors=F) %>%
55 | dplyr::rename(value_raw = subscriber_count,
56 | date = visit_week) %>%
57 | dplyr::mutate(region = region %>% as.character())
58 |
59 | #### Cleaned Data
60 | df_day_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH,
61 | "count_unique_subscribers_per_region_per_day.Rds")) %>%
62 | left_join(df_day_adm2_raw, by=c("date", "region"))
63 |
64 | df_week_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH,
65 | "count_unique_subscribers_per_region_per_week.Rds"))
66 |
67 | df_day_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH,
68 | "count_unique_subscribers_per_region_per_day.Rds")) %>%
69 | left_join(df_day_adm3_raw, by=c("date", "region")) %>%
70 | mutate(value_raw = value_raw %>% as.numeric())
71 |
72 | df_week_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH,
73 | "count_unique_subscribers_per_region_per_week.Rds"))
74 |
75 | # Create Ward Neighbors --------------------------------------------------------
76 | #### Region and id datasets
77 | ward_id_df <- wards_sp@data %>%
78 | dplyr::select(region) %>%
79 | mutate(id = 1:n())
80 |
81 | #### Create neighbor matrix
82 | neighbor_df <- gTouches(wards_sp, byid=TRUE) %>%
83 | as.data.frame() %>%
84 | mutate(id = 1:n()) %>%
85 | pivot_longer(-id) %>%
86 | dplyr::rename(n_id = name,
87 | neighbors = value) %>%
88 | dplyr::mutate(n_id = n_id %>% as.numeric()) %>%
89 |
90 | # id_n (neighbor) region
91 | left_join(ward_id_df, by = c("n_id" = "id")) %>%
92 | dplyr::rename(n_region = region) %>%
93 |
94 | # id region
95 | left_join(ward_id_df, by = "id") %>%
96 |
97 | # restrict to neighbors
98 | filter(neighbors %in% T)
99 |
100 | #### Merge data to neighbor matrix
101 | ward_data <- df_day_adm3 %>%
102 | dplyr::select(region, date, value, value_raw)
103 |
104 | neighbor_df <- neighbor_df %>%
105 |
106 | # neighbor data
107 | left_join(ward_data, by = c("n_region" = "region")) %>%
108 | dplyr::rename(value_n = value,
109 | value_raw_n = value_raw) %>%
110 |
111 | # ward data
112 | left_join(ward_data, by = c("region", "date"))
113 |
114 | #### Merge in Neighbor down
115 | neighbor_df <- neighbor_df %>%
116 | left_join(towers_down, by = c("region", "date")) %>%
117 |
118 | # tower down on any day?
119 | group_by(region) %>%
120 | mutate(tower_down_anyday = (TRUE %in% tower_down)) %>%
121 |
122 | # restrict to observations where tower down on any day
123 | filter(tower_down_anyday %in% T)
124 |
125 | #### Merge in province
126 | prov_df <- wards_sp@data %>%
127 | dplyr::select(region, province)
128 |
129 | neighbor_df <- neighbor_df %>%
130 | left_join(prov_df, by="region")
131 |
132 | # Neighbor Stats ---------------------------------------------------------------
133 | # TODO: Not naming things well, should be value_n_raw_avg, for example
134 | #### Average neighbor value
135 | neighbor_df <- neighbor_df %>%
136 | group_by(region, date) %>%
137 | mutate(value_n_avg = mean(value_raw_n, na.rm=T))
138 |
139 | #### Percen change of neighbor value from average
140 | neighbor_df <- neighbor_df %>%
141 | group_by(n_region) %>%
142 | mutate(region_n_value_avg = mean(value_raw_n, na.rm=T)) %>%
143 | mutate(region_n_value_pc = (value_raw_n - region_n_value_avg)/region_n_value_avg) %>%
144 | mutate(region_n_value_pc_max = max(region_n_value_pc, na.rm=T))
145 |
146 | # Export Datset ----------------------------------------------------------------
147 | #neighbor_df_clean <- neighbor_df %>%
148 | # dplyr::select(region, n_region, date, value_n)
149 |
150 | #head(neighbor_df)
151 |
152 |
153 |
154 | # Trends Over Time -------------------------------------------------------------
155 | neighbor_df %>%
156 | filter(id %in% 10) %>%
157 | ggplot() +
158 | geom_vline(data = . %>% filter(tower_down), aes(xintercept = date),
159 | color = "gray50", size=2, alpha = 0.2) +
160 | geom_line(aes(x=date, y=value_raw_n,
161 | group=n_id %>% as.factor(),
162 | color=n_id %>% as.factor())) +
163 | geom_line(aes(x=date, y=value_raw), size=2, color="black") +
164 | theme_minimal() +
165 | theme(legend.position = "none")
166 |
167 |
168 | lapply(unique(neighbor_df$province), function(province_i){
169 | print(province_i)
170 |
171 | p <- neighbor_df %>%
172 | filter(province %in% province_i) %>%
173 | ggplot() +
174 | geom_vline(data = . %>% filter(tower_down), aes(xintercept = date),
175 | color = "gray50", size=2, alpha = 0.2) +
176 | geom_line(aes(x=date, y=value_raw), size=1.5, color="black") +
177 | geom_line(aes(x=date, y=value_n_avg), size=1.5, color="red") +
178 | geom_line(aes(x=date, y=value_raw_n,
179 | group=n_id %>% as.factor(),
180 | color=n_id %>% as.factor()),
181 | size=.4) +
182 | theme_minimal() +
183 | theme(legend.position = "none") +
184 | facet_wrap(~region,
185 | scales = "free_y")
186 |
187 | ggsave(p, filename = file.path(FIG_PATH, paste0(province_i, ".png")), height = 25, width = 25)
188 |
189 | return(NULL)
190 | })
191 |
192 | # Bad Cases -------------------------------------------------------------
193 | for(percent in c(50, 75, 100)){
194 |
195 | print(percent)
196 |
197 | neighbor_df_bad <- neighbor_df %>%
198 | mutate(keep = (tower_down %in% TRUE) & (region_n_value_pc > percent/100)) %>%
199 | group_by(region) %>%
200 | mutate(keep_any = (TRUE %in% keep)) %>%
201 | ungroup() %>%
202 | filter(keep_any %in% TRUE) %>%
203 | filter(region_n_value_pc_max > percent/100)
204 |
205 | p_bad <- neighbor_df_bad %>%
206 | ggplot() +
207 | geom_vline(data = . %>% filter(tower_down), aes(xintercept = date),
208 | color = "gray50", size=2, alpha = 0.2) +
209 | geom_line(aes(x=date, y=value_raw), size=1.75, color="black") +
210 | geom_line(aes(x=date, y=value_raw_n,
211 | group=n_id %>% as.factor(),
212 | color=n_id %>% as.factor()),
213 | size=1) +
214 |
215 | #geom_line(aes(x=date, y=value_n_avg), size=1.5, color="red") +
216 | theme_minimal() +
217 | theme(legend.position = "none") +
218 | facet_wrap(~region,
219 | scales = "free_y")
220 |
221 | ggsave(p_bad, filename = file.path(FIG_PATH_OUTLIER, paste0(percent, "percent_thresh.png")), height = 25, width = 25)
222 | }
223 |
224 |
225 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/tower_clustering.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | import geopandas as gpd
3 | import numpy as np
4 | import pandas as pd
5 | from shapely.geometry import Polygon, LineString
6 | from sklearn.neighbors import DistanceMetric
7 | from scipy.spatial.distance import squareform
8 | from scipy.cluster.hierarchy import linkage
9 | from scipy.cluster.hierarchy import fcluster
10 | from copy import deepcopy
11 | import os
12 | if os.environ['HOME'] != '/root':
13 | from modules.utilities import *
14 | databricks = False
15 | else:
16 | databricks = True
17 |
18 |
19 | ## Class to handle spark and df in session
20 | class tower_clusterer:
21 | """Class to cluster towers together.
22 |
23 |
24 | Attributes
25 | ----------
26 | datasource : an instance of DataSource class.
27 | shape : a geopandas dataframe. Shapefile to use for clustering
28 | region_var : a string. Name of the region variable in the shapefile.
29 | sites : a string. Name of the attribute of datasource that holds the tower coordinates.
30 | shape_df : a pyspark dataframe. Shapefile to use for clustering, in pyspark df.
31 | spark : an initialised spark connection
32 | spark_df : a pyspark dataframe. Holds the cdr data
33 | result_path : a string. Where to save results.
34 | filename : a string. Name for result file.
35 | dist : a string. Metric to use to calculate distances.
36 | sites : a pyspark dataframe. Code, Lat, Lng for all tower_sites
37 | sites_with_clusters : a pyspark dataframe. Clustered sites (once methods have run)
38 |
39 |
40 |
41 | Methods
42 | -------
43 | cluster_towers()
44 | runs clustering algorithm
45 |
46 | get_centroids()
47 | computes centroids of clusters
48 |
49 | map_to_regions()
50 | maps cluster centroids to admin regions
51 |
52 | save_results()
53 | saves the results to csv
54 |
55 | """
56 |
57 | def __init__(self,
58 | datasource,
59 | shape,
60 | region_var,
61 | sites = 'tower_sites'):
62 | """
63 | Parameters
64 | ----------
65 | datasource : an instance of DataSource class.
66 | shape : a geopandas dataframe. Shapefile to use for clustering
67 | region_var : a string. Name of the region variable in the shapefile.
68 | sites : a string. Name of the attribute of datasource that holds the tower coordinates.
69 | """
70 | self.datasource = datasource
71 | self.spark = datasource.spark
72 | self.shape = getattr(datasource, shape + '_gpd')
73 | self.shape_df = getattr(datasource, shape)
74 | self.result_path = datasource.results_path
75 | self.filename = shape
76 | self.region_var = region_var
77 | self.dist = DistanceMetric.get_metric('haversine')
78 | sites_df = getattr(datasource, sites + '_pd')
79 | if (sites_df.columns == ['cell_id', 'LAT', 'LNG']).all():
80 | self.sites = sites_df[sites_df.LAT.notna()]
81 | self.sites_with_clusters = self.sites
82 | else:
83 | raise 'The sites dataframe does not have the correct columns / \
84 | column order. Should be cell_id, LAT, LNG'
85 |
86 | def cluster_towers(self):
87 | ## deepcopy sites since we will need it later on
88 | self.radians = deepcopy(self.sites)
89 | # convert degrees to radians
90 | self.radians['LAT'] = np.radians(self.sites['LAT'])
91 | self.radians['LNG'] = np.radians(self.sites['LNG'])
92 | # run clustering algorithm
93 | self.clusters = fcluster(
94 | linkage(
95 | squareform(
96 | self.dist.pairwise(self.radians[['LAT','LNG']]\
97 | .to_numpy())*6373), method='ward'), t = 1, criterion = 'distance')
98 | self.sites_with_clusters = self.radians
99 | self.sites_with_clusters['cluster'] = self.clusters
100 | # compute centroids of clusters
101 | self.get_centroids()
102 | self.sites_with_clusters['LAT'] = np.rad2deg(self.sites_with_clusters['LAT'])
103 | self.sites_with_clusters['LNG'] = np.rad2deg(self.sites_with_clusters['LNG'])
104 | self.sites_with_clusters['centroid_LAT'] = \
105 | np.rad2deg(self.sites_with_clusters['centroid_LAT'])
106 | self.sites_with_clusters['centroid_LNG'] = \
107 | np.rad2deg(self.sites_with_clusters['centroid_LNG'])
108 | # put clusters in geodataframe
109 | self.sites_gpd = gpd.GeoDataFrame(self.sites_with_clusters,
110 | geometry=gpd.points_from_xy(
111 | self.sites_with_clusters.centroid_LNG,
112 | self.sites_with_clusters.centroid_LAT),
113 | crs = 'epsg:4326')
114 | # compute distances between cluters
115 | self.distances_pd = pd.DataFrame(
116 | self.dist.pairwise(
117 | np.radians(
118 | self.sites_with_clusters[['centroid_LAT','centroid_LNG']])\
119 | .to_numpy())*6373, columns=self.sites_with_clusters.cell_id.unique(),
120 | index=self.sites_with_clusters.cell_id.unique())
121 | # create long form of distance matrix
122 | distances = []
123 | origin = []
124 | destination = []
125 | for a in self.distances_pd.index:
126 | for b in self.distances_pd.index:
127 | distances.append(self.distances_pd.loc[a,b])
128 | origin.append(a)
129 | destination.append(b)
130 | self.distances_pd_long = pd.DataFrame(list(zip(distances, origin, destination)),
131 | columns =['distance', 'origin', 'destination'])
132 | # map clusters to regions
133 | self.map_to_regions()
134 | return self.save_results()
135 |
136 | def get_centroids(self):
137 | # loop through clusters to compute centroids
138 | for cluster_num in self.sites_with_clusters.cluster.unique():
139 | subset = self.sites_with_clusters[self.sites_with_clusters.cluster == cluster_num]
140 | # use line method if we have only two towers in cluster
141 | if len(subset) == 2:
142 | line = LineString(subset.loc[:,['LNG', 'LAT']].to_numpy())
143 | self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \
144 | cluster_num, 'centroid_LNG'] = line.interpolate(0.5, normalized = True).x
145 | self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \
146 | cluster_num, 'centroid_LAT'] = line.interpolate(0.5, normalized = True).y
147 | # use polygon method if we have more than two towers in cluster
148 | if len(subset) > 2:
149 | self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \
150 | cluster_num, 'centroid_LNG'] = \
151 | Polygon(subset.loc[:,['LNG', 'LAT']].to_numpy()).convex_hull.centroid.x
152 | self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \
153 | cluster_num, 'centroid_LAT'] = \
154 | Polygon(subset.loc[:,['LNG', 'LAT']].to_numpy()).convex_hull.centroid.y
155 | # replace NAs
156 | self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(),
157 | 'centroid_LNG'] = \
158 | self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(), 'LNG']
159 | self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(),
160 | 'centroid_LAT'] = \
161 | self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(), 'LAT']
162 |
163 | def map_to_regions(self):
164 | # spatial join clusteres with shapefile
165 | self.joined = gpd.sjoin(self.sites_gpd, self.shape, op="intersects")
166 |
167 | def save_results(self):
168 | # save results of mapping of clusters to regions
169 | self.joined = self.joined.rename(columns={self.region_var:'region'})
170 | self.towers_regions_clusters_all_vars = \
171 | self.joined.loc[:,['cell_id', 'LAT', 'LNG', 'centroid_LAT',
172 | 'centroid_LNG', 'region', 'cluster']]
173 | self.towers_regions_clusters_all_vars = \
174 | self.spark.createDataFrame(self.towers_regions_clusters_all_vars)
175 | save_csv(self.towers_regions_clusters_all_vars,
176 | self.result_path,
177 | self.datasource.country_code + '_' + self.filename + '_tower_map_all_vars')
178 | # save results with only essential variables, for use in data processing
179 | self.towers_regions_clusters = \
180 | self.joined.loc[:,['cell_id', 'region']]
181 | self.towers_regions_clusters = \
182 | self.spark.createDataFrame(self.towers_regions_clusters)
183 | save_csv(self.towers_regions_clusters,
184 | self.result_path,
185 | self.datasource.country_code + '_' + self.filename + '_tower_map')
186 | # save distance matrix in long form
187 | self.distances_df_long = \
188 | self.spark.createDataFrame(self.distances_pd_long)
189 | save_csv(self.distances_df_long,
190 | self.result_path, self.datasource.country_code + '_distances_pd_long')
191 | # save shapefile used, for dashboarding
192 | save_csv(self.shape_df, self.result_path,
193 | self.datasource.country_code + '_' + self.filename + '_shapefile')
194 | return self.towers_regions_clusters, self.distances_df_long
195 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/aggregation_master.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | # # Production of indicators for the COVID19 Mobility Task Force
5 | #
6 | # In this notebook we produce indicators for the [COVID19 Mobility Task Force](https://github.com/worldbank/covid-mobile-data).
7 | #
8 | # [Flowminder](https://covid19.flowminder.org) indicators are produced to increase the availability of comparable datasets across countries, and have been copied without modification from the [Flowminder COVID-19 github repository](https://github.com/Flowminder/COVID-19) (except for the start and end dates). These have been supplemented by a set of *priority* indicators with data for ingestion into the dashboard in this repository.
9 | #
10 | # In this notebook we produce indicators in the following four steps:
11 | #
12 | # - **Import code**: The code for the aggregation is included in the 'custom_aggregation' and 'flowminder_aggregation' scripts
13 | # - **Import data**:
14 | # To set up the data import we need to place the CDR data files into the `data/new/CC/telco/` folder, where we replace `CC` with the country code and `telco` with the company abbreviation.
15 | # We also need to place csv files with the tower-region mapping and distance matrices into the `data/support-data/CC/telco/geofiles` folder, and then modify the `data/support_data/config_file.py` to specify:
16 | # - *geofiles*: the names of the geofiles,
17 | # - *country_code*: country code and company abbreviation,
18 | # - *telecom_alias*: the path to the `data` folder,
19 | # - *data_paths*: the names to the subfolders in `data/new/CC/telco/` that hold the csv files. Simply change this to `[*]` if you didn't create subfolders and want to load all files.
20 | # - *dates*: set the start and end date of the data you want to produce the indicators for.
21 | #
22 | # Find more information about the `config_file.py` settings see the [github page](https://github.com/worldbank/covid-mobile-data/tree/master/cdr-aggregation).
23 | #
24 | # - **Run aggregations**: By default, we produce all flowminder and priority indicators. We've included 4 re-tries in case of failure, which we have experienced to help on databricks but is probably irrelevant in other settings. Note that before you can re-run these aggregations, you need to move the csv outputs that have been saved in `data/results/CC/telco/` in previous runs to another folder, else these indicators will be skipped. This prevents you from accidentally overwriting previous results. This way you can also delete the files only for the indicators you want to re-produce, and skip any indicatos you don't want to re-produce.
25 | #
26 | # The outcome of this effort will be used to inform policy making using a [mobility indicator dashboard](https://github.com/worldbank/covid-mobile-data/tree/master/dashboard-dataviz).
27 |
28 | # # Import code
29 |
30 | # In[1]:
31 |
32 |
33 | get_ipython().run_line_magic('load_ext', 'autoreload')
34 | get_ipython().run_line_magic('autoreload', '2')
35 |
36 |
37 | # In[2]:
38 |
39 |
40 | from modules.DataSource import *
41 |
42 |
43 | # In[3]:
44 |
45 |
46 | config_file = '../config_file.py'
47 |
48 |
49 | # In[4]:
50 |
51 |
52 | exec(open(config_file).read())
53 |
54 |
55 | # In[5]:
56 |
57 |
58 | ds = DataSource(datasource_configs)
59 | ds.show_config()
60 |
61 |
62 | # In[6]:
63 |
64 |
65 | from modules.setup import *
66 |
67 |
68 | # # Import data
69 |
70 | # ## Load CDR data
71 |
72 | # ### Process/standardize raw data, save as parquet, and then load it
73 |
74 | # In[7]:
75 |
76 |
77 | # ds.standardize_csv_files(show=True)
78 | # ds.save_as_parquet()
79 |
80 |
81 | # In[8]:
82 |
83 |
84 | #ds.load_standardized_parquet_file()
85 |
86 |
87 | # ### Alternatively, specify and load hive table
88 |
89 | # In[9]:
90 |
91 |
92 | # # Specify and load hive data
93 | # ds.parquet_df = ds.spark.sql("""SELECT {} AS msisdn,
94 | # {} AS call_datetime,
95 | # {} AS location_id FROM {}""".format(ds.hive_vars['msisdn'],
96 | # ds.hive_vars['call_datetime'],
97 | # ds.hive_vars['location_id'],
98 | # ds.hive_vars['calls']))
99 |
100 |
101 | # ### Or load a sample file
102 |
103 | # In[10]:
104 |
105 |
106 | ## Use this in case you want to sample the data and run the code on the sample
107 |
108 | # #ds.sample_and_save(number_of_ids=1000)
109 | ds.load_sample('sample_feb_mar2020')
110 | ds.parquet_df = ds.sample_df
111 |
112 |
113 | # ## Load geo data
114 |
115 | # In[11]:
116 |
117 |
118 | ds.load_geo_csvs()
119 |
120 |
121 | # In[12]:
122 |
123 |
124 | ## Use this in case you want to cluster the towers and create a distance matrix
125 |
126 | # ds.create_gpds()
127 | # from modules.tower_clustering import *
128 | # clusterer = tower_clusterer(ds, 'admin2', 'ID_2')
129 | # ds.admin2_tower_map, ds.distances = clusterer.cluster_towers()
130 | # clusterer = tower_clusterer(ds, 'admin3', 'ADM3_PCODE')
131 | # ds.admin3_tower_map, ds.distances = clusterer.cluster_towers()
132 |
133 |
134 | # In[13]:
135 |
136 |
137 | ## Use this in case you want to create a voronoi tesselation
138 |
139 | # from modules.voronoi import *
140 | # voronoi = voronoi_maker(ds, 'admin3', 'ADM3_PCODE')
141 | # ds.voronoi = voronoi.make_voronoi()
142 |
143 |
144 | # # Run aggregations
145 |
146 | # ## Flowminder indicators for admin2
147 |
148 | # In[14]:
149 |
150 |
151 | agg_flowminder_admin2 = flowminder_aggregator(result_stub = '/admin2/flowminder',
152 | datasource = ds,
153 | regions = 'admin2_tower_map')
154 |
155 | agg_flowminder_admin2.attempt_aggregation()
156 |
157 |
158 | # ## Flowminder indicators for admin3
159 |
160 | # In[15]:
161 |
162 |
163 | agg_flowminder_admin3 = flowminder_aggregator(result_stub = '/admin3/flowminder',
164 | datasource = ds,
165 | regions = 'admin3_tower_map')
166 |
167 | agg_flowminder_admin3.attempt_aggregation()
168 |
169 |
170 | # ## Priority indicators for admin2
171 |
172 | # In[16]:
173 |
174 |
175 | agg_priority_admin2 = priority_aggregator(result_stub = '/admin2/priority',
176 | datasource = ds,
177 | regions = 'admin2_tower_map')
178 |
179 | agg_priority_admin2.attempt_aggregation(indicators_to_produce = {'unique_subscribers_per_day' : ['unique_subscribers', 'day'],
180 | 'percent_of_all_subscribers_active_per_day' : ['percent_of_all_subscribers_active', 'day'],
181 | 'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day'],
182 | 'mean_distance_per_day' : ['mean_distance', 'day'],
183 | 'mean_distance_per_week' : ['mean_distance', 'week'],
184 | 'origin_destination_matrix_time_per_day' : ['origin_destination_matrix_time', 'day'],
185 | 'home_vs_day_location_per_day' : ['home_vs_day_location_per_day', ['day','week']],
186 | 'home_vs_day_location_per_day' : ['home_vs_day_location_per_day', ['day','month']]})
187 |
188 |
189 | # ## Priority indicators for admin3
190 |
191 | # In[17]:
192 |
193 |
194 | agg_priority_admin3 = priority_aggregator(result_stub = '/admin3/priority',
195 | datasource = ds,
196 | regions = 'admin3_tower_map')
197 |
198 | agg_priority_admin3.attempt_aggregation(indicators_to_produce = {'transactions_per_hour' : ['transactions', 'hour'],
199 | 'transactions_per_hour' : ['transactions', 'hour']})
200 |
201 |
202 | # ## Scaled priority indicators for admin2
203 |
204 | # In[ ]:
205 |
206 |
207 | agg_scaled_admin2 = scaled_aggregator(result_stub = '/admin2/scaled',
208 | datasource = ds,
209 | regions = 'admin2_tower_map')
210 |
211 | agg_scaled_admin2.attempt_aggregation()
212 |
213 |
214 | # ## Priority indicators for tower-cluster
215 |
216 | # In[ ]:
217 |
218 |
219 | agg_priority_tower = priority_aggregator(result_stub = '/voronoi/priority',
220 | datasource = ds,
221 | regions = 'voronoi_tower_map')
222 |
223 | agg_priority_tower.attempt_aggregation(indicators_to_produce = {'unique_subscribers_per_hour' : ['unique_subscribers', 'hour'],
224 | 'mean_distance_per_day' : ['mean_distance', 'day'],
225 | 'mean_distance_per_week' : ['mean_distance', 'week']})
226 |
227 |
228 | # In[ ]:
229 |
230 |
231 | agg_priority_tower_harare = priority_aggregator(result_stub = '/voronoi/priority/harare',
232 | datasource = ds,
233 | regions = 'voronoi_tower_map_harare')
234 |
235 | agg_priority_tower_harare.attempt_aggregation(indicators_to_produce = {'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day']})
236 |
237 |
238 | # In[ ]:
239 |
240 |
241 | agg_priority_tower_bulawayo = priority_aggregator(result_stub = '/voronoi/priority/bulawayo',
242 | datasource = ds,
243 | regions = 'voronoi_tower_map_bulawayo')
244 |
245 | agg_priority_tower_bulawayo.attempt_aggregation(indicators_to_produce = {'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day']})
246 |
247 |
248 | # # Produce script
249 |
250 | # In[ ]:
251 |
252 |
253 | get_ipython().system('jupyter nbconvert --to script *.ipynb')
254 |
255 |
256 | # In[ ]:
257 |
258 |
259 |
260 |
261 |
--------------------------------------------------------------------------------
/cdr-aggregation/notebooks/modules/sql_code_aggregates.py:
--------------------------------------------------------------------------------
1 | # Databricks notebook source
2 | def write_sql_code(calls = 'calls',
3 | start_date = "\'2020-02-01\'",
4 | end_date = "\'2020-03-31\'",
5 | start_date_weeks = "\'2020-02-03\'",
6 | end_date_weeks = "\'2020-03-29\'"):
7 |
8 | sql_code = {
9 | # Aggregate 1 (April 1 version)
10 | 'count_unique_subscribers_per_region_per_day' :
11 | """
12 | SELECT * FROM (
13 | SELECT calls.call_date AS visit_date,
14 | cells.region AS region,
15 | count(DISTINCT msisdn) AS subscriber_count
16 | FROM calls
17 | INNER JOIN cells
18 | ON calls.location_id = cells.cell_id
19 | WHERE calls.call_date >= {}
20 | AND calls.call_date <= CURRENT_DATE
21 | GROUP BY 1, 2
22 | ) AS grouped
23 | WHERE grouped.subscriber_count >= 15
24 | """.format(start_date),
25 |
26 | # Intermediate Result - Home location
27 | 'home_locations' :
28 | """
29 | SELECT msisdn, region FROM (
30 | SELECT
31 | msisdn,
32 | region,
33 | row_number() OVER (
34 | PARTITION BY msisdn
35 | ORDER BY total DESC, latest_date DESC
36 | ) AS daily_location_rank
37 | FROM (
38 |
39 | SELECT msisdn,
40 | region,
41 | count(*) AS total,
42 | max(call_date) AS latest_date
43 | FROM (
44 | SELECT calls.msisdn,
45 | cells.region,
46 | calls.call_date,
47 | row_number() OVER (
48 | PARTITION BY calls.msisdn, calls.call_date
49 | ORDER BY calls.call_datetime DESC
50 | ) AS event_rank
51 | FROM calls
52 | INNER JOIN cells
53 | ON calls.location_id = cells.cell_id
54 | WHERE calls.call_date >= {}
55 | AND calls.call_date <= {}
56 |
57 | ) ranked_events
58 |
59 | WHERE event_rank = 1
60 | GROUP BY 1, 2
61 |
62 | ) times_visited
63 | ) ranked_locations
64 | WHERE daily_location_rank = 1
65 | """.format(start_date, end_date),
66 |
67 | # Aggregate 2 (April 1 version)
68 | 'count_unique_active_residents_per_region_per_day' :
69 | """
70 | SELECT * FROM (
71 | SELECT calls.call_date AS visit_date,
72 | cells.region AS region,
73 | count(DISTINCT calls.msisdn) AS subscriber_count
74 | FROM calls
75 | INNER JOIN cells
76 | ON calls.location_id = cells.cell_id
77 | INNER JOIN home_locations homes -- See intermediate_queries.sql for code to create the home_locations table
78 | ON calls.msisdn = homes.msisdn
79 | AND cells.region = homes.region
80 | GROUP BY 1, 2
81 | ) AS grouped
82 | WHERE grouped.subscriber_count >= 15""",
83 |
84 | 'count_unique_visitors_per_region_per_day' :
85 | """
86 | SELECT * FROM (
87 | SELECT all_visits.visit_date,
88 | all_visits.region,
89 | all_visits.subscriber_count - coalesce(home_visits.subscriber_count, 0) AS subscriber_count
90 | FROM count_unique_subscribers_per_region_per_day all_visits
91 | LEFT JOIN count_unique_active_residents_per_region_per_day home_visits
92 | ON all_visits.visit_date = home_visits.visit_date
93 | AND all_visits.region = home_visits.region
94 | ) AS visitors
95 | WHERE visitors.subscriber_count >= 15""",
96 |
97 | # Aggregate 3 (April 1 version)
98 | 'count_unique_subscribers_per_region_per_week' :
99 | """
100 | SELECT * FROM (
101 | SELECT extract(WEEK FROM calls.call_date) AS visit_week,
102 | cells.region AS region,
103 | count(DISTINCT calls.msisdn) AS subscriber_count
104 | FROM calls
105 | INNER JOIN cells
106 | ON calls.location_id = cells.cell_id
107 | WHERE calls.call_date >= {}
108 | AND calls.call_date <= {}
109 | GROUP BY 1, 2
110 | ) AS grouped
111 | WHERE grouped.subscriber_count >= 15
112 | """.format(start_date_weeks, end_date_weeks),
113 |
114 | # Aggregate 4 (April 1 version)
115 | 'count_unique_active_residents_per_region_per_week' :
116 | """
117 | SELECT * FROM (
118 | SELECT extract(WEEK FROM calls.call_date) AS visit_week,
119 | cells.region AS region,
120 | count(DISTINCT calls.msisdn) AS subscriber_count
121 | FROM calls
122 | INNER JOIN cells
123 | ON calls.location_id = cells.cell_id
124 | INNER JOIN home_locations homes -- See intermediate_queries.sql for code to create the home_locations table
125 | ON calls.msisdn = homes.msisdn
126 | AND cells.region = homes.region
127 | WHERE calls.call_date >= {}
128 | AND calls.call_date <= {}
129 | GROUP BY 1, 2
130 | ) AS grouped
131 | WHERE grouped.subscriber_count >= 15
132 | """.format(start_date_weeks, end_date_weeks),
133 |
134 | 'count_unique_visitors_per_region_per_week' :
135 | """
136 | SELECT * FROM (
137 | SELECT all_visits.visit_week,
138 | all_visits.region,
139 | all_visits.subscriber_count - coalesce(home_visits.subscriber_count, 0) AS subscriber_count
140 | FROM count_unique_subscribers_per_region_per_week all_visits
141 | LEFT JOIN count_unique_active_residents_per_region_per_week home_visits
142 | ON all_visits.visit_week = home_visits.visit_week
143 | AND all_visits.region = home_visits.region
144 | ) AS visitors
145 | WHERE visitors.subscriber_count >= 15""",
146 |
147 | # Aggregate 5 (April 1 version)
148 | 'regional_pair_connections_per_day' :
149 | """
150 | SELECT * FROM (
151 | SELECT connection_date,
152 | region1,
153 | region2,
154 | count(*) AS subscriber_count
155 | FROM (
156 |
157 | SELECT t1.call_date AS connection_date,
158 | t1.msisdn AS msisdn,
159 | t1.region AS region1,
160 | t2.region AS region2
161 | FROM (
162 | SELECT DISTINCT calls.msisdn,
163 | calls.call_date,
164 | cells.region
165 | FROM calls
166 | INNER JOIN cells
167 | ON calls.location_id = cells.cell_id
168 | WHERE calls.call_date >= {}
169 | AND calls.call_date <= CURRENT_DATE
170 | ) t1
171 |
172 | FULL OUTER JOIN
173 |
174 | (
175 | SELECT DISTINCT calls.msisdn,
176 | calls.call_date,
177 | cells.region
178 | FROM calls
179 | INNER JOIN cells
180 | ON calls.location_id = cells.cell_id
181 | WHERE calls.call_date >= {}
182 | AND calls.call_date <= CURRENT_DATE
183 | ) t2
184 |
185 | ON t1.msisdn = t2.msisdn
186 | AND t1.call_date = t2.call_date
187 | WHERE t1.region < t2.region
188 |
189 | ) AS pair_connections
190 | GROUP BY 1, 2, 3
191 | ) AS grouped
192 | WHERE grouped.subscriber_count >= 15
193 | """.format(start_date, start_date),
194 |
195 | # Aggregate 6 (April 2 version)
196 | 'directed_regional_pair_connections_per_day' :
197 | """
198 | WITH subscriber_locations AS (
199 | SELECT calls.msisdn,
200 | calls.call_date,
201 | cells.region,
202 | min(calls.call_datetime) AS earliest_visit,
203 | max(calls.call_datetime) AS latest_visit
204 | FROM calls
205 | INNER JOIN cells
206 | ON calls.location_id = cells.cell_id
207 | WHERE calls.call_date >= {}
208 | AND calls.call_date <= CURRENT_DATE
209 | GROUP BY msisdn, call_date, region
210 | )
211 | SELECT * FROM (
212 | SELECT connection_date,
213 | region_from,
214 | region_to,
215 | count(*) AS subscriber_count
216 | FROM (
217 |
218 | SELECT t1.call_date AS connection_date,
219 | t1.msisdn AS msisdn,
220 | t1.region AS region_from,
221 | t2.region AS region_to
222 | FROM subscriber_locations t1
223 | FULL OUTER JOIN subscriber_locations t2
224 | ON t1.msisdn = t2.msisdn
225 | AND t1.call_date = t2.call_date
226 | WHERE t1.region <> t2.region
227 | AND t1.earliest_visit < t2.latest_visit
228 |
229 | ) AS pair_connections
230 | GROUP BY 1, 2, 3
231 | ) AS grouped
232 | WHERE grouped.subscriber_count >= 15
233 | """.format(start_date),
234 |
235 | # Aggregate 7 (April 3 version)
236 | 'total_calls_per_region_per_day' :
237 | """
238 | SELECT
239 | call_date,
240 | region,
241 | total_calls
242 | FROM (
243 | SELECT calls.call_date AS call_date,
244 | cells.region AS region,
245 | count(DISTINCT msisdn) AS subscriber_count,
246 | count(*) AS total_calls
247 | FROM calls
248 | INNER JOIN cells
249 | ON calls.location_id = cells.cell_id
250 | WHERE calls.call_date >= {}
251 | AND calls.call_date <= CURRENT_DATE
252 | GROUP BY 1, 2
253 | ) AS grouped
254 | WHERE grouped.subscriber_count >= 15
255 | """.format(start_date),
256 |
257 | # Aggregate 8 (April 3 version)
258 | 'home_location_counts_per_region' :
259 | """
260 | SELECT * FROM (
261 | SELECT region, count(msisdn) AS subscriber_count
262 | FROM home_locations -- See intermediate_queries.sql for code to create the home_locations table
263 | GROUP BY region
264 | ) AS home_counts
265 | WHERE home_counts.subscriber_count >= 15"""}
266 | return sql_code
267 |
--------------------------------------------------------------------------------