├── dashboard-dataviz ├── README.md ├── dashboard │ ├── text_inputs │ │ └── README.md │ ├── data_inputs_for_dashboard │ │ └── README.md │ ├── preparing_data_for_dashboard │ │ ├── README.md │ │ ├── 01_clean_spatial_data │ │ │ ├── README.md │ │ │ ├── clean_adm3_file.R │ │ │ └── clean_adm2_file.R │ │ ├── 03_dashboard_data_prep │ │ │ ├── prep_subs_obs_totals_data.R │ │ │ ├── data_to_github.R │ │ │ └── README.md │ │ ├── 02_clean_telecom_data │ │ │ ├── clean_i3_subscribers_data.R │ │ │ ├── clean_i5_net_movement_data.R │ │ │ ├── clean_i5_movement_inout_data.R │ │ │ ├── clean_i7_distance_traveled.R │ │ │ └── README.md │ │ └── _dash_master.R │ ├── functions.R │ ├── styles.css │ └── README.md └── figures │ ├── _master_figures.R │ ├── i3_figures.R │ ├── i5_net_figures.R │ └── i5_into_out.R ├── data-checks ├── Archive │ ├── patch_cleaning.py │ ├── Descr-exploratory │ │ ├── draf.py │ │ ├── i5-plot.py │ │ └── fb-comparisson-draft.py │ ├── globals.py │ ├── quick_checks │ │ ├── check_subscribers.R │ │ └── ward_neighbors_tower_down.R │ ├── usage_outliers.py │ ├── i10-check.py │ ├── MASTER.py │ ├── 02_summary_stats.py │ ├── 03_i_specific_checks_i1_admin2.py │ ├── data_files_comparisson.py │ ├── 01_completenes_checks.py │ └── od_scaling.py └── README.md ├── cdr-aggregation ├── docker-compose.yml ├── notebooks │ ├── modules │ │ ├── setup.py │ │ ├── README.md │ │ ├── folder_utils.py │ │ ├── import_packages.py │ │ ├── utilities.py │ │ ├── flowminder_aggregator.py │ │ ├── voronoi.py │ │ ├── outliers.py │ │ ├── aggregator.py │ │ ├── tower_clustering.py │ │ └── sql_code_aggregates.py │ ├── folder_setup.py │ ├── README.md │ ├── folder_setup.ipynb │ └── aggregation_master.py ├── docker │ └── Dockerfile ├── config_file_template.py └── config_file_template_hive.py ├── data-panel ├── Archive │ ├── _master.py │ ├── usage_outliers.py │ ├── 02_clean.py │ ├── panel_draft2.py │ └── panel_draft.py ├── 01_construct.py └── utils.py └── .gitignore /dashboard-dataviz/README.md: -------------------------------------------------------------------------------- 1 | # Dashboard and Figures -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/text_inputs/README.md: -------------------------------------------------------------------------------- 1 | # Text Inputs -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/data_inputs_for_dashboard/README.md: -------------------------------------------------------------------------------- 1 | # Data Inputs for Dashboard -------------------------------------------------------------------------------- /data-checks/Archive/patch_cleaning.py: -------------------------------------------------------------------------------- 1 | 2 | # Cleaning 3 | 4 | fi = pd.read_csv(ICUST_adm3_path + file_name) 5 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/README.md: -------------------------------------------------------------------------------- 1 | # Files for data visualization and dashboards 2 | -------------------------------------------------------------------------------- /cdr-aggregation/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | 4 | jupyter: 5 | build: 6 | context: . 7 | dockerfile: ./docker/Dockerfile 8 | image: sebxwolf/cdr_aggregation_pyspark:v1 9 | container_name: cdr_aggregation 10 | ports: 11 | - "8888:8888" 12 | - "4040:4040" 13 | volumes: 14 | - ./:/home/jovyan/work 15 | -------------------------------------------------------------------------------- /data-panel/Archive/_master.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # PANEl MASTER 3 | #-----------------------------------------------------------------# 4 | 5 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/" 6 | DATA_POC = DATA_path + "proof-of-concept/" 7 | DATA_panel = DATA_POC + "panel_indicators/" 8 | DATA_panel_raw = DATA_panel + 'raw/' 9 | DATA_panel_clean = DATA_panel + 'clean/' -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/setup.py: -------------------------------------------------------------------------------- 1 | # all the modules we need 2 | 3 | from modules.import_packages import * 4 | from modules.DataSource import * 5 | from modules.utilities import * 6 | from modules.aggregator import * 7 | from modules.flowminder_aggregator import * 8 | from modules.priority_aggregator import * 9 | from modules.custom_aggregator import * 10 | from modules.scaled_aggregator import * 11 | from modules.sql_code_aggregates import * 12 | from modules.folder_utils import * 13 | -------------------------------------------------------------------------------- /cdr-aggregation/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jupyter/pyspark-notebook:dc9744740e12 2 | 3 | RUN python --version 4 | 5 | RUN conda install --quiet --yes -c \ 6 | conda-forge jupyter_contrib_nbextensions jupyter_nbextensions_configurator \ 7 | geopandas folium descartes 8 | 9 | RUN pip install -U folium \ 10 | geovoronoi \ 11 | geopy 12 | 13 | RUN jupyter labextension install @jupyterlab/toc 14 | 15 | VOLUME /home/jovyan/work 16 | WORKDIR /home/jovyan/work 17 | -------------------------------------------------------------------------------- /data-checks/Archive/Descr-exploratory/draf.py: -------------------------------------------------------------------------------- 1 | # Indicator 1 panel data 2 | i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv') 3 | i1 = i1[i1.region != '99999'] 4 | 5 | i3 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i3_admin3.csv') 6 | i3 = i3[i3.region != '99999'] 7 | 8 | i1['date'] = pd.to_datetime(i1['hour']).dt.date 9 | i3['date'] = pd.to_datetime(i3['day']).dt.date 10 | 11 | 12 | # Number of calls per day 13 | i1_day = i1.groupby(['date', 'region'])['count_p'].sum().reset_index() 14 | 15 | # Merge 16 | i13 = i1_day.merge(i3[['date', 'count_p', 'region']].rename(columns = {'count_p' : 'subscribers'}), 17 | on = ['date', 'region']) 18 | 19 | np.mean(i13['count_p']/i13['subscribers']) -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/folder_setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[ ]: 5 | 6 | 7 | import datetime as dt 8 | from modules.DataSource import * 9 | from modules.folder_utils import * 10 | 11 | 12 | # In[ ]: 13 | 14 | 15 | #Set relative file path to config file 16 | config_file = '../config_file.py' 17 | exec(open(config_file).read()) 18 | 19 | 20 | # In[ ]: 21 | 22 | 23 | #Create the DataSource object and show config 24 | ds = DataSource(datasource_configs) 25 | ds.show_config() 26 | 27 | 28 | # In[ ]: 29 | 30 | 31 | #Setup all required data folders 32 | setup_folder(ds) 33 | 34 | 35 | # In[ ]: 36 | 37 | 38 | #Check if required data folders already exists 39 | check_folders(ds) 40 | 41 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Notebook organization 2 | 3 | The [aggregation_master.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master.py) script is currently set to run all flowminder, priority and scaled indicators. Additional custom indicators are left out. 4 | 5 | The [aggregation_master.ipynb](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master.py) notebook does the same and can be used for data exploration, too. 6 | 7 | The [aggregation_master_databricks.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/aggregation_master_databricks.py) notebook is customised for databricks. 8 | -------------------------------------------------------------------------------- /cdr-aggregation/config_file_template.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import * 2 | schema = StructType([ 3 | StructField("msisdn", IntegerType(), True), 4 | StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files() 5 | StructField("location_id", StringType(), True) 6 | ]) 7 | 8 | datasource_configs = { 9 | "base_path": "/home/jovyan/work/data", #folder path used in this docker env 10 | "country_code": "", 11 | "telecom_alias": "", 12 | "schema" : schema, 13 | "data_paths" : ["*.csv"], 14 | "filestub": "", 15 | "geofiles": {}, 16 | "shapefiles": ['admin2','admin3', 'voronoi'], 17 | "dates": {'start_date' : dt.datetime(2020,2,1), 18 | 'end_date' : dt.datetime(2020,3,31)} 19 | } 20 | -------------------------------------------------------------------------------- /data-checks/Archive/globals.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # DATA CHECKS - Globals 3 | #-----------------------------------------------------------------# 4 | 5 | # This file contains settings and globals used across data checks 6 | # files 7 | 8 | # LIBRARIES 9 | import os 10 | import re 11 | import pandas as pd 12 | import numpy as np 13 | import datetime as dt 14 | 15 | import seaborn as sns; sns.set() 16 | from matplotlib import rcParams 17 | import matplotlib.pyplot as plt 18 | 19 | from bokeh.plotting import figure, output_file, show 20 | from bokeh.models import Span 21 | from bokeh.io import export_png 22 | 23 | 24 | # GLOBALS 25 | 26 | # File paths 27 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/" 28 | OUT_path = DATA_path + 'proof-of-concept/outputs/' 29 | 30 | # Default values 31 | missing_values = ['99999',''] -------------------------------------------------------------------------------- /cdr-aggregation/config_file_template_hive.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import * 2 | schema = StructType([ 3 | StructField("msisdn", IntegerType(), True), 4 | StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files() 5 | StructField("location_id", StringType(), True) 6 | ]) 7 | 8 | datasource_configs = { 9 | "base_path": "path_to_folder/data", #folder path used in this docker env 10 | "hive_warehouse_location": "path_to_hive_warehouse", 11 | "spark_mode": 'hive', 12 | "hive_vars":{ 'msisdn' : 'col1', 13 | 'call_datetime': 'col2', 14 | 'location_id': 'col3', 15 | 'calls': 'table'}, 16 | "country_code": "", 17 | "telecom_alias": "", 18 | "schema" : schema, 19 | "data_paths" : ["*.csv"], 20 | "filestub": "", 21 | "geofiles": {}, 22 | "shapefiles": ['admin2','admin3', 'voronoi'], 23 | "dates": {'start_date' : dt.datetime(2020,2,1), 24 | 'end_date' : dt.datetime(2020,3,31)} 25 | } 26 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/README.md: -------------------------------------------------------------------------------- 1 | # Clean Spatial Data 2 | 3 | Cleans spatial datasets: 4 | 1. Aggregate units when needed (e.g., aggregating wards) 5 | 2. Add additional variables (e.g., area) 6 | 3. Standardize variable names 7 | 4. Orders spatial data by region 8 | 9 | ### Standardize Variable Names 10 | Each spatial dataset should have standardized variable names. Standardizing 11 | variable names helps ensure different units (eg, admin2, admin3) can be 12 | easily switched in the dashboard 13 | 14 | | variable | format | example | description | 15 | |---|---|---|---| 16 | | region | string | ZW123456 | Unique identifier of the spatial unit | 17 | | name | string | Name| | Spatial unit name | 18 | | area | numeric | 1234 | Area of the spatial unit in kilometers squared | 19 | | province | string | Name| Name of the province | 20 | 21 | ### Order Spatial Data 22 | Spatial datasets are ordered by region. When cleaning other datasets at the 23 | region level, we also order by region and ensure all regions are present. This 24 | ensures that no reordering needs to be done in the dashboard. 25 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/clean_adm3_file.R: -------------------------------------------------------------------------------- 1 | # Clean ADM2 File 2 | 3 | # Load Data -------------------------------------------------------------------- 4 | # LOAD DATA HERE 5 | 6 | # Subset/Add Variables --------------------------------------------------------- 7 | adm3@data <- adm3@data %>% 8 | dplyr::select(NAME_3) %>% 9 | dplyr::rename(name = NAME_3) %>% 10 | dplyr::mutate(region = name) 11 | 12 | adm3$area <- geosphere::areaPolygon(adm3) / 1000^2 13 | 14 | # Simplify (to speed up plotting) ---------------------------------------------- 15 | # For ms_simplify, polygon IDs and other ID need to match 16 | pid <- sapply(slot(adm3, "polygons"), function(x) slot(x, "ID")) 17 | row.names(adm3) <- pid 18 | 19 | adm3 <- rmapshaper::ms_simplify(adm3) 20 | 21 | # Arrange ---------------------------------------------------------------------- 22 | #### Order by region 23 | adm3$region <- adm3$region %>% as.character() 24 | adm3 <- adm3[order(adm3$region),] 25 | 26 | # Export ----------------------------------------------------------------------- 27 | saveRDS(adm3, file.path(GEO_PATH, "adm3.Rds")) 28 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/01_clean_spatial_data/clean_adm2_file.R: -------------------------------------------------------------------------------- 1 | # Clean ADM2 File 2 | 3 | # Load Data -------------------------------------------------------------------- 4 | # LOAD DATA HERE 5 | 6 | # Subset/Add Variables --------------------------------------------------------- 7 | adm2@data <- adm2@data %>% 8 | dplyr::select(NAME_2) %>% 9 | dplyr::rename(name = NAME_2) %>% 10 | dplyr::mutate(region = name) 11 | 12 | adm2$area <- geosphere::areaPolygon(adm2) / 1000^2 13 | 14 | adm2$province <- NA 15 | 16 | # Simplify (to speed up plotting) ---------------------------------------------- 17 | # For ms_simplify, polygon IDs and other ID need to match 18 | pid <- sapply(slot(adm2, "polygons"), function(x) slot(x, "ID")) 19 | row.names(adm2) <- pid 20 | 21 | adm2 <- rmapshaper::ms_simplify(adm2) 22 | 23 | # Arrange ---------------------------------------------------------------------- 24 | #### Order by region 25 | adm2$region <- adm2$region %>% as.character() 26 | adm2 <- adm2[order(adm2$region),] 27 | 28 | # Export ----------------------------------------------------------------------- 29 | saveRDS(adm2, file.path(GEO_PATH, "adm2.Rds")) 30 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/functions.R: -------------------------------------------------------------------------------- 1 | # Functions ==================================================================== 2 | 3 | #### Log values with negatives 4 | # Define function to take the log of values that can deal with negative 5 | # values. Just takes the absoltue value, logs, then reapplies negative 6 | log_neg <- function(values){ 7 | # Log that takes into account zero. Only for logging values for 8 | # displaying! 9 | 10 | values_pos_index <- (values > 0) %in% T # %in% T to account for NAs 11 | values_neg_index <- (values <= 0) %in% T 12 | 13 | values_pos_log <- log(values[values_pos_index]+1) 14 | values_neg_log <- -log(-(values[values_neg_index])+1) 15 | 16 | values[values_pos_index] <- values_pos_log 17 | values[values_neg_index] <- values_neg_log 18 | 19 | return(values) 20 | } 21 | 22 | as.character.htmlwidget <- function(x, ...) { 23 | htmltools::HTML( 24 | htmltools:::as.character.shiny.tag.list( 25 | htmlwidgets:::as.tags.htmlwidget( 26 | x 27 | ), 28 | ... 29 | ) 30 | ) 31 | } 32 | 33 | add_deps <- function(dtbl, name, pkg = name) { 34 | tagList( 35 | dtbl, 36 | htmlwidgets::getDependency(name, pkg) 37 | ) 38 | } -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/styles.css: -------------------------------------------------------------------------------- 1 | input[type="number"] { 2 | max-width: 80%; 3 | } 4 | 5 | div.outer { 6 | position: fixed; 7 | top: 50px; 8 | left: 0; 9 | right: 0; 10 | bottom: 0; 11 | overflow: hidden; 12 | padding: 0; 13 | } 14 | 15 | /* Customize fonts */ 16 | body, label, input, button, select { 17 | font-family: 'Helvetica Neue', Helvetica; 18 | font-weight: 200; 19 | } 20 | h1, h2, h3, h4 { font-weight: 400; } 21 | 22 | #controls { 23 | /* Appearance */ 24 | background-color: white; 25 | padding: 0 20px 20px 20px; 26 | cursor: move; 27 | /* Fade out while not hovering */ 28 | opacity: 0.76; 29 | zoom: 0.95; 30 | transition: opacity 0ms 0ms; 31 | } 32 | #controls:hover { 33 | /* Fade in while hovering */ 34 | opacity: 0.99; 35 | transition-delay: 0; 36 | } 37 | 38 | #logo { 39 | /* Appearance */ 40 | background-color: transparent; 41 | cursor: move; 42 | /* Fade out while not hovering */ 43 | opacity: 0.25; 44 | zoom: 0.9; 45 | transition: opacity 500ms 1s; 46 | } 47 | 48 | #logo:hover { 49 | /* Fade in while hovering */ 50 | opacity: 0.95; 51 | transition-delay: 0; 52 | } 53 | 54 | #img-id{ 55 | position: fixed; 56 | right: 10px; 57 | top: 5px; 58 | } -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/prep_subs_obs_totals_data.R: -------------------------------------------------------------------------------- 1 | # Prep Subscribers / Observations Total Data 2 | 3 | # Prep datsets for line graphs on about page. 4 | 5 | # Subscribers ------------------------------------------------------------------ 6 | subs_adm2 <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_01_02_adm3_hour_result.csv")), 7 | stringsAsFactors=F) 8 | 9 | subs_adm2 <- subs_adm2 %>% 10 | group_by(pdate) %>% 11 | dplyr::summarise(Subscribers = sum(totalimei)) %>% 12 | dplyr::rename(Date = pdate) %>% 13 | mutate(Date = Date %>% ymd) 14 | 15 | saveRDS(subs_adm2, file.path(DASHBOARD_DATA_ONEDRIVE_PATH,"subscribers_total.Rds")) 16 | 17 | # Observations ----------------------------------------------------------------- 18 | obs_adm2 <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_01_02_adm3_hour_result.csv")), 19 | stringsAsFactors=F) 20 | 21 | obs_adm2 <- obs_adm2 %>% 22 | group_by(pdate) %>% 23 | dplyr::summarise(Observations = sum(total)) %>% 24 | dplyr::rename(Date = pdate) %>% 25 | mutate(Date = Date %>% ymd) 26 | 27 | saveRDS(obs_adm2, file.path(DASHBOARD_DATA_ONEDRIVE_PATH,"observations_total.Rds")) 28 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/README.md: -------------------------------------------------------------------------------- 1 | # Module organization 2 | 3 | ## Aggregation 4 | The base class `aggregator` defined in [aggregator.py](https://github.com/worldbank/covid-mobile-data/tree/cdr-master/cdr-aggregation/notebooks/modules/aggregator.py) implements methods and attributes shared by all aggregator classes. At the next level, `flowminder_aggregator` and `priority_aggregator` implement sql queries from [Flowminder](https://github.com/Flowminder), and priority indicators designed by this task force written in pyspark, respectively. Beyond that, the classes `scaled_aggregator` and `custom_aggregator` implement priority indicators scaled by a resident count, and additional custom pyspark indicators, respectively. Both inherit from the `priority_aggregator` class. 5 | 6 | ``` 7 | |-- aggregator 8 | | |-- flowminder_aggregator 9 | | |-- priority_aggregator 10 | | |-- scaled_aggregator 11 | | |-- custom_aggregator 12 | ``` 13 | 14 | ## Clustering and tesselation 15 | Modules `voronoi` and `tower_clustering` implement voronoi tesselation given tower locations, these will be of use in the setup phase to create tower-region mappings. 16 | 17 | ## Outlier analysis 18 | Module `outliers` can be used to study outlier observations. 19 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/data_to_github.R: -------------------------------------------------------------------------------- 1 | # Transfer dashboard data from OneDrive to Github 2 | 3 | 4 | ## Remove previous files in github 5 | REMOVE_PREVIOUS_FILES <- F 6 | 7 | if(REMOVE_PREVIOUS_FILES){ 8 | temp <- list.files(DASHBOARD_DATA_GITHUB_PATH, 9 | full.names = T, 10 | pattern = "*.Rds") %>% 11 | lapply(file.remove) 12 | 13 | } 14 | 15 | 16 | # Move telecom data to github folder ------------------------------------------- 17 | i <- 1 18 | 19 | telecom_files <- list.files(DASHBOARD_DATA_ONEDRIVE_PATH, pattern = "*.Rds") 20 | 21 | #telecom_files <- telecom_files[grepl("spark", telecom_files)] 22 | 23 | temp <- telecom_files %>% 24 | lapply(function(file_i){ 25 | if((i %% 100) %in% 0) print(paste(i, "/", length(telecom_files))) 26 | i <<- i + 1 27 | 28 | file.copy(file.path(DASHBOARD_DATA_ONEDRIVE_PATH, file_i), 29 | paste0(DASHBOARD_DATA_GITHUB_PATH, "/"), 30 | overwrite=T) 31 | }) 32 | 33 | 34 | # Move geofiles to github folder ----------------------------------------------- 35 | for(file_i in list.files(GEO_PATH)){ 36 | file.copy(file.path(GEO_PATH, file_i), 37 | paste0(DASHBOARD_DATA_GITHUB_PATH, "/"), 38 | overwrite=T) 39 | } 40 | 41 | 42 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/folder_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | ###################################### 3 | # Folder setup methods - written for the jupyter notebook docker image 4 | 5 | #Loops over the requred folders for teh datasource and create any missing folders 6 | def setup_folder(datasource): 7 | #Loop over required paths, and return true 8 | for folder in datasource.required_folders(): 9 | test_folder(folder, create_if_not_exist=True) 10 | return True 11 | 12 | #Check if all required folders exist without creating them 13 | def check_folders(datasource): 14 | return_boolean = True 15 | #loop over required folders 16 | for folder in datasource.required_folders(): 17 | if not test_folder(folder, create_if_not_exist=False): 18 | print("Folder '{}' is required but does not exist".format(folder)) 19 | return_boolean = False 20 | return return_boolean 21 | 22 | #Utility that check if folder exist 23 | def test_folder(path, create_if_not_exist): 24 | #If folder exists return true 25 | if os.path.exists(path): return True 26 | #Else: if create_if_not_exist is true then create folder and return true 27 | elif create_if_not_exist: 28 | os.makedirs(path) 29 | return True 30 | #Else: Folder does not exist and folder is not created, return false 31 | else: return False 32 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/03_dashboard_data_prep/README.md: -------------------------------------------------------------------------------- 1 | # Dashboard Data Prep 2 | 3 | Due to the high volume of data, data transformations (e.g., aggregating, filtering, etc) are done outside of the dashboard in order to minimize the processing and data needed to be loaded in memory at any point as the dashboard is running. These scripts filter the cleaned telecom data into individual datasets so that no additional filtering or transformations need to be applied within the dashboard; the dashboard can just read the files then immediately use the data in the map, line graph and table. Here, we create smaller datasets that contain the same variables as above. Indicators include density, movement in, movement out, mean distance traveled, etc. 4 | 5 | The following datasets are made. 6 | 7 | | Dataset Type | Naming Convention | Description | 8 | | --- | --- | --- | 9 | | unit-level | [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Date/Week].Rds | For a given day or week, this dataset contains information for all wards or districts for a specified indicator. For O-D level datasets, values are aggregated to the specified origin or destination unit (eg, movement into unit from all other units). | 10 | | time-level | [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name].Rds | For a given admin unit, this dataset contains a time series of values for a specified indicator. | 11 | | unit-time-level | [Unit Name]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name]\_[Date/Week].Rds | These datasets are only used for O-D variables. The show, for a given origin or destination unit, the movement in or out of that unit to all other units for the specified day/week. | 12 | 13 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/import_packages.py: -------------------------------------------------------------------------------- 1 | # Imports necessary packages and sets some global vars 2 | ### spark etc 3 | # import rarfile 4 | 5 | import os, pyspark, time, sys 6 | import pyspark.sql.functions as F 7 | from pyspark.sql.functions import pandas_udf, PandasUDFType 8 | from pyspark import * 9 | from pyspark.sql import * 10 | from pyspark.rdd import * 11 | from pyspark.ml import * 12 | from pyspark.sql.types import ArrayType 13 | from pyspark.sql.types import IntegerType 14 | from pyspark.sql.types import DoubleType 15 | from pyspark.sql.types import FloatType 16 | 17 | ### data wrangling 18 | import pandas as pd 19 | import glob 20 | import shutil 21 | pd.options.display.float_format = '{:,.0f}'.format 22 | # pd.set_option("display.max_rows", 100) 23 | pd.options.display.max_columns = None 24 | import datetime as dt 25 | import numpy as np 26 | from random import sample, seed 27 | seed(510) 28 | # timezone = dt.timezone(offset = -dt.timedelta(hours=5), name = "America/Bogota") 29 | timezone = dt.timezone(offset = -dt.timedelta(hours=0), name = "Africa/Harare") 30 | import re 31 | #import fiona 32 | #import geopandas as gpd 33 | import copy 34 | from collections import Counter 35 | from shapely import wkt 36 | 37 | ### plotting 38 | import matplotlib.pyplot as plt 39 | import matplotlib.dates as mdates 40 | import seaborn as sns 41 | #import folium 42 | #import gif 43 | #from folium.plugins import HeatMap, DualMap, Fullscreen 44 | #from folium.features import DivIcon 45 | #from branca.element import Template, MacroElement 46 | import locale 47 | from matplotlib.ticker import FuncFormatter 48 | import matplotlib.lines as mlines 49 | font = {'family' : 'Calibri', 50 | 'weight' : 'normal', 51 | 'size' : 18} 52 | import matplotlib 53 | -------------------------------------------------------------------------------- /dashboard-dataviz/figures/_master_figures.R: -------------------------------------------------------------------------------- 1 | # Master R Script for Prepping Data for Dashboard 2 | 3 | #### Packages #### ============================================================= 4 | library(tidyverse) 5 | library(sf) 6 | library(sp) 7 | library(plotly) 8 | library(stargazer) 9 | library(knitr) 10 | library(gridExtra) 11 | library(leaflet) 12 | library(ggpubr) 13 | library(purrr) 14 | library(parallel) 15 | library(pbmcapply) 16 | library(rgeos) 17 | library(rgdal) 18 | library(sp) 19 | library(rmapshaper) 20 | library(raster) 21 | library(geosphere) 22 | library(lubridate) 23 | library(data.table) 24 | library(mapview) 25 | library(hrbrthemes) 26 | 27 | #### File paths #### =========================================================== 28 | 29 | # Define Root Paths ------------------------------------------------------------ 30 | if(Sys.info()[["user"]] == "robmarty") PROJECT_PATH <- "~/Documents/World Bank/Sveta Milusheva - COVID 19 Results" 31 | if(Sys.info()[["user"]] == "wb519128") PROJECT_PATH <- "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results" 32 | if(Sys.info()[["user"]] == "WB521633") PROJECT_PATH <- "C:/Users/wb521633/WBG/Sveta Milusheva - COVID 19 Results" 33 | 34 | if(Sys.info()[["user"]] == "robmarty") GITHUB_PATH <- "~/Documents/Github/covid-mobile-data" 35 | if(Sys.info()[["user"]] == "wb519128") GITHUB_PATH <- "C:/Users/wb519128/Github/covid-mobile-data" 36 | if(Sys.info()[["user"]] == "WB521633") GITHUB_PATH <- "C:/Users/wb521633/Documents/Github/covid-mobile-data" 37 | 38 | # Define Paths from Root ------------------------------------------------------- 39 | CLEAN_DATA_ADM2_PATH <- file.path(PROJECT_PATH, "proof-of-concept", "files_for_dashboard", "files_clean", "adm2") 40 | CLEAN_DATA_ADM3_PATH <- file.path(PROJECT_PATH, "proof-of-concept", "files_for_dashboard", "files_clean", "adm3") 41 | figures_path <- file.path(PROJECT_PATH, "proof-of-concept", "outputs", "figures") 42 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/folder_setup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import datetime as dt\n", 10 | "from modules.DataSource import *\n", 11 | "from modules.folder_utils import *" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "#Set relative file path to config file\n", 21 | "config_file = '../config_file.py'\n", 22 | "exec(open(config_file).read())" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "scrolled": true 30 | }, 31 | "outputs": [], 32 | "source": [ 33 | "#Create the DataSource object and show config\n", 34 | "ds = DataSource(datasource_configs)\n", 35 | "ds.show_config()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "#Setup all required data folders\n", 45 | "setup_folder(ds)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "#Check if required data folders already exists\n", 55 | "check_folders(ds)" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.7.6" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 4 80 | } 81 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/utilities.py: -------------------------------------------------------------------------------- 1 | 2 | ############# Utility functions used throughout 3 | import os 4 | if os.environ['HOME'] != '/root': 5 | from modules.import_packages import * 6 | from modules.DataSource import * 7 | databricks = False 8 | else: 9 | databricks = True 10 | 11 | def save_and_load_parquet(df, filename, ds): 12 | # write parquet 13 | df.write.mode('overwrite').parquet(filename) 14 | #load parquet 15 | df = ds.spark.read.format("parquet").load(filename) 16 | return df 17 | 18 | def save_csv(matrix, path, filename): 19 | # write to csv 20 | matrix.repartition(1).write.mode('overwrite').format('com.databricks.spark.csv') \ 21 | .save(os.path.join(path, filename), header = 'true') 22 | # move one folder up and rename to human-legible .csv name 23 | if databricks: 24 | dbutils.fs.mv(dbutils.fs.ls(path + '/' + filename)[-1].path, 25 | path + '/' + filename + '.csv') 26 | # remove the old folder 27 | dbutils.fs.rm(path + '/' + filename + '/', recurse = True) 28 | 29 | else: 30 | os.rename(glob.glob(os.path.join(path, filename + '/*.csv'))[0], 31 | os.path.join(path, filename + '.csv')) 32 | shutil.rmtree(os.path.join(path, filename)) 33 | 34 | ############# Windows for window functions 35 | 36 | # window by cardnumber 37 | user_window = Window\ 38 | .partitionBy('msisdn').orderBy('call_datetime') 39 | 40 | # window by cardnumber starting with last transaction 41 | user_window_rev = Window\ 42 | .partitionBy('msisdn').orderBy(F.desc('call_datetime')) 43 | 44 | # user date window 45 | user_date_window = Window\ 46 | .partitionBy('msisdn', 'call_date').orderBy('call_datetime') 47 | 48 | # user date window starting from last date 49 | user_date_window_rev = Window\ 50 | .partitionBy('msisdn', 'call_date').orderBy(F.desc('call_datetime')) 51 | 52 | 53 | ############# Plotting 54 | 55 | def zero_to_nan(values): 56 | """Replace every 0 with 'nan' and return a copy.""" 57 | values[ values==0 ] = np.nan 58 | return values 59 | 60 | def fill_zero_dates(pd_df): 61 | pd_df = pd_df[~pd_df.index.isnull()].sort_index() 62 | msisdnx = pd.date_range(pd_df.index[0], pd_df.index[-1]) 63 | pd_df = pd_df.reindex(msisdnx, fill_value= 0) 64 | return pd_df 65 | -------------------------------------------------------------------------------- /data-checks/README.md: -------------------------------------------------------------------------------- 1 | # Data Checks 2 | 3 | This folder contains code for running basic checks of aggregated CDR indicators. The data quality checks are intended to achieve the following: 4 | 1. **Ensure the data is complete.** This means that there are no missing values in two main dimensions: spatial-all admin areas should have data; and temporal: all time slots (month, day and hour) should have data. This check is required for all indicators. 5 | 2. **Cell tower down checks**. This is a special type of missing data where the data may be missing due to cell tower. This check is required for all indicators? 6 | 3. **Consistency checks**. This check can be done for a single indicator to check for several things. But it can also be done cross indicators to ensure consistency of total numbers. 7 | 8 | ## Requirements 9 | 10 | - Python3 11 | - pandas 12 | - numpy 13 | - plotly 14 | 15 | ## Basic usage: 16 | 17 | ```bash 18 | $ git clone git@github.com:worldbank/covid-mobile-data.git 19 | $ cd covid-mobile-data/data-checks/ 20 | $ python checker.py --Path path/to/indicators 21 | [--prefix "your_prefix_"] 22 | [--outputs path/to/outputs] 23 | ``` 24 | 25 | ## Custom usage: 26 | You can create an instance of the checker class to customize any of the default values. 27 | 28 | ```python 29 | from checker import * 30 | 31 | check = checker(path = 'path/to/indicaotrs', 32 | outputs_path = 'path/to/outputs', 33 | level = 'subfolder', 34 | ind_dict = {'i1' : 'transactions_per_hour.csv', 35 | 'i3' : 'unique_subscrivers_per_day.csv', 36 | 'i5' : 'origin_destination_connection_matrix_per_day.csv'}, 37 | prefix = 'your_prefix_', 38 | col_names_dict = col_names_dict = { 39 | 'i1': {'Time':'hour', 40 | 'Geography':'region', 41 | 'Count':'count'}, 42 | 'i3': {'Time':'day', 43 | 'Geography':'region', 44 | 'Count':'count'}, 45 | 'i5': {'Time':'connection_date', 46 | 'Geography_from':'region_from', 47 | 'Geography_to':'region_to', 48 | 'Count':'total_count'} }) 49 | ``` 50 | -------------------------------------------------------------------------------- /dashboard-dataviz/figures/i3_figures.R: -------------------------------------------------------------------------------- 1 | # i3 Figures 2 | 3 | unit <- "districts" 4 | 5 | # Load Data -------------------------------------------------------------------- 6 | if(unit %in% "wards"){ 7 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH 8 | } 9 | 10 | if(unit %in% "districts"){ 11 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH 12 | } 13 | 14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i3_daily.Rds")) 15 | 16 | data <- data %>% 17 | group_by(region) %>% 18 | mutate(value_pre = mean(value[date < "2020-03-30"]), 19 | value_post = mean(value[date > "2020-03-30"])) %>% 20 | ungroup() %>% 21 | mutate(value_change = value_post - value_pre) %>% 22 | mutate(value_change_rank = rank(value_change)) 23 | 24 | data$value_change_rank[is.na(data$value_change)] <- NA 25 | 26 | # Figures ---------------------------------------------------------------------- 27 | rank_high <- data$value_change_rank %>% unique() %>% sort() %>% head(5) 28 | 29 | p_high <- data %>% 30 | dplyr::filter(value_change_rank %in% rank_high) %>% 31 | ggplot(aes(x = date, y = value)) + 32 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) + 33 | geom_line() + 34 | labs(x = "", 35 | y = "Number of Subscribers", 36 | title = "Largest Decreases") + 37 | facet_wrap(~name, 38 | scales = "free_y", 39 | nrow = 1) + 40 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 41 | strip.text.x = element_text(face = "bold")) 42 | p_high 43 | 44 | rank_low <- data$value_change_rank %>% unique() %>% sort() %>% tail(5) 45 | 46 | p_low <- data %>% 47 | dplyr::filter(value_change_rank %in% rank_low) %>% 48 | ggplot(aes(x = date, y = value)) + 49 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) + 50 | geom_line() + 51 | labs(x = "", 52 | y = "", 53 | title = "Largest Increases") + 54 | facet_wrap(~name, 55 | scales = "free_y", 56 | nrow = 1) + 57 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 58 | strip.text.x = element_text(face = "bold")) 59 | 60 | p_all <- ggarrange(p_high, p_low, nrow = 2) 61 | ggsave(p_all, filename = file.path(figures_path, 62 | paste0(unit, "_subsc_top_chng.png")), 63 | height = 5, width=12) 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /data-panel/01_construct.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # CREATE PANEL 3 | #-----------------------------------------------------------------# 4 | 5 | # This code creates panel datasets combinig different versions of 6 | # indicator files. 7 | 8 | from utils import * 9 | from panel_constructor import * 10 | 11 | #-----------------------------------------------------------------# 12 | # Settings 13 | 14 | EXPORT = False 15 | 16 | #-------------------# 17 | # Indicator dataframe 18 | 19 | # Load list of indicators to make it easier to bulk load files 20 | indicators_df = pd.read_csv('path/to/indicators_list.csv') 21 | 22 | 23 | #-------------------# 24 | # Set default values 25 | levels_dict = { 1: [3], 26 | 2: [3], 27 | 3: [2,3], 28 | 4: ['country'], 29 | 5: [2,3], 30 | 6: [3], 31 | 7: [2,3], 32 | 8: [2,3], 33 | 9: [2,3], 34 | 10: [2,3], 35 | 11: [2,3]} 36 | 37 | 38 | #-----------------------------------------------------------------# 39 | # Load indicators and create comparisson "dirty" panel 40 | 41 | indicators = panel_constructor(levels_dict, indicators_df) 42 | 43 | # Create class instance 44 | # If no levels dictionary is provided, it will use the default, which is all of them! 45 | # indicators = panel_constructor() 46 | 47 | # Run panel creation 48 | indicators.dirty_panel() 49 | 50 | #-----------------------------------------------------------------# 51 | # Load usage outliers file 52 | 53 | # This file is created in data-checks 54 | i1_ag_df_tower_down = pd.read_csv("/path/to/usage-outliers/file") 55 | 56 | #-----------------------------------------------------------------# 57 | # Export comparison panel 58 | 59 | if EXPORT: 60 | indicators.export('/export/path/') 61 | 62 | #-----------------------------------------------------------------# 63 | # Create clean panel 64 | 65 | # This replaces the old panel attribute with the clean version, with 66 | # standardized column names 67 | 68 | indicators.clean_panel(i1_ag_df_tower_down) 69 | 70 | #-----------------------------------------------------------------# 71 | 72 | 73 | indicators.add_other_provider(mno_path = "/path/to/other/mno/indicator/folder", 74 | mno_suffix = '_mno') 75 | 76 | 77 | #-----------------------------------------------------------------# 78 | # Export 79 | if EXPORT: 80 | indicators.export('/export/path/') 81 | -------------------------------------------------------------------------------- /dashboard-dataviz/figures/i5_net_figures.R: -------------------------------------------------------------------------------- 1 | # i3 Figures 2 | 3 | unit <- "wards" 4 | 5 | # Load Data -------------------------------------------------------------------- 6 | if(unit %in% "wards"){ 7 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH 8 | } 9 | 10 | if(unit %in% "districts"){ 11 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH 12 | } 13 | 14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i5_net_daily.Rds")) 15 | 16 | data <- data %>% 17 | group_by(region) %>% 18 | mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T), 19 | value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>% 20 | ungroup() %>% 21 | mutate(value_change = value_post - value_pre) %>% 22 | mutate(value_change_rank = rank(value_change)) 23 | 24 | data$value_change_rank[is.na(data$value_change)] <- NA 25 | 26 | data <- data[!is.na(data$date),] 27 | data$date <- data$date %>% as.Date() 28 | 29 | # Figures ---------------------------------------------------------------------- 30 | rank_high <- data$value_change_rank %>% unique() %>% sort() %>% head(5) 31 | 32 | p_high <- data %>% 33 | dplyr::filter(value_change_rank %in% rank_high) %>% 34 | ggplot(aes(x = date, y = value)) + 35 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) + 36 | geom_line() + 37 | labs(x = "", 38 | y = "Number of Subscribers", 39 | title = "Largest Decreases") + 40 | facet_wrap(~name, 41 | scales = "free_y", 42 | nrow = 1) + 43 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 44 | strip.text.x = element_text(face = "bold")) 45 | p_high 46 | 47 | datarank_low <- data$value_change_rank %>% unique() %>% sort() %>% tail(5) 48 | 49 | p_low <- data %>% 50 | dplyr::filter(value_change_rank %in% rank_low) %>% 51 | ggplot(aes(x = date, y = value)) + 52 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) + 53 | geom_line() + 54 | labs(x = "", 55 | y = "", 56 | title = "Largest Increases") + 57 | facet_wrap(~name, 58 | scales = "free_y", 59 | nrow = 1) + 60 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 61 | strip.text.x = element_text(face = "bold")) 62 | 63 | p_all <- ggarrange(p_high, p_low, nrow = 2) 64 | ggsave(p_all, filename = file.path(figures_path, 65 | paste0(unit, "_netmovement_top_chng.png")), 66 | height = 5, width=12) 67 | 68 | 69 | data$value[data$date < "2020-03-30"] %>% log() %>% hist() 70 | -------------------------------------------------------------------------------- /data-checks/Archive/Descr-exploratory/i5-plot.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import pandas as pd 4 | import numpy as np 5 | import datetime as dt 6 | import time 7 | 8 | from bokeh.plotting import figure, output_file, show 9 | from bokeh.models import Span 10 | from bokeh.io import export_png 11 | 12 | #-----------------------------------------------------------------# 13 | # Folder structure 14 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/" 15 | DATA_POC = DATA_path + "proof-of-concept/" 16 | DATA_Panel = DATA_POC + "panel_indicators/" 17 | OUT_path = DATA_POC + "outputs/" 18 | 19 | 20 | #-----------------------------------------------------------------# 21 | # Load data 22 | 23 | i5 = pd.read_csv(DATA_Panel + 'i5_admin2.csv') 24 | 25 | 26 | #-----------------------------------------------------------------# 27 | # Process data 28 | i5 = i5[['connection_date', 'region_from', 'region_to', 'od_count_p', 'subscriber_count_p', 'total_count_p']] 29 | 30 | i5['date'] = pd.to_datetime(i5['connection_date']).dt.date 31 | i5['month'] = pd.to_datetime(i5['connection_date']).dt.month 32 | 33 | 34 | i5_agg = i5\ 35 | .groupby('date')\ 36 | .agg({'region_from' : pd.Series.nunique , 37 | 'region_to' : pd.Series.nunique, 38 | 'subscriber_count_p' : np.sum, 39 | 'total_count_p' : np.sum})\ 40 | .reset_index()\ 41 | .sort_values('date') 42 | 43 | i5_agg_month = i5\ 44 | .groupby('month')\ 45 | .agg({'subscriber_count_p' : np.sum, 46 | 'total_count_p' : np.sum})\ 47 | .reset_index()\ 48 | .sort_values('month') 49 | 50 | #-----------------------------------------------------------------# 51 | # Plot 52 | 53 | p = figure(title="Total Daily Movement Between Districts on a Given Day", 54 | plot_width=800, 55 | plot_height=500, 56 | x_axis_type='datetime') 57 | p.circle(i5_agg['date'], 58 | i5_agg['subscriber_count_p']) 59 | 60 | # Add lockdown dates vertical line 61 | 62 | vline1 = Span(location= dt.date(2020, 3, 27), 63 | dimension='height', 64 | line_color='black', 65 | line_dash='dashed') 66 | vline2 = Span(location= dt.date(2020, 3, 30), 67 | dimension='height', 68 | line_color='black', 69 | line_dash='dashed') 70 | 71 | p.renderers.extend([vline1, vline2]) 72 | 73 | # Additional formatting 74 | p.left[0].formatter.use_scientific = False 75 | p.toolbar.logo = None 76 | p.toolbar_location = None 77 | p.xaxis.axis_label = "Date" 78 | p.yaxis.axis_label = "Movement Day" 79 | p.title.text_font_size = '15pt' 80 | p.xaxis.axis_label_text_font_size = "12pt" 81 | p.yaxis.axis_label_text_font_size = "12pt" 82 | p.yaxis.major_label_text_font_size = "10pt" 83 | p.xaxis.major_label_text_font_size = "10pt" 84 | 85 | # Display plot 86 | show(p) 87 | 88 | # Export 89 | export_png(p, 90 | filename= OUT_path + "all_movement.png") 91 | -------------------------------------------------------------------------------- /data-panel/Archive/usage_outliers.py: -------------------------------------------------------------------------------- 1 | 2 | #-----------------------------------------------------------------# 3 | # Settings 4 | 5 | import os 6 | import re 7 | import copy 8 | import pandas as pd 9 | import numpy as np 10 | import datetime as dt 11 | 12 | 13 | EXPORT = True 14 | 15 | # Number of hours below avg, used as a trashold to 16 | # define a tower down 17 | htrahshold = -3 18 | 19 | 20 | #-----------------------------------------------------------------# 21 | # Process data 22 | i1p = copy.deepcopy(i1.panel) 23 | 24 | i1p['date'] = pd.to_datetime(i1p['hour']).dt.date 25 | i1p['hour_int'] = pd.to_datetime(i1p['hour']).dt.hour 26 | 27 | 28 | # Number of observations per ward that is total number of hours 29 | i1freq = i1p.groupby('region').size() 30 | 31 | i1freq = i1freq.reset_index() 32 | i1freq.columns = ['region', 'freq'] 33 | 34 | # Select wards with less than 12h on average 35 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1p.date.nunique())] 36 | 37 | i1_low_total_hours = i1_low_total_hours\ 38 | .rename(columns = {'freq' : 'total_hours'}) 39 | # # Proportion of wards with at least one tower down 40 | # freq[freq < 1392].count()/len(set(i1['region'])) 41 | 42 | # # Proportion of wards with very 43 | # freq[freq < 700].count() 44 | # freq[freq < 700].count()/len(set(i1['region'])) 45 | 46 | # Export 47 | if(EXPORT): 48 | (i1_low_total_hours 49 | .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv', 50 | index = False) ) 51 | 52 | #-----------------------------------------------------------------# 53 | # USAGE OUTILERS: Indicator wards and days with towers down 54 | 55 | # Number of hours with transactions per region day 56 | hours_per_day = i1p.groupby(['region', 'date']).size() 57 | 58 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column 59 | hours_per_day.columns = ['region', 'date', 'hcount'] 60 | 61 | 62 | # Average hours per day per region 63 | avg_hours = (hours_per_day.groupby(['region']) 64 | .mean() 65 | .rename(columns={'hcount' :'avg_hours' })) 66 | 67 | # Create region day data set 68 | i1_ag_df = hours_per_day.merge(avg_hours, 69 | on = 'region') 70 | 71 | # Difference from average usage per hour 72 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours'] 73 | 74 | # Create data only with pairs of wards and days potential 75 | # towers down 76 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold] 77 | 78 | # Read me text 79 | readme_text = "This file contains a combinations of wards and days that are assumed to have a tower down." 80 | readme_text += "If a day has " + str(abs(htrahshold)) 81 | readme_text += " hours with any calls below the daily avergage for that ward," 82 | readme_text += " it is considered to have a trower down at some point that day." 83 | 84 | # Export 85 | if(EXPORT): 86 | (i1_ag_df_tower_down 87 | .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv', 88 | index = False) ) 89 | # Read me file 90 | file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w") 91 | file.write(readme_text) 92 | file.close() 93 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i3_subscribers_data.R: -------------------------------------------------------------------------------- 1 | # Clean i3 for Dashboard 2 | 3 | unit <- "adm2" 4 | for(unit in c("adm2", "adm3")){ 5 | 6 | # Load Data / Set Paths ------------------------------------------------------ 7 | df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_03_",unit,"_day_result.csv")), 8 | stringsAsFactors=F) 9 | admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds"))) 10 | 11 | if(unit %in% "adm2"){ 12 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH 13 | } 14 | if(unit %in% "adm3"){ 15 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH 16 | } 17 | 18 | # Daily ---------------------------------------------------------------------- 19 | df_day_clean <- df_day %>% 20 | 21 | tp_standardize_vars("pdate", unit, "totalimei") %>% 22 | 23 | # Clean datset 24 | tp_clean_date() %>% 25 | tp_fill_regions(admin_sp) %>% 26 | tp_complete_date_region() %>% 27 | tp_add_polygon_data(admin_sp) %>% 28 | 29 | # Interpolate/Clean Values 30 | tp_interpolate_outliers(NAs_as_zero = T, outlier_sd=3) %>% 31 | tp_replace_zeros(NAs_as_zero = T) %>% 32 | tp_less15_NA() %>% 33 | 34 | # Percent change 35 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i3_daily_base.csv"), 36 | baseline_date = BASELINE_DATE) %>% 37 | tp_add_percent_change() %>% 38 | 39 | # Add labels 40 | tp_add_label_level(timeunit = "day", OD = F) %>% 41 | tp_add_label_baseline(timeunit = "day", OD = F) %>% 42 | 43 | # Add density 44 | mutate(density = value / area) 45 | 46 | ## Export 47 | saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, "i3_daily.Rds")) 48 | write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, "i3_daily.csv"), row.names=F) 49 | 50 | # Weekly --------------------------------------------------------------------- 51 | print("week") 52 | 53 | df_week_clean <- df_day_clean %>% 54 | 55 | tp_standardize_vars("date", "region", "value") %>% 56 | 57 | # Clean datset 58 | tp_clean_week() %>% 59 | tp_agg_day_to_week(fun = "mean") %>% 60 | tp_complete_date_region() %>% 61 | tp_add_polygon_data(admin_sp) %>% 62 | 63 | # Interpolate/Clean Values 64 | tp_interpolate_outliers(NAs_as_zero = T) %>% 65 | tp_replace_zeros(NAs_as_zero = T) %>% 66 | tp_less15_NA() %>% 67 | 68 | # Percent change 69 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i3_weekly_base.csv"), 70 | type = "weekly", 71 | baseline_date = BASELINE_DATE) %>% 72 | tp_add_percent_change() %>% 73 | 74 | # Add labels 75 | tp_add_label_level(timeunit = "week", OD = F) %>% 76 | tp_add_label_baseline(timeunit = "week", OD = F) %>% 77 | 78 | # Add density 79 | mutate(density = value / area) 80 | 81 | ## Export 82 | saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH, "i3_weekly.Rds")) 83 | write.csv(df_week_clean, file.path(CLEAN_DATA_PATH, "i3_weekly.csv"), row.names=F) 84 | 85 | } 86 | 87 | 88 | -------------------------------------------------------------------------------- /data-panel/utils.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # Panel utils 3 | #-----------------------------------------------------------------# 4 | 5 | import os 6 | import re 7 | import copy 8 | import pandas as pd 9 | import numpy as np 10 | import datetime as dt 11 | 12 | #-----------------------------------------------------------------# 13 | # General functions 14 | 15 | def clean(data, index_cols): 16 | na_list = [np.nan, '', '99999', 99999, float("inf")] 17 | data = data[~data[index_cols].isin(na_list).any(axis ='columns')] 18 | return(data) 19 | 20 | #-----------------------------------------------------------------# 21 | # Clean panel function 22 | 23 | # Remove low usage outliers assuming these are towers down and 24 | # trims columns 25 | 26 | def clean_columns(indicator, timevar): 27 | # Remove comparison columns 28 | keepcols = copy.deepcopy(indicator.index_cols) 29 | keepcols.extend(indicator.panel.filter(like='_p', axis=1).columns.to_list()) 30 | new_df = indicator.panel[keepcols] 31 | # Rename columns 32 | new_df.columns = new_df.columns.str.strip('_p') 33 | # Create time variables 34 | new_df['date'] = pd.to_datetime(new_df[timevar]).dt.date 35 | return new_df 36 | 37 | def remove_towers_down(df, region_vars, outliers_df): 38 | # Process outliers file 39 | # outliers_df = copy.deepcopy(i1_ag_df_tower_down) # created in usage_outliers.py 40 | outliers_df = outliers_df\ 41 | .drop(['hcount', 'avg_hours', 'h_diff'], axis = 1)\ 42 | .rename(columns = {'region':'region_right'}) 43 | outliers_df['flag'] = 1 44 | # Merge outliers 45 | if len(region_vars) == 1: 46 | new_df = df\ 47 | .merge(outliers_df, 48 | left_on = ['date', region_vars[0]], 49 | right_on = ['date', 'region_right'], 50 | how = 'outer')\ 51 | .drop(['region_right'], axis = 1) 52 | else: 53 | new_df = df\ 54 | .merge(outliers_df, 55 | left_on = ['date', region_vars[0]], 56 | right_on = ['date', 'region_right'], 57 | how = 'outer')\ 58 | .drop(['region_right'], axis = 1)\ 59 | .merge(outliers_df, 60 | left_on = ['date', region_vars[1]], 61 | right_on = ['date', 'region_right'], 62 | how = 'outer')\ 63 | .drop(['region_right'], axis = 1) 64 | # Flag if either is true 65 | new_df['flag'] = ((new_df['flag_x'] == 1) | (new_df['flag_y'] == 1)).astype(int) 66 | new_df = new_df.drop(['flag_x', 'flag_y'], axis =1) 67 | # Drop outliers and processual columns 68 | new_df = new_df[~(new_df['flag'] == 1)].drop(['flag'], axis = 1) 69 | return new_df 70 | 71 | def clean_pipeline(indicator, timevar, region_vars, outliers_df): 72 | return remove_towers_down( 73 | clean_columns(indicator, 74 | timevar = timevar), 75 | region_vars = region_vars, 76 | outliers_df = outliers_df) -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i5_net_movement_data.R: -------------------------------------------------------------------------------- 1 | # Clean Subscribers Data 2 | 3 | # Depends on: clean_movement_inout_data.R 4 | 5 | unit <- "adm2" 6 | timeunit <- "daily" 7 | for(unit in c("adm2", "adm3")){ 8 | for(timeunit in c("daily", "weekly")){ 9 | 10 | print(paste(unit, timeunit, "--------------------------------------------")) 11 | 12 | # Set parameters ------------------------------------------------------------- 13 | admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds"))) 14 | 15 | if(unit %in% "adm2"){ 16 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH 17 | } 18 | 19 | if(unit %in% "adm3"){ 20 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH 21 | } 22 | 23 | # Clean ---------------------------------------------------------------------- 24 | df <- readRDS(file.path(CLEAN_DATA_PATH, 25 | paste0("i5_", 26 | timeunit, 27 | ".Rds"))) %>% 28 | as.data.table() 29 | 30 | ## Aggregate Origin 31 | df_orign <- df[, .(value = sum(value, na.rm=T)), 32 | by = list(region_origin, date)] 33 | 34 | names(df_orign)[names(df_orign) %in% "region_origin"] <- "region" 35 | names(df_orign)[names(df_orign) %in% "value"] <- "value_origin" 36 | 37 | ## Aggregate Destination 38 | df_dest <- df[, .(value = sum(value, na.rm=T)), 39 | by = list(region_dest, date)] 40 | 41 | names(df_dest)[names(df_dest) %in% "region_dest"] <- "region" 42 | names(df_dest)[names(df_dest) %in% "value"] <- "value_dest" 43 | 44 | ## Merge 45 | df_day_clean <- merge(df_orign, df_dest, by=c("region", "date")) %>% 46 | as.data.frame() 47 | 48 | ## Prep data 49 | df_day_clean <- df_day_clean %>% 50 | 51 | dplyr::mutate(value = value_dest - value_origin) %>% 52 | 53 | tp_standardize_vars("date", "region", "value") %>% 54 | 55 | # Clean Data 56 | tp_fill_regions(admin_sp) %>% 57 | tp_complete_date_region() %>% 58 | tp_add_polygon_data(admin_sp) %>% 59 | 60 | # Percent change 61 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, 62 | paste0("i5_net_",timeunit,"_base.csv")), 63 | type = timeunit) %>% 64 | tp_add_percent_change() %>% 65 | 66 | # Add labels 67 | tp_add_label_level(timeunit = timeunit, OD = F) %>% 68 | tp_add_label_baseline(timeunit = timeunit, OD = F) 69 | 70 | 71 | ## Export 72 | saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, 73 | paste0("i5_net_", 74 | timeunit, 75 | ".Rds"))) 76 | 77 | write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, 78 | paste0("i5_net_", 79 | timeunit, 80 | ".csv")), 81 | row.names=F) 82 | 83 | 84 | 85 | } 86 | } 87 | 88 | 89 | -------------------------------------------------------------------------------- /data-panel/Archive/02_clean.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # PANEL CLEAN 3 | #-----------------------------------------------------------------# 4 | 5 | #-----------------------------------------------------------------# 6 | # Settings 7 | 8 | import pandas as pd 9 | 10 | EXPORT = False 11 | 12 | # Number of hours below avg, used as a trashold to 13 | # define a tower down 14 | htrahshold = -3 15 | 16 | #-----------------------------------------------------------------# 17 | # Import data 18 | 19 | i1 = pd.read_csv( DATA_panel + 'i1_admin3.csv') 20 | 21 | #-----------------------------------------------------------------# 22 | # Process data 23 | 24 | i1['date'] = pd.to_datetime(i1['hour']).dt.date 25 | i1['hour_int'] = pd.to_datetime(i1['hour']).dt.hour 26 | 27 | 28 | 29 | #-----------------------------------------------------------------# 30 | # USAGE OUTILERS: Wards with very little data 31 | 32 | # Number of observations per ward that is total number of hours 33 | i1freq = i1.groupby('region').size() 34 | 35 | i1freq = i1freq.reset_index() 36 | i1freq.columns = ['region', 'freq'] 37 | 38 | # Select wards with less than 12h on average 39 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1.date.nunique())] 40 | 41 | i1_low_total_hours = i1_low_total_hours\ 42 | .rename(columns = {'freq' : 'total_hours'}) 43 | # # Proportion of wards with at least one tower down 44 | # freq[freq < 1392].count()/len(set(i1['region'])) 45 | 46 | # # Proportion of wards with very 47 | # freq[freq < 700].count() 48 | # freq[freq < 700].count()/len(set(i1['region'])) 49 | 50 | # Export 51 | if(EXPORT): 52 | (i1_low_total_hours 53 | .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv', 54 | index = False) ) 55 | 56 | #-----------------------------------------------------------------# 57 | # USAGE OUTILERS: Indicator wards and days with towers down 58 | 59 | # Number of hours with transactions per region day 60 | hours_per_day = i1.groupby(['region', 'date']).size() 61 | 62 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column 63 | hours_per_day.columns = ['region', 'date', 'hcount'] 64 | 65 | 66 | # Average hours per day per region 67 | avg_hours = (hours_per_day.groupby(['region']) 68 | .mean() 69 | .rename(columns={'hcount' :'avg_hours' })) 70 | 71 | # Create region day data set 72 | i1_ag_df = hours_per_day.merge(avg_hours, 73 | on = 'region') 74 | 75 | # Difference from average usage per hour 76 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours'] 77 | 78 | # Create data only with pairs of wards and days potential 79 | # towers down 80 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold] 81 | 82 | # Read me text 83 | readme_text = "This file contains a combinations of wards and days that are assumed to have a tower down." 84 | readme_text += "If a day has " + str(abs(htrahshold)) 85 | readme_text += " hours with any calls below the daily avergage for that ward," 86 | readme_text += " it is considered to have a trower down at some point that day." 87 | 88 | # Export 89 | if(EXPORT): 90 | (i1_ag_df_tower_down 91 | .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv', 92 | index = False) ) 93 | # Read me file 94 | file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w") 95 | file.write(readme_text) 96 | file.close() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ######################################################################## 2 | # 3 | # Based on DIME .gitignore template. Follow the instructions in the URL 4 | # below to set up this template in your own repository 5 | # https://github.com/worldbank/dime-github-trainings/tree/master/GitHub-resources/DIME-GitHub-Templates 6 | # 7 | # Note that if you are using GitKraken, you need to use version 5.x or more 8 | # recent for this template to work properly 9 | # 10 | ######################################################################## 11 | 12 | ####################### 13 | # Start by ignoring everything, and below we are explicitly saying 14 | # what to not ignore 15 | * 16 | 17 | ####################### 18 | # List of files with GitHub functionality anywhere in the repo 19 | # that we do not want to ignore 20 | 21 | # These files include GitHub settings 22 | !.gitignore 23 | !.gitattributes 24 | 25 | # Keep markdown files used for documentation on GitHub 26 | !README.md 27 | !CONTRIBUTING.md 28 | !LICENSE* 29 | 30 | ####################### 31 | # For performance reasons, if a folder is already ignored, then 32 | # GitHub does not check the content for that folder for matches 33 | # with additional rules. The line below includes folder in the 34 | # top folder (but not their content), so that anything matching 35 | # the rules below will still not be ignored. 36 | !*/ 37 | 38 | ####################### 39 | # The following file types are code that should always be 40 | # included no matter where in the repository folder they are 41 | # located unless you explicitly ignore that folder 42 | 43 | # Stata 44 | !/**/*.do 45 | !/**/*.ado 46 | 47 | # R 48 | !/**/*.R 49 | !/**/*.Rmd 50 | 51 | # LaTeX 52 | !/**/*.tex 53 | !/**/*.bib 54 | 55 | # Python 56 | !/**/*.py 57 | !/**/*.ipynb 58 | # Still ignore .ipynb files in checkpoint folders 59 | .ipynb_checkpoints 60 | 61 | # Matlab 62 | !/**/*.m 63 | 64 | # Markdown 65 | !/**/*.md 66 | 67 | # Julia 68 | !/**/*.jl 69 | 70 | # CSS 71 | !/**/*.css 72 | 73 | # Docker 74 | !/**/*.yml 75 | !/**/docker/* 76 | 77 | ####################### 78 | # Include some additional file formats in any output folder. You might have 79 | # to change the name of the Output folder to whatever it is called in your 80 | # project, but we strongly recommend that you only include these files in 81 | # a subset of the folders where you are certain no private data is ever stored. 82 | !/**/Output/**/*.txt 83 | !/**/Output/**/*.csv 84 | !/**/Output/**/*.xml 85 | !/**/Output/**/*.eps 86 | !/**/Output/**/*.svg 87 | 88 | ####################### 89 | # Include all the files with passwords or tokens here. All files named 90 | # password or passwords are with this template ignored no matter which 91 | # format you are using. Additionally, all content in any folder called 92 | # password or passwords are also ignored. NOTE that your project might be 93 | # using different names and then you must edit the lines below accordingly. 94 | password.* 95 | passwords.* 96 | password/ 97 | passwords/ 98 | 99 | generate_password.R 100 | generate_password* 101 | 102 | 103 | 104 | ####################### 105 | # Explicitly exclude data methods and sources description which should 106 | # be kept private. These are already excluded from above lines, just 107 | # including in case change. 108 | data_methods.txt 109 | data_source_description.txt 110 | /**/notebooks/ignored_scripts* 111 | /**/config_file.py 112 | /**/**/.DS_Storec 113 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/flowminder_aggregator.py: -------------------------------------------------------------------------------- 1 | import os 2 | if os.environ['HOME'] != '/root': 3 | from modules.DataSource import * 4 | from modules.sql_code_aggregates import * 5 | from modules.aggregator import * 6 | databricks = False 7 | else: 8 | databricks = True 9 | 10 | # Databricks notebook source 11 | class flowminder_aggregator(aggregator): 12 | """Class to handle sql aggregations of flowminder code. 13 | For the original sql code from flowminder see https://github.com/Flowminder/COVID-19 14 | 15 | Attributes 16 | ---------- 17 | result_stub : a string. File path where to save results 18 | datasource : an instance of DataSource class. Holds all dataframes and paths required 19 | regions : a pyspark dataframe. Admin level this aggregator will be used for 20 | intermediate_tables : a list. Names of tables that we don't want written to csv 21 | calls : a pyspark dataframe. pyspcdr data 22 | cells : a pyspark dataframe. admin region to tower mapping 23 | spark : an initialised spark connection. spark connection this aggregator should use 24 | dates : a dictionary. dates the aggregator should run over 25 | sql_code : a string. the flowminder sql code to be used 26 | 27 | 28 | Methods 29 | ------- 30 | run_and_save_all(table_name) 31 | runs run_and_save on the list of all flowminder queries at once 32 | 33 | run_save_and_rename_all() 34 | runs run_and_save_all and then renames the csv files created and 35 | moves them to their parent folder 36 | 37 | attempt_aggregation(indicators_to_produce = 'all', no_of_attempts = 4) 38 | - attempts aggregation of all flowminder indicators 39 | - tries mutiple times (this is relevant for databricks env, 40 | but should be dropped going forward and replaced by a more 41 | solid handling of databricks timeouts) 42 | 43 | 44 | """ 45 | 46 | def __init__(self, 47 | result_stub, 48 | datasource, 49 | regions, 50 | intermediate_tables = ['home_locations']): 51 | """ 52 | Parameters 53 | ---------- 54 | result_stub : where to save results 55 | datasource : holds all dataframes and paths required 56 | regions : admin level this aggregator will be used for 57 | intermediate_tables : tables that we don't want written to csv 58 | """ 59 | # initiate with parent init 60 | super().__init__(result_stub,datasource,regions) 61 | 62 | def run_and_save_all(self): 63 | for table_name in self.table_names: 64 | df = self.spark.sql(self.sql_code[table_name]) 65 | self.save_and_report(df, table_name) 66 | 67 | def run_save_and_rename_all(self): 68 | self.run_and_save_all() 69 | self.rename_all_csvs() 70 | 71 | 72 | def attempt_aggregation(self, indicators_to_produce = 'all'): 73 | try: 74 | # all indicators 75 | if indicators_to_produce == 'all': 76 | self.run_save_and_rename_all() 77 | 78 | # single indicator 79 | else: 80 | for table in indicators_to_produce.keys(): 81 | table_name = indicators_to_produce[table] 82 | print('--> Producing: ' + table_name) 83 | self.run_save_and_rename(table_name + '_per_' + indicators_to_produce[table_name]) 84 | print('Indicators saved.') 85 | 86 | except Exception as e: 87 | print(e) 88 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i5_movement_inout_data.R: -------------------------------------------------------------------------------- 1 | # Clean i5 Data for Dashboard 2 | 3 | EXPORT <- T 4 | 5 | unit = "adm2" 6 | for(unit in c("adm2", "adm3")){ 7 | 8 | # Load Data / Set Paths ------------------------------------------------------ 9 | df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_05_",unit,"_day_result.csv")), 10 | stringsAsFactors=F) 11 | admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds"))) 12 | 13 | if(unit %in% "adm2"){ 14 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH 15 | } 16 | 17 | if(unit %in% "adm3"){ 18 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH 19 | } 20 | 21 | #### Remove small observations 22 | # If less than 15, make NA. Doing this now removes some region-pairs. For 23 | # example, if a o-d pair has a value less than 15 for every time period, 24 | # we don't considered here and helps improve code speed both here and in 25 | # the script to prepare data for dashboard. 26 | df_day <- df_day[df_day$totalOD > 15,] 27 | 28 | # Daily ---------------------------------------------------------------------- 29 | #### Process data for dashboard 30 | df_day_clean <- df_day %>% 31 | 32 | tp_standardize_vars_od("pdate", 33 | unit, 34 | paste0("N_", unit), 35 | "totalOD") %>% 36 | 37 | # Clean datset 38 | tp_clean_date() %>% 39 | tp_complete_date_region_od() %>% 40 | tp_add_polygon_data_od(admin_sp) %>% 41 | 42 | # Interpolate/Clean Values 43 | tp_interpolate_outliers(NAs_as_zero = F) %>% 44 | #tp_replace_zeros(NAs_as_zero = T) %>% 45 | tp_less15_NA() %>% 46 | 47 | # Percent change 48 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i5_daily_base.csv"), 49 | baseline_date = BASELINE_DATE) %>% 50 | tp_add_percent_change() %>% 51 | 52 | # Add labels 53 | tp_add_label_level(timeunit = "day", OD = T) %>% 54 | tp_add_label_baseline(timeunit = "day", OD = T) 55 | 56 | ## Export 57 | saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, "i5_daily.Rds")) 58 | write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, "i5_daily.csv"), row.names=F) 59 | 60 | # Weekly --------------------------------------------------------------------- 61 | print("week") 62 | 63 | df_week_clean <- df_day_clean %>% 64 | 65 | dplyr::select(date, region_origin, region_dest, value) %>% 66 | 67 | tp_standardize_vars_od("date", "region_origin", "region_dest", "value") %>% 68 | 69 | # Clean datset 70 | tp_clean_week() %>% 71 | tp_agg_day_to_week_od() %>% 72 | tp_complete_date_region_od() %>% 73 | tp_add_polygon_data_od(admin_sp) %>% 74 | 75 | # Interpolate/Clean Values 76 | #tp_interpolate_outliers(NAs_as_zero = F) %>% 77 | #tp_replace_zeros(NAs_as_zero = T) %>% 78 | tp_less15_NA() %>% 79 | 80 | # Percent change 81 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, "i5_weekly_base.csv"), 82 | type = "weekly", 83 | baseline_date = BASELINE_DATE) %>% 84 | tp_add_percent_change() %>% 85 | 86 | # Add labels 87 | tp_add_label_level(timeunit = "week", OD = T) %>% 88 | tp_add_label_baseline(timeunit = "week", OD = T) 89 | 90 | ## Export 91 | saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH, "i5_weekly.Rds")) 92 | write.csv(df_week_clean, file.path(CLEAN_DATA_PATH, "i5_weekly.csv"), row.names=F) 93 | 94 | } 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /data-checks/Archive/quick_checks/check_subscribers.R: -------------------------------------------------------------------------------- 1 | # Check subscribers data 2 | 3 | FIG_PATH <- file.path(PROJECT_PATH, "proof-of-concept", 4 | "outputs", "data-checks", "figures_indicators", "subscribers_daily") 5 | 6 | # Load Data -------------------------------------------------------------------- 7 | # FILE PATHS NEED TO BE UPDATED 8 | ISAAC_DATA_PATH_2 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin2_flowminder") 9 | ISAAC_DATA_PATH_3 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin3_flowminder") 10 | 11 | #### Raw Data 12 | df_day_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2, 13 | "count_unique_subscribers_per_region_per_day.csv"), 14 | stringsAsFactors=F) %>% 15 | dplyr::rename(value_raw = subscriber_count, 16 | date = visit_date) %>% 17 | dplyr::mutate(region = region %>% as.character(), 18 | date = date %>% as.Date()) 19 | 20 | df_week_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2, 21 | "count_unique_subscribers_per_region_per_week.csv"), 22 | stringsAsFactors=F) %>% 23 | dplyr::rename(value_raw = subscriber_count, 24 | date = visit_week) %>% 25 | dplyr::mutate(region = region %>% as.character()) 26 | 27 | df_day_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3, 28 | "count_unique_subscribers_per_region_per_day.csv"), 29 | stringsAsFactors=F) %>% 30 | dplyr::rename(value_raw = subscriber_count, 31 | date = visit_date) %>% 32 | dplyr::mutate(region = region %>% as.character(), 33 | date = date %>% as.Date()) 34 | 35 | df_week_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3, 36 | "count_unique_subscribers_per_region_per_week.csv"), 37 | stringsAsFactors=F) %>% 38 | dplyr::rename(value_raw = subscriber_count, 39 | date = visit_week) %>% 40 | dplyr::mutate(region = region %>% as.character()) 41 | 42 | #### Cleaned Data 43 | df_day_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH, 44 | "count_unique_subscribers_per_region_per_day.Rds")) %>% 45 | left_join(df_day_adm2_raw, by=c("date", "region")) 46 | 47 | df_week_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH, 48 | "count_unique_subscribers_per_region_per_week.Rds")) 49 | 50 | df_day_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH, 51 | "count_unique_subscribers_per_region_per_day.Rds")) %>% 52 | left_join(df_day_adm3_raw, by=c("date", "region")) %>% 53 | mutate(value_raw = value_raw %>% as.numeric()) 54 | 55 | df_week_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH, 56 | "count_unique_subscribers_per_region_per_week.Rds")) 57 | 58 | # Trends Over Time ------------------------------------------------------------- 59 | df_day_adm2 %>% 60 | group_by(date) %>% 61 | summarise(value = sum(value), 62 | value_raw = sum(value_raw)) %>% 63 | ggplot() + 64 | geom_line(aes(x=date, y=value), color="black") + 65 | geom_point(aes(x=date, y=value), color="black") + 66 | geom_vline(xintercept = as.Date("2020-03-27"), color="red") 67 | 68 | lapply(unique(df_day_adm3$province), function(province_i){ 69 | print(province_i) 70 | 71 | p <- df_day_adm3 %>% 72 | filter(province %in% province_i) %>% 73 | ggplot(aes(x=date)) + 74 | geom_line(aes(y=value_raw), color="red", alpha=0.2, size=1.5) + 75 | geom_line(aes(y=value)) + 76 | facet_wrap(~region, 77 | scales = "free_y") 78 | ggsave(p, filename = file.path(FIG_PATH, paste0(province_i, ".png")), height = 25, width = 25) 79 | 80 | return(NULL) 81 | }) 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/clean_i7_distance_traveled.R: -------------------------------------------------------------------------------- 1 | # Clean Subscribers Data 2 | 3 | unit = "adm2" 4 | metric = "avg_dist" 5 | for(unit in c("adm2", "adm3")){ 6 | for(metric in c("avg_dist", "stddev")){ 7 | 8 | print(paste(unit, metric, "---------------------------------------------")) 9 | 10 | # Load Data / Set Paths ------------------------------------------------------ 11 | df_day <- read.csv(file.path(RAW_INDICATORS, paste0("indicator_07_home_",unit,"_day_result.csv")), 12 | stringsAsFactors=F) 13 | admin_sp <- readRDS(file.path(GEO_PATH, paste0(unit, ".Rds"))) 14 | 15 | if(unit %in% "adm2"){ 16 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH 17 | 18 | df_day <- clean_moz_names(df_day, 19 | name = "H_adm2", 20 | name_higher = "H_adm1", 21 | type = "adm2") 22 | 23 | } 24 | if(unit %in% "adm3"){ 25 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH 26 | 27 | df_day <- clean_moz_names(df_day, 28 | name = "H_adm3", 29 | name_higher = "H_adm2", 30 | type = "adm3") 31 | 32 | } 33 | 34 | # Daily ---------------------------------------------------------------------- 35 | print("day") 36 | 37 | df_day_clean <- df_day %>% 38 | 39 | tp_standardize_vars("pdate", paste0("H_", unit), metric) %>% 40 | 41 | # Clean datset 42 | tp_clean_date() %>% 43 | tp_fill_regions(admin_sp) %>% 44 | tp_complete_date_region() %>% 45 | tp_add_polygon_data(admin_sp) %>% 46 | 47 | # Interpolate/Clean Values 48 | tp_interpolate_outliers(NAs_as_zero = T, outlier_replace="both") %>% 49 | tp_replace_zeros(NAs_as_zero = T) %>% 50 | tp_less15_NA(threshold = 0) %>% 51 | 52 | # Percent change 53 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, paste0("i7_",metric,"_daily_base.csv"))) %>% 54 | tp_add_percent_change() %>% 55 | 56 | # Add labels 57 | tp_add_label_level(timeunit = "day", OD = F) %>% 58 | tp_add_label_baseline(timeunit = "day", OD = F) 59 | 60 | ## Export 61 | saveRDS(df_day_clean, file.path(CLEAN_DATA_PATH, paste0("i7_daily_",metric,".Rds"))) 62 | write.csv(df_day_clean, file.path(CLEAN_DATA_PATH, paste0("i7_daily_",metric,".csv")), row.names=F) 63 | 64 | 65 | # Weekly --------------------------------------------------------------------- 66 | print("week") 67 | 68 | df_week_clean <- df_day_clean %>% 69 | 70 | dplyr::select(date, region, value) %>% 71 | 72 | tp_standardize_vars("date", "region", "value") %>% 73 | 74 | # Clean datset 75 | tp_clean_week() %>% 76 | tp_agg_day_to_week(fun="mean") %>% 77 | tp_fill_regions(admin_sp) %>% 78 | tp_complete_date_region() %>% 79 | tp_add_polygon_data(admin_sp) %>% 80 | 81 | # Interpolate/Clean Values 82 | #tp_interpolate_outliers(NAs_as_zero = T) %>% 83 | #tp_replace_zeros(NAs_as_zero = T) %>% 84 | #tp_less15_NA() %>% 85 | 86 | # Percent change 87 | tp_add_baseline_comp_stats(file_name = file.path(CLEAN_DATA_PATH, paste0("i7_",metric,"_weekly_base.csv")), 88 | type = "weekly") %>% 89 | tp_add_percent_change() %>% 90 | 91 | # Add labels 92 | tp_add_label_level(timeunit = "week", OD = F) %>% 93 | tp_add_label_baseline(timeunit = "week", OD = F) 94 | 95 | 96 | ## Export 97 | saveRDS(df_week_clean, file.path(CLEAN_DATA_PATH, 98 | paste0("i7_weekly_",metric,".Rds"))) 99 | write.csv(df_week_clean, file.path(CLEAN_DATA_PATH, 100 | paste0("i7_weekly_",metric,".csv")), 101 | row.names=F) 102 | 103 | 104 | } 105 | } 106 | 107 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/_dash_master.R: -------------------------------------------------------------------------------- 1 | # Master R Script for Prepping Data for Dashboard 2 | # Mozambique 3 | 4 | #### Settings #### ============================================================= 5 | options(rsconnect.max.bundle.files = 400000) 6 | 7 | CLEAN_SPATIAL_DATA <- F 8 | CLEAN_TELECOM_DATA <- F 9 | PREP_DATA_FOR_DASH <- T 10 | 11 | BASELINE_DATE <- "2020-03-31" 12 | 13 | #### Packages #### ============================================================= 14 | library(tidyverse) 15 | library(sparkline) 16 | library(sf) 17 | library(sp) 18 | library(plotly) 19 | library(stargazer) 20 | library(knitr) 21 | library(gridExtra) 22 | library(leaflet) 23 | library(ggpubr) 24 | library(purrr) 25 | library(parallel) 26 | library(pbmcapply) 27 | library(rgeos) 28 | library(rgdal) 29 | library(sp) 30 | library(rmapshaper) 31 | library(raster) 32 | library(geosphere) 33 | library(lubridate) 34 | library(data.table) 35 | library(mapview) 36 | library(bcrypt) 37 | 38 | #### File paths #### =========================================================== 39 | 40 | # Define Root Paths ------------------------------------------------------------ 41 | if(Sys.info()[["user"]] == "robmarty") PROJECT_PATH <- "~/Documents/World Bank/Sveta Milusheva - COVID 19 Results" 42 | if(Sys.info()[["user"]] == "wb519128") PROJECT_PATH <- "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results" 43 | if(Sys.info()[["user"]] == "WB521633") PROJECT_PATH <- "C:/Users/wb521633/WBG/Sveta Milusheva - COVID 19 Results" 44 | 45 | if(Sys.info()[["user"]] == "robmarty") GITHUB_PATH <- "~/Documents/Github/covid-mobile-dashboards" 46 | if(Sys.info()[["user"]] == "wb519128") GITHUB_PATH <- "C:/Users/wb519128/Github/covid-mobile-dashboards" 47 | if(Sys.info()[["user"]] == "WB521633") GITHUB_PATH <- "C:/Users/wb521633/Documents/Github/covid-mobile-dashboards" 48 | 49 | # Define Paths from Root ------------------------------------------------------- 50 | GADM_PATH <- "PATH-HERE" 51 | GEO_PATH <- "PATH-HERE" 52 | 53 | CLEAN_DATA_ADM2_PATH <- "PATH-HERE" 54 | CLEAN_DATA_ADM3_PATH <- "PATH-HERE" 55 | 56 | DASHBOARD_DATA_ONEDRIVE_PATH <- "PATH-HERE" 57 | DASHBOARD_DATA_GITHUB_PATH <- "PATH-HERE" 58 | 59 | PREP_DATA_CODE_PATH <- "PATH-HERE" 60 | 61 | #### Functions #### ============================================================ 62 | source(file.path(GITHUB_PATH, "dashboard-dataviz", "dashboards", 63 | "_tp_functions.R")) 64 | 65 | source(file.path(GITHUB_PATH, "dashboard-dataviz", "dashboards", 66 | "_prep_data_for_dash_functions.R")) 67 | 68 | 69 | #### Scripts #### ============================================================== 70 | 71 | # 1. Prepare Spatial Data ------------------------------------------------------ 72 | if(CLEAN_SPATIAL_DATA){ 73 | source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "download_gadm.R")) 74 | source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "clean_adm2_file.R")) 75 | source(file.path(PREP_DATA_CODE_PATH, "01_clean_spatial_data", "clean_adm3_file.R")) 76 | } 77 | 78 | # 2. Prepare Spatial Data ------------------------------------------------------ 79 | if(CLEAN_TELECOM_DATA){ 80 | source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i3_subscribers_data.R")) 81 | source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i5_movement_inout_data.R")) 82 | source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i5_net_movement_data.R")) 83 | source(file.path(PREP_DATA_CODE_PATH, "02_clean_telecom_data", "clean_i7_distance_traveled.R")) 84 | } 85 | 86 | # 3. Prep Data for Dashboard --------------------------------------------------- 87 | if(PREP_DATA_FOR_DASH){ 88 | source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "prep_subs_obs_totals_data.R")) 89 | source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "prep_telecom_agg_data.R")) 90 | source(file.path(PREP_DATA_CODE_PATH, "03_dashboard_data_prep", "data_to_github.R")) 91 | } 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /data-checks/Archive/usage_outliers.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # Outliers and towers down 3 | #-----------------------------------------------------------------# 4 | 5 | # This code depends on MASTER.py to run as file path objects are 6 | # defined there 7 | 8 | 9 | #-----------------------------------------------------------------# 10 | # TO DO: 11 | 12 | # Identify regions with very sparse use 13 | # 1. Count obs per region 14 | # 2. Count obs per region per day 15 | 16 | # Identify regions with normal use and big valleys of usage, that 17 | # would probably indicate a tower being down 18 | 19 | #-----------------------------------------------------------------# 20 | # Settings 21 | 22 | import pandas as pd 23 | 24 | EXPORT = False 25 | TEMP_PANEL = True 26 | # Number of hours below avg, used as a trashold to 27 | # define a tower down 28 | htrahshold = -3 29 | 30 | #-----------------------------------------------------------------# 31 | # Import data 32 | 33 | if TEMP_PANEL: 34 | i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv') 35 | else: 36 | i1 = pd.read_csv(I1_Adm3_path + "transactions_per_hour.csv") 37 | 38 | i1 = i1[i1.region != '99999'] 39 | # Wards data 40 | 41 | 42 | 43 | # Hourly transactions per region 44 | 45 | # Unique subscribers per hour 46 | # i2a3 = pd.read_csv(I2_Adm3_path + "unique_subscribers_per_hour.csv") 47 | # i2t = pd.read_csv(I2_towercluster_path + "unique_subscribers_per_hour.csv") 48 | 49 | 50 | #-----------------------------------------------------------------# 51 | # Process data 52 | 53 | i1['date'] = pd.to_datetime(i1['hour']).dt.date 54 | i1['hour_int'] = pd.to_datetime(i1['hour']).dt.hour 55 | 56 | 57 | #-----------------------------------------------------------------# 58 | # Wards with very little data 59 | 60 | # Number of observations per ward that is total number of hours 61 | i1freq = i1.groupby('region').size() 62 | 63 | i1freq = i1freq.reset_index() 64 | i1freq.columns = ['region', 'freq'] 65 | 66 | # Select wards with less than 12h on average 67 | i1_low_total_hours = i1freq[i1freq['freq'] < (12*i1.date.nunique())] 68 | 69 | i1_low_total_hours = i1_low_total_hours\ 70 | .rename(columns = {'freq' : 'total_hours'}) 71 | # # Proportion of wards with at least one tower down 72 | # freq[freq < 1392].count()/len(set(i1['region'])) 73 | 74 | # # Proportion of wards with very 75 | # freq[freq < 700].count() 76 | # freq[freq < 700].count()/len(set(i1['region'])) 77 | 78 | # Export 79 | if(EXPORT): 80 | (i1_low_total_hours 81 | .to_csv(OUT_hfcs + 'wards_with_low_hours_I1.csv', 82 | index = False) ) 83 | 84 | #-----------------------------------------------------------------# 85 | # Indicator wards and days with towers down 86 | 87 | # Number of hours with transactions per region day 88 | hours_per_day = i1.groupby(['region', 'date']).size() 89 | 90 | hours_per_day = hours_per_day.reset_index() # ger regions to be a column 91 | hours_per_day.columns = ['region', 'date', 'hcount'] 92 | 93 | 94 | # Average hours per day per region 95 | avg_hours = (hours_per_day.groupby(['region']) 96 | .mean() 97 | .rename(columns={'hcount' :'avg_hours' })) 98 | 99 | # Create region day data set 100 | i1_ag_df = hours_per_day.merge(avg_hours, 101 | on = 'region') 102 | 103 | # Difference from average usage per hour 104 | i1_ag_df['h_diff'] = i1_ag_df['hcount'] - i1_ag_df['avg_hours'] 105 | 106 | # Create data only with pairs of wards and days potential 107 | # towers down 108 | i1_ag_df_tower_down = i1_ag_df[i1_ag_df['h_diff'] < htrahshold] 109 | 110 | # Read me text 111 | readme_text = "This file contains a combinations of wards and days that are assumed to have a tower down." 112 | readme_text += "If a day has " + str(abs(htrahshold)) 113 | readme_text += " hours with any calls below the daily avergage for that ward," 114 | readme_text += " it is considered to have a trower down at some point that day." 115 | 116 | # Export 117 | if(EXPORT): 118 | (i1_ag_df_tower_down 119 | .to_csv(OUT_hfcs + 'days_wards_with_low_hours_I1_panel.csv', 120 | index = False) ) 121 | # Read me file 122 | file = open(OUT_hfcs + "days_wards_with_low_hours_I1_README.txt", "w") 123 | file.write(readme_text) 124 | file.close() 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /data-checks/Archive/i10-check.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # dbutils.fs.ls('/mnt/') 3 | # dbutils.fs.refreshMounts() 4 | 5 | # COMMAND ---------- 6 | 7 | import pyspark.sql.functions as F 8 | from pyspark.sql.functions import to_timestamp 9 | from pyspark.sql.types import * 10 | from pyspark.sql.window import Window 11 | 12 | # Constat definitions 13 | privacy_filter = 15 14 | missing_value_code = 99999 15 | cutoff_days = 7 16 | max_duration = 21 17 | 18 | user_window = Window\ 19 | .partitionBy('msisdn').orderBy('call_datetime') 20 | 21 | 22 | # COMMAND ---------- 23 | 24 | # dbutils.fs.ls('/mnt/COVID19Data/Sveta Milusheva - mar20') 25 | base_path = '/mnt/COVID19Data/Sveta Milusheva - mar20/' 26 | geo_path = 'mnt/COVID19Data/proof-of-concept/support-data/geo-files/' 27 | 28 | # COMMAND ---------- 29 | 30 | # Load tower mapping to districts 31 | cells = spark.read.format("csv")\ 32 | .option("header", "true")\ 33 | .load(geo_path + 'zw_admin3_tower_map.csv') 34 | 35 | # COMMAND ---------- 36 | 37 | cells.show() 38 | 39 | # COMMAND ---------- 40 | 41 | # Set default schema 42 | schema = StructType([ 43 | StructField("msisdn", IntegerType(), True), 44 | StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files() 45 | StructField("location_id", StringType(), True) 46 | ]) 47 | 48 | # Import one day at a time 49 | 50 | mar20 = spark.read.format("csv")\ 51 | .option("header", "true")\ 52 | .load(base_path + 'MOH_EWZ_20200320.csv', schema = schema) 53 | 54 | mar21 = spark.read.format("csv")\ 55 | .option("header", "true")\ 56 | .load(base_path + 'MOH_EWZ_20200320.csv', schema = schema) 57 | 58 | 59 | 60 | # COMMAND ---------- 61 | 62 | # Process data 63 | 64 | def create_vars(df, cells): 65 | # Loading variables 66 | df = df.withColumn("call_datetime", to_timestamp("call_datetime","dd/MM/yyyy HH:mm:ss")) 67 | #get call_date from call_datetime 68 | df = df.withColumn('call_date', df.call_datetime.cast('date')) 69 | 70 | # Recreate analysis variables 71 | df = df.join(cells, df.location_id == cells.cell_id, how = 'left').drop('cell_id')\ 72 | .orderBy('msisdn', 'call_datetime')\ 73 | .withColumn('region_lag', F.lag('region').over(user_window))\ 74 | .withColumn('region_lead', F.lead('region').over(user_window))\ 75 | .withColumn('call_datetime_lag', F.lag('call_datetime').over(user_window))\ 76 | .withColumn('call_datetime_lead', F.lead('call_datetime').over(user_window))\ 77 | .withColumn('hour_of_day', F.hour('call_datetime').cast('byte'))\ 78 | .withColumn('hour', F.date_trunc('hour', F.col('call_datetime')))\ 79 | .withColumn('week', F.date_trunc('week', F.col('call_datetime')))\ 80 | .withColumn('month', F.date_trunc('month', F.col('call_datetime')))\ 81 | .withColumn('constant', F.lit(1).cast('byte'))\ 82 | .withColumn('day', F.date_trunc('day', F.col('call_datetime')))\ 83 | .na.fill({'region' : missing_value_code , 84 | 'region_lag' : missing_value_code , 85 | 'region_lead' : missing_value_code }) 86 | 87 | return df 88 | 89 | mar20 = create_vars(mar20, cells) 90 | mar21 = create_vars(mar21, cells) 91 | 92 | # COMMAND ---------- 93 | 94 | mar20.columns 95 | 96 | # COMMAND ---------- 97 | 98 | # Create simple OD matrix 99 | def simp_od(df): 100 | 101 | # Kepp if region and region_lag/lead are not the same 102 | df = df.where((F.col('region_lag') != F.col('region')) | (F.col('region_lead') != F.col('region')) | (F.col('call_datetime_lead').isNull())) 103 | 104 | # Aggregate total sum by region and region_lag 105 | agg_df = df.groupby('region', 'region_lag')\ 106 | .agg(F.count("*")) 107 | 108 | return agg_df 109 | 110 | m20_agg = simp_od(mar20) 111 | m21_agg = simp_od(mar21) 112 | 113 | # COMMAND ---------- 114 | 115 | m20_agg.show() 116 | 117 | # COMMAND ---------- 118 | 119 | # mar20.show() 120 | 121 | # COMMAND ---------- 122 | 123 | 124 | # 1. Merge with tower mapping to wards 125 | 126 | # 2. Recreate vars 127 | 128 | # 4. 129 | 130 | 131 | # COMMAND ---------- 132 | 133 | a 134 | 135 | # COMMAND ---------- 136 | 137 | 138 | 139 | # COMMAND ---------- 140 | 141 | test_df = spark.read\ 142 | .option('header', 'true')\ 143 | .option('inferSchema', 'true')\ 144 | .csv('/mnt/COVID19Data/proof-of-concept/new/ZW/telecel/world_bank_cdr_new.csv') 145 | dd 146 | 147 | # COMMAND ---------- 148 | 149 | test_df.printSchema() 150 | -------------------------------------------------------------------------------- /data-checks/Archive/MASTER.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # DATA CHECKS MASTER 3 | #-----------------------------------------------------------------# 4 | 5 | # This script sets file paths and (will) map all processes for checking 6 | # incoming data 7 | 8 | #-----------------------------------------------------------------# 9 | #### Settings 10 | 11 | import os 12 | import re 13 | import pandas as pd 14 | import numpy as np 15 | import datetime as dt 16 | 17 | import seaborn as sns; sns.set() 18 | from matplotlib import rcParams 19 | import matplotlib.pyplot as plt 20 | 21 | #-----------------------------------------------------------------# 22 | #### Set file paths 23 | 24 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/" 25 | DATA_POC = DATA_path + "proof-of-concept/" 26 | DATA_GIS = DATA_POC + 'geo_files/' 27 | 28 | DATA_DB_raw_indicators = DATA_POC + "databricks-results/zw/" 29 | DATA_dashboad_clean = DATA_POC + "/files_for_dashboard/files_clean/" 30 | 31 | DATA_dash_clean_a2 = DATA_dashboad_clean + "adm2/" 32 | DATA_dash_clean_a3 = DATA_dashboad_clean + "adm3/" 33 | 34 | #---------------# 35 | # Main indicators 36 | 37 | # Transactions per hour 38 | I1_path = DATA_DB_raw_indicators + "indicator 1/" 39 | I1_Adm3_path = I1_path + "admin3/" 40 | 41 | 42 | # Unique subcribers per hour 43 | I2_path = DATA_DB_raw_indicators + "indicator 2/" 44 | I2_Adm3_path = I2_path + "admin3/" 45 | I2_towercluster_path = I2_path + "tower_cluster/" 46 | 47 | 48 | # Unique subscribers per day 49 | I3_path = DATA_DB_raw_indicators + "indicator 3/" 50 | I3_Adm2_path = I3_path + "admin2/" 51 | I3_Adm3_path = I3_path + "admin3/" 52 | 53 | # Ratio of residents active that day based on those present 54 | # during baseline 55 | I4_path = DATA_DB_raw_indicators + "indicator 4/" 56 | I4_Adm2_path = I4_path + 'admin2/' 57 | I4_Adm3_path = I4_path + 'admin3/' 58 | 59 | # OD matrix 60 | I5_path = DATA_DB_raw_indicators + "indicator 5/" 61 | I5_Adm2_path = I5_path + "admin2/" 62 | I5_Adm3_path = I5_path + "admin3/" 63 | 64 | # Residents living in area 65 | I6_path = DATA_DB_raw_indicators + "indicator 6/" 66 | I6_Adm2_path = I6_path + "admin2/" 67 | I6_Adm3_path = I6_path + "admin3/" 68 | 69 | # Mean and Standard Deviation of distance 70 | # traveled (by home location) day 71 | I7_path = DATA_DB_raw_indicators + "indicator 7/" 72 | I7_Adm2_path = I7_path + "admin2/" 73 | I7_Adm3_path = I7_path + "admin3/" 74 | 75 | # Mean and Standard Deviation of distance 76 | # traveled (by home location) week 77 | I8_path = DATA_DB_raw_indicators + "indicator 8/" 78 | I8_Adm2_path = I5_path + "admin2/" 79 | I8_Adm3_path = I5_path + "admin3/" 80 | 81 | # Daily locations based on Home Region with 82 | # average stay time and SD of stay time 83 | I9_path = DATA_DB_raw_indicators + "indicator 9/" 84 | I9_Adm2_path = I9_path + "admin2/" 85 | I9_Adm3_path = I9_path + "admin3/" 86 | 87 | #Simple Origin Destination Matrix - trips 88 | # between consecutive in time regions with time 89 | I10_path = DATA_DB_raw_indicators + "indicator 10/" 90 | I10_Adm2_path = I5_path + "admin2/" 91 | I10_Adm3_path = I5_path + "admin3/" 92 | 93 | #---------------------# 94 | # Flowminder indicators 95 | FLOWM_path = DATA_DB_raw_indicators + "flowminder indicators/" 96 | FLOWM_adm2_path = FLOWM_path + "admin2/" 97 | FLOWM_adm3_path = FLOWM_path + "admin3/" 98 | 99 | #-------------------# 100 | # External indicators 101 | 102 | # Update file path 103 | IRESULTS = DATA_path + "Isaac-results/" 104 | 105 | IFLOW_path = IRESULTS + "flowminder/" 106 | ICUST_path = IRESULTS + "custom/" 107 | 108 | # Flowminder 109 | IFLOWM_adm2_path = IFLOW_path + "admin2/" 110 | IFLOWM_adm3_path = IFLOW_path + "admin3/" 111 | 112 | # Custum 113 | ICUST_adm2_path = ICUST_path + "admin2/" 114 | ICUST_adm3_path = ICUST_path + "admin3/" 115 | 116 | 117 | #---------------# 118 | # Outputs 119 | OUT_path = DATA_POC + "outputs/" 120 | OUT_plots = OUT_path + "Figures/" 121 | OUT_hfcs = OUT_path + "data-checks/" 122 | # OUT_hfcs_sheets = OUT_hfcs + "Sheet differences/" 123 | 124 | #-----------------------------------------------------------------# 125 | # Indicator dataframes 126 | 127 | # Load list of internal indicators to make it 128 | # easier to bulk load files 129 | internal_indicators = pd\ 130 | .read_csv(DATA_POC + 'documentation/indicators_list.csv') 131 | 132 | # Since sheet contains relative paths add path global 133 | # to have absolute paths 134 | internal_indicators['path'] = DATA_path + internal_indicators['path'] -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/README.md: -------------------------------------------------------------------------------- 1 | # Dashboard 2 | 3 | This dashboard is build using R Shiny. 4 | 5 | # Preparing Data for Dashboard 6 | 7 | `01_preparing_data_for_dashboard` contains three folders with scripts for cleaning and preparing data for the dashboard. 8 | 9 | ## Clean Spatial Data 10 | 11 | The files in `01_clean_spatial_data` clean spatial polygons to be used in the dashboard and subsequent cleaning steps. The following cleaning steps are conducted: 12 | 13 | 1. Aggregate units when needed (e.g., aggregating wards) 14 | 2. Add additional variables (e.g., area) 15 | 3. Standardize variable names 16 | 4. Orders spatial data by region 17 | 18 | #### Standardize Variable Names 19 | Each spatial dataset should have standardized variable names. Standardizing 20 | variable names helps ensure different units (eg, admin2, admin3) can be 21 | easily switched in the dashboard 22 | 23 | | variable | format | example | description | 24 | |---|---|---|---| 25 | | region | string | ZONE123456 | Unique identifier of the spatial unit | 26 | | name | string | name-here | Spatial unit name | 27 | | area | numeric | 1234 | Area of the spatial unit in kilometers squared | 28 | | adm1| string | name-here | Name of the province | 29 | 30 | #### Order Spatial Data 31 | Spatial datasets are ordered by region. When cleaning other datasets at the 32 | region level, we also order by region and ensure all regions are present. This 33 | ensures that no reordering needs to be done in the dashboard. 34 | 35 | ## Clean Telecom Data 36 | 37 | The files in `02_clean_telecom_data` clean telecom data. They clean variable values (eg, accounting for outliers), standardize variable names and add variables needed for the dashboard. 38 | 39 | #### Dataset 40 | 41 | A number of indicators are cleaned. To facilitate further processing for the datasets 42 | to be used in the dashboard, all cleaned datasets have the following standardized 43 | variables: 44 | 45 | | variable | format | example | description | 46 | |---|---|---|---| 47 | | region | string | ZONE123456 | Unique identifier of the spatial unit | 48 | | name | string | Name1 | Spatial unit name | 49 | | date | date or string | 2020-02-01 | The date | 50 | | value | numeric | 1000 | Value (e.g., number of subscribers, number of trips, distance traveled) | 51 | | value_lag | numeric | 1000 | Value from the previous time period | 52 | | value_base | numeric | 1000 | Baseline value | 53 | | value_perchange_base | numeric | 50 | Percent change from baseline | 54 | | value_zscore_base | numeric | 50 | Z-score change since baseline | 55 | | label_level | string | Name1
This day's value: 1000
... | Label for when level of variable is shown | 56 | | label_base| string | Name1
This day's value: 1000
... | Label for when change since baseline value is shown. | 57 | 58 | ## Dashboard Data Prep 59 | 60 | The files in `03_dashboard_data_prep` further process data into datasets that are used for the dashboard. Due to the high volume of data, data transformations (e.g., aggregating, filtering, etc) are done outside of the dashboard in order to minimize the processing and data needed to be loaded in memory at any point as the dashboard is running. These scripts filter the cleaned telecom data into individual datasets so that no additional filtering or transformations need to be applied within the dashboard; the dashboard can just read the files then immediately use the data in the map, line graph and table. Here, we create smaller datasets that contain the same variables as above. Indicators include density, movement in, movement out, mean distance traveled, etc. 61 | 62 | The following datasets are made. 63 | 64 | | Dataset Type | Naming Convention | Description | 65 | | --- | --- | --- | 66 | | unit-level | [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Date/Week].Rds | For a given day or week, this dataset contains information for all units for a specified indicator. For O-D level datasets, values are aggregated to the specified origin or destination unit (eg, movement into unit from all other units). | 67 | | time-level | [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name].Rds | For a given admin unit, this dataset contains a time series of values for a specified indicator. | 68 | | unit-time-level | [Unit Type (eg, ADM1, ADM2, etc)]\_[Indicator Name]\_[Daily/Weekly]\_[Unit Name]\_[Date/Week].Rds | These datasets are only used for O-D variables. The show, for a given origin or destination unit, the movement in or out of that unit to all other units for the specified day/week. | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /data-checks/Archive/02_summary_stats.py: -------------------------------------------------------------------------------- 1 | 2 | #-----------------------------------------------------------------# 3 | # Exploratory analysis 4 | #-----------------------------------------------------------------# 5 | #-----------------------------------------------------------------# 6 | #### Settings 7 | 8 | from globals import * 9 | 10 | #-----------------------------------------------------------------# 11 | #### Set file paths 12 | 13 | DATA_GIS = DATA_path + 'proof-of-concept/geo_files/' 14 | INDICATORS_path = DATA_path + "proof-of-concept/panel_indicators/clean/" 15 | 16 | #-----------------------------------------------------------------# 17 | #### Load data 18 | 19 | i1 = pd.read_csv(INDICATORS_path + 'i1_3.csv') # Number of calls 20 | i3 = pd.read_csv(INDICATORS_path + 'i3_3.csv') # number of users 21 | i5 = pd.read_csv(INDICATORS_path + 'i5_3.csv') # orgin and destination 22 | i52 = pd.read_csv(INDICATORS_path + 'i5_2.csv') # orgin and destination 23 | 24 | i7 = pd.read_csv(INDICATORS_path + 'i7_3.csv') # distance travelled 25 | 26 | #-----------------------------------------------------------------# 27 | #### Aggregate data at the country level 28 | 29 | i1_agg = i1\ 30 | .groupby('date')\ 31 | .agg({'count' : np.sum})\ 32 | .reset_index()\ 33 | .sort_values('date') 34 | 35 | i3_agg = i3\ 36 | .groupby('date')\ 37 | .agg({'count' : np.sum})\ 38 | .reset_index()\ 39 | .sort_values('date')\ 40 | .rename(columns = {'count': 'subs'}) 41 | 42 | # Add number of subscribers to indicator 1 aggregated data 43 | i1_agg = i1_agg.merge(i3_agg, on = 'date') 44 | i1_agg['calls_p'] = i1_agg['count']/i1_agg['subs'] 45 | 46 | 47 | # OD matrix aggregated data 48 | i5_agg = i5\ 49 | .groupby('date')\ 50 | .agg({'subscriber_count' : np.mean, 51 | 'total_count' : np.sum, 52 | 'region_to': pd.Series.nunique, 53 | 'region_from': pd.Series.nunique})\ 54 | .reset_index()\ 55 | .sort_values('date') 56 | 57 | i5_agg = i5_agg.merge(i3_agg, on = 'date') 58 | i5_agg['moves_p_sub'] = i5_agg['subscriber_count']/i5_agg['subs'] 59 | 60 | 61 | #-----------------------------------------------------------------# 62 | # Comparisson between pre and post lockdown stats 63 | 64 | # Pre-post lockdown variables 65 | lockdown_date = np.datetime64(dt.date(2020, 3, 27)) 66 | 67 | i1['post'] = (i1['date'].astype('datetime64') > lockdown_date).astype(int) 68 | i3['post'] = (i3['date'].astype('datetime64') > lockdown_date).astype(int) 69 | i5['post'] = (i5['date'].astype('datetime64') > lockdown_date).astype(int) 70 | i7['post'] = (i7['date'].astype('datetime64') > lockdown_date).astype(int) 71 | 72 | i1_agg['post'] = (i1_agg['date'].astype('datetime64') > lockdown_date).astype(int) 73 | i5_agg['post'] = (i5_agg['date'].astype('datetime64') > lockdown_date).astype(int) 74 | i7['post'] = (i7['date'].astype('datetime64') > lockdown_date).astype(int) 75 | 76 | # Number of calls per user 77 | i1_agg['calls_p'].mean() 78 | i1_agg['calls_p'][i1_agg['post'] == 0].mean() 79 | i1_agg['calls_p'][i1_agg['post'] == 1].mean() 80 | 81 | # Number of districts visited? 82 | i5_agg['moves_p_sub'].mean() 83 | i5_agg['moves_p_sub'][i5_agg['post'] == 0].mean() 84 | i5_agg['moves_p_sub'][i5_agg['post'] == 1].mean() 85 | 86 | # Average distance travelled 87 | i7['mean_distance'].mean() 88 | 89 | i7['mean_distance'][i7['post'] == 0].mean() 90 | i7['mean_distance'][i7['post'] == 1].mean() 91 | 92 | # Number of wards 93 | i5['subscriber_count'].mean() 94 | i5['subscriber_count'][i5['post'] == 0].sum() 95 | i5['subscriber_count'][i5['post'] == 1].mean() 96 | 97 | # Distance travelled 98 | i7['mean_distance'].mean() 99 | i7['mean_distance'][i7['post'] == 0].mean() 100 | i7['mean_distance'][i7['post'] == 1].mean() 101 | 102 | 103 | #-----------------------------------------------------------------# 104 | # Plot number of regions that received visitors per day 105 | 106 | import plotly.express as px 107 | 108 | fig = px.line(i5_agg, x="date", y="region_to") 109 | fig.show() 110 | 111 | 112 | #-----------------------------------------------------------------# 113 | # Compare regions that received and sent visitors 114 | 115 | import plotly.graph_objects as go 116 | 117 | # set up plotly figure 118 | fig = go.Figure() 119 | 120 | # add line / trace 1 to figure 121 | fig.add_trace(go.Scatter( 122 | x=i5_agg['date'], 123 | y=i5_agg['region_to'], 124 | marker=dict( 125 | color="blue" 126 | ))) 127 | fig.add_trace(go.Scatter( 128 | x=i5_agg['date'], 129 | y=i5_agg['region_from'], 130 | marker=dict( 131 | color="red" 132 | ))) 133 | 134 | fig.show() 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /data-checks/Archive/03_i_specific_checks_i1_admin2.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # Create Admin2 Indicator 1 3 | #-----------------------------------------------------------------# 4 | 5 | EXPORT = False 6 | 7 | # import shapely 8 | # import geojsonio 9 | import os 10 | import geopandas as gpd 11 | import matplotlib.pyplot as plt 12 | import plotly.graph_objects as go 13 | import plotly.express as px 14 | from plotly.subplots import make_subplots 15 | 16 | import seaborn as sns; sns.set() 17 | 18 | 19 | #-----------------------------------------------------------------# 20 | # Load data 21 | 22 | # Indicator 1 panel data 23 | i1 = pd.read_csv( OUT_hfcs + 'Sheet comp panel/i1_admin3.csv') 24 | i1 = i1[i1.region != '99999'] 25 | # Wards data 26 | wards = gpd.read_file(DATA_GIS + 'wards_aggregated.geojson') 27 | wd = wards[['ward_id', 'province_name', 'district_id', 'district_name']] 28 | 29 | #-----------------------------------------------------------------# 30 | # Create wards mapping into disctrics 31 | 32 | i1 = i1.merge(wd, left_on = 'region', right_on = 'ward_id') 33 | 34 | 35 | # Aggregate values by district 36 | i1_agg = i1.groupby(['district_id', 'district_name', 'hour']).agg(lambda x : sum(x)).reset_index() 37 | 38 | # Make sure hour is in date time 39 | i1_agg['hour'] = i1_agg['hour'].astype('datetime64') 40 | i1_agg['district_id'] = i1_agg['district_id'].astype('int') 41 | 42 | #-----------------------------------------------------------------# 43 | # Transactions per hour by district line plot. 44 | 45 | # Line plot function definition 46 | def line_plot(reg_i, 47 | var = 'count_p', 48 | data = i1_agg, 49 | region = 'district_id', 50 | region_str = 'district_name', 51 | time = 'hour'): 52 | plt_data = data[data[region] == reg_i] 53 | fig = go.Figure() 54 | # Create line 55 | fig.add_trace(go.Scatter(x=plt_data[time], y=plt_data[var], 56 | mode='lines', 57 | name='lines')) 58 | # Additional formatting 59 | title = str(plt_data[region].iloc[0]) + plt_data[region_str].iloc[0] 60 | fig.update_layout( 61 | title=title, 62 | xaxis_title="Time", 63 | yaxis_title="Count", 64 | font=dict( 65 | # family="Courier New, monospace", 66 | size=18, 67 | color="#7f7f7f"), 68 | autosize=False, 69 | width=1200, 70 | height=700 71 | ) 72 | return(fig) 73 | 74 | # Districts list 75 | dists = list(set(i1_agg['district_id'])) 76 | 77 | # region_plt(d) 78 | # plt.show() 79 | 80 | # Loop over districts 81 | for d in dists: 82 | print(d) 83 | # Create plot 84 | plt_i = line_plot(d) 85 | # Export 86 | save_name = None 87 | save_name = 'i1_districts_count' + str(d) + '.png' 88 | plt_i.write_image(OUT_plots + 'daily_obs_region/' + save_name) 89 | 90 | 91 | #-----------------------------------------------------------------# 92 | # Transactions per hour by day. That is one plot per hour 93 | i1_agg['time'] = pd.to_datetime(i1_agg['hour']).dt.hour 94 | i1_agg['date'] = pd.to_datetime(i1_agg['hour']).dt.date 95 | 96 | 97 | def hourly_scatter(reg_i, 98 | var = 'count_p', 99 | data = i1_agg, 100 | region = 'district_id', 101 | region_str = 'district_name', 102 | time = 'date', 103 | facets = 'time'): 104 | # Subset data 105 | plt_data = data[data[region] == reg_i] 106 | # Create plot 107 | fig = px.scatter(plt_data, 108 | x= time, 109 | y = var, 110 | facet_col = facets, 111 | facet_col_wrap = 5, 112 | width=1200, 113 | height=700) 114 | # Additional formatting 115 | title = str(plt_data[region].iloc[0]) + ' - ' + plt_data[region_str].iloc[0] 116 | fig.update_layout(title_text= title) 117 | fig.update_yaxes(matches=None) 118 | fig.for_each_annotation(lambda a: a.update(text=a.text.replace("time=", ""))) 119 | # Format axis titles 120 | return(fig) 121 | 122 | # Loop over districts 123 | for d in dists: 124 | print(d) 125 | # Create plot 126 | plt_i = hourly_scatter(d) 127 | # Export 128 | save_name = None 129 | save_name = 'i1_hourly_obs_byhour' + str(d) + '.png' 130 | plt_i.write_image(OUT_plots + 'hourly_obs_by_hour_region/' + save_name) 131 | 132 | 133 | 134 | 135 | #-----------------------------------------------------------------# 136 | # Export data 137 | if EXPORT: 138 | i1_agg.to_csv(OUT_hfcs + 'Sheet comp panel/i1_admin2.csv', index = False) 139 | 140 | 141 | 142 | 143 | #-----------------------------------------------------------------# 144 | # DRAFT 145 | -------------------------------------------------------------------------------- /dashboard-dataviz/figures/i5_into_out.R: -------------------------------------------------------------------------------- 1 | # i3 Figures 2 | 3 | unit <- "wards" 4 | 5 | # Load Data -------------------------------------------------------------------- 6 | if(unit %in% "wards"){ 7 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM3_PATH 8 | } 9 | 10 | if(unit %in% "districts"){ 11 | CLEAN_DATA_PATH <- CLEAN_DATA_ADM2_PATH 12 | } 13 | 14 | data <- readRDS(file.path(CLEAN_DATA_PATH, "i5_daily.Rds")) 15 | 16 | data_into <- data %>% 17 | group_by(region_dest, name_dest, date) %>% 18 | summarise(value = sum(value, na.rm=T)) %>% 19 | dplyr::rename(region = region_dest, 20 | name = name_dest) 21 | 22 | data_out <- data %>% 23 | group_by(region_origin, name_origin, date) %>% 24 | summarise(value = sum(value, na.rm=T)) %>% 25 | dplyr::rename(region = region_origin, 26 | name = name_origin) 27 | 28 | ## 29 | data_into <- data_into %>% 30 | group_by(region) %>% 31 | mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T), 32 | value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>% 33 | ungroup() %>% 34 | mutate(value_change = value_post - value_pre) %>% 35 | mutate(value_change_rank = rank(value_change)) 36 | data_into$value_change_rank[is.na(data_into$value_change)] <- NA 37 | 38 | data_out <- data_out %>% 39 | group_by(region) %>% 40 | mutate(value_pre = mean(value[date < "2020-03-30"], na.rm = T), 41 | value_post = mean(value[date > "2020-03-30"], na.rm = T)) %>% 42 | ungroup() %>% 43 | mutate(value_change = value_post - value_pre) %>% 44 | mutate(value_change_rank = rank(value_change)) 45 | data_out$value_change_rank[is.na(data_out$value_change)] <- NA 46 | 47 | 48 | ## FIX 49 | data_into <- data_into[!is.na(data_into$date),] 50 | data_into$date <- data_into$date %>% as.Date() 51 | 52 | data_out <- data_out[!is.na(data_out$date),] 53 | data_out$date <- data_out$date %>% as.Date() 54 | 55 | 56 | # Into ------------------------------------------------------------------------- 57 | rank_high <- data_into$value_change_rank %>% unique() %>% sort() %>% head(5) 58 | 59 | p_high <- data_into %>% 60 | dplyr::filter(value_change_rank %in% rank_high) %>% 61 | ggplot(aes(x = date, y = value)) + 62 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) + 63 | geom_line() + 64 | labs(x = "", 65 | y = "Number of Subscribers", 66 | title = "Largest Decreases") + 67 | facet_wrap(~name, 68 | scales = "free_y", 69 | nrow = 1) + 70 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 71 | strip.text.x = element_text(face = "bold")) 72 | p_high 73 | 74 | rank_low <- data_into$value_change_rank %>% unique() %>% sort() %>% tail(5) 75 | 76 | p_low <- data_into %>% 77 | dplyr::filter(value_change_rank %in% rank_low) %>% 78 | ggplot(aes(x = date, y = value)) + 79 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) + 80 | geom_line() + 81 | labs(x = "", 82 | y = "", 83 | title = "Largest Increases") + 84 | facet_wrap(~name, 85 | scales = "free_y", 86 | nrow = 1) + 87 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 88 | strip.text.x = element_text(face = "bold")) 89 | 90 | p_low 91 | 92 | p_all <- ggarrange(p_high, p_low, nrow = 2) 93 | ggsave(p_all, filename = file.path(figures_path, 94 | paste0(unit, "_netmovement_top_chng.png")), 95 | height = 5, width=12) 96 | 97 | 98 | 99 | # Out Of ------------------------------------------------------------------------- 100 | rank_high <- data_out$value_change_rank %>% unique() %>% sort() %>% head(5) 101 | 102 | p_high <- data_out %>% 103 | dplyr::filter(value_change_rank %in% rank_high) %>% 104 | ggplot(aes(x = date, y = value)) + 105 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) + 106 | geom_line() + 107 | labs(x = "", 108 | y = "Number of Subscribers", 109 | title = "Largest Decreases") + 110 | facet_wrap(~name, 111 | scales = "free_y", 112 | nrow = 1) + 113 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 114 | strip.text.x = element_text(face = "bold")) 115 | p_high 116 | 117 | rank_low <- data_out$value_change_rank %>% unique() %>% sort() %>% tail(5) 118 | 119 | p_low <- data_out %>% 120 | dplyr::filter(value_change_rank %in% rank_low) %>% 121 | ggplot(aes(x = date, y = value)) + 122 | geom_vline(aes(xintercept = "2020-03-30" %>% as.Date()), color="red", alpha = 0.7) + 123 | geom_line() + 124 | labs(x = "", 125 | y = "", 126 | title = "Largest Increases") + 127 | facet_wrap(~name, 128 | scales = "free_y", 129 | nrow = 1) + 130 | theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), 131 | strip.text.x = element_text(face = "bold")) 132 | 133 | p_low 134 | 135 | p_all <- ggarrange(p_high, p_low, nrow = 2) 136 | ggsave(p_all, filename = file.path(figures_path, 137 | paste0(unit, "_netmovement_top_chng.png")), 138 | height = 5, width=12) 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /dashboard-dataviz/dashboard/preparing_data_for_dashboard/02_clean_telecom_data/README.md: -------------------------------------------------------------------------------- 1 | # Clean Aggregated Telecom Data 2 | 3 | These scripts clean and standardize aggregated telecom data. This is a necessary 4 | part of the process towards preparing datasets for the dashboard. 5 | 6 | ## Dataset 7 | 8 | A number of indicators are cleaned. To facilitate further processing for the datasets 9 | to be used in the dashboard, all cleaned datasets have the following standardized 10 | variables: 11 | 12 | | variable | format | example | description | 13 | |---|---|---|---| 14 | | region | string | ZONE123456 | Unique identifier of the spatial unit | 15 | | name | string | Name| Spatial unit name | 16 | | date | date or string | 2020-02-01| The date | 17 | | value | numeric | 1000 | Value (e.g., number of subscribers, number of trips, distance traveled) | 18 | | value_lag | numeric | 1000 | Value from the previous time period | 19 | | value_base | numeric | 1000 | Baseline value | 20 | | value_perchange_base | numeric | 50 | Percent change from baseline | 21 | | value_zscore_base | numeric | 50 | Z-score change since baseline | 22 | | label_level | string | Name
This day's value: 1000
... | Label for when level of variable is shown | 23 | | label_base| string | Name
This day's value: 1000
... | Label for when change since baseline value is shown. | 24 | 25 | ## telecom prep [tp] functions 26 | 27 | The `_tp_functions.R` file defines a number of functions to help standardize 28 | the cleaning process. 29 | 30 | #### Set/Standardize Variables 31 | 32 | * __tp_standardize_vars:__ Renames the date, region and value variable names to 33 | `date`, `region` and `value`. The remaining `tp_` functions take these variable 34 | names as defaults. 35 | * __tp_standardize_vars_od:__ Renames variables for origin-destination matrices. 36 | Inputs include the date, region_origin, region_destination and value variables. This function 37 | standardizes those variables and creates a new variable that concatenates region_origin and 38 | region_destination as a unique identifier for the origin-destination pair. 39 | 40 | #### Clean Dataset 41 | 42 | * __tp_fill_regions:__ Checks for any regions that are missing in the telecom data that are in the polygon/admin data. Adds these regions to the dataset. 43 | * __tp_clean_day:__ If `date` is day of week, cleans into a `Date` variable. 44 | * __tp_clean_week:__ Transforms `date` to represent the week (e.g., `Feb 01 - Feb 07`). Handles 45 | both integers (e.g., week `6`) and groups day of week (e.g., `2020-02-01`) 46 | * __tp_agg_day_to_week:__ Aggregates the dataset from daily to weekly. 47 | * __tp_complete_date_region:__ Completes data with all data/region pairs. 48 | * __tp_complete_date_region_od:__ Completes data with all data/region pairs for 49 | origin-destination datasets. 50 | * __tp_add_polygon_data:__ Adds polygon data to dataset (primarily for `name`) 51 | * __tp_add_polygon_data_od:__ Adds polygon data to dataset for origin-destination data. 52 | Adds all polygon variables as `_origin` and `_dest` 53 | 54 | #### Clean Value Variable 55 | 56 | * __tp_interpolate_outliers:__ Interpolates outliers on the `value` variable. Includes 57 | options for replacing negative, positive or both types of outliers, and for what is considered 58 | and outlier. Defaults to 4 standard deviations. 59 | * __tp_replace_zeros:__ Interpolates values of zero. Only interpolates when the 60 | number of zeros is equal to or less than `N_zero_thresh`. 61 | 62 | #### Add Variables 63 | 64 | * __tp_add_percent_change:__ Adds percent change from the last time period (day or week) 65 | on the `value` variable 66 | * __tp_add_baseline_comp_stats:__ Adds percent change and z-score change values 67 | compared to baseline using `value` variable. 68 | 69 | #### Add Labels for Leaflet 70 | 71 | * __tp_add_label_level:__ Adds label for the original (level) value to be used in 72 | leaflet in the dashboard. 73 | * __tp_add_label_baseline:__ Adds label for change metrics since baseline to be used 74 | in leaflet in the dashboard. 75 | 76 | 77 | ## Example cleaning 78 | 79 | The following shows an example of cleaning data. Here we have two datasets: 80 | 81 | 1. __df_day:__ Which is a daily dataset of the number of subscribers at the unit level and contains three 82 | relevant variables: `visit_date` (e.g., `2020-02-01T00:00:00.000Z`), `region` (e.g., `ZONE123456`) and 83 | `subscriber_count` (e.g., `1000`). 84 | 85 | 2. __admin_sp:__ Which is a SpatialPolygonsDataFrame of units. It contains the variables 86 | described in `01_clean_spatial_data` (i.e., `name`, `region`, `area` and `adm1`). 87 | 88 | ```r 89 | df_day_clean <- df_day %>% 90 | 91 | # Standardizes variable names so can avoid defining variable names in the 92 | # tp_ functions. 93 | tp_standardize_vars("visit_date", "region", "subscriber_count") %>% 94 | 95 | # Clean dataset 96 | tp_clean_date() %>% 97 | tp_fill_regions(admin_sp) %>% 98 | tp_complete_date_region() %>% 99 | tp_add_polygon_data(admin_sp) %>% 100 | 101 | # Interpolate/Clean Values 102 | tp_interpolate_outliers(NAs_as_zero = T) %>% 103 | tp_replace_zeros(NAs_as_zero = T) %>% 104 | 105 | # Add change metrics 106 | tp_add_baseline_comp_stats() %>% 107 | tp_add_percent_change() %>% 108 | 109 | # Add labels 110 | tp_add_label_level(timeunit = "day", OD = F) %>% 111 | tp_add_label_baseline(timeunit = "day", OD = F) 112 | ``` 113 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/voronoi.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | from geovoronoi import voronoi_regions_from_coords 3 | 4 | import os 5 | if os.environ['HOME'] != '/root': 6 | from modules.tower_clustering import * 7 | 8 | ## Class to handle spark and df in session 9 | class voronoi_maker: 10 | """Class to handle all voronoi transformations and files for a specific df 11 | 12 | 13 | Attributes 14 | ---------- 15 | datasource : an instance of DataSource class. 16 | shape : a geopandas dataframe. Shapefile to use for clustering 17 | region_var : a string. Name of the region variable in the shapefile. 18 | sites : a string. Name of the attribute of datasource that holds the tower coordinates. 19 | spark_df : a pyspark dataframe. Holds the cdr data 20 | result_path : a string. Where to save results. 21 | clusterer : an instance of tower_clusterer. 22 | sites_df : a pyspark dataframe. Holds clustered sites. 23 | distances_pd_long : a pyspark dataframe. Holds distances between sites. 24 | sites : a pyspark dataframe. Clustered sites without NAs. 25 | 26 | Methods 27 | ------- 28 | make_voronoi() 29 | orchestrates all methods 30 | 31 | filter_towers_for_voronoi() 32 | we can't run on duplicates (location duplicates), so we have to filter them out first 33 | 34 | make_shape(towers_for_voronoi) 35 | makes a buffer around towers to create bubble shapes 36 | 37 | create_voronoi(towers_for_voronoi, shape) 38 | creats voronoi cells from tower list 39 | 40 | save_voronoi(poly_shapes) 41 | saves voronoi shape file and voronoi-tower mapping 42 | 43 | assign_to_spark_df() 44 | adds voronoi id to cdr records (not used currently) 45 | """ 46 | 47 | def __init__(self, 48 | datasource, 49 | shape, 50 | region_var, 51 | sites = 'tower_sites'): 52 | """ 53 | Parameters 54 | ---------- 55 | """ 56 | self.spark = datasource.spark 57 | self.datasource = datasource 58 | self.spark_df = datasource.parquet_df 59 | self.result_path = datasource.results_path 60 | self.clusterer = tower_clusterer(datasource, shape, region_var, sites) 61 | self.clusterer.cluster_towers() 62 | self.sites_df = self.clusterer.sites_with_clusters.loc[:,['cell_id', 'centroid_LAT', 'centroid_LNG']].rename(columns={'centroid_LAT' : 'LAT', 'centroid_LNG': 'LNG'}) 63 | self.distances_pd_long = self.clusterer.distances_pd_long 64 | if (self.sites_df.columns == ['cell_id', 'LAT', 'LNG']).all(): 65 | self.sites = self.sites_df[self.sites_df.LAT.notna()] 66 | else: 67 | raise 'The sites dataframe does not have the correct columns / column order. Should be cell_id, LAT, LNG' 68 | 69 | def make_voronoi(self): 70 | 71 | towers_for_voronoi = self.filter_towers_for_voronoi() 72 | shape, towers_for_voronoi = self.make_shape(towers_for_voronoi = towers_for_voronoi) 73 | poly_shapes = self.create_voronoi(shape = shape, towers_for_voronoi = towers_for_voronoi) 74 | self.save_voronoi(poly_shapes = poly_shapes) 75 | return self.voronoi_dict 76 | 77 | def filter_towers_for_voronoi(self): 78 | 79 | # get unique towers in data 80 | distinct_towers = self.spark_df.select('location_id').distinct().toPandas() 81 | 82 | # filter list of towers for unique towers 83 | self.sites = self.sites[self.sites.cell_id.isin(list(distinct_towers.location_id))] 84 | 85 | # Assign gpd 86 | self.towers = gpd.GeoDataFrame( 87 | self.sites, geometry = gpd.points_from_xy(self.sites.LNG, self.sites.LAT), crs = 'epsg:4326') 88 | 89 | # Find towers that are in same location 90 | self.towers.LAT = self.towers.LAT.apply(lambda x: round(x,4)) 91 | self.towers.LNG = self.towers.LNG.apply(lambda x: round(x,4)) 92 | towers_for_voronoi = self.towers[~self.towers.duplicated(subset = ['LAT', 'LNG'])] 93 | 94 | return towers_for_voronoi 95 | 96 | def make_shape(self, towers_for_voronoi): 97 | 98 | # Make border shape 99 | radians = 35 / 40000 * 360 100 | self.shape = towers_for_voronoi.buffer(radians).unary_union 101 | 102 | return self.shape, towers_for_voronoi 103 | 104 | def create_voronoi(self, towers_for_voronoi, shape): 105 | 106 | # Create np array of vertices 107 | points = towers_for_voronoi.loc[:,['LNG','LAT']].to_numpy() 108 | 109 | # Create voronoi shapes 110 | self.poly_shapes, pts, poly_to_pt_assignments = voronoi_regions_from_coords(points, shape) 111 | 112 | return self.poly_shapes 113 | 114 | def save_voronoi(self, poly_shapes): 115 | 116 | # Save voronoi 117 | self.voronoi_pd = pd.DataFrame(poly_shapes) 118 | self.voronoi_pd.columns =['geometry'] 119 | self.voronoi_gpd = deepcopy(self.voronoi_pd) 120 | self.voronoi_gpd = gpd.GeoDataFrame(self.voronoi_gpd, geometry = 'geometry', crs = 'epsg:4326') 121 | self.voronoi_pd['geometry'] = self.voronoi_pd.geometry.astype(str) 122 | self.voronoi_pd = self.voronoi_pd.reset_index() 123 | self.voronoi_pd.columns = ['region', 'geometry'] 124 | self.voronoi_pd = self.spark.createDataFrame(self.voronoi_pd) 125 | save_csv(self.voronoi_pd, self.result_path, self.datasource.country_code + '_voronoi_shapefile') 126 | 127 | # Match towers to voronoi so that all towers are assigned to a cell 128 | voronoi_towers = gpd.sjoin(self.voronoi_gpd, self.towers, op="intersects") 129 | self.voronoi_dict = voronoi_towers.drop(['geometry', 'LAT', 'LNG', 'index_right'], axis = 'columns') 130 | self.voronoi_dict = self.voronoi_dict.reset_index() 131 | self.voronoi_dict.columns = ['region', 'cell_id'] 132 | self.voronoi_dict = self.spark.createDataFrame(self.voronoi_dict) 133 | save_csv(self.voronoi_dict, self.result_path, self.datasource.country_code + '_voronoi_tower_map') 134 | 135 | def assign_to_spark_df(self): 136 | 137 | self.new_spark_df = self.spark_df.join(self.voronoi_dict, self.spark_df['location_id'] == self.voronoi_dict['cell_id'], how = 'left') 138 | return self.new_spark_df 139 | -------------------------------------------------------------------------------- /data-checks/Archive/Descr-exploratory/fb-comparisson-draft.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # CDR vs FB comparisson 3 | #-----------------------------------------------------------------# 4 | # TO DO 5 | 6 | 7 | # Read documentation 8 | 9 | # Do same process for movement data 10 | 11 | # Look at the results 12 | 13 | # Do the merging with only overlaping dates 14 | 15 | #-----------------------------------------------------------------# 16 | # Settings 17 | 18 | import os 19 | import pandas as pd 20 | import numpy as np 21 | import glob 22 | 23 | base_path = 'C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/proof-of-concept/' 24 | 25 | fb_data = base_path + 'Facebook Data/' 26 | cdr_path = base_path + 'panel_indicators/' 27 | 28 | doc_path = base_path + 'documentation/' 29 | 30 | OUT_path = base_path + '/outputs/data-checks/' 31 | 32 | data_pop = fb_data + 'Population Administrative Regions/' 33 | data_mov = fb_data + 'Movement Admin Regions/' 34 | 35 | # File names need to be updated 36 | # # prefix = 'Coronavirus Disease Prevention Map Apr 16 2020 Id Movement between Administrative Regions__' 37 | # prefix_pop = 'Coronavirus Disease Prevention Map Apr 16 2020 Id Facebook Population (Administrative Regions)__' 38 | # prefix_mov = 'Coronavirus Disease Prevention Map Apr 16 2020 Id Movement between Administrative Regions__' 39 | 40 | 41 | #-----------------------------------------------------------------# 42 | # Load FB data 43 | 44 | # Population - Load all csv files in the folder 45 | files_pop = glob.glob(data_pop + prefix_pop + "*.csv") 46 | files_mov = glob.glob(data_mov + prefix_mov + "*.csv") 47 | fpop = pd.concat([pd.read_csv(f, encoding='latin1') for f in files_pop], ignore_index=True) 48 | fmov = pd.concat([pd.read_csv(f, encoding='latin1') for f in files_mov], ignore_index=True) 49 | 50 | 51 | # df1 = pd.read_csv(data_pop + prefix + '2020-06-24 0000.csv') 52 | # df2 = pd.read_csv(data_pop + prefix + '2020-06-24 0800.csv') 53 | # df3 = pd.read_csv(data_pop + prefix + '2020-06-24 1600.csv') 54 | 55 | #-----------------------------------------------------------------# 56 | # Load CDR data 57 | 58 | # Using i3, Count of unique subscribers, for now. Not sure how this 59 | # fb indicator was calculated, so it might make sense to use another 60 | # indicator 61 | cpop = pd.read_csv(cdr_path + 'i3_admin2.csv') 62 | 63 | 64 | # i5 65 | cmov = pd.read_csv(cdr_path + 'i5_admin2.csv') 66 | 67 | # load and merge keys for str name matching 68 | a2_keys = pd.read_csv(doc_path + 'keys_districts.csv') 69 | a2_keys = a2_keys[['id2', 'name2']] 70 | 71 | # process cdr population 72 | cp = cpop.merge(a2_keys, 73 | left_on= 'region', 74 | right_on = 'id2') 75 | 76 | cp['date'] = pd.to_datetime(cp['day']).dt.date 77 | 78 | cp = cp[['date', 'id2', 'name2','count_p']]\ 79 | .rename(columns = {'name2' : 'name', 80 | 'count_p' : 'count'}) 81 | 82 | # process cdr movement 83 | 84 | cmov['date'] = pd.to_datetime(cmov['connection_date']).dt.date 85 | 86 | 87 | cm = cmov\ 88 | .merge(a2_keys, 89 | left_on = 'region_from', 90 | right_on= 'id2')\ 91 | .rename(columns = {'name2' : 'st_name'})\ 92 | .merge(a2_keys, 93 | left_on = 'region_to', 94 | right_on= 'id2')\ 95 | .rename(columns = {'name2' : 'ed_name', 96 | 'total_count_p' : 'count'})\ 97 | [['date', 'st_name','ed_name', 'count']] 98 | 99 | 100 | #-----------------------------------------------------------------# 101 | # Process FB data 102 | 103 | def process(df, time, group_by, count): 104 | # Remove other countried 105 | df = df.loc[df['country'] == 'ZW'] 106 | # Date var 107 | df['date'] = pd.to_datetime(df[time]).dt.date 108 | # Group by 109 | gby = ['date'] 110 | gby.extend(group_by) 111 | # Aggregate 112 | agg = df\ 113 | .groupby(gby)\ 114 | .agg({count : np.sum})\ 115 | .reset_index() 116 | 117 | return agg 118 | 119 | 120 | fp = process(fpop, 'date_time', ['polygon_name'], 'n_crisis')\ 121 | .rename(columns = {'polygon_name' : 'name', 122 | 'n_crisis' : 'count'}) 123 | fm = process(fmov, 'date_time', ['start_polygon_name', 'end_polygon_name'], 'n_crisis')\ 124 | .rename(columns = {'start_polygon_name' : 'st_name', 125 | 'end_polygon_name' : 'ed_name', 126 | 'n_crisis' : 'count'}) 127 | 128 | #-----------------------------------------------------------------# 129 | # Merge 130 | 131 | # Make sure I'm comparing same period 132 | overlapping_dates = set(cp['date']).intersection(set(fp['date'])) 133 | 134 | fp = fp[fp['date'].isin(overlapping_dates)] 135 | cp = cp[cp['date'].isin(overlapping_dates)] 136 | 137 | # String matching corrections 138 | fp['name'].loc[fp['name'] == 'Hwedza'] = 'Wedza' 139 | fp['name'].loc[fp['name'] == 'Chirumanzu'] = 'Chirumhanzu' 140 | fp['name'].loc[fp['name'] == 'Bulilimamangwe'] = 'Bulilima (North)' 141 | 142 | 143 | 144 | # def agg_rank(df, gby = 'name'): 145 | # df = df.groupby(gby).agg('mean').reset_index() 146 | # df["rank"] = df["count"].rank(ascending = False) 147 | # return df.sort_values('rank') 148 | 149 | # foo 150 | 151 | 152 | # full_period_comp = cp\ 153 | # .merge(fp, 154 | # on = ['name', 'date'], 155 | # how = 'outer', 156 | # suffixes=('', '_fb'))\ 157 | # .sort_values('rank') 158 | 159 | 160 | 161 | 162 | #-----------------------------------------------------------------# 163 | # Aggregated merge 164 | 165 | # Create full period ranking 166 | def agg_rank(df, gby = 'name'): 167 | df = df.groupby(gby).agg('mean').reset_index() 168 | df["rank"] = df["count"].rank(ascending = False) 169 | return df.sort_values('rank') 170 | 171 | cp_rank = agg_rank(cp) 172 | fp_rank = agg_rank(fp) 173 | 174 | 175 | 176 | full_period_comp = cp_rank\ 177 | .merge(fp_rank, 178 | on = 'name', 179 | how = 'outer', 180 | suffixes=('', '_fb'))\ 181 | .sort_values('rank') 182 | 183 | #-----------------------------------------------------------------# 184 | # Export 185 | 186 | # full_period_comp.to_csv(OUT_path + 'i3_fb_comp.csv', 187 | # index = False) 188 | 189 | 190 | # agg_rank(fm, ['st_name', 'ed_name']) 191 | # agg_rank(cm, ['st_name', 'ed_name']) 192 | 193 | fp_rank.sort_values('name') -------------------------------------------------------------------------------- /data-checks/Archive/data_files_comparisson.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # CSV comparisson 3 | #-----------------------------------------------------------------# 4 | 5 | import os 6 | import re 7 | import numpy as np 8 | import pandas as pd 9 | import datetime as dt 10 | import matplotlib.pyplot as plt 11 | # import seaborn as sns 12 | from datetime import datetime 13 | 14 | 15 | IRESULTS = DATA_path + "Isaac-results/" 16 | 17 | IFLOW = IRESULTS + "flowminder/" 18 | ICUST = IRESULTS + "custom/" 19 | 20 | #-----------------------------------------------------------------# 21 | # Load files 22 | 23 | filenames = os.listdir(IFLOW) 24 | 25 | # Custom indicatos 26 | # filenames = os.listdir(ICUST) 27 | 28 | #-----------------------------------------------------------------# 29 | # Make sure data is compatible 30 | 31 | # Masure order and date formats are the same 32 | def compat(data, 33 | timevar, 34 | standardize_time = False, 35 | regvar = 'region'): 36 | new_data = data 37 | 38 | # If has a date convert to standard 39 | if len(timevar) != 0: 40 | timevar = np.asscalar(np.array(timevar)) 41 | new_data[timevar] = pd.to_datetime(new_data[timevar]).dt.date 42 | # Make sure order is the same 43 | if len(data.columns) == 2: 44 | new_data = new_data.sort_values( by = [new_data.columns[0], new_data.columns[1] ]) 45 | else : 46 | new_data = new_data.sort_values( by = [new_data.columns[0], new_data.columns[1], new_data.columns[2] ]) 47 | return new_data 48 | 49 | 50 | # Comparisson outputs function 51 | def compare_dfs(df1,df2, filename = None, outputdf = False): 52 | # Set time var (GARMBIARRA WARNING) 53 | time = list(df1.columns[list(df1.columns.str.contains('date'))]) 54 | # Process data to be in the same format 55 | df1 = compat(df1, timevar = time) 56 | df2 = compat(df2, timevar = time) 57 | # Merge dfs 58 | index_cols = list(df1.columns[0:-1]) 59 | #Make sure merging columns are str 60 | df1[index_cols] = df1[index_cols].astype(str) 61 | df2[index_cols] = df2[index_cols].astype(str) 62 | cdf = df1.merge(df2, left_on = index_cols, right_on = index_cols) 63 | #--------------------# 64 | # Calculate differeces 65 | # Proportion of mismatches 66 | p_rows_diff = sum(cdf[cdf.columns[-1]] != cdf[cdf.columns[-2]])/cdf.shape[0] 67 | p_rows_diff = str(round(p_rows_diff, 4)*100) 68 | # Value difference 69 | cdf['pdiff'] = ((cdf[cdf.columns[-1]] - 70 | cdf[cdf.columns[-2]])/cdf[cdf.columns[-2]]) 71 | # Average difference 72 | avg_diff = str(round(cdf['pdiff'].mean(skipna = True), 4)*100) 73 | 74 | if outputdf: 75 | return(cdf) 76 | else: 77 | # Print report 78 | print(filename) 79 | print('N rows ours: ' + str(df1.shape[0]) ) 80 | print("N rows Isaac's: " + str(df2.shape[0])) 81 | print('Of matching rows:') 82 | print(' - Average difference of count column: ' + avg_diff + "%") 83 | print(' - Percentage rows that are different: ' + p_rows_diff + "%") 84 | print('\n') 85 | 86 | 87 | #-----------------------------------------------------------------# 88 | # Flowminder csvs 89 | for i in range(0, len(filenames)-1): 90 | file_i = filenames[i] 91 | # print(i) 92 | # print(filenames[i]) 93 | # Our file 94 | d1 = pd.read_csv(FLOWM_adm3_path + file_i) 95 | # I's file 96 | d2 = pd.read_csv(IFLOW + file_i) 97 | 98 | # Run comparisson 99 | print(i) 100 | print(filenames[i]) 101 | compare_dfs(d1,d2) 102 | 103 | #-----------------------------------------------------------------# 104 | # Custom indicatos csv 105 | 106 | # Indicator 1 # 107 | 108 | i1 = pd.read_csv(I1_Adm3_path + 'transactions_per_hour.csv') 109 | i1i = pd.read_csv(ICUST + 'transactions_per_hour.csv') 110 | 111 | # i1 = compat(i1, timevar = []) 112 | # i1i = compat(i1i, timevar = []) 113 | 114 | cdf = compare_dfs(i1,i1i, outputdf = True) 115 | cdf["diff_flag"] = cdf["count_x"] != cdf["count_y"] 116 | cdf['date'] = pd.to_datetime(cdf['hour']).dt.date 117 | 118 | 119 | foo = cdf[cdf['diff_flag']] 120 | 121 | foo.to_csv('C:/Users/wb519128/Desktop/i1_differences.csv', 122 | index = False) 123 | 124 | # Indicator 3 # 125 | 126 | i3 = pd.read_csv(I3_Adm3_path + 'unique_subscribers_per_day.csv') 127 | i3i = pd.read_csv(ICUST + 'unique_subscribers_per_day.csv') 128 | 129 | cdf3 = compare_dfs(i3,i3i, outputdf = True) 130 | cdf3["diff_flag"] = cdf3["count_x"] != cdf3["count_y"] 131 | cdf3['day'] = pd.to_datetime(cdf3['day']).dt.date 132 | 133 | 134 | cdf3[cdf3['day'] == dt.date(2020, 2, 3)] 135 | 136 | foo = cdf3[cdf3['diff_flag']] 137 | 138 | foo.to_csv('C:/Users/wb519128/Desktop/i3_differences.csv', 139 | index = False) 140 | 141 | 142 | 143 | # Indicator 5 # 144 | I5_Adm3_path 145 | 146 | i5 = pd.read_csv(I5_Adm3_path + 'origin_destination_connection_matrix_per_day.csv') 147 | i5i = pd.read_csv(ICUST + 'origin_destination_connection_matrix_per_day.csv') 148 | 149 | #cdf5 = compare_dfs(i5,i5i, outputdf = True) 150 | #cdf5["diff_flag"] = cdf5["total_count_x"] != cdf5["total_count_y"] 151 | 152 | compare_dfs(i5,i5i, outputdf = False) 153 | 154 | bar = i5.merge(i5i, on = ['connection_date', 'region_from', 'region_to']) 155 | bar["diff_flag"] = bar["od_count_x"] != bar["od_count_y"] 156 | 157 | diff_day_df = bar.groupby('connection_date').sum() 158 | diff_day_df = diff_day_df.reset_index() 159 | 160 | diff_day_df['day'] = pd.to_datetime(diff_day_df['connection_date']).dt.day 161 | 162 | plt.plot('day', 163 | 'diff', 164 | data = diff_day_df) 165 | 166 | # set(bar['connection_date']) 167 | # len(set(pd.to_datetime(foo['connection_date']).dt.date)) 168 | # len(set(foo['region_from'])) 169 | 170 | # # Absolute difference by day 171 | # bar['diff'] = bar['od_count_x']- bar['od_count_y'] 172 | 173 | 174 | 175 | # foo = bar[bar['diff_flag']] 176 | 177 | # foo['diff'] = foo['od_count_x']- foo['od_count_y'] 178 | # foo['diff'].mean() 179 | 180 | export_i5_merged = bar.rename( 181 | columns = { 182 | 'subscriber_count_x' : 'subscriber_count', 183 | 'subscriber_count_y' : 'subscriber_count_isaac', 184 | 'od_count_x': 'od_count_x', 185 | 'od_count_y': 'od_count_isaac', 186 | 'total_count_x' : 'total_count', 187 | 'total_count_y' : 'total_count_isaac'}) 188 | 189 | 190 | 191 | export_i5_merged\ 192 | .to_csv('C:/Users/wb519128/Desktop/i5_merged_with_Isaacs.csv', 193 | index = False) 194 | 195 | 196 | 197 | 198 | #-----------------------------------------------------------------# 199 | # DRAFT 200 | 201 | file_i = filenames[0] 202 | 203 | d1 = pd.read_csv(FLOWM_adm3_path + file_i) 204 | d2 = pd.read_csv(IFLOW + file_i) 205 | 206 | cdf = compare_dfs(d1,d2, outputdf = True) 207 | cdf["diff_flag"] = cdf["count_x"] != cdf["count_y"] -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/outliers.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | # Class to help counting outliers 3 | class outlier_counter: 4 | """Class to count outliers 5 | 6 | Attributes 7 | ---------- 8 | calls : a dataframe. which data to process 9 | spark : an initialised spark connection. 10 | thresholds : a dictionary with outlier thresholds to be used. 11 | 12 | Methods 13 | ------- 14 | count() 15 | count outliers and print results 16 | 17 | print_results(df) 18 | print results of outlier counts 19 | """ 20 | 21 | def __init__(self, 22 | calls, 23 | spark = spark, 24 | thresholds = {'min_transactions' : 3, 25 | 'max_avg_transactions' : 100, 26 | 'max_transactions_in_single_day' : 200}): 27 | """ 28 | Parameters 29 | ---------- 30 | 31 | """ 32 | self.calls = calls 33 | self.spark = spark 34 | self.counts = {} 35 | self.dfs = {} 36 | self.thresholds = thresholds 37 | 38 | 39 | def count(self): 40 | # count all records 41 | self.counts['all_records'] = self.calls.count() 42 | 43 | # count of days in dataframe 44 | self.counts['number_of_days'] = self.calls.select('call_date').distinct().count() 45 | 46 | # Count number of distinct users 47 | self.counts['distinct_ids'] = self.calls.select('msisdn').distinct().count() 48 | 49 | # Get # of records per user 50 | self.dfs['records_per_user'] = self.calls.groupby('msisdn').count() 51 | 52 | # Get # of records per user per day 53 | self.dfs['records_per_user_per_day'] = self.calls.groupby('msisdn', 'call_date').count() 54 | 55 | # Identify daily usage outlier msidsdn 56 | self.dfs['too_few_transactions'] = self.dfs['records_per_user']\ 57 | .where(F.col('count') < self.thresholds['min_transactions'])\ 58 | .select('msisdn').distinct() 59 | self.dfs['too_many_avg_transactions'] = self.dfs['records_per_user']\ 60 | .where(F.col('count') > (self.counts['number_of_days'] * \ 61 | self.thresholds['max_avg_transactions']))\ 62 | .select('msisdn').distinct() # more than __ calls and texts per day on average 63 | self.dfs['too_many_transactions_in_single_day'] = \ 64 | self.dfs['records_per_user_per_day']\ 65 | .where(F.col('count') > self.thresholds['max_transactions_in_single_day'])\ 66 | .select('msisdn').distinct() # more than __ calls and texts in a single day 67 | 68 | # Count the outlier accounts 69 | self.counts['too_few_transactions'] = \ 70 | self.dfs['too_few_transactions'].count() 71 | self.counts['too_many_avg_transactions'] = \ 72 | self.dfs['too_many_avg_transactions'].count() 73 | self.counts['too_many_transactions_in_single_day'] = \ 74 | self.dfs['too_many_transactions_in_single_day'].count() 75 | 76 | # Caclulate the outlier account fraction 77 | self.counts['too_few_transactions_fraction'] = \ 78 | self.counts['too_few_transactions'] / self.counts['distinct_ids'] 79 | self.counts['too_many_avg_transactions_fraction'] = \ 80 | self.counts['too_many_avg_transactions'] / self.counts['distinct_ids'] 81 | self.counts['too_many_transactions_in_single_day_fraction'] = \ 82 | self.counts['too_many_transactions_in_single_day'] / self.counts['distinct_ids'] 83 | 84 | # Keep only ids that aren't among the outlier accounts 85 | self.filtered_transactions = self.calls.join(self.dfs['too_few_transactions'], 86 | self.calls['msisdn'] == \ 87 | self.dfs['too_few_transactions']['msisdn'], 88 | how ='leftanti').select(self.calls.columns[0:]) 89 | self.filtered_transactions = self.filtered_transactions\ 90 | .join(self.dfs['too_many_avg_transactions'], 91 | self.filtered_transactions['msisdn'] == \ 92 | self.dfs['too_many_avg_transactions']['msisdn'], 93 | how ='leftanti')\ 94 | .select(self.filtered_transactions.columns[0:]) 95 | self.filtered_transactions = self.filtered_transactions\ 96 | .join(self.dfs['too_many_transactions_in_single_day'], 97 | self.filtered_transactions['msisdn'] == \ 98 | self.dfs['too_many_transactions_in_single_day']['msisdn'], 99 | how ='leftanti')\ 100 | .select(self.filtered_transactions.columns[0:]) 101 | 102 | # count how many we kept and dropped 103 | self.counts['filtered_transactions'] = self.filtered_transactions.count() 104 | self.counts['dropped_calls'] = \ 105 | self.counts['all_records'] - self.counts['filtered_transactions'] 106 | self.print_results() 107 | 108 | 109 | def print_results(self): 110 | print('Total number of unique SIMs: {:,}'.format(self.counts['distinct_ids'])) 111 | print('Number of SIMs with less than {} transactions: {:,}'\ 112 | .format(self.thresholds['min_transactions'], 113 | self.counts['too_few_transactions'])) 114 | print('Number of SIMs with more than {} transactions per day on average: {:,}'\ 115 | .format(self.thresholds['max_avg_transactions'], 116 | self.counts['too_many_avg_transactions'] )) 117 | print('Number of SIMs with more than {} transactions in a single day: {:,}'\ 118 | .format(self.thresholds['max_transactions_in_single_day'], 119 | self.counts['too_many_transactions_in_single_day'])) 120 | print('SIMs with less than {} transactions as a fraction of all accounts: {:.8f}'\ 121 | .format(self.thresholds['min_transactions'], 122 | self.counts['too_few_transactions_fraction'])) 123 | print('SIMs with more than {} transactions per day on average as a fraction of all accounts: {:.8f}'\ 124 | .format(self.thresholds['max_avg_transactions'], 125 | self.counts['too_many_avg_transactions_fraction'])) 126 | print('SIMs with more than {} transactions on a single day as a fraction of all accounts: {:.8f}'\ 127 | .format(self.thresholds['max_transactions_in_single_day'], 128 | self.counts['too_many_transactions_in_single_day_fraction'])) 129 | print('Number of transactions that would be kept: {:,}'\ 130 | .format(self.counts['filtered_transactions'])) 131 | print('Number of transactions that would be deleted: {:,}'\ 132 | .format(self.counts['dropped_calls'])) 133 | print('Fraction of transactions that would be deleted: {:.8f}'\ 134 | .format(self.counts['dropped_calls'] / self.counts['all_records'])) -------------------------------------------------------------------------------- /data-checks/Archive/01_completenes_checks.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # DATA CHECKS - Completeness checks 3 | #-----------------------------------------------------------------# 4 | 5 | #-----------------------------------------------------------------# 6 | # Settings 7 | 8 | from globals import * 9 | 10 | EXPORT_FIGURES = True 11 | 12 | # Default variable names 13 | timevar = 'hour' 14 | regvar = 'region' 15 | 16 | INDICATORS_path = DATA_path + 'isaac-results/Archive/e_23_07_2020_converage_23_05_to_30_06/' 17 | 18 | #-----------------------------------------------------------------# 19 | # Load data 20 | 21 | # Define loading function that depends on the existing folder 22 | # structure but also remove headers in the middle of the data if 23 | # if there is any 24 | def loadfiles(file_name, 25 | admin = 3, 26 | path = INDICATORS_path): 27 | print(file_name, admin) 28 | # Load external file 29 | folder = path + 'admin' + str(admin) + '/' 30 | de = None 31 | de = pd.read_csv(folder + file_name) 32 | # Patch cleannig of headers in the middle of the data 33 | c1_name = de.columns[0] 34 | de = de[~de[c1_name].astype(str).str.contains(c1_name)] 35 | return(de) 36 | 37 | 38 | # Indicator 1 39 | fi = loadfiles(file_name = 'transactions_per_hour.csv') 40 | 41 | # Indicator 2 42 | f2 = loadfiles('unique_subscribers_per_day.csv') 43 | 44 | # Indicator 5 45 | f5 = loadfiles('origin_destination_connection_matrix_per_day.csv') 46 | 47 | # Indicator 9 48 | f9 = loadfiles('week_home_vs_day_location_per_day.csv', admin = 2) 49 | 50 | 51 | #-----------------------------------------------------------------# 52 | # Processing data 53 | 54 | # Remove missings 55 | reg_missings_bol = fi['region'].isin(missing_values) 56 | fi_cl = fi[~reg_missings_bol] 57 | 58 | # Check for duplicates 59 | # sum(fi_cl.duplicated()) 60 | fi_cl['count'] = fi_cl['count'].astype(int) 61 | 62 | # Date vars 63 | fi_cl['date'] = pd.to_datetime(fi_cl['hour']).dt.date 64 | # fi_cl['hour'] = pd.to_datetime(fi_cl[timevar]).dt.hour 65 | # fi_cl['month'] = pd.to_datetime(fi_cl['date']).dt.month 66 | 67 | # Make sure dates are datetime 68 | fi_cl['hour'] = fi_cl['hour'].astype('datetime64') 69 | 70 | 71 | # I5 72 | f5['date'] = pd.to_datetime(f5['connection_date']).dt.date 73 | 74 | 75 | #-----------------------------------------------------------------# 76 | # Create aggregated datasets to the country level for ploting 77 | 78 | #---------------------------- 79 | # I1 - transactions per hour 80 | 81 | # Create plots data 82 | f1_agg_hour = fi_cl\ 83 | .groupby(['date', 'hour'])\ 84 | .agg({'region' : pd.Series.nunique , 85 | 'count' : np.sum})\ 86 | .reset_index()\ 87 | .sort_values(['date', 'hour'])\ 88 | .rename(columns = {'region' : 'n_regions'}) 89 | 90 | f1_agg_date = fi_cl\ 91 | .groupby('date')\ 92 | .agg({'region' : pd.Series.nunique , 93 | 'count' : np.sum})\ 94 | .reset_index()\ 95 | .sort_values(['date'])\ 96 | .rename(columns = {'region' : 'n_regions'}) 97 | 98 | #---------------------------- 99 | # I5 - OD matrix per day data 100 | 101 | f5['date'] = pd.to_datetime(f5['connection_date']).dt.date 102 | 103 | f5_agg_date = f5\ 104 | .groupby('date')\ 105 | .agg({'region_from' : pd.Series.nunique , 106 | 'region_to' : pd.Series.nunique, 107 | 'total_count' : np.sum})\ 108 | .reset_index()\ 109 | .sort_values('date') 110 | 111 | #---------------------------- 112 | # Complete dates and time 113 | 114 | # Create data sets with time indexes and fill blanks with 0s 115 | def time_complete(data, timevar = timevar, timefreq = 'D'): 116 | data[timevar] = data[timevar].astype('datetime64') 117 | full_time_range = pd.date_range(data[timevar].min(), 118 | data[timevar].max(), 119 | freq = timefreq) 120 | data = data.set_index(timevar) 121 | data = data.reindex(full_time_range, fill_value=0) 122 | return(data) 123 | 124 | f1_agg_date = time_complete(f1_agg_date, 'date') 125 | f1_agg_hour = time_complete(f1_agg_hour, 'hour', 'H') 126 | f5_agg_date = time_complete(f5_agg_date, 'date') 127 | 128 | #-----------------------------------------------------------------# 129 | # I1 - Day Plots 130 | 131 | # PLot number of regions with transactions per day. 132 | 133 | # Number of regions plot 134 | plt.figure(figsize=(12, 6)) 135 | date_plot = sns.lineplot(f1_agg_date.index, 136 | f1_agg_date['n_regions']) 137 | # Export 138 | date_plot.figure.savefig(OUT_path + "i1_dates_ward_count.png") 139 | 140 | 141 | # Number of transactions plot 142 | plt.figure(figsize=(12, 6)) 143 | obs_per_day_plot = sns.lineplot( 144 | f1_agg_date.index, 145 | f1_agg_date['count']) 146 | # Export 147 | if EXPORT_FIGURES: 148 | obs_per_day_plot.figure.savefig(OUT_path + "i1_dates_n_obs.png") 149 | 150 | 151 | #-----------------------------------------------------------------# 152 | # I1 - Hour Plots 153 | 154 | # Plot total number of transactions per hour to check for outliers 155 | 156 | #------------------ 157 | # Number of regions 158 | plt.figure(figsize=(12, 6)) 159 | hour_plot = sns.lineplot( 160 | f1_agg_hour.index, 161 | f1_agg_hour['n_regions']) 162 | 163 | # Cosmetics 164 | # x_ticks = list(set(fi_agg_hour['hour'].astype(str)))[0:len(fi_agg_hour):5] 165 | # x_ticks.sort() 166 | # hour_plot.set_xticklabels(x_ticks) 167 | 168 | # Export 169 | if EXPORT_FIGURES: 170 | hour_plot.figure.savefig(OUT_path + "i1_hours_ward_count.png") 171 | 172 | #---------------------------- 173 | # Total count of transactions 174 | plt.figure(figsize=(12, 6)) 175 | obs_per_hour_plot = sns.lineplot( 176 | f1_agg_hour.index.values, 177 | f1_agg_hour['count']) 178 | 179 | # Cosmetics 180 | # x_ticks = list(set(fi_agg_hour['date'].astype(str)))[0:len(fi_agg_hour):5] 181 | # x_ticks.sort() 182 | # obs_per_hour_plot.set_xticklabels(x_ticks) 183 | 184 | # Export 185 | if EXPORT_FIGURES: 186 | obs_per_hour_plot.figure.savefig(OUT_path + "i1_hours_n_obs.png") 187 | 188 | 189 | # Table with hours 190 | # fi_obs_per_hour[fi_obs_per_hour['date'] == dt.date(2020, 4, 30)] 191 | # apr30 = f1_agg_hour[f1_agg_hour['date'] == dt.date(2020, 4, 30)] 192 | 193 | # apr30.to_csv(OUT_path + "i1_hour_apr30.csv", 194 | # index = False) 195 | 196 | 197 | #-----------------------------------------------------------------# 198 | # I5 - Day Plots 199 | 200 | # Plot total number of movements per day 201 | 202 | # plot total count 203 | f5_plot = sns.lineplot( 204 | f5_agg_date.index, 205 | f5_agg_date['total_count']) 206 | # Export 207 | if EXPORT_FIGURES: 208 | f5_plot.figure.savefig(OUT_path + "i5_dates_total_count.png") 209 | 210 | 211 | #-----------------------------------------------------------------# 212 | # I9 - Week plots 213 | 214 | 215 | # f9_plot = sns.lineplot( 216 | # f9_agg_date['week'], 217 | # f9_agg_date['mean_distance']) 218 | # # Export 219 | # f9_plot.figure.savefig(OUT_path + "i9_week_mean_distance.png") 220 | -------------------------------------------------------------------------------- /data-panel/Archive/panel_draft2.py: -------------------------------------------------------------------------------- 1 | 2 | # Custom suffixes? 3 | # Class?? 4 | 5 | EXPORT = False 6 | 7 | #-----------------------------------------------------------------# 8 | # Settings 9 | 10 | import os 11 | import re 12 | import pandas as pd 13 | import numpy as np 14 | import datetime as dt 15 | 16 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/" 17 | DATA_POC = DATA_path + "proof-of-concept/" 18 | 19 | OUT_panel = DATA_POC + "panel_indicators/" 20 | 21 | 22 | # CHANGE: 23 | IRESULTS = DATA_path + "Isaac-results/" 24 | 25 | IFLOW_path = IRESULTS + "flowminder/" 26 | ICUST_path = IRESULTS + "custom/" 27 | 28 | INEW_PATH_2_mar = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin2_priority/mar1-mar31/" 29 | INEW_PATH_2_apr = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin2_priority/mar23-apr30/" 30 | 31 | INEW_PATH_3_mar = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin3_priority/mar1-mar31/" 32 | INEW_PATH_3_apr = IRESULTS + "Archive/e_17_06_2020_coverage_03_to_04/admin3_priority/mar23-apr30/" 33 | 34 | 35 | IOLD_PATH_2_mar = IRESULTS + "custom/admin2/" 36 | IOLD_PATH_3_mar = IRESULTS + "Archive/e_08_06_2020_coverage_04_to_05/admin3_custom/" 37 | 38 | 39 | # Load list of internal indicators to make it 40 | # easier to bulk load files 41 | DATA_path = "C:/Users/wb519128/WBG/Sveta Milusheva - COVID 19 Results/" 42 | 43 | internal_indicators = pd.read_csv(DATA_POC + 'indicators_list.csv') 44 | internal_indicators['path'] = DATA_path + internal_indicators['path'] 45 | 46 | # Load files function 47 | def loadfiles(file_name, 48 | files_df = internal_indicators, 49 | admin = 3, 50 | path_external = None): 51 | if path_external is None: 52 | # Set intex 53 | idx = files_df[(files_df['file'] == file_name) & (files_df['level'] == admin)].index.values[0] # Load internal 54 | # Custom file names for i5, i7 and i9 55 | if file_name in ['mean_distance_per_day', 56 | 'origin_destination_connection_matrix_per_day', 57 | 'mean_distance_per_week', 58 | 'month_home_vs_day_location_per_day', 59 | 'week_home_vs_day_location_per_day']: 60 | file_name_i = file_name + '_7day_limit.csv' 61 | else: 62 | file_name_i = file_name + '.csv' 63 | # External names 64 | print(file_name, admin) 65 | # Load data 66 | d = None 67 | d = pd.read_csv(files_df['path'][idx] + file_name_i) 68 | else: 69 | print(file_name) 70 | file_name = file_name + '.csv' 71 | d = None 72 | d = pd.read_csv(path_external + file_name) 73 | # Patch clean of headers in the middle of the data 74 | c1_name = d.columns[0] 75 | d = d[~d[c1_name].astype(str).str.contains(c1_name)] 76 | # Turn everything to string for simplicity 77 | d.astype(str) 78 | return d 79 | 80 | # i1 = loadfiles('transactions_per_hour') 81 | 82 | # i1e = loadfiles('transactions_per_hour', 83 | # path_external= INEW_PATH_3_apr) 84 | 85 | # Drop custom missigs 86 | def drop_custna(data, columns): 87 | na_list = ['nan', '', '99999', float("inf")] 88 | for cols in columns: 89 | data = data[~(data[cols].isin(na_list))] 90 | return(data) 91 | 92 | # Clean function 93 | def clean(d, index_cols): 94 | # Remove missins 95 | d = d.dropna() 96 | # All but the last column 97 | #index_cols = list(d.columns[0:-1]) 98 | # d = drop_custna(d, index_cols) 99 | return(d) 100 | 101 | #-----------------------------------------------------------------# 102 | # Load indicators 103 | i5_index = ['connection_date', 'region_from', 'region_to'] 104 | 105 | i5 = loadfiles('origin_destination_connection_matrix_per_day', 106 | admin = 2) 107 | i5e_mar = loadfiles('origin_destination_connection_matrix_per_day', 108 | path_external= INEW_PATH_2_mar) 109 | i5e_apr = loadfiles('origin_destination_connection_matrix_per_day', 110 | path_external= INEW_PATH_2_apr) 111 | 112 | i7_index = ['home_region', 'day'] 113 | i7 = loadfiles('mean_distance_per_day', admin = 2) 114 | 115 | # March files where only rerun for i5 and i9 so I'm using the old extraction from feb to apr 116 | i7e_mar = loadfiles('mean_distance_per_day', 117 | path_external= IOLD_PATH_2_mar) 118 | i7e_apr = loadfiles('mean_distance_per_day', 119 | path_external= INEW_PATH_2_apr) 120 | 121 | #-----------------------------------------------------------------# 122 | # Panel 123 | # Create panel 124 | def panel(d, 125 | de, 126 | index_cols, 127 | #countvars, 128 | r_suffix = '_ecnt', 129 | timevar = None, 130 | how = 'outer'): 131 | if timevar is None: 132 | timevar = index_cols[0] 133 | # MAke sure time var is date 134 | d[timevar] = d[timevar].astype('datetime64') 135 | de[timevar] = de[timevar].astype('datetime64') 136 | # Join 137 | md = d.merge(de, 138 | on = index_cols, 139 | how = how, 140 | suffixes=('', r_suffix)) 141 | return md 142 | 143 | 144 | d1_bol = (p7['day'] >= np.datetime64(dt.date(2020, 3, 15))) 145 | d2_bol = (p7['day'] >= np.datetime64(dt.date(2020, 4, 1))) 146 | 147 | #--------# 148 | # i5 Panel 149 | p5 = panel(i5, i5e_mar, i5_index, timevar = 'connection_date') 150 | p5 = panel(p5, 151 | i5e_apr, 152 | i5_index, 153 | r_suffix= '_ecnt_apr', 154 | timevar = 'connection_date') 155 | 156 | d1_bol = (p5['connection_date'] >= np.datetime64(dt.date(2020, 3, 15))) 157 | d2_bol = (p5['connection_date'] >= np.datetime64(dt.date(2020, 4, 1))) 158 | 159 | 160 | countvars = ['subscriber_count','od_count', 'total_count'] 161 | for var in countvars: 162 | varname = var + '_p' 163 | # Base value as our indicator 164 | p5[varname] = p5[var] 165 | # Replace values based on dates 166 | p5.loc[d1_bol, varname] = p5.loc[d1_bol, var + '_ecnt'] 167 | p5.loc[d2_bol, varname] = p5.loc[d2_bol, var + '_ecnt_apr'] 168 | 169 | p5 = p5.dropna(subset = ['connection_date']).sort_values(i5_index) 170 | 171 | # p5.to_csv('C:/Users/wb519128/Desktop/i5_test.csv', index = False) 172 | 173 | if EXPORT: 174 | p5.to_csv(OUT_panel + 'i5_admin2_temp.csv', index = False) 175 | 176 | #--------# 177 | # i7 Panel 178 | p7 = panel(i7, i7e_mar, i7_index, timevar = 'day') 179 | p7 = panel(p7, 180 | i7e_apr, 181 | i7_index, 182 | r_suffix= '_ecnt_apr', 183 | timevar = 'day') 184 | 185 | 186 | d1_bol = (p7['day'] >= np.datetime64(dt.date(2020, 3, 15))) 187 | d2_bol = (p7['day'] >= np.datetime64(dt.date(2020, 4, 1))) 188 | 189 | countvars = ['mean_distance', 'stdev_distance'] 190 | for var in countvars: 191 | varname = var + '_p' 192 | # Base value as our indicator 193 | p7[varname] = p7[var] 194 | # Replace values based on dates 195 | p7.loc[d1_bol, varname] = p7.loc[d1_bol, var + '_ecnt'] 196 | p7.loc[d2_bol, varname] = p7.loc[d2_bol, var + '_ecnt_apr'] 197 | 198 | 199 | 200 | 201 | p7 = p7.dropna(subset = ['day']).sort_values(i7_index) 202 | 203 | # Export 204 | if EXPORT: 205 | p7.to_csv(OUT_panel + 'i7_admin2_temp.csv', index = False) 206 | 207 | 208 | # p7.to_csv('C:/Users/wb519128/Desktop/i7_test.csv', index = False) 209 | 210 | -------------------------------------------------------------------------------- /data-panel/Archive/panel_draft.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # CREATE PANEL 3 | #-----------------------------------------------------------------# 4 | 5 | # This file combines two different sources of the indicators created 6 | # in cdr-aggregation to create a panel. 7 | 8 | # Dates at which different sources are connected are specific to 9 | # each indicator and the particularities of those sources 10 | 11 | #-----------------------------------------------------------------# 12 | # TO DO 13 | 14 | # Rewrite load function 15 | 16 | # Reorganize file paths to remove dependency on MASTER.py 17 | 18 | #-----------------------------------------------------------------# 19 | # Settings 20 | 21 | import os 22 | import re 23 | import pandas as pd 24 | import numpy as np 25 | import datetime as dt 26 | 27 | #-----------------------------------------------------------------# 28 | # Globals 29 | 30 | # Default connection date. 31 | append_date = dt.date(2020, 3, 15) 32 | 33 | #-----------------------------------------------------------------# 34 | # Function definitions 35 | 36 | # Drop custom missigs 37 | def drop_custna(data, columns): 38 | na_list = ['nan', '', '99999', float("inf")] 39 | for cols in columns: 40 | data = data[~(data[cols].isin(na_list))] 41 | return(data) 42 | 43 | # Load files function 44 | def loadfiles(file_name, 45 | files_df = internal_indicators, 46 | admin = 3): 47 | # Set intex 48 | idx = files_df[(files_df['file'] == file_name) & (files_df['level'] == admin)].index.values[0] # Load internal 49 | # Custom file names for i5, i7 and i9 50 | if file_name in ['mean_distance_per_day', 51 | 'origin_destination_connection_matrix_per_day', 52 | 'mean_distance_per_week', 53 | 'month_home_vs_day_location_per_day', 54 | 'week_home_vs_day_location_per_day']: 55 | file_name_i = file_name + '_7day_limit.csv' 56 | else: 57 | file_name_i = file_name + '.csv' 58 | # External names 59 | file_name_e = file_name + '.csv' 60 | print(file_name, admin) 61 | # Load data 62 | d = None 63 | d = pd.read_csv(files_df['path'][idx] + file_name_i) 64 | # Load external 65 | if files_df['indicator'][idx] == 'flow': 66 | ext_path = IFLOW_path 67 | else: 68 | ext_path = ICUST_path 69 | # Load external file 70 | ext_folder = ext_path + 'admin' + str(files_df['level'][idx]) + '/' 71 | de = None 72 | de = pd.read_csv(ext_folder + file_name_e) 73 | # Patch cleannig of headers in the middle of the data 74 | c1_name = d.columns[0] 75 | de = de[~de[c1_name].astype(str).str.contains(c1_name)] 76 | return([d, de]) 77 | 78 | # Clean function 79 | def clean(d, index_cols): 80 | # Remove missins 81 | d = d.dropna() 82 | # All but the last column 83 | #index_cols = list(d.columns[0:-1]) 84 | d = drop_custna(d, index_cols) 85 | return(d) 86 | 87 | # Create panel 88 | def simp_panel(d, 89 | de, 90 | index_cols, 91 | #countvars, 92 | append_date, 93 | compare = False, 94 | timevar = None, 95 | how = 'outer'): 96 | if timevar is None: 97 | timevar = index_cols[0] 98 | # Clean 99 | d = clean(d, index_cols) 100 | de = clean(de, index_cols) 101 | # Join 102 | md = d.merge(de, 103 | on = index_cols, 104 | how = how, 105 | suffixes=('', '_ecnt')) 106 | # Replace count values with internal until the 7th of march and 107 | # external after 108 | countvars = list(set(d.columns) - set(index_cols)) 109 | for var in countvars: 110 | if compare: 111 | varname = var + '_p' 112 | else: 113 | varname = var 114 | 115 | md[varname] = np.where(pd.to_datetime(md[timevar]).dt.date <= append_date, 116 | md[var], 117 | md[var + '_ecnt']) 118 | # Remove other columns 119 | if not compare: 120 | md = md.filter(regex=r'^((?!_ecnt).)*$') 121 | # Return 122 | return md.sort_values(index_cols).dropna(subset= index_cols) 123 | 124 | #-----------------------------------------------------------------# 125 | # Load indicators 126 | 127 | # Define indicator class that 128 | class i_indicator: 129 | """ 130 | This class contains information to load indicator files both 131 | from our original indicators and externally created ones. 132 | 133 | load() method loads both datasets 134 | clean() method removes missings from both datasets 135 | """ 136 | def __init__(self, 137 | file_name, 138 | index_cols, 139 | admin = 3): 140 | self.file_name = file_name 141 | self.index_cols = index_cols 142 | self.admin = admin 143 | # Call methods when intializing 144 | self.load() 145 | self.clean() 146 | # Load data 147 | def load(self): 148 | self.data, self.data_e = loadfiles(self.file_name, 149 | admin = self.admin) 150 | # Clean data 151 | def clean(self): 152 | self.data = clean(self.data, self.index_cols) 153 | self.data_e = clean(self.data_e, self.index_cols) 154 | 155 | # Create panel data 156 | def create_panel(self, 157 | timevar = None, 158 | compare = False, 159 | append_date = append_date): 160 | panel = simp_panel(self.data, 161 | self.data_e, 162 | self.index_cols, 163 | append_date, 164 | compare = compare, 165 | timevar=timevar) 166 | return panel 167 | 168 | # Indicator 1 169 | # Sum across all observations in the given hour and lowest admin 170 | # area. 171 | i1 = i_indicator('transactions_per_hour', 172 | ['hour', 'region']) 173 | 174 | # Indicator 2 175 | # Sum all unique subscribers with an observation in the given 176 | # admin area and time period. 177 | i2 = i_indicator('unique_subscribers_per_hour', 178 | ['hour', 'region']) 179 | 180 | 181 | # Indicator 3 182 | # Sum all unique subscribers with an observation in the given 183 | # admin area and time period. 184 | i3 = i_indicator('unique_subscribers_per_day', 185 | ['day', 'region']) 186 | 187 | # Indicator 4 188 | # i4 = i_indicator('percent_of_all_subscribers_active_per_day', 189 | # ['home_region', 'day']) 190 | 191 | # Indicator 5 192 | i5 = i_indicator('origin_destination_connection_matrix_per_day', 193 | ['connection_date', 'region_from', 'region_to']) 194 | # Indicator 7 195 | i7 = i_indicator('mean_distance_per_day', 196 | ['home_region', 'day']) 197 | 198 | # Indicator 8 199 | i8 = i_indicator('mean_distance_per_week', 200 | ['home_region', 'week']) 201 | 202 | # Indicator 9 203 | i9 = i_indicator('week_home_vs_day_location_per_day', 204 | ['region', 'home_region', 'day'], 205 | admin = 2) 206 | 207 | #-----------------------------------------------------------------# 208 | # Create panel 209 | 210 | # Make particular changes to indicators as needed here 211 | 212 | # Panel with defaults 213 | i_list = [i1, i2, i3, i5, i9] 214 | panel_list = list(map(lambda x: x.create_panel() , i_list)) 215 | 216 | # Custom arguments 217 | i7_p = i7.create_panel( timevar = 'day') 218 | 219 | #-----------------------------------------------------------------# 220 | # Export 221 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/aggregator.py: -------------------------------------------------------------------------------- 1 | import os 2 | if os.environ['HOME'] != '/root': 3 | from modules.DataSource import * 4 | from modules.sql_code_aggregates import * 5 | databricks = False 6 | else: 7 | databricks = True 8 | 9 | # Databricks notebook source 10 | class aggregator: 11 | """Class to handle aggregations. 12 | 13 | 14 | Attributes 15 | ---------- 16 | result_stub : a string. File path where to save results 17 | datasource : an instance of DataSource class. Holds all dataframes and paths required 18 | regions : a pyspark dataframe. Admin level this aggregator will be used for 19 | calls : a pyspark dataframe. cdr data 20 | cells : a pyspark dataframe. admin region to tower mapping 21 | spark : an initialised spark connection. spark connection this aggregator should use 22 | dates : a dictionary. dates the aggregator should run over 23 | intermediate_tables : tables that we don't want written to csv 24 | 25 | 26 | Methods 27 | ------- 28 | create_sql_dates() 29 | Convert the dates to strings to be used in the flowminder sql queries 30 | 31 | create_view(df, table_name) 32 | Creates a view of a dataframe 33 | 34 | save(table_name) 35 | Repartitions a dataframe into a single partition and writes it to a csv file 36 | 37 | save_and_report(table_name) 38 | Checks whether csv file exists before saving table_name to csv 39 | 40 | rename_csv(table_name) 41 | - rename a specific csv 42 | - move a csv to parent folder, rename it, then delete its remaining folder 43 | 44 | rename_all_csvs(table_name) 45 | renames all csvs at once 46 | 47 | rename_if_not_existing(table_name) 48 | rename only if the file doesn't exist as csv yet, handles errors 49 | 50 | check_if_file_exists(table_name) 51 | checks whether a csv exists before we re-create 52 | 53 | 54 | 55 | """ 56 | 57 | def __init__(self, 58 | result_stub, 59 | datasource, 60 | regions, 61 | intermediate_tables = ['home_locations']): 62 | """ 63 | Parameters 64 | ---------- 65 | result_stub : where to save results 66 | datasource : holds all dataframes and paths required 67 | regions : admin level this aggregator will be used for 68 | intermediate_tables : tables that we don't want written to csv 69 | """ 70 | self.datasource = datasource 71 | self.result_path = datasource.results_path + result_stub 72 | self.calls = datasource.parquet_df 73 | self.calls.createOrReplaceTempView('calls') 74 | self.cells = getattr(datasource, regions) 75 | self.cells.createOrReplaceTempView("cells") 76 | self.spark = datasource.spark 77 | self.dates = datasource.dates 78 | self.create_sql_dates() 79 | self.sql_code = write_sql_code(calls = self.calls, 80 | start_date = self.dates_sql['start_date'], 81 | end_date = self.dates_sql['end_date'], 82 | start_date_weeks = self.dates_sql['start_date_weeks'], 83 | end_date_weeks = self.dates_sql['end_date_weeks']) 84 | self.table_names = self.sql_code.keys() 85 | self.intermediate_tables = intermediate_tables 86 | 87 | def create_sql_dates(self): 88 | self.dates_sql = {'start_date' : "\'" + self.dates['start_date'].isoformat('-')[:10] + "\'", 89 | 'end_date' : "\'" + self.dates['end_date'].isoformat('-')[:10] + "\'", 90 | 'start_date_weeks' : "\'" + self.dates['start_date_weeks'].isoformat('-')[:10] + "\'", 91 | 'end_date_weeks' : "\'" + self.dates['end_date_weeks'].isoformat('-')[:10] + "\'"} 92 | 93 | def create_view(self, df, table_name): 94 | df.createOrReplaceTempView(table_name) 95 | 96 | def save(self, df, table_name): 97 | df.repartition(1).write.mode('overwrite').format('com.databricks.spark.csv') \ 98 | .save(os.path.join(self.result_path, table_name), header = 'true') 99 | 100 | def save_and_report(self, df, table_name): 101 | if table_name not in self.intermediate_tables: 102 | if self.check_if_file_exists(table_name): 103 | print('Skipped: ' + table_name) 104 | else: 105 | print('--> File does not exist. Saving: ' + table_name) 106 | self.save(df, table_name) 107 | else: 108 | print('Caching: home_locations') 109 | df.createOrReplaceTempView("home_locations") 110 | self.spark.sql('CACHE TABLE home_locations').collect() 111 | self.create_view(df, table_name) 112 | return table_name 113 | 114 | def rename_csv(self, table_name): 115 | # move one folder up and rename to human-legible .csv name 116 | if databricks: 117 | dbutils.fs.mv(dbutils.fs.ls(self.result_path + '/' + table_name)[-1].path, 118 | self.result_path + '/' + table_name + '.csv') 119 | # remove the old folder 120 | dbutils.fs.rm(self.result_path + '/' + table_name + '/', recurse = True) 121 | else: 122 | os.rename(glob.glob(os.path.join(self.result_path, table_name + '/*.csv'))[0], 123 | os.path.join(self.result_path, table_name + '.csv')) 124 | shutil.rmtree(os.path.join(self.result_path, table_name)) 125 | 126 | def save_and_rename_one(self, df, table_name): 127 | self.rename_if_not_existing(self.save_and_report(df, table_name)) 128 | 129 | def rename_all_csvs(self): 130 | for table_name in self.table_names: 131 | if table_name in self.intermediate_tables: 132 | pass 133 | else: 134 | self.rename_if_not_existing(table_name) 135 | 136 | def rename_if_not_existing(self, table_name): 137 | if databricks: 138 | try: 139 | # does the csv already exist 140 | dbutils.fs.ls(self.result_path + '/' + table_name + '.csv') 141 | except Exception as e: 142 | # the csv doesn't exist yet, move the file and delete the folder 143 | if 'java.io.FileNotFoundException' in str(e): 144 | print('--> Renaming: ' + table_name) 145 | self.rename_csv(table_name) 146 | else: 147 | raise 148 | else: 149 | if os.path.exists(self.result_path + '/' + table_name + '.csv'): 150 | pass 151 | else: 152 | print('--> Renaming: ' + table_name) 153 | self.rename_csv(table_name) 154 | 155 | def check_if_file_exists(self, table_name): 156 | if databricks: 157 | try: 158 | # does the folder exist? 159 | dbutils.fs.ls(self.result_path + '/' + table_name) 160 | return True 161 | except Exception as e: 162 | # the folder does not exist 163 | if 'java.io.FileNotFoundException' in str(e): 164 | try: 165 | # does the csv exist? 166 | dbutils.fs.ls(self.result_path + '/' + table_name + '.csv') 167 | return True 168 | except Exception as e: 169 | # the csv does not exist 170 | if 'java.io.FileNotFoundException' in str(e): 171 | return False 172 | else: 173 | raise 174 | else: 175 | raise 176 | else: 177 | return os.path.exists(self.result_path + '/' + table_name) | \ 178 | os.path.exists(self.result_path + '/' + table_name + '.csv') 179 | -------------------------------------------------------------------------------- /data-checks/Archive/od_scaling.py: -------------------------------------------------------------------------------- 1 | #-----------------------------------------------------------------# 2 | # OD matrix scaling checks 3 | #-----------------------------------------------------------------# 4 | 5 | # This code depends on MASTER.py to run as file path objects are 6 | # defined there 7 | 8 | #-----------------------------------------------------------------# 9 | # Settings 10 | 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | import datetime 14 | import os 15 | 16 | 17 | #-----------------------------------------------------------------# 18 | # Load data 19 | od = pd.read_csv(I5_Adm3_path + 20 | "origin_destination_connection_matrix_per_day.csv") 21 | 22 | 23 | # Number of residents 24 | res = pd.read_csv(FLOWM_adm3_path + 25 | "home_location_counts_per_region.csv") 26 | 27 | # Active residents 28 | ares = pd.read_csv(FLOWM_adm3_path + 29 | "count_unique_active_residents_per_region_per_day.csv") 30 | 31 | # Number of calls 32 | cal = pd.read_csv(FLOWM_adm3_path + 33 | "total_calls_per_region_per_day.csv") 34 | 35 | 36 | #-----------------------------------------------------------------# 37 | # Process data 38 | 39 | # Create date variable 40 | def convert_dates(df,date_col ='connection_date'): 41 | df['date'] = pd.\ 42 | to_datetime(df[date_col]).\ 43 | dt.date 44 | return(df) 45 | 46 | od = convert_dates(od, 'connection_date') 47 | ares = convert_dates(ares, 'visit_date') 48 | cal = convert_dates(cal, 'call_date') 49 | 50 | #-----------------------------------------------------------------# 51 | # Create different scaling factors 52 | 53 | #--------------------# 54 | # Create new variables 55 | 56 | # Number of active subscribers over total residents 57 | ares = ares.merge(res.rename(columns={"subscriber_count" : "residents"}), 58 | on = 'region', 59 | how='outer') 60 | 61 | ares = ares.rename(columns={"subscriber_count" : 'active_res'}) 62 | 63 | # Check pp > 1 !!!! 64 | ares['p_active_res'] = ares['active_res']/ares['residents'] 65 | 66 | 67 | 68 | # Number of calls over residents 69 | cal = cal.merge(res.rename(columns={"subscriber_count" : "residents"}), 70 | on = 'region', 71 | how='outer') 72 | 73 | cal['p_cals'] = cal['total_calls']/cal['residents'] 74 | 75 | #------------------------------# 76 | # Add new variables to od matrix 77 | 78 | # Proportion of active residents in orig and dest 79 | od = od.\ 80 | merge(ares[['region','date', 'p_active_res']], 81 | left_on= ['region_from','date'], 82 | right_on= ['region', 'date'], 83 | how='left').\ 84 | rename(columns={'p_active_res' : 'p_active_res_O'}).\ 85 | drop(columns='region').\ 86 | merge(ares[['region','date', 'p_active_res']], 87 | left_on= ['region_to','date'], 88 | right_on= ['region', 'date'], 89 | how='left').\ 90 | rename(columns={'p_active_res' : 'p_active_res_D'}).\ 91 | drop(columns='region') 92 | 93 | 94 | # Proportion of calls per residents in orig and dest 95 | od = od.\ 96 | merge(cal[['region','date', 'p_cals']], 97 | left_on= ['region_from','date'], 98 | right_on= ['region', 'date'], 99 | how='left').\ 100 | rename(columns={'p_cals' : 'p_cals_O'}).\ 101 | drop(columns='region').\ 102 | merge(cal[['region','date', 'p_cals']], 103 | left_on= ['region_to','date'], 104 | right_on= ['region', 'date'], 105 | how='left').\ 106 | rename(columns={'p_cals' : 'p_cals_D'}).\ 107 | drop(columns='region') 108 | 109 | 110 | #-----------------# 111 | # Create indicators 112 | 113 | # Multiplication of total active residents in origin and 114 | # destiantion 115 | od['w1'] = od['p_active_res_O'] * od['p_active_res_D'] 116 | 117 | 118 | # Sum of calls per person in origin and destinaion 119 | od['w2'] = od['p_cals_O'] + od['p_cals_D'] 120 | 121 | 122 | # od['p_cals_O'].isnull().sum()/od.shape[0] 123 | # 0.5159950493247425 124 | 125 | #-----------------------------------------------------------------# 126 | # Create scaled values 127 | od['total_count_w1'] = od['total_count']/od['w1'] 128 | 129 | od['total_count_w2'] = od['total_count']/od['w2'] 130 | 131 | #-----------------------------------------------------------------# 132 | # Plot 133 | 134 | # Set origin region 135 | od1 = od[od['region_from'] == 'ZW102109'] 136 | 137 | # Select a set of destinations 138 | # od1_top_dest = ['ZW120435','ZW142513','ZW192205', 139 | # 'ZW130720','ZW170530' ] 140 | 141 | od1_top_dest = od1['region_to'].value_counts().head(9).index 142 | 143 | # Create plot df 144 | # p1_df = od1[od1['region_to'] == 'ZW120435'] 145 | p1_df = od1[od1['region_to'].isin(od1_top_dest)] 146 | p1_df.set_index(['date'],inplace=True) 147 | 148 | 149 | # Plot function that already adds it to the grid 150 | def add_plts(dest_value, 151 | grid_pos, 152 | df = p1_df, 153 | dest_var = 'region_to', 154 | #x_axis = 'connection_date', 155 | y_axis = 'total_count'): 156 | 157 | df[df[dest_var] == dest_value].\ 158 | plot(y= y_axis, 159 | legend= False, 160 | ax = fig.add_subplot(grid_pos)) 161 | 162 | # Run plots 163 | # # Gambiarra da porra. Fazer isso melhor se tiver tempo 164 | # def plots_together(var): 165 | # fig, ax = plt.subplots(nrows=3,ncols=3) 166 | # fig = plt.figure() 167 | # gs = fig.add_gridspec(3, 3) 168 | 169 | # add_plts(od1_top_dest[0], gs[0, 0], y_axis = var) 170 | # add_plts(od1_top_dest[1], gs[0, 1], y_axis = var) 171 | # add_plts(od1_top_dest[2], gs[0, 2], y_axis = var) 172 | # add_plts(od1_top_dest[3], gs[1, 0], y_axis = var) 173 | # add_plts(od1_top_dest[4], gs[1, 1], y_axis = var) 174 | # add_plts(od1_top_dest[5], gs[1, 2], y_axis = var) 175 | # add_plts(od1_top_dest[6], gs[2, 0], y_axis = var) 176 | # add_plts(od1_top_dest[7], gs[2, 1], y_axis = var) 177 | # add_plts(od1_top_dest[8], gs[2, 2], y_axis = var) 178 | 179 | # return(fig) 180 | # # fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png') 181 | 182 | # plots_together('total_count') 183 | 184 | var = 'total_count' 185 | 186 | # Set plot parameters 187 | fig, ax = plt.subplots(nrows=3,ncols=3) 188 | fig = plt.figure() 189 | gs = fig.add_gridspec(3, 3) 190 | 191 | 192 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var) 193 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var) 194 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var) 195 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var) 196 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var) 197 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var) 198 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var) 199 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var) 200 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var) 201 | 202 | # Export 203 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png') 204 | 205 | 206 | var = 'total_count_w2' 207 | 208 | # Set plot parameters 209 | fig, ax = plt.subplots(nrows=3,ncols=3) 210 | fig = plt.figure() 211 | gs = fig.add_gridspec(3, 3) 212 | 213 | 214 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var) 215 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var) 216 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var) 217 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var) 218 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var) 219 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var) 220 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var) 221 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var) 222 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var) 223 | 224 | # Export 225 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png') 226 | 227 | var = 'total_count_w1' 228 | 229 | # Set plot parameters 230 | fig, ax = plt.subplots(nrows=3,ncols=3) 231 | fig = plt.figure() 232 | gs = fig.add_gridspec(3, 3) 233 | 234 | 235 | add_plts(od1_top_dest[0], gs[0, 0], y_axis = var) 236 | add_plts(od1_top_dest[1], gs[0, 1], y_axis = var) 237 | add_plts(od1_top_dest[2], gs[0, 2], y_axis = var) 238 | add_plts(od1_top_dest[3], gs[1, 0], y_axis = var) 239 | add_plts(od1_top_dest[4], gs[1, 1], y_axis = var) 240 | add_plts(od1_top_dest[5], gs[1, 2], y_axis = var) 241 | add_plts(od1_top_dest[6], gs[2, 0], y_axis = var) 242 | add_plts(od1_top_dest[7], gs[2, 1], y_axis = var) 243 | add_plts(od1_top_dest[8], gs[2, 2], y_axis = var) 244 | 245 | # Export 246 | fig.savefig('C:/Users/wb519128/Desktop/' + var + '.png') 247 | 248 | 249 | # df = p1_df 250 | # dest_value = od1_top_dest[0] 251 | # dest_var = 'region_to' 252 | # x_axis = 'connection_date' 253 | # y_axis = 'total_count' 254 | 255 | # df[df[dest_var] == dest_value].\ 256 | # plot(y= y_axis, 257 | # legend= False, 258 | # fontsize=6, 259 | # rot= 30) 260 | # plt.show() 261 | -------------------------------------------------------------------------------- /data-checks/Archive/quick_checks/ward_neighbors_tower_down.R: -------------------------------------------------------------------------------- 1 | # Check subscribers data 2 | 3 | FIG_PATH <- file.path(PROJECT_PATH, "proof-of-concept", 4 | "outputs", "data-checks", "figures_indicators", "subscribers_neighbors_daily") 5 | 6 | FIG_PATH_OUTLIER <- file.path(PROJECT_PATH, "proof-of-concept", 7 | "outputs", "data-checks", "figures_indicators", "subscribers_neighbors_daily_outlier") 8 | 9 | # Load Data -------------------------------------------------------------------- 10 | ISAAC_DATA_PATH_2 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin2_flowminder") 11 | ISAAC_DATA_PATH_3 <- file.path(PROJECT_PATH, "Isaac-results", "Isaac_apr_may", "admin3_flowminder") 12 | 13 | #### Wards 14 | wards_sp <- readRDS(file.path(CLEAN_DATA_ADM3_PATH, "wards_aggregated.Rds")) 15 | 16 | #### Tower down 17 | towers_down <- read.csv(file.path(PROOF_CONCEPT_PATH, 18 | "outputs", 19 | "data-checks", 20 | "days_wards_with_low_hours_I1_panel.csv")) 21 | 22 | towers_down <- towers_down %>% 23 | dplyr::select(region, date) %>% 24 | mutate(tower_down = T) %>% 25 | mutate(date = date %>% as.character %>% as.Date(), 26 | region = region %>% as.character()) 27 | 28 | #### Raw Data 29 | df_day_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2, 30 | "count_unique_subscribers_per_region_per_day.csv"), 31 | stringsAsFactors=F) %>% 32 | dplyr::rename(value_raw = subscriber_count, 33 | date = visit_date) %>% 34 | dplyr::mutate(region = region %>% as.character(), 35 | date = date %>% as.Date()) 36 | 37 | df_week_adm2_raw <- read.csv(file.path(ISAAC_DATA_PATH_2, 38 | "count_unique_subscribers_per_region_per_week.csv"), 39 | stringsAsFactors=F) %>% 40 | dplyr::rename(value_raw = subscriber_count, 41 | date = visit_week) %>% 42 | dplyr::mutate(region = region %>% as.character()) 43 | 44 | df_day_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3, 45 | "count_unique_subscribers_per_region_per_day.csv"), 46 | stringsAsFactors=F) %>% 47 | dplyr::rename(value_raw = subscriber_count, 48 | date = visit_date) %>% 49 | dplyr::mutate(region = region %>% as.character(), 50 | date = date %>% as.Date()) 51 | 52 | df_week_adm3_raw <- read.csv(file.path(ISAAC_DATA_PATH_3, 53 | "count_unique_subscribers_per_region_per_week.csv"), 54 | stringsAsFactors=F) %>% 55 | dplyr::rename(value_raw = subscriber_count, 56 | date = visit_week) %>% 57 | dplyr::mutate(region = region %>% as.character()) 58 | 59 | #### Cleaned Data 60 | df_day_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH, 61 | "count_unique_subscribers_per_region_per_day.Rds")) %>% 62 | left_join(df_day_adm2_raw, by=c("date", "region")) 63 | 64 | df_week_adm2 <- readRDS(file.path(CLEAN_DATA_ADM2_PATH, 65 | "count_unique_subscribers_per_region_per_week.Rds")) 66 | 67 | df_day_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH, 68 | "count_unique_subscribers_per_region_per_day.Rds")) %>% 69 | left_join(df_day_adm3_raw, by=c("date", "region")) %>% 70 | mutate(value_raw = value_raw %>% as.numeric()) 71 | 72 | df_week_adm3 <- readRDS(file.path(CLEAN_DATA_ADM3_PATH, 73 | "count_unique_subscribers_per_region_per_week.Rds")) 74 | 75 | # Create Ward Neighbors -------------------------------------------------------- 76 | #### Region and id datasets 77 | ward_id_df <- wards_sp@data %>% 78 | dplyr::select(region) %>% 79 | mutate(id = 1:n()) 80 | 81 | #### Create neighbor matrix 82 | neighbor_df <- gTouches(wards_sp, byid=TRUE) %>% 83 | as.data.frame() %>% 84 | mutate(id = 1:n()) %>% 85 | pivot_longer(-id) %>% 86 | dplyr::rename(n_id = name, 87 | neighbors = value) %>% 88 | dplyr::mutate(n_id = n_id %>% as.numeric()) %>% 89 | 90 | # id_n (neighbor) region 91 | left_join(ward_id_df, by = c("n_id" = "id")) %>% 92 | dplyr::rename(n_region = region) %>% 93 | 94 | # id region 95 | left_join(ward_id_df, by = "id") %>% 96 | 97 | # restrict to neighbors 98 | filter(neighbors %in% T) 99 | 100 | #### Merge data to neighbor matrix 101 | ward_data <- df_day_adm3 %>% 102 | dplyr::select(region, date, value, value_raw) 103 | 104 | neighbor_df <- neighbor_df %>% 105 | 106 | # neighbor data 107 | left_join(ward_data, by = c("n_region" = "region")) %>% 108 | dplyr::rename(value_n = value, 109 | value_raw_n = value_raw) %>% 110 | 111 | # ward data 112 | left_join(ward_data, by = c("region", "date")) 113 | 114 | #### Merge in Neighbor down 115 | neighbor_df <- neighbor_df %>% 116 | left_join(towers_down, by = c("region", "date")) %>% 117 | 118 | # tower down on any day? 119 | group_by(region) %>% 120 | mutate(tower_down_anyday = (TRUE %in% tower_down)) %>% 121 | 122 | # restrict to observations where tower down on any day 123 | filter(tower_down_anyday %in% T) 124 | 125 | #### Merge in province 126 | prov_df <- wards_sp@data %>% 127 | dplyr::select(region, province) 128 | 129 | neighbor_df <- neighbor_df %>% 130 | left_join(prov_df, by="region") 131 | 132 | # Neighbor Stats --------------------------------------------------------------- 133 | # TODO: Not naming things well, should be value_n_raw_avg, for example 134 | #### Average neighbor value 135 | neighbor_df <- neighbor_df %>% 136 | group_by(region, date) %>% 137 | mutate(value_n_avg = mean(value_raw_n, na.rm=T)) 138 | 139 | #### Percen change of neighbor value from average 140 | neighbor_df <- neighbor_df %>% 141 | group_by(n_region) %>% 142 | mutate(region_n_value_avg = mean(value_raw_n, na.rm=T)) %>% 143 | mutate(region_n_value_pc = (value_raw_n - region_n_value_avg)/region_n_value_avg) %>% 144 | mutate(region_n_value_pc_max = max(region_n_value_pc, na.rm=T)) 145 | 146 | # Export Datset ---------------------------------------------------------------- 147 | #neighbor_df_clean <- neighbor_df %>% 148 | # dplyr::select(region, n_region, date, value_n) 149 | 150 | #head(neighbor_df) 151 | 152 | 153 | 154 | # Trends Over Time ------------------------------------------------------------- 155 | neighbor_df %>% 156 | filter(id %in% 10) %>% 157 | ggplot() + 158 | geom_vline(data = . %>% filter(tower_down), aes(xintercept = date), 159 | color = "gray50", size=2, alpha = 0.2) + 160 | geom_line(aes(x=date, y=value_raw_n, 161 | group=n_id %>% as.factor(), 162 | color=n_id %>% as.factor())) + 163 | geom_line(aes(x=date, y=value_raw), size=2, color="black") + 164 | theme_minimal() + 165 | theme(legend.position = "none") 166 | 167 | 168 | lapply(unique(neighbor_df$province), function(province_i){ 169 | print(province_i) 170 | 171 | p <- neighbor_df %>% 172 | filter(province %in% province_i) %>% 173 | ggplot() + 174 | geom_vline(data = . %>% filter(tower_down), aes(xintercept = date), 175 | color = "gray50", size=2, alpha = 0.2) + 176 | geom_line(aes(x=date, y=value_raw), size=1.5, color="black") + 177 | geom_line(aes(x=date, y=value_n_avg), size=1.5, color="red") + 178 | geom_line(aes(x=date, y=value_raw_n, 179 | group=n_id %>% as.factor(), 180 | color=n_id %>% as.factor()), 181 | size=.4) + 182 | theme_minimal() + 183 | theme(legend.position = "none") + 184 | facet_wrap(~region, 185 | scales = "free_y") 186 | 187 | ggsave(p, filename = file.path(FIG_PATH, paste0(province_i, ".png")), height = 25, width = 25) 188 | 189 | return(NULL) 190 | }) 191 | 192 | # Bad Cases ------------------------------------------------------------- 193 | for(percent in c(50, 75, 100)){ 194 | 195 | print(percent) 196 | 197 | neighbor_df_bad <- neighbor_df %>% 198 | mutate(keep = (tower_down %in% TRUE) & (region_n_value_pc > percent/100)) %>% 199 | group_by(region) %>% 200 | mutate(keep_any = (TRUE %in% keep)) %>% 201 | ungroup() %>% 202 | filter(keep_any %in% TRUE) %>% 203 | filter(region_n_value_pc_max > percent/100) 204 | 205 | p_bad <- neighbor_df_bad %>% 206 | ggplot() + 207 | geom_vline(data = . %>% filter(tower_down), aes(xintercept = date), 208 | color = "gray50", size=2, alpha = 0.2) + 209 | geom_line(aes(x=date, y=value_raw), size=1.75, color="black") + 210 | geom_line(aes(x=date, y=value_raw_n, 211 | group=n_id %>% as.factor(), 212 | color=n_id %>% as.factor()), 213 | size=1) + 214 | 215 | #geom_line(aes(x=date, y=value_n_avg), size=1.5, color="red") + 216 | theme_minimal() + 217 | theme(legend.position = "none") + 218 | facet_wrap(~region, 219 | scales = "free_y") 220 | 221 | ggsave(p_bad, filename = file.path(FIG_PATH_OUTLIER, paste0(percent, "percent_thresh.png")), height = 25, width = 25) 222 | } 223 | 224 | 225 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/tower_clustering.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | import geopandas as gpd 3 | import numpy as np 4 | import pandas as pd 5 | from shapely.geometry import Polygon, LineString 6 | from sklearn.neighbors import DistanceMetric 7 | from scipy.spatial.distance import squareform 8 | from scipy.cluster.hierarchy import linkage 9 | from scipy.cluster.hierarchy import fcluster 10 | from copy import deepcopy 11 | import os 12 | if os.environ['HOME'] != '/root': 13 | from modules.utilities import * 14 | databricks = False 15 | else: 16 | databricks = True 17 | 18 | 19 | ## Class to handle spark and df in session 20 | class tower_clusterer: 21 | """Class to cluster towers together. 22 | 23 | 24 | Attributes 25 | ---------- 26 | datasource : an instance of DataSource class. 27 | shape : a geopandas dataframe. Shapefile to use for clustering 28 | region_var : a string. Name of the region variable in the shapefile. 29 | sites : a string. Name of the attribute of datasource that holds the tower coordinates. 30 | shape_df : a pyspark dataframe. Shapefile to use for clustering, in pyspark df. 31 | spark : an initialised spark connection 32 | spark_df : a pyspark dataframe. Holds the cdr data 33 | result_path : a string. Where to save results. 34 | filename : a string. Name for result file. 35 | dist : a string. Metric to use to calculate distances. 36 | sites : a pyspark dataframe. Code, Lat, Lng for all tower_sites 37 | sites_with_clusters : a pyspark dataframe. Clustered sites (once methods have run) 38 | 39 | 40 | 41 | Methods 42 | ------- 43 | cluster_towers() 44 | runs clustering algorithm 45 | 46 | get_centroids() 47 | computes centroids of clusters 48 | 49 | map_to_regions() 50 | maps cluster centroids to admin regions 51 | 52 | save_results() 53 | saves the results to csv 54 | 55 | """ 56 | 57 | def __init__(self, 58 | datasource, 59 | shape, 60 | region_var, 61 | sites = 'tower_sites'): 62 | """ 63 | Parameters 64 | ---------- 65 | datasource : an instance of DataSource class. 66 | shape : a geopandas dataframe. Shapefile to use for clustering 67 | region_var : a string. Name of the region variable in the shapefile. 68 | sites : a string. Name of the attribute of datasource that holds the tower coordinates. 69 | """ 70 | self.datasource = datasource 71 | self.spark = datasource.spark 72 | self.shape = getattr(datasource, shape + '_gpd') 73 | self.shape_df = getattr(datasource, shape) 74 | self.result_path = datasource.results_path 75 | self.filename = shape 76 | self.region_var = region_var 77 | self.dist = DistanceMetric.get_metric('haversine') 78 | sites_df = getattr(datasource, sites + '_pd') 79 | if (sites_df.columns == ['cell_id', 'LAT', 'LNG']).all(): 80 | self.sites = sites_df[sites_df.LAT.notna()] 81 | self.sites_with_clusters = self.sites 82 | else: 83 | raise 'The sites dataframe does not have the correct columns / \ 84 | column order. Should be cell_id, LAT, LNG' 85 | 86 | def cluster_towers(self): 87 | ## deepcopy sites since we will need it later on 88 | self.radians = deepcopy(self.sites) 89 | # convert degrees to radians 90 | self.radians['LAT'] = np.radians(self.sites['LAT']) 91 | self.radians['LNG'] = np.radians(self.sites['LNG']) 92 | # run clustering algorithm 93 | self.clusters = fcluster( 94 | linkage( 95 | squareform( 96 | self.dist.pairwise(self.radians[['LAT','LNG']]\ 97 | .to_numpy())*6373), method='ward'), t = 1, criterion = 'distance') 98 | self.sites_with_clusters = self.radians 99 | self.sites_with_clusters['cluster'] = self.clusters 100 | # compute centroids of clusters 101 | self.get_centroids() 102 | self.sites_with_clusters['LAT'] = np.rad2deg(self.sites_with_clusters['LAT']) 103 | self.sites_with_clusters['LNG'] = np.rad2deg(self.sites_with_clusters['LNG']) 104 | self.sites_with_clusters['centroid_LAT'] = \ 105 | np.rad2deg(self.sites_with_clusters['centroid_LAT']) 106 | self.sites_with_clusters['centroid_LNG'] = \ 107 | np.rad2deg(self.sites_with_clusters['centroid_LNG']) 108 | # put clusters in geodataframe 109 | self.sites_gpd = gpd.GeoDataFrame(self.sites_with_clusters, 110 | geometry=gpd.points_from_xy( 111 | self.sites_with_clusters.centroid_LNG, 112 | self.sites_with_clusters.centroid_LAT), 113 | crs = 'epsg:4326') 114 | # compute distances between cluters 115 | self.distances_pd = pd.DataFrame( 116 | self.dist.pairwise( 117 | np.radians( 118 | self.sites_with_clusters[['centroid_LAT','centroid_LNG']])\ 119 | .to_numpy())*6373, columns=self.sites_with_clusters.cell_id.unique(), 120 | index=self.sites_with_clusters.cell_id.unique()) 121 | # create long form of distance matrix 122 | distances = [] 123 | origin = [] 124 | destination = [] 125 | for a in self.distances_pd.index: 126 | for b in self.distances_pd.index: 127 | distances.append(self.distances_pd.loc[a,b]) 128 | origin.append(a) 129 | destination.append(b) 130 | self.distances_pd_long = pd.DataFrame(list(zip(distances, origin, destination)), 131 | columns =['distance', 'origin', 'destination']) 132 | # map clusters to regions 133 | self.map_to_regions() 134 | return self.save_results() 135 | 136 | def get_centroids(self): 137 | # loop through clusters to compute centroids 138 | for cluster_num in self.sites_with_clusters.cluster.unique(): 139 | subset = self.sites_with_clusters[self.sites_with_clusters.cluster == cluster_num] 140 | # use line method if we have only two towers in cluster 141 | if len(subset) == 2: 142 | line = LineString(subset.loc[:,['LNG', 'LAT']].to_numpy()) 143 | self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \ 144 | cluster_num, 'centroid_LNG'] = line.interpolate(0.5, normalized = True).x 145 | self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \ 146 | cluster_num, 'centroid_LAT'] = line.interpolate(0.5, normalized = True).y 147 | # use polygon method if we have more than two towers in cluster 148 | if len(subset) > 2: 149 | self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \ 150 | cluster_num, 'centroid_LNG'] = \ 151 | Polygon(subset.loc[:,['LNG', 'LAT']].to_numpy()).convex_hull.centroid.x 152 | self.sites_with_clusters.loc[self.sites_with_clusters.cluster == \ 153 | cluster_num, 'centroid_LAT'] = \ 154 | Polygon(subset.loc[:,['LNG', 'LAT']].to_numpy()).convex_hull.centroid.y 155 | # replace NAs 156 | self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(), 157 | 'centroid_LNG'] = \ 158 | self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(), 'LNG'] 159 | self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(), 160 | 'centroid_LAT'] = \ 161 | self.sites_with_clusters.loc[self.sites_with_clusters.centroid_LAT.isna(), 'LAT'] 162 | 163 | def map_to_regions(self): 164 | # spatial join clusteres with shapefile 165 | self.joined = gpd.sjoin(self.sites_gpd, self.shape, op="intersects") 166 | 167 | def save_results(self): 168 | # save results of mapping of clusters to regions 169 | self.joined = self.joined.rename(columns={self.region_var:'region'}) 170 | self.towers_regions_clusters_all_vars = \ 171 | self.joined.loc[:,['cell_id', 'LAT', 'LNG', 'centroid_LAT', 172 | 'centroid_LNG', 'region', 'cluster']] 173 | self.towers_regions_clusters_all_vars = \ 174 | self.spark.createDataFrame(self.towers_regions_clusters_all_vars) 175 | save_csv(self.towers_regions_clusters_all_vars, 176 | self.result_path, 177 | self.datasource.country_code + '_' + self.filename + '_tower_map_all_vars') 178 | # save results with only essential variables, for use in data processing 179 | self.towers_regions_clusters = \ 180 | self.joined.loc[:,['cell_id', 'region']] 181 | self.towers_regions_clusters = \ 182 | self.spark.createDataFrame(self.towers_regions_clusters) 183 | save_csv(self.towers_regions_clusters, 184 | self.result_path, 185 | self.datasource.country_code + '_' + self.filename + '_tower_map') 186 | # save distance matrix in long form 187 | self.distances_df_long = \ 188 | self.spark.createDataFrame(self.distances_pd_long) 189 | save_csv(self.distances_df_long, 190 | self.result_path, self.datasource.country_code + '_distances_pd_long') 191 | # save shapefile used, for dashboarding 192 | save_csv(self.shape_df, self.result_path, 193 | self.datasource.country_code + '_' + self.filename + '_shapefile') 194 | return self.towers_regions_clusters, self.distances_df_long 195 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/aggregation_master.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Production of indicators for the COVID19 Mobility Task Force 5 | # 6 | # In this notebook we produce indicators for the [COVID19 Mobility Task Force](https://github.com/worldbank/covid-mobile-data). 7 | # 8 | # [Flowminder](https://covid19.flowminder.org) indicators are produced to increase the availability of comparable datasets across countries, and have been copied without modification from the [Flowminder COVID-19 github repository](https://github.com/Flowminder/COVID-19) (except for the start and end dates). These have been supplemented by a set of *priority* indicators with data for ingestion into the dashboard in this repository. 9 | # 10 | # In this notebook we produce indicators in the following four steps: 11 | # 12 | # - **Import code**: The code for the aggregation is included in the 'custom_aggregation' and 'flowminder_aggregation' scripts 13 | # - **Import data**: 14 | # To set up the data import we need to place the CDR data files into the `data/new/CC/telco/` folder, where we replace `CC` with the country code and `telco` with the company abbreviation. 15 | # We also need to place csv files with the tower-region mapping and distance matrices into the `data/support-data/CC/telco/geofiles` folder, and then modify the `data/support_data/config_file.py` to specify: 16 | # - *geofiles*: the names of the geofiles, 17 | # - *country_code*: country code and company abbreviation, 18 | # - *telecom_alias*: the path to the `data` folder, 19 | # - *data_paths*: the names to the subfolders in `data/new/CC/telco/` that hold the csv files. Simply change this to `[*]` if you didn't create subfolders and want to load all files. 20 | # - *dates*: set the start and end date of the data you want to produce the indicators for. 21 | # 22 | # Find more information about the `config_file.py` settings see the [github page](https://github.com/worldbank/covid-mobile-data/tree/master/cdr-aggregation). 23 | # 24 | # - **Run aggregations**: By default, we produce all flowminder and priority indicators. We've included 4 re-tries in case of failure, which we have experienced to help on databricks but is probably irrelevant in other settings. Note that before you can re-run these aggregations, you need to move the csv outputs that have been saved in `data/results/CC/telco/` in previous runs to another folder, else these indicators will be skipped. This prevents you from accidentally overwriting previous results. This way you can also delete the files only for the indicators you want to re-produce, and skip any indicatos you don't want to re-produce. 25 | # 26 | # The outcome of this effort will be used to inform policy making using a [mobility indicator dashboard](https://github.com/worldbank/covid-mobile-data/tree/master/dashboard-dataviz). 27 | 28 | # # Import code 29 | 30 | # In[1]: 31 | 32 | 33 | get_ipython().run_line_magic('load_ext', 'autoreload') 34 | get_ipython().run_line_magic('autoreload', '2') 35 | 36 | 37 | # In[2]: 38 | 39 | 40 | from modules.DataSource import * 41 | 42 | 43 | # In[3]: 44 | 45 | 46 | config_file = '../config_file.py' 47 | 48 | 49 | # In[4]: 50 | 51 | 52 | exec(open(config_file).read()) 53 | 54 | 55 | # In[5]: 56 | 57 | 58 | ds = DataSource(datasource_configs) 59 | ds.show_config() 60 | 61 | 62 | # In[6]: 63 | 64 | 65 | from modules.setup import * 66 | 67 | 68 | # # Import data 69 | 70 | # ## Load CDR data 71 | 72 | # ### Process/standardize raw data, save as parquet, and then load it 73 | 74 | # In[7]: 75 | 76 | 77 | # ds.standardize_csv_files(show=True) 78 | # ds.save_as_parquet() 79 | 80 | 81 | # In[8]: 82 | 83 | 84 | #ds.load_standardized_parquet_file() 85 | 86 | 87 | # ### Alternatively, specify and load hive table 88 | 89 | # In[9]: 90 | 91 | 92 | # # Specify and load hive data 93 | # ds.parquet_df = ds.spark.sql("""SELECT {} AS msisdn, 94 | # {} AS call_datetime, 95 | # {} AS location_id FROM {}""".format(ds.hive_vars['msisdn'], 96 | # ds.hive_vars['call_datetime'], 97 | # ds.hive_vars['location_id'], 98 | # ds.hive_vars['calls'])) 99 | 100 | 101 | # ### Or load a sample file 102 | 103 | # In[10]: 104 | 105 | 106 | ## Use this in case you want to sample the data and run the code on the sample 107 | 108 | # #ds.sample_and_save(number_of_ids=1000) 109 | ds.load_sample('sample_feb_mar2020') 110 | ds.parquet_df = ds.sample_df 111 | 112 | 113 | # ## Load geo data 114 | 115 | # In[11]: 116 | 117 | 118 | ds.load_geo_csvs() 119 | 120 | 121 | # In[12]: 122 | 123 | 124 | ## Use this in case you want to cluster the towers and create a distance matrix 125 | 126 | # ds.create_gpds() 127 | # from modules.tower_clustering import * 128 | # clusterer = tower_clusterer(ds, 'admin2', 'ID_2') 129 | # ds.admin2_tower_map, ds.distances = clusterer.cluster_towers() 130 | # clusterer = tower_clusterer(ds, 'admin3', 'ADM3_PCODE') 131 | # ds.admin3_tower_map, ds.distances = clusterer.cluster_towers() 132 | 133 | 134 | # In[13]: 135 | 136 | 137 | ## Use this in case you want to create a voronoi tesselation 138 | 139 | # from modules.voronoi import * 140 | # voronoi = voronoi_maker(ds, 'admin3', 'ADM3_PCODE') 141 | # ds.voronoi = voronoi.make_voronoi() 142 | 143 | 144 | # # Run aggregations 145 | 146 | # ## Flowminder indicators for admin2 147 | 148 | # In[14]: 149 | 150 | 151 | agg_flowminder_admin2 = flowminder_aggregator(result_stub = '/admin2/flowminder', 152 | datasource = ds, 153 | regions = 'admin2_tower_map') 154 | 155 | agg_flowminder_admin2.attempt_aggregation() 156 | 157 | 158 | # ## Flowminder indicators for admin3 159 | 160 | # In[15]: 161 | 162 | 163 | agg_flowminder_admin3 = flowminder_aggregator(result_stub = '/admin3/flowminder', 164 | datasource = ds, 165 | regions = 'admin3_tower_map') 166 | 167 | agg_flowminder_admin3.attempt_aggregation() 168 | 169 | 170 | # ## Priority indicators for admin2 171 | 172 | # In[16]: 173 | 174 | 175 | agg_priority_admin2 = priority_aggregator(result_stub = '/admin2/priority', 176 | datasource = ds, 177 | regions = 'admin2_tower_map') 178 | 179 | agg_priority_admin2.attempt_aggregation(indicators_to_produce = {'unique_subscribers_per_day' : ['unique_subscribers', 'day'], 180 | 'percent_of_all_subscribers_active_per_day' : ['percent_of_all_subscribers_active', 'day'], 181 | 'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day'], 182 | 'mean_distance_per_day' : ['mean_distance', 'day'], 183 | 'mean_distance_per_week' : ['mean_distance', 'week'], 184 | 'origin_destination_matrix_time_per_day' : ['origin_destination_matrix_time', 'day'], 185 | 'home_vs_day_location_per_day' : ['home_vs_day_location_per_day', ['day','week']], 186 | 'home_vs_day_location_per_day' : ['home_vs_day_location_per_day', ['day','month']]}) 187 | 188 | 189 | # ## Priority indicators for admin3 190 | 191 | # In[17]: 192 | 193 | 194 | agg_priority_admin3 = priority_aggregator(result_stub = '/admin3/priority', 195 | datasource = ds, 196 | regions = 'admin3_tower_map') 197 | 198 | agg_priority_admin3.attempt_aggregation(indicators_to_produce = {'transactions_per_hour' : ['transactions', 'hour'], 199 | 'transactions_per_hour' : ['transactions', 'hour']}) 200 | 201 | 202 | # ## Scaled priority indicators for admin2 203 | 204 | # In[ ]: 205 | 206 | 207 | agg_scaled_admin2 = scaled_aggregator(result_stub = '/admin2/scaled', 208 | datasource = ds, 209 | regions = 'admin2_tower_map') 210 | 211 | agg_scaled_admin2.attempt_aggregation() 212 | 213 | 214 | # ## Priority indicators for tower-cluster 215 | 216 | # In[ ]: 217 | 218 | 219 | agg_priority_tower = priority_aggregator(result_stub = '/voronoi/priority', 220 | datasource = ds, 221 | regions = 'voronoi_tower_map') 222 | 223 | agg_priority_tower.attempt_aggregation(indicators_to_produce = {'unique_subscribers_per_hour' : ['unique_subscribers', 'hour'], 224 | 'mean_distance_per_day' : ['mean_distance', 'day'], 225 | 'mean_distance_per_week' : ['mean_distance', 'week']}) 226 | 227 | 228 | # In[ ]: 229 | 230 | 231 | agg_priority_tower_harare = priority_aggregator(result_stub = '/voronoi/priority/harare', 232 | datasource = ds, 233 | regions = 'voronoi_tower_map_harare') 234 | 235 | agg_priority_tower_harare.attempt_aggregation(indicators_to_produce = {'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day']}) 236 | 237 | 238 | # In[ ]: 239 | 240 | 241 | agg_priority_tower_bulawayo = priority_aggregator(result_stub = '/voronoi/priority/bulawayo', 242 | datasource = ds, 243 | regions = 'voronoi_tower_map_bulawayo') 244 | 245 | agg_priority_tower_bulawayo.attempt_aggregation(indicators_to_produce = {'origin_destination_connection_matrix_per_day' : ['origin_destination_connection_matrix', 'day']}) 246 | 247 | 248 | # # Produce script 249 | 250 | # In[ ]: 251 | 252 | 253 | get_ipython().system('jupyter nbconvert --to script *.ipynb') 254 | 255 | 256 | # In[ ]: 257 | 258 | 259 | 260 | 261 | -------------------------------------------------------------------------------- /cdr-aggregation/notebooks/modules/sql_code_aggregates.py: -------------------------------------------------------------------------------- 1 | # Databricks notebook source 2 | def write_sql_code(calls = 'calls', 3 | start_date = "\'2020-02-01\'", 4 | end_date = "\'2020-03-31\'", 5 | start_date_weeks = "\'2020-02-03\'", 6 | end_date_weeks = "\'2020-03-29\'"): 7 | 8 | sql_code = { 9 | # Aggregate 1 (April 1 version) 10 | 'count_unique_subscribers_per_region_per_day' : 11 | """ 12 | SELECT * FROM ( 13 | SELECT calls.call_date AS visit_date, 14 | cells.region AS region, 15 | count(DISTINCT msisdn) AS subscriber_count 16 | FROM calls 17 | INNER JOIN cells 18 | ON calls.location_id = cells.cell_id 19 | WHERE calls.call_date >= {} 20 | AND calls.call_date <= CURRENT_DATE 21 | GROUP BY 1, 2 22 | ) AS grouped 23 | WHERE grouped.subscriber_count >= 15 24 | """.format(start_date), 25 | 26 | # Intermediate Result - Home location 27 | 'home_locations' : 28 | """ 29 | SELECT msisdn, region FROM ( 30 | SELECT 31 | msisdn, 32 | region, 33 | row_number() OVER ( 34 | PARTITION BY msisdn 35 | ORDER BY total DESC, latest_date DESC 36 | ) AS daily_location_rank 37 | FROM ( 38 | 39 | SELECT msisdn, 40 | region, 41 | count(*) AS total, 42 | max(call_date) AS latest_date 43 | FROM ( 44 | SELECT calls.msisdn, 45 | cells.region, 46 | calls.call_date, 47 | row_number() OVER ( 48 | PARTITION BY calls.msisdn, calls.call_date 49 | ORDER BY calls.call_datetime DESC 50 | ) AS event_rank 51 | FROM calls 52 | INNER JOIN cells 53 | ON calls.location_id = cells.cell_id 54 | WHERE calls.call_date >= {} 55 | AND calls.call_date <= {} 56 | 57 | ) ranked_events 58 | 59 | WHERE event_rank = 1 60 | GROUP BY 1, 2 61 | 62 | ) times_visited 63 | ) ranked_locations 64 | WHERE daily_location_rank = 1 65 | """.format(start_date, end_date), 66 | 67 | # Aggregate 2 (April 1 version) 68 | 'count_unique_active_residents_per_region_per_day' : 69 | """ 70 | SELECT * FROM ( 71 | SELECT calls.call_date AS visit_date, 72 | cells.region AS region, 73 | count(DISTINCT calls.msisdn) AS subscriber_count 74 | FROM calls 75 | INNER JOIN cells 76 | ON calls.location_id = cells.cell_id 77 | INNER JOIN home_locations homes -- See intermediate_queries.sql for code to create the home_locations table 78 | ON calls.msisdn = homes.msisdn 79 | AND cells.region = homes.region 80 | GROUP BY 1, 2 81 | ) AS grouped 82 | WHERE grouped.subscriber_count >= 15""", 83 | 84 | 'count_unique_visitors_per_region_per_day' : 85 | """ 86 | SELECT * FROM ( 87 | SELECT all_visits.visit_date, 88 | all_visits.region, 89 | all_visits.subscriber_count - coalesce(home_visits.subscriber_count, 0) AS subscriber_count 90 | FROM count_unique_subscribers_per_region_per_day all_visits 91 | LEFT JOIN count_unique_active_residents_per_region_per_day home_visits 92 | ON all_visits.visit_date = home_visits.visit_date 93 | AND all_visits.region = home_visits.region 94 | ) AS visitors 95 | WHERE visitors.subscriber_count >= 15""", 96 | 97 | # Aggregate 3 (April 1 version) 98 | 'count_unique_subscribers_per_region_per_week' : 99 | """ 100 | SELECT * FROM ( 101 | SELECT extract(WEEK FROM calls.call_date) AS visit_week, 102 | cells.region AS region, 103 | count(DISTINCT calls.msisdn) AS subscriber_count 104 | FROM calls 105 | INNER JOIN cells 106 | ON calls.location_id = cells.cell_id 107 | WHERE calls.call_date >= {} 108 | AND calls.call_date <= {} 109 | GROUP BY 1, 2 110 | ) AS grouped 111 | WHERE grouped.subscriber_count >= 15 112 | """.format(start_date_weeks, end_date_weeks), 113 | 114 | # Aggregate 4 (April 1 version) 115 | 'count_unique_active_residents_per_region_per_week' : 116 | """ 117 | SELECT * FROM ( 118 | SELECT extract(WEEK FROM calls.call_date) AS visit_week, 119 | cells.region AS region, 120 | count(DISTINCT calls.msisdn) AS subscriber_count 121 | FROM calls 122 | INNER JOIN cells 123 | ON calls.location_id = cells.cell_id 124 | INNER JOIN home_locations homes -- See intermediate_queries.sql for code to create the home_locations table 125 | ON calls.msisdn = homes.msisdn 126 | AND cells.region = homes.region 127 | WHERE calls.call_date >= {} 128 | AND calls.call_date <= {} 129 | GROUP BY 1, 2 130 | ) AS grouped 131 | WHERE grouped.subscriber_count >= 15 132 | """.format(start_date_weeks, end_date_weeks), 133 | 134 | 'count_unique_visitors_per_region_per_week' : 135 | """ 136 | SELECT * FROM ( 137 | SELECT all_visits.visit_week, 138 | all_visits.region, 139 | all_visits.subscriber_count - coalesce(home_visits.subscriber_count, 0) AS subscriber_count 140 | FROM count_unique_subscribers_per_region_per_week all_visits 141 | LEFT JOIN count_unique_active_residents_per_region_per_week home_visits 142 | ON all_visits.visit_week = home_visits.visit_week 143 | AND all_visits.region = home_visits.region 144 | ) AS visitors 145 | WHERE visitors.subscriber_count >= 15""", 146 | 147 | # Aggregate 5 (April 1 version) 148 | 'regional_pair_connections_per_day' : 149 | """ 150 | SELECT * FROM ( 151 | SELECT connection_date, 152 | region1, 153 | region2, 154 | count(*) AS subscriber_count 155 | FROM ( 156 | 157 | SELECT t1.call_date AS connection_date, 158 | t1.msisdn AS msisdn, 159 | t1.region AS region1, 160 | t2.region AS region2 161 | FROM ( 162 | SELECT DISTINCT calls.msisdn, 163 | calls.call_date, 164 | cells.region 165 | FROM calls 166 | INNER JOIN cells 167 | ON calls.location_id = cells.cell_id 168 | WHERE calls.call_date >= {} 169 | AND calls.call_date <= CURRENT_DATE 170 | ) t1 171 | 172 | FULL OUTER JOIN 173 | 174 | ( 175 | SELECT DISTINCT calls.msisdn, 176 | calls.call_date, 177 | cells.region 178 | FROM calls 179 | INNER JOIN cells 180 | ON calls.location_id = cells.cell_id 181 | WHERE calls.call_date >= {} 182 | AND calls.call_date <= CURRENT_DATE 183 | ) t2 184 | 185 | ON t1.msisdn = t2.msisdn 186 | AND t1.call_date = t2.call_date 187 | WHERE t1.region < t2.region 188 | 189 | ) AS pair_connections 190 | GROUP BY 1, 2, 3 191 | ) AS grouped 192 | WHERE grouped.subscriber_count >= 15 193 | """.format(start_date, start_date), 194 | 195 | # Aggregate 6 (April 2 version) 196 | 'directed_regional_pair_connections_per_day' : 197 | """ 198 | WITH subscriber_locations AS ( 199 | SELECT calls.msisdn, 200 | calls.call_date, 201 | cells.region, 202 | min(calls.call_datetime) AS earliest_visit, 203 | max(calls.call_datetime) AS latest_visit 204 | FROM calls 205 | INNER JOIN cells 206 | ON calls.location_id = cells.cell_id 207 | WHERE calls.call_date >= {} 208 | AND calls.call_date <= CURRENT_DATE 209 | GROUP BY msisdn, call_date, region 210 | ) 211 | SELECT * FROM ( 212 | SELECT connection_date, 213 | region_from, 214 | region_to, 215 | count(*) AS subscriber_count 216 | FROM ( 217 | 218 | SELECT t1.call_date AS connection_date, 219 | t1.msisdn AS msisdn, 220 | t1.region AS region_from, 221 | t2.region AS region_to 222 | FROM subscriber_locations t1 223 | FULL OUTER JOIN subscriber_locations t2 224 | ON t1.msisdn = t2.msisdn 225 | AND t1.call_date = t2.call_date 226 | WHERE t1.region <> t2.region 227 | AND t1.earliest_visit < t2.latest_visit 228 | 229 | ) AS pair_connections 230 | GROUP BY 1, 2, 3 231 | ) AS grouped 232 | WHERE grouped.subscriber_count >= 15 233 | """.format(start_date), 234 | 235 | # Aggregate 7 (April 3 version) 236 | 'total_calls_per_region_per_day' : 237 | """ 238 | SELECT 239 | call_date, 240 | region, 241 | total_calls 242 | FROM ( 243 | SELECT calls.call_date AS call_date, 244 | cells.region AS region, 245 | count(DISTINCT msisdn) AS subscriber_count, 246 | count(*) AS total_calls 247 | FROM calls 248 | INNER JOIN cells 249 | ON calls.location_id = cells.cell_id 250 | WHERE calls.call_date >= {} 251 | AND calls.call_date <= CURRENT_DATE 252 | GROUP BY 1, 2 253 | ) AS grouped 254 | WHERE grouped.subscriber_count >= 15 255 | """.format(start_date), 256 | 257 | # Aggregate 8 (April 3 version) 258 | 'home_location_counts_per_region' : 259 | """ 260 | SELECT * FROM ( 261 | SELECT region, count(msisdn) AS subscriber_count 262 | FROM home_locations -- See intermediate_queries.sql for code to create the home_locations table 263 | GROUP BY region 264 | ) AS home_counts 265 | WHERE home_counts.subscriber_count >= 15"""} 266 | return sql_code 267 | --------------------------------------------------------------------------------