├── .gitignore ├── Common ├── cdr_data.py ├── cdr_interpolation.py ├── cdr_origin_destination.py ├── cdr_statistics.py ├── config_object.py ├── helper.py ├── hive_connection.py └── hive_create_tables.py ├── Interpolation ├── README.md └── output_sample │ └── interpolation.csv ├── LICENSE ├── Origin_Destination ├── README.md └── output_sample │ └── origin_destination.tsv ├── README.md ├── Sample_Inputs ├── cdr_sample.csv └── mapping_sample.csv ├── Statistics ├── README.md ├── output_graphs │ ├── daily_avg_cdr.png │ ├── daily_cdr_by_call_type.png │ ├── daily_cdrs.png │ ├── daily_unique_avg_locations.png │ ├── daily_unique_locations.png │ ├── daily_unique_users.png │ └── user_data_histogram.png └── output_reports │ ├── css_file_data_stat.csv │ ├── css_provider_data_stat_daily.csv │ ├── css_provider_data_stat_monthly.csv │ ├── frequent_location_output_sample.png │ ├── japan._joined_ADMIN1.json │ ├── summary_stats.csv │ ├── zone_based_aggregations_level_ADMIN1.csv │ └── zone_based_aggregations_level_ADMIN2.csv ├── hive_init_commands ├── initial_hive_commands_interpolation.json ├── initial_hive_commands_od.json └── initial_hive_commands_stats.json ├── lib ├── Jama-1.0.2.jar ├── ajt-2.5.jar ├── cdrinterpolationlib.jar ├── cdrlibindicator.jar ├── cdrmobilitylib.jar ├── cdrmobilitylibge.jar ├── cdrmobilitylibjica.jar ├── commons-dbcp-1.4.jar ├── commons-lang-2.6.jar ├── commons-logging-1.1.3.jar ├── commons-math-2.1.jar ├── commons-pool-1.5.4.jar ├── gt-api-9.3.jar ├── gt-data-9.3.jar ├── gt-epsg-hsql-12.2.jar ├── gt-main-9.3.jar ├── gt-metadata-2.6.5.jar ├── gt-metadata-9.3.jar ├── gt-opengis-9.3.jar ├── gt-referencing-2.6.5.jar ├── gt-referencing-9.3.jar ├── gt-shapefile-2.6.5.jar ├── jahmm-0.6.2.jar ├── javaml-0.1.6.jar ├── jsr-275-1.0-beta-2.jar ├── jts-1.12.jar ├── jts-1.13.jar ├── jtsio-1.12.jar ├── libsvm.jar ├── pflow-hiveUDF.jar ├── postgis-jdbc-2.1.0SVN.jar ├── postgresql-9.3-1102.jdbc4.jar ├── vecmath-1.3.2.jar └── weka.jar ├── queries ├── cdr_and_mapping │ ├── create_consolidate_cdr.sql │ ├── create_mapping_admin.sql │ ├── create_preprocess_cdr.sql │ ├── create_preprocess_mapping.sql │ ├── create_raw_cdr.sql │ ├── create_raw_mapping.sql │ ├── insert_consolidate_cdr.sql │ ├── insert_consolidate_cdr_join.sql │ ├── insert_mapping_admin.sql │ ├── insert_preprocess_cdr.sql │ └── insert_preprocess_mapping.sql ├── interpolation │ ├── create_cdr_by_uid.sql │ ├── create_poi_relocation.sql │ ├── create_route_interpolation.sql │ ├── create_trip_24_hr_padding.sql │ ├── create_trip_format.sql │ ├── export_to_gps_format.sql │ ├── insert_cdr_by_uid.sql │ ├── insert_poi_relocation.sql │ ├── insert_route_interpolation.sql │ ├── insert_trip_24_hr_padding.sql │ └── insert_trip_format.sql ├── origin_destination │ ├── create_la_cdr_all_with_ant_zone_by_uid.sql │ ├── create_la_cdr_all_with_ant_zone_by_uid_od.sql │ ├── create_la_cdr_all_with_ant_zone_by_uid_od_detail.sql │ ├── create_la_cdr_all_with_ant_zone_by_uid_od_sum.sql │ ├── create_la_cdr_uid_home.sql │ ├── insert_la_cdr_all_with_ant_zone_by_old_consolidate.sql │ ├── insert_la_cdr_all_with_ant_zone_by_uid.sql │ ├── insert_la_cdr_all_with_ant_zone_by_uid_od.sql │ ├── insert_la_cdr_all_with_ant_zone_by_uid_od_detail.sql │ ├── insert_la_cdr_all_with_ant_zone_by_uid_od_sum.sql │ ├── insert_la_cdr_uid_home.sql │ └── od_to_csv.sql └── statistics │ ├── graphs │ ├── daily_average_cdrs │ │ └── daily_average_cdrs.sql │ ├── daily_average_unique_locations │ │ ├── daily_average_unique_locations.sql │ │ └── daily_average_unique_locations_old_consolidate.sql │ ├── daily_cdrs │ │ └── total_daily_cdrs.sql │ ├── daily_cdrs_by_call_type │ │ └── daily_cdrs_by_call_type.sql │ ├── daily_unique_locations │ │ ├── daily_unique_locations.sql │ │ ├── daily_unique_locations_old_consolidate.sql │ │ ├── total_unique_locations.sql │ │ └── total_unique_locations_old_consolidate.sql │ ├── daily_unique_users │ │ └── total_daily_uids.sql │ └── date_histogram │ │ └── histogram.sql │ ├── reports │ ├── all_statistics │ │ └── data_statistics.sql │ ├── daily_statistics │ │ └── daily_statistics.sql │ ├── frequent_locations │ │ ├── create_frequent_locations.sql │ │ ├── create_frequent_locations_night.sql │ │ ├── frequent_locations.sql │ │ ├── frequent_locations_night.sql │ │ ├── frequent_locations_night_old_consolidate.sql │ │ ├── frequent_locations_old_consolidate.sql │ │ ├── frequent_locations_thresholded.sql │ │ ├── frequent_locations_thresholded_night.sql │ │ ├── frequent_locations_wsum.sql │ │ └── frequent_locations_wsum_night.sql │ ├── monthly_statistics │ │ └── monthly_statistics.sql │ ├── summary │ │ ├── average_daily_admin1.sql │ │ ├── average_daily_sms.sql │ │ ├── average_daily_voice.sql │ │ ├── average_unique_cell_ids.sql │ │ └── total_days.sql │ └── zone_population │ │ └── zone_population.sql │ ├── total_records.sql │ └── total_unique_uids.sql ├── requirements.txt ├── run_interpolation.py ├── run_origin_destination.py ├── run_prepare_cdr_and_mapping.py ├── run_statistics.py └── sample_configs ├── config.json └── config_big.json /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | cdrenv/ 3 | japan.json 4 | configs/config.json 5 | configs/config_big.json 6 | __pycache__ 7 | not_used/ 8 | venv 9 | -------------------------------------------------------------------------------- /Common/cdr_data.py: -------------------------------------------------------------------------------- 1 | class CDRData: 2 | def __init__(self): 3 | pass 4 | -------------------------------------------------------------------------------- /Common/cdr_interpolation.py: -------------------------------------------------------------------------------- 1 | from Common.hive_connection import HiveConnection 2 | import time 3 | from Common.helper import format_two_point_time, sql_to_string 4 | 5 | 6 | class Interpolation: 7 | def __init__(self, config): 8 | self.config = config 9 | self.hc = HiveConnection() 10 | 11 | def calculate_interpolation(self): 12 | self.convert_cdr_to_array_format() 13 | self.create_trip_format() 14 | self.create_trip_24hr_padding() 15 | self.create_poi_relocation() 16 | self.create_route_interpolation() 17 | self.export_to_csv() 18 | 19 | def convert_cdr_to_array_format(self): 20 | provider_prefix = self.config.provider_prefix 21 | cursor = self.hc.cursor 22 | print('########## CREATE CDR BY UID ARRAY FORMAT TABLE ##########') 23 | timer = time.time() 24 | print('Checking and dropping {provider_prefix}_cdr_by_uid table if existing.' 25 | .format(provider_prefix=provider_prefix)) 26 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid' 27 | .format(provider_prefix=provider_prefix)) 28 | print('Checked and dropped {provider_prefix}_cdr_by_uid table if existing. ' 29 | 'Elapsed time: {time} seconds' 30 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 31 | timer = time.time() 32 | print('Creating {provider_prefix}_cdr_by_uid table' 33 | .format(provider_prefix=provider_prefix)) 34 | raw_sql = sql_to_string('interpolation/create_cdr_by_uid.sql') 35 | query = raw_sql.format(provider_prefix=provider_prefix) 36 | cursor.execute(query) 37 | 38 | print('Created {provider_prefix}_cdr_by_uid table. Elapsed time: {time} seconds' 39 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 40 | timer = time.time() 41 | raw_sql = sql_to_string('interpolation/insert_cdr_by_uid.sql') 42 | print('Inserting into {provider_prefix}_cdr_by_uid table' 43 | .format(provider_prefix=provider_prefix)) 44 | query = raw_sql.format(provider_prefix=provider_prefix, max_size_cdr_by_uid=self.config.max_size_cdr_by_uid) 45 | cursor.execute(query) 46 | 47 | print('Inserted into {provider_prefix}_cdr_by_uid table. Elapsed time: {time} seconds' 48 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 49 | print('########## FINISHED CREATING CDR BY UID TABLE ##########') 50 | 51 | def create_trip_format(self): 52 | provider_prefix = self.config.provider_prefix 53 | cursor = self.hc.cursor 54 | print('########## CREATE CDR BY UID ARRAY TRIP FORMAT TABLE ##########') 55 | timer = time.time() 56 | print('Checking and dropping {provider_prefix}_cdr_by_uid_trip table if existing.' 57 | .format(provider_prefix=provider_prefix)) 58 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid_trip' 59 | .format(provider_prefix=provider_prefix)) 60 | print('Checked and dropped {provider_prefix}_cdr_by_uid_trip table if existing. ' 61 | 'Elapsed time: {time} seconds' 62 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 63 | timer = time.time() 64 | print('Creating {provider_prefix}_cdr_by_uid_trip table' 65 | .format(provider_prefix=provider_prefix)) 66 | raw_sql = sql_to_string('interpolation/create_trip_format.sql') 67 | query = raw_sql.format(provider_prefix=provider_prefix) 68 | cursor.execute(query) 69 | 70 | print('Created {provider_prefix}_cdr_by_uid_trip table. Elapsed time: {time} seconds' 71 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 72 | timer = time.time() 73 | raw_sql = sql_to_string('interpolation/insert_trip_format.sql') 74 | print('Inserting into {provider_prefix}_cdr_by_uid_trip table' 75 | .format(provider_prefix=provider_prefix)) 76 | query = raw_sql.format(provider_prefix=provider_prefix) 77 | cursor.execute(query) 78 | 79 | print('Inserted into {provider_prefix}_cdr_by_uid_trip table. Elapsed time: {time} seconds' 80 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 81 | print('########## FINISHED CREATING CDR BY UID TRIP FORMAT TABLE ##########') 82 | 83 | def create_trip_24hr_padding(self): 84 | provider_prefix = self.config.provider_prefix 85 | cursor = self.hc.cursor 86 | print('########## CREATE TRIP 24 HR PADDING TABLE ##########') 87 | timer = time.time() 88 | print('Checking and dropping {provider_prefix}_cdr_by_uid_trip_organized_array_apd table if existing.' 89 | .format(provider_prefix=provider_prefix)) 90 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid_trip_organized_array_apd' 91 | .format(provider_prefix=provider_prefix)) 92 | print('Checked and dropped {provider_prefix}_cdr_by_uid_trip_organized_array_apd table if existing. ' 93 | 'Elapsed time: {time} seconds' 94 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 95 | timer = time.time() 96 | print('Creating {provider_prefix}_cdr_by_uid_trip_organized_array_apd table' 97 | .format(provider_prefix=provider_prefix)) 98 | raw_sql = sql_to_string('interpolation/create_trip_24_hr_padding.sql') 99 | query = raw_sql.format(provider_prefix=provider_prefix) 100 | cursor.execute(query) 101 | 102 | print('Created {provider_prefix}_cdr_by_uid_trip_organized_array_apd table. Elapsed time: {time} seconds' 103 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 104 | timer = time.time() 105 | raw_sql = sql_to_string('interpolation/insert_trip_24_hr_padding.sql') 106 | print('Inserting into {provider_prefix}_cdr_by_uid_trip_organized_array_apd table' 107 | .format(provider_prefix=provider_prefix)) 108 | query = raw_sql.format(provider_prefix=provider_prefix) 109 | cursor.execute(query) 110 | 111 | print('Inserted into {provider_prefix}_cdr_by_uid_trip_organized_array_apd table. Elapsed time: {time} seconds' 112 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 113 | print('########## FINISHED TRIP 24 HR PADDING TABLE ##########') 114 | 115 | def create_poi_relocation(self): 116 | provider_prefix = self.config.provider_prefix 117 | cursor = self.hc.cursor 118 | print('########## CREATE POI RELOCATION TABLE ##########') 119 | timer = time.time() 120 | print('Checking and dropping {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table if existing.' 121 | .format(provider_prefix=provider_prefix)) 122 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid_trip_realloc_array_apd' 123 | .format(provider_prefix=provider_prefix)) 124 | print('Checked and dropped {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table if existing. ' 125 | 'Elapsed time: {time} seconds' 126 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 127 | timer = time.time() 128 | print('Creating {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table' 129 | .format(provider_prefix=provider_prefix)) 130 | raw_sql = sql_to_string('interpolation/create_poi_relocation.sql') 131 | query = raw_sql.format(provider_prefix=provider_prefix) 132 | cursor.execute(query) 133 | 134 | print('Created {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table. Elapsed time: {time} seconds' 135 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 136 | timer = time.time() 137 | raw_sql = sql_to_string('interpolation/insert_poi_relocation.sql') 138 | print('Inserting into {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table' 139 | .format(provider_prefix=provider_prefix)) 140 | query = raw_sql.format(provider_prefix=provider_prefix, 141 | poi=self.config.interpolation_poi_file_location.split('/')[-1]) 142 | cursor.execute(query) 143 | 144 | print('Inserted into {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table. Elapsed time: {time} seconds' 145 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 146 | print('########## FINISHED CREATING POI RELOCATION TABLE ##########') 147 | 148 | def create_route_interpolation(self): 149 | provider_prefix = self.config.provider_prefix 150 | cursor = self.hc.cursor 151 | print('########## CREATE ROUTE INTERPOLATION TABLE ##########') 152 | timer = time.time() 153 | print('Checking and dropping {provider_prefix}_cdr_by_uid_trip_routing_array_apd table if existing.' 154 | .format(provider_prefix=provider_prefix)) 155 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid_trip_routing_array_apd' 156 | .format(provider_prefix=provider_prefix)) 157 | print('Checked and dropped {provider_prefix}_cdr_by_uid_trip_routing_array_apd table if existing. ' 158 | 'Elapsed time: {time} seconds' 159 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 160 | timer = time.time() 161 | print('Creating {provider_prefix}_cdr_by_uid_trip_routing_array_apd table' 162 | .format(provider_prefix=provider_prefix)) 163 | raw_sql = sql_to_string('interpolation/create_route_interpolation.sql') 164 | query = raw_sql.format(provider_prefix=provider_prefix) 165 | cursor.execute(query) 166 | 167 | print('Created {provider_prefix}_cdr_by_uid_trip_routing_array_apd table. Elapsed time: {time} seconds' 168 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 169 | timer = time.time() 170 | raw_sql = sql_to_string('interpolation/insert_route_interpolation.sql') 171 | print('Inserting into {provider_prefix}_cdr_by_uid_trip_routing_array_apd table' 172 | .format(provider_prefix=provider_prefix)) 173 | query = raw_sql.format(provider_prefix=provider_prefix, 174 | max_size_interpolation=self.config.max_size_interpolation, 175 | osm=self.config.interpolation_osm_file_location.split('/')[-1], 176 | voronoi=self.config.interpolation_voronoi_file_location.split('/')[-1]) 177 | cursor.execute(query) 178 | 179 | print('Inserted into {provider_prefix}_cdr_by_uid_trip_routing_array_apd table. Elapsed time: {time} seconds' 180 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 181 | print('########## FINISHED ROUTE INTERPOLATION TABLE ##########') 182 | 183 | def export_to_csv(self): 184 | provider_prefix = self.config.provider_prefix 185 | cursor = self.hc.cursor 186 | print('########## Exporting route interpolation to CSV ##########') 187 | timer = time.time() 188 | raw_sql = sql_to_string('interpolation/export_to_gps_format.sql') 189 | query = raw_sql.format(provider_prefix=provider_prefix) 190 | cursor.execute(query) 191 | print('Exported to CSV. Elapsed time: {time} seconds' 192 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 193 | print('########## FINISHED EXPORTING, FILE LOCATED IN /tmp/hive/cdr_interpolation ##########') 194 | -------------------------------------------------------------------------------- /Common/cdr_origin_destination.py: -------------------------------------------------------------------------------- 1 | from Common.hive_connection import HiveConnection 2 | import time 3 | from Common.helper import format_two_point_time, sql_to_string 4 | 5 | 6 | class OriginDestination: 7 | def __init__(self, config): 8 | self.config = config 9 | self.hc = HiveConnection() 10 | 11 | def calculate_od(self): 12 | self.cdr_by_uid() 13 | self.create_od() 14 | self.create_od_detail() 15 | self.create_od_sum() 16 | 17 | def cdr_by_uid(self): 18 | provider_prefix = self.config.provider_prefix 19 | od_admin_unit = self.config.od_admin_unit 20 | cursor = self.hc.cursor 21 | print('########## CREATE CDR BY UID TABLE ##########') 22 | timer = time.time() 23 | print('Checking and dropping {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table if existing.' 24 | .format(provider_prefix=provider_prefix)) 25 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_la_cdr_all_with_ant_zone_by_uid' 26 | .format(provider_prefix=provider_prefix)) 27 | print('Checked and dropped {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table if existing. ' 28 | 'Elapsed time: {time} seconds' 29 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 30 | timer = time.time() 31 | print('Creating {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table' 32 | .format(provider_prefix=provider_prefix)) 33 | raw_sql = sql_to_string('origin_destination/create_la_cdr_all_with_ant_zone_by_uid.sql') 34 | query = raw_sql.format(provider_prefix=provider_prefix) 35 | cursor.execute(query) 36 | print('Created {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table. Elapsed time: {time} seconds' 37 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 38 | timer = time.time() 39 | raw_sql = sql_to_string('origin_destination/insert_la_cdr_all_with_ant_zone_by_uid.sql') 40 | print('Inserting into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table' 41 | .format(provider_prefix=provider_prefix)) 42 | query = raw_sql.format(provider_prefix=provider_prefix, target_admin=od_admin_unit, od_date=self.config.od_date) 43 | cursor.execute(query) 44 | 45 | print('Inserted into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table. Elapsed time: {time} seconds' 46 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 47 | print('########## FINISHED CREATING CDR BY UID TABLE ##########') 48 | 49 | def create_od(self): 50 | provider_prefix = self.config.provider_prefix 51 | od_admin_unit = self.config.od_admin_unit 52 | cursor = self.hc.cursor 53 | print('########## CREATE OD TABLE ##########') 54 | timer = time.time() 55 | print('Checking and dropping {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table if existing.' 56 | .format(provider_prefix=provider_prefix)) 57 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od' 58 | .format(provider_prefix=provider_prefix)) 59 | 60 | print('Checked and dropped {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table if existing.' 61 | ' Elapsed time: {time} seconds' 62 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 63 | print('Creating {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table'.format( 64 | provider_prefix=provider_prefix)) 65 | timer = time.time() 66 | raw_sql = sql_to_string('origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od.sql') 67 | query = raw_sql.format(provider_prefix=provider_prefix) 68 | cursor.execute(query) 69 | 70 | print('Created {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table. Elapsed time: {time}' 71 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 72 | timer = time.time() 73 | print('Inserting into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table' 74 | .format(provider_prefix=provider_prefix)) 75 | raw_sql = sql_to_string('origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od.sql') 76 | query = raw_sql.format(provider_prefix=provider_prefix, target_unit=od_admin_unit) 77 | cursor.execute(query) 78 | 79 | print('Inserted into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table. Elapsed time: {time} seconds' 80 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 81 | print('########## FINISHED CREATING OD TABLE ##########') 82 | 83 | def create_od_detail(self): 84 | provider_prefix = self.config.provider_prefix 85 | cursor = self.hc.cursor 86 | print('########## CREATING OD DETAIL TABLE ##########') 87 | timer = time.time() 88 | print('Checking and dropping {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table if existing.' 89 | .format(provider_prefix=provider_prefix)) 90 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail ' 91 | .format(provider_prefix=provider_prefix)) 92 | 93 | print('Checked and dropped {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table if existing. ' 94 | 'Elapsed time: {time} seconds' 95 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 96 | print('Creating {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table'.format( 97 | provider_prefix=provider_prefix)) 98 | 99 | raw_sql = sql_to_string('origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od_detail.sql') 100 | query = raw_sql.format(provider_prefix=provider_prefix) 101 | cursor.execute(query) 102 | 103 | print('Created {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table'.format( 104 | provider_prefix=provider_prefix)) 105 | timer = time.time() 106 | print('Inserting into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table'.format( 107 | provider_prefix=provider_prefix)) 108 | raw_sql = sql_to_string('origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od_detail.sql') 109 | query = raw_sql.format(provider_prefix=provider_prefix) 110 | cursor.execute(query) 111 | 112 | print('Inserted into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table. ' 113 | 'Elapsed time: {time} seconds' 114 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 115 | print('########## CREATING OD DETAIL TABLE ##########') 116 | 117 | def create_od_sum(self): 118 | provider_prefix = self.config.provider_prefix 119 | cursor = self.hc.cursor 120 | print('########## CREATING OD SUM TABLE ##########') 121 | timer = time.time() 122 | print('Checking and dropping {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table if existing.' 123 | .format(provider_prefix=provider_prefix)) 124 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum ' 125 | .format(provider_prefix=provider_prefix)) 126 | print('Checked and dropped {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table if existing. ' 127 | 'Elapsed time: {time} seconds' 128 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 129 | print('Creating {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table'.format( 130 | provider_prefix=provider_prefix)) 131 | raw_sql = sql_to_string('origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od_sum.sql') 132 | query = raw_sql.format(provider_prefix=provider_prefix) 133 | cursor.execute(query) 134 | print('Created {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table'.format( 135 | provider_prefix=provider_prefix)) 136 | timer = time.time() 137 | 138 | print('Inserting into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table'.format( 139 | provider_prefix=provider_prefix)) 140 | raw_sql = sql_to_string('origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od_sum.sql') 141 | query = raw_sql.format(provider_prefix=provider_prefix) 142 | cursor.execute(query) 143 | print('Inserted into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table. ' 144 | 'Elapsed time: {time} seconds' 145 | .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time()))) 146 | raw_sql = sql_to_string('origin_destination/od_to_csv.sql') 147 | query = raw_sql.format(provider_prefix=provider_prefix) 148 | cursor.execute(query) 149 | print('OD Result is stored in /tmp/hive/od_result') 150 | print('########## FINISHED CREATING OD SUM TABLE ##########') 151 | -------------------------------------------------------------------------------- /Common/cdr_statistics.py: -------------------------------------------------------------------------------- 1 | import json 2 | import csv 3 | import matplotlib.pyplot as plt 4 | import os 5 | from Common.hive_connection import HiveConnection 6 | from Common import helper as hp 7 | import time 8 | from datetime import datetime 9 | 10 | months = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 11 | 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'} 12 | 13 | 14 | class Statistics: 15 | def __init__(self, config): 16 | self.config = config 17 | self.hc = HiveConnection() 18 | 19 | def calculate_data_statistics(self): 20 | cdr_data_layer = self.config.cdr_data_layer 21 | disable = False 22 | for item in cdr_data_layer: 23 | if str.lower(item['name']) == 'call_time' and item['output_no'] == -1 \ 24 | or str.lower(item['name']) == 'uid' and item['output_no'] == -1 \ 25 | or str.lower(item['name']) == 'imei' and item['output_no'] == -1 \ 26 | or str.lower(item['name']) == 'imsi' and item['output_no'] == -1 \ 27 | or str.lower(item['name']) == 'cell_id' and item['output_no'] == -1: 28 | disable = True 29 | 30 | if not disable: 31 | provider_prefix = self.config.provider_prefix 32 | output_report_location = self.config.output_report_location 33 | print('########## CALCULATING DATA STATISTICS ##########') 34 | cursor = self.hc.cursor 35 | imei = "count(distinct IMEI) as unique_imei, " 36 | imsi = "count(distinct IMSI) as unique_imsi, " 37 | raw_sql = hp.sql_to_string('statistics/reports/all_statistics/data_statistics.sql') 38 | query = raw_sql.format(provider_prefix=provider_prefix, imei=imei, imsi=imsi) 39 | print('Calculating data statistics') 40 | timer = time.time() 41 | cursor.execute(query) 42 | print('Calculated data statistics. Elapsed time: {} seconds' 43 | .format(hp.format_two_point_time(timer, time.time()))) 44 | print('Writing to {}/css_file_data_stat.csv'.format(output_report_location)) 45 | timer = time.time() 46 | 47 | with open("{}/css_file_data_stat.csv".format(output_report_location), "w", newline='') as outfile: 48 | writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC) 49 | writer.writerow(col[0] for col in cursor.description) 50 | for row in cursor: 51 | writer.writerow(row) 52 | print('Successfully wrote to {}/css_file_data_stat.csv'.format(output_report_location)) 53 | print('Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 54 | print('########## FINISHED CALCULATING DATA STATISTICS ##########') 55 | else: 56 | print('Mapping for call_time, imsi, imei or uid is not sufficient. Ignored data statistic') 57 | 58 | def calculate_daily_statistics(self): 59 | provider_prefix = self.config.provider_prefix 60 | output_report_location = self.config.output_report_location 61 | cdr_data_layer = self.config.cdr_data_layer 62 | output_graph_location = self.config.output_graph_location 63 | imei = "count(distinct IMEI) as unique_imei, " 64 | imsi = "count(distinct IMSI) as unique_imsi, " 65 | cursor = self.hc.cursor 66 | file_location = '{}/css_file_data_stat.csv'.format(output_report_location) 67 | time_dict = hp.get_time_from_csv(file_location) 68 | start_date, end_date = time_dict['start_date'], time_dict['end_date'] 69 | 70 | disable = False 71 | for item in cdr_data_layer: 72 | if str.lower(item['name']) == 'network_type' and item['output_no'] == -1 \ 73 | or str.lower(item['name']) == 'call_type' and item['output_no'] == -1: 74 | disable = True 75 | if not disable: 76 | print('########## CALCULATING DAILY STATISTICS ##########') 77 | results = [] 78 | timer = time.time() 79 | print('Calculating Daily Statistics') 80 | # FOR CASE ALL 81 | raw_query = hp.sql_to_string('statistics/reports/daily_statistics/daily_statistics.sql') 82 | query = raw_query.format(provider_prefix=provider_prefix, 83 | start_date=start_date, 84 | end_date=end_date, 85 | imei=imei, 86 | imsi=imsi) 87 | cursor.execute(query) 88 | print('Query completed. Time elapsed: {} seconds.'.format(hp.format_two_point_time(timer, time.time()))) 89 | description = cursor.description 90 | rows = [] 91 | for row in cursor: 92 | rows.append(row) 93 | results += rows 94 | print('Writing into the graph for daily statistics') 95 | file_path = '{}/css_provider_data_stat_daily.csv'.format(output_report_location) 96 | if os.path.exists(file_path): 97 | os.remove(file_path) 98 | 99 | with open(file_path, "w", newline='') as outfile: 100 | writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC) 101 | writer.writerow(col[0][4:] for col in description) 102 | for row in results: 103 | writer.writerow(row) 104 | print('Successfully wrote to file css_provider_data_stat_daily.csv') 105 | print('########## FINISHED CALCULATING DAILY STATISTICS ##########') 106 | 107 | print('########## Querying daily cdr by call_type ##########') 108 | timer = time.time() 109 | raw_sql = hp.sql_to_string('statistics/graphs/daily_cdrs_by_call_type/daily_cdrs_by_call_type.sql') 110 | query = raw_sql.format(provider_prefix=provider_prefix, 111 | start_date=start_date, 112 | end_date=end_date, 113 | imei=imei, imsi=imsi) 114 | cursor.execute(query) 115 | print('Query completed. Time elapsed: {} seconds.'.format(hp.format_two_point_time(timer, time.time()))) 116 | 117 | rows = [] 118 | xs_all = set([]) 119 | ys_all = [] 120 | ys_data = [] 121 | ys_voice_or_sms = [] 122 | for row in cursor: 123 | rows.append(row) 124 | xs_all.add(row[0]) 125 | xs_all = list(xs_all) 126 | xs_all.sort(key=lambda date: datetime.strptime(date, '%Y-%m-%d')) 127 | # find the day in rows and match, then extract ALL, DATA and VOICE/SMS 128 | print('Writing into the graph for daily cdr by call type') 129 | for day in xs_all: 130 | c_all = 0 131 | c_data = 0 132 | c_sms_voice = 0 133 | for row in rows: 134 | if row[0] == day: 135 | if row[1] == 'ALL': 136 | c_all += row[3] 137 | elif row[1] == 'DATA': 138 | c_data += row[3] 139 | elif row[1] in ['VOICE', 'SMS']: 140 | c_sms_voice += row[3] 141 | ys_all.append(c_all) 142 | ys_data.append(c_data) 143 | ys_voice_or_sms.append(c_sms_voice) 144 | figure = plt.figure(figsize=(14, 11)) 145 | font_dict = { 146 | 'fontsize': 21, 147 | 'fontweight': 'bold', 148 | } 149 | figure.add_subplot(111) 150 | plt.subplots_adjust(top=0.95) 151 | plt.grid(b=True) 152 | plt.plot(xs_all, ys_all) 153 | plt.plot(xs_all, ys_data) 154 | plt.plot(xs_all, ys_voice_or_sms) 155 | plt.ylabel('Total Records') 156 | plt.xticks(rotation=90) 157 | plt.xlabel('Date') 158 | plt.title('Daily CDR by call type', fontdict=font_dict) 159 | plt.legend(['ALL', 'DATA', 'VOICE and SMS'], loc='upper left') 160 | plt.savefig('{}/daily_cdr_by_call_type'.format(output_graph_location)) 161 | plt.clf() 162 | print('Graph created successfully in {}/daily_cdr_by_call_type'.format(output_graph_location)) 163 | else: 164 | print('Mapping for network_type or call_type is not sufficient. Ignored daily statistics') 165 | 166 | def calculate_monthly_statistics(self): 167 | provider_prefix = self.config.provider_prefix 168 | output_report_location = self.config.output_report_location 169 | cdr_data_layer = self.config.cdr_data_layer 170 | cursor = self.hc.cursor 171 | disable = False 172 | for item in cdr_data_layer: 173 | if str.lower(item['name']) == 'network_type' and item['output_no'] == -1 \ 174 | or str.lower(item['name']) == 'call_type' and item['output_no'] == -1: 175 | disable = True 176 | if not disable: 177 | print('########## CALCULATING MONTHLY STATISTICS ##########') 178 | results = [] 179 | file_location = '{}/css_file_data_stat.csv'.format(output_report_location) 180 | imei = "count(distinct IMEI) as unique_imei, " 181 | imsi = "count(distinct IMSI) as unique_imsi, " 182 | time_dict = hp.get_time_from_csv(file_location) 183 | start_y, start_m, end_y, end_m = time_dict['start_y'], time_dict['start_m'], \ 184 | time_dict['end_y'], time_dict['end_m'] 185 | print('### Calculating Monthly Statistics ###') 186 | # FOR CASE ALL 187 | raw_sql = hp.sql_to_string('statistics/reports/monthly_statistics/monthly_statistics.sql') 188 | query = raw_sql.format(provider_prefix=provider_prefix, 189 | start_year=start_y, 190 | end_year=end_y, 191 | start_month=start_m, 192 | end_month=end_m, 193 | imei=imei, 194 | imsi=imsi) 195 | cursor.execute(query) 196 | description = cursor.description 197 | rows = [] 198 | for row in cursor: 199 | rows.append(row) 200 | 201 | results += cursor.fetchall() 202 | 203 | file_path = '{}/css_provider_data_stat_monthly.csv'.format(output_report_location) 204 | if os.path.exists(file_path): 205 | os.remove(file_path) 206 | 207 | with open(file_path, "w", newline='') as outfile: 208 | writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC) 209 | writer.writerow(col[0][4:] for col in description) 210 | for row in results: 211 | writer.writerow(row) 212 | 213 | print('### Successfully wrote to file css_provider_data_stat_monthly.csv###') 214 | print('########## CALCULATING MONTHLY STATISTICS ##########') 215 | else: 216 | print('Mapping for network_type or call_type is not sufficient. Ignored monthly statistics') 217 | 218 | def calculate_zone_population(self): 219 | print('########## CALCULATING ZONE POPULATION STATISTICS ##########') 220 | provider_prefix = self.config.provider_prefix 221 | output_report_location = self.config.output_report_location 222 | cursor = self.hc.cursor 223 | cdr_cell_tower = self.config.cdr_cell_tower 224 | admin_units = ['ADMIN0', 'ADMIN1', 'ADMIN2', 'ADMIN3', 'ADMIN4', 'ADMIN5'] 225 | admin_units_active = [] 226 | geo_jsons_active = [] 227 | name_columns = [] 228 | geo_json_filename = [] 229 | for col in cdr_cell_tower: 230 | if col['name'] in admin_units: 231 | admin_units_active.append(col['name']) 232 | if col['geojson_filename'] == '': 233 | geo_jsons_active.append('') 234 | else: 235 | geo_jsons_active.append(hp.json_file_to_object(col['geojson_filename'], encoding="utf-8")) 236 | name_columns.append(col['geojson_col_name']) 237 | geo_json_filename.append(col['geojson_filename']) 238 | geo_i = 0 239 | for admin_unit in admin_units_active: 240 | timer = time.time() 241 | print('Calculating zone population for {admin}'.format(admin=admin_unit)) 242 | raw_sql = hp.sql_to_string('statistics/reports/zone_population/zone_population.sql') 243 | query = raw_sql.format(provider_prefix=provider_prefix, level=admin_unit) 244 | cursor.execute(query) 245 | description = cursor.description 246 | print('Successfully zone population for {admin}. Elapsed time: {time} seconds' 247 | .format(admin=admin_unit, time=hp.format_two_point_time(timer, time.time()))) 248 | timer = time.time() 249 | rows = [] 250 | for row in cursor: 251 | rows.append(row) 252 | 253 | file_path = '{output_report_location}/zone_based_aggregations_level_{level}.csv'.format( 254 | output_report_location=output_report_location, level=admin_unit) 255 | if geo_jsons_active[geo_i] != '': 256 | print('Merging dictionary object to geojson') 257 | for f in range(0, len(geo_jsons_active[geo_i]['features'])): 258 | # TODO fix mockup 259 | if geo_jsons_active[geo_i]['features'][f]['properties'][name_columns[geo_i]] == 'Kochi Ken': 260 | geo_jsons_active[geo_i]['features'][f]['properties']['num_population'] = 'Kochi Ken' 261 | print('Merging completed. Time elapsed: {} seconds' 262 | .format(hp.format_two_point_time(timer, time.time()))) 263 | timer = time.time() 264 | else: 265 | print('No geojson file input') 266 | 267 | if os.path.exists(file_path): 268 | os.remove(file_path) 269 | 270 | print('Writing result zone population to {}'.format(file_path)) 271 | with open(file_path, "w", newline='') as outfile: 272 | writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC) 273 | writer.writerow(col[0] for col in description) 274 | for row in rows: 275 | writer.writerow(row) 276 | print('Writing completed. Time elapsed: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 277 | timer = time.time() 278 | 279 | if geo_json_filename[geo_i] != '': 280 | print('Writing into geojson file ' + geo_json_filename[geo_i][:-4] + '_joined_' + admin_unit + '.json') 281 | with open('{}/'.format(output_report_location) + geo_json_filename[geo_i][:-4] + 282 | '_joined_' + admin_unit + '.json', "w", newline='') as outfile: 283 | json.dump(geo_jsons_active[geo_i], outfile) 284 | print('Writing completed. Time elapsed: {} seconds' 285 | .format(hp.format_two_point_time(timer, time.time()))) 286 | geo_i += 1 287 | print('########## FINISHED CALCULATING ZONE POPULATION STATISTICS ##########') 288 | 289 | def calculate_user_date_histogram(self): 290 | print('########## CALCULATING USER DATE HISTOGRAM ##########') 291 | output_graph_location = self.config.output_graph_location 292 | output_report_location = self.config.output_report_location 293 | provider_prefix = self.config.provider_prefix 294 | cursor = self.hc.cursor 295 | raw_sql = hp.sql_to_string('statistics/graphs/date_histogram/histogram.sql') 296 | query = raw_sql.format(provider_prefix=provider_prefix) 297 | 298 | timer = time.time() 299 | print('Calculating data histogram') 300 | cursor.execute(query) 301 | description = cursor.description 302 | rows = cursor.fetchall() 303 | print('Calculating completed. Time elapsed: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 304 | 305 | file_path = '{}/histogram.csv'.format(output_report_location) 306 | print('Writing into {}'.format(file_path)) 307 | if os.path.exists(file_path): 308 | os.remove(file_path) 309 | 310 | with open(file_path, "w", newline='') as outfile: 311 | writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC) 312 | writer.writerow(col[0] for col in description) 313 | for row in rows: 314 | writer.writerow(row) 315 | print('Writing completed.') 316 | 317 | xs = [] 318 | ys = [] 319 | for row in rows: 320 | json_data = hp.string_to_json(row[0]) 321 | xs.append(json_data['x']) 322 | ys.append(json_data['y']) 323 | 324 | plt.subplots_adjust(left=0.15) 325 | plt.bar(xs, ys, align='center') # A bar chart 326 | plt.xlabel('Active Day Bins') 327 | plt.ylabel('Count No. Unique Ids') 328 | print('Plotting graph and writing into {}/user_data_histogram.png'.format(output_graph_location)) 329 | plt.savefig('{}/user_data_histogram.png'.format(output_graph_location)) 330 | print('Done.') 331 | print('########## CALCULATING USER DATE HISTOGRAM ##########') 332 | 333 | def calculate_summary(self): 334 | output_report_location = self.config.output_report_location 335 | provider_prefix = self.config.provider_prefix 336 | cdr_cell_tower = self.config.cdr_cell_tower 337 | cdr_data_layer = self.config.cdr_data_layer 338 | print('########## CALCULATING SUMMARY ##########') 339 | cursor = self.hc.cursor 340 | tb_1_description = ('All Data', 'Value') 341 | tb_2_description = ('Statistics',) 342 | output_1_rows = [] 343 | 344 | print('Calculating total records') 345 | timer = time.time() 346 | raw_sql = hp.sql_to_string('statistics/total_records.sql') 347 | q_total_records = raw_sql.format(provider_prefix=provider_prefix) 348 | cursor.execute(q_total_records) 349 | 350 | des = cursor.description 351 | row_total_records = cursor.fetchall() 352 | row_total_records = (des[0][0], row_total_records[0][0]) 353 | output_1_rows.append(row_total_records) 354 | total_records = row_total_records[1] 355 | print('Successfully calculated total records. Total records: {recs} records \nElapsed time: {time} seconds' 356 | .format(recs=total_records, time=hp.format_two_point_time(timer, time.time()))) 357 | timer = time.time() 358 | 359 | print('Calculating total unique uids') 360 | raw_sql = hp.sql_to_string('statistics/total_unique_uids.sql') 361 | q_total_uids = raw_sql.format(provider_prefix=provider_prefix) 362 | cursor.execute(q_total_uids) 363 | 364 | des = cursor.description 365 | row_total_uids = cursor.fetchall() 366 | row_total_uids = (des[0][0], row_total_uids[0][0]) 367 | output_1_rows.append(row_total_uids) 368 | total_uids = row_total_uids[1] 369 | print('Successfully calculated total unique uids. Total unique ids: {ids} ids \nElapsed time: {time} seconds' 370 | .format(ids=total_uids, time=hp.format_two_point_time(timer, time.time()))) 371 | timer = time.time() 372 | raw_sql = hp.sql_to_string('statistics/reports/summary/total_days.sql') 373 | print('Calculating total days') 374 | query = raw_sql.format(provider_prefix=provider_prefix) 375 | cursor.execute(query) 376 | des = cursor.description 377 | row_total_days = cursor.fetchall() 378 | 379 | total_days = row_total_days[0][0] 380 | start_yyyy_mm_dd = row_total_days[0][1].split('-') 381 | end_yyyy_mm_dd = row_total_days[0][2].split('-') 382 | 383 | start_day = start_yyyy_mm_dd[2] 384 | start_month = start_yyyy_mm_dd[1] 385 | start_year = start_yyyy_mm_dd[0] 386 | 387 | end_day = end_yyyy_mm_dd[2] 388 | end_month = end_yyyy_mm_dd[1] 389 | end_year = end_yyyy_mm_dd[0] 390 | 391 | if int(total_days) == 0: 392 | row_total_days = (des[0][0], row_total_days[0][0]) 393 | elif int(total_days) == 1: 394 | row_total_days = (des[0][0], 395 | str(row_total_days[0][0]) + ' ({} {} {})'.format(int(start_day), months[int(start_month)], 396 | start_year)) 397 | elif int(total_days) >= 2: 398 | if start_year == end_year: 399 | # same year 400 | if start_month == end_month: 401 | # no same day because it is gonna be total_days 1, which is done above 402 | row_total_days = (des[0][0], str(row_total_days[0][0]) + 403 | ' ({}-{} {} {})'.format(int(start_day), int(end_day), 404 | months[int(start_month)], start_year)) 405 | else: 406 | # for different months, same or different day will also be outputted 407 | row_total_days = (des[0][0], str(row_total_days[0][0]) + 408 | ' ({} {}-{} {} {})'.format(int(start_day), months[int(start_month)], 409 | int(end_day), months[int(end_month)], start_year)) 410 | 411 | else: 412 | # for the more-than-one-year case, everything is displayed 413 | row_total_days = (des[0][0], str(row_total_days[0][0]) + 414 | ' ({} {} {}-{} {} {})'.format(int(start_day), months[int(start_month)], 415 | start_year, int(end_day), months[int(end_month)], 416 | end_year)) 417 | 418 | output_1_rows.append(row_total_days) 419 | print('Successfully calculated total days. Total days: {days} \nElapsed time: {time} seconds' 420 | .format(days=row_total_days[1], time=hp.format_two_point_time(timer, time.time()))) 421 | 422 | # average usage per day 423 | print('Calculating average daily usage') 424 | output_2_rows = [] 425 | row_avg_daily_usage = ('average_usage_per_day', round(float(total_records / total_days), 3)) 426 | output_2_rows.append(row_avg_daily_usage) 427 | print('Successfully calculated average daily usage. Daily average usages : {uses} ' 428 | '\nElapsed time: {time} seconds' 429 | .format(uses=row_avg_daily_usage[1], time=hp.format_two_point_time(timer, time.time()))) 430 | timer = time.time() 431 | # avg voice call per day 432 | 433 | disable = False 434 | for item in cdr_data_layer: 435 | if str.lower(item['name']) == 'network_type' and item['output_no'] == -1 \ 436 | or str.lower(item['name']) == 'call_type' and item['output_no'] == -1: 437 | disable = True 438 | 439 | if not disable: 440 | print('########## Calculating average daily voice call usage ##########') 441 | raw_sql = hp.sql_to_string('statistics/reports/summary/average_daily_voice.sql') 442 | q_avg_daily_voice = raw_sql.format( 443 | provider_prefix=provider_prefix, total_days=total_days) 444 | cursor.execute(q_avg_daily_voice) 445 | des = cursor.description 446 | row_avg_daily_voice = cursor.fetchall() 447 | row_avg_daily_voice = (des[0][0], round(row_avg_daily_voice[0][0], 3)) 448 | output_2_rows.append(row_avg_daily_voice) 449 | print('Successfully calculated average daily voice call usage. Daily average sms usages : {uses} ' 450 | '\nElapsed time: {time} seconds' 451 | .format(uses=row_avg_daily_voice[1], time=hp.format_two_point_time(timer, time.time()))) 452 | timer = time.time() 453 | # avg sms per day 454 | print('Calculating average daily sms usage') 455 | raw_sql = hp.sql_to_string('statistics/reports/summary/average_daily_sms.sql') 456 | q_avg_daily_sms = raw_sql.format(provider_prefix=provider_prefix, total_days=total_days) 457 | cursor.execute(q_avg_daily_sms) 458 | des = cursor.description 459 | row_avg_daily_sms = cursor.fetchall() 460 | row_avg_daily_sms = (des[0][0], round(row_avg_daily_sms[0][0], 3)) 461 | output_2_rows.append(row_avg_daily_sms) 462 | print('########## Successfully calculated average daily sms usage. ' 463 | 'Daily average sms usages : {uses} ##########' 464 | '\n########## Elapsed time: {time} seconds ##########' 465 | .format(uses=row_avg_daily_sms[1], time=hp.format_two_point_time(timer, time.time()))) 466 | timer = time.time() 467 | else: 468 | print('call_type or network_type not completed. Ignored daily usage of sms and voice call') 469 | 470 | # avg unique cell id 471 | disable = False 472 | for item in cdr_data_layer: 473 | if str.lower(item['name']) == 'cell_id' and item['output_no'] == -1: 474 | disable = True 475 | 476 | if not disable: 477 | print('Calculating average daily unique cell id') 478 | raw_sql = hp.sql_to_string('statistics/reports/summary/average_unique_cell_ids.sql') 479 | query = raw_sql.format(provider_prefix=provider_prefix, total_days=total_days) 480 | cursor.execute(query) 481 | 482 | des = cursor.description 483 | row_avg_daily_unique_cell_id = cursor.fetchall() 484 | row_avg_daily_unique_cell_id = (des[0][0], round(row_avg_daily_unique_cell_id[0][0], 3)) 485 | output_2_rows.append(row_avg_daily_unique_cell_id) 486 | print('Successfully calculated average daily unique cell id') 487 | print('Successfully calculated average daily unique cel id.' 488 | '\nElapsed time: {time} seconds' 489 | .format(time=hp.format_two_point_time(timer, time.time()))) 490 | timer = time.time() 491 | have_district = False 492 | for col in cdr_cell_tower: 493 | if str.lower(col['name']) == 'admin1': 494 | have_district = True 495 | if have_district: 496 | print('Calculating average daily administration level 1') 497 | raw_sql = hp.sql_to_string('statistics/reports/summary/average_daily_admin1.sql') 498 | query = raw_sql.format(provider_prefix=provider_prefix, level='ADMIN1', total_days=total_days) 499 | cursor.execute(query) 500 | 501 | des = cursor.description 502 | row_avg_daily_district = cursor.fetchall() 503 | print(row_avg_daily_district) 504 | row_avg_daily_district = (des[0][0], round(row_avg_daily_district[0][0], 3)) 505 | output_2_rows.append(row_avg_daily_district) 506 | print('Successfully calculated average daily administration level 1. Daily average value : {dists} ' 507 | '\nElapsed time: {time} seconds' 508 | .format(dists=row_avg_daily_district[1], time=hp.format_two_point_time(timer, time.time()))) 509 | timer = time.time() 510 | 511 | else: 512 | print('Skipped due to incomplete cell_id data') 513 | 514 | print('Recording to summary_stats') 515 | with open("{}/summary_stats.csv".format(output_report_location), "w", newline='') as outfile: 516 | writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC) 517 | writer.writerow(tb_1_description) 518 | for row in output_1_rows: 519 | writer.writerow(row) 520 | 521 | writer.writerow('\n') 522 | 523 | writer.writerow(tb_2_description) 524 | for row in output_2_rows: 525 | writer.writerow(row) 526 | 527 | print('Successfully wrote to summary_stats.csv\nElapsed time: {time} seconds' 528 | .format(time=hp.format_two_point_time(timer, time.time()))) 529 | 530 | print('########## FINISHED CALCULATING SUMMARY ##########') 531 | 532 | def daily_cdrs(self): 533 | timer = time.time() 534 | output_graph_location = self.config.output_graph_location 535 | provider_prefix = self.config.provider_prefix 536 | cursor = self.hc.cursor 537 | print('########## Daily cdrs ##########') 538 | print('Selecting total records') 539 | raw_sql = hp.sql_to_string('statistics/total_records.sql') 540 | query = raw_sql.format(provider_prefix=provider_prefix) 541 | cursor.execute(query) 542 | 543 | des = cursor.description 544 | row_total_records = cursor.fetchall() 545 | row_total_records = (des[0][0], row_total_records[0][0]) 546 | total_records = row_total_records[1] 547 | print('Successfully calculated total records. Total records: {recs} records \nElapsed time: {time} seconds' 548 | .format(recs=total_records, time=hp.format_two_point_time(timer, time.time()))) 549 | timer = time.time() 550 | raw_sql = hp.sql_to_string('statistics/graphs/daily_cdrs/total_daily_cdrs.sql') 551 | q_total_daily_cdr = raw_sql.format(provider_prefix=provider_prefix) 552 | cursor.execute(q_total_daily_cdr) 553 | row_total_daily_cdr = cursor.fetchall() 554 | print('Query done' 555 | '\nElapsed time: {time} seconds' 556 | .format(time=hp.format_two_point_time(timer, time.time()))) 557 | timer = time.time() 558 | total_daily_cdr_x = [] 559 | total_daily_cdr_y = [] 560 | for row in row_total_daily_cdr: 561 | total_daily_cdr_x.append(row[0]) 562 | total_daily_cdr_y.append(row[1]) 563 | 564 | print('Querying min, max and avg of total records') 565 | q_total_daily_cdr_all = "select min(total_records), max(total_records), avg(total_records) from ({}) td"\ 566 | .format(q_total_daily_cdr) 567 | cursor.execute(q_total_daily_cdr_all) 568 | 569 | row_total_daily_cdr_all = cursor.fetchall() 570 | 571 | daily_cdr_min, daily_cdr_max, daily_cdr_avg = row_total_daily_cdr_all[0][0],\ 572 | row_total_daily_cdr_all[0][1], row_total_daily_cdr_all[0][2] 573 | print('Done.\nElapsed time: {time} seconds'.format(time=hp.format_two_point_time(timer, time.time()))) 574 | print('Writing into the graph for daily cdrs') 575 | hp.make_graph(total_daily_cdr_x, 'Day', total_daily_cdr_y, 'Total Records', 'Daily CDRs', 576 | '{}/daily_cdrs'.format(output_graph_location), 577 | des_pair_1={'text_x': 0.090, 'text_y': 1.27, 'text': 'MIN', 'value': f"{daily_cdr_min:,.2f}"}, 578 | des_pair_2={'text_x': 0.345, 'text_y': 1.27, 'text': 'MAX', 'value': f"{daily_cdr_max:,.2f}"}, 579 | des_pair_3={'text_x': 0.595, 'text_y': 1.27, 'text': 'AVG', 'value': f"{daily_cdr_avg:,.2f}"}, 580 | des_pair_4={'text_x': 0.83, 'text_y': 1.27, 'text': 'Total Records', 581 | 'value': f"{total_records:,.2f}"}) 582 | print( 583 | '########## Writing completed. File located in {}/daily_cdrs ##########'.format(output_graph_location)) 584 | 585 | def daily_unique_users(self): 586 | output_graph_location = self.config.output_graph_location 587 | provider_prefix = self.config.provider_prefix 588 | cursor = self.hc.cursor 589 | print('########## Daily unique users ###########') 590 | print('Calculating total unique uids') 591 | raw_sql = hp.sql_to_string('statistics/total_unique_uids.sql') 592 | q_total_uids = raw_sql.format(provider_prefix=provider_prefix) 593 | timer = time.time() 594 | cursor.execute(q_total_uids) 595 | des = cursor.description 596 | row_total_uids = cursor.fetchall() 597 | row_total_uids = (des[0][0], row_total_uids[0][0]) 598 | total_uids = row_total_uids[1] 599 | print('Successfully calculated total unique uids. Total unique ids: {ids} ids \nElapsed time: {time} seconds' 600 | .format(ids=total_uids, time=hp.format_two_point_time(timer, time.time()))) 601 | print('Quering date and unique users') 602 | timer = time.time() 603 | raw_sql = hp.sql_to_string('statistics/graphs/daily_unique_users/total_daily_uids.sql') 604 | q_total_daily_uid = raw_sql.format(provider_prefix=provider_prefix) 605 | cursor.execute(q_total_daily_uid) 606 | row_total_daily_uid = cursor.fetchall() 607 | print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 608 | timer = time.time() 609 | total_daily_uid_x = [] 610 | total_daily_uid_y = [] 611 | for row in row_total_daily_uid: 612 | total_daily_uid_x.append(row[0]) 613 | total_daily_uid_y.append(row[1]) 614 | print('Selecing min, max and avg of total users') 615 | q_total_daily_uid_all = "select min(total_users), max(total_users), avg(total_users) from ({}) td".format( 616 | q_total_daily_uid) 617 | cursor.execute(q_total_daily_uid_all) 618 | 619 | row_total_daily_uid_all = cursor.fetchall() 620 | daily_uid_min, daily_uid_max, daily_uid_avg = row_total_daily_uid_all[0][0], \ 621 | row_total_daily_uid_all[0][1], row_total_daily_uid_all[0][2] 622 | print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 623 | 624 | print('Writing into the graph for daily unique users') 625 | hp.make_graph(total_daily_uid_x, 'Date', total_daily_uid_y, 'Total Users', 'Daily Unique Users', 626 | '{}/daily_unique_users'.format(output_graph_location), 627 | des_pair_1={'text_x': 0.090, 'text_y': 1.27, 'text': 'MIN', 'value': f"{daily_uid_min:,.2f}"}, 628 | des_pair_2={'text_x': 0.345, 'text_y': 1.27, 'text': 'MAX', 'value': f"{daily_uid_max:,.2f}"}, 629 | des_pair_3={'text_x': 0.595, 'text_y': 1.27, 'text': 'AVG', 'value': f"{daily_uid_avg:,.2f}"}, 630 | des_pair_4={'text_x': 0.805, 'text_y': 1.27, 'text': 'Total Unique IDs', 631 | 'value': f"{total_uids:,.2f}"}) 632 | print('########## Writing completed. File located in {}/daily_unique_users ##########' 633 | .format(output_graph_location)) 634 | 635 | def daily_unique_locations(self): 636 | timer = time.time() 637 | output_graph_location = self.config.output_graph_location 638 | provider_prefix = self.config.provider_prefix 639 | cursor = self.hc.cursor 640 | print('########## Daily unique locations ##########') 641 | print('Calculating daily average location name') 642 | raw_sql = hp.sql_to_string('statistics/graphs/daily_unique_locations/total_unique_locations.sql') 643 | query = raw_sql.format(provider_prefix=provider_prefix) 644 | 645 | cursor.execute(query) 646 | des = cursor.description 647 | row_total_locations = cursor.fetchall() 648 | row_total_locations = (des[0][0], row_total_locations[0][0]) 649 | total_unique_locations = row_total_locations[1] 650 | print('Successfully calculated daily average location name. Daily average location names : {locs} ' 651 | '\nElapsed time: {time} seconds' 652 | .format(locs=row_total_locations[1], time=hp.format_two_point_time(timer, time.time()))) 653 | timer = time.time() 654 | 655 | print('Querying daily unique locations') 656 | raw_sql = hp.sql_to_string('statistics/graphs/daily_unique_locations/daily_unique_locations.sql') 657 | q_total_daily_locations = raw_sql.format(provider_prefix=provider_prefix) 658 | cursor.execute(q_total_daily_locations) 659 | row_total_daily_locations = cursor.fetchall() 660 | print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 661 | timer = time.time() 662 | 663 | total_daily_location_x = [] 664 | total_daily_location_y = [] 665 | for row in row_total_daily_locations: 666 | total_daily_location_x.append(row[0]) 667 | total_daily_location_y.append(row[1]) 668 | print('Selecing min, max and avg of unique locations') 669 | q_total_daily_location_all = "select min(unique_locations), max(unique_locations), avg(unique_locations) " \ 670 | "from ({}) td".format(q_total_daily_locations) 671 | cursor.execute(q_total_daily_location_all) 672 | 673 | row_total_daily_location_all = cursor.fetchall() 674 | daily_location_min, daily_location_max, daily_location_avg = row_total_daily_location_all[0][0],\ 675 | row_total_daily_location_all[0][1], row_total_daily_location_all[0][2] 676 | print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 677 | 678 | print('Writing into the graph for daily unique locations') 679 | hp.make_graph(total_daily_location_x, 'Date', total_daily_location_y, 'Total Locations', 680 | 'Daily Unique Locations', '{}/daily_unique_locations'.format(output_graph_location), 681 | des_pair_1={'text_x': 0.090, 'text_y': 1.27, 'text': 'MIN', 682 | 'value': f"{daily_location_min:,.2f}"}, 683 | des_pair_2={'text_x': 0.345, 'text_y': 1.27, 'text': 'MAX', 684 | 'value': f"{daily_location_max:,.2f}"}, 685 | des_pair_3={'text_x': 0.595, 'text_y': 1.27, 'text': 'AVG', 686 | 'value': f"{daily_location_avg:,.2f}"}, 687 | des_pair_4={'text_x': 0.805, 'text_y': 1.27, 'text': 'Total Unique Locations', 688 | 'value': f"{total_unique_locations:,.2f}"}) 689 | print('########## Writing completed. File located in {}/daily_unique_locations ###########' 690 | .format(output_graph_location)) 691 | 692 | def daily_average_cdrs(self): 693 | output_graph_location = self.config.output_graph_location 694 | provider_prefix = self.config.provider_prefix 695 | cursor = self.hc.cursor 696 | timer = time.time() 697 | print('########## Daily Average CDRs ##########') 698 | print('Querying for average cdr and total unique users') 699 | raw_sql = hp.sql_to_string('statistics/graphs/daily_average_cdrs/daily_average_cdrs.sql') 700 | q_total_daily_avg_cdr = raw_sql.format(provider_prefix=provider_prefix) 701 | cursor.execute(q_total_daily_avg_cdr) 702 | row_total_daily_avg_cdr = cursor.fetchall() 703 | print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 704 | timer = time.time() 705 | 706 | total_daily_avg_cdr_x = [] 707 | total_daily_avg_cdr_y = [] 708 | for row in row_total_daily_avg_cdr: 709 | total_daily_avg_cdr_x.append(row[0]) 710 | total_daily_avg_cdr_y.append(row[1]) 711 | 712 | print('Querying for average daily cdrs') 713 | q_total_daily_location_all = "select avg(daily_average_cdr) from ({}) td".format( 714 | q_total_daily_avg_cdr) 715 | cursor.execute(q_total_daily_location_all) 716 | 717 | row_total_daily_avg_cdr_all = cursor.fetchall() 718 | print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 719 | daily_avg_cdr = row_total_daily_avg_cdr_all[0][0] 720 | print('########## Writing into the graph for daily average CDRs ##########') 721 | hp.make_graph(total_daily_avg_cdr_x, 'Date', total_daily_avg_cdr_y, 'Total Daily Average CDRs', 722 | 'Daily Average CDRs', '{}/daily_avg_cdr'.format(output_graph_location), 723 | des_pair_1={'text_x': 0.035, 'text_y': 1.27, 'text': 'Total Daily Avg CDRs', 724 | 'value': f"{daily_avg_cdr:,.2f}"}) 725 | 726 | def daily_unique_average_locations(self): 727 | print('########## Daily unique average locations ##########') 728 | output_graph_location = self.config.output_graph_location 729 | provider_prefix = self.config.provider_prefix 730 | cursor = self.hc.cursor 731 | disable = False 732 | for item in self.config.cdr_data_layer: 733 | if str.lower(item['name']) == 'cell_id' and item['output_no'] == -1 \ 734 | or str.lower(item['name']) == 'call_time' and item['output_no'] == -1: 735 | disable = True 736 | if not disable: 737 | print('Querying daily average cell ids and daily average locations') 738 | timer = time.time() 739 | raw_sql = hp.sql_to_string('statistics/graphs/daily_average_unique_locations/' 740 | 'daily_average_unique_locations.sql') 741 | q_total_daily_avg_locations = raw_sql.format(provider_prefix=provider_prefix) 742 | cursor.execute(q_total_daily_avg_locations) 743 | row_total_daily_avg_locations = cursor.fetchall() 744 | print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 745 | timer = time.time() 746 | total_daily_avg_location_x = [] 747 | total_daily_avg_location_y = [] 748 | for row in row_total_daily_avg_locations: 749 | total_daily_avg_location_x.append(row[0]) 750 | total_daily_avg_location_y.append(row[1]) 751 | print('Querying for average daily avg cell_id and locations') 752 | q_total_daily_avg_location_all = "select avg(td.daily_avg_cell_ids), avg(td.daily_avg_locations) " \ 753 | "from ({}) td".format(q_total_daily_avg_locations) 754 | cursor.execute(q_total_daily_avg_location_all) 755 | row_total_daily_location_all = cursor.fetchall() 756 | print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 757 | daily_avg_location_cell_ids, daily_avg_location = row_total_daily_location_all[0][0],\ 758 | row_total_daily_location_all[0][1] 759 | 760 | print('Writing into the graph for daily unique average locations') 761 | hp.make_graph(total_daily_avg_location_x, 'Date', total_daily_avg_location_y, 'Total Unique Locations', 762 | 'Daily Unique Average Locations', 763 | '{}/daily_unique_avg_locations'.format(output_graph_location), 764 | des_pair_1={'text_x': 0.00, 'text_y': 1.27, 'text': 'Avg Daily Unique Cell IDs ', 765 | 'value': f"{daily_avg_location_cell_ids:,.2f}"}, 766 | des_pair_2={'text_x': 0.28, 'text_y': 1.27, 'text': 'Avg Daily Unique Locations', 767 | 'value': f"{daily_avg_location:,.2f}"}) 768 | print('########## Writing completed. File located in {}/daily_unique_avg_locations ##########' 769 | .format(output_graph_location)) 770 | else: 771 | print('call_time or cell_id is in incorrect form. Ignored output.') 772 | 773 | def frequent_locations(self): 774 | frequent_locations_percentage = self.config.frequent_locations_percentage 775 | provider_prefix = self.config.provider_prefix 776 | cursor = self.hc.cursor 777 | print('########## CREATE FREQUENT LOCATION TABLE ##########') 778 | print('Checking and dropping frequent location table if existing.') 779 | timer = time.time() 780 | admin = self.config.od_admin_unit 781 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_frequent_locations' 782 | .format(provider_prefix=provider_prefix)) 783 | print('Checked and dropped frequent location table if existing. Elapsed time: {} seconds'.format( 784 | hp.format_two_point_time(timer, time.time()))) 785 | timer = time.time() 786 | print('Creating frequent location table') 787 | raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/create_frequent_locations.sql') 788 | query = raw_sql.format(provider_prefix=provider_prefix, admin_params=admin + '_id string') 789 | 790 | cursor.execute(query) 791 | print('Created frequent location table. Elapsed time: {} seconds' 792 | .format(hp.format_two_point_time(timer, time.time()))) 793 | timer = time.time() 794 | print('Inserting into frequent location table') 795 | raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations.sql') 796 | query = raw_sql.format(provider_prefix=provider_prefix, admin_params=admin + '_id', admin=admin) 797 | 798 | cursor.execute(query) 799 | print('Inserted into frequent location table.\nResult are in the table named {provider_prefix}_' 800 | 'frequent_locations\nElapsed time: {time} seconds. ' 801 | .format(provider_prefix=provider_prefix, time=hp.format_two_point_time(timer, time.time()))) 802 | timer = time.time() 803 | print('Dropping freq location with accumulated percentage') 804 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_freq_with_acc_wsum' 805 | .format(provider_prefix=provider_prefix)) 806 | print('Checked and dropped frequent location table with accumulated percentage if existing. ' 807 | 'Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 808 | timer = time.time() 809 | print('Creating and insert freq with acc wsum Table (Frequent Locations) with accumulated percentage') 810 | raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_wsum.sql') 811 | query = raw_sql.format(provider_prefix=provider_prefix, admin=admin) 812 | cursor.execute(query) 813 | print('Inserted into frequent locations table with accumulated percentage. ' 814 | '\nElapsed time: {time} seconds. '.format(time=hp.format_two_point_time(timer, time.time()))) 815 | timer = time.time() 816 | print('Dropping frequent locations thresholded table') 817 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_frequent_locations_thresholded' 818 | .format(provider_prefix=provider_prefix)) 819 | print('Checked and dropped frequent locations table with accumulated percentage if existing.' 820 | 'Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 821 | timer = time.time() 822 | print('Creating and insert frequent locations thresholded table ') 823 | raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_thresholded.sql') 824 | query = raw_sql.format(provider_prefix=provider_prefix, admin=admin, threshold=frequent_locations_percentage) 825 | cursor.execute(query) 826 | print('Inserted into frequent locations thresholded table. \nElapsed time: {time} seconds. ' 827 | .format(time=hp.format_two_point_time(timer, time.time()))) 828 | print('########## FINISHED CREATING FREQUENT LOCATIONS TABLE ##########') 829 | 830 | def frequent_locations_night(self): 831 | frequent_locations_percentage = self.config.frequent_locations_percentage 832 | provider_prefix = self.config.provider_prefix 833 | cursor = self.hc.cursor 834 | print('########## CREATE FREQUENT LOCATIONS NIGHT TABLE ##########') 835 | print('Checking and dropping frequent locations night table if existing.') 836 | timer = time.time() 837 | admin = self.config.od_admin_unit 838 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_frequent_locations_night' 839 | .format(provider_prefix=provider_prefix)) 840 | print('Checked and dropped frequent locations night table if existing. Elapsed time: {} seconds' 841 | .format(hp.format_two_point_time(timer, time.time()))) 842 | timer = time.time() 843 | 844 | print('Creating frequent locations night table') 845 | raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/create_frequent_locations_night.sql') 846 | query = raw_sql.format(provider_prefix=provider_prefix, admin_params=admin + '_id string') 847 | cursor.execute(query) 848 | 849 | print('Created frequent locations night table. Elapsed time: {} seconds' 850 | .format(hp.format_two_point_time(timer, time.time()))) 851 | timer = time.time() 852 | print('Inserting into frequent locations night table') 853 | raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_night.sql') 854 | query = raw_sql.format(provider_prefix=provider_prefix, admin_params=admin + '_id', admin=admin) 855 | cursor.execute(query) 856 | print('Inserted into frequent locations night table.\n' 857 | 'Result are in the table named {provider_prefix}_frequent_locations_night\nElapsed time: {time} seconds. ' 858 | .format(provider_prefix=provider_prefix, time=hp.format_two_point_time(timer, time.time()))) 859 | timer = time.time() 860 | print('Dropping freq location night with accumulated percentage') 861 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_freq_with_acc_wsum_night' 862 | .format(provider_prefix=provider_prefix)) 863 | print( 864 | 'Checked and dropped frequent locations night table with accumulated percentage if existing. ' 865 | 'Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 866 | timer = time.time() 867 | print('Creating and insert freq night with acc wsum Table ' 868 | '(Frequent Locations Night) with accumulated percentage') 869 | raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_wsum_night.sql') 870 | query = raw_sql.format(provider_prefix=provider_prefix, admin=admin) 871 | cursor.execute(query) 872 | print( 873 | 'Inserted into frequent locations night table with accumulated percentage. \nElapsed time: {time} seconds. ' 874 | .format(time=hp.format_two_point_time(timer, time.time()))) 875 | timer = time.time() 876 | print('Dropping frequent locations thresholded night table') 877 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_frequent_locations_thresholded_night' 878 | .format(provider_prefix=provider_prefix)) 879 | print('Checked and dropped frequent locations night table with accumulated percentage if existing. ' 880 | 'Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time()))) 881 | timer = time.time() 882 | print('Creating and insert frequent locations thresholded night table ') 883 | raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_thresholded_night.sql') 884 | query = raw_sql.format(provider_prefix=provider_prefix, admin=admin, threshold=frequent_locations_percentage) 885 | cursor.execute(query) 886 | print('Inserted into frequent locations thresholded night table. \nElapsed time: {time} seconds. ' 887 | .format(time=hp.format_two_point_time(timer, time.time()))) 888 | print('########## FINISHED CREATING FREQUENT LOCATIONS NIGHT TABLE ##########') 889 | 890 | def rank1_frequent_locations(self): 891 | provider_prefix = self.config.provider_prefix 892 | cursor = self.hc.cursor 893 | print('########## CREATE RANK 1 FREQUENT LOCATIONS TABLE ##########') 894 | admin = self.config.od_admin_unit 895 | create_param = admin + '_id string' 896 | timer = time.time() 897 | print('Checking and dropping rank 1 frequent locations table if existing.') 898 | cursor.execute( 899 | 'DROP TABLE IF EXISTS {provider_prefix}_la_cdr_uid_home'.format(provider_prefix=provider_prefix)) 900 | print('Checked and dropped rank 1 frequent locations table if existing. Elapsed time: {} seconds'.format( 901 | hp.format_two_point_time(timer, time.time()))) 902 | timer = time.time() 903 | print('Creating rank 1 frequent locations table') 904 | raw_sql = hp.sql_to_string('origin_destination/create_la_cdr_uid_home.sql') 905 | query = raw_sql.format(provider_prefix=provider_prefix, admin_params=create_param) 906 | cursor.execute(query) 907 | print('Created rank 1 frequent locations table. Elapsed time: {} seconds'.format( 908 | hp.format_two_point_time(timer, time.time()))) 909 | timer = time.time() 910 | print('Inserting into rank 1 frequent locations table') 911 | raw_sql = hp.sql_to_string('origin_destination/insert_la_cdr_uid_home.sql') 912 | query = raw_sql.format(provider_prefix=provider_prefix) 913 | cursor.execute(query) 914 | print('Inserted into rank 1 frequent locations table (located in {provider_prefix}_la_cdr_uid_home). ' 915 | 'Elapsed time: {time} seconds' 916 | .format(provider_prefix=provider_prefix, time=hp.format_two_point_time(timer, time.time()))) 917 | print('########## FINISHED CREATING RANK 1 FREQUENT LOCATIONS TABLE ##########') 918 | -------------------------------------------------------------------------------- /Common/config_object.py: -------------------------------------------------------------------------------- 1 | from Common.helper import json_file_to_object 2 | 3 | 4 | class Config: 5 | def __init__(self, config_file): 6 | self.__dict__ = json_file_to_object(config_file) 7 | -------------------------------------------------------------------------------- /Common/helper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import matplotlib.pyplot as plt 3 | import csv 4 | import pandas 5 | from matplotlib.widgets import TextBox 6 | from codecs import open 7 | 8 | 9 | mandatory_columns = [["UID", "IMEI", "IMSI", "CALL_TIME", "DURATION", 10 | "CALL_TYPE", "NETWORK_TYPE", "CELL_ID", "LATITUDE", "LONGITUDE"], 11 | ['CELL_ID', 'LATITUDE', 'LONGITUDE', 'ADMIN0', 12 | 'ADMIN1', 'ADMIN2', 'ADMIN3', 'ADMIN4', 'ADMIN5']] 13 | 14 | 15 | def json_file_to_object(json_file, encoding=''): 16 | if encoding == '': 17 | with open(json_file) as jf: 18 | return json.load(jf) 19 | else: 20 | with open(json_file, encoding=encoding) as jf: 21 | return json.load(jf) 22 | 23 | 24 | def string_to_json(str_in): 25 | return json.loads(str_in) 26 | 27 | 28 | def sql_to_string(filename): 29 | path = "queries/" + filename 30 | sql = open(path, mode='r', encoding='utf-8-sig').read() 31 | return sql 32 | 33 | 34 | def get_admin_units_from_mapping(cell_tower_mapping): 35 | admin_units = [] 36 | admins = ['admin0', 'admin1', 'admin2', 'admin3', 'admin4', 'admin5'] 37 | admins.reverse() 38 | for row in cell_tower_mapping: 39 | for admin in admins: 40 | if row['output_no'] != -1 and str.lower(row['name']) == admin: 41 | admin_units.append(row['name']) 42 | 43 | print('Result admin units = {}'.format(', '.join(admin_units))) 44 | return admin_units 45 | 46 | 47 | def format_two_point_time(start, end): 48 | return round(end - start, 2) 49 | 50 | 51 | def get_time_from_csv(file_loc): 52 | with open(file_loc) as csv_file: 53 | csv_reader = csv.reader(csv_file, delimiter=',') 54 | line_count = 0 55 | for row in csv_reader: 56 | if line_count == 1: 57 | start_date = row[6] 58 | end_date = row[7] 59 | break 60 | line_count += 1 61 | 62 | start_date = pandas.Timestamp(start_date) 63 | start_m = start_date.month 64 | start_y = start_date.year 65 | end_date = pandas.Timestamp(end_date) 66 | end_m = end_date.month 67 | end_y = end_date.year 68 | 69 | print(start_date, start_m, start_y, end_date, end_m, end_y) 70 | print(start_date, end_date) 71 | 72 | result = dict() 73 | result['start_date'] = start_date 74 | result['start_m'] = start_m 75 | result['start_y'] = start_y 76 | result['end_date'] = end_date 77 | result['end_m'] = end_date.month 78 | result['end_y'] = end_date.year 79 | 80 | return result 81 | 82 | 83 | def make_graph(xs, x_label, ys, y_label, header, filename, des_pair_1=None, 84 | des_pair_2=None, des_pair_3=None, des_pair_4=None): 85 | figure = plt.figure(figsize=(14, 11)) 86 | 87 | font_dict = { 88 | 'fontsize': 21, 89 | 'fontweight': 'bold', 90 | } 91 | 92 | ax = figure.add_subplot(111) 93 | plt.title(header, fontdict=font_dict) 94 | plt.subplots_adjust(top=0.75) 95 | plt.grid(b=True) 96 | plt.plot(xs, ys) 97 | plt.ylabel(y_label) 98 | plt.xticks(rotation=90) 99 | plt.xlabel(x_label) 100 | 101 | if des_pair_1 is not None: 102 | plt.text(des_pair_1['text_x'], des_pair_1['text_y'], des_pair_1['text'], transform=ax.transAxes) 103 | axbox = plt.axes([0.1, 0.87, 0.2, 0.04]) 104 | offset = 60 - 2*len(des_pair_1['value']) 105 | text1 = '' 106 | for i in range(0, offset): 107 | text1 += ' ' 108 | text_box = TextBox(axbox, '', initial=text1 + des_pair_1['value'], color='orange', label_pad=0.005) 109 | text_box.disconnect_events() 110 | if des_pair_2 is not None: 111 | offset = 60 - 2*len(des_pair_2['value']) 112 | text2 = '' 113 | for i in range(0, offset): 114 | text2 += ' ' 115 | plt.text(des_pair_2['text_x'], des_pair_2['text_y'], des_pair_2['text'], transform=ax.transAxes) 116 | # plt.text(0.33, 1.27, des_pair_2['text'], transform=ax.transAxes) 117 | axbox = plt.axes([0.3, 0.87, 0.2, 0.04]) 118 | text_box = TextBox(axbox, '', initial=text2 + des_pair_2['value'], color='blue') 119 | text_box.disconnect_events() 120 | if des_pair_3 is not None: 121 | offset = 60 - 2*len(des_pair_3['value']) 122 | text3 = '' 123 | for i in range(0, offset): 124 | text3 += ' ' 125 | # plt.text(0.58, 1.27, des_pair_3['text'], transform=ax.transAxes) 126 | plt.text(des_pair_3['text_x'], des_pair_3['text_y'], des_pair_3['text'], transform=ax.transAxes) 127 | axbox = plt.axes([0.5, 0.87, 0.2, 0.04]) 128 | text_box = TextBox(axbox, '', initial=text3 + des_pair_3['value'], color='green') 129 | text_box.disconnect_events() 130 | if des_pair_4 is not None: 131 | offset = 60 - 2*len(des_pair_4['value']) 132 | text4 = '' 133 | for i in range(0, offset): 134 | text4 += ' ' 135 | # plt.text(0.79, 1.27, des_pair_4['text'], transform=ax.transAxes) 136 | plt.text(des_pair_4['text_x'], des_pair_4['text_y'], des_pair_4['text'], transform=ax.transAxes) 137 | axbox = plt.axes([0.7, 0.87, 0.2, 0.04]) 138 | text_box = TextBox(axbox, '', initial=text4 + des_pair_4['value'], color='red') 139 | text_box.disconnect_events() 140 | 141 | plt.savefig(filename) 142 | 143 | 144 | def extract_mapping_data(config, data): 145 | mappings = [config.cdr_data_layer, config.cdr_cell_tower] 146 | # Extract arguments 147 | for i in range(0, len(mappings)): 148 | arguments_map = [] 149 | arguments_prep = [] 150 | arguments_raw = [] 151 | arguments_con = [] 152 | for argument in mappings[i]: 153 | if str.upper(argument['name']) in mandatory_columns[i]: 154 | arguments_prep.append(argument['name'] + ' ' + argument['data_type']) 155 | arguments_con.append(argument['name']) 156 | if str.lower(argument['name']) == 'uid' and i == 0: 157 | arguments_con.append(argument['input_name']) 158 | arguments_prep.append(argument['input_name'] + ' ' + argument['data_type']) 159 | if argument['output_no'] != -1: 160 | if argument['input_no'] != -1: 161 | arguments_raw.append(argument['input_name'] + ' ' + argument['data_type']) 162 | if 'custom' in argument and argument['custom'] != '': 163 | if str.lower(argument['name']) == 'call_time' and config.input_file_time_format != "": 164 | arguments_map.append("from_unixtime(unix_timestamp({custom} " 165 | ",'{time_format}'), 'yyyy-MM-dd hh:mm:ss') as call_time" 166 | .format(custom=argument['custom'], 167 | time_format=config.input_file_time_format)) 168 | else: 169 | arguments_map.append(argument['custom'] + ' as ' + argument['name']) 170 | if str.lower(argument['name']) == 'uid' and i == 0: 171 | arguments_map.append(argument['input_name'] + ' as ' + argument['input_name']) 172 | else: 173 | if str.lower(argument['name']) == 'call_time' and config.input_file_time_format != "": 174 | arguments_map.append("from_unixtime(unix_timestamp({custom} " 175 | ",'{time_format}'), 'yyyy-MM-dd hh:mm:ss') as call_time" 176 | .format(custom=argument['input_name'], 177 | time_format=config.input_file_time_format)) 178 | print(arguments_map) 179 | else: 180 | arguments_map.append(argument['input_name'] + ' as ' + argument['name']) 181 | if str.lower(argument['name']) == 'uid' and i == 0: 182 | arguments_map.append(argument['input_name'] + ' as ' + argument['input_name']) 183 | else: 184 | # input -1 output 1 for custom 185 | if 'custom' in argument and argument['custom'] != '': 186 | arguments_map.append(argument['custom'] + ' as ' + argument['name']) 187 | # else = cdr without custom or cell tower, this case insert -1 if it is a mandatory column 188 | elif str.upper(argument['name']) in mandatory_columns[i]: 189 | arguments_map.append('-1' + ' as ' + argument['name']) 190 | print('Output ' + argument['name'] + ' ignored ') 191 | 192 | elif argument['input_no'] != -1: 193 | arguments_raw.append(argument['input_name'] + ' ' + argument['data_type']) 194 | if str.upper(argument['name']) in mandatory_columns[i]: 195 | arguments_map.append('-1' + ' as ' + argument['name']) 196 | print('Output ' + argument['name'] + ' ignored ') 197 | elif str.upper(argument['name']) in mandatory_columns[i]: 198 | # input -1 output -1 insert -1 if it is a mandatory column 199 | arguments_map.append('-1' + ' as ' + argument['name']) 200 | print('Output ' + argument['name'] + ' ignored ') 201 | 202 | if i == 0: 203 | data.arg_cdr_map, data.arg_cdr_raw, data.arg_cdr_prep, data.arg_cdr_con = \ 204 | arguments_map, arguments_raw, arguments_prep, arguments_con 205 | else: 206 | data.arg_cell_map, data.arg_cell_raw, data.arg_cell_create = \ 207 | arguments_map, arguments_raw, arguments_prep 208 | print(data.arg_cell_map, data.arg_cell_create) 209 | 210 | 211 | if __name__ == '__main__': 212 | make_graph([1, 2, 3, 4], 'x', [1, 2, 3, 4], 'y', 'TEST', 'test') 213 | -------------------------------------------------------------------------------- /Common/hive_connection.py: -------------------------------------------------------------------------------- 1 | from impala.dbapi import connect 2 | 3 | 4 | class Singleton(type): 5 | _instances = {} 6 | 7 | def __call__(cls, *args, **kwargs): 8 | if cls not in cls._instances: 9 | cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs) 10 | return cls._instances[cls] 11 | 12 | 13 | class HiveConnection(metaclass=Singleton): 14 | def __init__(self, host='', port='', user=''): 15 | self.conn = connect(host, port, user=user, auth_mechanism='PLAIN') 16 | self.cursor = self.conn.cursor() 17 | self.cursor.set_arraysize(1) 18 | -------------------------------------------------------------------------------- /Common/hive_create_tables.py: -------------------------------------------------------------------------------- 1 | from Common.hive_connection import HiveConnection 2 | from Common.helper import json_file_to_object, get_admin_units_from_mapping, format_two_point_time, sql_to_string 3 | import os 4 | import time 5 | 6 | 7 | class HiveTableCreator: 8 | def __init__(self, config, data=''): 9 | self.config = config 10 | self.data = data 11 | self.hc = HiveConnection() 12 | 13 | def initialize(self, init_cmd_file): 14 | print('########## Initilizing Hive ##########') 15 | timer = time.time() 16 | output_report_location = self.config.output_report_location 17 | output_graph_location = self.config.output_graph_location 18 | cursor = self.hc.cursor 19 | for command in json_file_to_object(init_cmd_file)['hive_commands']: 20 | if command.startswith('use'): 21 | command = command.format(db_name=self.config.db_name) 22 | elif '{poi_location}' in command: 23 | command = command.format(poi_location=self.config.interpolation_poi_file_location) 24 | elif '{osm_location}' in command: 25 | command = command.format(osm_location=self.config.interpolation_osm_file_location) 26 | elif '{voronoi_location}' in command: 27 | command = command.format(voronoi_location=self.config.interpolation_voronoi_file_location) 28 | cursor.execute(command) 29 | if not os.path.exists(output_report_location): 30 | os.makedirs(output_report_location) 31 | if not os.path.exists(output_graph_location): 32 | os.makedirs(output_graph_location) 33 | print('########## Done. Time elapsed: {} seconds ##########'.format(format_two_point_time(timer, time.time()))) 34 | 35 | def create_tables(self): 36 | print('########## Creating Tables ##########') 37 | timer = time.time() 38 | self.import_cell_tower_data_raw() 39 | self.preprocess_cell_tower_data() 40 | admins = get_admin_units_from_mapping(self.config.cdr_cell_tower) 41 | for admin in admins: 42 | self.cell_tower_data_admin(admin) 43 | self.import_raw() 44 | self.preprocess_data() 45 | self.consolidate_table() 46 | print('########## Done create all tables. Time elapsed: {} seconds ##########'.format( 47 | format_two_point_time(timer, time.time()))) 48 | 49 | def import_cell_tower_data_raw(self): 50 | provider_prefix = self.config.provider_prefix 51 | arg_cell_raw = self.data.arg_cell_raw 52 | input_cell_tower_delimiter = self.config.input_cell_tower_delimiter 53 | input_cell_tower_have_header = self.config.input_cell_tower_have_header 54 | input_cell_tower_files = self.config.input_cell_tower_files 55 | hadoop_data_path = self.config.hadoop_data_path 56 | cursor = self.hc.cursor 57 | print('########## IMPORT RAW MAPPING TABLE ##########') 58 | print('Checking and dropping raw mapping table if existing.') 59 | timer = time.time() 60 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cell_tower_data_raw' 61 | .format(provider_prefix=provider_prefix)) 62 | print('Checked and dropped raw mapping table if existing. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time()))) 63 | timer = time.time() 64 | 65 | print('Creating raw mapping table') 66 | raw_query = sql_to_string('cdr_and_mapping/create_raw_mapping.sql') 67 | query = raw_query.format(provider_prefix=provider_prefix, 68 | arg_raw=', '.join(arg_cell_raw), 69 | field_delimiter=input_cell_tower_delimiter, 70 | have_header=input_cell_tower_have_header) 71 | cursor.execute(query) 72 | print('Created raw mapping table. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time()))) 73 | timer = time.time() 74 | 75 | if len(input_cell_tower_files) < 1: 76 | print('Please check the input_cell_tower_files field in config.json and make sure the file is valid.') 77 | return 78 | elif len(input_cell_tower_files) == 1: 79 | cursor.execute( 80 | "load data local inpath '{hadoop_data_path}{hadoop_data_file}' " 81 | .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_cell_tower_files[0]) + 82 | "overwrite into table {provider_prefix}_cell_tower_data_raw".format( 83 | provider_prefix=provider_prefix) 84 | ) 85 | else: 86 | cursor.execute( 87 | "load data local inpath '{hadoop_data_path}{hadoop_data_file}' " 88 | .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_cell_tower_files[0]) + 89 | "overwrite into table {provider_prefix}_cell_tower_data_raw".format( 90 | provider_prefix=provider_prefix) 91 | ) 92 | for i in range(1, len(input_cell_tower_files)): 93 | cursor.execute( 94 | "load data local inpath '{hadoop_data_path}{hadoop_data_file}' " 95 | .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_cell_tower_files[i]) + 96 | "into table {provider_prefix}_cell_tower_data_raw".format(provider_prefix=provider_prefix) 97 | ) 98 | print('Imported to raw mapping table. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time()))) 99 | print('########## FINISHED IMPORTING TO RAW MAPPING TABLE ##########') 100 | 101 | def import_raw(self): 102 | provider_prefix = self.config.provider_prefix 103 | hadoop_data_path = self.config.hadoop_data_path 104 | input_cell_tower_have_header = self.config.input_cell_tower_have_header 105 | arg_cdr_raw = self.data.arg_cdr_raw 106 | input_files = self.config.input_files 107 | input_delimiter = self.config.input_delimiter 108 | cursor = self.hc.cursor 109 | print('########## IMPORT RAW TABLE ##########') 110 | print('Checking and dropping raw table if existing.') 111 | timer = time.time() 112 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_raw' 113 | .format(provider_prefix=provider_prefix)) 114 | print('Checked and dropped raw mapping table if existing. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time()))) 115 | timer = time.time() 116 | 117 | print('Creating raw table') 118 | raw_sql = sql_to_string('cdr_and_mapping/create_raw_cdr.sql') 119 | query = raw_sql.format(cell_tower_header=input_cell_tower_have_header, 120 | provider_prefix=provider_prefix, 121 | arg_raw=', '.join(arg_cdr_raw), 122 | field_delimiter=input_delimiter) 123 | cursor.execute(query) 124 | print('Created raw table. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time()))) 125 | timer = time.time() 126 | print('Importing to raw table') 127 | if len(input_files) < 1: 128 | 'Please check the input_files field in config.json and make sure the file is valid.' 129 | return 130 | elif len(input_files) == 1: 131 | cursor.execute( 132 | "load data local inpath '{hadoop_data_path}{hadoop_data_file}' " 133 | .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_files[0]) + 134 | "overwrite into table {provider_prefix}_raw".format(provider_prefix=provider_prefix) 135 | ) 136 | else: 137 | cursor.execute( 138 | "load data local inpath '{hadoop_data_path}{hadoop_data_file}' " 139 | .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_files[0]) + 140 | "overwrite into table {provider_prefix}_raw".format(provider_prefix=provider_prefix) 141 | ) 142 | for i in range(1, len(input_files)): 143 | cursor.execute( 144 | "load data local inpath '{hadoop_data_path}{hadoop_data_file}' " 145 | .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_files[i]) + 146 | "into table {provider_prefix}_raw".format(provider_prefix=provider_prefix) 147 | ) 148 | print('Imported to raw table. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time()))) 149 | print('########## IMPORT RAW TABLE COMPLETED ##########') 150 | 151 | def cell_tower_data_admin(self, admin): 152 | provider_prefix = self.config.provider_prefix 153 | check_invalid_lat_lng = self.config.check_invalid_lat_lng 154 | cursor = self.hc.cursor 155 | 156 | print('########## CREATE MAPPING ADMIN TABLE ##########') 157 | if check_invalid_lat_lng: 158 | check_lat_lng = 'and (latitude != 0 or longitude != 0) and latitude is not NULL and longitude is not NULL' 159 | else: 160 | check_lat_lng = '' 161 | print('Checking and dropping mapping {admin} table if existing.'.format(admin=admin)) 162 | timer = time.time() 163 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cell_tower_data_{admin}'.format( 164 | provider_prefix=provider_prefix, admin=admin)) 165 | print('Check and drop mapping {admin} table if existing. Elapsed time: {time} seconds' 166 | .format(admin=admin, time=format_two_point_time(timer, time.time()))) 167 | timer = time.time() 168 | 169 | print('Creating mapping {admin} table'.format(admin=admin)) 170 | raw_sql = sql_to_string('cdr_and_mapping/create_mapping_admin.sql') 171 | query = raw_sql.format(provider_prefix=provider_prefix, admin=admin) 172 | cursor.execute(query) 173 | print('Created mapping {admin} table. Elapsed time: {time} seconds' 174 | .format(admin=admin, time=format_two_point_time(timer, time.time()))) 175 | timer = time.time() 176 | 177 | print('Inserting into mapping {} table'.format(admin)) 178 | raw_sql = sql_to_string('cdr_and_mapping/insert_mapping_admin.sql') 179 | query = raw_sql.format(provider_prefix=provider_prefix, admin=admin, check_lat_lng=check_lat_lng) 180 | cursor.execute(query) 181 | print('Inserted into mapping {admin} table. Elapsed time: {time} seconds' 182 | .format(admin=admin, time=format_two_point_time(timer, time.time()))) 183 | print('########## FINISHED CREATING MAPPING ADMIN TABLE ##########') 184 | 185 | def preprocess_cell_tower_data(self): 186 | provider_prefix = self.config.provider_prefix 187 | check_duplicate = self.config.check_duplicate 188 | arg_cell_create = self.data.arg_cell_create 189 | arg_cell_map = self.data.arg_cell_map 190 | cursor = self.hc.cursor 191 | print('########## CREATE PREPROCESS MAPPING TABLE ##########') 192 | if check_duplicate: 193 | distinct = 'distinct' 194 | else: 195 | distinct = '' 196 | print('Checking and dropping preprocess mapping table if existing.') 197 | timer = time.time() 198 | 199 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cell_tower_data_preprocess'.format( 200 | provider_prefix=provider_prefix)) 201 | print('Checked and dropped preprocess mapping table if existing. Elapsed time: {} seconds'.format( 202 | format_two_point_time(timer, time.time()))) 203 | timer = time.time() 204 | 205 | print('Creating preprocess mapping table') 206 | raw_sql = sql_to_string('cdr_and_mapping/create_preprocess_mapping.sql') 207 | query = raw_sql.format(provider_prefix=provider_prefix, 208 | arg_create=', '.join(arg_cell_create)) 209 | cursor.execute(query) 210 | print('Created mapping preprocess table. Elapsed time: {} seconds' 211 | .format(format_two_point_time(timer, time.time()))) 212 | timer = time.time() 213 | # need username to get privilege 214 | 215 | print('Inserting into preprocess mapping table') 216 | raw_sql = sql_to_string('cdr_and_mapping/insert_preprocess_mapping.sql') 217 | query = raw_sql.format(provider_prefix=provider_prefix, distinct=distinct, arg=', '.join(arg_cell_map)) 218 | cursor.execute(query) 219 | print('Inserted into preprocess mapping table. Elapsed time: {} seconds' 220 | .format(format_two_point_time(timer, time.time()))) 221 | print('########## FINISHED CREATING PREPROCESS MAPPING TABLE ##########') 222 | 223 | def preprocess_data(self): 224 | provider_prefix = self.config.provider_prefix 225 | check_duplicate = self.config.check_duplicate 226 | arg_cdr_prep = self.data.arg_cdr_prep 227 | arg_cdr_map = self.data.arg_cdr_map 228 | cursor = self.hc.cursor 229 | 230 | print('########## CREATE PREPROCESS CDR TABLE ##########') 231 | if check_duplicate: 232 | distinct = 'distinct' 233 | else: 234 | distinct = '' 235 | 236 | print('Checking and dropping preprocess cdr table if existing.') 237 | timer = time.time() 238 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_preprocess'.format(provider_prefix=provider_prefix)) 239 | print('Checked and dropped preprocess cdr table if existing. Elapsed time: {} seconds'.format( 240 | format_two_point_time(timer, time.time()))) 241 | timer = time.time() 242 | 243 | print('Creating preprocess cdr table.') 244 | raw_sql = sql_to_string('cdr_and_mapping/create_preprocess_cdr.sql') 245 | query = raw_sql.format(args=', '.join(arg_cdr_prep), provider_prefix=provider_prefix) 246 | cursor.execute(query) 247 | 248 | print('Created preprocess cdr table. Elapsed time: {} seconds' 249 | .format(format_two_point_time(timer, time.time()))) 250 | timer = time.time() 251 | 252 | print('Inserting into preprocess table') 253 | print('Columns in preprocess table mapped: ' + ', '.join(arg_cdr_map)) 254 | raw_sql = sql_to_string('cdr_and_mapping/insert_preprocess_cdr.sql') 255 | query = raw_sql.format(distinct=distinct, arg=', '.join(arg_cdr_map), provider_prefix=provider_prefix) 256 | cursor.execute(query) 257 | print('Inserted into preprocess cdr table. Elapsed time: {} seconds' 258 | .format(format_two_point_time(timer, time.time()))) 259 | print('########## FINISHED CREATING PREPROCESS CDR TABLE ##########') 260 | 261 | def consolidate_table(self): 262 | # TODO join here 263 | provider_prefix = self.config.provider_prefix 264 | arg_cdr_prep = self.data.arg_cdr_prep 265 | arg_cdr_con = self.data.arg_cdr_con 266 | cursor = self.hc.cursor 267 | print('########## CREATE CONSOLIDATE CDR TABLE ##########') 268 | print('Checking and dropping consolidate cdr table if existing.') 269 | 270 | print('Checking latitude and lontitude in the preprocess table') 271 | cursor.execute('select max(latitude), max(longitude) from {provider_prefix}_preprocess' 272 | .format(provider_prefix=provider_prefix)) 273 | res = cursor.fetchall() 274 | 275 | latitude = res[0][0] 276 | longitude = res[0][1] 277 | arg_cdr_con_with_join_cond =[] 278 | if (latitude == -1 and longitude == -1): 279 | print('Join to make consolidate') 280 | for arg in arg_cdr_con: 281 | if str.lower(arg) in ['longitude', 'latitude']: 282 | arg_cdr_con_with_join_cond.append('a2.' + arg + ' as ' + arg) 283 | else: 284 | arg_cdr_con_with_join_cond.append('a1.' + arg + ' as ' + arg) 285 | insert_script_loc = 'cdr_and_mapping/insert_consolidate_cdr_join.sql' 286 | else: 287 | arg_cdr_con_with_join_cond = arg_cdr_con 288 | print('No join') 289 | insert_script_loc = 'cdr_and_mapping/insert_consolidate_cdr.sql' 290 | 291 | timer = time.time() 292 | cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_consolidate_data_all' 293 | .format(provider_prefix=provider_prefix)) 294 | print('Checked and dropped preprocess cdr table if existing. Elapsed time: {} seconds' 295 | .format(format_two_point_time(timer, time.time()))) 296 | timer = time.time() 297 | 298 | print('Creating consolidate table') 299 | raw_sql = sql_to_string('cdr_and_mapping/create_consolidate_cdr.sql') 300 | query = raw_sql.format(provider_prefix=provider_prefix, arg_prep=' ,'.join(arg_cdr_prep)) 301 | cursor.execute(query) 302 | print('Created consolidate cdr table. Elapsed time: {} seconds' 303 | .format(format_two_point_time(timer, time.time()))) 304 | timer = time.time() 305 | 306 | print('Columns in consolidate table: ' + ', '.join(arg_cdr_con_with_join_cond)) 307 | print('Inserting into the consolidate table') 308 | raw_sql = sql_to_string(insert_script_loc) 309 | query = raw_sql.format(provider_prefix=provider_prefix, arg_con=', '.join(arg_cdr_con_with_join_cond)) 310 | cursor.execute(query) 311 | print('Inserted into consolidate cdr table. Elapsed time: {} seconds' 312 | .format(format_two_point_time(timer, time.time()))) 313 | print('########## FINISHED CREATING CONSOLIDATE CDR TABLE ##########') 314 | 315 | -------------------------------------------------------------------------------- /Interpolation/README.md: -------------------------------------------------------------------------------- 1 | # Interpolation 2 | The tool will perform the following operations: 3 | * Trip segmentation 4 | * Stay Point Reallocation 5 | * Route Interpolation 6 | 7 | Results are the route interpolation of a CDR data. 8 | ## Prerequisites 9 | * CDR Consolidate Data Table ({provider_prefix}_consolidate_data_all) 10 | * obtained from the script [run_prepare_cdr_and_mapping.py](../run_prepare_cdr_and_mapping.py). 11 | See the [first page](../README.md) for how to prepare a CDR file and a cell tower mapping file. The following are the columns 12 | of the CDR consolidate data table. * means the column is required to have some valid values. 13 | ``` 14 | *UID : Unique Identifier of each user 15 | IMEI : International Mobile Equipment Identity (IMEI) of Caller 16 | IMSI : International Mobile Subscriber Identity (IMSI) of Caller 17 | *CALL_TIME : Activity Time (Start Time) in “YYYY-MM-DD HH:mm:ss” format 18 | *DURATION : Call Duration in seconds 19 | *CELL_ID : Unique Cell Tower ID (LAC+CellID) 20 | CALL_TYPE : Type of the call (Data, Voice or SMS) 21 | NETWORK_TYPE : Type of the network (2G, 3G, 4G, 5G) 22 | *Longitude : Real Number (decimal degree) in WGS84 23 | *Latitude : Real Number(decimal degree) in WGS84 24 | ``` 25 | * Building/POIs for Reallocation 26 | * OSM road network data 27 | * Voronoi data of Cell tower/Base Station location 28 | 29 | # Configuration 30 | The configuration of the connection of hadoop server and selection needs to be set prior to this. 31 | See the [first page](../README.md) in the configuration section. 32 | Then, in the config file, set five following fields: 33 | * interpolation_poi_file_location: set to be the local path of the poi file 34 | * interpolation_osm_file_location: set to be the local path of the osm file 35 | * interpolation_voronoi_file_location: set to be the local path of the voronoi file 36 | * max_size_cdr_by_uid: set to be the maximum array size of cdr of each particular user 37 | * max_size_interpolation: set to be the max size of interpolation 38 | according to what you have in the cell tower mapping raw data 39 | 40 | For example, see [config_big.json](../sample_configs/config_big.json) in line from 34 to 38 41 | 42 | # Route Interpolation 43 | run the following command 44 | * python3 [run_interpolation.py](../run_interpolation.py) -c {config_file} 45 | 46 | Example 47 | 48 | * run python3 run_interpolation.py -c sample_configs/config_big.json 49 | 50 | To edit further, the user can go to [cdr_interpolation.py](../Common/cdr_interpolation.py) in 51 | calculate_interpolation() 52 | 53 | If only some of the operations are needed, you can comment them here (ex. already finished some steps) 54 | 55 | ``` 56 | self.convert_cdr_to_array_format() 57 | self.create_trip_format() 58 | self.create_trip_24hr_padding() 59 | self.create_poi_relocation() 60 | self.create_route_interpolation() 61 | self.export_to_csv() 62 | ``` 63 | 64 | 65 | # Route Interpolation Output 66 | The output will be generated inside the hadoop server in /tmp/hive/csv_interpolation. 67 | The file will have no extension but it can be renamed to have .csv extension and run without any problem. 68 | The following file is the output sample for the [interpolation](output_sample/interpolation.csv) 69 | In the file, each field is separated by comma and having the following column name 70 | ``` 71 | 1. User Id 72 | • Unique for each device 73 | 2. Trip Sequence 74 | • Order of sub trip in a day, start from 1 75 | 3. Mobility Type 76 | • Value: STAY or MOVE 77 | 4. Transportation Mode 78 | • Indicate mode of transportation of corresponding sub trip 79 | • Value: STAY, WALK, VEHICLE 80 | 5. Total Distance 81 | • Total travel distance of sub trip in meters 82 | 6. Total Time 83 | • Total travel time of sub trip in seconds 84 | 7. Start Time 85 | • Indicate start time of sub trip 86 | • Format: hh:MM:ss 87 | • Example: 8:38:08 88 | 8. End Time 89 | • Indicate end time of sub trip 90 | • Format: hh:MM:ss 91 | • Example: 9:30:20 92 | 9. Total Points 93 | • Indicate total number of point data in sub trip 94 | 10. Subtrip Sequence 95 | • The point number of each sub trip in a day, start from 1 96 | 11. Subtrip Point Start Time 97 | • Indicate start time of sub trip 98 | • Format: MM/DD/YYYY hh:MM 99 | • Example: 1/2/2019 8:38 100 | 12. Subtrip Point Latitude 101 | • Indicate the latitude of a particular point in a subtrip 102 | • Format: MM/DD/YYYY hh:MM 103 | • Example: 1/2/2019 8:38 104 | 13. Subtrip Point Longitude 105 | • Indicate the longtitude of a particular point in a subtrip 106 | • Format: MM/DD/YYYY hh:MM 107 | • Example: 1/2/2019 8:38 108 | ``` 109 | Example (replace comma by tab for better illustration) 110 | 111 | ``` 112 | 1031073514 1 STAY STAY 31088 0 0:00:00 8:38:08 1 1 1/2/2019 0:00 23.614079 89.361402 113 | 1031073514 2 MOVE WALK 3132 4330.51 8:38:08 9:30:20 53 1 1/2/2019 8:38 23.619423 89.367677 114 | 1031073514 2 MOVE WALK 3132 4330.51 8:38:08 9:30:20 53 2 1/2/2019 8:39 23.618943 89.368309 115 | 1031073514 2 MOVE WALK 3132 4330.51 8:38:08 9:30:20 53 3 1/2/2019 8:40 23.618462 89.368942 116 | 1031073514 2 MOVE WALK 3132 4330.51 8:38:08 9:30:20 53 4 1/2/2019 8:41 23.617982 89.369574 117 | 118 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Spatial Data Commons 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Origin_Destination/README.md: -------------------------------------------------------------------------------- 1 | # Origin Destination 2 | The tool performs calculation of origin destination based on the data provided in the form of 3 | the consolidate data table. 4 | Results are the origin destination of a selected date stored in a tsv file. 5 | 6 | ## Prerequisites 7 | Tables obtained from the script [run_prepare_cdr_and_mapping.py](../run_prepare_cdr_and_mapping.py). 8 | See the [first page](../README.md) for how to prepare a CDR file and a cell tower mapping file. The following are the columns 9 | of the CDR consolidate data table. 10 | * CDR Consolidate Data Table ({provider_prefix}_consolidate_data_all) 11 | ``` 12 | UID : Unique Identifier of each user 13 | IMEI : International Mobile Equipment Identity (IMEI) of Caller 14 | IMSI : International Mobile Subscriber Identity (IMSI) of Caller 15 | CALL_TIME : Activity Time (Start Time) in “YYYY-MM-DD HH:mm:ss” format 16 | DURATION : Call Duration in seconds 17 | CELL_ID : Unique Cell Tower ID (LAC+CellID) 18 | CALL_TYPE : Type of the call (Data, Voice or SMS) 19 | NETWORK_TYPE : Type of the network (2G, 3G, 4G, 5G) 20 | Longitude : Real Number (decimal degree) in WGS84 21 | Latitude : Real Number(decimal degree) in WGS84 22 | ``` 23 | * Cell Tower Mapping Preprocess Table ({provider_prefix}_cell_tower_data_preprocess) 24 | ``` 25 | CELL_ID : Unique Cell Tower ID (LAC+CellID) 26 | Longitude : Real Number (decimal degree) in WGS84 27 | Latitude : Real Number(decimal degree) in WGS84 28 | Admin1 : Administration Unit 1 name (if any) 29 | Admin2 : Administration Unit 2 name (if any) 30 | . 31 | . 32 | . 33 | AdminN : Administration Unit N name (if any) 34 | ``` 35 | * Cell Tower Data Admin X Table ({provider_prefix}_cell_tower_data_adminX) 36 | * for generating sequence numbers of an administration unit in case of duplication 37 | ``` 38 | AdminX_ID : Administration Unit X name 39 | AdminX_Name : Name of the Administration Unit X 40 | CELL_ID : Unique Cell Tower ID 41 | Longitude : Real Number (decimal degree) in WGS84 42 | Latitude : Real Number (decimal degree) in WGS84 43 | ``` 44 | 45 | * CDR Home Location Table ({provider_prefix}_la_cdr_uid_home) 46 | * Obtained from finding rank1_frequent_location in calculate data statistics [run_statistics.py](../run_statistics.py)) 47 | 48 | ``` 49 | UID : Unique Identifier of each user 50 | SITE_ID : Unique Concatenation of Latitude and Longitude 51 | TCOUNT : Number of records found in a particular SITE_ID 52 | TRANK : The rank of the SITE_ID in how frequent each user is in the SITE_ID area 53 | PPERCENT : The ratio of how many times each user have a cdr data among all places at which the user use a mobile device in percentage 54 | LONGITUDE : Real Number (decimal degree) in WGS84 55 | LADITUDE : Real Number (decimal degree) in WGS84 56 | ADMINX_ID : Administration Unit X name 57 | ``` 58 | 59 | # Configuration 60 | The configuration of the connection of hadoop server and selection needs to be set prior to this. 61 | See the [first page](../README.md) in the configuration section. 62 | Then, in the config file, set two following fields: 63 | * od_admin_unit: set to be a value in ("admin0", "admin1", "admin2", "admin3", "admin4", "admin5") 64 | according to what you have in the cell tower mapping raw data 65 | * od_date: set to the date you want to perform origin destination to (format "yyyy-mm-dd") 66 | 67 | For example, see [config_big.json](../sample_configs/config_big.json) in line 31 and 32 68 | 69 | # Origin Destination 70 | run the following command 71 | * python3 [run_origin_destination.py](../run_origin_destination.py) -c {config_file} 72 | 73 | Example 74 | 75 | * run python3 run_origin_destination.py -c [sample_configs/config_big.json](../sample_configs/config_big.json) 76 | 77 | To edit further, the user can go to [cdr_origin_destination.py](../Common/cdr_origin_destination.py) in 78 | calculate_od() 79 | 80 | If only some of the operations are needed, you can comment them here (ex. already finished some steps) 81 | 82 | ``` 83 | self.cdr_by_uid() 84 | self.create_od() 85 | self.create_od_detail() 86 | self.create_od_sum() 87 | ``` 88 | 89 | 90 | # Origin Destination Output 91 | The output will be generated inside the hadoop server in /tmp/hive/od_result. 92 | The file will have no extension but it can be renamed to have .tsv extension and run without any problem. 93 | The following file is the output sample for the [origin-destination](output_sample/origin_destination.tsv) 94 | In the file, each field is separated by comma and having the following column name 95 | 96 | pdt,origin , 97 | destination,cast(tcount as string),cast(tusercount as string)) 98 | 99 | ``` 100 | 1. Date 101 | • The date of the origin destination data (as indicated in the config file in the field "od_date") 102 | 2. Origin Admin X ID 103 | • The origin place in the form of Admin X ID 104 | 3. Destination Admin X ID 105 | • The destination place in the form of Admin X ID 106 | 4. Count 107 | • Number of records of the movement from the origin to destination 108 | 5. User Count 109 | • Total Users in a particular movement 110 | ``` 111 | Example 112 | 113 | ``` 114 | 2016-03-01 0 805 1.0 1.0 115 | 2016-03-01 0 937 4.0 4.0 116 | 2016-03-01 0 938 1.0 1.0 117 | 2016-03-01 0 940 4.0 4.0 118 | 2016-03-01 1001 1062 6.0 6.0 119 | 2016-03-01 1001 1064 7.0 7.0 120 | 2016-03-01 1001 1065 4.0 4.0 121 | 2016-03-01 1001 1082 1.0 1.0 122 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CDR-analysis-tools-hadoop 2 | 3 | Like the standalone version, this repository is a set of tools written in Python for analyzing Call Detail Records (CDRs) data, additionally based on the hadoop platform which supports a large amount of data. The analysis includes Visualization (with reports and processed data compatible with other visualization platforms), Origin-Destination (OD) and Interpolation. 4 | 5 | This repository will be incrementally updated from time to time. Kindly visit the repository and have a look at this file. 6 | 7 | 8 | ## Getting Started 9 | 10 | These instructions will get you a copy of the software package and running on your local machine. 11 | It can be run on both Windows and Linux. The tool dependencies are in the requirements.txt file 12 | which can be installed in 1 command. 13 | 14 | Structure this package 15 | 16 | ``` 17 | ├─Statistics Report: 18 | │ Generating csv reports and graph reports including 19 | | - summary statistics (average usage, voice call and etc.) 20 | │ - whole data statistics (ex. total cdrs, total days and locations) 21 | | - daily and monthly statistics of users 22 | | - frequent locations 23 | | - zone based aggregation 24 | | - graphical daily data 25 | | - usage histogram 26 | | 27 | ├─Origin-Destination (OD): 28 | │ Generating Origin-Destination file indicating the movement of humans 29 | │ 30 | ├─Interpolation: 31 | | A set of software for route interpolation including 32 | | - Extracting stay points 33 | | - Extract tripsegment 34 | | - Relocation PoI 35 | | - Route Interpolation with transpotation network 36 | ``` 37 | After preparing the data, see: 38 | 39 | * [Statistics Report](../master/Statistics/) 40 | * [Origin Destination](../master/Origin_Destination/) 41 | * [Interpolation](../master/Interpolation) 42 | 43 | ## Data preparation 44 | The user needs 2 files for the tool, a cdr file and a location mapping file. Both of them come with different column names and formats. To process CDR data, the data needs to be in the format that is compatible with the tools. The mapping json file maps from your prepared raw csv files to Hive tables ready for the processing and a mapping scheme for each file has to be done by the user. 45 | 46 | ### a CSV file for CDR records 47 | To analyse the CDR data, the user needs to provide the tools with a CDR file in the csv format. The files include the de-identified CDR where any personally identifiable information (such as IMSI and IMEI) is encrypted in an unreversable manner. Hereinafter IMSI and IMEI mean de-identified IMSI and IMEI. The file needs to contain the following data items: 48 | ``` 49 | UID : Unique Identifier of each user 50 | IMEI : International Mobile Equipment Identity (IMEI) of Caller 51 | IMSI : International Mobile Subscriber Identity (IMSI) of Caller 52 | CALL_TIME : Activity Time (Start Time) in “YYYY-MM-DD HH:mm:ss” format 53 | DURATION : Call Duration in seconds 54 | CELL_ID : Unique Cell Tower ID (LAC+CellID) 55 | CALL_TYPE : Type of the call (Data, Voice or SMS) 56 | NETWORK_TYPE : Type of the network (2G, 3G, 4G, 5G) 57 | Longitude : Real Number (decimal degree) in WGS84 58 | Latitude : Real Number(decimal degree) in WGS84 59 | ``` 60 | 61 | For 9 and 10 If not existing, then it will be mapped in the cell tower mapping file by cell_id 62 | 63 | #### Mapping 64 | Given a CDR file, the mapping in the key "cdr_data_layer" in the file config.json is shown below 65 | 66 | ``` 67 | "cdr_data_layer":[ 68 | {"input_no":1, "input_name":"SUBID", "data_type":"String", "output_no":1, "name":"UID", "custom": ""}, 69 | {"input_no":-1, "input_name":"IMEI", "data_type":"String", "output_no":2, "name":"IMEI", "custom": ""}, 70 | {"input_no":-1, "input_name":"IMSI", "data_type":"String", "output_no":3, "name":"IMSI", "custom": ""}, 71 | {"input_no":2, "input_name":"CDATE", "data_type":"String", "output_no":-1, "name":"CDATE", "custom": ""}, 72 | {"input_no":3, "input_name":"CTIME", "data_type":"String", "output_no":4, "name":"CALL_TIME", "custom": "CONCAT(CDATE,' ',CTIME)"}, 73 | {"input_no":4, "input_name":"DURATION", "data_type":"String", "output_no":5, "name":"DURATION", "custom": ""}, 74 | {"input_no":5, "input_name":"CELLID", "data_type":"String", "output_no":6, "name":"CELL_ID", "custom": ""}, 75 | {"input_no":6, "input_name":"LATITUDE", "data_type":"String", "output_no":7, "name":"LATITUDE", "custom": ""}, 76 | {"input_no":7, "input_name":"LONGITUDE", "data_type":"String", "output_no":8, "name":"LONGITUDE", "custom": ""}, 77 | {"input_no":9, "input_name":"NETWORK_TYPE", "data_type":"String", "output_no":9, "name":"NETWORK_TYPE", "custom": ""}, 78 | {"input_no":10, "input_name":"CALL_TYPE", "data_type":"String", "output_no":10, "name":"CALL_TYPE", "custom": ""} 79 | ], 80 | ``` 81 | * To map, If the column in the raw file reflects one of the required columns for CDR (ex. UID) then put it in the configuration item 82 | whose "name" field contains "UID" (ex. "SUBID") and both "input_no" and "output_no" will be not "-1" 83 | * Do not remove the configuration item. If there is no raw column corresponding to the required column, set "input_no" -1 84 | meaning that there is no CALL_TYPE column in the raw file 85 | * For example, if you don't have call_type column in your raw cdr file, the configuration will be 86 | * {"input_no":-1, "input_name":"CALL_TYPE", "data_type":"String", "output_no":-1, "name":"CALL_TYPE", "custom": ""} 87 | * All the columns in the raw file needs to be indicated even it is not mapped to the required column. Simply adding more configuration items. If it is not mapped to any required column, 88 | set "output_no" -1 (ex. "CDATE") meaning that the column "CDATE" in the raw file does not reflect any required columns 89 | * Some column may need more function to convert into a desirable format, you can indicate in "custom" field 90 | * For example, in the fifth item, "custom" contains CONCAT(CDATE, ' ', CTIME). The column CTIME is in the raw so "input_no" is not -1". 91 | and operands must be in the item in which the "input_no" is not -1 (it exists in the raw file) 92 | 93 | ### a CSV location mapping file for administration units 94 | The previous csv file will be joined with this cell id file to calculate zone-based statistics. It should supply 95 | 1. Cell ID (will be joined with the Cell ID in the CDR record file) 96 | 2. At least one Administration Unit (ex. province or district) name 97 | 3. Latitude 98 | 4. Longitude 99 | 100 | ``` 101 | CELL_ID : Unique Cell Tower ID (LAC+CellID) 102 | Longitude : Real Number (decimal degree) in WGS84 103 | Latitude : Real Number(decimal degree) in WGS84 104 | At least one Administration Unit (ex. province or district) Name 105 | ``` 106 | 107 | #### Mapping 108 | The mapping in this file needs to be done in the same way as previously mentioned in the CDR raw file. 109 | 110 | ``` 111 | "cdr_cell_tower":[ 112 | {"input_no":1, "input_name":"bs_seq", "data_type":"String", "output_no":-1, "name":"BS_SEQ"}, 113 | {"input_no":2, "input_name":"cell_seq", "data_type":"String", "output_no":1, "name":"CELL_ID" }, 114 | {"input_no":3, "input_name":"name", "data_type":"String", "output_no":-1, "name":"NAME"}, 115 | {"input_no":4, "input_name":"lac", "data_type":"String", "output_no":-1, "name":"CELLNAME" }, 116 | {"input_no":5, "input_name":"cell", "data_type":"String", "output_no":-1, "name":"CI" }, 117 | {"input_no":6, "input_name":"lon", "data_type":"String", "output_no":2, "name":"LATITUDE" }, 118 | {"input_no":7, "input_name":"lat", "data_type":"String", "output_no":3, "name":"LONGITUDE" }, 119 | {"input_no":8, "input_name":"ISO2", "data_type":"String", "output_no":-1, "name":"ISO2" }, 120 | {"input_no":9, "input_name":"NAME_0_2", "data_type":"String", "output_no":-1, "name":"NAME_0_2"}, 121 | {"input_no":10, "input_name":"ID_1_2", "data_type":"String", "output_no":-1, "name":"ID_1_2" }, 122 | {"input_no":11, "input_name":"NAME_1_2", "data_type":"String", "output_no":4, "name":"ADMIN0", "geojson_filename": "", "geojson_col_name": "" }, 123 | {"input_no":12, "input_name":"ID_2", "data_type":"String", "output_no":-1, "name":"ID2" }, 124 | {"input_no":13, "input_name":"NAME_2", "data_type":"String", "output_no":5, "name":"ADMIN1", "geojson_filename": "", "geojson_col_name": "" }, 125 | {"input_no":14, "input_name":"ENGTYPE_2", "data_type":"String", "output_no":-1, "name":"ENGTYPE_2" } 126 | ] 127 | ``` 128 | 129 | One difference is that you need to supply at least one administration unit or your interested location to calculate zone population. For example, in the "input_no" 11, it contains an administration unit data 130 | and is mapped to "ADMIN0" (administration unit 0). It needs to be in the format ADMIN[0-5] to make the tool work (you may have shopping complex names in "input_name" and name it "ADMIN0" for example). 131 | If you want to visualize, put your geojson file location in the "geojson_filename" and the data will be joined with the zone population data and can be visualized in [kepler.gl](https://kepler.gl) 132 | 133 | ## Configuration 134 | In config.json file, you need to assign the right path, prefix, location and so on. Here is an example of a config.json file with an explanation 135 | 136 | 137 | "hadoop_data_path":"/path/to/cdr/and/celltower/file", 138 | 139 | "provider_prefix":"pref1" **any prefix you'd like to name (you may need in case that you want to use this tool to different data** 140 | 141 | "db_name" : "cdrproject", 142 | 143 | "input_delimiter":",", **raw file delimiter (ex. comma "," or tab "\t")** 144 | 145 | "input_files" :["cdr.csv"], ** raw cdr file(s)** 146 | 147 | "input_file_time_format": "yyyyMMdd hh:mm:ss", **time format in your data (if it is ambiguous ex. no separator between month and year, you need to put the format here or left it blank if it is dash, slash-separated date and colon-separated time** 148 | 149 | "input_file_have_header": 1, **if having table description (column names) put 1, otherwise 0** 150 | 151 | "input_cell_tower_files" : ["cdr_cell_tower.csv"], **cell tower mapping data** 152 | 153 | "input_cell_tower_delimiter":",", 154 | 155 | "input_cell_tower_have_header": 1, 156 | 157 | "check_duplicate": true, **filter duplicate rows or not** 158 | 159 | "check_invalid_lat_lng": true, **filter invalid lat and lng** 160 | 161 | "host": "host_name", **hostname of the hadoop server** 162 | 163 | "port": 10000, **hive2 server port** 164 | 165 | "frequent_location_percentage": 80, **sum of the frequent location of a particular uid** 166 | 167 | "csv_location": "csv_reports", **directory of the output csv reports** 168 | 169 | "graph_location": "graphical_reports", **directory of the graph reports** 170 | 171 | "od_admin_unit": "admin1", **administration unit used for calculating origin-destination** 172 | 173 | "od_date": "2016-03-01", **date selected for origin-destination** 174 | 175 | "interpolation_poi_file_location": "/path/to/poi", **a poi file for interpolation** 176 | 177 | "interpolation_osm_file_location": "/path/to/osm", **an osm file for interpolation** 178 | 179 | "interpolation_voronoi_file_location": "path/to/voronoi", **a voronoi file for interpolation** 180 | 181 | 182 | "cdr_data_layer": [...], the mapping scheme of cdr raw file to the table used for processing 183 | 184 | "cdr_cell_tower": [...], the mapping scheme of cell tower mapping raw file to the table used for processing 185 | 186 | ## Prerequisites 187 | * Hadoop server with Hive installed 188 | * Python 3 or above 189 | * Python pip3 (a Python package installer) 190 | 191 | ## Installation 192 | clone the repository and then 193 | install all requirement packages in requirements.txt using command 194 | * pip install -r requirements.txt 195 | 196 | ## Preparing tables 197 | 198 | Go to [config.json](sample_configs/config.json) or [config_big.json](sample_configs/config_big.json) to see the configuration files to setup the variables and 199 | start mapping your data. 200 | 201 | Then go to [run_prepare_cdr_and_mapping.py](run_prepare_cdr_and_mapping.py) in the user section and run 202 | 203 | * python3 run_prepare_cdr_and_mapping.py -c {config_file} 204 | 205 | Example 206 | 207 | * python3 run_prepare_cdr_and_mapping.py -c sample_configs/config_big.json 208 | 209 | You may have some error due to mapping but after fixing it, you can continue from the most current function. If you do not want the tables to be deleted and created again, go to hive_connector.py and then comment the line with function create_tables in the __init__ function 210 | ** 211 | 212 | There are mainly 3 sections you may want to customize. 213 | 214 | **main() in [run_prepare_cdr_and_mapping.py](run_prepare_cdr_and_mapping.py)** 215 | ``` 216 | def main(): 217 | # argument parser 218 | start = time.time() 219 | parser = argparse.ArgumentParser(description='Argument indicating the configuration file') 220 | 221 | # add configuration argument 222 | parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data" 223 | " \n ex. py py_hive_connect.py -c config.json", 224 | action="store") 225 | 226 | # parse config to args.config 227 | args = parser.parse_args() 228 | 229 | config = Config(args.config) 230 | HiveConnection(host=config.host, port=config.port, user=config.user) 231 | cdr_data = extract_mapping_data(config) 232 | 233 | # initialize hive and create tables 234 | table_creator = HiveTableCreator(config, cdr_data) 235 | table_creator.initialize('hive_init_commands/initial_hive_commands_stats.json') # init hive 236 | table_creator.create_tables() 237 | 238 | print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time()))) 239 | 240 | ``` 241 | **[hive_create_tables.py](Common/hive_create_tables.py) in \__init\__** 242 | If you don't want tables to be created again (maybe after some errors but tables created), you can comment it in **\__init\__** function 243 | 244 | ``` 245 | def __init__(self, config, data): 246 | self.__dict__ = config.__dict__ 247 | self.hive = HiveConnector(config) 248 | timer = time.time() 249 | print('########## Initilizing Hive ##########') 250 | self.hive.initialize(config) 251 | print('########## Done. Time elapsed: {} seconds ##########'.format(hp.format_two_point_time(timer, time.time()))) 252 | timer = time.time() 253 | print('########## Creating Tables ##########') 254 | #self.hive.create_tables(config, data) <<<< COMMENT HERE 255 | print('########## Done create all tables. Time elapsed: {} seconds ##########'.format(hp.format_two_point_time(timer, time.time()))) 256 | ``` 257 | ## License 258 | Free to use and distribute with acknowledgement. 259 | -------------------------------------------------------------------------------- /Sample_Inputs/cdr_sample.csv: -------------------------------------------------------------------------------- 1 | SUBID,CDATE,CTIME,DURATION,CELLID,LATITUDE,LONGITUDE,network_type,call_type 2 | 3594716203,20160501,8:43:47,96,10011,13.44845,-16.57612,2G,VOICE 3 | 3594716203,20160501,8:46:10,80,10021,13.45218,-16.57419,3G,DATA 4 | 3594716203,20160502,9:46:10,43,10021,13.45218,-16.57419,3G,DATA 5 | 3594716203,20160601,10:46:10,43,10022,13.45218,-16.57419,3G,DATA 6 | 3498343785,20160501,8:42:06,201,10011,13.44845,-16.57612,3G,DATA 7 | 3498191359,20160501,8:45:00,36,10012,13.44845,-16.57612,2G,DATA 8 | 3463089753,20160501,8:45:08,9,10012,13.44845,-16.57612,3G,VOICE 9 | 3589765737,20160501,8:45:07,10,10013,13.44845,-16.57612,2G,SMS 10 | 3587544575,20160501,8:45:10,8,10015,13.44845,-16.57612,3G,VOICE 11 | 3496971123,20160501,8:45:09,9,10016,13.44845,-16.57612,2G,DATA 12 | 3473003603,20160501,8:45:13,5,10021,13.45218,-16.57419,3G,DATA 13 | 3456069605,20160501,8:45:11,7,10021,13.45218,-16.57419,4G,DATA 14 | 3473003603,20160503,8:45:13,5,10021,13.45218,-16.57419,3G,DATA 15 | 3456069605,20160504,8:45:11,7,10021,13.45218,-16.57419,4G,DATA 16 | 3456069605,20160601,8:45:11,7,10021,13.45218,-16.57419,4G,DATA 17 | 3473003603,20160703,8:45:13,5,10021,13.45218,-16.57419,3G,DATA 18 | 3456069605,20200504,8:45:11,7,10021,13.45218,-16.57419,4G,DATA 19 | -------------------------------------------------------------------------------- /Sample_Inputs/mapping_sample.csv: -------------------------------------------------------------------------------- 1 | BTSID,SITE NAME,Longitude,Latitude,CELLID,CELLNAME,CI,Azimuth,District,Province 2 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10011,Q001-Dobson Street Roof Top-1,10011,30,Sendai,Miyagi 3 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10012,Q001-Dobson Street Roof Top-2,10012,160,Sendai,Miyagi 4 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10013,Q001-Dobson Street Roof Top-3,10013,290,Sendai,Miyagi 5 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10014,Q001-Dobson Street Roof Top_DCS-1,10014,30,Sendai,Miyagi 6 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10015,Q001-Dobson Street Roof Top_DCS-2,10015,160,Sendai,Miyagi 7 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10016,Q001-Dobson Street Roof Top_DCS-3,10016,290,Sendai,Miyagi 8 | 1002,Q002-ECOBANK,140.8694,38.2682,10021,Q002-ECOBANK-1,10021,20,Sendai,Miyagi 9 | 1002,Q002-ECOBANK,140.8694,38.2682,10022,Q002-ECOBANK-2,10022,160,Sendai,Miyagi 10 | -------------------------------------------------------------------------------- /Statistics/README.md: -------------------------------------------------------------------------------- 1 | # Statistics 2 | In this sections, it concerns generating reports, graphs for statistics or CDR data. Output will be in the 3 | format of CSV file and a graph (png) files 4 | To illustrate, a simple csv file for cdr and mapping will be used and they are located in. 5 | ## Prerequisites 6 | Tables obtained from the script [run_prepare_cdr_and_mapping.py](../run_prepare_cdr_and_mapping.py). 7 | See the [first page](../README.md) for how to prepare a CDR file and a cell tower mapping file. The following are the columns 8 | of the CDR consolidate data table. 9 | * CDR Consolidate Data Table ({provider_prefix}_consolidate_data_all) 10 | ``` 11 | UID : Unique Identifier of each user 12 | IMEI : International Mobile Equipment Identity (IMEI) of Caller 13 | IMSI : International Mobile Subscriber Identity (IMSI) of Caller 14 | CALL_TIME : Activity Time (Start Time) in “YYYY-MM-DD HH:mm:ss” format 15 | DURATION : Call Duration in seconds 16 | CELL_ID : Unique Cell Tower ID (LAC+CellID) 17 | CALL_TYPE : Type of the call (Data, Voice or SMS) 18 | NETWORK_TYPE : Type of the network (2G, 3G, 4G, 5G) 19 | Longitude : Real Number (decimal degree) in WGS84 20 | Latitude : Real Number(decimal degree) in WGS84 21 | ``` 22 | * Cell Tower Mapping Preprocess Table ({provider_prefix}_cell_tower_data_preprocess) 23 | ``` 24 | CELL_ID : Unique Cell Tower ID (LAC+CellID) 25 | Longitude : Real Number (decimal degree) in WGS84 26 | Latitude : Real Number(decimal degree) in WGS84 27 | Admin1 : Administration Unit 1 name (if any) 28 | Admin2 : Administration Unit 2 name (if any) 29 | . 30 | . 31 | . 32 | AdminN : Administration Unit N name (if any) 33 | ``` 34 | * Cell Tower Data Admin X Table ({provider_prefix}_cell_tower_data_adminX) 35 | * for generating sequence numbers of an administration unit in case of duplication 36 | ``` 37 | AdminX_ID : Administration Unit X name 38 | AdminX_Name : Name of the Administration Unit X 39 | CELL_ID : Unique Cell Tower ID 40 | Longitude : Real Number (decimal degree) in WGS84 41 | Latitude : Real Number (decimal degree) in WGS84 42 | ``` 43 | ## Usage 44 | Note: the configuration needs to be set first. See the [first page](../README.md) in the configuration section. 45 | 46 | run command 47 | * python3 [run_statistics.py](../run_statistics.py) -c {config_file} 48 | 49 | Example 50 | * python3 run_statistics.py -c sample_configs/[config_big.json](../sample_configs/config_big.json) 51 | 52 | If you wish to execute some of the features, you can comment some lines in the file in main() of [run_statistics.py](../run_statistics.py) 53 | in the user section 54 | 55 | ``` 56 | def main(): 57 | # argument parser 58 | start = time.time() 59 | parser = argparse.ArgumentParser(description='Argument indicating the configuration file') 60 | 61 | # add configuration argument 62 | parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data" 63 | " \n ex. py py_hive_connect.py -c config.json", 64 | action="store") 65 | 66 | # parse config to args.config 67 | args = parser.parse_args() 68 | 69 | config = Config(args.config) 70 | HiveConnection(host=config.host, port=config.port, user=config.user) 71 | 72 | table_creator = HiveTableCreator(config) 73 | table_creator.initialize('hive_init_commands/initial_hive_commands_stats.json') # mandatory (init hive) 74 | 75 | # init stats generators 76 | st = Statistics(config) 77 | 78 | # user section here 79 | # reports 80 | st.calculate_data_statistics() 81 | st.calculate_daily_statistics() 82 | st.calculate_monthly_statistics() 83 | st.calculate_zone_population() 84 | st.calculate_summary() 85 | st.calculate_user_date_histogram() 86 | # graphs 87 | st.daily_cdrs() 88 | st.daily_unique_users() 89 | st.daily_unique_locations() 90 | st.daily_average_cdrs() 91 | st.daily_unique_average_locations() 92 | 93 | # frequent locations (Report) 94 | st.frequent_locations() 95 | st.frequent_locations_night() 96 | 97 | # Prerequisite for Origin-Destination, if not wishing to calculate OD, kindly comment the code 98 | st.rank1_frequent_locations() # Require frequent_locations() in run_statistics.py 99 | 100 | print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time()))) 101 | ``` 102 | 103 | ## Output Reports 104 | The implementation is in [cdr_statistics.py](../Common/cdr_statistics.py) 105 | ### Data statistics 106 | Located in calculate_data_statistics(). In the data statistics, the result output (see the [Data statistics](../Statistics/output_reports/css_file_data_stat.csv)) provided will be: 107 | 108 | ``` 109 | Total Records : the total cdr usage data 110 | Total Days : the total days that have usage 111 | Unique id : the total unique ids of the data (it could be imei, imsi or another identifier given) 112 | Unique imei : the total unique imeis (will be omitted if it is the unique id already) 113 | Unique imsi : the total unique imsis (will be omitted if it is the unique id already) 114 | Unique loc name : the total unique latitude and longitude of the cdr data 115 | Start date : the starting date of the data 116 | End date : the end date of the data 117 | ``` 118 | ### Daily and Monthly Statistics 119 | Located in calculate_daily_statistics() and calculate_monthly_statistics (see the [daily](../Statistics/output_reports/css_provider_data_stat_daily.csv) and [monthly](../Statistics/output_reports/css_provider_data_stat_monthly.csv) output). Calculating some properties order by date first and then the type of call type and network type. 120 | 121 | Each field in the daily statistics is by a particular date (or year and month for monthly statistics), call type and network type including: 122 | ``` 123 | Date : the dates that have cdr records 124 | Call Type : the call type of the cdr data (VOICE, DATA, SMS) 125 | Network Type : the network type of the cdr data (2G, 3G, 4G) 126 | Total Records : the total records 127 | Total Days : the total days 128 | Unique id : the total unique ids of the data (it could be imei, imsi or another identifier given) 129 | Unique imei : the total unique imeis 130 | Unique imsi : the total unique imsis 131 | Unique loc name : the total unique latitude and longitude of the cdr data 132 | ``` 133 | 134 | ### Zone population 135 | at least one administration level is needed to calculate zone population. 136 | By indicating in the mapping in config.json 137 | ``` 138 | "cdr_cell_tower":[ 139 | {"input_no":1, "input_name":"BTSID", "data_type":"String", "output_no":1, "name":"UID"}, 140 | {"input_no":2, "input_name":"SITE_NAME", "data_type":"String", "output_no":2, "name":"SITE_NAME"}, 141 | {"input_no":3, "input_name":"LONGITUDE", "data_type":"String", "output_no":3, "name":"LONGITUDE"}, 142 | {"input_no":4, "input_name":"LATITUDE", "data_type":"String", "output_no":4, "name":"LATITUDE"}, 143 | {"input_no":5, "input_name":"CELLID", "data_type":"String", "output_no":5, "name":"CELL_ID" }, 144 | {"input_no":6, "input_name":"CELLNAME", "data_type":"String", "output_no":-1, "name":"CELLNAME" }, 145 | {"input_no":7, "input_name":"CI", "data_type":"String", "output_no":-1, "name":"CI" }, 146 | {"input_no":8, "input_name":"AZIMUTH", "data_type":"String", "output_no":-1, "name":"AZIMUTH" }, 147 | USED {"input_no":9, "input_name":"DISTRICT", "data_type":"String", "output_no":6, "name":"ADMIN1", "geojson_filename": "japan.json", "geojson_col_name": "nam"}, 148 | USED {{"input_no":10, "input_name":"PROVINCE", "data_type":"String", "output_no":7, "name":"ADMIN2", "geojson_filename": "", "geojson_col_name": ""} 149 | ] 150 | ``` 151 | You need to indicate that the column name indicating a place means what level of administration. For example, 152 | DISTRICT column is ADMIN1. 153 | 154 | If you are able to provide a geojson file for this administration level then put it in the key "geojson_filename" with the name of the field in geojson in "geojson_col_name" 155 | in order for the tool to join the attribute correctly. 156 | 157 | The result you get is the zone_base_aggregations_level_ADMIN{X}.csv and you can see the result output [zone_based_aggregations_level_ADMIN1](../Statistics/output_reports/zone_based_aggregations_level_ADMIN1.csv) 158 | 159 | which includes: 160 | ``` 161 | Administration Level x : Your input administration level x 162 | Count Activities : Total CDR Records that are in the administration level 163 | Count Unique IDs : Total Unique Ids 164 | ``` 165 | and {geojson_file_name}_joined_ADMIN{X}.json (you can put this in visualization service such as [kepler.gl](https://kepler.gl)) 166 | 167 | ### Summary Data 168 | For the output summary date [(summary)](../Statistics/output_reports/summary_stats.csv). It contains overall statistics of the cdr data including: 169 | ``` 170 | Total records 171 | Total unique IDs 172 | Total days 173 | Average daily usage 174 | Average daily voice 175 | Average daily sms 176 | Average daily unique cell id 177 | Average Daily Admin Level 1 178 | ``` 179 | ### Frequent Locations 180 | Frequent locations (All-day and night) output is the most popular cell_id in which user make a call data in a day or a night. 181 | The output here will be a table named {your_prefix}_frequent_location_thresholded and {your_prefix}_frequent_location_thresholded_night. 182 | Example output 183 | 184 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_reports/frequent_location_output_sample.png "Frequent Locations") 185 | 186 | 187 | ## Output Graphs 188 | The implementation is in [cdr_statistics.py](../Common/cdr_statistics.py) 189 | ### User Date Histogram 190 | The output histogram is the histogram of each number of days (x) with the corresponding number of unique ids of users who make a call for the particular number of days (y). 191 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/user_data_histogram.png "User Date Histogram") 192 | ### Daily CDRs 193 | The graph reports the daily usage of a user each day with minimum, maximum, average and total cdr 194 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_cdrs.png "Daily CDRs") 195 | ### Daily CDRs by call type 196 | This graph reports the daily usage of a user each day by call type (multiple lines) 197 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_cdr_by_call_type.png "Daily CDRs by call type") 198 | ### Daily Unique Users 199 | Daily unique uids are reported daily in the graph with statistics information (minimum, maximum, average and total) 200 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_unique_users.png "Daily Unique Users") 201 | ### Daily Unique Locations 202 | Daily unique locations (unique latitude and longitude) are reported daily in the graph with statistics information (minimum, maximum, average and total) 203 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_unique_locations.png "Daily Unique Locations") 204 | ### Daily average CDRs 205 | Daily average CDRs are the daily CDR per user in average with the average of them by days displayed on top 206 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_avg_cdr.png "Daily Average CDRs") 207 | 208 | ### Daily Unique Average Locations 209 | This graphs represent daily average locations which are the daily CDR per user in average with the average of them by days being shown on the top of the graph 210 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_unique_avg_locations.png "Daily Average CDRs") 211 | 212 | -------------------------------------------------------------------------------- /Statistics/output_graphs/daily_avg_cdr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_avg_cdr.png -------------------------------------------------------------------------------- /Statistics/output_graphs/daily_cdr_by_call_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_cdr_by_call_type.png -------------------------------------------------------------------------------- /Statistics/output_graphs/daily_cdrs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_cdrs.png -------------------------------------------------------------------------------- /Statistics/output_graphs/daily_unique_avg_locations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_unique_avg_locations.png -------------------------------------------------------------------------------- /Statistics/output_graphs/daily_unique_locations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_unique_locations.png -------------------------------------------------------------------------------- /Statistics/output_graphs/daily_unique_users.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_unique_users.png -------------------------------------------------------------------------------- /Statistics/output_graphs/user_data_histogram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/user_data_histogram.png -------------------------------------------------------------------------------- /Statistics/output_reports/css_file_data_stat.csv: -------------------------------------------------------------------------------- 1 | "total_records","total_days","unique_id","unique_imei","unique_imsi","unique_location_name","start_date","end_date" 2 | 17,7,9,1,1,7,"2016-05-01","2020-05-04" 3 | -------------------------------------------------------------------------------- /Statistics/output_reports/css_provider_data_stat_daily.csv: -------------------------------------------------------------------------------- 1 | "date","call_type","network_type","total_records","total_days","unique_id","unique_imei","unique_imsi","unique_location_name" 2 | "2016-05-01","ALL","ALL",10,1,9,1,1,6 3 | "2016-05-01","ALL","4G",1,1,1,1,1,1 4 | "2016-05-01","ALL","3G",5,1,5,1,1,4 5 | "2016-05-01","ALL","2G",4,1,4,1,1,4 6 | "2016-05-01","DATA","ALL",6,1,6,1,1,4 7 | "2016-05-01","DATA","4G",1,1,1,1,1,1 8 | "2016-05-01","DATA","3G",3,1,3,1,1,2 9 | "2016-05-01","DATA","2G",2,1,2,1,1,2 10 | "2016-05-01","SMS","ALL",1,1,1,1,1,1 11 | "2016-05-01","SMS","2G",1,1,1,1,1,1 12 | "2016-05-01","VOICE","ALL",3,1,3,1,1,3 13 | "2016-05-01","VOICE","3G",2,1,2,1,1,2 14 | "2016-05-01","VOICE","2G",1,1,1,1,1,1 15 | "2016-05-02","ALL","ALL",1,1,1,1,1,1 16 | "2016-05-02","ALL","3G",1,1,1,1,1,1 17 | "2016-05-02","DATA","ALL",1,1,1,1,1,1 18 | "2016-05-02","DATA","3G",1,1,1,1,1,1 19 | "2016-05-03","ALL","ALL",1,1,1,1,1,1 20 | "2016-05-03","ALL","3G",1,1,1,1,1,1 21 | "2016-05-03","DATA","ALL",1,1,1,1,1,1 22 | "2016-05-03","DATA","3G",1,1,1,1,1,1 23 | "2016-05-04","ALL","ALL",1,1,1,1,1,1 24 | "2016-05-04","ALL","4G",1,1,1,1,1,1 25 | "2016-05-04","DATA","ALL",1,1,1,1,1,1 26 | "2016-05-04","DATA","4G",1,1,1,1,1,1 27 | "2016-06-01","ALL","ALL",2,1,2,1,1,2 28 | "2016-06-01","ALL","4G",1,1,1,1,1,1 29 | "2016-06-01","ALL","3G",1,1,1,1,1,1 30 | "2016-06-01","DATA","ALL",2,1,2,1,1,2 31 | "2016-06-01","DATA","4G",1,1,1,1,1,1 32 | "2016-06-01","DATA","3G",1,1,1,1,1,1 33 | "2016-07-03","ALL","ALL",1,1,1,1,1,1 34 | "2016-07-03","ALL","3G",1,1,1,1,1,1 35 | "2016-07-03","DATA","ALL",1,1,1,1,1,1 36 | "2016-07-03","DATA","3G",1,1,1,1,1,1 37 | "2020-05-04","ALL","ALL",1,1,1,1,1,1 38 | "2020-05-04","ALL","4G",1,1,1,1,1,1 39 | "2020-05-04","DATA","ALL",1,1,1,1,1,1 40 | "2020-05-04","DATA","4G",1,1,1,1,1,1 41 | -------------------------------------------------------------------------------- /Statistics/output_reports/css_provider_data_stat_monthly.csv: -------------------------------------------------------------------------------- 1 | "year","month","call_type","network_type","total_records","total_days","unique_id","unique_imei","unique_imsi","unique_location_name" 2 | 2016,5,"ALL","ALL",13,4,9,1,1,6 3 | 2016,5,"ALL","4G",2,2,1,1,1,1 4 | 2016,5,"ALL","3G",7,3,5,1,1,4 5 | 2016,5,"ALL","2G",4,1,4,1,1,4 6 | 2016,5,"DATA","ALL",9,4,6,1,1,4 7 | 2016,5,"DATA","4G",2,2,1,1,1,1 8 | 2016,5,"DATA","3G",5,3,3,1,1,2 9 | 2016,5,"DATA","2G",2,1,2,1,1,2 10 | 2016,5,"SMS","ALL",1,1,1,1,1,1 11 | 2016,5,"SMS","2G",1,1,1,1,1,1 12 | 2016,5,"VOICE","ALL",3,1,3,1,1,3 13 | 2016,5,"VOICE","3G",2,1,2,1,1,2 14 | 2016,5,"VOICE","2G",1,1,1,1,1,1 15 | 2020,5,"ALL","ALL",1,1,1,1,1,1 16 | 2020,5,"ALL","4G",1,1,1,1,1,1 17 | 2020,5,"DATA","ALL",1,1,1,1,1,1 18 | 2020,5,"DATA","4G",1,1,1,1,1,1 19 | -------------------------------------------------------------------------------- /Statistics/output_reports/frequent_location_output_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_reports/frequent_location_output_sample.png -------------------------------------------------------------------------------- /Statistics/output_reports/summary_stats.csv: -------------------------------------------------------------------------------- 1 | "All Data","Value" 2 | "total_records",17 3 | "total_uids",9 4 | "total_days","7 (1 May 2016-4 May 2020)" 5 | " 6 | " 7 | "Statistics" 8 | "average_usage_per_day",2.429 9 | "average_daily_voice",0.429 10 | "average_daily_sms",0.143 11 | "average_daily_unique_cell_id",1.857 12 | "average_admin1_per_day",1.0 13 | -------------------------------------------------------------------------------- /Statistics/output_reports/zone_based_aggregations_level_ADMIN1.csv: -------------------------------------------------------------------------------- 1 | "admin1","count_activities","count_unique_ids" 2 | "Sendai",17,9 3 | -------------------------------------------------------------------------------- /Statistics/output_reports/zone_based_aggregations_level_ADMIN2.csv: -------------------------------------------------------------------------------- 1 | "admin2","count_activities","count_unique_ids" 2 | "Miyagi",17,9 3 | -------------------------------------------------------------------------------- /hive_init_commands/initial_hive_commands_interpolation.json: -------------------------------------------------------------------------------- 1 | { 2 | "hive_commands": [ 3 | "ADD JAR /hadoop/hive/lib/cdrmobilitylib.jar", 4 | "ADD JAR /hadoop/hive/lib/cdrmobilitylibge.jar", 5 | "ADD JAR /hadoop/hive/lib/cdrmobilitylibjica.jar", 6 | "ADD JAR /hadoop/hive/lib/jts-1.12.jar", 7 | "ADD JAR /hadoop/hive/lib/jtsio-1.12.jar", 8 | "ADD JAR /hadoop/hive/lib/javaml-0.1.6.jar", 9 | "ADD JAR /hadoop/hive/lib/weka.jar", 10 | "ADD JAR /hadoop/hive/lib/libsvm.jar", 11 | "ADD JAR /hadoop/hive/lib/ajt-2.5.jar", 12 | "ADD JAR /hadoop/hive/lib/Jama-1.0.2.jar", 13 | "create temporary function FindPolygon as 'com.apichon.mobility.hive.udf.FindPolygon'", 14 | "create temporary function FindPolygon2 as 'com.apichon.mobility.hive.udf.FindPolygon2'", 15 | "create temporary function CreateTrajectoriesCDR as 'com.apichon.mobility.hive.udf.ge.CreateTrajectoriesCDR'", 16 | "create temporary function TripSegmentationCDR as 'com.apichon.mobility.hive.udf.ge.TripSegmentationCDR'", 17 | "create temporary function CreateTrajectoriesJICAWithZone as 'com.apichon.mobility.hive.udf.jica.CreateTrajectoriesJICAWithZone'", 18 | "create temporary function TripOD as 'com.apichon.mobility.hive.udf.jica.TripOD'", 19 | "create temporary function TripODStay as 'com.apichon.mobility.hive.udf.jica.TripODStay'", 20 | "set hive.exec.dynamic.partition.mode=nonstrict", 21 | "set hive.exec.dynamic.partition=true", 22 | "set hive.exec.max.dynamic.partitions.pernode=4000", 23 | "set hive.exec.max.created.files=150000", 24 | "set hive.enforce.bucketing=true", 25 | "set mapred.reduce.slowstart.completed.maps=0.98", 26 | "set mapred.job.reuse.jvm.num.tasks=50", 27 | "set mapred.child.java.opts=-Xmx2048m -XX:-UseGCOverheadLimit", 28 | "set hive.map.aggr.hash.percentmemory = 0.25", 29 | "ADD jar /hadoop/hive/lib/commons-pool-1.5.4.jar", 30 | "ADD jar /hadoop/hive/lib/commons-dbcp-1.4.jar", 31 | "ADD jar /hadoop/hive/lib/commons-lang-2.6.jar", 32 | "ADD jar /hadoop/hive/lib/commons-logging-1.1.3.jar", 33 | "ADD jar /hadoop/hive/lib/commons-math-2.1.jar", 34 | "ADD jar /hadoop/hive/lib/gt-api-9.3.jar", 35 | "ADD jar /hadoop/hive/lib/gt-data-9.3.jar", 36 | "ADD jar /hadoop/hive/lib/gt-epsg-hsql-12.2.jar", 37 | "ADD jar /hadoop/hive/lib/gt-main-9.3.jar", 38 | "ADD jar /hadoop/hive/lib/gt-metadata-9.3.jar", 39 | "ADD jar /hadoop/hive/lib/gt-opengis-9.3.jar", 40 | "ADD jar /hadoop/hive/lib/gt-referencing-9.3.jar", 41 | "ADD jar /hadoop/hive/lib/jsr-275-1.0-beta-2.jar", 42 | "ADD jar /hadoop/hive/lib/jts-1.13.jar", 43 | "ADD jar /hadoop/hive/lib/vecmath-1.3.2.jar", 44 | "ADD jar /hadoop/hive/lib/postgresql-9.3-1102.jdbc4.jar", 45 | "ADD jar /hadoop/hive/lib/postgis-jdbc-2.1.0SVN.jar", 46 | 47 | "ADD jar /hadoop/hive/lib/cdrinterpolationlib.jar", 48 | 49 | "create temporary function f_turkcellarray as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellCdrArrayUDAF'", 50 | "create temporary function f_organizearray as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellOrganizeTripUDF'", 51 | "create temporary function f_reallocation as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellReallocationUDF'", 52 | "create temporary function f_routing as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellRoutingUDF'", 53 | "create temporary function f_routing2 as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellRoutingUDF2'", 54 | "create temporary function f_hmesh as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.HMeshUDF'", 55 | "create temporary function f_dumptrip as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.DumpTripUDF'", 56 | "create temporary function f_hmesharray as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.HMeshArrayUDF'", 57 | "create temporary function DumpTripUDF as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.DumpTripUDF'", 58 | "create temporary function sp_distance as 'com.apichon.mobility.hive.udf.sp_distance'", 59 | "set hive.support.sql11.reserved.keywords=false", 60 | "set hive.fetch.task.conversion=minimal", 61 | 62 | "add file {poi_location}", 63 | "add file {osm_location}", 64 | "add file {voronoi_location}", 65 | "use {db_name}" 66 | ] 67 | } -------------------------------------------------------------------------------- /hive_init_commands/initial_hive_commands_od.json: -------------------------------------------------------------------------------- 1 | { 2 | "hive_commands": [ 3 | "ADD JAR /hadoop/hive/lib/cdrmobilitylib.jar", 4 | "ADD JAR /hadoop/hive/lib/cdrmobilitylibge.jar", 5 | "ADD JAR /hadoop/hive/lib/cdrmobilitylibjica.jar", 6 | "ADD JAR /hadoop/hive/lib/jts-1.12.jar", 7 | "ADD JAR /hadoop/hive/lib/jtsio-1.12.jar", 8 | "ADD JAR /hadoop/hive/lib/javaml-0.1.6.jar", 9 | "ADD JAR /hadoop/hive/lib/weka.jar", 10 | "ADD JAR /hadoop/hive/lib/libsvm.jar", 11 | "ADD JAR /hadoop/hive/lib/ajt-2.5.jar", 12 | "ADD JAR /hadoop/hive/lib/Jama-1.0.2.jar", 13 | "create temporary function FindPolygon as 'com.apichon.mobility.hive.udf.FindPolygon'", 14 | "create temporary function FindPolygon2 as 'com.apichon.mobility.hive.udf.FindPolygon2'", 15 | "create temporary function CreateTrajectoriesCDR as 'com.apichon.mobility.hive.udf.ge.CreateTrajectoriesCDR'", 16 | "create temporary function TripSegmentationCDR as 'com.apichon.mobility.hive.udf.ge.TripSegmentationCDR'", 17 | "create temporary function CreateTrajectoriesJICAWithZone as 'com.apichon.mobility.hive.udf.jica.CreateTrajectoriesJICAWithZone'", 18 | "create temporary function TripOD as 'com.apichon.mobility.hive.udf.jica.TripOD'", 19 | "create temporary function TripODStay as 'com.apichon.mobility.hive.udf.jica.TripODStay'", 20 | "set hive.exec.dynamic.partition.mode=nonstrict", 21 | "set hive.exec.dynamic.partition=true", 22 | "set hive.exec.max.dynamic.partitions.pernode=4000", 23 | "set hive.exec.max.created.files=150000", 24 | "set hive.enforce.bucketing=true", 25 | "set mapred.map.tasks.speculative.execution=false", 26 | "set mapred.reduce.tasks.speculative.execution=false", 27 | "set hive.mapred.reduce.tasks.speculative.execution=false", 28 | "set hive.map.aggr=false", 29 | "set mapred.reduce.slowstart.completed.maps=0.98", 30 | "set mapred.job.reuse.jvm.num.tasks=50", 31 | "set mapred.child.java.opts=-Xmx8096m -XX:-UseGCOverheadLimit -XX:+UseConcMarkSweepGC", 32 | "set hive.tez.container.size=4096", 33 | "set hive.map.aggr.hash.percentmemory = 0.25", 34 | "ADD jar /hadoop/hive/lib/commons-pool-1.5.4.jar", 35 | "ADD jar /hadoop/hive/lib/commons-dbcp-1.4.jar", 36 | "ADD jar /hadoop/hive/lib/commons-lang-2.6.jar", 37 | "ADD jar /hadoop/hive/lib/commons-logging-1.1.3.jar", 38 | "ADD jar /hadoop/hive/lib/commons-math-2.1.jar", 39 | "ADD jar /hadoop/hive/lib/jts-1.13.jar", 40 | "ADD jar /hadoop/hive/lib/vecmath-1.3.2.jar", 41 | "create temporary function sp_distance as 'com.apichon.mobility.hive.udf.sp_distance'", 42 | "set hive.support.sql11.reserved.keywords=false", 43 | "use {db_name}" 44 | ] 45 | } -------------------------------------------------------------------------------- /hive_init_commands/initial_hive_commands_stats.json: -------------------------------------------------------------------------------- 1 | { 2 | "hive_commands": [ 3 | "set hive.exec.dynamic.partition.mode=nonstrict", 4 | "set hive.exec.dynamic.partition=true", 5 | "set hive.exec.max.dynamic.partitions.pernode=4000", 6 | "set hive.exec.max.created.files=150000", 7 | "set hive.enforce.bucketing=true", 8 | "set mapred.map.tasks.speculative.execution=false", 9 | "set mapred.reduce.tasks.speculative.execution=false", 10 | "set hive.mapred.reduce.tasks.speculative.execution=false", 11 | "set hive.map.aggr=false", 12 | "set mapred.reduce.slowstart.completed.maps=0.98", 13 | "set mapred.job.reuse.jvm.num.tasks=50", 14 | "set hive.support.sql11.reserved.keywords=false", 15 | "use {db_name}" 16 | ] 17 | } -------------------------------------------------------------------------------- /lib/Jama-1.0.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/Jama-1.0.2.jar -------------------------------------------------------------------------------- /lib/ajt-2.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/ajt-2.5.jar -------------------------------------------------------------------------------- /lib/cdrinterpolationlib.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrinterpolationlib.jar -------------------------------------------------------------------------------- /lib/cdrlibindicator.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrlibindicator.jar -------------------------------------------------------------------------------- /lib/cdrmobilitylib.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrmobilitylib.jar -------------------------------------------------------------------------------- /lib/cdrmobilitylibge.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrmobilitylibge.jar -------------------------------------------------------------------------------- /lib/cdrmobilitylibjica.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrmobilitylibjica.jar -------------------------------------------------------------------------------- /lib/commons-dbcp-1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-dbcp-1.4.jar -------------------------------------------------------------------------------- /lib/commons-lang-2.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-lang-2.6.jar -------------------------------------------------------------------------------- /lib/commons-logging-1.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-logging-1.1.3.jar -------------------------------------------------------------------------------- /lib/commons-math-2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-math-2.1.jar -------------------------------------------------------------------------------- /lib/commons-pool-1.5.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-pool-1.5.4.jar -------------------------------------------------------------------------------- /lib/gt-api-9.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-api-9.3.jar -------------------------------------------------------------------------------- /lib/gt-data-9.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-data-9.3.jar -------------------------------------------------------------------------------- /lib/gt-epsg-hsql-12.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-epsg-hsql-12.2.jar -------------------------------------------------------------------------------- /lib/gt-main-9.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-main-9.3.jar -------------------------------------------------------------------------------- /lib/gt-metadata-2.6.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-metadata-2.6.5.jar -------------------------------------------------------------------------------- /lib/gt-metadata-9.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-metadata-9.3.jar -------------------------------------------------------------------------------- /lib/gt-opengis-9.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-opengis-9.3.jar -------------------------------------------------------------------------------- /lib/gt-referencing-2.6.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-referencing-2.6.5.jar -------------------------------------------------------------------------------- /lib/gt-referencing-9.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-referencing-9.3.jar -------------------------------------------------------------------------------- /lib/gt-shapefile-2.6.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-shapefile-2.6.5.jar -------------------------------------------------------------------------------- /lib/jahmm-0.6.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jahmm-0.6.2.jar -------------------------------------------------------------------------------- /lib/javaml-0.1.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/javaml-0.1.6.jar -------------------------------------------------------------------------------- /lib/jsr-275-1.0-beta-2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jsr-275-1.0-beta-2.jar -------------------------------------------------------------------------------- /lib/jts-1.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jts-1.12.jar -------------------------------------------------------------------------------- /lib/jts-1.13.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jts-1.13.jar -------------------------------------------------------------------------------- /lib/jtsio-1.12.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jtsio-1.12.jar -------------------------------------------------------------------------------- /lib/libsvm.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/libsvm.jar -------------------------------------------------------------------------------- /lib/pflow-hiveUDF.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/pflow-hiveUDF.jar -------------------------------------------------------------------------------- /lib/postgis-jdbc-2.1.0SVN.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/postgis-jdbc-2.1.0SVN.jar -------------------------------------------------------------------------------- /lib/postgresql-9.3-1102.jdbc4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/postgresql-9.3-1102.jdbc4.jar -------------------------------------------------------------------------------- /lib/vecmath-1.3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/vecmath-1.3.2.jar -------------------------------------------------------------------------------- /lib/weka.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/weka.jar -------------------------------------------------------------------------------- /queries/cdr_and_mapping/create_consolidate_cdr.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_consolidate_data_all({arg_prep}) 2 | PARTITIONED BY (pdt string) ROW FORMAT DELIMITED 3 | FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/cdr_and_mapping/create_mapping_admin.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {provider_prefix}_cell_tower_data_{admin} 2 | ({admin}_id string, {admin}_name string, latitude string, longitude string) 3 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' 4 | STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/cdr_and_mapping/create_preprocess_cdr.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_preprocess({args}) ROW FORMAT DELIMITED 2 | FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/cdr_and_mapping/create_preprocess_mapping.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS {provider_prefix}_cell_tower_data_preprocess({arg_create}) 2 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/cdr_and_mapping/create_raw_cdr.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_raw ({arg_raw}) 2 | ROW FORMAT DELIMITED FIELDS TERMINATED BY "{field_delimiter}" 3 | LINES TERMINATED BY '\n' STORED AS TEXTFILE 4 | tblproperties ("skip.header.line.count"="{cell_tower_header}") -------------------------------------------------------------------------------- /queries/cdr_and_mapping/create_raw_mapping.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_cell_tower_data_raw ({arg_raw}) ROW FORMAT DELIMITED 2 | FIELDS TERMINATED BY "{field_delimiter}" LINES TERMINATED BY '\n' STORED AS TEXTFILE 3 | tblproperties ("skip.header.line.count"="{have_header}") -------------------------------------------------------------------------------- /queries/cdr_and_mapping/insert_consolidate_cdr.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO TABLE {provider_prefix}_consolidate_data_all 2 | PARTITION (pdt) select {arg_con}, to_date(call_time) as pdt 3 | from {provider_prefix}_preprocess 4 | 5 | -------------------------------------------------------------------------------- /queries/cdr_and_mapping/insert_consolidate_cdr_join.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO TABLE {provider_prefix}_consolidate_data_all 2 | PARTITION (pdt) select {arg_con}, to_date(a1.call_time) as pdt 3 | from {provider_prefix}_preprocess a1 join 4 | {provider_prefix}_cell_tower_data_preprocess a2 5 | on(a1.cell_id = a2.cell_id) 6 | 7 | -------------------------------------------------------------------------------- /queries/cdr_and_mapping/insert_mapping_admin.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_cell_tower_data_{admin} 2 | select row_number() OVER () - 1 as rowidx, {admin}, latitude, longitude 3 | from {provider_prefix}_cell_tower_data_preprocess where translate({admin},' ',' ') != '' 4 | {check_lat_lng} group by {admin}, latitude, longitude order by rowidx -------------------------------------------------------------------------------- /queries/cdr_and_mapping/insert_preprocess_cdr.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_preprocess 2 | select {distinct} {arg} from {provider_prefix}_raw 3 | -------------------------------------------------------------------------------- /queries/cdr_and_mapping/insert_preprocess_mapping.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO TABLE {provider_prefix}_cell_tower_data_preprocess 2 | select {distinct} {arg} from {provider_prefix}_cell_tower_data_raw -------------------------------------------------------------------------------- /queries/interpolation/create_cdr_by_uid.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_cdr_by_uid (uid string, arr ARRAY>) 2 | PARTITIONED BY (pdt string) 3 | ROW FORMAT DELIMITED 4 | FIELDS TERMINATED BY '\t' 5 | COLLECTION ITEMS TERMINATED BY ',' 6 | MAP KEYS TERMINATED BY '!' 7 | LINES TERMINATED BY '\n' 8 | STORED AS ORC 9 | -------------------------------------------------------------------------------- /queries/interpolation/create_poi_relocation.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_cdr_by_uid_trip_realloc_array_apd (uid string, arr ARRAY>) 2 | PARTITIONED BY (pdt string) 3 | ROW FORMAT DELIMITED 4 | FIELDS TERMINATED BY '\t' 5 | COLLECTION ITEMS TERMINATED BY ',' 6 | MAP KEYS TERMINATED BY '!' 7 | LINES TERMINATED BY '\n' 8 | STORED AS ORC 9 | -------------------------------------------------------------------------------- /queries/interpolation/create_route_interpolation.sql: -------------------------------------------------------------------------------- 1 | create table {provider_prefix}_cdr_by_uid_trip_routing_array_apd 2 | (uid string, route_arr ARRAY>) 3 | partitioned by (pdt string) 4 | row format delimited fields terminated by '\t' 5 | collection items terminated by ',' 6 | map keys terminated by '!' 7 | lines terminated by '\n' 8 | stored as ORC 9 | 10 | -------------------------------------------------------------------------------- /queries/interpolation/create_trip_24_hr_padding.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_cdr_by_uid_trip_organized_array_apd (uid string, arr ARRAY>) 2 | PARTITIONED BY (pdt string) 3 | ROW FORMAT DELIMITED 4 | FIELDS TERMINATED BY '\t' 5 | COLLECTION ITEMS TERMINATED BY ',' 6 | MAP KEYS TERMINATED BY '!' 7 | LINES TERMINATED BY '\n' 8 | STORED AS ORC 9 | -------------------------------------------------------------------------------- /queries/interpolation/create_trip_format.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_cdr_by_uid_trip (uid string, arr ARRAY>) 2 | PARTITIONED BY (pdt string) 3 | ROW FORMAT DELIMITED 4 | FIELDS TERMINATED BY '\t' 5 | COLLECTION ITEMS TERMINATED BY ',' 6 | MAP KEYS TERMINATED BY '!' 7 | LINES TERMINATED BY '\n' 8 | STORED AS ORC 9 | -------------------------------------------------------------------------------- /queries/interpolation/export_to_gps_format.sql: -------------------------------------------------------------------------------- 1 | insert overwrite local directory '/tmp/hive/csv_interpolation' 2 | ROW FORMAT DELIMITED 3 | FIELDS TERMINATED BY ',' 4 | select CONCAT_WS(',',uid,trip_seq, mobilitytype, mode, totaldistance, totaltime, starttime,endtime, totalpoints, regexp_replace(m,'\\\|',',')) 5 | from (select uid ,m[1] as trip_seq,m[2] as mobilitytype,m[3] as mode,m[4] totaldistance ,m[5] as 6 | totaltime,m[6] as starttime,m[7] as endtime,m[8] as totalpoints,m[9] as pointlist 7 | from (select * from {provider_prefix}_cdr_by_uid_trip_routing_array_apd where size(route_arr)>1) t1 LATERAL 8 | VIEW explode(t1.route_arr) myTable1 AS m) t1 LATERAL VIEW explode(split(t1.pointlist,'\;')) myTable1 AS m 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /queries/interpolation/insert_cdr_by_uid.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_cdr_by_uid PARTITION(pdt) 2 | select uid as uid, CreateTrajectoriesCDR(time, duration,cell_id,longitude,latitude) as arr, pdate as pdt 3 | from cdr_test_interpolation group by pdate, uid 4 | having count(*) <= {max_size_cdr_by_uid} -------------------------------------------------------------------------------- /queries/interpolation/insert_poi_relocation.sql: -------------------------------------------------------------------------------- 1 | insert overwrite table {provider_prefix}_cdr_by_uid_trip_realloc_array_apd partition (pdt) 2 | select uid,f_reallocation(uid,arr,pdt, "{poi}"),pdt 3 | from {provider_prefix}_cdr_by_uid_trip_organized_array_apd -------------------------------------------------------------------------------- /queries/interpolation/insert_route_interpolation.sql: -------------------------------------------------------------------------------- 1 | insert overwrite table {provider_prefix}_cdr_by_uid_trip_routing_array_apd partition (pdt) 2 | select uid,f_routing(uid,arr,"{osm}","{voronoi}"), pdt 3 | from {provider_prefix}_cdr_by_uid_trip_realloc_array_apd 4 | where (size(arr)>0 and size(arr)<={max_size_interpolation}) 5 | -------------------------------------------------------------------------------- /queries/interpolation/insert_trip_24_hr_padding.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_cdr_by_uid_trip_organized_array_apd PARTITION(pdt) 2 | select uid, f_organizearray(uid,arr) as arr,pdt 3 | from {provider_prefix}_cdr_by_uid_trip where size(arr) > 0 4 | -------------------------------------------------------------------------------- /queries/interpolation/insert_trip_format.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_cdr_by_uid_trip PARTITION(pdt) 2 | select uid, TripSegmentationCDR(arr,uid) as arr,pdt from {provider_prefix}_cdr_by_uid 3 | -------------------------------------------------------------------------------- /queries/origin_destination/create_la_cdr_all_with_ant_zone_by_uid.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid(uid string, arr ARRAY < ARRAY < string >>) PARTITIONED 2 | BY(pdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',' 3 | MAP KEYS TERMINATED BY '!' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od 2 | (uid string, home_site_id string,home_zone string, arr ARRAY>) 3 | PARTITIONED BY (pdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 4 | COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY '!' 5 | LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE 6 | 7 | -------------------------------------------------------------------------------- /queries/origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od_detail.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail 2 | (uid string, home_site_id string,home_zone string, arr ARRAY) 3 | PARTITIONED BY (pdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 4 | COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY '!' 5 | LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od_sum.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum 2 | (origin string,destination string,tcount double, tusercount double) 3 | PARTITIONED BY (pdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 4 | COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY '!' 5 | LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/origin_destination/create_la_cdr_uid_home.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_la_cdr_uid_home (uid string, site_id string, tcount 2 | int, trank int, ppercent double, LONGITUDE string, LATITUDE string, {admin_params}) 3 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',' 4 | MAP KEYS TERMINATED BY '!' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_old_consolidate.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid PARTITION (pdt) 2 | select a1.uid, CreateTrajectoriesJICAWithZone (a1.uid, call_time, duration, a2.longitude, a2.latitude, a1.cell_id, a3.{target_admin}_id) 3 | as arr, pdt from {provider_prefix}_consolidate_data_all a1 join {provider_prefix}_cell_tower_data_preprocess a2 4 | on (a1.cell_id = a2.cell_id) 5 | join {provider_prefix}_cell_tower_data_{target_admin} a3 on (a2.latitude = a3.latitude and a2.longitude = a3.longitude) 6 | where to_date(pdt) = "{od_date}" group by a1.uid, pdt 7 | -------------------------------------------------------------------------------- /queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_uid.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid PARTITION (pdt) 2 | select a1.uid, CreateTrajectoriesJICAWithZone 3 | (a1.uid, call_time, duration, a1.longitude, a1.latitude, concat(a1.latitude, ' : ', a1.longitude), a2.{target_admin}_id) 4 | as arr, pdt from {provider_prefix}_consolidate_data_all a1 5 | join {provider_prefix}_cell_tower_data_{target_admin} a2 on (a1.latitude = a2.latitude and a1.longitude = a2.longitude) 6 | where to_date(pdt) = "{od_date}" group by a1.uid, pdt -------------------------------------------------------------------------------- /queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od PARTITION (pdt) 2 | select t1.uid,t2.site_id as home_site_id, t2.{target_unit}_id as home_zone, 3 | TripOD(arr, t1.uid, t2.site_id, t2.{target_unit}_id, t2.LONGITUDE, t2.LATITUDE), pdt 4 | from {provider_prefix}_la_cdr_all_with_ant_zone_by_uid t1 5 | inner join {provider_prefix}_la_cdr_uid_home t2 on t1.uid = t2.uid 6 | where size(arr) <= 500 7 | -------------------------------------------------------------------------------- /queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od_detail.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail PARTITION (pdt) 2 | select uid ,home_site_id,home_zone,m as arr,pdt 3 | from {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od t1 4 | LATERAL VIEW explode(t1.arr) myTable1 AS m -------------------------------------------------------------------------------- /queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od_sum.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum PARTITION(pdt) 2 | select arr[2] as origin, arr[3] as destination, count(*) as tcount, count(distinct uid) as tusercount, pdt 3 | from {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail where ((arr[2] != '-1' and arr[3] != '-1' ) ) 4 | group by pdt,arr[2],arr[3] -------------------------------------------------------------------------------- /queries/origin_destination/insert_la_cdr_uid_home.sql: -------------------------------------------------------------------------------- 1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_uid_home 2 | select uid, unique_location, tcount, trank, ppercent, latitude, longitude, admin1_id 3 | from big6_frequent_locations where trank = 1 -------------------------------------------------------------------------------- /queries/origin_destination/od_to_csv.sql: -------------------------------------------------------------------------------- 1 | insert overwrite local directory '/tmp/hive/od_result' 2 | ROW FORMAT DELIMITED 3 | FIELDS TERMINATED BY '\t' 4 | select CONCAT_WS('\t',pdt,origin , 5 | destination,cast(tcount as string),cast(tusercount as string)) 6 | from {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum t1 -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_average_cdrs/daily_average_cdrs.sql: -------------------------------------------------------------------------------- 1 | select date, total_records/total_uids as daily_average_cdr 2 | from(select to_date(call_time) as date, count(distinct uid) as total_uids, 3 | count(*) as total_records from {provider_prefix}_consolidate_data_all a1 4 | group by to_date(call_time) order by date)td1 -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_average_unique_locations/daily_average_unique_locations.sql: -------------------------------------------------------------------------------- 1 | select date, unique_locations/unique_users as daily_avg_locations, unique_cell_ids/unique_users as daily_avg_cell_ids 2 | from (select to_date(call_time) as date, count(distinct latitude, longitude) as unique_locations, 3 | count(distinct uid) as unique_users, count(distinct cell_id) as unique_cell_ids 4 | from {provider_prefix}_consolidate_data_all group by to_date(call_time) order by date) td1 -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_average_unique_locations/daily_average_unique_locations_old_consolidate.sql: -------------------------------------------------------------------------------- 1 | select date, unique_locations/unique_users as daily_avg_locations, unique_cell_ids/unique_users as daily_avg_cell_ids 2 | from (select to_date(call_time) as date, count(distinct a2.latitude, a2.longitude) as unique_locations, 3 | count(distinct a1.uid) as unique_users, count(distinct a1.cell_id) as unique_cell_ids 4 | from {provider_prefix}_consolidate_data_all a1 join {provider_prefix}_cell_tower_data_preprocess a2 5 | on(a1.cell_id = a2.cell_id) group by to_date(call_time) order by date) td1 -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_cdrs/total_daily_cdrs.sql: -------------------------------------------------------------------------------- 1 | select to_date(call_time) as date, count(*) as total_records 2 | from {provider_prefix}_consolidate_data_all group by to_date(call_time) 3 | order by date -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_cdrs_by_call_type/daily_cdrs_by_call_type.sql: -------------------------------------------------------------------------------- 1 | SELECT to_date(call_time) as date, 'ALL' as call_type, 'ALL' as network_type, COUNT(*) as total_records, 2 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 3 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt) 4 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time) 5 | UNION 6 | SELECT to_date(call_time) as date, call_type, 'ALL' as network_type, COUNT(*) as total_records, 7 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 8 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt) 9 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time), call_type 10 | ORDER BY call_type ASC, network_type DESC 11 | -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_unique_locations/daily_unique_locations.sql: -------------------------------------------------------------------------------- 1 | select to_date(call_time) as date, count(distinct latitude, longitude) as unique_locations 2 | from {provider_prefix}_consolidate_data_all group by to_date(call_time) order by date -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_unique_locations/daily_unique_locations_old_consolidate.sql: -------------------------------------------------------------------------------- 1 | select to_date(call_time) as date, count(distinct a2.latitude, a2.longitude) as unique_locations 2 | from {provider_prefix}_consolidate_data_all a1 join {provider_prefix}_cell_tower_data_preprocess a2 3 | on(a1.cell_id = a2.cell_id) group by to_date(call_time) order by date -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_unique_locations/total_unique_locations.sql: -------------------------------------------------------------------------------- 1 | select count(*) as count_unique_locations from (select distinct latitude, longitude 2 | from {provider_prefix}_consolidate_data_all) td -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_unique_locations/total_unique_locations_old_consolidate.sql: -------------------------------------------------------------------------------- 1 | select count(*) as count_unique_locations from (select distinct a2.latitude, a2.longitude 2 | from {provider_prefix}_consolidate_data_all a1 3 | join {provider_prefix}_cell_tower_data_preprocess a2 4 | on(a1.cell_id = a2.cell_id)) td -------------------------------------------------------------------------------- /queries/statistics/graphs/daily_unique_users/total_daily_uids.sql: -------------------------------------------------------------------------------- 1 | select to_date(call_time) as date, count(distinct uid) as total_users 2 | from {provider_prefix}_consolidate_data_all group by to_date(call_time) order by date -------------------------------------------------------------------------------- /queries/statistics/graphs/date_histogram/histogram.sql: -------------------------------------------------------------------------------- 1 | select explode(histogram_numeric(active_days, 10)) as active_day_bins from 2 | (select count(*) as active_days, td.uid from (select year(to_date(call_time)) as year, 3 | month(to_date(call_time)) as month, day(to_date(call_time)) as day, uid 4 | from {provider_prefix}_consolidate_data_all group by uid, year(to_date(call_time)), 5 | month(to_date(call_time)), day(to_date(call_time)) order by year, month, day, uid) td 6 | group by td.uid) td2 -------------------------------------------------------------------------------- /queries/statistics/reports/all_statistics/data_statistics.sql: -------------------------------------------------------------------------------- 1 | select count(*) as total_records, count(distinct to_date(call_time)) as total_days, 2 | count(distinct uid) as unique_id, {imei} {imsi} count(distinct cell_id) as unique_location_name, 3 | min(to_date(call_time)) as start_date, max(to_date(call_time)) as end_date from {provider_prefix}_consolidate_data_all -------------------------------------------------------------------------------- /queries/statistics/reports/daily_statistics/daily_statistics.sql: -------------------------------------------------------------------------------- 1 | SELECT to_date(call_time) as date, 'ALL' as call_type, 'ALL' as network_type, COUNT(*) as total_records, 2 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 3 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt) 4 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time) 5 | UNION 6 | SELECT to_date(call_time) as date, call_type, 'ALL' as network_type, COUNT(*) as total_records, 7 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 8 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt) 9 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time), call_type 10 | UNION 11 | SELECT to_date(call_time) as date, 'ALL' as call_type, network_type, COUNT(*) as total_records, 12 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 13 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt) 14 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time), network_type 15 | UNION 16 | SELECT to_date(call_time) as date, call_type, network_type, COUNT(*) as total_records, 17 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 18 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt) 19 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time), call_type, network_type 20 | ORDER BY date ASC, call_type ASC, network_type DESC 21 | -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/create_frequent_locations.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_frequent_locations (uid string, tcount int,trank int,ppercent double, 2 | unique_location string, latitude string, longitude string, {admin_params}) 3 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',' 4 | MAP KEYS TERMINATED BY '!' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/create_frequent_locations_night.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE {provider_prefix}_frequent_locations_night (uid string, tcount int,trank int,ppercent double, 2 | unique_location string, latitude string, longitude string, {admin_params}) 3 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ',' 4 | MAP KEYS TERMINATED BY '!' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/frequent_locations.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO TABLE {provider_prefix}_frequent_locations SELECT a1.uid, 2 | count(a1.uid) as tcount, ROW_NUMBER() OVER(PARTITION BY a1.uid order by count(a1.uid) DESC) as rank, 3 | count(a1.uid)/SUM(count(a1.uid)) OVER(partition by a1.uid) * 100 as ppercent 4 | , concat(a1.latitude, ' : ', a1.longitude) as unique_location, a1.latitude, a1.longitude, 5 | a2.{admin_params} from {provider_prefix}_consolidate_data_all a1 6 | JOIN {provider_prefix}_cell_tower_data_{admin} a2 on(a1.latitude = a2.latitude and a1.longitude = a2.longitude) 7 | group by a1.uid, concat(a2.latitude, ' : ', a2.longitude), a1.latitude, a1.longitude, a2.{admin_params} 8 | order by a1.uid, rank -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/frequent_locations_night.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO TABLE {provider_prefix}_frequent_locations_night SELECT a1.uid, 2 | count(a1.uid) as tcount, ROW_NUMBER() OVER(PARTITION BY a1.uid order by count(a1.uid) DESC) as rank, 3 | count(a1.uid)/SUM(count(a1.uid)) OVER(partition by a1.uid) * 100 as ppercent 4 | , concat(a1.latitude, ' : ', a1.longitude) as unique_location, a1.latitude, a1.longitude, 5 | a2.{admin_params} from {provider_prefix}_consolidate_data_all a1 6 | JOIN {provider_prefix}_cell_tower_data_{admin} a2 on(a1.latitude = a2.latitude and a1.longitude = a2.longitude) 7 | where hour(a1.call_time) in (0,1,2,3,4,5,6,7,20,21,22,23) 8 | group by a1.uid, concat(a2.latitude, ' : ', a2.longitude), a1.latitude, a1.longitude, a2.{admin_params} 9 | order by a1.uid, rank ASC -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/frequent_locations_night_old_consolidate.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO TABLE {provider_prefix}_frequent_locations_night SELECT a1.uid, a2.cell_id, 2 | count(a1.uid) as tcount, ROW_NUMBER() OVER(PARTITION BY a1.uid, a2.cell_id order by count(a1.uid) DESC) as rank, 3 | count(a1.uid)/SUM(count(a1.uid)) OVER(partition by a1.uid, a2.cell_id) * 100 as percentage 4 | , a2.longitude, a2.latitude, a3.{admin_params} from {provider_prefix}_consolidate_data_all a1 5 | JOIN {provider_prefix}_cell_tower_data_preprocess a2 ON(a1.cell_id = a2.cell_id) 6 | JOIN {provider_prefix}_cell_tower_data_{admin} a3 on(a2.latitude = a3.latitude and a2.longitude = a3.longitude) 7 | where hour(a1.call_time) in (0,1,2,3,4,5,6,7,20,21,22,23) group by a1.uid, a2.latitude, a2.longitude , a2.cell_id, a3.{admin_params} 8 | order by a1.uid, rank ASC -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/frequent_locations_old_consolidate.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO TABLE {provider_prefix}_frequent_locations SELECT a1.uid, a2.cell_id, 2 | count(a1.uid) as tcount, ROW_NUMBER() OVER(PARTITION BY a1.uid, a2.cell_id order by count(a1.uid) DESC) as rank, 3 | count(a1.uid)/SUM(count(a1.uid)) OVER(partition by a1.uid, a2.cell_id) * 100 as percentage 4 | , a2.longitude, a2.latitude, a3.{admin_params} from {provider_prefix}_consolidate_data_all a1 5 | JOIN {provider_prefix}_cell_tower_data_preprocess a2 ON(a1.cell_id = a2.cell_id) 6 | JOIN {provider_prefix}_cell_tower_data_{admin} a3 on(a2.latitude = a3.latitude and a2.longitude = a3.longitude) 7 | group by a1.uid, a2.latitude, a2.longitude , a2.cell_id, a3.{admin_params} order by a1.uid, rank ASC 8 | -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/frequent_locations_thresholded.sql: -------------------------------------------------------------------------------- 1 | create table {provider_prefix}_frequent_locations_thresholded as select td3.uid as uid, td3.tcount 2 | as tcount, td3.trank as trank, td3.ppercent as ppercent, td3.unique_location as unique_location, 3 | td3.longitude as longitude, td3.latitude as latitude, 4 | td3.{admin}_id as {admin}_id, td3.acc_wsum as acc_wsum, td3.min_acc_wsum as min_acc_wsum from 5 | (select a1.uid as uid, a1.tcount as tcount, a1.trank as trank, 6 | a1.ppercent as ppercent, a1.unique_location as unique_location, 7 | a1.longitude as longitude, a1.latitude as latitude, 8 | a1.{admin}_id as {admin}_id, a1.acc_wsum as acc_wsum, td2.min_acc_wsum as min_acc_wsum 9 | from {provider_prefix}_freq_with_acc_wsum a1 10 | join (select td.uid as uid, min(td.acc_wsum) as min_acc_wsum from ( 11 | select uid, acc_wsum from {provider_prefix}_freq_with_acc_wsum 12 | where acc_wsum >= {threshold} group by uid, acc_wsum) td group by td.uid) td2 13 | on (a1.uid = td2.uid)) td3 where acc_wsum <= min_acc_wsum -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/frequent_locations_thresholded_night.sql: -------------------------------------------------------------------------------- 1 | create table {provider_prefix}_frequent_locations_thresholded_night as select td3.uid as uid, td3.tcount 2 | as tcount, td3.trank as trank, td3.ppercent as ppercent, td3.unique_location as unique_location, 3 | td3.longitude as longitude, td3.latitude as latitude, 4 | td3.{admin}_id as {admin}_id, td3.acc_wsum as acc_wsum, td3.min_acc_wsum as min_acc_wsum from 5 | (select a1.uid as uid, a1.tcount as tcount, a1.trank as trank, 6 | a1.ppercent as ppercent, a1.unique_location as unique_location, 7 | a1.longitude as longitude, a1.latitude as latitude, 8 | a1.{admin}_id as {admin}_id, a1.acc_wsum as acc_wsum, td2.min_acc_wsum as min_acc_wsum 9 | from {provider_prefix}_freq_with_acc_wsum_night a1 10 | join (select td.uid as uid, min(td.acc_wsum) as min_acc_wsum from ( 11 | select uid, acc_wsum from {provider_prefix}_freq_with_acc_wsum_night 12 | where acc_wsum >= {threshold} group by uid, acc_wsum) td group by td.uid) td2 13 | on (a1.uid = td2.uid)) td3 where acc_wsum <= min_acc_wsum -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/frequent_locations_wsum.sql: -------------------------------------------------------------------------------- 1 | CREATE table {provider_prefix}_freq_with_acc_wsum as select uid, tcount, 2 | trank, ppercent, unique_location, longitude, latitude , {admin}_id, 3 | sum(ppercent) over (partition by uid order by trank asc) 4 | as acc_wsum from {provider_prefix}_frequent_locations 5 | order by uid, trank -------------------------------------------------------------------------------- /queries/statistics/reports/frequent_locations/frequent_locations_wsum_night.sql: -------------------------------------------------------------------------------- 1 | CREATE table {provider_prefix}_freq_with_acc_wsum_night as select uid, tcount, 2 | trank, ppercent, unique_location, longitude, latitude , {admin}_id, 3 | sum(ppercent) over (partition by uid order by trank asc) 4 | as acc_wsum from {provider_prefix}_frequent_locations_night 5 | order by uid, trank -------------------------------------------------------------------------------- /queries/statistics/reports/monthly_statistics/monthly_statistics.sql: -------------------------------------------------------------------------------- 1 | SELECT YEAR(call_time) as year, MONTH(call_time) as month , 'ALL' as call_type, 'ALL' as network_type, COUNT(*) as total_records, 2 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 3 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where 4 | (year(pdt) between {start_year} and {end_year}) and (MONTH(pdt) between {start_month} and {end_month}) 5 | GROUP BY YEAR(call_time), MONTH(call_time) 6 | UNION 7 | SELECT YEAR(call_time) as year, MONTH(call_time) as month, call_type, 'ALL' as network_type, COUNT(*) as total_records, 8 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, COUNT(DISTINCT imei) as unique_imei, 9 | COUNT(DISTINCT imsi) unique_imsi, COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all 10 | where (year(pdt) between {start_year} and {end_year}) and (MONTH(pdt) between {start_month} and {end_month}) 11 | GROUP BY YEAR(call_time), MONTH(call_time), call_type 12 | UNION 13 | SELECT YEAR(call_time) as year, MONTH(call_time) as month, 'ALL' as call_type, network_type, COUNT(*) as total_records, 14 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 15 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where (year(pdt) between {start_year} and {end_year}) 16 | and (MONTH(pdt) between {start_month} and {end_month}) GROUP BY YEAR(call_time), MONTH(call_time), network_type 17 | UNION 18 | SELECT YEAR(call_time) as year, MONTH(call_time) as month , call_type, network_type, COUNT(*) as total_records, 19 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi} 20 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all 21 | where (year(pdt) between {start_year} and {end_year}) and (MONTH(pdt) between {start_month} and {end_month}) 22 | GROUP BY YEAR(call_time), MONTH(call_time), call_type, network_type ORDER BY year ASC, month ASC, call_type ASC, network_type DESC 23 | -------------------------------------------------------------------------------- /queries/statistics/reports/summary/average_daily_admin1.sql: -------------------------------------------------------------------------------- 1 | select sum(td.count)/{total_days} as average_{level}_per_day 2 | from ( select count(distinct a1.{level}) as count from {provider_prefix}_cell_tower_data_preprocess a1 3 | JOIN {provider_prefix}_consolidate_data_all a2 4 | on (a1.cell_id = a2.cell_id) group by to_date(call_time))td -------------------------------------------------------------------------------- /queries/statistics/reports/summary/average_daily_sms.sql: -------------------------------------------------------------------------------- 1 | select count(*)/{total_days} as average_daily_sms 2 | from {provider_prefix}_consolidate_data_all 3 | where call_type = 'SMS' -------------------------------------------------------------------------------- /queries/statistics/reports/summary/average_daily_voice.sql: -------------------------------------------------------------------------------- 1 | select count(*)/{total_days} as average_daily_voice from {provider_prefix}_consolidate_data_all where call_type = 'VOICE' -------------------------------------------------------------------------------- /queries/statistics/reports/summary/average_unique_cell_ids.sql: -------------------------------------------------------------------------------- 1 | select sum(td.count)/{total_days} as average_daily_unique_cell_id from 2 | (select count(distinct cell_id) as count from {provider_prefix}_consolidate_data_all 3 | group by to_date(call_time)) td 4 | -------------------------------------------------------------------------------- /queries/statistics/reports/summary/total_days.sql: -------------------------------------------------------------------------------- 1 | select count(*) as total_days, min(dates) as start_date, max(dates) 2 | as end_date from (select to_date(call_time) as dates from 3 | {provider_prefix}_consolidate_data_all group by to_date(call_time)) td -------------------------------------------------------------------------------- /queries/statistics/reports/zone_population/zone_population.sql: -------------------------------------------------------------------------------- 1 | select lv as {level}, sum(td.count) as count_activities, count(td.uid) as count_unique_ids from 2 | (select a1.{level} as lv, count(a1.{level}) as count, a2.uid as uid 3 | from {provider_prefix}_cell_tower_data_preprocess a1 JOIN {provider_prefix}_consolidate_data_all a2 4 | on (a1.cell_id = a2.cell_id) group by a1.{level}, a2.uid) td group by lv -------------------------------------------------------------------------------- /queries/statistics/total_records.sql: -------------------------------------------------------------------------------- 1 | select count(*) as total_records from {provider_prefix}_consolidate_data_all -------------------------------------------------------------------------------- /queries/statistics/total_unique_uids.sql: -------------------------------------------------------------------------------- 1 | select count(*) as total_uids from 2 | (select distinct uid from {provider_prefix}_consolidate_data_all) td -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bit-array==0.1.0 2 | bitarray==1.2.1 3 | cycler==0.10.0 4 | impyla==0.16.2 5 | kiwisolver==1.1.0 6 | matplotlib==3.2.0 7 | numpy==1.18.1 8 | pandas==1.0.1 9 | ply==3.11 10 | pure-sasl==0.6.2 11 | pyparsing==2.4.6 12 | python-dateutil==2.8.1 13 | pytz==2019.3 14 | six==1.14.0 15 | thrift==0.13.0 16 | thrift-sasl==0.4.1 17 | thriftpy==0.3.9 18 | thriftpy2==0.4.11 -------------------------------------------------------------------------------- /run_interpolation.py: -------------------------------------------------------------------------------- 1 | from Common.config_object import Config 2 | from Common.cdr_interpolation import Interpolation 3 | from Common.hive_create_tables import HiveTableCreator 4 | from Common.hive_connection import HiveConnection 5 | from Common.helper import format_two_point_time 6 | import argparse 7 | import time 8 | 9 | 10 | def main(): 11 | # argument parser 12 | start = time.time() 13 | parser = argparse.ArgumentParser(description='Argument indicating the configuration file') 14 | 15 | # add configuration argument 16 | parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data" 17 | " \n ex. py py_hive_connect.py -c config.json", 18 | action="store") 19 | 20 | # parse config to args.config 21 | args = parser.parse_args() 22 | 23 | config = Config(args.config) 24 | hc = HiveConnection(host=config.host, port=config.port, user=config.user) 25 | 26 | # initialize hive and create tables 27 | table_creator = HiveTableCreator(config) 28 | table_creator.initialize('hive_init_commands/initial_hive_commands_interpolation.json') # mandatory (init hive) 29 | 30 | # init interpolation generators 31 | it = Interpolation(config) 32 | 33 | # interpolation 34 | it.calculate_interpolation() 35 | 36 | print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time()))) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /run_origin_destination.py: -------------------------------------------------------------------------------- 1 | from Common.config_object import Config 2 | from Common.cdr_origin_destination import OriginDestination 3 | from Common.hive_create_tables import HiveTableCreator 4 | from Common.hive_connection import HiveConnection 5 | from Common.helper import format_two_point_time 6 | import argparse 7 | import time 8 | 9 | 10 | def main(): 11 | # argument parser 12 | start = time.time() 13 | parser = argparse.ArgumentParser(description='Argument indicating the configuration file') 14 | 15 | # add configuration argument 16 | parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data" 17 | " \n ex. py py_hive_connect.py -c config.json", 18 | action="store") 19 | 20 | # parse config to args.config 21 | args = parser.parse_args() 22 | 23 | config = Config(args.config) 24 | HiveConnection(host=config.host, port=config.port, user=config.user) 25 | 26 | # initialize hive and create tables 27 | table_creator = HiveTableCreator(config) 28 | table_creator.initialize('hive_init_commands/initial_hive_commands_od.json') # mandatory (init hive) 29 | 30 | # init od and stat generator 31 | od = OriginDestination(config) 32 | 33 | # origin destination 34 | od.calculate_od() # Require rank1_frequent_locations 35 | 36 | print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time()))) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /run_prepare_cdr_and_mapping.py: -------------------------------------------------------------------------------- 1 | from Common.config_object import Config 2 | from Common.hive_create_tables import HiveTableCreator 3 | from Common.hive_connection import HiveConnection 4 | from Common.helper import extract_mapping_data, format_two_point_time 5 | from Common.cdr_data import CDRData 6 | import argparse 7 | import time 8 | 9 | 10 | def main(): 11 | # argument parser 12 | start = time.time() 13 | parser = argparse.ArgumentParser(description='Argument indicating the configuration file') 14 | 15 | # add configuration argument 16 | parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data" 17 | " \n ex. py py_hive_connect.py -c config.json", 18 | action="store") 19 | 20 | # parse config to args.config 21 | args = parser.parse_args() 22 | 23 | config = Config(args.config) 24 | HiveConnection(host=config.host, port=config.port, user=config.user) 25 | cdr_data = CDRData() 26 | extract_mapping_data(config, cdr_data) 27 | 28 | # initialize hive and create tables 29 | 30 | table_creator = HiveTableCreator(config, cdr_data) 31 | table_creator.initialize('hive_init_commands/initial_hive_commands_stats.json') # init hive 32 | table_creator.create_tables() 33 | 34 | print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time()))) 35 | 36 | 37 | if __name__ == '__main__': 38 | main() 39 | -------------------------------------------------------------------------------- /run_statistics.py: -------------------------------------------------------------------------------- 1 | from Common.config_object import Config 2 | from Common.cdr_statistics import Statistics 3 | from Common.hive_create_tables import HiveTableCreator 4 | from Common.hive_connection import HiveConnection 5 | from Common.helper import format_two_point_time 6 | import argparse 7 | import time 8 | 9 | 10 | def main(): 11 | # argument parser 12 | start = time.time() 13 | parser = argparse.ArgumentParser(description='Argument indicating the configuration file') 14 | 15 | # add configuration argument 16 | parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data" 17 | " \n ex. py py_hive_connect.py -c config.json", 18 | action="store") 19 | 20 | # parse config to args.config 21 | args = parser.parse_args() 22 | 23 | config = Config(args.config) 24 | HiveConnection(host=config.host, port=config.port, user=config.user) 25 | 26 | table_creator = HiveTableCreator(config) 27 | table_creator.initialize('hive_init_commands/initial_hive_commands_stats.json') # mandatory (init hive) 28 | 29 | # init stats generators 30 | st = Statistics(config) 31 | 32 | # user section here 33 | # reports 34 | st.calculate_data_statistics() 35 | st.calculate_daily_statistics() 36 | st.calculate_monthly_statistics() 37 | st.calculate_zone_population() 38 | st.calculate_summary() 39 | st.calculate_user_date_histogram() 40 | # graphs 41 | st.daily_cdrs() 42 | st.daily_unique_users() 43 | st.daily_unique_locations() 44 | st.daily_average_cdrs() 45 | st.daily_unique_average_locations() 46 | 47 | # frequent locations (Report) 48 | st.frequent_locations() 49 | st.frequent_locations_night() 50 | 51 | # Prerequisite for Origin-Destination, if not wishing to calculate OD, kindly comment the code 52 | st.rank1_frequent_locations() # Require frequent_locations() in run_statistics.py 53 | 54 | print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time()))) 55 | if __name__ == '__main__': 56 | main() -------------------------------------------------------------------------------- /sample_configs/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "hadoop_data_path":"/disk2/data/Chaichan/", 3 | "provider_prefix":"DTAC3", 4 | "db_name" : "cdrproject", 5 | 6 | "input_delimiter":",", 7 | "input_files" :["ais.csv"], 8 | "_comment": ["if time format is well formed (like yyyy/mm/dd or dd-mm-yyyy and colons(:) separating time, then leave it blank", 9 | "if the time format has no separator then indicate it (ex. yyyyMMdd (can't be automatically recognized) hh:mm:ss)"], 10 | "input_file_time_format": "yyyyMMdd hh:mm:ss", 11 | "input_file_have_header_comment_": "put 1 if there is a header row, otherwise 0", 12 | "input_file_have_header": 1, 13 | 14 | "input_cell_tower_files" : ["cdr_cell_tower.csv"], 15 | "input_cell_tower_delimiter":",", 16 | "input_cell_tower_have_header": 1, 17 | 18 | "check_duplicate_comment_": "will check duplicate in the cdr raw datafile and cell tower file", 19 | "check_duplicate": true, 20 | "check_invalid_lat_lng_comment_": "filter invalid lat_lng (both 0 or one of it is null)", 21 | "check_invalid_lat_lng": true, 22 | "host": "hadoopmaster.apichon.com", 23 | "port": 10000, 24 | "user": "rsstudent", 25 | 26 | "frequent_locations_percentage": 80, 27 | 28 | "output_report_location": "output_reports_small_latest", 29 | "output_graph_location": "graphical_reports_small_latest", 30 | 31 | "od_admin_unit": "ADMIN1", 32 | "od_date": "2016-05-01", 33 | 34 | "interpolation_poi_file_location": "/hadoop/hive/data/bangladesh/bangladesh.landscan2010_poi.tsv", 35 | "interpolation_osm_file_location": "/hadoop/hive/data/bangladesh/bangladesh.osm_road_with_waterway.tsv", 36 | "interpolation_voronoi_file_location": "/hadoop/hive/data/bangladesh/gp_voronoi.tsv", 37 | "max_size_cdr_by_uid": 500, 38 | "max_size_interpolation": 50, 39 | 40 | "cdr_data_layer_comment_": ["do not remove or change the name of the mandatory output columns", 41 | "if you don't have anything to map to the output, put -1 in the output_no", 42 | "if you have input that is not used and not mapped, insert a row with input_no = -1 and output_no = -1", 43 | "If a column is in your raw table, put input_no non-negative in the order of your column order", 44 | "network_type must be in 2G, 3G, 4G, 5G or put -1 to the output_no of both network_type and call_type", 45 | "call_type must be in Voice, Data, Call and SMS or put -1 to the output_no of both network_type and call_type", 46 | "note that in 1-1 mapping both input_no and output_no are non negative", 47 | "you need to import all the columns of your raw file and the output_no may be -1"], 48 | 49 | "cdr_data_layer":[ 50 | {"input_no":1, "input_name":"SUBID", "data_type":"String", "output_no":1, "name":"UID", "custom": ""}, 51 | {"input_no":-1, "input_name":"IMEI", "data_type":"String", "output_no":2, "name":"IMEI", "custom": ""}, 52 | {"input_no":-1, "input_name":"IMSI", "data_type":"String", "output_no":3, "name":"IMSI", "custom": ""}, 53 | {"input_no":2, "input_name":"CDATE", "data_type":"String", "output_no":-1, "name":"CALL_DATE", "custom": ""}, 54 | {"input_no":3, "input_name":"CTIME", "data_type":"String", "output_no":4, "name":"CALL_TIME", "custom": "CONCAT(CDATE,' ',CTIME)"}, 55 | {"input_no":4, "input_name":"DURATION", "data_type":"String", "output_no":5, "name":"DURATION", "custom": ""}, 56 | {"input_no":5, "input_name":"CELLID", "data_type":"String", "output_no":6, "name":"CELL_ID", "custom": ""}, 57 | {"input_no":6, "input_name":"LATITUDE", "data_type":"String", "output_no":7, "name":"LATITUDE", "custom": ""}, 58 | {"input_no":7, "input_name":"LONGITUDE", "data_type":"String", "output_no":8, "name":"LONGITUDE", "custom": ""}, 59 | {"input_no":9, "input_name":"NETWORK_TYPE", "data_type":"String", "output_no":10, "name":"NETWORK_TYPE", "custom": ""}, 60 | {"input_no":8, "input_name":"CALL_TYPE", "data_type":"String", "output_no":9, "name":"CALL_TYPE", "custom": ""} 61 | ], 62 | 63 | "cdr_cell_tower":[ 64 | {"input_no":1, "input_name":"BTSID", "data_type":"String", "output_no":-1, "name":"UID"}, 65 | {"input_no":2, "input_name":"SITE_NAME", "data_type":"String", "output_no":-1, "name":"SITE_NAME"}, 66 | {"input_no":3, "input_name":"LONGITUDE", "data_type":"String", "output_no":3, "name":"LONGITUDE"}, 67 | {"input_no":4, "input_name":"LATITUDE", "data_type":"String", "output_no":4, "name":"LATITUDE"}, 68 | {"input_no":5, "input_name":"CELLID", "data_type":"String", "output_no":5, "name":"CELL_ID" }, 69 | {"input_no":6, "input_name":"CELLNAME", "data_type":"String", "output_no":-1, "name":"CELLNAME" }, 70 | {"input_no":7, "input_name":"CI", "data_type":"String", "output_no":-1, "name":"CI" }, 71 | {"input_no":8, "input_name":"AZIMUTH", "data_type":"String", "output_no":-1, "name":"AZIMUTH" }, 72 | {"input_no":9, "input_name":"DISTRICT", "data_type":"String", "output_no":6, "name":"ADMIN1", "geojson_filename": "japan.json", "geojson_col_name": "nam"}, 73 | {"input_no":10, "input_name":"PROVINCE", "data_type":"String", "output_no":7, "name":"ADMIN2", "geojson_filename": "", "geojson_col_name": ""} 74 | ] 75 | 76 | } 77 | -------------------------------------------------------------------------------- /sample_configs/config_big.json: -------------------------------------------------------------------------------- 1 | { 2 | "hadoop_data_path":"/disk2/data/Chaichan/", 3 | "provider_prefix":"big7", 4 | "db_name" : "cdrproject", 5 | 6 | "input_delimiter":",", 7 | "input_files" :["2016-03-01.csv","2016-03-02.csv","2016-03-03.csv","2016-03-04.csv","2016-03-05.csv"], 8 | "_comment": ["if time format is well formed (like yyyy/mm/dd or dd-mm-yyyy and colons(:) separating time, then leave it blank", 9 | "if the time format has no separator then indicate it (ex. yyyyMMdd (can't be automatically recognized) hh:mm:ss)"], 10 | "input_file_time_format": "", 11 | "input_file_have_header_comment_": "put 1 if there is a header row, otherwise 0", 12 | "input_file_have_header": 0, 13 | 14 | "input_cell_tower_files" : ["moz_cell_adm1_adm2.csv"], 15 | "input_cell_tower_delimiter":",", 16 | "input_cell_tower_have_header": 1, 17 | 18 | "check_duplicate_comment_": "will check duplicate in the cdr raw datafile and cell tower file", 19 | "check_duplicate": true, 20 | "check_invalid_lat_lng_comment_": "filter invalid lat_lng (both 0 or one of it is null)", 21 | "check_invalid_lat_lng": true, 22 | "host": "hadoopmaster.apichon.com", 23 | "port": 10000, 24 | "user": "rsstudent", 25 | 26 | "frequent_locations_percentage": 80, 27 | 28 | "output_report_location": "big7", 29 | "output_graph_location": "big7", 30 | 31 | "od_admin_unit": "admin1", 32 | "od_date": "2016-03-01", 33 | 34 | "interpolation_poi_file_location": "/hadoop/hive/data/bangladesh/bangladesh.landscan2010_poi.tsv", 35 | "interpolation_osm_file_location": "/hadoop/hive/data/bangladesh/bangladesh.osm_road_with_waterway.tsv", 36 | "interpolation_voronoi_file_location": "/hadoop/hive/data/bangladesh/gp_voronoi.tsv", 37 | "max_size_cdr_by_uid": 500, 38 | "max_size_interpolation": 50, 39 | 40 | "cdr_data_layer_comment_": ["do not remove or change the name of the mandatory output columns", 41 | "if you don't have anything to map to the output, put -1 in the output_no", 42 | "if you have input that is not used and not mapped, insert a row with input_no = -1 and output_no = -1", 43 | "If a column is in your raw table, put input_no non-negative in the order of your column order", 44 | "network_type must be in 2G, 3G, 4G, 5G or put -1 to the output_no of both network_type and call_type", 45 | "call_type must be in Voice, Data, Call and SMS or put -1 to the output_no of both network_type and call_type", 46 | "note that in 1-1 mapping both input_no and output_no are non negative", 47 | "you need to import all the columns of your raw file and the output_no may be -1"], 48 | 49 | "cdr_data_layer":[ 50 | {"input_no":1, "input_name":"IMEI", "data_type":"String", "output_no":1, "name":"UID", "custom": ""}, 51 | {"input_no":2, "input_name":"IMSI", "data_type":"String", "output_no":2, "name":"IMSI", "custom": ""}, 52 | {"input_no":3, "input_name":"startTime", "data_type":"String", "output_no":3, "name":"CALL_TIME", "custom": ""}, 53 | {"input_no":4, "input_name":"endTime", "data_type":"String", "output_no":-1, "name":"END_TIME", "custom": ""}, 54 | {"input_no":5, "input_name":"DURATION", "data_type":"String", "output_no":4, "name":"DURATION", "custom": ""}, 55 | {"input_no":6, "input_name":"bs_seq", "data_type":"String", "output_no":-1, "name":"BS_SEQ", "custom": ""}, 56 | {"input_no":7, "input_name":"CELLID", "data_type":"String", "output_no":5, "name":"CELL_ID", "custom": ""}, 57 | {"input_no":8, "input_name":"call_type", "data_type":"String", "output_no":-1, "name":"CALL_TYPE", "custom": ""}, 58 | {"input_no":-1, "input_name":"network_type", "data_type":"String", "output_no":-1, "name":"NETWORK_TYPE", "custom": ""}, 59 | {"input_no":9, "input_name":"lon", "data_type":"String", "output_no":-1, "name":"LATITUDE", "custom": ""}, 60 | {"input_no":10, "input_name":"lat", "data_type":"String", "output_no":-1, "name":"LONGITUDE", "custom": ""} 61 | 62 | ], 63 | 64 | "cdr_cell_tower_comment_": "if you don't have a geojson file leave the field of geojson_filename blank but still preserve the key", 65 | 66 | "cdr_cell_tower":[ 67 | {"input_no":1, "input_name":"bs_seq", "data_type":"String", "output_no":-1, "name":"BS_SEQ"}, 68 | {"input_no":2, "input_name":"cell_seq", "data_type":"String", "output_no":1, "name":"CELL_ID" }, 69 | {"input_no":3, "input_name":"name", "data_type":"String", "output_no":-1, "name":"NAME"}, 70 | {"input_no":4, "input_name":"lac", "data_type":"String", "output_no":-1, "name":"CELLNAME" }, 71 | {"input_no":5, "input_name":"cell", "data_type":"String", "output_no":-1, "name":"CI" }, 72 | {"input_no":6, "input_name":"lon", "data_type":"String", "output_no":2, "name":"LATITUDE" }, 73 | {"input_no":7, "input_name":"lat", "data_type":"String", "output_no":3, "name":"LONGITUDE" }, 74 | {"input_no":8, "input_name":"ISO2", "data_type":"String", "output_no":-1, "name":"ISO2" }, 75 | {"input_no":9, "input_name":"NAME_0_2", "data_type":"String", "output_no":-1, "name":"NAME_0_2"}, 76 | {"input_no":10, "input_name":"ID_1_2", "data_type":"String", "output_no":-1, "name":"ID_1_2" }, 77 | {"input_no":11, "input_name":"NAME_1_2", "data_type":"String", "output_no":4, "name":"ADMIN0", "geojson_filename": "", "geojson_col_name": "" }, 78 | {"input_no":12, "input_name":"ID_2", "data_type":"String", "output_no":-1, "name":"ID2" }, 79 | {"input_no":13, "input_name":"NAME_2", "data_type":"String", "output_no":5, "name":"ADMIN1", "geojson_filename": "", "geojson_col_name": "" }, 80 | {"input_no":14, "input_name":"ENGTYPE_2", "data_type":"String", "output_no":-1, "name":"ENGTYPE_2" } 81 | ] 82 | 83 | } 84 | --------------------------------------------------------------------------------