├── .gitignore
├── Common
    ├── cdr_data.py
    ├── cdr_interpolation.py
    ├── cdr_origin_destination.py
    ├── cdr_statistics.py
    ├── config_object.py
    ├── helper.py
    ├── hive_connection.py
    └── hive_create_tables.py
├── Interpolation
    ├── README.md
    └── output_sample
    │   └── interpolation.csv
├── LICENSE
├── Origin_Destination
    ├── README.md
    └── output_sample
    │   └── origin_destination.tsv
├── README.md
├── Sample_Inputs
    ├── cdr_sample.csv
    └── mapping_sample.csv
├── Statistics
    ├── README.md
    ├── output_graphs
    │   ├── daily_avg_cdr.png
    │   ├── daily_cdr_by_call_type.png
    │   ├── daily_cdrs.png
    │   ├── daily_unique_avg_locations.png
    │   ├── daily_unique_locations.png
    │   ├── daily_unique_users.png
    │   └── user_data_histogram.png
    └── output_reports
    │   ├── css_file_data_stat.csv
    │   ├── css_provider_data_stat_daily.csv
    │   ├── css_provider_data_stat_monthly.csv
    │   ├── frequent_location_output_sample.png
    │   ├── japan._joined_ADMIN1.json
    │   ├── summary_stats.csv
    │   ├── zone_based_aggregations_level_ADMIN1.csv
    │   └── zone_based_aggregations_level_ADMIN2.csv
├── hive_init_commands
    ├── initial_hive_commands_interpolation.json
    ├── initial_hive_commands_od.json
    └── initial_hive_commands_stats.json
├── lib
    ├── Jama-1.0.2.jar
    ├── ajt-2.5.jar
    ├── cdrinterpolationlib.jar
    ├── cdrlibindicator.jar
    ├── cdrmobilitylib.jar
    ├── cdrmobilitylibge.jar
    ├── cdrmobilitylibjica.jar
    ├── commons-dbcp-1.4.jar
    ├── commons-lang-2.6.jar
    ├── commons-logging-1.1.3.jar
    ├── commons-math-2.1.jar
    ├── commons-pool-1.5.4.jar
    ├── gt-api-9.3.jar
    ├── gt-data-9.3.jar
    ├── gt-epsg-hsql-12.2.jar
    ├── gt-main-9.3.jar
    ├── gt-metadata-2.6.5.jar
    ├── gt-metadata-9.3.jar
    ├── gt-opengis-9.3.jar
    ├── gt-referencing-2.6.5.jar
    ├── gt-referencing-9.3.jar
    ├── gt-shapefile-2.6.5.jar
    ├── jahmm-0.6.2.jar
    ├── javaml-0.1.6.jar
    ├── jsr-275-1.0-beta-2.jar
    ├── jts-1.12.jar
    ├── jts-1.13.jar
    ├── jtsio-1.12.jar
    ├── libsvm.jar
    ├── pflow-hiveUDF.jar
    ├── postgis-jdbc-2.1.0SVN.jar
    ├── postgresql-9.3-1102.jdbc4.jar
    ├── vecmath-1.3.2.jar
    └── weka.jar
├── queries
    ├── cdr_and_mapping
    │   ├── create_consolidate_cdr.sql
    │   ├── create_mapping_admin.sql
    │   ├── create_preprocess_cdr.sql
    │   ├── create_preprocess_mapping.sql
    │   ├── create_raw_cdr.sql
    │   ├── create_raw_mapping.sql
    │   ├── insert_consolidate_cdr.sql
    │   ├── insert_consolidate_cdr_join.sql
    │   ├── insert_mapping_admin.sql
    │   ├── insert_preprocess_cdr.sql
    │   └── insert_preprocess_mapping.sql
    ├── interpolation
    │   ├── create_cdr_by_uid.sql
    │   ├── create_poi_relocation.sql
    │   ├── create_route_interpolation.sql
    │   ├── create_trip_24_hr_padding.sql
    │   ├── create_trip_format.sql
    │   ├── export_to_gps_format.sql
    │   ├── insert_cdr_by_uid.sql
    │   ├── insert_poi_relocation.sql
    │   ├── insert_route_interpolation.sql
    │   ├── insert_trip_24_hr_padding.sql
    │   └── insert_trip_format.sql
    ├── origin_destination
    │   ├── create_la_cdr_all_with_ant_zone_by_uid.sql
    │   ├── create_la_cdr_all_with_ant_zone_by_uid_od.sql
    │   ├── create_la_cdr_all_with_ant_zone_by_uid_od_detail.sql
    │   ├── create_la_cdr_all_with_ant_zone_by_uid_od_sum.sql
    │   ├── create_la_cdr_uid_home.sql
    │   ├── insert_la_cdr_all_with_ant_zone_by_old_consolidate.sql
    │   ├── insert_la_cdr_all_with_ant_zone_by_uid.sql
    │   ├── insert_la_cdr_all_with_ant_zone_by_uid_od.sql
    │   ├── insert_la_cdr_all_with_ant_zone_by_uid_od_detail.sql
    │   ├── insert_la_cdr_all_with_ant_zone_by_uid_od_sum.sql
    │   ├── insert_la_cdr_uid_home.sql
    │   └── od_to_csv.sql
    └── statistics
    │   ├── graphs
    │       ├── daily_average_cdrs
    │       │   └── daily_average_cdrs.sql
    │       ├── daily_average_unique_locations
    │       │   ├── daily_average_unique_locations.sql
    │       │   └── daily_average_unique_locations_old_consolidate.sql
    │       ├── daily_cdrs
    │       │   └── total_daily_cdrs.sql
    │       ├── daily_cdrs_by_call_type
    │       │   └── daily_cdrs_by_call_type.sql
    │       ├── daily_unique_locations
    │       │   ├── daily_unique_locations.sql
    │       │   ├── daily_unique_locations_old_consolidate.sql
    │       │   ├── total_unique_locations.sql
    │       │   └── total_unique_locations_old_consolidate.sql
    │       ├── daily_unique_users
    │       │   └── total_daily_uids.sql
    │       └── date_histogram
    │       │   └── histogram.sql
    │   ├── reports
    │       ├── all_statistics
    │       │   └── data_statistics.sql
    │       ├── daily_statistics
    │       │   └── daily_statistics.sql
    │       ├── frequent_locations
    │       │   ├── create_frequent_locations.sql
    │       │   ├── create_frequent_locations_night.sql
    │       │   ├── frequent_locations.sql
    │       │   ├── frequent_locations_night.sql
    │       │   ├── frequent_locations_night_old_consolidate.sql
    │       │   ├── frequent_locations_old_consolidate.sql
    │       │   ├── frequent_locations_thresholded.sql
    │       │   ├── frequent_locations_thresholded_night.sql
    │       │   ├── frequent_locations_wsum.sql
    │       │   └── frequent_locations_wsum_night.sql
    │       ├── monthly_statistics
    │       │   └── monthly_statistics.sql
    │       ├── summary
    │       │   ├── average_daily_admin1.sql
    │       │   ├── average_daily_sms.sql
    │       │   ├── average_daily_voice.sql
    │       │   ├── average_unique_cell_ids.sql
    │       │   └── total_days.sql
    │       └── zone_population
    │       │   └── zone_population.sql
    │   ├── total_records.sql
    │   └── total_unique_uids.sql
├── requirements.txt
├── run_interpolation.py
├── run_origin_destination.py
├── run_prepare_cdr_and_mapping.py
├── run_statistics.py
└── sample_configs
    ├── config.json
    └── config_big.json


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | cdrenv/
3 | japan.json
4 | configs/config.json
5 | configs/config_big.json
6 | __pycache__
7 | not_used/
8 | venv
9 | 


--------------------------------------------------------------------------------
/Common/cdr_data.py:
--------------------------------------------------------------------------------
1 | class CDRData:
2 |     def __init__(self):
3 |         pass
4 | 


--------------------------------------------------------------------------------
/Common/cdr_interpolation.py:
--------------------------------------------------------------------------------
  1 | from Common.hive_connection import HiveConnection
  2 | import time
  3 | from Common.helper import format_two_point_time, sql_to_string
  4 | 
  5 | 
  6 | class Interpolation:
  7 |     def __init__(self, config):
  8 |         self.config = config
  9 |         self.hc = HiveConnection()
 10 | 
 11 |     def calculate_interpolation(self):
 12 |         self.convert_cdr_to_array_format()
 13 |         self.create_trip_format()
 14 |         self.create_trip_24hr_padding()
 15 |         self.create_poi_relocation()
 16 |         self.create_route_interpolation()
 17 |         self.export_to_csv()
 18 | 
 19 |     def convert_cdr_to_array_format(self):
 20 |         provider_prefix = self.config.provider_prefix
 21 |         cursor = self.hc.cursor
 22 |         print('########## CREATE CDR BY UID ARRAY FORMAT TABLE ##########')
 23 |         timer = time.time()
 24 |         print('Checking and dropping {provider_prefix}_cdr_by_uid table if existing.'
 25 |               .format(provider_prefix=provider_prefix))
 26 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid'
 27 |                        .format(provider_prefix=provider_prefix))
 28 |         print('Checked and dropped {provider_prefix}_cdr_by_uid  table if existing. '
 29 |               'Elapsed time: {time} seconds'
 30 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 31 |         timer = time.time()
 32 |         print('Creating {provider_prefix}_cdr_by_uid table'
 33 |               .format(provider_prefix=provider_prefix))
 34 |         raw_sql = sql_to_string('interpolation/create_cdr_by_uid.sql')
 35 |         query = raw_sql.format(provider_prefix=provider_prefix)
 36 |         cursor.execute(query)
 37 | 
 38 |         print('Created {provider_prefix}_cdr_by_uid table. Elapsed time: {time} seconds'
 39 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 40 |         timer = time.time()
 41 |         raw_sql = sql_to_string('interpolation/insert_cdr_by_uid.sql')
 42 |         print('Inserting into {provider_prefix}_cdr_by_uid table'
 43 |               .format(provider_prefix=provider_prefix))
 44 |         query = raw_sql.format(provider_prefix=provider_prefix, max_size_cdr_by_uid=self.config.max_size_cdr_by_uid)
 45 |         cursor.execute(query)
 46 | 
 47 |         print('Inserted into {provider_prefix}_cdr_by_uid table. Elapsed time: {time} seconds'
 48 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 49 |         print('########## FINISHED CREATING CDR BY UID TABLE ##########')
 50 | 
 51 |     def create_trip_format(self):
 52 |         provider_prefix = self.config.provider_prefix
 53 |         cursor = self.hc.cursor
 54 |         print('########## CREATE CDR BY UID ARRAY TRIP FORMAT TABLE ##########')
 55 |         timer = time.time()
 56 |         print('Checking and dropping {provider_prefix}_cdr_by_uid_trip table if existing.'
 57 |               .format(provider_prefix=provider_prefix))
 58 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid_trip'
 59 |                        .format(provider_prefix=provider_prefix))
 60 |         print('Checked and dropped {provider_prefix}_cdr_by_uid_trip  table if existing. '
 61 |               'Elapsed time: {time} seconds'
 62 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 63 |         timer = time.time()
 64 |         print('Creating {provider_prefix}_cdr_by_uid_trip table'
 65 |               .format(provider_prefix=provider_prefix))
 66 |         raw_sql = sql_to_string('interpolation/create_trip_format.sql')
 67 |         query = raw_sql.format(provider_prefix=provider_prefix)
 68 |         cursor.execute(query)
 69 | 
 70 |         print('Created {provider_prefix}_cdr_by_uid_trip table. Elapsed time: {time} seconds'
 71 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 72 |         timer = time.time()
 73 |         raw_sql = sql_to_string('interpolation/insert_trip_format.sql')
 74 |         print('Inserting into {provider_prefix}_cdr_by_uid_trip table'
 75 |               .format(provider_prefix=provider_prefix))
 76 |         query = raw_sql.format(provider_prefix=provider_prefix)
 77 |         cursor.execute(query)
 78 | 
 79 |         print('Inserted into {provider_prefix}_cdr_by_uid_trip table. Elapsed time: {time} seconds'
 80 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 81 |         print('########## FINISHED CREATING CDR BY UID TRIP FORMAT TABLE ##########')
 82 | 
 83 |     def create_trip_24hr_padding(self):
 84 |         provider_prefix = self.config.provider_prefix
 85 |         cursor = self.hc.cursor
 86 |         print('########## CREATE TRIP 24 HR PADDING TABLE ##########')
 87 |         timer = time.time()
 88 |         print('Checking and dropping {provider_prefix}_cdr_by_uid_trip_organized_array_apd table if existing.'
 89 |               .format(provider_prefix=provider_prefix))
 90 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid_trip_organized_array_apd'
 91 |                        .format(provider_prefix=provider_prefix))
 92 |         print('Checked and dropped {provider_prefix}_cdr_by_uid_trip_organized_array_apd table if existing. '
 93 |               'Elapsed time: {time} seconds'
 94 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 95 |         timer = time.time()
 96 |         print('Creating {provider_prefix}_cdr_by_uid_trip_organized_array_apd table'
 97 |               .format(provider_prefix=provider_prefix))
 98 |         raw_sql = sql_to_string('interpolation/create_trip_24_hr_padding.sql')
 99 |         query = raw_sql.format(provider_prefix=provider_prefix)
100 |         cursor.execute(query)
101 | 
102 |         print('Created {provider_prefix}_cdr_by_uid_trip_organized_array_apd table. Elapsed time: {time} seconds'
103 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
104 |         timer = time.time()
105 |         raw_sql = sql_to_string('interpolation/insert_trip_24_hr_padding.sql')
106 |         print('Inserting into {provider_prefix}_cdr_by_uid_trip_organized_array_apd table'
107 |               .format(provider_prefix=provider_prefix))
108 |         query = raw_sql.format(provider_prefix=provider_prefix)
109 |         cursor.execute(query)
110 | 
111 |         print('Inserted into {provider_prefix}_cdr_by_uid_trip_organized_array_apd table. Elapsed time: {time} seconds'
112 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
113 |         print('########## FINISHED TRIP 24 HR PADDING TABLE ##########')
114 | 
115 |     def create_poi_relocation(self):
116 |         provider_prefix = self.config.provider_prefix
117 |         cursor = self.hc.cursor
118 |         print('########## CREATE POI RELOCATION TABLE ##########')
119 |         timer = time.time()
120 |         print('Checking and dropping {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table if existing.'
121 |               .format(provider_prefix=provider_prefix))
122 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid_trip_realloc_array_apd'
123 |                        .format(provider_prefix=provider_prefix))
124 |         print('Checked and dropped {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table if existing. '
125 |               'Elapsed time: {time} seconds'
126 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
127 |         timer = time.time()
128 |         print('Creating {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table'
129 |               .format(provider_prefix=provider_prefix))
130 |         raw_sql = sql_to_string('interpolation/create_poi_relocation.sql')
131 |         query = raw_sql.format(provider_prefix=provider_prefix)
132 |         cursor.execute(query)
133 | 
134 |         print('Created {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table. Elapsed time: {time} seconds'
135 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
136 |         timer = time.time()
137 |         raw_sql = sql_to_string('interpolation/insert_poi_relocation.sql')
138 |         print('Inserting into {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table'
139 |               .format(provider_prefix=provider_prefix))
140 |         query = raw_sql.format(provider_prefix=provider_prefix,
141 |                                poi=self.config.interpolation_poi_file_location.split('/')[-1])
142 |         cursor.execute(query)
143 | 
144 |         print('Inserted into {provider_prefix}_cdr_by_uid_trip_realloc_array_apd table. Elapsed time: {time} seconds'
145 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
146 |         print('########## FINISHED CREATING POI RELOCATION TABLE ##########')
147 | 
148 |     def create_route_interpolation(self):
149 |         provider_prefix = self.config.provider_prefix
150 |         cursor = self.hc.cursor
151 |         print('########## CREATE ROUTE INTERPOLATION TABLE ##########')
152 |         timer = time.time()
153 |         print('Checking and dropping {provider_prefix}_cdr_by_uid_trip_routing_array_apd table if existing.'
154 |               .format(provider_prefix=provider_prefix))
155 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cdr_by_uid_trip_routing_array_apd'
156 |                        .format(provider_prefix=provider_prefix))
157 |         print('Checked and dropped {provider_prefix}_cdr_by_uid_trip_routing_array_apd table if existing. '
158 |               'Elapsed time: {time} seconds'
159 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
160 |         timer = time.time()
161 |         print('Creating {provider_prefix}_cdr_by_uid_trip_routing_array_apd table'
162 |               .format(provider_prefix=provider_prefix))
163 |         raw_sql = sql_to_string('interpolation/create_route_interpolation.sql')
164 |         query = raw_sql.format(provider_prefix=provider_prefix)
165 |         cursor.execute(query)
166 | 
167 |         print('Created {provider_prefix}_cdr_by_uid_trip_routing_array_apd table. Elapsed time: {time} seconds'
168 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
169 |         timer = time.time()
170 |         raw_sql = sql_to_string('interpolation/insert_route_interpolation.sql')
171 |         print('Inserting into {provider_prefix}_cdr_by_uid_trip_routing_array_apd table'
172 |               .format(provider_prefix=provider_prefix))
173 |         query = raw_sql.format(provider_prefix=provider_prefix,
174 |                                max_size_interpolation=self.config.max_size_interpolation,
175 |                                osm=self.config.interpolation_osm_file_location.split('/')[-1],
176 |                                voronoi=self.config.interpolation_voronoi_file_location.split('/')[-1])
177 |         cursor.execute(query)
178 | 
179 |         print('Inserted into {provider_prefix}_cdr_by_uid_trip_routing_array_apd table. Elapsed time: {time} seconds'
180 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
181 |         print('########## FINISHED ROUTE INTERPOLATION TABLE ##########')
182 | 
183 |     def export_to_csv(self):
184 |         provider_prefix = self.config.provider_prefix
185 |         cursor = self.hc.cursor
186 |         print('########## Exporting route interpolation to CSV ##########')
187 |         timer = time.time()
188 |         raw_sql = sql_to_string('interpolation/export_to_gps_format.sql')
189 |         query = raw_sql.format(provider_prefix=provider_prefix)
190 |         cursor.execute(query)
191 |         print('Exported to CSV. Elapsed time: {time} seconds'
192 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
193 |         print('########## FINISHED EXPORTING, FILE LOCATED IN /tmp/hive/cdr_interpolation ##########')
194 | 


--------------------------------------------------------------------------------
/Common/cdr_origin_destination.py:
--------------------------------------------------------------------------------
  1 | from Common.hive_connection import HiveConnection
  2 | import time
  3 | from Common.helper import format_two_point_time, sql_to_string
  4 | 
  5 | 
  6 | class OriginDestination:
  7 |     def __init__(self, config):
  8 |         self.config = config
  9 |         self.hc = HiveConnection()
 10 | 
 11 |     def calculate_od(self):
 12 |         self.cdr_by_uid()
 13 |         self.create_od()
 14 |         self.create_od_detail()
 15 |         self.create_od_sum()
 16 | 
 17 |     def cdr_by_uid(self):
 18 |         provider_prefix = self.config.provider_prefix
 19 |         od_admin_unit = self.config.od_admin_unit
 20 |         cursor = self.hc.cursor
 21 |         print('########## CREATE CDR BY UID TABLE ##########')
 22 |         timer = time.time()
 23 |         print('Checking and dropping {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table if existing.'
 24 |               .format(provider_prefix=provider_prefix))
 25 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_la_cdr_all_with_ant_zone_by_uid'
 26 |                        .format(provider_prefix=provider_prefix))
 27 |         print('Checked and dropped {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table if existing. '
 28 |               'Elapsed time: {time} seconds'
 29 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 30 |         timer = time.time()
 31 |         print('Creating {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table'
 32 |               .format(provider_prefix=provider_prefix))
 33 |         raw_sql = sql_to_string('origin_destination/create_la_cdr_all_with_ant_zone_by_uid.sql')
 34 |         query = raw_sql.format(provider_prefix=provider_prefix)
 35 |         cursor.execute(query)
 36 |         print('Created {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table. Elapsed time: {time} seconds'
 37 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 38 |         timer = time.time()
 39 |         raw_sql = sql_to_string('origin_destination/insert_la_cdr_all_with_ant_zone_by_uid.sql')
 40 |         print('Inserting into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table'
 41 |               .format(provider_prefix=provider_prefix))
 42 |         query = raw_sql.format(provider_prefix=provider_prefix, target_admin=od_admin_unit, od_date=self.config.od_date)
 43 |         cursor.execute(query)
 44 | 
 45 |         print('Inserted into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table. Elapsed time: {time} seconds'
 46 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 47 |         print('########## FINISHED CREATING CDR BY UID TABLE ##########')
 48 | 
 49 |     def create_od(self):
 50 |         provider_prefix = self.config.provider_prefix
 51 |         od_admin_unit = self.config.od_admin_unit
 52 |         cursor = self.hc.cursor
 53 |         print('########## CREATE OD TABLE ##########')
 54 |         timer = time.time()
 55 |         print('Checking and dropping {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table if existing.'
 56 |               .format(provider_prefix=provider_prefix))
 57 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od'
 58 |                        .format(provider_prefix=provider_prefix))
 59 | 
 60 |         print('Checked and dropped  {provider_prefix}_la_cdr_all_with_ant_zone_by_uid table if existing.'
 61 |               ' Elapsed time: {time} seconds'
 62 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 63 |         print('Creating {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table'.format(
 64 |             provider_prefix=provider_prefix))
 65 |         timer = time.time()
 66 |         raw_sql = sql_to_string('origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od.sql')
 67 |         query = raw_sql.format(provider_prefix=provider_prefix)
 68 |         cursor.execute(query)
 69 | 
 70 |         print('Created {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table. Elapsed time: {time}'
 71 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 72 |         timer = time.time()
 73 |         print('Inserting into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table'
 74 |               .format(provider_prefix=provider_prefix))
 75 |         raw_sql = sql_to_string('origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od.sql')
 76 |         query = raw_sql.format(provider_prefix=provider_prefix, target_unit=od_admin_unit)
 77 |         cursor.execute(query)
 78 | 
 79 |         print('Inserted into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od table. Elapsed time: {time} seconds'
 80 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 81 |         print('########## FINISHED CREATING OD TABLE ##########')
 82 | 
 83 |     def create_od_detail(self):
 84 |         provider_prefix = self.config.provider_prefix
 85 |         cursor = self.hc.cursor
 86 |         print('########## CREATING OD DETAIL TABLE ##########')
 87 |         timer = time.time()
 88 |         print('Checking and dropping {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table if existing.'
 89 |               .format(provider_prefix=provider_prefix))
 90 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail '
 91 |                        .format(provider_prefix=provider_prefix))
 92 | 
 93 |         print('Checked and dropped {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table if existing. '
 94 |               'Elapsed time: {time} seconds'
 95 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
 96 |         print('Creating {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table'.format(
 97 |             provider_prefix=provider_prefix))
 98 | 
 99 |         raw_sql = sql_to_string('origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od_detail.sql')
100 |         query = raw_sql.format(provider_prefix=provider_prefix)
101 |         cursor.execute(query)
102 | 
103 |         print('Created {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table'.format(
104 |             provider_prefix=provider_prefix))
105 |         timer = time.time()
106 |         print('Inserting into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table'.format(
107 |             provider_prefix=provider_prefix))
108 |         raw_sql = sql_to_string('origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od_detail.sql')
109 |         query = raw_sql.format(provider_prefix=provider_prefix)
110 |         cursor.execute(query)
111 | 
112 |         print('Inserted into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail table. '
113 |               'Elapsed time: {time} seconds'
114 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
115 |         print('########## CREATING OD DETAIL TABLE ##########')
116 | 
117 |     def create_od_sum(self):
118 |         provider_prefix = self.config.provider_prefix
119 |         cursor = self.hc.cursor
120 |         print('########## CREATING OD SUM TABLE ##########')
121 |         timer = time.time()
122 |         print('Checking and dropping {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table if existing.'
123 |               .format(provider_prefix=provider_prefix))
124 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum '
125 |                        .format(provider_prefix=provider_prefix))
126 |         print('Checked and dropped {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table if existing. '
127 |               'Elapsed time: {time} seconds'
128 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
129 |         print('Creating {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table'.format(
130 |             provider_prefix=provider_prefix))
131 |         raw_sql = sql_to_string('origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od_sum.sql')
132 |         query = raw_sql.format(provider_prefix=provider_prefix)
133 |         cursor.execute(query)
134 |         print('Created {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table'.format(
135 |             provider_prefix=provider_prefix))
136 |         timer = time.time()
137 | 
138 |         print('Inserting into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table'.format(
139 |             provider_prefix=provider_prefix))
140 |         raw_sql = sql_to_string('origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od_sum.sql')
141 |         query = raw_sql.format(provider_prefix=provider_prefix)
142 |         cursor.execute(query)
143 |         print('Inserted into {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum table. '
144 |               'Elapsed time: {time} seconds'
145 |               .format(provider_prefix=provider_prefix, time=format_two_point_time(timer, time.time())))
146 |         raw_sql = sql_to_string('origin_destination/od_to_csv.sql')
147 |         query = raw_sql.format(provider_prefix=provider_prefix)
148 |         cursor.execute(query)
149 |         print('OD Result is stored in /tmp/hive/od_result')
150 |         print('########## FINISHED CREATING OD SUM TABLE ##########')
151 | 


--------------------------------------------------------------------------------
/Common/cdr_statistics.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import csv
  3 | import matplotlib.pyplot as plt
  4 | import os
  5 | from Common.hive_connection import HiveConnection
  6 | from Common import helper as hp
  7 | import time
  8 | from datetime import datetime
  9 | 
 10 | months = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June',
 11 |           7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
 12 | 
 13 | 
 14 | class Statistics:
 15 |     def __init__(self, config):
 16 |         self.config = config
 17 |         self.hc = HiveConnection()
 18 | 
 19 |     def calculate_data_statistics(self):
 20 |         cdr_data_layer = self.config.cdr_data_layer
 21 |         disable = False
 22 |         for item in cdr_data_layer:
 23 |             if str.lower(item['name']) == 'call_time' and item['output_no'] == -1 \
 24 |                     or str.lower(item['name']) == 'uid' and item['output_no'] == -1 \
 25 |                     or str.lower(item['name']) == 'imei' and item['output_no'] == -1 \
 26 |                     or str.lower(item['name']) == 'imsi' and item['output_no'] == -1 \
 27 |                     or str.lower(item['name']) == 'cell_id' and item['output_no'] == -1:
 28 |                 disable = True
 29 | 
 30 |         if not disable:
 31 |             provider_prefix = self.config.provider_prefix
 32 |             output_report_location = self.config.output_report_location
 33 |             print('########## CALCULATING DATA STATISTICS ##########')
 34 |             cursor = self.hc.cursor
 35 |             imei = "count(distinct IMEI) as unique_imei, "
 36 |             imsi = "count(distinct IMSI) as unique_imsi, "
 37 |             raw_sql = hp.sql_to_string('statistics/reports/all_statistics/data_statistics.sql')
 38 |             query = raw_sql.format(provider_prefix=provider_prefix, imei=imei, imsi=imsi)
 39 |             print('Calculating data statistics')
 40 |             timer = time.time()
 41 |             cursor.execute(query)
 42 |             print('Calculated data statistics. Elapsed time: {} seconds'
 43 |                   .format(hp.format_two_point_time(timer, time.time())))
 44 |             print('Writing to {}/css_file_data_stat.csv'.format(output_report_location))
 45 |             timer = time.time()
 46 | 
 47 |             with open("{}/css_file_data_stat.csv".format(output_report_location), "w", newline='') as outfile:
 48 |                 writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC)
 49 |                 writer.writerow(col[0] for col in cursor.description)
 50 |                 for row in cursor:
 51 |                     writer.writerow(row)
 52 |             print('Successfully wrote to {}/css_file_data_stat.csv'.format(output_report_location))
 53 |             print('Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
 54 |             print('########## FINISHED CALCULATING DATA STATISTICS ##########')
 55 |         else:
 56 |             print('Mapping for call_time, imsi, imei or uid is not sufficient. Ignored data statistic')
 57 | 
 58 |     def calculate_daily_statistics(self):
 59 |         provider_prefix = self.config.provider_prefix
 60 |         output_report_location = self.config.output_report_location
 61 |         cdr_data_layer = self.config.cdr_data_layer
 62 |         output_graph_location = self.config.output_graph_location
 63 |         imei = "count(distinct IMEI) as unique_imei, "
 64 |         imsi = "count(distinct IMSI) as unique_imsi, "
 65 |         cursor = self.hc.cursor
 66 |         file_location = '{}/css_file_data_stat.csv'.format(output_report_location)
 67 |         time_dict = hp.get_time_from_csv(file_location)
 68 |         start_date, end_date = time_dict['start_date'], time_dict['end_date']
 69 | 
 70 |         disable = False
 71 |         for item in cdr_data_layer:
 72 |             if str.lower(item['name']) == 'network_type' and item['output_no'] == -1 \
 73 |                     or str.lower(item['name']) == 'call_type' and item['output_no'] == -1:
 74 |                 disable = True
 75 |         if not disable:
 76 |             print('########## CALCULATING DAILY STATISTICS ##########')
 77 |             results = []
 78 |             timer = time.time()
 79 |             print('Calculating Daily Statistics')
 80 |             # FOR CASE ALL
 81 |             raw_query = hp.sql_to_string('statistics/reports/daily_statistics/daily_statistics.sql')
 82 |             query = raw_query.format(provider_prefix=provider_prefix,
 83 |                                      start_date=start_date,
 84 |                                      end_date=end_date,
 85 |                                      imei=imei,
 86 |                                      imsi=imsi)
 87 |             cursor.execute(query)
 88 |             print('Query completed. Time elapsed: {} seconds.'.format(hp.format_two_point_time(timer, time.time())))
 89 |             description = cursor.description
 90 |             rows = []
 91 |             for row in cursor:
 92 |                 rows.append(row)
 93 |             results += rows
 94 |             print('Writing into the graph for daily statistics')
 95 |             file_path = '{}/css_provider_data_stat_daily.csv'.format(output_report_location)
 96 |             if os.path.exists(file_path):
 97 |                 os.remove(file_path)
 98 | 
 99 |             with open(file_path, "w", newline='') as outfile:
100 |                 writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC)
101 |                 writer.writerow(col[0][4:] for col in description)
102 |                 for row in results:
103 |                     writer.writerow(row)
104 |             print('Successfully wrote to file css_provider_data_stat_daily.csv')
105 |             print('########## FINISHED CALCULATING DAILY STATISTICS ##########')
106 | 
107 |             print('########## Querying daily cdr by call_type ##########')
108 |             timer = time.time()
109 |             raw_sql = hp.sql_to_string('statistics/graphs/daily_cdrs_by_call_type/daily_cdrs_by_call_type.sql')
110 |             query = raw_sql.format(provider_prefix=provider_prefix,
111 |                                    start_date=start_date,
112 |                                    end_date=end_date,
113 |                                    imei=imei, imsi=imsi)
114 |             cursor.execute(query)
115 |             print('Query completed. Time elapsed: {} seconds.'.format(hp.format_two_point_time(timer, time.time())))
116 | 
117 |             rows = []
118 |             xs_all = set([])
119 |             ys_all = []
120 |             ys_data = []
121 |             ys_voice_or_sms = []
122 |             for row in cursor:
123 |                 rows.append(row)
124 |                 xs_all.add(row[0])
125 |             xs_all = list(xs_all)
126 |             xs_all.sort(key=lambda date: datetime.strptime(date, '%Y-%m-%d'))
127 |             # find the day in rows and match, then extract ALL, DATA and VOICE/SMS
128 |             print('Writing into the graph for daily cdr by call type')
129 |             for day in xs_all:
130 |                 c_all = 0
131 |                 c_data = 0
132 |                 c_sms_voice = 0
133 |                 for row in rows:
134 |                     if row[0] == day:
135 |                         if row[1] == 'ALL':
136 |                             c_all += row[3]
137 |                         elif row[1] == 'DATA':
138 |                             c_data += row[3]
139 |                         elif row[1] in ['VOICE', 'SMS']:
140 |                             c_sms_voice += row[3]
141 |                 ys_all.append(c_all)
142 |                 ys_data.append(c_data)
143 |                 ys_voice_or_sms.append(c_sms_voice)
144 |             figure = plt.figure(figsize=(14, 11))
145 |             font_dict = {
146 |                 'fontsize': 21,
147 |                 'fontweight': 'bold',
148 |             }
149 |             figure.add_subplot(111)
150 |             plt.subplots_adjust(top=0.95)
151 |             plt.grid(b=True)
152 |             plt.plot(xs_all, ys_all)
153 |             plt.plot(xs_all, ys_data)
154 |             plt.plot(xs_all, ys_voice_or_sms)
155 |             plt.ylabel('Total Records')
156 |             plt.xticks(rotation=90)
157 |             plt.xlabel('Date')
158 |             plt.title('Daily CDR by call type', fontdict=font_dict)
159 |             plt.legend(['ALL', 'DATA', 'VOICE and SMS'], loc='upper left')
160 |             plt.savefig('{}/daily_cdr_by_call_type'.format(output_graph_location))
161 |             plt.clf()
162 |             print('Graph created successfully in {}/daily_cdr_by_call_type'.format(output_graph_location))
163 |         else:
164 |             print('Mapping for network_type or call_type is not sufficient. Ignored daily statistics')
165 | 
166 |     def calculate_monthly_statistics(self):
167 |         provider_prefix = self.config.provider_prefix
168 |         output_report_location = self.config.output_report_location
169 |         cdr_data_layer = self.config.cdr_data_layer
170 |         cursor = self.hc.cursor
171 |         disable = False
172 |         for item in cdr_data_layer:
173 |             if str.lower(item['name']) == 'network_type' and item['output_no'] == -1 \
174 |                     or str.lower(item['name']) == 'call_type' and item['output_no'] == -1:
175 |                 disable = True
176 |         if not disable:
177 |             print('########## CALCULATING MONTHLY STATISTICS ##########')
178 |             results = []
179 |             file_location = '{}/css_file_data_stat.csv'.format(output_report_location)
180 |             imei = "count(distinct IMEI) as unique_imei, "
181 |             imsi = "count(distinct IMSI) as unique_imsi, "
182 |             time_dict = hp.get_time_from_csv(file_location)
183 |             start_y, start_m, end_y, end_m = time_dict['start_y'], time_dict['start_m'], \
184 |                 time_dict['end_y'], time_dict['end_m']
185 |             print('### Calculating Monthly Statistics ###')
186 |             # FOR CASE ALL
187 |             raw_sql = hp.sql_to_string('statistics/reports/monthly_statistics/monthly_statistics.sql')
188 |             query = raw_sql.format(provider_prefix=provider_prefix,
189 |                                    start_year=start_y,
190 |                                    end_year=end_y,
191 |                                    start_month=start_m,
192 |                                    end_month=end_m,
193 |                                    imei=imei,
194 |                                    imsi=imsi)
195 |             cursor.execute(query)
196 |             description = cursor.description
197 |             rows = []
198 |             for row in cursor:
199 |                 rows.append(row)
200 | 
201 |             results += cursor.fetchall()
202 | 
203 |             file_path = '{}/css_provider_data_stat_monthly.csv'.format(output_report_location)
204 |             if os.path.exists(file_path):
205 |                 os.remove(file_path)
206 | 
207 |             with open(file_path, "w", newline='') as outfile:
208 |                 writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC)
209 |                 writer.writerow(col[0][4:] for col in description)
210 |                 for row in results:
211 |                     writer.writerow(row)
212 | 
213 |             print('### Successfully wrote to file css_provider_data_stat_monthly.csv###')
214 |             print('########## CALCULATING MONTHLY STATISTICS ##########')
215 |         else:
216 |             print('Mapping for network_type or call_type is not sufficient. Ignored monthly statistics')
217 | 
218 |     def calculate_zone_population(self):
219 |         print('########## CALCULATING ZONE POPULATION STATISTICS ##########')
220 |         provider_prefix = self.config.provider_prefix
221 |         output_report_location = self.config.output_report_location
222 |         cursor = self.hc.cursor
223 |         cdr_cell_tower = self.config.cdr_cell_tower
224 |         admin_units = ['ADMIN0', 'ADMIN1', 'ADMIN2', 'ADMIN3', 'ADMIN4', 'ADMIN5']
225 |         admin_units_active = []
226 |         geo_jsons_active = []
227 |         name_columns = []
228 |         geo_json_filename = []
229 |         for col in cdr_cell_tower:
230 |             if col['name'] in admin_units:
231 |                 admin_units_active.append(col['name'])
232 |                 if col['geojson_filename'] == '':
233 |                     geo_jsons_active.append('')
234 |                 else:
235 |                     geo_jsons_active.append(hp.json_file_to_object(col['geojson_filename'], encoding="utf-8"))
236 |                 name_columns.append(col['geojson_col_name'])
237 |                 geo_json_filename.append(col['geojson_filename'])
238 |         geo_i = 0
239 |         for admin_unit in admin_units_active:
240 |             timer = time.time()
241 |             print('Calculating zone population for {admin}'.format(admin=admin_unit))
242 |             raw_sql = hp.sql_to_string('statistics/reports/zone_population/zone_population.sql')
243 |             query = raw_sql.format(provider_prefix=provider_prefix, level=admin_unit)
244 |             cursor.execute(query)
245 |             description = cursor.description
246 |             print('Successfully zone population for {admin}. Elapsed time: {time} seconds'
247 |                   .format(admin=admin_unit, time=hp.format_two_point_time(timer, time.time())))
248 |             timer = time.time()
249 |             rows = []
250 |             for row in cursor:
251 |                 rows.append(row)
252 | 
253 |             file_path = '{output_report_location}/zone_based_aggregations_level_{level}.csv'.format(
254 |                 output_report_location=output_report_location, level=admin_unit)
255 |             if geo_jsons_active[geo_i] != '':
256 |                 print('Merging dictionary object to geojson')
257 |                 for f in range(0, len(geo_jsons_active[geo_i]['features'])):
258 |                     # TODO fix mockup
259 |                     if geo_jsons_active[geo_i]['features'][f]['properties'][name_columns[geo_i]] == 'Kochi Ken':
260 |                         geo_jsons_active[geo_i]['features'][f]['properties']['num_population'] = 'Kochi Ken'
261 |                 print('Merging completed. Time elapsed: {} seconds'
262 |                       .format(hp.format_two_point_time(timer, time.time())))
263 |                 timer = time.time()
264 |             else:
265 |                 print('No geojson file input')
266 | 
267 |             if os.path.exists(file_path):
268 |                 os.remove(file_path)
269 | 
270 |             print('Writing result zone population to {}'.format(file_path))
271 |             with open(file_path, "w", newline='') as outfile:
272 |                 writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC)
273 |                 writer.writerow(col[0] for col in description)
274 |                 for row in rows:
275 |                     writer.writerow(row)
276 |             print('Writing completed. Time elapsed: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
277 |             timer = time.time()
278 | 
279 |             if geo_json_filename[geo_i] != '':
280 |                 print('Writing into geojson file ' + geo_json_filename[geo_i][:-4] + '_joined_' + admin_unit + '.json')
281 |                 with open('{}/'.format(output_report_location) + geo_json_filename[geo_i][:-4] +
282 |                           '_joined_' + admin_unit + '.json', "w", newline='') as outfile:
283 |                     json.dump(geo_jsons_active[geo_i], outfile)
284 |                 print('Writing completed. Time elapsed: {} seconds'
285 |                       .format(hp.format_two_point_time(timer, time.time())))
286 |             geo_i += 1
287 |         print('########## FINISHED CALCULATING ZONE POPULATION STATISTICS ##########')
288 | 
289 |     def calculate_user_date_histogram(self):
290 |         print('########## CALCULATING USER DATE HISTOGRAM ##########')
291 |         output_graph_location = self.config.output_graph_location
292 |         output_report_location = self.config.output_report_location
293 |         provider_prefix = self.config.provider_prefix
294 |         cursor = self.hc.cursor
295 |         raw_sql = hp.sql_to_string('statistics/graphs/date_histogram/histogram.sql')
296 |         query = raw_sql.format(provider_prefix=provider_prefix)
297 | 
298 |         timer = time.time()
299 |         print('Calculating data histogram')
300 |         cursor.execute(query)
301 |         description = cursor.description
302 |         rows = cursor.fetchall()
303 |         print('Calculating completed. Time elapsed: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
304 | 
305 |         file_path = '{}/histogram.csv'.format(output_report_location)
306 |         print('Writing into {}'.format(file_path))
307 |         if os.path.exists(file_path):
308 |             os.remove(file_path)
309 | 
310 |         with open(file_path, "w", newline='') as outfile:
311 |             writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC)
312 |             writer.writerow(col[0] for col in description)
313 |             for row in rows:
314 |                 writer.writerow(row)
315 |         print('Writing completed.')
316 | 
317 |         xs = []
318 |         ys = []
319 |         for row in rows:
320 |             json_data = hp.string_to_json(row[0])
321 |             xs.append(json_data['x'])
322 |             ys.append(json_data['y'])
323 | 
324 |         plt.subplots_adjust(left=0.15)
325 |         plt.bar(xs, ys, align='center')  # A bar chart
326 |         plt.xlabel('Active Day Bins')
327 |         plt.ylabel('Count No. Unique Ids')
328 |         print('Plotting graph and writing into {}/user_data_histogram.png'.format(output_graph_location))
329 |         plt.savefig('{}/user_data_histogram.png'.format(output_graph_location))
330 |         print('Done.')
331 |         print('########## CALCULATING USER DATE HISTOGRAM ##########')
332 | 
333 |     def calculate_summary(self):
334 |         output_report_location = self.config.output_report_location
335 |         provider_prefix = self.config.provider_prefix
336 |         cdr_cell_tower = self.config.cdr_cell_tower
337 |         cdr_data_layer = self.config.cdr_data_layer
338 |         print('########## CALCULATING SUMMARY ##########')
339 |         cursor = self.hc.cursor
340 |         tb_1_description = ('All Data', 'Value')
341 |         tb_2_description = ('Statistics',)
342 |         output_1_rows = []
343 | 
344 |         print('Calculating total records')
345 |         timer = time.time()
346 |         raw_sql = hp.sql_to_string('statistics/total_records.sql')
347 |         q_total_records = raw_sql.format(provider_prefix=provider_prefix)
348 |         cursor.execute(q_total_records)
349 | 
350 |         des = cursor.description
351 |         row_total_records = cursor.fetchall()
352 |         row_total_records = (des[0][0], row_total_records[0][0])
353 |         output_1_rows.append(row_total_records)
354 |         total_records = row_total_records[1]
355 |         print('Successfully calculated total records. Total records: {recs} records \nElapsed time: {time} seconds'
356 |               .format(recs=total_records, time=hp.format_two_point_time(timer, time.time())))
357 |         timer = time.time()
358 | 
359 |         print('Calculating total unique uids')
360 |         raw_sql = hp.sql_to_string('statistics/total_unique_uids.sql')
361 |         q_total_uids = raw_sql.format(provider_prefix=provider_prefix)
362 |         cursor.execute(q_total_uids)
363 | 
364 |         des = cursor.description
365 |         row_total_uids = cursor.fetchall()
366 |         row_total_uids = (des[0][0], row_total_uids[0][0])
367 |         output_1_rows.append(row_total_uids)
368 |         total_uids = row_total_uids[1]
369 |         print('Successfully calculated total unique uids. Total unique ids: {ids} ids \nElapsed time: {time} seconds'
370 |               .format(ids=total_uids, time=hp.format_two_point_time(timer, time.time())))
371 |         timer = time.time()
372 |         raw_sql = hp.sql_to_string('statistics/reports/summary/total_days.sql')
373 |         print('Calculating total days')
374 |         query = raw_sql.format(provider_prefix=provider_prefix)
375 |         cursor.execute(query)
376 |         des = cursor.description
377 |         row_total_days = cursor.fetchall()
378 | 
379 |         total_days = row_total_days[0][0]
380 |         start_yyyy_mm_dd = row_total_days[0][1].split('-')
381 |         end_yyyy_mm_dd = row_total_days[0][2].split('-')
382 | 
383 |         start_day = start_yyyy_mm_dd[2]
384 |         start_month = start_yyyy_mm_dd[1]
385 |         start_year = start_yyyy_mm_dd[0]
386 | 
387 |         end_day = end_yyyy_mm_dd[2]
388 |         end_month = end_yyyy_mm_dd[1]
389 |         end_year = end_yyyy_mm_dd[0]
390 | 
391 |         if int(total_days) == 0:
392 |             row_total_days = (des[0][0], row_total_days[0][0])
393 |         elif int(total_days) == 1:
394 |             row_total_days = (des[0][0],
395 |                               str(row_total_days[0][0]) + ' ({} {} {})'.format(int(start_day), months[int(start_month)],
396 |                                                                                start_year))
397 |         elif int(total_days) >= 2:
398 |             if start_year == end_year:
399 |                 # same year
400 |                 if start_month == end_month:
401 |                     # no same day because it is gonna be total_days 1, which is done above
402 |                     row_total_days = (des[0][0], str(row_total_days[0][0]) +
403 |                                       ' ({}-{} {} {})'.format(int(start_day), int(end_day),
404 |                                                               months[int(start_month)], start_year))
405 |                 else:
406 |                     # for different months, same or different day will also be outputted
407 |                     row_total_days = (des[0][0], str(row_total_days[0][0]) +
408 |                                       ' ({} {}-{} {} {})'.format(int(start_day), months[int(start_month)],
409 |                                                                  int(end_day), months[int(end_month)], start_year))
410 | 
411 |             else:
412 |                 # for the more-than-one-year case, everything is displayed
413 |                 row_total_days = (des[0][0], str(row_total_days[0][0]) +
414 |                                   ' ({} {} {}-{} {} {})'.format(int(start_day), months[int(start_month)],
415 |                                                                 start_year, int(end_day), months[int(end_month)],
416 |                                                                 end_year))
417 | 
418 |         output_1_rows.append(row_total_days)
419 |         print('Successfully calculated total days. Total days: {days} \nElapsed time: {time} seconds'
420 |               .format(days=row_total_days[1], time=hp.format_two_point_time(timer, time.time())))
421 | 
422 |         # average usage per day
423 |         print('Calculating average daily usage')
424 |         output_2_rows = []
425 |         row_avg_daily_usage = ('average_usage_per_day', round(float(total_records / total_days), 3))
426 |         output_2_rows.append(row_avg_daily_usage)
427 |         print('Successfully calculated average daily usage. Daily average usages : {uses} '
428 |               '\nElapsed time: {time} seconds'
429 |               .format(uses=row_avg_daily_usage[1], time=hp.format_two_point_time(timer, time.time())))
430 |         timer = time.time()
431 |         # avg voice call per day
432 | 
433 |         disable = False
434 |         for item in cdr_data_layer:
435 |             if str.lower(item['name']) == 'network_type' and item['output_no'] == -1 \
436 |                     or str.lower(item['name']) == 'call_type' and item['output_no'] == -1:
437 |                 disable = True
438 | 
439 |         if not disable:
440 |             print('########## Calculating average daily voice call usage ##########')
441 |             raw_sql = hp.sql_to_string('statistics/reports/summary/average_daily_voice.sql')
442 |             q_avg_daily_voice = raw_sql.format(
443 |                 provider_prefix=provider_prefix, total_days=total_days)
444 |             cursor.execute(q_avg_daily_voice)
445 |             des = cursor.description
446 |             row_avg_daily_voice = cursor.fetchall()
447 |             row_avg_daily_voice = (des[0][0], round(row_avg_daily_voice[0][0], 3))
448 |             output_2_rows.append(row_avg_daily_voice)
449 |             print('Successfully calculated average daily voice call usage. Daily average sms usages : {uses} '
450 |                   '\nElapsed time: {time} seconds'
451 |                   .format(uses=row_avg_daily_voice[1], time=hp.format_two_point_time(timer, time.time())))
452 |             timer = time.time()
453 |             # avg sms per day
454 |             print('Calculating average daily sms usage')
455 |             raw_sql = hp.sql_to_string('statistics/reports/summary/average_daily_sms.sql')
456 |             q_avg_daily_sms = raw_sql.format(provider_prefix=provider_prefix, total_days=total_days)
457 |             cursor.execute(q_avg_daily_sms)
458 |             des = cursor.description
459 |             row_avg_daily_sms = cursor.fetchall()
460 |             row_avg_daily_sms = (des[0][0], round(row_avg_daily_sms[0][0], 3))
461 |             output_2_rows.append(row_avg_daily_sms)
462 |             print('########## Successfully calculated average daily sms usage. '
463 |                   'Daily average sms usages : {uses} ##########'
464 |                   '\n########## Elapsed time: {time} seconds ##########'
465 |                   .format(uses=row_avg_daily_sms[1], time=hp.format_two_point_time(timer, time.time())))
466 |             timer = time.time()
467 |         else:
468 |             print('call_type or network_type not completed. Ignored daily usage of sms and voice call')
469 | 
470 |         # avg unique cell id
471 |         disable = False
472 |         for item in cdr_data_layer:
473 |             if str.lower(item['name']) == 'cell_id' and item['output_no'] == -1:
474 |                 disable = True
475 | 
476 |         if not disable:
477 |             print('Calculating average daily unique cell id')
478 |             raw_sql = hp.sql_to_string('statistics/reports/summary/average_unique_cell_ids.sql')
479 |             query = raw_sql.format(provider_prefix=provider_prefix, total_days=total_days)
480 |             cursor.execute(query)
481 | 
482 |             des = cursor.description
483 |             row_avg_daily_unique_cell_id = cursor.fetchall()
484 |             row_avg_daily_unique_cell_id = (des[0][0], round(row_avg_daily_unique_cell_id[0][0], 3))
485 |             output_2_rows.append(row_avg_daily_unique_cell_id)
486 |             print('Successfully calculated average daily unique cell id')
487 |             print('Successfully calculated average daily unique cel id.'
488 |                   '\nElapsed time: {time} seconds'
489 |                   .format(time=hp.format_two_point_time(timer, time.time())))
490 |             timer = time.time()
491 |             have_district = False
492 |             for col in cdr_cell_tower:
493 |                 if str.lower(col['name']) == 'admin1':
494 |                     have_district = True
495 |             if have_district:
496 |                 print('Calculating average daily administration level 1')
497 |                 raw_sql = hp.sql_to_string('statistics/reports/summary/average_daily_admin1.sql')
498 |                 query = raw_sql.format(provider_prefix=provider_prefix, level='ADMIN1', total_days=total_days)
499 |                 cursor.execute(query)
500 | 
501 |                 des = cursor.description
502 |                 row_avg_daily_district = cursor.fetchall()
503 |                 print(row_avg_daily_district)
504 |                 row_avg_daily_district = (des[0][0], round(row_avg_daily_district[0][0], 3))
505 |                 output_2_rows.append(row_avg_daily_district)
506 |                 print('Successfully calculated average daily administration level 1. Daily average value : {dists} '
507 |                       '\nElapsed time: {time} seconds'
508 |                       .format(dists=row_avg_daily_district[1], time=hp.format_two_point_time(timer, time.time())))
509 |                 timer = time.time()
510 | 
511 |         else:
512 |             print('Skipped due to incomplete cell_id data')
513 | 
514 |         print('Recording to summary_stats')
515 |         with open("{}/summary_stats.csv".format(output_report_location), "w", newline='') as outfile:
516 |             writer = csv.writer(outfile, quoting=csv.QUOTE_NONNUMERIC)
517 |             writer.writerow(tb_1_description)
518 |             for row in output_1_rows:
519 |                 writer.writerow(row)
520 | 
521 |             writer.writerow('\n')
522 | 
523 |             writer.writerow(tb_2_description)
524 |             for row in output_2_rows:
525 |                 writer.writerow(row)
526 | 
527 |         print('Successfully wrote to summary_stats.csv\nElapsed time: {time} seconds'
528 |               .format(time=hp.format_two_point_time(timer, time.time())))
529 | 
530 |         print('########## FINISHED CALCULATING SUMMARY ##########')
531 | 
532 |     def daily_cdrs(self):
533 |         timer = time.time()
534 |         output_graph_location = self.config.output_graph_location
535 |         provider_prefix = self.config.provider_prefix
536 |         cursor = self.hc.cursor
537 |         print('########## Daily cdrs ##########')
538 |         print('Selecting total records')
539 |         raw_sql = hp.sql_to_string('statistics/total_records.sql')
540 |         query = raw_sql.format(provider_prefix=provider_prefix)
541 |         cursor.execute(query)
542 | 
543 |         des = cursor.description
544 |         row_total_records = cursor.fetchall()
545 |         row_total_records = (des[0][0], row_total_records[0][0])
546 |         total_records = row_total_records[1]
547 |         print('Successfully calculated total records. Total records: {recs} records \nElapsed time: {time} seconds'
548 |               .format(recs=total_records, time=hp.format_two_point_time(timer, time.time())))
549 |         timer = time.time()
550 |         raw_sql = hp.sql_to_string('statistics/graphs/daily_cdrs/total_daily_cdrs.sql')
551 |         q_total_daily_cdr = raw_sql.format(provider_prefix=provider_prefix)
552 |         cursor.execute(q_total_daily_cdr)
553 |         row_total_daily_cdr = cursor.fetchall()
554 |         print('Query done'
555 |               '\nElapsed time: {time} seconds'
556 |               .format(time=hp.format_two_point_time(timer, time.time())))
557 |         timer = time.time()
558 |         total_daily_cdr_x = []
559 |         total_daily_cdr_y = []
560 |         for row in row_total_daily_cdr:
561 |             total_daily_cdr_x.append(row[0])
562 |             total_daily_cdr_y.append(row[1])
563 | 
564 |         print('Querying min, max and avg of total records')
565 |         q_total_daily_cdr_all = "select min(total_records), max(total_records), avg(total_records) from ({}) td"\
566 |             .format(q_total_daily_cdr)
567 |         cursor.execute(q_total_daily_cdr_all)
568 | 
569 |         row_total_daily_cdr_all = cursor.fetchall()
570 | 
571 |         daily_cdr_min, daily_cdr_max, daily_cdr_avg = row_total_daily_cdr_all[0][0],\
572 |             row_total_daily_cdr_all[0][1], row_total_daily_cdr_all[0][2]
573 |         print('Done.\nElapsed time: {time} seconds'.format(time=hp.format_two_point_time(timer, time.time())))
574 |         print('Writing into the graph for daily cdrs')
575 |         hp.make_graph(total_daily_cdr_x, 'Day', total_daily_cdr_y, 'Total Records', 'Daily CDRs',
576 |                       '{}/daily_cdrs'.format(output_graph_location),
577 |                       des_pair_1={'text_x': 0.090, 'text_y': 1.27, 'text': 'MIN', 'value': f"{daily_cdr_min:,.2f}"},
578 |                       des_pair_2={'text_x': 0.345, 'text_y': 1.27, 'text': 'MAX', 'value': f"{daily_cdr_max:,.2f}"},
579 |                       des_pair_3={'text_x': 0.595, 'text_y': 1.27, 'text': 'AVG', 'value': f"{daily_cdr_avg:,.2f}"},
580 |                       des_pair_4={'text_x': 0.83, 'text_y': 1.27, 'text': 'Total Records',
581 |                                   'value': f"{total_records:,.2f}"})
582 |         print(
583 |             '########## Writing completed. File located in {}/daily_cdrs ##########'.format(output_graph_location))
584 | 
585 |     def daily_unique_users(self):
586 |         output_graph_location = self.config.output_graph_location
587 |         provider_prefix = self.config.provider_prefix
588 |         cursor = self.hc.cursor
589 |         print('########## Daily unique users ###########')
590 |         print('Calculating total unique uids')
591 |         raw_sql = hp.sql_to_string('statistics/total_unique_uids.sql')
592 |         q_total_uids = raw_sql.format(provider_prefix=provider_prefix)
593 |         timer = time.time()
594 |         cursor.execute(q_total_uids)
595 |         des = cursor.description
596 |         row_total_uids = cursor.fetchall()
597 |         row_total_uids = (des[0][0], row_total_uids[0][0])
598 |         total_uids = row_total_uids[1]
599 |         print('Successfully calculated total unique uids. Total unique ids: {ids} ids  \nElapsed time: {time} seconds'
600 |               .format(ids=total_uids, time=hp.format_two_point_time(timer, time.time())))
601 |         print('Quering date and unique users')
602 |         timer = time.time()
603 |         raw_sql = hp.sql_to_string('statistics/graphs/daily_unique_users/total_daily_uids.sql')
604 |         q_total_daily_uid = raw_sql.format(provider_prefix=provider_prefix)
605 |         cursor.execute(q_total_daily_uid)
606 |         row_total_daily_uid = cursor.fetchall()
607 |         print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
608 |         timer = time.time()
609 |         total_daily_uid_x = []
610 |         total_daily_uid_y = []
611 |         for row in row_total_daily_uid:
612 |             total_daily_uid_x.append(row[0])
613 |             total_daily_uid_y.append(row[1])
614 |         print('Selecing min, max and avg of total users')
615 |         q_total_daily_uid_all = "select min(total_users), max(total_users), avg(total_users) from ({}) td".format(
616 |             q_total_daily_uid)
617 |         cursor.execute(q_total_daily_uid_all)
618 | 
619 |         row_total_daily_uid_all = cursor.fetchall()
620 |         daily_uid_min, daily_uid_max, daily_uid_avg = row_total_daily_uid_all[0][0], \
621 |             row_total_daily_uid_all[0][1], row_total_daily_uid_all[0][2]
622 |         print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
623 | 
624 |         print('Writing into the graph for daily unique users')
625 |         hp.make_graph(total_daily_uid_x, 'Date', total_daily_uid_y, 'Total Users', 'Daily Unique Users',
626 |                       '{}/daily_unique_users'.format(output_graph_location),
627 |                       des_pair_1={'text_x': 0.090, 'text_y': 1.27, 'text': 'MIN', 'value': f"{daily_uid_min:,.2f}"},
628 |                       des_pair_2={'text_x': 0.345, 'text_y': 1.27, 'text': 'MAX', 'value': f"{daily_uid_max:,.2f}"},
629 |                       des_pair_3={'text_x': 0.595, 'text_y': 1.27, 'text': 'AVG', 'value': f"{daily_uid_avg:,.2f}"},
630 |                       des_pair_4={'text_x': 0.805, 'text_y': 1.27, 'text': 'Total Unique IDs',
631 |                                   'value': f"{total_uids:,.2f}"})
632 |         print('########## Writing completed. File located in {}/daily_unique_users ##########'
633 |               .format(output_graph_location))
634 | 
635 |     def daily_unique_locations(self):
636 |         timer = time.time()
637 |         output_graph_location = self.config.output_graph_location
638 |         provider_prefix = self.config.provider_prefix
639 |         cursor = self.hc.cursor
640 |         print('########## Daily unique locations ##########')
641 |         print('Calculating daily average location name')
642 |         raw_sql = hp.sql_to_string('statistics/graphs/daily_unique_locations/total_unique_locations.sql')
643 |         query = raw_sql.format(provider_prefix=provider_prefix)
644 | 
645 |         cursor.execute(query)
646 |         des = cursor.description
647 |         row_total_locations = cursor.fetchall()
648 |         row_total_locations = (des[0][0], row_total_locations[0][0])
649 |         total_unique_locations = row_total_locations[1]
650 |         print('Successfully calculated daily average location name. Daily average location names : {locs} '
651 |               '\nElapsed time: {time} seconds'
652 |               .format(locs=row_total_locations[1], time=hp.format_two_point_time(timer, time.time())))
653 |         timer = time.time()
654 | 
655 |         print('Querying daily unique locations')
656 |         raw_sql = hp.sql_to_string('statistics/graphs/daily_unique_locations/daily_unique_locations.sql')
657 |         q_total_daily_locations = raw_sql.format(provider_prefix=provider_prefix)
658 |         cursor.execute(q_total_daily_locations)
659 |         row_total_daily_locations = cursor.fetchall()
660 |         print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
661 |         timer = time.time()
662 | 
663 |         total_daily_location_x = []
664 |         total_daily_location_y = []
665 |         for row in row_total_daily_locations:
666 |             total_daily_location_x.append(row[0])
667 |             total_daily_location_y.append(row[1])
668 |         print('Selecing min, max and avg of unique locations')
669 |         q_total_daily_location_all = "select min(unique_locations), max(unique_locations), avg(unique_locations) " \
670 |                                      "from ({}) td".format(q_total_daily_locations)
671 |         cursor.execute(q_total_daily_location_all)
672 | 
673 |         row_total_daily_location_all = cursor.fetchall()
674 |         daily_location_min, daily_location_max, daily_location_avg = row_total_daily_location_all[0][0],\
675 |             row_total_daily_location_all[0][1], row_total_daily_location_all[0][2]
676 |         print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
677 | 
678 |         print('Writing into the graph for daily unique locations')
679 |         hp.make_graph(total_daily_location_x, 'Date', total_daily_location_y, 'Total Locations',
680 |                       'Daily Unique Locations', '{}/daily_unique_locations'.format(output_graph_location),
681 |                       des_pair_1={'text_x': 0.090, 'text_y': 1.27, 'text': 'MIN',
682 |                                   'value': f"{daily_location_min:,.2f}"},
683 |                       des_pair_2={'text_x': 0.345, 'text_y': 1.27, 'text': 'MAX',
684 |                                   'value': f"{daily_location_max:,.2f}"},
685 |                       des_pair_3={'text_x': 0.595, 'text_y': 1.27, 'text': 'AVG',
686 |                                   'value': f"{daily_location_avg:,.2f}"},
687 |                       des_pair_4={'text_x': 0.805, 'text_y': 1.27, 'text': 'Total Unique Locations',
688 |                                   'value': f"{total_unique_locations:,.2f}"})
689 |         print('########## Writing completed. File located in {}/daily_unique_locations ###########'
690 |               .format(output_graph_location))
691 | 
692 |     def daily_average_cdrs(self):
693 |         output_graph_location = self.config.output_graph_location
694 |         provider_prefix = self.config.provider_prefix
695 |         cursor = self.hc.cursor
696 |         timer = time.time()
697 |         print('########## Daily Average CDRs ##########')
698 |         print('Querying for average cdr and total unique users')
699 |         raw_sql = hp.sql_to_string('statistics/graphs/daily_average_cdrs/daily_average_cdrs.sql')
700 |         q_total_daily_avg_cdr = raw_sql.format(provider_prefix=provider_prefix)
701 |         cursor.execute(q_total_daily_avg_cdr)
702 |         row_total_daily_avg_cdr = cursor.fetchall()
703 |         print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
704 |         timer = time.time()
705 | 
706 |         total_daily_avg_cdr_x = []
707 |         total_daily_avg_cdr_y = []
708 |         for row in row_total_daily_avg_cdr:
709 |             total_daily_avg_cdr_x.append(row[0])
710 |             total_daily_avg_cdr_y.append(row[1])
711 | 
712 |         print('Querying for average daily cdrs')
713 |         q_total_daily_location_all = "select avg(daily_average_cdr) from ({}) td".format(
714 |             q_total_daily_avg_cdr)
715 |         cursor.execute(q_total_daily_location_all)
716 | 
717 |         row_total_daily_avg_cdr_all = cursor.fetchall()
718 |         print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
719 |         daily_avg_cdr = row_total_daily_avg_cdr_all[0][0]
720 |         print('########## Writing into the graph for daily average CDRs ##########')
721 |         hp.make_graph(total_daily_avg_cdr_x, 'Date', total_daily_avg_cdr_y, 'Total Daily Average CDRs',
722 |                       'Daily Average CDRs', '{}/daily_avg_cdr'.format(output_graph_location),
723 |                       des_pair_1={'text_x': 0.035, 'text_y': 1.27, 'text': 'Total Daily Avg CDRs',
724 |                                   'value': f"{daily_avg_cdr:,.2f}"})
725 | 
726 |     def daily_unique_average_locations(self):
727 |         print('########## Daily unique average locations ##########')
728 |         output_graph_location = self.config.output_graph_location
729 |         provider_prefix = self.config.provider_prefix
730 |         cursor = self.hc.cursor
731 |         disable = False
732 |         for item in self.config.cdr_data_layer:
733 |             if str.lower(item['name']) == 'cell_id' and item['output_no'] == -1 \
734 |                     or str.lower(item['name']) == 'call_time' and item['output_no'] == -1:
735 |                 disable = True
736 |         if not disable:
737 |             print('Querying daily average cell ids and daily average locations')
738 |             timer = time.time()
739 |             raw_sql = hp.sql_to_string('statistics/graphs/daily_average_unique_locations/'
740 |                                        'daily_average_unique_locations.sql')
741 |             q_total_daily_avg_locations = raw_sql.format(provider_prefix=provider_prefix)
742 |             cursor.execute(q_total_daily_avg_locations)
743 |             row_total_daily_avg_locations = cursor.fetchall()
744 |             print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
745 |             timer = time.time()
746 |             total_daily_avg_location_x = []
747 |             total_daily_avg_location_y = []
748 |             for row in row_total_daily_avg_locations:
749 |                 total_daily_avg_location_x.append(row[0])
750 |                 total_daily_avg_location_y.append(row[1])
751 |             print('Querying for average daily avg cell_id and locations')
752 |             q_total_daily_avg_location_all = "select avg(td.daily_avg_cell_ids), avg(td.daily_avg_locations) " \
753 |                                              "from ({}) td".format(q_total_daily_avg_locations)
754 |             cursor.execute(q_total_daily_avg_location_all)
755 |             row_total_daily_location_all = cursor.fetchall()
756 |             print('Query completed. Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
757 |             daily_avg_location_cell_ids, daily_avg_location = row_total_daily_location_all[0][0],\
758 |                 row_total_daily_location_all[0][1]
759 | 
760 |             print('Writing into the graph for daily unique average locations')
761 |             hp.make_graph(total_daily_avg_location_x, 'Date', total_daily_avg_location_y, 'Total Unique Locations',
762 |                           'Daily Unique Average Locations',
763 |                           '{}/daily_unique_avg_locations'.format(output_graph_location),
764 |                           des_pair_1={'text_x': 0.00, 'text_y': 1.27, 'text': 'Avg Daily Unique Cell IDs ',
765 |                                       'value': f"{daily_avg_location_cell_ids:,.2f}"},
766 |                           des_pair_2={'text_x': 0.28, 'text_y': 1.27, 'text': 'Avg Daily Unique Locations',
767 |                                       'value': f"{daily_avg_location:,.2f}"})
768 |             print('########## Writing completed. File located in {}/daily_unique_avg_locations ##########'
769 |                   .format(output_graph_location))
770 |         else:
771 |             print('call_time or cell_id is in incorrect form. Ignored output.')
772 | 
773 |     def frequent_locations(self):
774 |         frequent_locations_percentage = self.config.frequent_locations_percentage
775 |         provider_prefix = self.config.provider_prefix
776 |         cursor = self.hc.cursor
777 |         print('########## CREATE FREQUENT LOCATION TABLE ##########')
778 |         print('Checking and dropping frequent location table if existing.')
779 |         timer = time.time()
780 |         admin = self.config.od_admin_unit
781 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_frequent_locations'
782 |                        .format(provider_prefix=provider_prefix))
783 |         print('Checked and dropped frequent location table if existing. Elapsed time: {} seconds'.format(
784 |             hp.format_two_point_time(timer, time.time())))
785 |         timer = time.time()
786 |         print('Creating frequent location table')
787 |         raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/create_frequent_locations.sql')
788 |         query = raw_sql.format(provider_prefix=provider_prefix, admin_params=admin + '_id string')
789 | 
790 |         cursor.execute(query)
791 |         print('Created frequent location table. Elapsed time: {} seconds'
792 |               .format(hp.format_two_point_time(timer, time.time())))
793 |         timer = time.time()
794 |         print('Inserting into frequent location table')
795 |         raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations.sql')
796 |         query = raw_sql.format(provider_prefix=provider_prefix, admin_params=admin + '_id', admin=admin)
797 | 
798 |         cursor.execute(query)
799 |         print('Inserted into frequent location table.\nResult are in the table named {provider_prefix}_'
800 |               'frequent_locations\nElapsed time: {time} seconds. '
801 |               .format(provider_prefix=provider_prefix, time=hp.format_two_point_time(timer, time.time())))
802 |         timer = time.time()
803 |         print('Dropping freq location with accumulated percentage')
804 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_freq_with_acc_wsum'
805 |                        .format(provider_prefix=provider_prefix))
806 |         print('Checked and dropped frequent location table with accumulated percentage if existing. '
807 |               'Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
808 |         timer = time.time()
809 |         print('Creating and insert freq with acc wsum Table (Frequent Locations) with accumulated percentage')
810 |         raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_wsum.sql')
811 |         query = raw_sql.format(provider_prefix=provider_prefix, admin=admin)
812 |         cursor.execute(query)
813 |         print('Inserted into frequent locations table with accumulated percentage. '
814 |               '\nElapsed time: {time} seconds. '.format(time=hp.format_two_point_time(timer, time.time())))
815 |         timer = time.time()
816 |         print('Dropping frequent locations thresholded table')
817 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_frequent_locations_thresholded'
818 |                        .format(provider_prefix=provider_prefix))
819 |         print('Checked and dropped frequent locations table with accumulated percentage if existing.'
820 |               'Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
821 |         timer = time.time()
822 |         print('Creating and insert frequent locations thresholded table ')
823 |         raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_thresholded.sql')
824 |         query = raw_sql.format(provider_prefix=provider_prefix, admin=admin, threshold=frequent_locations_percentage)
825 |         cursor.execute(query)
826 |         print('Inserted into frequent locations thresholded table. \nElapsed time: {time} seconds. '
827 |               .format(time=hp.format_two_point_time(timer, time.time())))
828 |         print('########## FINISHED CREATING FREQUENT LOCATIONS TABLE ##########')
829 | 
830 |     def frequent_locations_night(self):
831 |         frequent_locations_percentage = self.config.frequent_locations_percentage
832 |         provider_prefix = self.config.provider_prefix
833 |         cursor = self.hc.cursor
834 |         print('########## CREATE FREQUENT LOCATIONS NIGHT TABLE ##########')
835 |         print('Checking and dropping frequent locations night table if existing.')
836 |         timer = time.time()
837 |         admin = self.config.od_admin_unit
838 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_frequent_locations_night'
839 |                        .format(provider_prefix=provider_prefix))
840 |         print('Checked and dropped frequent locations night table if existing. Elapsed time: {} seconds'
841 |             .format(hp.format_two_point_time(timer, time.time())))
842 |         timer = time.time()
843 | 
844 |         print('Creating frequent locations night table')
845 |         raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/create_frequent_locations_night.sql')
846 |         query = raw_sql.format(provider_prefix=provider_prefix, admin_params=admin + '_id string')
847 |         cursor.execute(query)
848 | 
849 |         print('Created frequent locations night table. Elapsed time: {} seconds'
850 |               .format(hp.format_two_point_time(timer, time.time())))
851 |         timer = time.time()
852 |         print('Inserting into frequent locations night table')
853 |         raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_night.sql')
854 |         query = raw_sql.format(provider_prefix=provider_prefix, admin_params=admin + '_id', admin=admin)
855 |         cursor.execute(query)
856 |         print('Inserted into frequent locations night table.\n'
857 |               'Result are in the table named {provider_prefix}_frequent_locations_night\nElapsed time: {time} seconds. '
858 |               .format(provider_prefix=provider_prefix, time=hp.format_two_point_time(timer, time.time())))
859 |         timer = time.time()
860 |         print('Dropping freq location night with accumulated percentage')
861 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_freq_with_acc_wsum_night'
862 |                        .format(provider_prefix=provider_prefix))
863 |         print(
864 |             'Checked and dropped frequent locations night table with accumulated percentage if existing. '
865 |             'Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
866 |         timer = time.time()
867 |         print('Creating and insert freq night with acc wsum Table '
868 |               '(Frequent Locations Night) with accumulated percentage')
869 |         raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_wsum_night.sql')
870 |         query = raw_sql.format(provider_prefix=provider_prefix, admin=admin)
871 |         cursor.execute(query)
872 |         print(
873 |             'Inserted into frequent locations night table with accumulated percentage. \nElapsed time: {time} seconds. '
874 |                 .format(time=hp.format_two_point_time(timer, time.time())))
875 |         timer = time.time()
876 |         print('Dropping frequent locations thresholded night table')
877 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_frequent_locations_thresholded_night'
878 |                        .format(provider_prefix=provider_prefix))
879 |         print('Checked and dropped frequent locations night table with accumulated percentage if existing. '
880 |               'Elapsed time: {} seconds'.format(hp.format_two_point_time(timer, time.time())))
881 |         timer = time.time()
882 |         print('Creating and insert frequent locations thresholded night table ')
883 |         raw_sql = hp.sql_to_string('statistics/reports/frequent_locations/frequent_locations_thresholded_night.sql')
884 |         query = raw_sql.format(provider_prefix=provider_prefix, admin=admin, threshold=frequent_locations_percentage)
885 |         cursor.execute(query)
886 |         print('Inserted into frequent locations thresholded night table. \nElapsed time: {time} seconds. '
887 |               .format(time=hp.format_two_point_time(timer, time.time())))
888 |         print('########## FINISHED CREATING FREQUENT LOCATIONS NIGHT TABLE ##########')
889 | 
890 |     def rank1_frequent_locations(self):
891 |         provider_prefix = self.config.provider_prefix
892 |         cursor = self.hc.cursor
893 |         print('########## CREATE RANK 1 FREQUENT LOCATIONS TABLE ##########')
894 |         admin = self.config.od_admin_unit
895 |         create_param = admin + '_id string'
896 |         timer = time.time()
897 |         print('Checking and dropping rank 1 frequent locations table if existing.')
898 |         cursor.execute(
899 |             'DROP TABLE IF EXISTS {provider_prefix}_la_cdr_uid_home'.format(provider_prefix=provider_prefix))
900 |         print('Checked and dropped rank 1 frequent locations table if existing. Elapsed time: {} seconds'.format(
901 |             hp.format_two_point_time(timer, time.time())))
902 |         timer = time.time()
903 |         print('Creating rank 1 frequent locations table')
904 |         raw_sql = hp.sql_to_string('origin_destination/create_la_cdr_uid_home.sql')
905 |         query = raw_sql.format(provider_prefix=provider_prefix, admin_params=create_param)
906 |         cursor.execute(query)
907 |         print('Created rank 1 frequent locations table. Elapsed time: {} seconds'.format(
908 |             hp.format_two_point_time(timer, time.time())))
909 |         timer = time.time()
910 |         print('Inserting into rank 1 frequent locations table')
911 |         raw_sql = hp.sql_to_string('origin_destination/insert_la_cdr_uid_home.sql')
912 |         query = raw_sql.format(provider_prefix=provider_prefix)
913 |         cursor.execute(query)
914 |         print('Inserted into rank 1 frequent locations table (located in {provider_prefix}_la_cdr_uid_home). '
915 |               'Elapsed time: {time} seconds'
916 |               .format(provider_prefix=provider_prefix, time=hp.format_two_point_time(timer, time.time())))
917 |         print('########## FINISHED CREATING RANK 1 FREQUENT LOCATIONS TABLE ##########')
918 | 


--------------------------------------------------------------------------------
/Common/config_object.py:
--------------------------------------------------------------------------------
1 | from Common.helper import json_file_to_object
2 | 
3 | 
4 | class Config:
5 |     def __init__(self, config_file):
6 |         self.__dict__ = json_file_to_object(config_file)
7 | 


--------------------------------------------------------------------------------
/Common/helper.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import matplotlib.pyplot as plt
  3 | import csv
  4 | import pandas
  5 | from matplotlib.widgets import TextBox
  6 | from codecs import open
  7 | 
  8 | 
  9 | mandatory_columns = [["UID", "IMEI", "IMSI", "CALL_TIME", "DURATION",
 10 |                      "CALL_TYPE", "NETWORK_TYPE", "CELL_ID", "LATITUDE", "LONGITUDE"],
 11 |                      ['CELL_ID', 'LATITUDE', 'LONGITUDE', 'ADMIN0',
 12 |                       'ADMIN1', 'ADMIN2', 'ADMIN3', 'ADMIN4', 'ADMIN5']]
 13 | 
 14 | 
 15 | def json_file_to_object(json_file, encoding=''):
 16 |     if encoding == '':
 17 |         with open(json_file) as jf:
 18 |             return json.load(jf)
 19 |     else:
 20 |         with open(json_file, encoding=encoding) as jf:
 21 |             return json.load(jf)
 22 | 
 23 | 
 24 | def string_to_json(str_in):
 25 |     return json.loads(str_in)
 26 | 
 27 | 
 28 | def sql_to_string(filename):
 29 |     path = "queries/" + filename
 30 |     sql = open(path, mode='r', encoding='utf-8-sig').read()
 31 |     return sql
 32 | 
 33 | 
 34 | def get_admin_units_from_mapping(cell_tower_mapping):
 35 |     admin_units = []
 36 |     admins = ['admin0', 'admin1', 'admin2', 'admin3', 'admin4', 'admin5']
 37 |     admins.reverse()
 38 |     for row in cell_tower_mapping:
 39 |         for admin in admins:
 40 |             if row['output_no'] != -1 and str.lower(row['name']) == admin:
 41 |                 admin_units.append(row['name'])
 42 | 
 43 |     print('Result admin units = {}'.format(', '.join(admin_units)))
 44 |     return admin_units
 45 | 
 46 | 
 47 | def format_two_point_time(start, end):
 48 |     return round(end - start, 2)
 49 | 
 50 | 
 51 | def get_time_from_csv(file_loc):
 52 |     with open(file_loc) as csv_file:
 53 |         csv_reader = csv.reader(csv_file, delimiter=',')
 54 |         line_count = 0
 55 |         for row in csv_reader:
 56 |             if line_count == 1:
 57 |                 start_date = row[6]
 58 |                 end_date = row[7]
 59 |                 break
 60 |             line_count += 1
 61 | 
 62 |     start_date = pandas.Timestamp(start_date)
 63 |     start_m = start_date.month
 64 |     start_y = start_date.year
 65 |     end_date = pandas.Timestamp(end_date)
 66 |     end_m = end_date.month
 67 |     end_y = end_date.year
 68 | 
 69 |     print(start_date, start_m, start_y, end_date, end_m, end_y)
 70 |     print(start_date, end_date)
 71 | 
 72 |     result = dict()
 73 |     result['start_date'] = start_date
 74 |     result['start_m'] = start_m
 75 |     result['start_y'] = start_y
 76 |     result['end_date'] = end_date
 77 |     result['end_m'] = end_date.month
 78 |     result['end_y'] = end_date.year
 79 | 
 80 |     return result
 81 | 
 82 | 
 83 | def make_graph(xs, x_label, ys, y_label, header, filename, des_pair_1=None,
 84 |                des_pair_2=None, des_pair_3=None, des_pair_4=None):
 85 |     figure = plt.figure(figsize=(14, 11))
 86 | 
 87 |     font_dict = {
 88 |         'fontsize': 21,
 89 |         'fontweight': 'bold',
 90 |      }
 91 | 
 92 |     ax = figure.add_subplot(111)
 93 |     plt.title(header, fontdict=font_dict)
 94 |     plt.subplots_adjust(top=0.75)
 95 |     plt.grid(b=True)
 96 |     plt.plot(xs, ys)
 97 |     plt.ylabel(y_label)
 98 |     plt.xticks(rotation=90)
 99 |     plt.xlabel(x_label)
100 | 
101 |     if des_pair_1 is not None:
102 |         plt.text(des_pair_1['text_x'],  des_pair_1['text_y'], des_pair_1['text'], transform=ax.transAxes)
103 |         axbox = plt.axes([0.1, 0.87, 0.2, 0.04])
104 |         offset = 60 - 2*len(des_pair_1['value'])
105 |         text1 = ''
106 |         for i in range(0, offset):
107 |             text1 += ' '
108 |         text_box = TextBox(axbox, '', initial=text1 + des_pair_1['value'], color='orange', label_pad=0.005)
109 |         text_box.disconnect_events()
110 |     if des_pair_2 is not None:
111 |         offset = 60 - 2*len(des_pair_2['value'])
112 |         text2 = ''
113 |         for i in range(0, offset):
114 |             text2 += ' '
115 |         plt.text(des_pair_2['text_x'],  des_pair_2['text_y'], des_pair_2['text'], transform=ax.transAxes)
116 |         # plt.text(0.33, 1.27, des_pair_2['text'], transform=ax.transAxes)
117 |         axbox = plt.axes([0.3, 0.87, 0.2, 0.04])
118 |         text_box = TextBox(axbox, '', initial=text2 + des_pair_2['value'], color='blue')
119 |         text_box.disconnect_events()
120 |     if des_pair_3 is not None:
121 |         offset = 60 - 2*len(des_pair_3['value'])
122 |         text3 = ''
123 |         for i in range(0, offset):
124 |             text3 += ' '
125 |         # plt.text(0.58, 1.27, des_pair_3['text'], transform=ax.transAxes)
126 |         plt.text(des_pair_3['text_x'],  des_pair_3['text_y'], des_pair_3['text'], transform=ax.transAxes)
127 |         axbox = plt.axes([0.5, 0.87, 0.2, 0.04])
128 |         text_box = TextBox(axbox, '', initial=text3 + des_pair_3['value'], color='green')
129 |         text_box.disconnect_events()
130 |     if des_pair_4 is not None:
131 |         offset = 60 - 2*len(des_pair_4['value'])
132 |         text4 = ''
133 |         for i in range(0, offset):
134 |             text4 += ' '
135 |         # plt.text(0.79, 1.27, des_pair_4['text'],  transform=ax.transAxes)
136 |         plt.text(des_pair_4['text_x'],  des_pair_4['text_y'], des_pair_4['text'], transform=ax.transAxes)
137 |         axbox = plt.axes([0.7, 0.87, 0.2, 0.04])
138 |         text_box = TextBox(axbox, '', initial=text4 + des_pair_4['value'], color='red')
139 |         text_box.disconnect_events()
140 | 
141 |     plt.savefig(filename)
142 | 
143 | 
144 | def extract_mapping_data(config, data):
145 |     mappings = [config.cdr_data_layer, config.cdr_cell_tower]
146 |     # Extract arguments
147 |     for i in range(0, len(mappings)):
148 |         arguments_map = []
149 |         arguments_prep = []
150 |         arguments_raw = []
151 |         arguments_con = []
152 |         for argument in mappings[i]:
153 |             if str.upper(argument['name']) in mandatory_columns[i]:
154 |                 arguments_prep.append(argument['name'] + ' ' + argument['data_type'])
155 |                 arguments_con.append(argument['name'])
156 |             if str.lower(argument['name']) == 'uid' and i == 0:
157 |                 arguments_con.append(argument['input_name'])
158 |                 arguments_prep.append(argument['input_name'] + ' ' + argument['data_type'])
159 |             if argument['output_no'] != -1:
160 |                 if argument['input_no'] != -1:
161 |                     arguments_raw.append(argument['input_name'] + ' ' + argument['data_type'])
162 |                     if 'custom' in argument and argument['custom'] != '':
163 |                         if str.lower(argument['name']) == 'call_time' and config.input_file_time_format != "":
164 |                             arguments_map.append("from_unixtime(unix_timestamp({custom} "
165 |                                                  ",'{time_format}'), 'yyyy-MM-dd hh:mm:ss') as call_time"
166 |                                                  .format(custom=argument['custom'],
167 |                                                          time_format=config.input_file_time_format))
168 |                         else:
169 |                             arguments_map.append(argument['custom'] + ' as ' + argument['name'])
170 |                         if str.lower(argument['name']) == 'uid' and i == 0:
171 |                             arguments_map.append(argument['input_name'] + ' as ' + argument['input_name'])
172 |                     else:
173 |                         if str.lower(argument['name']) == 'call_time' and config.input_file_time_format != "":
174 |                             arguments_map.append("from_unixtime(unix_timestamp({custom} "
175 |                                                  ",'{time_format}'), 'yyyy-MM-dd hh:mm:ss') as call_time"
176 |                                                  .format(custom=argument['input_name'],
177 |                                                          time_format=config.input_file_time_format))
178 |                             print(arguments_map)
179 |                         else:
180 |                             arguments_map.append(argument['input_name'] + ' as ' + argument['name'])
181 |                         if str.lower(argument['name']) == 'uid' and i == 0:
182 |                             arguments_map.append(argument['input_name'] + ' as ' + argument['input_name'])
183 |                 else:
184 |                     # input -1 output 1 for custom
185 |                     if 'custom' in argument and argument['custom'] != '':
186 |                         arguments_map.append(argument['custom'] + ' as ' + argument['name'])
187 |                     # else =  cdr without custom or cell tower, this case insert -1 if it is a mandatory column
188 |                     elif str.upper(argument['name']) in mandatory_columns[i]:
189 |                         arguments_map.append('-1' + ' as ' + argument['name'])
190 |                         print('Output ' + argument['name'] + ' ignored ')
191 | 
192 |             elif argument['input_no'] != -1:
193 |                 arguments_raw.append(argument['input_name'] + ' ' + argument['data_type'])
194 |                 if str.upper(argument['name']) in mandatory_columns[i]:
195 |                     arguments_map.append('-1' + ' as ' + argument['name'])
196 |                     print('Output ' + argument['name'] + ' ignored ')
197 |             elif str.upper(argument['name']) in mandatory_columns[i]:
198 |                 # input -1 output -1 insert -1 if it is a mandatory column
199 |                 arguments_map.append('-1' + ' as ' + argument['name'])
200 |                 print('Output ' + argument['name'] + ' ignored ')
201 | 
202 |         if i == 0:
203 |             data.arg_cdr_map, data.arg_cdr_raw, data.arg_cdr_prep, data.arg_cdr_con = \
204 |                 arguments_map, arguments_raw, arguments_prep, arguments_con
205 |         else:
206 |             data.arg_cell_map, data.arg_cell_raw, data.arg_cell_create = \
207 |                 arguments_map, arguments_raw, arguments_prep
208 |             print(data.arg_cell_map, data.arg_cell_create)
209 | 
210 | 
211 | if __name__ == '__main__':
212 |     make_graph([1, 2, 3, 4], 'x', [1, 2, 3, 4], 'y', 'TEST', 'test')
213 | 


--------------------------------------------------------------------------------
/Common/hive_connection.py:
--------------------------------------------------------------------------------
 1 | from impala.dbapi import connect
 2 | 
 3 | 
 4 | class Singleton(type):
 5 |     _instances = {}
 6 | 
 7 |     def __call__(cls, *args, **kwargs):
 8 |         if cls not in cls._instances:
 9 |             cls._instances[cls] = super(Singleton, cls).__call__(*args, **kwargs)
10 |         return cls._instances[cls]
11 | 
12 | 
13 | class HiveConnection(metaclass=Singleton):
14 |     def __init__(self, host='', port='', user=''):
15 |         self.conn = connect(host, port, user=user, auth_mechanism='PLAIN')
16 |         self.cursor = self.conn.cursor()
17 |         self.cursor.set_arraysize(1)
18 | 


--------------------------------------------------------------------------------
/Common/hive_create_tables.py:
--------------------------------------------------------------------------------
  1 | from Common.hive_connection import HiveConnection
  2 | from Common.helper import json_file_to_object, get_admin_units_from_mapping, format_two_point_time, sql_to_string
  3 | import os
  4 | import time
  5 | 
  6 | 
  7 | class HiveTableCreator:
  8 |     def __init__(self, config, data=''):
  9 |         self.config = config
 10 |         self.data = data
 11 |         self.hc = HiveConnection()
 12 | 
 13 |     def initialize(self, init_cmd_file):
 14 |         print('########## Initilizing Hive ##########')
 15 |         timer = time.time()
 16 |         output_report_location = self.config.output_report_location
 17 |         output_graph_location = self.config.output_graph_location
 18 |         cursor = self.hc.cursor
 19 |         for command in json_file_to_object(init_cmd_file)['hive_commands']:
 20 |             if command.startswith('use'):
 21 |                 command = command.format(db_name=self.config.db_name)
 22 |             elif '{poi_location}' in command:
 23 |                 command = command.format(poi_location=self.config.interpolation_poi_file_location)
 24 |             elif '{osm_location}' in command:
 25 |                 command = command.format(osm_location=self.config.interpolation_osm_file_location)
 26 |             elif '{voronoi_location}' in command:
 27 |                 command = command.format(voronoi_location=self.config.interpolation_voronoi_file_location)
 28 |             cursor.execute(command)
 29 |         if not os.path.exists(output_report_location):
 30 |             os.makedirs(output_report_location)
 31 |         if not os.path.exists(output_graph_location):
 32 |             os.makedirs(output_graph_location)
 33 |         print('########## Done. Time elapsed: {} seconds ##########'.format(format_two_point_time(timer, time.time())))
 34 | 
 35 |     def create_tables(self):
 36 |         print('########## Creating Tables ##########')
 37 |         timer = time.time()
 38 |         self.import_cell_tower_data_raw()
 39 |         self.preprocess_cell_tower_data()
 40 |         admins = get_admin_units_from_mapping(self.config.cdr_cell_tower)
 41 |         for admin in admins:
 42 |             self.cell_tower_data_admin(admin)
 43 |         self.import_raw()
 44 |         self.preprocess_data()
 45 |         self.consolidate_table()
 46 |         print('########## Done create all tables. Time elapsed: {} seconds ##########'.format(
 47 |             format_two_point_time(timer, time.time())))
 48 | 
 49 |     def import_cell_tower_data_raw(self):
 50 |         provider_prefix = self.config.provider_prefix
 51 |         arg_cell_raw = self.data.arg_cell_raw
 52 |         input_cell_tower_delimiter = self.config.input_cell_tower_delimiter
 53 |         input_cell_tower_have_header = self.config.input_cell_tower_have_header
 54 |         input_cell_tower_files = self.config.input_cell_tower_files
 55 |         hadoop_data_path = self.config.hadoop_data_path
 56 |         cursor = self.hc.cursor
 57 |         print('########## IMPORT RAW MAPPING TABLE ##########')
 58 |         print('Checking and dropping raw mapping table if existing.')
 59 |         timer = time.time()
 60 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cell_tower_data_raw'
 61 |                             .format(provider_prefix=provider_prefix))
 62 |         print('Checked and dropped raw mapping table if existing. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time())))
 63 |         timer = time.time()
 64 | 
 65 |         print('Creating raw mapping table')
 66 |         raw_query = sql_to_string('cdr_and_mapping/create_raw_mapping.sql')
 67 |         query = raw_query.format(provider_prefix=provider_prefix,
 68 |                                  arg_raw=', '.join(arg_cell_raw),
 69 |                                  field_delimiter=input_cell_tower_delimiter,
 70 |                                  have_header=input_cell_tower_have_header)
 71 |         cursor.execute(query)
 72 |         print('Created raw mapping table. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time())))
 73 |         timer = time.time()
 74 | 
 75 |         if len(input_cell_tower_files) < 1:
 76 |             print('Please check the input_cell_tower_files field in config.json and make sure the file is valid.')
 77 |             return
 78 |         elif len(input_cell_tower_files) == 1:
 79 |             cursor.execute(
 80 |                 "load data local inpath '{hadoop_data_path}{hadoop_data_file}' "
 81 |                 .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_cell_tower_files[0]) +
 82 |                 "overwrite into table {provider_prefix}_cell_tower_data_raw".format(
 83 |                     provider_prefix=provider_prefix)
 84 |             )
 85 |         else:
 86 |             cursor.execute(
 87 |                 "load data local inpath '{hadoop_data_path}{hadoop_data_file}' "
 88 |                 .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_cell_tower_files[0]) +
 89 |                 "overwrite into table {provider_prefix}_cell_tower_data_raw".format(
 90 |                     provider_prefix=provider_prefix)
 91 |             )
 92 |             for i in range(1, len(input_cell_tower_files)):
 93 |                 cursor.execute(
 94 |                     "load data local inpath '{hadoop_data_path}{hadoop_data_file}' "
 95 |                     .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_cell_tower_files[i]) +
 96 |                     "into table {provider_prefix}_cell_tower_data_raw".format(provider_prefix=provider_prefix)
 97 |                 )
 98 |         print('Imported to raw mapping table. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time())))
 99 |         print('########## FINISHED IMPORTING TO RAW MAPPING TABLE ##########')
100 | 
101 |     def import_raw(self):
102 |         provider_prefix = self.config.provider_prefix
103 |         hadoop_data_path = self.config.hadoop_data_path
104 |         input_cell_tower_have_header = self.config.input_cell_tower_have_header
105 |         arg_cdr_raw = self.data.arg_cdr_raw
106 |         input_files = self.config.input_files
107 |         input_delimiter = self.config.input_delimiter
108 |         cursor = self.hc.cursor
109 |         print('########## IMPORT RAW TABLE ##########')
110 |         print('Checking and dropping raw table if existing.')
111 |         timer = time.time()
112 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_raw'
113 |                             .format(provider_prefix=provider_prefix))
114 |         print('Checked and dropped raw mapping table if existing. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time())))
115 |         timer = time.time()
116 | 
117 |         print('Creating raw table')
118 |         raw_sql = sql_to_string('cdr_and_mapping/create_raw_cdr.sql')
119 |         query = raw_sql.format(cell_tower_header=input_cell_tower_have_header,
120 |                                provider_prefix=provider_prefix,
121 |                                arg_raw=', '.join(arg_cdr_raw),
122 |                                field_delimiter=input_delimiter)
123 |         cursor.execute(query)
124 |         print('Created raw table. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time())))
125 |         timer = time.time()
126 |         print('Importing to raw table')
127 |         if len(input_files) < 1:
128 |             'Please check the input_files field in config.json and make sure the file is valid.'
129 |             return
130 |         elif len(input_files) == 1:
131 |             cursor.execute(
132 |                 "load data local inpath '{hadoop_data_path}{hadoop_data_file}' "
133 |                 .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_files[0]) +
134 |                 "overwrite into table {provider_prefix}_raw".format(provider_prefix=provider_prefix)
135 |             )
136 |         else:
137 |             cursor.execute(
138 |                 "load data local inpath '{hadoop_data_path}{hadoop_data_file}' "
139 |                 .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_files[0]) +
140 |                 "overwrite into table {provider_prefix}_raw".format(provider_prefix=provider_prefix)
141 |             )
142 |             for i in range(1, len(input_files)):
143 |                 cursor.execute(
144 |                     "load data local inpath '{hadoop_data_path}{hadoop_data_file}' "
145 |                     .format(hadoop_data_path=hadoop_data_path, hadoop_data_file=input_files[i]) +
146 |                     "into table {provider_prefix}_raw".format(provider_prefix=provider_prefix)
147 |                 )
148 |         print('Imported to raw table. Elapsed time: {} seconds'.format(format_two_point_time(timer, time.time())))
149 |         print('########## IMPORT RAW TABLE COMPLETED ##########')
150 |         
151 |     def cell_tower_data_admin(self, admin):
152 |         provider_prefix = self.config.provider_prefix
153 |         check_invalid_lat_lng = self.config.check_invalid_lat_lng
154 |         cursor = self.hc.cursor
155 | 
156 |         print('########## CREATE MAPPING ADMIN TABLE ##########')
157 |         if check_invalid_lat_lng:
158 |             check_lat_lng = 'and (latitude != 0 or longitude != 0) and latitude is not NULL and longitude is not NULL'
159 |         else:
160 |             check_lat_lng = ''
161 |         print('Checking and dropping mapping {admin} table if existing.'.format(admin=admin))
162 |         timer = time.time()
163 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cell_tower_data_{admin}'.format(
164 |             provider_prefix=provider_prefix, admin=admin))
165 |         print('Check and drop mapping {admin} table if existing. Elapsed time: {time} seconds'
166 |               .format(admin=admin, time=format_two_point_time(timer, time.time())))
167 |         timer = time.time()
168 | 
169 |         print('Creating mapping {admin} table'.format(admin=admin))
170 |         raw_sql = sql_to_string('cdr_and_mapping/create_mapping_admin.sql')
171 |         query = raw_sql.format(provider_prefix=provider_prefix, admin=admin)
172 |         cursor.execute(query)
173 |         print('Created mapping {admin} table. Elapsed time: {time} seconds'
174 |               .format(admin=admin, time=format_two_point_time(timer, time.time())))
175 |         timer = time.time()
176 | 
177 |         print('Inserting into mapping {} table'.format(admin))
178 |         raw_sql = sql_to_string('cdr_and_mapping/insert_mapping_admin.sql')
179 |         query = raw_sql.format(provider_prefix=provider_prefix, admin=admin, check_lat_lng=check_lat_lng)
180 |         cursor.execute(query)
181 |         print('Inserted into mapping {admin} table. Elapsed time: {time} seconds'
182 |               .format(admin=admin, time=format_two_point_time(timer, time.time())))
183 |         print('########## FINISHED CREATING MAPPING ADMIN TABLE ##########')
184 | 
185 |     def preprocess_cell_tower_data(self):
186 |         provider_prefix = self.config.provider_prefix
187 |         check_duplicate = self.config.check_duplicate
188 |         arg_cell_create = self.data.arg_cell_create
189 |         arg_cell_map = self.data.arg_cell_map
190 |         cursor = self.hc.cursor
191 |         print('########## CREATE PREPROCESS MAPPING TABLE ##########')
192 |         if check_duplicate:
193 |             distinct = 'distinct'
194 |         else:
195 |             distinct = ''
196 |         print('Checking and dropping preprocess mapping table if existing.')
197 |         timer = time.time()
198 | 
199 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_cell_tower_data_preprocess'.format(
200 |             provider_prefix=provider_prefix))
201 |         print('Checked and dropped preprocess mapping table if existing. Elapsed time: {} seconds'.format(
202 |             format_two_point_time(timer, time.time())))
203 |         timer = time.time()
204 | 
205 |         print('Creating preprocess mapping table')
206 |         raw_sql = sql_to_string('cdr_and_mapping/create_preprocess_mapping.sql')
207 |         query = raw_sql.format(provider_prefix=provider_prefix,
208 |                                arg_create=', '.join(arg_cell_create))
209 |         cursor.execute(query)
210 |         print('Created mapping preprocess table. Elapsed time: {} seconds'
211 |               .format(format_two_point_time(timer, time.time())))
212 |         timer = time.time()
213 |         # need username to get privilege
214 | 
215 |         print('Inserting into preprocess mapping table')
216 |         raw_sql = sql_to_string('cdr_and_mapping/insert_preprocess_mapping.sql')
217 |         query = raw_sql.format(provider_prefix=provider_prefix, distinct=distinct, arg=', '.join(arg_cell_map))
218 |         cursor.execute(query)
219 |         print('Inserted into preprocess mapping table. Elapsed time: {} seconds'
220 |               .format(format_two_point_time(timer, time.time())))
221 |         print('########## FINISHED CREATING PREPROCESS MAPPING TABLE ##########')
222 | 
223 |     def preprocess_data(self):
224 |         provider_prefix = self.config.provider_prefix
225 |         check_duplicate = self.config.check_duplicate
226 |         arg_cdr_prep = self.data.arg_cdr_prep
227 |         arg_cdr_map = self.data.arg_cdr_map
228 |         cursor = self.hc.cursor
229 | 
230 |         print('########## CREATE PREPROCESS CDR TABLE ##########')
231 |         if check_duplicate:
232 |             distinct = 'distinct'
233 |         else:
234 |             distinct = ''
235 | 
236 |         print('Checking and dropping preprocess cdr table if existing.')
237 |         timer = time.time()
238 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_preprocess'.format(provider_prefix=provider_prefix))
239 |         print('Checked and dropped preprocess cdr table if existing. Elapsed time: {} seconds'.format(
240 |             format_two_point_time(timer, time.time())))
241 |         timer = time.time()
242 | 
243 |         print('Creating preprocess cdr table.')
244 |         raw_sql = sql_to_string('cdr_and_mapping/create_preprocess_cdr.sql')
245 |         query = raw_sql.format(args=', '.join(arg_cdr_prep), provider_prefix=provider_prefix)
246 |         cursor.execute(query)
247 | 
248 |         print('Created preprocess cdr table. Elapsed time: {} seconds'
249 |               .format(format_two_point_time(timer, time.time())))
250 |         timer = time.time()
251 | 
252 |         print('Inserting into preprocess table')
253 |         print('Columns in preprocess table mapped: ' + ', '.join(arg_cdr_map))
254 |         raw_sql = sql_to_string('cdr_and_mapping/insert_preprocess_cdr.sql')
255 |         query = raw_sql.format(distinct=distinct, arg=', '.join(arg_cdr_map), provider_prefix=provider_prefix)
256 |         cursor.execute(query)
257 |         print('Inserted into preprocess cdr table. Elapsed time: {} seconds'
258 |               .format(format_two_point_time(timer, time.time())))
259 |         print('########## FINISHED CREATING PREPROCESS CDR TABLE ##########')
260 | 
261 |     def consolidate_table(self):
262 |         # TODO join here
263 |         provider_prefix = self.config.provider_prefix
264 |         arg_cdr_prep = self.data.arg_cdr_prep
265 |         arg_cdr_con = self.data.arg_cdr_con
266 |         cursor = self.hc.cursor
267 |         print('########## CREATE CONSOLIDATE CDR TABLE ##########')
268 |         print('Checking and dropping consolidate cdr table if existing.')
269 | 
270 |         print('Checking latitude and lontitude in the preprocess table')
271 |         cursor.execute('select max(latitude), max(longitude) from {provider_prefix}_preprocess'
272 |                        .format(provider_prefix=provider_prefix))
273 |         res = cursor.fetchall()
274 | 
275 |         latitude = res[0][0]
276 |         longitude = res[0][1]
277 |         arg_cdr_con_with_join_cond =[]
278 |         if (latitude == -1 and longitude == -1):
279 |             print('Join to make consolidate')
280 |             for arg in arg_cdr_con:
281 |                 if str.lower(arg) in ['longitude', 'latitude']:
282 |                     arg_cdr_con_with_join_cond.append('a2.' + arg + ' as ' + arg)
283 |                 else:
284 |                     arg_cdr_con_with_join_cond.append('a1.' + arg + ' as ' + arg)
285 |             insert_script_loc = 'cdr_and_mapping/insert_consolidate_cdr_join.sql'
286 |         else:
287 |             arg_cdr_con_with_join_cond = arg_cdr_con
288 |             print('No join')
289 |             insert_script_loc = 'cdr_and_mapping/insert_consolidate_cdr.sql'
290 | 
291 |         timer = time.time()
292 |         cursor.execute('DROP TABLE IF EXISTS {provider_prefix}_consolidate_data_all'
293 |                        .format(provider_prefix=provider_prefix))
294 |         print('Checked and dropped preprocess cdr table if existing. Elapsed time: {} seconds'
295 |               .format(format_two_point_time(timer, time.time())))
296 |         timer = time.time()
297 | 
298 |         print('Creating consolidate table')
299 |         raw_sql = sql_to_string('cdr_and_mapping/create_consolidate_cdr.sql')
300 |         query = raw_sql.format(provider_prefix=provider_prefix, arg_prep=' ,'.join(arg_cdr_prep))
301 |         cursor.execute(query)
302 |         print('Created consolidate cdr table. Elapsed time: {} seconds'
303 |               .format(format_two_point_time(timer, time.time())))
304 |         timer = time.time()
305 | 
306 |         print('Columns in consolidate table: ' + ', '.join(arg_cdr_con_with_join_cond))
307 |         print('Inserting into the consolidate table')
308 |         raw_sql = sql_to_string(insert_script_loc)
309 |         query = raw_sql.format(provider_prefix=provider_prefix, arg_con=', '.join(arg_cdr_con_with_join_cond))
310 |         cursor.execute(query)
311 |         print('Inserted into consolidate cdr table. Elapsed time: {} seconds'
312 |               .format(format_two_point_time(timer, time.time())))
313 |         print('########## FINISHED CREATING CONSOLIDATE CDR TABLE ##########')
314 | 
315 | 


--------------------------------------------------------------------------------
/Interpolation/README.md:
--------------------------------------------------------------------------------
  1 | # Interpolation
  2 | The tool will perform the following operations: 
  3 | * Trip segmentation
  4 | * Stay Point Reallocation
  5 | * Route Interpolation
  6 | 
  7 | Results are the route interpolation of a CDR data.
  8 | ## Prerequisites
  9 | * CDR Consolidate Data Table ({provider_prefix}_consolidate_data_all)
 10 |     * obtained from the script [run_prepare_cdr_and_mapping.py](../run_prepare_cdr_and_mapping.py).
 11 | See the [first page](../README.md) for how to prepare a CDR file and a cell tower mapping file. The following are the columns 
 12 | of the CDR consolidate data table. * means the column is required to have some valid values.
 13 | ```
 14 | *UID          : Unique Identifier of each user
 15 | IMEI          : International Mobile Equipment Identity (IMEI) of Caller
 16 | IMSI          : International Mobile Subscriber Identity (IMSI) of Caller
 17 | *CALL_TIME    : Activity Time (Start Time) in “YYYY-MM-DD HH:mm:ss” format 
 18 | *DURATION     : Call Duration in seconds
 19 | *CELL_ID      : Unique Cell Tower ID (LAC+CellID)
 20 | CALL_TYPE     : Type of the call (Data, Voice or SMS)
 21 | NETWORK_TYPE  : Type of the network (2G, 3G, 4G, 5G)
 22 | *Longitude    : Real Number (decimal degree) in WGS84
 23 | *Latitude     : Real Number(decimal degree) in WGS84
 24 | ```
 25 | * Building/POIs for Reallocation
 26 | * OSM road network data
 27 | * Voronoi data of Cell tower/Base Station location
 28 | 
 29 | # Configuration
 30 | The configuration of the connection of hadoop server and selection needs to be set prior to this.
 31 | See the [first page](../README.md) in the configuration section.
 32 | Then, in the config file, set five following fields:
 33 | * interpolation_poi_file_location: set to be the local path of the poi file
 34 | * interpolation_osm_file_location: set to be the local path of the osm file
 35 | * interpolation_voronoi_file_location: set to be the local path of the voronoi file
 36 | * max_size_cdr_by_uid: set to be the maximum array size of cdr of each particular user
 37 | * max_size_interpolation: set to be the max size of interpolation
 38 | according to what you have in the cell tower mapping raw data
 39 | 
 40 | For example, see [config_big.json](../sample_configs/config_big.json) in line from 34 to 38
 41 | 
 42 | # Route Interpolation
 43 | run the following command
 44 | * python3 [run_interpolation.py](../run_interpolation.py) -c {config_file}
 45 | 
 46 | Example
 47 | 
 48 | * run python3 run_interpolation.py -c sample_configs/config_big.json
 49 | 
 50 | To edit further, the user can go to [cdr_interpolation.py](../Common/cdr_interpolation.py) in  
 51 | calculate_interpolation()
 52 | 
 53 | If only some of the operations are needed, you can comment them here (ex. already finished some steps)
 54 | 
 55 | ```
 56 |         self.convert_cdr_to_array_format()
 57 |         self.create_trip_format()
 58 |         self.create_trip_24hr_padding()
 59 |         self.create_poi_relocation()
 60 |         self.create_route_interpolation()
 61 |         self.export_to_csv()
 62 | ```
 63 | 
 64 | 
 65 | # Route Interpolation Output
 66 | The output will be generated inside the hadoop server in /tmp/hive/csv_interpolation. 
 67 | The file will have no extension but it can be renamed to have .csv extension and run without any problem.
 68 | The following file is the output sample for the [interpolation](output_sample/interpolation.csv) 
 69 | In the file, each field is separated by comma and having the following column name
 70 | ```
 71 | 1. User Id
 72 |     •   Unique for each device
 73 | 2. Trip Sequence
 74 |     •	Order of sub trip in a day, start from 1
 75 | 3. Mobility Type
 76 |     •	Value: STAY or MOVE
 77 | 4. Transportation Mode
 78 |     •	Indicate mode of transportation of corresponding sub trip
 79 |     •	Value: STAY, WALK, VEHICLE
 80 | 5. Total Distance 
 81 |     •	Total travel distance of sub trip in meters
 82 | 6. Total Time
 83 |     •	Total travel time of sub trip in seconds
 84 | 7. Start Time 
 85 |     •	Indicate start time of sub trip
 86 |     •	Format: hh:MM:ss
 87 |     •	Example: 8:38:08
 88 | 8. End Time
 89 |     •	Indicate end time of sub trip
 90 |     •	Format: hh:MM:ss
 91 |     •	Example: 9:30:20 
 92 | 9. Total Points 
 93 |     •	Indicate total number of point data in sub trip
 94 | 10. Subtrip Sequence
 95 |     •	The point number of each sub trip in a day, start from 1
 96 | 11. Subtrip Point Start Time 
 97 |     •	Indicate start time of sub trip
 98 |     •	Format: MM/DD/YYYY hh:MM
 99 |     •	Example: 1/2/2019 8:38 
100 | 12. Subtrip Point Latitude
101 |     •	Indicate the latitude of a particular point in a subtrip
102 |     •	Format: MM/DD/YYYY hh:MM
103 |     •	Example: 1/2/2019 8:38 
104 | 13. Subtrip Point Longitude
105 |     •	Indicate the longtitude of a particular point in a subtrip
106 |     •	Format: MM/DD/YYYY hh:MM
107 |     •	Example: 1/2/2019 8:38 
108 | ```
109 | Example (replace comma by tab for better illustration)
110 | 
111 | ```
112 | 1031073514	1	STAY	STAY	31088	0	0:00:00	8:38:08	1	1	1/2/2019 0:00	23.614079	89.361402
113 | 1031073514	2	MOVE	WALK	3132	4330.51	8:38:08	9:30:20	53	1	1/2/2019 8:38	23.619423	89.367677
114 | 1031073514	2	MOVE	WALK	3132	4330.51	8:38:08	9:30:20	53	2	1/2/2019 8:39	23.618943	89.368309
115 | 1031073514	2	MOVE	WALK	3132	4330.51	8:38:08	9:30:20	53	3	1/2/2019 8:40	23.618462	89.368942
116 | 1031073514	2	MOVE	WALK	3132	4330.51	8:38:08	9:30:20	53	4	1/2/2019 8:41	23.617982	89.369574
117 | 
118 | ```


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Spatial Data Commons
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Origin_Destination/README.md:
--------------------------------------------------------------------------------
  1 | # Origin Destination
  2 | The tool performs calculation of origin destination based on the data provided in the form of
  3 | the consolidate data table. 
  4 | Results are the origin destination of a selected date stored in a tsv file.
  5 | 
  6 | ## Prerequisites
  7 | Tables obtained from the script [run_prepare_cdr_and_mapping.py](../run_prepare_cdr_and_mapping.py).
  8 | See the [first page](../README.md) for how to prepare a CDR file and a cell tower mapping file. The following are the columns 
  9 | of the CDR consolidate data table.
 10 | * CDR Consolidate Data Table ({provider_prefix}_consolidate_data_all)
 11 | ```
 12 | UID          : Unique Identifier of each user
 13 | IMEI          : International Mobile Equipment Identity (IMEI) of Caller
 14 | IMSI          : International Mobile Subscriber Identity (IMSI) of Caller
 15 | CALL_TIME    : Activity Time (Start Time) in “YYYY-MM-DD HH:mm:ss” format 
 16 | DURATION     : Call Duration in seconds
 17 | CELL_ID      : Unique Cell Tower ID (LAC+CellID)
 18 | CALL_TYPE     : Type of the call (Data, Voice or SMS)
 19 | NETWORK_TYPE  : Type of the network (2G, 3G, 4G, 5G)
 20 | Longitude    : Real Number (decimal degree) in WGS84
 21 | Latitude     : Real Number(decimal degree) in WGS84
 22 | ```
 23 | * Cell Tower Mapping Preprocess Table ({provider_prefix}_cell_tower_data_preprocess)
 24 | ```
 25 | CELL_ID      : Unique Cell Tower ID (LAC+CellID)
 26 | Longitude    : Real Number (decimal degree) in WGS84
 27 | Latitude     : Real Number(decimal degree) in WGS84
 28 | Admin1       : Administration Unit 1 name (if any)
 29 | Admin2       : Administration Unit 2 name (if any)
 30 | .
 31 | .
 32 | .
 33 | AdminN       : Administration Unit N name (if any)
 34 | ```
 35 | * Cell Tower Data Admin X Table ({provider_prefix}_cell_tower_data_adminX)
 36 |     * for generating sequence numbers of an administration unit in case of duplication
 37 | ```
 38 | AdminX_ID       : Administration Unit X name
 39 | AdminX_Name     : Name of the Administration Unit X
 40 | CELL_ID      : Unique Cell Tower ID
 41 | Longitude    : Real Number (decimal degree) in WGS84
 42 | Latitude     : Real Number (decimal degree) in WGS84
 43 | ```
 44 | 
 45 | * CDR Home Location Table ({provider_prefix}_la_cdr_uid_home) 
 46 |     * Obtained from finding rank1_frequent_location in calculate data statistics [run_statistics.py](../run_statistics.py))
 47 | 
 48 | ```
 49 | UID 	        : Unique Identifier of each user
 50 | SITE_ID 	    : Unique Concatenation of Latitude and Longitude
 51 | TCOUNT 	        : Number of records found in a particular SITE_ID
 52 | TRANK 	        : The rank of the SITE_ID in how frequent each user is in the SITE_ID area 
 53 | PPERCENT        : The ratio of how many times each user have a cdr data among all places at which the user use a mobile device in percentage  	
 54 | LONGITUDE 	    : Real Number (decimal degree) in WGS84 
 55 | LADITUDE 	    : Real Number (decimal degree) in WGS84
 56 | ADMINX_ID       : Administration Unit X name
 57 | ```
 58 | 
 59 | # Configuration
 60 | The configuration of the connection of hadoop server and selection needs to be set prior to this.
 61 | See the [first page](../README.md) in the configuration section.
 62 | Then, in the config file, set two following fields:
 63 | * od_admin_unit: set to be a value in ("admin0", "admin1", "admin2", "admin3", "admin4", "admin5") 
 64 | according to what you have in the cell tower mapping raw data
 65 | * od_date: set to the date you want to perform origin destination to (format "yyyy-mm-dd")
 66 | 
 67 | For example, see [config_big.json](../sample_configs/config_big.json) in line 31 and 32
 68 |  
 69 | # Origin Destination 
 70 | run the following command
 71 | * python3 [run_origin_destination.py](../run_origin_destination.py) -c {config_file}
 72 | 
 73 | Example
 74 | 
 75 | * run python3 run_origin_destination.py -c [sample_configs/config_big.json](../sample_configs/config_big.json)
 76 | 
 77 | To edit further, the user can go to [cdr_origin_destination.py](../Common/cdr_origin_destination.py) in  
 78 | calculate_od()
 79 | 
 80 | If only some of the operations are needed, you can comment them here (ex. already finished some steps)
 81 | 
 82 | ```
 83 |         self.cdr_by_uid()
 84 |         self.create_od()
 85 |         self.create_od_detail()
 86 |         self.create_od_sum()
 87 | ```
 88 | 
 89 | 
 90 | # Origin Destination Output
 91 | The output will be generated inside the hadoop server in /tmp/hive/od_result. 
 92 | The file will have no extension but it can be renamed to have .tsv extension and run without any problem.
 93 | The following file is the output sample for the [origin-destination](output_sample/origin_destination.tsv) 
 94 | In the file, each field is separated by comma and having the following column name
 95 | 
 96 | pdt,origin ,
 97 | destination,cast(tcount as string),cast(tusercount as string))
 98 | 
 99 | ```
100 | 1. Date
101 |     •   The date of the origin destination data (as indicated in the config file in the field "od_date")
102 | 2. Origin Admin X ID
103 |     •	The origin place in the form of Admin X ID
104 | 3. Destination Admin X ID
105 |     •	The destination place in the form of Admin X ID 
106 | 4. Count
107 |     •	Number of records of the movement from the origin to destination
108 | 5. User Count
109 |     •	Total Users in a particular movement
110 | ```
111 | Example
112 | 
113 | ```
114 | 2016-03-01	0	805	1.0	1.0
115 | 2016-03-01	0	937	4.0	4.0
116 | 2016-03-01	0	938	1.0	1.0
117 | 2016-03-01	0	940	4.0	4.0
118 | 2016-03-01	1001	1062	6.0	6.0
119 | 2016-03-01	1001	1064	7.0	7.0
120 | 2016-03-01	1001	1065	4.0	4.0
121 | 2016-03-01	1001	1082	1.0	1.0
122 | ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # CDR-analysis-tools-hadoop
  2 | 
  3 | Like the standalone version, this repository is a set of tools written in Python for analyzing Call Detail Records (CDRs) data, additionally based on the hadoop platform which supports a large amount of data. The analysis includes Visualization (with reports and processed data compatible with other visualization platforms), Origin-Destination (OD) and Interpolation. 
  4 | 
  5 | This repository will be incrementally updated from time to time. Kindly visit the repository and have a look at this file. 
  6 | 
  7 | 
  8 | ## Getting Started
  9 | 
 10 | These instructions will get you a copy of the software package and running on your local machine.
 11 | It can be run on both Windows and Linux. The tool dependencies are in the requirements.txt file 
 12 | which can be installed in 1 command.
 13 | 
 14 | Structure this package
 15 | 
 16 | ```
 17 | ├─Statistics Report:
 18 | │       Generating csv reports and graph reports including
 19 | |	  - summary statistics (average usage, voice call and etc.)
 20 | │	  - whole data statistics (ex. total cdrs, total days and locations)
 21 | |	  - daily and monthly statistics of users
 22 | |	  - frequent locations
 23 | |	  - zone based aggregation
 24 | |	  - graphical daily data
 25 | |	  - usage histogram
 26 | |
 27 | ├─Origin-Destination (OD):
 28 | │       Generating Origin-Destination file indicating the movement of humans
 29 | │
 30 | ├─Interpolation:
 31 | |       A set of software for route interpolation including 
 32 | |         - Extracting stay points
 33 | |         - Extract tripsegment
 34 | |         - Relocation PoI
 35 | |         - Route Interpolation with transpotation network
 36 | ```
 37 | After preparing the data, see:
 38 | 
 39 | * [Statistics Report](../master/Statistics/)
 40 | * [Origin Destination](../master/Origin_Destination/)
 41 | * [Interpolation](../master/Interpolation)
 42 | 
 43 | ## Data preparation
 44 | The user needs 2 files for the tool, a cdr file and a location mapping file. Both of them come with different column names and formats. To process CDR data, the data needs to be in the format that is compatible with the tools. The mapping json file maps from your prepared raw csv files to Hive tables ready for the processing and a mapping scheme for each file has to be done by the user. 
 45 | 
 46 | ### a CSV file for CDR records
 47 | To analyse the CDR data, the user needs to provide the tools with a CDR file in the csv format. The files include the de-identified CDR where any personally identifiable information (such as IMSI and IMEI) is encrypted in an unreversable manner. Hereinafter IMSI and IMEI mean de-identified IMSI and IMEI. The file needs to contain the following data items:
 48 | ```
 49 | UID          : Unique Identifier of each user
 50 | IMEI          : International Mobile Equipment Identity (IMEI) of Caller
 51 | IMSI          : International Mobile Subscriber Identity (IMSI) of Caller
 52 | CALL_TIME    : Activity Time (Start Time) in “YYYY-MM-DD HH:mm:ss” format 
 53 | DURATION     : Call Duration in seconds
 54 | CELL_ID      : Unique Cell Tower ID (LAC+CellID)
 55 | CALL_TYPE     : Type of the call (Data, Voice or SMS)
 56 | NETWORK_TYPE  : Type of the network (2G, 3G, 4G, 5G)
 57 | Longitude    : Real Number (decimal degree) in WGS84
 58 | Latitude     : Real Number(decimal degree) in WGS84
 59 | ```
 60 | 
 61 | For 9 and 10 If not existing, then it will be mapped in the cell tower mapping file by cell_id 
 62 | 
 63 | #### Mapping
 64 | Given a CDR file, the mapping in the key "cdr_data_layer" in the file config.json is shown below
 65 | 
 66 | ```
 67 | "cdr_data_layer":[
 68 | 		{"input_no":1, "input_name":"SUBID", "data_type":"String",  "output_no":1,   "name":"UID", "custom": ""},
 69 | 		{"input_no":-1, "input_name":"IMEI", "data_type":"String",  "output_no":2,   "name":"IMEI", "custom": ""},
 70 | 		{"input_no":-1, "input_name":"IMSI", "data_type":"String",  "output_no":3,   "name":"IMSI", "custom": ""},
 71 | 		{"input_no":2, "input_name":"CDATE", "data_type":"String",  "output_no":-1,   "name":"CDATE", "custom": ""},
 72 | 		{"input_no":3, "input_name":"CTIME", "data_type":"String",  "output_no":4,   "name":"CALL_TIME",  "custom": "CONCAT(CDATE,' ',CTIME)"},
 73 | 		{"input_no":4, "input_name":"DURATION", "data_type":"String",  "output_no":5,   "name":"DURATION", "custom": ""},
 74 | 		{"input_no":5, "input_name":"CELLID", "data_type":"String",  "output_no":6,   "name":"CELL_ID", "custom": ""},
 75 | 		{"input_no":6, "input_name":"LATITUDE", "data_type":"String",  "output_no":7,   "name":"LATITUDE", "custom": ""},
 76 | 		{"input_no":7, "input_name":"LONGITUDE", "data_type":"String",  "output_no":8,   "name":"LONGITUDE", "custom": ""},
 77 | 		{"input_no":9, "input_name":"NETWORK_TYPE", "data_type":"String",  "output_no":9,   "name":"NETWORK_TYPE", "custom": ""},
 78 | 		{"input_no":10, "input_name":"CALL_TYPE", "data_type":"String",  "output_no":10,   "name":"CALL_TYPE", "custom": ""}
 79 |  ],
 80 |  ```
 81 | * To map, If the column in the raw file reflects one of the required columns for CDR (ex. UID) then put it in the configuration item
 82 | whose "name" field contains "UID" (ex. "SUBID") and both "input_no" and "output_no" will be not "-1"
 83 | * Do not remove the configuration item. If there is no raw column corresponding to the required column, set "input_no" -1 
 84 | meaning that there is no CALL_TYPE column in the raw file
 85 |     * For example, if you don't have call_type column in your raw cdr file, the configuration will be
 86 |         * {"input_no":-1, "input_name":"CALL_TYPE", "data_type":"String",  "output_no":-1,   "name":"CALL_TYPE", "custom": ""}
 87 | * All the columns in the raw file needs to be indicated even it is not mapped to the required column. Simply adding more configuration items. If it is not mapped to any required column,
 88 |   set "output_no" -1 (ex. "CDATE") meaning that the column "CDATE" in the raw file does not reflect any required columns
 89 | * Some column may need more function to convert into a desirable format, you can indicate in "custom" field
 90 |     * For example, in the fifth item, "custom" contains CONCAT(CDATE, ' ', CTIME). The column CTIME is in the raw so "input_no" is not -1".
 91 |     and operands must be in the item in which the "input_no" is not -1 (it exists in the raw file)
 92 | 
 93 | ### a CSV location mapping file for administration units
 94 | The previous csv file will be joined with this cell id file to calculate zone-based statistics. It should supply
 95 | 1. Cell ID (will be joined with the Cell ID in the CDR record file)
 96 | 2. At least one Administration Unit (ex. province or district) name
 97 | 3. Latitude
 98 | 4. Longitude
 99 | 
100 | ```
101 | CELL_ID      : Unique Cell Tower ID (LAC+CellID)
102 | Longitude    : Real Number (decimal degree) in WGS84
103 | Latitude     : Real Number(decimal degree) in WGS84
104 | At least one Administration Unit (ex. province or district) Name
105 | ```
106 | 
107 | #### Mapping
108 | The mapping in this file needs to be done in the same way as previously mentioned in the CDR raw file.
109 | 
110 | ```
111 | 	"cdr_cell_tower":[
112 | 		{"input_no":1, "input_name":"bs_seq", "data_type":"String",  "output_no":-1,   "name":"BS_SEQ"},
113 | 		{"input_no":2, "input_name":"cell_seq", "data_type":"String",  "output_no":1,   "name":"CELL_ID" },
114 | 		{"input_no":3, "input_name":"name", "data_type":"String",  "output_no":-1,   "name":"NAME"},
115 | 		{"input_no":4, "input_name":"lac", "data_type":"String",  "output_no":-1,   "name":"CELLNAME" },
116 | 		{"input_no":5, "input_name":"cell", "data_type":"String",  "output_no":-1,   "name":"CI" },
117 | 		{"input_no":6, "input_name":"lon", "data_type":"String",  "output_no":2,   "name":"LATITUDE" },
118 | 		{"input_no":7, "input_name":"lat", "data_type":"String",  "output_no":3,   "name":"LONGITUDE" },
119 | 		{"input_no":8, "input_name":"ISO2", "data_type":"String",  "output_no":-1,   "name":"ISO2" },
120 | 		{"input_no":9, "input_name":"NAME_0_2", "data_type":"String",  "output_no":-1,   "name":"NAME_0_2"},
121 | 		{"input_no":10, "input_name":"ID_1_2", "data_type":"String",  "output_no":-1,   "name":"ID_1_2" },
122 | 		{"input_no":11, "input_name":"NAME_1_2", "data_type":"String",  "output_no":4,   "name":"ADMIN0", "geojson_filename": "", "geojson_col_name": "" },
123 | 		{"input_no":12, "input_name":"ID_2", "data_type":"String",  "output_no":-1,   "name":"ID2" },
124 | 		{"input_no":13, "input_name":"NAME_2", "data_type":"String",  "output_no":5,   "name":"ADMIN1",  "geojson_filename": "", "geojson_col_name": ""  },
125 | 		{"input_no":14, "input_name":"ENGTYPE_2", "data_type":"String",  "output_no":-1,   "name":"ENGTYPE_2" }
126 | 	]
127 | ```
128 | 
129 | One difference is that you need to supply at least one administration unit or your interested location to calculate zone population. For example, in the "input_no" 11, it contains an administration unit data
130 | and is mapped to "ADMIN0" (administration unit 0). It needs to be in the format ADMIN[0-5] to make the tool work (you may have shopping complex names in "input_name" and name it "ADMIN0" for example).
131 | If you want to visualize, put your geojson file location in the "geojson_filename" and the data will be joined with the zone population data and can be visualized in [kepler.gl](https://kepler.gl)
132 | 
133 | ## Configuration
134 | In config.json file, you need to assign the right path, prefix, location and so on. Here is an example of a config.json file with an explanation 
135 | 
136 | 
137 | "hadoop_data_path":"/path/to/cdr/and/celltower/file",
138 | 
139 | "provider_prefix":"pref1" **any prefix you'd like to name (you may need in case that you want to use this tool to different data** 
140 | 
141 | "db_name" : "cdrproject", 
142 | 
143 | "input_delimiter":",", **raw file delimiter (ex. comma "," or tab "\t")**
144 | 
145 | "input_files" :["cdr.csv"], ** raw cdr file(s)**
146 | 
147 | "input_file_time_format": "yyyyMMdd hh:mm:ss", **time format in your data (if it is ambiguous ex. no separator between month and year, you need to put the format here or left it blank if it is dash, slash-separated date and colon-separated time**
148 | 
149 | "input_file_have_header": 1, **if having table description (column names) put 1, otherwise 0**
150 | 
151 | "input_cell_tower_files" : ["cdr_cell_tower.csv"], **cell tower mapping data**
152 | 
153 | "input_cell_tower_delimiter":",", 
154 | 
155 | "input_cell_tower_have_header": 1,
156 | 
157 | "check_duplicate": true, **filter duplicate rows or not**
158 | 
159 | "check_invalid_lat_lng": true, **filter invalid lat and lng**
160 | 
161 | "host": "host_name", **hostname of the hadoop server**
162 | 
163 | "port": 10000, **hive2 server port**
164 | 
165 | "frequent_location_percentage": 80, **sum of the frequent location of a particular uid**
166 | 
167 | "csv_location": "csv_reports", **directory of the output csv reports**
168 | 
169 | "graph_location": "graphical_reports", **directory of the graph reports**
170 | 
171 | "od_admin_unit": "admin1", **administration unit used for calculating origin-destination**
172 | 
173 | "od_date": "2016-03-01", **date selected for origin-destination**
174 | 
175 | "interpolation_poi_file_location": "/path/to/poi", **a poi file for interpolation**
176 | 	
177 | "interpolation_osm_file_location": "/path/to/osm", **an osm file for interpolation**
178 | 	
179 | "interpolation_voronoi_file_location": "path/to/voronoi", **a voronoi file for interpolation**
180 | 
181 | 
182 | "cdr_data_layer": [...], the mapping scheme of cdr raw file to the table used for processing
183 | 
184 | "cdr_cell_tower": [...], the mapping scheme of cell tower mapping raw file to the table used for processing
185 | 
186 | ## Prerequisites
187 |   * Hadoop server with Hive installed
188 |   * Python 3 or above 
189 |   * Python pip3 (a Python package installer)
190 |   
191 | ## Installation
192 | clone the repository and then
193 | install all requirement packages in requirements.txt using command 
194 |   * pip install -r requirements.txt
195 |   
196 | ## Preparing tables
197 | 
198 | Go to [config.json](sample_configs/config.json) or [config_big.json](sample_configs/config_big.json) to see the configuration files to setup the variables and 
199 | start mapping your data.
200 | 
201 | Then go to [run_prepare_cdr_and_mapping.py](run_prepare_cdr_and_mapping.py) in the user section and run
202 | 
203 | * python3 run_prepare_cdr_and_mapping.py -c {config_file}
204 | 
205 | Example
206 | 
207 | * python3 run_prepare_cdr_and_mapping.py -c sample_configs/config_big.json
208 | 
209 | You may have some error due to mapping but after fixing it, you can continue from the most current function. If you do not want the tables to be deleted and created again, go to hive_connector.py and then comment the line with function create_tables in the __init__ function
210 | **
211 | 
212 | There are mainly 3 sections you may want to customize. 
213 | 
214 | **main() in [run_prepare_cdr_and_mapping.py](run_prepare_cdr_and_mapping.py)**
215 | ```
216 | def main():
217 |     # argument parser
218 |     start = time.time()
219 |     parser = argparse.ArgumentParser(description='Argument indicating the configuration file')
220 | 
221 |     # add configuration argument
222 |     parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data"
223 |                                                " \n ex. py py_hive_connect.py -c config.json",
224 |                         action="store")
225 | 
226 |     # parse config to args.config
227 |     args = parser.parse_args()
228 | 
229 |     config = Config(args.config)
230 |     HiveConnection(host=config.host, port=config.port, user=config.user)
231 |     cdr_data = extract_mapping_data(config)
232 | 
233 |     # initialize hive and create tables
234 |     table_creator = HiveTableCreator(config, cdr_data)
235 |     table_creator.initialize('hive_init_commands/initial_hive_commands_stats.json')  # init hive
236 |     table_creator.create_tables()
237 | 
238 |     print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time())))
239 | 
240 | ```
241 | **[hive_create_tables.py](Common/hive_create_tables.py) in \__init\__**
242 | If you don't want tables to be created again (maybe after some errors but tables created), you can comment it in **\__init\__** function
243 | 
244 | ```
245 |     def __init__(self, config, data):
246 |         self.__dict__ = config.__dict__
247 |         self.hive = HiveConnector(config)
248 |         timer = time.time()
249 |         print('########## Initilizing Hive ##########')
250 |         self.hive.initialize(config)
251 |         print('########## Done. Time elapsed: {} seconds ##########'.format(hp.format_two_point_time(timer, time.time())))
252 |         timer = time.time()
253 |         print('########## Creating Tables ##########')
254 |         #self.hive.create_tables(config, data) <<<< COMMENT HERE
255 |         print('########## Done create all tables. Time elapsed: {} seconds ##########'.format(hp.format_two_point_time(timer, time.time())))
256 | ```
257 | ## License
258 | Free to use and distribute with acknowledgement.
259 | 


--------------------------------------------------------------------------------
/Sample_Inputs/cdr_sample.csv:
--------------------------------------------------------------------------------
 1 | ﻿SUBID,CDATE,CTIME,DURATION,CELLID,LATITUDE,LONGITUDE,network_type,call_type
 2 | 3594716203,20160501,8:43:47,96,10011,13.44845,-16.57612,2G,VOICE
 3 | 3594716203,20160501,8:46:10,80,10021,13.45218,-16.57419,3G,DATA
 4 | 3594716203,20160502,9:46:10,43,10021,13.45218,-16.57419,3G,DATA
 5 | 3594716203,20160601,10:46:10,43,10022,13.45218,-16.57419,3G,DATA
 6 | 3498343785,20160501,8:42:06,201,10011,13.44845,-16.57612,3G,DATA
 7 | 3498191359,20160501,8:45:00,36,10012,13.44845,-16.57612,2G,DATA
 8 | 3463089753,20160501,8:45:08,9,10012,13.44845,-16.57612,3G,VOICE
 9 | 3589765737,20160501,8:45:07,10,10013,13.44845,-16.57612,2G,SMS
10 | 3587544575,20160501,8:45:10,8,10015,13.44845,-16.57612,3G,VOICE
11 | 3496971123,20160501,8:45:09,9,10016,13.44845,-16.57612,2G,DATA
12 | 3473003603,20160501,8:45:13,5,10021,13.45218,-16.57419,3G,DATA
13 | 3456069605,20160501,8:45:11,7,10021,13.45218,-16.57419,4G,DATA
14 | 3473003603,20160503,8:45:13,5,10021,13.45218,-16.57419,3G,DATA
15 | 3456069605,20160504,8:45:11,7,10021,13.45218,-16.57419,4G,DATA
16 | 3456069605,20160601,8:45:11,7,10021,13.45218,-16.57419,4G,DATA
17 | 3473003603,20160703,8:45:13,5,10021,13.45218,-16.57419,3G,DATA
18 | 3456069605,20200504,8:45:11,7,10021,13.45218,-16.57419,4G,DATA
19 | 


--------------------------------------------------------------------------------
/Sample_Inputs/mapping_sample.csv:
--------------------------------------------------------------------------------
 1 | BTSID,SITE NAME,Longitude,Latitude,CELLID,CELLNAME,CI,Azimuth,District,Province
 2 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10011,Q001-Dobson Street Roof Top-1,10011,30,Sendai,Miyagi
 3 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10012,Q001-Dobson Street Roof Top-2,10012,160,Sendai,Miyagi
 4 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10013,Q001-Dobson Street Roof Top-3,10013,290,Sendai,Miyagi
 5 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10014,Q001-Dobson Street Roof Top_DCS-1,10014,30,Sendai,Miyagi
 6 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10015,Q001-Dobson Street Roof Top_DCS-2,10015,160,Sendai,Miyagi
 7 | 1001,Q001-Dobson Street Roof Top,140.8694,38.2682,10016,Q001-Dobson Street Roof Top_DCS-3,10016,290,Sendai,Miyagi
 8 | 1002,Q002-ECOBANK,140.8694,38.2682,10021,Q002-ECOBANK-1,10021,20,Sendai,Miyagi
 9 | 1002,Q002-ECOBANK,140.8694,38.2682,10022,Q002-ECOBANK-2,10022,160,Sendai,Miyagi
10 | 


--------------------------------------------------------------------------------
/Statistics/README.md:
--------------------------------------------------------------------------------
  1 | # Statistics
  2 | In this sections, it concerns generating reports, graphs for statistics or CDR data. Output will be in the 
  3 | format of CSV file and a graph (png) files
  4 | To illustrate, a simple csv file for cdr and mapping will be used and they are located in.
  5 | ## Prerequisites
  6 | Tables obtained from the script [run_prepare_cdr_and_mapping.py](../run_prepare_cdr_and_mapping.py).
  7 | See the [first page](../README.md) for how to prepare a CDR file and a cell tower mapping file. The following are the columns 
  8 | of the CDR consolidate data table.
  9 | * CDR Consolidate Data Table ({provider_prefix}_consolidate_data_all)
 10 | ```
 11 | UID          : Unique Identifier of each user
 12 | IMEI          : International Mobile Equipment Identity (IMEI) of Caller
 13 | IMSI          : International Mobile Subscriber Identity (IMSI) of Caller
 14 | CALL_TIME    : Activity Time (Start Time) in “YYYY-MM-DD HH:mm:ss” format 
 15 | DURATION     : Call Duration in seconds
 16 | CELL_ID      : Unique Cell Tower ID (LAC+CellID)
 17 | CALL_TYPE     : Type of the call (Data, Voice or SMS)
 18 | NETWORK_TYPE  : Type of the network (2G, 3G, 4G, 5G)
 19 | Longitude    : Real Number (decimal degree) in WGS84
 20 | Latitude     : Real Number(decimal degree) in WGS84
 21 | ```
 22 | * Cell Tower Mapping Preprocess Table ({provider_prefix}_cell_tower_data_preprocess)
 23 | ```
 24 | CELL_ID      : Unique Cell Tower ID (LAC+CellID)
 25 | Longitude    : Real Number (decimal degree) in WGS84
 26 | Latitude     : Real Number(decimal degree) in WGS84
 27 | Admin1       : Administration Unit 1 name (if any)
 28 | Admin2       : Administration Unit 2 name (if any)
 29 | .
 30 | .
 31 | .
 32 | AdminN       : Administration Unit N name (if any)
 33 | ```
 34 | * Cell Tower Data Admin X Table ({provider_prefix}_cell_tower_data_adminX)
 35 |     * for generating sequence numbers of an administration unit in case of duplication
 36 | ```
 37 | AdminX_ID       : Administration Unit X name
 38 | AdminX_Name     : Name of the Administration Unit X
 39 | CELL_ID      : Unique Cell Tower ID
 40 | Longitude    : Real Number (decimal degree) in WGS84
 41 | Latitude     : Real Number (decimal degree) in WGS84
 42 | ```
 43 | ## Usage
 44 | Note: the configuration needs to be set first. See the [first page](../README.md) in the configuration section.
 45 | 
 46 | run command
 47 | * python3 [run_statistics.py](../run_statistics.py) -c {config_file}
 48 | 
 49 | Example
 50 | * python3 run_statistics.py -c sample_configs/[config_big.json](../sample_configs/config_big.json)
 51 | 
 52 | If you wish to execute some of the features, you can comment some lines in the file in main() of [run_statistics.py](../run_statistics.py) 
 53 | in the user section 
 54 | 
 55 | ```
 56 | def main():
 57 |     # argument parser
 58 |     start = time.time()
 59 |     parser = argparse.ArgumentParser(description='Argument indicating the configuration file')
 60 | 
 61 |     # add configuration argument
 62 |     parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data"
 63 |                                                " \n ex. py py_hive_connect.py -c config.json",
 64 |                         action="store")
 65 | 
 66 |     # parse config to args.config
 67 |     args = parser.parse_args()
 68 | 
 69 |     config = Config(args.config)
 70 |     HiveConnection(host=config.host, port=config.port, user=config.user)
 71 | 
 72 |     table_creator = HiveTableCreator(config)
 73 |     table_creator.initialize('hive_init_commands/initial_hive_commands_stats.json')  # mandatory (init hive)
 74 | 
 75 |     # init stats generators
 76 |     st = Statistics(config)
 77 | 
 78 |     # user section here
 79 |     # reports
 80 |     st.calculate_data_statistics()
 81 |     st.calculate_daily_statistics()
 82 |     st.calculate_monthly_statistics()
 83 |     st.calculate_zone_population()
 84 |     st.calculate_summary()
 85 |     st.calculate_user_date_histogram()
 86 |     # graphs
 87 |     st.daily_cdrs()
 88 |     st.daily_unique_users()
 89 |     st.daily_unique_locations()
 90 |     st.daily_average_cdrs()
 91 |     st.daily_unique_average_locations()
 92 | 
 93 |     # frequent locations (Report)
 94 |     st.frequent_locations()
 95 |     st.frequent_locations_night()
 96 | 
 97 |     # Prerequisite for Origin-Destination, if not wishing to calculate OD, kindly comment the code
 98 |     st.rank1_frequent_locations()  # Require frequent_locations() in run_statistics.py
 99 | 
100 |     print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time())))
101 | ```    
102 | 
103 | ## Output Reports
104 | The implementation is in [cdr_statistics.py](../Common/cdr_statistics.py)
105 | ### Data statistics
106 | Located in calculate_data_statistics(). In the data statistics, the result output (see the [Data statistics](../Statistics/output_reports/css_file_data_stat.csv)) provided will be:
107 | 
108 | ```
109 | Total Records   : the total cdr usage data 
110 | Total Days      : the total days that have usage
111 | Unique id       : the total unique ids of the data (it could be imei, imsi or another identifier given)
112 | Unique imei     : the total unique imeis (will be omitted if it is the unique id already)
113 | Unique imsi     : the total unique imsis (will be omitted if it is the unique id already)
114 | Unique loc name : the total unique latitude and longitude of the cdr data
115 | Start date      : the starting date of the data
116 | End date        : the end date of the data
117 | ```
118 | ### Daily and Monthly Statistics
119 | Located in calculate_daily_statistics() and calculate_monthly_statistics (see the [daily](../Statistics/output_reports/css_provider_data_stat_daily.csv) and [monthly](../Statistics/output_reports/css_provider_data_stat_monthly.csv) output). Calculating some properties order by date first and then the type of call type and network type.
120 | 
121 | Each field in the daily statistics is by a particular date (or year and month for monthly statistics), call type and network type including:
122 | ```
123 | Date            : the dates that have cdr records
124 | Call Type       : the call type of the cdr data (VOICE, DATA, SMS)
125 | Network Type    : the network type of the cdr data (2G, 3G, 4G)
126 | Total Records   : the total records
127 | Total Days      : the total days
128 | Unique id       : the total unique ids of the data (it could be imei, imsi or another identifier given)
129 | Unique imei     : the total unique imeis 
130 | Unique imsi     : the total unique imsis
131 | Unique loc name : the total unique latitude and longitude of the cdr data
132 | ```
133 | 
134 | ### Zone population
135 | at least one administration level is needed to calculate zone population.
136 | By indicating in the mapping in config.json
137 | ```
138 | "cdr_cell_tower":[
139 | 		{"input_no":1, "input_name":"BTSID", "data_type":"String",  "output_no":1,   "name":"UID"},
140 | 		{"input_no":2, "input_name":"SITE_NAME", "data_type":"String",  "output_no":2,   "name":"SITE_NAME"},
141 | 		{"input_no":3, "input_name":"LONGITUDE", "data_type":"String",  "output_no":3,   "name":"LONGITUDE"},
142 | 		{"input_no":4, "input_name":"LATITUDE", "data_type":"String",  "output_no":4,   "name":"LATITUDE"},
143 | 		{"input_no":5, "input_name":"CELLID", "data_type":"String",  "output_no":5,   "name":"CELL_ID" },
144 | 		{"input_no":6, "input_name":"CELLNAME", "data_type":"String",  "output_no":-1,   "name":"CELLNAME" },
145 | 		{"input_no":7, "input_name":"CI", "data_type":"String",  "output_no":-1,   "name":"CI" },
146 | 		{"input_no":8, "input_name":"AZIMUTH", "data_type":"String",  "output_no":-1,   "name":"AZIMUTH" },
147 | USED	{"input_no":9, "input_name":"DISTRICT", "data_type":"String",  "output_no":6,   "name":"ADMIN1", "geojson_filename": "japan.json", "geojson_col_name": "nam"},
148 | USED	{{"input_no":10, "input_name":"PROVINCE", "data_type":"String",  "output_no":7,   "name":"ADMIN2", "geojson_filename": "", "geojson_col_name": ""}
149 | 	]
150 | ```
151 | You need to indicate that the column name indicating a place means what level of administration. For example,
152 | DISTRICT column is ADMIN1.
153 | 
154 | If you are able to provide a geojson file for this administration level then put it in the key "geojson_filename" with the name of the field in geojson in "geojson_col_name"
155 |  in order for the tool to join the attribute correctly.
156 |  
157 | The result you get is the zone_base_aggregations_level_ADMIN{X}.csv and you can see the result output [zone_based_aggregations_level_ADMIN1](../Statistics/output_reports/zone_based_aggregations_level_ADMIN1.csv)
158 | 
159 | which includes:
160 | ```
161 | Administration Level x : Your input administration level x
162 | Count Activities       : Total CDR Records that are in the administration level
163 | Count Unique IDs       : Total Unique Ids
164 | ```
165 | and {geojson_file_name}_joined_ADMIN{X}.json (you can put this in visualization service such as [kepler.gl](https://kepler.gl))
166 | 
167 | ### Summary Data
168 | For the output summary date [(summary)](../Statistics/output_reports/summary_stats.csv). It contains overall statistics of the cdr data including:
169 | ```
170 | Total records
171 | Total unique IDs
172 | Total days          
173 | Average daily usage 
174 | Average daily voice 
175 | Average daily sms   
176 | Average daily unique cell id
177 | Average Daily Admin Level 1
178 | ```
179 | ### Frequent Locations
180 | Frequent locations (All-day and night) output is the most popular cell_id in which user make a call data in a day or a night.
181 | The output here will be a table named {your_prefix}_frequent_location_thresholded and {your_prefix}_frequent_location_thresholded_night.
182 | Example output
183 | 
184 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_reports/frequent_location_output_sample.png "Frequent Locations")
185 | 
186 | 
187 | ## Output Graphs
188 | The implementation is in [cdr_statistics.py](../Common/cdr_statistics.py)
189 | ### User Date Histogram
190 | The output histogram is the histogram of each number of days (x) with the corresponding number of unique ids of users who make a call for the particular number of days (y).
191 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/user_data_histogram.png "User Date Histogram")
192 | ### Daily CDRs
193 | The graph reports the daily usage of a user each day with minimum, maximum, average and total cdr
194 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_cdrs.png "Daily CDRs")
195 | ### Daily CDRs by call type
196 | This graph reports the daily usage of a user each day by call type (multiple lines)
197 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_cdr_by_call_type.png "Daily CDRs by call type")
198 | ### Daily Unique Users
199 | Daily unique uids are reported daily in the graph with statistics information (minimum, maximum, average and total)
200 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_unique_users.png "Daily Unique Users")
201 | ### Daily Unique Locations
202 | Daily unique locations (unique latitude and longitude) are reported daily in the graph with statistics information (minimum, maximum, average and total)
203 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_unique_locations.png "Daily Unique Locations")
204 | ### Daily average CDRs
205 | Daily average CDRs are the daily CDR per user in average with the average of them by days displayed on top
206 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_avg_cdr.png "Daily Average CDRs")
207 | 
208 | ### Daily Unique Average Locations
209 | This graphs represent daily average locations which are the daily CDR per user in average with the average of them by days being shown on the top of the graph
210 | ![alt text](https://github.com/shibalab/CDR-analysis-tools-hadoop/blob/master/Statistics/output_graphs/daily_unique_avg_locations.png "Daily Average CDRs")
211 | 
212 | 


--------------------------------------------------------------------------------
/Statistics/output_graphs/daily_avg_cdr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_avg_cdr.png


--------------------------------------------------------------------------------
/Statistics/output_graphs/daily_cdr_by_call_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_cdr_by_call_type.png


--------------------------------------------------------------------------------
/Statistics/output_graphs/daily_cdrs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_cdrs.png


--------------------------------------------------------------------------------
/Statistics/output_graphs/daily_unique_avg_locations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_unique_avg_locations.png


--------------------------------------------------------------------------------
/Statistics/output_graphs/daily_unique_locations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_unique_locations.png


--------------------------------------------------------------------------------
/Statistics/output_graphs/daily_unique_users.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/daily_unique_users.png


--------------------------------------------------------------------------------
/Statistics/output_graphs/user_data_histogram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_graphs/user_data_histogram.png


--------------------------------------------------------------------------------
/Statistics/output_reports/css_file_data_stat.csv:
--------------------------------------------------------------------------------
1 | "total_records","total_days","unique_id","unique_imei","unique_imsi","unique_location_name","start_date","end_date"
2 | 17,7,9,1,1,7,"2016-05-01","2020-05-04"
3 | 


--------------------------------------------------------------------------------
/Statistics/output_reports/css_provider_data_stat_daily.csv:
--------------------------------------------------------------------------------
 1 | "date","call_type","network_type","total_records","total_days","unique_id","unique_imei","unique_imsi","unique_location_name"
 2 | "2016-05-01","ALL","ALL",10,1,9,1,1,6
 3 | "2016-05-01","ALL","4G",1,1,1,1,1,1
 4 | "2016-05-01","ALL","3G",5,1,5,1,1,4
 5 | "2016-05-01","ALL","2G",4,1,4,1,1,4
 6 | "2016-05-01","DATA","ALL",6,1,6,1,1,4
 7 | "2016-05-01","DATA","4G",1,1,1,1,1,1
 8 | "2016-05-01","DATA","3G",3,1,3,1,1,2
 9 | "2016-05-01","DATA","2G",2,1,2,1,1,2
10 | "2016-05-01","SMS","ALL",1,1,1,1,1,1
11 | "2016-05-01","SMS","2G",1,1,1,1,1,1
12 | "2016-05-01","VOICE","ALL",3,1,3,1,1,3
13 | "2016-05-01","VOICE","3G",2,1,2,1,1,2
14 | "2016-05-01","VOICE","2G",1,1,1,1,1,1
15 | "2016-05-02","ALL","ALL",1,1,1,1,1,1
16 | "2016-05-02","ALL","3G",1,1,1,1,1,1
17 | "2016-05-02","DATA","ALL",1,1,1,1,1,1
18 | "2016-05-02","DATA","3G",1,1,1,1,1,1
19 | "2016-05-03","ALL","ALL",1,1,1,1,1,1
20 | "2016-05-03","ALL","3G",1,1,1,1,1,1
21 | "2016-05-03","DATA","ALL",1,1,1,1,1,1
22 | "2016-05-03","DATA","3G",1,1,1,1,1,1
23 | "2016-05-04","ALL","ALL",1,1,1,1,1,1
24 | "2016-05-04","ALL","4G",1,1,1,1,1,1
25 | "2016-05-04","DATA","ALL",1,1,1,1,1,1
26 | "2016-05-04","DATA","4G",1,1,1,1,1,1
27 | "2016-06-01","ALL","ALL",2,1,2,1,1,2
28 | "2016-06-01","ALL","4G",1,1,1,1,1,1
29 | "2016-06-01","ALL","3G",1,1,1,1,1,1
30 | "2016-06-01","DATA","ALL",2,1,2,1,1,2
31 | "2016-06-01","DATA","4G",1,1,1,1,1,1
32 | "2016-06-01","DATA","3G",1,1,1,1,1,1
33 | "2016-07-03","ALL","ALL",1,1,1,1,1,1
34 | "2016-07-03","ALL","3G",1,1,1,1,1,1
35 | "2016-07-03","DATA","ALL",1,1,1,1,1,1
36 | "2016-07-03","DATA","3G",1,1,1,1,1,1
37 | "2020-05-04","ALL","ALL",1,1,1,1,1,1
38 | "2020-05-04","ALL","4G",1,1,1,1,1,1
39 | "2020-05-04","DATA","ALL",1,1,1,1,1,1
40 | "2020-05-04","DATA","4G",1,1,1,1,1,1
41 | 


--------------------------------------------------------------------------------
/Statistics/output_reports/css_provider_data_stat_monthly.csv:
--------------------------------------------------------------------------------
 1 | "year","month","call_type","network_type","total_records","total_days","unique_id","unique_imei","unique_imsi","unique_location_name"
 2 | 2016,5,"ALL","ALL",13,4,9,1,1,6
 3 | 2016,5,"ALL","4G",2,2,1,1,1,1
 4 | 2016,5,"ALL","3G",7,3,5,1,1,4
 5 | 2016,5,"ALL","2G",4,1,4,1,1,4
 6 | 2016,5,"DATA","ALL",9,4,6,1,1,4
 7 | 2016,5,"DATA","4G",2,2,1,1,1,1
 8 | 2016,5,"DATA","3G",5,3,3,1,1,2
 9 | 2016,5,"DATA","2G",2,1,2,1,1,2
10 | 2016,5,"SMS","ALL",1,1,1,1,1,1
11 | 2016,5,"SMS","2G",1,1,1,1,1,1
12 | 2016,5,"VOICE","ALL",3,1,3,1,1,3
13 | 2016,5,"VOICE","3G",2,1,2,1,1,2
14 | 2016,5,"VOICE","2G",1,1,1,1,1,1
15 | 2020,5,"ALL","ALL",1,1,1,1,1,1
16 | 2020,5,"ALL","4G",1,1,1,1,1,1
17 | 2020,5,"DATA","ALL",1,1,1,1,1,1
18 | 2020,5,"DATA","4G",1,1,1,1,1,1
19 | 


--------------------------------------------------------------------------------
/Statistics/output_reports/frequent_location_output_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/Statistics/output_reports/frequent_location_output_sample.png


--------------------------------------------------------------------------------
/Statistics/output_reports/summary_stats.csv:
--------------------------------------------------------------------------------
 1 | "All Data","Value"
 2 | "total_records",17
 3 | "total_uids",9
 4 | "total_days","7 (1 May 2016-4 May 2020)"
 5 | "
 6 | "
 7 | "Statistics"
 8 | "average_usage_per_day",2.429
 9 | "average_daily_voice",0.429
10 | "average_daily_sms",0.143
11 | "average_daily_unique_cell_id",1.857
12 | "average_admin1_per_day",1.0
13 | 


--------------------------------------------------------------------------------
/Statistics/output_reports/zone_based_aggregations_level_ADMIN1.csv:
--------------------------------------------------------------------------------
1 | "admin1","count_activities","count_unique_ids"
2 | "Sendai",17,9
3 | 


--------------------------------------------------------------------------------
/Statistics/output_reports/zone_based_aggregations_level_ADMIN2.csv:
--------------------------------------------------------------------------------
1 | "admin2","count_activities","count_unique_ids"
2 | "Miyagi",17,9
3 | 


--------------------------------------------------------------------------------
/hive_init_commands/initial_hive_commands_interpolation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "hive_commands": [
 3 |     "ADD JAR /hadoop/hive/lib/cdrmobilitylib.jar",
 4 | 	"ADD JAR /hadoop/hive/lib/cdrmobilitylibge.jar",
 5 | 	"ADD JAR /hadoop/hive/lib/cdrmobilitylibjica.jar",
 6 | 	"ADD JAR /hadoop/hive/lib/jts-1.12.jar",
 7 | 	"ADD JAR /hadoop/hive/lib/jtsio-1.12.jar",
 8 | 	"ADD JAR /hadoop/hive/lib/javaml-0.1.6.jar",
 9 | 	"ADD JAR /hadoop/hive/lib/weka.jar",
10 | 	"ADD JAR /hadoop/hive/lib/libsvm.jar",
11 | 	"ADD JAR /hadoop/hive/lib/ajt-2.5.jar",
12 | 	"ADD JAR /hadoop/hive/lib/Jama-1.0.2.jar",
13 | 	"create temporary function FindPolygon as 'com.apichon.mobility.hive.udf.FindPolygon'",
14 | 	"create temporary function FindPolygon2 as 'com.apichon.mobility.hive.udf.FindPolygon2'",
15 | 	"create temporary function CreateTrajectoriesCDR as 'com.apichon.mobility.hive.udf.ge.CreateTrajectoriesCDR'",
16 | 	"create temporary function TripSegmentationCDR as 'com.apichon.mobility.hive.udf.ge.TripSegmentationCDR'",
17 | 	"create temporary function CreateTrajectoriesJICAWithZone as 'com.apichon.mobility.hive.udf.jica.CreateTrajectoriesJICAWithZone'",
18 | 	"create temporary function TripOD as 'com.apichon.mobility.hive.udf.jica.TripOD'",
19 | 	"create temporary function TripODStay as 'com.apichon.mobility.hive.udf.jica.TripODStay'",
20 | 	"set hive.exec.dynamic.partition.mode=nonstrict",
21 | 	"set hive.exec.dynamic.partition=true",
22 | 	"set hive.exec.max.dynamic.partitions.pernode=4000",
23 | 	"set hive.exec.max.created.files=150000",
24 | 	"set hive.enforce.bucketing=true",
25 | 	"set mapred.reduce.slowstart.completed.maps=0.98",
26 | 	"set mapred.job.reuse.jvm.num.tasks=50",
27 | 	"set mapred.child.java.opts=-Xmx2048m -XX:-UseGCOverheadLimit",
28 | 	"set hive.map.aggr.hash.percentmemory = 0.25",
29 | 	"ADD jar /hadoop/hive/lib/commons-pool-1.5.4.jar",
30 | 	"ADD jar /hadoop/hive/lib/commons-dbcp-1.4.jar",
31 | 	"ADD jar /hadoop/hive/lib/commons-lang-2.6.jar",
32 | 	"ADD jar /hadoop/hive/lib/commons-logging-1.1.3.jar",
33 | 	"ADD jar /hadoop/hive/lib/commons-math-2.1.jar",
34 | 	"ADD jar /hadoop/hive/lib/gt-api-9.3.jar",
35 | 	"ADD jar /hadoop/hive/lib/gt-data-9.3.jar",
36 | 	"ADD jar /hadoop/hive/lib/gt-epsg-hsql-12.2.jar",
37 | 	"ADD jar /hadoop/hive/lib/gt-main-9.3.jar",
38 | 	"ADD jar /hadoop/hive/lib/gt-metadata-9.3.jar",
39 | 	"ADD jar /hadoop/hive/lib/gt-opengis-9.3.jar",
40 | 	"ADD jar /hadoop/hive/lib/gt-referencing-9.3.jar",
41 | 	"ADD jar /hadoop/hive/lib/jsr-275-1.0-beta-2.jar",
42 | 	"ADD jar /hadoop/hive/lib/jts-1.13.jar",
43 | 	"ADD jar /hadoop/hive/lib/vecmath-1.3.2.jar",
44 | 	"ADD jar /hadoop/hive/lib/postgresql-9.3-1102.jdbc4.jar",
45 | 	"ADD jar /hadoop/hive/lib/postgis-jdbc-2.1.0SVN.jar",
46 | 
47 | 	"ADD jar /hadoop/hive/lib/cdrinterpolationlib.jar",
48 | 
49 | 	"create temporary function f_turkcellarray   as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellCdrArrayUDAF'",
50 | 	"create temporary function f_organizearray   as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellOrganizeTripUDF'",
51 | 	"create temporary function f_reallocation    as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellReallocationUDF'",
52 | 	"create temporary function f_routing         as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellRoutingUDF'",
53 | 	"create temporary function f_routing2         as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.TurkcellRoutingUDF2'",
54 | 	"create temporary function f_hmesh           as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.HMeshUDF'",
55 | 	"create temporary function f_dumptrip        as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.DumpTripUDF'",
56 | 	"create temporary function f_hmesharray      as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.HMeshArrayUDF'",
57 | 	"create temporary function DumpTripUDF      as 'jp.ac.ut.csis.pflow.turkcell.hive.udf.DumpTripUDF'",
58 | 	"create temporary function sp_distance as 'com.apichon.mobility.hive.udf.sp_distance'",
59 | 	"set hive.support.sql11.reserved.keywords=false",
60 | 	"set hive.fetch.task.conversion=minimal",
61 | 
62 |     "add file {poi_location}",
63 |     "add file {osm_location}",
64 |     "add file {voronoi_location}",
65 |     "use {db_name}"
66 |   ]
67 | }


--------------------------------------------------------------------------------
/hive_init_commands/initial_hive_commands_od.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "hive_commands": [
 3 |     "ADD JAR /hadoop/hive/lib/cdrmobilitylib.jar",
 4 | 	"ADD JAR /hadoop/hive/lib/cdrmobilitylibge.jar",
 5 | 	"ADD JAR /hadoop/hive/lib/cdrmobilitylibjica.jar",
 6 | 	"ADD JAR /hadoop/hive/lib/jts-1.12.jar",
 7 | 	"ADD JAR /hadoop/hive/lib/jtsio-1.12.jar",
 8 | 	"ADD JAR /hadoop/hive/lib/javaml-0.1.6.jar",
 9 | 	"ADD JAR /hadoop/hive/lib/weka.jar",
10 | 	"ADD JAR /hadoop/hive/lib/libsvm.jar",
11 | 	"ADD JAR /hadoop/hive/lib/ajt-2.5.jar",
12 | 	"ADD JAR /hadoop/hive/lib/Jama-1.0.2.jar",
13 | 	"create temporary function FindPolygon as 'com.apichon.mobility.hive.udf.FindPolygon'",
14 | 	"create temporary function FindPolygon2 as 'com.apichon.mobility.hive.udf.FindPolygon2'",
15 | 	"create temporary function CreateTrajectoriesCDR as 'com.apichon.mobility.hive.udf.ge.CreateTrajectoriesCDR'",
16 | 	"create temporary function TripSegmentationCDR as 'com.apichon.mobility.hive.udf.ge.TripSegmentationCDR'",
17 | 	"create temporary function CreateTrajectoriesJICAWithZone as 'com.apichon.mobility.hive.udf.jica.CreateTrajectoriesJICAWithZone'",
18 | 	"create temporary function TripOD as 'com.apichon.mobility.hive.udf.jica.TripOD'",
19 | 	"create temporary function TripODStay as 'com.apichon.mobility.hive.udf.jica.TripODStay'",
20 | 	"set hive.exec.dynamic.partition.mode=nonstrict", 
21 | 	"set hive.exec.dynamic.partition=true", 
22 | 	"set hive.exec.max.dynamic.partitions.pernode=4000", 
23 | 	"set hive.exec.max.created.files=150000", 
24 | 	"set hive.enforce.bucketing=true", 
25 | 	"set mapred.map.tasks.speculative.execution=false",
26 | 	"set mapred.reduce.tasks.speculative.execution=false",
27 | 	"set hive.mapred.reduce.tasks.speculative.execution=false",
28 | 	"set hive.map.aggr=false",
29 | 	"set mapred.reduce.slowstart.completed.maps=0.98",
30 | 	"set mapred.job.reuse.jvm.num.tasks=50", 
31 | 	"set mapred.child.java.opts=-Xmx8096m -XX:-UseGCOverheadLimit -XX:+UseConcMarkSweepGC",
32 | 	"set hive.tez.container.size=4096",
33 | 	"set hive.map.aggr.hash.percentmemory = 0.25",
34 | 	"ADD jar /hadoop/hive/lib/commons-pool-1.5.4.jar",
35 | 	"ADD jar /hadoop/hive/lib/commons-dbcp-1.4.jar",
36 | 	"ADD jar /hadoop/hive/lib/commons-lang-2.6.jar",
37 | 	"ADD jar /hadoop/hive/lib/commons-logging-1.1.3.jar",
38 | 	"ADD jar /hadoop/hive/lib/commons-math-2.1.jar",
39 | 	"ADD jar /hadoop/hive/lib/jts-1.13.jar",
40 | 	"ADD jar /hadoop/hive/lib/vecmath-1.3.2.jar",
41 | 	"create temporary function sp_distance as 'com.apichon.mobility.hive.udf.sp_distance'",
42 | 	"set hive.support.sql11.reserved.keywords=false",
43 |     "use {db_name}"
44 |   ]
45 | }


--------------------------------------------------------------------------------
/hive_init_commands/initial_hive_commands_stats.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "hive_commands": [
 3 | 	"set hive.exec.dynamic.partition.mode=nonstrict", 
 4 | 	"set hive.exec.dynamic.partition=true", 
 5 | 	"set hive.exec.max.dynamic.partitions.pernode=4000", 
 6 | 	"set hive.exec.max.created.files=150000", 
 7 | 	"set hive.enforce.bucketing=true", 
 8 | 	"set mapred.map.tasks.speculative.execution=false",
 9 | 	"set mapred.reduce.tasks.speculative.execution=false",
10 | 	"set hive.mapred.reduce.tasks.speculative.execution=false",
11 | 	"set hive.map.aggr=false",
12 | 	"set mapred.reduce.slowstart.completed.maps=0.98",
13 | 	"set mapred.job.reuse.jvm.num.tasks=50", 
14 | 	"set hive.support.sql11.reserved.keywords=false",
15 |     "use {db_name}"
16 |   ]
17 | }


--------------------------------------------------------------------------------
/lib/Jama-1.0.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/Jama-1.0.2.jar


--------------------------------------------------------------------------------
/lib/ajt-2.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/ajt-2.5.jar


--------------------------------------------------------------------------------
/lib/cdrinterpolationlib.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrinterpolationlib.jar


--------------------------------------------------------------------------------
/lib/cdrlibindicator.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrlibindicator.jar


--------------------------------------------------------------------------------
/lib/cdrmobilitylib.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrmobilitylib.jar


--------------------------------------------------------------------------------
/lib/cdrmobilitylibge.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrmobilitylibge.jar


--------------------------------------------------------------------------------
/lib/cdrmobilitylibjica.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/cdrmobilitylibjica.jar


--------------------------------------------------------------------------------
/lib/commons-dbcp-1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-dbcp-1.4.jar


--------------------------------------------------------------------------------
/lib/commons-lang-2.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-lang-2.6.jar


--------------------------------------------------------------------------------
/lib/commons-logging-1.1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-logging-1.1.3.jar


--------------------------------------------------------------------------------
/lib/commons-math-2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-math-2.1.jar


--------------------------------------------------------------------------------
/lib/commons-pool-1.5.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/commons-pool-1.5.4.jar


--------------------------------------------------------------------------------
/lib/gt-api-9.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-api-9.3.jar


--------------------------------------------------------------------------------
/lib/gt-data-9.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-data-9.3.jar


--------------------------------------------------------------------------------
/lib/gt-epsg-hsql-12.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-epsg-hsql-12.2.jar


--------------------------------------------------------------------------------
/lib/gt-main-9.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-main-9.3.jar


--------------------------------------------------------------------------------
/lib/gt-metadata-2.6.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-metadata-2.6.5.jar


--------------------------------------------------------------------------------
/lib/gt-metadata-9.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-metadata-9.3.jar


--------------------------------------------------------------------------------
/lib/gt-opengis-9.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-opengis-9.3.jar


--------------------------------------------------------------------------------
/lib/gt-referencing-2.6.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-referencing-2.6.5.jar


--------------------------------------------------------------------------------
/lib/gt-referencing-9.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-referencing-9.3.jar


--------------------------------------------------------------------------------
/lib/gt-shapefile-2.6.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/gt-shapefile-2.6.5.jar


--------------------------------------------------------------------------------
/lib/jahmm-0.6.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jahmm-0.6.2.jar


--------------------------------------------------------------------------------
/lib/javaml-0.1.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/javaml-0.1.6.jar


--------------------------------------------------------------------------------
/lib/jsr-275-1.0-beta-2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jsr-275-1.0-beta-2.jar


--------------------------------------------------------------------------------
/lib/jts-1.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jts-1.12.jar


--------------------------------------------------------------------------------
/lib/jts-1.13.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jts-1.13.jar


--------------------------------------------------------------------------------
/lib/jtsio-1.12.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/jtsio-1.12.jar


--------------------------------------------------------------------------------
/lib/libsvm.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/libsvm.jar


--------------------------------------------------------------------------------
/lib/pflow-hiveUDF.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/pflow-hiveUDF.jar


--------------------------------------------------------------------------------
/lib/postgis-jdbc-2.1.0SVN.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/postgis-jdbc-2.1.0SVN.jar


--------------------------------------------------------------------------------
/lib/postgresql-9.3-1102.jdbc4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/postgresql-9.3-1102.jdbc4.jar


--------------------------------------------------------------------------------
/lib/vecmath-1.3.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/vecmath-1.3.2.jar


--------------------------------------------------------------------------------
/lib/weka.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SpatialDataCommons/CDR-analysis-tools-hadoop/8e02b72cbcfe031481659d75eb224b52437fae8e/lib/weka.jar


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/create_consolidate_cdr.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_consolidate_data_all({arg_prep})
2 | PARTITIONED BY (pdt string) ROW FORMAT DELIMITED
3 | FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/create_mapping_admin.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS {provider_prefix}_cell_tower_data_{admin}
2 | ({admin}_id string, {admin}_name string, latitude string, longitude string)
3 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'
4 | STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/create_preprocess_cdr.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_preprocess({args}) ROW FORMAT DELIMITED
2 | FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/create_preprocess_mapping.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS {provider_prefix}_cell_tower_data_preprocess({arg_create})
2 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/create_raw_cdr.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_raw ({arg_raw})
2 | ROW FORMAT DELIMITED FIELDS TERMINATED BY "{field_delimiter}"
3 | LINES TERMINATED BY '\n' STORED AS TEXTFILE
4 | tblproperties ("skip.header.line.count"="{cell_tower_header}")


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/create_raw_mapping.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_cell_tower_data_raw ({arg_raw}) ROW FORMAT DELIMITED
2 | FIELDS TERMINATED BY "{field_delimiter}" LINES TERMINATED BY '\n' STORED AS TEXTFILE
3 | tblproperties ("skip.header.line.count"="{have_header}")


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/insert_consolidate_cdr.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO TABLE  {provider_prefix}_consolidate_data_all
2 | PARTITION (pdt) select {arg_con}, to_date(call_time) as pdt
3 | from {provider_prefix}_preprocess
4 | 
5 | 


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/insert_consolidate_cdr_join.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO TABLE  {provider_prefix}_consolidate_data_all
2 | PARTITION (pdt) select {arg_con}, to_date(a1.call_time) as pdt
3 | from {provider_prefix}_preprocess a1 join
4 | {provider_prefix}_cell_tower_data_preprocess a2
5 | on(a1.cell_id = a2.cell_id)
6 | 
7 | 


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/insert_mapping_admin.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE  {provider_prefix}_cell_tower_data_{admin}
2 | select row_number() OVER () - 1 as rowidx, {admin}, latitude, longitude
3 | from {provider_prefix}_cell_tower_data_preprocess where translate({admin},'  ',' ') != ''
4 | {check_lat_lng} group by {admin}, latitude, longitude order by rowidx


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/insert_preprocess_cdr.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE  {provider_prefix}_preprocess
2 | select {distinct} {arg} from {provider_prefix}_raw
3 | 


--------------------------------------------------------------------------------
/queries/cdr_and_mapping/insert_preprocess_mapping.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO TABLE  {provider_prefix}_cell_tower_data_preprocess
2 | select {distinct} {arg} from {provider_prefix}_cell_tower_data_raw


--------------------------------------------------------------------------------
/queries/interpolation/create_cdr_by_uid.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_cdr_by_uid (uid string, arr ARRAY<ARRAY<string>>)
2 | PARTITIONED BY (pdt string)
3 | ROW FORMAT DELIMITED
4 | FIELDS TERMINATED BY '\t'
5 | COLLECTION ITEMS TERMINATED BY ','
6 | MAP KEYS TERMINATED BY '!'
7 | LINES TERMINATED BY '\n'
8 | STORED AS ORC
9 | 


--------------------------------------------------------------------------------
/queries/interpolation/create_poi_relocation.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_cdr_by_uid_trip_realloc_array_apd (uid string, arr ARRAY<ARRAY<string>>)
2 | PARTITIONED BY (pdt string)
3 | ROW FORMAT DELIMITED
4 | FIELDS TERMINATED BY '\t'
5 | COLLECTION ITEMS TERMINATED BY ','
6 | MAP KEYS TERMINATED BY '!'
7 | LINES TERMINATED BY '\n'
8 | STORED AS ORC
9 | 


--------------------------------------------------------------------------------
/queries/interpolation/create_route_interpolation.sql:
--------------------------------------------------------------------------------
 1 | create table {provider_prefix}_cdr_by_uid_trip_routing_array_apd
 2 | (uid string, route_arr ARRAY<ARRAY<string>>)
 3 | partitioned by (pdt string)
 4 | row format delimited fields terminated by '\t'
 5 | collection items terminated by ','
 6 | map keys terminated by '!'
 7 | lines terminated by '\n'
 8 | stored as ORC
 9 | 
10 | 


--------------------------------------------------------------------------------
/queries/interpolation/create_trip_24_hr_padding.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_cdr_by_uid_trip_organized_array_apd (uid string, arr ARRAY<ARRAY<string>>)
2 | PARTITIONED BY (pdt string)
3 | ROW FORMAT DELIMITED
4 | FIELDS TERMINATED BY '\t'
5 | COLLECTION ITEMS TERMINATED BY ','
6 | MAP KEYS TERMINATED BY '!'
7 | LINES TERMINATED BY '\n'
8 | STORED AS ORC
9 | 


--------------------------------------------------------------------------------
/queries/interpolation/create_trip_format.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_cdr_by_uid_trip (uid string, arr ARRAY<ARRAY<string>>)
2 | PARTITIONED BY (pdt string)
3 | ROW FORMAT DELIMITED
4 | FIELDS TERMINATED BY '\t'
5 | COLLECTION ITEMS TERMINATED BY ','
6 | MAP KEYS TERMINATED BY '!'
7 | LINES TERMINATED BY '\n'
8 | STORED AS ORC
9 | 


--------------------------------------------------------------------------------
/queries/interpolation/export_to_gps_format.sql:
--------------------------------------------------------------------------------
 1 | insert overwrite local directory '/tmp/hive/csv_interpolation'
 2 | ROW FORMAT DELIMITED
 3 | FIELDS TERMINATED BY ','
 4 | select CONCAT_WS(',',uid,trip_seq, mobilitytype, mode, totaldistance, totaltime, starttime,endtime, totalpoints, regexp_replace(m,'\\\|',','))
 5 | from (select uid ,m[1] as trip_seq,m[2] as mobilitytype,m[3] as mode,m[4] totaldistance ,m[5] as
 6 | totaltime,m[6] as starttime,m[7] as endtime,m[8] as totalpoints,m[9] as pointlist
 7 | from (select * from {provider_prefix}_cdr_by_uid_trip_routing_array_apd where size(route_arr)>1) t1 LATERAL
 8 | VIEW explode(t1.route_arr) myTable1 AS m) t1 LATERAL VIEW explode(split(t1.pointlist,'\;')) myTable1 AS m
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/queries/interpolation/insert_cdr_by_uid.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE {provider_prefix}_cdr_by_uid  PARTITION(pdt)
2 | select uid as uid, CreateTrajectoriesCDR(time, duration,cell_id,longitude,latitude) as arr, pdate as pdt
3 | from cdr_test_interpolation group by pdate, uid
4 | having count(*) <= {max_size_cdr_by_uid}


--------------------------------------------------------------------------------
/queries/interpolation/insert_poi_relocation.sql:
--------------------------------------------------------------------------------
1 | insert overwrite table {provider_prefix}_cdr_by_uid_trip_realloc_array_apd partition (pdt)
2 | select uid,f_reallocation(uid,arr,pdt, "{poi}"),pdt
3 | from {provider_prefix}_cdr_by_uid_trip_organized_array_apd


--------------------------------------------------------------------------------
/queries/interpolation/insert_route_interpolation.sql:
--------------------------------------------------------------------------------
1 | insert overwrite table {provider_prefix}_cdr_by_uid_trip_routing_array_apd partition (pdt)
2 | select uid,f_routing(uid,arr,"{osm}","{voronoi}"), pdt
3 | from {provider_prefix}_cdr_by_uid_trip_realloc_array_apd
4 | where (size(arr)>0 and  size(arr)<={max_size_interpolation})
5 | 


--------------------------------------------------------------------------------
/queries/interpolation/insert_trip_24_hr_padding.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE  {provider_prefix}_cdr_by_uid_trip_organized_array_apd  PARTITION(pdt)
2 | select uid, f_organizearray(uid,arr) as arr,pdt
3 | from {provider_prefix}_cdr_by_uid_trip where size(arr) > 0
4 | 


--------------------------------------------------------------------------------
/queries/interpolation/insert_trip_format.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE {provider_prefix}_cdr_by_uid_trip  PARTITION(pdt)
2 | select uid, TripSegmentationCDR(arr,uid) as arr,pdt from {provider_prefix}_cdr_by_uid
3 | 


--------------------------------------------------------------------------------
/queries/origin_destination/create_la_cdr_all_with_ant_zone_by_uid.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid(uid string, arr ARRAY < ARRAY < string >>) PARTITIONED
2 | BY(pdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ','
3 | MAP KEYS TERMINATED BY '!' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od
2 | (uid string, home_site_id string,home_zone string, arr ARRAY<ARRAY<string>>)
3 | PARTITIONED BY (pdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
4 | COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY '!'
5 | LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE
6 | 
7 | 


--------------------------------------------------------------------------------
/queries/origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od_detail.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail
2 | (uid string, home_site_id string,home_zone string, arr ARRAY<string>)
3 | PARTITIONED BY (pdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
4 | COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY '!'
5 | LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/origin_destination/create_la_cdr_all_with_ant_zone_by_uid_od_sum.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum
2 | (origin string,destination string,tcount double, tusercount double)
3 | PARTITIONED BY (pdt string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
4 | COLLECTION ITEMS TERMINATED BY ',' MAP KEYS TERMINATED BY '!'
5 | LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/origin_destination/create_la_cdr_uid_home.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_la_cdr_uid_home  (uid string, site_id string, tcount
2 | int, trank int, ppercent double, LONGITUDE string, LATITUDE string, {admin_params})
3 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ','
4 | MAP KEYS TERMINATED BY '!' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE
5 | 
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_old_consolidate.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE  {provider_prefix}_la_cdr_all_with_ant_zone_by_uid  PARTITION (pdt)
2 | select a1.uid, CreateTrajectoriesJICAWithZone (a1.uid, call_time, duration, a2.longitude, a2.latitude, a1.cell_id, a3.{target_admin}_id)
3 | as arr, pdt from {provider_prefix}_consolidate_data_all a1 join {provider_prefix}_cell_tower_data_preprocess a2
4 | on (a1.cell_id = a2.cell_id)
5 | join {provider_prefix}_cell_tower_data_{target_admin} a3 on (a2.latitude = a3.latitude and a2.longitude = a3.longitude)
6 | where to_date(pdt) = "{od_date}" group by a1.uid, pdt
7 | 


--------------------------------------------------------------------------------
/queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_uid.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE  {provider_prefix}_la_cdr_all_with_ant_zone_by_uid  PARTITION (pdt)
2 | select a1.uid, CreateTrajectoriesJICAWithZone
3 | (a1.uid, call_time, duration, a1.longitude, a1.latitude, concat(a1.latitude, ' : ', a1.longitude), a2.{target_admin}_id)
4 | as arr, pdt from {provider_prefix}_consolidate_data_all a1
5 | join {provider_prefix}_cell_tower_data_{target_admin} a2 on (a1.latitude = a2.latitude and a1.longitude = a2.longitude)
6 | where to_date(pdt) = "{od_date}" group by a1.uid, pdt


--------------------------------------------------------------------------------
/queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od PARTITION (pdt)
2 | select t1.uid,t2.site_id as home_site_id, t2.{target_unit}_id as home_zone,
3 | TripOD(arr, t1.uid, t2.site_id, t2.{target_unit}_id, t2.LONGITUDE, t2.LATITUDE), pdt
4 | from {provider_prefix}_la_cdr_all_with_ant_zone_by_uid t1
5 | inner join {provider_prefix}_la_cdr_uid_home t2 on t1.uid = t2.uid
6 | where size(arr) <= 500
7 | 


--------------------------------------------------------------------------------
/queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od_detail.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE  {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail PARTITION (pdt)
2 | select uid ,home_site_id,home_zone,m as arr,pdt
3 | from {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od t1
4 | LATERAL VIEW explode(t1.arr) myTable1 AS m


--------------------------------------------------------------------------------
/queries/origin_destination/insert_la_cdr_all_with_ant_zone_by_uid_od_sum.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum PARTITION(pdt)  
2 | select arr[2] as origin, arr[3]  as destination, count(*) as tcount, count(distinct uid) as tusercount, pdt
3 | from {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_detail  where ((arr[2] != '-1' and arr[3] != '-1' ) )
4 | group by pdt,arr[2],arr[3]


--------------------------------------------------------------------------------
/queries/origin_destination/insert_la_cdr_uid_home.sql:
--------------------------------------------------------------------------------
1 | INSERT OVERWRITE TABLE {provider_prefix}_la_cdr_uid_home
2 | select uid, unique_location, tcount, trank, ppercent, latitude, longitude, admin1_id
3 | from big6_frequent_locations where trank = 1


--------------------------------------------------------------------------------
/queries/origin_destination/od_to_csv.sql:
--------------------------------------------------------------------------------
1 | insert overwrite local directory '/tmp/hive/od_result'
2 | ROW FORMAT DELIMITED
3 | FIELDS TERMINATED BY '\t'
4 | select CONCAT_WS('\t',pdt,origin ,
5 | destination,cast(tcount as string),cast(tusercount as string))
6 | from {provider_prefix}_la_cdr_all_with_ant_zone_by_uid_od_sum t1


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_average_cdrs/daily_average_cdrs.sql:
--------------------------------------------------------------------------------
1 | select date, total_records/total_uids as daily_average_cdr
2 | from(select to_date(call_time) as date, count(distinct uid) as total_uids,
3 | count(*) as total_records from {provider_prefix}_consolidate_data_all a1
4 | group by to_date(call_time) order by date)td1


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_average_unique_locations/daily_average_unique_locations.sql:
--------------------------------------------------------------------------------
1 | select date, unique_locations/unique_users as daily_avg_locations, unique_cell_ids/unique_users as daily_avg_cell_ids
2 | from (select to_date(call_time) as date, count(distinct latitude, longitude)  as unique_locations,
3 | count(distinct uid) as unique_users, count(distinct cell_id) as unique_cell_ids
4 | from {provider_prefix}_consolidate_data_all  group by to_date(call_time) order by date) td1


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_average_unique_locations/daily_average_unique_locations_old_consolidate.sql:
--------------------------------------------------------------------------------
1 | select date, unique_locations/unique_users as daily_avg_locations, unique_cell_ids/unique_users as daily_avg_cell_ids
2 | from (select to_date(call_time) as date, count(distinct a2.latitude, a2.longitude)  as unique_locations,
3 | count(distinct a1.uid) as unique_users, count(distinct a1.cell_id) as unique_cell_ids
4 | from {provider_prefix}_consolidate_data_all a1 join {provider_prefix}_cell_tower_data_preprocess a2
5 | on(a1.cell_id = a2.cell_id) group by to_date(call_time) order by date) td1


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_cdrs/total_daily_cdrs.sql:
--------------------------------------------------------------------------------
1 |  select to_date(call_time) as date, count(*) as total_records
2 |  from {provider_prefix}_consolidate_data_all group by to_date(call_time)
3 |  order by date


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_cdrs_by_call_type/daily_cdrs_by_call_type.sql:
--------------------------------------------------------------------------------
 1 | SELECT to_date(call_time) as date, 'ALL' as call_type, 'ALL' as network_type, COUNT(*) as total_records,
 2 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
 3 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt)
 4 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time)
 5 | UNION
 6 | SELECT to_date(call_time) as date, call_type, 'ALL' as network_type, COUNT(*) as total_records,
 7 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
 8 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt)
 9 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time), call_type
10 | ORDER BY  call_type ASC, network_type DESC
11 | 


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_unique_locations/daily_unique_locations.sql:
--------------------------------------------------------------------------------
1 | select to_date(call_time) as date, count(distinct latitude, longitude) as unique_locations
2 | from {provider_prefix}_consolidate_data_all group by to_date(call_time) order by date


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_unique_locations/daily_unique_locations_old_consolidate.sql:
--------------------------------------------------------------------------------
1 | select to_date(call_time) as date, count(distinct a2.latitude, a2.longitude) as unique_locations
2 | from {provider_prefix}_consolidate_data_all a1 join {provider_prefix}_cell_tower_data_preprocess a2
3 | on(a1.cell_id = a2.cell_id) group by to_date(call_time) order by date


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_unique_locations/total_unique_locations.sql:
--------------------------------------------------------------------------------
1 | select count(*) as count_unique_locations from (select distinct latitude, longitude
2 | from {provider_prefix}_consolidate_data_all) td


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_unique_locations/total_unique_locations_old_consolidate.sql:
--------------------------------------------------------------------------------
1 | select count(*) as count_unique_locations from (select distinct a2.latitude, a2.longitude
2 | from {provider_prefix}_consolidate_data_all a1
3 | join {provider_prefix}_cell_tower_data_preprocess a2
4 | on(a1.cell_id = a2.cell_id)) td


--------------------------------------------------------------------------------
/queries/statistics/graphs/daily_unique_users/total_daily_uids.sql:
--------------------------------------------------------------------------------
1 | select to_date(call_time) as date, count(distinct uid) as total_users
2 | from {provider_prefix}_consolidate_data_all group by to_date(call_time) order by date


--------------------------------------------------------------------------------
/queries/statistics/graphs/date_histogram/histogram.sql:
--------------------------------------------------------------------------------
1 | select explode(histogram_numeric(active_days, 10)) as active_day_bins from
2 | (select count(*) as active_days, td.uid from (select year(to_date(call_time)) as year,
3 | month(to_date(call_time)) as month, day(to_date(call_time)) as day, uid
4 | from {provider_prefix}_consolidate_data_all group by uid, year(to_date(call_time)),
5 | month(to_date(call_time)), day(to_date(call_time)) order by year, month, day, uid) td
6 | group by td.uid) td2


--------------------------------------------------------------------------------
/queries/statistics/reports/all_statistics/data_statistics.sql:
--------------------------------------------------------------------------------
1 | select count(*) as total_records, count(distinct to_date(call_time)) as total_days,
2 | count(distinct uid) as unique_id, {imei} {imsi} count(distinct cell_id) as unique_location_name,
3 | min(to_date(call_time)) as start_date, max(to_date(call_time)) as end_date from {provider_prefix}_consolidate_data_all


--------------------------------------------------------------------------------
/queries/statistics/reports/daily_statistics/daily_statistics.sql:
--------------------------------------------------------------------------------
 1 | SELECT to_date(call_time) as date, 'ALL' as call_type, 'ALL' as network_type, COUNT(*) as total_records,
 2 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
 3 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt)
 4 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time)
 5 | UNION
 6 | SELECT to_date(call_time) as date, call_type, 'ALL' as network_type, COUNT(*) as total_records,
 7 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
 8 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt)
 9 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time), call_type
10 | UNION
11 | SELECT to_date(call_time) as date, 'ALL' as call_type,  network_type, COUNT(*) as total_records,
12 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
13 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt)
14 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time), network_type
15 | UNION
16 | SELECT to_date(call_time) as date, call_type, network_type, COUNT(*) as total_records,
17 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
18 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where to_date(pdt)
19 | between to_date('{start_date}') and to_date('{end_date}') GROUP BY to_date(call_time), call_type, network_type
20 | ORDER BY date ASC, call_type ASC, network_type DESC
21 | 


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/create_frequent_locations.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_frequent_locations  (uid string, tcount int,trank int,ppercent double,
2 | unique_location string, latitude string, longitude string, {admin_params})
3 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ','
4 | MAP KEYS TERMINATED BY '!' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/create_frequent_locations_night.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE {provider_prefix}_frequent_locations_night  (uid string, tcount int,trank int,ppercent double,
2 | unique_location string, latitude string, longitude string, {admin_params})
3 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' COLLECTION ITEMS TERMINATED BY ','
4 | MAP KEYS TERMINATED BY '!' LINES TERMINATED BY '\n' STORED AS SEQUENCEFILE


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/frequent_locations.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO TABLE {provider_prefix}_frequent_locations SELECT a1.uid,
2 | count(a1.uid) as tcount, ROW_NUMBER() OVER(PARTITION BY a1.uid order by count(a1.uid) DESC) as rank,
3 | count(a1.uid)/SUM(count(a1.uid)) OVER(partition by a1.uid) * 100 as ppercent
4 | , concat(a1.latitude, ' : ', a1.longitude) as unique_location, a1.latitude, a1.longitude,
5 | a2.{admin_params} from {provider_prefix}_consolidate_data_all a1
6 | JOIN {provider_prefix}_cell_tower_data_{admin} a2 on(a1.latitude = a2.latitude and a1.longitude = a2.longitude)
7 | group by a1.uid, concat(a2.latitude, ' : ', a2.longitude), a1.latitude, a1.longitude, a2.{admin_params}
8 | order by a1.uid, rank


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/frequent_locations_night.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO TABLE {provider_prefix}_frequent_locations_night SELECT a1.uid,
2 | count(a1.uid) as tcount, ROW_NUMBER() OVER(PARTITION BY a1.uid order by count(a1.uid) DESC) as rank,
3 | count(a1.uid)/SUM(count(a1.uid)) OVER(partition by a1.uid) * 100 as ppercent
4 | , concat(a1.latitude, ' : ', a1.longitude) as unique_location, a1.latitude, a1.longitude,
5 | a2.{admin_params} from {provider_prefix}_consolidate_data_all a1
6 | JOIN {provider_prefix}_cell_tower_data_{admin} a2 on(a1.latitude = a2.latitude and a1.longitude = a2.longitude)
7 | where hour(a1.call_time) in (0,1,2,3,4,5,6,7,20,21,22,23)
8 | group by a1.uid, concat(a2.latitude, ' : ', a2.longitude), a1.latitude, a1.longitude, a2.{admin_params}
9 | order by a1.uid, rank ASC


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/frequent_locations_night_old_consolidate.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO TABLE {provider_prefix}_frequent_locations_night SELECT a1.uid, a2.cell_id,
2 | count(a1.uid) as tcount, ROW_NUMBER() OVER(PARTITION BY a1.uid, a2.cell_id order by count(a1.uid) DESC) as rank,
3 | count(a1.uid)/SUM(count(a1.uid)) OVER(partition by a1.uid, a2.cell_id) * 100 as percentage
4 | , a2.longitude, a2.latitude, a3.{admin_params} from {provider_prefix}_consolidate_data_all a1
5 | JOIN {provider_prefix}_cell_tower_data_preprocess a2  ON(a1.cell_id = a2.cell_id)
6 | JOIN {provider_prefix}_cell_tower_data_{admin} a3 on(a2.latitude = a3.latitude and a2.longitude = a3.longitude)
7 | where hour(a1.call_time) in (0,1,2,3,4,5,6,7,20,21,22,23) group by a1.uid, a2.latitude,  a2.longitude , a2.cell_id, a3.{admin_params}
8 | order by a1.uid, rank ASC


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/frequent_locations_old_consolidate.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO TABLE {provider_prefix}_frequent_locations SELECT a1.uid, a2.cell_id,
2 | count(a1.uid) as tcount, ROW_NUMBER() OVER(PARTITION BY a1.uid, a2.cell_id order by count(a1.uid) DESC) as rank,
3 | count(a1.uid)/SUM(count(a1.uid)) OVER(partition by a1.uid, a2.cell_id) * 100 as percentage
4 | , a2.longitude, a2.latitude, a3.{admin_params} from {provider_prefix}_consolidate_data_all a1
5 | JOIN {provider_prefix}_cell_tower_data_preprocess a2  ON(a1.cell_id = a2.cell_id)
6 | JOIN {provider_prefix}_cell_tower_data_{admin} a3 on(a2.latitude = a3.latitude and a2.longitude = a3.longitude)
7 | group by a1.uid, a2.latitude,  a2.longitude , a2.cell_id, a3.{admin_params} order by a1.uid, rank ASC
8 | 


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/frequent_locations_thresholded.sql:
--------------------------------------------------------------------------------
 1 | create table {provider_prefix}_frequent_locations_thresholded as select td3.uid as uid, td3.tcount
 2 | as tcount, td3.trank as trank, td3.ppercent as ppercent, td3.unique_location as unique_location,
 3 | td3.longitude as longitude, td3.latitude as latitude,
 4 | td3.{admin}_id as {admin}_id, td3.acc_wsum as acc_wsum, td3.min_acc_wsum as min_acc_wsum from
 5 | (select a1.uid as uid, a1.tcount as tcount, a1.trank as trank,
 6 | a1.ppercent as ppercent, a1.unique_location as unique_location,
 7 | a1.longitude as longitude, a1.latitude as latitude,
 8 | a1.{admin}_id as {admin}_id, a1.acc_wsum as acc_wsum, td2.min_acc_wsum as min_acc_wsum
 9 | from {provider_prefix}_freq_with_acc_wsum a1
10 | join (select td.uid as uid, min(td.acc_wsum) as min_acc_wsum from (
11 | select uid, acc_wsum from  {provider_prefix}_freq_with_acc_wsum
12 | where acc_wsum >= {threshold} group by uid, acc_wsum) td group by td.uid) td2
13 | on (a1.uid = td2.uid)) td3 where acc_wsum <= min_acc_wsum


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/frequent_locations_thresholded_night.sql:
--------------------------------------------------------------------------------
 1 | create table {provider_prefix}_frequent_locations_thresholded_night as select td3.uid as uid, td3.tcount
 2 | as tcount, td3.trank as trank, td3.ppercent as ppercent, td3.unique_location as unique_location,
 3 | td3.longitude as longitude, td3.latitude as latitude,
 4 | td3.{admin}_id as {admin}_id, td3.acc_wsum as acc_wsum, td3.min_acc_wsum as min_acc_wsum from
 5 | (select a1.uid as uid, a1.tcount as tcount, a1.trank as trank,
 6 | a1.ppercent as ppercent, a1.unique_location as unique_location,
 7 | a1.longitude as longitude, a1.latitude as latitude,
 8 | a1.{admin}_id as {admin}_id, a1.acc_wsum as acc_wsum, td2.min_acc_wsum as min_acc_wsum
 9 | from {provider_prefix}_freq_with_acc_wsum_night a1
10 | join (select td.uid as uid, min(td.acc_wsum) as min_acc_wsum from (
11 | select uid, acc_wsum from  {provider_prefix}_freq_with_acc_wsum_night
12 | where acc_wsum >= {threshold} group by uid, acc_wsum) td group by td.uid) td2
13 | on (a1.uid = td2.uid)) td3 where acc_wsum <= min_acc_wsum


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/frequent_locations_wsum.sql:
--------------------------------------------------------------------------------
1 | CREATE table {provider_prefix}_freq_with_acc_wsum as select uid, tcount,
2 | trank, ppercent, unique_location, longitude, latitude , {admin}_id,
3 | sum(ppercent) over (partition by uid order by trank asc)
4 | as acc_wsum from {provider_prefix}_frequent_locations
5 | order by uid, trank


--------------------------------------------------------------------------------
/queries/statistics/reports/frequent_locations/frequent_locations_wsum_night.sql:
--------------------------------------------------------------------------------
1 | CREATE table {provider_prefix}_freq_with_acc_wsum_night as select uid, tcount,
2 | trank, ppercent, unique_location, longitude, latitude , {admin}_id,
3 | sum(ppercent) over (partition by uid order by trank asc)
4 | as acc_wsum from {provider_prefix}_frequent_locations_night
5 | order by uid, trank


--------------------------------------------------------------------------------
/queries/statistics/reports/monthly_statistics/monthly_statistics.sql:
--------------------------------------------------------------------------------
 1 | SELECT YEAR(call_time) as year, MONTH(call_time) as month  , 'ALL' as call_type, 'ALL' as network_type, COUNT(*) as total_records,
 2 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
 3 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where
 4 | (year(pdt) between {start_year} and {end_year}) and (MONTH(pdt) between {start_month} and {end_month})
 5 | GROUP BY YEAR(call_time), MONTH(call_time)
 6 | UNION
 7 | SELECT YEAR(call_time) as year, MONTH(call_time) as month, call_type, 'ALL' as network_type, COUNT(*) as total_records,
 8 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, COUNT(DISTINCT imei) as unique_imei,
 9 | COUNT(DISTINCT imsi) unique_imsi, COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all
10 | where (year(pdt) between {start_year} and {end_year}) and (MONTH(pdt) between {start_month} and {end_month})
11 | GROUP BY YEAR(call_time), MONTH(call_time), call_type
12 | UNION
13 | SELECT YEAR(call_time) as year, MONTH(call_time) as month, 'ALL' as call_type,  network_type, COUNT(*) as total_records,
14 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
15 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all where (year(pdt) between {start_year} and {end_year})
16 | and (MONTH(pdt) between {start_month} and {end_month}) GROUP BY YEAR(call_time), MONTH(call_time), network_type
17 | UNION
18 | SELECT YEAR(call_time) as year, MONTH(call_time) as month , call_type, network_type, COUNT(*) as total_records,
19 | COUNT(DISTINCT TO_DATE(call_time)) as total_days, COUNT(DISTINCT uid) as unique_id, {imei} {imsi}
20 | COUNT(DISTINCT cell_id) as unique_location_name FROM {provider_prefix}_consolidate_data_all
21 | where (year(pdt) between {start_year} and {end_year}) and (MONTH(pdt) between {start_month} and {end_month})
22 | GROUP BY YEAR(call_time), MONTH(call_time), call_type, network_type ORDER BY year ASC, month ASC, call_type ASC, network_type DESC
23 | 


--------------------------------------------------------------------------------
/queries/statistics/reports/summary/average_daily_admin1.sql:
--------------------------------------------------------------------------------
1 | select sum(td.count)/{total_days} as average_{level}_per_day
2 | from ( select count(distinct a1.{level}) as count from {provider_prefix}_cell_tower_data_preprocess a1
3 | JOIN {provider_prefix}_consolidate_data_all a2
4 | on (a1.cell_id = a2.cell_id) group by to_date(call_time))td


--------------------------------------------------------------------------------
/queries/statistics/reports/summary/average_daily_sms.sql:
--------------------------------------------------------------------------------
1 | select count(*)/{total_days} as average_daily_sms
2 | from {provider_prefix}_consolidate_data_all
3 | where call_type = 'SMS'


--------------------------------------------------------------------------------
/queries/statistics/reports/summary/average_daily_voice.sql:
--------------------------------------------------------------------------------
1 | select count(*)/{total_days} as average_daily_voice from {provider_prefix}_consolidate_data_all where call_type = 'VOICE'


--------------------------------------------------------------------------------
/queries/statistics/reports/summary/average_unique_cell_ids.sql:
--------------------------------------------------------------------------------
1 | select sum(td.count)/{total_days} as average_daily_unique_cell_id from
2 | (select count(distinct cell_id) as count from {provider_prefix}_consolidate_data_all
3 | group by to_date(call_time)) td
4 | 


--------------------------------------------------------------------------------
/queries/statistics/reports/summary/total_days.sql:
--------------------------------------------------------------------------------
1 | select count(*) as total_days, min(dates) as start_date, max(dates)
2 | as end_date from (select  to_date(call_time) as dates from
3 | {provider_prefix}_consolidate_data_all group by to_date(call_time)) td


--------------------------------------------------------------------------------
/queries/statistics/reports/zone_population/zone_population.sql:
--------------------------------------------------------------------------------
1 | select lv as {level}, sum(td.count) as count_activities, count(td.uid) as count_unique_ids from
2 | (select a1.{level} as lv,  count(a1.{level}) as count, a2.uid as uid
3 | from {provider_prefix}_cell_tower_data_preprocess a1 JOIN {provider_prefix}_consolidate_data_all a2
4 | on (a1.cell_id = a2.cell_id) group by a1.{level}, a2.uid) td group by lv


--------------------------------------------------------------------------------
/queries/statistics/total_records.sql:
--------------------------------------------------------------------------------
1 | select count(*) as total_records from {provider_prefix}_consolidate_data_all


--------------------------------------------------------------------------------
/queries/statistics/total_unique_uids.sql:
--------------------------------------------------------------------------------
1 | select count(*) as total_uids from
2 | (select distinct uid from {provider_prefix}_consolidate_data_all) td


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | bit-array==0.1.0
 2 | bitarray==1.2.1
 3 | cycler==0.10.0
 4 | impyla==0.16.2
 5 | kiwisolver==1.1.0
 6 | matplotlib==3.2.0
 7 | numpy==1.18.1
 8 | pandas==1.0.1
 9 | ply==3.11
10 | pure-sasl==0.6.2
11 | pyparsing==2.4.6
12 | python-dateutil==2.8.1
13 | pytz==2019.3
14 | six==1.14.0
15 | thrift==0.13.0
16 | thrift-sasl==0.4.1
17 | thriftpy==0.3.9
18 | thriftpy2==0.4.11


--------------------------------------------------------------------------------
/run_interpolation.py:
--------------------------------------------------------------------------------
 1 | from Common.config_object import Config
 2 | from Common.cdr_interpolation import Interpolation
 3 | from Common.hive_create_tables import HiveTableCreator
 4 | from Common.hive_connection import HiveConnection
 5 | from Common.helper import format_two_point_time
 6 | import argparse
 7 | import time
 8 | 
 9 | 
10 | def main():
11 |     # argument parser
12 |     start = time.time()
13 |     parser = argparse.ArgumentParser(description='Argument indicating the configuration file')
14 | 
15 |     # add configuration argument
16 |     parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data"
17 |                                                " \n ex. py py_hive_connect.py -c config.json",
18 |                         action="store")
19 | 
20 |     # parse config to args.config
21 |     args = parser.parse_args()
22 | 
23 |     config = Config(args.config)
24 |     hc = HiveConnection(host=config.host, port=config.port, user=config.user)
25 | 
26 |     # initialize hive and create tables
27 |     table_creator = HiveTableCreator(config)
28 |     table_creator.initialize('hive_init_commands/initial_hive_commands_interpolation.json')  # mandatory (init hive)
29 | 
30 |     # init interpolation generators
31 |     it = Interpolation(config)
32 | 
33 |     # interpolation
34 |     it.calculate_interpolation()
35 | 
36 |     print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time())))
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/run_origin_destination.py:
--------------------------------------------------------------------------------
 1 | from Common.config_object import Config
 2 | from Common.cdr_origin_destination import OriginDestination
 3 | from Common.hive_create_tables import HiveTableCreator
 4 | from Common.hive_connection import HiveConnection
 5 | from Common.helper import format_two_point_time
 6 | import argparse
 7 | import time
 8 | 
 9 | 
10 | def main():
11 |     # argument parser
12 |     start = time.time()
13 |     parser = argparse.ArgumentParser(description='Argument indicating the configuration file')
14 | 
15 |     # add configuration argument
16 |     parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data"
17 |                                                " \n ex. py py_hive_connect.py -c config.json",
18 |                         action="store")
19 | 
20 |     # parse config to args.config
21 |     args = parser.parse_args()
22 | 
23 |     config = Config(args.config)
24 |     HiveConnection(host=config.host, port=config.port, user=config.user)
25 | 
26 |     # initialize hive and create tables
27 |     table_creator = HiveTableCreator(config)
28 |     table_creator.initialize('hive_init_commands/initial_hive_commands_od.json')  # mandatory (init hive)
29 | 
30 |     # init od and stat generator
31 |     od = OriginDestination(config)
32 | 
33 |     # origin destination
34 |     od.calculate_od()  # Require rank1_frequent_locations
35 | 
36 |     print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time())))
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/run_prepare_cdr_and_mapping.py:
--------------------------------------------------------------------------------
 1 | from Common.config_object import Config
 2 | from Common.hive_create_tables import HiveTableCreator
 3 | from Common.hive_connection import HiveConnection
 4 | from Common.helper import extract_mapping_data, format_two_point_time
 5 | from Common.cdr_data import CDRData
 6 | import argparse
 7 | import time
 8 | 
 9 | 
10 | def main():
11 |     # argument parser
12 |     start = time.time()
13 |     parser = argparse.ArgumentParser(description='Argument indicating the configuration file')
14 | 
15 |     # add configuration argument
16 |     parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data"
17 |                                                " \n ex. py py_hive_connect.py -c config.json",
18 |                         action="store")
19 | 
20 |     # parse config to args.config
21 |     args = parser.parse_args()
22 | 
23 |     config = Config(args.config)
24 |     HiveConnection(host=config.host, port=config.port, user=config.user)
25 |     cdr_data = CDRData()
26 |     extract_mapping_data(config, cdr_data)
27 | 
28 |     # initialize hive and create tables
29 | 
30 |     table_creator = HiveTableCreator(config, cdr_data)
31 |     table_creator.initialize('hive_init_commands/initial_hive_commands_stats.json')  # init hive
32 |     table_creator.create_tables()
33 | 
34 |     print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time())))
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     main()
39 | 


--------------------------------------------------------------------------------
/run_statistics.py:
--------------------------------------------------------------------------------
 1 | from Common.config_object import Config
 2 | from Common.cdr_statistics import Statistics
 3 | from Common.hive_create_tables import HiveTableCreator
 4 | from Common.hive_connection import HiveConnection
 5 | from Common.helper import format_two_point_time
 6 | import argparse
 7 | import time
 8 | 
 9 | 
10 | def main():
11 |     # argument parser
12 |     start = time.time()
13 |     parser = argparse.ArgumentParser(description='Argument indicating the configuration file')
14 | 
15 |     # add configuration argument
16 |     parser.add_argument("-c", "--config", help="add a configuration file you would like to process the cdr data"
17 |                                                " \n ex. py py_hive_connect.py -c config.json",
18 |                         action="store")
19 | 
20 |     # parse config to args.config
21 |     args = parser.parse_args()
22 | 
23 |     config = Config(args.config)
24 |     HiveConnection(host=config.host, port=config.port, user=config.user)
25 | 
26 |     table_creator = HiveTableCreator(config)
27 |     table_creator.initialize('hive_init_commands/initial_hive_commands_stats.json')  # mandatory (init hive)
28 | 
29 |     # init stats generators
30 |     st = Statistics(config)
31 | 
32 |     # user section here
33 |     # reports
34 |     st.calculate_data_statistics()
35 |     st.calculate_daily_statistics()
36 |     st.calculate_monthly_statistics()
37 |     st.calculate_zone_population()
38 |     st.calculate_summary()
39 |     st.calculate_user_date_histogram()
40 |     # graphs
41 |     st.daily_cdrs()
42 |     st.daily_unique_users()
43 |     st.daily_unique_locations()
44 |     st.daily_average_cdrs()
45 |     st.daily_unique_average_locations()
46 | 
47 |     # frequent locations (Report)
48 |     st.frequent_locations()
49 |     st.frequent_locations_night()
50 | 
51 |     # Prerequisite for Origin-Destination, if not wishing to calculate OD, kindly comment the code
52 |     st.rank1_frequent_locations()  # Require frequent_locations() in run_statistics.py
53 | 
54 |     print('Overall time elapsed: {} seconds'.format(format_two_point_time(start, time.time())))
55 | if __name__ == '__main__':
56 |     main()


--------------------------------------------------------------------------------
/sample_configs/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"hadoop_data_path":"/disk2/data/Chaichan/",
 3 | 	"provider_prefix":"DTAC3",
 4 | 	"db_name" : "cdrproject",
 5 | 
 6 | 	"input_delimiter":",",
 7 | 	"input_files" :["ais.csv"],
 8 | 	"_comment": ["if time format is well formed (like yyyy/mm/dd or dd-mm-yyyy and colons(:) separating time, then leave it blank",
 9 | 	"if the time format has no separator then indicate it (ex. yyyyMMdd (can't be automatically recognized) hh:mm:ss)"],
10 | 	"input_file_time_format": "yyyyMMdd hh:mm:ss",
11 | 	"input_file_have_header_comment_": "put 1 if there is a header row, otherwise 0",
12 | 	"input_file_have_header": 1,
13 | 
14 | 	"input_cell_tower_files" : ["cdr_cell_tower.csv"],
15 | 	"input_cell_tower_delimiter":",",
16 | 	"input_cell_tower_have_header": 1,
17 | 
18 | 	"check_duplicate_comment_": "will check duplicate in the cdr raw datafile and cell tower file",
19 | 	"check_duplicate": true,
20 | 	"check_invalid_lat_lng_comment_": "filter invalid lat_lng (both 0 or one of it is null)",
21 | 	"check_invalid_lat_lng": true,
22 | 	"host": "hadoopmaster.apichon.com",
23 | 	"port": 10000,
24 | 	"user": "rsstudent",
25 | 
26 | 	"frequent_locations_percentage": 80,
27 | 
28 | 	"output_report_location": "output_reports_small_latest",
29 | 	"output_graph_location": "graphical_reports_small_latest",
30 | 
31 | 	"od_admin_unit": "ADMIN1",
32 | 	"od_date": "2016-05-01",
33 | 
34 | 	"interpolation_poi_file_location": "/hadoop/hive/data/bangladesh/bangladesh.landscan2010_poi.tsv",
35 | 	"interpolation_osm_file_location": "/hadoop/hive/data/bangladesh/bangladesh.osm_road_with_waterway.tsv",
36 | 	"interpolation_voronoi_file_location": "/hadoop/hive/data/bangladesh/gp_voronoi.tsv",
37 | 	"max_size_cdr_by_uid": 500,
38 | 	"max_size_interpolation": 50,
39 | 
40 | 	"cdr_data_layer_comment_": ["do not remove or change the name of the mandatory output columns",
41 | 								"if you don't have anything to map to the output, put -1 in the output_no",
42 | 								"if you have input that is not used and not mapped, insert a row with input_no = -1 and output_no = -1",
43 | 								"If a column is in your raw table, put input_no non-negative in the order of your column order",
44 | 								"network_type must be in 2G, 3G, 4G, 5G or put -1 to the output_no of both network_type and call_type",
45 | 								"call_type must be in Voice, Data, Call and SMS or put -1 to the output_no of both network_type and call_type",
46 | 								"note that in 1-1 mapping both input_no and output_no are non negative",
47 | 								"you need to import all the columns of your raw file and the output_no may be -1"],
48 | 
49 | 	"cdr_data_layer":[
50 | 		{"input_no":1, "input_name":"SUBID", "data_type":"String",  "output_no":1,   "name":"UID", "custom": ""},
51 | 		{"input_no":-1, "input_name":"IMEI", "data_type":"String",  "output_no":2,   "name":"IMEI", "custom": ""},
52 | 		{"input_no":-1, "input_name":"IMSI", "data_type":"String",  "output_no":3,   "name":"IMSI", "custom": ""},
53 | 		{"input_no":2, "input_name":"CDATE", "data_type":"String",  "output_no":-1,   "name":"CALL_DATE", "custom": ""},
54 | 		{"input_no":3, "input_name":"CTIME", "data_type":"String",  "output_no":4,   "name":"CALL_TIME",  "custom": "CONCAT(CDATE,' ',CTIME)"},
55 | 		{"input_no":4, "input_name":"DURATION", "data_type":"String",  "output_no":5,   "name":"DURATION", "custom": ""},
56 | 		{"input_no":5, "input_name":"CELLID", "data_type":"String",  "output_no":6,   "name":"CELL_ID", "custom": ""},
57 | 		{"input_no":6, "input_name":"LATITUDE", "data_type":"String",  "output_no":7,   "name":"LATITUDE", "custom": ""},
58 | 		{"input_no":7, "input_name":"LONGITUDE", "data_type":"String",  "output_no":8,   "name":"LONGITUDE", "custom": ""},
59 | 		{"input_no":9, "input_name":"NETWORK_TYPE", "data_type":"String",  "output_no":10,   "name":"NETWORK_TYPE", "custom": ""},
60 | 		{"input_no":8, "input_name":"CALL_TYPE", "data_type":"String",  "output_no":9,   "name":"CALL_TYPE", "custom": ""}
61 |  ],
62 | 
63 | 	"cdr_cell_tower":[
64 | 		{"input_no":1, "input_name":"BTSID", "data_type":"String",  "output_no":-1,   "name":"UID"},
65 | 		{"input_no":2, "input_name":"SITE_NAME", "data_type":"String",  "output_no":-1,   "name":"SITE_NAME"},
66 | 		{"input_no":3, "input_name":"LONGITUDE", "data_type":"String",  "output_no":3,   "name":"LONGITUDE"},
67 | 		{"input_no":4, "input_name":"LATITUDE", "data_type":"String",  "output_no":4,   "name":"LATITUDE"},
68 | 		{"input_no":5, "input_name":"CELLID", "data_type":"String",  "output_no":5,   "name":"CELL_ID" },
69 | 		{"input_no":6, "input_name":"CELLNAME", "data_type":"String",  "output_no":-1,   "name":"CELLNAME" },
70 | 		{"input_no":7, "input_name":"CI", "data_type":"String",  "output_no":-1,   "name":"CI" },
71 | 		{"input_no":8, "input_name":"AZIMUTH", "data_type":"String",  "output_no":-1,   "name":"AZIMUTH" },
72 | 		{"input_no":9, "input_name":"DISTRICT", "data_type":"String",  "output_no":6,   "name":"ADMIN1", "geojson_filename": "japan.json", "geojson_col_name": "nam"},
73 | 		{"input_no":10, "input_name":"PROVINCE", "data_type":"String",  "output_no":7,   "name":"ADMIN2", "geojson_filename": "", "geojson_col_name": ""}
74 | 	]
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/sample_configs/config_big.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"hadoop_data_path":"/disk2/data/Chaichan/",
 3 | 	"provider_prefix":"big7",
 4 | 	"db_name" : "cdrproject",
 5 | 
 6 | 	"input_delimiter":",",
 7 | 	"input_files" :["2016-03-01.csv","2016-03-02.csv","2016-03-03.csv","2016-03-04.csv","2016-03-05.csv"],
 8 | 	"_comment": ["if time format is well formed (like yyyy/mm/dd or dd-mm-yyyy and colons(:) separating time, then leave it blank",
 9 | 	"if the time format has no separator then indicate it (ex. yyyyMMdd (can't be automatically recognized) hh:mm:ss)"],
10 | 	"input_file_time_format": "",
11 | 	"input_file_have_header_comment_": "put 1 if there is a header row, otherwise 0",
12 | 	"input_file_have_header": 0,
13 | 
14 | 	"input_cell_tower_files" : ["moz_cell_adm1_adm2.csv"],
15 | 	"input_cell_tower_delimiter":",",
16 | 	"input_cell_tower_have_header": 1,
17 | 
18 | 	"check_duplicate_comment_": "will check duplicate in the cdr raw datafile and cell tower file",
19 | 	"check_duplicate": true,
20 | 	"check_invalid_lat_lng_comment_": "filter invalid lat_lng (both 0 or one of it is null)",
21 | 	"check_invalid_lat_lng": true,
22 | 	"host": "hadoopmaster.apichon.com",
23 | 	"port": 10000,
24 | 	"user": "rsstudent",
25 | 
26 | 	"frequent_locations_percentage": 80,
27 | 
28 | 	"output_report_location": "big7",
29 | 	"output_graph_location": "big7",
30 | 
31 | 	"od_admin_unit": "admin1",
32 | 	"od_date": "2016-03-01",
33 | 
34 | 	"interpolation_poi_file_location": "/hadoop/hive/data/bangladesh/bangladesh.landscan2010_poi.tsv",
35 | 	"interpolation_osm_file_location": "/hadoop/hive/data/bangladesh/bangladesh.osm_road_with_waterway.tsv",
36 | 	"interpolation_voronoi_file_location": "/hadoop/hive/data/bangladesh/gp_voronoi.tsv",
37 | 	"max_size_cdr_by_uid": 500,
38 | 	"max_size_interpolation": 50,
39 | 
40 | 	"cdr_data_layer_comment_": ["do not remove or change the name of the mandatory output columns",
41 | 								"if you don't have anything to map to the output, put -1 in the output_no",
42 | 								"if you have input that is not used and not mapped, insert a row with input_no = -1 and output_no = -1",
43 | 								"If a column is in your raw table, put input_no non-negative in the order of your column order",
44 | 								"network_type must be in 2G, 3G, 4G, 5G or put -1 to the output_no of both network_type and call_type",
45 | 								"call_type must be in Voice, Data, Call and SMS or put -1 to the output_no of both network_type and call_type",
46 | 								"note that in 1-1 mapping both input_no and output_no are non negative",
47 | 								"you need to import all the columns of your raw file and the output_no may be -1"],
48 | 
49 | 	"cdr_data_layer":[
50 | 		{"input_no":1, "input_name":"IMEI", "data_type":"String",  "output_no":1,   "name":"UID", "custom": ""},
51 | 		{"input_no":2, "input_name":"IMSI", "data_type":"String",  "output_no":2,   "name":"IMSI", "custom": ""},
52 | 		{"input_no":3, "input_name":"startTime", "data_type":"String",  "output_no":3,   "name":"CALL_TIME", "custom": ""},
53 | 		{"input_no":4, "input_name":"endTime", "data_type":"String",  "output_no":-1,   "name":"END_TIME", "custom": ""},
54 | 		{"input_no":5, "input_name":"DURATION", "data_type":"String",  "output_no":4,   "name":"DURATION", "custom": ""},
55 | 		{"input_no":6, "input_name":"bs_seq", "data_type":"String",  "output_no":-1,   "name":"BS_SEQ", "custom": ""},
56 | 		{"input_no":7, "input_name":"CELLID", "data_type":"String",  "output_no":5,   "name":"CELL_ID", "custom": ""},
57 | 		{"input_no":8, "input_name":"call_type", "data_type":"String",  "output_no":-1,   "name":"CALL_TYPE", "custom": ""},
58 | 		{"input_no":-1, "input_name":"network_type", "data_type":"String",  "output_no":-1,   "name":"NETWORK_TYPE", "custom": ""},
59 | 		{"input_no":9, "input_name":"lon", "data_type":"String",  "output_no":-1,   "name":"LATITUDE", "custom": ""},
60 | 		{"input_no":10, "input_name":"lat", "data_type":"String",  "output_no":-1,   "name":"LONGITUDE", "custom": ""}
61 | 
62 |  ],
63 | 
64 | 	"cdr_cell_tower_comment_": "if you don't have a geojson file leave the field of geojson_filename blank but still preserve the key",
65 | 
66 | 	"cdr_cell_tower":[
67 | 		{"input_no":1, "input_name":"bs_seq", "data_type":"String",  "output_no":-1,   "name":"BS_SEQ"},
68 | 		{"input_no":2, "input_name":"cell_seq", "data_type":"String",  "output_no":1,   "name":"CELL_ID" },
69 | 		{"input_no":3, "input_name":"name", "data_type":"String",  "output_no":-1,   "name":"NAME"},
70 | 		{"input_no":4, "input_name":"lac", "data_type":"String",  "output_no":-1,   "name":"CELLNAME" },
71 | 		{"input_no":5, "input_name":"cell", "data_type":"String",  "output_no":-1,   "name":"CI" },
72 | 		{"input_no":6, "input_name":"lon", "data_type":"String",  "output_no":2,   "name":"LATITUDE" },
73 | 		{"input_no":7, "input_name":"lat", "data_type":"String",  "output_no":3,   "name":"LONGITUDE" },
74 | 		{"input_no":8, "input_name":"ISO2", "data_type":"String",  "output_no":-1,   "name":"ISO2" },
75 | 		{"input_no":9, "input_name":"NAME_0_2", "data_type":"String",  "output_no":-1,   "name":"NAME_0_2"},
76 | 		{"input_no":10, "input_name":"ID_1_2", "data_type":"String",  "output_no":-1,   "name":"ID_1_2" },
77 | 		{"input_no":11, "input_name":"NAME_1_2", "data_type":"String",  "output_no":4,   "name":"ADMIN0", "geojson_filename": "", "geojson_col_name": "" },
78 | 		{"input_no":12, "input_name":"ID_2", "data_type":"String",  "output_no":-1,   "name":"ID2" },
79 | 		{"input_no":13, "input_name":"NAME_2", "data_type":"String",  "output_no":5,   "name":"ADMIN1",  "geojson_filename": "", "geojson_col_name": ""  },
80 | 		{"input_no":14, "input_name":"ENGTYPE_2", "data_type":"String",  "output_no":-1,   "name":"ENGTYPE_2" }
81 | 	]
82 | 
83 | }
84 | 


--------------------------------------------------------------------------------