141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
--------------------------------------------------------------------------------
/FeatureExtraction/CommonFunctions/converters.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: This file contains the basic data convertes and methods are self explonatory
4 | """
5 |
6 |
7 | def ConvertTime(string):
8 | Times = string.split(":")
9 | return int(Times[0])+((int)(Times[1])*(1/60))
10 |
11 | def ConvertToIntList(Values):
12 | return Values.get_values()
13 |
14 | def ConvertDate(string):
15 | Times = string.split("-")
16 | return int(Times[1])*30+(int)(Times[2])
17 |
18 | def ConvertPercent(number):
19 | number= str(number)
20 | number =number.split("%")
21 | if(len(number) >=1):
22 | return int(number[0])
23 | else:
24 | return int(number)
25 |
--------------------------------------------------------------------------------
/FeatureExtraction/CommonFunctions/dataprocessing_helper.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: This file contains data processing methods such as merging two data frames and filtering data frames.
4 | """
5 |
6 | import functools as funs
7 |
8 | import pandas as pd
9 |
10 | import FeatureExtraction.CommonFunctions.converters as converters
11 |
12 |
13 | def merge(df_list,merge_param_list):
14 | """
15 | method to merge the list of data frames based on list of column identifiers. This returns intersection of values
16 | :param df_list:
17 | :param merge_param_list:
18 | :return merged data frame:
19 | """
20 |
21 | merged = funs.reduce(lambda left, right: pd.merge(left, right,how='inner', on=merge_param_list), df_list)
22 |
23 | return merged
24 |
25 | def merge_outer(df_list,merge_param_list):
26 | """
27 | method to merge the list of data frames based on list of column identifiers. This gives union of values
28 | :param df_list:
29 | :param merge_param_list:
30 | :return merged data frame:
31 | """
32 |
33 | merged = funs.reduce(lambda left, right: pd.merge(left, right,how='outer', on=merge_param_list), df_list)
34 |
35 | return merged
36 |
37 |
38 |
39 | def filter_data_on_complaince(folder_Path,complaince_rate):
40 | """
41 | Method to filter the participants based on complaince rate
42 | :param folder_Path:
43 | :param complaince_rate:
44 | :return:
45 | """
46 | complaince_df=pd.read_csv(folder_Path+"complaince.csv")
47 | complaince_df['Percent'] = complaince_df['Percent'].apply(converters.ConvertPercent)
48 | complaince_df = complaince_df.loc[complaince_df.Percent >= complaince_rate]
49 | IDs= complaince_df.ID.unique()
50 | # print(IDs)
51 | df_apps=pd.read_csv(folder_Path+"app_usage.csv")
52 | df_apps = df_apps.loc[df_apps.user_id.isin(IDs)]
53 | df_apps = df_apps.reset_index(drop=True)
54 | df_apps.to_csv(folder_Path+"Filtered/app_usage.csv")
55 |
56 |
57 | df_battery= pd.read_csv(folder_Path+"battery_events.csv")
58 | df_battery= df_battery.loc[df_battery.user_id.isin(IDs)]
59 | df_battery = df_battery.reset_index(drop=True)
60 | df_battery.to_csv(folder_Path+"Filtered/battery_events.csv")
61 |
62 |
63 | df_bluetooth = pd.read_csv(folder_Path+"bluetooth.csv")
64 | df_bluetooth = df_bluetooth.loc[df_bluetooth.user_id.isin(IDs)]
65 | df_bluetooth = df_bluetooth.reset_index(drop=True)
66 | df_bluetooth.to_csv(folder_Path+"Filtered/bluetooth.csv")
67 |
68 | df_screen = pd.read_csv(folder_Path+"screenstate.csv")
69 | df_screen = df_screen.loc[df_screen.user_id.isin(IDs)]
70 | df_screen = df_screen.reset_index(drop=True)
71 | df_screen.to_csv(folder_Path+"Filtered/screenstate.csv")
72 |
73 |
74 | df_wifi = pd.read_csv(folder_Path+"wifi.csv")
75 | df_wifi = df_wifi.loc[df_wifi.user_id.isin(IDs)]
76 | df_wifi = df_wifi.reset_index(drop=True)
77 | df_wifi.to_csv(folder_Path+"Filtered/wifi.csv")
78 |
--------------------------------------------------------------------------------
/FeatureExtraction/DataSet.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: This file contains the code to create a data set from a folder path with multiple sensor data.
4 | It extracts data from sensors and then merges multiple features into a single sample.
5 | """
6 |
7 | import FeatureExtraction.CommonFunctions.dataprocessing_helper as dataprocess
8 | import FeatureExtraction.battery_sensor_features_extractor as battery
9 | import FeatureExtraction.bluetooth_sensor_features_extractor as bluetooth
10 | import FeatureExtraction.screenstate_sensor_features_extractor as screen
11 | import FeatureExtraction.wifi_sensor_features_extractor as wifi
12 |
13 |
14 | class DataSet(object):
15 |
16 | def __init__(self, path, balance=0.0):
17 | """Return object for Dataset"""
18 | self.path = path
19 |
20 | def extract_features(self):
21 | """
22 | method to extract the features from battery, bluetooth and wifi sensors
23 | expects file name on the Dataset object.
24 | """
25 |
26 | folder_Path = self.path
27 | if folder_Path=="":
28 | print("need to set the file name to create data set")
29 | return
30 | ################ Battery Sensor #################
31 |
32 | print("Extracting Battery Sensor Features ------------------------5%")
33 | df_battery= battery.extract(folder_Path+ "battery_events.csv")
34 | df_battery.to_csv(folder_Path+'battery_events_processed.csv')
35 | print("Number of Features extracted from Battery Sensor are:",len(df_battery))
36 |
37 | ################ Bluetooth Sensor #################
38 |
39 | print("Extracting Bluetooth Sensor Features ------------------------20%")
40 | df_bluetooth = bluetooth.extract(folder_Path + "bluetooth.csv")
41 | df_bluetooth.to_csv(folder_Path+'bluetooth_processed.csv')
42 | print("Number of Features extracted from Bluetooth Sensor are",len(df_bluetooth))
43 |
44 |
45 | ################ Screen Sensor #################
46 |
47 | print("Extracting Screen State Sensor Features ------------------------40%")
48 | df_screen = screen.main(folder_Path + "screenstate.csv")
49 | df_screen.to_csv(folder_Path+'screenstate_processed.csv')
50 | print("Number of Features extracted from Screen State Sensor are:",len(df_screen))
51 |
52 |
53 | ################ WiFi Sensor #################
54 |
55 |
56 | print("Extracting WiFi Features ------------------------60%")
57 | df_wifi = wifi.main(folder_Path + "wifi.csv")
58 | df_wifi.to_csv(folder_Path+'wifi_processed.csv')
59 | print("Number of Features extracted from WiFi Sensor are:",len(df_wifi))
60 |
61 | #Merging the features on ID and Date
62 | print("Extracting Features Done, Merging the Features ------------------------90%")
63 | dfs = [df_battery,df_bluetooth,df_screen,df_wifi]
64 | merged_features= dataprocess.merge(dfs, ['ID', 'Date'])
65 |
66 | print("Feature Extraction Finished, total number of samples are:",len(merged_features))
67 |
68 | print("Saving a copy at",folder_Path+'FeaturesExtraction.csv')
69 | merged_features.to_csv(folder_Path+'FeaturesExtraction.csv')
70 |
71 | return merged_features
72 |
73 |
74 |
--------------------------------------------------------------------------------
/FeatureExtraction/battery_sensor_features_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Based on the Battery sensor Data, charger plug in time and duration of plug in time are extracted on a daily basis.
4 | """
5 | #Importing the required libraries.
6 | import collections as col
7 | import functools
8 | from collections import Counter
9 | import pandas as pd
10 | import FeatureExtraction.CommonFunctions.converters as converters
11 | from FeatureExtraction.CommonFunctions import dataprocessing_helper
12 |
13 | def TakeMostProbableTimeInStudy(study_values,day_values):
14 | """
15 | Method to get most probable time based on give data.
16 | Sometimes it is possible that partcipant can charge the mobile multiple times in a day,
17 | in such cases we consider the most probable time of corresponding participant
18 | occurred in the entire study period.
19 | :param StudyValues: study charge time values
20 | :param DayValues: charge time values for a given day
21 | :return:
22 | """
23 |
24 | #if total number of values in a day are one then return the only value i.e only one charger plugin time in a given day
25 | if day_values.count ==1 :
26 | return day_values
27 |
28 | #more than one time found, hence get the day values and the time values for entire study
29 |
30 | else:
31 | #get the study time and day values values count
32 | counter = Counter(study_values)
33 | return functools.reduce(lambda max_key,current_key: max_key if counter[max_key]>counter[current_key] else current_key, study_values)
34 |
35 |
36 |
37 | def get_charger_plugintime_daily(file):
38 | """
39 | Method to compute the battery charger plug in time
40 | :param file:
41 | :return data frame:
42 | """
43 |
44 | #read the data in to a dataframe
45 | df= pd.read_csv(file)
46 |
47 | #splitting datetime in to date and time columns
48 | df['Date'], df['Time'] = zip(*df['start_time'].map(lambda x: x.split(' ')))
49 |
50 | #removing rows with battery plugged status as o which is unplugged and converting the time to Integer for easy caliculations
51 | df= df[df.plugged !=0]
52 | df['Time'] =df['Time'].apply(converters.ConvertTime)
53 | df['Time'] =df['Time'].apply(converters.ConvertToInt)
54 |
55 |
56 | #getting the all plug in times for a particular participant in the entire study of 30 days.
57 | tempdf = df
58 | tempgrouping = tempdf.groupby(['user_id'])
59 | batterychargeTimePerStudy= [(key,col.Counter(converters.ConvertToIntList(value['Time']))) for (key, value) in tempgrouping.__iter__()]
60 | batterychargeTimePerStudydf= pd.DataFrame(batterychargeTimePerStudy,columns=['ID','Values'])
61 |
62 | #grouping by date and userid
63 | grouping = df.groupby(['user_id','Date'])
64 |
65 | #Get battery time for each day by taking the most probable time in the entire study if there are more than one recod
66 | batterychargeTime_perDay= [(key[0],key[1],TakeMostProbableTimeInStudy(batterychargeTimePerStudydf[batterychargeTimePerStudydf.ID ==key[0]],value['Time'])) for (key,value) in grouping.__iter__()]
67 | outputdf= pd.DataFrame(batterychargeTime_perDay,columns=['ID','Date','CharginTimeDaily'])
68 |
69 | return outputdf
70 |
71 |
72 | def max_battery_plugin_time_daily(file):
73 |
74 | """
75 | computes the maximum plug in time of battery in a give day for all participants
76 | :param file:
77 | :return:
78 | """
79 |
80 | #read the data in to data fram
81 | df= pd.read_csv(file)
82 |
83 |
84 | #create new df columns for start,end date and time columns and convert the values for math advantages
85 | df['StartDate'],df['StartTime'] = zip(*df['start_time'].map(lambda x:x.split(' ')))
86 | df['ConvertedStartTime'] = df['StartTime'].apply(converters.ConvertTime)
87 | df['ConvertedStartDate'] = df['StartDate'].apply(converters.ConvertDate)
88 | df['EndDate'],df['EndTime'] = zip(*df['end_time'].map(lambda x:x.split(' ')))
89 | df['ConvertedEndTime'] = df['EndTime'].apply(converters.ConvertTime)
90 | df['ConvertedEndDate'] = df['EndDate'].apply(converters.ConvertDate)
91 |
92 |
93 |
94 | userIds= df.user_id.unique()
95 | outputlist=[]
96 |
97 | # Since this depends on continous data records we need to iterate the records, smart aggregation doesn't help much
98 | #processing for corresponding participant
99 | for user in userIds:
100 | tempdf = df.loc[df.user_id == user]
101 | Dates = tempdf.StartDate.unique()
102 |
103 | #processing for each day
104 | for date in Dates:
105 | tmpdf = tempdf.loc[((df.StartDate == date))]
106 | tmpdf= tmpdf.sort_values(['ConvertedStartTime'],ascending=(True))
107 | tmpdf= tmpdf[tmpdf.plugged.isin([1,2])]
108 | durations =[0]
109 |
110 | for index,value in tmpdf.iterrows():
111 | if (tmpdf.loc[index,'StartDate'] == tmpdf.loc[index,'EndDate']):
112 | durations.append(tmpdf.loc[index,'ConvertedEndTime'] - tmpdf.loc[index,'ConvertedStartTime'])
113 | else:
114 | durations.append((24.0 - tmpdf.loc[index,'ConvertedStartTime']) + tmpdf.loc[index,'ConvertedEndTime'])
115 |
116 | output_dataFrame = pd.DataFrame(outputlist,columns=['ID','Date','Battery_Charging_Duration',])
117 | return output_dataFrame
118 |
119 | def extract(path):
120 | """
121 | Method to extract the features based on the csv path given
122 | """
123 |
124 | #getting the daily charger plug in times in a day for each participants
125 | df_charge_plugin_times=get_charger_plugintime_daily(path)
126 |
127 | #getting the maximum charger plugin duration in a day for each participants
128 | df_max_plugin_duration=max_battery_plugin_time_daily(path)
129 |
130 | #merging the extracted features
131 | battery_df= dataprocessing_helper.merge([df_charge_plugin_times, df_max_plugin_duration], ['ID', 'Date'])
132 |
133 |
134 | return battery_df
135 |
136 |
137 |
138 | #Code to test the functionality independently
139 | # df_battery=extract(r"/home/naveen/Data/Shed10/Filtered/battery_events.csv")
140 | # print((df_battery))
141 |
--------------------------------------------------------------------------------
/FeatureExtraction/bluetooth_sensor_features_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Using the Bluetooth sensor Data, number of contacts a participant made with other individuals is caliculated
4 | """
5 | #Importing the required libraries.
6 |
7 | import pandas as pd
8 | import numpy as np
9 |
10 |
11 | #method to count the distinct strings in a list
12 | def CountDistinctStrings(list):
13 | return np.count_nonzero(np.unique(list))
14 |
15 | def find_contactrate_perday(file):
16 | """
17 | This method finds the number of contacts a person made with other people using bluetooth sensor data
18 | :param file:
19 | :return df:
20 | """
21 |
22 |
23 | #reading the data in to data frame
24 | df= pd.read_csv(file)
25 |
26 | #bluetooth can be connected to any type of devices such as printer, computer, smartphones etc. But we need
27 | # only smartphones as it is carried by participants
28 | smartphone_class_Id_List=['01020c','50020c','52020c','58020c','5a020c','62020c','70020c','72020c','78020c','7a020c']
29 | df_smartPhones = df.loc[df.dev_class.isin(smartphone_class_Id_List)]
30 |
31 | #of all smartphones bluetooth sensed take the nearby smartphones only based on signla strength
32 | df_nearby_smartphones= df_smartPhones[df_smartPhones.rssi > -80 ]
33 | df_nearby_smartphones['Date'],df_nearby_smartphones['Time'] = zip(*df_nearby_smartphones['record_time'].map(lambda x:x.split(' ')))
34 |
35 | #Grouping By ID, Date so that we can get for each individual on a given day
36 | grouped = df_nearby_smartphones.groupby(['user_id','Date'])
37 |
38 | #get contact rate each day by counting the nearby smartphones
39 | ContactRateDailyList=[(key[0],key[1],CountDistinctStrings(value['mac'])) for (key,value) in grouped.__iter__()]
40 | outputdf= pd.DataFrame(ContactRateDailyList,columns=['ID','Date','ContactRatePerDay'])
41 | return outputdf
42 |
43 |
44 | def extract(path):
45 |
46 | """
47 | Method to extract the features based on the csv path given
48 | """
49 | return find_contactrate_perday(path)
50 |
--------------------------------------------------------------------------------
/FeatureExtraction/fetch_data_from_db.py:
--------------------------------------------------------------------------------
1 | #Author: Naveen Kambham
2 | #Purpose: Thesis Project
3 | #Main Functionality: This python file contains funcationality to fetch data from a remote server
4 |
5 | '''Importing the required libraries'''
6 | import pandas as pd
7 | import numpy as np
8 | import mysql.connector
9 |
10 |
11 | def ConnectToDb_Return_Df_table(id,pwd,host,db_name,table_name):
12 | """
13 | This method will Connect to Data base and return the requested table in the form of a dataframe
14 | Better to make it a singleton to ensure multiple db connections are not spawned
15 | :param id:
16 | :param pwd:
17 | :param host:
18 | :param db_name:
19 | :param table_name:
20 | :return:
21 | """
22 | #making connection object
23 | conn = mysql.connector.connect(
24 | user=id,
25 | password=pwd,
26 | host=host,
27 | database=db_name)
28 | #Starting cursor
29 | cur = conn.cursor()
30 | # query = ("SELECT * FROM "+ tableName+" limit 2")
31 | #Preparing query with the give table name
32 | query = ("SELECT * FROM "+ table_name)
33 |
34 | #Reading the query result to a dataframe
35 | df =pd.read_sql_query(query,conn)
36 | conn.close()
37 | return df
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/FeatureExtraction/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: This file contains the code to drive the feature extraction step. This assumes that data is located at /home/naveen/Data/ folders
4 | """
5 | import pandas as pd
6 |
7 | import FeatureExtraction.CommonFunctions.dataprocessing_helper as dataprocessor
8 | from FeatureExtraction.DataSet import DataSet
9 | #Data Studies
10 | DataFolders=["/home/naveen/Data/Shed8/Filtered/","/home/naveen/Data/Shed9/Filtered/","/home/naveen/Data/Shed10/Filtered/"]
11 |
12 | for filepath in DataFolders:
13 |
14 | #load the data and extract the features from different smartphone sensors
15 | dataset=DataSet(filepath)
16 | dataset.extract_features()
17 |
18 | #merge the ground truth i.e Big Five Personality values and extracted input features
19 | presurvey = pd.read_csv(filepath+'/PreSurvey_Processed.csv')
20 | features=pd.read_csv(filepath+'/FeaturesExtraction.csv')
21 | df_data=dataprocessor.merge([features,presurvey],['ID'])
22 | df_data.to_csv(filepath+'/Final_DataSet.csv')
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/FeatureExtraction/screenstate_sensor_features_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Code to find the daily(active) smartphone usage feature from Smartphone Screen Sensor data.
4 | Continuous screen ON and OFF states in a single day are caliculated and
5 | collectively these gave us the active mobile duration of each participant in a day.
6 | """
7 |
8 | #Importing the required libraries.
9 | import pandas as pd
10 | from FeatureExtraction.CommonFunctions import converters as converters
11 |
12 |
13 | def get_activephone_usage(file):
14 | """
15 | method to find the activie phone usage of participants
16 | :param file path:
17 | :return dataframe:
18 | """
19 |
20 | #read the data in to dataframe
21 | df= pd.read_csv(file)
22 |
23 |
24 | #split the time record in to date and time. convert the values for easy math
25 | df['Date'],df['Time'] = zip(*df['record_time'].map(lambda x:x.split(' ')))
26 | df['ConvertedTime'] = df['Time'].apply(converters.ConvertTime)
27 |
28 | #Loop through each user.
29 | #Since this depends on continous data records we need to iterate the records aggregation doesn't help much
30 | userIds= df.user_id.unique()
31 | outputlist=[]
32 | for user in userIds:
33 | tempdf = df.loc[df.user_id == user]
34 | dates = tempdf.Date.unique()
35 |
36 | #looping through each day
37 | for date in dates:
38 | tmpdf = tempdf.loc[((df.Date == date))]
39 | tmpdf= tmpdf.sort_values(['ConvertedTime'],ascending=(True))
40 | tmpdf = tmpdf.reset_index(drop=False)
41 |
42 | #dictionary to store the sum of continous time value counts in a give day
43 | dict_screenstates={}
44 | dict_screenstates[1]=0
45 | dict_screenstates[0]=0
46 | for index,value in tmpdf.iterrows():
47 |
48 | if (index < len(tmpdf)-1):
49 |
50 | #if the next state is not same as current state then compute the time otherwise skip
51 | if (tmpdf.loc[index,'state'] != tmpdf.loc[index+1,'state']):
52 | dict_screenstates[tmpdf.loc[index,'state']] += tmpdf.loc[index+1,'ConvertedTime'] - tmpdf.loc[index,'ConvertedTime']
53 | else:
54 | continue
55 |
56 | #append the data record for each user and each day
57 | outputlist.append([user,date,dict_screenstates[0],dict_screenstates[1]])
58 |
59 |
60 |
61 | output_dataFrame = pd.DataFrame(outputlist,columns=['ID','Date','ScreenState_ON','ScreenState_OFF',])
62 |
63 | return output_dataFrame
64 |
65 |
66 |
67 | def extract(path):
68 | """
69 | method to extract the features from smartphone sensor data
70 | """
71 | return get_activephone_usage(path)
72 |
73 |
74 | #stand alone code to test
75 | # pd.set_option('display.max_rows', 5000)
76 | # print(main(r'/home/naveen/Data/Shed9/Filtered/screenstate.csv'))
77 |
--------------------------------------------------------------------------------
/FeatureExtraction/wifi_sensor_features_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Code to find campus arrival time, campus departure time, time spent in campus, number of unique WiFi routers visited
4 | around the city for each day using the known MAC addresses of University and geographic coordinates for each campus WiFirouter.
5 | """
6 | #Importing the required libraries.
7 | import numpy as np
8 | import pandas as pd
9 | import FeatureExtraction.CommonFunctions.dataprocessing_helper as dataprocessor
10 |
11 |
12 | def CountDistinctStrings(list):
13 | return np.count_nonzero(np.unique(list))
14 |
15 | def get_campus_entry_leave_times(file):
16 | """
17 | method to find the campus entry time, leave time and time on campus
18 | First time in a day a phone sees a campus router is entry time of the participant/phone and
19 | last time is leave time. Difference of these two gives time spent on campus.
20 | :param file path:
21 | :return dataframe:
22 | """
23 | #Read the data in to data frame
24 | df = pd.read_csv(file)
25 |
26 |
27 | #consider only university wifi router address records and split the record_time in to date and time
28 | df_with_UofS_Wifi = df.loc[df.ssid.isin(['uofs-secure','uofs-public','uofs-guest'])]
29 | df_with_UofS_Wifi['Date'],df_with_UofS_Wifi['Time'] = zip(*df_with_UofS_Wifi['record_time'].map(lambda x:x.split(' ')))
30 |
31 | #Group by Id, Date
32 | grouped= df_with_UofS_Wifi.groupby(['user_id','Date'])
33 |
34 |
35 | #From the aggreagation get the min, max times i.e campues entry, leave times
36 | lst_campus_entry_leaving_timings = [(key[0],key[1], min(value['Time']), max(value['Time'])) for (key, value) in grouped.__iter__()]
37 |
38 | # create data frame out of three features.
39 | df = pd.DataFrame(lst_campus_entry_leaving_timings, columns=['ID','Date', 'EntryTime','LeavingTime'])
40 | df['Time_In_School']= df['EntryTime'] - df['LeavingTime']
41 |
42 | return df
43 |
44 | def get_diff_wifi_seen(file):
45 | """
46 | method to find the different routers a phone seen throguh wifi sensors. Here routers with out hand-shake-connection
47 | are also recorded by the wifi sensor
48 | :param file:
49 | :return df:
50 | """
51 |
52 | #read the data in to df and split the record_time in to Date and Time
53 | df = pd.read_csv(file)
54 | df['ssid']= df['ssid'].astype(str)
55 | df['record_time']= df['record_time'].astype(str)
56 | df['Date'],df['Time'] = zip(*df['record_time'].map(lambda x:x.split(' ')))
57 |
58 | #Group by Id, Date
59 | grouped= df.groupby(['user_id','Date'])
60 |
61 |
62 | #count distinct wifi strings
63 | lst_wifi_routers_visited_daily = [(key[0],key[1],CountDistinctStrings(value['ssid'])) for (key, value) in grouped.__iter__()]
64 | df_wifis_seen = pd.DataFrame(lst_wifi_routers_visited_daily, columns=['ID','Date','WifiCountPerDay'])
65 |
66 | return df_wifis_seen
67 |
68 |
69 |
70 | def extract(path):
71 | """
72 | method to extrac campus entry time, leave time, time spent in campus and different buildings, wifi routers seen
73 | """
74 |
75 | #extracting campus entry leave times, time in school and different wifi router seen in city
76 | df_campus_entry_leave_times=get_campus_entry_leave_times(path)
77 | df_diff_wifi_seen= get_diff_wifi_seen(path)
78 |
79 |
80 | #merging the data frames
81 | df_wifi_features= dataprocessor.merge([df_campus_entry_leave_times, df_diff_wifi_seen], ['ID', 'Date'])
82 | return df_wifi_features
83 |
84 |
85 |
86 |
87 | #stand alone code to test the data
88 | # df_wifi =main(r"/home/naveen/Data/Shed10/wifi.csv")
89 | # print(len(df_wifi))
90 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 naveenkambham
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/MachineLearning_Models/Classification_Models/SVM_Classification.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Classification Model based on Support Vector Machine using Scikit.
4 | This generalizes the data and then apply cross validation to evaluate it.
5 | Hyper parameters are tuned using GridsearchCV and set.
6 | """
7 | #Importing the required libraries.
8 | import numpy
9 | import pandas
10 | from sklearn.model_selection import KFold
11 | from sklearn.preprocessing import StandardScaler
12 | from sklearn.pipeline import Pipeline
13 | from sklearn.metrics import confusion_matrix
14 | from sklearn.model_selection import cross_val_predict
15 | from sklearn import svm
16 | from sklearn.metrics import precision_score
17 |
18 | #load the data and fill missing values
19 | dataframe = pandas.read_csv(r'/home/naveen/Documents/DataSet1_1.csv')
20 | dataframe = dataframe.fillna(value=0)
21 |
22 |
23 |
24 | def Labelclasses(value):
25 | """
26 | Method to bin the value ranges. Since the trait value ranges are 0,1 dividing in to three bins
27 | """
28 | if value >=0 and value <=0.3:
29 | return 0
30 | elif value >0.3 and value <=0.6:
31 | return 1
32 | elif value >0.6 and value <=1:
33 | return 2
34 |
35 |
36 | def SVCModel(trait):
37 | """
38 | method to model the continuos problem as classification problem using Support Vector Machines
39 | """
40 |
41 | #converting the continuous values in to bins and reading the input(X) and output(Y) features
42 | dataframe[trait] = dataframe[trait].apply(Labelclasses)
43 | X = dataframe.loc[:,'mediaUsage':'Scheduling_OfficeTools_Weather'].values
44 | Y = dataframe.loc[:,trait].values
45 |
46 | # applying scaling and modelling in pipeline
47 | seed=7
48 | numpy.random.seed(seed)
49 | estimators = []
50 | estimators.append(('standardize', StandardScaler()))
51 | model = svm.SVC(kernel='rbf',C=0.8,gamma='auto')
52 | estimators.append(('mlp',model ))
53 | pipeline = Pipeline(estimators)
54 |
55 | #cross validation
56 | kfold = KFold(n_splits=4, random_state=seed)
57 | predicted = cross_val_predict(pipeline,X,Y,cv=kfold)
58 |
59 | #confusion matrix
60 | print(confusion_matrix(Y,predicted))
61 |
62 | #returning the accuracy
63 | return (precision_score(Y, predicted, average='macro'))
64 |
65 | #modeling each Big Five trait.
66 | traits=['Openness','Conscientiousness','Extraversion','Agreeableness','Neuroticism']
67 |
68 | for trait in traits:
69 | print(trait,SVCModel(trait))
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/MachineLearning_Models/Regression_Models/NeuralNetworks.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Neural Networks Regression Model using Tensor Flow. This generalizes the data and cross validation is used
4 | to evaluate it with RMSE as metric. Hyper parameters are tuned using GridsearchCV and set.
5 | """
6 | #Importing the required libraries.
7 | import numpy
8 | import pandas
9 | from keras.models import Sequential
10 | from keras.layers import Dense
11 | from keras.wrappers.scikit_learn import KerasRegressor
12 | from sklearn.model_selection import KFold
13 | from sklearn.preprocessing import StandardScaler
14 | from sklearn.pipeline import Pipeline
15 | from math import sqrt
16 | from sklearn.model_selection import cross_val_predict
17 | import os
18 | from matplotlib import pyplot as plt
19 | from sklearn.metrics import mean_squared_error
20 |
21 |
22 |
23 | #Enable the GPU, read data and fill null values
24 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # see issue #152
25 | os.environ["CUDA_VISIBLE_DEVICES"] = ""
26 | dataframe = pandas.read_csv(r'/home/naveen/Documents/DataSet1_1.csv')
27 | dataframe = dataframe.fillna(value=0)
28 |
29 |
30 | def model():
31 | """
32 | method to create the input,output layers and the hidden layers with relu activation and adam optimer and mse as loss function
33 | :return model:
34 | """
35 | model = Sequential()
36 | model.add(Dense(30, input_dim=13, kernel_initializer='normal', activation='relu'))
37 | model.add(Dense(1, kernel_initializer='normal'))
38 | model.compile(loss='mean_squared_error', optimizer='adam')
39 |
40 | return model
41 |
42 | def NeuralNets(trait):
43 | """
44 | code for Neural Networks model setup
45 | """
46 | #get the dependent and independent variables
47 | X = dataframe.loc[:,'mediaUsage':'Scheduling_OfficeTools_Weather'].values
48 | Y = dataframe.loc[:,trait].values
49 |
50 |
51 | #adding seed and standardizing the data
52 | seed=7
53 | numpy.random.seed(seed)
54 | estimators = []
55 | estimators.append(('standardize', StandardScaler()))
56 | estimators.append(('mlp', KerasRegressor(build_fn=model, nb_epoch=300, batch_size=50 , verbose=0)))
57 |
58 |
59 |
60 | #pipeline to run stadadization and model
61 | pipeline = Pipeline(estimators)
62 | kfold = KFold(n_splits=4, random_state=seed)
63 |
64 |
65 | predicted = cross_val_predict(pipeline,X,Y,cv=kfold)
66 |
67 | #Scatter Plot for results and actual values
68 | fig, ax = plt.subplots()
69 | ax.scatter(Y,predicted, edgecolors=(0, 0, 0))
70 | ax.plot([0, 1], [0, 1], 'k--', lw=2)
71 | ax.set_xlabel('Actual Values (Big Five PreSurvey)')
72 | ax.set_ylabel('Predicted Values')
73 | plt.title("Neural Networks - "+trait)
74 | # plt.show()# enable for debugging
75 | plt.savefig('/home/naveen/Desktop/Plots1/'+trait+'.png')
76 |
77 |
78 | return (sqrt(mean_squared_error(Y,predicted))*100)
79 |
80 |
81 | #looping for each Big Five trait.
82 | traits=['Openness','Conscientiousness','Extraversion','Agreeableness','Neuroticism']
83 |
84 | for trait in traits:
85 | print(trait,NeuralNets(trait))
86 |
--------------------------------------------------------------------------------
/MachineLearning_Models/Regression_Models/RandomForest.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Random Forests Regression Model using Scikit. This generalizes the data and then apply cross validation
4 | to evaluate it with RMSE as metric. Hyper parameters are tuned and set.
5 | """
6 | #Importing the required libraries.
7 | from sklearn.ensemble import RandomForestRegressor
8 | import pandas
9 | import numpy
10 | from sklearn.model_selection import cross_val_score
11 | from sklearn.model_selection import KFold
12 | from sklearn.pipeline import Pipeline
13 | from math import sqrt
14 | from sklearn.model_selection import cross_val_predict
15 | from sklearn.metrics import mean_squared_error
16 | from matplotlib import pyplot as plt
17 | from sklearn.preprocessing import StandardScaler
18 |
19 |
20 |
21 |
22 | def RandomForestsModel(trait,estimator,min_split,max_depth,max_feature):
23 | #reading the data and filling missing values
24 | df = pandas.read_csv(r'/home/naveen/Data/DataSet1.csv')
25 | df= df.fillna(value=0)
26 |
27 | #getting the dependent variables, independent variables
28 | X = df.loc[:,'mediaUsage':'Scheduling_OfficeTools_Weather']
29 | Y = df.loc[:,trait]
30 |
31 | #creating the model and pipeling along with scaler
32 | model = RandomForestRegressor(n_estimators=estimator,
33 | min_samples_split=min_split,max_features=max_feature,criterion='mse',max_depth=max_depth)
34 | seed=7
35 | numpy.random.seed(seed)
36 | estimators = []
37 | estimators.append(('standardize', StandardScaler()))
38 | estimators.append(('RandomForest', model))
39 | pipeline = Pipeline(estimators)
40 | kfold = KFold(n_splits=4, random_state=seed)
41 | predicted = cross_val_predict(pipeline,X,Y,cv=kfold)
42 |
43 |
44 | #fit line for results and actual values
45 | fig, ax = plt.subplots()
46 | ax.scatter(Y,predicted, edgecolors=(0, 0, 0))
47 | ax.plot([0, 1], [0, 1], 'k--', lw=2)
48 | ax.set_xlabel('Actual Values (Big Five PreSurvey)')
49 | ax.set_ylabel('Predicted Values')
50 | plt.title("Random Forests - "+trait)
51 | plt.savefig('/home/naveen/Desktop/Plots1/'+trait+'.png')
52 |
53 |
54 | return (sqrt(mean_squared_error(Y,predicted))*100)
55 |
56 |
57 |
58 |
59 |
60 | traits=['Openness','Conscientiousness','Extraversion','Agreeableness','Neuroticism']
61 | for trait in traits:
62 | print(trait,RandomForestsModel(trait,20,50,20,5))
63 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Big Five personality modeling
2 | Human behavior is complex and often resists straightforward modeling with traditional mathematical approaches. To simplify this task, researchers frequently employ intermediate psychological models that capture specific facets of human behavior. These models, like the Big Five personality framework, are typically validated using survey instruments and are known to correlate with certain behavioral tendencies. Traditionally, these constructs have been used to predict stylized behaviors; however, advances in sensing technologies have opened up new possibilities to infer these psychological constructs directly from observed behavior.
3 |
4 | Modern smartphones are equipped with a variety of sensors that can be leveraged to capture abstract measures of human behavior. This raises the question: can we reliably infer psychological profiles from passive smartphone data alone? The ability to derive a personality profile from unobtrusive, sensor-derived data has promising applications, ranging from personalized marketing to targeted social or health interventions.
5 |
6 | In this study, we developed a model to infer personality traits based on the Big Five personality inventory. By analyzing daily routines captured via smartphone sensors, we applied supervised machine learning to predict individuals’ personality traits. Our evaluation, using cross-validation, showed that the model achieved a sufficiently low root mean squared error to provide actionable predictions for most individuals, though it struggled with personality outliers.
7 |
8 | This project demonstrates the feasibility of using mobile sensor data to approximate personality traits, suggesting potential for real-world applications in fields requiring adaptive and personalized approaches.
9 |
--------------------------------------------------------------------------------
/UnitTests/test_BigFiveFeatureExtraction.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: code to test the whole feature extraction phase by plotting the distributions. This plots univariate and bivariate distributions.
4 | Helpful for a quick assement of whole data set including input/output feature relations.
5 | """
6 | #Importing the required libraries.
7 | import unittest
8 | import numpy as np
9 | import pandas as pd
10 | import matplotlib.pyplot as plt
11 | import seaborn as sns
12 | from FeatureExtraction import battery_sensor_features_extractor
13 |
14 |
15 | def test_BigFiveFeaturesPhase(file):
16 | """
17 | code to draw the visulations for extracted features and scatterplot matrix for input/output features
18 | :param file path:
19 | :return nothing:
20 | """
21 |
22 | #read the data in to a data frame
23 | dataframe = pd.read_csv(file)
24 |
25 | ###################### Univariate Analysis/plots ##################################
26 | #plotting a density plot with 3 plots in each row
27 | dataframe.plot(kind='density', subplots=True, layout=(5, 3), sharex=False, sharey=False)
28 | plt.savefig(r'density.png')
29 |
30 | # plotting a histogram plot with 3 plots in each row
31 | dataframe.plot(kind='hist', subplots=True, layout=(5, 3), sharex=False, sharey=False)
32 | plt.savefig(r'histogram.png')
33 |
34 | # plotting a area plot with 3 plots in each row
35 | dataframe.plot(kind='area', subplots=True, layout=(5, 3), sharex=False, sharey=False)
36 | plt.savefig(r'histogram.png')
37 |
38 |
39 |
40 | ################ Bivariate Analysis/plots #####################################
41 | #plotting scatter plot matrix for dependent and independent features
42 | #scatter plot for app usage
43 | xvars_appusage=["Camera, Maps, Internet apps","Scheduling, OfficeTools, Weather apps","Media apps usage","Other apps usage"]
44 | yvars=['Openness','Conscientiousness','Extraversion','Agreeableness','Neuroticism']
45 |
46 | xvars_entry_leavetimes = ['Charge start time', 'Battery charging duration', 'Campus entry time', 'Campus leaving time',
47 | 'Time spent in campus']
48 |
49 | g = sns.pairplot(dataframe,x_vars=xvars_appusage,y_vars=yvars)
50 | g.savefig(r'scatterplot_appusage.png')
51 |
52 | #scatter plot for time related features
53 | g = sns.pairplot(dataframe, x_vars=xvars_entry_leavetimes, y_vars=yvars)
54 | g.savefig(r'scatterplot_entryleavetimes.png')
55 | print("Plots are saved please check for anomolies")
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/UnitTests/test_battery_sensor_features_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Unit testing for battery sensor feature extractor code. Majority of the data extraction code has to be tested visually by looking at the plots distributions.
4 | """
5 | #Importing the required libraries.
6 | import unittest
7 | import numpy as np
8 | from FeatureExtraction import battery_sensor_features_extractor
9 |
10 |
11 |
12 | class BatterySensorTestCase(unittest.TestCase):
13 | """
14 | Tests for battery_sensor_features_extractor.py
15 | """
16 | def test_TakeMostProbableTimeInStudy(self):
17 | """
18 | to test the most probable time functionality
19 | :return:
20 | """
21 | #case 1 multiple values in each day
22 | result= battery_sensor_features_extractor.TakeMostProbableTimeInStudy([1,1,1,1,2,2,3,3,3,3,3,3,3,3],[1,2,0])
23 | self.assertEqual(result,3)
24 |
25 | # case 2 only one value in a day
26 | result = battery_sensor_features_extractor.TakeMostProbableTimeInStudy(
27 | [1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3], [1])
28 | self.assertEqual(result, 4)
29 |
30 | # case 3 only one value in a day and it is not exists in the study times so far seen
31 | result = battery_sensor_features_extractor.TakeMostProbableTimeInStudy(
32 | [1, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3], [0])
33 | self.assertEqual(result, 0)
34 |
35 | def test_extract(self):
36 | """
37 | testing the feature extractor code
38 | :return:
39 | """
40 | #extracting the features
41 | df_battery=battery_sensor_features_extractor.extract(r"/home/naveen/Data/Shed10/Filtered/battery_events.csv")
42 |
43 | # charging should atleast be greater than 0
44 | self.assertTrue(np.min(df_battery['Battery_Charging_Duration'] >=0))
45 | self.assertTrue(np.min(df_battery['CharginTimeDaily'] >=0) and np.max(df_battery['CharginTimeDaily'] <=24))
46 |
47 |
48 | if __name__ == '__main__':
49 | unittest.main()
--------------------------------------------------------------------------------
/UnitTests/test_bluetooth_sensor_features_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Unit testing for bluetooth sensor feature extractor code. Majority of the data extraction code has to be tested visually by looking at the plots distributions.
4 | """
5 | #Importing the required libraries.
6 | import unittest
7 | import numpy as np
8 | from FeatureExtraction import bluetooth_sensor_features_extractor
9 | class BluetoothSensorTestCase(unittest.TestCase):
10 | """
11 | Tests for bluetooth_sensor_features_extractor.py
12 | """
13 | def test_find_contactrate_perday(self):
14 | """
15 | to test the contactrate code
16 | :return:
17 | """
18 | #extracting the features
19 | df_bluetooth=bluetooth_sensor_features_extractor.extract(r"/home/naveen/Data/Shed10/Filtered/bluetooth.csv")
20 |
21 | # contact patterns has to be greater than or equal to o or less than 100 max number of participants
22 | self.assertTrue(np.min(df_bluetooth['ContactRatePerDay'] >=0) and np.max(df_bluetooth['ContactRatePerDay'] <=100))
23 |
24 |
25 | if __name__ == '__main__':
26 | unittest.main()
--------------------------------------------------------------------------------
/UnitTests/test_screenstate_sensor_features_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Unit testing for battery sensor feature extractor code. Majority of the data extraction code has to be tested visually by looking at the plots distributions.
4 | """
5 | #Importing the required libraries.
6 | import unittest
7 | import numpy as np
8 | from FeatureExtraction import screenstate_sensor_features_extractor
9 | class ScreenStateSensorTestCase(unittest.TestCase):
10 | """
11 | Tests for screenstate_sensor_features_extractor.py
12 | """
13 | def test_get_activephone_usage(self):
14 | """
15 | to test the get_activephone_usage functionality
16 | :return:
17 | """
18 | #extracting the features
19 | df_get_activephone_usage=screenstate_sensor_features_extractor.extract(r"/home/naveen/Data/Shed10/Filtered/battery_events.csv")
20 |
21 | # ON and OFF should be between 0 and 100
22 | self.assertTrue((df_get_activephone_usage['ScreenState_ON'] / (df_get_activephone_usage['ScreenState_ON'] +df_get_activephone_usage['ScreenState_OFF']) >=0))
23 | self.assertTrue((df_get_activephone_usage['ScreenState_OFF'] / (
24 | df_get_activephone_usage['ScreenState_ON'] + df_get_activephone_usage['ScreenState_OFF']) <= 100))
25 |
26 |
27 | #assert for some random participants
28 | if(len(df_get_activephone_usage >=100)):
29 | self.assertTrue((df_get_activephone_usage.loc[50,'ScreenState_ON'] == (
30 | df_get_activephone_usage.loc[50, 'ScreenState_ON'] + df_get_activephone_usage.loc[50,'ScreenState_OFF'] - df_get_activephone_usage.loc[50,'ScreenState_OFF']) >= 0))
31 |
32 | self.assertTrue((df_get_activephone_usage.loc[25, 'ScreenState_ON'] == (
33 | df_get_activephone_usage.loc[25, 'ScreenState_ON'] + df_get_activephone_usage.loc[
34 | 25, 'ScreenState_OFF'] - df_get_activephone_usage.loc[25, 'ScreenState_OFF']) >= 0))
35 |
36 |
37 | if __name__ == '__main__':
38 | unittest.main()
--------------------------------------------------------------------------------
/UnitTests/test_wifi_sensor_features_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Developer : Naveen Kambham
3 | Description: Unit testing for wifi sensor feature extractor code. Majority of the data extraction code has to be tested visually by looking at the plots distributions.
4 | """
5 | #Importing the required libraries.
6 | import unittest
7 | import numpy as np
8 | from FeatureExtraction import wifi_sensor_features_extractor
9 | class WiFiSensorTestCase(unittest.TestCase):
10 | """
11 | Tests for wifi_sensor_features_extractor.py
12 | """
13 | def test_find_get_campus_entry_leave_times(self):
14 | """
15 | to test the get_campus_entry_leave_times code
16 | :return:
17 | """
18 | #extracting the features
19 | df_entry_leave_timeinschool=wifi_sensor_features_extractor.get_campus_entry_leave_times(r"/home/naveen/Data/Shed10/Filtered/wifi.csv")
20 |
21 | # contact patterns has to be greater than or equal to o or less than 24.0 hours
22 | self.assertTrue(np.min(df_entry_leave_timeinschool['EntryTime'] >=0) and np.max(df_entry_leave_timeinschool['EntryTime'] <=24))
23 | self.assertTrue(np.min(df_entry_leave_timeinschool['LeavingTime'] >= 0) and np.max( df_entry_leave_timeinschool['LeavingTime'] <= 24))
24 | self.assertTrue(np.min(df_entry_leave_timeinschool['Time_In_School'] >= 0) and np.max(df_entry_leave_timeinschool['Time_In_School'] <= 24))
25 |
26 | #Take some random participants and check time_in_school == leavetime - entrytime
27 | if(len(df_entry_leave_timeinschool) >=100):
28 | self.assertTrue(df_entry_leave_timeinschool.loc[100,'Time_In_School'] == df_entry_leave_timeinschool.loc[100,'LeavingTime'] - df_entry_leave_timeinschool.loc[100,'EntryTime'])
29 | self.assertTrue(df_entry_leave_timeinschool.loc[75, 'Time_In_School'] == df_entry_leave_timeinschool.loc[
30 | 75, 'LeavingTime'] - df_entry_leave_timeinschool.loc[75, 'EntryTime'])
31 | self.assertTrue(df_entry_leave_timeinschool.loc[25, 'Time_In_School'] == df_entry_leave_timeinschool.loc[
32 | 25, 'LeavingTime'] - df_entry_leave_timeinschool.loc[25, 'EntryTime'])
33 | self.assertTrue(df_entry_leave_timeinschool.loc[0, 'Time_In_School'] == df_entry_leave_timeinschool.loc[
34 | 0, 'LeavingTime'] - df_entry_leave_timeinschool.loc[0, 'EntryTime'])
35 | self.assertTrue(df_entry_leave_timeinschool.loc[50, 'Time_In_School'] == df_entry_leave_timeinschool.loc[
36 | 50, 'LeavingTime'] - df_entry_leave_timeinschool.loc[50, 'EntryTime'])
37 |
38 | def test_get_diff_wifi_seen(self):
39 | """
40 | to test the test_get_diff_wifi_seen code
41 | :return:
42 | """
43 | # extracting the features
44 | df_test_get_diff_wifi_seen = wifi_sensor_features_extractor.test_get_diff_wifi_seen(
45 | r"/home/naveen/Data/Shed10/Filtered/wifi.csv")
46 |
47 | # contact patterns has to be greater than or equal to o or less than 24.0 hours
48 | self.assertTrue(np.min(df_test_get_diff_wifi_seen['WifiCountPerDay'] >= 0) and np.max(
49 | df_test_get_diff_wifi_seen['WifiCountPerDay'] <= 500))
50 |
51 |
52 |
53 |
54 | if __name__ == '__main__':
55 | unittest.main()
--------------------------------------------------------------------------------