├── demo video (is from an earlier version of the project ) (1).mp4 ├── BACKGROUNDdata_3.csv ├── TEST_MODEL.py ├── README.md ├── TRAINER_MODEL.py ├── DATA_TO_USB.py └── ai-nose-dataset-curation.ipynb /demo video (is from an earlier version of the project ) (1).mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aditya56h/SMELL_SENSING_MODEL_V1/HEAD/demo video (is from an earlier version of the project ) (1).mp4 -------------------------------------------------------------------------------- /BACKGROUNDdata_3.csv: -------------------------------------------------------------------------------- 1 | Alcohol PPM,LPG GAS,CH4 PPM,Propane,H2PPM,Humidity,VOC,Temperature 2 | 29.83,3.51,16.32,3.51,52.54,36.00,87,14.60 3 | 29.83,3.51,16.32,3.51,52.54,36.00,87,14.60 4 | 29.83,3.51,16.32,3.51,52.54,36.00,87,14.60 5 | 29.83,3.51,16.32,3.51,52.54,36.00,86,14.60 6 | 29.83,3.51,16.32,3.51,52.54,36.00,87,14.60 7 | 29.83,3.51,16.32,3.51,52.54,36.00,87,14.60 8 | 29.83,3.51,16.32,3.51,52.54,36.00,87,14.60 9 | 29.83,3.51,16.32,3.51,52.54,36.00,87,14.60 10 | 29.83,3.51,16.32,3.51,52.54,36.00,87,14.60 11 | 29.83,3.51,16.32,3.51,52.54,36.00,86,14.60 12 | -------------------------------------------------------------------------------- /TEST_MODEL.py: -------------------------------------------------------------------------------- 1 | # Author: Aditya Sharma 2 | # Project: Smell Sensing Model Version 1 - Data Transfer and Prediction 3 | import os 4 | import pandas as pd 5 | import numpy as np 6 | import tensorflow as tf 7 | from tensorflow.keras.models import load_model 8 | import serial 9 | import time 10 | 11 | # Load the trained model 12 | model = load_model("") 13 | 14 | # Establish communication with Arduino 15 | ser = serial.Serial('COM9', 9600) 16 | time.sleep(2) 17 | 18 | # Define the means and standard deviations for normalization 19 | means = np.array([36.6395, 2.9867, 123.5196, 2.9867, 54.1952, 28.449, 242.1697, 22.4664]) 20 | std_devs = np.array([8.0384, 1.0675, 290.0955, 1.0675, 5.6318, 16.2025, 238.1397, 4.8474]) 21 | 22 | # Continuously read data from Arduino and make predictions 23 | while True: 24 | try: 25 | # Read 8 lines from the serial port 26 | lines = [ser.readline().decode('utf-8').strip() for _ in range(10)][1:] 27 | 28 | # Convert the lines to a DataFrame 29 | data = pd.DataFrame([[float(x) for x in line.split(',')] for line in lines]) 30 | 31 | # Normalize the data 32 | data = (data - means) / std_devs 33 | 34 | # Make a prediction using the model 35 | prediction = model.predict(data) 36 | 37 | # Print the prediction 38 | print(f'Prediction: {prediction}') 39 | except KeyboardInterrupt: 40 | # Break the loop if there is a KeyboardInterrupt 41 | break 42 | 43 | # Close the serial port 44 | ser.close() 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Smell Sensing Model Version 1 2 | 3 | ## Author 4 | Aditya Sharma 5 | 6 | ## Introduction 7 | This project involves the creation of a smell sensing model using various gas sensors. The data from these sensors is collected using an Arduino-based system and stored in CSV files. This data is then used to train a neural network for smell identification tasks. 8 | 9 | ## Sensors Used 10 | - DHT11 for temperature and humidity 11 | - MQ-3 for ethanol detection 12 | - MQ-2 for LPG and propane detection 13 | - MQ-4 for methane and natural gas detection 14 | - MQ-8 for hydrogen detection 15 | - A VOC sensor for volatile organic compound detection 16 | 17 | ## Data Acquisition 18 | The data from the sensors is read using an Arduino UNO. The sensor readings are written to CSV files, with each file containing 10 readings. The code for this part can be found in `data_acquisition.ino`. 19 | 20 | ## Data Processing and Model Training 21 | The data from the CSV files is loaded into a Python script, where it is split into features and target variables. The data is then split into training and testing sets. A neural network model is defined and trained on the training data. The model includes L1 and L2 regularization to prevent overfitting. The code for this part can be found in `model_training.py`. 22 | 23 | ## Installation 24 | To run the project, you will need to have the following installed: 25 | - Arduino IDE 26 | - Python 3 27 | - TensorFlow 28 | - Keras 29 | - scikit-learn 30 | - pandas 31 | 32 | ## Usage 33 | 1. Upload the `data_acquisition.ino` sketch to your Arduino UNO. 34 | 2. Run the `model_training.py` script in Python. 35 | ## Contact 36 | For any queries, you can reach out to me at adduabbu2005sharma@gmail.com 37 | -------------------------------------------------------------------------------- /TRAINER_MODEL.py: -------------------------------------------------------------------------------- 1 | # Author: Your Name 2 | # Project: Smell Sensing Model Version 1 - Data Acquisition and Neural Network Training 3 | 4 | import tensorflow as tf 5 | from tensorflow.keras.models import Sequential 6 | from tensorflow.keras.layers import Dense 7 | from tensorflow.keras import regularizers 8 | from sklearn.model_selection import train_test_split 9 | import pandas as pd 10 | import glob 11 | 12 | # Function to load data from CSV files 13 | def load_data_from_csv(): 14 | # Get a list of all CSV files 15 | files = glob.glob('E:/BACKGROUNDdata_*.csv') # replace 'E:\\' with your USB drive's path 16 | 17 | # Read the CSV files into a DataFrame 18 | data = pd.concat([pd.read_csv(file) for file in files]) 19 | 20 | # Split the data into X and y 21 | X = data.drop('Temperature', axis=1) 22 | y = data['Temperature'] 23 | 24 | return X, y 25 | 26 | # Load the data 27 | X, y = load_data_from_csv() 28 | 29 | # Split the data into training and testing sets 30 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 31 | 32 | # Define the model 33 | model = Sequential([ 34 | Dense(32, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)), 35 | Dense(32, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)), 36 | Dense(1, activation='sigmoid'), 37 | ]) 38 | 39 | # Compile the model 40 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 41 | 42 | # Train the model 43 | model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test)) 44 | 45 | # Save the model 46 | model.save('my_model.h5') 47 | -------------------------------------------------------------------------------- /DATA_TO_USB.py: -------------------------------------------------------------------------------- 1 | # Author: Aditya Sharma 2 | # Project: Smell Sensing Model Version 1 - Data Acquisition and Storage 3 | 4 | import serial 5 | import csv 6 | import datetime 7 | 8 | # Function to write data to a CSV file 9 | def write_to_csv(filename, data, fieldnames): 10 | # Open the file in append mode 11 | with open(filename, mode='a', newline='', encoding='utf-8') as file: 12 | # Create a CSV writer 13 | writer = csv.DictWriter(file, fieldnames=fieldnames) 14 | 15 | # Write the header if the file is empty 16 | if file.tell() == 0: 17 | writer.writeheader() 18 | 19 | # Write the data row 20 | writer.writerow(data) 21 | 22 | # Main function 23 | def main(): 24 | # Establish communication with Arduino 25 | ser = serial.Serial('COM9', 9600) 26 | 27 | count = 0 28 | file_num = 1 29 | 30 | # Define the fieldnames for the CSV file 31 | fieldnames = ['Alcohol PPM', 'LPG GAS', 'CH4 PPM', 'Propane', 'H2PPM', 'Humidity', 'VOC', 'Temperature'] 32 | 33 | # Continuously read data from Arduino 34 | while True: 35 | if ser.inWaiting(): 36 | # Read a line from Arduino and remove trailing whitespace 37 | line = ser.readline().decode('utf-8').strip() 38 | 39 | # Remove commas from the data 40 | line = line.replace(',', ' ') 41 | 42 | # Split the line into values and create a dictionary with the fieldnames 43 | data = dict(zip(fieldnames, line.split())) 44 | 45 | # Define the filename 46 | filename = f'E:/BACKGROUNDdata_{file_num}.csv' # replace 'E:\\' with your USB drive's path 47 | 48 | # Write the data to the CSV file 49 | write_to_csv(filename, data, fieldnames) 50 | 51 | count += 1 52 | 53 | # If 10 lines have been written, start a new file 54 | if count == 10: 55 | count = 0 56 | file_num += 1 57 | 58 | # Run the main function 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /ai-nose-dataset-curation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "AvxbZv9hQk2S" 17 | }, 18 | "source": [ 19 | "# Dataset curation - Feature scaling for time series data\n", 20 | "\n", 21 | "[![Open In Colab <](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ShawnHymel/ai-nose/blob/master/ai-nose-dataset-curation.ipynb)\n", 22 | "\n", 23 | "In the paper \"Efficient BackProp\" [1], LeCun et al. shows that we can achieve a more accurate model (e.g. artificial neural network) in less time by standarizing (i.e. to a mean of 0 and unit variance) and decorrelating our input data.\n", 24 | "\n", 25 | "However, the process of standarization assumes that the data is normally distributed (i.e. Gaussian). If our data does not follow a Gaussian distribution, we should perform normalization [2], where we divide by the range to produce a set of values between 0 and 1.\n", 26 | "\n", 27 | "Create a directory */content/dataset* and upload your entire dataset there. Run through the cells in this notebook, following all of the directions to analyze the data and create a curated dataset. If you perform normalization or standarization for any dimension, you will need to copy the mean, standard deviation, minimum, and range arrays for use in your inference code (i.e. preprocessing the data before running inference).\n", 28 | "\n", 29 | "[1] http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf\n", 30 | "\n", 31 | "[2] https://becominghuman.ai/what-does-feature-scaling-mean-when-to-normalize-data-and-when-to-standardize-data-c3de654405ed" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "id": "xnALAnX4Ml61" 38 | }, 39 | "source": [ 40 | "## Step 1: Analyze the data" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "id": "ASsQkrDSFmt6" 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "import csv\n", 52 | "import os\n", 53 | "import shutil\n", 54 | "\n", 55 | "import numpy as np\n", 56 | "import matplotlib.pyplot as plt\n", 57 | "import pandas as pd" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": { 64 | "id": "gpwB4SCFIuES" 65 | }, 66 | "outputs": [], 67 | "source": [ 68 | "### Settings\n", 69 | "HOME_PATH = \"/content\" # Location of the working directory\n", 70 | "DATASET_PATH = \"/content/dataset\" # Upload your .csv samples to this directory\n", 71 | "OUT_PATH = \"/content/out\" # Where output files go (will be deleted and recreated)\n", 72 | "OUT_ZIP = \"/content/out.zip\" # Where to store the zipped output files\n", 73 | "\n", 74 | "# Do not change these settings!\n", 75 | "PREP_DROP = -1 # Drop a column\n", 76 | "PREP_NONE = 0 # Perform no preprocessing on column of data\n", 77 | "PREP_STD = 1 # Perform standardization on column of data\n", 78 | "PREP_NORM = 2 # Perform normalization on column of data" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "id": "9x1FvOLJI2vX" 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "### Read in .csv files to construct one long multi-axis, time series data\n", 90 | "\n", 91 | "# Store header, raw data, and number of lines found in each .csv file\n", 92 | "header = None\n", 93 | "raw_data = []\n", 94 | "num_lines = []\n", 95 | "filenames = []\n", 96 | "\n", 97 | "# Read each CSV file\n", 98 | "for filename in os.listdir(DATASET_PATH):\n", 99 | "\n", 100 | " # Check if the path is a file\n", 101 | " filepath = os.path.join(DATASET_PATH, filename)\n", 102 | " if not os.path.isfile(filepath):\n", 103 | " continue\n", 104 | "\n", 105 | " # Read the .csv file\n", 106 | " with open(filepath) as f:\n", 107 | " csv_reader = csv.reader(f, delimiter=',')\n", 108 | "\n", 109 | " # Read each line\n", 110 | " valid_line_counter = 0\n", 111 | " for line_count, line in enumerate(csv_reader):\n", 112 | "\n", 113 | " # Check header\n", 114 | " if line_count == 0:\n", 115 | "\n", 116 | " # Record first header as our official header for all the data\n", 117 | " if header == None:\n", 118 | " header = line\n", 119 | "\n", 120 | " # Check to make sure subsequent headers match the original header\n", 121 | " if header == line:\n", 122 | " num_lines.append(0)\n", 123 | " filenames.append(filename)\n", 124 | " else:\n", 125 | " print(\"Error: Headers do not match. Skipping\", filename)\n", 126 | " break\n", 127 | "\n", 128 | " # Construct raw data array, make sure number of elements match number of header labels\n", 129 | " else:\n", 130 | " if len(line) == len(header):\n", 131 | " raw_data.append(line)\n", 132 | " num_lines[-1] += 1\n", 133 | " else:\n", 134 | " print(\"Error: Data length does not match header length. Skipping line.\")\n", 135 | " continue\n", 136 | "\n", 137 | "# Convert our raw data into a numpy array\n", 138 | "raw_data = np.array(raw_data).astype(float)\n", 139 | "\n", 140 | "# Print out our results\n", 141 | "print(\"Dataset array shape:\", raw_data.shape)\n", 142 | "print(\"Number of elements in num_lines:\", len(num_lines))\n", 143 | "print(\"Number of filenames:\", len(filenames))\n", 144 | "assert(len(num_lines) == len(filenames))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": { 151 | "id": "EPXyymW9LtPB" 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "### Plot scatter matrix to look for correlation\n", 156 | "\n", 157 | "# Convert NumPy array to Pandas DataFrame\n", 158 | "df = pd.DataFrame(raw_data, columns=header)\n", 159 | "\n", 160 | "# Create scatter matrix\n", 161 | "sm = pd.plotting.scatter_matrix(df, figsize=(15, 15))" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": { 167 | "id": "2GuTOWqkYo9M" 168 | }, 169 | "source": [ 170 | "Notice the wide range of input values! We need to get those to be close to the same range so that the correlation plots will make more sense. Before we do that, we should plot the histograms to see how the data is distributed." 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": { 177 | "id": "u1JIjUVT5qwl" 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "### Show correlation matrix as colors\n", 182 | "\n", 183 | "# Create plot\n", 184 | "fig = plt.figure(figsize=(10, 10))\n", 185 | "ax = fig.add_subplot(111)\n", 186 | "im = ax.matshow(df.corr())\n", 187 | "\n", 188 | "# Add legend\n", 189 | "fig.colorbar(im)\n", 190 | "\n", 191 | "# Add x and y labels\n", 192 | "_ = ax.set_xticks(np.arange(len(header)))\n", 193 | "_ = ax.set_xticklabels(header)\n", 194 | "_ = ax.set_yticks(np.arange(len(header)))\n", 195 | "_ = ax.set_yticklabels(header)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": { 202 | "id": "qt9D2F_PMjdY" 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "### Examine the histograms of all the data\n", 207 | "\n", 208 | "# Create subplots\n", 209 | "num_hists = len(header)\n", 210 | "fig, axs = plt.subplots(1, num_hists, figsize=(20,3))\n", 211 | "\n", 212 | "# Create histogram for each category of data\n", 213 | "for i in range(num_hists):\n", 214 | " _ = axs[i].hist(raw_data[:, i])\n", 215 | " axs[i].title.set_text(header[i])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": { 222 | "id": "x1JHQ3zbZTuP" 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "### Analyze the data\n", 227 | "\n", 228 | "# Calculate means, standard deviations, and ranges\n", 229 | "means = np.mean(raw_data, axis=0)\n", 230 | "std_devs = np.std(raw_data, axis=0)\n", 231 | "maxes = np.max(raw_data, axis=0)\n", 232 | "mins = np.min(raw_data, axis=0)\n", 233 | "ranges = np.ptp(raw_data, axis=0)\n", 234 | "\n", 235 | "# Print results\n", 236 | "for i, name in enumerate(header):\n", 237 | " print(name)\n", 238 | " print(\" mean:\", means[i])\n", 239 | " print(\" std dev:\", std_devs[i])\n", 240 | " print(\" max:\", maxes[i])\n", 241 | " print(\" min:\", mins[i])\n", 242 | " print(\" range:\", ranges[i])" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "id": "sWwyVZEv4M6B" 249 | }, 250 | "source": [ 251 | "## Step 2: Choose how to preprocess the data" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "id": "0nb5ZVm_QeqD" 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "### Choose preprocessing method for each column\n", 263 | "# PREP_DROP: Drop column\n", 264 | "# PREP_NONE: no preprocessing\n", 265 | "# PREP_STD: standardization (if data is Gaussian)\n", 266 | "# PREP_NORM: normalization (if data is non-Gaussian)\n", 267 | "\n", 268 | "# Change this to match your picks!\n", 269 | "preproc = [PREP_NONE, # Timestamp\n", 270 | " PREP_NORM, # Temperature\n", 271 | " PREP_NORM, # Humidity\n", 272 | " PREP_DROP, # Pressure\n", 273 | " PREP_NORM, # CO2\n", 274 | " PREP_NORM, # VOC1\n", 275 | " PREP_NORM, # VOC2\n", 276 | " PREP_NORM, # NO2\n", 277 | " PREP_NORM, # Ethanol\n", 278 | " PREP_NORM] # CO\n", 279 | "\n", 280 | "# Check to make sure we have the correct number of preprocessing request elements\n", 281 | "assert(len(preproc) == len(header))\n", 282 | "assert(len(preproc) == raw_data.shape[1])\n", 283 | "\n", 284 | "# ### If we do not need the timestamp column, drop it from the data\n", 285 | "# if not KEEP_TIMESTAMP:\n", 286 | "# header = header[1:]\n", 287 | "# raw_data = raw_data[:,1:]\n", 288 | "# print(\"Array shape without timestamp:\", data_without_time.shape)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": { 294 | "id": "KWxMyht0CQUK" 295 | }, 296 | "source": [ 297 | "## Step 3: Perform data preprocessing" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": { 304 | "id": "BqrCbErD7vC6" 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "### Perform preprocessing steps as requested\n", 309 | "\n", 310 | "# Figure out how many columns we plan to keep\n", 311 | "num_cols = sum(1 for x in preproc if x != PREP_DROP)\n", 312 | "\n", 313 | "# Create empty numpy array and header for preprocessed data\n", 314 | "prep_data = np.zeros((raw_data.shape[0], num_cols))\n", 315 | "prep_header = []\n", 316 | "prep_means = []\n", 317 | "prep_std_devs = []\n", 318 | "prep_mins = []\n", 319 | "prep_ranges = []\n", 320 | "\n", 321 | "# Go through each column to preprocess the data\n", 322 | "prep_c = 0\n", 323 | "for raw_c in range(len(header)):\n", 324 | "\n", 325 | " # Drop column if requested\n", 326 | " if preproc[raw_c] == PREP_DROP:\n", 327 | " print(\"Dropping\", header[raw_c])\n", 328 | " continue\n", 329 | "\n", 330 | " # Perform data standardization\n", 331 | " if preproc[raw_c] == PREP_STD:\n", 332 | " prep_data[:, prep_c] = (raw_data[:, raw_c] - means[raw_c]) / std_devs[raw_c]\n", 333 | "\n", 334 | " # Perform data normalization\n", 335 | " elif preproc[raw_c] == PREP_NORM:\n", 336 | " prep_data[:, prep_c] = (raw_data[:, raw_c] - mins[raw_c]) / ranges[raw_c]\n", 337 | "\n", 338 | " # Copy data over if no preprocessing is requested\n", 339 | " elif preproc[raw_c] == PREP_NONE:\n", 340 | " prep_data[:, raw_c] = raw_data[:, raw_c]\n", 341 | "\n", 342 | " # Error if code not recognized\n", 343 | " else:\n", 344 | " raise Exception(\"Preprocessing code not recognized\")\n", 345 | "\n", 346 | " # Copy header (and preprocessing constants) and increment preprocessing column index\n", 347 | " prep_header.append(header[raw_c])\n", 348 | " prep_means.append(means[raw_c])\n", 349 | " prep_std_devs.append(std_devs[raw_c])\n", 350 | " prep_mins.append(mins[raw_c])\n", 351 | " prep_ranges.append(ranges[raw_c])\n", 352 | " prep_c += 1\n", 353 | "\n", 354 | "# Show new data header and shape\n", 355 | "print(prep_header)\n", 356 | "print(\"New data shape:\", prep_data.shape)\n", 357 | "print(\"Means:\", [float(\"{:.4f}\".format(x)) for x in prep_means])\n", 358 | "print(\"Std devs:\", [float(\"{:.4f}\".format(x)) for x in prep_std_devs])\n", 359 | "print(\"Mins:\", [float(\"{:.4f}\".format(x)) for x in prep_mins])\n", 360 | "print(\"Ranges:\", [float(\"{:.4f}\".format(x)) for x in prep_ranges])" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": { 366 | "id": "K-E0CfJaCSNc" 367 | }, 368 | "source": [ 369 | "## Step 4: Analyze newly preprocessed data" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "id": "Nfxoua7-hG1e" 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "### Recreate the scatter matrix to look for correlation\n", 381 | "\n", 382 | "# Convert NumPy array to Pandas DataFrame\n", 383 | "df = pd.DataFrame(prep_data, columns=prep_header)\n", 384 | "\n", 385 | "# Create scatter matrix\n", 386 | "sm = pd.plotting.scatter_matrix(df, figsize=(15, 15))" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "id": "mMbAP3tShNuQ" 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "### Show correlation matrix as colors\n", 398 | "\n", 399 | "# Create plot\n", 400 | "fig = plt.figure(figsize=(10, 10))\n", 401 | "ax = fig.add_subplot(111)\n", 402 | "im = ax.matshow(df.corr())\n", 403 | "\n", 404 | "# Add legend\n", 405 | "fig.colorbar(im)\n", 406 | "\n", 407 | "# Add x and y labels\n", 408 | "_ = ax.set_xticks(np.arange(len(prep_header)))\n", 409 | "_ = ax.set_xticklabels(prep_header)\n", 410 | "_ = ax.set_yticks(np.arange(len(prep_header)))\n", 411 | "_ = ax.set_yticklabels(prep_header)" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": { 417 | "id": "ytiLtULjDPEq" 418 | }, 419 | "source": [ 420 | "## Step 5: Store preprocessed data in CSV files" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": { 427 | "id": "j-zn9JjSl0si" 428 | }, 429 | "outputs": [], 430 | "source": [ 431 | "### Delete output directory (if it exists) and recreate it\n", 432 | "if os.path.exists(OUT_PATH):\n", 433 | " shutil.rmtree(OUT_PATH)\n", 434 | "os.makedirs(OUT_PATH)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": { 441 | "id": "HbIvoyilljEx" 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "### Write out data to .csv files\n", 446 | "\n", 447 | "# Go through all the original filenames\n", 448 | "row_index = 0\n", 449 | "for file_num, filename in enumerate(filenames):\n", 450 | "\n", 451 | " # Open .csv file\n", 452 | " file_path = os.path.join(OUT_PATH, filename)\n", 453 | " with open(file_path, 'w') as f:\n", 454 | " csv_writer = csv.writer(f, delimiter=',')\n", 455 | "\n", 456 | " # Write header\n", 457 | " csv_writer.writerow(prep_header)\n", 458 | "\n", 459 | " # Write contents\n", 460 | " for _ in range(num_lines[file_num]):\n", 461 | " csv_writer.writerow(prep_data[row_index])\n", 462 | " row_index += 1" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": { 469 | "id": "WHwvKD94pOao" 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "### Zip output directory\n", 474 | "%cd {OUT_PATH}\n", 475 | "!zip -FS -r -q {OUT_ZIP} *\n", 476 | "%cd {HOME_PATH}" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": { 483 | "id": "8FB-mGIGquk0" 484 | }, 485 | "outputs": [], 486 | "source": [] 487 | } 488 | ], 489 | "metadata": { 490 | "colab": { 491 | "name": "gas-sensor-dataset-curation.ipynb", 492 | "provenance": [], 493 | "include_colab_link": true 494 | }, 495 | "kernelspec": { 496 | "display_name": "Python 3", 497 | "name": "python3" 498 | }, 499 | "language_info": { 500 | "name": "python" 501 | } 502 | }, 503 | "nbformat": 4, 504 | "nbformat_minor": 0 505 | } --------------------------------------------------------------------------------