├── MySQL Queries.sql
├── PSQL Queries.sql
├── README.md
├── Walmart Business Problems.pdf
├── Walmart Project.png
├── Walmart.csv
├── project.ipynb
├── requirements.txt
├── walmart_clean_data.csv
└── walmart_project-piplelines.png


/MySQL Queries.sql:
--------------------------------------------------------------------------------
  1 | -- Walmart Project Queries - MySQL
  2 | 
  3 | SELECT * FROM walmart;
  4 | 
  5 | -- DROP TABLE walmart;
  6 | 
  7 | -- DROP TABLE walmart;
  8 | 
  9 | -- Count total records
 10 | SELECT COUNT(*) FROM walmart;
 11 | 
 12 | -- Count payment methods and number of transactions by payment method
 13 | SELECT 
 14 |     payment_method,
 15 |     COUNT(*) AS no_payments
 16 | FROM walmart
 17 | GROUP BY payment_method;
 18 | 
 19 | -- Count distinct branches
 20 | SELECT COUNT(DISTINCT branch) FROM walmart;
 21 | 
 22 | -- Find the minimum quantity sold
 23 | SELECT MIN(quantity) FROM walmart;
 24 | 
 25 | -- Business Problem Q1: Find different payment methods, number of transactions, and quantity sold by payment method
 26 | SELECT 
 27 |     payment_method,
 28 |     COUNT(*) AS no_payments,
 29 |     SUM(quantity) AS no_qty_sold
 30 | FROM walmart
 31 | GROUP BY payment_method;
 32 | 
 33 | -- Project Question #2: Identify the highest-rated category in each branch
 34 | -- Display the branch, category, and avg rating
 35 | SELECT branch, category, avg_rating
 36 | FROM (
 37 |     SELECT 
 38 |         branch,
 39 |         category,
 40 |         AVG(rating) AS avg_rating,
 41 |         RANK() OVER(PARTITION BY branch ORDER BY AVG(rating) DESC) AS rank
 42 |     FROM walmart
 43 |     GROUP BY branch, category
 44 | ) AS ranked
 45 | WHERE rank = 1;
 46 | 
 47 | -- Q3: Identify the busiest day for each branch based on the number of transactions
 48 | SELECT branch, day_name, no_transactions
 49 | FROM (
 50 |     SELECT 
 51 |         branch,
 52 |         DAYNAME(STR_TO_DATE(date, '%d/%m/%Y')) AS day_name,
 53 |         COUNT(*) AS no_transactions,
 54 |         RANK() OVER(PARTITION BY branch ORDER BY COUNT(*) DESC) AS rank
 55 |     FROM walmart
 56 |     GROUP BY branch, day_name
 57 | ) AS ranked
 58 | WHERE rank = 1;
 59 | 
 60 | -- Q4: Calculate the total quantity of items sold per payment method
 61 | SELECT 
 62 |     payment_method,
 63 |     SUM(quantity) AS no_qty_sold
 64 | FROM walmart
 65 | GROUP BY payment_method;
 66 | 
 67 | -- Q5: Determine the average, minimum, and maximum rating of categories for each city
 68 | SELECT 
 69 |     city,
 70 |     category,
 71 |     MIN(rating) AS min_rating,
 72 |     MAX(rating) AS max_rating,
 73 |     AVG(rating) AS avg_rating
 74 | FROM walmart
 75 | GROUP BY city, category;
 76 | 
 77 | -- Q6: Calculate the total profit for each category
 78 | SELECT 
 79 |     category,
 80 |     SUM(unit_price * quantity * profit_margin) AS total_profit
 81 | FROM walmart
 82 | GROUP BY category
 83 | ORDER BY total_profit DESC;
 84 | 
 85 | -- Q7: Determine the most common payment method for each branch
 86 | WITH cte AS (
 87 |     SELECT 
 88 |         branch,
 89 |         payment_method,
 90 |         COUNT(*) AS total_trans,
 91 |         RANK() OVER(PARTITION BY branch ORDER BY COUNT(*) DESC) AS rank
 92 |     FROM walmart
 93 |     GROUP BY branch, payment_method
 94 | )
 95 | SELECT branch, payment_method AS preferred_payment_method
 96 | FROM cte
 97 | WHERE rank = 1;
 98 | 
 99 | -- Q8: Categorize sales into Morning, Afternoon, and Evening shifts
100 | SELECT
101 |     branch,
102 |     CASE 
103 |         WHEN HOUR(TIME(time)) < 12 THEN 'Morning'
104 |         WHEN HOUR(TIME(time)) BETWEEN 12 AND 17 THEN 'Afternoon'
105 |         ELSE 'Evening'
106 |     END AS shift,
107 |     COUNT(*) AS num_invoices
108 | FROM walmart
109 | GROUP BY branch, shift
110 | ORDER BY branch, num_invoices DESC;
111 | 
112 | -- Q9: Identify the 5 branches with the highest revenue decrease ratio from last year to current year (e.g., 2022 to 2023)
113 | WITH revenue_2022 AS (
114 |     SELECT 
115 |         branch,
116 |         SUM(total) AS revenue
117 |     FROM walmart
118 |     WHERE YEAR(STR_TO_DATE(date, '%d/%m/%Y')) = 2022
119 |     GROUP BY branch
120 | ),
121 | revenue_2023 AS (
122 |     SELECT 
123 |         branch,
124 |         SUM(total) AS revenue
125 |     FROM walmart
126 |     WHERE YEAR(STR_TO_DATE(date, '%d/%m/%Y')) = 2023
127 |     GROUP BY branch
128 | )
129 | SELECT 
130 |     r2022.branch,
131 |     r2022.revenue AS last_year_revenue,
132 |     r2023.revenue AS current_year_revenue,
133 |     ROUND(((r2022.revenue - r2023.revenue) / r2022.revenue) * 100, 2) AS revenue_decrease_ratio
134 | FROM revenue_2022 AS r2022
135 | JOIN revenue_2023 AS r2023 ON r2022.branch = r2023.branch
136 | WHERE r2022.revenue > r2023.revenue
137 | ORDER BY revenue_decrease_ratio DESC
138 | LIMIT 5;
139 | 


--------------------------------------------------------------------------------
/PSQL Queries.sql:
--------------------------------------------------------------------------------
  1 | -- Walmart Project Queries
  2 | 
  3 | SELECT * FROM walmart;
  4 | 
  5 | -- DROP TABLE walmart;
  6 | 
  7 | -- DROP TABLE walmart;
  8 | 
  9 | -- 
 10 | SELECT COUNT(*) FROM walmart;
 11 | 
 12 | SELECT 
 13 | 	 payment_method,
 14 | 	 COUNT(*)
 15 | FROM walmart
 16 | GROUP BY payment_method
 17 | 
 18 | SELECT 
 19 | 	COUNT(DISTINCT branch) 
 20 | FROM walmart;
 21 | 
 22 | SELECT MIN(quantity) FROM walmart;
 23 | 
 24 | -- Business Problems
 25 | --Q.1 Find different payment method and number of transactions, number of qty sold
 26 | 
 27 | 
 28 | SELECT 
 29 | 	 payment_method,
 30 | 	 COUNT(*) as no_payments,
 31 | 	 SUM(quantity) as no_qty_sold
 32 | FROM walmart
 33 | GROUP BY payment_method
 34 | 
 35 | 
 36 | -- Project Question #2
 37 | -- Identify the highest-rated category in each branch, displaying the branch, category
 38 | -- AVG RATING
 39 | 
 40 | SELECT * 
 41 | FROM
 42 | (	SELECT 
 43 | 		branch,
 44 | 		category,
 45 | 		AVG(rating) as avg_rating,
 46 | 		RANK() OVER(PARTITION BY branch ORDER BY AVG(rating) DESC) as rank
 47 | 	FROM walmart
 48 | 	GROUP BY 1, 2
 49 | )
 50 | WHERE rank = 1
 51 | 
 52 | 
 53 | -- Q.3 Identify the busiest day for each branch based on the number of transactions
 54 | 
 55 | SELECT * 
 56 | FROM
 57 | 	(SELECT 
 58 | 		branch,
 59 | 		TO_CHAR(TO_DATE(date, 'DD/MM/YY'), 'Day') as day_name,
 60 | 		COUNT(*) as no_transactions,
 61 | 		RANK() OVER(PARTITION BY branch ORDER BY COUNT(*) DESC) as rank
 62 | 	FROM walmart
 63 | 	GROUP BY 1, 2
 64 | 	)
 65 | WHERE rank = 1
 66 | 
 67 | -- Q. 4 
 68 | -- Calculate the total quantity of items sold per payment method. List payment_method and total_quantity.
 69 | 
 70 | 
 71 | 
 72 | SELECT 
 73 | 	 payment_method,
 74 | 	 -- COUNT(*) as no_payments,
 75 | 	 SUM(quantity) as no_qty_sold
 76 | FROM walmart
 77 | GROUP BY payment_method
 78 | 
 79 | 
 80 | -- Q.5
 81 | -- Determine the average, minimum, and maximum rating of category for each city. 
 82 | -- List the city, average_rating, min_rating, and max_rating.
 83 | 
 84 | SELECT 
 85 | 	city,
 86 | 	category,
 87 | 	MIN(rating) as min_rating,
 88 | 	MAX(rating) as max_rating,
 89 | 	AVG(rating) as avg_rating
 90 | FROM walmart
 91 | GROUP BY 1, 2
 92 | 
 93 | 
 94 | -- Q.6
 95 | -- Calculate the total profit for each category by considering total_profit as
 96 | -- (unit_price * quantity * profit_margin). 
 97 | -- List category and total_profit, ordered from highest to lowest profit.
 98 | 
 99 | SELECT 
100 | 	category,
101 | 	SUM(total) as total_revenue,
102 | 	SUM(total * profit_margin) as profit
103 | FROM walmart
104 | GROUP BY 1
105 | 
106 | 
107 | -- Q.7
108 | -- Determine the most common payment method for each Branch. 
109 | -- Display Branch and the preferred_payment_method.
110 | 
111 | WITH cte 
112 | AS
113 | (SELECT 
114 | 	branch,
115 | 	payment_method,
116 | 	COUNT(*) as total_trans,
117 | 	RANK() OVER(PARTITION BY branch ORDER BY COUNT(*) DESC) as rank
118 | FROM walmart
119 | GROUP BY 1, 2
120 | )
121 | SELECT *
122 | FROM cte
123 | WHERE rank = 1
124 | 
125 | 
126 | -- Q.8
127 | -- Categorize sales into 3 group MORNING, AFTERNOON, EVENING 
128 | -- Find out each of the shift and number of invoices
129 | 
130 | SELECT
131 | 	branch,
132 | CASE 
133 | 		WHEN EXTRACT(HOUR FROM(time::time)) < 12 THEN 'Morning'
134 | 		WHEN EXTRACT(HOUR FROM(time::time)) BETWEEN 12 AND 17 THEN 'Afternoon'
135 | 		ELSE 'Evening'
136 | 	END day_time,
137 | 	COUNT(*)
138 | FROM walmart
139 | GROUP BY 1, 2
140 | ORDER BY 1, 3 DESC
141 | 
142 | -- 
143 | -- #9 Identify 5 branch with highest decrese ratio in 
144 | -- revevenue compare to last year(current year 2023 and last year 2022)
145 | 
146 | -- rdr == last_rev-cr_rev/ls_rev*100
147 | 
148 | SELECT *,
149 | EXTRACT(YEAR FROM TO_DATE(date, 'DD/MM/YY')) as formated_date
150 | FROM walmart
151 | 
152 | -- 2022 sales
153 | WITH revenue_2022
154 | AS
155 | (
156 | 	SELECT 
157 | 		branch,
158 | 		SUM(total) as revenue
159 | 	FROM walmart
160 | 	WHERE EXTRACT(YEAR FROM TO_DATE(date, 'DD/MM/YY')) = 2022 -- psql
161 | 	-- WHERE YEAR(TO_DATE(date, 'DD/MM/YY')) = 2022 -- mysql
162 | 	GROUP BY 1
163 | ),
164 | 
165 | revenue_2023
166 | AS
167 | (
168 | 
169 | 	SELECT 
170 | 		branch,
171 | 		SUM(total) as revenue
172 | 	FROM walmart
173 | 	WHERE EXTRACT(YEAR FROM TO_DATE(date, 'DD/MM/YY')) = 2023
174 | 	GROUP BY 1
175 | )
176 | 
177 | SELECT 
178 | 	ls.branch,
179 | 	ls.revenue as last_year_revenue,
180 | 	cs.revenue as cr_year_revenue,
181 | 	ROUND(
182 | 		(ls.revenue - cs.revenue)::numeric/
183 | 		ls.revenue::numeric * 100, 
184 | 		2) as rev_dec_ratio
185 | FROM revenue_2022 as ls
186 | JOIN
187 | revenue_2023 as cs
188 | ON ls.branch = cs.branch
189 | WHERE 
190 | 	ls.revenue > cs.revenue
191 | ORDER BY 4 DESC
192 | LIMIT 5
193 | 
194 | 
195 | 
196 | 
197 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Walmart Data Analysis: End-to-End SQL + Python Project P-9
  2 | 
  3 | ## Project Overview
  4 | 
  5 | ![Project Pipeline](https://github.com/najirh/Walmart_SQL_Python/blob/main/walmart_project-piplelines.png)
  6 | 
  7 | 
  8 | This project is an end-to-end data analysis solution designed to extract critical business insights from Walmart sales data. We utilize Python for data processing and analysis, SQL for advanced querying, and structured problem-solving techniques to solve key business questions. The project is ideal for data analysts looking to develop skills in data manipulation, SQL querying, and data pipeline creation.
  9 | 
 10 | ---
 11 | 
 12 | ## Project Steps
 13 | 
 14 | ### 1. Set Up the Environment
 15 |    - **Tools Used**: Visual Studio Code (VS Code), Python, SQL (MySQL and PostgreSQL)
 16 |    - **Goal**: Create a structured workspace within VS Code and organize project folders for smooth development and data handling.
 17 | 
 18 | ### 2. Set Up Kaggle API
 19 |    - **API Setup**: Obtain your Kaggle API token from [Kaggle](https://www.kaggle.com/) by navigating to your profile settings and downloading the JSON file.
 20 |    - **Configure Kaggle**: 
 21 |       - Place the downloaded `kaggle.json` file in your local `.kaggle` folder.
 22 |       - Use the command `kaggle datasets download -d <dataset-path>` to pull datasets directly into your project.
 23 | 
 24 | ### 3. Download Walmart Sales Data
 25 |    - **Data Source**: Use the Kaggle API to download the Walmart sales datasets from Kaggle.
 26 |    - **Dataset Link**: [Walmart Sales Dataset](https://www.kaggle.com/najir0123/walmart-10k-sales-datasets)
 27 |    - **Storage**: Save the data in the `data/` folder for easy reference and access.
 28 | 
 29 | ### 4. Install Required Libraries and Load Data
 30 |    - **Libraries**: Install necessary Python libraries using:
 31 |      ```bash
 32 |      pip install pandas numpy sqlalchemy mysql-connector-python psycopg2
 33 |      ```
 34 |    - **Loading Data**: Read the data into a Pandas DataFrame for initial analysis and transformations.
 35 | 
 36 | ### 5. Explore the Data
 37 |    - **Goal**: Conduct an initial data exploration to understand data distribution, check column names, types, and identify potential issues.
 38 |    - **Analysis**: Use functions like `.info()`, `.describe()`, and `.head()` to get a quick overview of the data structure and statistics.
 39 | 
 40 | ### 6. Data Cleaning
 41 |    - **Remove Duplicates**: Identify and remove duplicate entries to avoid skewed results.
 42 |    - **Handle Missing Values**: Drop rows or columns with missing values if they are insignificant; fill values where essential.
 43 |    - **Fix Data Types**: Ensure all columns have consistent data types (e.g., dates as `datetime`, prices as `float`).
 44 |    - **Currency Formatting**: Use `.replace()` to handle and format currency values for analysis.
 45 |    - **Validation**: Check for any remaining inconsistencies and verify the cleaned data.
 46 | 
 47 | ### 7. Feature Engineering
 48 |    - **Create New Columns**: Calculate the `Total Amount` for each transaction by multiplying `unit_price` by `quantity` and adding this as a new column.
 49 |    - **Enhance Dataset**: Adding this calculated field will streamline further SQL analysis and aggregation tasks.
 50 | 
 51 | ### 8. Load Data into MySQL and PostgreSQL
 52 |    - **Set Up Connections**: Connect to MySQL and PostgreSQL using `sqlalchemy` and load the cleaned data into each database.
 53 |    - **Table Creation**: Set up tables in both MySQL and PostgreSQL using Python SQLAlchemy to automate table creation and data insertion.
 54 |    - **Verification**: Run initial SQL queries to confirm that the data has been loaded accurately.
 55 | 
 56 | ### 9. SQL Analysis: Complex Queries and Business Problem Solving
 57 |    - **Business Problem-Solving**: Write and execute complex SQL queries to answer critical business questions, such as:
 58 |      - Revenue trends across branches and categories.
 59 |      - Identifying best-selling product categories.
 60 |      - Sales performance by time, city, and payment method.
 61 |      - Analyzing peak sales periods and customer buying patterns.
 62 |      - Profit margin analysis by branch and category.
 63 |    - **Documentation**: Keep clear notes of each query's objective, approach, and results.
 64 | 
 65 | ### 10. Project Publishing and Documentation
 66 |    - **Documentation**: Maintain well-structured documentation of the entire process in Markdown or a Jupyter Notebook.
 67 |    - **Project Publishing**: Publish the completed project on GitHub or any other version control platform, including:
 68 |      - The `README.md` file (this document).
 69 |      - Jupyter Notebooks (if applicable).
 70 |      - SQL query scripts.
 71 |      - Data files (if possible) or steps to access them.
 72 | 
 73 | ---
 74 | 
 75 | ## Requirements
 76 | 
 77 | - **Python 3.8+**
 78 | - **SQL Databases**: MySQL, PostgreSQL
 79 | - **Python Libraries**:
 80 |   - `pandas`, `numpy`, `sqlalchemy`, `mysql-connector-python`, `psycopg2`
 81 | - **Kaggle API Key** (for data downloading)
 82 | 
 83 | ## Getting Started
 84 | 
 85 | 1. Clone the repository:
 86 |    ```bash
 87 |    git clone <repo-url>
 88 |    ```
 89 | 2. Install Python libraries:
 90 |    ```bash
 91 |    pip install -r requirements.txt
 92 |    ```
 93 | 3. Set up your Kaggle API, download the data, and follow the steps to load and analyze.
 94 | 
 95 | ---
 96 | 
 97 | ## Project Structure
 98 | 
 99 | ```plaintext
100 | |-- data/                     # Raw data and transformed data
101 | |-- sql_queries/              # SQL scripts for analysis and queries
102 | |-- notebooks/                # Jupyter notebooks for Python analysis
103 | |-- README.md                 # Project documentation
104 | |-- requirements.txt          # List of required Python libraries
105 | |-- main.py                   # Main script for loading, cleaning, and processing data
106 | ```
107 | ---
108 | 
109 | ## Results and Insights
110 | 
111 | This section will include your analysis findings:
112 | - **Sales Insights**: Key categories, branches with highest sales, and preferred payment methods.
113 | - **Profitability**: Insights into the most profitable product categories and locations.
114 | - **Customer Behavior**: Trends in ratings, payment preferences, and peak shopping hours.
115 | 
116 | ## Future Enhancements
117 | 
118 | Possible extensions to this project:
119 | - Integration with a dashboard tool (e.g., Power BI or Tableau) for interactive visualization.
120 | - Additional data sources to enhance analysis depth.
121 | - Automation of the data pipeline for real-time data ingestion and analysis.
122 | 
123 | ---
124 | 
125 | ## License
126 | 
127 | This project is licensed under the MIT License. 
128 | 
129 | ---
130 | 
131 | ## Acknowledgments
132 | 
133 | - **Data Source**: Kaggle’s Walmart Sales Dataset
134 | - **Inspiration**: Walmart’s business case studies on sales and supply chain optimization.
135 | 
136 | ---
137 | 


--------------------------------------------------------------------------------
/Walmart Business Problems.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/najirh/Walmart_SQL_Python/af425b2ddc63f298ab0b91707779ea0d5ce78465/Walmart Business Problems.pdf


--------------------------------------------------------------------------------
/Walmart Project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/najirh/Walmart_SQL_Python/af425b2ddc63f298ab0b91707779ea0d5ce78465/Walmart Project.png


--------------------------------------------------------------------------------
/project.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 2,
   6 |    "metadata": {},
   7 |    "outputs": [
   8 |     {
   9 |      "name": "stdout",
  10 |      "output_type": "stream",
  11 |      "text": [
  12 |       "Hello World\n"
  13 |      ]
  14 |     }
  15 |    ],
  16 |    "source": [
  17 |     "print(\"Hello World\")"
  18 |    ]
  19 |   },
  20 |   {
  21 |    "cell_type": "markdown",
  22 |    "metadata": {},
  23 |    "source": [
  24 |     "**Step 1 Data Exploration & Leading**"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "code",
  29 |    "execution_count": 3,
  30 |    "metadata": {},
  31 |    "outputs": [],
  32 |    "source": [
  33 |     "#importing dependencies\n",
  34 |     "\n",
  35 |     "import pandas as pd\n",
  36 |     "\n",
  37 |     "#mysql toolkit\n",
  38 |     "import pymysql #this will work as adapter\n",
  39 |     "from sqlalchemy import create_engine\n",
  40 |     "\n",
  41 |     "#psql\n",
  42 |     "import psycopg2"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": 4,
  48 |    "metadata": {},
  49 |    "outputs": [
  50 |     {
  51 |      "name": "stdout",
  52 |      "output_type": "stream",
  53 |      "text": [
  54 |       "2.2.3\n"
  55 |      ]
  56 |     }
  57 |    ],
  58 |    "source": [
  59 |     "print(pd.__version__)"
  60 |    ]
  61 |   },
  62 |   {
  63 |    "cell_type": "code",
  64 |    "execution_count": 5,
  65 |    "metadata": {},
  66 |    "outputs": [
  67 |     {
  68 |      "data": {
  69 |       "text/plain": [
  70 |        "(10051, 11)"
  71 |       ]
  72 |      },
  73 |      "execution_count": 5,
  74 |      "metadata": {},
  75 |      "output_type": "execute_result"
  76 |     }
  77 |    ],
  78 |    "source": [
  79 |     "df = pd.read_csv('Walmart.csv', encoding_errors='ignore')\n",
  80 |     "\n",
  81 |     "df.shape"
  82 |    ]
  83 |   },
  84 |   {
  85 |    "cell_type": "code",
  86 |    "execution_count": 6,
  87 |    "metadata": {},
  88 |    "outputs": [
  89 |     {
  90 |      "data": {
  91 |       "text/html": [
  92 |        "<div>\n",
  93 |        "<style scoped>\n",
  94 |        "    .dataframe tbody tr th:only-of-type {\n",
  95 |        "        vertical-align: middle;\n",
  96 |        "    }\n",
  97 |        "\n",
  98 |        "    .dataframe tbody tr th {\n",
  99 |        "        vertical-align: top;\n",
 100 |        "    }\n",
 101 |        "\n",
 102 |        "    .dataframe thead th {\n",
 103 |        "        text-align: right;\n",
 104 |        "    }\n",
 105 |        "</style>\n",
 106 |        "<table border=\"1\" class=\"dataframe\">\n",
 107 |        "  <thead>\n",
 108 |        "    <tr style=\"text-align: right;\">\n",
 109 |        "      <th></th>\n",
 110 |        "      <th>invoice_id</th>\n",
 111 |        "      <th>Branch</th>\n",
 112 |        "      <th>City</th>\n",
 113 |        "      <th>category</th>\n",
 114 |        "      <th>unit_price</th>\n",
 115 |        "      <th>quantity</th>\n",
 116 |        "      <th>date</th>\n",
 117 |        "      <th>time</th>\n",
 118 |        "      <th>payment_method</th>\n",
 119 |        "      <th>rating</th>\n",
 120 |        "      <th>profit_margin</th>\n",
 121 |        "    </tr>\n",
 122 |        "  </thead>\n",
 123 |        "  <tbody>\n",
 124 |        "    <tr>\n",
 125 |        "      <th>0</th>\n",
 126 |        "      <td>1</td>\n",
 127 |        "      <td>WALM003</td>\n",
 128 |        "      <td>San Antonio</td>\n",
 129 |        "      <td>Health and beauty</td>\n",
 130 |        "      <td>$74.69</td>\n",
 131 |        "      <td>7.0</td>\n",
 132 |        "      <td>05/01/19</td>\n",
 133 |        "      <td>13:08:00</td>\n",
 134 |        "      <td>Ewallet</td>\n",
 135 |        "      <td>9.1</td>\n",
 136 |        "      <td>0.48</td>\n",
 137 |        "    </tr>\n",
 138 |        "    <tr>\n",
 139 |        "      <th>1</th>\n",
 140 |        "      <td>2</td>\n",
 141 |        "      <td>WALM048</td>\n",
 142 |        "      <td>Harlingen</td>\n",
 143 |        "      <td>Electronic accessories</td>\n",
 144 |        "      <td>$15.28</td>\n",
 145 |        "      <td>5.0</td>\n",
 146 |        "      <td>08/03/19</td>\n",
 147 |        "      <td>10:29:00</td>\n",
 148 |        "      <td>Cash</td>\n",
 149 |        "      <td>9.6</td>\n",
 150 |        "      <td>0.48</td>\n",
 151 |        "    </tr>\n",
 152 |        "    <tr>\n",
 153 |        "      <th>2</th>\n",
 154 |        "      <td>3</td>\n",
 155 |        "      <td>WALM067</td>\n",
 156 |        "      <td>Haltom City</td>\n",
 157 |        "      <td>Home and lifestyle</td>\n",
 158 |        "      <td>$46.33</td>\n",
 159 |        "      <td>7.0</td>\n",
 160 |        "      <td>03/03/19</td>\n",
 161 |        "      <td>13:23:00</td>\n",
 162 |        "      <td>Credit card</td>\n",
 163 |        "      <td>7.4</td>\n",
 164 |        "      <td>0.33</td>\n",
 165 |        "    </tr>\n",
 166 |        "    <tr>\n",
 167 |        "      <th>3</th>\n",
 168 |        "      <td>4</td>\n",
 169 |        "      <td>WALM064</td>\n",
 170 |        "      <td>Bedford</td>\n",
 171 |        "      <td>Health and beauty</td>\n",
 172 |        "      <td>$58.22</td>\n",
 173 |        "      <td>8.0</td>\n",
 174 |        "      <td>27/01/19</td>\n",
 175 |        "      <td>20:33:00</td>\n",
 176 |        "      <td>Ewallet</td>\n",
 177 |        "      <td>8.4</td>\n",
 178 |        "      <td>0.33</td>\n",
 179 |        "    </tr>\n",
 180 |        "    <tr>\n",
 181 |        "      <th>4</th>\n",
 182 |        "      <td>5</td>\n",
 183 |        "      <td>WALM013</td>\n",
 184 |        "      <td>Irving</td>\n",
 185 |        "      <td>Sports and travel</td>\n",
 186 |        "      <td>$86.31</td>\n",
 187 |        "      <td>7.0</td>\n",
 188 |        "      <td>08/02/19</td>\n",
 189 |        "      <td>10:37:00</td>\n",
 190 |        "      <td>Ewallet</td>\n",
 191 |        "      <td>5.3</td>\n",
 192 |        "      <td>0.48</td>\n",
 193 |        "    </tr>\n",
 194 |        "  </tbody>\n",
 195 |        "</table>\n",
 196 |        "</div>"
 197 |       ],
 198 |       "text/plain": [
 199 |        "   invoice_id   Branch         City                category unit_price  \\\n",
 200 |        "0           1  WALM003  San Antonio       Health and beauty     $74.69   \n",
 201 |        "1           2  WALM048    Harlingen  Electronic accessories     $15.28   \n",
 202 |        "2           3  WALM067  Haltom City      Home and lifestyle     $46.33   \n",
 203 |        "3           4  WALM064      Bedford       Health and beauty     $58.22   \n",
 204 |        "4           5  WALM013       Irving       Sports and travel     $86.31   \n",
 205 |        "\n",
 206 |        "   quantity      date      time payment_method  rating  profit_margin  \n",
 207 |        "0       7.0  05/01/19  13:08:00        Ewallet     9.1           0.48  \n",
 208 |        "1       5.0  08/03/19  10:29:00           Cash     9.6           0.48  \n",
 209 |        "2       7.0  03/03/19  13:23:00    Credit card     7.4           0.33  \n",
 210 |        "3       8.0  27/01/19  20:33:00        Ewallet     8.4           0.33  \n",
 211 |        "4       7.0  08/02/19  10:37:00        Ewallet     5.3           0.48  "
 212 |       ]
 213 |      },
 214 |      "execution_count": 6,
 215 |      "metadata": {},
 216 |      "output_type": "execute_result"
 217 |     }
 218 |    ],
 219 |    "source": [
 220 |     "df.head()"
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "code",
 225 |    "execution_count": 7,
 226 |    "metadata": {},
 227 |    "outputs": [
 228 |     {
 229 |      "data": {
 230 |       "text/html": [
 231 |        "<div>\n",
 232 |        "<style scoped>\n",
 233 |        "    .dataframe tbody tr th:only-of-type {\n",
 234 |        "        vertical-align: middle;\n",
 235 |        "    }\n",
 236 |        "\n",
 237 |        "    .dataframe tbody tr th {\n",
 238 |        "        vertical-align: top;\n",
 239 |        "    }\n",
 240 |        "\n",
 241 |        "    .dataframe thead th {\n",
 242 |        "        text-align: right;\n",
 243 |        "    }\n",
 244 |        "</style>\n",
 245 |        "<table border=\"1\" class=\"dataframe\">\n",
 246 |        "  <thead>\n",
 247 |        "    <tr style=\"text-align: right;\">\n",
 248 |        "      <th></th>\n",
 249 |        "      <th>invoice_id</th>\n",
 250 |        "      <th>quantity</th>\n",
 251 |        "      <th>rating</th>\n",
 252 |        "      <th>profit_margin</th>\n",
 253 |        "    </tr>\n",
 254 |        "  </thead>\n",
 255 |        "  <tbody>\n",
 256 |        "    <tr>\n",
 257 |        "      <th>count</th>\n",
 258 |        "      <td>10051.000000</td>\n",
 259 |        "      <td>10020.000000</td>\n",
 260 |        "      <td>10051.000000</td>\n",
 261 |        "      <td>10051.000000</td>\n",
 262 |        "    </tr>\n",
 263 |        "    <tr>\n",
 264 |        "      <th>mean</th>\n",
 265 |        "      <td>5025.741220</td>\n",
 266 |        "      <td>2.353493</td>\n",
 267 |        "      <td>5.825659</td>\n",
 268 |        "      <td>0.393791</td>\n",
 269 |        "    </tr>\n",
 270 |        "    <tr>\n",
 271 |        "      <th>std</th>\n",
 272 |        "      <td>2901.174372</td>\n",
 273 |        "      <td>1.602658</td>\n",
 274 |        "      <td>1.763991</td>\n",
 275 |        "      <td>0.090669</td>\n",
 276 |        "    </tr>\n",
 277 |        "    <tr>\n",
 278 |        "      <th>min</th>\n",
 279 |        "      <td>1.000000</td>\n",
 280 |        "      <td>1.000000</td>\n",
 281 |        "      <td>3.000000</td>\n",
 282 |        "      <td>0.180000</td>\n",
 283 |        "    </tr>\n",
 284 |        "    <tr>\n",
 285 |        "      <th>25%</th>\n",
 286 |        "      <td>2513.500000</td>\n",
 287 |        "      <td>1.000000</td>\n",
 288 |        "      <td>4.000000</td>\n",
 289 |        "      <td>0.330000</td>\n",
 290 |        "    </tr>\n",
 291 |        "    <tr>\n",
 292 |        "      <th>50%</th>\n",
 293 |        "      <td>5026.000000</td>\n",
 294 |        "      <td>2.000000</td>\n",
 295 |        "      <td>6.000000</td>\n",
 296 |        "      <td>0.330000</td>\n",
 297 |        "    </tr>\n",
 298 |        "    <tr>\n",
 299 |        "      <th>75%</th>\n",
 300 |        "      <td>7538.500000</td>\n",
 301 |        "      <td>3.000000</td>\n",
 302 |        "      <td>7.000000</td>\n",
 303 |        "      <td>0.480000</td>\n",
 304 |        "    </tr>\n",
 305 |        "    <tr>\n",
 306 |        "      <th>max</th>\n",
 307 |        "      <td>10000.000000</td>\n",
 308 |        "      <td>10.000000</td>\n",
 309 |        "      <td>10.000000</td>\n",
 310 |        "      <td>0.570000</td>\n",
 311 |        "    </tr>\n",
 312 |        "  </tbody>\n",
 313 |        "</table>\n",
 314 |        "</div>"
 315 |       ],
 316 |       "text/plain": [
 317 |        "         invoice_id      quantity        rating  profit_margin\n",
 318 |        "count  10051.000000  10020.000000  10051.000000   10051.000000\n",
 319 |        "mean    5025.741220      2.353493      5.825659       0.393791\n",
 320 |        "std     2901.174372      1.602658      1.763991       0.090669\n",
 321 |        "min        1.000000      1.000000      3.000000       0.180000\n",
 322 |        "25%     2513.500000      1.000000      4.000000       0.330000\n",
 323 |        "50%     5026.000000      2.000000      6.000000       0.330000\n",
 324 |        "75%     7538.500000      3.000000      7.000000       0.480000\n",
 325 |        "max    10000.000000     10.000000     10.000000       0.570000"
 326 |       ]
 327 |      },
 328 |      "execution_count": 7,
 329 |      "metadata": {},
 330 |      "output_type": "execute_result"
 331 |     }
 332 |    ],
 333 |    "source": [
 334 |     "df.describe()"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": 8,
 340 |    "metadata": {},
 341 |    "outputs": [
 342 |     {
 343 |      "name": "stdout",
 344 |      "output_type": "stream",
 345 |      "text": [
 346 |       "<class 'pandas.core.frame.DataFrame'>\n",
 347 |       "RangeIndex: 10051 entries, 0 to 10050\n",
 348 |       "Data columns (total 11 columns):\n",
 349 |       " #   Column          Non-Null Count  Dtype  \n",
 350 |       "---  ------          --------------  -----  \n",
 351 |       " 0   invoice_id      10051 non-null  int64  \n",
 352 |       " 1   Branch          10051 non-null  object \n",
 353 |       " 2   City            10051 non-null  object \n",
 354 |       " 3   category        10051 non-null  object \n",
 355 |       " 4   unit_price      10020 non-null  object \n",
 356 |       " 5   quantity        10020 non-null  float64\n",
 357 |       " 6   date            10051 non-null  object \n",
 358 |       " 7   time            10051 non-null  object \n",
 359 |       " 8   payment_method  10051 non-null  object \n",
 360 |       " 9   rating          10051 non-null  float64\n",
 361 |       " 10  profit_margin   10051 non-null  float64\n",
 362 |       "dtypes: float64(3), int64(1), object(7)\n",
 363 |       "memory usage: 863.9+ KB\n"
 364 |      ]
 365 |     }
 366 |    ],
 367 |    "source": [
 368 |     "df.info()"
 369 |    ]
 370 |   },
 371 |   {
 372 |    "cell_type": "code",
 373 |    "execution_count": 9,
 374 |    "metadata": {},
 375 |    "outputs": [
 376 |     {
 377 |      "data": {
 378 |       "text/plain": [
 379 |        "np.int64(51)"
 380 |       ]
 381 |      },
 382 |      "execution_count": 9,
 383 |      "metadata": {},
 384 |      "output_type": "execute_result"
 385 |     }
 386 |    ],
 387 |    "source": [
 388 |     "#all duplicates\n",
 389 |     "df.duplicated().sum()"
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "code",
 394 |    "execution_count": 10,
 395 |    "metadata": {},
 396 |    "outputs": [
 397 |     {
 398 |      "data": {
 399 |       "text/plain": [
 400 |        "np.int64(0)"
 401 |       ]
 402 |      },
 403 |      "execution_count": 10,
 404 |      "metadata": {},
 405 |      "output_type": "execute_result"
 406 |     }
 407 |    ],
 408 |    "source": [
 409 |     "df.drop_duplicates(inplace=True)\n",
 410 |     "df.duplicated().sum()"
 411 |    ]
 412 |   },
 413 |   {
 414 |    "cell_type": "code",
 415 |    "execution_count": 11,
 416 |    "metadata": {},
 417 |    "outputs": [
 418 |     {
 419 |      "data": {
 420 |       "text/plain": [
 421 |        "(10000, 11)"
 422 |       ]
 423 |      },
 424 |      "execution_count": 11,
 425 |      "metadata": {},
 426 |      "output_type": "execute_result"
 427 |     }
 428 |    ],
 429 |    "source": [
 430 |     "df.shape"
 431 |    ]
 432 |   },
 433 |   {
 434 |    "cell_type": "code",
 435 |    "execution_count": 12,
 436 |    "metadata": {},
 437 |    "outputs": [
 438 |     {
 439 |      "data": {
 440 |       "text/plain": [
 441 |        "invoice_id         0\n",
 442 |        "Branch             0\n",
 443 |        "City               0\n",
 444 |        "category           0\n",
 445 |        "unit_price        31\n",
 446 |        "quantity          31\n",
 447 |        "date               0\n",
 448 |        "time               0\n",
 449 |        "payment_method     0\n",
 450 |        "rating             0\n",
 451 |        "profit_margin      0\n",
 452 |        "dtype: int64"
 453 |       ]
 454 |      },
 455 |      "execution_count": 12,
 456 |      "metadata": {},
 457 |      "output_type": "execute_result"
 458 |     }
 459 |    ],
 460 |    "source": [
 461 |     "df.isnull().sum()"
 462 |    ]
 463 |   },
 464 |   {
 465 |    "cell_type": "code",
 466 |    "execution_count": 13,
 467 |    "metadata": {},
 468 |    "outputs": [
 469 |     {
 470 |      "data": {
 471 |       "text/plain": [
 472 |        "invoice_id        0\n",
 473 |        "Branch            0\n",
 474 |        "City              0\n",
 475 |        "category          0\n",
 476 |        "unit_price        0\n",
 477 |        "quantity          0\n",
 478 |        "date              0\n",
 479 |        "time              0\n",
 480 |        "payment_method    0\n",
 481 |        "rating            0\n",
 482 |        "profit_margin     0\n",
 483 |        "dtype: int64"
 484 |       ]
 485 |      },
 486 |      "execution_count": 13,
 487 |      "metadata": {},
 488 |      "output_type": "execute_result"
 489 |     }
 490 |    ],
 491 |    "source": [
 492 |     "#droppping all rows with missing records\n",
 493 |     "df.dropna(inplace=True)\n",
 494 |     "\n",
 495 |     "# verify\n",
 496 |     "df.isnull().sum()"
 497 |    ]
 498 |   },
 499 |   {
 500 |    "cell_type": "code",
 501 |    "execution_count": 14,
 502 |    "metadata": {},
 503 |    "outputs": [
 504 |     {
 505 |      "data": {
 506 |       "text/plain": [
 507 |        "(9969, 11)"
 508 |       ]
 509 |      },
 510 |      "execution_count": 14,
 511 |      "metadata": {},
 512 |      "output_type": "execute_result"
 513 |     }
 514 |    ],
 515 |    "source": [
 516 |     "df.shape"
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "code",
 521 |    "execution_count": 15,
 522 |    "metadata": {},
 523 |    "outputs": [
 524 |     {
 525 |      "data": {
 526 |       "text/plain": [
 527 |        "invoice_id          int64\n",
 528 |        "Branch             object\n",
 529 |        "City               object\n",
 530 |        "category           object\n",
 531 |        "unit_price         object\n",
 532 |        "quantity          float64\n",
 533 |        "date               object\n",
 534 |        "time               object\n",
 535 |        "payment_method     object\n",
 536 |        "rating            float64\n",
 537 |        "profit_margin     float64\n",
 538 |        "dtype: object"
 539 |       ]
 540 |      },
 541 |      "execution_count": 15,
 542 |      "metadata": {},
 543 |      "output_type": "execute_result"
 544 |     }
 545 |    ],
 546 |    "source": [
 547 |     "df.dtypes"
 548 |    ]
 549 |   },
 550 |   {
 551 |    "cell_type": "code",
 552 |    "execution_count": null,
 553 |    "metadata": {},
 554 |    "outputs": [],
 555 |    "source": [
 556 |     "df['unit_price'].astype(float)"
 557 |    ]
 558 |   },
 559 |   {
 560 |    "cell_type": "code",
 561 |    "execution_count": null,
 562 |    "metadata": {},
 563 |    "outputs": [
 564 |     {
 565 |      "data": {
 566 |       "text/html": [
 567 |        "<div>\n",
 568 |        "<style scoped>\n",
 569 |        "    .dataframe tbody tr th:only-of-type {\n",
 570 |        "        vertical-align: middle;\n",
 571 |        "    }\n",
 572 |        "\n",
 573 |        "    .dataframe tbody tr th {\n",
 574 |        "        vertical-align: top;\n",
 575 |        "    }\n",
 576 |        "\n",
 577 |        "    .dataframe thead th {\n",
 578 |        "        text-align: right;\n",
 579 |        "    }\n",
 580 |        "</style>\n",
 581 |        "<table border=\"1\" class=\"dataframe\">\n",
 582 |        "  <thead>\n",
 583 |        "    <tr style=\"text-align: right;\">\n",
 584 |        "      <th></th>\n",
 585 |        "      <th>invoice_id</th>\n",
 586 |        "      <th>branch</th>\n",
 587 |        "      <th>city</th>\n",
 588 |        "      <th>category</th>\n",
 589 |        "      <th>unit_price</th>\n",
 590 |        "      <th>quantity</th>\n",
 591 |        "      <th>date</th>\n",
 592 |        "      <th>time</th>\n",
 593 |        "      <th>payment_method</th>\n",
 594 |        "      <th>rating</th>\n",
 595 |        "      <th>profit_margin</th>\n",
 596 |        "    </tr>\n",
 597 |        "  </thead>\n",
 598 |        "  <tbody>\n",
 599 |        "    <tr>\n",
 600 |        "      <th>0</th>\n",
 601 |        "      <td>1</td>\n",
 602 |        "      <td>WALM003</td>\n",
 603 |        "      <td>San Antonio</td>\n",
 604 |        "      <td>Health and beauty</td>\n",
 605 |        "      <td>74.69</td>\n",
 606 |        "      <td>7.0</td>\n",
 607 |        "      <td>05/01/19</td>\n",
 608 |        "      <td>13:08:00</td>\n",
 609 |        "      <td>Ewallet</td>\n",
 610 |        "      <td>9.1</td>\n",
 611 |        "      <td>0.48</td>\n",
 612 |        "    </tr>\n",
 613 |        "    <tr>\n",
 614 |        "      <th>1</th>\n",
 615 |        "      <td>2</td>\n",
 616 |        "      <td>WALM048</td>\n",
 617 |        "      <td>Harlingen</td>\n",
 618 |        "      <td>Electronic accessories</td>\n",
 619 |        "      <td>15.28</td>\n",
 620 |        "      <td>5.0</td>\n",
 621 |        "      <td>08/03/19</td>\n",
 622 |        "      <td>10:29:00</td>\n",
 623 |        "      <td>Cash</td>\n",
 624 |        "      <td>9.6</td>\n",
 625 |        "      <td>0.48</td>\n",
 626 |        "    </tr>\n",
 627 |        "    <tr>\n",
 628 |        "      <th>2</th>\n",
 629 |        "      <td>3</td>\n",
 630 |        "      <td>WALM067</td>\n",
 631 |        "      <td>Haltom City</td>\n",
 632 |        "      <td>Home and lifestyle</td>\n",
 633 |        "      <td>46.33</td>\n",
 634 |        "      <td>7.0</td>\n",
 635 |        "      <td>03/03/19</td>\n",
 636 |        "      <td>13:23:00</td>\n",
 637 |        "      <td>Credit card</td>\n",
 638 |        "      <td>7.4</td>\n",
 639 |        "      <td>0.33</td>\n",
 640 |        "    </tr>\n",
 641 |        "    <tr>\n",
 642 |        "      <th>3</th>\n",
 643 |        "      <td>4</td>\n",
 644 |        "      <td>WALM064</td>\n",
 645 |        "      <td>Bedford</td>\n",
 646 |        "      <td>Health and beauty</td>\n",
 647 |        "      <td>58.22</td>\n",
 648 |        "      <td>8.0</td>\n",
 649 |        "      <td>27/01/19</td>\n",
 650 |        "      <td>20:33:00</td>\n",
 651 |        "      <td>Ewallet</td>\n",
 652 |        "      <td>8.4</td>\n",
 653 |        "      <td>0.33</td>\n",
 654 |        "    </tr>\n",
 655 |        "    <tr>\n",
 656 |        "      <th>4</th>\n",
 657 |        "      <td>5</td>\n",
 658 |        "      <td>WALM013</td>\n",
 659 |        "      <td>Irving</td>\n",
 660 |        "      <td>Sports and travel</td>\n",
 661 |        "      <td>86.31</td>\n",
 662 |        "      <td>7.0</td>\n",
 663 |        "      <td>08/02/19</td>\n",
 664 |        "      <td>10:37:00</td>\n",
 665 |        "      <td>Ewallet</td>\n",
 666 |        "      <td>5.3</td>\n",
 667 |        "      <td>0.48</td>\n",
 668 |        "    </tr>\n",
 669 |        "  </tbody>\n",
 670 |        "</table>\n",
 671 |        "</div>"
 672 |       ],
 673 |       "text/plain": [
 674 |        "   invoice_id   branch         city                category  unit_price  \\\n",
 675 |        "0           1  WALM003  San Antonio       Health and beauty       74.69   \n",
 676 |        "1           2  WALM048    Harlingen  Electronic accessories       15.28   \n",
 677 |        "2           3  WALM067  Haltom City      Home and lifestyle       46.33   \n",
 678 |        "3           4  WALM064      Bedford       Health and beauty       58.22   \n",
 679 |        "4           5  WALM013       Irving       Sports and travel       86.31   \n",
 680 |        "\n",
 681 |        "   quantity      date      time payment_method  rating  profit_margin  \n",
 682 |        "0       7.0  05/01/19  13:08:00        Ewallet     9.1           0.48  \n",
 683 |        "1       5.0  08/03/19  10:29:00           Cash     9.6           0.48  \n",
 684 |        "2       7.0  03/03/19  13:23:00    Credit card     7.4           0.33  \n",
 685 |        "3       8.0  27/01/19  20:33:00        Ewallet     8.4           0.33  \n",
 686 |        "4       7.0  08/02/19  10:37:00        Ewallet     5.3           0.48  "
 687 |       ]
 688 |      },
 689 |      "execution_count": 110,
 690 |      "metadata": {},
 691 |      "output_type": "execute_result"
 692 |     }
 693 |    ],
 694 |    "source": [
 695 |     "df['unit_price'] = df['unit_price'].str.replace('$', '').astype(float)\n",
 696 |     "\n",
 697 |     "df.head()"
 698 |    ]
 699 |   },
 700 |   {
 701 |    "cell_type": "code",
 702 |    "execution_count": null,
 703 |    "metadata": {},
 704 |    "outputs": [
 705 |     {
 706 |      "name": "stdout",
 707 |      "output_type": "stream",
 708 |      "text": [
 709 |       "<class 'pandas.core.frame.DataFrame'>\n",
 710 |       "Index: 9969 entries, 0 to 9999\n",
 711 |       "Data columns (total 11 columns):\n",
 712 |       " #   Column          Non-Null Count  Dtype  \n",
 713 |       "---  ------          --------------  -----  \n",
 714 |       " 0   invoice_id      9969 non-null   int64  \n",
 715 |       " 1   Branch          9969 non-null   object \n",
 716 |       " 2   City            9969 non-null   object \n",
 717 |       " 3   category        9969 non-null   object \n",
 718 |       " 4   unit_price      9969 non-null   float64\n",
 719 |       " 5   quantity        9969 non-null   float64\n",
 720 |       " 6   date            9969 non-null   object \n",
 721 |       " 7   time            9969 non-null   object \n",
 722 |       " 8   payment_method  9969 non-null   object \n",
 723 |       " 9   rating          9969 non-null   float64\n",
 724 |       " 10  profit_margin   9969 non-null   float64\n",
 725 |       "dtypes: float64(4), int64(1), object(6)\n",
 726 |       "memory usage: 934.6+ KB\n"
 727 |      ]
 728 |     }
 729 |    ],
 730 |    "source": [
 731 |     "df.info()"
 732 |    ]
 733 |   },
 734 |   {
 735 |    "cell_type": "code",
 736 |    "execution_count": null,
 737 |    "metadata": {},
 738 |    "outputs": [
 739 |     {
 740 |      "data": {
 741 |       "text/plain": [
 742 |        "Index(['invoice_id', 'Branch', 'City', 'category', 'unit_price', 'quantity',\n",
 743 |        "       'date', 'time', 'payment_method', 'rating', 'profit_margin'],\n",
 744 |        "      dtype='object')"
 745 |       ]
 746 |      },
 747 |      "execution_count": 29,
 748 |      "metadata": {},
 749 |      "output_type": "execute_result"
 750 |     }
 751 |    ],
 752 |    "source": [
 753 |     "df.columns"
 754 |    ]
 755 |   },
 756 |   {
 757 |    "cell_type": "code",
 758 |    "execution_count": null,
 759 |    "metadata": {},
 760 |    "outputs": [
 761 |     {
 762 |      "data": {
 763 |       "text/html": [
 764 |        "<div>\n",
 765 |        "<style scoped>\n",
 766 |        "    .dataframe tbody tr th:only-of-type {\n",
 767 |        "        vertical-align: middle;\n",
 768 |        "    }\n",
 769 |        "\n",
 770 |        "    .dataframe tbody tr th {\n",
 771 |        "        vertical-align: top;\n",
 772 |        "    }\n",
 773 |        "\n",
 774 |        "    .dataframe thead th {\n",
 775 |        "        text-align: right;\n",
 776 |        "    }\n",
 777 |        "</style>\n",
 778 |        "<table border=\"1\" class=\"dataframe\">\n",
 779 |        "  <thead>\n",
 780 |        "    <tr style=\"text-align: right;\">\n",
 781 |        "      <th></th>\n",
 782 |        "      <th>invoice_id</th>\n",
 783 |        "      <th>branch</th>\n",
 784 |        "      <th>city</th>\n",
 785 |        "      <th>category</th>\n",
 786 |        "      <th>unit_price</th>\n",
 787 |        "      <th>quantity</th>\n",
 788 |        "      <th>date</th>\n",
 789 |        "      <th>time</th>\n",
 790 |        "      <th>payment_method</th>\n",
 791 |        "      <th>rating</th>\n",
 792 |        "      <th>profit_margin</th>\n",
 793 |        "      <th>total</th>\n",
 794 |        "    </tr>\n",
 795 |        "  </thead>\n",
 796 |        "  <tbody>\n",
 797 |        "    <tr>\n",
 798 |        "      <th>0</th>\n",
 799 |        "      <td>1</td>\n",
 800 |        "      <td>WALM003</td>\n",
 801 |        "      <td>San Antonio</td>\n",
 802 |        "      <td>Health and beauty</td>\n",
 803 |        "      <td>74.69</td>\n",
 804 |        "      <td>7.0</td>\n",
 805 |        "      <td>05/01/19</td>\n",
 806 |        "      <td>13:08:00</td>\n",
 807 |        "      <td>Ewallet</td>\n",
 808 |        "      <td>9.1</td>\n",
 809 |        "      <td>0.48</td>\n",
 810 |        "      <td>522.83</td>\n",
 811 |        "    </tr>\n",
 812 |        "    <tr>\n",
 813 |        "      <th>1</th>\n",
 814 |        "      <td>2</td>\n",
 815 |        "      <td>WALM048</td>\n",
 816 |        "      <td>Harlingen</td>\n",
 817 |        "      <td>Electronic accessories</td>\n",
 818 |        "      <td>15.28</td>\n",
 819 |        "      <td>5.0</td>\n",
 820 |        "      <td>08/03/19</td>\n",
 821 |        "      <td>10:29:00</td>\n",
 822 |        "      <td>Cash</td>\n",
 823 |        "      <td>9.6</td>\n",
 824 |        "      <td>0.48</td>\n",
 825 |        "      <td>76.40</td>\n",
 826 |        "    </tr>\n",
 827 |        "    <tr>\n",
 828 |        "      <th>2</th>\n",
 829 |        "      <td>3</td>\n",
 830 |        "      <td>WALM067</td>\n",
 831 |        "      <td>Haltom City</td>\n",
 832 |        "      <td>Home and lifestyle</td>\n",
 833 |        "      <td>46.33</td>\n",
 834 |        "      <td>7.0</td>\n",
 835 |        "      <td>03/03/19</td>\n",
 836 |        "      <td>13:23:00</td>\n",
 837 |        "      <td>Credit card</td>\n",
 838 |        "      <td>7.4</td>\n",
 839 |        "      <td>0.33</td>\n",
 840 |        "      <td>324.31</td>\n",
 841 |        "    </tr>\n",
 842 |        "    <tr>\n",
 843 |        "      <th>3</th>\n",
 844 |        "      <td>4</td>\n",
 845 |        "      <td>WALM064</td>\n",
 846 |        "      <td>Bedford</td>\n",
 847 |        "      <td>Health and beauty</td>\n",
 848 |        "      <td>58.22</td>\n",
 849 |        "      <td>8.0</td>\n",
 850 |        "      <td>27/01/19</td>\n",
 851 |        "      <td>20:33:00</td>\n",
 852 |        "      <td>Ewallet</td>\n",
 853 |        "      <td>8.4</td>\n",
 854 |        "      <td>0.33</td>\n",
 855 |        "      <td>465.76</td>\n",
 856 |        "    </tr>\n",
 857 |        "    <tr>\n",
 858 |        "      <th>4</th>\n",
 859 |        "      <td>5</td>\n",
 860 |        "      <td>WALM013</td>\n",
 861 |        "      <td>Irving</td>\n",
 862 |        "      <td>Sports and travel</td>\n",
 863 |        "      <td>86.31</td>\n",
 864 |        "      <td>7.0</td>\n",
 865 |        "      <td>08/02/19</td>\n",
 866 |        "      <td>10:37:00</td>\n",
 867 |        "      <td>Ewallet</td>\n",
 868 |        "      <td>5.3</td>\n",
 869 |        "      <td>0.48</td>\n",
 870 |        "      <td>604.17</td>\n",
 871 |        "    </tr>\n",
 872 |        "  </tbody>\n",
 873 |        "</table>\n",
 874 |        "</div>"
 875 |       ],
 876 |       "text/plain": [
 877 |        "   invoice_id   branch         city                category  unit_price  \\\n",
 878 |        "0           1  WALM003  San Antonio       Health and beauty       74.69   \n",
 879 |        "1           2  WALM048    Harlingen  Electronic accessories       15.28   \n",
 880 |        "2           3  WALM067  Haltom City      Home and lifestyle       46.33   \n",
 881 |        "3           4  WALM064      Bedford       Health and beauty       58.22   \n",
 882 |        "4           5  WALM013       Irving       Sports and travel       86.31   \n",
 883 |        "\n",
 884 |        "   quantity      date      time payment_method  rating  profit_margin   total  \n",
 885 |        "0       7.0  05/01/19  13:08:00        Ewallet     9.1           0.48  522.83  \n",
 886 |        "1       5.0  08/03/19  10:29:00           Cash     9.6           0.48   76.40  \n",
 887 |        "2       7.0  03/03/19  13:23:00    Credit card     7.4           0.33  324.31  \n",
 888 |        "3       8.0  27/01/19  20:33:00        Ewallet     8.4           0.33  465.76  \n",
 889 |        "4       7.0  08/02/19  10:37:00        Ewallet     5.3           0.48  604.17  "
 890 |       ]
 891 |      },
 892 |      "execution_count": 111,
 893 |      "metadata": {},
 894 |      "output_type": "execute_result"
 895 |     }
 896 |    ],
 897 |    "source": [
 898 |     "df['total'] = df['unit_price'] * df['quantity']\n",
 899 |     "df.head()"
 900 |    ]
 901 |   },
 902 |   {
 903 |    "cell_type": "markdown",
 904 |    "metadata": {},
 905 |    "source": [
 906 |     "**Fixing the column name to lower case**"
 907 |    ]
 908 |   },
 909 |   {
 910 |    "cell_type": "code",
 911 |    "execution_count": null,
 912 |    "metadata": {},
 913 |    "outputs": [
 914 |     {
 915 |      "data": {
 916 |       "text/plain": [
 917 |        "Index(['invoice_id', 'Branch', 'City', 'category', 'unit_price', 'quantity',\n",
 918 |        "       'date', 'time', 'payment_method', 'rating', 'profit_margin'],\n",
 919 |        "      dtype='object')"
 920 |       ]
 921 |      },
 922 |      "metadata": {},
 923 |      "output_type": "display_data"
 924 |     }
 925 |    ],
 926 |    "source": [
 927 |     "df.columns"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "code",
 932 |    "execution_count": null,
 933 |    "metadata": {},
 934 |    "outputs": [
 935 |     {
 936 |      "data": {
 937 |       "text/plain": [
 938 |        "Index(['invoice_id', 'branch', 'city', 'category', 'unit_price', 'quantity',\n",
 939 |        "       'date', 'time', 'payment_method', 'rating', 'profit_margin'],\n",
 940 |        "      dtype='object')"
 941 |       ]
 942 |      },
 943 |      "metadata": {},
 944 |      "output_type": "display_data"
 945 |     }
 946 |    ],
 947 |    "source": [
 948 |     "df.columns = df.columns.str.lower()\n",
 949 |     "df.columns"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "code",
 954 |    "execution_count": null,
 955 |    "metadata": {},
 956 |    "outputs": [],
 957 |    "source": [
 958 |     "# mysql \n",
 959 |     "# host = localhost\n",
 960 |     "# port = 3306\n",
 961 |     "# user = root\n",
 962 |     "# password = 'your_password'\n",
 963 |     "\n",
 964 |     "# psql\n",
 965 |     "# host = localhost\n",
 966 |     "# port = 5432\n",
 967 |     "# user = postgres\n",
 968 |     "# password = 'x0000'"
 969 |    ]
 970 |   },
 971 |   {
 972 |    "cell_type": "code",
 973 |    "execution_count": null,
 974 |    "metadata": {},
 975 |    "outputs": [
 976 |     {
 977 |      "data": {
 978 |       "text/plain": [
 979 |        "(9969, 12)"
 980 |       ]
 981 |      },
 982 |      "execution_count": 36,
 983 |      "metadata": {},
 984 |      "output_type": "execute_result"
 985 |     }
 986 |    ],
 987 |    "source": [
 988 |     "df.shape"
 989 |    ]
 990 |   },
 991 |   {
 992 |    "cell_type": "code",
 993 |    "execution_count": null,
 994 |    "metadata": {},
 995 |    "outputs": [],
 996 |    "source": [
 997 |     "df.to_csv('walmart_clean_data.csv', index=False)"
 998 |    ]
 999 |   },
1000 |   {
1001 |    "cell_type": "code",
1002 |    "execution_count": null,
1003 |    "metadata": {},
1004 |    "outputs": [
1005 |     {
1006 |      "name": "stdout",
1007 |      "output_type": "stream",
1008 |      "text": [
1009 |       "Help on function create_engine in module sqlalchemy.engine.create:\n",
1010 |       "\n",
1011 |       "create_engine(url: 'Union[str, _url.URL]', **kwargs: 'Any') -> 'Engine'\n",
1012 |       "    Create a new :class:`_engine.Engine` instance.\n",
1013 |       "\n",
1014 |       "    The standard calling form is to send the :ref:`URL <database_urls>` as the\n",
1015 |       "    first positional argument, usually a string\n",
1016 |       "    that indicates database dialect and connection arguments::\n",
1017 |       "\n",
1018 |       "        engine = create_engine(\"postgresql+psycopg2://scott:tiger@localhost/test\")\n",
1019 |       "\n",
1020 |       "    .. note::\n",
1021 |       "\n",
1022 |       "        Please review :ref:`database_urls` for general guidelines in composing\n",
1023 |       "        URL strings.  In particular, special characters, such as those often\n",
1024 |       "        part of passwords, must be URL encoded to be properly parsed.\n",
1025 |       "\n",
1026 |       "    Additional keyword arguments may then follow it which\n",
1027 |       "    establish various options on the resulting :class:`_engine.Engine`\n",
1028 |       "    and its underlying :class:`.Dialect` and :class:`_pool.Pool`\n",
1029 |       "    constructs::\n",
1030 |       "\n",
1031 |       "        engine = create_engine(\"mysql+mysqldb://scott:tiger@hostname/dbname\",\n",
1032 |       "                                    pool_recycle=3600, echo=True)\n",
1033 |       "\n",
1034 |       "    The string form of the URL is\n",
1035 |       "    ``dialect[+driver]://user:password@host/dbname[?key=value..]``, where\n",
1036 |       "    ``dialect`` is a database name such as ``mysql``, ``oracle``,\n",
1037 |       "    ``postgresql``, etc., and ``driver`` the name of a DBAPI, such as\n",
1038 |       "    ``psycopg2``, ``pyodbc``, ``cx_oracle``, etc.  Alternatively,\n",
1039 |       "    the URL can be an instance of :class:`~sqlalchemy.engine.url.URL`.\n",
1040 |       "\n",
1041 |       "    ``**kwargs`` takes a wide variety of options which are routed\n",
1042 |       "    towards their appropriate components.  Arguments may be specific to\n",
1043 |       "    the :class:`_engine.Engine`, the underlying :class:`.Dialect`,\n",
1044 |       "    as well as the\n",
1045 |       "    :class:`_pool.Pool`.  Specific dialects also accept keyword arguments that\n",
1046 |       "    are unique to that dialect.   Here, we describe the parameters\n",
1047 |       "    that are common to most :func:`_sa.create_engine()` usage.\n",
1048 |       "\n",
1049 |       "    Once established, the newly resulting :class:`_engine.Engine` will\n",
1050 |       "    request a connection from the underlying :class:`_pool.Pool` once\n",
1051 |       "    :meth:`_engine.Engine.connect` is called, or a method which depends on it\n",
1052 |       "    such as :meth:`_engine.Engine.execute` is invoked.   The\n",
1053 |       "    :class:`_pool.Pool` in turn\n",
1054 |       "    will establish the first actual DBAPI connection when this request\n",
1055 |       "    is received.   The :func:`_sa.create_engine` call itself does **not**\n",
1056 |       "    establish any actual DBAPI connections directly.\n",
1057 |       "\n",
1058 |       "    .. seealso::\n",
1059 |       "\n",
1060 |       "        :doc:`/core/engines`\n",
1061 |       "\n",
1062 |       "        :doc:`/dialects/index`\n",
1063 |       "\n",
1064 |       "        :ref:`connections_toplevel`\n",
1065 |       "\n",
1066 |       "    :param connect_args: a dictionary of options which will be\n",
1067 |       "        passed directly to the DBAPI's ``connect()`` method as\n",
1068 |       "        additional keyword arguments.  See the example\n",
1069 |       "        at :ref:`custom_dbapi_args`.\n",
1070 |       "\n",
1071 |       "    :param creator: a callable which returns a DBAPI connection.\n",
1072 |       "        This creation function will be passed to the underlying\n",
1073 |       "        connection pool and will be used to create all new database\n",
1074 |       "        connections. Usage of this function causes connection\n",
1075 |       "        parameters specified in the URL argument to be bypassed.\n",
1076 |       "\n",
1077 |       "        This hook is not as flexible as the newer\n",
1078 |       "        :meth:`_events.DialectEvents.do_connect` hook which allows complete\n",
1079 |       "        control over how a connection is made to the database, given the full\n",
1080 |       "        set of URL arguments and state beforehand.\n",
1081 |       "\n",
1082 |       "        .. seealso::\n",
1083 |       "\n",
1084 |       "            :meth:`_events.DialectEvents.do_connect` - event hook that allows\n",
1085 |       "            full control over DBAPI connection mechanics.\n",
1086 |       "\n",
1087 |       "            :ref:`custom_dbapi_args`\n",
1088 |       "\n",
1089 |       "    :param echo=False: if True, the Engine will log all statements\n",
1090 |       "        as well as a ``repr()`` of their parameter lists to the default log\n",
1091 |       "        handler, which defaults to ``sys.stdout`` for output.   If set to the\n",
1092 |       "        string ``\"debug\"``, result rows will be printed to the standard output\n",
1093 |       "        as well. The ``echo`` attribute of ``Engine`` can be modified at any\n",
1094 |       "        time to turn logging on and off; direct control of logging is also\n",
1095 |       "        available using the standard Python ``logging`` module.\n",
1096 |       "\n",
1097 |       "        .. seealso::\n",
1098 |       "\n",
1099 |       "            :ref:`dbengine_logging` - further detail on how to configure\n",
1100 |       "            logging.\n",
1101 |       "\n",
1102 |       "\n",
1103 |       "    :param echo_pool=False: if True, the connection pool will log\n",
1104 |       "        informational output such as when connections are invalidated\n",
1105 |       "        as well as when connections are recycled to the default log handler,\n",
1106 |       "        which defaults to ``sys.stdout`` for output.   If set to the string\n",
1107 |       "        ``\"debug\"``, the logging will include pool checkouts and checkins.\n",
1108 |       "        Direct control of logging is also available using the standard Python\n",
1109 |       "        ``logging`` module.\n",
1110 |       "\n",
1111 |       "        .. seealso::\n",
1112 |       "\n",
1113 |       "            :ref:`dbengine_logging` - further detail on how to configure\n",
1114 |       "            logging.\n",
1115 |       "\n",
1116 |       "\n",
1117 |       "    :param empty_in_strategy:   No longer used; SQLAlchemy now uses\n",
1118 |       "        \"empty set\" behavior for IN in all cases.\n",
1119 |       "\n",
1120 |       "        .. deprecated:: 1.4 The :paramref:`_sa.create_engine.empty_in_strategy` keyword is deprecated, and no longer has any effect.  All IN expressions are now rendered using the \"expanding parameter\" strategy which renders a set of boundexpressions, or an \"empty set\" SELECT, at statement executiontime.\n",
1121 |       "\n",
1122 |       "\n",
1123 |       "\n",
1124 |       "    :param enable_from_linting: defaults to True.  Will emit a warning\n",
1125 |       "        if a given SELECT statement is found to have un-linked FROM elements\n",
1126 |       "        which would cause a cartesian product.\n",
1127 |       "\n",
1128 |       "        .. versionadded:: 1.4\n",
1129 |       "\n",
1130 |       "        .. seealso::\n",
1131 |       "\n",
1132 |       "            :ref:`change_4737`\n",
1133 |       "\n",
1134 |       "    :param execution_options: Dictionary execution options which will\n",
1135 |       "        be applied to all connections.  See\n",
1136 |       "        :meth:`~sqlalchemy.engine.Connection.execution_options`\n",
1137 |       "\n",
1138 |       "    :param future: Use the 2.0 style :class:`_engine.Engine` and\n",
1139 |       "        :class:`_engine.Connection` API.\n",
1140 |       "\n",
1141 |       "        As of SQLAlchemy 2.0, this parameter is present for backwards\n",
1142 |       "        compatibility only and must remain at its default value of ``True``.\n",
1143 |       "\n",
1144 |       "        The :paramref:`_sa.create_engine.future` parameter will be\n",
1145 |       "        deprecated in a subsequent 2.x release and eventually removed.\n",
1146 |       "\n",
1147 |       "        .. versionadded:: 1.4\n",
1148 |       "\n",
1149 |       "        .. versionchanged:: 2.0 All :class:`_engine.Engine` objects are\n",
1150 |       "           \"future\" style engines and there is no longer a ``future=False``\n",
1151 |       "           mode of operation.\n",
1152 |       "\n",
1153 |       "        .. seealso::\n",
1154 |       "\n",
1155 |       "            :ref:`migration_20_toplevel`\n",
1156 |       "\n",
1157 |       "    :param hide_parameters: Boolean, when set to True, SQL statement parameters\n",
1158 |       "        will not be displayed in INFO logging nor will they be formatted into\n",
1159 |       "        the string representation of :class:`.StatementError` objects.\n",
1160 |       "\n",
1161 |       "        .. versionadded:: 1.3.8\n",
1162 |       "\n",
1163 |       "        .. seealso::\n",
1164 |       "\n",
1165 |       "            :ref:`dbengine_logging` - further detail on how to configure\n",
1166 |       "            logging.\n",
1167 |       "\n",
1168 |       "    :param implicit_returning=True:  Legacy parameter that may only be set\n",
1169 |       "        to True. In SQLAlchemy 2.0, this parameter does nothing. In order to\n",
1170 |       "        disable \"implicit returning\" for statements invoked by the ORM,\n",
1171 |       "        configure this on a per-table basis using the\n",
1172 |       "        :paramref:`.Table.implicit_returning` parameter.\n",
1173 |       "\n",
1174 |       "\n",
1175 |       "    :param insertmanyvalues_page_size: number of rows to format into an\n",
1176 |       "     INSERT statement when the statement uses \"insertmanyvalues\" mode, which is\n",
1177 |       "     a paged form of bulk insert that is used for many backends when using\n",
1178 |       "     :term:`executemany` execution typically in conjunction with RETURNING.\n",
1179 |       "     Defaults to 1000, but may also be subject to dialect-specific limiting\n",
1180 |       "     factors which may override this value on a per-statement basis.\n",
1181 |       "\n",
1182 |       "     .. versionadded:: 2.0\n",
1183 |       "\n",
1184 |       "     .. seealso::\n",
1185 |       "\n",
1186 |       "        :ref:`engine_insertmanyvalues`\n",
1187 |       "\n",
1188 |       "        :ref:`engine_insertmanyvalues_page_size`\n",
1189 |       "\n",
1190 |       "        :paramref:`_engine.Connection.execution_options.insertmanyvalues_page_size`\n",
1191 |       "\n",
1192 |       "    :param isolation_level: optional string name of an isolation level\n",
1193 |       "        which will be set on all new connections unconditionally.\n",
1194 |       "        Isolation levels are typically some subset of the string names\n",
1195 |       "        ``\"SERIALIZABLE\"``, ``\"REPEATABLE READ\"``,\n",
1196 |       "        ``\"READ COMMITTED\"``, ``\"READ UNCOMMITTED\"`` and ``\"AUTOCOMMIT\"``\n",
1197 |       "        based on backend.\n",
1198 |       "\n",
1199 |       "        The :paramref:`_sa.create_engine.isolation_level` parameter is\n",
1200 |       "        in contrast to the\n",
1201 |       "        :paramref:`.Connection.execution_options.isolation_level`\n",
1202 |       "        execution option, which may be set on an individual\n",
1203 |       "        :class:`.Connection`, as well as the same parameter passed to\n",
1204 |       "        :meth:`.Engine.execution_options`, where it may be used to create\n",
1205 |       "        multiple engines with different isolation levels that share a common\n",
1206 |       "        connection pool and dialect.\n",
1207 |       "\n",
1208 |       "        .. versionchanged:: 2.0 The\n",
1209 |       "           :paramref:`_sa.create_engine.isolation_level`\n",
1210 |       "           parameter has been generalized to work on all dialects which support\n",
1211 |       "           the concept of isolation level, and is provided as a more succinct,\n",
1212 |       "           up front configuration switch in contrast to the execution option\n",
1213 |       "           which is more of an ad-hoc programmatic option.\n",
1214 |       "\n",
1215 |       "        .. seealso::\n",
1216 |       "\n",
1217 |       "            :ref:`dbapi_autocommit`\n",
1218 |       "\n",
1219 |       "    :param json_deserializer: for dialects that support the\n",
1220 |       "        :class:`_types.JSON`\n",
1221 |       "        datatype, this is a Python callable that will convert a JSON string\n",
1222 |       "        to a Python object.  By default, the Python ``json.loads`` function is\n",
1223 |       "        used.\n",
1224 |       "\n",
1225 |       "        .. versionchanged:: 1.3.7  The SQLite dialect renamed this from\n",
1226 |       "           ``_json_deserializer``.\n",
1227 |       "\n",
1228 |       "    :param json_serializer: for dialects that support the :class:`_types.JSON`\n",
1229 |       "        datatype, this is a Python callable that will render a given object\n",
1230 |       "        as JSON.   By default, the Python ``json.dumps`` function is used.\n",
1231 |       "\n",
1232 |       "        .. versionchanged:: 1.3.7  The SQLite dialect renamed this from\n",
1233 |       "           ``_json_serializer``.\n",
1234 |       "\n",
1235 |       "\n",
1236 |       "    :param label_length=None: optional integer value which limits\n",
1237 |       "        the size of dynamically generated column labels to that many\n",
1238 |       "        characters. If less than 6, labels are generated as\n",
1239 |       "        \"_(counter)\". If ``None``, the value of\n",
1240 |       "        ``dialect.max_identifier_length``, which may be affected via the\n",
1241 |       "        :paramref:`_sa.create_engine.max_identifier_length` parameter,\n",
1242 |       "        is used instead.   The value of\n",
1243 |       "        :paramref:`_sa.create_engine.label_length`\n",
1244 |       "        may not be larger than that of\n",
1245 |       "        :paramref:`_sa.create_engine.max_identfier_length`.\n",
1246 |       "\n",
1247 |       "        .. seealso::\n",
1248 |       "\n",
1249 |       "            :paramref:`_sa.create_engine.max_identifier_length`\n",
1250 |       "\n",
1251 |       "    :param logging_name:  String identifier which will be used within\n",
1252 |       "        the \"name\" field of logging records generated within the\n",
1253 |       "        \"sqlalchemy.engine\" logger. Defaults to a hexstring of the\n",
1254 |       "        object's id.\n",
1255 |       "\n",
1256 |       "        .. seealso::\n",
1257 |       "\n",
1258 |       "            :ref:`dbengine_logging` - further detail on how to configure\n",
1259 |       "            logging.\n",
1260 |       "\n",
1261 |       "            :paramref:`_engine.Connection.execution_options.logging_token`\n",
1262 |       "\n",
1263 |       "    :param max_identifier_length: integer; override the max_identifier_length\n",
1264 |       "        determined by the dialect.  if ``None`` or zero, has no effect.  This\n",
1265 |       "        is the database's configured maximum number of characters that may be\n",
1266 |       "        used in a SQL identifier such as a table name, column name, or label\n",
1267 |       "        name. All dialects determine this value automatically, however in the\n",
1268 |       "        case of a new database version for which this value has changed but\n",
1269 |       "        SQLAlchemy's dialect has not been adjusted, the value may be passed\n",
1270 |       "        here.\n",
1271 |       "\n",
1272 |       "        .. versionadded:: 1.3.9\n",
1273 |       "\n",
1274 |       "        .. seealso::\n",
1275 |       "\n",
1276 |       "            :paramref:`_sa.create_engine.label_length`\n",
1277 |       "\n",
1278 |       "    :param max_overflow=10: the number of connections to allow in\n",
1279 |       "        connection pool \"overflow\", that is connections that can be\n",
1280 |       "        opened above and beyond the pool_size setting, which defaults\n",
1281 |       "        to five. this is only used with :class:`~sqlalchemy.pool.QueuePool`.\n",
1282 |       "\n",
1283 |       "    :param module=None: reference to a Python module object (the module\n",
1284 |       "        itself, not its string name).  Specifies an alternate DBAPI module to\n",
1285 |       "        be used by the engine's dialect.  Each sub-dialect references a\n",
1286 |       "        specific DBAPI which will be imported before first connect.  This\n",
1287 |       "        parameter causes the import to be bypassed, and the given module to\n",
1288 |       "        be used instead. Can be used for testing of DBAPIs as well as to\n",
1289 |       "        inject \"mock\" DBAPI implementations into the :class:`_engine.Engine`.\n",
1290 |       "\n",
1291 |       "    :param paramstyle=None: The `paramstyle <https://legacy.python.org/dev/peps/pep-0249/#paramstyle>`_\n",
1292 |       "        to use when rendering bound parameters.  This style defaults to the\n",
1293 |       "        one recommended by the DBAPI itself, which is retrieved from the\n",
1294 |       "        ``.paramstyle`` attribute of the DBAPI.  However, most DBAPIs accept\n",
1295 |       "        more than one paramstyle, and in particular it may be desirable\n",
1296 |       "        to change a \"named\" paramstyle into a \"positional\" one, or vice versa.\n",
1297 |       "        When this attribute is passed, it should be one of the values\n",
1298 |       "        ``\"qmark\"``, ``\"numeric\"``, ``\"named\"``, ``\"format\"`` or\n",
1299 |       "        ``\"pyformat\"``, and should correspond to a parameter style known\n",
1300 |       "        to be supported by the DBAPI in use.\n",
1301 |       "\n",
1302 |       "    :param pool=None: an already-constructed instance of\n",
1303 |       "        :class:`~sqlalchemy.pool.Pool`, such as a\n",
1304 |       "        :class:`~sqlalchemy.pool.QueuePool` instance. If non-None, this\n",
1305 |       "        pool will be used directly as the underlying connection pool\n",
1306 |       "        for the engine, bypassing whatever connection parameters are\n",
1307 |       "        present in the URL argument. For information on constructing\n",
1308 |       "        connection pools manually, see :ref:`pooling_toplevel`.\n",
1309 |       "\n",
1310 |       "    :param poolclass=None: a :class:`~sqlalchemy.pool.Pool`\n",
1311 |       "        subclass, which will be used to create a connection pool\n",
1312 |       "        instance using the connection parameters given in the URL. Note\n",
1313 |       "        this differs from ``pool`` in that you don't actually\n",
1314 |       "        instantiate the pool in this case, you just indicate what type\n",
1315 |       "        of pool to be used.\n",
1316 |       "\n",
1317 |       "    :param pool_logging_name:  String identifier which will be used within\n",
1318 |       "       the \"name\" field of logging records generated within the\n",
1319 |       "       \"sqlalchemy.pool\" logger. Defaults to a hexstring of the object's\n",
1320 |       "       id.\n",
1321 |       "\n",
1322 |       "       .. seealso::\n",
1323 |       "\n",
1324 |       "            :ref:`dbengine_logging` - further detail on how to configure\n",
1325 |       "            logging.\n",
1326 |       "\n",
1327 |       "    :param pool_pre_ping: boolean, if True will enable the connection pool\n",
1328 |       "        \"pre-ping\" feature that tests connections for liveness upon\n",
1329 |       "        each checkout.\n",
1330 |       "\n",
1331 |       "        .. versionadded:: 1.2\n",
1332 |       "\n",
1333 |       "        .. seealso::\n",
1334 |       "\n",
1335 |       "            :ref:`pool_disconnects_pessimistic`\n",
1336 |       "\n",
1337 |       "    :param pool_size=5: the number of connections to keep open\n",
1338 |       "        inside the connection pool. This used with\n",
1339 |       "        :class:`~sqlalchemy.pool.QueuePool` as\n",
1340 |       "        well as :class:`~sqlalchemy.pool.SingletonThreadPool`.  With\n",
1341 |       "        :class:`~sqlalchemy.pool.QueuePool`, a ``pool_size`` setting\n",
1342 |       "        of 0 indicates no limit; to disable pooling, set ``poolclass`` to\n",
1343 |       "        :class:`~sqlalchemy.pool.NullPool` instead.\n",
1344 |       "\n",
1345 |       "    :param pool_recycle=-1: this setting causes the pool to recycle\n",
1346 |       "        connections after the given number of seconds has passed. It\n",
1347 |       "        defaults to -1, or no timeout. For example, setting to 3600\n",
1348 |       "        means connections will be recycled after one hour. Note that\n",
1349 |       "        MySQL in particular will disconnect automatically if no\n",
1350 |       "        activity is detected on a connection for eight hours (although\n",
1351 |       "        this is configurable with the MySQLDB connection itself and the\n",
1352 |       "        server configuration as well).\n",
1353 |       "\n",
1354 |       "        .. seealso::\n",
1355 |       "\n",
1356 |       "            :ref:`pool_setting_recycle`\n",
1357 |       "\n",
1358 |       "    :param pool_reset_on_return='rollback': set the\n",
1359 |       "        :paramref:`_pool.Pool.reset_on_return` parameter of the underlying\n",
1360 |       "        :class:`_pool.Pool` object, which can be set to the values\n",
1361 |       "        ``\"rollback\"``, ``\"commit\"``, or ``None``.\n",
1362 |       "\n",
1363 |       "        .. seealso::\n",
1364 |       "\n",
1365 |       "            :ref:`pool_reset_on_return`\n",
1366 |       "\n",
1367 |       "    :param pool_timeout=30: number of seconds to wait before giving\n",
1368 |       "        up on getting a connection from the pool. This is only used\n",
1369 |       "        with :class:`~sqlalchemy.pool.QueuePool`. This can be a float but is\n",
1370 |       "        subject to the limitations of Python time functions which may not be\n",
1371 |       "        reliable in the tens of milliseconds.\n",
1372 |       "\n",
1373 |       "        .. note: don't use 30.0 above, it seems to break with the :param tag\n",
1374 |       "\n",
1375 |       "    :param pool_use_lifo=False: use LIFO (last-in-first-out) when retrieving\n",
1376 |       "        connections from :class:`.QueuePool` instead of FIFO\n",
1377 |       "        (first-in-first-out). Using LIFO, a server-side timeout scheme can\n",
1378 |       "        reduce the number of connections used during non- peak   periods of\n",
1379 |       "        use.   When planning for server-side timeouts, ensure that a recycle or\n",
1380 |       "        pre-ping strategy is in use to gracefully   handle stale connections.\n",
1381 |       "\n",
1382 |       "          .. versionadded:: 1.3\n",
1383 |       "\n",
1384 |       "          .. seealso::\n",
1385 |       "\n",
1386 |       "            :ref:`pool_use_lifo`\n",
1387 |       "\n",
1388 |       "            :ref:`pool_disconnects`\n",
1389 |       "\n",
1390 |       "    :param plugins: string list of plugin names to load.  See\n",
1391 |       "        :class:`.CreateEnginePlugin` for background.\n",
1392 |       "\n",
1393 |       "        .. versionadded:: 1.2.3\n",
1394 |       "\n",
1395 |       "    :param query_cache_size: size of the cache used to cache the SQL string\n",
1396 |       "     form of queries.  Set to zero to disable caching.\n",
1397 |       "\n",
1398 |       "     The cache is pruned of its least recently used items when its size reaches\n",
1399 |       "     N * 1.5.  Defaults to 500, meaning the cache will always store at least\n",
1400 |       "     500 SQL statements when filled, and will grow up to 750 items at which\n",
1401 |       "     point it is pruned back down to 500 by removing the 250 least recently\n",
1402 |       "     used items.\n",
1403 |       "\n",
1404 |       "     Caching is accomplished on a per-statement basis by generating a\n",
1405 |       "     cache key that represents the statement's structure, then generating\n",
1406 |       "     string SQL for the current dialect only if that key is not present\n",
1407 |       "     in the cache.   All statements support caching, however some features\n",
1408 |       "     such as an INSERT with a large set of parameters will intentionally\n",
1409 |       "     bypass the cache.   SQL logging will indicate statistics for each\n",
1410 |       "     statement whether or not it were pull from the cache.\n",
1411 |       "\n",
1412 |       "     .. note:: some ORM functions related to unit-of-work persistence as well\n",
1413 |       "        as some attribute loading strategies will make use of individual\n",
1414 |       "        per-mapper caches outside of the main cache.\n",
1415 |       "\n",
1416 |       "\n",
1417 |       "     .. seealso::\n",
1418 |       "\n",
1419 |       "        :ref:`sql_caching`\n",
1420 |       "\n",
1421 |       "     .. versionadded:: 1.4\n",
1422 |       "\n",
1423 |       "    :param use_insertmanyvalues: True by default, use the \"insertmanyvalues\"\n",
1424 |       "     execution style for INSERT..RETURNING statements by default.\n",
1425 |       "\n",
1426 |       "     .. versionadded:: 2.0\n",
1427 |       "\n",
1428 |       "     .. seealso::\n",
1429 |       "\n",
1430 |       "        :ref:`engine_insertmanyvalues`\n",
1431 |       "\n"
1432 |      ]
1433 |     }
1434 |    ],
1435 |    "source": [
1436 |     "help(create_engine)"
1437 |    ]
1438 |   },
1439 |   {
1440 |    "cell_type": "code",
1441 |    "execution_count": null,
1442 |    "metadata": {},
1443 |    "outputs": [
1444 |     {
1445 |      "name": "stdout",
1446 |      "output_type": "stream",
1447 |      "text": [
1448 |       "Connection Successed to mysql\n"
1449 |      ]
1450 |     }
1451 |    ],
1452 |    "source": [
1453 |     "#mysql connection\n",
1454 |     "# \"mysql+pymysql://user:password@localhost:3306/db_name\"\n",
1455 |     "engine_mysql = create_engine(\"mysql+pymysql://root@localhost:3306/walmart_db\")\n",
1456 |     "\n",
1457 |     "try:\n",
1458 |     "    engine_mysql\n",
1459 |     "    print(\"Connection Successed to mysql\")\n",
1460 |     "except:\n",
1461 |     "    print(\"Unable to connect\")"
1462 |    ]
1463 |   },
1464 |   {
1465 |    "cell_type": "code",
1466 |    "execution_count": null,
1467 |    "metadata": {},
1468 |    "outputs": [
1469 |     {
1470 |      "data": {
1471 |       "text/plain": [
1472 |        "9969"
1473 |       ]
1474 |      },
1475 |      "execution_count": 44,
1476 |      "metadata": {},
1477 |      "output_type": "execute_result"
1478 |     }
1479 |    ],
1480 |    "source": [
1481 |     "df.to_sql(name='walmart', con=engine_mysql, if_exists='append', index=False)"
1482 |    ]
1483 |   },
1484 |   {
1485 |    "cell_type": "code",
1486 |    "execution_count": null,
1487 |    "metadata": {},
1488 |    "outputs": [
1489 |     {
1490 |      "data": {
1491 |       "text/plain": [
1492 |        "(9969, 12)"
1493 |       ]
1494 |      },
1495 |      "execution_count": 45,
1496 |      "metadata": {},
1497 |      "output_type": "execute_result"
1498 |     }
1499 |    ],
1500 |    "source": [
1501 |     "df.shape"
1502 |    ]
1503 |   },
1504 |   {
1505 |    "cell_type": "code",
1506 |    "execution_count": null,
1507 |    "metadata": {},
1508 |    "outputs": [
1509 |     {
1510 |      "name": "stdout",
1511 |      "output_type": "stream",
1512 |      "text": [
1513 |       "Connection Successed to PSQL\n"
1514 |      ]
1515 |     }
1516 |    ],
1517 |    "source": [
1518 |     "#psql connection\n",
1519 |     "# \"mysql+pymysql://user:password@localhost:3306/db_name\"\n",
1520 |     "engine_psql = create_engine(\"postgresql+psycopg2://postgres:x0000@localhost:5432/walmart_db\")\n",
1521 |     "\n",
1522 |     "try:\n",
1523 |     "    engine_psql\n",
1524 |     "    print(\"Connection Successed to PSQL\")\n",
1525 |     "except:\n",
1526 |     "    print(\"Unable to connect\")"
1527 |    ]
1528 |   },
1529 |   {
1530 |    "cell_type": "code",
1531 |    "execution_count": null,
1532 |    "metadata": {},
1533 |    "outputs": [
1534 |     {
1535 |      "data": {
1536 |       "text/plain": [
1537 |        "969"
1538 |       ]
1539 |      },
1540 |      "execution_count": 113,
1541 |      "metadata": {},
1542 |      "output_type": "execute_result"
1543 |     }
1544 |    ],
1545 |    "source": [
1546 |     "df.to_sql(name='walmart', con=engine_psql, if_exists='replace', index=False)"
1547 |    ]
1548 |   },
1549 |   {
1550 |    "cell_type": "code",
1551 |    "execution_count": 17,
1552 |    "metadata": {},
1553 |    "outputs": [],
1554 |    "source": [
1555 |     "df.to_csv('walmart_clean_data.csv', index=False)"
1556 |    ]
1557 |   }
1558 |  ],
1559 |  "metadata": {
1560 |   "kernelspec": {
1561 |    "display_name": "my_env1",
1562 |    "language": "python",
1563 |    "name": "python3"
1564 |   },
1565 |   "language_info": {
1566 |    "codemirror_mode": {
1567 |     "name": "ipython",
1568 |     "version": 3
1569 |    },
1570 |    "file_extension": ".py",
1571 |    "mimetype": "text/x-python",
1572 |    "name": "python",
1573 |    "nbconvert_exporter": "python",
1574 |    "pygments_lexer": "ipython3",
1575 |    "version": "3.12.6"
1576 |   }
1577 |  },
1578 |  "nbformat": 4,
1579 |  "nbformat_minor": 2
1580 | }
1581 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | pandas
 2 | pymysql
 3 | sqlalchemy
 4 | psycopg2
 5 | 
 6 | 
 7 | #Instructions & Termnal Commands
 8 | 
 9 | 
10 | # Environment Setup
11 | 
12 | ## macOS Terminal Commands
13 | ```bash
14 | # Install Homebrew if not installed
15 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
16 | 
17 | # Install Python (if not installed)
18 | brew install python
19 | 
20 | # Install pip (Python package installer)
21 | sudo easy_install pip
22 | ```
23 | 
24 | ## Windows Command Prompt Commands
25 | ```cmd
26 | :: Install Python from the official website if not installed.
27 | :: Make sure to check "Add Python to PATH" during installation.
28 | 
29 | :: Install pip if not installed (comes with Python installations).
30 | python -m ensurepip --upgrade
31 | ```
32 | 
33 | # Kaggle API Setup
34 | 
35 | ## Both macOS and Windows Commands
36 | ```bash
37 | # Create a Kaggle account if you don't have one.
38 | # Go to your account settings and click on "Create New API Token".
39 | # This will download a file called kaggle.json.
40 | 
41 | # Create a directory for Kaggle configuration
42 | mkdir ~/.kaggle      # macOS
43 | mkdir %USERPROFILE%\.kaggle   # Windows
44 | 
45 | # Move kaggle.json to the Kaggle directory
46 | # Use the following commands to copy the kaggle.json file:
47 | mv ~/Downloads/kaggle.json ~/.kaggle/   # macOS
48 | copy %USERPROFILE%\Downloads\kaggle.json %USERPROFILE%\.kaggle\   # Windows
49 | 
50 | # Set the permissions for kaggle.json
51 | chmod 600 ~/.kaggle/kaggle.json  # macOS
52 | ```
53 | 
54 | # Download Datasets
55 | 
56 | ## Both macOS and Windows Commands
57 | ```bash
58 | # Navigate to the directory where you want to download the dataset
59 | cd path/to/your/directory
60 | 
61 | # Example: Download a dataset (replace 'dataset-name' with the actual dataset slug)
62 | kaggle datasets download -d dataset-name
63 | 
64 | # Unzip the dataset
65 | unzip dataset-name.zip   # This will extract the files in the current directory
66 | ```
67 | 
68 | # Install Required Libraries
69 | 
70 | ## Both macOS and Windows Commands
71 | ```bash
72 | # Install the necessary Python libraries
73 | pip install pandas numpy matplotlib seaborn scikit-learn
74 | ```
75 | 
76 | # Summary of Commands
77 | 1. **Environment Setup:** Install Homebrew (macOS) or Python (Windows).
78 | 2. **Kaggle API Setup:** Generate API token and configure kaggle.json.
79 | 3. **Download Datasets:** Use Kaggle API commands to download datasets and unzip them.
80 | 4. **Install Libraries:** Use pip to install required libraries for data analysis.
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/walmart_project-piplelines.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/najirh/Walmart_SQL_Python/af425b2ddc63f298ab0b91707779ea0d5ce78465/walmart_project-piplelines.png


--------------------------------------------------------------------------------