├── MySQL Queries.sql ├── PSQL Queries.sql ├── README.md ├── Walmart Business Problems.pdf ├── Walmart Project.png ├── Walmart.csv ├── project.ipynb ├── requirements.txt ├── walmart_clean_data.csv └── walmart_project-piplelines.png /MySQL Queries.sql: -------------------------------------------------------------------------------- 1 | -- Walmart Project Queries - MySQL 2 | 3 | SELECT * FROM walmart; 4 | 5 | -- DROP TABLE walmart; 6 | 7 | -- DROP TABLE walmart; 8 | 9 | -- Count total records 10 | SELECT COUNT(*) FROM walmart; 11 | 12 | -- Count payment methods and number of transactions by payment method 13 | SELECT 14 | payment_method, 15 | COUNT(*) AS no_payments 16 | FROM walmart 17 | GROUP BY payment_method; 18 | 19 | -- Count distinct branches 20 | SELECT COUNT(DISTINCT branch) FROM walmart; 21 | 22 | -- Find the minimum quantity sold 23 | SELECT MIN(quantity) FROM walmart; 24 | 25 | -- Business Problem Q1: Find different payment methods, number of transactions, and quantity sold by payment method 26 | SELECT 27 | payment_method, 28 | COUNT(*) AS no_payments, 29 | SUM(quantity) AS no_qty_sold 30 | FROM walmart 31 | GROUP BY payment_method; 32 | 33 | -- Project Question #2: Identify the highest-rated category in each branch 34 | -- Display the branch, category, and avg rating 35 | SELECT branch, category, avg_rating 36 | FROM ( 37 | SELECT 38 | branch, 39 | category, 40 | AVG(rating) AS avg_rating, 41 | RANK() OVER(PARTITION BY branch ORDER BY AVG(rating) DESC) AS rank 42 | FROM walmart 43 | GROUP BY branch, category 44 | ) AS ranked 45 | WHERE rank = 1; 46 | 47 | -- Q3: Identify the busiest day for each branch based on the number of transactions 48 | SELECT branch, day_name, no_transactions 49 | FROM ( 50 | SELECT 51 | branch, 52 | DAYNAME(STR_TO_DATE(date, '%d/%m/%Y')) AS day_name, 53 | COUNT(*) AS no_transactions, 54 | RANK() OVER(PARTITION BY branch ORDER BY COUNT(*) DESC) AS rank 55 | FROM walmart 56 | GROUP BY branch, day_name 57 | ) AS ranked 58 | WHERE rank = 1; 59 | 60 | -- Q4: Calculate the total quantity of items sold per payment method 61 | SELECT 62 | payment_method, 63 | SUM(quantity) AS no_qty_sold 64 | FROM walmart 65 | GROUP BY payment_method; 66 | 67 | -- Q5: Determine the average, minimum, and maximum rating of categories for each city 68 | SELECT 69 | city, 70 | category, 71 | MIN(rating) AS min_rating, 72 | MAX(rating) AS max_rating, 73 | AVG(rating) AS avg_rating 74 | FROM walmart 75 | GROUP BY city, category; 76 | 77 | -- Q6: Calculate the total profit for each category 78 | SELECT 79 | category, 80 | SUM(unit_price * quantity * profit_margin) AS total_profit 81 | FROM walmart 82 | GROUP BY category 83 | ORDER BY total_profit DESC; 84 | 85 | -- Q7: Determine the most common payment method for each branch 86 | WITH cte AS ( 87 | SELECT 88 | branch, 89 | payment_method, 90 | COUNT(*) AS total_trans, 91 | RANK() OVER(PARTITION BY branch ORDER BY COUNT(*) DESC) AS rank 92 | FROM walmart 93 | GROUP BY branch, payment_method 94 | ) 95 | SELECT branch, payment_method AS preferred_payment_method 96 | FROM cte 97 | WHERE rank = 1; 98 | 99 | -- Q8: Categorize sales into Morning, Afternoon, and Evening shifts 100 | SELECT 101 | branch, 102 | CASE 103 | WHEN HOUR(TIME(time)) < 12 THEN 'Morning' 104 | WHEN HOUR(TIME(time)) BETWEEN 12 AND 17 THEN 'Afternoon' 105 | ELSE 'Evening' 106 | END AS shift, 107 | COUNT(*) AS num_invoices 108 | FROM walmart 109 | GROUP BY branch, shift 110 | ORDER BY branch, num_invoices DESC; 111 | 112 | -- Q9: Identify the 5 branches with the highest revenue decrease ratio from last year to current year (e.g., 2022 to 2023) 113 | WITH revenue_2022 AS ( 114 | SELECT 115 | branch, 116 | SUM(total) AS revenue 117 | FROM walmart 118 | WHERE YEAR(STR_TO_DATE(date, '%d/%m/%Y')) = 2022 119 | GROUP BY branch 120 | ), 121 | revenue_2023 AS ( 122 | SELECT 123 | branch, 124 | SUM(total) AS revenue 125 | FROM walmart 126 | WHERE YEAR(STR_TO_DATE(date, '%d/%m/%Y')) = 2023 127 | GROUP BY branch 128 | ) 129 | SELECT 130 | r2022.branch, 131 | r2022.revenue AS last_year_revenue, 132 | r2023.revenue AS current_year_revenue, 133 | ROUND(((r2022.revenue - r2023.revenue) / r2022.revenue) * 100, 2) AS revenue_decrease_ratio 134 | FROM revenue_2022 AS r2022 135 | JOIN revenue_2023 AS r2023 ON r2022.branch = r2023.branch 136 | WHERE r2022.revenue > r2023.revenue 137 | ORDER BY revenue_decrease_ratio DESC 138 | LIMIT 5; 139 | -------------------------------------------------------------------------------- /PSQL Queries.sql: -------------------------------------------------------------------------------- 1 | -- Walmart Project Queries 2 | 3 | SELECT * FROM walmart; 4 | 5 | -- DROP TABLE walmart; 6 | 7 | -- DROP TABLE walmart; 8 | 9 | -- 10 | SELECT COUNT(*) FROM walmart; 11 | 12 | SELECT 13 | payment_method, 14 | COUNT(*) 15 | FROM walmart 16 | GROUP BY payment_method 17 | 18 | SELECT 19 | COUNT(DISTINCT branch) 20 | FROM walmart; 21 | 22 | SELECT MIN(quantity) FROM walmart; 23 | 24 | -- Business Problems 25 | --Q.1 Find different payment method and number of transactions, number of qty sold 26 | 27 | 28 | SELECT 29 | payment_method, 30 | COUNT(*) as no_payments, 31 | SUM(quantity) as no_qty_sold 32 | FROM walmart 33 | GROUP BY payment_method 34 | 35 | 36 | -- Project Question #2 37 | -- Identify the highest-rated category in each branch, displaying the branch, category 38 | -- AVG RATING 39 | 40 | SELECT * 41 | FROM 42 | ( SELECT 43 | branch, 44 | category, 45 | AVG(rating) as avg_rating, 46 | RANK() OVER(PARTITION BY branch ORDER BY AVG(rating) DESC) as rank 47 | FROM walmart 48 | GROUP BY 1, 2 49 | ) 50 | WHERE rank = 1 51 | 52 | 53 | -- Q.3 Identify the busiest day for each branch based on the number of transactions 54 | 55 | SELECT * 56 | FROM 57 | (SELECT 58 | branch, 59 | TO_CHAR(TO_DATE(date, 'DD/MM/YY'), 'Day') as day_name, 60 | COUNT(*) as no_transactions, 61 | RANK() OVER(PARTITION BY branch ORDER BY COUNT(*) DESC) as rank 62 | FROM walmart 63 | GROUP BY 1, 2 64 | ) 65 | WHERE rank = 1 66 | 67 | -- Q. 4 68 | -- Calculate the total quantity of items sold per payment method. List payment_method and total_quantity. 69 | 70 | 71 | 72 | SELECT 73 | payment_method, 74 | -- COUNT(*) as no_payments, 75 | SUM(quantity) as no_qty_sold 76 | FROM walmart 77 | GROUP BY payment_method 78 | 79 | 80 | -- Q.5 81 | -- Determine the average, minimum, and maximum rating of category for each city. 82 | -- List the city, average_rating, min_rating, and max_rating. 83 | 84 | SELECT 85 | city, 86 | category, 87 | MIN(rating) as min_rating, 88 | MAX(rating) as max_rating, 89 | AVG(rating) as avg_rating 90 | FROM walmart 91 | GROUP BY 1, 2 92 | 93 | 94 | -- Q.6 95 | -- Calculate the total profit for each category by considering total_profit as 96 | -- (unit_price * quantity * profit_margin). 97 | -- List category and total_profit, ordered from highest to lowest profit. 98 | 99 | SELECT 100 | category, 101 | SUM(total) as total_revenue, 102 | SUM(total * profit_margin) as profit 103 | FROM walmart 104 | GROUP BY 1 105 | 106 | 107 | -- Q.7 108 | -- Determine the most common payment method for each Branch. 109 | -- Display Branch and the preferred_payment_method. 110 | 111 | WITH cte 112 | AS 113 | (SELECT 114 | branch, 115 | payment_method, 116 | COUNT(*) as total_trans, 117 | RANK() OVER(PARTITION BY branch ORDER BY COUNT(*) DESC) as rank 118 | FROM walmart 119 | GROUP BY 1, 2 120 | ) 121 | SELECT * 122 | FROM cte 123 | WHERE rank = 1 124 | 125 | 126 | -- Q.8 127 | -- Categorize sales into 3 group MORNING, AFTERNOON, EVENING 128 | -- Find out each of the shift and number of invoices 129 | 130 | SELECT 131 | branch, 132 | CASE 133 | WHEN EXTRACT(HOUR FROM(time::time)) < 12 THEN 'Morning' 134 | WHEN EXTRACT(HOUR FROM(time::time)) BETWEEN 12 AND 17 THEN 'Afternoon' 135 | ELSE 'Evening' 136 | END day_time, 137 | COUNT(*) 138 | FROM walmart 139 | GROUP BY 1, 2 140 | ORDER BY 1, 3 DESC 141 | 142 | -- 143 | -- #9 Identify 5 branch with highest decrese ratio in 144 | -- revevenue compare to last year(current year 2023 and last year 2022) 145 | 146 | -- rdr == last_rev-cr_rev/ls_rev*100 147 | 148 | SELECT *, 149 | EXTRACT(YEAR FROM TO_DATE(date, 'DD/MM/YY')) as formated_date 150 | FROM walmart 151 | 152 | -- 2022 sales 153 | WITH revenue_2022 154 | AS 155 | ( 156 | SELECT 157 | branch, 158 | SUM(total) as revenue 159 | FROM walmart 160 | WHERE EXTRACT(YEAR FROM TO_DATE(date, 'DD/MM/YY')) = 2022 -- psql 161 | -- WHERE YEAR(TO_DATE(date, 'DD/MM/YY')) = 2022 -- mysql 162 | GROUP BY 1 163 | ), 164 | 165 | revenue_2023 166 | AS 167 | ( 168 | 169 | SELECT 170 | branch, 171 | SUM(total) as revenue 172 | FROM walmart 173 | WHERE EXTRACT(YEAR FROM TO_DATE(date, 'DD/MM/YY')) = 2023 174 | GROUP BY 1 175 | ) 176 | 177 | SELECT 178 | ls.branch, 179 | ls.revenue as last_year_revenue, 180 | cs.revenue as cr_year_revenue, 181 | ROUND( 182 | (ls.revenue - cs.revenue)::numeric/ 183 | ls.revenue::numeric * 100, 184 | 2) as rev_dec_ratio 185 | FROM revenue_2022 as ls 186 | JOIN 187 | revenue_2023 as cs 188 | ON ls.branch = cs.branch 189 | WHERE 190 | ls.revenue > cs.revenue 191 | ORDER BY 4 DESC 192 | LIMIT 5 193 | 194 | 195 | 196 | 197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Walmart Data Analysis: End-to-End SQL + Python Project P-9 2 | 3 | ## Project Overview 4 | 5 | ![Project Pipeline](https://github.com/najirh/Walmart_SQL_Python/blob/main/walmart_project-piplelines.png) 6 | 7 | 8 | This project is an end-to-end data analysis solution designed to extract critical business insights from Walmart sales data. We utilize Python for data processing and analysis, SQL for advanced querying, and structured problem-solving techniques to solve key business questions. The project is ideal for data analysts looking to develop skills in data manipulation, SQL querying, and data pipeline creation. 9 | 10 | --- 11 | 12 | ## Project Steps 13 | 14 | ### 1. Set Up the Environment 15 | - **Tools Used**: Visual Studio Code (VS Code), Python, SQL (MySQL and PostgreSQL) 16 | - **Goal**: Create a structured workspace within VS Code and organize project folders for smooth development and data handling. 17 | 18 | ### 2. Set Up Kaggle API 19 | - **API Setup**: Obtain your Kaggle API token from [Kaggle](https://www.kaggle.com/) by navigating to your profile settings and downloading the JSON file. 20 | - **Configure Kaggle**: 21 | - Place the downloaded `kaggle.json` file in your local `.kaggle` folder. 22 | - Use the command `kaggle datasets download -d ` to pull datasets directly into your project. 23 | 24 | ### 3. Download Walmart Sales Data 25 | - **Data Source**: Use the Kaggle API to download the Walmart sales datasets from Kaggle. 26 | - **Dataset Link**: [Walmart Sales Dataset](https://www.kaggle.com/najir0123/walmart-10k-sales-datasets) 27 | - **Storage**: Save the data in the `data/` folder for easy reference and access. 28 | 29 | ### 4. Install Required Libraries and Load Data 30 | - **Libraries**: Install necessary Python libraries using: 31 | ```bash 32 | pip install pandas numpy sqlalchemy mysql-connector-python psycopg2 33 | ``` 34 | - **Loading Data**: Read the data into a Pandas DataFrame for initial analysis and transformations. 35 | 36 | ### 5. Explore the Data 37 | - **Goal**: Conduct an initial data exploration to understand data distribution, check column names, types, and identify potential issues. 38 | - **Analysis**: Use functions like `.info()`, `.describe()`, and `.head()` to get a quick overview of the data structure and statistics. 39 | 40 | ### 6. Data Cleaning 41 | - **Remove Duplicates**: Identify and remove duplicate entries to avoid skewed results. 42 | - **Handle Missing Values**: Drop rows or columns with missing values if they are insignificant; fill values where essential. 43 | - **Fix Data Types**: Ensure all columns have consistent data types (e.g., dates as `datetime`, prices as `float`). 44 | - **Currency Formatting**: Use `.replace()` to handle and format currency values for analysis. 45 | - **Validation**: Check for any remaining inconsistencies and verify the cleaned data. 46 | 47 | ### 7. Feature Engineering 48 | - **Create New Columns**: Calculate the `Total Amount` for each transaction by multiplying `unit_price` by `quantity` and adding this as a new column. 49 | - **Enhance Dataset**: Adding this calculated field will streamline further SQL analysis and aggregation tasks. 50 | 51 | ### 8. Load Data into MySQL and PostgreSQL 52 | - **Set Up Connections**: Connect to MySQL and PostgreSQL using `sqlalchemy` and load the cleaned data into each database. 53 | - **Table Creation**: Set up tables in both MySQL and PostgreSQL using Python SQLAlchemy to automate table creation and data insertion. 54 | - **Verification**: Run initial SQL queries to confirm that the data has been loaded accurately. 55 | 56 | ### 9. SQL Analysis: Complex Queries and Business Problem Solving 57 | - **Business Problem-Solving**: Write and execute complex SQL queries to answer critical business questions, such as: 58 | - Revenue trends across branches and categories. 59 | - Identifying best-selling product categories. 60 | - Sales performance by time, city, and payment method. 61 | - Analyzing peak sales periods and customer buying patterns. 62 | - Profit margin analysis by branch and category. 63 | - **Documentation**: Keep clear notes of each query's objective, approach, and results. 64 | 65 | ### 10. Project Publishing and Documentation 66 | - **Documentation**: Maintain well-structured documentation of the entire process in Markdown or a Jupyter Notebook. 67 | - **Project Publishing**: Publish the completed project on GitHub or any other version control platform, including: 68 | - The `README.md` file (this document). 69 | - Jupyter Notebooks (if applicable). 70 | - SQL query scripts. 71 | - Data files (if possible) or steps to access them. 72 | 73 | --- 74 | 75 | ## Requirements 76 | 77 | - **Python 3.8+** 78 | - **SQL Databases**: MySQL, PostgreSQL 79 | - **Python Libraries**: 80 | - `pandas`, `numpy`, `sqlalchemy`, `mysql-connector-python`, `psycopg2` 81 | - **Kaggle API Key** (for data downloading) 82 | 83 | ## Getting Started 84 | 85 | 1. Clone the repository: 86 | ```bash 87 | git clone 88 | ``` 89 | 2. Install Python libraries: 90 | ```bash 91 | pip install -r requirements.txt 92 | ``` 93 | 3. Set up your Kaggle API, download the data, and follow the steps to load and analyze. 94 | 95 | --- 96 | 97 | ## Project Structure 98 | 99 | ```plaintext 100 | |-- data/ # Raw data and transformed data 101 | |-- sql_queries/ # SQL scripts for analysis and queries 102 | |-- notebooks/ # Jupyter notebooks for Python analysis 103 | |-- README.md # Project documentation 104 | |-- requirements.txt # List of required Python libraries 105 | |-- main.py # Main script for loading, cleaning, and processing data 106 | ``` 107 | --- 108 | 109 | ## Results and Insights 110 | 111 | This section will include your analysis findings: 112 | - **Sales Insights**: Key categories, branches with highest sales, and preferred payment methods. 113 | - **Profitability**: Insights into the most profitable product categories and locations. 114 | - **Customer Behavior**: Trends in ratings, payment preferences, and peak shopping hours. 115 | 116 | ## Future Enhancements 117 | 118 | Possible extensions to this project: 119 | - Integration with a dashboard tool (e.g., Power BI or Tableau) for interactive visualization. 120 | - Additional data sources to enhance analysis depth. 121 | - Automation of the data pipeline for real-time data ingestion and analysis. 122 | 123 | --- 124 | 125 | ## License 126 | 127 | This project is licensed under the MIT License. 128 | 129 | --- 130 | 131 | ## Acknowledgments 132 | 133 | - **Data Source**: Kaggle’s Walmart Sales Dataset 134 | - **Inspiration**: Walmart’s business case studies on sales and supply chain optimization. 135 | 136 | --- 137 | -------------------------------------------------------------------------------- /Walmart Business Problems.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/najirh/Walmart_SQL_Python/af425b2ddc63f298ab0b91707779ea0d5ce78465/Walmart Business Problems.pdf -------------------------------------------------------------------------------- /Walmart Project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/najirh/Walmart_SQL_Python/af425b2ddc63f298ab0b91707779ea0d5ce78465/Walmart Project.png -------------------------------------------------------------------------------- /project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Hello World\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "print(\"Hello World\")" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "**Step 1 Data Exploration & Leading**" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "#importing dependencies\n", 34 | "\n", 35 | "import pandas as pd\n", 36 | "\n", 37 | "#mysql toolkit\n", 38 | "import pymysql #this will work as adapter\n", 39 | "from sqlalchemy import create_engine\n", 40 | "\n", 41 | "#psql\n", 42 | "import psycopg2" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "2.2.3\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "print(pd.__version__)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "(10051, 11)" 71 | ] 72 | }, 73 | "execution_count": 5, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "df = pd.read_csv('Walmart.csv', encoding_errors='ignore')\n", 80 | "\n", 81 | "df.shape" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/html": [ 92 | "
\n", 93 | "\n", 106 | "\n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | "
invoice_idBranchCitycategoryunit_pricequantitydatetimepayment_methodratingprofit_margin
01WALM003San AntonioHealth and beauty$74.697.005/01/1913:08:00Ewallet9.10.48
12WALM048HarlingenElectronic accessories$15.285.008/03/1910:29:00Cash9.60.48
23WALM067Haltom CityHome and lifestyle$46.337.003/03/1913:23:00Credit card7.40.33
34WALM064BedfordHealth and beauty$58.228.027/01/1920:33:00Ewallet8.40.33
45WALM013IrvingSports and travel$86.317.008/02/1910:37:00Ewallet5.30.48
\n", 196 | "
" 197 | ], 198 | "text/plain": [ 199 | " invoice_id Branch City category unit_price \\\n", 200 | "0 1 WALM003 San Antonio Health and beauty $74.69 \n", 201 | "1 2 WALM048 Harlingen Electronic accessories $15.28 \n", 202 | "2 3 WALM067 Haltom City Home and lifestyle $46.33 \n", 203 | "3 4 WALM064 Bedford Health and beauty $58.22 \n", 204 | "4 5 WALM013 Irving Sports and travel $86.31 \n", 205 | "\n", 206 | " quantity date time payment_method rating profit_margin \n", 207 | "0 7.0 05/01/19 13:08:00 Ewallet 9.1 0.48 \n", 208 | "1 5.0 08/03/19 10:29:00 Cash 9.6 0.48 \n", 209 | "2 7.0 03/03/19 13:23:00 Credit card 7.4 0.33 \n", 210 | "3 8.0 27/01/19 20:33:00 Ewallet 8.4 0.33 \n", 211 | "4 7.0 08/02/19 10:37:00 Ewallet 5.3 0.48 " 212 | ] 213 | }, 214 | "execution_count": 6, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "df.head()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 7, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/html": [ 231 | "
\n", 232 | "\n", 245 | "\n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | "
invoice_idquantityratingprofit_margin
count10051.00000010020.00000010051.00000010051.000000
mean5025.7412202.3534935.8256590.393791
std2901.1743721.6026581.7639910.090669
min1.0000001.0000003.0000000.180000
25%2513.5000001.0000004.0000000.330000
50%5026.0000002.0000006.0000000.330000
75%7538.5000003.0000007.0000000.480000
max10000.00000010.00000010.0000000.570000
\n", 314 | "
" 315 | ], 316 | "text/plain": [ 317 | " invoice_id quantity rating profit_margin\n", 318 | "count 10051.000000 10020.000000 10051.000000 10051.000000\n", 319 | "mean 5025.741220 2.353493 5.825659 0.393791\n", 320 | "std 2901.174372 1.602658 1.763991 0.090669\n", 321 | "min 1.000000 1.000000 3.000000 0.180000\n", 322 | "25% 2513.500000 1.000000 4.000000 0.330000\n", 323 | "50% 5026.000000 2.000000 6.000000 0.330000\n", 324 | "75% 7538.500000 3.000000 7.000000 0.480000\n", 325 | "max 10000.000000 10.000000 10.000000 0.570000" 326 | ] 327 | }, 328 | "execution_count": 7, 329 | "metadata": {}, 330 | "output_type": "execute_result" 331 | } 332 | ], 333 | "source": [ 334 | "df.describe()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 8, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "\n", 347 | "RangeIndex: 10051 entries, 0 to 10050\n", 348 | "Data columns (total 11 columns):\n", 349 | " # Column Non-Null Count Dtype \n", 350 | "--- ------ -------------- ----- \n", 351 | " 0 invoice_id 10051 non-null int64 \n", 352 | " 1 Branch 10051 non-null object \n", 353 | " 2 City 10051 non-null object \n", 354 | " 3 category 10051 non-null object \n", 355 | " 4 unit_price 10020 non-null object \n", 356 | " 5 quantity 10020 non-null float64\n", 357 | " 6 date 10051 non-null object \n", 358 | " 7 time 10051 non-null object \n", 359 | " 8 payment_method 10051 non-null object \n", 360 | " 9 rating 10051 non-null float64\n", 361 | " 10 profit_margin 10051 non-null float64\n", 362 | "dtypes: float64(3), int64(1), object(7)\n", 363 | "memory usage: 863.9+ KB\n" 364 | ] 365 | } 366 | ], 367 | "source": [ 368 | "df.info()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 9, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "np.int64(51)" 380 | ] 381 | }, 382 | "execution_count": 9, 383 | "metadata": {}, 384 | "output_type": "execute_result" 385 | } 386 | ], 387 | "source": [ 388 | "#all duplicates\n", 389 | "df.duplicated().sum()" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 10, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "np.int64(0)" 401 | ] 402 | }, 403 | "execution_count": 10, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "df.drop_duplicates(inplace=True)\n", 410 | "df.duplicated().sum()" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 11, 416 | "metadata": {}, 417 | "outputs": [ 418 | { 419 | "data": { 420 | "text/plain": [ 421 | "(10000, 11)" 422 | ] 423 | }, 424 | "execution_count": 11, 425 | "metadata": {}, 426 | "output_type": "execute_result" 427 | } 428 | ], 429 | "source": [ 430 | "df.shape" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 12, 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "data": { 440 | "text/plain": [ 441 | "invoice_id 0\n", 442 | "Branch 0\n", 443 | "City 0\n", 444 | "category 0\n", 445 | "unit_price 31\n", 446 | "quantity 31\n", 447 | "date 0\n", 448 | "time 0\n", 449 | "payment_method 0\n", 450 | "rating 0\n", 451 | "profit_margin 0\n", 452 | "dtype: int64" 453 | ] 454 | }, 455 | "execution_count": 12, 456 | "metadata": {}, 457 | "output_type": "execute_result" 458 | } 459 | ], 460 | "source": [ 461 | "df.isnull().sum()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 13, 467 | "metadata": {}, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "text/plain": [ 472 | "invoice_id 0\n", 473 | "Branch 0\n", 474 | "City 0\n", 475 | "category 0\n", 476 | "unit_price 0\n", 477 | "quantity 0\n", 478 | "date 0\n", 479 | "time 0\n", 480 | "payment_method 0\n", 481 | "rating 0\n", 482 | "profit_margin 0\n", 483 | "dtype: int64" 484 | ] 485 | }, 486 | "execution_count": 13, 487 | "metadata": {}, 488 | "output_type": "execute_result" 489 | } 490 | ], 491 | "source": [ 492 | "#droppping all rows with missing records\n", 493 | "df.dropna(inplace=True)\n", 494 | "\n", 495 | "# verify\n", 496 | "df.isnull().sum()" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 14, 502 | "metadata": {}, 503 | "outputs": [ 504 | { 505 | "data": { 506 | "text/plain": [ 507 | "(9969, 11)" 508 | ] 509 | }, 510 | "execution_count": 14, 511 | "metadata": {}, 512 | "output_type": "execute_result" 513 | } 514 | ], 515 | "source": [ 516 | "df.shape" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 15, 522 | "metadata": {}, 523 | "outputs": [ 524 | { 525 | "data": { 526 | "text/plain": [ 527 | "invoice_id int64\n", 528 | "Branch object\n", 529 | "City object\n", 530 | "category object\n", 531 | "unit_price object\n", 532 | "quantity float64\n", 533 | "date object\n", 534 | "time object\n", 535 | "payment_method object\n", 536 | "rating float64\n", 537 | "profit_margin float64\n", 538 | "dtype: object" 539 | ] 540 | }, 541 | "execution_count": 15, 542 | "metadata": {}, 543 | "output_type": "execute_result" 544 | } 545 | ], 546 | "source": [ 547 | "df.dtypes" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": null, 553 | "metadata": {}, 554 | "outputs": [], 555 | "source": [ 556 | "df['unit_price'].astype(float)" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": {}, 563 | "outputs": [ 564 | { 565 | "data": { 566 | "text/html": [ 567 | "
\n", 568 | "\n", 581 | "\n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | "
invoice_idbranchcitycategoryunit_pricequantitydatetimepayment_methodratingprofit_margin
01WALM003San AntonioHealth and beauty74.697.005/01/1913:08:00Ewallet9.10.48
12WALM048HarlingenElectronic accessories15.285.008/03/1910:29:00Cash9.60.48
23WALM067Haltom CityHome and lifestyle46.337.003/03/1913:23:00Credit card7.40.33
34WALM064BedfordHealth and beauty58.228.027/01/1920:33:00Ewallet8.40.33
45WALM013IrvingSports and travel86.317.008/02/1910:37:00Ewallet5.30.48
\n", 671 | "
" 672 | ], 673 | "text/plain": [ 674 | " invoice_id branch city category unit_price \\\n", 675 | "0 1 WALM003 San Antonio Health and beauty 74.69 \n", 676 | "1 2 WALM048 Harlingen Electronic accessories 15.28 \n", 677 | "2 3 WALM067 Haltom City Home and lifestyle 46.33 \n", 678 | "3 4 WALM064 Bedford Health and beauty 58.22 \n", 679 | "4 5 WALM013 Irving Sports and travel 86.31 \n", 680 | "\n", 681 | " quantity date time payment_method rating profit_margin \n", 682 | "0 7.0 05/01/19 13:08:00 Ewallet 9.1 0.48 \n", 683 | "1 5.0 08/03/19 10:29:00 Cash 9.6 0.48 \n", 684 | "2 7.0 03/03/19 13:23:00 Credit card 7.4 0.33 \n", 685 | "3 8.0 27/01/19 20:33:00 Ewallet 8.4 0.33 \n", 686 | "4 7.0 08/02/19 10:37:00 Ewallet 5.3 0.48 " 687 | ] 688 | }, 689 | "execution_count": 110, 690 | "metadata": {}, 691 | "output_type": "execute_result" 692 | } 693 | ], 694 | "source": [ 695 | "df['unit_price'] = df['unit_price'].str.replace('$', '').astype(float)\n", 696 | "\n", 697 | "df.head()" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [ 705 | { 706 | "name": "stdout", 707 | "output_type": "stream", 708 | "text": [ 709 | "\n", 710 | "Index: 9969 entries, 0 to 9999\n", 711 | "Data columns (total 11 columns):\n", 712 | " # Column Non-Null Count Dtype \n", 713 | "--- ------ -------------- ----- \n", 714 | " 0 invoice_id 9969 non-null int64 \n", 715 | " 1 Branch 9969 non-null object \n", 716 | " 2 City 9969 non-null object \n", 717 | " 3 category 9969 non-null object \n", 718 | " 4 unit_price 9969 non-null float64\n", 719 | " 5 quantity 9969 non-null float64\n", 720 | " 6 date 9969 non-null object \n", 721 | " 7 time 9969 non-null object \n", 722 | " 8 payment_method 9969 non-null object \n", 723 | " 9 rating 9969 non-null float64\n", 724 | " 10 profit_margin 9969 non-null float64\n", 725 | "dtypes: float64(4), int64(1), object(6)\n", 726 | "memory usage: 934.6+ KB\n" 727 | ] 728 | } 729 | ], 730 | "source": [ 731 | "df.info()" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "metadata": {}, 738 | "outputs": [ 739 | { 740 | "data": { 741 | "text/plain": [ 742 | "Index(['invoice_id', 'Branch', 'City', 'category', 'unit_price', 'quantity',\n", 743 | " 'date', 'time', 'payment_method', 'rating', 'profit_margin'],\n", 744 | " dtype='object')" 745 | ] 746 | }, 747 | "execution_count": 29, 748 | "metadata": {}, 749 | "output_type": "execute_result" 750 | } 751 | ], 752 | "source": [ 753 | "df.columns" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": null, 759 | "metadata": {}, 760 | "outputs": [ 761 | { 762 | "data": { 763 | "text/html": [ 764 | "
\n", 765 | "\n", 778 | "\n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | "
invoice_idbranchcitycategoryunit_pricequantitydatetimepayment_methodratingprofit_margintotal
01WALM003San AntonioHealth and beauty74.697.005/01/1913:08:00Ewallet9.10.48522.83
12WALM048HarlingenElectronic accessories15.285.008/03/1910:29:00Cash9.60.4876.40
23WALM067Haltom CityHome and lifestyle46.337.003/03/1913:23:00Credit card7.40.33324.31
34WALM064BedfordHealth and beauty58.228.027/01/1920:33:00Ewallet8.40.33465.76
45WALM013IrvingSports and travel86.317.008/02/1910:37:00Ewallet5.30.48604.17
\n", 874 | "
" 875 | ], 876 | "text/plain": [ 877 | " invoice_id branch city category unit_price \\\n", 878 | "0 1 WALM003 San Antonio Health and beauty 74.69 \n", 879 | "1 2 WALM048 Harlingen Electronic accessories 15.28 \n", 880 | "2 3 WALM067 Haltom City Home and lifestyle 46.33 \n", 881 | "3 4 WALM064 Bedford Health and beauty 58.22 \n", 882 | "4 5 WALM013 Irving Sports and travel 86.31 \n", 883 | "\n", 884 | " quantity date time payment_method rating profit_margin total \n", 885 | "0 7.0 05/01/19 13:08:00 Ewallet 9.1 0.48 522.83 \n", 886 | "1 5.0 08/03/19 10:29:00 Cash 9.6 0.48 76.40 \n", 887 | "2 7.0 03/03/19 13:23:00 Credit card 7.4 0.33 324.31 \n", 888 | "3 8.0 27/01/19 20:33:00 Ewallet 8.4 0.33 465.76 \n", 889 | "4 7.0 08/02/19 10:37:00 Ewallet 5.3 0.48 604.17 " 890 | ] 891 | }, 892 | "execution_count": 111, 893 | "metadata": {}, 894 | "output_type": "execute_result" 895 | } 896 | ], 897 | "source": [ 898 | "df['total'] = df['unit_price'] * df['quantity']\n", 899 | "df.head()" 900 | ] 901 | }, 902 | { 903 | "cell_type": "markdown", 904 | "metadata": {}, 905 | "source": [ 906 | "**Fixing the column name to lower case**" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": null, 912 | "metadata": {}, 913 | "outputs": [ 914 | { 915 | "data": { 916 | "text/plain": [ 917 | "Index(['invoice_id', 'Branch', 'City', 'category', 'unit_price', 'quantity',\n", 918 | " 'date', 'time', 'payment_method', 'rating', 'profit_margin'],\n", 919 | " dtype='object')" 920 | ] 921 | }, 922 | "metadata": {}, 923 | "output_type": "display_data" 924 | } 925 | ], 926 | "source": [ 927 | "df.columns" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "metadata": {}, 934 | "outputs": [ 935 | { 936 | "data": { 937 | "text/plain": [ 938 | "Index(['invoice_id', 'branch', 'city', 'category', 'unit_price', 'quantity',\n", 939 | " 'date', 'time', 'payment_method', 'rating', 'profit_margin'],\n", 940 | " dtype='object')" 941 | ] 942 | }, 943 | "metadata": {}, 944 | "output_type": "display_data" 945 | } 946 | ], 947 | "source": [ 948 | "df.columns = df.columns.str.lower()\n", 949 | "df.columns" 950 | ] 951 | }, 952 | { 953 | "cell_type": "code", 954 | "execution_count": null, 955 | "metadata": {}, 956 | "outputs": [], 957 | "source": [ 958 | "# mysql \n", 959 | "# host = localhost\n", 960 | "# port = 3306\n", 961 | "# user = root\n", 962 | "# password = 'your_password'\n", 963 | "\n", 964 | "# psql\n", 965 | "# host = localhost\n", 966 | "# port = 5432\n", 967 | "# user = postgres\n", 968 | "# password = 'x0000'" 969 | ] 970 | }, 971 | { 972 | "cell_type": "code", 973 | "execution_count": null, 974 | "metadata": {}, 975 | "outputs": [ 976 | { 977 | "data": { 978 | "text/plain": [ 979 | "(9969, 12)" 980 | ] 981 | }, 982 | "execution_count": 36, 983 | "metadata": {}, 984 | "output_type": "execute_result" 985 | } 986 | ], 987 | "source": [ 988 | "df.shape" 989 | ] 990 | }, 991 | { 992 | "cell_type": "code", 993 | "execution_count": null, 994 | "metadata": {}, 995 | "outputs": [], 996 | "source": [ 997 | "df.to_csv('walmart_clean_data.csv', index=False)" 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "code", 1002 | "execution_count": null, 1003 | "metadata": {}, 1004 | "outputs": [ 1005 | { 1006 | "name": "stdout", 1007 | "output_type": "stream", 1008 | "text": [ 1009 | "Help on function create_engine in module sqlalchemy.engine.create:\n", 1010 | "\n", 1011 | "create_engine(url: 'Union[str, _url.URL]', **kwargs: 'Any') -> 'Engine'\n", 1012 | " Create a new :class:`_engine.Engine` instance.\n", 1013 | "\n", 1014 | " The standard calling form is to send the :ref:`URL ` as the\n", 1015 | " first positional argument, usually a string\n", 1016 | " that indicates database dialect and connection arguments::\n", 1017 | "\n", 1018 | " engine = create_engine(\"postgresql+psycopg2://scott:tiger@localhost/test\")\n", 1019 | "\n", 1020 | " .. note::\n", 1021 | "\n", 1022 | " Please review :ref:`database_urls` for general guidelines in composing\n", 1023 | " URL strings. In particular, special characters, such as those often\n", 1024 | " part of passwords, must be URL encoded to be properly parsed.\n", 1025 | "\n", 1026 | " Additional keyword arguments may then follow it which\n", 1027 | " establish various options on the resulting :class:`_engine.Engine`\n", 1028 | " and its underlying :class:`.Dialect` and :class:`_pool.Pool`\n", 1029 | " constructs::\n", 1030 | "\n", 1031 | " engine = create_engine(\"mysql+mysqldb://scott:tiger@hostname/dbname\",\n", 1032 | " pool_recycle=3600, echo=True)\n", 1033 | "\n", 1034 | " The string form of the URL is\n", 1035 | " ``dialect[+driver]://user:password@host/dbname[?key=value..]``, where\n", 1036 | " ``dialect`` is a database name such as ``mysql``, ``oracle``,\n", 1037 | " ``postgresql``, etc., and ``driver`` the name of a DBAPI, such as\n", 1038 | " ``psycopg2``, ``pyodbc``, ``cx_oracle``, etc. Alternatively,\n", 1039 | " the URL can be an instance of :class:`~sqlalchemy.engine.url.URL`.\n", 1040 | "\n", 1041 | " ``**kwargs`` takes a wide variety of options which are routed\n", 1042 | " towards their appropriate components. Arguments may be specific to\n", 1043 | " the :class:`_engine.Engine`, the underlying :class:`.Dialect`,\n", 1044 | " as well as the\n", 1045 | " :class:`_pool.Pool`. Specific dialects also accept keyword arguments that\n", 1046 | " are unique to that dialect. Here, we describe the parameters\n", 1047 | " that are common to most :func:`_sa.create_engine()` usage.\n", 1048 | "\n", 1049 | " Once established, the newly resulting :class:`_engine.Engine` will\n", 1050 | " request a connection from the underlying :class:`_pool.Pool` once\n", 1051 | " :meth:`_engine.Engine.connect` is called, or a method which depends on it\n", 1052 | " such as :meth:`_engine.Engine.execute` is invoked. The\n", 1053 | " :class:`_pool.Pool` in turn\n", 1054 | " will establish the first actual DBAPI connection when this request\n", 1055 | " is received. The :func:`_sa.create_engine` call itself does **not**\n", 1056 | " establish any actual DBAPI connections directly.\n", 1057 | "\n", 1058 | " .. seealso::\n", 1059 | "\n", 1060 | " :doc:`/core/engines`\n", 1061 | "\n", 1062 | " :doc:`/dialects/index`\n", 1063 | "\n", 1064 | " :ref:`connections_toplevel`\n", 1065 | "\n", 1066 | " :param connect_args: a dictionary of options which will be\n", 1067 | " passed directly to the DBAPI's ``connect()`` method as\n", 1068 | " additional keyword arguments. See the example\n", 1069 | " at :ref:`custom_dbapi_args`.\n", 1070 | "\n", 1071 | " :param creator: a callable which returns a DBAPI connection.\n", 1072 | " This creation function will be passed to the underlying\n", 1073 | " connection pool and will be used to create all new database\n", 1074 | " connections. Usage of this function causes connection\n", 1075 | " parameters specified in the URL argument to be bypassed.\n", 1076 | "\n", 1077 | " This hook is not as flexible as the newer\n", 1078 | " :meth:`_events.DialectEvents.do_connect` hook which allows complete\n", 1079 | " control over how a connection is made to the database, given the full\n", 1080 | " set of URL arguments and state beforehand.\n", 1081 | "\n", 1082 | " .. seealso::\n", 1083 | "\n", 1084 | " :meth:`_events.DialectEvents.do_connect` - event hook that allows\n", 1085 | " full control over DBAPI connection mechanics.\n", 1086 | "\n", 1087 | " :ref:`custom_dbapi_args`\n", 1088 | "\n", 1089 | " :param echo=False: if True, the Engine will log all statements\n", 1090 | " as well as a ``repr()`` of their parameter lists to the default log\n", 1091 | " handler, which defaults to ``sys.stdout`` for output. If set to the\n", 1092 | " string ``\"debug\"``, result rows will be printed to the standard output\n", 1093 | " as well. The ``echo`` attribute of ``Engine`` can be modified at any\n", 1094 | " time to turn logging on and off; direct control of logging is also\n", 1095 | " available using the standard Python ``logging`` module.\n", 1096 | "\n", 1097 | " .. seealso::\n", 1098 | "\n", 1099 | " :ref:`dbengine_logging` - further detail on how to configure\n", 1100 | " logging.\n", 1101 | "\n", 1102 | "\n", 1103 | " :param echo_pool=False: if True, the connection pool will log\n", 1104 | " informational output such as when connections are invalidated\n", 1105 | " as well as when connections are recycled to the default log handler,\n", 1106 | " which defaults to ``sys.stdout`` for output. If set to the string\n", 1107 | " ``\"debug\"``, the logging will include pool checkouts and checkins.\n", 1108 | " Direct control of logging is also available using the standard Python\n", 1109 | " ``logging`` module.\n", 1110 | "\n", 1111 | " .. seealso::\n", 1112 | "\n", 1113 | " :ref:`dbengine_logging` - further detail on how to configure\n", 1114 | " logging.\n", 1115 | "\n", 1116 | "\n", 1117 | " :param empty_in_strategy: No longer used; SQLAlchemy now uses\n", 1118 | " \"empty set\" behavior for IN in all cases.\n", 1119 | "\n", 1120 | " .. deprecated:: 1.4 The :paramref:`_sa.create_engine.empty_in_strategy` keyword is deprecated, and no longer has any effect. All IN expressions are now rendered using the \"expanding parameter\" strategy which renders a set of boundexpressions, or an \"empty set\" SELECT, at statement executiontime.\n", 1121 | "\n", 1122 | "\n", 1123 | "\n", 1124 | " :param enable_from_linting: defaults to True. Will emit a warning\n", 1125 | " if a given SELECT statement is found to have un-linked FROM elements\n", 1126 | " which would cause a cartesian product.\n", 1127 | "\n", 1128 | " .. versionadded:: 1.4\n", 1129 | "\n", 1130 | " .. seealso::\n", 1131 | "\n", 1132 | " :ref:`change_4737`\n", 1133 | "\n", 1134 | " :param execution_options: Dictionary execution options which will\n", 1135 | " be applied to all connections. See\n", 1136 | " :meth:`~sqlalchemy.engine.Connection.execution_options`\n", 1137 | "\n", 1138 | " :param future: Use the 2.0 style :class:`_engine.Engine` and\n", 1139 | " :class:`_engine.Connection` API.\n", 1140 | "\n", 1141 | " As of SQLAlchemy 2.0, this parameter is present for backwards\n", 1142 | " compatibility only and must remain at its default value of ``True``.\n", 1143 | "\n", 1144 | " The :paramref:`_sa.create_engine.future` parameter will be\n", 1145 | " deprecated in a subsequent 2.x release and eventually removed.\n", 1146 | "\n", 1147 | " .. versionadded:: 1.4\n", 1148 | "\n", 1149 | " .. versionchanged:: 2.0 All :class:`_engine.Engine` objects are\n", 1150 | " \"future\" style engines and there is no longer a ``future=False``\n", 1151 | " mode of operation.\n", 1152 | "\n", 1153 | " .. seealso::\n", 1154 | "\n", 1155 | " :ref:`migration_20_toplevel`\n", 1156 | "\n", 1157 | " :param hide_parameters: Boolean, when set to True, SQL statement parameters\n", 1158 | " will not be displayed in INFO logging nor will they be formatted into\n", 1159 | " the string representation of :class:`.StatementError` objects.\n", 1160 | "\n", 1161 | " .. versionadded:: 1.3.8\n", 1162 | "\n", 1163 | " .. seealso::\n", 1164 | "\n", 1165 | " :ref:`dbengine_logging` - further detail on how to configure\n", 1166 | " logging.\n", 1167 | "\n", 1168 | " :param implicit_returning=True: Legacy parameter that may only be set\n", 1169 | " to True. In SQLAlchemy 2.0, this parameter does nothing. In order to\n", 1170 | " disable \"implicit returning\" for statements invoked by the ORM,\n", 1171 | " configure this on a per-table basis using the\n", 1172 | " :paramref:`.Table.implicit_returning` parameter.\n", 1173 | "\n", 1174 | "\n", 1175 | " :param insertmanyvalues_page_size: number of rows to format into an\n", 1176 | " INSERT statement when the statement uses \"insertmanyvalues\" mode, which is\n", 1177 | " a paged form of bulk insert that is used for many backends when using\n", 1178 | " :term:`executemany` execution typically in conjunction with RETURNING.\n", 1179 | " Defaults to 1000, but may also be subject to dialect-specific limiting\n", 1180 | " factors which may override this value on a per-statement basis.\n", 1181 | "\n", 1182 | " .. versionadded:: 2.0\n", 1183 | "\n", 1184 | " .. seealso::\n", 1185 | "\n", 1186 | " :ref:`engine_insertmanyvalues`\n", 1187 | "\n", 1188 | " :ref:`engine_insertmanyvalues_page_size`\n", 1189 | "\n", 1190 | " :paramref:`_engine.Connection.execution_options.insertmanyvalues_page_size`\n", 1191 | "\n", 1192 | " :param isolation_level: optional string name of an isolation level\n", 1193 | " which will be set on all new connections unconditionally.\n", 1194 | " Isolation levels are typically some subset of the string names\n", 1195 | " ``\"SERIALIZABLE\"``, ``\"REPEATABLE READ\"``,\n", 1196 | " ``\"READ COMMITTED\"``, ``\"READ UNCOMMITTED\"`` and ``\"AUTOCOMMIT\"``\n", 1197 | " based on backend.\n", 1198 | "\n", 1199 | " The :paramref:`_sa.create_engine.isolation_level` parameter is\n", 1200 | " in contrast to the\n", 1201 | " :paramref:`.Connection.execution_options.isolation_level`\n", 1202 | " execution option, which may be set on an individual\n", 1203 | " :class:`.Connection`, as well as the same parameter passed to\n", 1204 | " :meth:`.Engine.execution_options`, where it may be used to create\n", 1205 | " multiple engines with different isolation levels that share a common\n", 1206 | " connection pool and dialect.\n", 1207 | "\n", 1208 | " .. versionchanged:: 2.0 The\n", 1209 | " :paramref:`_sa.create_engine.isolation_level`\n", 1210 | " parameter has been generalized to work on all dialects which support\n", 1211 | " the concept of isolation level, and is provided as a more succinct,\n", 1212 | " up front configuration switch in contrast to the execution option\n", 1213 | " which is more of an ad-hoc programmatic option.\n", 1214 | "\n", 1215 | " .. seealso::\n", 1216 | "\n", 1217 | " :ref:`dbapi_autocommit`\n", 1218 | "\n", 1219 | " :param json_deserializer: for dialects that support the\n", 1220 | " :class:`_types.JSON`\n", 1221 | " datatype, this is a Python callable that will convert a JSON string\n", 1222 | " to a Python object. By default, the Python ``json.loads`` function is\n", 1223 | " used.\n", 1224 | "\n", 1225 | " .. versionchanged:: 1.3.7 The SQLite dialect renamed this from\n", 1226 | " ``_json_deserializer``.\n", 1227 | "\n", 1228 | " :param json_serializer: for dialects that support the :class:`_types.JSON`\n", 1229 | " datatype, this is a Python callable that will render a given object\n", 1230 | " as JSON. By default, the Python ``json.dumps`` function is used.\n", 1231 | "\n", 1232 | " .. versionchanged:: 1.3.7 The SQLite dialect renamed this from\n", 1233 | " ``_json_serializer``.\n", 1234 | "\n", 1235 | "\n", 1236 | " :param label_length=None: optional integer value which limits\n", 1237 | " the size of dynamically generated column labels to that many\n", 1238 | " characters. If less than 6, labels are generated as\n", 1239 | " \"_(counter)\". If ``None``, the value of\n", 1240 | " ``dialect.max_identifier_length``, which may be affected via the\n", 1241 | " :paramref:`_sa.create_engine.max_identifier_length` parameter,\n", 1242 | " is used instead. The value of\n", 1243 | " :paramref:`_sa.create_engine.label_length`\n", 1244 | " may not be larger than that of\n", 1245 | " :paramref:`_sa.create_engine.max_identfier_length`.\n", 1246 | "\n", 1247 | " .. seealso::\n", 1248 | "\n", 1249 | " :paramref:`_sa.create_engine.max_identifier_length`\n", 1250 | "\n", 1251 | " :param logging_name: String identifier which will be used within\n", 1252 | " the \"name\" field of logging records generated within the\n", 1253 | " \"sqlalchemy.engine\" logger. Defaults to a hexstring of the\n", 1254 | " object's id.\n", 1255 | "\n", 1256 | " .. seealso::\n", 1257 | "\n", 1258 | " :ref:`dbengine_logging` - further detail on how to configure\n", 1259 | " logging.\n", 1260 | "\n", 1261 | " :paramref:`_engine.Connection.execution_options.logging_token`\n", 1262 | "\n", 1263 | " :param max_identifier_length: integer; override the max_identifier_length\n", 1264 | " determined by the dialect. if ``None`` or zero, has no effect. This\n", 1265 | " is the database's configured maximum number of characters that may be\n", 1266 | " used in a SQL identifier such as a table name, column name, or label\n", 1267 | " name. All dialects determine this value automatically, however in the\n", 1268 | " case of a new database version for which this value has changed but\n", 1269 | " SQLAlchemy's dialect has not been adjusted, the value may be passed\n", 1270 | " here.\n", 1271 | "\n", 1272 | " .. versionadded:: 1.3.9\n", 1273 | "\n", 1274 | " .. seealso::\n", 1275 | "\n", 1276 | " :paramref:`_sa.create_engine.label_length`\n", 1277 | "\n", 1278 | " :param max_overflow=10: the number of connections to allow in\n", 1279 | " connection pool \"overflow\", that is connections that can be\n", 1280 | " opened above and beyond the pool_size setting, which defaults\n", 1281 | " to five. this is only used with :class:`~sqlalchemy.pool.QueuePool`.\n", 1282 | "\n", 1283 | " :param module=None: reference to a Python module object (the module\n", 1284 | " itself, not its string name). Specifies an alternate DBAPI module to\n", 1285 | " be used by the engine's dialect. Each sub-dialect references a\n", 1286 | " specific DBAPI which will be imported before first connect. This\n", 1287 | " parameter causes the import to be bypassed, and the given module to\n", 1288 | " be used instead. Can be used for testing of DBAPIs as well as to\n", 1289 | " inject \"mock\" DBAPI implementations into the :class:`_engine.Engine`.\n", 1290 | "\n", 1291 | " :param paramstyle=None: The `paramstyle `_\n", 1292 | " to use when rendering bound parameters. This style defaults to the\n", 1293 | " one recommended by the DBAPI itself, which is retrieved from the\n", 1294 | " ``.paramstyle`` attribute of the DBAPI. However, most DBAPIs accept\n", 1295 | " more than one paramstyle, and in particular it may be desirable\n", 1296 | " to change a \"named\" paramstyle into a \"positional\" one, or vice versa.\n", 1297 | " When this attribute is passed, it should be one of the values\n", 1298 | " ``\"qmark\"``, ``\"numeric\"``, ``\"named\"``, ``\"format\"`` or\n", 1299 | " ``\"pyformat\"``, and should correspond to a parameter style known\n", 1300 | " to be supported by the DBAPI in use.\n", 1301 | "\n", 1302 | " :param pool=None: an already-constructed instance of\n", 1303 | " :class:`~sqlalchemy.pool.Pool`, such as a\n", 1304 | " :class:`~sqlalchemy.pool.QueuePool` instance. If non-None, this\n", 1305 | " pool will be used directly as the underlying connection pool\n", 1306 | " for the engine, bypassing whatever connection parameters are\n", 1307 | " present in the URL argument. For information on constructing\n", 1308 | " connection pools manually, see :ref:`pooling_toplevel`.\n", 1309 | "\n", 1310 | " :param poolclass=None: a :class:`~sqlalchemy.pool.Pool`\n", 1311 | " subclass, which will be used to create a connection pool\n", 1312 | " instance using the connection parameters given in the URL. Note\n", 1313 | " this differs from ``pool`` in that you don't actually\n", 1314 | " instantiate the pool in this case, you just indicate what type\n", 1315 | " of pool to be used.\n", 1316 | "\n", 1317 | " :param pool_logging_name: String identifier which will be used within\n", 1318 | " the \"name\" field of logging records generated within the\n", 1319 | " \"sqlalchemy.pool\" logger. Defaults to a hexstring of the object's\n", 1320 | " id.\n", 1321 | "\n", 1322 | " .. seealso::\n", 1323 | "\n", 1324 | " :ref:`dbengine_logging` - further detail on how to configure\n", 1325 | " logging.\n", 1326 | "\n", 1327 | " :param pool_pre_ping: boolean, if True will enable the connection pool\n", 1328 | " \"pre-ping\" feature that tests connections for liveness upon\n", 1329 | " each checkout.\n", 1330 | "\n", 1331 | " .. versionadded:: 1.2\n", 1332 | "\n", 1333 | " .. seealso::\n", 1334 | "\n", 1335 | " :ref:`pool_disconnects_pessimistic`\n", 1336 | "\n", 1337 | " :param pool_size=5: the number of connections to keep open\n", 1338 | " inside the connection pool. This used with\n", 1339 | " :class:`~sqlalchemy.pool.QueuePool` as\n", 1340 | " well as :class:`~sqlalchemy.pool.SingletonThreadPool`. With\n", 1341 | " :class:`~sqlalchemy.pool.QueuePool`, a ``pool_size`` setting\n", 1342 | " of 0 indicates no limit; to disable pooling, set ``poolclass`` to\n", 1343 | " :class:`~sqlalchemy.pool.NullPool` instead.\n", 1344 | "\n", 1345 | " :param pool_recycle=-1: this setting causes the pool to recycle\n", 1346 | " connections after the given number of seconds has passed. It\n", 1347 | " defaults to -1, or no timeout. For example, setting to 3600\n", 1348 | " means connections will be recycled after one hour. Note that\n", 1349 | " MySQL in particular will disconnect automatically if no\n", 1350 | " activity is detected on a connection for eight hours (although\n", 1351 | " this is configurable with the MySQLDB connection itself and the\n", 1352 | " server configuration as well).\n", 1353 | "\n", 1354 | " .. seealso::\n", 1355 | "\n", 1356 | " :ref:`pool_setting_recycle`\n", 1357 | "\n", 1358 | " :param pool_reset_on_return='rollback': set the\n", 1359 | " :paramref:`_pool.Pool.reset_on_return` parameter of the underlying\n", 1360 | " :class:`_pool.Pool` object, which can be set to the values\n", 1361 | " ``\"rollback\"``, ``\"commit\"``, or ``None``.\n", 1362 | "\n", 1363 | " .. seealso::\n", 1364 | "\n", 1365 | " :ref:`pool_reset_on_return`\n", 1366 | "\n", 1367 | " :param pool_timeout=30: number of seconds to wait before giving\n", 1368 | " up on getting a connection from the pool. This is only used\n", 1369 | " with :class:`~sqlalchemy.pool.QueuePool`. This can be a float but is\n", 1370 | " subject to the limitations of Python time functions which may not be\n", 1371 | " reliable in the tens of milliseconds.\n", 1372 | "\n", 1373 | " .. note: don't use 30.0 above, it seems to break with the :param tag\n", 1374 | "\n", 1375 | " :param pool_use_lifo=False: use LIFO (last-in-first-out) when retrieving\n", 1376 | " connections from :class:`.QueuePool` instead of FIFO\n", 1377 | " (first-in-first-out). Using LIFO, a server-side timeout scheme can\n", 1378 | " reduce the number of connections used during non- peak periods of\n", 1379 | " use. When planning for server-side timeouts, ensure that a recycle or\n", 1380 | " pre-ping strategy is in use to gracefully handle stale connections.\n", 1381 | "\n", 1382 | " .. versionadded:: 1.3\n", 1383 | "\n", 1384 | " .. seealso::\n", 1385 | "\n", 1386 | " :ref:`pool_use_lifo`\n", 1387 | "\n", 1388 | " :ref:`pool_disconnects`\n", 1389 | "\n", 1390 | " :param plugins: string list of plugin names to load. See\n", 1391 | " :class:`.CreateEnginePlugin` for background.\n", 1392 | "\n", 1393 | " .. versionadded:: 1.2.3\n", 1394 | "\n", 1395 | " :param query_cache_size: size of the cache used to cache the SQL string\n", 1396 | " form of queries. Set to zero to disable caching.\n", 1397 | "\n", 1398 | " The cache is pruned of its least recently used items when its size reaches\n", 1399 | " N * 1.5. Defaults to 500, meaning the cache will always store at least\n", 1400 | " 500 SQL statements when filled, and will grow up to 750 items at which\n", 1401 | " point it is pruned back down to 500 by removing the 250 least recently\n", 1402 | " used items.\n", 1403 | "\n", 1404 | " Caching is accomplished on a per-statement basis by generating a\n", 1405 | " cache key that represents the statement's structure, then generating\n", 1406 | " string SQL for the current dialect only if that key is not present\n", 1407 | " in the cache. All statements support caching, however some features\n", 1408 | " such as an INSERT with a large set of parameters will intentionally\n", 1409 | " bypass the cache. SQL logging will indicate statistics for each\n", 1410 | " statement whether or not it were pull from the cache.\n", 1411 | "\n", 1412 | " .. note:: some ORM functions related to unit-of-work persistence as well\n", 1413 | " as some attribute loading strategies will make use of individual\n", 1414 | " per-mapper caches outside of the main cache.\n", 1415 | "\n", 1416 | "\n", 1417 | " .. seealso::\n", 1418 | "\n", 1419 | " :ref:`sql_caching`\n", 1420 | "\n", 1421 | " .. versionadded:: 1.4\n", 1422 | "\n", 1423 | " :param use_insertmanyvalues: True by default, use the \"insertmanyvalues\"\n", 1424 | " execution style for INSERT..RETURNING statements by default.\n", 1425 | "\n", 1426 | " .. versionadded:: 2.0\n", 1427 | "\n", 1428 | " .. seealso::\n", 1429 | "\n", 1430 | " :ref:`engine_insertmanyvalues`\n", 1431 | "\n" 1432 | ] 1433 | } 1434 | ], 1435 | "source": [ 1436 | "help(create_engine)" 1437 | ] 1438 | }, 1439 | { 1440 | "cell_type": "code", 1441 | "execution_count": null, 1442 | "metadata": {}, 1443 | "outputs": [ 1444 | { 1445 | "name": "stdout", 1446 | "output_type": "stream", 1447 | "text": [ 1448 | "Connection Successed to mysql\n" 1449 | ] 1450 | } 1451 | ], 1452 | "source": [ 1453 | "#mysql connection\n", 1454 | "# \"mysql+pymysql://user:password@localhost:3306/db_name\"\n", 1455 | "engine_mysql = create_engine(\"mysql+pymysql://root@localhost:3306/walmart_db\")\n", 1456 | "\n", 1457 | "try:\n", 1458 | " engine_mysql\n", 1459 | " print(\"Connection Successed to mysql\")\n", 1460 | "except:\n", 1461 | " print(\"Unable to connect\")" 1462 | ] 1463 | }, 1464 | { 1465 | "cell_type": "code", 1466 | "execution_count": null, 1467 | "metadata": {}, 1468 | "outputs": [ 1469 | { 1470 | "data": { 1471 | "text/plain": [ 1472 | "9969" 1473 | ] 1474 | }, 1475 | "execution_count": 44, 1476 | "metadata": {}, 1477 | "output_type": "execute_result" 1478 | } 1479 | ], 1480 | "source": [ 1481 | "df.to_sql(name='walmart', con=engine_mysql, if_exists='append', index=False)" 1482 | ] 1483 | }, 1484 | { 1485 | "cell_type": "code", 1486 | "execution_count": null, 1487 | "metadata": {}, 1488 | "outputs": [ 1489 | { 1490 | "data": { 1491 | "text/plain": [ 1492 | "(9969, 12)" 1493 | ] 1494 | }, 1495 | "execution_count": 45, 1496 | "metadata": {}, 1497 | "output_type": "execute_result" 1498 | } 1499 | ], 1500 | "source": [ 1501 | "df.shape" 1502 | ] 1503 | }, 1504 | { 1505 | "cell_type": "code", 1506 | "execution_count": null, 1507 | "metadata": {}, 1508 | "outputs": [ 1509 | { 1510 | "name": "stdout", 1511 | "output_type": "stream", 1512 | "text": [ 1513 | "Connection Successed to PSQL\n" 1514 | ] 1515 | } 1516 | ], 1517 | "source": [ 1518 | "#psql connection\n", 1519 | "# \"mysql+pymysql://user:password@localhost:3306/db_name\"\n", 1520 | "engine_psql = create_engine(\"postgresql+psycopg2://postgres:x0000@localhost:5432/walmart_db\")\n", 1521 | "\n", 1522 | "try:\n", 1523 | " engine_psql\n", 1524 | " print(\"Connection Successed to PSQL\")\n", 1525 | "except:\n", 1526 | " print(\"Unable to connect\")" 1527 | ] 1528 | }, 1529 | { 1530 | "cell_type": "code", 1531 | "execution_count": null, 1532 | "metadata": {}, 1533 | "outputs": [ 1534 | { 1535 | "data": { 1536 | "text/plain": [ 1537 | "969" 1538 | ] 1539 | }, 1540 | "execution_count": 113, 1541 | "metadata": {}, 1542 | "output_type": "execute_result" 1543 | } 1544 | ], 1545 | "source": [ 1546 | "df.to_sql(name='walmart', con=engine_psql, if_exists='replace', index=False)" 1547 | ] 1548 | }, 1549 | { 1550 | "cell_type": "code", 1551 | "execution_count": 17, 1552 | "metadata": {}, 1553 | "outputs": [], 1554 | "source": [ 1555 | "df.to_csv('walmart_clean_data.csv', index=False)" 1556 | ] 1557 | } 1558 | ], 1559 | "metadata": { 1560 | "kernelspec": { 1561 | "display_name": "my_env1", 1562 | "language": "python", 1563 | "name": "python3" 1564 | }, 1565 | "language_info": { 1566 | "codemirror_mode": { 1567 | "name": "ipython", 1568 | "version": 3 1569 | }, 1570 | "file_extension": ".py", 1571 | "mimetype": "text/x-python", 1572 | "name": "python", 1573 | "nbconvert_exporter": "python", 1574 | "pygments_lexer": "ipython3", 1575 | "version": "3.12.6" 1576 | } 1577 | }, 1578 | "nbformat": 4, 1579 | "nbformat_minor": 2 1580 | } 1581 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | pymysql 3 | sqlalchemy 4 | psycopg2 5 | 6 | 7 | #Instructions & Termnal Commands 8 | 9 | 10 | # Environment Setup 11 | 12 | ## macOS Terminal Commands 13 | ```bash 14 | # Install Homebrew if not installed 15 | /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" 16 | 17 | # Install Python (if not installed) 18 | brew install python 19 | 20 | # Install pip (Python package installer) 21 | sudo easy_install pip 22 | ``` 23 | 24 | ## Windows Command Prompt Commands 25 | ```cmd 26 | :: Install Python from the official website if not installed. 27 | :: Make sure to check "Add Python to PATH" during installation. 28 | 29 | :: Install pip if not installed (comes with Python installations). 30 | python -m ensurepip --upgrade 31 | ``` 32 | 33 | # Kaggle API Setup 34 | 35 | ## Both macOS and Windows Commands 36 | ```bash 37 | # Create a Kaggle account if you don't have one. 38 | # Go to your account settings and click on "Create New API Token". 39 | # This will download a file called kaggle.json. 40 | 41 | # Create a directory for Kaggle configuration 42 | mkdir ~/.kaggle # macOS 43 | mkdir %USERPROFILE%\.kaggle # Windows 44 | 45 | # Move kaggle.json to the Kaggle directory 46 | # Use the following commands to copy the kaggle.json file: 47 | mv ~/Downloads/kaggle.json ~/.kaggle/ # macOS 48 | copy %USERPROFILE%\Downloads\kaggle.json %USERPROFILE%\.kaggle\ # Windows 49 | 50 | # Set the permissions for kaggle.json 51 | chmod 600 ~/.kaggle/kaggle.json # macOS 52 | ``` 53 | 54 | # Download Datasets 55 | 56 | ## Both macOS and Windows Commands 57 | ```bash 58 | # Navigate to the directory where you want to download the dataset 59 | cd path/to/your/directory 60 | 61 | # Example: Download a dataset (replace 'dataset-name' with the actual dataset slug) 62 | kaggle datasets download -d dataset-name 63 | 64 | # Unzip the dataset 65 | unzip dataset-name.zip # This will extract the files in the current directory 66 | ``` 67 | 68 | # Install Required Libraries 69 | 70 | ## Both macOS and Windows Commands 71 | ```bash 72 | # Install the necessary Python libraries 73 | pip install pandas numpy matplotlib seaborn scikit-learn 74 | ``` 75 | 76 | # Summary of Commands 77 | 1. **Environment Setup:** Install Homebrew (macOS) or Python (Windows). 78 | 2. **Kaggle API Setup:** Generate API token and configure kaggle.json. 79 | 3. **Download Datasets:** Use Kaggle API commands to download datasets and unzip them. 80 | 4. **Install Libraries:** Use pip to install required libraries for data analysis. 81 | 82 | 83 | -------------------------------------------------------------------------------- /walmart_project-piplelines.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/najirh/Walmart_SQL_Python/af425b2ddc63f298ab0b91707779ea0d5ce78465/walmart_project-piplelines.png --------------------------------------------------------------------------------