├── Procfile
├── logo.jpg
├── requirements.txt
├── .github
    └── workflows
    │   └── jekyll-gh-pages.yml
├── TODO.md
├── styles.css
├── result.html
├── README.md
├── index.html
└── app.py


/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn app:app
2 | 


--------------------------------------------------------------------------------
/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Data-preprocessing/HEAD/logo.jpg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask
2 | pandas
3 | matplotlib
4 | seaborn
5 | plotly
6 | openpyxl
7 | scikit-learn
8 | gunicorn
9 | 


--------------------------------------------------------------------------------
/.github/workflows/jekyll-gh-pages.yml:
--------------------------------------------------------------------------------
 1 | # Sample workflow for building and deploying a Jekyll site to GitHub Pages
 2 | name: Deploy Jekyll with GitHub Pages dependencies preinstalled
 3 | 
 4 | on:
 5 |   # Runs on pushes targeting the default branch
 6 |   push:
 7 |     branches: ["main"]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 |   contents: read
15 |   pages: write
16 |   id-token: write
17 | 
18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20 | concurrency:
21 |   group: "pages"
22 |   cancel-in-progress: false
23 | 
24 | jobs:
25 |   # Build job
26 |   build:
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       - name: Checkout
30 |         uses: actions/checkout@v4
31 |       - name: Setup Pages
32 |         uses: actions/configure-pages@v5
33 |       - name: Build with Jekyll
34 |         uses: actions/jekyll-build-pages@v1
35 |         with:
36 |           source: ./
37 |           destination: ./_site
38 |       - name: Upload artifact
39 |         uses: actions/upload-pages-artifact@v3
40 | 
41 |   # Deployment job
42 |   deploy:
43 |     environment:
44 |       name: github-pages
45 |       url: ${{ steps.deployment.outputs.page_url }}
46 |     runs-on: ubuntu-latest
47 |     needs: build
48 |     steps:
49 |       - name: Deploy to GitHub Pages
50 |         id: deployment
51 |         uses: actions/deploy-pages@v4
52 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # Improvement Plan for Data Preprocessing Project
 2 | 
 3 | ## Logical Steps:
 4 | 
 5 | 1. [x] **Update requirements.txt**: Add new dependencies for advanced features (plotly for interactive charts, openpyxl for Excel support, scikit-learn for preprocessing like outlier removal and scaling).
 6 | 
 7 | 2. [x] **Enhance app.py**:
 8 |    - [x] Add support for multiple file formats (CSV and Excel).
 9 |    - [x] Implement advanced preprocessing options: outlier removal (IQR), normalization (MinMaxScaler), categorical encoding (one-hot), duplicate removal.
10 |    - [x] Generate summary statistics tables for original and cleaned data.
11 |    - [x] Switch chart generation to Plotly for interactive visualizations (comparison subplots).
12 |    - [x] Add file validation (size limit 10MB, check if CSV/Excel).
13 |    - [x] Save both original and cleaned datasets for download.
14 |    - [x] Improve error handling.
15 | 
16 | 3. [x] **Update index.html**: Add form elements for advanced preprocessing options (checkboxes for outliers, normalize, encode, duplicates). Update file input to accept .csv and .xlsx.
17 | 
18 | 4. [x] **Update result.html**: Add sections for summary statistics (HTML tables), download links for original and cleaned files, embed Plotly interactive charts (using plotly.js CDN).
19 | 
20 | 5. [x] **Update styles.css**: Add styles for new UI elements (preprocessing options, summary tables, interactive chart containers).
21 | 
22 | 6. [x] **Update README.md**: Expand with setup instructions, new features list, usage guide.
23 | 
24 | 7. [ ] **Install dependencies**: Run pip install -r requirements.txt.
25 | 
26 | 8. [ ] **Test the application**:
27 |    - [ ] Run Flask app.
28 |    - [ ] Upload sample CSV/Excel files.
29 |    - [ ] Select preprocessing options and charts.
30 |    - [ ] Verify outputs: cleaned data, summaries, interactive charts, downloads.
31 |    - [ ] Use browser to interact and confirm.
32 | 
33 | 9. [ ] **Handle any issues**: Debug errors from testing, update files as needed.
34 | 


--------------------------------------------------------------------------------
/styles.css:
--------------------------------------------------------------------------------
  1 | /* General Body Styling */
  2 | body {
  3 |     background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); /* Fallback gradient since background.jpg is missing */
  4 |     color: #333;
  5 | }
  6 | 
  7 | /* Hero Section */
  8 | .hero {
  9 |     background-color: rgba(44, 160, 243, 0.8); /* Use a semi-transparent background color */
 10 |     color: white;
 11 |     padding: 40px;
 12 |     text-align: center;
 13 |     border-radius: 4px;
 14 |     margin-bottom: 20px;
 15 | }
 16 | 
 17 | /* Upload Section */
 18 | .upload-section {
 19 |     text-align: center;
 20 |     margin-top: 20px;
 21 |     background: rgba(255, 255, 255, 0.1);
 22 |     padding: 20px;
 23 |     border-radius: 10px;
 24 | }
 25 | 
 26 | /* Preprocessing Options */
 27 | .preprocess-options {
 28 |     display: flex;
 29 |     flex-direction: column;
 30 |     gap: 10px;
 31 |     margin-bottom: 20px;
 32 | }
 33 | 
 34 | .preprocess-options .form-check {
 35 |     text-align: left;
 36 | }
 37 | 
 38 | /* Button Styling */
 39 | .upload-button,
 40 | .download-button {
 41 |     background-color: #4a90e2;
 42 |     border: none;
 43 |     color: white;
 44 |     padding: 12px 24px;
 45 |     font-size: 1.1rem;
 46 |     border-radius: 25px;
 47 |     transition: background-color 0.3s ease;
 48 |     text-decoration: none;
 49 |     display: inline-block;
 50 | }
 51 | 
 52 | .upload-button:hover,
 53 | .download-button:hover {
 54 |     background-color: #357ab8;
 55 | }
 56 | 
 57 | /* Download Section */
 58 | .download-section {
 59 |     text-align: center;
 60 |     margin-top: 20px;
 61 | }
 62 | 
 63 | /* Summary Section */
 64 | .summary-section {
 65 |     background: rgba(255, 255, 255, 0.1);
 66 |     padding: 20px;
 67 |     border-radius: 10px;
 68 |     margin-top: 20px;
 69 | }
 70 | 
 71 | .summary-section table {
 72 |     background: white;
 73 |     color: #333;
 74 |     border-radius: 8px;
 75 |     overflow: hidden;
 76 |     box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
 77 | }
 78 | 
 79 | /* Visualization Section */
 80 | .visualization-section {
 81 |     margin-top: 40px;
 82 | }
 83 | 
 84 | .chart-container {
 85 |     height: 400px;
 86 |     width: 100%;
 87 |     border-radius: 12px;
 88 |     box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
 89 | }
 90 | 
 91 | .visualization {
 92 |     text-align: center;
 93 |     margin-bottom: 20px;
 94 | }
 95 | 
 96 | /* Footer Styling */
 97 | footer {
 98 |     margin-top: 30px;
 99 |     padding: 15px 0;
100 |     background-color: rgba(240, 244, 248, 0.8); /* Use a semi-transparent footer */
101 |     text-align: center;
102 | }
103 | 


--------------------------------------------------------------------------------
/result.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Preprocessing Results</title>
 7 | 
 8 |     <!-- Bootstrap CSS via CDN -->
 9 |     <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css">
10 |     <link rel="stylesheet" href="{{ url_for('static', filename='css/styles.css') }}">
11 | 
12 | </head>
13 | <body>
14 | 
15 |     <!-- Hero Section -->
16 |     <div class="container mt-4">
17 |         <div class="hero shadow-lg">
18 |             <h1>Preprocessing Complete</h1>
19 |             <p>Your data has been successfully processed.</p>
20 |         </div>
21 | 
22 |         <!-- Download Section -->
23 |         <div class="download-section">
24 |             <h2>Download Datasets</h2>
25 |             <a href="{{ original_file }}" class="download-button me-3" download>Download Original Dataset</a>
26 |             <a href="{{ cleaned_file }}" class="download-button" download>Download Cleaned Dataset</a>
27 |         </div>
28 | 
29 |         <!-- Summary Statistics Section -->
30 |         <div class="summary-section mt-5">
31 |             <h2>Summary Statistics</h2>
32 |             <div class="row">
33 |                 <div class="col-md-6">
34 |                     <h3>Original Data</h3>
35 |                     {{ original_summary | safe }}
36 |                 </div>
37 |                 <div class="col-md-6">
38 |                     <h3>Cleaned Data</h3>
39 |                     {{ cleaned_summary | safe }}
40 |                 </div>
41 |             </div>
42 |         </div>
43 | 
44 |         <!-- Visualization Section -->
45 |         <div class="visualization-section mt-5">
46 |             {% if charts_selected %}
47 |                 <h2>Generated Charts (Before and After)</h2>
48 |                 <div class="row">
49 |                     {% for chart in charts %}
50 |                     <div class="col-md-6 visualization">
51 |                         <h3>{{ chart.name }}</h3>
52 |                         <div class="chart-container">{{ chart.html | safe }}</div>
53 |                     </div>
54 |                     {% endfor %}
55 |                 </div>
56 |             {% else %}
57 |                 <p>No charts selected for visualization.</p>
58 |             {% endif %}
59 |         </div>
60 |     </div>
61 | 
62 |     <!-- Footer -->
63 |     <footer>
64 |         <p>&copy; 2024 Data Cleaner. All rights reserved.</p>
65 |     </footer>
66 | 
67 |     <!-- Plotly JS CDN -->
68 |     <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
69 |     <!-- Bootstrap JS and dependencies via CDN -->
70 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js"></script>
71 | 
72 | </body>
73 | </html>
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Preprocessing Application
 2 | 
 3 | ## Overview
 4 | This is an advanced web application for data preprocessing and visualization built with Flask. It allows users to upload CSV or Excel datasets, apply various preprocessing techniques, generate summary statistics, and create interactive charts for comparison between original and cleaned data.
 5 | 
 6 | ## Features
 7 | - **File Upload Support**: Upload CSV and Excel (.xlsx) files (max 10MB).
 8 | - **Preprocessing Options**:
 9 |   - Remove missing values (default).
10 |   - Remove duplicates.
11 |   - Remove outliers using IQR method for numerical columns.
12 |   - Normalize numerical columns using MinMaxScaler.
13 |   - One-hot encode categorical columns.
14 | - **Summary Statistics**: Generate and display descriptive statistics for original and cleaned datasets.
15 | - **Interactive Visualizations**: Select from Histogram, Box Plot, Scatter Plot, Line Chart, Bar Chart, Pie Chart. Uses Plotly for interactive charts comparing original vs. cleaned data.
16 | - **Downloads**: Download original and cleaned datasets as CSV.
17 | - **User-Friendly UI**: Bootstrap-based interface with custom styling.
18 | 
19 | ## Setup Instructions
20 | 1. **Clone or Download** the project.
21 | 2. **Install Dependencies**:
22 |    ```
23 |    pip install -r requirements.txt
24 |    ```
25 |    Required libraries: Flask, pandas, matplotlib, seaborn, plotly, openpyxl, scikit-learn.
26 | 3. **Run the Application**:
27 |    ```
28 |    python app.py
29 |    ```
30 |    The app will start on http://127.0.0.1:5000.
31 | 4. **Access the App**: Open your browser and go to http://127.0.0.1:5000.
32 | 
33 | ## Usage
34 | 1. **Upload Dataset**: Select a CSV or Excel file.
35 | 2. **Select Preprocessing Options**: Choose from the checkboxes (Remove Missing Values is enabled by default).
36 | 3. **Select Charts**: Check the visualizations you want to generate.
37 | 4. **Submit**: Click "Upload and Generate Charts".
38 | 5. **View Results**: 
39 |    - Download original or cleaned datasets.
40 |    - Review summary statistics tables.
41 |    - Interact with generated charts (before and after preprocessing).
42 | 
43 | ## Project Structure
44 | - `app.py`: Main Flask application with routes, preprocessing logic, and chart generation.
45 | - `templates/index.html`: Upload form with preprocessing and chart options.
46 | - `templates/result.html`: Results page with downloads, summaries, and charts.
47 | - `static/css/styles.css`: Custom styles for the UI.
48 | - `static/charts/`: Directory for chart files (auto-created).
49 | - `requirements.txt`: Python dependencies.
50 | - `logo.jpg`: App logo (optional).
51 | 
52 | ## Notes
53 | - Charts are generated based on the first suitable columns (numerical for most, categorical for pie).
54 | - For scatter plots, requires at least two numerical columns.
55 | - Error handling for invalid files, empty datasets, and processing errors.
56 | - Interactive charts require an internet connection for Plotly CDN.
57 | 
58 | ## Deployment Instructions
59 | To deploy the application to a cloud platform, follow these steps for Heroku (free tier available):
60 | 
61 | 1. **Install Heroku CLI**: Download and install from https://devcenter.heroku.com/articles/heroku-cli.
62 | 
63 | 2. **Prepare the App for Production**:
64 |    - Add `gunicorn` to `requirements.txt` (already included).
65 |    - Create a `Procfile` in the root directory with the content:
66 |      ```
67 |      web: gunicorn app:app
68 |      ```
69 |    - Ensure `app.py` has the following at the end:
70 |      ```python
71 |      if __name__ == '__main__':
72 |          app.run()
73 |      ```
74 |      (Already present.)
75 | 
76 | 3. **Deploy to Heroku**:
77 |    - Login to Heroku: `heroku login`
78 |    - Create a new app: `heroku create your-app-name`
79 |    - Push the code: `git add . && git commit -m "Initial commit" && git push heroku main`
80 |    - Open the app: `heroku open`
81 | 
82 | For other platforms like Render or Railway, create an account, connect your GitHub repo (push the code to GitHub first), and deploy as a web service.
83 | 
84 | ## Future Improvements
85 | - Support for more file formats (JSON, etc.).
86 | - Advanced preprocessing (feature selection, imputation methods).
87 | - Export charts as images/PDF.
88 | - User authentication and session management.
89 | 
90 | &copy; 2024 Data Cleaner. All rights reserved.
91 | 


--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 6 |     <title>Data Preprocessing Application</title>
 7 | 
 8 |     <!-- Bootstrap CSS via CDN -->
 9 |     <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/css/bootstrap.min.css">
10 |     <link rel="stylesheet" href="{{ url_for('static', filename='css/styles.css') }}">
11 | 
12 | </head>
13 | <body>
14 | 
15 |     <!-- Hero Section with Logo -->
16 |     <div class="container mt-4">
17 |         <div class="hero shadow-lg text-center">
18 |             <!-- Add the logo image here -->
19 |             <img src="{{ url_for('static', filename='logo.jpg') }}" alt="App Logo" class="mb-4" style="width: 150px;">
20 |             <h1>Data Preprocessing Application</h1>
21 |             <p>Upload your dataset and select charts for visualization.</p>
22 |         </div>
23 | 
24 |     <!-- Upload Section -->
25 |     <div class="upload-section">
26 |         <h2>Choose your Dataset (CSV or Excel)</h2>
27 |         <form action="/upload" method="post" enctype="multipart/form-data" class="mt-4">
28 |             <div class="mb-3">
29 |                 <input type="file" class="form-control" id="file" name="file" accept=".csv,.xlsx" required>
30 |             </div>
31 | 
32 |             <h2>Preprocessing Options</h2>
33 |             <div class="preprocess-options mb-4">
34 |                 <div class="form-check">
35 |                     <input class="form-check-input" type="checkbox" name="preprocess" value="remove_na" id="remove_na" checked>
36 |                     <label class="form-check-label" for="remove_na">Remove Missing Values</label>
37 |                 </div>
38 |                 <div class="form-check">
39 |                     <input class="form-check-input" type="checkbox" name="preprocess" value="remove_duplicates" id="remove_duplicates">
40 |                     <label class="form-check-label" for="remove_duplicates">Remove Duplicates</label>
41 |                 </div>
42 |                 <div class="form-check">
43 |                     <input class="form-check-input" type="checkbox" name="preprocess" value="remove_outliers" id="remove_outliers">
44 |                     <label class="form-check-label" for="remove_outliers">Remove Outliers (IQR)</label>
45 |                 </div>
46 |                 <div class="form-check">
47 |                     <input class="form-check-input" type="checkbox" name="preprocess" value="normalize" id="normalize">
48 |                     <label class="form-check-label" for="normalize">Normalize Numerical Columns</label>
49 |                 </div>
50 |                 <div class="form-check">
51 |                     <input class="form-check-input" type="checkbox" name="preprocess" value="encode_categoricals" id="encode_categoricals">
52 |                     <label class="form-check-label" for="encode_categoricals">Encode Categorical Columns (One-Hot)</label>
53 |                 </div>
54 |             </div>
55 | 
56 |             <h2>Select Charts to Visualize</h2>
57 |             <div class="chart-options">
58 |                 <div class="form-check">
59 |                     <input class="form-check-input" type="checkbox" name="charts" value="histogram" id="histogram">
60 |                     <label class="form-check-label" for="histogram">Histogram</label>
61 |                 </div>
62 |                 <div class="form-check">
63 |                     <input class="form-check-input" type="checkbox" name="charts" value="boxplot" id="boxplot">
64 |                     <label class="form-check-label" for="boxplot">Box Plot</label>
65 |                 </div>
66 |                 <div class="form-check">
67 |                     <input class="form-check-input" type="checkbox" name="charts" value="scatter" id="scatter">
68 |                     <label class="form-check-label" for="scatter">Scatter Plot</label>
69 |                 </div>
70 |                 <div class="form-check">
71 |                     <input class="form-check-input" type="checkbox" name="charts" value="line" id="line">
72 |                     <label class="form-check-label" for="line">Line Chart</label>
73 |                 </div>
74 |                 <div class="form-check">
75 |                     <input class="form-check-input" type="checkbox" name="charts" value="bar" id="bar">
76 |                     <label class="form-check-label" for="bar">Bar Chart</label>
77 |                 </div>
78 |                 <div class="form-check">
79 |                     <input class="form-check-input" type="checkbox" name="charts" value="pie" id="pie">
80 |                     <label class="form-check-label" for="pie">Pie Chart</label>
81 |                 </div>
82 |             </div>
83 |             
84 |             <button type="submit" class="upload-button mt-4">Upload and Generate Charts</button>
85 |         </form>
86 |     </div>
87 |     </div>
88 | 
89 |     <!-- Footer -->
90 |     <footer>
91 |         <p>&copy; 2024 Data Cleaner. All rights reserved.</p>
92 |     </footer>
93 | 
94 |     <!-- Bootstrap JS and dependencies via CDN -->
95 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0-alpha1/dist/js/bootstrap.bundle.min.js"></script>
96 | 
97 | </body>
98 | </html>
99 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, request, render_template, url_for
  2 | import pandas as pd
  3 | import os
  4 | import io
  5 | from sklearn.preprocessing import MinMaxScaler
  6 | import plotly.express as px
  7 | import plotly.graph_objects as go
  8 | from plotly.subplots import make_subplots
  9 | 
 10 | app = Flask(__name__)
 11 | 
 12 | # Ensure the static folder is created for saving charts
 13 | os.makedirs('static/charts', exist_ok=True)
 14 | 
 15 | @app.route('/')
 16 | def index():
 17 |     return render_template('index.html')
 18 | 
 19 | @app.route('/upload', methods=['POST'])
 20 | def upload_file():
 21 |     if 'file' not in request.files:
 22 |         return "No file uploaded", 400
 23 |     file = request.files['file']
 24 |     if file.filename == '':
 25 |         return "No file selected", 400
 26 | 
 27 |     # File size limit: 10MB
 28 |     file.seek(0, 2)
 29 |     file_size = file.tell()
 30 |     file.seek(0)
 31 |     if file_size > 10 * 1024 * 1024:
 32 |         return "File too large. Maximum size is 10MB.", 400
 33 | 
 34 |     # Check file extension
 35 |     filename = file.filename.lower()
 36 |     if not (filename.endswith('.csv') or filename.endswith('.xlsx')):
 37 |         return "Unsupported file format. Please upload CSV or Excel files.", 400
 38 | 
 39 |     try:
 40 |         # Load the dataset
 41 |         if filename.endswith('.csv'):
 42 |             df = pd.read_csv(file)
 43 |         else:
 44 |             df = pd.read_excel(file)
 45 | 
 46 |         if df.empty:
 47 |             return "Uploaded file is empty.", 400
 48 | 
 49 |         # Get preprocessing options from form
 50 |         preprocess_options = request.form.getlist('preprocess')
 51 |         selected_charts = request.form.getlist('charts')
 52 | 
 53 |         # Save original dataset
 54 |         original_file_path = 'static/original_dataset.csv'
 55 |         df.to_csv(original_file_path, index=False)
 56 | 
 57 |         # Preprocess the dataset
 58 |         df_cleaned = df.copy()
 59 | 
 60 |         # Default: remove missing values
 61 |         if 'remove_na' in preprocess_options or not preprocess_options:
 62 |             df_cleaned = df_cleaned.dropna()
 63 | 
 64 |         # Remove duplicates
 65 |         if 'remove_duplicates' in preprocess_options:
 66 |             df_cleaned = df_cleaned.drop_duplicates()
 67 | 
 68 |         # Remove outliers (IQR method for numerical columns)
 69 |         if 'remove_outliers' in preprocess_options:
 70 |             numerical_cols = df_cleaned.select_dtypes(include=['number']).columns
 71 |             for col in numerical_cols:
 72 |                 Q1 = df_cleaned[col].quantile(0.25)
 73 |                 Q3 = df_cleaned[col].quantile(0.75)
 74 |                 IQR = Q3 - Q1
 75 |                 lower_bound = Q1 - 1.5 * IQR
 76 |                 upper_bound = Q3 + 1.5 * IQR
 77 |                 df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]
 78 | 
 79 |         # Normalize numerical columns
 80 |         if 'normalize' in preprocess_options:
 81 |             numerical_cols = df_cleaned.select_dtypes(include=['number']).columns
 82 |             if len(numerical_cols) > 0:
 83 |                 scaler = MinMaxScaler()
 84 |                 df_cleaned[numerical_cols] = scaler.fit_transform(df_cleaned[numerical_cols])
 85 | 
 86 |         # Encode categorical columns (one-hot encoding)
 87 |         if 'encode_categoricals' in preprocess_options:
 88 |             categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
 89 |             for col in categorical_cols:
 90 |                 df_cleaned = pd.get_dummies(df_cleaned, columns=[col], prefix=col, drop_first=True)
 91 | 
 92 |         # Save the cleaned dataset
 93 |         cleaned_file_path = 'static/cleaned_dataset.csv'
 94 |         df_cleaned.to_csv(cleaned_file_path, index=False)
 95 | 
 96 |         # Generate summaries
 97 |         original_summary = df.describe(include='all').to_html(classes='table table-striped')
 98 |         cleaned_summary = df_cleaned.describe(include='all').to_html(classes='table table-striped')
 99 | 
100 |         # Render the results page
101 |         return render_template(
102 |             'result.html',
103 |             original_file=original_file_path,
104 |             cleaned_file=cleaned_file_path,
105 |             original_summary=original_summary,
106 |             cleaned_summary=cleaned_summary,
107 |             charts_selected=bool(selected_charts),
108 |             charts=generate_comparison_charts(df, df_cleaned, selected_charts)
109 |         )
110 |     except Exception as e:
111 |         return f"Error processing file: {str(e)}", 500
112 | 
113 | def generate_comparison_charts(df_original, df_cleaned, selected_charts):
114 |     charts = []
115 | 
116 |     numerical_cols = df_original.select_dtypes(include=['number']).columns.tolist()
117 |     categorical_cols = df_original.select_dtypes(include=['object']).columns.tolist()
118 | 
119 |     # Helper to get first suitable column
120 |     def get_first_num_col():
121 |         return numerical_cols[0] if numerical_cols else None
122 | 
123 |     def get_first_cat_col():
124 |         return categorical_cols[0] if categorical_cols else None
125 | 
126 |     def get_two_num_cols():
127 |         return numerical_cols[:2] if len(numerical_cols) >= 2 else None
128 | 
129 |     # Generate Histogram
130 |     if 'histogram' in selected_charts and numerical_cols:
131 |         col = get_first_num_col()
132 |         fig_orig = px.histogram(df_original, x=col, title=f'Original Data - Histogram ({col})', marginal='rug')
133 |         html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
134 |         charts.append({'name': f'Original Histogram ({col})', 'html': html_orig})
135 | 
136 |         fig_clean = px.histogram(df_cleaned, x=col, title=f'Cleaned Data - Histogram ({col})', marginal='rug')
137 |         html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
138 |         charts.append({'name': f'Cleaned Histogram ({col})', 'html': html_clean})
139 | 
140 |     # Generate Box Plot
141 |     if 'boxplot' in selected_charts and numerical_cols:
142 |         col = get_first_num_col()
143 |         fig_orig = px.box(df_original, y=col, title=f'Original Data - Box Plot ({col})')
144 |         html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
145 |         charts.append({'name': f'Original Box Plot ({col})', 'html': html_orig})
146 | 
147 |         fig_clean = px.box(df_cleaned, y=col, title=f'Cleaned Data - Box Plot ({col})')
148 |         html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
149 |         charts.append({'name': f'Cleaned Box Plot ({col})', 'html': html_clean})
150 | 
151 |     # Generate Scatter Plot
152 |     if 'scatter' in selected_charts and len(numerical_cols) >= 2:
153 |         cols = get_two_num_cols()
154 |         fig_orig = px.scatter(df_original, x=cols[0], y=cols[1], title=f'Original Data - Scatter Plot ({cols[0]} vs {cols[1]})')
155 |         html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
156 |         charts.append({'name': f'Original Scatter Plot ({cols[0]} vs {cols[1]})', 'html': html_orig})
157 | 
158 |         fig_clean = px.scatter(df_cleaned, x=cols[0], y=cols[1], title=f'Cleaned Data - Scatter Plot ({cols[0]} vs {cols[1]})')
159 |         html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
160 |         charts.append({'name': f'Cleaned Scatter Plot ({cols[0]} vs {cols[1]})', 'html': html_clean})
161 | 
162 |     # Generate Line Chart
163 |     if 'line' in selected_charts and numerical_cols:
164 |         col = get_first_num_col()
165 |         fig_orig = px.line(df_original, x=df_original.index, y=col, title=f'Original Data - Line Chart ({col})')
166 |         html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
167 |         charts.append({'name': f'Original Line Chart ({col})', 'html': html_orig})
168 | 
169 |         fig_clean = px.line(df_cleaned, x=df_cleaned.index, y=col, title=f'Cleaned Data - Line Chart ({col})')
170 |         html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
171 |         charts.append({'name': f'Cleaned Line Chart ({col})', 'html': html_clean})
172 | 
173 |     # Generate Bar Chart
174 |     if 'bar' in selected_charts and numerical_cols:
175 |         col = get_first_num_col()
176 |         fig_orig = px.bar(df_original.head(10), x=df_original.index[:10], y=col, title='Original Data - Bar Chart (First 10 rows)')
177 |         html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
178 |         charts.append({'name': 'Original Bar Chart', 'html': html_orig})
179 | 
180 |         fig_clean = px.bar(df_cleaned.head(10), x=df_cleaned.index[:10], y=col, title='Cleaned Data - Bar Chart (First 10 rows)')
181 |         html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
182 |         charts.append({'name': 'Cleaned Bar Chart', 'html': html_clean})
183 | 
184 |     # Generate Pie Chart
185 |     if 'pie' in selected_charts and categorical_cols:
186 |         col = get_first_cat_col()
187 |         fig_orig = px.pie(df_original, names=col, title=f'Original Data - Pie Chart ({col})')
188 |         html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
189 |         charts.append({'name': f'Original Pie Chart ({col})', 'html': html_orig})
190 | 
191 |         fig_clean = px.pie(df_cleaned, names=col, title=f'Cleaned Data - Pie Chart ({col})')
192 |         html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
193 |         charts.append({'name': f'Cleaned Pie Chart ({col})', 'html': html_clean})
194 | 
195 |     return charts
196 | 
197 | if __name__ == '__main__':
198 |     app.run(debug=True)
199 | 


--------------------------------------------------------------------------------