├── Procfile
├── logo.jpg
├── requirements.txt
├── .github
└── workflows
│ └── jekyll-gh-pages.yml
├── TODO.md
├── styles.css
├── result.html
├── README.md
├── index.html
└── app.py
/Procfile:
--------------------------------------------------------------------------------
1 | web: gunicorn app:app
2 |
--------------------------------------------------------------------------------
/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Madhuarvind/Data-preprocessing/HEAD/logo.jpg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Flask
2 | pandas
3 | matplotlib
4 | seaborn
5 | plotly
6 | openpyxl
7 | scikit-learn
8 | gunicorn
9 |
--------------------------------------------------------------------------------
/.github/workflows/jekyll-gh-pages.yml:
--------------------------------------------------------------------------------
1 | # Sample workflow for building and deploying a Jekyll site to GitHub Pages
2 | name: Deploy Jekyll with GitHub Pages dependencies preinstalled
3 |
4 | on:
5 | # Runs on pushes targeting the default branch
6 | push:
7 | branches: ["main"]
8 |
9 | # Allows you to run this workflow manually from the Actions tab
10 | workflow_dispatch:
11 |
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 | contents: read
15 | pages: write
16 | id-token: write
17 |
18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20 | concurrency:
21 | group: "pages"
22 | cancel-in-progress: false
23 |
24 | jobs:
25 | # Build job
26 | build:
27 | runs-on: ubuntu-latest
28 | steps:
29 | - name: Checkout
30 | uses: actions/checkout@v4
31 | - name: Setup Pages
32 | uses: actions/configure-pages@v5
33 | - name: Build with Jekyll
34 | uses: actions/jekyll-build-pages@v1
35 | with:
36 | source: ./
37 | destination: ./_site
38 | - name: Upload artifact
39 | uses: actions/upload-pages-artifact@v3
40 |
41 | # Deployment job
42 | deploy:
43 | environment:
44 | name: github-pages
45 | url: ${{ steps.deployment.outputs.page_url }}
46 | runs-on: ubuntu-latest
47 | needs: build
48 | steps:
49 | - name: Deploy to GitHub Pages
50 | id: deployment
51 | uses: actions/deploy-pages@v4
52 |
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | # Improvement Plan for Data Preprocessing Project
2 |
3 | ## Logical Steps:
4 |
5 | 1. [x] **Update requirements.txt**: Add new dependencies for advanced features (plotly for interactive charts, openpyxl for Excel support, scikit-learn for preprocessing like outlier removal and scaling).
6 |
7 | 2. [x] **Enhance app.py**:
8 | - [x] Add support for multiple file formats (CSV and Excel).
9 | - [x] Implement advanced preprocessing options: outlier removal (IQR), normalization (MinMaxScaler), categorical encoding (one-hot), duplicate removal.
10 | - [x] Generate summary statistics tables for original and cleaned data.
11 | - [x] Switch chart generation to Plotly for interactive visualizations (comparison subplots).
12 | - [x] Add file validation (size limit 10MB, check if CSV/Excel).
13 | - [x] Save both original and cleaned datasets for download.
14 | - [x] Improve error handling.
15 |
16 | 3. [x] **Update index.html**: Add form elements for advanced preprocessing options (checkboxes for outliers, normalize, encode, duplicates). Update file input to accept .csv and .xlsx.
17 |
18 | 4. [x] **Update result.html**: Add sections for summary statistics (HTML tables), download links for original and cleaned files, embed Plotly interactive charts (using plotly.js CDN).
19 |
20 | 5. [x] **Update styles.css**: Add styles for new UI elements (preprocessing options, summary tables, interactive chart containers).
21 |
22 | 6. [x] **Update README.md**: Expand with setup instructions, new features list, usage guide.
23 |
24 | 7. [ ] **Install dependencies**: Run pip install -r requirements.txt.
25 |
26 | 8. [ ] **Test the application**:
27 | - [ ] Run Flask app.
28 | - [ ] Upload sample CSV/Excel files.
29 | - [ ] Select preprocessing options and charts.
30 | - [ ] Verify outputs: cleaned data, summaries, interactive charts, downloads.
31 | - [ ] Use browser to interact and confirm.
32 |
33 | 9. [ ] **Handle any issues**: Debug errors from testing, update files as needed.
34 |
--------------------------------------------------------------------------------
/styles.css:
--------------------------------------------------------------------------------
1 | /* General Body Styling */
2 | body {
3 | background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); /* Fallback gradient since background.jpg is missing */
4 | color: #333;
5 | }
6 |
7 | /* Hero Section */
8 | .hero {
9 | background-color: rgba(44, 160, 243, 0.8); /* Use a semi-transparent background color */
10 | color: white;
11 | padding: 40px;
12 | text-align: center;
13 | border-radius: 4px;
14 | margin-bottom: 20px;
15 | }
16 |
17 | /* Upload Section */
18 | .upload-section {
19 | text-align: center;
20 | margin-top: 20px;
21 | background: rgba(255, 255, 255, 0.1);
22 | padding: 20px;
23 | border-radius: 10px;
24 | }
25 |
26 | /* Preprocessing Options */
27 | .preprocess-options {
28 | display: flex;
29 | flex-direction: column;
30 | gap: 10px;
31 | margin-bottom: 20px;
32 | }
33 |
34 | .preprocess-options .form-check {
35 | text-align: left;
36 | }
37 |
38 | /* Button Styling */
39 | .upload-button,
40 | .download-button {
41 | background-color: #4a90e2;
42 | border: none;
43 | color: white;
44 | padding: 12px 24px;
45 | font-size: 1.1rem;
46 | border-radius: 25px;
47 | transition: background-color 0.3s ease;
48 | text-decoration: none;
49 | display: inline-block;
50 | }
51 |
52 | .upload-button:hover,
53 | .download-button:hover {
54 | background-color: #357ab8;
55 | }
56 |
57 | /* Download Section */
58 | .download-section {
59 | text-align: center;
60 | margin-top: 20px;
61 | }
62 |
63 | /* Summary Section */
64 | .summary-section {
65 | background: rgba(255, 255, 255, 0.1);
66 | padding: 20px;
67 | border-radius: 10px;
68 | margin-top: 20px;
69 | }
70 |
71 | .summary-section table {
72 | background: white;
73 | color: #333;
74 | border-radius: 8px;
75 | overflow: hidden;
76 | box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
77 | }
78 |
79 | /* Visualization Section */
80 | .visualization-section {
81 | margin-top: 40px;
82 | }
83 |
84 | .chart-container {
85 | height: 400px;
86 | width: 100%;
87 | border-radius: 12px;
88 | box-shadow: 0 4px 10px rgba(0, 0, 0, 0.1);
89 | }
90 |
91 | .visualization {
92 | text-align: center;
93 | margin-bottom: 20px;
94 | }
95 |
96 | /* Footer Styling */
97 | footer {
98 | margin-top: 30px;
99 | padding: 15px 0;
100 | background-color: rgba(240, 244, 248, 0.8); /* Use a semi-transparent footer */
101 | text-align: center;
102 | }
103 |
--------------------------------------------------------------------------------
/result.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Preprocessing Results
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
Preprocessing Complete
19 |
Your data has been successfully processed.
20 |
21 |
22 |
23 |
28 |
29 |
30 |
31 |
Summary Statistics
32 |
33 |
34 |
Original Data
35 | {{ original_summary | safe }}
36 |
37 |
38 |
Cleaned Data
39 | {{ cleaned_summary | safe }}
40 |
41 |
42 |
43 |
44 |
45 |
46 | {% if charts_selected %}
47 |
Generated Charts (Before and After)
48 |
49 | {% for chart in charts %}
50 |
51 |
{{ chart.name }}
52 |
{{ chart.html | safe }}
53 |
54 | {% endfor %}
55 |
56 | {% else %}
57 |
No charts selected for visualization.
58 | {% endif %}
59 |
60 |
61 |
62 |
63 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Preprocessing Application
2 |
3 | ## Overview
4 | This is an advanced web application for data preprocessing and visualization built with Flask. It allows users to upload CSV or Excel datasets, apply various preprocessing techniques, generate summary statistics, and create interactive charts for comparison between original and cleaned data.
5 |
6 | ## Features
7 | - **File Upload Support**: Upload CSV and Excel (.xlsx) files (max 10MB).
8 | - **Preprocessing Options**:
9 | - Remove missing values (default).
10 | - Remove duplicates.
11 | - Remove outliers using IQR method for numerical columns.
12 | - Normalize numerical columns using MinMaxScaler.
13 | - One-hot encode categorical columns.
14 | - **Summary Statistics**: Generate and display descriptive statistics for original and cleaned datasets.
15 | - **Interactive Visualizations**: Select from Histogram, Box Plot, Scatter Plot, Line Chart, Bar Chart, Pie Chart. Uses Plotly for interactive charts comparing original vs. cleaned data.
16 | - **Downloads**: Download original and cleaned datasets as CSV.
17 | - **User-Friendly UI**: Bootstrap-based interface with custom styling.
18 |
19 | ## Setup Instructions
20 | 1. **Clone or Download** the project.
21 | 2. **Install Dependencies**:
22 | ```
23 | pip install -r requirements.txt
24 | ```
25 | Required libraries: Flask, pandas, matplotlib, seaborn, plotly, openpyxl, scikit-learn.
26 | 3. **Run the Application**:
27 | ```
28 | python app.py
29 | ```
30 | The app will start on http://127.0.0.1:5000.
31 | 4. **Access the App**: Open your browser and go to http://127.0.0.1:5000.
32 |
33 | ## Usage
34 | 1. **Upload Dataset**: Select a CSV or Excel file.
35 | 2. **Select Preprocessing Options**: Choose from the checkboxes (Remove Missing Values is enabled by default).
36 | 3. **Select Charts**: Check the visualizations you want to generate.
37 | 4. **Submit**: Click "Upload and Generate Charts".
38 | 5. **View Results**:
39 | - Download original or cleaned datasets.
40 | - Review summary statistics tables.
41 | - Interact with generated charts (before and after preprocessing).
42 |
43 | ## Project Structure
44 | - `app.py`: Main Flask application with routes, preprocessing logic, and chart generation.
45 | - `templates/index.html`: Upload form with preprocessing and chart options.
46 | - `templates/result.html`: Results page with downloads, summaries, and charts.
47 | - `static/css/styles.css`: Custom styles for the UI.
48 | - `static/charts/`: Directory for chart files (auto-created).
49 | - `requirements.txt`: Python dependencies.
50 | - `logo.jpg`: App logo (optional).
51 |
52 | ## Notes
53 | - Charts are generated based on the first suitable columns (numerical for most, categorical for pie).
54 | - For scatter plots, requires at least two numerical columns.
55 | - Error handling for invalid files, empty datasets, and processing errors.
56 | - Interactive charts require an internet connection for Plotly CDN.
57 |
58 | ## Deployment Instructions
59 | To deploy the application to a cloud platform, follow these steps for Heroku (free tier available):
60 |
61 | 1. **Install Heroku CLI**: Download and install from https://devcenter.heroku.com/articles/heroku-cli.
62 |
63 | 2. **Prepare the App for Production**:
64 | - Add `gunicorn` to `requirements.txt` (already included).
65 | - Create a `Procfile` in the root directory with the content:
66 | ```
67 | web: gunicorn app:app
68 | ```
69 | - Ensure `app.py` has the following at the end:
70 | ```python
71 | if __name__ == '__main__':
72 | app.run()
73 | ```
74 | (Already present.)
75 |
76 | 3. **Deploy to Heroku**:
77 | - Login to Heroku: `heroku login`
78 | - Create a new app: `heroku create your-app-name`
79 | - Push the code: `git add . && git commit -m "Initial commit" && git push heroku main`
80 | - Open the app: `heroku open`
81 |
82 | For other platforms like Render or Railway, create an account, connect your GitHub repo (push the code to GitHub first), and deploy as a web service.
83 |
84 | ## Future Improvements
85 | - Support for more file formats (JSON, etc.).
86 | - Advanced preprocessing (feature selection, imputation methods).
87 | - Export charts as images/PDF.
88 | - User authentication and session management.
89 |
90 | © 2024 Data Cleaner. All rights reserved.
91 |
--------------------------------------------------------------------------------
/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Data Preprocessing Application
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
 }})
20 |
Data Preprocessing Application
21 |
Upload your dataset and select charts for visualization.
22 |
23 |
24 |
25 |
26 |
Choose your Dataset (CSV or Excel)
27 |
86 |
87 |
88 |
89 |
90 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, request, render_template, url_for
2 | import pandas as pd
3 | import os
4 | import io
5 | from sklearn.preprocessing import MinMaxScaler
6 | import plotly.express as px
7 | import plotly.graph_objects as go
8 | from plotly.subplots import make_subplots
9 |
10 | app = Flask(__name__)
11 |
12 | # Ensure the static folder is created for saving charts
13 | os.makedirs('static/charts', exist_ok=True)
14 |
15 | @app.route('/')
16 | def index():
17 | return render_template('index.html')
18 |
19 | @app.route('/upload', methods=['POST'])
20 | def upload_file():
21 | if 'file' not in request.files:
22 | return "No file uploaded", 400
23 | file = request.files['file']
24 | if file.filename == '':
25 | return "No file selected", 400
26 |
27 | # File size limit: 10MB
28 | file.seek(0, 2)
29 | file_size = file.tell()
30 | file.seek(0)
31 | if file_size > 10 * 1024 * 1024:
32 | return "File too large. Maximum size is 10MB.", 400
33 |
34 | # Check file extension
35 | filename = file.filename.lower()
36 | if not (filename.endswith('.csv') or filename.endswith('.xlsx')):
37 | return "Unsupported file format. Please upload CSV or Excel files.", 400
38 |
39 | try:
40 | # Load the dataset
41 | if filename.endswith('.csv'):
42 | df = pd.read_csv(file)
43 | else:
44 | df = pd.read_excel(file)
45 |
46 | if df.empty:
47 | return "Uploaded file is empty.", 400
48 |
49 | # Get preprocessing options from form
50 | preprocess_options = request.form.getlist('preprocess')
51 | selected_charts = request.form.getlist('charts')
52 |
53 | # Save original dataset
54 | original_file_path = 'static/original_dataset.csv'
55 | df.to_csv(original_file_path, index=False)
56 |
57 | # Preprocess the dataset
58 | df_cleaned = df.copy()
59 |
60 | # Default: remove missing values
61 | if 'remove_na' in preprocess_options or not preprocess_options:
62 | df_cleaned = df_cleaned.dropna()
63 |
64 | # Remove duplicates
65 | if 'remove_duplicates' in preprocess_options:
66 | df_cleaned = df_cleaned.drop_duplicates()
67 |
68 | # Remove outliers (IQR method for numerical columns)
69 | if 'remove_outliers' in preprocess_options:
70 | numerical_cols = df_cleaned.select_dtypes(include=['number']).columns
71 | for col in numerical_cols:
72 | Q1 = df_cleaned[col].quantile(0.25)
73 | Q3 = df_cleaned[col].quantile(0.75)
74 | IQR = Q3 - Q1
75 | lower_bound = Q1 - 1.5 * IQR
76 | upper_bound = Q3 + 1.5 * IQR
77 | df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]
78 |
79 | # Normalize numerical columns
80 | if 'normalize' in preprocess_options:
81 | numerical_cols = df_cleaned.select_dtypes(include=['number']).columns
82 | if len(numerical_cols) > 0:
83 | scaler = MinMaxScaler()
84 | df_cleaned[numerical_cols] = scaler.fit_transform(df_cleaned[numerical_cols])
85 |
86 | # Encode categorical columns (one-hot encoding)
87 | if 'encode_categoricals' in preprocess_options:
88 | categorical_cols = df_cleaned.select_dtypes(include=['object']).columns
89 | for col in categorical_cols:
90 | df_cleaned = pd.get_dummies(df_cleaned, columns=[col], prefix=col, drop_first=True)
91 |
92 | # Save the cleaned dataset
93 | cleaned_file_path = 'static/cleaned_dataset.csv'
94 | df_cleaned.to_csv(cleaned_file_path, index=False)
95 |
96 | # Generate summaries
97 | original_summary = df.describe(include='all').to_html(classes='table table-striped')
98 | cleaned_summary = df_cleaned.describe(include='all').to_html(classes='table table-striped')
99 |
100 | # Render the results page
101 | return render_template(
102 | 'result.html',
103 | original_file=original_file_path,
104 | cleaned_file=cleaned_file_path,
105 | original_summary=original_summary,
106 | cleaned_summary=cleaned_summary,
107 | charts_selected=bool(selected_charts),
108 | charts=generate_comparison_charts(df, df_cleaned, selected_charts)
109 | )
110 | except Exception as e:
111 | return f"Error processing file: {str(e)}", 500
112 |
113 | def generate_comparison_charts(df_original, df_cleaned, selected_charts):
114 | charts = []
115 |
116 | numerical_cols = df_original.select_dtypes(include=['number']).columns.tolist()
117 | categorical_cols = df_original.select_dtypes(include=['object']).columns.tolist()
118 |
119 | # Helper to get first suitable column
120 | def get_first_num_col():
121 | return numerical_cols[0] if numerical_cols else None
122 |
123 | def get_first_cat_col():
124 | return categorical_cols[0] if categorical_cols else None
125 |
126 | def get_two_num_cols():
127 | return numerical_cols[:2] if len(numerical_cols) >= 2 else None
128 |
129 | # Generate Histogram
130 | if 'histogram' in selected_charts and numerical_cols:
131 | col = get_first_num_col()
132 | fig_orig = px.histogram(df_original, x=col, title=f'Original Data - Histogram ({col})', marginal='rug')
133 | html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
134 | charts.append({'name': f'Original Histogram ({col})', 'html': html_orig})
135 |
136 | fig_clean = px.histogram(df_cleaned, x=col, title=f'Cleaned Data - Histogram ({col})', marginal='rug')
137 | html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
138 | charts.append({'name': f'Cleaned Histogram ({col})', 'html': html_clean})
139 |
140 | # Generate Box Plot
141 | if 'boxplot' in selected_charts and numerical_cols:
142 | col = get_first_num_col()
143 | fig_orig = px.box(df_original, y=col, title=f'Original Data - Box Plot ({col})')
144 | html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
145 | charts.append({'name': f'Original Box Plot ({col})', 'html': html_orig})
146 |
147 | fig_clean = px.box(df_cleaned, y=col, title=f'Cleaned Data - Box Plot ({col})')
148 | html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
149 | charts.append({'name': f'Cleaned Box Plot ({col})', 'html': html_clean})
150 |
151 | # Generate Scatter Plot
152 | if 'scatter' in selected_charts and len(numerical_cols) >= 2:
153 | cols = get_two_num_cols()
154 | fig_orig = px.scatter(df_original, x=cols[0], y=cols[1], title=f'Original Data - Scatter Plot ({cols[0]} vs {cols[1]})')
155 | html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
156 | charts.append({'name': f'Original Scatter Plot ({cols[0]} vs {cols[1]})', 'html': html_orig})
157 |
158 | fig_clean = px.scatter(df_cleaned, x=cols[0], y=cols[1], title=f'Cleaned Data - Scatter Plot ({cols[0]} vs {cols[1]})')
159 | html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
160 | charts.append({'name': f'Cleaned Scatter Plot ({cols[0]} vs {cols[1]})', 'html': html_clean})
161 |
162 | # Generate Line Chart
163 | if 'line' in selected_charts and numerical_cols:
164 | col = get_first_num_col()
165 | fig_orig = px.line(df_original, x=df_original.index, y=col, title=f'Original Data - Line Chart ({col})')
166 | html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
167 | charts.append({'name': f'Original Line Chart ({col})', 'html': html_orig})
168 |
169 | fig_clean = px.line(df_cleaned, x=df_cleaned.index, y=col, title=f'Cleaned Data - Line Chart ({col})')
170 | html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
171 | charts.append({'name': f'Cleaned Line Chart ({col})', 'html': html_clean})
172 |
173 | # Generate Bar Chart
174 | if 'bar' in selected_charts and numerical_cols:
175 | col = get_first_num_col()
176 | fig_orig = px.bar(df_original.head(10), x=df_original.index[:10], y=col, title='Original Data - Bar Chart (First 10 rows)')
177 | html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
178 | charts.append({'name': 'Original Bar Chart', 'html': html_orig})
179 |
180 | fig_clean = px.bar(df_cleaned.head(10), x=df_cleaned.index[:10], y=col, title='Cleaned Data - Bar Chart (First 10 rows)')
181 | html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
182 | charts.append({'name': 'Cleaned Bar Chart', 'html': html_clean})
183 |
184 | # Generate Pie Chart
185 | if 'pie' in selected_charts and categorical_cols:
186 | col = get_first_cat_col()
187 | fig_orig = px.pie(df_original, names=col, title=f'Original Data - Pie Chart ({col})')
188 | html_orig = fig_orig.to_html(full_html=False, include_plotlyjs='cdn')
189 | charts.append({'name': f'Original Pie Chart ({col})', 'html': html_orig})
190 |
191 | fig_clean = px.pie(df_cleaned, names=col, title=f'Cleaned Data - Pie Chart ({col})')
192 | html_clean = fig_clean.to_html(full_html=False, include_plotlyjs='cdn')
193 | charts.append({'name': f'Cleaned Pie Chart ({col})', 'html': html_clean})
194 |
195 | return charts
196 |
197 | if __name__ == '__main__':
198 | app.run(debug=True)
199 |
--------------------------------------------------------------------------------