├── Aurora.py
├── License.txt
├── README.md
├── __pycache__
    ├── Aurora.cpython-38.pyc
    └── Aurora.cpython-39.pyc
├── icon.png
├── images
    ├── logo.png
    ├── ss1.png
    └── ss2.png
├── plugins
    ├── IsolationForest.py
    ├── Mann_Whit.py
    ├── PCA.py
    ├── __pycache__
    │   ├── IsolationForest.cpython-38.pyc
    │   ├── Mann_Whit.cpython-38.pyc
    │   ├── PCA.cpython-38.pyc
    │   ├── anova.cpython-38.pyc
    │   ├── cca.cpython-38.pyc
    │   ├── esm.cpython-38.pyc
    │   ├── example_plugin_a.cpython-38.pyc
    │   ├── plugin_a.cpython-38.pyc
    │   ├── plugin_b.cpython-38.pyc
    │   ├── poisson_probabilities.cpython-38.pyc
    │   └── vine_copula.cpython-38.pyc
    ├── anova.py
    ├── autoencoder.py
    ├── cca.py
    ├── esm.py
    ├── example_plugin_a.py
    ├── histogram.py
    ├── kmeans.py
    ├── knn.py
    ├── pearson.py
    ├── poisson_probabilities.py
    ├── svm.py
    ├── text_classifier.py
    └── xgboost.py
└── requirements.txt


/Aurora.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | #####################################################
  5 | #### Package: Aurora
  6 | #### Version: 0.2
  7 | #### Author: Marius Neagoe
  8 | #### Copyright: © 2024 Marius Neagoe
  9 | #### Website: https://mariusneagoe.com
 10 | #### Github: https://github.com/MariusNea/Aurora
 11 | #####################################################
 12 | 
 13 | import tkinter as tk
 14 | from tkinter import ttk, StringVar
 15 | from tkinter import messagebox
 16 | from tkinter import filedialog
 17 | import pandas as pd
 18 | from tkinter import simpledialog
 19 | import os
 20 | import importlib.util
 21 | from sklearn.model_selection import train_test_split
 22 | from sklearn.linear_model import LinearRegression, LogisticRegression
 23 | from sklearn.tree import DecisionTreeClassifier
 24 | from statsmodels.tsa.seasonal import STL
 25 | import matplotlib.pyplot as plt
 26 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
 27 | from tkinter import Label, Entry, Button
 28 | from PIL import Image, ImageTk
 29 | from io import BytesIO
 30 | from matplotlib.widgets import RectangleSelector
 31 | from sklearn.preprocessing import LabelEncoder
 32 | 
 33 | 
 34 | class DataFrameEditor:
 35 |     def __init__(self, root, dataframe):
 36 |         self.root = root
 37 |         self.root.title("Aurora")
 38 |         self.dataframe = dataframe
 39 |         self.plugins = {}
 40 |         self.selected_columns = []
 41 |         self.menu_bar = tk.Menu(self.root)
 42 |         self.root.config(menu=self.menu_bar)
 43 |         self.create_menu()
 44 |         self.current_page = 0
 45 |         self.rows_per_page = 100  # Set the number of rows per page
 46 |         self.total_pages = (len(self.dataframe) - 1) // self.rows_per_page + 1
 47 |         self.tree = ttk.Treeview(root)
 48 |         self.tree.pack(expand=True, fill='both')
 49 | 
 50 |         self.setup_tree_view()
 51 |         self.add_controls()
 52 |         self.add_pagination_controls()
 53 |         self.target_col = None
 54 |         self.model = None
 55 |         self.input_data = None
 56 |         highlighted1 = []
 57 |         highlighted2 = []
 58 |         self.sel_list = []
 59 |         
 60 |     def register_plugin(self, category, name, menu_text):
 61 |         def decorator(func):
 62 |             if name not in self.plugins:
 63 |                 self.plugins[name] = func
 64 |                 if category == 'statistics':
 65 |                     self.add_plugin_menu_item(self.stats_menu, menu_text, func)
 66 |                 elif category == 'machine_learning':
 67 |                     self.add_plugin_menu_item(self.ml_menu, menu_text, func)
 68 |             else:
 69 |                 print(f"Plugin '{name}' is already registered.")
 70 |             return func
 71 |         return decorator
 72 |     
 73 |     def create_menu(self):
 74 |         # Create a menu bar
 75 |         
 76 | 
 77 |         # File menu
 78 |         file_menu = tk.Menu(self.menu_bar, tearoff=0)
 79 |         self.menu_bar.add_cascade(label="File", menu=file_menu)
 80 |         # Add menu items to the File menu
 81 |         file_menu.add_separator()
 82 |         file_menu.add_command(label="Exit", command=self.root.destroy)
 83 | 
 84 |         # Edit menu
 85 |         #edit_menu = tk.Menu(menu_bar, tearoff=0)
 86 |         #menu_bar.add_cascade(label="Edit", menu=edit_menu)
 87 |         #edit_menu.add_command(label="Cut", command=self.dummy_function)
 88 |         #edit_menu.add_command(label="Copy", command=self.dummy_function)
 89 |         #edit_menu.add_command(label="Paste", command=self.dummy_function)
 90 | 
 91 |         # Statistics menu
 92 |         self.stats_menu = tk.Menu(self.menu_bar, tearoff=0)
 93 |         self.menu_bar.add_cascade(label="Statistics", menu=self.stats_menu)
 94 |         self.stats_menu.add_command(label="Generate Statistics", command=self.dummy_function)
 95 |         self.stats_menu.add_command(label="Statistical Models", command=self.regressions)
 96 |         self.stats_menu.add_command(label="Time Series Decomposition", command=self.decompose_and_plot)
 97 |         #Machine Learning menu
 98 |         self.ml_menu = tk.Menu(self.menu_bar, tearoff=0)
 99 |         self.menu_bar.add_cascade(label="Machine Learning", menu=self.ml_menu)
100 |         
101 |         
102 |         # Help menu
103 |         help_menu = tk.Menu(self.menu_bar, tearoff=0)
104 |         self.menu_bar.add_cascade(label="Help", menu=help_menu)
105 |         help_menu.add_command(label="About", command=self.show_about)
106 |         help_menu.add_command(label="License", command=self.show_license)
107 |         
108 |     def add_plugin_menu_item(self, menu, text, command):
109 |         menu.add_command(label=text, command=command)
110 |         
111 |     def setup_tree_view(self):
112 |         # Clear existing columns and rows in the Treeview
113 |         for i in self.tree.get_children():
114 |             self.tree.delete(i)
115 |         self.tree["columns"] = list(self.dataframe.columns)
116 |         self.tree["show"] = "headings"
117 |         for column in self.dataframe.columns:
118 |             self.tree.heading(column, text=column)
119 |             self.tree.column(column, anchor='center')
120 | 
121 |         # Inserting rows from the current page
122 |         start = self.current_page * self.rows_per_page
123 |         end = start + self.rows_per_page
124 |         display_df = self.dataframe.iloc[start:end]
125 |         for _, row in display_df.iterrows():
126 |             self.tree.insert('', 'end', values=list(row))
127 | 
128 |     def add_pagination_controls(self):
129 |         button_style = {"background": "#4CAF50",  # Green background color
130 |                 "foreground": "white",    # White text color
131 |                 "font": ("Arial", 12),    # Font and size
132 |                 "borderwidth": 2,         # Border width
133 |                 "relief": "groove"}
134 |         pagination_frame = tk.Frame(self.root)
135 |         pagination_frame.pack(fill='x', padx=5, pady=5)
136 |         prev_button = tk.Button(pagination_frame, text="Previous", command=self.prev_page, **button_style)
137 |         prev_button.pack(side='left')
138 |         next_button = tk.Button(pagination_frame, text="Next", command=self.next_page, **button_style)
139 |         next_button.pack(side='left')
140 | 
141 |     def prev_page(self):
142 |         if self.current_page > 0:
143 |             self.current_page -= 1
144 |             self.setup_tree_view()
145 | 
146 |     def next_page(self):
147 |         if self.current_page < self.total_pages - 1:
148 |             self.current_page += 1
149 |             self.setup_tree_view()
150 |             
151 |             
152 |     def add_controls(self):
153 |         add_row_button = tk.Button(self.root, text="Add Row", command=self.add_row)
154 |         add_row_button.pack(side='left')
155 | 
156 |         delete_row_button = tk.Button(self.root, text="Delete Row", command=self.delete_row)
157 |         delete_row_button.pack(side='left')
158 | 
159 |         add_column_button = tk.Button(self.root, text="Add Column", command=self.add_column)
160 |         add_column_button.pack(side='left')
161 | 
162 |         delete_column_button = tk.Button(self.root, text="Delete Column", command=self.delete_column)
163 |         delete_column_button.pack(side='left')
164 |         
165 |         clear_button = tk.Button(self.root, text="Clear Selection", command=self.clear_list)
166 |         clear_button.pack(side='right')
167 |         
168 |         # Button to plot selected columns
169 |         plot_button = tk.Button(self.root, text="Plot or Brush", command=self.int_hig_wrap)
170 |         plot_button.pack(side='right')
171 |         
172 |         # Button to select columns for plotting
173 |         select_button = tk.Button(self.root, text="Select Columns to Plot or Brush", command=self.select_columns)
174 |         select_button.pack(side='right')    
175 |         
176 |         refresh_button = tk.Button(self.root, text="Refresh Dataframe", command=self.update_frame)
177 |         refresh_button.pack(side='right')
178 |         
179 |         save_button = tk.Button(self.root, text="Save Dataframe", command=self.save_df)
180 |         save_button.pack(side='right')
181 |         
182 |         encode_button = tk.Button(self.root, text="Label Encode", command=self.encode_labels)
183 |         encode_button.pack(side='right')
184 |         
185 |         
186 |         self.tree.bind('<Double-1>', self.on_item_double_click)
187 |         
188 |     def clear_list(self):
189 |         self.sel_list.clear()
190 |         
191 |     def dummy_function(self):
192 |         summary = self.dataframe.describe()
193 |         result = "Summary Statistics"
194 | 
195 |         # Create a new window to display the result
196 |         result_window = tk.Toplevel(self.root)
197 |         result_window.title("summary Statistics")
198 | 
199 |         # Create a label to display the result
200 |         result_label = tk.Label(result_window, text=summary, padx=10, pady=10)
201 |         result_label.pack()
202 |         
203 |         
204 |     def decompose_and_plot(self):
205 |     
206 |     # Function to handle plotting with the entered period
207 |         def plot_with_period():
208 |             try:
209 |                 period = int(period_entry.get())
210 |             except ValueError:
211 |                 tk.messagebox.showerror("Error", "Please enter a valid integer for the period. First column hast to be Date and second Series")
212 |                 return
213 | 
214 |         # Assuming the first column is datetime and the second column is values
215 |             time_series = self.dataframe.iloc[:, 1]
216 | 
217 |         # Perform STL decomposition
218 |             decomposition = STL(time_series, period=period).fit()
219 | 
220 |         # Extract components
221 |             original = time_series
222 |             trend = decomposition.trend
223 |             seasonal = decomposition.seasonal
224 |             residual = decomposition.resid
225 | 
226 |         # Plot the components
227 |             root = tk.Toplevel(self.root)
228 |             
229 |             root.title("Time Series Decomposition")
230 | 
231 |             fig, axs = plt.subplots(4, 1, figsize=(8, 10), sharex=True)
232 | 
233 |             axs[0].plot(original, label='Original')
234 |             axs[0].set_ylabel('Original')
235 | 
236 |             axs[1].plot(trend, label='Trend', color='orange')
237 |             axs[1].set_ylabel('Trend')
238 | 
239 |             axs[2].plot(seasonal, label='Seasonal', color='green')
240 |             axs[2].set_ylabel('Seasonal')
241 | 
242 |             axs[3].plot(residual, label='Residual', color='red')
243 |             axs[3].set_ylabel('Residual')
244 | 
245 |             for ax in axs:
246 |                 ax.legend()
247 | 
248 |         # Embed the matplotlib plot into the Tkinter window
249 |             canvas = FigureCanvasTkAgg(fig, master=root)
250 |             canvas_widget = canvas.get_tk_widget()
251 |             canvas_widget.pack(side=tk.TOP, fill=tk.BOTH, expand=1)
252 | 
253 |             tk.mainloop()
254 | 
255 |     # Create a new window for period input
256 |         period_window = tk.Toplevel(self.root)
257 |         period_window.title("Enter Seasonality Period")
258 | 
259 |     # Label and Entry for period input
260 |         label = Label(period_window, text="Enter Seasonality Period:")
261 |         label.pack(pady=10)
262 |         period_entry = Entry(period_window)
263 |         period_entry.pack(pady=10)
264 | 
265 |     # Button to trigger the plot with the entered period
266 |         plot_button = Button(period_window, text="Plot", command=plot_with_period)
267 |         plot_button.pack(pady=10)
268 |         
269 |     def train_linear_regression(self, target_col):
270 |         if len(self.dataframe) == 1:
271 |             return None  # Return None if there's only one sample
272 |         X_train, X_test, y_train, y_test = train_test_split(self.dataframe.drop(columns=[target_col]), self.dataframe[target_col], test_size=0.2, random_state=42)
273 |         model = LinearRegression()
274 |         model.fit(X_train, y_train)
275 |         return model
276 | 
277 |     def train_logistic_regression(self, target_col):
278 |         if len(self.dataframe) == 1:
279 |             return None  # Return None if there's only one sample
280 |         X_train, X_test, y_train, y_test = train_test_split(self.dataframe.drop(columns=[target_col]), self.dataframe[target_col], test_size=0.2, random_state=42)
281 |         model = LogisticRegression()
282 |         model.fit(X_train, y_train)
283 |         return model
284 | 
285 |     def train_decision_tree(self, target_col):
286 |         if len(self.dataframe) == 1:
287 |             return None  # Return None if there's only one sample
288 |         X_train, X_test, y_train, y_test = train_test_split(self.dataframe.drop(columns=[target_col]), self.dataframe[target_col], test_size=0.2, random_state=42)
289 |         model = DecisionTreeClassifier()
290 |         model.fit(X_train, y_train)
291 |         return model
292 | 
293 |     def make_predictions(self, model, input_data):
294 |         if model is None:
295 |             return None  # Return None if the model is not trained
296 |         predictions = model.predict(input_data)
297 |         return predictions
298 |     
299 |     def on_predict_button_click(self, selected_model, entry_features, label_predictions):
300 |     # Get values from entry widgets
301 |         feature_values = [float(entry.get()) for entry in entry_features]
302 | 
303 |     # Create a DataFrame for prediction
304 |         new_data = pd.DataFrame([feature_values], columns=self.dataframe.columns[:-1])
305 | 
306 |     # Train the selected model
307 |         if selected_model == "linear":
308 |             model = self.train_linear_regression(target_col='target')
309 |         elif selected_model == "logistic":
310 |             model = self.train_logistic_regression(target_col='target')
311 |         elif selected_model == "tree":
312 |             model = self.train_decision_tree(target_col='target')
313 |         else:
314 |             model = None
315 | 
316 |     # Make predictions
317 |         if model is not None:
318 |             predictions = self.make_predictions(model, new_data)
319 |         # Display predictions in labels or handle as needed
320 |             label_predictions.config(text=f"Prediction: {predictions}")
321 |         else:
322 |             label_predictions.config(text="Please select a valid model before predicting.")
323 |     
324 |     def regressions(self):
325 |     # Create a Tkinter window
326 |         window = tk.Toplevel(self.root)
327 |         window.title("Machine Learning Predictions")
328 | 
329 |     # Ask the user for the number of features
330 |         num_features = simpledialog.askinteger("Number of Features", "Enter the number of features(number of columns from 1 to n-1). Last column is the predicted column:")
331 | 
332 |     # Create entry widgets for user input features
333 |         entry_features = []
334 |         for i in range(num_features):
335 |             entry = tk.Entry(window, width=10)
336 |             entry.grid(row=i, column=1, padx=10, pady=10)
337 |             entry_features.append(entry)
338 |             label = tk.Label(window, text=f"Feature {i + 1}:")
339 |             label.grid(row=i, column=0, padx=10, pady=10, sticky=tk.E)
340 | 
341 |     # Create radio buttons for selecting the model
342 |         # Create radio buttons for selecting the model
343 |         selected_model = tk.StringVar()
344 |         linear_radio = tk.Radiobutton(window, text="Linear Regression", variable=selected_model, value="linear")
345 |         linear_radio.grid(row=num_features, column=0, columnspan=2, pady=10)
346 |         logistic_radio = tk.Radiobutton(window, text="Logistic Regression", variable=selected_model, value="logistic")
347 |         logistic_radio.grid(row=num_features + 1, column=0, columnspan=2, pady=10)
348 |         decision_tree_radio = tk.Radiobutton(window, text="Decision Tree", variable=selected_model, value="tree")
349 |         decision_tree_radio.grid(row=num_features + 2, column=0, columnspan=2, pady=10)
350 | 
351 |     # Create labels for displaying predictions
352 |         label_predictions = tk.Label(window, text="Predictions:")
353 |         label_predictions.grid(row=num_features + 4, column=0, columnspan=2)
354 | 
355 |     # Create a button to trigger predictions
356 |         predict_button = tk.Button(window, text="Predict", command=lambda: self.on_predict_button_click(selected_model.get(), entry_features, label_predictions))
357 |         predict_button.grid(row=num_features + 3, column=0, columnspan=2, pady=10)
358 | 
359 | 
360 |     # Placeholder DataFrame with an unknown number of columns
361 |         data = {'target': [0]}  
362 |         for i in range(num_features):
363 |             data[f'feature{i + 1}'] = [0.0]  # Initialize with placeholder values
364 | 
365 |     # Start the Tkinter event loop
366 |         window.mainloop()
367 |         
368 |     
369 |     def show_about(self):
370 |         messagebox.showinfo("About", "Aurora \nVersion 0.1\n\nCreated by Marius Neagoe\n\n www.mariusneagoe.com")
371 |     
372 |     def show_license(self):
373 |         license_window = tk.Toplevel()
374 |         license_window.title("License")
375 |         license_window.geometry("500x300")  # You can adjust the size as needed
376 | 
377 |     # Create a Text widget for displaying the license
378 |         license_text_widget = tk.Text(license_window, wrap="word")
379 |         license_text_widget.pack(expand=True, fill="both", padx=10, pady=10)
380 | 
381 |     # License text
382 |         license_text = """ AURORA - Accessible User-friendly Resources for Optimized Research Analytics
383 | Copyright (C) 2024 Marius Neagoe (www.mariusneagoe.com)
384 | 
385 | This program is free software; you can redistribute it and/or
386 | modify it under the terms of the GNU General Public License
387 | as published by the Free Software Foundation; either version 2
388 | of the License, or (at your option) any later version.
389 | 
390 | This program is distributed in the hope that it will be useful,
391 | but WITHOUT ANY WARRANTY; without even the implied warranty of
392 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
393 | GNU General Public License for more details.
394 | 
395 | You should have received a copy of the GNU General Public License
396 | along with this program; if not, write to the Free Software
397 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA. 
398 | 
399 | """
400 |         
401 |     # Insert the license text into the Text widget and disable editing
402 |         license_text_widget.insert(tk.END, license_text)
403 |         license_text_widget.config(state="disabled")
404 |     
405 |     
406 |     def select_columns(self):
407 |         # Use simpledialog to prompt the user for column selection
408 |         selected_columns = simpledialog.askstring("Select Columns", "Enter two column names separated by a comma (e.g., col1, col2):")
409 |         if selected_columns:
410 |             columns = [col.strip() for col in selected_columns.split(',')]
411 |             if len(columns) == 2:
412 |                 self.selected_columns = columns
413 |                 self.sel_list.append(selected_columns)
414 |                 
415 |             else:
416 |                 messagebox.showerror("Error", "Please enter exactly two column names.")
417 |                 self.select_columns()
418 |                 
419 |     def int_hig_wrap(self):
420 |         date1, high = self.sel_list[0].split(', ')
421 |         
422 |         if len(self.sel_list) == 1:
423 |             #messagebox.showinfo("Info", "Press OK to plot. You have to select 2 pairs of columns in order to Brush.")
424 |             plt.figure(figsize=(10, 6))
425 |             plt.scatter(self.dataframe[date1], self.dataframe[high])
426 |             plt.xlabel(date1)
427 |             plt.ylabel(high)
428 |             plt.show()
429 |         else:
430 |             date2, target = self.sel_list[1].split(', ')
431 |             col1 = date1
432 |             col2 = high
433 |             col3 = date2
434 |             col4 = target
435 |             self.interactive_highlight(col1, col2, col3, col4)
436 |             
437 |     def interactive_highlight(self, col1, col2, col3, col4):
438 |     
439 |     # Proceed with the interactive highlight functionality for non-empty col3 and col4
440 |     # Check for identical columns among col1, col2, col3, col4
441 |         cols = [col1, col2, col3, col4]
442 |         distinct_values = []
443 |         seen_values = set()
444 | 
445 |         for value in cols:
446 |             if value not in seen_values:
447 |                 seen_values.add(value)
448 |                 distinct_values.append(value)
449 |                 
450 |         x_col, y1_col, y2_col = distinct_values
451 |     # Plotting both graphs
452 |         fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
453 |         line1, = ax1.plot(self.dataframe[x_col], self.dataframe[y1_col], 'ro', picker=5)
454 |         line2, = ax2.plot(self.dataframe[x_col], self.dataframe[y2_col], 'bo')
455 | 
456 |         highlighted1 = []
457 |         highlighted2 = []
458 | 
459 |         def clear_previous_highlights():
460 |             for hl in highlighted1:
461 |                 hl.remove()
462 |             highlighted1.clear()
463 |             for hl in highlighted2:
464 |                 hl.remove()
465 |             highlighted2.clear()
466 | 
467 |         def onselect(eclick, erelease):
468 |             clear_previous_highlights()
469 |             x1, y1 = eclick.xdata, eclick.ydata
470 |             x2, y2 = erelease.xdata, erelease.ydata
471 |             mask = (self.dataframe[x_col] >= min(x1, x2)) & (self.dataframe[x_col] <= max(x1, x2)) & \
472 |                    (self.dataframe[y1_col] >= min(y1, y2)) & (self.dataframe[y1_col] <= max(y1, y2))
473 |             selected = self.dataframe[mask]
474 |             hl1 = ax1.plot(selected[x_col], selected[y1_col], 'yo', linestyle='None', zorder=5)
475 |             highlighted1.extend(hl1)
476 |             hl2 = ax2.plot(selected[x_col], selected[y2_col], 'yo', linestyle='None', zorder=5)
477 |             highlighted2.extend(hl2)
478 |             fig.canvas.draw_idle()
479 |             
480 |         toggle_selector = RectangleSelector(ax1, onselect, useblit=True,
481 |                                             button=[1],
482 |                                             minspanx=5, minspany=5,
483 |                                             spancoords='pixels',
484 |                                             interactive=True)
485 |         ax1.set_xlabel(x_col)
486 |         ax1.set_ylabel(y1_col)
487 |         ax2.set_xlabel(x_col)
488 |         ax2.set_ylabel(y2_col)
489 |         
490 |         plt.show()       
491 |     
492 | 
493 |     def get_dataframe(self):
494 |         return self.dataframe
495 |     
496 | 
497 |     def on_item_double_click(self, event):
498 |         item = self.tree.selection()[0]  # This gets the ID of the selected item in the Treeview
499 |         column = self.tree.identify_column(event.x)  # Identifies the clicked column
500 |         col_index = int(column.replace('#', '')) - 1  # Convert column ID to index
501 |     
502 |         new_value = simpledialog.askstring("Input", f"Enter new value:", parent=self.root)
503 |         if new_value is not None:
504 |             try:
505 |                 new_value = float(new_value)
506 |                 df_index = self.tree.index(item)  # Assuming direct correspondence between Treeview and DataFrame indices
507 |                 if df_index < len(self.dataframe):
508 |                     self.dataframe.iat[df_index, col_index] = new_value  # Update DataFrame
509 |                     self.tree.set(item, column=col_index, value=new_value)  # Update Treeview
510 |                 else:
511 |                     print(f"Index {df_index} is out of bounds for the DataFrame.")
512 |             except IndexError as e:
513 |                 print(f"Error updating cell: {e}")
514 |     
515 |     
516 |     def encode_labels(self):
517 |         categ_col = simpledialog.askstring("Input", f"Enter column name for Label Encoding:")
518 |         # Check if the column exists in the dataframe
519 |         if categ_col not in self.dataframe.columns:
520 |             raise ValueError(f"The column '{categ_col}' does not exist in the dataframe.")
521 |     
522 |         # Initialize the LabelEncoder
523 |         le = LabelEncoder()
524 |     
525 |         # Fit and transform the data in the column
526 |         self.dataframe[categ_col] = le.fit_transform(self.dataframe[categ_col])
527 |     
528 |         return self.dataframe   
529 |     
530 |     
531 |     def add_row(self):
532 |         new_row_index = len(self.dataframe)  # Next row index
533 |         self.dataframe.loc[new_row_index] = [None] * len(self.dataframe.columns)  # Initialize new row with None or suitable defaults
534 |         self.tree.insert('', 'end', values=([None] * len(self.dataframe.columns)))  # Add new row to Treeview as well
535 | 
536 | 
537 |     def delete_row(self):
538 |         selected_item = self.tree.selection()[0]  # Treeview's selected item ID
539 |         if selected_item:
540 |         # Assuming the order of items in the Treeview matches the DataFrame's index order
541 |             index_in_df = self.tree.index(selected_item)  # Get index of the item in the Treeview
542 |             df_index_to_delete = self.dataframe.index[index_in_df]  # Get corresponding DataFrame index
543 |             self.dataframe.drop(df_index_to_delete, inplace=True)  # Drop the row from the DataFrame
544 |             self.tree.delete(selected_item)  # Delete the item from the Treeview
545 | 
546 | 
547 |     def add_column(self):
548 |         new_column_name = simpledialog.askstring("Input", "Enter new column name:", parent=self.root)
549 |         if new_column_name:
550 |             self.dataframe[new_column_name] = ""
551 |             self.setup_tree_view()
552 | 
553 |     def delete_column(self):
554 |         column_name = simpledialog.askstring("Input", "Enter column name to delete:", parent=self.root)
555 |         if column_name and column_name in self.dataframe.columns:
556 |         # Drop the column from the DataFrame
557 |             self.dataframe.drop(columns=[column_name], inplace=True)
558 |         
559 |         # Rebuild the Treeview to reflect the change
560 |             self.rebuild_treeview()
561 | 
562 |     def rebuild_treeview(self):
563 |     # Clear the existing columns and data in the Treeview
564 |         for col in self.tree['columns']:
565 |             self.tree.delete(*self.tree.get_children())
566 |             self.tree.heading(col, text='')
567 |             self.tree.column(col, width=0, minwidth=0)
568 | 
569 |     # Setup the Treeview again with the updated DataFrame
570 |         self.setup_tree_view()
571 |         
572 |     def update_frame(self):
573 |         self.setup_tree_view()
574 |         
575 |     def save_df(self):
576 |         self.dataframe.to_csv('current_dataframe.csv', encoding='utf-8', index=False)
577 |         messagebox.showinfo("Info", "Your dataframe was saved to current_dataframe.csv in current folder.")     
578 |     
579 | def load_plugins(directory: str, app):
580 |     for filename in os.listdir(directory):
581 |         if filename.endswith('.py') and not filename.startswith('__'):
582 |             plugin_path = os.path.join(directory, filename)
583 |             module_name = os.path.splitext(filename)[0]
584 |             spec = importlib.util.spec_from_file_location(module_name, plugin_path)
585 |             module = importlib.util.module_from_spec(spec)
586 |             spec.loader.exec_module(module)
587 |             # Check if the module has a register function and call it with the app instance
588 |             if hasattr(module, 'register'):
589 |                 module.register(app)        
590 |    
591 | 
592 | if __name__ == "__main__":
593 |     root = tk.Tk()
594 |     root.geometry("1200x680")
595 |     
596 |     root.iconphoto(False, tk.PhotoImage(file='icon.png'))
597 |     # Use a file dialog to get the initial CSV file path
598 |     initial_file_path = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
599 | 
600 |     if not initial_file_path:
601 |         messagebox.showinfo("Info", "No file selected. Exiting.")
602 |         root.destroy()
603 |         quit()
604 |     try:
605 |         # Load the initial CSV file into a DataFrame
606 |         initial_df = pd.read_csv(initial_file_path, engine='python')
607 |     except Exception as e:
608 |         messagebox.showerror("Error", f"Error loading initial CSV file: {e}")
609 |         root.destroy()  # destroy the root window in case of an error
610 |         quit()
611 | 
612 |     app = DataFrameEditor(root, initial_df)
613 |     try:
614 |         load_plugins('plugins', app)
615 |     except RuntimeError as error:
616 |         print(error)
617 |         print("Some plugins did not load correctly and it may not work.")
618 |         pass
619 |     root.mainloop()


--------------------------------------------------------------------------------
/License.txt:
--------------------------------------------------------------------------------
  1 |                     GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     <one line to give the program's name and a brief idea of what it does.>
294 |     Copyright (C) <year>  <name of author>
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   <signature of Ty Coon>, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a name="readme-top"></a>
  2 | 
  3 | ![Python][Python]
  4 | <!-- PROJECT LOGO -->
  5 | <br />
  6 | <div align="center">
  7 |   <a href="https://github.com/MariusNea/Aurora">
  8 |     <img src="images/logo.png" alt="Logo" width="128" height="128">
  9 |   </a>
 10 | 
 11 |   <h3 align="center">Aurora</h3>
 12 | 
 13 |   <p align="center">
 14 |     Problem solving focused statistical and machine learning software toolkit.
 15 |     <br />
 16 |     <a href="https://github.com/MariusNea/Aurora/issues/new?labels=bug&template=bug-report---.md">Report Bug</a>
 17 |     ·
 18 |     <a href="https://github.com/MariusNea/Aurora/issues/new?labels=enhancement&template=feature-request---.md">Request Feature</a>
 19 |   </p>
 20 | </div>
 21 | 
 22 | 
 23 | 
 24 | <!-- ABOUT THE PROJECT -->
 25 | ## About The Project
 26 | 
 27 | In today's world, the fields of statistics and machine learning hold immense potential for solving real-world problems and significantly impacting businesses and daily life. However, the complexity and learning curve associated with these fields can be daunting, making it challenging for those interested to effectively utilize these tools. Recognizing this gap, we've developed AURORA, a software solution crafted to make the power of statistical and machine learning models more accessible to everyone.
 28 | 
 29 | AURORA is designed with the principle that tools that are capable of addressing a diverse range of problems should be within reach of anyone interested in applying scientific methods to their decision-making processes. Our aim is to remove the barriers posed by the need for specialized training, making it easier for individuals to leverage these models in their activities.
 30 | 
 31 | Aurora comprises three main components:
 32 | 
 33 | 1. **Algorithms Component**: This section encompasses various algorithms essential to Aurora's functionality.
 34 | 2. **Data Gathering Module**: This module is responsible for collecting data from multiple sources, including web scraping tools.
 35 | 3. **Automated Problem Solver Module**: Utilizing Natural Language Processing, this module assists users in navigating and applying interactively Aurora's capabilities to address their specific issues effectively.
 36 | <p align="right">(<a href="#readme-top">back to top</a>)</p>
 37 | 
 38 | ### Examples
 39 | 
 40 | Using Text Classifier from Aurora to predict if a message is spam or not
 41 | 
 42 | [![Watch the video](https://img.youtube.com/vi/ntX30JjQB8M/0.jpg)](https://www.youtube.com/watch?v=ntX30JjQB8M)
 43 | 
 44 | Predict employee churn using AURORA
 45 | 
 46 | [![Watch the video](https://img.youtube.com/vi/fY1UBiRSwLg/0.jpg)](https://www.youtube.com/watch?v=fY1UBiRSwLg)
 47 | 
 48 | ### Built With
 49 | 
 50 | * ![Matplotlib][Matplotlib]
 51 | * ![Pandas][Pandas]
 52 | * ![Scikit-learn][scikit-learn]
 53 | 
 54 | <p align="right">(<a href="#readme-top">back to top</a>)</p>
 55 | 
 56 | 
 57 | 
 58 | ### Prerequisites
 59 | 
 60 | Make sure you have Python >=3.9 installed
 61 | 
 62 | ### Installation
 63 | 
 64 | 1. Clone the repo
 65 |    ```sh
 66 |    git clone https://github.com/MariusNea/Aurora.git
 67 |    ```
 68 | 2. Install libraries
 69 |    ```sh
 70 |    pip install -r requirements.txt
 71 |    ```
 72 | 
 73 | <p align="right">(<a href="#readme-top">back to top</a>)</p>
 74 | 
 75 | 
 76 | <!-- USAGE EXAMPLES -->
 77 | ## Usage
 78 | 
 79 |    ```sh
 80 |    python -m Aurora
 81 |    ```
 82 | The process commences with your .csv file containing the requisite information, which is initially imported as a dataframe into AURORA. Subsequently, all models are applied based on this dataframe.
 83 | 
 84 | <h4>Structuring the Dataframe for plugins</h4>
 85 | 
 86 | Every plugin comes with its own documentation except the core plugins which are described here.
 87 | 
 88 | <h5>Regression Algorithms</h5>
 89 | 
 90 | Within the dataframe, all columns except the last one function as features, while the final column represents the predicted variable. The Linear Regression algorithm can accommodate any type of numerical data in the predicted column, whereas Logistic Regression and Decision Trees are suitable for categorical data.
 91 | 
 92 | <h5>Mann-Whitney U Test</h5>
 93 | 
 94 | This test is conducted between two consecutive columns in the dataframe. For instance, if there are four columns named data_1, data_2, data_3, and data_4, the Mann-Whitney U Test is performed between data_1 and data_2, and then between data_3 and data_4, respectively. Consequently, the dataframe must have an even number of columns.
 95 | 
 96 | <h5>ANOVA</h5>
 97 | 
 98 | Firs column of the dataframe must contain your tests categories. All other column must be numeric and represents the results of your tests. If your dataframe contains cells without values, AURORA will clean it automatically.
 99 | 
100 | For a practical example, let's consider a scenario where a researcher wants to analyze the impact of three different types of fertilizer on the growth of plants. The researcher has three groups of plants, each group receiving a different type of fertilizer. The goal is to see if there's a significant difference in the growth of plants (measured in height) across these groups.
101 | 
102 | CSV example:
103 |  |No   |     Fertilizer_Type | Height_After_1_Month | Height_After_2_Months | Height_After_3_Months |
104 |  |-----|---------------------|---------------------|------------------------|-----------------------|
105 | | 0    |     Type_A          |         5.1         |            7.2         |           9.8 |
106 | | 1    |     Type_B          |         4.8         |            7.0         |          10.1 |
107 | | 2    |     Type_C          |         5.3         |            7.9         |          10.5 |
108 | | 3    |     Type_A          |         5.5         |            7.5         |           9.9 |
109 | | 4    |     Type_B          |         4.9         |            7.1         |          10.0 |
110 | | 5    |     Type_C          |         5.0         |            7.8         |          10.2 |
111 | ...
112 | 
113 | 
114 | <h5>Outliers (Anomaly) Detection</h5>
115 | 
116 | This plugin uses Isolation Forest algorithm to detect outliers in timeseries. From your dataframe select column on which you want to apply algorithm. The result will be a plot with both inliers(red) and outliers (blue).
117 | 
118 | <h5>Principal Component Analysis (PCA)</h5>
119 | 
120 | To apply this plugin on your dataframe, the last column must be the target column and others columns must be features columns. The output will be a .csv file with components.
121 | 
122 | <h4>Screenshots from main GUI</h4>
123 | 
124 | ![Product Name Screen Shot][product-screenshot]
125 | ![Product Name Screen Shot2][product-screenshot2]
126 | <p align="right">(<a href="#readme-top">back to top</a>)</p>
127 | 
128 | 
129 | <!-- ROADMAP -->
130 | ## Roadmap
131 | 
132 | - [x] Implement Plot & CrossSelect
133 | - [x] Implement Dataframe Edit 
134 | - [x] Implement Dataframe Pagination for fast loading
135 | - [x] Implement Linear Regression
136 | - [x] Implement Logistic Regression
137 | - [x] Implement Decision Tree
138 | - [x] Implement Time Series Decomposition
139 | - [x] Implement One Way ANOVA
140 | - [x] Implement Canonical Correlation Analysis
141 | - [x] Implement Exponential Smoothing Model
142 | - [x] Implement Mann-Whitney U Test
143 | - [x] Implement Poisson Probabilities
144 | - [x] Implement Anomaly (Outliers) Detection
145 | - [x] Implement Principal Component Analysis
146 | - [x] Implement Support Vector Machines
147 | - [x] Implement K-Nearest Neighbors
148 | - [x] Implement K-Means
149 | - [x] Implement Histogram
150 | - [x] Implement Text Classifier
151 | - [x] Implement Denoising Autoencoder
152 | - [x] Implement XGBoost (Regression and Classification)
153 | - [x] Implement Pearson Correlation
154 | - [ ] Implement Monte Carlo Simulation
155 | - [ ] Implement Interactive Web Scraper
156 | - [ ] Develop multiple methods for interactive data gathering 
157 | - [ ] Implement Automated Problem Solver
158 | 
159 | <p align="right">(<a href="#readme-top">back to top</a>)</p>
160 | 
161 | 
162 | <!-- CONTRIBUTING -->
163 | ## Contributing
164 | 
165 | Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**.
166 | 
167 | If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement".
168 | Don't forget to give the project a star! Thanks again!
169 | 
170 | For contributing to the project follow steps described <a href="https://docs.github.com/en/get-started/exploring-projects-on-github/contributing-to-a-project">here</a>
171 | 
172 | <p align="right">(<a href="#readme-top">back to top</a>)</p>
173 | 
174 | 
175 | 
176 | <!-- LICENSE -->
177 | ## License
178 | 
179 | This project is dual licensed. Distributed under the GPL-2.0 license and a commercial license. See `LICENSE.txt` for GPL-2.0.
180 | 
181 | <p align="right">(<a href="#readme-top">back to top</a>)</p>
182 | 
183 | 
184 | 
185 | <!-- CONTACT -->
186 | ## Contact
187 | 
188 | Find more <a href="https://mariusneagoe.com">here</a>
189 | 
190 | Show your support  [!["Buy Me A Coffee"](https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png)](https://buymeacoffee.com/mariussorid)
191 | <p align="right">(<a href="#readme-top">back to top</a>)</p>
192 | 
193 | 
194 | 
195 | <!-- MARKDOWN LINKS & IMAGES -->
196 | <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
197 | [product-screenshot]: images/ss1.png
198 | [product-screenshot2]: images/ss2.png
199 | [Matplotlib]: https://img.shields.io/badge/Matplotlib-%23ffffff.svg?style=for-the-badge&logo=Matplotlib&logoColor=black
200 | [Pandas]: https://img.shields.io/badge/pandas-%23150458.svg?style=for-the-badge&logo=pandas&logoColor=white
201 | [scikit-learn]: https://img.shields.io/badge/scikit--learn-%23F7931E.svg?style=for-the-badge&logo=scikit-learn&logoColor=white
202 | [Python]: https://ForTheBadge.com/images/badges/made-with-python.svg


--------------------------------------------------------------------------------
/__pycache__/Aurora.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/__pycache__/Aurora.cpython-38.pyc


--------------------------------------------------------------------------------
/__pycache__/Aurora.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/__pycache__/Aurora.cpython-39.pyc


--------------------------------------------------------------------------------
/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/icon.png


--------------------------------------------------------------------------------
/images/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/images/logo.png


--------------------------------------------------------------------------------
/images/ss1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/images/ss1.png


--------------------------------------------------------------------------------
/images/ss2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/images/ss2.png


--------------------------------------------------------------------------------
/plugins/IsolationForest.py:
--------------------------------------------------------------------------------
 1 | # plugins/IsolationForest.py
 2 | 
 3 | 
 4 | #####################################################
 5 | #### Package: Aurora
 6 | #### Plugin: Outliers (Anomaly) Detection
 7 | #### Version: 0.1
 8 | #### Author: Marius Neagoe
 9 | #### Copyright: © 2024 Marius Neagoe
10 | #### Website: https://mariusneagoe.com
11 | #### Github: https://github.com/MariusNea/Aurora
12 | #####################################################
13 |     
14 | import pandas as pd
15 | from sklearn.ensemble import IsolationForest
16 | import matplotlib.pyplot as plt
17 | import tkinter as tk
18 | from tkinter.simpledialog import askstring
19 | from tkinter import messagebox
20 | 
21 | 
22 | def run_isolation_forest(df):
23 |     col = ask_col()
24 |     contamination = ask_contamination()  # Get contamination from the user    
25 |     df_part = df[col]
26 | 
27 |     # Initialize the Isolation Forest model
28 |     model = IsolationForest(contamination=float(contamination), random_state=42)
29 |     
30 |     # Fit the model on the data
31 |     # Note: .values.reshape(-1, 1) reshapes data for a single feature
32 |     model.fit(df_part.values.reshape(-1, 1))
33 |     
34 |     # Predict outliers
35 |     preds = model.predict(df_part.values.reshape(-1, 1))
36 |     
37 |     # Add predictions to the DataFrame
38 |     df['outlier_' + col] = preds
39 |     
40 |     # Filter outliers
41 |     outliers = df[df['outlier_' + col] == -1]
42 |     
43 |     # Plotting
44 |     plt.figure(figsize=(10, 6))
45 |     # Plot all data points, using a scatter plot
46 |     # Y-values are zeros since it's a single dimension, with slight jitter added for visualization
47 |     plt.scatter(df[col], [0 + jitter for jitter in preds * 0.02], c=preds, cmap='coolwarm', edgecolor='k', s=20)
48 |     plt.title('Data Points Classified by Isolation Forest')
49 |     plt.xlabel(col)  # X-axis label as the column name
50 |     plt.yticks([])  # Hide Y-axis ticks since they are arbitrary
51 |     plt.legend(['Inliers', 'Outliers'], loc='lower right')
52 |     plt.savefig('outlier_plot' + '_column_' + col + '.png')
53 |     plt.show()
54 | 
55 | def ask_contamination():
56 |     root = tk.Tk()
57 |     root.withdraw()  # We don't want a full GUI, so keep the root window from appearing
58 |     contamination = askstring("Input", "Enter the contamination factor (e.g., 0.01):", parent=root)
59 |     root.destroy()
60 | 
61 |     return contamination    
62 |     
63 | def ask_col():
64 |     root = tk.Tk()
65 |     root.withdraw()  # We don't want a full GUI, so keep the root window from appearing
66 |     col = askstring("Input", "Enter column name on which tou want to perform outlier detection:", parent=root)
67 |     root.destroy()
68 | 
69 |     return col
70 |     
71 | def register(app):
72 |     @app.register_plugin('machine_learning', 'isolation_forest', 'Outliers (Anomaly) Detection')
73 |     def isolation_forest():
74 |         global df
75 |         df = app.get_dataframe()
76 |         run_isolation_forest(df)
77 |         messagebox.showinfo("Results", "Your data was saved as a image in current folder.")
78 |         
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/plugins/Mann_Whit.py:
--------------------------------------------------------------------------------
 1 | # plugins/plugin_a.py
 2 | 
 3 | 
 4 | #####################################################
 5 | #### Package: Aurora
 6 | #### Plugin: Mann-Whitney U Test
 7 | #### Version: 0.1
 8 | #### Author: Marius Neagoe
 9 | #### Copyright: © 2024 Marius Neagoe
10 | #### Website: https://mariusneagoe.com
11 | #### Github: https://github.com/MariusNea/Aurora
12 | #####################################################
13 | 
14 | 
15 | from scipy.stats import mannwhitneyu
16 | from tkinter import messagebox
17 | import tkinter as tk
18 | from tkinter import ttk
19 | import pandas as pd
20 | 
21 | def register(app):
22 |     @app.register_plugin('statistics','mann_whitney_u_test', 'Mann-Whitney U Test')
23 |     def mann_whitney_u_test():
24 |         df = app.get_dataframe()
25 |     # Check if the number of columns is even
26 |         if len(df.columns) % 2 != 0:
27 |             error_message = "Error: The number of columns in the dataframe must be even. The test is done on the columns that are placed one next to another."
28 |             messagebox.showerror("Error", error_message)
29 |             return
30 | 
31 |         # Create tkinter window
32 |         root = tk.Tk()
33 |         root.title("Mann-Whitney U Test Results")
34 | 
35 |         # Create treeview to display results
36 |         tree = ttk.Treeview(root)
37 |         tree["columns"] = ("Column Pair", "U Statistic", "P-Value")
38 | 
39 |         # Define treeview columns
40 |         tree.column("#0", width=0, stretch=tk.NO)
41 |         tree.column("Column Pair", anchor=tk.W, width=100)
42 |         tree.column("U Statistic", anchor=tk.W, width=100)
43 |         tree.column("P-Value", anchor=tk.W, width=100)
44 | 
45 |         # Create treeview headings
46 |         tree.heading("#0", text="", anchor=tk.W)
47 |         tree.heading("Column Pair", text="Column Pair", anchor=tk.W)
48 |         tree.heading("U Statistic", text="U Statistic", anchor=tk.W)
49 |         tree.heading("P-Value", text="P-Value", anchor=tk.W)
50 | 
51 |         # Perform Mann-Whitney U test for adjacent column pairs
52 |         for i in range(0, len(df.columns), 2):
53 |             df['column1_clean'] = pd.to_numeric(df.iloc[:, i], errors='coerce')
54 |             df['column2_clean'] = pd.to_numeric(df.iloc[:, i + 1], errors='coerce')
55 |             df_clean = df.dropna(subset=['column1_clean', 'column2_clean'])
56 |             result = mannwhitneyu(df_clean['column1_clean'], df_clean['column2_clean'])
57 | 
58 |         # Insert result into treeview
59 |             tree.insert("", i, values=(f"{df.columns[i]} - {df.columns[i+1]}", result.statistic, result.pvalue))
60 | 
61 |         # Pack and run tkinter window
62 |         tree.pack(expand=True, fill=tk.BOTH)
63 |         root.mainloop()
64 | 
65 | 


--------------------------------------------------------------------------------
/plugins/PCA.py:
--------------------------------------------------------------------------------
 1 | #plugins/pca.py
 2 | 
 3 | #####################################################
 4 | #### Package: Aurora
 5 | #### Plugin: Principal Component Analysis
 6 | #### Version: 0.1
 7 | #### Author: Marius Neagoe
 8 | #### Copyright: © 2024 Marius Neagoe
 9 | #### Website: https://mariusneagoe.com
10 | #### Github: https://github.com/MariusNea/Aurora
11 | #####################################################
12 | 
13 | import pandas as pd
14 | from sklearn.decomposition import PCA
15 | from sklearn.preprocessing import StandardScaler
16 | 
17 | 
18 | def register(app):
19 |     @app.register_plugin('machine_learning','perform_pca_and_export_csv', 'Principal Component Analysis')
20 |     def perform_pca_and_export_csv():
21 |     # Assuming 'df' is a known dataframe available in the scope
22 |         df = app.get_dataframe()
23 |     
24 |     # Separate features from the target
25 |         features = df.columns[:-1]  # Exclude the last column which is the target
26 |         target_column = df.columns[-1]  # The last column is the target
27 |         X = df.loc[:, features].values
28 |         y = df.loc[:, target_column].values
29 |     
30 |     # Standardize the features
31 |         X = StandardScaler().fit_transform(X)
32 |     
33 |     # Perform PCA
34 |         pca = PCA(n_components=2)  # Adjust n_components as needed
35 |         principalComponents = pca.fit_transform(X)
36 |     
37 |     # Create a DataFrame with the principal components
38 |         principalDf = pd.DataFrame(data=principalComponents, columns=['principal component 1', 'principal component 2'])
39 |     
40 |     # Add the target column to the DataFrame
41 |         principalDf[target_column] = y
42 |     
43 |     # Export to CSV
44 |         principalDf.to_csv('pca.csv', index=False)
45 | 
46 | # introduce in new menu, Machine Learning
47 | 


--------------------------------------------------------------------------------
/plugins/__pycache__/IsolationForest.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/IsolationForest.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/Mann_Whit.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/Mann_Whit.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/PCA.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/PCA.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/anova.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/anova.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/cca.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/cca.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/esm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/esm.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/example_plugin_a.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/example_plugin_a.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/plugin_a.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/plugin_a.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/plugin_b.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/plugin_b.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/poisson_probabilities.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/poisson_probabilities.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/__pycache__/vine_copula.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MariusNea/Aurora/2c9a2594355b2d6f9e63f6e76776121a1e08cec3/plugins/__pycache__/vine_copula.cpython-38.pyc


--------------------------------------------------------------------------------
/plugins/anova.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #####################################################
  3 | #### Package: Aurora
  4 | #### Plugin: One Way ANOVA
  5 | #### Version: 0.1
  6 | #### Author: Marius Neagoe
  7 | #### Copyright: © 2024 Marius Neagoe
  8 | #### Website: https://mariusneagoe.com
  9 | #### Github: https://github.com/MariusNea/Aurora
 10 | #####################################################
 11 | 
 12 | import pandas as pd
 13 | import numpy as np
 14 | from scipy import stats
 15 | import matplotlib.pyplot as plt
 16 | from tkinter import *
 17 | from tkinter import messagebox
 18 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
 19 | from statsmodels.stats.multicomp import pairwise_tukeyhsd
 20 | from scipy.stats import f_oneway
 21 | from statsmodels.formula.api import ols
 22 | from statsmodels.stats.anova import anova_lm
 23 | 
 24 | 
 25 | def validate_dataframe(dataframe):
 26 |     """
 27 |     Validates the DataFrame structure. Assumes the first column is categorical and the rest are numeric.
 28 |     Checks for at least one categorical column and at least two numeric columns.
 29 |     """
 30 |     # Check if the DataFrame has at least three columns (one categorical and at least two numeric)
 31 |     if dataframe.shape[1] < 3:
 32 |         raise ValueError("DataFrame must contain at least one categorical column and two numeric columns.")
 33 | 
 34 |     # Check if the first column is categorical (object or category dtype)
 35 |     if dataframe.dtypes[0] not in ['object', 'category']:
 36 |         raise ValueError("The first column must be categorical (type object or category).")
 37 |         exit()
 38 |     # Check if the remaining columns are numeric
 39 |     if not all(dataframe.dtypes[1:].apply(lambda dtype: np.issubdtype(dtype, np.number))):
 40 |         raise ValueError("All columns except the first must be numeric.")
 41 |         exit()
 42 |     # Check for missing values in the DataFrame
 43 |     if dataframe.isnull().any().any():
 44 |         print("Warning: DataFrame contains missing values. They will be handled appropriately.")
 45 | 
 46 | def handle_missing_values(dataframe):
 47 |     """
 48 |     Handles missing values by dropping rows with any missing values.
 49 |     """
 50 |     return dataframe.dropna()
 51 | 
 52 | def perform_anova_and_tukey(dataframe):
 53 |     """
 54 |     Performs ANOVA and Tukey's HSD test on the given DataFrame.
 55 |     Assumes the first column is categorical and the rest are numeric.
 56 |     """
 57 |     group_col = dataframe.columns[0]  # The first column as the categorical column
 58 |     numeric_cols = dataframe.columns[1:]  # The rest as numeric columns
 59 | 
 60 |     for col in numeric_cols:
 61 |         # Preparing groups for ANOVA
 62 |         groups = [dataframe[dataframe[group_col] == group][col].dropna() for group in dataframe[group_col].unique()]
 63 | 
 64 |         # Performing ANOVA
 65 |         f_stat, p_value = f_oneway(*groups)
 66 |         print(f"ANOVA result for {col}: F={f_stat}, p={p_value}")
 67 | 
 68 |         # If the p-value from ANOVA is significant, proceed with Tukey's HSD
 69 |         if p_value < 0.05:
 70 |             # Concatenating all group data into a single series for Tukey's test
 71 |             all_data = pd.concat(groups)
 72 |             all_groups = pd.concat([pd.Series([group] * len(g)) for group, g in zip(dataframe[group_col].unique(), groups)])
 73 | 
 74 |             # Performing Tukey's HSD test
 75 |             tukey = pairwise_tukeyhsd(endog=all_data, groups=all_groups, alpha=0.05)
 76 |             print(f"Tukey's HSD test result for {col}:\n{tukey}")
 77 |         else:
 78 |             print("ANOVA p-value > 0.05; Tukey's test not performed.")
 79 | 
 80 | 
 81 | 
 82 | def display_results(dataframe):
 83 |     """
 84 |     Orchestrates the analysis process, including validations, ANOVA, effect size calculation, 
 85 |     and graphical summary, then displays results and plots in a Tkinter window.
 86 |     """
 87 |     group_col = dataframe.columns[0]  # First column as categorical
 88 |     numeric_cols = dataframe.columns[1:]  # Remaining columns as numeric variables
 89 |     
 90 |     # Data Validation
 91 |     messagebox.showinfo("Preparation", "First column must be populated with categories that will take part on ANOVA. All other columns must be numeric.")
 92 |     validate_dataframe(dataframe)
 93 |     dataframe = handle_missing_values(dataframe)
 94 |     check_equal_variances(dataframe)
 95 |     check_normality(dataframe)
 96 |     calculate_effect_size(dataframe)
 97 |     # Perform ANOVA and Tukey's HSD Test
 98 |     perform_anova_and_tukey(dataframe)
 99 |     
100 | 
101 | def check_equal_variances(dataframe):
102 |     """
103 |     Checks for equal variances among groups using Levene's test for each numeric variable.
104 |     
105 |     :param dataframe: The pandas DataFrame containing the data.
106 |     :param group_col: The name of the column containing the categorical variable.
107 |     """
108 |     group_col = dataframe.columns[0]  # First column as categorical
109 |     numeric_cols = dataframe.columns[1:]  # Remaining columns as numeric variables
110 |     
111 |     for col in numeric_cols:
112 |         print(f"Levene's test for {col}:")
113 |         groups = [dataframe[dataframe[group_col] == group][col].dropna() for group in dataframe[group_col].unique()]
114 |         statistic, p_value = stats.levene(*groups)
115 |         if p_value < 0.05:
116 |             print(f"    Warning: Unequal variances detected (p-value: {p_value:.3f}).")
117 |         else:
118 |             print(f"    Equal variances confirmed (p-value: {p_value:.3f}).")
119 | 
120 | def check_normality(dataframe):
121 |     """
122 |     Checks for normality in each group for each numeric variable using the Shapiro-Wilk test.
123 |     
124 |     :param dataframe: The pandas DataFrame containing the data.
125 |     :param group_col: The name of the column containing the categorical variable.
126 |     """
127 |     group_col = dataframe.columns[0]  # First column as categorical
128 |     numeric_cols = dataframe.columns[1:]  # Remaining columns as numeric variables
129 |     
130 |     for col in numeric_cols:
131 |         print(f"Shapiro-Wilk test for normality in {col}:")
132 |         for group in dataframe[group_col].unique():
133 |             group_data = dataframe[dataframe[group_col] == group][col].dropna()
134 |             statistic, p_value = stats.shapiro(group_data)
135 |             if p_value < 0.05:
136 |                 print(f"    Group {group}: Non-normal distribution detected (p-value: {p_value:.3f}).")
137 |             else:
138 |                 print(f"    Group {group}: Normal distribution confirmed (p-value: {p_value:.3f}).")
139 | 
140 | # Integrate these functions into your existing workflow as needed, calling them before conducting ANOVA.
141 | def calculate_effect_size(dataframe):
142 |     """
143 |     Calculates the effect size (eta squared) for each numeric variable against the categorical variable.
144 |     
145 |     :param dataframe: The pandas DataFrame containing the data.
146 |     :param group_col: The name of the column containing the categorical variable.
147 |     :return: A dictionary with numeric columns as keys and their eta squared values as values.
148 |     """
149 |     eta_squared_values = {}
150 |     group_col = dataframe.columns[0]  # First column as categorical
151 |     numeric_cols = dataframe.columns[1:]  # Remaining columns as numeric variables
152 |     
153 |     for col in numeric_cols:
154 |         formula = f"{col} ~ C({group_col})"
155 |         model = ols(formula, data=dataframe).fit()
156 |         aov_table = anova_lm(model, typ=2)
157 |         
158 |         ss_between = aov_table.sum_sq['C({})'.format(group_col)]  # Corrected access method
159 |         ss_total = sum(aov_table.sum_sq)
160 |         eta_squared = ss_between / ss_total
161 |         eta_squared_values[col] = eta_squared
162 |     print("Effect sizes of groups:")
163 |     print(eta_squared_values)   
164 | 
165 | def register(app):
166 |     @app.register_plugin('statistics', 'anova', 'One Way ANOVA')
167 |     def anova():
168 |         df = app.get_dataframe()
169 | 
170 |         display_results(df)
171 | 


--------------------------------------------------------------------------------
/plugins/autoencoder.py:
--------------------------------------------------------------------------------
  1 | import tkinter as tk
  2 | from tkinter import scrolledtext, messagebox, filedialog, Label, Entry, Button
  3 | import torch
  4 | from torch import nn
  5 | from torch.utils.data import DataLoader, TensorDataset
  6 | import pandas as pd
  7 | import numpy as np
  8 | from sklearn.preprocessing import MinMaxScaler
  9 | 
 10 | # Define the Autoencoder model using PyTorch
 11 | class Autoencoder(nn.Module):
 12 |     def __init__(self, input_dim, encoding_dim):
 13 |         super(Autoencoder, self).__init__()
 14 |         self.encoder = nn.Sequential(
 15 |             nn.Linear(input_dim, encoding_dim),
 16 |             nn.ReLU(True)
 17 |         )
 18 |         self.decoder = nn.Sequential(
 19 |             nn.Linear(encoding_dim, input_dim),
 20 |             nn.Sigmoid()  # Assuming data normalization [0,1]
 21 |         )
 22 | 
 23 |     def forward(self, x):
 24 |         x = self.encoder(x)
 25 |         x = self.decoder(x)
 26 |         return x
 27 | 
 28 | def train_autoencoder(model, dataloader, epochs, device, output_text):
 29 |     criterion = nn.MSELoss()
 30 |     optimizer = torch.optim.Adam(model.parameters())
 31 |     model.train()
 32 |     for epoch in range(epochs):
 33 |         total_loss = 0
 34 |         for data, target in dataloader:
 35 |             data = data.to(device)
 36 |             target = target.to(device)
 37 |             optimizer.zero_grad()
 38 |             output = model(data)
 39 |             loss = criterion(output, target)
 40 |             loss.backward()
 41 |             optimizer.step()
 42 |             total_loss += loss.item()
 43 |         average_loss = total_loss / len(dataloader)
 44 |         output_text.insert(tk.END, f'Epoch {epoch+1}, Loss: {average_loss:.4f}\n')
 45 |     output_text.insert(tk.END, "Training complete!\n")
 46 | 
 47 | def save_model(model, output_text):
 48 |     if model is None:
 49 |         messagebox.showerror("Error", "No model to save.")
 50 |         return
 51 |     save_path = filedialog.asksaveasfilename(filetypes=[("PyTorch Model", "*.pth")], defaultextension=".pth")
 52 |     if save_path:
 53 |         torch.save(model.state_dict(), save_path)
 54 |         output_text.insert(tk.END, f"Model saved to {save_path}\n")
 55 | 
 56 | def load_model(input_dim, encoding_dim, device, output_text):
 57 |     model = Autoencoder(input_dim, encoding_dim).to(device)
 58 |     load_path = filedialog.askopenfilename(filetypes=[("PyTorch Model", "*.pth")])
 59 |     if load_path:
 60 |         model.load_state_dict(torch.load(load_path))
 61 |         model.eval()
 62 |         output_text.insert(tk.END, "Model loaded successfully.\n")
 63 |         return model
 64 |     return None
 65 | 
 66 | # GUI for controlling the autoencoder
 67 | def run_gui(dataframe=None):
 68 |     root = tk.Tk()
 69 |     root.title("Autoencoder Configuration")
 70 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 71 |     model = None
 72 | 
 73 |     def on_train():
 74 |         nonlocal model
 75 |         input_dim = int(input_dim_entry.get())
 76 |         encoding_dim = int(encoding_dim_entry.get())
 77 |         epochs = int(epoch_entry.get())
 78 |         batch_size = int(batch_size_entry.get())
 79 |         model = Autoencoder(input_dim, encoding_dim).to(device)
 80 | 
 81 |         if dataframe is not None:
 82 |             scaler = MinMaxScaler()
 83 |             scaled_data = scaler.fit_transform(dataframe.values)
 84 |             # Assume scaled_data is already noisy and the original data is not accessible
 85 |             data_tensor = torch.tensor(scaled_data, dtype=torch.float32)
 86 |             # Assuming no clean target available, use noisy data as target for unsupervised learning
 87 |             dataloader = DataLoader(TensorDataset(data_tensor, data_tensor), batch_size=batch_size, shuffle=True)
 88 |             train_autoencoder(model, dataloader, epochs, device, output_text)
 89 |         else:
 90 |             messagebox.showerror("Error", "No data loaded for training.")
 91 | 
 92 | 
 93 |     def on_load_model():
 94 |         nonlocal model
 95 |         input_dim = int(input_dim_entry.get())
 96 |         encoding_dim = int(encoding_dim_entry.get())
 97 |         model = load_model(input_dim, encoding_dim, device, output_text)
 98 | 
 99 |     def on_predict():
100 |         nonlocal model
101 |         if model is None:
102 |             messagebox.showerror("Error", "Model not trained or initialized.")
103 |             return
104 |         try:
105 |             scaler = MinMaxScaler()
106 |             data_scaled = scaler.fit_transform(dataframe.values)
107 |             noisy_data = data_scaled + 0.1 * np.random.normal(size=data_scaled.shape)
108 |             noisy_data = np.clip(noisy_data, 0, 1)
109 |             input_tensor = torch.tensor(noisy_data, dtype=torch.float32).to(device)
110 |             model.eval()
111 |             with torch.no_grad():
112 |                 predicted = model(input_tensor)
113 |             clean_predicted = scaler.inverse_transform(predicted.cpu().numpy())
114 |             output_text.insert(tk.END, "Denoised data ready. Check Aurora's directory.\n")
115 |             np.savetxt("denoised_data.csv", clean_predicted, delimiter=",")
116 |         except Exception as e:
117 |             messagebox.showerror("Error", str(e))
118 | 
119 |     # Layout configuration
120 |     frame = tk.Frame(root)
121 |     frame.pack(fill=tk.BOTH, expand=True)
122 | 
123 |     Label(frame, text="Input Dimension:").pack()
124 |     input_dim_entry = Entry(frame)
125 |     input_dim_entry.pack()
126 | 
127 |     Label(frame, text="Encoding Dimension:").pack()
128 |     encoding_dim_entry = Entry(frame)
129 |     encoding_dim_entry.pack()
130 | 
131 |     Label(frame, text="Epochs:").pack()
132 |     epoch_entry = Entry(frame)
133 |     epoch_entry.pack()
134 | 
135 |     Label(frame, text="Batch Size:").pack()
136 |     batch_size_entry = Entry(frame)
137 |     batch_size_entry.pack()
138 | 
139 |     Button(frame, text="Train Model", command=on_train).pack()
140 |     Button(frame, text="Load Model", command=on_load_model).pack()
141 | 
142 |     Button(frame, text="Predict and Save Clean Data", command=on_predict).pack()
143 |     Button(frame, text="Save Model", command=lambda: save_model(model, output_text)).pack()
144 | 
145 |     output_text = scrolledtext.ScrolledText(frame, height=10)
146 |     output_text.pack(fill=tk.BOTH, expand=True)
147 | 
148 |     root.mainloop()
149 | 
150 | 
151 | def register(app):
152 |     @app.register_plugin('machine_learning', 'ae', 'Denoising Autoencoder')
153 |     def ae():
154 |         dataae = app.get_dataframe()
155 |         run_gui(dataae)
156 |     
157 | 


--------------------------------------------------------------------------------
/plugins/cca.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #####################################################
 4 | #### Package: Aurora
 5 | #### Plugin: Canonical Correlation Analysis
 6 | #### Version: 0.1
 7 | #### Author: Marius Neagoe
 8 | #### Copyright: © 2024 Marius Neagoe
 9 | #### Website: https://mariusneagoe.com
10 | #### Github: https://github.com/MariusNea/Aurora
11 | #####################################################
12 | 
13 | import pandas as pd
14 | import numpy as np
15 | from scipy.linalg import eigh
16 | import matplotlib.pyplot as plt
17 | 
18 | def fill_na_with_mean(df):
19 |     """Fill NaN values with the mean of their respective columns."""
20 |     return df.fillna(df.mean())
21 | 
22 | def standardize_data(df):
23 |     """Standardize DataFrame to have zero mean and unit variance."""
24 |     df_filled = fill_na_with_mean(df)
25 |     return (df_filled - df_filled.mean()) / df_filled.std()
26 | 
27 | def split_dataframe(df):
28 |     """Split DataFrame into two equal halves."""
29 |     mid_point = df.shape[1] // 2
30 |     X = df.iloc[:, :mid_point]
31 |     Y = df.iloc[:, mid_point:]
32 |     return X, Y
33 | 
34 | def canonical_correlation_analysis(df):
35 |     """Perform Canonical Correlation Analysis on a DataFrame."""
36 |     X, Y = split_dataframe(df)
37 |     X_std = standardize_data(X)
38 |     Y_std = standardize_data(Y)
39 |     
40 |     S_xx = np.cov(X_std.T, bias=True)
41 |     S_yy = np.cov(Y_std.T, bias=True)
42 |     S_xy = np.cov(X_std.T, Y_std.T, bias=True)[:X_std.shape[1], X_std.shape[1]:]
43 |     S_yx = S_xy.T
44 | 
45 |     # Ensure matrices are at least two-dimensional
46 |     S_xx = np.atleast_2d(S_xx)
47 |     S_yy = np.atleast_2d(S_yy)
48 |     S_xy = np.atleast_2d(S_xy)
49 |     S_yx = np.atleast_2d(S_yx)
50 |     
51 |     # Solve the generalized eigenvalue problem
52 |     eigvals, eigvecs_x = eigh(S_xy @ np.linalg.inv(S_yy) @ S_yx, S_xx)
53 |     eigvals = np.sqrt(np.maximum(eigvals, 0))  # Ensure non-negative eigenvalues
54 |     
55 |     idx = np.argsort(-eigvals)
56 |     canonical_correlations = eigvals[idx]
57 |     canonical_weights_x = eigvecs_x[:, idx]
58 |     
59 |     U = X_std @ canonical_weights_x
60 |     V = Y_std @ (np.linalg.inv(S_yy) @ S_yx @ canonical_weights_x)
61 |     
62 |     return canonical_correlations, U, V
63 |     
64 | def plot_first_pair_canonical_variables(U, V):
65 |     """
66 |     Plot the first canonical variables from U and V against each other.
67 |     U and V are the matrices of canonical variables, where each column is a canonical variable.
68 |     This function focuses on the first pair, illustrating their relationship.
69 |     """
70 |     plt.figure(figsize=(8, 6))
71 |     plt.scatter(U, V, edgecolor='k', alpha=0.7, label='Canonical Variable Pair')
72 |     plt.title('Scatter Plot of the First Pair of Canonical Variables')
73 |     plt.xlabel('First Canonical Variable from U')
74 |     plt.ylabel('First Canonical Variable from V')
75 |     plt.legend()
76 |     plt.grid(True)
77 |     plt.show()
78 | 
79 | def register(app):
80 |     @app.register_plugin('statistics', 'cca', 'Canonical Correlation Analysis')
81 |     def cca():
82 |         data_cor = app.get_dataframe()
83 | 		# You can add your code here
84 |         canonical_correlations, U, V = canonical_correlation_analysis(data_cor)
85 |         print("Canonical Correlations:", canonical_correlations)
86 |         print(U)
87 |         print(V)
88 |         plot_first_pair_canonical_variables(U, V)


--------------------------------------------------------------------------------
/plugins/esm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #####################################################
  4 | #### Package: Aurora
  5 | #### Plugin: Exponential Smoothing Model
  6 | #### Version: 0.1
  7 | #### Author: Marius Neagoe
  8 | #### Copyright: © 2024 Marius Neagoe
  9 | #### Website: https://mariusneagoe.com
 10 | #### Github: https://github.com/MariusNea/Aurora
 11 | #####################################################
 12 | 
 13 | import tkinter as tk
 14 | from tkinter import ttk
 15 | from tkinter import messagebox
 16 | import pandas as pd
 17 | import numpy as np
 18 | import matplotlib.pyplot as plt
 19 | from statsmodels.tsa.holtwinters import ExponentialSmoothing
 20 | 
 21 | def esf(df, column_name, period, trend, seasonal):
 22 |     """
 23 |     Applies Exponential Smoothing on a DataFrame's specified time series data column and plots the original data and forecast.
 24 |     
 25 |     :param df: DataFrame containing the time series data.
 26 |     :param column_name: Name of the column containing the time series data.
 27 |     :param period: The seasonal period.
 28 |     :param trend: The type of trend component ('additive', 'multiplicative', or None).
 29 |     :param seasonal: The type of seasonal component ('additive', 'multiplicative', or None).
 30 |     """
 31 |     # Validate column name
 32 |     if column_name not in df.columns:
 33 |         messagebox.showerror("Error", f"Column '{column_name}' not found in DataFrame.")
 34 |         return
 35 | 
 36 |     # Convert period to integer
 37 |     try:
 38 |         period = int(period)
 39 |     except Exception as e:
 40 |         # Display an error message box with the description of the exception
 41 |         messagebox.showerror("Error", f"An error occurred: {e}")
 42 |     
 43 |     # Convert 'None' strings to NoneType
 44 |     trend = None if trend == 'None' else trend
 45 |     seasonal = None if seasonal == 'None' else seasonal
 46 |     try:
 47 |     # Fit the model
 48 |         model = ExponentialSmoothing(df[column_name], trend=trend, seasonal=seasonal, seasonal_periods=period)
 49 |         model_fit = model.fit()
 50 |     except Exception as e:
 51 |         # Display an error message box with the description of the exception
 52 |         messagebox.showerror("Error", f"An error occurred: {e}")
 53 |         
 54 |     # Forecast
 55 |     forecast = model_fit.fittedvalues
 56 |     
 57 |     # Plot the original data and the forecast
 58 |     plt.figure(figsize=(10, 6))
 59 |     plt.plot(df.index, df[column_name], label='Original')
 60 |     plt.plot(df.index, forecast, label='Forecast', alpha=0.7)
 61 |     plt.title('Time Series Forecast')
 62 |     plt.xlabel('Time')
 63 |     plt.ylabel('Values')
 64 |     plt.legend()
 65 |     plt.show()
 66 | 
 67 | 
 68 | def register(app):
 69 |     @app.register_plugin('statistics', 'esm', 'Exponential Smoothing Model')
 70 |     def esm():
 71 |         data = app.get_dataframe()
 72 | 		# Create the main window
 73 |         root = tk.Tk()
 74 |         root.title("Exponential Smoothing Parameters")
 75 | 
 76 |         # Column Name Entry
 77 |         tk.Label(root, text="Column Name:").grid(row=0, column=0, padx=10, pady=10, sticky='w')
 78 |         column_entry = tk.Entry(root)
 79 |         column_entry.grid(row=0, column=1, padx=10, pady=10, sticky='ew')
 80 | 
 81 | 
 82 |         # Period Entry
 83 |         tk.Label(root, text="Period:").grid(row=1, column=0, padx=10, pady=10, sticky='w')
 84 |         period_entry = tk.Entry(root)
 85 |         period_entry.grid(row=1, column=1, padx=10, pady=10, sticky='ew')
 86 | 
 87 |         # Trend ComboBox
 88 |         tk.Label(root, text="Trend:").grid(row=2, column=0, padx=10, pady=10, sticky='w')
 89 |         trend_options = ["additive", "multiplicative", "None"]
 90 |         trend_combobox = ttk.Combobox(root, values=trend_options, state="readonly")
 91 |         trend_combobox.grid(row=2, column=1, padx=10, pady=10, sticky='ew')
 92 |         trend_combobox.set("None")
 93 | 
 94 |         # Seasonal ComboBox
 95 |         tk.Label(root, text="Seasonal:").grid(row=3, column=0, padx=10, pady=10, sticky='w')
 96 |         seasonal_options = ["additive", "multiplicative", "None"]
 97 |         seasonal_combobox = ttk.Combobox(root, values=seasonal_options, state="readonly")
 98 |         seasonal_combobox.grid(row=3, column=1, padx=10, pady=10, sticky='ew')
 99 |         seasonal_combobox.set("None")
100 | 
101 |         # Submit Button
102 |         submit_button = tk.Button(root, text="Submit", command=lambda: esf(data, column_entry.get(), period_entry.get(), trend_combobox.get(), seasonal_combobox.get()))
103 | 
104 |         submit_button.grid(row=4, column=0, columnspan=2, pady=10)
105 | 
106 |         # Set the grid expansion properties
107 |         root.grid_columnconfigure(1, weight=1)
108 |         root.grid_rowconfigure(4, weight=1)
109 | 
110 |         root.mainloop()
111 |         


--------------------------------------------------------------------------------
/plugins/example_plugin_a.py:
--------------------------------------------------------------------------------
 1 | # plugins/example_plugin_a.py
 2 | # The MyApp class now supports creating two separate menu categories: "Statistics" and "Machine Learning".
 3 | # The register_plugin method requires a category argument to determine under which menu the plugin should be registered.
 4 | # Plugins need to specify their category ('statistics' or 'machine_learning') when using the register_plugin decorator.
 5 | # The code can use "app.get_dataframe()" method to access the main dataframe from AURORA.
 6 | 
 7 | 
 8 | #####################################################
 9 | #### Package: Aurora
10 | #### Plugin: Test plugin
11 | #### Version: 0.1
12 | #### Author: Marius Neagoe
13 | #### Copyright: © 2024 Marius Neagoe
14 | #### Website: https://mariusneagoe.com
15 | #### Github: https://github.com/MariusNea/Aurora
16 | #####################################################
17 | 
18 | 
19 | def register(app):
20 |     @app.register_plugin('category', 'stats_test', 'Perform Stats Test')
21 |     def stats_test():
22 |        
23 | 		# You can add your code here
24 |         print("Running a statistics or machine learning test...")
25 | 
26 | # category - replace this with 'statistics' or 'machine_learning'	
27 | # stats_test - is function's name.
28 | # "Perform Stats Test" - is the text that will apear in AURORA's GUI
29 | 
30 | 


--------------------------------------------------------------------------------
/plugins/histogram.py:
--------------------------------------------------------------------------------
 1 | #####################################################
 2 | #### Package: Aurora
 3 | #### Plugin: Histogram
 4 | #### Version: 0.1
 5 | #### Author: Marius Neagoe
 6 | #### Copyright: © 2024 Marius Neagoe
 7 | #### Website: https://mariusneagoe.com
 8 | #### Github: https://github.com/MariusNea/Aurora
 9 | #####################################################
10 | 
11 | 
12 | import pandas as pd
13 | import matplotlib.pyplot as plt
14 | 
15 | def plot_histogram(df):
16 |     """
17 |     Plots histograms for each numerical column in the given DataFrame.
18 | 
19 |     Parameters:
20 |     df (pandas.DataFrame): The DataFrame containing the data to plot histograms for.
21 |     """
22 |     # Check if the input is a DataFrame
23 |     if not isinstance(df, pd.DataFrame):
24 |         raise ValueError("The input must be a pandas DataFrame.")
25 | 
26 |     # Get the numerical columns from the DataFrame
27 |     numerical_columns = df.select_dtypes(include='number').columns
28 | 
29 |     # Plot histograms for each numerical column
30 |     for column in numerical_columns:
31 |         plt.figure(figsize=(10, 6))
32 |         plt.hist(df[column], bins=30, edgecolor='black')
33 |         plt.title(f'Histogram of {column}')
34 |         plt.xlabel(column)
35 |         plt.ylabel('Frequency')
36 |         plt.grid(True)
37 |         plt.show()
38 | 
39 | # Example usage
40 | def register(app):
41 |     @app.register_plugin('statistics', 'histogram', 'Histogram')
42 |     def histogram():
43 |         histogram_data = app.get_dataframe()
44 | 
45 |         plot_histogram(histogram_data)
46 | 


--------------------------------------------------------------------------------
/plugins/kmeans.py:
--------------------------------------------------------------------------------
  1 | #####################################################
  2 | #### Package: Aurora
  3 | #### Plugin: K-Means
  4 | #### Version: 0.1
  5 | #### Author: Marius Neagoe
  6 | #### Copyright: © 2024 Marius Neagoe
  7 | #### Website: https://mariusneagoe.com
  8 | #### Github: https://github.com/MariusNea/Aurora
  9 | #####################################################
 10 | 
 11 | import tkinter as tk
 12 | from tkinter import ttk, scrolledtext
 13 | from sklearn.cluster import KMeans
 14 | from sklearn.preprocessing import StandardScaler
 15 | import numpy as np
 16 | import pandas as pd
 17 | 
 18 | # Function to run K-Means clustering
 19 | def run_kmeans(data, n_clusters, init_method, max_iter):
 20 |     print(f"Running KMeans with n_clusters={n_clusters}, init_method='{init_method}', max_iter={max_iter}")
 21 |     # Configure and run the KMeans algorithm
 22 |     kmeans = KMeans(
 23 |         n_clusters=int(n_clusters),
 24 |         init=init_method,
 25 |         max_iter=int(max_iter),
 26 |         algorithm='lloyd',
 27 |         random_state=42,
 28 |         n_init=10
 29 |     )
 30 |     kmeans.fit(data)
 31 |     return kmeans
 32 | 
 33 | # GUI creation function
 34 | def create_gui(data):
 35 |     # Root window
 36 |     root = tk.Tk()
 37 |     root.title("K-Means Clustering")
 38 | 
 39 |     # Entry for Number of Clusters
 40 |     tk.Label(root, text="Number of Clusters:").grid(row=0, column=0)
 41 |     n_clusters_entry = tk.Entry(root)
 42 |     n_clusters_entry.grid(row=0, column=1)
 43 | 
 44 |     # Dropdown for Initialization Methods
 45 |     tk.Label(root, text="Initialization Method:").grid(row=1, column=0)
 46 |     init_method_var = tk.StringVar(root)
 47 |     init_method_dropdown = ttk.Combobox(root, textvariable=init_method_var, state="readonly")
 48 |     init_method_dropdown['values'] = ('k-means++', 'random')
 49 |     init_method_dropdown.grid(row=1, column=1)
 50 |     init_method_dropdown.current(0)
 51 | 
 52 |     # Entry for Maximum Number of Iterations
 53 |     tk.Label(root, text="Max Iterations:").grid(row=2, column=0)
 54 |     max_iter_entry = tk.Entry(root)
 55 |     max_iter_entry.grid(row=2, column=1)
 56 | 
 57 |     # Scrolled Text Area for Output
 58 |     output_area = scrolledtext.ScrolledText(root, width=40, height=10)
 59 |     output_area.grid(row=5, column=0, columnspan=2, pady=10)
 60 | 
 61 |     # Button to Run K-Means
 62 |     def on_run_clicked():
 63 |         n_clusters = n_clusters_entry.get()
 64 |         init_method = init_method_var.get()
 65 |         max_iter = max_iter_entry.get()
 66 |         print(f"Button clicked with init_method='{init_method}'")
 67 | 
 68 |         if init_method not in ['k-means++', 'random']:
 69 |             output_area.delete('1.0', tk.END)
 70 |             output_area.insert(tk.INSERT, f"Invalid init method: {init_method}. Select 'k-means++' or 'random'.\n")
 71 |             return
 72 | 
 73 |         try:
 74 |             global model  # Declare model as global to use in prediction
 75 |             model = run_kmeans(data, int(n_clusters), init_method, int(max_iter))
 76 |             centers = model.cluster_centers_
 77 |             output = "Cluster Centers:\n{}\n".format(centers)
 78 |             output_area.delete('1.0', tk.END)
 79 |             output_area.insert(tk.INSERT, output)
 80 |         except Exception as e:
 81 |             output_area.delete('1.0', tk.END)
 82 |             output_area.insert(tk.INSERT, "Error: {}\n".format(e))
 83 | 
 84 |     run_button = tk.Button(root, text="Run K-Means", command=on_run_clicked)
 85 |     run_button.grid(row=4, column=0, columnspan=2)
 86 | 
 87 |     # Entry for Prediction Data
 88 |     tk.Label(root, text="Enter Prediction Data (comma-separated):").grid(row=6, column=0)
 89 |     prediction_entry = tk.Entry(root)
 90 |     prediction_entry.grid(row=6, column=1)
 91 | 
 92 |     # Button for Making Predictions
 93 |     def on_predict_clicked():
 94 |         prediction_data = prediction_entry.get()
 95 |         try:
 96 |             data_point = np.array([float(x) for x in prediction_data.split(',')]).reshape(1, -1)
 97 |             cluster = model.predict(data_point)
 98 |             output_area.insert(tk.END, "Predicted Cluster: {}\n".format(cluster[0]))
 99 |         except Exception as e:
100 |             output_area.insert(tk.END, "Error in prediction: {}\n".format(e))
101 | 
102 |     predict_button = tk.Button(root, text="Make Prediction", command=on_predict_clicked)
103 |     predict_button.grid(row=7, column=0, columnspan=2)
104 | 
105 |     # Start the GUI
106 |     root.mainloop()
107 | 
108 | def register(app):
109 |     @app.register_plugin('machine_learning', 'kmeans', 'Unsupervised Learning (K Means)')
110 |     def kmeans():
111 |         dateq = app.get_dataframe()
112 |         # Preprocess data: scaling
113 |         scaler = StandardScaler()
114 |         data_scaled = scaler.fit_transform(dateq)
115 |         data_scaled = pd.DataFrame(data_scaled, columns=dateq.columns)
116 |         # Running the GUI
117 |         create_gui(data_scaled)
118 | 


--------------------------------------------------------------------------------
/plugins/knn.py:
--------------------------------------------------------------------------------
  1 | #####################################################
  2 | #### Package: Aurora
  3 | #### Plugin: K Nearest Neighbors
  4 | #### Version: 0.1
  5 | #### Author: Marius Neagoe
  6 | #### Copyright: © 2024 Marius Neagoe
  7 | #### Website: https://mariusneagoe.com
  8 | #### Github: https://github.com/MariusNea/Aurora
  9 | #####################################################
 10 | 
 11 | 
 12 | import tkinter as tk
 13 | from tkinter import simpledialog, messagebox
 14 | import pandas as pd
 15 | from sklearn.neighbors import KNeighborsClassifier
 16 | 
 17 | class KNNApp:
 18 |     def __init__(self, master, df):
 19 |         self.master = master
 20 |         self.df = df
 21 |         self.model = KNeighborsClassifier(n_neighbors=3)
 22 |         self.features = None
 23 |         self.target = None
 24 |         
 25 |         # Text area for displaying information
 26 |         self.text_area = tk.Text(master, height=10, width=50)
 27 |         self.text_area.pack()
 28 | 
 29 |         # Button to train the KNN
 30 |         self.train_btn = tk.Button(master, text="Train KNN", command=self.open_feature_selection)
 31 |         self.train_btn.pack()
 32 | 
 33 |         # Button to make a prediction
 34 |         self.predict_btn = tk.Button(master, text="Make Prediction", command=self.make_prediction)
 35 |         self.predict_btn.pack()
 36 | 
 37 |     def open_feature_selection(self):
 38 |         # Opens a new window to select features
 39 |         self.feature_window = tk.Toplevel(self.master)
 40 |         self.feature_window.title("Select Features")
 41 |         
 42 |         tk.Label(self.feature_window, text="Enter features separated by commas:").pack()
 43 |         
 44 |         self.feature_entry = tk.Entry(self.feature_window, width=50)
 45 |         self.feature_entry.pack(pady=10)
 46 | 
 47 |         submit_btn = tk.Button(self.feature_window, text="Submit", command=self.train_knn)
 48 |         submit_btn.pack()
 49 | 
 50 |     def train_knn(self):
 51 |         features = self.feature_entry.get().replace(' ', '').split(',')
 52 |         if all(feature in self.df.columns for feature in features):
 53 |             self.features = self.df[features]
 54 |             self.target = self.df.iloc[:, -1]  # Assuming the last column is the target
 55 | 
 56 |             self.model.fit(self.features, self.target)
 57 |             self.text_area.insert(tk.END, "Model trained with features: {}\n".format(", ".join(features)))
 58 |             self.feature_window.destroy()
 59 |         else:
 60 |             messagebox.showerror("Error", "One or more features are invalid")
 61 | 
 62 |     def make_prediction(self):
 63 |         # Opens a new window for predictions
 64 |         if self.features is None:
 65 |             messagebox.showerror("Error", "Model is not trained yet")
 66 |             return
 67 | 
 68 |         self.pred_window = tk.Toplevel(self.master)
 69 |         self.pred_window.title("Make Prediction")
 70 |         self.entries = []
 71 | 
 72 |         for feature in self.features.columns:
 73 |             row = tk.Frame(self.pred_window)
 74 |             lbl = tk.Label(row, width=15, text=feature, anchor='w')
 75 |             ent = tk.Entry(row)
 76 |             row.pack(side=tk.TOP, fill=tk.X, padx=5, pady=5)
 77 |             lbl.pack(side=tk.LEFT)
 78 |             ent.pack(side=tk.RIGHT, expand=tk.YES, fill=tk.X)
 79 |             self.entries.append(ent)
 80 |         
 81 |         submit_btn = tk.Button(self.pred_window, text="Submit", command=self.submit_prediction)
 82 |         submit_btn.pack()
 83 | 
 84 |     def submit_prediction(self):
 85 |         try:
 86 |             input_data = [float(entry.get()) for entry in self.entries]
 87 |             prediction = self.model.predict([input_data])[0]
 88 |             self.text_area.insert(tk.END, f"Prediction data: {input_data}\n")
 89 |             self.text_area.insert(tk.END, f"Belonging class: {prediction}\n")
 90 |         except ValueError:
 91 |             messagebox.showerror("Error", "Please enter valid numbers")
 92 |         finally:
 93 |             self.pred_window.destroy()
 94 | 
 95 | 
 96 | def register(app):
 97 |     @app.register_plugin('machine_learning', 'knn', 'K Nearest Neighbors')
 98 |     def knn():
 99 |         datas = app.get_dataframe()
100 |         root = tk.Tk()
101 |         appl = KNNApp(root, datas)
102 |         root.mainloop()


--------------------------------------------------------------------------------
/plugins/pearson.py:
--------------------------------------------------------------------------------
  1 | #####################################################
  2 | #### Package: Aurora
  3 | #### Plugin: Pearson correlation
  4 | #### Version: 0.1
  5 | #### Author: Marius Neagoe
  6 | #### Copyright: © 2024 Marius Neagoe
  7 | #### Website: https://mariusneagoe.com
  8 | #### Github: https://github.com/MariusNea/Aurora
  9 | #####################################################
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | import tkinter as tk
 14 | from tkinter import messagebox
 15 | from tkinter import filedialog
 16 | 
 17 | def calculate_pearson_correlation(df, col1, col2):
 18 |     """Calculate the Pearson correlation coefficient between two columns."""
 19 |     if col1 not in df.columns or col2 not in df.columns:
 20 |         raise ValueError(f"One or both columns '{col1}' and '{col2}' are not in the DataFrame.")
 21 |     
 22 |     return df[col1].corr(df[col2])
 23 | 
 24 | def create_tkinter_ui(dataframe):
 25 |     """Create a Tkinter UI for inputting column names and calculating Pearson correlation."""
 26 |     def on_calculate():
 27 |         """Handler for the calculate button."""
 28 |         cols = entry.get().strip()
 29 |         try:
 30 |             if '-' in cols:
 31 |                 parts = cols.split(',')
 32 |                 if len(parts) != 2:
 33 |                     raise ValueError("Incorrect format. Use either 'col1,col2', 'col1,col2-col10', or 'col1-col100,col105-col200'.")
 34 | 
 35 |                 range1 = parts[0].strip()
 36 |                 range2 = parts[1].strip()
 37 | 
 38 |                 if '-' in range1 and '-' in range2:
 39 |                     start_col1, end_col1 = range1.split('-')
 40 |                     start_col1 = start_col1.strip()
 41 |                     end_col1 = end_col1.strip()
 42 | 
 43 |                     start_col2, end_col2 = range2.split('-')
 44 |                     start_col2 = start_col2.strip()
 45 |                     end_col2 = end_col2.strip()
 46 | 
 47 |                     if start_col1 not in dataframe.columns or end_col1 not in dataframe.columns or start_col2 not in dataframe.columns or end_col2 not in dataframe.columns:
 48 |                         raise ValueError("One or more columns are not in the DataFrame.")
 49 |                     
 50 |                     # Get the range of columns for both parts
 51 |                     col_range1 = dataframe.loc[:, start_col1:end_col1].columns
 52 |                     col_range2 = dataframe.loc[:, start_col2:end_col2].columns
 53 | 
 54 |                     results = []
 55 |                     for col1 in col_range1:
 56 |                         for col2 in col_range2:
 57 |                             correlation = calculate_pearson_correlation(dataframe, col1, col2)
 58 |                             results.append((col1, col2, correlation))
 59 |                 
 60 |                     # Convert results to a DataFrame for export
 61 |                     result_df = pd.DataFrame(results, columns=['Column 1', 'Column 2', 'Correlation'])
 62 |                     
 63 |                     # Save to CSV
 64 |                     save_results_to_csv(result_df)
 65 |                 elif '-' in range1 or '-' in range2:
 66 |                     raise ValueError("Invalid range format. Both parts should be ranges if '-' is present in both.")
 67 |                 else:
 68 |                     base_col = range1
 69 |                     other_col = range2
 70 |                     correlation = calculate_pearson_correlation(dataframe, base_col, other_col)
 71 |                     result_df = pd.DataFrame([(base_col, other_col, correlation)], columns=['Base Column', 'Compared Column', 'Correlation'])
 72 |                     
 73 |                     # Show the result in a message box
 74 |                     messagebox.showinfo("Pearson Correlation", f"The correlation between '{base_col}' and '{other_col}' is: {correlation:.4f}")
 75 |                     
 76 |                     # Save to CSV
 77 |                     save_results_to_csv(result_df)
 78 |             else:
 79 |                 col1, col2 = cols.split(',')
 80 |                 col1 = col1.strip()
 81 |                 col2 = col2.strip()
 82 |                 correlation = calculate_pearson_correlation(dataframe, col1, col2)
 83 |                 result_df = pd.DataFrame([(col1, col2, correlation)], columns=['Base Column', 'Compared Column', 'Correlation'])
 84 |                 
 85 |                 # Show the result in a message box
 86 |                 messagebox.showinfo("Pearson Correlation", f"The correlation between '{col1}' and '{col2}' is: {correlation:.4f}")
 87 |                 
 88 |                 # Save to CSV
 89 |                 save_results_to_csv(result_df)
 90 |         except ValueError as ve:
 91 |             messagebox.showerror("Input Error", str(ve))
 92 |         except Exception as e:
 93 |             messagebox.showerror("Error", str(e))
 94 | 
 95 |     def save_results_to_csv(result_df):
 96 |         """Save the correlation results to a CSV file."""
 97 |         file_path = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV files", "*.csv")])
 98 |         if file_path:
 99 |             result_df.to_csv(file_path, index=False)
100 |             messagebox.showinfo("Save Successful", f"Results saved to {file_path}")
101 | 
102 |     # Tkinter setup
103 |     root = tk.Tk()
104 |     root.title("Pearson Correlation Calculator")
105 | 
106 |     label = tk.Label(root, text="Enter column names (e.g., Col1,Col2 or Col1,Col2-Col10 or Col1-Col100,Col105-Col200):")
107 |     label.pack()
108 | 
109 |     global entry  # Declare entry as global to access it within on_calculate
110 |     entry = tk.Entry(root, width=50)
111 |     entry.pack()
112 | 
113 |     button = tk.Button(root, text="Calculate Correlation", command=on_calculate)
114 |     button.pack()
115 | 
116 |     root.mainloop()
117 | 
118 | def register(app):
119 |     @app.register_plugin('statistics', 'pearson', 'Pearson Correlation')
120 |     def pearson():      
121 |         datafr = app.get_dataframe()
122 |         create_tkinter_ui(datafr)
123 | 


--------------------------------------------------------------------------------
/plugins/poisson_probabilities.py:
--------------------------------------------------------------------------------
 1 | #plugins/poisson_probabilities.py
 2 | import tkinter as tk
 3 | from tkinter import simpledialog, messagebox
 4 | import pandas as pd
 5 | import math
 6 | 
 7 | #####################################################
 8 | #### Package: Aurora
 9 | #### Plugin: Poisson Probabilities
10 | #### Version: 0.1
11 | #### Author: Marius Neagoe
12 | #### Copyright: © 2024 Marius Neagoe
13 | #### Website: https://mariusneagoe.com
14 | #### Github: https://github.com/MariusNea/Aurora
15 | #####################################################
16 | 
17 | ## The dataframemust contain only one column which represents the number of events on a given period of time.
18 | ## Plugin outputh takes one argument, the numer of events for that period of time.
19 | ## Outputs 3 probabilities: the exact probability for exact that number of events to take place in the next period of time,
20 | ## the probability that < x number of events to take place in the next period of time,
21 | ## the probability that > x number of events to take place in the next period of time.
22 | 
23 | def poisson_probability(x, mu):
24 |     return (math.exp(-mu) * (mu ** x)) / math.factorial(x)
25 | 
26 | def cumulative_poisson_probability(x, mu, cumulative=False):
27 |     if cumulative:
28 |         return sum(poisson_probability(i, mu) for i in range(x + 1))
29 |     else:
30 |         return poisson_probability(x, mu)
31 | 
32 | def real_world_poisson(mu, parameter, calculation_type='exact', **kwargs):
33 |     if calculation_type == 'exact':
34 |         return poisson_probability(parameter, mu)
35 |     elif calculation_type == 'cumulative':
36 |         return cumulative_poisson_probability(parameter, mu, cumulative=True)
37 |     elif calculation_type == 'greater_than':
38 |         return 1 - cumulative_poisson_probability(parameter, mu, cumulative=True)
39 |     else:
40 |         raise ValueError("Unsupported calculation type")
41 | 
42 | def calculate_and_display_results():
43 |     # Calculate the average number of events per interval from the DataFrame
44 |     mu = df[df.columns[0]].mean()
45 |     
46 |     x = simpledialog.askstring("Input", "Enter the specific number of events (x):")
47 |     if x is not None:
48 |         try:
49 |             x = int(x)
50 |         except ValueError:
51 |             messagebox.showerror("Error", "Please enter a valid integer.")
52 |             return
53 |         
54 |         # Calculate probabilities
55 |         exact_probability = real_world_poisson(mu, x, 'exact')
56 |         cumulative_probability = real_world_poisson(mu, x, 'cumulative')
57 |         greater_than_probability = real_world_poisson(mu, x, 'greater_than')
58 |         
59 |         # Display results in a messagebox
60 |         result_message = f"Exact Probability (x={x}): {exact_probability*100:.4f} %\n" \
61 |                          f"Cumulative Probability (<=x): {cumulative_probability*100:.4f} %\n" \
62 |                          f"Greater Than Probability (>x): {greater_than_probability*100:.4f} %"
63 |         messagebox.showinfo("Probability Results", result_message)
64 | 
65 | def register(app):
66 |     @app.register_plugin('statistics','poisson', 'Poisson Probabilities')
67 |     def poisson():
68 |         global df
69 |         df = app.get_dataframe()
70 |     # Check if the number of columns is even
71 |         if len(df.columns) > 1:
72 |             error_message = "Error: The number of columns in the dataframe must be 1. Each row represents the number of events on a period of time."
73 |             messagebox.showerror("Error", error_message)
74 |             return
75 |         root = tk.Tk()
76 |         root.withdraw()  # Hide the main window
77 |         calculate_and_display_results()
78 | 
79 | 


--------------------------------------------------------------------------------
/plugins/svm.py:
--------------------------------------------------------------------------------
  1 | #####################################################
  2 | #### Package: Aurora
  3 | #### Plugin: Support Vector Machines Classifier
  4 | #### Version: 0.1
  5 | #### Author: Marius Neagoe
  6 | #### Copyright: © 2024 Marius Neagoe
  7 | #### Website: https://mariusneagoe.com
  8 | #### Github: https://github.com/MariusNea/Aurora
  9 | #####################################################
 10 | 
 11 | import pandas as pd
 12 | import numpy as np
 13 | import tkinter as tk
 14 | from tkinter import ttk, simpledialog, messagebox, scrolledtext
 15 | from sklearn.model_selection import train_test_split
 16 | from sklearn.preprocessing import StandardScaler
 17 | from sklearn.svm import SVC
 18 | from sklearn.metrics import classification_report, accuracy_score
 19 | import threading
 20 | 
 21 | 
 22 | def preprocess_data(df, features, target):
 23 |     X = df[features]
 24 |     y = df[target]
 25 |     scaler = StandardScaler()
 26 |     X_scaled = scaler.fit_transform(X)
 27 |     return X_scaled, y, scaler
 28 | 
 29 | def split_data(X, y, test_size=0.2, random_state=42):
 30 |     return train_test_split(X, y, test_size=test_size, random_state=random_state)
 31 | 
 32 | def train_svm(X_train, y_train, kernel='rbf', C=1.0):
 33 |     model = SVC(kernel=kernel, C=C)
 34 |     model.fit(X_train, y_train)
 35 |     return model
 36 | 
 37 | def evaluate_model(model, X_test, y_test):
 38 |     predictions = model.predict(X_test)
 39 |     accuracy = accuracy_score(y_test, predictions)
 40 |     report = classification_report(y_test, predictions, zero_division=0)
 41 |     return accuracy, report
 42 | 
 43 | def create_gui(df):
 44 |     window = tk.Tk()
 45 |     window.title("SVM Classifier")
 46 | 
 47 |     # Determine initial values for features and target from DataFrame
 48 |     initial_features = ", ".join(df.columns[:-1])  # All columns except the last one
 49 |     initial_target = df.columns[-1]  # Last column
 50 | 
 51 |     kernel_var = tk.StringVar(window)
 52 |     kernel_options = ['linear', 'poly', 'rbf', 'sigmoid']
 53 |     kernel_var.set(kernel_options[2])  # default to RBF
 54 | 
 55 |     ttk.Label(window, text="Select Kernel:").pack(pady=5)
 56 |     kernel_dropdown = ttk.OptionMenu(window, kernel_var, kernel_options[2], *kernel_options)
 57 |     kernel_dropdown.pack(pady=5)
 58 | 
 59 |     results_text = scrolledtext.ScrolledText(window, width=60, height=10)
 60 |     results_text.pack(pady=10)
 61 | 
 62 |     global model, scaler
 63 |     model = None
 64 |     scaler = None
 65 | 
 66 |     def run_svm(features, target, kernel, C):
 67 |         global model, scaler
 68 |         try:
 69 |             features_list = [f.strip() for f in features.split(',')]
 70 |             X_scaled, y, scaler = preprocess_data(df, features_list, target.strip())
 71 |             X_train, X_test, y_train, y_test = split_data(X_scaled, y)
 72 |             model = train_svm(X_train, y_train, kernel, C)
 73 |             accuracy, report = evaluate_model(model, X_test, y_test)
 74 |             results_text.delete('1.0', tk.END)
 75 |             results_text.insert(tk.INSERT, f"Classification Report:\n{report}\n")
 76 |             results_text.insert(tk.INSERT, f"Accuracy: {accuracy:.2f}\n")
 77 |         except Exception as e:
 78 |             messagebox.showerror("Error", f"An error occurred: {e}")
 79 | 
 80 |     def get_input():
 81 |         features = simpledialog.askstring("Input", "Enter feature column names separated by comma:",
 82 |                                           initialvalue=initial_features)
 83 |         target = simpledialog.askstring("Input", "Enter target column name:", initialvalue=initial_target)
 84 |         kernel = kernel_var.get()
 85 |         C = float(simpledialog.askstring("Input", "Enter C parameter (e.g., 1.0):"))
 86 |         threading.Thread(target=run_svm, args=(features, target, kernel, C)).start()
 87 | 
 88 |     def make_prediction():
 89 |         if model is not None and scaler is not None:
 90 |             try:
 91 |                 feature_inputs = simpledialog.askstring("Predict", "Enter values for {} separated by commas:".format(", ".join(df.columns[:-1])))
 92 |                 if not feature_inputs:
 93 |                     messagebox.showwarning("Warning", "Input was cancelled or empty. Please provide valid numbers.")
 94 |                     return
 95 |                 feature_values = [float(v.strip()) for v in feature_inputs.split(',')]
 96 |                 if len(feature_values) != len(df.columns[:-1]):
 97 |                     messagebox.showerror("Error", "The number of input values must match the number of features.")
 98 |                     return
 99 |                 data = pd.DataFrame([feature_values], columns=df.columns[:-1])
100 |                 scaled_data = scaler.transform(data)
101 |                 prediction = model.predict(scaled_data)
102 |                 messagebox.showinfo("Prediction Result", f"The predicted class is: {prediction[0]}")
103 |             except ValueError:
104 |                 messagebox.showerror("Error", "Invalid input. Please enter valid numbers.")
105 |             except Exception as e:
106 |                 messagebox.showerror("Error", f"An unexpected error occurred: {e}")
107 |         else:
108 |             messagebox.showerror("Error", "Model is not trained yet. Please train the model first.")
109 | 
110 |     btn_run = tk.Button(window, text="Train SVM", command=get_input)
111 |     btn_run.pack(pady=10)
112 | 
113 |     btn_predict = tk.Button(window, text="Make Prediction", command=make_prediction)
114 |     btn_predict.pack(pady=10)
115 | 
116 |     window.mainloop()
117 | 
118 | def register(app):
119 |     @app.register_plugin('machine_learning', 'svm', 'Support Vector Machines')
120 |     def svm():
121 |         data = app.get_dataframe()
122 |         create_gui(data)
123 | 


--------------------------------------------------------------------------------
/plugins/text_classifier.py:
--------------------------------------------------------------------------------
  1 | #####################################################
  2 | #### Package: Aurora
  3 | #### Plugin: Text Classifier
  4 | #### Version: 0.1
  5 | #### Author: Marius Neagoe
  6 | #### Copyright: © 2024 Marius Neagoe
  7 | #### Website: https://mariusneagoe.com
  8 | #### Github: https://github.com/MariusNea/Aurora
  9 | #####################################################
 10 | 
 11 | import tkinter as tk
 12 | from tkinter import simpledialog, messagebox
 13 | from tkinter.scrolledtext import ScrolledText  # Corrected import
 14 | from sklearn.model_selection import train_test_split
 15 | from sklearn.feature_extraction.text import CountVectorizer
 16 | from sklearn.naive_bayes import MultinomialNB
 17 | from sklearn.pipeline import make_pipeline
 18 | from sklearn.metrics import classification_report
 19 | import pandas as pd
 20 | 
 21 | class TextClassifierPlugin:
 22 |     def __init__(self, dataframe):
 23 |         self.dataframe = dataframe
 24 |         self.parameters = {}
 25 |         self.status_text = None
 26 |         self.model = None
 27 | 
 28 |     def get_parameters_window(self):
 29 |         def submit():
 30 |             try:
 31 |                 self.parameters['test_size'] = float(test_size_entry.get())
 32 |                 self.parameters['random_state'] = int(random_state_entry.get())
 33 |                 self.parameters['text_column'] = text_column_entry.get()
 34 |                 self.parameters['label_column'] = label_column_entry.get()
 35 |                 param_window.destroy()
 36 |             except ValueError:
 37 |                 self.show_status("Invalid input. Please enter valid numbers for test size and random state, and column names.")
 38 | 
 39 |         param_window = tk.Toplevel(root)
 40 |         param_window.title("Set Parameters")
 41 | 
 42 |         tk.Label(param_window, text="Test Size (0-1):").grid(row=0, column=0)
 43 |         tk.Label(param_window, text="Random State:").grid(row=1, column=0)
 44 |         tk.Label(param_window, text="Text Column Name:").grid(row=2, column=0)
 45 |         tk.Label(param_window, text="Label Column Name:").grid(row=3, column=0)
 46 | 
 47 |         test_size_entry = tk.Entry(param_window)
 48 |         random_state_entry = tk.Entry(param_window)
 49 |         text_column_entry = tk.Entry(param_window)
 50 |         label_column_entry = tk.Entry(param_window)
 51 | 
 52 |         test_size_entry.grid(row=0, column=1)
 53 |         random_state_entry.grid(row=1, column=1)
 54 |         text_column_entry.grid(row=2, column=1)
 55 |         label_column_entry.grid(row=3, column=1)
 56 | 
 57 |         text_column_entry.insert(0, "Text")  # Default column name for text
 58 |         label_column_entry.insert(0, "Label")  # Default column name for labels
 59 | 
 60 |         submit_button = tk.Button(param_window, text="Submit", command=submit)
 61 |         submit_button.grid(row=4, columnspan=2)
 62 | 
 63 |         param_window.transient(root)
 64 |         param_window.grab_set()
 65 |         root.wait_window(param_window)
 66 | 
 67 |     def load_data(self):
 68 |         X = self.dataframe[self.parameters['text_column']]
 69 |         y = self.dataframe[self.parameters['label_column']]
 70 |         return X, y
 71 | 
 72 |     def train_model(self):
 73 |         self.show_status("Loading data...")
 74 |         X, y = self.load_data()
 75 | 
 76 |         self.show_status("Splitting data...")
 77 |         X_train, X_test, y_train, y_test = train_test_split(
 78 |             X, y, test_size=self.parameters['test_size'], random_state=self.parameters['random_state'])
 79 | 
 80 |         self.show_status("Training model...")
 81 |         self.model = make_pipeline(CountVectorizer(), MultinomialNB())
 82 |         self.model.fit(X_train, y_train)
 83 | 
 84 |         self.show_status("Evaluating model...")
 85 |         y_pred = self.model.predict(X_test)
 86 |         report = classification_report(y_test, y_pred)
 87 |         self.show_status("Model trained. \n\n" + report)
 88 | 
 89 |     def make_prediction(self):
 90 |         input_text = self.prediction_entry.get()
 91 |         if self.model is not None:
 92 |             prediction = self.model.predict([input_text])
 93 |             self.show_status(f"Prediction for '{input_text}': {prediction[0]}")
 94 |         else:
 95 |             self.show_status("Model is not trained yet.")
 96 | 
 97 |     def show_status(self, message):
 98 |         if self.status_text:
 99 |             self.status_text.config(state=tk.NORMAL)
100 |             self.status_text.insert(tk.END, message + "\n")
101 |             self.status_text.config(state=tk.DISABLED)
102 | 
103 |     def main(self):
104 |         global root
105 |         root = tk.Tk()
106 |         root.title("Text Classifier")
107 | 
108 |         self.status_text = ScrolledText(root, wrap=tk.WORD, state=tk.DISABLED)
109 |         self.status_text.pack(expand=True, fill='both')
110 | 
111 |         self.get_parameters_window()
112 | 
113 |         start_button = tk.Button(root, text="Start Training", command=self.train_model)
114 |         start_button.pack()
115 | 
116 |         tk.Label(root, text="Enter text for prediction:").pack()
117 |         self.prediction_entry = tk.Entry(root)
118 |         self.prediction_entry.pack()
119 | 
120 |         predict_button = tk.Button(root, text="Make Prediction", command=self.make_prediction)
121 |         predict_button.pack()
122 | 
123 |         root.mainloop()
124 | 
125 | def register(app):
126 |     @app.register_plugin('machine_learning', 'text_classifier', 'Text Classifier')
127 |     def kmeans():
128 |         text = app.get_dataframe()
129 | 
130 |         classifier_plugin = TextClassifierPlugin(text)
131 |         classifier_plugin.main()
132 | 


--------------------------------------------------------------------------------
/plugins/xgboost.py:
--------------------------------------------------------------------------------
  1 | #####################################################
  2 | #### Package: Aurora
  3 | #### Plugin: XGBoost (Regression and Classification)
  4 | #### Version: 0.1
  5 | #### Author: Marius Neagoe
  6 | #### Copyright: © 2024 Marius Neagoe
  7 | #### Website: https://mariusneagoe.com
  8 | #### Github: https://github.com/MariusNea/Aurora
  9 | #####################################################
 10 | 
 11 | import pandas as pd
 12 | import xgboost as xgb
 13 | from sklearn.model_selection import train_test_split
 14 | from sklearn.metrics import mean_squared_error
 15 | import tkinter as tk
 16 | from tkinter import messagebox
 17 | 
 18 | def xgboost_trainer_gui(data: pd.DataFrame):
 19 | 
 20 |     model = None
 21 |     feature_cols = None
 22 |     target_col = None
 23 |     prediction_feature_names = None
 24 | 
 25 |     def train_xgboost_model(data: pd.DataFrame, feature_cols=None, target_col=None, n_estimators=100, learning_rate=0.1, max_depth=5, early_stopping_rounds=10):
 26 |         
 27 |         if feature_cols is None or target_col is None:
 28 |             X = data.iloc[:, :-1]
 29 |             y = data.iloc[:, -1]
 30 |         else:
 31 |             X = data[feature_cols]
 32 |             y = data[target_col]
 33 |         
 34 |         # Split the data into training and validation sets
 35 |         X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
 36 |         
 37 |         # Define the model
 38 |         model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
 39 | 
 40 |         # Track early stopping manually
 41 |         min_val_error = float("inf")
 42 |         rounds_without_improvement = 0
 43 | 
 44 |         for i in range(n_estimators):
 45 |             model.n_estimators = i + 1  # Incrementally increase n_estimators
 46 |             model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
 47 | 
 48 |             val_predictions = model.predict(X_val)
 49 |             val_error = mean_squared_error(y_val, val_predictions)
 50 | 
 51 |             if val_error < min_val_error:
 52 |                 min_val_error = val_error
 53 |                 rounds_without_improvement = 0
 54 |             else:
 55 |                 rounds_without_improvement += 1
 56 | 
 57 |             if rounds_without_improvement >= early_stopping_rounds:
 58 |                 print(f"Early stopping after {i+1} rounds.")
 59 |                 break
 60 |         
 61 |         # Feature importance
 62 |         feature_importance = pd.DataFrame({
 63 |             'Feature': X.columns,
 64 |             'Importance': model.feature_importances_
 65 |         }).sort_values(by='Importance', ascending=False)
 66 |         
 67 |         return model, feature_importance, X.columns.tolist()
 68 | 
 69 |     def train_model():
 70 |         nonlocal model, feature_cols, target_col, prediction_feature_names
 71 | 
 72 |         if not entry_estimators.get().strip() or not entry_lr.get().strip() or not entry_depth.get().strip() or not entry_early_stopping_rounds.get().strip():
 73 |             messagebox.showwarning("Invalid Input", "All fields must not be empty.")
 74 |             return
 75 | 
 76 |         feature_cols = entry_features.get().strip()
 77 |         target_col = entry_target.get().strip()
 78 | 
 79 |         if not feature_cols:
 80 |             feature_cols = None
 81 |         else:
 82 |             feature_cols = [col.strip() for col in feature_cols.split(",")]
 83 | 
 84 |         if not target_col:
 85 |             target_col = None
 86 | 
 87 |         try:
 88 |             n_estimators = int(entry_estimators.get().strip())
 89 |             learning_rate = float(entry_lr.get().strip())
 90 |             max_depth = int(entry_depth.get().strip())
 91 |             early_stopping_rounds = int(entry_early_stopping_rounds.get().strip())
 92 |         except ValueError:
 93 |             messagebox.showwarning("Invalid Input", "Please enter valid hyperparameters.")
 94 |             return
 95 | 
 96 |         model, feature_importance, feature_names = train_xgboost_model(data, feature_cols, target_col, n_estimators, learning_rate, max_depth, early_stopping_rounds)
 97 |         print("Model trained successfully!")
 98 |         print(feature_importance)
 99 | 
100 |         # Store feature names for prediction
101 |         prediction_feature_names = feature_names
102 | 
103 |         # Enable prediction inputs after training
104 |         for entry in prediction_entries:
105 |             entry.config(state='normal')
106 |         btn_predict.config(state='normal')
107 | 
108 |     def predict():
109 |         if model is None:
110 |             messagebox.showwarning("Model Not Trained", "Please train the model before making predictions.")
111 |             return
112 | 
113 |         try:
114 |             input_data = [float(entry.get().strip()) for entry in prediction_entries]
115 |         except ValueError:
116 |             messagebox.showwarning("Invalid Input", "Please enter valid numbers for predictions.")
117 |             return
118 | 
119 |         input_df = pd.DataFrame([input_data], columns=prediction_feature_names)
120 |         prediction = model.predict(input_df)[0]
121 |         lbl_prediction_result.config(text=f"Predicted Value: {prediction:.2f}")
122 | 
123 |     # Tkinter GUI setup
124 |     root = tk.Tk()
125 |     root.title("XGBoost Model Trainer and Predictor")
126 | 
127 |     tk.Label(root, text="Feature Columns (comma separated):").pack()
128 |     entry_features = tk.Entry(root, width=80)
129 |     entry_features.pack(pady=5)
130 | 
131 |     tk.Label(root, text="Target Column:").pack()
132 |     entry_target = tk.Entry(root, width=80)
133 |     entry_target.pack(pady=5)
134 | 
135 |     tk.Label(root, text="Number of Estimators:").pack()
136 |     entry_estimators = tk.Entry(root, width=20)
137 |     entry_estimators.insert(0, "100")  # Default value
138 |     entry_estimators.pack(pady=5)
139 | 
140 |     tk.Label(root, text="Learning Rate:").pack()
141 |     entry_lr = tk.Entry(root, width=20)
142 |     entry_lr.insert(0, "0.1")  # Default value
143 |     entry_lr.pack(pady=5)
144 | 
145 |     tk.Label(root, text="Max Depth:").pack()
146 |     entry_depth = tk.Entry(root, width=20)
147 |     entry_depth.insert(0, "5")  # Default value
148 |     entry_depth.pack(pady=5)
149 | 
150 |     tk.Label(root, text="Early Stopping Rounds:").pack()
151 |     entry_early_stopping_rounds = tk.Entry(root, width=20)
152 |     entry_early_stopping_rounds.insert(0, "10")  # Default value
153 |     entry_early_stopping_rounds.pack(pady=5)
154 | 
155 |     btn_train_model = tk.Button(root, text="Train Model", command=train_model)
156 |     btn_train_model.pack(pady=20)
157 | 
158 |     # Prediction Section
159 |     tk.Label(root, text="Enter Values for Prediction:").pack(pady=10)
160 |     prediction_entries = []
161 |     for i in range(data.shape[1] - 1):  # Number of features
162 |         entry = tk.Entry(root, width=20, state='disabled')
163 |         entry.pack(pady=2)
164 |         prediction_entries.append(entry)
165 | 
166 |     btn_predict = tk.Button(root, text="Predict", command=predict, state='disabled')
167 |     btn_predict.pack(pady=20)
168 | 
169 |     lbl_prediction_result = tk.Label(root, text="")
170 |     lbl_prediction_result.pack(pady=5)
171 | 
172 |     root.mainloop()
173 | 
174 | 
175 | def register(app):
176 |     @app.register_plugin('machine_learning', 'xgboost', 'XGBoost (Regression and Classification)')
177 |     def xgboost():      
178 |         date = app.get_dataframe()
179 |         xgboost_trainer_gui(date)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | altgraph==0.17.4
  2 | annotated-types==0.6.0
  3 | anyio==3.7.0
  4 | appdirs==1.4.4
  5 | argon2-cffi==21.3.0
  6 | argon2-cffi-bindings==21.2.0
  7 | arrow==1.2.3
  8 | astor==0.8.1
  9 | asttokens==2.2.1
 10 | attrs==23.1.0
 11 | autocommand==2.2.2
 12 | backcall==0.2.0
 13 | backports.csv==1.0.7
 14 | beautifulsoup4==4.11.1
 15 | bleach==6.0.0
 16 | certifi==2022.9.24
 17 | cffi==1.15.1
 18 | charset-normalizer==2.1.1
 19 | cheroot==10.0.0
 20 | CherryPy==18.9.0
 21 | click==8.1.7
 22 | cloudpickle==3.0.0
 23 | colorama==0.4.6
 24 | comm==0.1.3
 25 | configparser==6.0.0
 26 | contourpy==1.1.0
 27 | cryptography==41.0.1
 28 | cycler==0.11.0
 29 | Cython==3.0.4
 30 | dask==2023.5.0
 31 | debugpy==1.6.7
 32 | decorator==5.1.1
 33 | defusedxml==0.7.1
 34 | distributed==2023.5.0
 35 | docopt==0.6.2
 36 | entrypoints==0.4
 37 | exceptiongroup==1.1.1
 38 | executing==1.2.0
 39 | extruct==0.14.0
 40 | fastdtw==0.3.4
 41 | fastjsonschema==2.17.1
 42 | feedparser==6.0.11
 43 | fonttools==4.42.1
 44 | fqdn==1.5.1
 45 | frozendict==2.3.8
 46 | fsspec==2024.2.0
 47 | future==1.0.0
 48 | futures==3.0.5
 49 | html-text==0.5.2
 50 | html5lib==1.1
 51 | idna==3.4
 52 | importlib-metadata==6.6.0
 53 | importlib-resources==5.12.0
 54 | inflect==7.0.0
 55 | ipykernel==6.23.2
 56 | ipython==8.12.2
 57 | ipython-genutils==0.2.0
 58 | ipywidgets==8.1.2
 59 | isodate==0.6.1
 60 | isoduration==20.11.0
 61 | jaraco.collections==5.0.0
 62 | jaraco.context==4.3.0
 63 | jaraco.functools==4.0.0
 64 | jaraco.text==3.12.0
 65 | jedi==0.18.2
 66 | Jinja2==3.1.2
 67 | joblib==1.3.2
 68 | jsonpointer==2.4
 69 | jsonschema==4.17.3
 70 | jstyleson==0.0.2
 71 | jupyter-events==0.6.3
 72 | jupyter_client==8.2.0
 73 | jupyter_core==5.3.1
 74 | jupyter_server==2.6.0
 75 | jupyter_server_terminals==0.4.4
 76 | jupyterlab-pygments==0.2.2
 77 | jupyterlab_widgets==3.0.10
 78 | kiwisolver==1.4.5
 79 | locket==1.0.0
 80 | lxml==4.9.1
 81 | MarkupSafe==2.1.3
 82 | matplotlib==3.7.2
 83 | matplotlib-inline==0.1.6
 84 | mechanize==0.4.8
 85 | mf2py==1.1.2
 86 | mistune==2.0.5
 87 | more-itertools==10.2.0
 88 | mplcursors==0.5.3
 89 | msgpack==1.0.7
 90 | multitasking==0.0.11
 91 | mysqlclient==2.2.4
 92 | nbclassic==1.0.0
 93 | nbclient==0.8.0
 94 | nbconvert==7.5.0
 95 | nbformat==5.9.0
 96 | nest-asyncio==1.5.6
 97 | networkx==3.1
 98 | nltk==3.8.1
 99 | notebook==6.5.4
100 | notebook_shim==0.2.3
101 | numpy==1.24.4
102 | overrides==7.3.1
103 | packaging==23.1
104 | pandas==2.0.3
105 | pandasgui==0.2.14
106 | pandocfilters==1.5.0
107 | parso==0.8.3
108 | partd==1.4.1
109 | patsy==0.5.6
110 | Pattern==3.6
111 | pdfminer.six==20231228
112 | pefile==2023.2.7
113 | pickleshare==0.7.5
114 | Pillow==10.0.0
115 | pipreqs==0.4.13
116 | pkgutil_resolve_name==1.3.10
117 | platformdirs==3.5.3
118 | plotly==5.16.1
119 | portend==3.2.0
120 | portpicker==1.6.0
121 | prometheus-client==0.17.0
122 | prompt-toolkit==3.0.38
123 | protobuf==4.25.2
124 | psutil==5.9.5
125 | pure-eval==0.2.2
126 | pyaes==1.6.1
127 | pyarrow==15.0.0
128 | pyasn1==0.5.0
129 | pycparser==2.21
130 | pydantic==2.6.4
131 | pydantic_core==2.16.3
132 | Pygments==2.15.1
133 | pyinstaller==6.3.0
134 | pyinstaller-hooks-contrib==2024.0
135 | pynput==1.7.6
136 | pyparsing==3.0.9
137 | PyQt5==5.15.10
138 | PyQt5-Qt5==5.15.2
139 | PyQt5-sip==12.13.0
140 | PyQtWebEngine==5.15.6
141 | PyQtWebEngine-Qt5==5.15.2
142 | pyRdfa3==3.5.3
143 | pyrsistent==0.19.3
144 | python-dateutil==2.8.2
145 | python-docx==1.1.0
146 | python-json-logger==2.0.7
147 | pytz==2023.3
148 | pywin32==306
149 | pywin32-ctypes==0.2.2
150 | pywinpty==2.0.10
151 | PyYAML==6.0
152 | pyzmq==25.1.0
153 | qtstylish==0.1.5
154 | rdflib==6.2.0
155 | recipe-scrapers==14.23.0
156 | regex==2023.12.25
157 | requests==2.28.1
158 | rfc3339-validator==0.1.4
159 | rfc3986-validator==0.1.1
160 | rsa==4.9
161 | scikit-learn==1.3.0
162 | scipy==1.10.1
163 | seaborn==0.13.2
164 | Send2Trash==1.8.2
165 | sgmllib3k==1.0.0
166 | six==1.16.0
167 | sklearn==0.0.post7
168 | sniffio==1.3.0
169 | sortedcontainers==2.4.0
170 | soupsieve==2.3.2.post1
171 | stack-data==0.6.2
172 | statsmodels==0.14.1
173 | tblib==3.0.0
174 | telegram==0.0.1
175 | Telethon==1.31.1
176 | tempora==5.5.1
177 | tenacity==8.2.3
178 | terminado==0.17.1
179 | threadpoolctl==3.2.0
180 | tinycss2==1.2.1
181 | tk==0.1.0
182 | toolz==0.12.1
183 | tornado==6.3.2
184 | tqdm==4.66.2
185 | traitlets==5.9.0
186 | ttkthemes==3.2.2
187 | typing_extensions==4.6.3
188 | tzdata==2023.3
189 | uri-template==1.2.0
190 | urllib3==1.26.12
191 | vl-convert-python==1.2.3
192 | w3lib==2.0.1
193 | wcwidth==0.2.6
194 | webcolors==1.13
195 | webencodings==0.5.1
196 | websocket-client==1.5.3
197 | widgetsnbextension==4.0.10
198 | wordcloud==1.9.3
199 | yarg==0.1.9
200 | yfinance==0.2.20
201 | zc.lockfile==3.0.post1
202 | zict==3.0.0
203 | zipp==3.15.0
204 | torch==2.3.0
205 | xgboost==2.1.1


--------------------------------------------------------------------------------