├── .github
    └── workflows
    │   └── python-package.yml
├── README.md
├── assets
    └── result.png
├── cluster_ga.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    └── top_level.txt
├── cluster_ga
    ├── __pycache__
    │   ├── cluster.cpython-313.pyc
    │   └── genetic.cpython-313.pyc
    ├── cluster.py
    └── genetic.py
├── dist
    ├── cluster_ga-0.1-py3-none-any.whl
    ├── cluster_ga-0.1.tar.gz
    ├── cluster_ga-0.2-py3-none-any.whl
    └── cluster_ga-0.2.tar.gz
├── requirements.txt
└── setup.py


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.9", "3.10", "3.11"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with pytest
39 |       run: |
40 |         pytest
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Genetic Clustering Algorithm
 2 | 
 3 | This Python script implements a genetic algorithm for clustering data. The algorithm optimizes the cluster assignments of data points using a genetic approach, aiming to improve the silhouette score. The silhouette score is a measure of how well-defined the clusters are in the data.
 4 | 
 5 | ## Table of Contents
 6 | - [Installation](#installation)
 7 | - [Usage](#usage)
 8 | - [Algorithm Overview](#algorithm-overview)
 9 |   - [Genetic Class](#genetic-class)
10 |   - [Cluster Class](#cluster-class)
11 | - [Parameters](#parameters)
12 | - [Results](#results)
13 | - [License](#license)
14 | - [Acknowledgments](#acknowledgments)
15 | 
16 | 
17 | 
18 | ## Installation
19 | 
20 | 
21 | 
22 | 
23 | ```bash
24 | pip install cluster_ga
25 | ```
26 | 
27 | ## Usage
28 | 
29 | 
30 | 
31 | ```python
32 | from sklearn import datasets
33 | import numpy as np
34 | import pandas as pd
35 | from cluster_ga.cluster import cluster
36 | 
37 | # this is a for test
38 | 
39 | iris = datasets.load_iris()
40 | iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
41 | x = np.array(iris_df[["petal length (cm)", "petal width (cm)"]])
42 | y = iris.target
43 | 
44 | # Instantiate and fit the model
45 | model = cluster(x, y, 500, 0.9,150) 
46 | model.fit()
47 | 
48 | 
49 | # show fitness plot
50 | model.show_plot()
51 | 
52 | ```
53 | 
54 | ## Algorithm Overview
55 | 
56 | The genetic clustering algorithm consists of the following components:
57 | 
58 | ### Genetic Class
59 | 
60 | Defines the genetic operations such as mutation, generation, and fitness calculation.
61 | 
62 | ### Cluster Class
63 | 
64 | Manages the clustering process, including the initialization of populations, evolution, and convergence.
65 | 
66 | 
67 | 
68 | ## Parameters
69 | 
70 | - `size_population`: Number of individuals in the population.
71 | - `goal`: The desired fitness score to achieve.
72 | - `repeat`: Number of generations to run the algorithm.
73 | - `is_mutation`: Boolean flag to enable or disable mutation.
74 | 
75 | ## Results
76 | 
77 | The script outputs the progress of the algorithm, including the generation number and the fitness score achieved. Additionally, a plot of the fitness scores over generations is displayed at the end of the execution.
78 | 
79 | ![result](./assets/result.png)
80 | 
81 | ## License
82 | 
83 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details.
84 | 
85 | ## Acknowledgments
86 | 
87 | - This implementation is inspired by genetic algorithms and clustering techniques.
88 | - Special thanks to the scikit-learn library for providing the silhouette score metric.
89 | 


--------------------------------------------------------------------------------
/assets/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parvvaresh/clustering-with-genetic/1d29b0ae0ea81e54641ab7fdc32ac466448770f3/assets/result.png


--------------------------------------------------------------------------------
/cluster_ga.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
  1 | Metadata-Version: 2.1
  2 | Name: cluster_ga
  3 | Version: 0.2
  4 | Summary: This Python script implements a genetic algorithm for clustering data. The algorithm optimizes the cluster assignments of data points using a genetic approach, aiming to improve the silhouette score. The silhouette score is a measure of how well-defined the clusters are in the data.
  5 | Home-page: https://github.com/parvvaresh/clustering-with-genetic
  6 | Author: Alireza Parvaresh
  7 | Author-email: parvvaresh@gmail.com
  8 | Classifier: Programming Language :: Python :: 3
  9 | Classifier: License :: OSI Approved :: MIT License
 10 | Classifier: Operating System :: OS Independent
 11 | Requires-Python: >=3.6
 12 | Description-Content-Type: text/markdown
 13 | 
 14 | # Genetic Clustering Algorithm
 15 | 
 16 | This Python script implements a genetic algorithm for clustering data. The algorithm optimizes the cluster assignments of data points using a genetic approach, aiming to improve the silhouette score. The silhouette score is a measure of how well-defined the clusters are in the data.
 17 | 
 18 | ## Table of Contents
 19 | - [Getting Started](#getting-started)
 20 |   - [Installation](#installation)
 21 | - [Usage](#usage)
 22 | - [Algorithm Overview](#algorithm-overview)
 23 |   - [Genetic Class](#genetic-class)
 24 |   - [Cluster Class](#cluster-class)
 25 | - [Parameters](#parameters)
 26 | - [Results](#results)
 27 | - [License](#license)
 28 | - [Acknowledgments](#acknowledgments)
 29 | 
 30 | ## Getting Started
 31 | 
 32 | 
 33 | ### Installation
 34 | 
 35 | 
 36 | 
 37 | 2. **Install the required dependencies:**
 38 | 
 39 | ```bash
 40 | pip install cluster_ga
 41 | ```
 42 | 
 43 | ## Usage
 44 | 
 45 | 
 46 | 
 47 | ```python
 48 | from sklearn import datasets
 49 | import numpy as np
 50 | import pandas as pd
 51 | from cluster_ga.cluster import cluster
 52 | 
 53 | # this is a for test
 54 | 
 55 | iris = datasets.load_iris()
 56 | iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
 57 | x = np.array(iris_df[["petal length (cm)", "petal width (cm)"]])
 58 | y = iris.target
 59 | 
 60 | # Instantiate and fit the model
 61 | model = cluster(x, y, 500, 0.9,150) 
 62 | model.fit()
 63 | 
 64 | 
 65 | # show fitness plot
 66 | model.show_plot()
 67 | 
 68 | ```
 69 | 
 70 | ## Algorithm Overview
 71 | 
 72 | The genetic clustering algorithm consists of the following components:
 73 | 
 74 | ### Genetic Class
 75 | 
 76 | Defines the genetic operations such as mutation, generation, and fitness calculation.
 77 | 
 78 | ### Cluster Class
 79 | 
 80 | Manages the clustering process, including the initialization of populations, evolution, and convergence.
 81 | 
 82 | 
 83 | 
 84 | ## Parameters
 85 | 
 86 | - `size_population`: Number of individuals in the population.
 87 | - `goal`: The desired fitness score to achieve.
 88 | - `repeat`: Number of generations to run the algorithm.
 89 | - `is_mutation`: Boolean flag to enable or disable mutation.
 90 | 
 91 | ## Results
 92 | 
 93 | The script outputs the progress of the algorithm, including the generation number and the fitness score achieved. Additionally, a plot of the fitness scores over generations is displayed at the end of the execution.
 94 | 
 95 | ![result](./assets/result.png)
 96 | 
 97 | ## License
 98 | 
 99 | This project is licensed under the MIT License - see the [LICENSE.md](LICENSE.md) file for details.
100 | 
101 | ## Acknowledgments
102 | 
103 | - This implementation is inspired by genetic algorithms and clustering techniques.
104 | - Special thanks to the scikit-learn library for providing the silhouette score metric.
105 | 


--------------------------------------------------------------------------------
/cluster_ga.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
1 | README.md
2 | setup.py
3 | cluster_ga.egg-info/PKG-INFO
4 | cluster_ga.egg-info/SOURCES.txt
5 | cluster_ga.egg-info/dependency_links.txt
6 | cluster_ga.egg-info/top_level.txt


--------------------------------------------------------------------------------
/cluster_ga.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/cluster_ga.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/cluster_ga/__pycache__/cluster.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parvvaresh/clustering-with-genetic/1d29b0ae0ea81e54641ab7fdc32ac466448770f3/cluster_ga/__pycache__/cluster.cpython-313.pyc


--------------------------------------------------------------------------------
/cluster_ga/__pycache__/genetic.cpython-313.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parvvaresh/clustering-with-genetic/1d29b0ae0ea81e54641ab7fdc32ac466448770f3/cluster_ga/__pycache__/genetic.cpython-313.pyc


--------------------------------------------------------------------------------
/cluster_ga/cluster.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | from sklearn import datasets
 5 | import random
 6 | from .genetic import GeneticClustering 
 7 | 
 8 | 
 9 | class cluster:
10 |     def __init__(self, X: np.array, y: np.array, size_population: int = 200, goal: float = 0.9, iters: int = 200) -> None:
11 |         self.X = X
12 |         self.y = y
13 |         self.size_population = size_population
14 |         self.goal = goal
15 |         self.iters = iters
16 | 
17 |         self.n_samples = self.X.shape[0]
18 |         self.genom = list(np.unique(y))
19 | 
20 |         self.population = []
21 |         self.fitness = []
22 | 
23 |     def fit(self) -> None:
24 |         for _ in range(self.size_population):
25 |             chromosome = self._create_random_chromosome()
26 |             self.population.append(GeneticClustering(chromosome, self.X, self.genom))
27 | 
28 |         self.counter = 1
29 | 
30 |         while True:
31 |             # Sort by fitness scores in descending order
32 |             self.population = sorted(self.population, key=lambda chromosome: chromosome.fitness_scores, reverse=True)
33 | 
34 |             if self.counter >= self.iters or (self.goal <= self.population[0].fitness_scores <= 1):
35 |                 break
36 | 
37 |             new_generation = []
38 | 
39 |             # Retain top 10% of the population
40 |             size_best_people = int((10 * self.size_population) / 100)
41 |             new_generation.extend(self.population[:size_best_people])
42 | 
43 |             # Generate children for the next generation (90% of population)
44 |             for _ in range(int((90 * self.size_population) / 100)):
45 |                 parent1 = random.choice(self.population[:50])
46 |                 parent2 = random.choice(self.population[:50])
47 |                 child = parent1.generate(parent2)  # Assuming 'generate' exists in the 'genetic' class
48 |                 new_generation.append(child)
49 | 
50 |             self.population = new_generation
51 | 
52 |             # Apply mutation to the new population
53 |             for index in range(self.size_population):
54 |                 self.population[index] = GeneticClustering(self.population[index].mutate(), self.X, self.genom)
55 | 
56 |             # Update fitness scores
57 |             self.fitness.append(self.population[0].fitness_scores)
58 |             self.counter += 1
59 |             self.show()
60 | 
61 |     def _create_random_chromosome(self) -> dict:
62 |         chromosome = {index: random.choice(self.genom) for index in range(self.n_samples)}
63 |         return chromosome
64 | 
65 |     def show(self) -> None:
66 |         print(f"Loop: {self.counter}")
67 |         print(f"===== >> Best Chromosome: {self.population[0]} \tFitness: {self.population[0].fitness_scores}")
68 | 
69 |     def show_plot(self) -> None:
70 |         plt.plot(self.fitness)
71 |         plt.title("Fitness Over Generations")
72 |         plt.xlabel("Generation")
73 |         plt.ylabel("Fitness Score")
74 |         plt.show()
75 | 


--------------------------------------------------------------------------------
/cluster_ga/genetic.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | from sklearn.metrics import silhouette_score
 4 | 
 5 | class GeneticClustering:
 6 |     def __init__(self, chromosome: dict, points: np.array, list_class: list) -> None:
 7 |         self.chromosome = chromosome
 8 |         self.points = points
 9 |         self.list_class = list_class
10 |         self.fitness_scores = self.get_fitness()
11 | 
12 |     def get_fitness(self):
13 |         labels = np.array(list(self.chromosome.values()))
14 |         unique_labels = np.unique(labels)
15 | 
16 |         if len(unique_labels) < 2:
17 |             return -1
18 | 
19 |         return silhouette_score(self.points, labels)
20 | 
21 |     def mutate(self):
22 |         if random.random() >= 0.95:
23 |             self._change_labels_to_nearest_center()
24 |             self._change_random_label()
25 |             self._assign_nearest_cluster_label()
26 |         return self.chromosome
27 | 
28 |     def generate(self, parent):
29 |         new_generation = {}
30 |         for index in range(len(self.chromosome)):
31 |             prob = random.random()
32 | 
33 |             if prob < 0.45:
34 |                 new_generation[index] = self.chromosome[index]
35 |             elif prob < 0.90:
36 |                 new_generation[index] = parent.chromosome[index]
37 |             else:
38 |                 new_generation[index] = random.choice(self.list_class)
39 | 
40 |         return GeneticClustering(new_generation, self.points, self.list_class)
41 | 
42 |     def _change_labels_to_nearest_center(self):
43 |         sample_cluster_label = random.choice(self.list_class)
44 |         nearest_cluster_label = self._find_nearest_cluster(sample_cluster_label)
45 | 
46 |         sample_cluster_indices = [index for index, label in self.chromosome.items() if label == sample_cluster_label]
47 |         num_changes = len(sample_cluster_indices) // 3
48 | 
49 |         for idx in sample_cluster_indices[:num_changes]:
50 |             self.chromosome[idx] = nearest_cluster_label
51 | 
52 |     def _find_nearest_cluster(self, sample_cluster_label):
53 |         sample_points = self.points[list(self.chromosome.values()) == sample_cluster_label]
54 |         sample_center = np.mean(sample_points, axis=0)
55 | 
56 |         # Get centers of all other clusters
57 |         other_clusters = {label: [] for label in self.list_class if label != sample_cluster_label}
58 |         for index, label in self.chromosome.items():
59 |             if label != sample_cluster_label:
60 |                 other_clusters[label].append(self.points[index])
61 | 
62 |         cluster_centers = {label: np.mean(np.array(points), axis=0) for label, points in other_clusters.items()}
63 |         
64 |         distances = {label: np.linalg.norm(sample_center - center) for label, center in cluster_centers.items()}
65 |         nearest_label = min(distances, key=distances.get)
66 |         return nearest_label
67 | 
68 |     def _assign_nearest_cluster_label(self):
69 |         sample_index = random.choice(list(self.chromosome.keys()))
70 |         distances = np.linalg.norm(self.points - self.points[sample_index], axis=1)
71 |         nearest_index = np.argsort(distances)[1]  # 1st closest point
72 |         self.chromosome[sample_index] = self.chromosome[nearest_index]
73 | 
74 |     def _change_random_label(self):
75 |         sample_index = random.choice(list(self.chromosome.keys()))
76 |         new_label = random.choice(self.list_class)
77 | 
78 |         while new_label == self.chromosome[sample_index]:
79 |             new_label = random.choice(self.list_class)
80 | 
81 |         self.chromosome[sample_index] = new_label
82 | 


--------------------------------------------------------------------------------
/dist/cluster_ga-0.1-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parvvaresh/clustering-with-genetic/1d29b0ae0ea81e54641ab7fdc32ac466448770f3/dist/cluster_ga-0.1-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/cluster_ga-0.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parvvaresh/clustering-with-genetic/1d29b0ae0ea81e54641ab7fdc32ac466448770f3/dist/cluster_ga-0.1.tar.gz


--------------------------------------------------------------------------------
/dist/cluster_ga-0.2-py3-none-any.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parvvaresh/clustering-with-genetic/1d29b0ae0ea81e54641ab7fdc32ac466448770f3/dist/cluster_ga-0.2-py3-none-any.whl


--------------------------------------------------------------------------------
/dist/cluster_ga-0.2.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/parvvaresh/clustering-with-genetic/1d29b0ae0ea81e54641ab7fdc32ac466448770f3/dist/cluster_ga-0.2.tar.gz


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scikit-learn
4 | matplotlib
5 | 
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="cluster_ga",  # Name of your package
 5 |     version="0.2",      # Version of your package
 6 |     author="Alireza Parvaresh",
 7 |     author_email="parvvaresh@gmail.com",
 8 |     description="This Python script implements a genetic algorithm for clustering data. The algorithm optimizes the cluster assignments of data points using a genetic approach, aiming to improve the silhouette score. The silhouette score is a measure of how well-defined the clusters are in the data.",
 9 |     long_description=open("README.md").read(),
10 |     long_description_content_type="text/markdown",  # This is optional
11 |     url="https://github.com/parvvaresh/clustering-with-genetic",  # Link to your package’s homepage
12 |     packages=find_packages(),  # Automatically find and include packages in your directory
13 |     classifiers=[
14 |         "Programming Language :: Python :: 3",
15 |         "License :: OSI Approved :: MIT License",
16 |         "Operating System :: OS Independent",
17 |     ],
18 |     python_requires=">=3.6",  # Adjust Python version as needed
19 | )
20 | 


--------------------------------------------------------------------------------