└── ID3 Algorithm.ipynb /ID3 Algorithm.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "36c44b7b", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import math\n", 11 | "import pandas as pd\n", 12 | "\n", 13 | "class DecisionTree:\n", 14 | " def __init__(self, df, target, positive, parent_val, parent):\n", 15 | " self.data = df\n", 16 | " self.target = target\n", 17 | " self.positive = positive\n", 18 | " self.parent_val = parent_val\n", 19 | " self.parent = parent\n", 20 | " self.childs = []\n", 21 | " self.decision = \"\"\n", 22 | "\n", 23 | " def _get_entropy(self, data):\n", 24 | " p = sum(data[self.target] == self.positive)\n", 25 | " n = data.shape[0] - p\n", 26 | " p_ratio = p / (p + n) if (p + n) != 0 else 0\n", 27 | " n_ratio = 1 - p_ratio\n", 28 | " entropy_p = -p_ratio * math.log2(p_ratio) if p_ratio != 0 else 0\n", 29 | " entropy_n = -n_ratio * math.log2(n_ratio) if n_ratio != 0 else 0\n", 30 | " return entropy_p + entropy_n\n", 31 | "\n", 32 | " def _get_gain(self, feat):\n", 33 | " avg_info = sum(\n", 34 | " self._get_entropy(self.data[self.data[feat] == val]) * sum(self.data[feat] == val) / self.data.shape[0]\n", 35 | " for val in self.data[feat].unique()\n", 36 | " )\n", 37 | " return self._get_entropy(self.data) - avg_info\n", 38 | "\n", 39 | " def _get_splitter(self):\n", 40 | " self.splitter = max(self.gains, key=lambda x: x[1])[0]\n", 41 | "\n", 42 | " def update_nodes(self):\n", 43 | " self.features = [col for col in self.data.columns if col != self.target]\n", 44 | " self.entropy = self._get_entropy(self.data)\n", 45 | " if self.entropy != 0:\n", 46 | " self.gains = [(feat, self._get_gain(feat)) for feat in self.features]\n", 47 | " self._get_splitter()\n", 48 | " residual_columns = [k for k in self.data.columns if k != self.splitter]\n", 49 | " for val in self.data[self.splitter].unique():\n", 50 | " df_tmp = self.data[self.data[self.splitter] == val][residual_columns]\n", 51 | " tmp_node = DecisionTree(df_tmp, self.target, self.positive, val, self.splitter)\n", 52 | " tmp_node.update_nodes()\n", 53 | " self.childs.append(tmp_node)\n", 54 | "\n", 55 | " def print_tree(node, depth=0):\n", 56 | " if node:\n", 57 | " print(f\"{' ' * depth}Parent: {node.parent} | Parent Value: {node.parent_val}\")\n", 58 | " for child in node.childs:\n", 59 | " DecisionTree.print_tree(child, depth + 1)\n", 60 | "\n", 61 | "# Assuming the CSV file is in the same directory as the notebook\n", 62 | "df = pd.read_csv(\"governors_county.csv\")\n", 63 | "dt = DecisionTree(df, \"state\", \"100\", \",\", None)\n", 64 | "dt.update_nodes()\n", 65 | "DecisionTree.print_tree(dt)\n" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.8" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 5 90 | } 91 | --------------------------------------------------------------------------------