Added DecisionTree

Mayurji · web-flow · commit f9094ffa523c · 2021-07-04T17:45:24.000+05:30
diff --git a/Decision Tree.ipynb b/Decision Tree.ipynb
@@ -0,0 +1,162 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def entropy(y):\n",
+    "    hist = np.bincount(y)\n",
+    "    ps = hist / len(y)\n",
+    "    return -np.sum([p * np.log2(p) for p in ps if p > 0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Node:\n",
+    "    def __init__(self, feature=None, left=None, right=None, threshold=None, value=None):\n",
+    "        self.feature = feature\n",
+    "        self.left = left\n",
+    "        self.right = right\n",
+    "        self.threshold = threshold\n",
+    "        self.value = value\n",
+    "        \n",
+    "    def is_leaf_node(self):\n",
+    "        return self.value is not None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class DecisionTree:\n",
+    "    def __init__(self, min_sample_split=2, max_depth=100, n_feature=None):\n",
+    "        self.n_feature = n_feature\n",
+    "        self.min_sample_split = min_sample_split\n",
+    "        self.max_depth = max_depth\n",
+    "        \n",
+    "    def fit(self, X, y):\n",
+    "        n_samples, n_features = X.shape\n",
+    "        self.n_feature = X.shape[1] if not self.n_feature else min(self.n_feature, n_features)\n",
+    "        self.root = self.growTree(X, y)\n",
+    "        \n",
+    "    def predict(self, X):\n",
+    "        ypred = [self.traverseTree(x, self.root) for x in X]\n",
+    "        return np.array(ypred)\n",
+    "    \n",
+    "    def growTree(self, X, y, depth=0):\n",
+    "        n_samples, n_features = X.shape\n",
+    "        n_labels = np.unique(y)\n",
+    "        \n",
+    "        if (depth >= max_depth or n_samples < self.min_sample_split \n",
+    "           or n_labels == 1):\n",
+    "            leaf_value = self.most_common_label(y)\n",
+    "            return Node(value=leaf_value)\n",
+    "        \n",
+    "        \n",
+    "        feature_idxs = np.random.choice(n_features, self.n_feature, replace=False)\n",
+    "        bestFeature, bestThreshold = self.bestCriteria(X, y, feature_idxs)\n",
+    "        \n",
+    "        left_idxs, right_idxs = self.split(X[:, bestFeature], bestThreshold)\n",
+    "        left = self.growTree(X[left_idxs, :], y[left_idxs], depth+1)\n",
+    "        right = self.growTree(X[right_idxs, :], y[right_idxs], depth+1)\n",
+    "        \n",
+    "        return Node(feature=bestFeature, left = left, right=right, threshold=bestThreshold)\n",
+    "    \n",
+    "    def bestCriteria(X, y, feature_idxs):\n",
+    "        best_gain = -1\n",
+    "        splitIdx, splitThreshold = None, None\n",
+    "        \n",
+    "        for f in feature_idxs:\n",
+    "            X_column = X[:, f]\n",
+    "            thresholds = np.unique(X_column)\n",
+    "            for t in thresholds:\n",
+    "                gain = self.infoGain(X_column, y, t)\n",
+    "                \n",
+    "                if gain > best_gain:\n",
+    "                    best_gain = gain\n",
+    "                    split_idx = f\n",
+    "                    split_thresh = t\n",
+    "            \n",
+    "        return split_idx, split_thresh\n",
+    "            \n",
+    "    def infoGain(self, X, y, t):\n",
+    "        pE = entropy(y)\n",
+    "        \n",
+    "        leftIdx, rightIdx = self.split(X, t)\n",
+    "        if len(leftIdx)==0 or len(rightIdx)==0:\n",
+    "            return 0\n",
+    "        \n",
+    "        n = len(y)\n",
+    "        n_l, n_r = len(leftIdx), len(rightIdx)\n",
+    "        e_l, e_r = entropy(y[leftIdx]), entropy(y[rightIdx])\n",
+    "        \n",
+    "        child_entropy = (n_l * e_l) / n + (n_r * e_r) / n\n",
+    "        ig = pE - child_entropy\n",
+    "        return ig\n",
+    "    \n",
+    "    def split(self, X, threshold):\n",
+    "        leftIdxs = np.argwhere(X>= threshold).flatten()\n",
+    "        rightIdxs = np.argwhere(X<threshold).flatten()\n",
+    "        return leftIdxs, rightIdxs\n",
+    "    \n",
+    "    def traverseTree(self, X, node):\n",
+    "        if node.is_leaf_node():\n",
+    "            return node.value\n",
+    "        \n",
+    "        if X[node.feature] <= node.threshold:\n",
+    "            return traverseTree(X, node.left)\n",
+    "        return traverseTree(X, node.right)\n",
+    "    \n",
+    "    def most_common_label(self, label):\n",
+    "        mostCommon = Counter(label)\n",
+    "        mc = mostCommon(1)[0][0]\n",
+    "        return mc"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}