KNN explained and implemented and 2526 56837 35!

usrbinomarbash · web-flow · commit ffc1f42b8ced · 2021-05-03T02:23:46.000Z
diff --git a/Machine Learning/KNN/Knn.py b/Machine Learning/KNN/Knn.py
@@ -0,0 +1,42 @@
+import numpy as np
+from collections import Counter
+
+# distance formula in 2d
+def distform(a, b):
+    return np.sqrt(np.sum(a-b)**2)
+
+
+class KNN:
+    def __init__(self,k=4):
+        #store the k val
+        self.k = k
+    #follow the scikit library convention.... fit method will fit the training samples
+    def fit(self, X, y):
+        #storing the vals
+        self.X_train = X
+        self.y_train = y
+
+    #predict new samples
+    def predict(self, testSamps):
+        #write a helper method which is a list
+        predictedLabs  = [self.__predict(x) for x in testSamps]
+        #converting it into a numpy arr
+        return np.array(predictedLabs)
+
+    #helper method which takes one sample
+    def __predict(self, newsamp):
+    
+    #predict the distances and find the nearest neighbor and their labels and I do a majority vote
+    #and choose the most common class label
+    #1. compute dists
+        dist = [distform(newsamp,X_train) for X_train in self.X_train]
+    
+    #2. get the knn and labels
+        k_indeces = np.argsort(dist)[:self.k]
+        k_nearest_label = [self.y_train[elem] for elem in k_indeces]
+
+    #3. perform a majority vote 
+        most_comm = Counter(k_nearest_label).most_common(1)
+        return most_comm[0][0]
+
+
diff --git a/Machine Learning/KNN/Knntest.py b/Machine Learning/KNN/Knntest.py
@@ -0,0 +1,51 @@
+import numpy as np
+from sklearn import datasets
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+from matplotlib.colors import ListedColormap
+from Knn import KNN
+colormp = ListedColormap(["#FF0000", "#00FF00", "#0000FF"])
+
+
+iris = datasets.load_iris()
+X,y = iris.data, iris.target
+
+
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)
+
+'''
+print(X_train.shape)
+
+#result is:(120, 4) where 120 is the number of samples and 4 is the number of features per sample
+
+print(X_train[0])
+
+print(y_train.shape)
+print(y_train)
+
+plt.figure()
+
+plt.scatter(X[:,0], X[:,1], c=y, cmap=colormp, edgecolor="k", s=20)
+plt.show()
+
+alpha = [1,1,1,2,2,3,4,5,6]
+
+
+from collections import Counter
+
+#print only one most common item
+most_common = Counter(alpha).most_common(1)
+print(most_common) 
+'''
+
+
+myclassifier = KNN(k=3)
+myclassifier.fit(X_train, y_train)
+
+#predicting my test samples
+predict = myclassifier.predict(X_test)
+
+#predicting the accuracy of my model aka how many predictions I made are correctly classified
+how_accurate = np.sum(predict== y_test) / len(y_test) 
+
+print(how_accurate)
diff --git a/Machine Learning/KNN/README.md b/Machine Learning/KNN/README.md
@@ -0,0 +1,12 @@
+### K Nearest Neighbor
+1. I have two classes and I have feature vectors which are 2d
+2. X(sub1) is the 1st axis and X(Sub2) is the 2nd axis
+3. I have a training sample 
+4. For each new sample I want to classify. I calc the dist between the original sample to each of the training sample
+5. I take a look at the nearest neighbors
+6. I give a label to the nearest neighbors based on the most common neighbors
+7. For example, 2 green classes and a red class therefore the label will be green class
+8. To calculate the distance between the samples I use the euclidian distance aka the distance formula
+
+### More general case formula:
+- Square root of (summation of (q-subi - p-subi)^2) from i to n where n is the num of dim
diff --git a/Machine Learning/KNN/__pycache__/Knn.cpython-38.pyc b/Machine Learning/KNN/__pycache__/Knn.cpython-38.pyc