Classification Spot-Checking

MvMukesh · web-flow · commit 9b22368a5155 · 2021-08-10T14:13:08.000+05:30
diff --git a/spot_checking(Classificatinn).py b/spot_checking(Classificatinn).py
@@ -0,0 +1,128 @@
+''' 
+# Spot-Check is Pick out random samples for examination in order to ensure high quality algo(Classification)
+
+Spot-Checking  on   Classification
+
+Spot-checking algorithms is about getting a quick assessment of a bunch of different algorithms on machine learning problem to know what algorithms to focus on and what to discard.
+
+*********
+
+Benefits of spot-checking algorithms on machine learning problems:
+    - Speed
+    - Objective
+    - Results
+    
+*********    
+    
+Dataset Used : Pima Indians onset of Diabetes
+Test harness : 10-fold cross validation [To demonstrate how to spot-check ML algorithm]
+Performance Evaluation[algo.]: mean accuracy error(MAE)
+'''
+
+''' # Algorithm Overview
+## Start with 2 linear ML algorithms:
+   - LogistricRegression()
+   - LinearDiscriminantAnalysis()
+
+# Then look at 4 non-linear machine learning  algorithms:
+   - kNN  :    KNeighborsClassifier()
+   - Naive Bayes  :   GaussianNB()
+   - Classification and Regression Trees   :   DecisionTreeClassifier()
+   - SVM   :   SVC()
+'''
+
+from pandas import read_csv
+from sklearn.model_selection import KFold
+from sklearn.model_selection import cross_val_score
+
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.naive_bayes import GaussianNB
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC
+
+
+
+filename = 'pima-indians-diabetes.data.csv'
+names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
+data = read_csv(filename, names=names)
+array = data.values
+X = array[:,0:8]
+Y = array[:,8]
+
+num_folds = 10
+kfold = KFold(n_splits=10, random_state=7)
+
+
+
+
+''' 1. Logistic Regression '''
+# 
+# LR ==> assumes a Gaussian distribution for numeric input variables and can model binary classification problems
+# 
+model = LogisticRegression()
+results = cross_val_score(model, X, Y, cv = kfold)
+
+print(results.mean())
+#   0.76951469583
+
+
+
+''' 2. Linear Discriminant Analysis '''
+# LDA ==> a statistical technique for binary and multiclass classification
+model = LinearDiscriminantAnalysis()
+results = cross_val_score(model, X, Y, cv= kfold)
+
+print(results.mean())
+#   0.773462064252
+
+
+
+''' 3. kNN '''
+# KNN ==> uses a distance metric to find k most similar instances in training data for a new instance and takes mean outcome of neighbors as prediction
+model = KNeighborsClassifier()
+results = cross_val_score(model, X, Y, cv = kfold)
+
+print(results.mean)
+# 0.726555023923
+
+
+''' 4. Naive Bayes '''
+#
+# NB ==> calculates probability of each class and conditional probability of each class given each input value
+#   These probabilities are estimated for new data and multiplied together, assuming that they are all independent (Naive assumption)
+#
+model = GaussianNB()
+
+results = cross_val_score(model, X, Y, cv = kfold)
+print(results.mean())
+#  0.75517771702
+
+
+
+''' 5. CART/decision tree '''
+#
+# DT ==> construct a binary tree from training data.
+#
+model = DecisionTreeClassifier()
+
+results = cross_val_score(model, X, Y, cv = kfold)
+print(results.mean())
+# 0.686056049214
+
+
+
+''' 6. Support Vector Machine'''
+#
+# SVM ==> seek a line that best separates two classes. 
+# Use of differnet kernel functions via kernel parameter is important
+# By default, Radial Basis Function is used
+#
+model = SVC()
+
+results = cross_val_score(model, X, Y, cv = kfold)
+print(results.mean())
+#   0.651025290499