|
| 1 | +''' |
| 2 | +# Spot-Check is Pick out random samples for examination in order to ensure high quality algo(Classification) |
| 3 | +
|
| 4 | +Spot-Checking on Classification |
| 5 | +
|
| 6 | +Spot-checking algorithms is about getting a quick assessment of a bunch of different algorithms on machine learning problem to know what algorithms to focus on and what to discard. |
| 7 | +
|
| 8 | +********* |
| 9 | +
|
| 10 | +Benefits of spot-checking algorithms on machine learning problems: |
| 11 | + - Speed |
| 12 | + - Objective |
| 13 | + - Results |
| 14 | + |
| 15 | +********* |
| 16 | + |
| 17 | +Dataset Used : Pima Indians onset of Diabetes |
| 18 | +Test harness : 10-fold cross validation [To demonstrate how to spot-check ML algorithm] |
| 19 | +Performance Evaluation[algo.]: mean accuracy error(MAE) |
| 20 | +''' |
| 21 | + |
| 22 | +''' # Algorithm Overview |
| 23 | +## Start with 2 linear ML algorithms: |
| 24 | + - LogistricRegression() |
| 25 | + - LinearDiscriminantAnalysis() |
| 26 | +
|
| 27 | +# Then look at 4 non-linear machine learning algorithms: |
| 28 | + - kNN : KNeighborsClassifier() |
| 29 | + - Naive Bayes : GaussianNB() |
| 30 | + - Classification and Regression Trees : DecisionTreeClassifier() |
| 31 | + - SVM : SVC() |
| 32 | +''' |
| 33 | + |
| 34 | +from pandas import read_csv |
| 35 | +from sklearn.model_selection import KFold |
| 36 | +from sklearn.model_selection import cross_val_score |
| 37 | + |
| 38 | + |
| 39 | +from sklearn.linear_model import LogisticRegression |
| 40 | +from sklearn.discriminant_analysis import LinearDiscriminantAnalysis |
| 41 | + |
| 42 | +from sklearn.neighbors import KNeighborsClassifier |
| 43 | +from sklearn.naive_bayes import GaussianNB |
| 44 | +from sklearn.tree import DecisionTreeClassifier |
| 45 | +from sklearn.svm import SVC |
| 46 | + |
| 47 | + |
| 48 | + |
| 49 | +filename = 'pima-indians-diabetes.data.csv' |
| 50 | +names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] |
| 51 | +data = read_csv(filename, names=names) |
| 52 | +array = data.values |
| 53 | +X = array[:,0:8] |
| 54 | +Y = array[:,8] |
| 55 | + |
| 56 | +num_folds = 10 |
| 57 | +kfold = KFold(n_splits=10, random_state=7) |
| 58 | + |
| 59 | + |
| 60 | + |
| 61 | + |
| 62 | +''' 1. Logistic Regression ''' |
| 63 | +# |
| 64 | +# LR ==> assumes a Gaussian distribution for numeric input variables and can model binary classification problems |
| 65 | +# |
| 66 | +model = LogisticRegression() |
| 67 | +results = cross_val_score(model, X, Y, cv = kfold) |
| 68 | + |
| 69 | +print(results.mean()) |
| 70 | +# 0.76951469583 |
| 71 | + |
| 72 | + |
| 73 | + |
| 74 | +''' 2. Linear Discriminant Analysis ''' |
| 75 | +# LDA ==> a statistical technique for binary and multiclass classification |
| 76 | +model = LinearDiscriminantAnalysis() |
| 77 | +results = cross_val_score(model, X, Y, cv= kfold) |
| 78 | + |
| 79 | +print(results.mean()) |
| 80 | +# 0.773462064252 |
| 81 | + |
| 82 | + |
| 83 | + |
| 84 | +''' 3. kNN ''' |
| 85 | +# KNN ==> uses a distance metric to find k most similar instances in training data for a new instance and takes mean outcome of neighbors as prediction |
| 86 | +model = KNeighborsClassifier() |
| 87 | +results = cross_val_score(model, X, Y, cv = kfold) |
| 88 | + |
| 89 | +print(results.mean) |
| 90 | +# 0.726555023923 |
| 91 | + |
| 92 | + |
| 93 | +''' 4. Naive Bayes ''' |
| 94 | +# |
| 95 | +# NB ==> calculates probability of each class and conditional probability of each class given each input value |
| 96 | +# These probabilities are estimated for new data and multiplied together, assuming that they are all independent (Naive assumption) |
| 97 | +# |
| 98 | +model = GaussianNB() |
| 99 | + |
| 100 | +results = cross_val_score(model, X, Y, cv = kfold) |
| 101 | +print(results.mean()) |
| 102 | +# 0.75517771702 |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | +''' 5. CART/decision tree ''' |
| 107 | +# |
| 108 | +# DT ==> construct a binary tree from training data. |
| 109 | +# |
| 110 | +model = DecisionTreeClassifier() |
| 111 | + |
| 112 | +results = cross_val_score(model, X, Y, cv = kfold) |
| 113 | +print(results.mean()) |
| 114 | +# 0.686056049214 |
| 115 | + |
| 116 | + |
| 117 | + |
| 118 | +''' 6. Support Vector Machine''' |
| 119 | +# |
| 120 | +# SVM ==> seek a line that best separates two classes. |
| 121 | +# Use of differnet kernel functions via kernel parameter is important |
| 122 | +# By default, Radial Basis Function is used |
| 123 | +# |
| 124 | +model = SVC() |
| 125 | + |
| 126 | +results = cross_val_score(model, X, Y, cv = kfold) |
| 127 | +print(results.mean()) |
| 128 | +# 0.651025290499 |
0 commit comments