Skip to content

Commit 9b22368

Browse files
authored
Classification Spot-Checking
1 parent f6ab898 commit 9b22368

File tree

1 file changed

+128
-0
lines changed

1 file changed

+128
-0
lines changed

spot_checking(Classificatinn).py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
'''
2+
# Spot-Check is Pick out random samples for examination in order to ensure high quality algo(Classification)
3+
4+
Spot-Checking on Classification
5+
6+
Spot-checking algorithms is about getting a quick assessment of a bunch of different algorithms on machine learning problem to know what algorithms to focus on and what to discard.
7+
8+
*********
9+
10+
Benefits of spot-checking algorithms on machine learning problems:
11+
- Speed
12+
- Objective
13+
- Results
14+
15+
*********
16+
17+
Dataset Used : Pima Indians onset of Diabetes
18+
Test harness : 10-fold cross validation [To demonstrate how to spot-check ML algorithm]
19+
Performance Evaluation[algo.]: mean accuracy error(MAE)
20+
'''
21+
22+
''' # Algorithm Overview
23+
## Start with 2 linear ML algorithms:
24+
- LogistricRegression()
25+
- LinearDiscriminantAnalysis()
26+
27+
# Then look at 4 non-linear machine learning algorithms:
28+
- kNN : KNeighborsClassifier()
29+
- Naive Bayes : GaussianNB()
30+
- Classification and Regression Trees : DecisionTreeClassifier()
31+
- SVM : SVC()
32+
'''
33+
34+
from pandas import read_csv
35+
from sklearn.model_selection import KFold
36+
from sklearn.model_selection import cross_val_score
37+
38+
39+
from sklearn.linear_model import LogisticRegression
40+
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
41+
42+
from sklearn.neighbors import KNeighborsClassifier
43+
from sklearn.naive_bayes import GaussianNB
44+
from sklearn.tree import DecisionTreeClassifier
45+
from sklearn.svm import SVC
46+
47+
48+
49+
filename = 'pima-indians-diabetes.data.csv'
50+
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
51+
data = read_csv(filename, names=names)
52+
array = data.values
53+
X = array[:,0:8]
54+
Y = array[:,8]
55+
56+
num_folds = 10
57+
kfold = KFold(n_splits=10, random_state=7)
58+
59+
60+
61+
62+
''' 1. Logistic Regression '''
63+
#
64+
# LR ==> assumes a Gaussian distribution for numeric input variables and can model binary classification problems
65+
#
66+
model = LogisticRegression()
67+
results = cross_val_score(model, X, Y, cv = kfold)
68+
69+
print(results.mean())
70+
# 0.76951469583
71+
72+
73+
74+
''' 2. Linear Discriminant Analysis '''
75+
# LDA ==> a statistical technique for binary and multiclass classification
76+
model = LinearDiscriminantAnalysis()
77+
results = cross_val_score(model, X, Y, cv= kfold)
78+
79+
print(results.mean())
80+
# 0.773462064252
81+
82+
83+
84+
''' 3. kNN '''
85+
# KNN ==> uses a distance metric to find k most similar instances in training data for a new instance and takes mean outcome of neighbors as prediction
86+
model = KNeighborsClassifier()
87+
results = cross_val_score(model, X, Y, cv = kfold)
88+
89+
print(results.mean)
90+
# 0.726555023923
91+
92+
93+
''' 4. Naive Bayes '''
94+
#
95+
# NB ==> calculates probability of each class and conditional probability of each class given each input value
96+
# These probabilities are estimated for new data and multiplied together, assuming that they are all independent (Naive assumption)
97+
#
98+
model = GaussianNB()
99+
100+
results = cross_val_score(model, X, Y, cv = kfold)
101+
print(results.mean())
102+
# 0.75517771702
103+
104+
105+
106+
''' 5. CART/decision tree '''
107+
#
108+
# DT ==> construct a binary tree from training data.
109+
#
110+
model = DecisionTreeClassifier()
111+
112+
results = cross_val_score(model, X, Y, cv = kfold)
113+
print(results.mean())
114+
# 0.686056049214
115+
116+
117+
118+
''' 6. Support Vector Machine'''
119+
#
120+
# SVM ==> seek a line that best separates two classes.
121+
# Use of differnet kernel functions via kernel parameter is important
122+
# By default, Radial Basis Function is used
123+
#
124+
model = SVC()
125+
126+
results = cross_val_score(model, X, Y, cv = kfold)
127+
print(results.mean())
128+
# 0.651025290499

0 commit comments

Comments
 (0)