-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataTest.py
129 lines (76 loc) · 3.06 KB
/
dataTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# Data Processing
print("Retrieving Data and assigning headers...")
headers = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
"hours-per-week", "native-country", "goal"]
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None, names=headers, na_values=np.nan)
print("Replacing ? with NaN")
df = df.replace(' ?', np.nan)
print(df.shape)
print("Removing rows with missing values")
df = df.dropna()
print("Factorizing....")
df['workclass'] = pd.factorize(df['workclass'])[0]
df['education'] = pd.factorize(df['education'])[0]
df['marital-status'] = pd.factorize(df['marital-status'])[0]
df['occupation'] = pd.factorize(df['occupation'])[0]
df['relationship'] = pd.factorize(df['relationship'])[0]
df['race'] = pd.factorize(df['race'])[0]
df['sex'] = pd.factorize(df['sex'])[0]
df['native-country'] = pd.factorize(df['native-country'])[0]
# <=50K is 0 and >50K is 1
df['goal'] = pd.factorize(df['goal'])[0]
print('Saving label...')
labels = df.loc[:, 'goal']
print('Removing label from Dataframe...')
df = df.drop(['goal'], axis=1)
# Start of the model
neigh = KNeighborsClassifier(n_neighbors=3)
# data and label
X = df.values
y = labels
# create split of test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# create learning model object
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
# predict
pred = knn.predict(X_test)
print("ACCURACY OF TEST: K = 3")
print(accuracy_score(y_test, pred))
odd_k = [i for i in range(50) if i % 2 != 0]
def ten_fold_cval(aList):
cv_scores = []
for k in aList:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
print("K = " + str(k))
print(str(scores.mean()))
cv_scores.append(scores.mean())
return cv_scores
cross_scores = ten_fold_cval(odd_k)
MSE = [1 - x for x in cross_scores]
optimal_k = odd_k[MSE.index(min(MSE))]
print("Optimal number of k is " + str(optimal_k))
plt.plot(odd_k, MSE)
plt.xlabel("Number of K")
plt.ylabel("Missclassification Error")
plt.show()
#kay_folds = KFold(n_splits=5)
#kay_folds.get_n_splits(X)
# for train_index, test_index in kay_folds.split(X):
# print("TRAIN:", train_index, "TEST:", test_index)
# X_train, X_test = X[train_index], X[test_index]
# y_train, y_test = y[train_index], y[test_index]
#
# neigh.fit(X_train, y_train)
# accuracy_score(neigh.predict(X_test), y_test)
#