In [8]:
import os
import numpy as np
import pandas as pd
rng = np.random.RandomState(42)
In [3]:
df = pd.read_csv("diabetes.xls")
In [4]:
df
Out[4]:
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | Pedigree | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
768 rows × 9 columns
In [6]:
print("PLOT: распределения, корреляции, 3D scatter трех признаков")
num_df = df.select_dtypes(include=[np.number])
nonnum_df = df.select_dtypes(exclude=[np.number])
print("DESCRIBE (numeric):")
print(num_df.describe().T)
q = num_df.quantile([0, 0.25, 0.5, 0.75, 1.0])
print("\nQUANTILES (numeric, 0, 0.25, 0.5, 0.75, 1.0):")
print(q)
PLOT: распределения, корреляции, 3D scatter трех признаков
DESCRIBE (numeric):
count mean std min 25% 50% \
Pregnancies 768.0 3.845052 3.369578 0.000 1.00000 3.0000
Glucose 768.0 120.894531 31.972618 0.000 99.00000 117.0000
BloodPressure 768.0 69.105469 19.355807 0.000 62.00000 72.0000
SkinThickness 768.0 20.536458 15.952218 0.000 0.00000 23.0000
Insulin 768.0 79.799479 115.244002 0.000 0.00000 30.5000
BMI 768.0 31.992578 7.884160 0.000 27.30000 32.0000
Pedigree 768.0 0.471876 0.331329 0.078 0.24375 0.3725
Age 768.0 33.240885 11.760232 21.000 24.00000 29.0000
Outcome 768.0 0.348958 0.476951 0.000 0.00000 0.0000
75% max
Pregnancies 6.00000 17.00
Glucose 140.25000 199.00
BloodPressure 80.00000 122.00
SkinThickness 32.00000 99.00
Insulin 127.25000 846.00
BMI 36.60000 67.10
Pedigree 0.62625 2.42
Age 41.00000 81.00
Outcome 1.00000 1.00
QUANTILES (numeric, 0, 0.25, 0.5, 0.75, 1.0):
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
0.00 0.0 0.00 0.0 0.0 0.00 0.0
0.25 1.0 99.00 62.0 0.0 0.00 27.3
0.50 3.0 117.00 72.0 23.0 30.50 32.0
0.75 6.0 140.25 80.0 32.0 127.25 36.6
1.00 17.0 199.00 122.0 99.0 846.00 67.1
Pedigree Age Outcome
0.00 0.07800 21.0 0.0
0.25 0.24375 24.0 0.0
0.50 0.37250 29.0 0.0
0.75 0.62625 41.0 1.0
1.00 2.42000 81.0 1.0
In [9]:
df = df.copy()
features_all = [c for c in df.columns if c != "Outcome"]
target_col = "Outcome"
for c in df.columns:
df[c] = pd.to_numeric(df[c], errors="coerce")
cols_zero_as_missing = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
for c in cols_zero_as_missing:
if c in df.columns:
df.loc[df[c] == 0, c] = np.nan
df[target_col] = df[target_col].fillna(df[target_col].mode().iloc[0]).astype(int)
for c in features_all:
if df[c].isna().any():
df[c] = df[c]
X_all = df[features_all].values.astype(float)
y_all = df[target_col].values.astype(int)
N = X_all.shape[0]
idx = np.arange(N)
rng.shuffle(idx)
train_size = int(0.8 * N)
train_idx = idx[:train_size]
test_idx = idx[train_size:]
X_train_raw = X_all[train_idx]
y_train = y_all[train_idx]
X_test_raw = X_all[test_idx]
y_test = y_all[test_idx]
train_df = pd.DataFrame(X_train_raw, columns=features_all)
test_df = pd.DataFrame(X_test_raw, columns=features_all)
for c in cols_zero_as_missing:
if c in train_df.columns:
med = np.nanmedian(train_df[c].values)
train_df[c] = train_df[c].fillna(med)
test_df[c] = test_df[c].fillna(med)
for c in features_all:
if not np.isfinite(train_df[c]).all():
med = np.nanmedian(train_df[c].values)
train_df[c] = train_df[c].fillna(med)
if not np.isfinite(test_df[c]).all():
med = np.nanmedian(train_df[c].values)
test_df[c] = test_df[c].fillna(med)
X_train = train_df.values.astype(float)
X_test = test_df.values.astype(float)
mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
sigma[sigma == 0] = 1.0
X_train_z = (X_train - mu) / sigma
X_test_z = (X_test - mu) / sigma
print("Train shape:", X_train_z.shape, "Test shape:", X_test_z.shape)
Train shape: (614, 8) Test shape: (154, 8)
In [10]:
def knn_predict(X_train, y_train, X_test, k):
preds = np.zeros(X_test.shape[0], dtype=int)
for i in range(X_test.shape[0]):
d = np.sqrt(((X_train - X_test[i])**2).sum(axis=1))
nn_idx = np.argpartition(d, k)[:k]
votes = y_train[nn_idx]
s1 = (votes == 1).sum()
s0 = k - s1
if s1 > s0:
preds[i] = 1
elif s0 > s1:
preds[i] = 0
else:
tie = nn_idx[np.argmin(d[nn_idx])]
preds[i] = y_train[tie]
return preds
def confusion_and_metrics(y_true, y_pred):
tp = int(((y_pred == 1) & (y_true == 1)).sum())
tn = int(((y_pred == 0) & (y_true == 0)).sum())
fp = int(((y_pred == 1) & (y_true == 0)).sum())
fn = int(((y_pred == 0) & (y_true == 1)).sum())
acc = (tp + tn) / max(1, len(y_true))
prec = tp / max(1, (tp + fp))
rec = tp / max(1, (tp + fn))
return tp, tn, fp, fn, acc, prec, rec
In [11]:
fixed_features = ["Glucose","BMI","Age","Pregnancies"]
fixed_idx = [features_all.index(c) for c in fixed_features]
Xtr_fix = X_train_z[:, fixed_idx]
Xte_fix = X_test_z[:, fixed_idx]
k_list = [3,5,10]
results_fixed = []
for k in k_list:
y_pred = knn_predict(Xtr_fix, y_train, Xte_fix, k)
tp, tn, fp, fn, acc, prec, rec = confusion_and_metrics(y_test, y_pred)
results_fixed.append((k, tp, tn, fp, fn, acc, prec, rec))
print("Model 2 (fixed), k=", k)
print(f"Confusion: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
print("Accuracy:", round(acc,4), "Precision:", round(prec,4), "Recall:", round(rec,4))
Model 2 (fixed), k= 3 Confusion: TN=74, FP=22, FN=26, TP=32 Accuracy: 0.6883 Precision: 0.5926 Recall: 0.5517 Model 2 (fixed), k= 5 Confusion: TN=79, FP=17, FN=25, TP=33 Accuracy: 0.7273 Precision: 0.66 Recall: 0.569 Model 2 (fixed), k= 10 Confusion: TN=82, FP=14, FN=23, TP=35 Accuracy: 0.7597 Precision: 0.7143 Recall: 0.6034
In [12]:
n_feat = len(features_all)
rand_size = 4
rand_idx = rng.choice(np.arange(n_feat), size=rand_size, replace=False)
rand_features = [features_all[i] for i in rand_idx]
Xtr_rand = X_train_z[:, rand_idx]
Xte_rand = X_test_z[:, rand_idx]
results_rand = []
for k in k_list:
y_pred = knn_predict(Xtr_rand, y_train, Xte_rand, k)
tp, tn, fp, fn, acc, prec, rec = confusion_and_metrics(y_test, y_pred)
results_rand.append((k, tp, tn, fp, fn, acc, prec, rec))
print("Model 1 (random), features:", rand_features, "k=", k)
print(f"Confusion: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
print("Accuracy:", round(acc,4), "Precision:", round(prec,4), "Recall:", round(rec,4))
Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 3 Confusion: TN=68, FP=28, FN=25, TP=33 Accuracy: 0.6558 Precision: 0.541 Recall: 0.569 Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 5 Confusion: TN=72, FP=24, FN=28, TP=30 Accuracy: 0.6623 Precision: 0.5556 Recall: 0.5172 Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 10 Confusion: TN=78, FP=18, FN=22, TP=36 Accuracy: 0.7403 Precision: 0.6667 Recall: 0.6207
In [13]:
rows = []
for k, tp, tn, fp, fn, acc, prec, rec in results_fixed:
rows.append(["Model_2_fixed", ",".join(fixed_features), k, acc, prec, rec, tp, tn, fp, fn])
for k, tp, tn, fp, fn, acc, prec, rec in results_rand:
rows.append(["Model_1_random", ",".join(rand_features), k, acc, prec, rec, tp, tn, fp, fn])
summary = pd.DataFrame(rows, columns=["model","features","k","accuracy","precision","recall","TP","TN","FP","FN"])
print(summary.sort_values(["model","k"]).reset_index(drop=True))
model features k accuracy \ 0 Model_1_random Pedigree,Age,Glucose,SkinThickness 3 0.655844 1 Model_1_random Pedigree,Age,Glucose,SkinThickness 5 0.662338 2 Model_1_random Pedigree,Age,Glucose,SkinThickness 10 0.740260 3 Model_2_fixed Glucose,BMI,Age,Pregnancies 3 0.688312 4 Model_2_fixed Glucose,BMI,Age,Pregnancies 5 0.727273 5 Model_2_fixed Glucose,BMI,Age,Pregnancies 10 0.759740 precision recall TP TN FP FN 0 0.540984 0.568966 33 68 28 25 1 0.555556 0.517241 30 72 24 28 2 0.666667 0.620690 36 78 18 22 3 0.592593 0.551724 32 74 22 26 4 0.660000 0.568966 33 79 17 25 5 0.714286 0.603448 35 82 14 23
In [14]:
import matplotlib.pyplot as plt
num_df = df.select_dtypes(include=[np.number])
cols = list(num_df.columns)
r = int(np.ceil(len(cols)/3))
plt.figure(figsize=(14, 4*r))
for i,c in enumerate(cols,1):
plt.subplot(r,3,i)
plt.hist(num_df[c].dropna().values, bins=30)
plt.title(c)
plt.tight_layout()
plt.show()
corr = num_df.corr()
plt.figure(figsize=(8,6))
plt.imshow(corr.values, aspect='auto')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.title("Correlation matrix")
plt.tight_layout()
plt.show()
In [15]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
gx = test_df["Glucose"].values
bm = test_df["BMI"].values
ag = test_df["Age"].values
ax.scatter(gx, bm, ag, c=y_test, s=15, alpha=0.7)
ax.set_xlabel("Glucose")
ax.set_ylabel("BMI")
ax.set_zlabel("Age")
plt.title("3D scatter (test)")
plt.tight_layout()
plt.show()
In [16]:
k_vals = []
acc_fix = []
acc_rand = []
for k, tp, tn, fp, fn, acc, prec, rec in results_fixed:
k_vals.append(k)
acc_fix.append(acc)
lookup_rand = {k:acc for (k, tp, tn, fp, fn, acc, prec, rec) in results_rand}
acc_rand = [lookup_rand[k] for k in k_vals]
plt.figure(figsize=(7,4))
plt.plot(k_vals, acc_fix, marker='o', label='fixed')
plt.plot(k_vals, acc_rand, marker='o', label='random')
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.title("Accuracy vs k")
plt.legend()
plt.tight_layout()
plt.show()
In [17]:
k_sel = 5
y_pred_sel = knn_predict(Xtr_fix, y_train, Xte_fix, k_sel)
tp = int(((y_pred_sel == 1) & (y_test == 1)).sum())
tn = int(((y_pred_sel == 0) & (y_test == 0)).sum())
fp = int(((y_pred_sel == 1) & (y_test == 0)).sum())
fn = int(((y_pred_sel == 0) & (y_test == 1)).sum())
cm = np.array([[tn, fp],[fn, tp]])
plt.figure(figsize=(4,4))
plt.imshow(cm, cmap='Blues')
plt.title("Confusion Matrix (Model 2, k=5)")
plt.xticks([0,1], ["Pred 0","Pred 1"])
plt.yticks([0,1], ["True 0","True 1"])
for i in range(2):
for j in range(2):
plt.text(j, i, cm[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()
In [ ]: