In [8]:
import os
import numpy as np
import pandas as pd

rng = np.random.RandomState(42)
In [3]:
df = pd.read_csv("diabetes.xls")
In [4]:
df
Out[4]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI Pedigree Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
... ... ... ... ... ... ... ... ... ...
763 10 101 76 48 180 32.9 0.171 63 0
764 2 122 70 27 0 36.8 0.340 27 0
765 5 121 72 23 112 26.2 0.245 30 0
766 1 126 60 0 0 30.1 0.349 47 1
767 1 93 70 31 0 30.4 0.315 23 0

768 rows × 9 columns

In [6]:
print("PLOT: распределения, корреляции, 3D scatter трех признаков")

num_df = df.select_dtypes(include=[np.number])
nonnum_df = df.select_dtypes(exclude=[np.number])

print("DESCRIBE (numeric):")
print(num_df.describe().T)

q = num_df.quantile([0, 0.25, 0.5, 0.75, 1.0])
print("\nQUANTILES (numeric, 0, 0.25, 0.5, 0.75, 1.0):")
print(q)
PLOT: распределения, корреляции, 3D scatter трех признаков
DESCRIBE (numeric):
               count        mean         std     min       25%       50%  \
Pregnancies    768.0    3.845052    3.369578   0.000   1.00000    3.0000   
Glucose        768.0  120.894531   31.972618   0.000  99.00000  117.0000   
BloodPressure  768.0   69.105469   19.355807   0.000  62.00000   72.0000   
SkinThickness  768.0   20.536458   15.952218   0.000   0.00000   23.0000   
Insulin        768.0   79.799479  115.244002   0.000   0.00000   30.5000   
BMI            768.0   31.992578    7.884160   0.000  27.30000   32.0000   
Pedigree       768.0    0.471876    0.331329   0.078   0.24375    0.3725   
Age            768.0   33.240885   11.760232  21.000  24.00000   29.0000   
Outcome        768.0    0.348958    0.476951   0.000   0.00000    0.0000   

                     75%     max  
Pregnancies      6.00000   17.00  
Glucose        140.25000  199.00  
BloodPressure   80.00000  122.00  
SkinThickness   32.00000   99.00  
Insulin        127.25000  846.00  
BMI             36.60000   67.10  
Pedigree         0.62625    2.42  
Age             41.00000   81.00  
Outcome          1.00000    1.00  

QUANTILES (numeric, 0, 0.25, 0.5, 0.75, 1.0):
      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0.00          0.0     0.00            0.0            0.0     0.00   0.0   
0.25          1.0    99.00           62.0            0.0     0.00  27.3   
0.50          3.0   117.00           72.0           23.0    30.50  32.0   
0.75          6.0   140.25           80.0           32.0   127.25  36.6   
1.00         17.0   199.00          122.0           99.0   846.00  67.1   

      Pedigree   Age  Outcome  
0.00   0.07800  21.0      0.0  
0.25   0.24375  24.0      0.0  
0.50   0.37250  29.0      0.0  
0.75   0.62625  41.0      1.0  
1.00   2.42000  81.0      1.0  
In [9]:
df = df.copy()

features_all = [c for c in df.columns if c != "Outcome"]
target_col = "Outcome"

for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="coerce")

cols_zero_as_missing = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
for c in cols_zero_as_missing:
    if c in df.columns:
        df.loc[df[c] == 0, c] = np.nan

df[target_col] = df[target_col].fillna(df[target_col].mode().iloc[0]).astype(int)
for c in features_all:
    if df[c].isna().any():
        df[c] = df[c]

X_all = df[features_all].values.astype(float)
y_all = df[target_col].values.astype(int)

N = X_all.shape[0]
idx = np.arange(N)
rng.shuffle(idx)
train_size = int(0.8 * N)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

X_train_raw = X_all[train_idx]
y_train = y_all[train_idx]
X_test_raw = X_all[test_idx]
y_test = y_all[test_idx]

train_df = pd.DataFrame(X_train_raw, columns=features_all)
test_df = pd.DataFrame(X_test_raw, columns=features_all)

for c in cols_zero_as_missing:
    if c in train_df.columns:
        med = np.nanmedian(train_df[c].values)
        train_df[c] = train_df[c].fillna(med)
        test_df[c] = test_df[c].fillna(med)

for c in features_all:
    if not np.isfinite(train_df[c]).all():
        med = np.nanmedian(train_df[c].values)
        train_df[c] = train_df[c].fillna(med)
    if not np.isfinite(test_df[c]).all():
        med = np.nanmedian(train_df[c].values)
        test_df[c] = test_df[c].fillna(med)

X_train = train_df.values.astype(float)
X_test = test_df.values.astype(float)

mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
sigma[sigma == 0] = 1.0

X_train_z = (X_train - mu) / sigma
X_test_z = (X_test - mu) / sigma

print("Train shape:", X_train_z.shape, "Test shape:", X_test_z.shape)
Train shape: (614, 8) Test shape: (154, 8)
In [10]:
def knn_predict(X_train, y_train, X_test, k):
    preds = np.zeros(X_test.shape[0], dtype=int)
    for i in range(X_test.shape[0]):
        d = np.sqrt(((X_train - X_test[i])**2).sum(axis=1))
        nn_idx = np.argpartition(d, k)[:k]
        votes = y_train[nn_idx]
        s1 = (votes == 1).sum()
        s0 = k - s1
        if s1 > s0:
            preds[i] = 1
        elif s0 > s1:
            preds[i] = 0
        else:
            tie = nn_idx[np.argmin(d[nn_idx])]
            preds[i] = y_train[tie]
    return preds

def confusion_and_metrics(y_true, y_pred):
    tp = int(((y_pred == 1) & (y_true == 1)).sum())
    tn = int(((y_pred == 0) & (y_true == 0)).sum())
    fp = int(((y_pred == 1) & (y_true == 0)).sum())
    fn = int(((y_pred == 0) & (y_true == 1)).sum())
    acc = (tp + tn) / max(1, len(y_true))
    prec = tp / max(1, (tp + fp))
    rec = tp / max(1, (tp + fn))
    return tp, tn, fp, fn, acc, prec, rec
In [11]:
fixed_features = ["Glucose","BMI","Age","Pregnancies"]
fixed_idx = [features_all.index(c) for c in fixed_features]

Xtr_fix = X_train_z[:, fixed_idx]
Xte_fix = X_test_z[:, fixed_idx]

k_list = [3,5,10]
results_fixed = []
for k in k_list:
    y_pred = knn_predict(Xtr_fix, y_train, Xte_fix, k)
    tp, tn, fp, fn, acc, prec, rec = confusion_and_metrics(y_test, y_pred)
    results_fixed.append((k, tp, tn, fp, fn, acc, prec, rec))
    print("Model 2 (fixed), k=", k)
    print(f"Confusion: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
    print("Accuracy:", round(acc,4), "Precision:", round(prec,4), "Recall:", round(rec,4))
Model 2 (fixed), k= 3
Confusion: TN=74, FP=22, FN=26, TP=32
Accuracy: 0.6883 Precision: 0.5926 Recall: 0.5517
Model 2 (fixed), k= 5
Confusion: TN=79, FP=17, FN=25, TP=33
Accuracy: 0.7273 Precision: 0.66 Recall: 0.569
Model 2 (fixed), k= 10
Confusion: TN=82, FP=14, FN=23, TP=35
Accuracy: 0.7597 Precision: 0.7143 Recall: 0.6034
In [12]:
n_feat = len(features_all)
rand_size = 4
rand_idx = rng.choice(np.arange(n_feat), size=rand_size, replace=False)
rand_features = [features_all[i] for i in rand_idx]

Xtr_rand = X_train_z[:, rand_idx]
Xte_rand = X_test_z[:, rand_idx]

results_rand = []
for k in k_list:
    y_pred = knn_predict(Xtr_rand, y_train, Xte_rand, k)
    tp, tn, fp, fn, acc, prec, rec = confusion_and_metrics(y_test, y_pred)
    results_rand.append((k, tp, tn, fp, fn, acc, prec, rec))
    print("Model 1 (random), features:", rand_features, "k=", k)
    print(f"Confusion: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
    print("Accuracy:", round(acc,4), "Precision:", round(prec,4), "Recall:", round(rec,4))
Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 3
Confusion: TN=68, FP=28, FN=25, TP=33
Accuracy: 0.6558 Precision: 0.541 Recall: 0.569
Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 5
Confusion: TN=72, FP=24, FN=28, TP=30
Accuracy: 0.6623 Precision: 0.5556 Recall: 0.5172
Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 10
Confusion: TN=78, FP=18, FN=22, TP=36
Accuracy: 0.7403 Precision: 0.6667 Recall: 0.6207
In [13]:
rows = []
for k, tp, tn, fp, fn, acc, prec, rec in results_fixed:
    rows.append(["Model_2_fixed", ",".join(fixed_features), k, acc, prec, rec, tp, tn, fp, fn])
for k, tp, tn, fp, fn, acc, prec, rec in results_rand:
    rows.append(["Model_1_random", ",".join(rand_features), k, acc, prec, rec, tp, tn, fp, fn])

summary = pd.DataFrame(rows, columns=["model","features","k","accuracy","precision","recall","TP","TN","FP","FN"])
print(summary.sort_values(["model","k"]).reset_index(drop=True))
            model                            features   k  accuracy  \
0  Model_1_random  Pedigree,Age,Glucose,SkinThickness   3  0.655844   
1  Model_1_random  Pedigree,Age,Glucose,SkinThickness   5  0.662338   
2  Model_1_random  Pedigree,Age,Glucose,SkinThickness  10  0.740260   
3   Model_2_fixed         Glucose,BMI,Age,Pregnancies   3  0.688312   
4   Model_2_fixed         Glucose,BMI,Age,Pregnancies   5  0.727273   
5   Model_2_fixed         Glucose,BMI,Age,Pregnancies  10  0.759740   

   precision    recall  TP  TN  FP  FN  
0   0.540984  0.568966  33  68  28  25  
1   0.555556  0.517241  30  72  24  28  
2   0.666667  0.620690  36  78  18  22  
3   0.592593  0.551724  32  74  22  26  
4   0.660000  0.568966  33  79  17  25  
5   0.714286  0.603448  35  82  14  23  
In [14]:
import matplotlib.pyplot as plt
num_df = df.select_dtypes(include=[np.number])
cols = list(num_df.columns)
r = int(np.ceil(len(cols)/3))
plt.figure(figsize=(14, 4*r))
for i,c in enumerate(cols,1):
    plt.subplot(r,3,i)
    plt.hist(num_df[c].dropna().values, bins=30)
    plt.title(c)
plt.tight_layout()
plt.show()

corr = num_df.corr()
plt.figure(figsize=(8,6))
plt.imshow(corr.values, aspect='auto')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.title("Correlation matrix")
plt.tight_layout()
plt.show()
No description has been provided for this image
No description has been provided for this image
In [15]:
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
gx = test_df["Glucose"].values
bm = test_df["BMI"].values
ag = test_df["Age"].values
ax.scatter(gx, bm, ag, c=y_test, s=15, alpha=0.7)
ax.set_xlabel("Glucose")
ax.set_ylabel("BMI")
ax.set_zlabel("Age")
plt.title("3D scatter (test)")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [16]:
k_vals = []
acc_fix = []
acc_rand = []
for k, tp, tn, fp, fn, acc, prec, rec in results_fixed:
    k_vals.append(k)
    acc_fix.append(acc)
lookup_rand = {k:acc for (k, tp, tn, fp, fn, acc, prec, rec) in results_rand}
acc_rand = [lookup_rand[k] for k in k_vals]
plt.figure(figsize=(7,4))
plt.plot(k_vals, acc_fix, marker='o', label='fixed')
plt.plot(k_vals, acc_rand, marker='o', label='random')
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.title("Accuracy vs k")
plt.legend()
plt.tight_layout()
plt.show()
No description has been provided for this image
In [17]:
k_sel = 5
y_pred_sel = knn_predict(Xtr_fix, y_train, Xte_fix, k_sel)
tp = int(((y_pred_sel == 1) & (y_test == 1)).sum())
tn = int(((y_pred_sel == 0) & (y_test == 0)).sum())
fp = int(((y_pred_sel == 1) & (y_test == 0)).sum())
fn = int(((y_pred_sel == 0) & (y_test == 1)).sum())
cm = np.array([[tn, fp],[fn, tp]])
plt.figure(figsize=(4,4))
plt.imshow(cm, cmap='Blues')
plt.title("Confusion Matrix (Model 2, k=5)")
plt.xticks([0,1], ["Pred 0","Pred 1"])
plt.yticks([0,1], ["True 0","True 1"])
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]: