import os
import numpy as np
import pandas as pd

rng = np.random.RandomState(42)

df = pd.read_csv("diabetes.xls")

df

print("PLOT: распределения, корреляции, 3D scatter трех признаков")

num_df = df.select_dtypes(include=[np.number])
nonnum_df = df.select_dtypes(exclude=[np.number])

print("DESCRIBE (numeric):")
print(num_df.describe().T)

q = num_df.quantile([0, 0.25, 0.5, 0.75, 1.0])
print("\nQUANTILES (numeric, 0, 0.25, 0.5, 0.75, 1.0):")
print(q)

PLOT: распределения, корреляции, 3D scatter трех признаков
DESCRIBE (numeric):
               count        mean         std     min       25%       50%  \
Pregnancies    768.0    3.845052    3.369578   0.000   1.00000    3.0000   
Glucose        768.0  120.894531   31.972618   0.000  99.00000  117.0000   
BloodPressure  768.0   69.105469   19.355807   0.000  62.00000   72.0000   
SkinThickness  768.0   20.536458   15.952218   0.000   0.00000   23.0000   
Insulin        768.0   79.799479  115.244002   0.000   0.00000   30.5000   
BMI            768.0   31.992578    7.884160   0.000  27.30000   32.0000   
Pedigree       768.0    0.471876    0.331329   0.078   0.24375    0.3725   
Age            768.0   33.240885   11.760232  21.000  24.00000   29.0000   
Outcome        768.0    0.348958    0.476951   0.000   0.00000    0.0000   

                     75%     max  
Pregnancies      6.00000   17.00  
Glucose        140.25000  199.00  
BloodPressure   80.00000  122.00  
SkinThickness   32.00000   99.00  
Insulin        127.25000  846.00  
BMI             36.60000   67.10  
Pedigree         0.62625    2.42  
Age             41.00000   81.00  
Outcome          1.00000    1.00  

QUANTILES (numeric, 0, 0.25, 0.5, 0.75, 1.0):
      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0.00          0.0     0.00            0.0            0.0     0.00   0.0   
0.25          1.0    99.00           62.0            0.0     0.00  27.3   
0.50          3.0   117.00           72.0           23.0    30.50  32.0   
0.75          6.0   140.25           80.0           32.0   127.25  36.6   
1.00         17.0   199.00          122.0           99.0   846.00  67.1   

      Pedigree   Age  Outcome  
0.00   0.07800  21.0      0.0  
0.25   0.24375  24.0      0.0  
0.50   0.37250  29.0      0.0  
0.75   0.62625  41.0      1.0  
1.00   2.42000  81.0      1.0

df = df.copy()

features_all = [c for c in df.columns if c != "Outcome"]
target_col = "Outcome"

for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="coerce")

cols_zero_as_missing = ["Glucose","BloodPressure","SkinThickness","Insulin","BMI"]
for c in cols_zero_as_missing:
    if c in df.columns:
        df.loc[df[c] == 0, c] = np.nan

df[target_col] = df[target_col].fillna(df[target_col].mode().iloc[0]).astype(int)
for c in features_all:
    if df[c].isna().any():
        df[c] = df[c]

X_all = df[features_all].values.astype(float)
y_all = df[target_col].values.astype(int)

N = X_all.shape[0]
idx = np.arange(N)
rng.shuffle(idx)
train_size = int(0.8 * N)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

X_train_raw = X_all[train_idx]
y_train = y_all[train_idx]
X_test_raw = X_all[test_idx]
y_test = y_all[test_idx]

train_df = pd.DataFrame(X_train_raw, columns=features_all)
test_df = pd.DataFrame(X_test_raw, columns=features_all)

for c in cols_zero_as_missing:
    if c in train_df.columns:
        med = np.nanmedian(train_df[c].values)
        train_df[c] = train_df[c].fillna(med)
        test_df[c] = test_df[c].fillna(med)

for c in features_all:
    if not np.isfinite(train_df[c]).all():
        med = np.nanmedian(train_df[c].values)
        train_df[c] = train_df[c].fillna(med)
    if not np.isfinite(test_df[c]).all():
        med = np.nanmedian(train_df[c].values)
        test_df[c] = test_df[c].fillna(med)

X_train = train_df.values.astype(float)
X_test = test_df.values.astype(float)

mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
sigma[sigma == 0] = 1.0

X_train_z = (X_train - mu) / sigma
X_test_z = (X_test - mu) / sigma

print("Train shape:", X_train_z.shape, "Test shape:", X_test_z.shape)

Train shape: (614, 8) Test shape: (154, 8)

def knn_predict(X_train, y_train, X_test, k):
    preds = np.zeros(X_test.shape[0], dtype=int)
    for i in range(X_test.shape[0]):
        d = np.sqrt(((X_train - X_test[i])**2).sum(axis=1))
        nn_idx = np.argpartition(d, k)[:k]
        votes = y_train[nn_idx]
        s1 = (votes == 1).sum()
        s0 = k - s1
        if s1 > s0:
            preds[i] = 1
        elif s0 > s1:
            preds[i] = 0
        else:
            tie = nn_idx[np.argmin(d[nn_idx])]
            preds[i] = y_train[tie]
    return preds

def confusion_and_metrics(y_true, y_pred):
    tp = int(((y_pred == 1) & (y_true == 1)).sum())
    tn = int(((y_pred == 0) & (y_true == 0)).sum())
    fp = int(((y_pred == 1) & (y_true == 0)).sum())
    fn = int(((y_pred == 0) & (y_true == 1)).sum())
    acc = (tp + tn) / max(1, len(y_true))
    prec = tp / max(1, (tp + fp))
    rec = tp / max(1, (tp + fn))
    return tp, tn, fp, fn, acc, prec, rec

fixed_features = ["Glucose","BMI","Age","Pregnancies"]
fixed_idx = [features_all.index(c) for c in fixed_features]

Xtr_fix = X_train_z[:, fixed_idx]
Xte_fix = X_test_z[:, fixed_idx]

k_list = [3,5,10]
results_fixed = []
for k in k_list:
    y_pred = knn_predict(Xtr_fix, y_train, Xte_fix, k)
    tp, tn, fp, fn, acc, prec, rec = confusion_and_metrics(y_test, y_pred)
    results_fixed.append((k, tp, tn, fp, fn, acc, prec, rec))
    print("Model 2 (fixed), k=", k)
    print(f"Confusion: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
    print("Accuracy:", round(acc,4), "Precision:", round(prec,4), "Recall:", round(rec,4))

Model 2 (fixed), k= 3
Confusion: TN=74, FP=22, FN=26, TP=32
Accuracy: 0.6883 Precision: 0.5926 Recall: 0.5517
Model 2 (fixed), k= 5
Confusion: TN=79, FP=17, FN=25, TP=33
Accuracy: 0.7273 Precision: 0.66 Recall: 0.569
Model 2 (fixed), k= 10
Confusion: TN=82, FP=14, FN=23, TP=35
Accuracy: 0.7597 Precision: 0.7143 Recall: 0.6034

n_feat = len(features_all)
rand_size = 4
rand_idx = rng.choice(np.arange(n_feat), size=rand_size, replace=False)
rand_features = [features_all[i] for i in rand_idx]

Xtr_rand = X_train_z[:, rand_idx]
Xte_rand = X_test_z[:, rand_idx]

results_rand = []
for k in k_list:
    y_pred = knn_predict(Xtr_rand, y_train, Xte_rand, k)
    tp, tn, fp, fn, acc, prec, rec = confusion_and_metrics(y_test, y_pred)
    results_rand.append((k, tp, tn, fp, fn, acc, prec, rec))
    print("Model 1 (random), features:", rand_features, "k=", k)
    print(f"Confusion: TN={tn}, FP={fp}, FN={fn}, TP={tp}")
    print("Accuracy:", round(acc,4), "Precision:", round(prec,4), "Recall:", round(rec,4))

Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 3
Confusion: TN=68, FP=28, FN=25, TP=33
Accuracy: 0.6558 Precision: 0.541 Recall: 0.569
Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 5
Confusion: TN=72, FP=24, FN=28, TP=30
Accuracy: 0.6623 Precision: 0.5556 Recall: 0.5172
Model 1 (random), features: ['Pedigree', 'Age', 'Glucose', 'SkinThickness'] k= 10
Confusion: TN=78, FP=18, FN=22, TP=36
Accuracy: 0.7403 Precision: 0.6667 Recall: 0.6207

rows = []
for k, tp, tn, fp, fn, acc, prec, rec in results_fixed:
    rows.append(["Model_2_fixed", ",".join(fixed_features), k, acc, prec, rec, tp, tn, fp, fn])
for k, tp, tn, fp, fn, acc, prec, rec in results_rand:
    rows.append(["Model_1_random", ",".join(rand_features), k, acc, prec, rec, tp, tn, fp, fn])

summary = pd.DataFrame(rows, columns=["model","features","k","accuracy","precision","recall","TP","TN","FP","FN"])
print(summary.sort_values(["model","k"]).reset_index(drop=True))

            model                            features   k  accuracy  \
0  Model_1_random  Pedigree,Age,Glucose,SkinThickness   3  0.655844   
1  Model_1_random  Pedigree,Age,Glucose,SkinThickness   5  0.662338   
2  Model_1_random  Pedigree,Age,Glucose,SkinThickness  10  0.740260   
3   Model_2_fixed         Glucose,BMI,Age,Pregnancies   3  0.688312   
4   Model_2_fixed         Glucose,BMI,Age,Pregnancies   5  0.727273   
5   Model_2_fixed         Glucose,BMI,Age,Pregnancies  10  0.759740   

   precision    recall  TP  TN  FP  FN  
0   0.540984  0.568966  33  68  28  25  
1   0.555556  0.517241  30  72  24  28  
2   0.666667  0.620690  36  78  18  22  
3   0.592593  0.551724  32  74  22  26  
4   0.660000  0.568966  33  79  17  25  
5   0.714286  0.603448  35  82  14  23

import matplotlib.pyplot as plt
num_df = df.select_dtypes(include=[np.number])
cols = list(num_df.columns)
r = int(np.ceil(len(cols)/3))
plt.figure(figsize=(14, 4*r))
for i,c in enumerate(cols,1):
    plt.subplot(r,3,i)
    plt.hist(num_df[c].dropna().values, bins=30)
    plt.title(c)
plt.tight_layout()
plt.show()

corr = num_df.corr()
plt.figure(figsize=(8,6))
plt.imshow(corr.values, aspect='auto')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.title("Correlation matrix")
plt.tight_layout()
plt.show()

from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
gx = test_df["Glucose"].values
bm = test_df["BMI"].values
ag = test_df["Age"].values
ax.scatter(gx, bm, ag, c=y_test, s=15, alpha=0.7)
ax.set_xlabel("Glucose")
ax.set_ylabel("BMI")
ax.set_zlabel("Age")
plt.title("3D scatter (test)")
plt.tight_layout()
plt.show()

k_vals = []
acc_fix = []
acc_rand = []
for k, tp, tn, fp, fn, acc, prec, rec in results_fixed:
    k_vals.append(k)
    acc_fix.append(acc)
lookup_rand = {k:acc for (k, tp, tn, fp, fn, acc, prec, rec) in results_rand}
acc_rand = [lookup_rand[k] for k in k_vals]
plt.figure(figsize=(7,4))
plt.plot(k_vals, acc_fix, marker='o', label='fixed')
plt.plot(k_vals, acc_rand, marker='o', label='random')
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.title("Accuracy vs k")
plt.legend()
plt.tight_layout()
plt.show()

k_sel = 5
y_pred_sel = knn_predict(Xtr_fix, y_train, Xte_fix, k_sel)
tp = int(((y_pred_sel == 1) & (y_test == 1)).sum())
tn = int(((y_pred_sel == 0) & (y_test == 0)).sum())
fp = int(((y_pred_sel == 1) & (y_test == 0)).sum())
fn = int(((y_pred_sel == 0) & (y_test == 1)).sum())
cm = np.array([[tn, fp],[fn, tp]])
plt.figure(figsize=(4,4))
plt.imshow(cm, cmap='Blues')
plt.title("Confusion Matrix (Model 2, k=5)")
plt.xticks([0,1], ["Pred 0","Pred 1"])
plt.yticks([0,1], ["True 0","True 1"])
for i in range(2):
    for j in range(2):
        plt.text(j, i, cm[i,j], ha='center', va='center')
plt.tight_layout()
plt.show()

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	Pedigree	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1
...	...	...	...	...	...	...	...	...	...
763	10	101	76	48	180	32.9	0.171	63	0
764	2	122	70	27	0	36.8	0.340	27	0
765	5	121	72	23	112	26.2	0.245	30	0
766	1	126	60	0	0	30.1	0.349	47	1
767	1	93	70	31	0	30.4	0.315	23	0