import numpy as np
import pandas as pd

rng = np.random.RandomState(42)

possible_paths = [
    "Student_Performance.csv",
]
path = None
for p in possible_paths:
    try:
        df = pd.read_csv(p)
        path = p
        break
    except Exception:
        pass

print("Файл:", path)
print("Загружено:", df.shape)
print("Колонки:", list(df.columns))

Файл: Student_Performance.csv
Загружено: (10000, 6)
Колонки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']

print("PLOT: распределения признаков — вставить после выполнения")
print("PLOT: парные графики признаков — вставить после выполнения")

num_df = df.select_dtypes(include=[np.number])
nonnum_df = df.select_dtypes(exclude=[np.number])

print("NUMERIC COLUMNS:", list(num_df.columns))
print("NON-NUMERIC COLUMNS:", list(nonnum_df.columns))

desc_num = num_df.describe().T
print("DESCRIBE (numeric):\n", desc_num)

if nonnum_df.shape[1] > 0:
    desc_non = nonnum_df.describe().T
    print("\nDESCRIBE (non-numeric):\n", desc_non)

q = num_df.quantile([0.25, 0.5, 0.75])
print("\nQUANTILES (numeric, 0.25, 0.5, 0.75):\n", q)

PLOT: распределения признаков — вставить после выполнения
PLOT: парные графики признаков — вставить после выполнения
NUMERIC COLUMNS: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']
NON-NUMERIC COLUMNS: []
DESCRIBE (numeric):
                                     count     mean        std   min   25%  \
Hours Studied                     10000.0   4.9929   2.589309   1.0   3.0   
Previous Scores                   10000.0  69.4457  17.343152  40.0  54.0   
Extracurricular Activities        10000.0   0.4948   0.499998   0.0   0.0   
Sleep Hours                       10000.0   6.5306   1.695863   4.0   5.0   
Sample Question Papers Practiced  10000.0   4.5833   2.867348   0.0   2.0   
Performance Index                 10000.0  55.2248  19.212558  10.0  40.0   

                                   50%   75%    max  
Hours Studied                      5.0   7.0    9.0  
Previous Scores                   69.0  85.0   99.0  
Extracurricular Activities         0.0   1.0    1.0  
Sleep Hours                        7.0   8.0    9.0  
Sample Question Papers Practiced   5.0   7.0    9.0  
Performance Index                 55.0  71.0  100.0  

QUANTILES (numeric, 0.25, 0.5, 0.75):
       Hours Studied  Previous Scores  Extracurricular Activities  Sleep Hours  \
0.25            3.0             54.0                         0.0          5.0   
0.50            5.0             69.0                         0.0          7.0   
0.75            7.0             85.0                         1.0          8.0   

      Sample Question Papers Practiced  Performance Index  
0.25                               2.0               40.0  
0.50                               5.0               55.0  
0.75                               7.0               71.0

df = df.copy()

if "Extracurricular Activities" in df.columns:
    map_bool = {
        True: 1, False: 0,
        "true": 1, "false": 0,
        "True": 1, "False": 0,
        "Yes": 1, "No": 0,
        "yes": 1, "no": 0,
        1: 1, 0: 0
    }
    df["Extracurricular Activities"] = df["Extracurricular Activities"].map(map_bool)
    if df["Extracurricular Activities"].isna().any():
        df["Extracurricular Activities"] = df["Extracurricular Activities"].fillna(0).astype(int)

for c in df.columns:
    if c != "Extracurricular Activities":
        df[c] = pd.to_numeric(df[c], errors="coerce")

num_cols = [c for c in df.columns if c != "Extracurricular Activities" and c != "Performance Index"]
if "Extracurricular Activities" in df.columns:
    all_features = num_cols + ["Extracurricular Activities"]
else:
    all_features = num_cols

for c in all_features + ["Performance Index"]:
    if c in df.columns:
        if df[c].dtype.kind in "biufc":
            med = df[c].median()
            df[c] = df[c].fillna(med)
        else:
            mode_val = df[c].mode().iloc[0]
            df[c] = df[c].fillna(mode_val)

features = [
    "Hours Studied",
    "Previous Scores",
    "Extracurricular Activities",
    "Sleep Hours",
    "Sample Question Papers Practiced"
]
target_col = "Performance Index"

X_all = df[features].values.astype(float)
y_all = df[target_col].values.astype(float)

N = X_all.shape[0]
idx = np.arange(N)
rng.shuffle(idx)
train_size = int(0.8 * N)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

X_train = X_all[train_idx]
y_train = y_all[train_idx]
X_test = X_all[test_idx]
y_test = y_all[test_idx]

mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
sigma[sigma == 0] = 1.0

X_train_z = (X_train - mu) / sigma
X_test_z = (X_test - mu) / sigma

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

Train shape: (8000, 5) Test shape: (2000, 5)

def add_intercept(X):
    return np.hstack([np.ones((X.shape[0], 1)), X])

def fit_ols(X, y):
    X1 = add_intercept(X)
    XtX = X1.T @ X1
    XtX_inv = np.linalg.pinv(XtX)
    beta = XtX_inv @ X1.T @ y
    return beta

def predict_ols(X, beta):
    X1 = add_intercept(X)
    return X1 @ beta

def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - y_true.mean()) ** 2)
    return 1 - ss_res / ss_tot if ss_tot != 0 else 0.0

m1_cols = ["Hours Studied", "Previous Scores"]
m1_idx = [features.index(c) for c in m1_cols]

Xtr_m1 = X_train_z[:, m1_idx]
Xte_m1 = X_test_z[:, m1_idx]

beta_m1 = fit_ols(Xtr_m1, y_train)
pred_tr_m1 = predict_ols(Xtr_m1, beta_m1)
pred_te_m1 = predict_ols(Xte_m1, beta_m1)

r2_tr_m1 = r2_score(y_train, pred_tr_m1)
r2_te_m1 = r2_score(y_test, pred_te_m1)

print("Модель 1 — признаки:", m1_cols)
print("beta:", beta_m1.round(6))
print("R2 train:", round(r2_tr_m1, 6), "R2 test:", round(r2_te_m1, 6))

Модель 1 — признаки: ['Hours Studied', 'Previous Scores']
beta: [55.2365    7.391342 17.675357]
R2 train: 0.985798 R2 test: 0.986163

m2_cols = [
    "Hours Studied",
    "Previous Scores",
    "Extracurricular Activities",
    "Sleep Hours",
    "Sample Question Papers Practiced"
]
m2_idx = [features.index(c) for c in m2_cols]

Xtr_m2 = X_train_z[:, m2_idx]
Xte_m2 = X_test_z[:, m2_idx]

beta_m2 = fit_ols(Xtr_m2, y_train)
pred_tr_m2 = predict_ols(Xtr_m2, beta_m2)
pred_te_m2 = predict_ols(Xte_m2, beta_m2)

r2_tr_m2 = r2_score(y_train, pred_tr_m2)
r2_te_m2 = r2_score(y_test, pred_te_m2)

print("Модель 2 — признаки:", m2_cols)
print("beta:", beta_m2.round(6))
print("R2 train:", round(r2_tr_m2, 6), "R2 test:", round(r2_te_m2, 6))

Модель 2 — признаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced']
beta: [55.2365    7.381186 17.658739  0.307544  0.812387  0.576902]
R2 train: 0.988733 R2 test: 0.988819

hs_idx = features.index("Hours Studied")
ps_idx = features.index("Previous Scores")

hs_tr = X_train_z[:, hs_idx]
ps_tr = X_train_z[:, ps_idx]
hs_te = X_test_z[:, hs_idx]
ps_te = X_test_z[:, ps_idx]

syn_tr = (hs_tr * ps_tr).reshape(-1, 1)
syn_te = (hs_te * ps_te).reshape(-1, 1)

Xtr_m3 = np.hstack([X_train_z[:, :], syn_tr])
Xte_m3 = np.hstack([X_test_z[:, :], syn_te])

beta_m3 = fit_ols(Xtr_m3, y_train)
pred_tr_m3 = predict_ols(Xtr_m3, beta_m3)
pred_te_m3 = predict_ols(Xte_m3, beta_m3)

r2_tr_m3 = r2_score(y_train, pred_tr_m3)
r2_te_m3 = r2_score(y_test, pred_te_m3)

all_cols_plus_syn = features + ["HSxPS"]
print("Модель 3 — признаки:", all_cols_plus_syn)
print("beta:", beta_m3.round(6))
print("R2 train:", round(r2_tr_m3, 6), "R2 test:", round(r2_te_m3, 6))

Модель 3 — признаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'HSxPS']
beta: [ 5.5236138e+01  7.3809510e+00  1.7658707e+01  3.0760900e-01
  8.1211400e-01  5.7723100e-01 -2.1174000e-02]
R2 train: 0.988734 R2 test: 0.988812

hs_idx = features.index("Hours Studied")
ps_idx = features.index("Previous Scores")

hs_tr = X_train_z[:, hs_idx]
ps_tr = X_train_z[:, ps_idx]
hs_te = X_test_z[:, hs_idx]
ps_te = X_test_z[:, ps_idx]

syn_tr = (hs_tr * ps_tr).reshape(-1, 1)
syn_te = (hs_te * ps_te).reshape(-1, 1)

Xtr_m3 = np.hstack([X_train_z[:, :], syn_tr])
Xte_m3 = np.hstack([X_test_z[:, :], syn_te])

beta_m3 = fit_ols(Xtr_m3, y_train)
pred_tr_m3 = predict_ols(Xtr_m3, beta_m3)
pred_te_m3 = predict_ols(Xte_m3, beta_m3)

r2_tr_m3 = r2_score(y_train, pred_tr_m3)
r2_te_m3 = r2_score(y_test, pred_te_m3)

all_cols_plus_syn = features + ["HSxPS"]
print("Модель 3 — признаки:", all_cols_plus_syn)
print("beta:", beta_m3.round(6))
print("R2 train:", round(r2_tr_m3, 6), "R2 test:", round(r2_te_m3, 6))

Модель 3 — признаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'HSxPS']
beta: [ 5.5236138e+01  7.3809510e+00  1.7658707e+01  3.0760900e-01
  8.1211400e-01  5.7723100e-01 -2.1174000e-02]
R2 train: 0.988734 R2 test: 0.988812

summary = pd.DataFrame({
    "model": ["Model_1", "Model_2", "Model_3"],
    "features": [
        ",".join(["Hours Studied","Previous Scores"]),
        ",".join(["Hours Studied","Previous Scores","Extracurricular Activities","Sleep Hours","Sample Question Papers Practiced"]),
        ",".join(features + ["HSxPS"])
    ],
    "R2_train": [r2_tr_m1, r2_tr_m2, r2_tr_m3],
    "R2_test": [r2_te_m1, r2_te_m2, r2_te_m3]
})
print(summary)

     model                                           features  R2_train  \
0  Model_1                      Hours Studied,Previous Scores  0.985798   
1  Model_2  Hours Studied,Previous Scores,Extracurricular ...  0.988733   
2  Model_3  Hours Studied,Previous Scores,Extracurricular ...  0.988734   

    R2_test  
0  0.986163  
1  0.988819  
2  0.988812

import matplotlib.pyplot as plt
num_df = df.select_dtypes(include=[np.number])
cols = list(num_df.columns)
r = int(np.ceil(len(cols)/3))
plt.figure(figsize=(14, 4*r))
for i,c in enumerate(cols,1):
    plt.subplot(r,3,i)
    plt.hist(num_df[c].dropna().values, bins=30)
    plt.title(c)
plt.tight_layout()
plt.show()

feat_for_pairs = ["Hours Studied","Previous Scores","Extracurricular Activities","Sleep Hours","Sample Question Papers Practiced"]
d = df[feat_for_pairs].astype(float)
n = len(feat_for_pairs)
fig, axes = plt.subplots(n, n, figsize=(14,14))
for i in range(n):
    for j in range(n):
        ax = axes[i,j]
        if i == j:
            ax.hist(d.iloc[:,j].dropna().values, bins=30)
        else:
            ax.scatter(d.iloc[:,j].values, d.iloc[:,i].values, s=5, alpha=0.5)
        if i == n-1:
            ax.set_xlabel(feat_for_pairs[j])
        else:
            ax.set_xticklabels([])
        if j == 0:
            ax.set_ylabel(feat_for_pairs[i])
        else:
            ax.set_yticklabels([])
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,6))
plt.scatter(y_test, pred_te_m2, s=8, alpha=0.6)
mn = min(np.min(y_test), np.min(pred_te_m2))
mx = max(np.max(y_test), np.max(pred_te_m2))
plt.plot([mn,mx],[mn,mx])
plt.xlabel("Actual")
plt.ylabel("Predicted (Model 2)")
plt.title("Actual vs Predicted")
plt.tight_layout()
plt.show()

resid = y_test - pred_te_m2
plt.figure(figsize=(8,4))
plt.hist(resid, bins=40)
plt.title("Residuals (Model 2, test)")
plt.tight_layout()
plt.show()