InĀ [19]:
import numpy as np
import pandas as pd

rng = np.random.RandomState(42)

possible_paths = [
    "Student_Performance.csv",
]
path = None
for p in possible_paths:
    try:
        df = pd.read_csv(p)
        path = p
        break
    except Exception:
        pass

print("Файл:", path)
print("Š—Š°Š³Ń€ŃƒŠ¶ŠµŠ½Š¾:", df.shape)
print("Колонки:", list(df.columns))
Файл: Student_Performance.csv
Š—Š°Š³Ń€ŃƒŠ¶ŠµŠ½Š¾: (10000, 6)
Колонки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']
InĀ [14]:
print("PLOT: Ń€Š°ŃŠæŃ€ŠµŠ“ŠµŠ»ŠµŠ½ŠøŃ признаков — Š²ŃŃ‚Š°Š²ŠøŃ‚ŃŒ после Š²Ń‹ŠæŠ¾Š»Š½ŠµŠ½ŠøŃ")
print("PLOT: парные графики признаков — Š²ŃŃ‚Š°Š²ŠøŃ‚ŃŒ после Š²Ń‹ŠæŠ¾Š»Š½ŠµŠ½ŠøŃ")

num_df = df.select_dtypes(include=[np.number])
nonnum_df = df.select_dtypes(exclude=[np.number])

print("NUMERIC COLUMNS:", list(num_df.columns))
print("NON-NUMERIC COLUMNS:", list(nonnum_df.columns))

desc_num = num_df.describe().T
print("DESCRIBE (numeric):\n", desc_num)

if nonnum_df.shape[1] > 0:
    desc_non = nonnum_df.describe().T
    print("\nDESCRIBE (non-numeric):\n", desc_non)

q = num_df.quantile([0.25, 0.5, 0.75])
print("\nQUANTILES (numeric, 0.25, 0.5, 0.75):\n", q)
PLOT: Ń€Š°ŃŠæŃ€ŠµŠ“ŠµŠ»ŠµŠ½ŠøŃ признаков — Š²ŃŃ‚Š°Š²ŠøŃ‚ŃŒ после Š²Ń‹ŠæŠ¾Š»Š½ŠµŠ½ŠøŃ
PLOT: парные графики признаков — Š²ŃŃ‚Š°Š²ŠøŃ‚ŃŒ после Š²Ń‹ŠæŠ¾Š»Š½ŠµŠ½ŠøŃ
NUMERIC COLUMNS: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']
NON-NUMERIC COLUMNS: []
DESCRIBE (numeric):
                                     count     mean        std   min   25%  \
Hours Studied                     10000.0   4.9929   2.589309   1.0   3.0   
Previous Scores                   10000.0  69.4457  17.343152  40.0  54.0   
Extracurricular Activities        10000.0   0.4948   0.499998   0.0   0.0   
Sleep Hours                       10000.0   6.5306   1.695863   4.0   5.0   
Sample Question Papers Practiced  10000.0   4.5833   2.867348   0.0   2.0   
Performance Index                 10000.0  55.2248  19.212558  10.0  40.0   

                                   50%   75%    max  
Hours Studied                      5.0   7.0    9.0  
Previous Scores                   69.0  85.0   99.0  
Extracurricular Activities         0.0   1.0    1.0  
Sleep Hours                        7.0   8.0    9.0  
Sample Question Papers Practiced   5.0   7.0    9.0  
Performance Index                 55.0  71.0  100.0  

QUANTILES (numeric, 0.25, 0.5, 0.75):
       Hours Studied  Previous Scores  Extracurricular Activities  Sleep Hours  \
0.25            3.0             54.0                         0.0          5.0   
0.50            5.0             69.0                         0.0          7.0   
0.75            7.0             85.0                         1.0          8.0   

      Sample Question Papers Practiced  Performance Index  
0.25                               2.0               40.0  
0.50                               5.0               55.0  
0.75                               7.0               71.0  
InĀ [7]:
df = df.copy()

if "Extracurricular Activities" in df.columns:
    map_bool = {
        True: 1, False: 0,
        "true": 1, "false": 0,
        "True": 1, "False": 0,
        "Yes": 1, "No": 0,
        "yes": 1, "no": 0,
        1: 1, 0: 0
    }
    df["Extracurricular Activities"] = df["Extracurricular Activities"].map(map_bool)
    if df["Extracurricular Activities"].isna().any():
        df["Extracurricular Activities"] = df["Extracurricular Activities"].fillna(0).astype(int)

for c in df.columns:
    if c != "Extracurricular Activities":
        df[c] = pd.to_numeric(df[c], errors="coerce")

num_cols = [c for c in df.columns if c != "Extracurricular Activities" and c != "Performance Index"]
if "Extracurricular Activities" in df.columns:
    all_features = num_cols + ["Extracurricular Activities"]
else:
    all_features = num_cols

for c in all_features + ["Performance Index"]:
    if c in df.columns:
        if df[c].dtype.kind in "biufc":
            med = df[c].median()
            df[c] = df[c].fillna(med)
        else:
            mode_val = df[c].mode().iloc[0]
            df[c] = df[c].fillna(mode_val)

features = [
    "Hours Studied",
    "Previous Scores",
    "Extracurricular Activities",
    "Sleep Hours",
    "Sample Question Papers Practiced"
]
target_col = "Performance Index"

X_all = df[features].values.astype(float)
y_all = df[target_col].values.astype(float)

N = X_all.shape[0]
idx = np.arange(N)
rng.shuffle(idx)
train_size = int(0.8 * N)
train_idx = idx[:train_size]
test_idx = idx[train_size:]

X_train = X_all[train_idx]
y_train = y_all[train_idx]
X_test = X_all[test_idx]
y_test = y_all[test_idx]

mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
sigma[sigma == 0] = 1.0

X_train_z = (X_train - mu) / sigma
X_test_z = (X_test - mu) / sigma

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
Train shape: (8000, 5) Test shape: (2000, 5)
InĀ [8]:
def add_intercept(X):
    return np.hstack([np.ones((X.shape[0], 1)), X])

def fit_ols(X, y):
    X1 = add_intercept(X)
    XtX = X1.T @ X1
    XtX_inv = np.linalg.pinv(XtX)
    beta = XtX_inv @ X1.T @ y
    return beta

def predict_ols(X, beta):
    X1 = add_intercept(X)
    return X1 @ beta

def r2_score(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - y_true.mean()) ** 2)
    return 1 - ss_res / ss_tot if ss_tot != 0 else 0.0
InĀ [9]:
m1_cols = ["Hours Studied", "Previous Scores"]
m1_idx = [features.index(c) for c in m1_cols]

Xtr_m1 = X_train_z[:, m1_idx]
Xte_m1 = X_test_z[:, m1_idx]

beta_m1 = fit_ols(Xtr_m1, y_train)
pred_tr_m1 = predict_ols(Xtr_m1, beta_m1)
pred_te_m1 = predict_ols(Xte_m1, beta_m1)

r2_tr_m1 = r2_score(y_train, pred_tr_m1)
r2_te_m1 = r2_score(y_test, pred_te_m1)

print("МоГель 1 — признаки:", m1_cols)
print("beta:", beta_m1.round(6))
print("R2 train:", round(r2_tr_m1, 6), "R2 test:", round(r2_te_m1, 6))
МоГель 1 — признаки: ['Hours Studied', 'Previous Scores']
beta: [55.2365    7.391342 17.675357]
R2 train: 0.985798 R2 test: 0.986163
InĀ [10]:
m2_cols = [
    "Hours Studied",
    "Previous Scores",
    "Extracurricular Activities",
    "Sleep Hours",
    "Sample Question Papers Practiced"
]
m2_idx = [features.index(c) for c in m2_cols]

Xtr_m2 = X_train_z[:, m2_idx]
Xte_m2 = X_test_z[:, m2_idx]

beta_m2 = fit_ols(Xtr_m2, y_train)
pred_tr_m2 = predict_ols(Xtr_m2, beta_m2)
pred_te_m2 = predict_ols(Xte_m2, beta_m2)

r2_tr_m2 = r2_score(y_train, pred_tr_m2)
r2_te_m2 = r2_score(y_test, pred_te_m2)

print("МоГель 2 — признаки:", m2_cols)
print("beta:", beta_m2.round(6))
print("R2 train:", round(r2_tr_m2, 6), "R2 test:", round(r2_te_m2, 6))
МоГель 2 — признаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced']
beta: [55.2365    7.381186 17.658739  0.307544  0.812387  0.576902]
R2 train: 0.988733 R2 test: 0.988819
InĀ [11]:
hs_idx = features.index("Hours Studied")
ps_idx = features.index("Previous Scores")

hs_tr = X_train_z[:, hs_idx]
ps_tr = X_train_z[:, ps_idx]
hs_te = X_test_z[:, hs_idx]
ps_te = X_test_z[:, ps_idx]

syn_tr = (hs_tr * ps_tr).reshape(-1, 1)
syn_te = (hs_te * ps_te).reshape(-1, 1)

Xtr_m3 = np.hstack([X_train_z[:, :], syn_tr])
Xte_m3 = np.hstack([X_test_z[:, :], syn_te])

beta_m3 = fit_ols(Xtr_m3, y_train)
pred_tr_m3 = predict_ols(Xtr_m3, beta_m3)
pred_te_m3 = predict_ols(Xte_m3, beta_m3)

r2_tr_m3 = r2_score(y_train, pred_tr_m3)
r2_te_m3 = r2_score(y_test, pred_te_m3)

all_cols_plus_syn = features + ["HSxPS"]
print("МоГель 3 — признаки:", all_cols_plus_syn)
print("beta:", beta_m3.round(6))
print("R2 train:", round(r2_tr_m3, 6), "R2 test:", round(r2_te_m3, 6))
МоГель 3 — признаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'HSxPS']
beta: [ 5.5236138e+01  7.3809510e+00  1.7658707e+01  3.0760900e-01
  8.1211400e-01  5.7723100e-01 -2.1174000e-02]
R2 train: 0.988734 R2 test: 0.988812
InĀ [12]:
hs_idx = features.index("Hours Studied")
ps_idx = features.index("Previous Scores")

hs_tr = X_train_z[:, hs_idx]
ps_tr = X_train_z[:, ps_idx]
hs_te = X_test_z[:, hs_idx]
ps_te = X_test_z[:, ps_idx]

syn_tr = (hs_tr * ps_tr).reshape(-1, 1)
syn_te = (hs_te * ps_te).reshape(-1, 1)

Xtr_m3 = np.hstack([X_train_z[:, :], syn_tr])
Xte_m3 = np.hstack([X_test_z[:, :], syn_te])

beta_m3 = fit_ols(Xtr_m3, y_train)
pred_tr_m3 = predict_ols(Xtr_m3, beta_m3)
pred_te_m3 = predict_ols(Xte_m3, beta_m3)

r2_tr_m3 = r2_score(y_train, pred_tr_m3)
r2_te_m3 = r2_score(y_test, pred_te_m3)

all_cols_plus_syn = features + ["HSxPS"]
print("МоГель 3 — признаки:", all_cols_plus_syn)
print("beta:", beta_m3.round(6))
print("R2 train:", round(r2_tr_m3, 6), "R2 test:", round(r2_te_m3, 6))
МоГель 3 — признаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'HSxPS']
beta: [ 5.5236138e+01  7.3809510e+00  1.7658707e+01  3.0760900e-01
  8.1211400e-01  5.7723100e-01 -2.1174000e-02]
R2 train: 0.988734 R2 test: 0.988812
InĀ [13]:
summary = pd.DataFrame({
    "model": ["Model_1", "Model_2", "Model_3"],
    "features": [
        ",".join(["Hours Studied","Previous Scores"]),
        ",".join(["Hours Studied","Previous Scores","Extracurricular Activities","Sleep Hours","Sample Question Papers Practiced"]),
        ",".join(features + ["HSxPS"])
    ],
    "R2_train": [r2_tr_m1, r2_tr_m2, r2_tr_m3],
    "R2_test": [r2_te_m1, r2_te_m2, r2_te_m3]
})
print(summary)
     model                                           features  R2_train  \
0  Model_1                      Hours Studied,Previous Scores  0.985798   
1  Model_2  Hours Studied,Previous Scores,Extracurricular ...  0.988733   
2  Model_3  Hours Studied,Previous Scores,Extracurricular ...  0.988734   

    R2_test  
0  0.986163  
1  0.988819  
2  0.988812  
InĀ [15]:
import matplotlib.pyplot as plt
num_df = df.select_dtypes(include=[np.number])
cols = list(num_df.columns)
r = int(np.ceil(len(cols)/3))
plt.figure(figsize=(14, 4*r))
for i,c in enumerate(cols,1):
    plt.subplot(r,3,i)
    plt.hist(num_df[c].dropna().values, bins=30)
    plt.title(c)
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [16]:
feat_for_pairs = ["Hours Studied","Previous Scores","Extracurricular Activities","Sleep Hours","Sample Question Papers Practiced"]
d = df[feat_for_pairs].astype(float)
n = len(feat_for_pairs)
fig, axes = plt.subplots(n, n, figsize=(14,14))
for i in range(n):
    for j in range(n):
        ax = axes[i,j]
        if i == j:
            ax.hist(d.iloc[:,j].dropna().values, bins=30)
        else:
            ax.scatter(d.iloc[:,j].values, d.iloc[:,i].values, s=5, alpha=0.5)
        if i == n-1:
            ax.set_xlabel(feat_for_pairs[j])
        else:
            ax.set_xticklabels([])
        if j == 0:
            ax.set_ylabel(feat_for_pairs[i])
        else:
            ax.set_yticklabels([])
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [17]:
plt.figure(figsize=(6,6))
plt.scatter(y_test, pred_te_m2, s=8, alpha=0.6)
mn = min(np.min(y_test), np.min(pred_te_m2))
mx = max(np.max(y_test), np.max(pred_te_m2))
plt.plot([mn,mx],[mn,mx])
plt.xlabel("Actual")
plt.ylabel("Predicted (Model 2)")
plt.title("Actual vs Predicted")
plt.tight_layout()
plt.show()
No description has been provided for this image
InĀ [18]:
resid = y_test - pred_te_m2
plt.figure(figsize=(8,4))
plt.hist(resid, bins=40)
plt.title("Residuals (Model 2, test)")
plt.tight_layout()
plt.show()
No description has been provided for this image