InĀ [19]:
import numpy as np
import pandas as pd
rng = np.random.RandomState(42)
possible_paths = [
"Student_Performance.csv",
]
path = None
for p in possible_paths:
try:
df = pd.read_csv(p)
path = p
break
except Exception:
pass
print("Файл:", path)
print("ŠŠ°Š³ŃŃŠ¶ŠµŠ½Š¾:", df.shape)
print("ŠŠ¾Š»Š¾Š½ŠŗŠø:", list(df.columns))
Файл: Student_Performance.csv ŠŠ°Š³ŃŃŠ¶ŠµŠ½Š¾: (10000, 6) ŠŠ¾Š»Š¾Š½ŠŗŠø: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']
InĀ [14]:
print("PLOT: ŃŠ°ŃŠæŃŠµŠ“ŠµŠ»ŠµŠ½ŠøŃ ŠæŃŠøŠ·Š½Š°ŠŗŠ¾Š² ā Š²ŃŃŠ°Š²ŠøŃŃ ŠæŠ¾ŃŠ»Šµ Š²ŃŠæŠ¾Š»Š½ŠµŠ½ŠøŃ")
print("PLOT: ŠæŠ°ŃŠ½Ńе Š³ŃŠ°ŃŠøŠŗŠø ŠæŃŠøŠ·Š½Š°ŠŗŠ¾Š² ā Š²ŃŃŠ°Š²ŠøŃŃ ŠæŠ¾ŃŠ»Šµ Š²ŃŠæŠ¾Š»Š½ŠµŠ½ŠøŃ")
num_df = df.select_dtypes(include=[np.number])
nonnum_df = df.select_dtypes(exclude=[np.number])
print("NUMERIC COLUMNS:", list(num_df.columns))
print("NON-NUMERIC COLUMNS:", list(nonnum_df.columns))
desc_num = num_df.describe().T
print("DESCRIBE (numeric):\n", desc_num)
if nonnum_df.shape[1] > 0:
desc_non = nonnum_df.describe().T
print("\nDESCRIBE (non-numeric):\n", desc_non)
q = num_df.quantile([0.25, 0.5, 0.75])
print("\nQUANTILES (numeric, 0.25, 0.5, 0.75):\n", q)
PLOT: ŃŠ°ŃŠæŃŠµŠ“ŠµŠ»ŠµŠ½ŠøŃ ŠæŃŠøŠ·Š½Š°ŠŗŠ¾Š² ā Š²ŃŃŠ°Š²ŠøŃŃ ŠæŠ¾ŃŠ»Šµ Š²ŃŠæŠ¾Š»Š½ŠµŠ½ŠøŃ
PLOT: ŠæŠ°ŃŠ½Ńе Š³ŃŠ°ŃŠøŠŗŠø ŠæŃŠøŠ·Š½Š°ŠŗŠ¾Š² ā Š²ŃŃŠ°Š²ŠøŃŃ ŠæŠ¾ŃŠ»Šµ Š²ŃŠæŠ¾Š»Š½ŠµŠ½ŠøŃ
NUMERIC COLUMNS: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index']
NON-NUMERIC COLUMNS: []
DESCRIBE (numeric):
count mean std min 25% \
Hours Studied 10000.0 4.9929 2.589309 1.0 3.0
Previous Scores 10000.0 69.4457 17.343152 40.0 54.0
Extracurricular Activities 10000.0 0.4948 0.499998 0.0 0.0
Sleep Hours 10000.0 6.5306 1.695863 4.0 5.0
Sample Question Papers Practiced 10000.0 4.5833 2.867348 0.0 2.0
Performance Index 10000.0 55.2248 19.212558 10.0 40.0
50% 75% max
Hours Studied 5.0 7.0 9.0
Previous Scores 69.0 85.0 99.0
Extracurricular Activities 0.0 1.0 1.0
Sleep Hours 7.0 8.0 9.0
Sample Question Papers Practiced 5.0 7.0 9.0
Performance Index 55.0 71.0 100.0
QUANTILES (numeric, 0.25, 0.5, 0.75):
Hours Studied Previous Scores Extracurricular Activities Sleep Hours \
0.25 3.0 54.0 0.0 5.0
0.50 5.0 69.0 0.0 7.0
0.75 7.0 85.0 1.0 8.0
Sample Question Papers Practiced Performance Index
0.25 2.0 40.0
0.50 5.0 55.0
0.75 7.0 71.0
InĀ [7]:
df = df.copy()
if "Extracurricular Activities" in df.columns:
map_bool = {
True: 1, False: 0,
"true": 1, "false": 0,
"True": 1, "False": 0,
"Yes": 1, "No": 0,
"yes": 1, "no": 0,
1: 1, 0: 0
}
df["Extracurricular Activities"] = df["Extracurricular Activities"].map(map_bool)
if df["Extracurricular Activities"].isna().any():
df["Extracurricular Activities"] = df["Extracurricular Activities"].fillna(0).astype(int)
for c in df.columns:
if c != "Extracurricular Activities":
df[c] = pd.to_numeric(df[c], errors="coerce")
num_cols = [c for c in df.columns if c != "Extracurricular Activities" and c != "Performance Index"]
if "Extracurricular Activities" in df.columns:
all_features = num_cols + ["Extracurricular Activities"]
else:
all_features = num_cols
for c in all_features + ["Performance Index"]:
if c in df.columns:
if df[c].dtype.kind in "biufc":
med = df[c].median()
df[c] = df[c].fillna(med)
else:
mode_val = df[c].mode().iloc[0]
df[c] = df[c].fillna(mode_val)
features = [
"Hours Studied",
"Previous Scores",
"Extracurricular Activities",
"Sleep Hours",
"Sample Question Papers Practiced"
]
target_col = "Performance Index"
X_all = df[features].values.astype(float)
y_all = df[target_col].values.astype(float)
N = X_all.shape[0]
idx = np.arange(N)
rng.shuffle(idx)
train_size = int(0.8 * N)
train_idx = idx[:train_size]
test_idx = idx[train_size:]
X_train = X_all[train_idx]
y_train = y_all[train_idx]
X_test = X_all[test_idx]
y_test = y_all[test_idx]
mu = X_train.mean(axis=0)
sigma = X_train.std(axis=0)
sigma[sigma == 0] = 1.0
X_train_z = (X_train - mu) / sigma
X_test_z = (X_test - mu) / sigma
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
Train shape: (8000, 5) Test shape: (2000, 5)
InĀ [8]:
def add_intercept(X):
return np.hstack([np.ones((X.shape[0], 1)), X])
def fit_ols(X, y):
X1 = add_intercept(X)
XtX = X1.T @ X1
XtX_inv = np.linalg.pinv(XtX)
beta = XtX_inv @ X1.T @ y
return beta
def predict_ols(X, beta):
X1 = add_intercept(X)
return X1 @ beta
def r2_score(y_true, y_pred):
ss_res = np.sum((y_true - y_pred) ** 2)
ss_tot = np.sum((y_true - y_true.mean()) ** 2)
return 1 - ss_res / ss_tot if ss_tot != 0 else 0.0
InĀ [9]:
m1_cols = ["Hours Studied", "Previous Scores"]
m1_idx = [features.index(c) for c in m1_cols]
Xtr_m1 = X_train_z[:, m1_idx]
Xte_m1 = X_test_z[:, m1_idx]
beta_m1 = fit_ols(Xtr_m1, y_train)
pred_tr_m1 = predict_ols(Xtr_m1, beta_m1)
pred_te_m1 = predict_ols(Xte_m1, beta_m1)
r2_tr_m1 = r2_score(y_train, pred_tr_m1)
r2_te_m1 = r2_score(y_test, pred_te_m1)
print("ŠŠ¾Š“ŠµŠ»Ń 1 ā ŠæŃизнаки:", m1_cols)
print("beta:", beta_m1.round(6))
print("R2 train:", round(r2_tr_m1, 6), "R2 test:", round(r2_te_m1, 6))
ŠŠ¾Š“ŠµŠ»Ń 1 ā ŠæŃизнаки: ['Hours Studied', 'Previous Scores'] beta: [55.2365 7.391342 17.675357] R2 train: 0.985798 R2 test: 0.986163
InĀ [10]:
m2_cols = [
"Hours Studied",
"Previous Scores",
"Extracurricular Activities",
"Sleep Hours",
"Sample Question Papers Practiced"
]
m2_idx = [features.index(c) for c in m2_cols]
Xtr_m2 = X_train_z[:, m2_idx]
Xte_m2 = X_test_z[:, m2_idx]
beta_m2 = fit_ols(Xtr_m2, y_train)
pred_tr_m2 = predict_ols(Xtr_m2, beta_m2)
pred_te_m2 = predict_ols(Xte_m2, beta_m2)
r2_tr_m2 = r2_score(y_train, pred_tr_m2)
r2_te_m2 = r2_score(y_test, pred_te_m2)
print("ŠŠ¾Š“ŠµŠ»Ń 2 ā ŠæŃизнаки:", m2_cols)
print("beta:", beta_m2.round(6))
print("R2 train:", round(r2_tr_m2, 6), "R2 test:", round(r2_te_m2, 6))
ŠŠ¾Š“ŠµŠ»Ń 2 ā ŠæŃизнаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced'] beta: [55.2365 7.381186 17.658739 0.307544 0.812387 0.576902] R2 train: 0.988733 R2 test: 0.988819
InĀ [11]:
hs_idx = features.index("Hours Studied")
ps_idx = features.index("Previous Scores")
hs_tr = X_train_z[:, hs_idx]
ps_tr = X_train_z[:, ps_idx]
hs_te = X_test_z[:, hs_idx]
ps_te = X_test_z[:, ps_idx]
syn_tr = (hs_tr * ps_tr).reshape(-1, 1)
syn_te = (hs_te * ps_te).reshape(-1, 1)
Xtr_m3 = np.hstack([X_train_z[:, :], syn_tr])
Xte_m3 = np.hstack([X_test_z[:, :], syn_te])
beta_m3 = fit_ols(Xtr_m3, y_train)
pred_tr_m3 = predict_ols(Xtr_m3, beta_m3)
pred_te_m3 = predict_ols(Xte_m3, beta_m3)
r2_tr_m3 = r2_score(y_train, pred_tr_m3)
r2_te_m3 = r2_score(y_test, pred_te_m3)
all_cols_plus_syn = features + ["HSxPS"]
print("ŠŠ¾Š“ŠµŠ»Ń 3 ā ŠæŃизнаки:", all_cols_plus_syn)
print("beta:", beta_m3.round(6))
print("R2 train:", round(r2_tr_m3, 6), "R2 test:", round(r2_te_m3, 6))
ŠŠ¾Š“ŠµŠ»Ń 3 ā ŠæŃизнаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'HSxPS'] beta: [ 5.5236138e+01 7.3809510e+00 1.7658707e+01 3.0760900e-01 8.1211400e-01 5.7723100e-01 -2.1174000e-02] R2 train: 0.988734 R2 test: 0.988812
InĀ [12]:
hs_idx = features.index("Hours Studied")
ps_idx = features.index("Previous Scores")
hs_tr = X_train_z[:, hs_idx]
ps_tr = X_train_z[:, ps_idx]
hs_te = X_test_z[:, hs_idx]
ps_te = X_test_z[:, ps_idx]
syn_tr = (hs_tr * ps_tr).reshape(-1, 1)
syn_te = (hs_te * ps_te).reshape(-1, 1)
Xtr_m3 = np.hstack([X_train_z[:, :], syn_tr])
Xte_m3 = np.hstack([X_test_z[:, :], syn_te])
beta_m3 = fit_ols(Xtr_m3, y_train)
pred_tr_m3 = predict_ols(Xtr_m3, beta_m3)
pred_te_m3 = predict_ols(Xte_m3, beta_m3)
r2_tr_m3 = r2_score(y_train, pred_tr_m3)
r2_te_m3 = r2_score(y_test, pred_te_m3)
all_cols_plus_syn = features + ["HSxPS"]
print("ŠŠ¾Š“ŠµŠ»Ń 3 ā ŠæŃизнаки:", all_cols_plus_syn)
print("beta:", beta_m3.round(6))
print("R2 train:", round(r2_tr_m3, 6), "R2 test:", round(r2_te_m3, 6))
ŠŠ¾Š“ŠµŠ»Ń 3 ā ŠæŃизнаки: ['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 'Sleep Hours', 'Sample Question Papers Practiced', 'HSxPS'] beta: [ 5.5236138e+01 7.3809510e+00 1.7658707e+01 3.0760900e-01 8.1211400e-01 5.7723100e-01 -2.1174000e-02] R2 train: 0.988734 R2 test: 0.988812
InĀ [13]:
summary = pd.DataFrame({
"model": ["Model_1", "Model_2", "Model_3"],
"features": [
",".join(["Hours Studied","Previous Scores"]),
",".join(["Hours Studied","Previous Scores","Extracurricular Activities","Sleep Hours","Sample Question Papers Practiced"]),
",".join(features + ["HSxPS"])
],
"R2_train": [r2_tr_m1, r2_tr_m2, r2_tr_m3],
"R2_test": [r2_te_m1, r2_te_m2, r2_te_m3]
})
print(summary)
model features R2_train \
0 Model_1 Hours Studied,Previous Scores 0.985798
1 Model_2 Hours Studied,Previous Scores,Extracurricular ... 0.988733
2 Model_3 Hours Studied,Previous Scores,Extracurricular ... 0.988734
R2_test
0 0.986163
1 0.988819
2 0.988812
InĀ [15]:
import matplotlib.pyplot as plt
num_df = df.select_dtypes(include=[np.number])
cols = list(num_df.columns)
r = int(np.ceil(len(cols)/3))
plt.figure(figsize=(14, 4*r))
for i,c in enumerate(cols,1):
plt.subplot(r,3,i)
plt.hist(num_df[c].dropna().values, bins=30)
plt.title(c)
plt.tight_layout()
plt.show()
InĀ [16]:
feat_for_pairs = ["Hours Studied","Previous Scores","Extracurricular Activities","Sleep Hours","Sample Question Papers Practiced"]
d = df[feat_for_pairs].astype(float)
n = len(feat_for_pairs)
fig, axes = plt.subplots(n, n, figsize=(14,14))
for i in range(n):
for j in range(n):
ax = axes[i,j]
if i == j:
ax.hist(d.iloc[:,j].dropna().values, bins=30)
else:
ax.scatter(d.iloc[:,j].values, d.iloc[:,i].values, s=5, alpha=0.5)
if i == n-1:
ax.set_xlabel(feat_for_pairs[j])
else:
ax.set_xticklabels([])
if j == 0:
ax.set_ylabel(feat_for_pairs[i])
else:
ax.set_yticklabels([])
plt.tight_layout()
plt.show()
InĀ [17]:
plt.figure(figsize=(6,6))
plt.scatter(y_test, pred_te_m2, s=8, alpha=0.6)
mn = min(np.min(y_test), np.min(pred_te_m2))
mx = max(np.max(y_test), np.max(pred_te_m2))
plt.plot([mn,mx],[mn,mx])
plt.xlabel("Actual")
plt.ylabel("Predicted (Model 2)")
plt.title("Actual vs Predicted")
plt.tight_layout()
plt.show()
InĀ [18]:
resid = y_test - pred_te_m2
plt.figure(figsize=(8,4))
plt.hist(resid, bins=40)
plt.title("Residuals (Model 2, test)")
plt.tight_layout()
plt.show()