import os
import numpy as np
import pandas as pd

rng = np.random.RandomState(42)

df = pd.read_csv("DATA.csv")

df

df_students = df

df_students.columns = [c.strip() for c in df_students.columns]

print("Первые строки датасета:")
print(df_students.head())

print("\nИнформация о датасете:")
print(df_students.info())

print("\nРаспределение оценок (GRADE):")
print(df_students['GRADE'].value_counts().sort_index())

Первые строки датасета:
  STUDENT ID  1  2  3  4  5  6  7  8  9  ...  23  24  25  26  27  28  29  30  \
0   STUDENT1  2  2  3  3  1  2  2  1  1  ...   1   1   3   2   1   2   1   1   
1   STUDENT2  2  2  3  3  1  2  2  1  1  ...   1   1   3   2   3   2   2   3   
2   STUDENT3  2  2  2  3  2  2  2  2  4  ...   1   1   2   2   1   1   2   2   
3   STUDENT4  1  1  1  3  1  2  1  2  1  ...   1   2   3   2   2   1   3   2   
4   STUDENT5  2  2  1  3  2  2  1  3  1  ...   2   1   2   2   2   1   2   2   

   COURSE ID  GRADE  
0          1      1  
1          1      1  
2          1      1  
3          1      1  
4          1      1  

[5 rows x 33 columns]

Информация о датасете:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145 entries, 0 to 144
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   STUDENT ID  145 non-null    object
 1   1           145 non-null    int64 
 2   2           145 non-null    int64 
 3   3           145 non-null    int64 
 4   4           145 non-null    int64 
 5   5           145 non-null    int64 
 6   6           145 non-null    int64 
 7   7           145 non-null    int64 
 8   8           145 non-null    int64 
 9   9           145 non-null    int64 
 10  10          145 non-null    int64 
 11  11          145 non-null    int64 
 12  12          145 non-null    int64 
 13  13          145 non-null    int64 
 14  14          145 non-null    int64 
 15  15          145 non-null    int64 
 16  16          145 non-null    int64 
 17  17          145 non-null    int64 
 18  18          145 non-null    int64 
 19  19          145 non-null    int64 
 20  20          145 non-null    int64 
 21  21          145 non-null    int64 
 22  22          145 non-null    int64 
 23  23          145 non-null    int64 
 24  24          145 non-null    int64 
 25  25          145 non-null    int64 
 26  26          145 non-null    int64 
 27  27          145 non-null    int64 
 28  28          145 non-null    int64 
 29  29          145 non-null    int64 
 30  30          145 non-null    int64 
 31  COURSE ID   145 non-null    int64 
 32  GRADE       145 non-null    int64 
dtypes: int64(32), object(1)
memory usage: 37.5+ KB
None

Распределение оценок (GRADE):
GRADE
0     8
1    35
2    24
3    21
4    10
5    17
6    13
7    17
Name: count, dtype: int64

plt.figure()
df_students['GRADE'].value_counts().sort_index().plot(kind='bar')
plt.xlabel("GRADE (0=Fail, 7=AA)")
plt.ylabel("Количество студентов")
plt.title("Распределение итоговых оценок (GRADE)")
plt.show()

df_students['Success'] = (df_students['GRADE'] >= 4).astype(int)

print("Распределение по Success (0=неуспешный, 1=успешный):")
print(df_students['Success'].value_counts())

plt.figure()
df_students['Success'].value_counts().plot(kind='bar')
plt.xticks([0, 1], ['0 - неуспешный', '1 - успешный'], rotation=0)
plt.ylabel("Количество студентов")
plt.title("Распределение успешности студентов")
plt.show()

Распределение по Success (0=неуспешный, 1=успешный):
Success
0    88
1    57
Name: count, dtype: int64

exclude_cols = ['ID', 'COURSE ID', 'GRADE', 'Success']
feature_cols = [c for c in df_students.columns if c not in exclude_cols]

n_features = len(feature_cols)
print(f"Всего признаков: {n_features}")
print("Все признаки:", feature_cols)

Всего признаков: 31
Все признаки: ['STUDENT ID', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30']

sqrt_n = int(np.ceil(np.sqrt(n_features)))
selected_features = np.random.choice(feature_cols, size=sqrt_n, replace=False)
selected_features = list(selected_features)

print(f"\nВыбрано sqrt(n) признаков (ceil): {sqrt_n}")
print("Случайно выбранные признаки:", selected_features)

Выбрано sqrt(n) признаков (ceil): 6
Случайно выбранные признаки: [np.str_('27'), np.str_('15'), np.str_('23'), np.str_('17'), np.str_('8'), np.str_('9')]

plt.figure()
plt.bar(range(len(selected_features)), np.ones(len(selected_features)))
plt.xticks(range(len(selected_features)), selected_features, rotation=90)
plt.yticks([])
plt.title("Случайно выбранные признаки для дерева решений")
plt.tight_layout()
plt.show()

X = df_students[selected_features].values
y = df_students['Success'].values

indices = np.arange(len(X))
np.random.shuffle(indices)

train_size = int(0.8 * len(X))
train_idx = indices[:train_size]
test_idx = indices[train_size:]

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

print(f"\nРазмер обучающей выборки: {X_train.shape}, тестовой: {X_test.shape}")

Размер обучающей выборки: (116, 6), тестовой: (29, 6)

from collections import Counter

class TreeNode:
    def __init__(self, is_leaf=False, prediction=None, probability=None,
                 feature_index=None, children=None):
        """
        is_leaf: True, если лист
        prediction: предсказанный класс (0/1) для листа
        probability: вероятность класса 1 в листе (для ROC/PR)
        feature_index: индекс признака (в X), по которому происходит разбиение
        children: словарь value -> TreeNode
        """
        self.is_leaf = is_leaf
        self.prediction = prediction
        self.probability = probability
        self.feature_index = feature_index
        self.children = children if children is not None else {}

def gini_impurity(y):
    """ Gini impurity для бинарной классификации """
    if len(y) == 0:
        return 0.0
    counts = Counter(y)
    total = len(y)
    impurity = 1.0
    for cls in counts:
        p = counts[cls] / total
        impurity -= p ** 2
    return impurity

def best_split(X, y, feature_indices):
    """
    Находим лучший признак для разбиения по критерию уменьшения Gini.
    Дерево не бинарное: ветка для каждого уникального значения признака.
    """
    base_impurity = gini_impurity(y)
    best_gain = 0.0
    best_feature = None

    n_samples = len(y)

    for f in feature_indices:
        values = np.unique(X[:, f])
        if len(values) == 1:
            # Признак константный, не информативен
            continue

        # Разбиение y по значениям признака
        weighted_impurity = 0.0
        for v in values:
            mask = (X[:, f] == v)
            y_subset = y[mask]
            weight = len(y_subset) / n_samples
            weighted_impurity += weight * gini_impurity(y_subset)

        gain = base_impurity - weighted_impurity
        if gain > best_gain:
            best_gain = gain
            best_feature = f

    return best_feature, best_gain

def build_tree(X, y, feature_indices, depth=0, max_depth=None, min_samples_split=2):
    """
    Рекурсивное построение дерева решений.
    """
    # Если все объекты одного класса -> лист
    unique_classes = np.unique(y)
    if len(unique_classes) == 1:
        cls = unique_classes[0]
        prob = float(cls)  # 0 или 1
        return TreeNode(
            is_leaf=True,
            prediction=int(cls),
            probability=prob
        )

    # Если глубина достигнута или мало объектов -> лист
    if max_depth is not None and depth >= max_depth:
        counts = Counter(y)
        majority_class = counts.most_common(1)[0][0]
        prob = counts.get(1, 0) / len(y)
        return TreeNode(
            is_leaf=True,
            prediction=int(majority_class),
            probability=prob
        )

    if len(y) < min_samples_split or len(feature_indices) == 0:
        counts = Counter(y)
        majority_class = counts.most_common(1)[0][0]
        prob = counts.get(1, 0) / len(y)
        return TreeNode(
            is_leaf=True,
            prediction=int(majority_class),
            probability=prob
        )

    # Ищем лучший признак
    best_feature, best_gain = best_split(X, y, feature_indices)

    if best_feature is None or best_gain == 0:
        # Нет информативного разбиения
        counts = Counter(y)
        majority_class = counts.most_common(1)[0][0]
        prob = counts.get(1, 0) / len(y)
        return TreeNode(
            is_leaf=True,
            prediction=int(majority_class),
            probability=prob
        )

    # Строим узел и дочерние ветви
    node = TreeNode(
        is_leaf=False,
        feature_index=best_feature,
        children={}
    )

    values = np.unique(X[:, best_feature])
    # Для ветвей можно использовать тот же набор признаков
    # (дерево не обязательно уменьшает множество признаков)
    for v in values:
        mask = (X[:, best_feature] == v)
        X_child = X[mask]
        y_child = y[mask]
        child = build_tree(
            X_child,
            y_child,
            feature_indices=feature_indices,
            depth=depth + 1,
            max_depth=max_depth,
            min_samples_split=min_samples_split
        )
        node.children[v] = child

    return node

def predict_single(node, x):
    """
    Предсказание для одного объекта (x - одномерный массив признаков).
    """
    if node.is_leaf:
        return node.prediction, node.probability

    feature_value = x[node.feature_index]
    child = node.children.get(feature_value, None)

    # Если нет ветки для такого значения (например, значение не встречалось в обучении),
    # возвращаем предсказание ближайшего листа (здесь - самого узла, как majority).
    if child is None:
        # Для внутреннего узла нет prediction/probability, поэтому
        # имеет смысл заранее не допускать такого случая,
        # но для безопасности можно вернуть среднее по дочерним листьям.
        leaf_probs = []
        leaf_preds = []
        def collect_leaves(n):
            if n.is_leaf:
                leaf_probs.append(n.probability)
                leaf_preds.append(n.prediction)
            else:
                for c in n.children.values():
                    collect_leaves(c)
        collect_leaves(node)
        if len(leaf_probs) == 0:
            return 0, 0.0
        avg_prob = float(np.mean(leaf_probs))
        pred = int(round(avg_prob))
        return pred, avg_prob

    return predict_single(child, x)

def predict_tree(node, X):
    """
    Предсказания для матрицы объектов.
    Возвращает два массива: метки и вероятности класса 1.
    """
    preds = []
    probs = []
    for x in X:
        p, pr = predict_single(node, x)
        preds.append(p)
        probs.append(pr)
    return np.array(preds), np.array(probs)

def binary_classification_metrics(y_true, y_pred):
    """
    Вычисление accuracy, precision, recall для бинарной классификации.
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0

    return accuracy, precision, recall, TP, TN, FP, FN

feature_indices = np.arange(X_train.shape[1])

tree_root = build_tree(
    X_train,
    y_train,
    feature_indices=feature_indices,
    max_depth=None,          # без ограничения глубины
    min_samples_split=2
)

y_pred_test, y_prob_test = predict_tree(tree_root, X_test)

acc, prec, rec, TP, TN, FP, FN = binary_classification_metrics(y_test, y_pred_test)

print("=== Качество на тестовой выборке ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"TP={TP}, TN={TN}, FP={FP}, FN={FN}")

=== Качество на тестовой выборке ===
Accuracy:  0.3793
Precision: 0.3333
Recall:    0.2000
TP=3, TN=8, FP=6, FN=12

cm = np.array([[TN, FP],
               [FN, TP]])

plt.figure()
plt.imshow(cm, cmap='Blues')
plt.colorbar()
plt.xticks([0, 1], ['Pred 0', 'Pred 1'])
plt.yticks([0, 1], ['True 0', 'True 1'])
plt.title("Матрица ошибок (Confusion Matrix)")
for i in range(2):
    for j in range(2):
        plt.text(j, i, str(cm[i, j]), ha='center', va='center', color='black')
plt.tight_layout()
plt.show()

def roc_curve_manual(y_true, y_scores):
    """
    ROC-кривая: возвращает массивы FPR, TPR, thresholds.
    """
    # Сортируем по score по убыванию
    desc_order = np.argsort(-y_scores)
    y_true = y_true[desc_order]
    y_scores = y_scores[desc_order]

    # Все возможные пороги — уникальные значения score
    thresholds = np.unique(y_scores)[::-1]

    TPR = []
    FPR = []

    P = np.sum(y_true == 1)
    N = np.sum(y_true == 0)

    for thr in thresholds:
        y_pred = (y_scores >= thr).astype(int)

        TP = np.sum((y_true == 1) & (y_pred == 1))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        FN = np.sum((y_true == 1) & (y_pred == 0))
        TN = np.sum((y_true == 0) & (y_pred == 0))

        tpr = TP / P if P > 0 else 0.0
        fpr = FP / N if N > 0 else 0.0

        TPR.append(tpr)
        FPR.append(fpr)

    # Добавим точки (0,0) и (1,1) для корректного графика
    FPR = np.array([0.0] + FPR + [1.0])
    TPR = np.array([0.0] + TPR + [1.0])

    # thresholds можно расширить при необходимости
    return FPR, TPR, thresholds

def pr_curve_manual(y_true, y_scores):
    """
    PR-кривая: возвращает массивы Recall, Precision, thresholds.
    """
    desc_order = np.argsort(-y_scores)
    y_true = y_true[desc_order]
    y_scores = y_scores[desc_order]

    thresholds = np.unique(y_scores)[::-1]

    PREC = []
    REC = []

    P = np.sum(y_true == 1)

    for thr in thresholds:
        y_pred = (y_scores >= thr).astype(int)

        TP = np.sum((y_true == 1) & (y_pred == 1))
        FP = np.sum((y_true == 0) & (y_pred == 1))
        FN = np.sum((y_true == 1) & (y_pred == 0))

        precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0

        PREC.append(precision)
        REC.append(recall)

    # Добавим точку (recall=0, precision=средняя по выборке) при необходимости
    REC = np.array([0.0] + REC)
    baseline_precision = P / len(y_true) if len(y_true) > 0 else 0.0
    PREC = np.array([baseline_precision] + PREC)

    return REC, PREC, thresholds

def auc_trapezoid(x, y):
    """ Площадь под кривой методом трапеций. """
    # x должен быть отсортирован по возрастанию
    order = np.argsort(x)
    x = x[order]
    y = y[order]
    area = 0.0
    for i in range(1, len(x)):
        area += (x[i] - x[i-1]) * (y[i] + y[i-1]) / 2
    return area

FPR, TPR, roc_thresholds = roc_curve_manual(y_test, y_prob_test)
auc_roc = auc_trapezoid(FPR, TPR)

plt.figure()
plt.plot(FPR, TPR, marker='o', label=f"AUC-ROC = {auc_roc:.3f}")
plt.plot([0, 1], [0, 1], '--', label="Случайный классификатор")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC-кривая (Decision Tree)")
plt.legend()
plt.show()

REC, PREC, pr_thresholds = pr_curve_manual(y_test, y_prob_test)
auc_pr = auc_trapezoid(REC, PREC)

plt.figure()
plt.plot(REC, PREC, marker='o', label=f"AUC-PR = {auc_pr:.3f}")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("PR-кривая (Decision Tree)")
plt.legend()
plt.show()

print(f"AUC-ROC: {auc_roc:.4f}")
print(f"AUC-PR:  {auc_pr:.4f}")

AUC-ROC: 0.4429
AUC-PR:  0.4742

	STUDENT ID	1	2	3	4	5	6	7	8	9	...	23	24	25	26	27	28	29	30	COURSE ID	GRADE
0	STUDENT1	2	2	3	3	1	2	2	1	1	...	1	1	3	2	1	2	1	1	1	1
1	STUDENT2	2	2	3	3	1	2	2	1	1	...	1	1	3	2	3	2	2	3	1	1
2	STUDENT3	2	2	2	3	2	2	2	2	4	...	1	1	2	2	1	1	2	2	1	1
3	STUDENT4	1	1	1	3	1	2	1	2	1	...	1	2	3	2	2	1	3	2	1	1
4	STUDENT5	2	2	1	3	2	2	1	3	1	...	2	1	2	2	2	1	2	2	1	1
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
140	STUDENT141	2	1	2	3	1	1	2	1	1	...	1	1	2	1	2	1	3	3	9	5
141	STUDENT142	1	1	2	4	2	2	2	1	4	...	1	1	3	2	2	1	5	3	9	5
142	STUDENT143	1	1	1	4	2	2	2	1	1	...	1	1	3	3	2	1	4	3	9	1
143	STUDENT144	2	1	2	4	1	1	1	5	2	...	2	1	2	1	2	1	5	3	9	4
144	STUDENT145	1	1	1	5	2	2	2	3	1	...	2	1	3	2	3	1	5	4	9	3