import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (8, 5)
plt.rcParams['axes.grid'] = True

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

titanic_path = "train.csv" 
df_titanic = pd.read_csv(titanic_path)

print("Первые строки титаника:")
print(df_titanic.head())

print("\nИнформация о датасете Titanic:")
print(df_titanic.info())

print("\nОписательная статистика числовых признаков:")
print(df_titanic.describe())

Первые строки титаника:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  

Информация о датасете Titanic:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None

Описательная статистика числовых признаков:
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200

plt.figure()
df_titanic['Survived'].value_counts().plot(kind='bar')
plt.xticks([0, 1], ['0 - погиб', '1 - выжил'], rotation=0)
plt.ylabel("Количество пассажиров")
plt.title("Распределение целевой переменной Survived")
plt.show()

plt.figure()
df_titanic['Sex'].value_counts().plot(kind='bar')
plt.title("Распределение по полу (Sex)")
plt.show()

plt.figure()
df_titanic['Pclass'].value_counts().sort_index().plot(kind='bar')
plt.title("Распределение по классу билета (Pclass)")
plt.show()

plt.figure()
df_titanic['Age'].hist(bins=30)
plt.xlabel("Age")
plt.ylabel("Количество")
plt.title("Распределение возраста (Age)")
plt.show()

plt.figure()
df_titanic['Fare'].plot(kind='box')
plt.title("Boxplot тарифов (Fare)")
plt.show()

df = df_titanic.copy()
# Возраст и тариф заполним медианой, Embarked - модой
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Кодирование категориальных признаков
# Sex -> 0/1
df['Sex'] = (df['Sex'] == 'male').astype(int)

# One-hot кодирование Embarked (C, Q, S)
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='Embarked')
df = pd.concat([df, embarked_dummies], axis=1)

# Можно добавить семейный размер (feature engineering)
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

# Выбираем признаки для логистической регрессии
feature_cols_logreg = [
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'FamilySize',
] + list(embarked_dummies.columns)

print("Признаки, используемые в логистической регрессии:")
print(feature_cols_logreg)

X_log = df[feature_cols_logreg].values.astype(float)
y_log = df['Survived'].values.astype(int)

Признаки, используемые в логистической регрессии:
['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'Embarked_C', 'Embarked_Q', 'Embarked_S']

indices = np.arange(len(X_log))
np.random.shuffle(indices)

train_size = int(0.8 * len(X_log))
train_idx = indices[:train_size]
test_idx = indices[train_size:]

X_train_log, X_test_log = X_log[train_idx], X_log[test_idx]
y_train_log, y_test_log = y_log[train_idx], y_log[test_idx]

print(f"\nРазмер обучающей выборки (Titanic): {X_train_log.shape}, тестовой: {X_test_log.shape}")

Размер обучающей выборки (Titanic): (712, 10), тестовой: (179, 10)

class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, num_iter=1000, method='gd', verbose=False):
        """
        learning_rate: шаг обучения (для градиентного спуска)
        num_iter: количество итераций
        method: 'gd' - градиентный спуск, 'newton' - метод Ньютона
        """
        self.learning_rate = learning_rate
        self.num_iter = num_iter
        self.method = method
        self.verbose = verbose
        self.weights = None  # включая bias
        self.loss_history = []

    def _sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def _add_bias(self, X):
        # Добавляем столбец единиц для свободного члена
        m = X.shape[0]
        return np.hstack([np.ones((m, 1)), X])

    def _compute_loss(self, y, h):
        # Логистическая функция потерь (log loss)
        eps = 1e-15
        h_clipped = np.clip(h, eps, 1 - eps)
        loss = -np.mean(y * np.log(h_clipped) + (1 - y) * np.log(1 - h_clipped))
        return loss

    def fit(self, X, y):
        Xb = self._add_bias(X)
        m, n = Xb.shape

        # Инициализация весов нулями
        self.weights = np.zeros(n)

        if self.method == 'gd':
            for i in range(self.num_iter):
                z = Xb @ self.weights
                h = self._sigmoid(z)
                gradient = (Xb.T @ (h - y)) / m
                self.weights -= self.learning_rate * gradient

                loss = self._compute_loss(y, h)
                self.loss_history.append(loss)

                if self.verbose and (i % max(1, self.num_iter // 10) == 0):
                    print(f"Iter {i}, loss = {loss:.4f}")

        elif self.method == 'newton':
            for i in range(self.num_iter):
                z = Xb @ self.weights
                h = self._sigmoid(z)
                # Градиент
                gradient = (Xb.T @ (h - y)) / m
                # Диагональная матрица R (вектор)
                r = h * (1 - h)
                # Гессиан H = X^T * R * X / m
                # Умножаем каждую строку Xb на r_i
                XR = Xb * r[:, np.newaxis]
                H = (Xb.T @ XR) / m

                # Добавим небольшую L2-регуляризацию к диагонали для устойчивости
                reg = 1e-6
                H_reg = H + reg * np.eye(n)

                # Обновление Ньютона: w = w - H^{-1} * gradient
                try:
                    delta = np.linalg.solve(H_reg, gradient)
                except np.linalg.LinAlgError:
                    # На случай вырожденного Гессиана
                    delta = np.linalg.pinv(H_reg) @ gradient

                self.weights -= delta

                loss = self._compute_loss(y, h)
                self.loss_history.append(loss)

                if self.verbose and (i % max(1, self.num_iter // 10) == 0):
                    print(f"Newton iter {i}, loss = {loss:.4f}")

        else:
            raise ValueError("method должен быть 'gd' или 'newton'")

    def predict_proba(self, X):
        Xb = self._add_bias(X)
        z = Xb @ self.weights
        return self._sigmoid(z)

    def predict(self, X, threshold=0.5):
        proba = self.predict_proba(X)
        return (proba >= threshold).astype(int)

# Функция для метрик (добавим F1)
def classification_metrics_with_f1(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    TP = np.sum((y_true == 1) & (y_pred == 1))
    TN = np.sum((y_true == 0) & (y_pred == 0))
    FP = np.sum((y_true == 0) & (y_pred == 1))
    FN = np.sum((y_true == 1) & (y_pred == 0))

    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0

    return accuracy, precision, recall, f1, TP, TN, FP, FN

base_model = LogisticRegressionScratch(
    learning_rate=0.01,
    num_iter=1000,
    method='gd',
    verbose=False
)

base_model.fit(X_train_log, y_train_log)
y_pred_base = base_model.predict(X_test_log)

acc, prec, rec, f1, TP, TN, FP, FN = classification_metrics_with_f1(y_test_log, y_pred_base)

print("=== Базовая модель логистической регрессии (GD) ===")
print(f"Accuracy:  {acc:.4f}")
print(f"Precision: {prec:.4f}")
print(f"Recall:    {rec:.4f}")
print(f"F1-score:  {f1:.4f}")
print(f"TP={TP}, TN={TN}, FP={FP}, FN={FN}")

=== Базовая модель логистической регрессии (GD) ===
Accuracy:  0.6592
Precision: 0.6667
Recall:    0.1231
F1-score:  0.2078
TP=8, TN=110, FP=4, FN=57

plt.figure()
plt.plot(base_model.loss_history)
plt.xlabel("Итерация")
plt.ylabel("Log-loss")
plt.title("Сходимость градиентного спуска (базовая модель)")
plt.show()

learning_rates = [0.001, 0.01, 0.1]
num_iters_list = [100, 300, 1000]
methods = ['gd', 'newton']

results = []

for method in methods:
    for lr in learning_rates:
        for n_iter in num_iters_list:
            # В методе Ньютона часто достаточно меньшего числа итераций,
            # но мы перебираем общий набор для наглядности.
            model = LogisticRegressionScratch(
                learning_rate=lr,
                num_iter=n_iter,
                method=method,
                verbose=False
            )
            model.fit(X_train_log, y_train_log)
            y_pred = model.predict(X_test_log)

            acc, prec, rec, f1, TP, TN, FP, FN = classification_metrics_with_f1(y_test_log, y_pred)

            results.append({
                'method': method,
                'learning_rate': lr,
                'num_iter': n_iter,
                'accuracy': acc,
                'precision': prec,
                'recall': rec,
                'f1': f1
            })

results_df = pd.DataFrame(results)
print("Результаты перебора гиперпараметров (первые строки):")
print(results_df.head())

Результаты перебора гиперпараметров (первые строки):
  method  learning_rate  num_iter  accuracy  precision    recall        f1
0     gd          0.001       100  0.648045   0.531250  0.261538  0.350515
1     gd          0.001       300  0.675978   0.629630  0.261538  0.369565
2     gd          0.001      1000  0.664804   0.608696  0.215385  0.318182
3     gd          0.010       100  0.636872   0.000000  0.000000  0.000000
4     gd          0.010       300  0.653631   0.636364  0.107692  0.184211

results_sorted = results_df.sort_values(by='f1', ascending=False)
print("\nТоп-10 комбинаций гиперпараметров по F1-score:")
print(results_sorted.head(10))

Топ-10 комбинаций гиперпараметров по F1-score:
    method  learning_rate  num_iter  accuracy  precision    recall        f1
16  newton          0.100       300  0.776536   0.686567  0.707692  0.696970
9   newton          0.001       100  0.776536   0.686567  0.707692  0.696970
17  newton          0.100      1000  0.776536   0.686567  0.707692  0.696970
12  newton          0.010       100  0.776536   0.686567  0.707692  0.696970
15  newton          0.100       100  0.776536   0.686567  0.707692  0.696970
14  newton          0.010      1000  0.776536   0.686567  0.707692  0.696970
13  newton          0.010       300  0.776536   0.686567  0.707692  0.696970
10  newton          0.001       300  0.776536   0.686567  0.707692  0.696970
11  newton          0.001      1000  0.776536   0.686567  0.707692  0.696970
7       gd          0.100       300  0.648045   0.510638  0.738462  0.603774

# 1) Влияние learning_rate для каждого метода (берем, например, num_iter=1000)
subset_1000 = results_df[results_df['num_iter'] == 1000]

plt.figure()
for method in methods:
    sub = subset_1000[subset_1000['method'] == method]
    plt.plot(sub['learning_rate'], sub['accuracy'], marker='o', label=f"{method} - accuracy")
plt.xscale('log')
plt.xlabel("learning_rate (log scale)")
plt.ylabel("Accuracy")
plt.title("Влияние learning_rate (num_iter=1000)")
plt.legend()
plt.show()

# 2) Влияние числа итераций для разных методов при фиксированном learning_rate (например lr=0.01)
subset_lr = results_df[results_df['learning_rate'] == 0.01]

plt.figure()
for method in methods:
    sub = subset_lr[subset_lr['method'] == method]
    plt.plot(sub['num_iter'], sub['accuracy'], marker='o', label=f"{method} - accuracy")
plt.xlabel("num_iter")
plt.ylabel("Accuracy")
plt.title("Влияние количества итераций при learning_rate=0.01")
plt.legend()
plt.show()

# 3) Сравнение методов по F1-score при лучших найденных настройках (по F1)
best_by_method = results_df.loc[results_df.groupby('method')['f1'].idxmax()]

plt.figure()
plt.bar(best_by_method['method'], best_by_method['f1'])
plt.xlabel("Метод оптимизации")
plt.ylabel("Лучший F1-score")
plt.title("Сравнение методов (GD vs Newton) по лучшему F1-score")
plt.show()

print("\nЛучшие настройки по каждому методу (по F1-score):")
print(best_by_method)

Лучшие настройки по каждому методу (по F1-score):
   method  learning_rate  num_iter  accuracy  precision    recall        f1
7      gd          0.100       300  0.648045   0.510638  0.738462  0.603774
9  newton          0.001       100  0.776536   0.686567  0.707692  0.696970