不均衡データ#

不均衡データ(imbalanced data)は目的変数の分布が極端に偏っているためにおこる問題。

例えば目的変数が\(y\in \{0, 1\}\)の二値分類において1が90%あるデータの場合、全てに対して1を予測するだけのアルゴリズムであっても正解率(accuracy)は90%になる

きちんと分類できるデータであれば不均衡であってもきちんと分類できるが、そうでない場合は不均衡具合に影響を受けてしまう

きちんと分類できない領域については、サンプリングや誤差関数への重み付けなどにより均衡データに近づけて学習させる方法がある。

ただし、その場合は均衡データとして学習している(例えば二値分類なら50%/50%)ので、予測の確率値も50%がしきい値となるような高い値が出てくるため、確率をカリブレーション(calibration)して戻す必要がある

Hide code cell source
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def gen_data(scale = 1, p_minority = 0.1, n = 1000):
    n_minor = int(n * p_minority)
    n_major = int(n * (1 - p_minority))
    np.random.seed(0)
    y = np.append(
        np.zeros(shape=(n_major,)),
        np.ones(shape=(n_minor,)),
    )
    X = np.append(
        np.random.normal(loc=(-1, -1), scale=scale, size=(n_major, 2)),
        np.random.normal(loc=(1, 1), scale=scale, size=(n_minor, 2)),
        axis=0
    )
    return X, y


def plot_data(X, y):
    fig, ax = plt.subplots()
    ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=.7)
    ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=.7)
    ax.set(xlabel="x1", ylabel="x2")
    ax.legend()
    fig.show()

上手く分類できないデータの場合#

Hide code cell source
X, y = gen_data(scale = 2, p_minority = 0.1)
plot_data(X, y)
../_images/8c7c9e1fcbe50bdf31a4eb59afb2384fff4c582854897767107c005cdc6222e3.png
Hide code cell source
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
../_images/85a60c1b56d1a2d20a8574dcb0a199a31c835e86dc4e9a7bb534effd219bf1cf.png
Hide code cell source
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score

print(f"""
accuracy: {accuracy_score(y_test, y_pred):.3g}
balanced_accuracy: {balanced_accuracy_score(y_test, y_pred):.3g}
recall: {recall_score(y_test, y_pred):.3g}
precision: {precision_score(y_test, y_pred):.3g}
f1_score: {f1_score(y_test, y_pred):.3g}
""")
accuracy: 0.885
balanced_accuracy: 0.557
recall: 0.12
precision: 0.75
f1_score: 0.207
Hide code cell source
import japanize_matplotlib

x1_lim = (X[:, 0].min() * 1.1, X[:, 0].max() * 1.1)
x2_lim = (X[:, 1].min() * 1.1, X[:, 1].max() * 1.1)

X1, X2 = np.mgrid[x1_lim[0]:x1_lim[1]:1000j, x2_lim[0]:x2_lim[1]:1000j]
X_range= np.append(X1.reshape(-1, 1), X2.reshape(-1, 1), axis=1)
Y = clf.predict(X_range).reshape(X1.shape)

fig, ax = plt.subplots()
ax.contourf(X1, X2, Y, cmap='Paired', alpha=0.4)

ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=0.7)
ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=0.7)
# ax.set(xlabel="x1", ylabel="x2", title="Decision Boundary of Logistic Regression")
ax.set(xlabel="x1", ylabel="x2", title="不均衡データで、うまく分類できていない場合")
ax.legend()
fig.show()
../_images/a45cc0a895434299644fa2a4809b120213c0162a73f4d5ff4740a1ec8f1f1753.png

分類しやすいデータの場合#

Hide code cell source
X, y = gen_data(scale = 0.6, p_minority = 0.1)
plot_data(X, y)
../_images/f4b717ae21a8159b48ad690e840e0bf2eea4d7144c42dad1e5f3bc577f2939d5.png
Hide code cell source
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
../_images/3282e5db34b2a5e0cb528283fac1efa160329ecf8a80b5dc9d2e9e415e68d301.png
Hide code cell source
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score

print(f"""
accuracy: {accuracy_score(y_test, y_pred):.3g}
balanced_accuracy: {balanced_accuracy_score(y_test, y_pred):.3g}
recall: {recall_score(y_test, y_pred):.3g}
precision: {precision_score(y_test, y_pred):.3g}
f1_score: {f1_score(y_test, y_pred):.3g}
""")
accuracy: 1
balanced_accuracy: 1
recall: 1
precision: 1
f1_score: 1
Hide code cell source
import japanize_matplotlib

x1_lim = (X[:, 0].min() * 1.1, X[:, 0].max() * 1.1)
x2_lim = (X[:, 1].min() * 1.1, X[:, 1].max() * 1.1)

X1, X2 = np.mgrid[x1_lim[0]:x1_lim[1]:1000j, x2_lim[0]:x2_lim[1]:1000j]
X_range= np.append(X1.reshape(-1, 1), X2.reshape(-1, 1), axis=1)
Y = clf.predict(X_range).reshape(X1.shape)

fig, ax = plt.subplots()
ax.contourf(X1, X2, Y, cmap='Paired', alpha=0.4)

ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=0.7)
ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=0.7)
# ax.set(xlabel="x1", ylabel="x2", title="Decision Boundary of Logistic Regression")
ax.set(xlabel="x1", ylabel="x2", title="不均衡データだがうまく分類できている場合")
# ax.set(xlabel="x1", ylabel="x2", title="不均衡データで、うまく分類できていない場合")
ax.legend()
fig.show()
../_images/d7a0f5f38c82590257adba209d20bebc2a915c42c4adbb08a1945f85459c6b9f.png

参考#