不均衡データ#

不均衡データ(imbalanced data)は目的変数の分布が極端に偏っているためにおこる問題。

例えば目的変数が\(y\in \{0, 1\}\)の二値分類において1が90%あるデータの場合、全てに対して1を予測するだけのアルゴリズムであっても正解率(accuracy)は90%になる

きちんと分類できるデータであれば不均衡であってもきちんと分類できるが、そうでない場合は不均衡具合に影響を受けてしまう

きちんと分類できない領域については、サンプリングや誤差関数への重み付けなどにより均衡データに近づけて学習させる方法がある。

ただし、その場合は均衡データとして学習している(例えば二値分類なら50%/50%)ので、予測の確率値も50%がしきい値となるような高い値が出てくるため、確率をカリブレーション(calibration)して戻す必要がある

Hide code cell source
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def gen_data(scale = 1, p_minority = 0.1, n = 1000):
    n_minor = int(n * p_minority)
    n_major = int(n * (1 - p_minority))
    np.random.seed(0)
    y = np.append(
        np.zeros(shape=(n_major,)),
        np.ones(shape=(n_minor,)),
    )
    X = np.append(
        np.random.normal(loc=(-1, -1), scale=scale, size=(n_major, 2)),
        np.random.normal(loc=(1, 1), scale=scale, size=(n_minor, 2)),
        axis=0
    )
    return X, y


def plot_data(X, y):
    fig, ax = plt.subplots()
    ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=.7)
    ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=.7)
    ax.set(xlabel="x1", ylabel="x2")
    ax.legend()
    fig.show()

上手く分類できないデータの場合#

Hide code cell source
X, y = gen_data(scale = 2, p_minority = 0.1)
plot_data(X, y)
../_images/0a01f5a3ce27e5320f4bcefe4d83cfd2ee5b144082ee885de97e9f25def20d8c.png
Hide code cell source
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
../_images/31d2039669f16b6295db5d07e098ffacba0c84e6c75440357019a7f98837a359.png
Hide code cell source
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score

print(f"""
accuracy: {accuracy_score(y_test, y_pred):.3g}
balanced_accuracy: {balanced_accuracy_score(y_test, y_pred):.3g}
recall: {recall_score(y_test, y_pred):.3g}
precision: {precision_score(y_test, y_pred):.3g}
f1_score: {f1_score(y_test, y_pred):.3g}
""")
accuracy: 0.885
balanced_accuracy: 0.557
recall: 0.12
precision: 0.75
f1_score: 0.207
Hide code cell source
import japanize_matplotlib

x1_lim = (X[:, 0].min() * 1.1, X[:, 0].max() * 1.1)
x2_lim = (X[:, 1].min() * 1.1, X[:, 1].max() * 1.1)

X1, X2 = np.mgrid[x1_lim[0]:x1_lim[1]:1000j, x2_lim[0]:x2_lim[1]:1000j]
X_range= np.append(X1.reshape(-1, 1), X2.reshape(-1, 1), axis=1)
Y = clf.predict(X_range).reshape(X1.shape)

fig, ax = plt.subplots()
ax.contourf(X1, X2, Y, cmap='Paired', alpha=0.4)

ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=0.7)
ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=0.7)
# ax.set(xlabel="x1", ylabel="x2", title="Decision Boundary of Logistic Regression")
ax.set(xlabel="x1", ylabel="x2", title="不均衡データで、うまく分類できていない場合")
ax.legend()
fig.show()
../_images/f1db398cab840ed70a9f13487673530fa6a2c0d80625e29897e0f42751e55839.png

分類しやすいデータの場合#

Hide code cell source
X, y = gen_data(scale = 0.6, p_minority = 0.1)
plot_data(X, y)
../_images/127705497cf5f5391edb63a67071cfd18081e75b6980fc7f60ac715315228ccc.png
Hide code cell source
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
../_images/cd1453624a52b54dda5fa563a0fdd7d7ba363a6df2060557b833714f668dfc2a.png
Hide code cell source
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score

print(f"""
accuracy: {accuracy_score(y_test, y_pred):.3g}
balanced_accuracy: {balanced_accuracy_score(y_test, y_pred):.3g}
recall: {recall_score(y_test, y_pred):.3g}
precision: {precision_score(y_test, y_pred):.3g}
f1_score: {f1_score(y_test, y_pred):.3g}
""")
accuracy: 1
balanced_accuracy: 1
recall: 1
precision: 1
f1_score: 1
Hide code cell source
import japanize_matplotlib

x1_lim = (X[:, 0].min() * 1.1, X[:, 0].max() * 1.1)
x2_lim = (X[:, 1].min() * 1.1, X[:, 1].max() * 1.1)

X1, X2 = np.mgrid[x1_lim[0]:x1_lim[1]:1000j, x2_lim[0]:x2_lim[1]:1000j]
X_range= np.append(X1.reshape(-1, 1), X2.reshape(-1, 1), axis=1)
Y = clf.predict(X_range).reshape(X1.shape)

fig, ax = plt.subplots()
ax.contourf(X1, X2, Y, cmap='Paired', alpha=0.4)

ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=0.7)
ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=0.7)
# ax.set(xlabel="x1", ylabel="x2", title="Decision Boundary of Logistic Regression")
ax.set(xlabel="x1", ylabel="x2", title="不均衡データだがうまく分類できている場合")
# ax.set(xlabel="x1", ylabel="x2", title="不均衡データで、うまく分類できていない場合")
ax.legend()
fig.show()
../_images/dd945c54e8f29c83af7d93872eb2640061eb3fe75d98aa09179c27c892c680f9.png

参考#