不均衡データ#

不均衡データ(imbalanced data)は目的変数の分布が極端に偏っているためにおこる問題。

例えば目的変数が\(y\in \{0, 1\}\)の二値分類において1が90%あるデータの場合、全てに対して1を予測するだけのアルゴリズムであっても正解率(accuracy)は90%になる

きちんと分類できるデータであれば不均衡であってもきちんと分類できるが、そうでない場合は不均衡具合に影響を受けてしまう

きちんと分類できない領域については、サンプリングや誤差関数への重み付けなどにより均衡データに近づけて学習させる方法がある。

ただし、その場合は均衡データとして学習している(例えば二値分類なら50%/50%)ので、予測の確率値も50%がしきい値となるような高い値が出てくるため、確率をカリブレーション(calibration)して戻す必要がある

Hide code cell source
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def gen_data(scale = 1, p_minority = 0.1, n = 1000):
    n_minor = int(n * p_minority)
    n_major = int(n * (1 - p_minority))
    np.random.seed(0)
    y = np.append(
        np.zeros(shape=(n_major,)),
        np.ones(shape=(n_minor,)),
    )
    X = np.append(
        np.random.normal(loc=(-1, -1), scale=scale, size=(n_major, 2)),
        np.random.normal(loc=(1, 1), scale=scale, size=(n_minor, 2)),
        axis=0
    )
    return X, y


def plot_data(X, y):
    fig, ax = plt.subplots()
    ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=.7)
    ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=.7)
    ax.set(xlabel="x1", ylabel="x2")
    ax.legend()
    fig.show()

上手く分類できないデータの場合#

Hide code cell source
X, y = gen_data(scale = 2, p_minority = 0.1)
plot_data(X, y)
../_images/2a5cecc15e4e0fb06d5b049a9ba5acda2c5a13473f24dc052b5cdb5e83f0c6eb.png
Hide code cell source
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
../_images/bd949d218091a036090df83309363586b700010c422a0fbb69a81fb3a6a6224b.png
Hide code cell source
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score

print(f"""
accuracy: {accuracy_score(y_test, y_pred):.3g}
balanced_accuracy: {balanced_accuracy_score(y_test, y_pred):.3g}
recall: {recall_score(y_test, y_pred):.3g}
precision: {precision_score(y_test, y_pred):.3g}
f1_score: {f1_score(y_test, y_pred):.3g}
""")
accuracy: 0.885
balanced_accuracy: 0.557
recall: 0.12
precision: 0.75
f1_score: 0.207
Hide code cell source
import japanize_matplotlib

x1_lim = (X[:, 0].min() * 1.1, X[:, 0].max() * 1.1)
x2_lim = (X[:, 1].min() * 1.1, X[:, 1].max() * 1.1)

X1, X2 = np.mgrid[x1_lim[0]:x1_lim[1]:1000j, x2_lim[0]:x2_lim[1]:1000j]
X_range= np.append(X1.reshape(-1, 1), X2.reshape(-1, 1), axis=1)
Y = clf.predict(X_range).reshape(X1.shape)

fig, ax = plt.subplots()
ax.contourf(X1, X2, Y, cmap='Paired', alpha=0.4)

ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=0.7)
ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=0.7)
# ax.set(xlabel="x1", ylabel="x2", title="Decision Boundary of Logistic Regression")
ax.set(xlabel="x1", ylabel="x2", title="不均衡データで、うまく分類できていない場合")
ax.legend()
fig.show()
../_images/18395e7503f648da57875302fca475240768eac617b3fd961ea949f3c6072ce8.png

分類しやすいデータの場合#

Hide code cell source
X, y = gen_data(scale = 0.6, p_minority = 0.1)
plot_data(X, y)
../_images/0f491ce971b01133fd4c1ced0e5615baf85a2e6c6f3c46a8eb93055d025742cf.png
Hide code cell source
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot()
plt.show()
../_images/e214e8d860cedc56095c703ccb969dd8d237e3e5d6e80a1def25a92900e69fdf.png
Hide code cell source
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score

print(f"""
accuracy: {accuracy_score(y_test, y_pred):.3g}
balanced_accuracy: {balanced_accuracy_score(y_test, y_pred):.3g}
recall: {recall_score(y_test, y_pred):.3g}
precision: {precision_score(y_test, y_pred):.3g}
f1_score: {f1_score(y_test, y_pred):.3g}
""")
accuracy: 1
balanced_accuracy: 1
recall: 1
precision: 1
f1_score: 1
Hide code cell source
import japanize_matplotlib

x1_lim = (X[:, 0].min() * 1.1, X[:, 0].max() * 1.1)
x2_lim = (X[:, 1].min() * 1.1, X[:, 1].max() * 1.1)

X1, X2 = np.mgrid[x1_lim[0]:x1_lim[1]:1000j, x2_lim[0]:x2_lim[1]:1000j]
X_range= np.append(X1.reshape(-1, 1), X2.reshape(-1, 1), axis=1)
Y = clf.predict(X_range).reshape(X1.shape)

fig, ax = plt.subplots()
ax.contourf(X1, X2, Y, cmap='Paired', alpha=0.4)

ax.scatter(X[y == 0, 0], X[y == 0, 1], label="y == 0", alpha=0.7)
ax.scatter(X[y == 1, 0], X[y == 1, 1], label="y == 1", alpha=0.7)
# ax.set(xlabel="x1", ylabel="x2", title="Decision Boundary of Logistic Regression")
ax.set(xlabel="x1", ylabel="x2", title="不均衡データだがうまく分類できている場合")
# ax.set(xlabel="x1", ylabel="x2", title="不均衡データで、うまく分類できていない場合")
ax.legend()
fig.show()
../_images/6e2478339cd9a0c4a47229a5165e9845ecb4ef6175cf20a1bc11d4a864fc7b81.png

参考#