シミュレーション#
import numpy as np
def m(X):
return np.log(X)
def g(X):
return X ** 2
def gen_data(n=500, seed=0, theta=5):
np.random.seed(seed)
V = np.random.normal(size=n, scale=2)
U = np.random.normal(size=n, scale=3)
X = np.random.uniform(size=n, low=1, high=10)
D = m(X) + V
Y = D * theta + g(X) + U
X = X.reshape(-1, 1)
return Y, D, X
Y, D, X = gen_data(seed=0)
import pandas as pd
import seaborn as sns
sns.pairplot(data=pd.DataFrame(dict(Y=Y, D=D, X=X.reshape(-1, ))))
<seaborn.axisgrid.PairGrid at 0x7fb408e9a7d0>
DML1を真似たやつ
サンプルを\(K\)個に分割する。
\(k \in K\)について、\(k\)番目のチャンクのサンプルのインデックスの集合を\(I_k\)とする。\(i \notin I_k\)のサンプルで局外関数の推定を行う:\(\hat{\eta}_{0,k} = (\hat{\eta}_{0}(W_i)_{i \notin I_k})\)
\(i \in I_k\)のサンプルで\(\theta\)を推定
\(\hat{\theta}_{0,k}\)の平均を集計
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
def dml1(Y, D, X):
kf = KFold(n_splits=5)
kf.get_n_splits(X)
thetas = []
for i, (train_idx, test_idx) in enumerate(kf.split(X)):
# 局外関数の推定
m = LGBMRegressor(verbose=-1).fit(X[train_idx], D[train_idx])
l = LGBMRegressor(verbose=-1).fit(X[train_idx], Y[train_idx])
# 残差の計算
V_hat = D[test_idx] - m.predict(X[test_idx])
Y_res = Y[test_idx] - l.predict(X[test_idx])
# θの推定
theta_hat = np.mean(V_hat * V_hat) ** (-1) * np.mean(V_hat * Y_res)
thetas.append(theta_hat)
return np.mean(thetas)
dml1(Y, D, X)
4.996210951148309
DML#
%%time
theta = 5
n_rep = 1000
estimation_errors = []
for i_rep in range(n_rep):
Y, D, X = gen_data(seed=i_rep, theta=theta)
theta_hat = dml1(Y, D, X)
estimation_error = theta - theta_hat
estimation_errors.append(estimation_error)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.hist(estimation_errors, bins=30)
ax.set(xlabel=r"$\theta - \hat{\theta}$", title="Estimation Error")
mean_error = np.mean(estimation_errors)
ax.axvline(mean_error, color="darkblue")
ax.text(mean_error, 10, f" mean={mean_error:.2g}")
fig.show()
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
File <timed exec>:6
Cell In[2], line 10, in dml1(Y, D, X)
7 thetas = []
8 for i, (train_idx, test_idx) in enumerate(kf.split(X)):
9 # 局外関数の推定
---> 10 m = LGBMRegressor(verbose=-1).fit(X[train_idx], D[train_idx])
11 l = LGBMRegressor(verbose=-1).fit(X[train_idx], Y[train_idx])
12 # 残差の計算
File /usr/local/lib/python3.10/site-packages/lightgbm/sklearn.py:1189, in LGBMRegressor.fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_init_score, eval_metric, feature_name, categorical_feature, callbacks, init_model)
1172 def fit( # type: ignore[override]
1173 self,
1174 X: _LGBM_ScikitMatrixLike,
(...)
1186 init_model: Optional[Union[str, Path, Booster, LGBMModel]] = None,
1187 ) -> "LGBMRegressor":
1188 """Docstring is inherited from the LGBMModel."""
-> 1189 super().fit(
1190 X,
1191 y,
1192 sample_weight=sample_weight,
1193 init_score=init_score,
1194 eval_set=eval_set,
1195 eval_names=eval_names,
1196 eval_sample_weight=eval_sample_weight,
1197 eval_init_score=eval_init_score,
1198 eval_metric=eval_metric,
1199 feature_name=feature_name,
1200 categorical_feature=categorical_feature,
1201 callbacks=callbacks,
1202 init_model=init_model,
1203 )
1204 return self
File /usr/local/lib/python3.10/site-packages/lightgbm/sklearn.py:955, in LGBMModel.fit(self, X, y, sample_weight, init_score, group, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_group, eval_metric, feature_name, categorical_feature, callbacks, init_model)
952 evals_result: _EvalResultDict = {}
953 callbacks.append(record_evaluation(evals_result))
--> 955 self._Booster = train(
956 params=params,
957 train_set=train_set,
958 num_boost_round=self.n_estimators,
959 valid_sets=valid_sets,
960 valid_names=eval_names,
961 feval=eval_metrics_callable, # type: ignore[arg-type]
962 init_model=init_model,
963 callbacks=callbacks,
964 )
966 self._evals_result = evals_result
967 self._best_iteration = self._Booster.best_iteration
File /usr/local/lib/python3.10/site-packages/lightgbm/engine.py:307, in train(params, train_set, num_boost_round, valid_sets, valid_names, feval, init_model, feature_name, categorical_feature, keep_training_booster, callbacks)
295 for cb in callbacks_before_iter:
296 cb(
297 callback.CallbackEnv(
298 model=booster,
(...)
304 )
305 )
--> 307 booster.update(fobj=fobj)
309 evaluation_result_list: List[_LGBM_BoosterEvalMethodResultType] = []
310 # check evaluation result.
File /usr/local/lib/python3.10/site-packages/lightgbm/basic.py:4136, in Booster.update(self, train_set, fobj)
4133 if self.__set_objective_to_none:
4134 raise LightGBMError("Cannot update due to null objective function.")
4135 _safe_call(
-> 4136 _LIB.LGBM_BoosterUpdateOneIter(
4137 self._handle,
4138 ctypes.byref(is_finished),
4139 )
4140 )
4141 self.__is_predicted_cur_iter = [False for _ in range(self.__num_dataset)]
4142 return is_finished.value == 1
KeyboardInterrupt:
もしCross Fittingがなければ#
def estimate_without_cross_fitting(Y, D, X):
# 局外関数の推定
m = LGBMRegressor(max_depth=4, verbose=-1).fit(X, D)
l = LGBMRegressor(max_depth=4, verbose=-1).fit(X, Y)
# 残差の計算
V_hat = D - m.predict(X)
Y_res = Y - l.predict(X)
# θの推定
theta_hat = np.mean(V_hat * V_hat) ** (-1) * np.mean(V_hat * Y_res)
return theta_hat
estimate_without_cross_fitting(Y, D, X)
4.8689337039631475
%%time
n_rep = 1000
estimation_errors = []
for i_rep in range(n_rep):
Y, D, X = gen_data(seed=i_rep, theta=theta)
theta_hat = estimate_without_cross_fitting(Y, D, X)
estimation_error = theta - theta_hat
estimation_errors.append(estimation_error)
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.hist(estimation_errors, bins=30)
ax.set(xlabel=r"$\theta - \hat{\theta}$", title="Estimation Error (without splitting)")
mean_error = np.mean(estimation_errors)
ax.axvline(mean_error, color="darkblue")
ax.text(mean_error, 10, f" mean={mean_error:.2g}")
fig.show()
CPU times: user 2min 39s, sys: 868 ms, total: 2min 40s
Wall time: 20.3 s
ライブラリと一致するか#
theta = 7
Y, D, X = gen_data(seed=0, theta=theta)
theta_hat = dml1(Y, D, X)
print(f"{theta=:.1f}, {theta_hat=:.3f}")
theta=7.0, theta_hat=7.002
df = pd.DataFrame(X, columns=["x"])
df["y"] = Y
df["d"] = Y
df.head()
x | y | d | |
---|---|---|---|
0 | 8.397135 | 99.940053 | 99.940053 |
1 | 7.304758 | 67.200958 | 67.200958 |
2 | 8.947698 | 104.094708 | 104.094708 |
3 | 9.699176 | 127.140504 | 127.140504 |
4 | 7.972729 | 91.577762 | 91.577762 |
l = LGBMRegressor(verbose=-1, n_estimators=100, max_depth=4)
m = LGBMRegressor(verbose=-1)
from doubleml import DoubleMLData
data = DoubleMLData(df, y_col="y", d_cols="d", x_cols=["x"])
from doubleml import DoubleMLPLR
np.random.seed(1)
dml = DoubleMLPLR(data, l, m)
dml.fit()
print(dml)
================== DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['x']
Instrument variable(s): None
No. Observations: 500
------------------ Score & algorithm ------------------
Score function: partialling out
------------------ Machine learner ------------------
Learner ml_l: LGBMRegressor(max_depth=4, verbose=-1)
Learner ml_m: LGBMRegressor(verbose=-1)
Out-of-sample Performance:
Regression:
Learner ml_l RMSE: [[10.77148372]]
Learner ml_m RMSE: [[10.86900956]]
------------------ Resampling ------------------
No. folds: 5
No. repeated sample splits: 1
------------------ Fit summary ------------------
coef std err t P>|t| 2.5 % 97.5 %
d 0.989565 0.002293 431.535933 0.0 0.985071 0.99406
import numpy as np
np.random.seed(3141)
n_obs = 500
n_vars = 100
theta = 3
X = np.random.normal(size=(n_obs, n_vars))
d = np.dot(X[:, :3], np.array([5, 5, 5])) + np.random.standard_normal(size=(n_obs,))
y = theta * d + np.dot(X[:, :3], np.array([5, 5, 5])) + np.random.standard_normal(size=(n_obs,))
dml_data_sim = DoubleMLData.from_arrays(X, y, d)
print(dml_data_sim)
================== DoubleMLData Object ==================
------------------ Data summary ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'X83', 'X84', 'X85', 'X86', 'X87', 'X88', 'X89', 'X90', 'X91', 'X92', 'X93', 'X94', 'X95', 'X96', 'X97', 'X98', 'X99', 'X100']
Instrument variable(s): None
No. Observations: 500
------------------ DataFrame info ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Columns: 102 entries, X1 to d
dtypes: float64(102)
memory usage: 398.6 KB
from doubleml import DoubleMLPLR
np.random.seed(1)
dml = DoubleMLPLR(dml_data_sim, l, m)
dml.fit()
print(dml)
================== DoubleMLPLR Object ==================
------------------ Data summary ------------------
Outcome variable: y
Treatment variable(s): ['d']
Covariates: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'X79', 'X80', 'X81', 'X82', 'X83', 'X84', 'X85', 'X86', 'X87', 'X88', 'X89', 'X90', 'X91', 'X92', 'X93', 'X94', 'X95', 'X96', 'X97', 'X98', 'X99', 'X100']
Instrument variable(s): None
No. Observations: 500
------------------ Score & algorithm ------------------
Score function: partialling out
------------------ Machine learner ------------------
Learner ml_l: LGBMRegressor(max_depth=4, verbose=-1)
Learner ml_m: LGBMRegressor(verbose=-1)
Out-of-sample Performance:
Regression:
Learner ml_l RMSE: [[8.69278568]]
Learner ml_m RMSE: [[2.22238082]]
------------------ Resampling ------------------
No. folds: 5
No. repeated sample splits: 1
------------------ Fit summary ------------------
coef std err t P>|t| 2.5 % 97.5 %
d 3.48904 0.108446 32.172982 4.214560e-227 3.276489 3.701591