datatable: memory leak and speed concerns
import numpy as np
import lightgbm_gpu as lgb
import scipy
import pandas as pd
from sklearn.utils import shuffle
from h2oaicore.metrics import def_rmse
import datatable as dt
def set_dt_col(train_dt, name, value):
    if isinstance(name, int):
        name = train_dt.names[name]
    train_dt[:, name] = dt.Frame(value)
    return train_dt
nrow = 4000
ncol = 5000
X = np.random.randn(nrow, ncol)
y = np.random.randn(nrow)
model = lgb.LGBMRegressor(objective='regression', n_jobs=20)  # 40 very slow
model.fit(X, y)
X_dt = dt.Frame(X)
cols_actual = list(X_dt.names)
do_numpy = False
score_f = def_rmse
preds = model.predict(X)
main_metric = score_f(actual=y, predicted=preds)
seed = 1234
def go():
    feature_importances = {}
    for n in range(ncol):
        print(n, flush=True)
        if do_numpy:
            shuf = shuffle(X[:,n].ravel())
            X_tmp = X # .copy()
            X_tmp[:,n] = shuf
            new_preds = model.predict(X_tmp)
            metric = score_f(actual=y, predicted=new_preds)
            col = "C" + str(n)
            feature_importances[col] = main_metric - metric
        else:
            col = cols_actual[n]
            shuf = shuffle(X_dt[:, col].to_numpy().ravel(), random_state=seed)
            X_tmp = set_dt_col(dt.Frame(X_dt), col, shuf)
            new_preds = model.predict(X_tmp)
            metric = score_f(actual=y, predicted=new_preds)
            feature_importances[col] = main_metric - metric
    return feature_importances
print(go())
Related to permutation variable importance.
If do_numpy = False, so it uses dt, then I see the resident memory slowly creep up from about 0.8GB to 1.6GB at n=1800 etc. By n=4000 it’s using 2.7GB.
If I use do_numpy = True, so it uses no dt, then I see resident memory never change over all n.
I thought at one point I only saw with LightGBM and not xgboost, but I’m not sure.
Unit tests like this numpy version by Microsoft show LightGBM not itself leaking: https://github.com/Microsoft/LightGBM/issues/1968
These 2 cases aren’t doing exactly the same thing in that the numpy version keeps shuffling the same original X, while the dt version I think has essentially 2 copies, but the other original X_dt columns are not modified. But @st-pasha you can confirm.
One can add the X_tmp = X.copy(), but it’s not quite fair. It makes a full copy, while dt should get away with only overwriting a single column.
Perhaps the flaw is how we are using dt and the frames?
About this issue
- Original URL
- State: closed
- Created 5 years ago
- Comments: 27 (13 by maintainers)
this avoids copying the full X