xgboost: Training many models with gpu_hist in Optuna yields ‘parallel_for failed: out of memory’

Hi, I am having an issue using XGBClassifier on GPU running OOM and tried to implement a workaround by saving the model, deleting the model and loading it back in.

pickle.dump(self.model, open(f'tmp/model_{uid}.pkl', 'wb'))
del self.model
self.model = pickle.load(open(f'tmp/model_{uid}.pkl', 'rb'))
os.remove(f'tmp/model_{uid}.pkl')

I am on xgb 1.3.0 and the models are very small. I am running a HO with Optuna with a 1000x Bootstrapping CV in each iteration. After 50 - 120 Optuna iteration, it throws the error:

xgboost.core.XGBoostError: [16:11:48] ../src/tree/updater_gpu_hist.cu:731: Exception in gpu_hist: NCCL failure :unhandled cuda error ../src/common/device_helpers.cu(71)

and

terminate called after throwing an instance of 'thrust::system::system_error'
  what():  parallel_for failed: out of memory

Looking at nvidia-smi it only takes a constant ~210 MB… (RTX TITAN)

My parameter space looks like this:

params = {
            'booster': 'gbtree',
            'objective': 'binary:logistic',
            'tree_method': 'gpu_hist',
            'random_state': self.random_state,
            'predictor': 'cpu_predictor',
            'n_estimators' : 100,
            'reg_alpha': 0,
            'reg_lambda': 1,
            'min_child_weight': 1,
            'max_depth': trial.suggest_int('max_depth', 2, 6),
            'gamma': trial.suggest_discrete_uniform('gamma', 0, 10, 0.1),
            'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
            'subsample': trial.suggest_discrete_uniform('subsample', 0.3, 1.0, 0.05),
            'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.1, 1.0, 0.1)
        }

I thought this is related to issue https://github.com/dmlc/xgboost/issues/4668, but I am not sure about that anymore.

BTW, everything works fine running the same code on CPU. Other libraries like RAPIDS cuML are working fine on GPU.

About this issue

  • Original URL
  • State: open
  • Created 4 years ago
  • Comments: 15 (6 by maintainers)

Most upvoted comments

This is my code, which stops after 28 rounds with the errors stated above.

import numpy as np
import pandas as pd
from sklearn.utils import resample
from xgboost import XGBClassifier
import optuna
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

X = pd.DataFrame(np.random.randint(0,200, size=(100, 4000)))
y = pd.Series(np.random.randint(0,2, size=(100)))

class StratifiedBootstrapping():
    """
    """
    def __init__(self, n_iter, n_size, random_state=101):
        self.n_iter = n_iter
        self.n_size = n_size
        self.random_state = random_state
    
    def get_splits(self):
        return self.n_iter

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_iter
    
    def split(self, X, y, group=None):
        X = X.reset_index(drop=True)    
        for i in range(self.n_iter):
            train = resample(
                X.index, 
                n_samples=self.n_size, 
                stratify=list(y),
                random_state=self.random_state+i
            )
            test = np.array([x for x in X.index if x not in train])

            yield train, test

def get_auc(model, X, y, cv, callback=None, tqdm_disable=False):
    """
    """
    aucs = []
    for i, (train, test) in enumerate(cv.split(X, y)):
        fitted_model = model.fit(
            X.iloc[train, :],
            y.iloc[train],
            eval_set = [
                (X.iloc[train, :], y.iloc[train]),
                (X.iloc[test, :], y.iloc[test])
                ],
            eval_metric = ['logloss'],
            callbacks = callback,
            early_stopping_rounds=10,
            verbose=False,
        )
        y_test = np.array(y.iloc[test])
        X_test = X.iloc[test, :]
        y_pred_proba = fitted_model.predict_proba(X_test)
        auc = roc_auc_score(
            y_true=y_test, 
            y_score=y_pred_proba[:, 1],
            )
        aucs.append(auc)

    auc_mean = np.mean(aucs)
    auc_std = np.std(aucs)

    return auc_mean, auc_std

def objective(trial):
    """
    """
    model = XGBClassifier()
    params = {
        'booster': 'gbtree',
        'objective': 'binary:logistic',
        'tree_method': 'gpu_hist',
        'random_state': 101,
        'n_estimators' : 500,
        'scale_pos_weight' : 1,
        'min_child_weight': trial.suggest_discrete_uniform('min_child_weight', 0, 10, 0.1),
        'reg_alpha': trial.suggest_discrete_uniform('reg_alpha', 0, 1, 0.05),
        'reg_lambda': trial.suggest_discrete_uniform('reg_lambda', 0, 1, 0.05),
        'max_depth': trial.suggest_int('max_depth', 2, 6),
        'gamma': trial.suggest_discrete_uniform('gamma', 0, 10, 0.1),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.005, 0.5),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.3, 1.0, 0.05),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.1, 1.0, 0.05),
        'colsample_bylevel': trial.suggest_discrete_uniform('colsample_bylevel', 0.1, 1.0, 0.05),
        'colsample_bynode': trial.suggest_discrete_uniform('colsample_bynode', 0.1, 1.0, 0.05),
    }
    cv = StratifiedBootstrapping(1000, 95, 101)
    model.set_params(**params)
    callback = 'validation_1-logloss'
    pruning_callback = [optuna.integration.XGBoostPruningCallback(trial, callback)]
    auc = get_auc(model, X, y, cv, pruning_callback)
    return auc

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000, show_progress_bar=True)