xgboost: xgboost Distributed Calculates Wrong Gradients/Hessians

I’ve tried using a simple example for distributed XGBoost and I’m getting wrong gradients

I’m using the 1.4 release branch of xgboost

In short, I’m using 100 samples from Iris dataset and try to train distributed xgb model. Code is below:

import argparse
import logging
import xgboost as xgb
import traceback
import pandas as pd
from sklearn import datasets
import socket

HOST = socket.gethostname()
HOSTNAMES = ['opuscydvux0221.optiver.us', 'opuscydvux1220.optiver.us']
assert HOST in HOSTNAMES

logger = logging.getLogger(__name__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 100)
pd.set_option('precision', 5)
pd.set_option('display.max_rows', 4)


def read_train_data(rank, num_workers, partition):
    """
    Read file based on the rank of worker.
    We use the sklearn.iris data for demonstration
    You can extend this to read distributed data source like HDFS, HIVE etc
    :param rank: the id of each worker
    :param num_workers: total number of workers in this cluster
    :param path: the input file name or the place to read the data
    :return: XGBoost Dmatrix
    """
    iris = datasets.load_iris()
    iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
    if num_workers > 1:
        partition = rank
    assert partition < 2

    separate = 100
    train_df = iris_df.iloc[:separate]
    test_df = iris_df.iloc[separate:]

    if partition < 0:
        print("Using full data")
    else:
        separate_train = 60
        separate_test = 30
        if partition == 0:
            train_df = train_df.iloc[:separate_train]
            test_df = test_df.iloc[:separate_test]
        else:
            train_df = train_df.iloc[separate_train:]
            test_df = test_df.iloc[separate_test:]

    def get_dmatrix(df):
        x = df[df.columns[:-1]]
        y = df[df.columns[-1]]
        print(x)
        return xgb.DMatrix(data=x, label=y)

    dtrain = get_dmatrix(train_df)
    dtest = get_dmatrix(test_df)
    return dtrain, dtest

def train(args):
    """
    :param args: configuration for train job
    :return: XGBoost model
    """

    world_size = args.world_size
    assert world_size <= len(HOSTNAMES)
    port = 9095

    rank = 0 if world_size == 1 else HOSTNAMES.index(HOST)
    # addr = args.hostnames[0]
    addr = '10.121.4.163'

    print("Rank=", rank, addr)

    rabit_tracker = None

    try:
        """start to build the network"""
        if world_size > 1:
            if rank == 0:
                logger.info("start the master node")

                # rabit = RabitTracker(hostIP="0.0.0.0", nslave=world_size,
                #                     port=port, port_end=port + 1)
                rabit = xgb.tracker.RabitTracker(hostIP=addr, nslave=world_size,
                                                 port=port, port_end=port + 1)
                rabit.start(world_size)
                rabit_tracker = rabit
                logger.info('###### RabitTracker Setup Finished ######')
            envs = [
                'DMLC_NUM_WORKER=%d' % world_size,
                'DMLC_TRACKER_URI=%s' % addr,
                'DMLC_TRACKER_PORT=%d' % port,
                'DMLC_TASK_ID=%d' % rank
            ]

            logger.info('##### Rabit rank setup with below envs #####')
            for i, env in enumerate(envs):
                logger.info(env)
                envs[i] = str.encode(env)

            xgb.rabit.init(envs)
            logger.info('##### Rabit rank = %d' % xgb.rabit.get_rank())
            rank = xgb.rabit.get_rank()
            print("RR", rank)
        else:
            world_size = 1
            logging.info("Start the train in a single node")

        dtrain, dtest = read_train_data(rank=rank, num_workers=world_size, partition=args.partition)
        params = {'max_depth': 2,
                  'eta': 1,
                  'silent': 1,
                  "tree_method": "hist",
                  "objective": "reg:linear",
                  }
        logging.info("starting to train xgboost at node with rank %d", rank)
        evals_result = {}
        bst = xgb.train(
            dtrain=dtrain,
            num_boost_round=int(args.n_estimators),
            params=params,
            # evals=[(dtrain, 'train'), (dtest, 'test')],
            # evals_result=evals_result
        )
        print(evals_result)

        model = bst

        logging.info("finish xgboost training at node with rank %d", rank)

    except Exception as e:
        logger.error("something wrong happen: %s", traceback.format_exc())
        raise e
    finally:
        logger.info("xgboost training job finished!")
        if world_size > 1:
            logging.info('finalizing rabit')
            xgb.rabit.finalize()
        if rabit_tracker:
            rabit_tracker.join()
            logging.info('rabit_tracker join')

    logging.info("Out of here")
    return model


def main(args):
    logging.info("starting the train job")
    model = train(args)

    if model is not None:
        dump = model.get_dump(with_stats=True)
        for i in dump:
            print(i)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--n_estimators',
        help='Number of trees in the model',
        type=int,
        default=1
    )

    parser.add_argument(
        '--world_size',
        help='World Size',
        default=1,
        type=int
    )

    parser.add_argument(
        '--partition',
        help='Partition',
        default=-1,
        type=int,
    )

    logging.basicConfig(format='%(message)s')
    logging.getLogger().setLevel(logging.INFO)
    main_args = parser.parse_args()
    main(main_args)

When I run non-distributed version python simple_distributed_xgb.py --world_size 1 I get this tree:

0:[petal length (cm)<3] yes=1,no=2,missing=2,gain=28.5088348,cover=100
        1:[petal length (cm)<1.70000005] yes=3,no=4,missing=4,gain=0.0223081112,cover=50
                3:leaf=-0.25999999,cover=44
                4:leaf=-0.142857149,cover=6
        2:[petal length (cm)<4.19999981] yes=5,no=6,missing=6,gain=0.343410492,cover=50
                5:leaf=0.620000005,cover=19
                6:leaf=0.903124988,cover=31

Seems correct since I’m using 100 examples. However, then I run python simple_distributed_xgb.py --world_size 2 On the two machines that I use and I get this tree:

0:[petal length (cm)<3] yes=1,no=2,missing=2,gain=19.2424183,cover=200
        1:[petal length (cm)<1.70000005] yes=3,no=4,missing=4,gain=0.0223081112,cover=50
                3:leaf=-0.25999999,cover=44
                4:leaf=-0.142857149,cover=6
        2:[petal length (cm)<3.5999999] yes=5,no=6,missing=5,gain=9.38419342,cover=150
                5:leaf=0.294339627,cover=105
                6:leaf=0.841304362,cover=45

Notice cover=200. Gain is also wrong initially(and not halfed. so gradients also wrong or am I missing a regularization?), which causes wrong leaf values on the right side of the tree.

I’d appreciate your help figuring out what am I doing wrong

About this issue

Original URL
State: closed
Created 3 years ago
Comments: 18 (12 by maintainers)

Most upvoted comments

Thanks for raising the issue, I will look into it and add some tests tomorrow.

trivialfis on Jul 17, 2021