xgboost: xgboost Distributed Calculates Wrong Gradients/Hessians
I’ve tried using a simple example for distributed XGBoost and I’m getting wrong gradients
I’m using the 1.4 release branch of xgboost
In short, I’m using 100 samples from Iris dataset and try to train distributed xgb model. Code is below:
import argparse
import logging
import xgboost as xgb
import traceback
import pandas as pd
from sklearn import datasets
import socket
HOST = socket.gethostname()
HOSTNAMES = ['opuscydvux0221.optiver.us', 'opuscydvux1220.optiver.us']
assert HOST in HOSTNAMES
logger = logging.getLogger(__name__)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 10000)
pd.set_option('max_colwidth', 100)
pd.set_option('precision', 5)
pd.set_option('display.max_rows', 4)
def read_train_data(rank, num_workers, partition):
"""
Read file based on the rank of worker.
We use the sklearn.iris data for demonstration
You can extend this to read distributed data source like HDFS, HIVE etc
:param rank: the id of each worker
:param num_workers: total number of workers in this cluster
:param path: the input file name or the place to read the data
:return: XGBoost Dmatrix
"""
iris = datasets.load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
if num_workers > 1:
partition = rank
assert partition < 2
separate = 100
train_df = iris_df.iloc[:separate]
test_df = iris_df.iloc[separate:]
if partition < 0:
print("Using full data")
else:
separate_train = 60
separate_test = 30
if partition == 0:
train_df = train_df.iloc[:separate_train]
test_df = test_df.iloc[:separate_test]
else:
train_df = train_df.iloc[separate_train:]
test_df = test_df.iloc[separate_test:]
def get_dmatrix(df):
x = df[df.columns[:-1]]
y = df[df.columns[-1]]
print(x)
return xgb.DMatrix(data=x, label=y)
dtrain = get_dmatrix(train_df)
dtest = get_dmatrix(test_df)
return dtrain, dtest
def train(args):
"""
:param args: configuration for train job
:return: XGBoost model
"""
world_size = args.world_size
assert world_size <= len(HOSTNAMES)
port = 9095
rank = 0 if world_size == 1 else HOSTNAMES.index(HOST)
# addr = args.hostnames[0]
addr = '10.121.4.163'
print("Rank=", rank, addr)
rabit_tracker = None
try:
"""start to build the network"""
if world_size > 1:
if rank == 0:
logger.info("start the master node")
# rabit = RabitTracker(hostIP="0.0.0.0", nslave=world_size,
# port=port, port_end=port + 1)
rabit = xgb.tracker.RabitTracker(hostIP=addr, nslave=world_size,
port=port, port_end=port + 1)
rabit.start(world_size)
rabit_tracker = rabit
logger.info('###### RabitTracker Setup Finished ######')
envs = [
'DMLC_NUM_WORKER=%d' % world_size,
'DMLC_TRACKER_URI=%s' % addr,
'DMLC_TRACKER_PORT=%d' % port,
'DMLC_TASK_ID=%d' % rank
]
logger.info('##### Rabit rank setup with below envs #####')
for i, env in enumerate(envs):
logger.info(env)
envs[i] = str.encode(env)
xgb.rabit.init(envs)
logger.info('##### Rabit rank = %d' % xgb.rabit.get_rank())
rank = xgb.rabit.get_rank()
print("RR", rank)
else:
world_size = 1
logging.info("Start the train in a single node")
dtrain, dtest = read_train_data(rank=rank, num_workers=world_size, partition=args.partition)
params = {'max_depth': 2,
'eta': 1,
'silent': 1,
"tree_method": "hist",
"objective": "reg:linear",
}
logging.info("starting to train xgboost at node with rank %d", rank)
evals_result = {}
bst = xgb.train(
dtrain=dtrain,
num_boost_round=int(args.n_estimators),
params=params,
# evals=[(dtrain, 'train'), (dtest, 'test')],
# evals_result=evals_result
)
print(evals_result)
model = bst
logging.info("finish xgboost training at node with rank %d", rank)
except Exception as e:
logger.error("something wrong happen: %s", traceback.format_exc())
raise e
finally:
logger.info("xgboost training job finished!")
if world_size > 1:
logging.info('finalizing rabit')
xgb.rabit.finalize()
if rabit_tracker:
rabit_tracker.join()
logging.info('rabit_tracker join')
logging.info("Out of here")
return model
def main(args):
logging.info("starting the train job")
model = train(args)
if model is not None:
dump = model.get_dump(with_stats=True)
for i in dump:
print(i)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--n_estimators',
help='Number of trees in the model',
type=int,
default=1
)
parser.add_argument(
'--world_size',
help='World Size',
default=1,
type=int
)
parser.add_argument(
'--partition',
help='Partition',
default=-1,
type=int,
)
logging.basicConfig(format='%(message)s')
logging.getLogger().setLevel(logging.INFO)
main_args = parser.parse_args()
main(main_args)
When I run non-distributed version python simple_distributed_xgb.py --world_size 1 I get this tree:
0:[petal length (cm)<3] yes=1,no=2,missing=2,gain=28.5088348,cover=100
1:[petal length (cm)<1.70000005] yes=3,no=4,missing=4,gain=0.0223081112,cover=50
3:leaf=-0.25999999,cover=44
4:leaf=-0.142857149,cover=6
2:[petal length (cm)<4.19999981] yes=5,no=6,missing=6,gain=0.343410492,cover=50
5:leaf=0.620000005,cover=19
6:leaf=0.903124988,cover=31
Seems correct since I’m using 100 examples. However, then I run python simple_distributed_xgb.py --world_size 2 On the two machines that I use and I get this tree:
0:[petal length (cm)<3] yes=1,no=2,missing=2,gain=19.2424183,cover=200
1:[petal length (cm)<1.70000005] yes=3,no=4,missing=4,gain=0.0223081112,cover=50
3:leaf=-0.25999999,cover=44
4:leaf=-0.142857149,cover=6
2:[petal length (cm)<3.5999999] yes=5,no=6,missing=5,gain=9.38419342,cover=150
5:leaf=0.294339627,cover=105
6:leaf=0.841304362,cover=45
Notice cover=200. Gain is also wrong initially(and not halfed. so gradients also wrong or am I missing a regularization?), which causes wrong leaf values on the right side of the tree.
I’d appreciate your help figuring out what am I doing wrong
About this issue
- Original URL
- State: closed
- Created 3 years ago
- Comments: 18 (12 by maintainers)
Thanks for raising the issue, I will look into it and add some tests tomorrow.