training-operator: training-operator set scheduler error

training-operator v1.3.0

if spec.Template.Spec.SchedulerName is null, IsGangSchedulerSet will return false, podTemplate.Spec.SchedulerName will not set gangSchedulerName

// pkg/controller.v1/tensorflow/tfjob_controller.go
	if r.Config.EnableGangScheduling {
		if !util.IsGangSchedulerSet(replicas, gangSchedulerName) {
			errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten"
			logger.Warning(errMsg)
			r.Recorder.Event(tfjob, v1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg)
		} else {
			podTemplate.Spec.SchedulerName = gangSchedulerName
		}
         ...
// pkg/common/util/scheduler.go
func IsGangSchedulerSet(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec, schedulerName string) bool {
	if len(schedulerName) == 0 {
		schedulerName = DefaultGangSchedulerName
	}

	for _, spec := range replicas {
		logrus.Info(fmt.Sprintf("Spec %v, SchedulerName %v",spec,spec.Template.Spec.SchedulerName))
		if spec.Template.Spec.SchedulerName != "" && spec.Template.Spec.SchedulerName == schedulerName {
			return true
		}
	}
	return false
}

the same code in tf-operator v1.2.1

// pkg/controller.v1/tensorflow/pod.go
	if isNonGangSchedulerSet(replicas) {
			errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten"
			logger.Warning(errMsg)
			tc.Recorder.Event(tfjob, v1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg)
		} else {
			podTemplate.Spec.SchedulerName = gangSchedulerName
		}
func isNonGangSchedulerSet(replicas map[commonv1.ReplicaType]*commonv1.ReplicaSpec) bool {
	for _, spec := range replicas {
		if spec.Template.Spec.SchedulerName != "" && spec.Template.Spec.SchedulerName != gangSchedulerName {
			return true
		}
	}
	return false
}

About this issue

  • Original URL
  • State: closed
  • Created 3 years ago
  • Comments: 15 (12 by maintainers)

Most upvoted comments

@berlinsaint The patch is merged, you can use the lastest code. The bug should be fixed.

Thanks. In fact , i have compiled them myself this afternoon…

If there’s empty scheduler name, we set to default

I think it should be: If there is empty scheduler name or null