tesseract: Many bugs in training the legacy engine

I doubt anybody have successfully trained custom data with tesseract 5.2.0 and 5.1.0, the latest I can succeed is 5.0.0-alpha-20201224. Below are some BUGs when I’m running tesseract 5.2.0 for custom data training. I can say there are TOO MANY BUGS, thus I was not able to finish the whole training due to limited time at this moment, below are just a few of the found BUGs for reference.

// Deletes all samples with zero features marked by KillSample.
void TrainingSampleSet::DeleteDeadSamples() {
  using namespace std::placeholders; // for _1
  auto old_it = samples_.begin();
  for (; old_it < samples_.end(); ++old_it) {
    if (*old_it == nullptr || (*old_it)->class_id() < 0) {
      break;
    }
  }
  auto new_it = old_it;
  for (; old_it < samples_.end(); ++old_it) {
    if (*old_it == nullptr || (*old_it)->class_id() < 0) {
      delete *old_it;
    } else {
      *new_it = *old_it;
      ++new_it;
    }
  }
  //samples_.resize(new_it - samples_.begin() + 1);      //<------------crash the program when samples_.size() is 0
  samples_.resize(new_it - samples_.begin());
  num_raw_samples_ = samples_.size();
  // Samples must be re-organized now we have deleted a few.
}



INT_TEMPLATES_STRUCT *Classify::CreateIntTemplates(CLASSES FloatProtos,
                                           const UNICHARSET &target_unicharset) {
  CLASS_TYPE FClass;
  INT_CLASS_STRUCT *IClass;
  int ProtoId;
  int ConfigId;

  auto IntTemplates = new INT_TEMPLATES_STRUCT;

  for (unsigned ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
    FClass = &(FloatProtos[ClassId]);
    if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
        strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
      tprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
              target_unicharset.id_to_unichar(ClassId));
    }
    assert(UnusedClassIdIn(IntTemplates, ClassId));
    IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs);
    //FontSet fs{FClass->font_set.size()};             //<---------------------- it will force to push an element in, not size of the vector
    int fsize = FClass->font_set.size();
    FontSet fs(fsize);
	
	
	
	
/**
 * This routine converts from the old floating point format
 * to the new integer format.
 * @param FloatProtos prototypes in old floating pt format
 * @param target_unicharset the UNICHARSET to use
 * @return New set of training templates in integer format.
 * @note Globals: none
 */
INT_TEMPLATES_STRUCT *Classify::CreateIntTemplates(CLASSES FloatProtos,
                                           const UNICHARSET &target_unicharset) {
  CLASS_TYPE FClass;
  INT_CLASS_STRUCT *IClass;
  int ProtoId;
  int ConfigId;

  auto IntTemplates = new INT_TEMPLATES_STRUCT;

  for (unsigned ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
    FClass = &(FloatProtos[ClassId]);
    if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
        strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
      tprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
              target_unicharset.id_to_unichar(ClassId));
    }
    assert(UnusedClassIdIn(IntTemplates, ClassId));
    IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs);
    //FontSet fs{FClass->font_set.size()};
    int fsize = FClass->font_set.size();
    FontSet fs(fsize);
    for (unsigned i = 0; i < fs.size(); ++i) {
      fs[i] = FClass->font_set.at(i);
    }
    IClass->font_set_id = this->fontset_table_.push_back(fs);  // <------------------------------ref. to below push_back function
    AddIntClass(IntTemplates, ClassId, IClass);                         

    for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
      AddIntProto(IClass);
      ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
      AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
                            classify_learning_debug_level >= 2);
      AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
    }

    for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
      AddIntConfig(IClass);
      ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
    }
  }
  return (IntTemplates);
} /* CreateIntTemplates */

//ref. unicity_table.h
  /// Add an element in the table
  int push_back(T object)  {
    auto idx = get_index(object);
    if (idx == -1) {
      //table_.push_back(object);  //<----------- it will crash the program since idx will be 1 and when size() is 1; actually index should be 0 for size of 1;
      //idx = size();
      idx = table_.push_back(object);
    }
    return idx;
  }


bool write_set(FILE *f, const FontSet &fs) {
  int size = fs.size();
  //return tesseract::Serialize(f, &size) && tesseract::Serialize(f,  &fs[0], size); //<----------------this will crash the program when fs.size() is 0
  return tesseract::Serialize(f, &size) && tesseract::Serialize(f, (size?&fs[0]:0), size);
}



/*---------------------------------------------------------------------------*/
// TODO(rays) This is now used only by cntraining. Convert cntraining to use
// the new method or get rid of it entirely.
/**
 * This routine reads training samples from a file and
 * places them into a data structure which organizes the
 * samples by FontName and CharName.  It then returns this
 * data structure.
 * @param file open text file to read samples from
 * @param feature_definitions
 * @param feature_name
 * @param max_samples
 * @param unicharset
 * @param training_samples
 */
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name,
                         int max_samples, UNICHARSET *unicharset, FILE *file,
                         LIST *training_samples) {
  char buffer[2048];
  char unichar[UNICHAR_LEN + 1];
  LABELEDLIST char_sample;
  FEATURE_SET feature_samples;
  uint32_t feature_type = ShortNameToFeatureType(feature_definitions, feature_name);

  // Zero out the font_sample_count for all the classes.
  LIST it = *training_samples;
  iterate(it) {
    char_sample = reinterpret_cast<LABELEDLIST>(it->first_node());
    char_sample->font_sample_count = 0;
  }

  while (fgets(buffer, 2048, file) != nullptr) {
    if (buffer[0] == '\n') {
      continue;
    }

    sscanf(buffer, "%*s %s", unichar);
    if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
      unicharset->unichar_insert(unichar);
      if (unicharset->size() > MAX_NUM_CLASSES) {
        tprintf(
            "Error: Size of unicharset in training is "
            "greater than MAX_NUM_CLASSES\n");
        exit(1);
      }
    }
    char_sample = FindList(*training_samples, unichar);
    if (char_sample == nullptr) {
      char_sample = new LABELEDLISTNODE(unichar);
      *training_samples = push(*training_samples, char_sample);
    }
    auto char_desc = ReadCharDescription(feature_definitions, file);
    feature_samples = char_desc->FeatureSets[feature_type];
    if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
      char_sample->List = push(char_sample->List, feature_samples);
      char_sample->SampleCount++;
      char_sample->font_sample_count++;
    } else {
      delete feature_samples;
    }
    for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
      if (feature_type != i) {
        delete char_desc->FeatureSets[i];
        char_desc->FeatureSets[i] = nullptr;  //<--------------newly added, otherwise crash the program on "delete char_desc;" when destruction is forced by char_desc;
      }
    }
    delete char_desc;
  }
} // ReadTrainingSamples

I changed the above code and can get “shapeclustering.exe” and “mftraining.exe” to run smoothly, all training materail such as “inttemp” and “pffmtable” are well generated. Currently the cntraining.exe will crash, but I don’t have any more time to test.