pykeen: IndexError when training model with TextRepresentation

Describe the bug

Trying to train model with TextRepresentation and while other models work as usual there seems to be a strange issue with tokeniser.

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In[17], line 1
----> 1 result = pipeline(
      2     model=ERModel,
      3     training=training,
      4     testing=testing,
      5     validation=testing,
      6     stopper='early',
      7     model_kwargs=dict(
     8         interaction="ermlpe",
     9         interaction_kwargs=dict(
     10             embedding_dim=entity_representations.shape[0],
     11         ),
     12         entity_representations=entity_representations,
     13         relation_representations_kwargs=dict(
     14             shape=entity_representations.shape,
     15         ),
     16     ),
     17     training_kwargs=dict(
     18         batch_size=64,
     19         num_epochs=1,
     20     ),
     21     evaluation_kwargs=dict(
     22         batch_size=11
     23     ),
     24     stopper_kwargs=dict(frequency=3, patience=3, relative_delta=0.0001),
     25     random_seed=42,
     26     negative_sampler_kwargs=dict(
     27         filtered=True,
     28     filterer='python-set',  
     29         num_negs_per_pos = 3, 
     30         corruption_scheme=(0)
     31     ),
     32 )

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/pipeline/api.py:1368, in pipeline(dataset, dataset_kwargs, training, testing, validation, evaluation_entity_whitelist, evaluation_relation_whitelist, model, model_kwargs, interaction, interaction_kwargs, dimensions, loss, loss_kwargs, regularizer, regularizer_kwargs, optimizer, optimizer_kwargs, clear_optimizer, lr_scheduler, lr_scheduler_kwargs, training_loop, training_loop_kwargs, negative_sampler, negative_sampler_kwargs, epochs, training_kwargs, stopper, stopper_kwargs, evaluator, evaluator_kwargs, evaluation_kwargs, result_tracker, result_tracker_kwargs, metadata, device, random_seed, use_testing_data, evaluation_fallback, filter_validation_when_testing, use_tqdm)
   1359 _result_tracker.log_params(
   1360     params=dict(
   1361         evaluation_kwargs={
   (...)
   1365     )
   1366 )
   1367 evaluate_start_time = time.time()
-> 1368 metric_results: MetricResults = _safe_evaluate(
   1369     model=model_instance,
   1370     mapped_triples=mapped_triples,
   1371     evaluator=evaluator_instance,
   1372     evaluation_kwargs=evaluation_kwargs,
   1373     evaluation_fallback=evaluation_fallback,
   1374 )
   1375 evaluate_end_time = time.time() - evaluate_start_time
   1376 _result_tracker.log_metrics(metrics=dict(final_evaluation=evaluate_end_time), step=step, prefix="times")

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/pipeline/api.py:1424, in _safe_evaluate(model, mapped_triples, evaluator, evaluation_kwargs, evaluation_fallback)
   1422 while True:
   1423     try:
-> 1424         metric_results: MetricResults = evaluator.evaluate(
   1425             model=model,
   1426             mapped_triples=mapped_triples,
   1427             **evaluation_kwargs,
   1428         )
   1429     except (MemoryError, RuntimeError) as e:
   1430         # If the evaluation still fail using the CPU, the error is raised
   1431         if model.device.type != "cuda" or not evaluation_fallback:

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/evaluation/evaluator.py:208, in Evaluator.evaluate(self, model, mapped_triples, batch_size, slice_size, **kwargs)
    205         # Clear the ranks from the current evaluator
    206         self.finalize()
--> 208 rv = evaluate(
    209     model=model,
    210     mapped_triples=mapped_triples,
    211     evaluator=self,
    212     batch_size=batch_size,
    213     slice_size=slice_size,
    214     **kwargs,
    215 )
    216 # Since squeeze is true, we can expect that evaluate returns a MetricResult, but we need to tell MyPy that
    217 return cast(MetricResults, rv)

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/evaluation/evaluator.py:682, in evaluate(model, mapped_triples, evaluator, only_size_probing, batch_size, slice_size, device, use_tqdm, tqdm_kwargs, restrict_entities_to, restrict_relations_to, do_time_consuming_checks, additional_filter_triples, pre_filtered_triples, targets, mode)
    680 relation_filter = None
    681 for target in targets:
--> 682     relation_filter = _evaluate_batch(
    683         batch=batch,
    684         model=model,
    685         target=target,
    686         evaluator=evaluator,
    687         slice_size=slice_size,
    688         all_pos_triples=all_pos_triples,
    689         relation_filter=relation_filter,
    690         restrict_entities_to=restrict_entities_to,
    691         mode=mode,
    692     )
    694 # If we only probe sizes we do not need more than one batch
    695 if only_size_probing and evaluated_once:

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/evaluation/evaluator.py:755, in _evaluate_batch(batch, model, target, evaluator, slice_size, all_pos_triples, relation_filter, restrict_entities_to, mode)
    715 def _evaluate_batch(
    716     batch: MappedTriples,
    717     model: Model,
   (...)
    725     mode: Optional[InductiveMode],
    726 ) -> torch.BoolTensor:
    727     """
    728     Evaluate ranking for batch.
    729 
   (...)
    753         The relation filter, which can be re-used for the same batch.
    754     """
--> 755     scores = model.predict(hrt_batch=batch, target=target, slice_size=slice_size, mode=mode)
    757     if evaluator.filtered or evaluator.requires_positive_mask:
    758         column = TARGET_TO_INDEX[target]

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/base.py:465, in Model.predict(self, hrt_batch, target, full_batch, ids, **kwargs)
    463     if full_batch:
    464         hrt_batch = hrt_batch[:, 0:2]
--> 465     return self.predict_t(hrt_batch, **kwargs, tails=ids)
    467 if target == LABEL_RELATION:
    468     if full_batch:

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/base.py:403, in Model.predict_t(self, hr_batch, **kwargs)
    401 self.eval()  # Enforce evaluation mode
    402 hr_batch = self._prepare_batch(batch=hr_batch, index_relation=1)
--> 403 scores = self.score_t(hr_batch, **kwargs)
    404 if self.predict_with_sigmoid:
    405     scores = torch.sigmoid(scores)

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/nbase.py:466, in ERModel.score_t(self, hr_batch, slice_size, mode, tails)
    464 # add broadcast dimension
    465 hr_batch = hr_batch.unsqueeze(dim=1)
--> 466 h, r, t = self._get_representations(h=hr_batch[..., 0], r=hr_batch[..., 1], t=tails, mode=mode)
    467 # unsqueeze if necessary
    468 if tails is None or tails.ndimension() == 1:

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/nbase.py:570, in ERModel._get_representations(self, h, r, t, mode)
    568 head_representations = [head_representations[i] for i in self.interaction.head_indices()]
    569 tail_representations = [tail_representations[i] for i in self.interaction.tail_indices()]
--> 570 hr, rr, tr = [
    571     [representation(indices=indices) for representation in representations]
    572     for indices, representations in (
    573         (h, head_representations),
    574         (r, self.relation_representations),
    575         (t, tail_representations),
    576     )
    577 ]
    578 # normalization
    579 return cast(
    580     Tuple[HeadRepresentation, RelationRepresentation, TailRepresentation],
    581     tuple(x[0] if len(x) == 1 else x for x in (hr, rr, tr)),
    582 )

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/nbase.py:571, in <listcomp>(.0)
    568 head_representations = [head_representations[i] for i in self.interaction.head_indices()]
    569 tail_representations = [tail_representations[i] for i in self.interaction.tail_indices()]
    570 hr, rr, tr = [
--> 571     [representation(indices=indices) for representation in representations]
    572     for indices, representations in (
    573         (h, head_representations),
    574         (r, self.relation_representations),
    575         (t, tail_representations),
    576     )
    577 ]
    578 # normalization
    579 return cast(
    580     Tuple[HeadRepresentation, RelationRepresentation, TailRepresentation],
    581     tuple(x[0] if len(x) == 1 else x for x in (hr, rr, tr)),
    582 )

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/nbase.py:571, in <listcomp>(.0)
    568 head_representations = [head_representations[i] for i in self.interaction.head_indices()]
    569 tail_representations = [tail_representations[i] for i in self.interaction.tail_indices()]
    570 hr, rr, tr = [
--> 571     [representation(indices=indices) for representation in representations]
    572     for indices, representations in (
    573         (h, head_representations),
    574         (r, self.relation_representations),
    575         (t, tail_representations),
    576     )
    577 ]
    578 # normalization
    579 return cast(
    580     Tuple[HeadRepresentation, RelationRepresentation, TailRepresentation],
    581     tuple(x[0] if len(x) == 1 else x for x in (hr, rr, tr)),
    582 )

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
   1190 # If we don't have any hooks, we want to skip the rest of the logic in
   1191 # this function, and just call forward.
   1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194     return forward_call(*input, **kwargs)
   1195 # Do not call functions when jit is used
   1196 full_backward_hooks, non_full_backward_hooks = [], []

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/nn/representation.py:179, in Representation.forward(self, indices)
    177 if indices is not None and self.unique:
    178     indices, inverse = indices.unique(return_inverse=True)
--> 179 x = self._plain_forward(indices=indices)
    180 # normalize *before* repeating
    181 if self.normalizer is not None:

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/nn/representation.py:1046, in TextRepresentation._plain_forward(self, indices)
   1044 else:
   1045     labels = [self.labels[i] for i in indices.tolist()]
-> 1046 return self.encoder(labels=labels)

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
   1190 # If we don't have any hooks, we want to skip the rest of the logic in
   1191 # this function, and just call forward.
   1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1193         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194     return forward_call(*input, **kwargs)
   1195 # Do not call functions when jit is used
   1196 full_backward_hooks, non_full_backward_hooks = [], []

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/nn/text.py:77, in TextEncoder.forward(self, labels)
     75 labels = upgrade_to_sequence(labels)
     76 labels = list(map(str, labels))
---> 77 return self.forward_normalized(texts=labels)

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/nn/text.py:223, in TransformerTextEncoder.forward_normalized(self, texts)
    221 def forward_normalized(self, texts: Sequence[str]) -> torch.FloatTensor:  # noqa: D102
    222     return self.model(
--> 223         **self.tokenizer(
    224             texts,
    225             return_tensors="pt",
    226             padding=True,
    227             truncation=True,
    228             max_length=self.max_length,
    229         ).to(get_preferred_device(self.model))
    230     ).pooler_output

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2520, in PreTrainedTokenizerBase.__call__(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
   2518     if not self._in_target_context_manager:
   2519         self._switch_to_input_mode()
-> 2520     encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
   2521 if text_target is not None:
   2522     self._switch_to_target_mode()

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2606, in PreTrainedTokenizerBase._call_one(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
   2601         raise ValueError(
   2602             f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
   2603             f" {len(text_pair)}."
   2604         )
   2605     batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-> 2606     return self.batch_encode_plus(
   2607         batch_text_or_text_pairs=batch_text_or_text_pairs,
   2608         add_special_tokens=add_special_tokens,
   2609         padding=padding,
   2610         truncation=truncation,
   2611         max_length=max_length,
   2612         stride=stride,
   2613         is_split_into_words=is_split_into_words,
   2614         pad_to_multiple_of=pad_to_multiple_of,
   2615         return_tensors=return_tensors,
   2616         return_token_type_ids=return_token_type_ids,
   2617         return_attention_mask=return_attention_mask,
   2618         return_overflowing_tokens=return_overflowing_tokens,
   2619         return_special_tokens_mask=return_special_tokens_mask,
   2620         return_offsets_mapping=return_offsets_mapping,
   2621         return_length=return_length,
   2622         verbose=verbose,
   2623         **kwargs,
   2624     )
   2625 else:
   2626     return self.encode_plus(
   2627         text=text,
   2628         text_pair=text_pair,
   (...)
   2644         **kwargs,
   2645     )

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2797, in PreTrainedTokenizerBase.batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
   2787 # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
   2788 padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
   2789     padding=padding,
   2790     truncation=truncation,
   (...)
   2794     **kwargs,
   2795 )
-> 2797 return self._batch_encode_plus(
   2798     batch_text_or_text_pairs=batch_text_or_text_pairs,
   2799     add_special_tokens=add_special_tokens,
   2800     padding_strategy=padding_strategy,
   2801     truncation_strategy=truncation_strategy,
   2802     max_length=max_length,
   2803     stride=stride,
   2804     is_split_into_words=is_split_into_words,
   2805     pad_to_multiple_of=pad_to_multiple_of,
   2806     return_tensors=return_tensors,
   2807     return_token_type_ids=return_token_type_ids,
   2808     return_attention_mask=return_attention_mask,
   2809     return_overflowing_tokens=return_overflowing_tokens,
   2810     return_special_tokens_mask=return_special_tokens_mask,
   2811     return_offsets_mapping=return_offsets_mapping,
   2812     return_length=return_length,
   2813     verbose=verbose,
   2814     **kwargs,
   2815 )

File /home/user/conda/envs/python_38/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:462, in PreTrainedTokenizerFast._batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)
    455 # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
    456 # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
    457 # (we say ~ because the number of overflow varies with the example in the batch)
    458 #
    459 # To match each overflowing sample with the original sample in the batch
    460 # we add an overflow_to_sample_mapping array (see below)
    461 sanitized_tokens = {}
--> 462 for key in tokens_and_encodings[0][0].keys():
    463     stack = [e for item, _ in tokens_and_encodings for e in item[key]]
    464     sanitized_tokens[key] = stack

IndexError: list index out of range

How to reproduce

from pykeen.nn.representation import TextRepresentation
from pykeen.models import ERModel


entity_representations = TextRepresentation.from_triples_factory(
    triples_factory=training, 
    encoder="transformer",
    encoder_kwargs=dict(pretrained_model_name_or_path="bert-base-multilingual-uncased", max_length=512),
)

result = pipeline(
    model=ERModel,
    training=training,
    testing=testing,
    validation=testing,
    stopper='early',
    model_kwargs=dict(
        interaction="ermlpe",
        interaction_kwargs=dict(
            embedding_dim=entity_representations.shape[0],
        ),
        entity_representations=entity_representations,
        relation_representations_kwargs=dict(
            shape=entity_representations.shape,
        ),
    ),
    training_kwargs=dict(
        batch_size=64,
        num_epochs=1,
    ),
    evaluation_kwargs=dict(
        batch_size=11
    ),
    random_seed=42,
    negative_sampler_kwargs=dict(
        filtered=True,
        filterer='python-set',  
        num_negs_per_pos = 3, 
        corruption_scheme=(0)
    ),
)

Environment

Key Value
OS posix
Platform Linux
Release 5.4.0-122-generic
Time Thu Jan 12 10:11:36 2023
Python 3.8.15
PyKEEN 1.9.0
PyKEEN Hash UNHASHED
PyKEEN Branch
PyTorch 1.13.1+cu117
CUDA Available? true
CUDA Version 11.7
cuDNN Version 8500

Additional information

No response

Issue Template Checks

  • This is not a feature request (use a different issue template if it is)
  • This is not a question (use the discussions forum instead)
  • I’ve read the text explaining why including environment information is important and understand if I omit this information that my issue will be dismissed

About this issue

  • Original URL
  • State: closed
  • Created a year ago
  • Comments: 23 (21 by maintainers)

Commits related to this issue

Most upvoted comments

I tried it, however there are now cuda errors but my evaluation batch size is 1 and I have 40 GB of GPU memory

RuntimeError: CUDA out of memory. Tried to allocate 17.43 GiB (GPU 0; 39.59 GiB total capacity; 28.53 GiB already allocated; 9.13 GiB free; 28.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

I’ll re-open this issue until we receive notification that this also solved the original issue.