pykeen: IndexError when training model with TextRepresentation
Describe the bug
Trying to train model with TextRepresentation and while other models work as usual there seems to be a strange issue with tokeniser.
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[17], line 1
----> 1 result = pipeline(
2 model=ERModel,
3 training=training,
4 testing=testing,
5 validation=testing,
6 stopper='early',
7 model_kwargs=dict(
8 interaction="ermlpe",
9 interaction_kwargs=dict(
10 embedding_dim=entity_representations.shape[0],
11 ),
12 entity_representations=entity_representations,
13 relation_representations_kwargs=dict(
14 shape=entity_representations.shape,
15 ),
16 ),
17 training_kwargs=dict(
18 batch_size=64,
19 num_epochs=1,
20 ),
21 evaluation_kwargs=dict(
22 batch_size=11
23 ),
24 stopper_kwargs=dict(frequency=3, patience=3, relative_delta=0.0001),
25 random_seed=42,
26 negative_sampler_kwargs=dict(
27 filtered=True,
28 filterer='python-set',
29 num_negs_per_pos = 3,
30 corruption_scheme=(0)
31 ),
32 )
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/pipeline/api.py:1368, in pipeline(dataset, dataset_kwargs, training, testing, validation, evaluation_entity_whitelist, evaluation_relation_whitelist, model, model_kwargs, interaction, interaction_kwargs, dimensions, loss, loss_kwargs, regularizer, regularizer_kwargs, optimizer, optimizer_kwargs, clear_optimizer, lr_scheduler, lr_scheduler_kwargs, training_loop, training_loop_kwargs, negative_sampler, negative_sampler_kwargs, epochs, training_kwargs, stopper, stopper_kwargs, evaluator, evaluator_kwargs, evaluation_kwargs, result_tracker, result_tracker_kwargs, metadata, device, random_seed, use_testing_data, evaluation_fallback, filter_validation_when_testing, use_tqdm)
1359 _result_tracker.log_params(
1360 params=dict(
1361 evaluation_kwargs={
(...)
1365 )
1366 )
1367 evaluate_start_time = time.time()
-> 1368 metric_results: MetricResults = _safe_evaluate(
1369 model=model_instance,
1370 mapped_triples=mapped_triples,
1371 evaluator=evaluator_instance,
1372 evaluation_kwargs=evaluation_kwargs,
1373 evaluation_fallback=evaluation_fallback,
1374 )
1375 evaluate_end_time = time.time() - evaluate_start_time
1376 _result_tracker.log_metrics(metrics=dict(final_evaluation=evaluate_end_time), step=step, prefix="times")
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/pipeline/api.py:1424, in _safe_evaluate(model, mapped_triples, evaluator, evaluation_kwargs, evaluation_fallback)
1422 while True:
1423 try:
-> 1424 metric_results: MetricResults = evaluator.evaluate(
1425 model=model,
1426 mapped_triples=mapped_triples,
1427 **evaluation_kwargs,
1428 )
1429 except (MemoryError, RuntimeError) as e:
1430 # If the evaluation still fail using the CPU, the error is raised
1431 if model.device.type != "cuda" or not evaluation_fallback:
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/evaluation/evaluator.py:208, in Evaluator.evaluate(self, model, mapped_triples, batch_size, slice_size, **kwargs)
205 # Clear the ranks from the current evaluator
206 self.finalize()
--> 208 rv = evaluate(
209 model=model,
210 mapped_triples=mapped_triples,
211 evaluator=self,
212 batch_size=batch_size,
213 slice_size=slice_size,
214 **kwargs,
215 )
216 # Since squeeze is true, we can expect that evaluate returns a MetricResult, but we need to tell MyPy that
217 return cast(MetricResults, rv)
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/evaluation/evaluator.py:682, in evaluate(model, mapped_triples, evaluator, only_size_probing, batch_size, slice_size, device, use_tqdm, tqdm_kwargs, restrict_entities_to, restrict_relations_to, do_time_consuming_checks, additional_filter_triples, pre_filtered_triples, targets, mode)
680 relation_filter = None
681 for target in targets:
--> 682 relation_filter = _evaluate_batch(
683 batch=batch,
684 model=model,
685 target=target,
686 evaluator=evaluator,
687 slice_size=slice_size,
688 all_pos_triples=all_pos_triples,
689 relation_filter=relation_filter,
690 restrict_entities_to=restrict_entities_to,
691 mode=mode,
692 )
694 # If we only probe sizes we do not need more than one batch
695 if only_size_probing and evaluated_once:
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/evaluation/evaluator.py:755, in _evaluate_batch(batch, model, target, evaluator, slice_size, all_pos_triples, relation_filter, restrict_entities_to, mode)
715 def _evaluate_batch(
716 batch: MappedTriples,
717 model: Model,
(...)
725 mode: Optional[InductiveMode],
726 ) -> torch.BoolTensor:
727 """
728 Evaluate ranking for batch.
729
(...)
753 The relation filter, which can be re-used for the same batch.
754 """
--> 755 scores = model.predict(hrt_batch=batch, target=target, slice_size=slice_size, mode=mode)
757 if evaluator.filtered or evaluator.requires_positive_mask:
758 column = TARGET_TO_INDEX[target]
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/base.py:465, in Model.predict(self, hrt_batch, target, full_batch, ids, **kwargs)
463 if full_batch:
464 hrt_batch = hrt_batch[:, 0:2]
--> 465 return self.predict_t(hrt_batch, **kwargs, tails=ids)
467 if target == LABEL_RELATION:
468 if full_batch:
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/base.py:403, in Model.predict_t(self, hr_batch, **kwargs)
401 self.eval() # Enforce evaluation mode
402 hr_batch = self._prepare_batch(batch=hr_batch, index_relation=1)
--> 403 scores = self.score_t(hr_batch, **kwargs)
404 if self.predict_with_sigmoid:
405 scores = torch.sigmoid(scores)
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/nbase.py:466, in ERModel.score_t(self, hr_batch, slice_size, mode, tails)
464 # add broadcast dimension
465 hr_batch = hr_batch.unsqueeze(dim=1)
--> 466 h, r, t = self._get_representations(h=hr_batch[..., 0], r=hr_batch[..., 1], t=tails, mode=mode)
467 # unsqueeze if necessary
468 if tails is None or tails.ndimension() == 1:
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/nbase.py:570, in ERModel._get_representations(self, h, r, t, mode)
568 head_representations = [head_representations[i] for i in self.interaction.head_indices()]
569 tail_representations = [tail_representations[i] for i in self.interaction.tail_indices()]
--> 570 hr, rr, tr = [
571 [representation(indices=indices) for representation in representations]
572 for indices, representations in (
573 (h, head_representations),
574 (r, self.relation_representations),
575 (t, tail_representations),
576 )
577 ]
578 # normalization
579 return cast(
580 Tuple[HeadRepresentation, RelationRepresentation, TailRepresentation],
581 tuple(x[0] if len(x) == 1 else x for x in (hr, rr, tr)),
582 )
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/nbase.py:571, in <listcomp>(.0)
568 head_representations = [head_representations[i] for i in self.interaction.head_indices()]
569 tail_representations = [tail_representations[i] for i in self.interaction.tail_indices()]
570 hr, rr, tr = [
--> 571 [representation(indices=indices) for representation in representations]
572 for indices, representations in (
573 (h, head_representations),
574 (r, self.relation_representations),
575 (t, tail_representations),
576 )
577 ]
578 # normalization
579 return cast(
580 Tuple[HeadRepresentation, RelationRepresentation, TailRepresentation],
581 tuple(x[0] if len(x) == 1 else x for x in (hr, rr, tr)),
582 )
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/models/nbase.py:571, in <listcomp>(.0)
568 head_representations = [head_representations[i] for i in self.interaction.head_indices()]
569 tail_representations = [tail_representations[i] for i in self.interaction.tail_indices()]
570 hr, rr, tr = [
--> 571 [representation(indices=indices) for representation in representations]
572 for indices, representations in (
573 (h, head_representations),
574 (r, self.relation_representations),
575 (t, tail_representations),
576 )
577 ]
578 # normalization
579 return cast(
580 Tuple[HeadRepresentation, RelationRepresentation, TailRepresentation],
581 tuple(x[0] if len(x) == 1 else x for x in (hr, rr, tr)),
582 )
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/nn/representation.py:179, in Representation.forward(self, indices)
177 if indices is not None and self.unique:
178 indices, inverse = indices.unique(return_inverse=True)
--> 179 x = self._plain_forward(indices=indices)
180 # normalize *before* repeating
181 if self.normalizer is not None:
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/nn/representation.py:1046, in TextRepresentation._plain_forward(self, indices)
1044 else:
1045 labels = [self.labels[i] for i in indices.tolist()]
-> 1046 return self.encoder(labels=labels)
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/torch/nn/modules/module.py:1194, in Module._call_impl(self, *input, **kwargs)
1190 # If we don't have any hooks, we want to skip the rest of the logic in
1191 # this function, and just call forward.
1192 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1193 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1194 return forward_call(*input, **kwargs)
1195 # Do not call functions when jit is used
1196 full_backward_hooks, non_full_backward_hooks = [], []
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/nn/text.py:77, in TextEncoder.forward(self, labels)
75 labels = upgrade_to_sequence(labels)
76 labels = list(map(str, labels))
---> 77 return self.forward_normalized(texts=labels)
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/pykeen/nn/text.py:223, in TransformerTextEncoder.forward_normalized(self, texts)
221 def forward_normalized(self, texts: Sequence[str]) -> torch.FloatTensor: # noqa: D102
222 return self.model(
--> 223 **self.tokenizer(
224 texts,
225 return_tensors="pt",
226 padding=True,
227 truncation=True,
228 max_length=self.max_length,
229 ).to(get_preferred_device(self.model))
230 ).pooler_output
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2520, in PreTrainedTokenizerBase.__call__(self, text, text_pair, text_target, text_pair_target, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2518 if not self._in_target_context_manager:
2519 self._switch_to_input_mode()
-> 2520 encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
2521 if text_target is not None:
2522 self._switch_to_target_mode()
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2606, in PreTrainedTokenizerBase._call_one(self, text, text_pair, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2601 raise ValueError(
2602 f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
2603 f" {len(text_pair)}."
2604 )
2605 batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
-> 2606 return self.batch_encode_plus(
2607 batch_text_or_text_pairs=batch_text_or_text_pairs,
2608 add_special_tokens=add_special_tokens,
2609 padding=padding,
2610 truncation=truncation,
2611 max_length=max_length,
2612 stride=stride,
2613 is_split_into_words=is_split_into_words,
2614 pad_to_multiple_of=pad_to_multiple_of,
2615 return_tensors=return_tensors,
2616 return_token_type_ids=return_token_type_ids,
2617 return_attention_mask=return_attention_mask,
2618 return_overflowing_tokens=return_overflowing_tokens,
2619 return_special_tokens_mask=return_special_tokens_mask,
2620 return_offsets_mapping=return_offsets_mapping,
2621 return_length=return_length,
2622 verbose=verbose,
2623 **kwargs,
2624 )
2625 else:
2626 return self.encode_plus(
2627 text=text,
2628 text_pair=text_pair,
(...)
2644 **kwargs,
2645 )
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/transformers/tokenization_utils_base.py:2797, in PreTrainedTokenizerBase.batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2787 # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
2788 padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
2789 padding=padding,
2790 truncation=truncation,
(...)
2794 **kwargs,
2795 )
-> 2797 return self._batch_encode_plus(
2798 batch_text_or_text_pairs=batch_text_or_text_pairs,
2799 add_special_tokens=add_special_tokens,
2800 padding_strategy=padding_strategy,
2801 truncation_strategy=truncation_strategy,
2802 max_length=max_length,
2803 stride=stride,
2804 is_split_into_words=is_split_into_words,
2805 pad_to_multiple_of=pad_to_multiple_of,
2806 return_tensors=return_tensors,
2807 return_token_type_ids=return_token_type_ids,
2808 return_attention_mask=return_attention_mask,
2809 return_overflowing_tokens=return_overflowing_tokens,
2810 return_special_tokens_mask=return_special_tokens_mask,
2811 return_offsets_mapping=return_offsets_mapping,
2812 return_length=return_length,
2813 verbose=verbose,
2814 **kwargs,
2815 )
File /home/user/conda/envs/python_38/lib/python3.8/site-packages/transformers/tokenization_utils_fast.py:462, in PreTrainedTokenizerFast._batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)
455 # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
456 # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
457 # (we say ~ because the number of overflow varies with the example in the batch)
458 #
459 # To match each overflowing sample with the original sample in the batch
460 # we add an overflow_to_sample_mapping array (see below)
461 sanitized_tokens = {}
--> 462 for key in tokens_and_encodings[0][0].keys():
463 stack = [e for item, _ in tokens_and_encodings for e in item[key]]
464 sanitized_tokens[key] = stack
IndexError: list index out of range
How to reproduce
from pykeen.nn.representation import TextRepresentation
from pykeen.models import ERModel
entity_representations = TextRepresentation.from_triples_factory(
triples_factory=training,
encoder="transformer",
encoder_kwargs=dict(pretrained_model_name_or_path="bert-base-multilingual-uncased", max_length=512),
)
result = pipeline(
model=ERModel,
training=training,
testing=testing,
validation=testing,
stopper='early',
model_kwargs=dict(
interaction="ermlpe",
interaction_kwargs=dict(
embedding_dim=entity_representations.shape[0],
),
entity_representations=entity_representations,
relation_representations_kwargs=dict(
shape=entity_representations.shape,
),
),
training_kwargs=dict(
batch_size=64,
num_epochs=1,
),
evaluation_kwargs=dict(
batch_size=11
),
random_seed=42,
negative_sampler_kwargs=dict(
filtered=True,
filterer='python-set',
num_negs_per_pos = 3,
corruption_scheme=(0)
),
)
Environment
| Key | Value |
|---|---|
| OS | posix |
| Platform | Linux |
| Release | 5.4.0-122-generic |
| Time | Thu Jan 12 10:11:36 2023 |
| Python | 3.8.15 |
| PyKEEN | 1.9.0 |
| PyKEEN Hash | UNHASHED |
| PyKEEN Branch | |
| PyTorch | 1.13.1+cu117 |
| CUDA Available? | true |
| CUDA Version | 11.7 |
| cuDNN Version | 8500 |
Additional information
No response
Issue Template Checks
- This is not a feature request (use a different issue template if it is)
- This is not a question (use the discussions forum instead)
- I’ve read the text explaining why including environment information is important and understand if I omit this information that my issue will be dismissed
About this issue
- Original URL
- State: closed
- Created a year ago
- Comments: 23 (21 by maintainers)
Commits related to this issue
- 📜🔧 Fix labels not being converted to list (#1209) Fix #1197 --------- Co-authored-by: Charles Tapley Hoyt <cthoyt@gmail.com> — committed to pykeen/pykeen by mberr a year ago
- 🌅🔪 Early Slicing for Lazy Target Representations (#1321) Slice earlier to allow lazy computation of target representations; this is relevant when calculating all target representations is infeasib... — committed to pykeen/pykeen by mberr 9 months ago
I tried it, however there are now cuda errors but my evaluation batch size is 1 and I have 40 GB of GPU memory
I’ll re-open this issue until we receive notification that this also solved the original issue.