Module text_embeddings.base
base covers all the base classes, functions for other embedding based tokenizers.
Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2021-04-22 20:43:06
# @Author : Chenghao Mou (mouchenghao@gmail.com)
"""base covers all the base classes, functions for other embedding based tokenizers."""
import abc
from typing import List, Optional, Union, Dict
from itertools import zip_longest
import numpy as np
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy, TruncationStrategy, TensorType, BatchEncoding, EncodedInput, is_torch_available, to_py_obj, TextInput
def is_torch(x) -> bool: # pragma: no
"""
Helper function to check whether the input is a torch tensor.
Parameters
----------
x : [type]
Input data
Returns
-------
bool
Boolean value indicating whether the input is a torch tensor
"""
import torch
return isinstance(x, torch.Tensor)
class EmbeddingTokenizer(PreTrainedTokenizerBase):
"""
Embedding based tokenizer. It assumes each token is mapped to a tensor instead of an index number.
This implementation borrows most implementation from huggingface's transformers library.
Parameters
----------
model_input_names : Optional[List[str]], optional
Required model input names, by default None
special_tokens : Optional[Dict[str, np.ndarray]], optional
Required model special tokens, by default None
max_length : Optional[int], optional
Maximum sequence length supported by the model, by default 2048
"""
def __init__(
self,
model_input_names: Optional[List[str]] = None,
special_tokens: Optional[Dict[str, np.ndarray]] = None,
max_length: Optional[int] = 2048,
):
self.model_input_names = model_input_names
self.special_tokens = special_tokens
self.max_length = max_length
@abc.abstractmethod
def text2embeddings(self, text: str) -> np.ndarray:
raise NotImplementedError('This function is not implemented')
def __call__(
self,
text: Union[TextInput, List[TextInput]],
text_pair: Optional[Union[TextInput, List[TextInput]]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
**kwargs,
) -> BatchEncoding:
"""
Tokenize the text into a sequence of image blocks.
Parameters
----------
text : Union[TextInput, List[TextInput]]
A single text or a list of text
text_pair : Optional[Union[TextInput, List[TextInput]]], optional
A single text or a list of text, by default None
add_special_tokens : bool, optional
Whether to add special tokens to the data, by default True
padding : Union[bool, str, PaddingStrategy], optional
The padding strategy, by default False
truncation : Union[bool, str, TruncationStrategy], optional
The truncation strategy, by default False
max_length : Optional[int], optional
Maximum sequence length, overriding the class variable, by default None
pad_to_multiple_of : Optional[int], optional
Padding parameters, by default None
return_tensors : Optional[Union[str, TensorType]], optional
Return tensors in `pt`, 'tf' or 'np', by default None
return_token_type_ids : Optional[bool], optional
Return token type ids, by default None
return_attention_mask : Optional[bool], optional
Return attention mask, by default None
return_overflowing_tokens : bool, optional
Return overflowing tokens, by default False
return_special_tokens_mask : bool, optional
Return special token mask, by default False
return_length : bool, optional
Return length, by default False
Returns
-------
BatchEncoding
A BatchEncoding object
"""
if self.special_tokens is None:
self.special_tokens = {
"CLS": self.text2embeddings("[CLS]"),
"SEP": self.text2embeddings("[SEP]"),
}
if add_special_tokens and text_pair:
actual_max_length = self.max_length - len(self.special_tokens["SEP"]) * 2 - len(self.special_tokens["CLS"])
else:
actual_max_length = self.max_length
batch_outputs = {}
text = text if isinstance(text, list) else [text]
text_pair = text_pair if isinstance(text_pair, list) else [text_pair]
if isinstance(padding, str):
padding = PaddingStrategy(padding)
if isinstance(truncation, str):
truncation = TruncationStrategy(truncation)
for first_text, second_text in zip_longest(text, text_pair, fillvalue=None):
first_embeddings = self.text2embeddings(first_text)
second_embeddings = self.text2embeddings(second_text)
outputs = self.prepare_for_model(
first_embeddings,
second_embeddings,
add_special_tokens=add_special_tokens,
padding=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward
truncation=truncation,
max_length=max_length or actual_max_length,
pad_to_multiple_of=None, # we pad in batch afterward
return_attention_mask=False, # we pad in batch afterward
return_token_type_ids=return_token_type_ids,
return_overflowing_tokens=return_overflowing_tokens,
return_special_tokens_mask=return_special_tokens_mask,
return_length=return_length,
return_tensors=None, # We convert the whole batch to tensors at the end
prepend_batch_axis=False,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
batch_outputs = self.pad(
batch_outputs,
padding=padding,
max_length=max_length or actual_max_length,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
return batch_outputs
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
if token_ids_1 is None:
return token_ids_0
return np.concatenate(
[
self.special_tokens["CLS"],
token_ids_0,
self.special_tokens["SEP"],
token_ids_1,
self.special_tokens["SEP"],
],
axis=0
)
def prepare_for_model(
self,
ids: List[int],
pair_ids: Optional[List[int]] = None,
add_special_tokens: bool = True,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = False,
max_length: Optional[int] = None,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
return_token_type_ids: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
return_overflowing_tokens: bool = False,
return_special_tokens_mask: bool = False,
return_length: bool = False,
prepend_batch_axis: bool = False,
**kwargs
):
pair = bool(pair_ids is not None)
len_ids = len(ids)
len_pair_ids = len(pair_ids) if pair else 0
if return_token_type_ids and not add_special_tokens:
raise ValueError(
"Asking to return token_type_ids while setting add_special_tokens to False "
"results in an undefined behavior. Please set add_special_tokens to True or "
"set return_token_type_ids to None."
)
# Load from model defaults
if return_token_type_ids is None:
return_token_type_ids = "token_type_ids" in self.model_input_names
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
encoded_inputs = {}
# Compute the total size of the returned encodings
total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
# Truncation: Handle max sequence length
overflowing_tokens = []
if truncation != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
ids, pair_ids, overflowing_tokens = self.truncate_sequences(
ids,
pair_ids=pair_ids,
num_tokens_to_remove=total_len - max_length,
truncation_strategy=truncation,
stride=stride,
)
if return_overflowing_tokens:
encoded_inputs["overflowing_tokens"] = overflowing_tokens
encoded_inputs["num_truncated_tokens"] = total_len - max_length
# Add special tokens
if add_special_tokens:
sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
else:
sequence = np.concatenate([ids, pair_ids], axis=0) if pair is True else ids
token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
# Build output dictionary
encoded_inputs["input_ids"] = sequence
if return_token_type_ids:
encoded_inputs["token_type_ids"] = token_type_ids
if return_special_tokens_mask:
if add_special_tokens:
encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
else:
encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
# Padding
if padding != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
encoded_inputs = self.pad(
encoded_inputs,
max_length=max_length,
padding=padding,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
if return_length:
encoded_inputs["length"] = len(encoded_inputs["input_ids"])
batch_outputs = BatchEncoding(
encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
)
return batch_outputs
def num_special_tokens_to_add(self, pair: bool = False) -> int:
return 0 if not pair else 3
def get_special_tokens_mask(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
if token_ids_1 is None:
return [0 for _ in token_ids_0]
return [1 for _ in self.special_tokens["CLS"]] + [0 for _ in token_ids_0] + [1 for _ in self.special_tokens["SEP"]] + [0 for _ in token_ids_1] + [1 for _ in self.special_tokens["SEP"]]
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
if token_ids_1 is None:
return len(token_ids_0) * [0]
return [0]*len(self.special_tokens["CLS"]) + [0] * len(token_ids_0) + [0]*len(self.special_tokens["SEP"]) + [1] * len(token_ids_1) + [0]*len(self.special_tokens["SEP"])
def pad(
self,
encoded_inputs: Union[
BatchEncoding,
List[BatchEncoding],
Dict[str, EncodedInput],
Dict[str, List[EncodedInput]],
List[Dict[str, EncodedInput]],
],
padding: Union[bool, str, PaddingStrategy] = True,
max_length: Optional[int] = None,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchEncoding:
# If we have a list of dicts, let's convert it in a dict of lists
# We do this to allow using this method as a collate_fn function in PyTorch Dataloader
if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
# The model's main input name, usually `input_ids`, has be passed for padding
if self.model_input_names[0] not in encoded_inputs:
raise ValueError(
"You should supply an encoding or a list of encodings to this method"
f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
)
required_input = encoded_inputs[self.model_input_names[0]]
if required_input is None:
if return_attention_mask:
encoded_inputs["attention_mask"] = []
return encoded_inputs
# If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
# and rebuild them afterwards if no return_tensors is specified
# Note that we lose the specific device the tensor may be on for PyTorch
first_element = required_input[0]
if isinstance(first_element, (list, tuple)):
# first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
index = 0
while len(required_input[index]) == 0:
index += 1
if index < len(required_input):
first_element = required_input[index][0]
# At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
if not isinstance(first_element, (int, list, tuple)):
if is_torch_available() and is_torch(first_element):
return_tensors = "pt" if return_tensors is None else return_tensors
elif isinstance(first_element, np.ndarray):
return_tensors = "np" if return_tensors is None else return_tensors
else:
raise ValueError(
f"type of {first_element} unknown: {type(first_element)}. "
f"Should be one of a python, numpy or pytorch object."
)
for key, value in encoded_inputs.items():
encoded_inputs[key] = to_py_obj(value)
required_input = encoded_inputs[self.model_input_names[0]]
if required_input and not isinstance(required_input[0], (list, tuple)):
encoded_inputs = self._pad(
encoded_inputs,
max_length=max_length,
padding_strategy=padding,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
batch_size = len(required_input)
assert all(
len(v) == batch_size for v in encoded_inputs.values()
), "Some items in the output dictionary have a different batch size than others."
if padding == PaddingStrategy.LONGEST:
max_length = max(len(inputs) for inputs in required_input)
padding = PaddingStrategy.MAX_LENGTH
batch_outputs = {}
for i in range(batch_size):
inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
outputs = self._pad(
inputs,
max_length=max_length,
padding_strategy=padding,
pad_to_multiple_of=pad_to_multiple_of,
return_attention_mask=return_attention_mask,
)
for key, value in outputs.items():
if key not in batch_outputs:
batch_outputs[key] = []
batch_outputs[key].append(value)
return BatchEncoding(batch_outputs, tensor_type=return_tensors)
def create_padding_token_embedding(self, input_embeddings=None):
raise NotImplementedError('This function is not implemented')
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
# Load from model defaults
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
required_input = encoded_inputs[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
if needs_to_be_padded:
difference = max_length - len(required_input)
if "token_type_ids" in encoded_inputs and isinstance(encoded_inputs["token_type_ids"], int):
encoded_inputs["token_type_ids"] = [encoded_inputs["token_type_ids"]]
if self.padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"] + [1] * difference
)
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
encoded_inputs[self.model_input_names[0]] = required_input + [self.create_padding_token_embedding(input_embeddings=required_input)] * difference
elif self.padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = [0] * difference + encoded_inputs[
"token_type_ids"
]
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
encoded_inputs[self.model_input_names[0]] = [self.create_padding_token_embedding(input_embeddings=required_input)] * difference + required_input
else:
raise ValueError("Invalid padding strategy:" + str(self.padding_side))
elif return_attention_mask and "attention_mask" not in encoded_inputs:
encoded_inputs["attention_mask"] = [1] * len(required_input)
return encoded_inputs
Functions
def is_torch(x) ‑> bool
-
Helper function to check whether the input is a torch tensor.
Parameters
x
:[type]
- Input data
Returns
bool
- Boolean value indicating whether the input is a torch tensor
Expand source code
def is_torch(x) -> bool: # pragma: no """ Helper function to check whether the input is a torch tensor. Parameters ---------- x : [type] Input data Returns ------- bool Boolean value indicating whether the input is a torch tensor """ import torch return isinstance(x, torch.Tensor)
Classes
class EmbeddingTokenizer (model_input_names: Union[List[str], NoneType] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 2048)
-
Embedding based tokenizer. It assumes each token is mapped to a tensor instead of an index number. This implementation borrows most implementation from huggingface's transformers library.
Parameters
model_input_names
:Optional[List[str]]
, optional- Required model input names, by default None
special_tokens
:Optional[Dict[str, np.ndarray]]
, optional- Required model special tokens, by default None
max_length
:Optional[int]
, optional- Maximum sequence length supported by the model, by default 2048
Expand source code
class EmbeddingTokenizer(PreTrainedTokenizerBase): """ Embedding based tokenizer. It assumes each token is mapped to a tensor instead of an index number. This implementation borrows most implementation from huggingface's transformers library. Parameters ---------- model_input_names : Optional[List[str]], optional Required model input names, by default None special_tokens : Optional[Dict[str, np.ndarray]], optional Required model special tokens, by default None max_length : Optional[int], optional Maximum sequence length supported by the model, by default 2048 """ def __init__( self, model_input_names: Optional[List[str]] = None, special_tokens: Optional[Dict[str, np.ndarray]] = None, max_length: Optional[int] = 2048, ): self.model_input_names = model_input_names self.special_tokens = special_tokens self.max_length = max_length @abc.abstractmethod def text2embeddings(self, text: str) -> np.ndarray: raise NotImplementedError('This function is not implemented') def __call__( self, text: Union[TextInput, List[TextInput]], text_pair: Optional[Union[TextInput, List[TextInput]]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_length: bool = False, **kwargs, ) -> BatchEncoding: """ Tokenize the text into a sequence of image blocks. Parameters ---------- text : Union[TextInput, List[TextInput]] A single text or a list of text text_pair : Optional[Union[TextInput, List[TextInput]]], optional A single text or a list of text, by default None add_special_tokens : bool, optional Whether to add special tokens to the data, by default True padding : Union[bool, str, PaddingStrategy], optional The padding strategy, by default False truncation : Union[bool, str, TruncationStrategy], optional The truncation strategy, by default False max_length : Optional[int], optional Maximum sequence length, overriding the class variable, by default None pad_to_multiple_of : Optional[int], optional Padding parameters, by default None return_tensors : Optional[Union[str, TensorType]], optional Return tensors in `pt`, 'tf' or 'np', by default None return_token_type_ids : Optional[bool], optional Return token type ids, by default None return_attention_mask : Optional[bool], optional Return attention mask, by default None return_overflowing_tokens : bool, optional Return overflowing tokens, by default False return_special_tokens_mask : bool, optional Return special token mask, by default False return_length : bool, optional Return length, by default False Returns ------- BatchEncoding A BatchEncoding object """ if self.special_tokens is None: self.special_tokens = { "CLS": self.text2embeddings("[CLS]"), "SEP": self.text2embeddings("[SEP]"), } if add_special_tokens and text_pair: actual_max_length = self.max_length - len(self.special_tokens["SEP"]) * 2 - len(self.special_tokens["CLS"]) else: actual_max_length = self.max_length batch_outputs = {} text = text if isinstance(text, list) else [text] text_pair = text_pair if isinstance(text_pair, list) else [text_pair] if isinstance(padding, str): padding = PaddingStrategy(padding) if isinstance(truncation, str): truncation = TruncationStrategy(truncation) for first_text, second_text in zip_longest(text, text_pair, fillvalue=None): first_embeddings = self.text2embeddings(first_text) second_embeddings = self.text2embeddings(second_text) outputs = self.prepare_for_model( first_embeddings, second_embeddings, add_special_tokens=add_special_tokens, padding=PaddingStrategy.DO_NOT_PAD, # we pad in batch afterward truncation=truncation, max_length=max_length or actual_max_length, pad_to_multiple_of=None, # we pad in batch afterward return_attention_mask=False, # we pad in batch afterward return_token_type_ids=return_token_type_ids, return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask, return_length=return_length, return_tensors=None, # We convert the whole batch to tensors at the end prepend_batch_axis=False, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) batch_outputs = self.pad( batch_outputs, padding=padding, max_length=max_length or actual_max_length, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) return batch_outputs def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: if token_ids_1 is None: return token_ids_0 return np.concatenate( [ self.special_tokens["CLS"], token_ids_0, self.special_tokens["SEP"], token_ids_1, self.special_tokens["SEP"], ], axis=0 ) def prepare_for_model( self, ids: List[int], pair_ids: Optional[List[int]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_length: bool = False, prepend_batch_axis: bool = False, **kwargs ): pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 if return_token_type_ids and not add_special_tokens: raise ValueError( "Asking to return token_type_ids while setting add_special_tokens to False " "results in an undefined behavior. Please set add_special_tokens to True or " "set return_token_type_ids to None." ) # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Compute the total size of the returned encodings total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) # Truncation: Handle max sequence length overflowing_tokens = [] if truncation != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids=pair_ids, num_tokens_to_remove=total_len - max_length, truncation_strategy=truncation, stride=stride, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) else: sequence = np.concatenate([ids, pair_ids], axis=0) if pair is True else ids token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) # Build output dictionary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids if return_special_tokens_mask: if add_special_tokens: encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) else: encoded_inputs["special_tokens_mask"] = [0] * len(sequence) # Padding if padding != PaddingStrategy.DO_NOT_PAD or return_attention_mask: encoded_inputs = self.pad( encoded_inputs, max_length=max_length, padding=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) if return_length: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) batch_outputs = BatchEncoding( encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis ) return batch_outputs def num_special_tokens_to_add(self, pair: bool = False) -> int: return 0 if not pair else 3 def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: if token_ids_1 is None: return [0 for _ in token_ids_0] return [1 for _ in self.special_tokens["CLS"]] + [0 for _ in token_ids_0] + [1 for _ in self.special_tokens["SEP"]] + [0 for _ in token_ids_1] + [1 for _ in self.special_tokens["SEP"]] def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: if token_ids_1 is None: return len(token_ids_0) * [0] return [0]*len(self.special_tokens["CLS"]) + [0] * len(token_ids_0) + [0]*len(self.special_tokens["SEP"]) + [1] * len(token_ids_1) + [0]*len(self.special_tokens["SEP"]) def pad( self, encoded_inputs: Union[ BatchEncoding, List[BatchEncoding], Dict[str, EncodedInput], Dict[str, List[EncodedInput]], List[Dict[str, EncodedInput]], ], padding: Union[bool, str, PaddingStrategy] = True, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, ) -> BatchEncoding: # If we have a list of dicts, let's convert it in a dict of lists # We do this to allow using this method as a collate_fn function in PyTorch Dataloader if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} # The model's main input name, usually `input_ids`, has be passed for padding if self.model_input_names[0] not in encoded_inputs: raise ValueError( "You should supply an encoding or a list of encodings to this method" f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" ) required_input = encoded_inputs[self.model_input_names[0]] if required_input is None: if return_attention_mask: encoded_inputs["attention_mask"] = [] return encoded_inputs # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects # and rebuild them afterwards if no return_tensors is specified # Note that we lose the specific device the tensor may be on for PyTorch first_element = required_input[0] if isinstance(first_element, (list, tuple)): # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. index = 0 while len(required_input[index]) == 0: index += 1 if index < len(required_input): first_element = required_input[index][0] # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. if not isinstance(first_element, (int, list, tuple)): if is_torch_available() and is_torch(first_element): return_tensors = "pt" if return_tensors is None else return_tensors elif isinstance(first_element, np.ndarray): return_tensors = "np" if return_tensors is None else return_tensors else: raise ValueError( f"type of {first_element} unknown: {type(first_element)}. " f"Should be one of a python, numpy or pytorch object." ) for key, value in encoded_inputs.items(): encoded_inputs[key] = to_py_obj(value) required_input = encoded_inputs[self.model_input_names[0]] if required_input and not isinstance(required_input[0], (list, tuple)): encoded_inputs = self._pad( encoded_inputs, max_length=max_length, padding_strategy=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) batch_size = len(required_input) assert all( len(v) == batch_size for v in encoded_inputs.values() ), "Some items in the output dictionary have a different batch size than others." if padding == PaddingStrategy.LONGEST: max_length = max(len(inputs) for inputs in required_input) padding = PaddingStrategy.MAX_LENGTH batch_outputs = {} for i in range(batch_size): inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) outputs = self._pad( inputs, max_length=max_length, padding_strategy=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) return BatchEncoding(batch_outputs, tensor_type=return_tensors) def create_padding_token_embedding(self, input_embeddings=None): raise NotImplementedError('This function is not implemented') def _pad( self, encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], max_length: Optional[int] = None, padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, ) -> dict: # Load from model defaults if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names required_input = encoded_inputs[self.model_input_names[0]] if padding_strategy == PaddingStrategy.LONGEST: max_length = len(required_input) if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length if needs_to_be_padded: difference = max_length - len(required_input) if "token_type_ids" in encoded_inputs and isinstance(encoded_inputs["token_type_ids"], int): encoded_inputs["token_type_ids"] = [encoded_inputs["token_type_ids"]] if self.padding_side == "right": if return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = ( encoded_inputs["token_type_ids"] + [1] * difference ) if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs[self.model_input_names[0]] = required_input + [self.create_padding_token_embedding(input_embeddings=required_input)] * difference elif self.padding_side == "left": if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input) if "token_type_ids" in encoded_inputs: encoded_inputs["token_type_ids"] = [0] * difference + encoded_inputs[ "token_type_ids" ] if "special_tokens_mask" in encoded_inputs: encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] encoded_inputs[self.model_input_names[0]] = [self.create_padding_token_embedding(input_embeddings=required_input)] * difference + required_input else: raise ValueError("Invalid padding strategy:" + str(self.padding_side)) elif return_attention_mask and "attention_mask" not in encoded_inputs: encoded_inputs["attention_mask"] = [1] * len(required_input) return encoded_inputs
Ancestors
- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
Subclasses
Class variables
var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]
Methods
def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Union[List[int], NoneType] = None) ‑> List[int]
-
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens.
This implementation does not add special tokens and this method should be overridden in a subclass.
Args
token_ids_0 (
List[int]
): The first tokenized sequence. token_ids_1 (List[int]
, optional): The second tokenized sequence.Returns
List[int]
: The model input with special tokens.Expand source code
def build_inputs_with_special_tokens( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: if token_ids_1 is None: return token_ids_0 return np.concatenate( [ self.special_tokens["CLS"], token_ids_0, self.special_tokens["SEP"], token_ids_1, self.special_tokens["SEP"], ], axis=0 )
def create_padding_token_embedding(self, input_embeddings=None)
-
Expand source code
def create_padding_token_embedding(self, input_embeddings=None): raise NotImplementedError('This function is not implemented')
def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Union[List[int], NoneType] = None) ‑> List[int]
-
Create the token type IDs corresponding to the sequences passed. What are token type IDs?
Should be overridden in a subclass if the model has a special way of building those.
Args
token_ids_0 (
List[int]
): The first tokenized sequence. token_ids_1 (List[int]
, optional): The second tokenized sequence.Returns
List[int]
: The token type ids.Expand source code
def create_token_type_ids_from_sequences( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: if token_ids_1 is None: return len(token_ids_0) * [0] return [0]*len(self.special_tokens["CLS"]) + [0] * len(token_ids_0) + [0]*len(self.special_tokens["SEP"]) + [1] * len(token_ids_1) + [0]*len(self.special_tokens["SEP"])
def get_special_tokens_mask(self, token_ids_0: List[int], token_ids_1: Union[List[int], NoneType] = None) ‑> List[int]
-
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer
prepare_for_model
orencode_plus
methods.Args
token_ids_0 (
List[int]
): List of ids of the first sequence. token_ids_1 (List[int]
, optional): List of ids of the second sequence. already_has_special_tokens (bool
, optional, defaults toFalse
): Whether or not the token list is already formatted with special tokens for the model.Returns
A list
ofintegers in the range [0, 1]
- 1 for a special token, 0 for a sequence token.
Expand source code
def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None ) -> List[int]: if token_ids_1 is None: return [0 for _ in token_ids_0] return [1 for _ in self.special_tokens["CLS"]] + [0 for _ in token_ids_0] + [1 for _ in self.special_tokens["SEP"]] + [0 for _ in token_ids_1] + [1 for _ in self.special_tokens["SEP"]]
def num_special_tokens_to_add(self, pair: bool = False) ‑> int
-
Expand source code
def num_special_tokens_to_add(self, pair: bool = False) -> int: return 0 if not pair else 3
def pad(self, encoded_inputs: Union[transformers.tokenization_utils_base.BatchEncoding, List[transformers.tokenization_utils_base.BatchEncoding], Dict[str, List[int]], Dict[str, List[List[int]]], List[Dict[str, List[int]]]], padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = True, max_length: Union[int, NoneType] = None, pad_to_multiple_of: Union[int, NoneType] = None, return_attention_mask: Union[bool, NoneType] = None, return_tensors: Union[str, transformers.utils.generic.TensorType, NoneType] = None) ‑> transformers.tokenization_utils_base.BatchEncoding
-
Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch.
Padding side (left/right) padding token ids are defined at the tokenizer level (with
self.padding_side
,self.pad_token_id
andself.pad_token_type_id
)If the
encoded_inputs
passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result will use the same type unless you provide a different tensor type withreturn_tensors
. In the case of PyTorch tensors, you will lose the specific device of your tensors however.Args
encoded_inputs ([
BatchEncoding
], list of [BatchEncoding
],Dict[str, List[int]]
,Dict[str, List[List[int]]
orList[Dict[str, List[int]]]
): Tokenized inputs. Can represent one input ([BatchEncoding
] orDict[str, List[int]]
) or a batch of tokenized inputs (list of [BatchEncoding
], Dict[str, List[List[int]]] or List[Dict[str, List[int]]]) so you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.Instead of <code>List\[int]</code> you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see the note above for the return type.
padding (
bool
,str
or [~utils.PaddingStrategy
], optional, defaults toTrue
): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among:- <code>True</code> or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument <code>max\_length</code> or to the maximum acceptable input length for the model if that argument is not provided. - <code>False</code> or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths).
max_length (
int
, optional): Maximum length of the returned list and optionally padding length (see above). pad_to_multiple_of (int
, optional): If set will pad the sequence to a multiple of the provided value.This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
return_attention_mask (
bool
, optional): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by thereturn_outputs
attribute.[What are attention masks?](../glossary#attention-mask)
return_tensors (
str
or [~utils.TensorType
], optional): If set, will return tensors instead of list of python integers. Acceptable values are:- `'tf'`: Return TensorFlow <code>tf.constant</code> objects. - `'pt'`: Return PyTorch <code>torch.Tensor</code> objects. - `'np'`: Return Numpy <code>np.ndarray</code> objects.
verbose (
bool
, optional, defaults toTrue
): Whether or not to print more information and warnings.Expand source code
def pad( self, encoded_inputs: Union[ BatchEncoding, List[BatchEncoding], Dict[str, EncodedInput], Dict[str, List[EncodedInput]], List[Dict[str, EncodedInput]], ], padding: Union[bool, str, PaddingStrategy] = True, max_length: Optional[int] = None, pad_to_multiple_of: Optional[int] = None, return_attention_mask: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, ) -> BatchEncoding: # If we have a list of dicts, let's convert it in a dict of lists # We do this to allow using this method as a collate_fn function in PyTorch Dataloader if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} # The model's main input name, usually `input_ids`, has be passed for padding if self.model_input_names[0] not in encoded_inputs: raise ValueError( "You should supply an encoding or a list of encodings to this method" f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" ) required_input = encoded_inputs[self.model_input_names[0]] if required_input is None: if return_attention_mask: encoded_inputs["attention_mask"] = [] return encoded_inputs # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects # and rebuild them afterwards if no return_tensors is specified # Note that we lose the specific device the tensor may be on for PyTorch first_element = required_input[0] if isinstance(first_element, (list, tuple)): # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. index = 0 while len(required_input[index]) == 0: index += 1 if index < len(required_input): first_element = required_input[index][0] # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. if not isinstance(first_element, (int, list, tuple)): if is_torch_available() and is_torch(first_element): return_tensors = "pt" if return_tensors is None else return_tensors elif isinstance(first_element, np.ndarray): return_tensors = "np" if return_tensors is None else return_tensors else: raise ValueError( f"type of {first_element} unknown: {type(first_element)}. " f"Should be one of a python, numpy or pytorch object." ) for key, value in encoded_inputs.items(): encoded_inputs[key] = to_py_obj(value) required_input = encoded_inputs[self.model_input_names[0]] if required_input and not isinstance(required_input[0], (list, tuple)): encoded_inputs = self._pad( encoded_inputs, max_length=max_length, padding_strategy=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) return BatchEncoding(encoded_inputs, tensor_type=return_tensors) batch_size = len(required_input) assert all( len(v) == batch_size for v in encoded_inputs.values() ), "Some items in the output dictionary have a different batch size than others." if padding == PaddingStrategy.LONGEST: max_length = max(len(inputs) for inputs in required_input) padding = PaddingStrategy.MAX_LENGTH batch_outputs = {} for i in range(batch_size): inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) outputs = self._pad( inputs, max_length=max_length, padding_strategy=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) for key, value in outputs.items(): if key not in batch_outputs: batch_outputs[key] = [] batch_outputs[key].append(value) return BatchEncoding(batch_outputs, tensor_type=return_tensors)
def prepare_for_model(self, ids: List[int], pair_ids: Union[List[int], NoneType] = None, add_special_tokens: bool = True, padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = False, truncation: Union[bool, str, transformers.tokenization_utils_base.TruncationStrategy] = False, max_length: Union[int, NoneType] = None, stride: int = 0, pad_to_multiple_of: Union[int, NoneType] = None, return_tensors: Union[str, transformers.utils.generic.TensorType, NoneType] = None, return_token_type_ids: Union[bool, NoneType] = None, return_attention_mask: Union[bool, NoneType] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_length: bool = False, prepend_batch_axis: bool = False, **kwargs)
-
Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens. Please Note, for pair_ids different than
None
and truncation_strategy = longest_first orTrue
, it is not possible to return overflowing tokens. Such a combination of arguments will raise an error.Args
ids (
List[int]
): Tokenized input ids of the first sequence. Can be obtained from a string by chaining thetokenize
andconvert_tokens_to_ids
methods. pair_ids (List[int]
, optional): Tokenized input ids of the second sequence. Can be obtained from a string by chaining thetokenize
andconvert_tokens_to_ids
methods.add_special_tokens (
bool
, optional, defaults toTrue
): Whether or not to encode the sequences with the special tokens relative to their model. padding (bool
,str
or [~utils.PaddingStrategy
], optional, defaults toFalse
): Activates and controls padding. Accepts the following values:- <code>True</code> or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument <code>max\_length</code> or to the maximum acceptable input length for the model if that argument is not provided. - <code>False</code> or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different lengths).
truncation (
bool
,str
or [~tokenization_utils_base.TruncationStrategy
], optional, defaults toFalse
): Activates and controls truncation. Accepts the following values:- <code>True</code> or `'longest_first'`: Truncate to a maximum length specified with the argument <code>max\_length</code> or to the maximum acceptable input length for the model if that argument is not provided. This will truncate token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is provided. - `'only_first'`: Truncate to a maximum length specified with the argument <code>max\_length</code> or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - `'only_second'`: Truncate to a maximum length specified with the argument <code>max\_length</code> or to the maximum acceptable input length for the model if that argument is not provided. This will only truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. - <code>False</code> or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater than the model maximum admissible input size).
max_length (
int
, optional): Controls the maximum length to use by one of the truncation/padding parameters.If left unset or set to <code>None</code>, this will use the predefined model maximum length if a maximum length is required by one of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet) truncation/padding to a maximum length will be deactivated.
stride (
int
, optional, defaults to 0): If set to a number along withmax_length
, the overflowing tokens returned whenreturn_overflowing_tokens=True
will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflowing sequences. The value of this argument defines the number of overlapping tokens. is_split_into_words (bool
, optional, defaults toFalse
): Whether or not the input is already pre-tokenized (e.g., split into words). If set toTrue
, the tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) which it will tokenize. This is useful for NER or token classification. pad_to_multiple_of (int
, optional): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). return_tensors (str
or [~utils.TensorType
], optional): If set, will return tensors instead of list of python integers. Acceptable values are:- `'tf'`: Return TensorFlow <code>tf.constant</code> objects. - `'pt'`: Return PyTorch <code>torch.Tensor</code> objects. - `'np'`: Return Numpy <code>np.ndarray</code> objects.
return_token_type_ids (
bool
, optional): Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific tokenizer's default, defined by thereturn_outputs
attribute.[What are token type IDs?](../glossary#token-type-ids)
return_attention_mask (
bool
, optional): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by thereturn_outputs
attribute.[What are attention masks?](../glossary#attention-mask)
return_overflowing_tokens (
bool
, optional, defaults toFalse
): Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch of pairs) is provided withtruncation_strategy = longest_first
orTrue
, an error is raised instead of returning overflowing tokens. return_special_tokens_mask (bool
, optional, defaults toFalse
): Whether or not to return special tokens mask information. return_offsets_mapping (bool
, optional, defaults toFalse
): Whether or not to return(char_start, char_end)
for each token.This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using Python's tokenizer, this method will raise <code>NotImplementedError</code>.
- return_length
(
bool
, optional, defaults toFalse
): - Whether or not to return the lengths of the encoded inputs.
- verbose (
bool
, optional, defaults toTrue
): - Whether or not to print more information and warnings.
**kwargs
- passed to the
self.tokenize()
method
Return
[
BatchEncoding
]: A [BatchEncoding
] with the following fields:- input_ids – List of token ids to be fed to a model.
- token_type_ids – List of token type ids to be fed to a model (when
return_token_type_ids=True
or if "token_type_ids" is inself.model_input_names
).
- attention_mask – List of indices specifying which tokens should be attended to by the model (when
return_attention_mask=True
or if "attention_mask" is inself.model_input_names
).
- overflowing_tokens – List of overflowing tokens sequences (when a
max_length
is specified andreturn_overflowing_tokens=True
). - num_truncated_tokens – Number of tokens truncated (when a
max_length
is specified andreturn_overflowing_tokens=True
). - special_tokens_mask – List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
regular sequence tokens (when
add_special_tokens=True
andreturn_special_tokens_mask=True
). - length – The length of the inputs (when
return_length=True
)
Expand source code
def prepare_for_model( self, ids: List[int], pair_ids: Optional[List[int]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = False, max_length: Optional[int] = None, stride: int = 0, pad_to_multiple_of: Optional[int] = None, return_tensors: Optional[Union[str, TensorType]] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_length: bool = False, prepend_batch_axis: bool = False, **kwargs ): pair = bool(pair_ids is not None) len_ids = len(ids) len_pair_ids = len(pair_ids) if pair else 0 if return_token_type_ids and not add_special_tokens: raise ValueError( "Asking to return token_type_ids while setting add_special_tokens to False " "results in an undefined behavior. Please set add_special_tokens to True or " "set return_token_type_ids to None." ) # Load from model defaults if return_token_type_ids is None: return_token_type_ids = "token_type_ids" in self.model_input_names if return_attention_mask is None: return_attention_mask = "attention_mask" in self.model_input_names encoded_inputs = {} # Compute the total size of the returned encodings total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) # Truncation: Handle max sequence length overflowing_tokens = [] if truncation != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: ids, pair_ids, overflowing_tokens = self.truncate_sequences( ids, pair_ids=pair_ids, num_tokens_to_remove=total_len - max_length, truncation_strategy=truncation, stride=stride, ) if return_overflowing_tokens: encoded_inputs["overflowing_tokens"] = overflowing_tokens encoded_inputs["num_truncated_tokens"] = total_len - max_length # Add special tokens if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) else: sequence = np.concatenate([ids, pair_ids], axis=0) if pair is True else ids token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) # Build output dictionary encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids if return_special_tokens_mask: if add_special_tokens: encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) else: encoded_inputs["special_tokens_mask"] = [0] * len(sequence) # Padding if padding != PaddingStrategy.DO_NOT_PAD or return_attention_mask: encoded_inputs = self.pad( encoded_inputs, max_length=max_length, padding=padding, pad_to_multiple_of=pad_to_multiple_of, return_attention_mask=return_attention_mask, ) if return_length: encoded_inputs["length"] = len(encoded_inputs["input_ids"]) batch_outputs = BatchEncoding( encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis ) return batch_outputs
- return_length
(
def text2embeddings(self, text: str) ‑> numpy.ndarray
-
Expand source code
@abc.abstractmethod def text2embeddings(self, text: str) -> np.ndarray: raise NotImplementedError('This function is not implemented')