Module text_embeddings.base

base covers all the base classes, functions for other embedding based tokenizers.

Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2021-04-22 20:43:06
# @Author  : Chenghao Mou (mouchenghao@gmail.com)

"""base covers all the base classes, functions for other embedding based tokenizers."""

import abc
from typing import List, Optional, Union, Dict
from itertools import zip_longest

import numpy as np
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy, TruncationStrategy, TensorType, BatchEncoding, EncodedInput, is_torch_available, to_py_obj, TextInput

def is_torch(x) -> bool: # pragma: no
    """
    Helper function to check whether the input is a torch tensor.

    Parameters
    ----------
    x : [type]
        Input data

    Returns
    -------
    bool
        Boolean value indicating whether the input is a torch tensor
    """
    import torch
    return isinstance(x, torch.Tensor)

class EmbeddingTokenizer(PreTrainedTokenizerBase):
    """
    Embedding based tokenizer. It assumes each token is mapped to a tensor instead of an index number.
    This implementation borrows most implementation from huggingface's transformers library.

    Parameters
    ----------
    model_input_names : Optional[List[str]], optional
        Required model input names, by default None
    special_tokens : Optional[Dict[str, np.ndarray]], optional
        Required model special tokens, by default None
    max_length : Optional[int], optional
        Maximum sequence length supported by the model, by default 2048
    """

    def __init__(
        self,
        model_input_names: Optional[List[str]] = None,
        special_tokens: Optional[Dict[str, np.ndarray]] = None,
        max_length: Optional[int] = 2048,
    ):
        self.model_input_names = model_input_names
        self.special_tokens = special_tokens
        self.max_length = max_length

    @abc.abstractmethod
    def text2embeddings(self, text: str) -> np.ndarray:
        raise NotImplementedError('This function is not implemented')

    def __call__(
        self,
        text: Union[TextInput, List[TextInput]],
        text_pair: Optional[Union[TextInput, List[TextInput]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize the text into a sequence of image blocks.

        Parameters
        ----------
        text : Union[TextInput, List[TextInput]]
            A single text or a list of text
        text_pair : Optional[Union[TextInput, List[TextInput]]], optional
            A single text or a list of text, by default None
        add_special_tokens : bool, optional
            Whether to add special tokens to the data, by default True
        padding : Union[bool, str, PaddingStrategy], optional
            The padding strategy, by default False
        truncation : Union[bool, str, TruncationStrategy], optional
            The truncation strategy, by default False
        max_length : Optional[int], optional
            Maximum sequence length, overriding the class variable, by default None
        pad_to_multiple_of : Optional[int], optional
            Padding parameters, by default None
        return_tensors : Optional[Union[str, TensorType]], optional
            Return tensors in `pt`, 'tf' or 'np', by default None
        return_token_type_ids : Optional[bool], optional
            Return token type ids, by default None
        return_attention_mask : Optional[bool], optional
            Return attention mask, by default None
        return_overflowing_tokens : bool, optional
            Return overflowing tokens, by default False
        return_special_tokens_mask : bool, optional
            Return special token mask, by default False
        return_length : bool, optional
            Return length, by default False

        Returns
        -------
        BatchEncoding
            A BatchEncoding object
        """
        if self.special_tokens is None:
            self.special_tokens = {
                "CLS": self.text2embeddings("[CLS]"),
                "SEP": self.text2embeddings("[SEP]"),
            }

        if add_special_tokens and text_pair:
            actual_max_length = self.max_length - len(self.special_tokens["SEP"]) * 2 - len(self.special_tokens["CLS"])
        else:
            actual_max_length = self.max_length

        batch_outputs = {}
        text = text if isinstance(text, list) else [text]
        text_pair = text_pair if isinstance(text_pair, list) else [text_pair]

        if isinstance(padding, str):
            padding = PaddingStrategy(padding)
        
        if isinstance(truncation, str):
            truncation = TruncationStrategy(truncation)

        for first_text, second_text in zip_longest(text, text_pair, fillvalue=None):
            
            first_embeddings = self.text2embeddings(first_text)
            second_embeddings = self.text2embeddings(second_text)

            outputs = self.prepare_for_model(
                first_embeddings,
                second_embeddings,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD,  # we pad in batch afterward
                truncation=truncation,
                max_length=max_length or actual_max_length,
                pad_to_multiple_of=None,  # we pad in batch afterward
                return_attention_mask=False,  # we pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # We convert the whole batch to tensors at the end
                prepend_batch_axis=False,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        batch_outputs = self.pad(
            batch_outputs,
            padding=padding,
            max_length=max_length or actual_max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        return batch_outputs

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        if token_ids_1 is None:
            return token_ids_0
        
        return np.concatenate(
            [
                self.special_tokens["CLS"],
                token_ids_0,
                self.special_tokens["SEP"],
                token_ids_1,
                self.special_tokens["SEP"],
            ],
            axis=0
        )

    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        prepend_batch_axis: bool = False,
        **kwargs
    ):

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0
        if return_token_type_ids and not add_special_tokens:
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None."
            )

        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
        overflowing_tokens = []
        if truncation != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation,
                stride=stride,
            )

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
        else:
            sequence = np.concatenate([ids, pair_ids], axis=0) if pair is True else ids
            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
        
        # Build output dictionary
        encoded_inputs["input_ids"] = sequence

        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Padding
        if padding != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(
            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
        )
        
        return batch_outputs
    
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        return 0 if not pair else 3
    
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        if token_ids_1 is None:
            return [0 for _ in token_ids_0]
        return [1 for _ in self.special_tokens["CLS"]] + [0 for _ in token_ids_0] + [1 for _ in self.special_tokens["SEP"]] + [0 for _ in token_ids_1] + [1 for _ in self.special_tokens["SEP"]]
    
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:

        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        return [0]*len(self.special_tokens["CLS"]) + [0] * len(token_ids_0) + [0]*len(self.special_tokens["SEP"]) + [1] * len(token_ids_1) + [0]*len(self.special_tokens["SEP"])

    def pad(
        self,
        encoded_inputs: Union[
            BatchEncoding,
            List[BatchEncoding],
            Dict[str, EncodedInput],
            Dict[str, List[EncodedInput]],
            List[Dict[str, EncodedInput]],
        ],
        padding: Union[bool, str, PaddingStrategy] = True,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ) -> BatchEncoding:

        # If we have a list of dicts, let's convert it in a dict of lists
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
                "You should supply an encoding or a list of encodings to this method"
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )

        required_input = encoded_inputs[self.model_input_names[0]]

        if required_input is None:
            if return_attention_mask:
                encoded_inputs["attention_mask"] = []
            return encoded_inputs

        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch

        first_element = required_input[0]
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
            index = 0
            while len(required_input[index]) == 0:
                index += 1
            if index < len(required_input):
                first_element = required_input[index][0]
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
            if is_torch_available() and is_torch(first_element):
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
                    f"Should be one of a python, numpy or pytorch object."
                )

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)
        
        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
            encoded_inputs = self._pad(
                encoded_inputs,
                max_length=max_length,
                padding_strategy=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

        batch_size = len(required_input)
        assert all(
            len(v) == batch_size for v in encoded_inputs.values()
        ), "Some items in the output dictionary have a different batch size than others."

        if padding == PaddingStrategy.LONGEST:
            max_length = max(len(inputs) for inputs in required_input)
            padding = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
    
    def create_padding_token_embedding(self, input_embeddings=None):
        raise NotImplementedError('This function is not implemented')

    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    ) -> dict:

        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        required_input = encoded_inputs[self.model_input_names[0]]
        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = len(required_input)

        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

        if needs_to_be_padded:
            difference = max_length - len(required_input)
            if "token_type_ids" in encoded_inputs and isinstance(encoded_inputs["token_type_ids"], int):
                encoded_inputs["token_type_ids"] = [encoded_inputs["token_type_ids"]]
            if self.padding_side == "right":
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] + [1] * difference
                    )
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                
                encoded_inputs[self.model_input_names[0]] = required_input + [self.create_padding_token_embedding(input_embeddings=required_input)] * difference
            elif self.padding_side == "left":
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [0] * difference + encoded_inputs[
                        "token_type_ids"
                    ]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                encoded_inputs[self.model_input_names[0]] = [self.create_padding_token_embedding(input_embeddings=required_input)] * difference + required_input
            else:
                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
        elif return_attention_mask and "attention_mask" not in encoded_inputs:
            encoded_inputs["attention_mask"] = [1] * len(required_input)

        return encoded_inputs

Functions

def is_torch(x) ‑> bool

Helper function to check whether the input is a torch tensor.

Parameters

x : [type]
Input data

Returns

bool
Boolean value indicating whether the input is a torch tensor
Expand source code
def is_torch(x) -> bool: # pragma: no
    """
    Helper function to check whether the input is a torch tensor.

    Parameters
    ----------
    x : [type]
        Input data

    Returns
    -------
    bool
        Boolean value indicating whether the input is a torch tensor
    """
    import torch
    return isinstance(x, torch.Tensor)

Classes

class EmbeddingTokenizer (model_input_names: Union[List[str], NoneType] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 2048)

Embedding based tokenizer. It assumes each token is mapped to a tensor instead of an index number. This implementation borrows most implementation from huggingface's transformers library.

Parameters

model_input_names : Optional[List[str]], optional
Required model input names, by default None
special_tokens : Optional[Dict[str, np.ndarray]], optional
Required model special tokens, by default None
max_length : Optional[int], optional
Maximum sequence length supported by the model, by default 2048
Expand source code
class EmbeddingTokenizer(PreTrainedTokenizerBase):
    """
    Embedding based tokenizer. It assumes each token is mapped to a tensor instead of an index number.
    This implementation borrows most implementation from huggingface's transformers library.

    Parameters
    ----------
    model_input_names : Optional[List[str]], optional
        Required model input names, by default None
    special_tokens : Optional[Dict[str, np.ndarray]], optional
        Required model special tokens, by default None
    max_length : Optional[int], optional
        Maximum sequence length supported by the model, by default 2048
    """

    def __init__(
        self,
        model_input_names: Optional[List[str]] = None,
        special_tokens: Optional[Dict[str, np.ndarray]] = None,
        max_length: Optional[int] = 2048,
    ):
        self.model_input_names = model_input_names
        self.special_tokens = special_tokens
        self.max_length = max_length

    @abc.abstractmethod
    def text2embeddings(self, text: str) -> np.ndarray:
        raise NotImplementedError('This function is not implemented')

    def __call__(
        self,
        text: Union[TextInput, List[TextInput]],
        text_pair: Optional[Union[TextInput, List[TextInput]]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        **kwargs,
    ) -> BatchEncoding:
        """
        Tokenize the text into a sequence of image blocks.

        Parameters
        ----------
        text : Union[TextInput, List[TextInput]]
            A single text or a list of text
        text_pair : Optional[Union[TextInput, List[TextInput]]], optional
            A single text or a list of text, by default None
        add_special_tokens : bool, optional
            Whether to add special tokens to the data, by default True
        padding : Union[bool, str, PaddingStrategy], optional
            The padding strategy, by default False
        truncation : Union[bool, str, TruncationStrategy], optional
            The truncation strategy, by default False
        max_length : Optional[int], optional
            Maximum sequence length, overriding the class variable, by default None
        pad_to_multiple_of : Optional[int], optional
            Padding parameters, by default None
        return_tensors : Optional[Union[str, TensorType]], optional
            Return tensors in `pt`, 'tf' or 'np', by default None
        return_token_type_ids : Optional[bool], optional
            Return token type ids, by default None
        return_attention_mask : Optional[bool], optional
            Return attention mask, by default None
        return_overflowing_tokens : bool, optional
            Return overflowing tokens, by default False
        return_special_tokens_mask : bool, optional
            Return special token mask, by default False
        return_length : bool, optional
            Return length, by default False

        Returns
        -------
        BatchEncoding
            A BatchEncoding object
        """
        if self.special_tokens is None:
            self.special_tokens = {
                "CLS": self.text2embeddings("[CLS]"),
                "SEP": self.text2embeddings("[SEP]"),
            }

        if add_special_tokens and text_pair:
            actual_max_length = self.max_length - len(self.special_tokens["SEP"]) * 2 - len(self.special_tokens["CLS"])
        else:
            actual_max_length = self.max_length

        batch_outputs = {}
        text = text if isinstance(text, list) else [text]
        text_pair = text_pair if isinstance(text_pair, list) else [text_pair]

        if isinstance(padding, str):
            padding = PaddingStrategy(padding)
        
        if isinstance(truncation, str):
            truncation = TruncationStrategy(truncation)

        for first_text, second_text in zip_longest(text, text_pair, fillvalue=None):
            
            first_embeddings = self.text2embeddings(first_text)
            second_embeddings = self.text2embeddings(second_text)

            outputs = self.prepare_for_model(
                first_embeddings,
                second_embeddings,
                add_special_tokens=add_special_tokens,
                padding=PaddingStrategy.DO_NOT_PAD,  # we pad in batch afterward
                truncation=truncation,
                max_length=max_length or actual_max_length,
                pad_to_multiple_of=None,  # we pad in batch afterward
                return_attention_mask=False,  # we pad in batch afterward
                return_token_type_ids=return_token_type_ids,
                return_overflowing_tokens=return_overflowing_tokens,
                return_special_tokens_mask=return_special_tokens_mask,
                return_length=return_length,
                return_tensors=None,  # We convert the whole batch to tensors at the end
                prepend_batch_axis=False,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        batch_outputs = self.pad(
            batch_outputs,
            padding=padding,
            max_length=max_length or actual_max_length,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)

        return batch_outputs

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        if token_ids_1 is None:
            return token_ids_0
        
        return np.concatenate(
            [
                self.special_tokens["CLS"],
                token_ids_0,
                self.special_tokens["SEP"],
                token_ids_1,
                self.special_tokens["SEP"],
            ],
            axis=0
        )

    def prepare_for_model(
        self,
        ids: List[int],
        pair_ids: Optional[List[int]] = None,
        add_special_tokens: bool = True,
        padding: Union[bool, str, PaddingStrategy] = False,
        truncation: Union[bool, str, TruncationStrategy] = False,
        max_length: Optional[int] = None,
        stride: int = 0,
        pad_to_multiple_of: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
        return_token_type_ids: Optional[bool] = None,
        return_attention_mask: Optional[bool] = None,
        return_overflowing_tokens: bool = False,
        return_special_tokens_mask: bool = False,
        return_length: bool = False,
        prepend_batch_axis: bool = False,
        **kwargs
    ):

        pair = bool(pair_ids is not None)
        len_ids = len(ids)
        len_pair_ids = len(pair_ids) if pair else 0
        if return_token_type_ids and not add_special_tokens:
            raise ValueError(
                "Asking to return token_type_ids while setting add_special_tokens to False "
                "results in an undefined behavior. Please set add_special_tokens to True or "
                "set return_token_type_ids to None."
            )

        # Load from model defaults
        if return_token_type_ids is None:
            return_token_type_ids = "token_type_ids" in self.model_input_names
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        encoded_inputs = {}

        # Compute the total size of the returned encodings
        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

        # Truncation: Handle max sequence length
        overflowing_tokens = []
        if truncation != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
                ids,
                pair_ids=pair_ids,
                num_tokens_to_remove=total_len - max_length,
                truncation_strategy=truncation,
                stride=stride,
            )

        if return_overflowing_tokens:
            encoded_inputs["overflowing_tokens"] = overflowing_tokens
            encoded_inputs["num_truncated_tokens"] = total_len - max_length

        # Add special tokens
        if add_special_tokens:
            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
        else:
            sequence = np.concatenate([ids, pair_ids], axis=0) if pair is True else ids
            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
        
        # Build output dictionary
        encoded_inputs["input_ids"] = sequence

        if return_token_type_ids:
            encoded_inputs["token_type_ids"] = token_type_ids
        if return_special_tokens_mask:
            if add_special_tokens:
                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
            else:
                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

        # Padding
        if padding != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
            encoded_inputs = self.pad(
                encoded_inputs,
                max_length=max_length,
                padding=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

        if return_length:
            encoded_inputs["length"] = len(encoded_inputs["input_ids"])

        batch_outputs = BatchEncoding(
            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
        )
        
        return batch_outputs
    
    def num_special_tokens_to_add(self, pair: bool = False) -> int:
        return 0 if not pair else 3
    
    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        if token_ids_1 is None:
            return [0 for _ in token_ids_0]
        return [1 for _ in self.special_tokens["CLS"]] + [0 for _ in token_ids_0] + [1 for _ in self.special_tokens["SEP"]] + [0 for _ in token_ids_1] + [1 for _ in self.special_tokens["SEP"]]
    
    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:

        if token_ids_1 is None:
            return len(token_ids_0) * [0]
        return [0]*len(self.special_tokens["CLS"]) + [0] * len(token_ids_0) + [0]*len(self.special_tokens["SEP"]) + [1] * len(token_ids_1) + [0]*len(self.special_tokens["SEP"])

    def pad(
        self,
        encoded_inputs: Union[
            BatchEncoding,
            List[BatchEncoding],
            Dict[str, EncodedInput],
            Dict[str, List[EncodedInput]],
            List[Dict[str, EncodedInput]],
        ],
        padding: Union[bool, str, PaddingStrategy] = True,
        max_length: Optional[int] = None,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
    ) -> BatchEncoding:

        # If we have a list of dicts, let's convert it in a dict of lists
        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

        # The model's main input name, usually `input_ids`, has be passed for padding
        if self.model_input_names[0] not in encoded_inputs:
            raise ValueError(
                "You should supply an encoding or a list of encodings to this method"
                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
            )

        required_input = encoded_inputs[self.model_input_names[0]]

        if required_input is None:
            if return_attention_mask:
                encoded_inputs["attention_mask"] = []
            return encoded_inputs

        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
        # and rebuild them afterwards if no return_tensors is specified
        # Note that we lose the specific device the tensor may be on for PyTorch

        first_element = required_input[0]
        if isinstance(first_element, (list, tuple)):
            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
            index = 0
            while len(required_input[index]) == 0:
                index += 1
            if index < len(required_input):
                first_element = required_input[index][0]
        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
        if not isinstance(first_element, (int, list, tuple)):
            if is_torch_available() and is_torch(first_element):
                return_tensors = "pt" if return_tensors is None else return_tensors
            elif isinstance(first_element, np.ndarray):
                return_tensors = "np" if return_tensors is None else return_tensors
            else:
                raise ValueError(
                    f"type of {first_element} unknown: {type(first_element)}. "
                    f"Should be one of a python, numpy or pytorch object."
                )

            for key, value in encoded_inputs.items():
                encoded_inputs[key] = to_py_obj(value)
        
        required_input = encoded_inputs[self.model_input_names[0]]
        if required_input and not isinstance(required_input[0], (list, tuple)):
            encoded_inputs = self._pad(
                encoded_inputs,
                max_length=max_length,
                padding_strategy=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )
            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

        batch_size = len(required_input)
        assert all(
            len(v) == batch_size for v in encoded_inputs.values()
        ), "Some items in the output dictionary have a different batch size than others."

        if padding == PaddingStrategy.LONGEST:
            max_length = max(len(inputs) for inputs in required_input)
            padding = PaddingStrategy.MAX_LENGTH

        batch_outputs = {}
        for i in range(batch_size):
            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
            outputs = self._pad(
                inputs,
                max_length=max_length,
                padding_strategy=padding,
                pad_to_multiple_of=pad_to_multiple_of,
                return_attention_mask=return_attention_mask,
            )

            for key, value in outputs.items():
                if key not in batch_outputs:
                    batch_outputs[key] = []
                batch_outputs[key].append(value)

        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
    
    def create_padding_token_embedding(self, input_embeddings=None):
        raise NotImplementedError('This function is not implemented')

    def _pad(
        self,
        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
        max_length: Optional[int] = None,
        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
        pad_to_multiple_of: Optional[int] = None,
        return_attention_mask: Optional[bool] = None,
    ) -> dict:

        # Load from model defaults
        if return_attention_mask is None:
            return_attention_mask = "attention_mask" in self.model_input_names

        required_input = encoded_inputs[self.model_input_names[0]]
        if padding_strategy == PaddingStrategy.LONGEST:
            max_length = len(required_input)

        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length

        if needs_to_be_padded:
            difference = max_length - len(required_input)
            if "token_type_ids" in encoded_inputs and isinstance(encoded_inputs["token_type_ids"], int):
                encoded_inputs["token_type_ids"] = [encoded_inputs["token_type_ids"]]
            if self.padding_side == "right":
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = (
                        encoded_inputs["token_type_ids"] + [1] * difference
                    )
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                
                encoded_inputs[self.model_input_names[0]] = required_input + [self.create_padding_token_embedding(input_embeddings=required_input)] * difference
            elif self.padding_side == "left":
                if return_attention_mask:
                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
                if "token_type_ids" in encoded_inputs:
                    encoded_inputs["token_type_ids"] = [0] * difference + encoded_inputs[
                        "token_type_ids"
                    ]
                if "special_tokens_mask" in encoded_inputs:
                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
                encoded_inputs[self.model_input_names[0]] = [self.create_padding_token_embedding(input_embeddings=required_input)] * difference + required_input
            else:
                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
        elif return_attention_mask and "attention_mask" not in encoded_inputs:
            encoded_inputs["attention_mask"] = [1] * len(required_input)

        return encoded_inputs

Ancestors

  • transformers.tokenization_utils_base.PreTrainedTokenizerBase
  • transformers.tokenization_utils_base.SpecialTokensMixin
  • transformers.utils.hub.PushToHubMixin

Subclasses

Class variables

var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]

Methods

def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Union[List[int], NoneType] = None) ‑> List[int]

Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and adding special tokens.

This implementation does not add special tokens and this method should be overridden in a subclass.

Args

token_ids_0 (List[int]): The first tokenized sequence. token_ids_1 (List[int], optional): The second tokenized sequence.

Returns

List[int]: The model input with special tokens.

Expand source code
def build_inputs_with_special_tokens(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
    if token_ids_1 is None:
        return token_ids_0
    
    return np.concatenate(
        [
            self.special_tokens["CLS"],
            token_ids_0,
            self.special_tokens["SEP"],
            token_ids_1,
            self.special_tokens["SEP"],
        ],
        axis=0
    )
def create_padding_token_embedding(self, input_embeddings=None)
Expand source code
def create_padding_token_embedding(self, input_embeddings=None):
    raise NotImplementedError('This function is not implemented')
def create_token_type_ids_from_sequences(self, token_ids_0: List[int], token_ids_1: Union[List[int], NoneType] = None) ‑> List[int]

Create the token type IDs corresponding to the sequences passed. What are token type IDs?

Should be overridden in a subclass if the model has a special way of building those.

Args

token_ids_0 (List[int]): The first tokenized sequence. token_ids_1 (List[int], optional): The second tokenized sequence.

Returns

List[int]: The token type ids.

Expand source code
def create_token_type_ids_from_sequences(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:

    if token_ids_1 is None:
        return len(token_ids_0) * [0]
    return [0]*len(self.special_tokens["CLS"]) + [0] * len(token_ids_0) + [0]*len(self.special_tokens["SEP"]) + [1] * len(token_ids_1) + [0]*len(self.special_tokens["SEP"])
def get_special_tokens_mask(self, token_ids_0: List[int], token_ids_1: Union[List[int], NoneType] = None) ‑> List[int]

Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding special tokens using the tokenizer prepare_for_model or encode_plus methods.

Args

token_ids_0 (List[int]): List of ids of the first sequence. token_ids_1 (List[int], optional): List of ids of the second sequence. already_has_special_tokens (bool, optional, defaults to False): Whether or not the token list is already formatted with special tokens for the model.

Returns

A list of integers in the range [0, 1]
1 for a special token, 0 for a sequence token.
Expand source code
def get_special_tokens_mask(
    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
    if token_ids_1 is None:
        return [0 for _ in token_ids_0]
    return [1 for _ in self.special_tokens["CLS"]] + [0 for _ in token_ids_0] + [1 for _ in self.special_tokens["SEP"]] + [0 for _ in token_ids_1] + [1 for _ in self.special_tokens["SEP"]]
def num_special_tokens_to_add(self, pair: bool = False) ‑> int
Expand source code
def num_special_tokens_to_add(self, pair: bool = False) -> int:
    return 0 if not pair else 3
def pad(self, encoded_inputs: Union[transformers.tokenization_utils_base.BatchEncoding, List[transformers.tokenization_utils_base.BatchEncoding], Dict[str, List[int]], Dict[str, List[List[int]]], List[Dict[str, List[int]]]], padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = True, max_length: Union[int, NoneType] = None, pad_to_multiple_of: Union[int, NoneType] = None, return_attention_mask: Union[bool, NoneType] = None, return_tensors: Union[str, transformers.utils.generic.TensorType, NoneType] = None) ‑> transformers.tokenization_utils_base.BatchEncoding

Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length in the batch.

Padding side (left/right) padding token ids are defined at the tokenizer level (with self.padding_side, self.pad_token_id and self.pad_token_type_id)

If the encoded_inputs passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result will use the same type unless you provide a different tensor type with return_tensors. In the case of PyTorch tensors, you will lose the specific device of your tensors however.

Args

encoded_inputs ([BatchEncoding], list of [BatchEncoding], Dict[str, List[int]], Dict[str, List[List[int]] or List[Dict[str, List[int]]]): Tokenized inputs. Can represent one input ([BatchEncoding] or Dict[str, List[int]]) or a batch of tokenized inputs (list of [BatchEncoding], Dict[str, List[List[int]]] or List[Dict[str, List[int]]]) so you can use this method during preprocessing as well as in a PyTorch Dataloader collate function.

Instead of <code>List\[int]</code> you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see
the note above for the return type.

padding (bool, str or [~utils.PaddingStrategy], optional, defaults to True): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among:

- <code>True</code> or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument <code>max\_length</code> or to the maximum
  acceptable input length for the model if that argument is not provided.
- <code>False</code> or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  lengths).

max_length (int, optional): Maximum length of the returned list and optionally padding length (see above). pad_to_multiple_of (int, optional): If set will pad the sequence to a multiple of the provided value.

This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
>= 7.5 (Volta).

return_attention_mask (bool, optional): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the return_outputs attribute.

[What are attention masks?](../glossary#attention-mask)

return_tensors (str or [~utils.TensorType], optional): If set, will return tensors instead of list of python integers. Acceptable values are:

- `'tf'`: Return TensorFlow <code>tf.constant</code> objects.
- `'pt'`: Return PyTorch <code>torch.Tensor</code> objects.
- `'np'`: Return Numpy <code>np.ndarray</code> objects.

verbose (bool, optional, defaults to True): Whether or not to print more information and warnings.

Expand source code
def pad(
    self,
    encoded_inputs: Union[
        BatchEncoding,
        List[BatchEncoding],
        Dict[str, EncodedInput],
        Dict[str, List[EncodedInput]],
        List[Dict[str, EncodedInput]],
    ],
    padding: Union[bool, str, PaddingStrategy] = True,
    max_length: Optional[int] = None,
    pad_to_multiple_of: Optional[int] = None,
    return_attention_mask: Optional[bool] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
) -> BatchEncoding:

    # If we have a list of dicts, let's convert it in a dict of lists
    # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
    if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
        encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}

    # The model's main input name, usually `input_ids`, has be passed for padding
    if self.model_input_names[0] not in encoded_inputs:
        raise ValueError(
            "You should supply an encoding or a list of encodings to this method"
            f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
        )

    required_input = encoded_inputs[self.model_input_names[0]]

    if required_input is None:
        if return_attention_mask:
            encoded_inputs["attention_mask"] = []
        return encoded_inputs

    # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
    # and rebuild them afterwards if no return_tensors is specified
    # Note that we lose the specific device the tensor may be on for PyTorch

    first_element = required_input[0]
    if isinstance(first_element, (list, tuple)):
        # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
        index = 0
        while len(required_input[index]) == 0:
            index += 1
        if index < len(required_input):
            first_element = required_input[index][0]
    # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
    if not isinstance(first_element, (int, list, tuple)):
        if is_torch_available() and is_torch(first_element):
            return_tensors = "pt" if return_tensors is None else return_tensors
        elif isinstance(first_element, np.ndarray):
            return_tensors = "np" if return_tensors is None else return_tensors
        else:
            raise ValueError(
                f"type of {first_element} unknown: {type(first_element)}. "
                f"Should be one of a python, numpy or pytorch object."
            )

        for key, value in encoded_inputs.items():
            encoded_inputs[key] = to_py_obj(value)
    
    required_input = encoded_inputs[self.model_input_names[0]]
    if required_input and not isinstance(required_input[0], (list, tuple)):
        encoded_inputs = self._pad(
            encoded_inputs,
            max_length=max_length,
            padding_strategy=padding,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )
        return BatchEncoding(encoded_inputs, tensor_type=return_tensors)

    batch_size = len(required_input)
    assert all(
        len(v) == batch_size for v in encoded_inputs.values()
    ), "Some items in the output dictionary have a different batch size than others."

    if padding == PaddingStrategy.LONGEST:
        max_length = max(len(inputs) for inputs in required_input)
        padding = PaddingStrategy.MAX_LENGTH

    batch_outputs = {}
    for i in range(batch_size):
        inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
        outputs = self._pad(
            inputs,
            max_length=max_length,
            padding_strategy=padding,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

        for key, value in outputs.items():
            if key not in batch_outputs:
                batch_outputs[key] = []
            batch_outputs[key].append(value)

    return BatchEncoding(batch_outputs, tensor_type=return_tensors)
def prepare_for_model(self, ids: List[int], pair_ids: Union[List[int], NoneType] = None, add_special_tokens: bool = True, padding: Union[bool, str, transformers.utils.generic.PaddingStrategy] = False, truncation: Union[bool, str, transformers.tokenization_utils_base.TruncationStrategy] = False, max_length: Union[int, NoneType] = None, stride: int = 0, pad_to_multiple_of: Union[int, NoneType] = None, return_tensors: Union[str, transformers.utils.generic.TensorType, NoneType] = None, return_token_type_ids: Union[bool, NoneType] = None, return_attention_mask: Union[bool, NoneType] = None, return_overflowing_tokens: bool = False, return_special_tokens_mask: bool = False, return_length: bool = False, prepend_batch_axis: bool = False, **kwargs)

Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and manages a moving window (with user defined stride) for overflowing tokens. Please Note, for pair_ids different than None and truncation_strategy = longest_first or True, it is not possible to return overflowing tokens. Such a combination of arguments will raise an error.

Args

ids (List[int]): Tokenized input ids of the first sequence. Can be obtained from a string by chaining the tokenize and convert_tokens_to_ids methods. pair_ids (List[int], optional): Tokenized input ids of the second sequence. Can be obtained from a string by chaining the tokenize and convert_tokens_to_ids methods.

add_special_tokens (bool, optional, defaults to True): Whether or not to encode the sequences with the special tokens relative to their model. padding (bool, str or [~utils.PaddingStrategy], optional, defaults to False): Activates and controls padding. Accepts the following values:

- <code>True</code> or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument <code>max\_length</code> or to the maximum
  acceptable input length for the model if that argument is not provided.
- <code>False</code> or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  lengths).

truncation (bool, str or [~tokenization_utils_base.TruncationStrategy], optional, defaults to False): Activates and controls truncation. Accepts the following values:

- <code>True</code> or `'longest_first'`: Truncate to a maximum length specified with the argument <code>max\_length</code> or
  to the maximum acceptable input length for the model if that argument is not provided. This will
  truncate token by token, removing a token from the longest sequence in the pair if a pair of
  sequences (or a batch of pairs) is provided.
- `'only_first'`: Truncate to a maximum length specified with the argument <code>max\_length</code> or to the
  maximum acceptable input length for the model if that argument is not provided. This will only
  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- `'only_second'`: Truncate to a maximum length specified with the argument <code>max\_length</code> or to the
  maximum acceptable input length for the model if that argument is not provided. This will only
  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
- <code>False</code> or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
  greater than the model maximum admissible input size).

max_length (int, optional): Controls the maximum length to use by one of the truncation/padding parameters.

If left unset or set to <code>None</code>, this will use the predefined model maximum length if a maximum length
is required by one of the truncation/padding parameters. If the model has no specific maximum input
length (like XLNet) truncation/padding to a maximum length will be deactivated.

stride (int, optional, defaults to 0): If set to a number along with max_length, the overflowing tokens returned when return_overflowing_tokens=True will contain some tokens from the end of the truncated sequence returned to provide some overlap between truncated and overflowing sequences. The value of this argument defines the number of overlapping tokens. is_split_into_words (bool, optional, defaults to False): Whether or not the input is already pre-tokenized (e.g., split into words). If set to True, the tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) which it will tokenize. This is useful for NER or token classification. pad_to_multiple_of (int, optional): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). return_tensors (str or [~utils.TensorType], optional): If set, will return tensors instead of list of python integers. Acceptable values are:

- `'tf'`: Return TensorFlow <code>tf.constant</code> objects.
- `'pt'`: Return PyTorch <code>torch.Tensor</code> objects.
- `'np'`: Return Numpy <code>np.ndarray</code> objects.

return_token_type_ids (bool, optional): Whether to return token type IDs. If left to the default, will return the token type IDs according to the specific tokenizer's default, defined by the return_outputs attribute.

[What are token type IDs?](../glossary#token-type-ids)

return_attention_mask (bool, optional): Whether to return the attention mask. If left to the default, will return the attention mask according to the specific tokenizer's default, defined by the return_outputs attribute.

[What are attention masks?](../glossary#attention-mask)

return_overflowing_tokens (bool, optional, defaults to False): Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch of pairs) is provided with truncation_strategy = longest_first or True, an error is raised instead of returning overflowing tokens. return_special_tokens_mask (bool, optional, defaults to False): Whether or not to return special tokens mask information. return_offsets_mapping (bool, optional, defaults to False): Whether or not to return (char_start, char_end) for each token.

This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
Python's tokenizer, this method will raise <code>NotImplementedError</code>.
return_length (bool, optional, defaults to False):
Whether or not to return the lengths of the encoded inputs.
verbose (bool, optional, defaults to True):
Whether or not to print more information and warnings.
**kwargs
passed to the self.tokenize() method

Return

[BatchEncoding]: A [BatchEncoding] with the following fields:

  • input_ids – List of token ids to be fed to a model.

What are input IDs?

  • token_type_ids – List of token type ids to be fed to a model (when return_token_type_ids=True or if "token_type_ids" is in self.model_input_names).

What are token type IDs?

  • attention_mask – List of indices specifying which tokens should be attended to by the model (when return_attention_mask=True or if "attention_mask" is in self.model_input_names).

What are attention masks?

  • overflowing_tokens – List of overflowing tokens sequences (when a max_length is specified and return_overflowing_tokens=True).
  • num_truncated_tokens – Number of tokens truncated (when a max_length is specified and return_overflowing_tokens=True).
  • special_tokens_mask – List of 0s and 1s, with 1 specifying added special tokens and 0 specifying regular sequence tokens (when add_special_tokens=True and return_special_tokens_mask=True).
  • length – The length of the inputs (when return_length=True)
Expand source code
def prepare_for_model(
    self,
    ids: List[int],
    pair_ids: Optional[List[int]] = None,
    add_special_tokens: bool = True,
    padding: Union[bool, str, PaddingStrategy] = False,
    truncation: Union[bool, str, TruncationStrategy] = False,
    max_length: Optional[int] = None,
    stride: int = 0,
    pad_to_multiple_of: Optional[int] = None,
    return_tensors: Optional[Union[str, TensorType]] = None,
    return_token_type_ids: Optional[bool] = None,
    return_attention_mask: Optional[bool] = None,
    return_overflowing_tokens: bool = False,
    return_special_tokens_mask: bool = False,
    return_length: bool = False,
    prepend_batch_axis: bool = False,
    **kwargs
):

    pair = bool(pair_ids is not None)
    len_ids = len(ids)
    len_pair_ids = len(pair_ids) if pair else 0
    if return_token_type_ids and not add_special_tokens:
        raise ValueError(
            "Asking to return token_type_ids while setting add_special_tokens to False "
            "results in an undefined behavior. Please set add_special_tokens to True or "
            "set return_token_type_ids to None."
        )

    # Load from model defaults
    if return_token_type_ids is None:
        return_token_type_ids = "token_type_ids" in self.model_input_names
    if return_attention_mask is None:
        return_attention_mask = "attention_mask" in self.model_input_names

    encoded_inputs = {}

    # Compute the total size of the returned encodings
    total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)

    # Truncation: Handle max sequence length
    overflowing_tokens = []
    if truncation != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
        ids, pair_ids, overflowing_tokens = self.truncate_sequences(
            ids,
            pair_ids=pair_ids,
            num_tokens_to_remove=total_len - max_length,
            truncation_strategy=truncation,
            stride=stride,
        )

    if return_overflowing_tokens:
        encoded_inputs["overflowing_tokens"] = overflowing_tokens
        encoded_inputs["num_truncated_tokens"] = total_len - max_length

    # Add special tokens
    if add_special_tokens:
        sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
        token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
    else:
        sequence = np.concatenate([ids, pair_ids], axis=0) if pair is True else ids
        token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
    
    # Build output dictionary
    encoded_inputs["input_ids"] = sequence

    if return_token_type_ids:
        encoded_inputs["token_type_ids"] = token_type_ids
    if return_special_tokens_mask:
        if add_special_tokens:
            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
        else:
            encoded_inputs["special_tokens_mask"] = [0] * len(sequence)

    # Padding
    if padding != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
        encoded_inputs = self.pad(
            encoded_inputs,
            max_length=max_length,
            padding=padding,
            pad_to_multiple_of=pad_to_multiple_of,
            return_attention_mask=return_attention_mask,
        )

    if return_length:
        encoded_inputs["length"] = len(encoded_inputs["input_ids"])

    batch_outputs = BatchEncoding(
        encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
    )
    
    return batch_outputs
def text2embeddings(self, text: str) ‑> numpy.ndarray
Expand source code
@abc.abstractmethod
def text2embeddings(self, text: str) -> np.ndarray:
    raise NotImplementedError('This function is not implemented')