Module `text_embeddings.byte`

Expand source code

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2021-07-18 14:36:09
# @Author  : Chenghao Mou (mouchenghao@gmail.com)

from text_embeddings.byte.byt5 import ByT5Tokenizer
from text_embeddings.byte.charformer import ByteTokenizer, GBST

__all__ = ['ByT5Tokenizer', 'GBST', 'ByteTokenizer']

Sub-modules

text_embeddings.byte.byt5: From ByT5: Towards a token-free future with pre-trained byte-to-byte models.
text_embeddings.byte.charformer: This is from paper Charformer: Fast Character Transformers via Gradient-based Subword Tokenization.

Classes

class ByT5Tokenizer (embed_size: int = 259, model_input_names: Union[List[str], NoneType] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 1024)

Embed text into byte sequences. This is different from other tokenizers because it still has a small vocabulary where each byte is mapped to an index.

Parameters

embed_size : int, optional: The size of the embedding, by default 259 (256 + 3 special tokens)
model_input_names : Optional[List[str]], optional: Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
special_tokens : Optional[Dict[str, np.ndarray]], optional: Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
max_length : Optional[int], optional: Maximum character length, by default 1024

Examples

>>> tokenizer = ByT5Tokenizer()
>>> e = tokenizer.text2embeddings("This is a test message")
>>> e.shape
(22, 259)
>>> np.equal(np.max(e, axis=1), np.ones((len(e)))).all()
True

Expand source code

class ByT5Tokenizer(EmbeddingTokenizer):
    """Embed text into byte sequences. This is different from other tokenizers because it still has a small vocabulary where each byte is mapped to an index.

    Parameters
    ----------
    embed_size : int, optional
        The size of the embedding, by default 259 (256 + 3 special tokens)
    model_input_names : Optional[List[str]], optional
        Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
    special_tokens : Optional[Dict[str, np.ndarray]], optional
        Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
    max_length : Optional[int], optional
        Maximum character length, by default 1024

    Examples
    --------
    >>> tokenizer = ByT5Tokenizer()
    >>> e = tokenizer.text2embeddings("This is a test message")
    >>> e.shape
    (22, 259)
    >>> np.equal(np.max(e, axis=1), np.ones((len(e)))).all()
    True
    """

    def __init__(
        self,
        embed_size: int = 259,
        model_input_names: Optional[List[str]] = None,
        special_tokens: Optional[Dict[str, np.ndarray]] = None,
        max_length: Optional[int] = 1024,
    ):
        super().__init__(model_input_names, special_tokens, max_length)
        self.embed_size = embed_size
        self.model_input_names = model_input_names
        self.special_tokens = special_tokens
        self.max_length = max_length

        if self.model_input_names is None:
            logger.warning(
                'Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]'
            )
            self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"]

        if self.special_tokens is None:
            logger.warning("Using default special_tokens values")
            self.special_tokens = {
                "SEP": np.zeros((self.embed_size,)),
                "CLS": np.zeros((self.embed_size,)),
            }
            self.special_tokens["CLS"][1] = 1
            self.special_tokens["SEP"][2] = 1

        logger.info("Be sure to add an embedding layer when using a ByT5Tokenizer.")

    def text2embeddings(self, text: str) -> np.ndarray:
        """Convert text into an numpy array, in (sequence_length, embed_size) shape.

        Parameters
        ----------
        text : str
            Input text

        Returns
        -------
        np.ndarray
            An array in (sequence_length, embed_size) shape
        """
        if not text:
            return None

        b = text.encode("utf-8", errors="ignore")

        result = np.zeros((len(b), self.embed_size))
        for i, byte in enumerate(b):
            result[i][byte + 3] = 1

        return result

    def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
        """Create a padding token embedding.

        Parameters
        ----------
        input_embeddings : np.ndarray, optional
            Embedded input, by default None

        Returns
        -------
        np.ndarray
            A padding token embedding compatible with the input
        """
        e = np.zeros((self.embed_size,))
        e[0] = 1
        return e

Ancestors

EmbeddingTokenizer
transformers.tokenization_utils_base.PreTrainedTokenizerBase
transformers.tokenization_utils_base.SpecialTokensMixin
transformers.utils.hub.PushToHubMixin

Class variables

var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]

Methods

def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray

Create a padding token embedding.

Parameters

input_embeddings : np.ndarray, optional: Embedded input, by default None

Returns

np.ndarray: A padding token embedding compatible with the input

Expand source code

def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
    """Create a padding token embedding.

    Parameters
    ----------
    input_embeddings : np.ndarray, optional
        Embedded input, by default None

    Returns
    -------
    np.ndarray
        A padding token embedding compatible with the input
    """
    e = np.zeros((self.embed_size,))
    e[0] = 1
    return e

def text2embeddings(self, text: str) ‑> numpy.ndarray

Convert text into an numpy array, in (sequence_length, embed_size) shape.

Parameters

text : str: Input text

Returns

np.ndarray: An array in (sequence_length, embed_size) shape

Expand source code

def text2embeddings(self, text: str) -> np.ndarray:
    """Convert text into an numpy array, in (sequence_length, embed_size) shape.

    Parameters
    ----------
    text : str
        Input text

    Returns
    -------
    np.ndarray
        An array in (sequence_length, embed_size) shape
    """
    if not text:
        return None

    b = text.encode("utf-8", errors="ignore")

    result = np.zeros((len(b), self.embed_size))
    for i, byte in enumerate(b):
        result[i][byte + 3] = 1

    return result

Inherited members

EmbeddingTokenizer:

class ByteTokenizer (model_input_names: Union[List[str], NoneType] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 1024)

Embed text into byte sequences. This is different from other tokenizers because it still needs a small vocabulary where each byte is mapped to an index.

Parameters

model_input_names : Optional[List[str]], optional: Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
special_tokens : Optional[Dict[str, np.ndarray]], optional: Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
max_length : Optional[int], optional: Maximum character length, by default 1024

Examples

>>> from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
>>> tokenizer = ByteTokenizer()
>>> e = tokenizer.text2embeddings("This is a test message")
>>> e.shape
(22, 1)
>>> r = tokenizer(["This is a test message", "This is another test message"], padding=PaddingStrategy.LONGEST)
>>> r["input_ids"].shape
(2, 28)

Expand source code

class ByteTokenizer(EmbeddingTokenizer):
    """Embed text into byte sequences. This is different from other tokenizers because it still needs a small vocabulary where each byte is mapped to an index.

    Parameters
    ----------
    model_input_names : Optional[List[str]], optional
        Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
    special_tokens : Optional[Dict[str, np.ndarray]], optional
        Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
    max_length : Optional[int], optional
        Maximum character length, by default 1024

    Examples
    --------
    >>> from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy
    >>> tokenizer = ByteTokenizer()
    >>> e = tokenizer.text2embeddings("This is a test message")
    >>> e.shape
    (22, 1)
    >>> r = tokenizer(["This is a test message", "This is another test message"], padding=PaddingStrategy.LONGEST)
    >>> r["input_ids"].shape
    (2, 28)
    """

    def __init__(
        self,
        model_input_names: Optional[List[str]] = None,
        special_tokens: Optional[Dict[str, np.ndarray]] = None,
        max_length: Optional[int] = 1024,
    ):
        super().__init__(model_input_names, special_tokens, max_length)
        self.embed_size = 1
        self.model_input_names = model_input_names
        self.special_tokens = special_tokens
        self.max_length = max_length

        if self.model_input_names is None:
            logger.warning(
                'Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]'
            )
            self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"]

        if self.special_tokens is None:
            logger.warning("Using default special_tokens values")
            self.special_tokens = {
                "SEP": np.zeros((self.embed_size,)),
                "CLS": np.zeros((self.embed_size,)),
            }
            self.special_tokens["CLS"] = 1
            self.special_tokens["SEP"] = 2

        logger.info("Be sure to add an embedding layer when using a ByteTokenizer.")

    def text2embeddings(self, text: str) -> np.ndarray:
        """Convert text into an numpy array, in (sequence_length, embed_size) shape.

        Parameters
        ----------
        text : str
            Input text

        Returns
        -------
        np.ndarray
            An array in (sequence_length, embed_size) shape
        """
        if not text:
            return None

        b = text.encode("utf-8", errors="ignore")

        result = np.zeros((len(b), self.embed_size))
        for i, byte in enumerate(b):
            result[i] = byte + len(self.special_tokens) + 1

        return result

    def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
        """Create a padding token embedding.

        Parameters
        ----------
        input_embeddings : np.ndarray, optional
            Embedded input, by default None

        Returns
        -------
        np.ndarray
            A padding token embedding compatible with the input
        """
        e = np.zeros((self.embed_size,))
        return e

    def __call__(self, *args, **kwargs):
        results = super().__call__(*args, **kwargs)
        results["input_ids"] = np.squeeze(results["input_ids"], axis=-1)
        return results

Ancestors

EmbeddingTokenizer
transformers.tokenization_utils_base.PreTrainedTokenizerBase
transformers.tokenization_utils_base.SpecialTokensMixin
transformers.utils.hub.PushToHubMixin

Class variables

var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]

Methods

def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray

Create a padding token embedding.

Parameters

input_embeddings : np.ndarray, optional: Embedded input, by default None

Returns

np.ndarray: A padding token embedding compatible with the input

Expand source code

def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
    """Create a padding token embedding.

    Parameters
    ----------
    input_embeddings : np.ndarray, optional
        Embedded input, by default None

    Returns
    -------
    np.ndarray
        A padding token embedding compatible with the input
    """
    e = np.zeros((self.embed_size,))
    return e

def text2embeddings(self, text: str) ‑> numpy.ndarray

Convert text into an numpy array, in (sequence_length, embed_size) shape.

Parameters

text : str: Input text

Returns

np.ndarray: An array in (sequence_length, embed_size) shape

Expand source code

def text2embeddings(self, text: str) -> np.ndarray:
    """Convert text into an numpy array, in (sequence_length, embed_size) shape.

    Parameters
    ----------
    text : str
        Input text

    Returns
    -------
    np.ndarray
        An array in (sequence_length, embed_size) shape
    """
    if not text:
        return None

    b = text.encode("utf-8", errors="ignore")

    result = np.zeros((len(b), self.embed_size))
    for i, byte in enumerate(b):
        result[i] = byte + len(self.special_tokens) + 1

    return result

Inherited members

EmbeddingTokenizer:

class GBST (embed_size: int = 256, max_block_size: int = 4, downsampling_factor: int = 2, score_calibration: bool = True, vocab_size: int = 256)

Gradient-based Subword Tokenization module from the paper: Charformer: Fast Character Transformers via Gradient-based Subword Tokenization.

Parameters

embed_size : int, optional: The embedding size for each byte/character, by default 259
max_block_size : int, optional: Every subword token of length from 1 to max_block_size are considered, by default 4
downsampling_factor : int, optional: Downsampling rate from byte sequence to the final sequence, by default 2
score_calibration : bool, optional: To calibrate the scores with a self-attention like step, by default True
vocab_size : int, optional: The size of the byte vocabulary, by default 256

Examples

>>> model = GBST(
...     embed_size=128,
...     max_block_size=4,
...     downsampling_factor=2,
...     score_calibration=True,
...     vocab_size=256,
... )
>>> tokenizer = ByteTokenizer()
>>> results = tokenizer(["Life is like a box of chocolates.", "Coding is fun."], add_special_tokens=True)
>>> results["input_ids"].shape
(2, 1024)
>>> hidden = model(torch.tensor(results["input_ids"]).long())
>>> hidden.shape
torch.Size([2, 512, 128])

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class GBST(nn.Module):
    """Gradient-based Subword Tokenization module from the paper:
    Charformer: Fast Character Transformers via Gradient-based Subword Tokenization.

    Parameters
    ----------
    embed_size : int, optional
        The embedding size for each byte/character, by default 259
    max_block_size : int, optional
        Every subword token of length from 1 to max_block_size are considered, by default 4
    downsampling_factor : int, optional
        Downsampling rate from byte sequence to the final sequence, by default 2
    score_calibration : bool, optional
        To calibrate the scores with a self-attention like step, by default True
    vocab_size : int, optional
        The size of the byte vocabulary, by default 256

    Examples
    --------
    >>> model = GBST(
    ...     embed_size=128,
    ...     max_block_size=4,
    ...     downsampling_factor=2,
    ...     score_calibration=True,
    ...     vocab_size=256,
    ... )
    >>> tokenizer = ByteTokenizer()
    >>> results = tokenizer(["Life is like a box of chocolates.", "Coding is fun."], add_special_tokens=True)
    >>> results["input_ids"].shape
    (2, 1024)
    >>> hidden = model(torch.tensor(results["input_ids"]).long())
    >>> hidden.shape
    torch.Size([2, 512, 128])
    """

    def __init__(
        self,
        embed_size: int = 256,
        max_block_size: int = 4,
        downsampling_factor: int = 2,
        score_calibration: bool = True,
        vocab_size: int = 256,
    ):
        super().__init__()
        self.vocab_size = vocab_size
        self.max_block_size = max_block_size
        self.score_calibration = score_calibration
        self.downsampling_factor = downsampling_factor
        self.embed_size = embed_size

        self.byte_embedding = nn.Embedding(
            self.vocab_size, self.embed_size, padding_idx=0
        )
        self.block_position_embedding = PositionalEncoding(
            self.embed_size, max_len=self.max_block_size
        )

        self.avg_pools = nn.ModuleDict(
            {
                str(i): nn.AvgPool1d(i, ceil_mode=True)
                for i in range(1, self.max_block_size + 1)
            }
        )
        self.block_scorer = nn.Linear(self.embed_size, 1)
        self.down_sampler = nn.AvgPool1d(self.downsampling_factor, ceil_mode=True)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input):

        byte_embeddings = self.byte_embedding(input)
        sequence_length = byte_embeddings.shape[1]

        Xs = []
        X_scores = []

        for block_size in range(1, self.max_block_size + 1):
            positioned_embeddings = rearrange(byte_embeddings, "b l h -> b h l")
            positioned_embeddings = self.block_position_embedding(positioned_embeddings)

            # b h s
            Xb = self.avg_pools[str(block_size)](positioned_embeddings)
            # b 1 s
            Xb_scores = rearrange(
                self.block_scorer(rearrange(Xb, "b h s -> b s h")), "b s 1 -> b 1 s"
            )
            # b h l
            Xb_ = Xb.repeat_interleave(repeats=block_size, dim=2)
            # b 1 l
            Xb_scores_ = Xb_scores.repeat_interleave(repeats=block_size, dim=2)

            Xs.append(Xb_[:, :, :sequence_length])
            X_scores.append(Xb_scores_[:, :, :sequence_length])

        # b M l
        scores = torch.cat(X_scores, dim=1)
        # b l M 1
        scores = rearrange(torch.softmax(scores, dim=1), "b M l -> b l M 1")

        if self.score_calibration:
            # b l M 1
            scores = (
                torch.softmax(scores @ rearrange(scores, "b l M 1 -> b l 1 M"), dim=-1)
                @ scores
            )

        # b l h M
        Xs = rearrange(torch.stack(Xs, dim=0), "M b h l -> b l h M")
        Xs = rearrange(Xs @ scores, "b l h 1 -> b h l")
        Xs = rearrange(self.down_sampler(Xs), "b h s -> b s h")

        return Xs

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, input) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Expand source code

def forward(self, input):

    byte_embeddings = self.byte_embedding(input)
    sequence_length = byte_embeddings.shape[1]

    Xs = []
    X_scores = []

    for block_size in range(1, self.max_block_size + 1):
        positioned_embeddings = rearrange(byte_embeddings, "b l h -> b h l")
        positioned_embeddings = self.block_position_embedding(positioned_embeddings)

        # b h s
        Xb = self.avg_pools[str(block_size)](positioned_embeddings)
        # b 1 s
        Xb_scores = rearrange(
            self.block_scorer(rearrange(Xb, "b h s -> b s h")), "b s 1 -> b 1 s"
        )
        # b h l
        Xb_ = Xb.repeat_interleave(repeats=block_size, dim=2)
        # b 1 l
        Xb_scores_ = Xb_scores.repeat_interleave(repeats=block_size, dim=2)

        Xs.append(Xb_[:, :, :sequence_length])
        X_scores.append(Xb_scores_[:, :, :sequence_length])

    # b M l
    scores = torch.cat(X_scores, dim=1)
    # b l M 1
    scores = rearrange(torch.softmax(scores, dim=1), "b M l -> b l M 1")

    if self.score_calibration:
        # b l M 1
        scores = (
            torch.softmax(scores @ rearrange(scores, "b l M 1 -> b l 1 M"), dim=-1)
            @ scores
        )

    # b l h M
    Xs = rearrange(torch.stack(Xs, dim=0), "M b h l -> b l h M")
    Xs = rearrange(Xs @ scores, "b l h 1 -> b h l")
    Xs = rearrange(self.down_sampler(Xs), "b h s -> b s h")

    return Xs