Module text_embeddings.byte
Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2021-07-18 14:36:09
# @Author : Chenghao Mou (mouchenghao@gmail.com)
from text_embeddings.byte.byt5 import ByT5Tokenizer
from text_embeddings.byte.charformer import ByteTokenizer, GBST
__all__ = ['ByT5Tokenizer', 'GBST', 'ByteTokenizer']
Sub-modules
text_embeddings.byte.byt5
-
From ByT5: Towards a token-free future with pre-trained byte-to-byte models.
text_embeddings.byte.charformer
-
This is from paper Charformer: Fast Character Transformers via Gradient-based Subword Tokenization.
Classes
class ByT5Tokenizer (embed_size: int = 259, model_input_names: Union[List[str], NoneType] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 1024)
-
Embed text into byte sequences. This is different from other tokenizers because it still has a small vocabulary where each byte is mapped to an index.
Parameters
embed_size
:int
, optional- The size of the embedding, by default 259 (256 + 3 special tokens)
model_input_names
:Optional[List[str]]
, optional- Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
special_tokens
:Optional[Dict[str, np.ndarray]]
, optional- Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
max_length
:Optional[int]
, optional- Maximum character length, by default 1024
Examples
>>> tokenizer = ByT5Tokenizer() >>> e = tokenizer.text2embeddings("This is a test message") >>> e.shape (22, 259) >>> np.equal(np.max(e, axis=1), np.ones((len(e)))).all() True
Expand source code
class ByT5Tokenizer(EmbeddingTokenizer): """Embed text into byte sequences. This is different from other tokenizers because it still has a small vocabulary where each byte is mapped to an index. Parameters ---------- embed_size : int, optional The size of the embedding, by default 259 (256 + 3 special tokens) model_input_names : Optional[List[str]], optional Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"] special_tokens : Optional[Dict[str, np.ndarray]], optional Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"} max_length : Optional[int], optional Maximum character length, by default 1024 Examples -------- >>> tokenizer = ByT5Tokenizer() >>> e = tokenizer.text2embeddings("This is a test message") >>> e.shape (22, 259) >>> np.equal(np.max(e, axis=1), np.ones((len(e)))).all() True """ def __init__( self, embed_size: int = 259, model_input_names: Optional[List[str]] = None, special_tokens: Optional[Dict[str, np.ndarray]] = None, max_length: Optional[int] = 1024, ): super().__init__(model_input_names, special_tokens, max_length) self.embed_size = embed_size self.model_input_names = model_input_names self.special_tokens = special_tokens self.max_length = max_length if self.model_input_names is None: logger.warning( 'Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]' ) self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"] if self.special_tokens is None: logger.warning("Using default special_tokens values") self.special_tokens = { "SEP": np.zeros((self.embed_size,)), "CLS": np.zeros((self.embed_size,)), } self.special_tokens["CLS"][1] = 1 self.special_tokens["SEP"][2] = 1 logger.info("Be sure to add an embedding layer when using a ByT5Tokenizer.") def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, embed_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, embed_size) shape """ if not text: return None b = text.encode("utf-8", errors="ignore") result = np.zeros((len(b), self.embed_size)) for i, byte in enumerate(b): result[i][byte + 3] = 1 return result def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding. Parameters ---------- input_embeddings : np.ndarray, optional Embedded input, by default None Returns ------- np.ndarray A padding token embedding compatible with the input """ e = np.zeros((self.embed_size,)) e[0] = 1 return e
Ancestors
- EmbeddingTokenizer
- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
Class variables
var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]
Methods
def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray
-
Create a padding token embedding.
Parameters
input_embeddings
:np.ndarray
, optional- Embedded input, by default None
Returns
np.ndarray
- A padding token embedding compatible with the input
Expand source code
def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding. Parameters ---------- input_embeddings : np.ndarray, optional Embedded input, by default None Returns ------- np.ndarray A padding token embedding compatible with the input """ e = np.zeros((self.embed_size,)) e[0] = 1 return e
def text2embeddings(self, text: str) ‑> numpy.ndarray
-
Convert text into an numpy array, in (sequence_length, embed_size) shape.
Parameters
text
:str
- Input text
Returns
np.ndarray
- An array in (sequence_length, embed_size) shape
Expand source code
def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, embed_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, embed_size) shape """ if not text: return None b = text.encode("utf-8", errors="ignore") result = np.zeros((len(b), self.embed_size)) for i, byte in enumerate(b): result[i][byte + 3] = 1 return result
Inherited members
class ByteTokenizer (model_input_names: Union[List[str], NoneType] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 1024)
-
Embed text into byte sequences. This is different from other tokenizers because it still needs a small vocabulary where each byte is mapped to an index.
Parameters
model_input_names
:Optional[List[str]]
, optional- Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
special_tokens
:Optional[Dict[str, np.ndarray]]
, optional- Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
max_length
:Optional[int]
, optional- Maximum character length, by default 1024
Examples
>>> from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy >>> tokenizer = ByteTokenizer() >>> e = tokenizer.text2embeddings("This is a test message") >>> e.shape (22, 1) >>> r = tokenizer(["This is a test message", "This is another test message"], padding=PaddingStrategy.LONGEST) >>> r["input_ids"].shape (2, 28)
Expand source code
class ByteTokenizer(EmbeddingTokenizer): """Embed text into byte sequences. This is different from other tokenizers because it still needs a small vocabulary where each byte is mapped to an index. Parameters ---------- model_input_names : Optional[List[str]], optional Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"] special_tokens : Optional[Dict[str, np.ndarray]], optional Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"} max_length : Optional[int], optional Maximum character length, by default 1024 Examples -------- >>> from transformers.tokenization_utils_base import PaddingStrategy, TruncationStrategy >>> tokenizer = ByteTokenizer() >>> e = tokenizer.text2embeddings("This is a test message") >>> e.shape (22, 1) >>> r = tokenizer(["This is a test message", "This is another test message"], padding=PaddingStrategy.LONGEST) >>> r["input_ids"].shape (2, 28) """ def __init__( self, model_input_names: Optional[List[str]] = None, special_tokens: Optional[Dict[str, np.ndarray]] = None, max_length: Optional[int] = 1024, ): super().__init__(model_input_names, special_tokens, max_length) self.embed_size = 1 self.model_input_names = model_input_names self.special_tokens = special_tokens self.max_length = max_length if self.model_input_names is None: logger.warning( 'Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]' ) self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"] if self.special_tokens is None: logger.warning("Using default special_tokens values") self.special_tokens = { "SEP": np.zeros((self.embed_size,)), "CLS": np.zeros((self.embed_size,)), } self.special_tokens["CLS"] = 1 self.special_tokens["SEP"] = 2 logger.info("Be sure to add an embedding layer when using a ByteTokenizer.") def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, embed_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, embed_size) shape """ if not text: return None b = text.encode("utf-8", errors="ignore") result = np.zeros((len(b), self.embed_size)) for i, byte in enumerate(b): result[i] = byte + len(self.special_tokens) + 1 return result def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding. Parameters ---------- input_embeddings : np.ndarray, optional Embedded input, by default None Returns ------- np.ndarray A padding token embedding compatible with the input """ e = np.zeros((self.embed_size,)) return e def __call__(self, *args, **kwargs): results = super().__call__(*args, **kwargs) results["input_ids"] = np.squeeze(results["input_ids"], axis=-1) return results
Ancestors
- EmbeddingTokenizer
- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
Class variables
var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]
Methods
def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray
-
Create a padding token embedding.
Parameters
input_embeddings
:np.ndarray
, optional- Embedded input, by default None
Returns
np.ndarray
- A padding token embedding compatible with the input
Expand source code
def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding. Parameters ---------- input_embeddings : np.ndarray, optional Embedded input, by default None Returns ------- np.ndarray A padding token embedding compatible with the input """ e = np.zeros((self.embed_size,)) return e
def text2embeddings(self, text: str) ‑> numpy.ndarray
-
Convert text into an numpy array, in (sequence_length, embed_size) shape.
Parameters
text
:str
- Input text
Returns
np.ndarray
- An array in (sequence_length, embed_size) shape
Expand source code
def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, embed_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, embed_size) shape """ if not text: return None b = text.encode("utf-8", errors="ignore") result = np.zeros((len(b), self.embed_size)) for i, byte in enumerate(b): result[i] = byte + len(self.special_tokens) + 1 return result
Inherited members
class GBST (embed_size: int = 256, max_block_size: int = 4, downsampling_factor: int = 2, score_calibration: bool = True, vocab_size: int = 256)
-
Gradient-based Subword Tokenization module from the paper: Charformer: Fast Character Transformers via Gradient-based Subword Tokenization.
Parameters
embed_size
:int
, optional- The embedding size for each byte/character, by default 259
max_block_size
:int
, optional- Every subword token of length from 1 to max_block_size are considered, by default 4
downsampling_factor
:int
, optional- Downsampling rate from byte sequence to the final sequence, by default 2
score_calibration
:bool
, optional- To calibrate the scores with a self-attention like step, by default True
vocab_size
:int
, optional- The size of the byte vocabulary, by default 256
Examples
>>> model = GBST( ... embed_size=128, ... max_block_size=4, ... downsampling_factor=2, ... score_calibration=True, ... vocab_size=256, ... ) >>> tokenizer = ByteTokenizer() >>> results = tokenizer(["Life is like a box of chocolates.", "Coding is fun."], add_special_tokens=True) >>> results["input_ids"].shape (2, 1024) >>> hidden = model(torch.tensor(results["input_ids"]).long()) >>> hidden.shape torch.Size([2, 512, 128])
Initializes internal Module state, shared by both nn.Module and ScriptModule.
Expand source code
class GBST(nn.Module): """Gradient-based Subword Tokenization module from the paper: Charformer: Fast Character Transformers via Gradient-based Subword Tokenization. Parameters ---------- embed_size : int, optional The embedding size for each byte/character, by default 259 max_block_size : int, optional Every subword token of length from 1 to max_block_size are considered, by default 4 downsampling_factor : int, optional Downsampling rate from byte sequence to the final sequence, by default 2 score_calibration : bool, optional To calibrate the scores with a self-attention like step, by default True vocab_size : int, optional The size of the byte vocabulary, by default 256 Examples -------- >>> model = GBST( ... embed_size=128, ... max_block_size=4, ... downsampling_factor=2, ... score_calibration=True, ... vocab_size=256, ... ) >>> tokenizer = ByteTokenizer() >>> results = tokenizer(["Life is like a box of chocolates.", "Coding is fun."], add_special_tokens=True) >>> results["input_ids"].shape (2, 1024) >>> hidden = model(torch.tensor(results["input_ids"]).long()) >>> hidden.shape torch.Size([2, 512, 128]) """ def __init__( self, embed_size: int = 256, max_block_size: int = 4, downsampling_factor: int = 2, score_calibration: bool = True, vocab_size: int = 256, ): super().__init__() self.vocab_size = vocab_size self.max_block_size = max_block_size self.score_calibration = score_calibration self.downsampling_factor = downsampling_factor self.embed_size = embed_size self.byte_embedding = nn.Embedding( self.vocab_size, self.embed_size, padding_idx=0 ) self.block_position_embedding = PositionalEncoding( self.embed_size, max_len=self.max_block_size ) self.avg_pools = nn.ModuleDict( { str(i): nn.AvgPool1d(i, ceil_mode=True) for i in range(1, self.max_block_size + 1) } ) self.block_scorer = nn.Linear(self.embed_size, 1) self.down_sampler = nn.AvgPool1d(self.downsampling_factor, ceil_mode=True) self.apply(self._init_weights) def _init_weights(self, module): if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=0.02) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=0.02) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) def forward(self, input): byte_embeddings = self.byte_embedding(input) sequence_length = byte_embeddings.shape[1] Xs = [] X_scores = [] for block_size in range(1, self.max_block_size + 1): positioned_embeddings = rearrange(byte_embeddings, "b l h -> b h l") positioned_embeddings = self.block_position_embedding(positioned_embeddings) # b h s Xb = self.avg_pools[str(block_size)](positioned_embeddings) # b 1 s Xb_scores = rearrange( self.block_scorer(rearrange(Xb, "b h s -> b s h")), "b s 1 -> b 1 s" ) # b h l Xb_ = Xb.repeat_interleave(repeats=block_size, dim=2) # b 1 l Xb_scores_ = Xb_scores.repeat_interleave(repeats=block_size, dim=2) Xs.append(Xb_[:, :, :sequence_length]) X_scores.append(Xb_scores_[:, :, :sequence_length]) # b M l scores = torch.cat(X_scores, dim=1) # b l M 1 scores = rearrange(torch.softmax(scores, dim=1), "b M l -> b l M 1") if self.score_calibration: # b l M 1 scores = ( torch.softmax(scores @ rearrange(scores, "b l M 1 -> b l 1 M"), dim=-1) @ scores ) # b l h M Xs = rearrange(torch.stack(Xs, dim=0), "M b h l -> b l h M") Xs = rearrange(Xs @ scores, "b l h 1 -> b h l") Xs = rearrange(self.down_sampler(Xs), "b h s -> b s h") return Xs
Ancestors
- torch.nn.modules.module.Module
Class variables
var dump_patches : bool
var training : bool
Methods
def forward(self, input) ‑> Callable[..., Any]
-
Defines the computation performed at every call.
Should be overridden by all subclasses.
Note
Although the recipe for forward pass needs to be defined within this function, one should call the :class:
Module
instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.Expand source code
def forward(self, input): byte_embeddings = self.byte_embedding(input) sequence_length = byte_embeddings.shape[1] Xs = [] X_scores = [] for block_size in range(1, self.max_block_size + 1): positioned_embeddings = rearrange(byte_embeddings, "b l h -> b h l") positioned_embeddings = self.block_position_embedding(positioned_embeddings) # b h s Xb = self.avg_pools[str(block_size)](positioned_embeddings) # b 1 s Xb_scores = rearrange( self.block_scorer(rearrange(Xb, "b h s -> b s h")), "b s 1 -> b 1 s" ) # b h l Xb_ = Xb.repeat_interleave(repeats=block_size, dim=2) # b 1 l Xb_scores_ = Xb_scores.repeat_interleave(repeats=block_size, dim=2) Xs.append(Xb_[:, :, :sequence_length]) X_scores.append(Xb_scores_[:, :, :sequence_length]) # b M l scores = torch.cat(X_scores, dim=1) # b l M 1 scores = rearrange(torch.softmax(scores, dim=1), "b M l -> b l M 1") if self.score_calibration: # b l M 1 scores = ( torch.softmax(scores @ rearrange(scores, "b l M 1 -> b l 1 M"), dim=-1) @ scores ) # b l h M Xs = rearrange(torch.stack(Xs, dim=0), "M b h l -> b l h M") Xs = rearrange(Xs @ scores, "b l h 1 -> b h l") Xs = rearrange(self.down_sampler(Xs), "b h s -> b s h") return Xs