Module text_embeddings.hash
Hash related tokenizers.
Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2021-04-22 20:58:54
# @Author  : Chenghao Mou (mouchenghao@gmail.com)
"""Hash related tokenizers."""
from .canine import CANINETokenizer
from .pqrnn import PQRNNTokenizer
__all__ = ['PQRNNTokenizer', 'CANINETokenizer']Sub-modules
- text_embeddings.hash.canine
- 
From CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation. 
- text_embeddings.hash.pqrnn
- text_embeddings.hash.util
- 
Python translation of the original code in tensorflow. 
Classes
- class CANINETokenizer (hash_size: int = 768, model_input_names: Union[List[str], NoneType] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 2048)
- 
A character hashing tokenizer/embedder from CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation Parameters- hash_size:- int, optional
- The embedding size of each character, by default 768
- model_input_names:- Optional[List[str]], optional
- Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
- special_tokens:- Optional[Dict[str, np.ndarray]], optional
- Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
- max_length:- Optional[int], optional
- Maximum character length, by default 2048
 Examples>>> from text_embeddings.hash import CANINETokenizer >>> from transformers.tokenization_utils_base import * >>> tokenier = CANINETokenizer() >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False) >>> assert results['input_ids'].shape == (2, 25, 768), results['input_ids'].shapeExpand source codeclass CANINETokenizer(EmbeddingTokenizer): """ A character hashing tokenizer/embedder from [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) Parameters ---------- hash_size : int, optional The embedding size of each character, by default 768 model_input_names : Optional[List[str]], optional Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"] special_tokens : Optional[Dict[str, np.ndarray]], optional Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"} max_length : Optional[int], optional Maximum character length, by default 2048 Examples -------- >>> from text_embeddings.hash import CANINETokenizer >>> from transformers.tokenization_utils_base import * >>> tokenier = CANINETokenizer() >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False) >>> assert results['input_ids'].shape == (2, 25, 768), results['input_ids'].shape """ def __init__( self, hash_size: int = 768, model_input_names: Optional[List[str]] = None, special_tokens: Optional[Dict[str, np.ndarray]] = None, max_length: Optional[int] = 2048, ): super().__init__(model_input_names, special_tokens, max_length) self.hash_size = hash_size self.model_input_names = model_input_names self.special_tokens = special_tokens self.max_length = max_length if self.model_input_names is None: logger.warning( 'Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]' ) self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"] def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, hash_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, hash_size) shape """ if not text: return None result = np.zeros((len(text), self.hash_size)) for i, char in enumerate(text): result[i] = murmurhash(char, feature_size=self.hash_size * 2) return result def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding. Parameters ---------- input_embeddings : [type], optional Embeddings already encoded, by default None Returns ------- np.ndarray An embedding array in (hash_size) """ return np.zeros((self.hash_size,))Ancestors- EmbeddingTokenizer
- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
 Class variables- var max_model_input_sizes : Dict[str, Union[int, NoneType]]
- var model_input_names : List[str]
- var padding_side : str
- var pretrained_init_configuration : Dict[str, Dict[str, Any]]
- var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
- var truncation_side : str
- var vocab_files_names : Dict[str, str]
 Methods- def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray
- 
Create a padding token embedding. Parameters- input_embeddings:- [type], optional
- Embeddings already encoded, by default None
 Returns- np.ndarray
- An embedding array in (hash_size)
 Expand source codedef create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding. Parameters ---------- input_embeddings : [type], optional Embeddings already encoded, by default None Returns ------- np.ndarray An embedding array in (hash_size) """ return np.zeros((self.hash_size,))
- def text2embeddings(self, text: str) ‑> numpy.ndarray
- 
Convert text into an numpy array, in (sequence_length, hash_size) shape. Parameters- text:- str
- Input text
 Returns- np.ndarray
- An array in (sequence_length, hash_size) shape
 Expand source codedef text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, hash_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, hash_size) shape """ if not text: return None result = np.zeros((len(text), self.hash_size)) for i, char in enumerate(text): result[i] = murmurhash(char, feature_size=self.hash_size * 2) return result
 Inherited members
- class PQRNNTokenizer (hash_size: int = 768, model_input_names: Union[List[str], NoneType] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 2048)
- 
Boundary-based hashing embeddings based on PQRNN Parameters- hash_size:- int, optional
- The size of the hashing embedding, by default 768
- model_input_names:- Optional[List[str]], optional
- Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
- special_tokens:- Optional[Dict[str, np.ndarray]], optional
- Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
- max_length:- Optional[int], optional
- Maximum token length, by default 2048
 Examples>>> from text_embeddings.hash import PQRNNTokenizer >>> from transformers.tokenization_utils_base import * >>> tokenier = PQRNNTokenizer() >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False) >>> assert results['input_ids'].shape == (2, 4, 768)Expand source codeclass PQRNNTokenizer(EmbeddingTokenizer): """ Boundary-based hashing embeddings based on [PQRNN](https://ai.googleblog.com/2020/09/advancing-nlp-with-efficient-projection.html) Parameters ---------- hash_size : int, optional The size of the hashing embedding, by default 768 model_input_names : Optional[List[str]], optional Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"] special_tokens : Optional[Dict[str, np.ndarray]], optional Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"} max_length : Optional[int], optional Maximum token length, by default 2048 Examples -------- >>> from text_embeddings.hash import PQRNNTokenizer >>> from transformers.tokenization_utils_base import * >>> tokenier = PQRNNTokenizer() >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False) >>> assert results['input_ids'].shape == (2, 4, 768) """ def __init__( self, hash_size: int = 768, model_input_names: Optional[List[str]] = None, special_tokens: Optional[Dict[str, np.ndarray]] = None, max_length: Optional[int] = 2048, ): super().__init__(model_input_names, special_tokens, max_length) self.hash_size = hash_size self.model_input_names = model_input_names self.special_tokens = special_tokens self.max_length = max_length if self.model_input_names is None: logger.warning('Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]') self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"] def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, 1, hash_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, 1, hash_size) shape """ if not text: return None tokens = text.split(" ") result = np.zeros((len(tokens), self.hash_size)) for i, token in enumerate(tokens): result[i] = murmurhash(token, feature_size=self.hash_size*2) return result def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding Parameters ---------- input_embeddings : [type], optional [description], by default None Returns ------- np.ndarray An empty embedding in (hash_size, ) shape """ return np.zeros((self.hash_size, ))Ancestors- EmbeddingTokenizer
- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
 Class variables- var max_model_input_sizes : Dict[str, Union[int, NoneType]]
- var model_input_names : List[str]
- var padding_side : str
- var pretrained_init_configuration : Dict[str, Dict[str, Any]]
- var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
- var truncation_side : str
- var vocab_files_names : Dict[str, str]
 Methods- def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray
- 
Create a padding token embedding Parameters- input_embeddings:- [type], optional
- [description], by default None
 Returns- np.ndarray
- An empty embedding in (hash_size, ) shape
 Expand source codedef create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding Parameters ---------- input_embeddings : [type], optional [description], by default None Returns ------- np.ndarray An empty embedding in (hash_size, ) shape """ return np.zeros((self.hash_size, ))
- def text2embeddings(self, text: str) ‑> numpy.ndarray
- 
Convert text into an numpy array, in (sequence_length, 1, hash_size) shape. Parameters- text:- str
- Input text
 Returns- np.ndarray
- An array in (sequence_length, 1, hash_size) shape
 Expand source codedef text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, 1, hash_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, 1, hash_size) shape """ if not text: return None tokens = text.split(" ") result = np.zeros((len(tokens), self.hash_size)) for i, token in enumerate(tokens): result[i] = murmurhash(token, feature_size=self.hash_size*2) return result
 Inherited members