Module text_embeddings.visual
Visual information based tokenizers.
Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2021-04-22 20:59:35
# @Author : Chenghao Mou (mouchenghao@gmail.com)
"""Visual information based tokenizers."""
from .vtr import VTRTokenizer, text2image
__all__ = ['VTRTokenizer', 'text2image']
Sub-modules
text_embeddings.visual.vtr
-
Robust Open Vocabulary Translation from Visual Text Representations
Functions
def text2image(text: str, font: str, font_size: int = 14) ‑>
-
Convert text into an image and return the image. Reference: https://gist.github.com/destan/5540702
Parameters
text
:str
- Text to encode
font
:str
- Name of the font to use
font_size
:int
, optional- Size of the font, by default 14
Returns
Image
- Encoded image
Expand source code
def text2image(text: str, font: str, font_size: int = 14) -> Image: """Convert text into an image and return the image. Reference: https://gist.github.com/destan/5540702 Parameters ---------- text : str Text to encode font : str Name of the font to use font_size : int, optional Size of the font, by default 14 Returns ------- Image Encoded image """ image_font = ImageFont.truetype(font, max(font_size - 2, 8)) text = text.replace("\n", " ") line_width, _ = image_font.getsize(text) img = Image.new("L", (line_width, font_size)) draw = ImageDraw.Draw(img) draw.text(xy=(0, 0), text=text, fill="#FFFFFF", font=image_font) return img
Classes
class VTRTokenizer (window_size: int = 10, stride: int = 10, font: str = 'resources/Noto_Sans/NotoSans-Regular.ttf', font_size: int = 14, model_input_names: List[str] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 25)
-
Render the text into a series of image blocks. Reference VTR
Parameters
window_size
:int
, optional- The width of the image window, by default 10
stride
:int optional
- The stride used to generate image windows, by default 10
font
:str
, optional- Path to the font file, by default "resources/Noto_Sans/NotoSans-Regular.ttf"
font_size
:int
, optional- The size of the font in pixels, might be smaller than the actual image height, by default 14
model_input_names
:List[str]
, optional- Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
special_tokens
:Optional[Dict[str, np.ndarray]]
, optional- Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
max_length
:Optional[int]
, optional- Maximum sequence length, by default 25
Examples
>>> from text_embeddings.visual import VTRTokenizer >>> from transformers.tokenization_utils_base import * >>> tokenier = VTRTokenizer() >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False) >>> assert results['input_ids'].shape == (2, 13, 14, 10), results['input_ids'].shape
Expand source code
class VTRTokenizer(EmbeddingTokenizer): """ Render the text into a series of image blocks. Reference [VTR](https://t.co/l9E6rL8O5p?amp=1) Parameters ---------- window_size : int, optional The width of the image window, by default 10 stride: int optional The stride used to generate image windows, by default 10 font : str, optional Path to the font file, by default "resources/Noto_Sans/NotoSans-Regular.ttf" font_size : int, optional The size of the font in pixels, might be smaller than the actual image height, by default 14 model_input_names : List[str], optional Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"] special_tokens : Optional[Dict[str, np.ndarray]], optional Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"} max_length : Optional[int], optional Maximum sequence length, by default 25 Examples -------- >>> from text_embeddings.visual import VTRTokenizer >>> from transformers.tokenization_utils_base import * >>> tokenier = VTRTokenizer() >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False) >>> assert results['input_ids'].shape == (2, 13, 14, 10), results['input_ids'].shape """ def __init__( self, window_size: int = 10, stride: int = 10, font: str = "resources/Noto_Sans/NotoSans-Regular.ttf", font_size: int = 14, model_input_names: List[str] = None, special_tokens: Optional[Dict[str, np.ndarray]] = None, max_length: Optional[int] = 25, ): super().__init__(model_input_names, special_tokens, max_length) self.font_size = font_size self.window_size = window_size self.stride = stride self.font = font if self.model_input_names is None: logger.warning('Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]') self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"] def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, font_size, window_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, height, width) shape """ if not text: return None image = text2image(text, font=self.font, font_size=self.font_size) image_array = np.asarray(image) return np.squeeze( sliding_window_view(image_array, (image_array.shape[0], min(self.window_size, image_array.shape[1]))), axis=0, )[:: self.stride] def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding for an empty window. Parameters ---------- input_embeddings : [type], optional Embeddings already encoded, by default None Returns ------- np.ndarray An empty array in (font_size, window_size) shape """ return np.zeros((len(input_embeddings[0]), self.window_size))
Ancestors
- EmbeddingTokenizer
- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
Class variables
var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]
Methods
def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray
-
Create a padding token embedding for an empty window.
Parameters
input_embeddings
:[type]
, optional- Embeddings already encoded, by default None
Returns
np.ndarray
- An empty array in (font_size, window_size) shape
Expand source code
def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding for an empty window. Parameters ---------- input_embeddings : [type], optional Embeddings already encoded, by default None Returns ------- np.ndarray An empty array in (font_size, window_size) shape """ return np.zeros((len(input_embeddings[0]), self.window_size))
def text2embeddings(self, text: str) ‑> numpy.ndarray
-
Convert text into an numpy array, in (sequence_length, font_size, window_size) shape.
Parameters
text
:str
- Input text
Returns
np.ndarray
- An array in (sequence_length, height, width) shape
Expand source code
def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, font_size, window_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, height, width) shape """ if not text: return None image = text2image(text, font=self.font, font_size=self.font_size) image_array = np.asarray(image) return np.squeeze( sliding_window_view(image_array, (image_array.shape[0], min(self.window_size, image_array.shape[1]))), axis=0, )[:: self.stride]
Inherited members