Module text_embeddings.visual.vtr
Robust Open Vocabulary Translation from Visual Text Representations
Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2021-04-17 08:08:04
# @Author : Chenghao Mou (mouchenghao@gmail.com)
# @Description : From Robust Open-Vocabulary Translation from Visual Text Representations
"""Robust Open Vocabulary Translation from Visual Text Representations"""
from typing import List, Optional, Dict
import numpy as np
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from numpy.lib.stride_tricks import sliding_window_view
from loguru import logger
from text_embeddings.base import EmbeddingTokenizer
def text2image(text: str, font: str, font_size: int = 14) -> Image:
"""Convert text into an image and return the image. Reference: https://gist.github.com/destan/5540702
Parameters
----------
text : str
Text to encode
font : str
Name of the font to use
font_size : int, optional
Size of the font, by default 14
Returns
-------
Image
Encoded image
"""
image_font = ImageFont.truetype(font, max(font_size - 2, 8))
text = text.replace("\n", " ")
line_width, _ = image_font.getsize(text)
img = Image.new("L", (line_width, font_size))
draw = ImageDraw.Draw(img)
draw.text(xy=(0, 0), text=text, fill="#FFFFFF", font=image_font)
return img
class VTRTokenizer(EmbeddingTokenizer):
"""
Render the text into a series of image blocks. Reference [VTR](https://t.co/l9E6rL8O5p?amp=1)
Parameters
----------
window_size : int, optional
The width of the image window, by default 10
stride: int optional
The stride used to generate image windows, by default 10
font : str, optional
Path to the font file, by default "resources/Noto_Sans/NotoSans-Regular.ttf"
font_size : int, optional
The size of the font in pixels, might be smaller than the actual image height, by default 14
model_input_names : List[str], optional
Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
special_tokens : Optional[Dict[str, np.ndarray]], optional
Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
max_length : Optional[int], optional
Maximum sequence length, by default 25
Examples
--------
>>> from text_embeddings.visual import VTRTokenizer
>>> from transformers.tokenization_utils_base import *
>>> tokenier = VTRTokenizer()
>>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False)
>>> assert results['input_ids'].shape == (2, 13, 14, 10), results['input_ids'].shape
"""
def __init__(
self,
window_size: int = 10,
stride: int = 10,
font: str = "resources/Noto_Sans/NotoSans-Regular.ttf",
font_size: int = 14,
model_input_names: List[str] = None,
special_tokens: Optional[Dict[str, np.ndarray]] = None,
max_length: Optional[int] = 25,
):
super().__init__(model_input_names, special_tokens, max_length)
self.font_size = font_size
self.window_size = window_size
self.stride = stride
self.font = font
if self.model_input_names is None:
logger.warning('Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]')
self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"]
def text2embeddings(self, text: str) -> np.ndarray:
"""Convert text into an numpy array, in (sequence_length, font_size, window_size) shape.
Parameters
----------
text : str
Input text
Returns
-------
np.ndarray
An array in (sequence_length, height, width) shape
"""
if not text:
return None
image = text2image(text, font=self.font, font_size=self.font_size)
image_array = np.asarray(image)
return np.squeeze(
sliding_window_view(image_array, (image_array.shape[0], min(self.window_size, image_array.shape[1]))),
axis=0,
)[:: self.stride]
def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
"""Create a padding token embedding for an empty window.
Parameters
----------
input_embeddings : [type], optional
Embeddings already encoded, by default None
Returns
-------
np.ndarray
An empty array in (font_size, window_size) shape
"""
return np.zeros((len(input_embeddings[0]), self.window_size))
Functions
def text2image(text: str, font: str, font_size: int = 14) ‑>
-
Convert text into an image and return the image. Reference: https://gist.github.com/destan/5540702
Parameters
text
:str
- Text to encode
font
:str
- Name of the font to use
font_size
:int
, optional- Size of the font, by default 14
Returns
Image
- Encoded image
Expand source code
def text2image(text: str, font: str, font_size: int = 14) -> Image: """Convert text into an image and return the image. Reference: https://gist.github.com/destan/5540702 Parameters ---------- text : str Text to encode font : str Name of the font to use font_size : int, optional Size of the font, by default 14 Returns ------- Image Encoded image """ image_font = ImageFont.truetype(font, max(font_size - 2, 8)) text = text.replace("\n", " ") line_width, _ = image_font.getsize(text) img = Image.new("L", (line_width, font_size)) draw = ImageDraw.Draw(img) draw.text(xy=(0, 0), text=text, fill="#FFFFFF", font=image_font) return img
Classes
class VTRTokenizer (window_size: int = 10, stride: int = 10, font: str = 'resources/Noto_Sans/NotoSans-Regular.ttf', font_size: int = 14, model_input_names: List[str] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 25)
-
Render the text into a series of image blocks. Reference VTR
Parameters
window_size
:int
, optional- The width of the image window, by default 10
stride
:int optional
- The stride used to generate image windows, by default 10
font
:str
, optional- Path to the font file, by default "resources/Noto_Sans/NotoSans-Regular.ttf"
font_size
:int
, optional- The size of the font in pixels, might be smaller than the actual image height, by default 14
model_input_names
:List[str]
, optional- Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
special_tokens
:Optional[Dict[str, np.ndarray]]
, optional- Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
max_length
:Optional[int]
, optional- Maximum sequence length, by default 25
Examples
>>> from text_embeddings.visual import VTRTokenizer >>> from transformers.tokenization_utils_base import * >>> tokenier = VTRTokenizer() >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False) >>> assert results['input_ids'].shape == (2, 13, 14, 10), results['input_ids'].shape
Expand source code
class VTRTokenizer(EmbeddingTokenizer): """ Render the text into a series of image blocks. Reference [VTR](https://t.co/l9E6rL8O5p?amp=1) Parameters ---------- window_size : int, optional The width of the image window, by default 10 stride: int optional The stride used to generate image windows, by default 10 font : str, optional Path to the font file, by default "resources/Noto_Sans/NotoSans-Regular.ttf" font_size : int, optional The size of the font in pixels, might be smaller than the actual image height, by default 14 model_input_names : List[str], optional Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"] special_tokens : Optional[Dict[str, np.ndarray]], optional Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"} max_length : Optional[int], optional Maximum sequence length, by default 25 Examples -------- >>> from text_embeddings.visual import VTRTokenizer >>> from transformers.tokenization_utils_base import * >>> tokenier = VTRTokenizer() >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False) >>> assert results['input_ids'].shape == (2, 13, 14, 10), results['input_ids'].shape """ def __init__( self, window_size: int = 10, stride: int = 10, font: str = "resources/Noto_Sans/NotoSans-Regular.ttf", font_size: int = 14, model_input_names: List[str] = None, special_tokens: Optional[Dict[str, np.ndarray]] = None, max_length: Optional[int] = 25, ): super().__init__(model_input_names, special_tokens, max_length) self.font_size = font_size self.window_size = window_size self.stride = stride self.font = font if self.model_input_names is None: logger.warning('Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]') self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"] def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, font_size, window_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, height, width) shape """ if not text: return None image = text2image(text, font=self.font, font_size=self.font_size) image_array = np.asarray(image) return np.squeeze( sliding_window_view(image_array, (image_array.shape[0], min(self.window_size, image_array.shape[1]))), axis=0, )[:: self.stride] def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding for an empty window. Parameters ---------- input_embeddings : [type], optional Embeddings already encoded, by default None Returns ------- np.ndarray An empty array in (font_size, window_size) shape """ return np.zeros((len(input_embeddings[0]), self.window_size))
Ancestors
- EmbeddingTokenizer
- transformers.tokenization_utils_base.PreTrainedTokenizerBase
- transformers.tokenization_utils_base.SpecialTokensMixin
- transformers.utils.hub.PushToHubMixin
Class variables
var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]
Methods
def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray
-
Create a padding token embedding for an empty window.
Parameters
input_embeddings
:[type]
, optional- Embeddings already encoded, by default None
Returns
np.ndarray
- An empty array in (font_size, window_size) shape
Expand source code
def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray: """Create a padding token embedding for an empty window. Parameters ---------- input_embeddings : [type], optional Embeddings already encoded, by default None Returns ------- np.ndarray An empty array in (font_size, window_size) shape """ return np.zeros((len(input_embeddings[0]), self.window_size))
def text2embeddings(self, text: str) ‑> numpy.ndarray
-
Convert text into an numpy array, in (sequence_length, font_size, window_size) shape.
Parameters
text
:str
- Input text
Returns
np.ndarray
- An array in (sequence_length, height, width) shape
Expand source code
def text2embeddings(self, text: str) -> np.ndarray: """Convert text into an numpy array, in (sequence_length, font_size, window_size) shape. Parameters ---------- text : str Input text Returns ------- np.ndarray An array in (sequence_length, height, width) shape """ if not text: return None image = text2image(text, font=self.font, font_size=self.font_size) image_array = np.asarray(image) return np.squeeze( sliding_window_view(image_array, (image_array.shape[0], min(self.window_size, image_array.shape[1]))), axis=0, )[:: self.stride]
Inherited members