Module text_embeddings.visual

Visual information based tokenizers.

Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2021-04-22 20:59:35
# @Author  : Chenghao Mou (mouchenghao@gmail.com)

"""Visual information based tokenizers."""

from .vtr import VTRTokenizer, text2image

__all__ = ['VTRTokenizer', 'text2image']

Sub-modules

text_embeddings.visual.vtr

Robust Open­ Vocabulary Translation from Visual Text Representations

Functions

def text2image(text: str, font: str, font_size: int = 14) ‑> 

Convert text into an image and return the image. Reference: https://gist.github.com/destan/5540702

Parameters

text : str
Text to encode
font : str
Name of the font to use
font_size : int, optional
Size of the font, by default 14

Returns

Image
Encoded image
Expand source code
def text2image(text: str, font: str, font_size: int = 14) -> Image:
    """Convert text into an image and return the image. Reference: https://gist.github.com/destan/5540702

    Parameters
    ----------
    text : str
        Text to encode
    font : str
        Name of the font to use
    font_size : int, optional
        Size of the font, by default 14

    Returns
    -------
    Image
        Encoded image
    """

    image_font = ImageFont.truetype(font, max(font_size - 2, 8))
    text = text.replace("\n", " ")

    line_width, _ = image_font.getsize(text)

    img = Image.new("L", (line_width, font_size))
    draw = ImageDraw.Draw(img)
    draw.text(xy=(0, 0), text=text, fill="#FFFFFF", font=image_font)

    return img

Classes

class VTRTokenizer (window_size: int = 10, stride: int = 10, font: str = 'resources/Noto_Sans/NotoSans-Regular.ttf', font_size: int = 14, model_input_names: List[str] = None, special_tokens: Union[Dict[str, numpy.ndarray], NoneType] = None, max_length: Union[int, NoneType] = 25)

Render the text into a series of image blocks. Reference VTR

Parameters

window_size : int, optional
The width of the image window, by default 10
stride : int optional
The stride used to generate image windows, by default 10
font : str, optional
Path to the font file, by default "resources/Noto_Sans/NotoSans-Regular.ttf"
font_size : int, optional
The size of the font in pixels, might be smaller than the actual image height, by default 14
model_input_names : List[str], optional
Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
special_tokens : Optional[Dict[str, np.ndarray]], optional
Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
max_length : Optional[int], optional
Maximum sequence length, by default 25

Examples

>>> from text_embeddings.visual import VTRTokenizer
>>> from transformers.tokenization_utils_base import *
>>> tokenier = VTRTokenizer()
>>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False)
>>> assert results['input_ids'].shape == (2, 13, 14, 10), results['input_ids'].shape
Expand source code
class VTRTokenizer(EmbeddingTokenizer):
    """
    Render the text into a series of image blocks. Reference [VTR](https://t.co/l9E6rL8O5p?amp=1)

    Parameters
    ----------
    window_size : int, optional
        The width of the image window, by default 10
    stride: int optional
        The stride used to generate image windows, by default 10
    font : str, optional
        Path to the font file, by default "resources/Noto_Sans/NotoSans-Regular.ttf"
    font_size : int, optional
        The size of the font in pixels, might be smaller than the actual image height, by default 14
    model_input_names : List[str], optional
        Required inputs of the downstream model, by default it uses the same names as a BERT — ["input_ids", "token_type_ids", "attention_mask"]
    special_tokens : Optional[Dict[str, np.ndarray]], optional
        Special tokens for the downstream model, by default it uses the same special tokens as a BERT — {"CLS": "[CLS]", "SEP": "[SEP]"}
    max_length : Optional[int], optional
        Maximum sequence length, by default 25

    Examples
    --------
    >>> from text_embeddings.visual import VTRTokenizer
    >>> from transformers.tokenization_utils_base import *
    >>> tokenier = VTRTokenizer()
    >>> results = tokenier(text=['This is a sentence.', 'This is another sentence.'], padding=PaddingStrategy.LONGEST, truncation="longest_first", add_special_tokens=False)
    >>> assert results['input_ids'].shape == (2, 13, 14, 10), results['input_ids'].shape
    """

    def __init__(
        self,
        window_size: int = 10,
        stride: int = 10,
        font: str = "resources/Noto_Sans/NotoSans-Regular.ttf",
        font_size: int = 14,
        model_input_names: List[str] = None,
        special_tokens: Optional[Dict[str, np.ndarray]] = None,
        max_length: Optional[int] = 25,
    ):
        super().__init__(model_input_names, special_tokens, max_length)
        self.font_size = font_size
        self.window_size = window_size
        self.stride = stride
        self.font = font

        if self.model_input_names is None:
            logger.warning('Using default model_input_names values ["input_ids", "token_type_ids", "attention_mask"]')
            self.model_input_names = ["input_ids", "token_type_ids", "attention_mask"]

    def text2embeddings(self, text: str) -> np.ndarray:
        """Convert text into an numpy array, in (sequence_length, font_size, window_size) shape.

        Parameters
        ----------
        text : str
            Input text

        Returns
        -------
        np.ndarray
            An array in (sequence_length, height, width) shape
        """
        if not text:
            return None

        image = text2image(text, font=self.font, font_size=self.font_size)
        image_array = np.asarray(image)

        return np.squeeze(
            sliding_window_view(image_array, (image_array.shape[0], min(self.window_size, image_array.shape[1]))),
            axis=0,
        )[:: self.stride]

    def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
        """Create a padding token embedding for an empty window.

        Parameters
        ----------
        input_embeddings : [type], optional
            Embeddings already encoded, by default None

        Returns
        -------
        np.ndarray
            An empty array in (font_size, window_size) shape
        """
        return np.zeros((len(input_embeddings[0]), self.window_size))

Ancestors

  • EmbeddingTokenizer
  • transformers.tokenization_utils_base.PreTrainedTokenizerBase
  • transformers.tokenization_utils_base.SpecialTokensMixin
  • transformers.utils.hub.PushToHubMixin

Class variables

var max_model_input_sizes : Dict[str, Union[int, NoneType]]
var model_input_names : List[str]
var padding_side : str
var pretrained_init_configuration : Dict[str, Dict[str, Any]]
var pretrained_vocab_files_map : Dict[str, Dict[str, str]]
var truncation_side : str
var vocab_files_names : Dict[str, str]

Methods

def create_padding_token_embedding(self, input_embeddings=None) ‑> numpy.ndarray

Create a padding token embedding for an empty window.

Parameters

input_embeddings : [type], optional
Embeddings already encoded, by default None

Returns

np.ndarray
An empty array in (font_size, window_size) shape
Expand source code
def create_padding_token_embedding(self, input_embeddings=None) -> np.ndarray:
    """Create a padding token embedding for an empty window.

    Parameters
    ----------
    input_embeddings : [type], optional
        Embeddings already encoded, by default None

    Returns
    -------
    np.ndarray
        An empty array in (font_size, window_size) shape
    """
    return np.zeros((len(input_embeddings[0]), self.window_size))
def text2embeddings(self, text: str) ‑> numpy.ndarray

Convert text into an numpy array, in (sequence_length, font_size, window_size) shape.

Parameters

text : str
Input text

Returns

np.ndarray
An array in (sequence_length, height, width) shape
Expand source code
def text2embeddings(self, text: str) -> np.ndarray:
    """Convert text into an numpy array, in (sequence_length, font_size, window_size) shape.

    Parameters
    ----------
    text : str
        Input text

    Returns
    -------
    np.ndarray
        An array in (sequence_length, height, width) shape
    """
    if not text:
        return None

    image = text2image(text, font=self.font, font_size=self.font_size)
    image_array = np.asarray(image)

    return np.squeeze(
        sliding_window_view(image_array, (image_array.shape[0], min(self.window_size, image_array.shape[1]))),
        axis=0,
    )[:: self.stride]

Inherited members