Module text_embeddings.hash.util

Python translation of the original code in tensorflow.

Expand source code
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2021-04-22 21:03:09
# @Author  : Chenghao Mou (mouchenghao@gmail.com)

"""Python translation of the original code in tensorflow."""

from typing import List

import mmh3

kMul: int = 0xC6A4A7935BD1E995
kMul2: int = 0x9E3779B97F4A7835

kMappingTable: List[int] = [0, 1, -1, 0]


def shift_mix(val):
    return val ^ (val >> 47)


def get_more_bits(hash1, hash2):

    hash1 = shift_mix(hash1) * kMul
    hash2 ^= hash1
    newhigh = shift_mix(hash1)
    newlow = shift_mix(hash2 * kMul2) * kMul2

    return newlow, newhigh


def murmurhash(token: str, feature_size: int = 512) -> List[int]:
    """
    Hash a token into a list of feature_size integers (-1, 0, or 1).

    Parameters
    ----------
    token : str
        Input token string
    feature_size : int, optional
        The target size of the hash embedding, by default 512

    Returns
    -------
    List[int]
        A list of feature_size trinary integers
    """
    hash_low = 0
    hash_high = 0
    hash_codes = []

    for i in range(0, feature_size, 64):
        if i == 0:
            hash_low, hash_high = mmh3.hash64(token, signed=False)
        else:
            hash_low, hash_high = get_more_bits(hash_low, hash_high)
        hash_codes.append(hash_low)
        hash_codes.append(hash_high)

    projection: List[int] = []
    for code in hash_codes:
        while code:
            if len(projection) >= feature_size // 2:
                break
            projection.append(kMappingTable[code & 3])
            code = code >> 2
        if len(projection) >= feature_size // 2:
            break
    return projection[: feature_size // 2]

Functions

def get_more_bits(hash1, hash2)
Expand source code
def get_more_bits(hash1, hash2):

    hash1 = shift_mix(hash1) * kMul
    hash2 ^= hash1
    newhigh = shift_mix(hash1)
    newlow = shift_mix(hash2 * kMul2) * kMul2

    return newlow, newhigh
def murmurhash(token: str, feature_size: int = 512) ‑> List[int]

Hash a token into a list of feature_size integers (-1, 0, or 1).

Parameters

token : str
Input token string
feature_size : int, optional
The target size of the hash embedding, by default 512

Returns

List[int]
A list of feature_size trinary integers
Expand source code
def murmurhash(token: str, feature_size: int = 512) -> List[int]:
    """
    Hash a token into a list of feature_size integers (-1, 0, or 1).

    Parameters
    ----------
    token : str
        Input token string
    feature_size : int, optional
        The target size of the hash embedding, by default 512

    Returns
    -------
    List[int]
        A list of feature_size trinary integers
    """
    hash_low = 0
    hash_high = 0
    hash_codes = []

    for i in range(0, feature_size, 64):
        if i == 0:
            hash_low, hash_high = mmh3.hash64(token, signed=False)
        else:
            hash_low, hash_high = get_more_bits(hash_low, hash_high)
        hash_codes.append(hash_low)
        hash_codes.append(hash_high)

    projection: List[int] = []
    for code in hash_codes:
        while code:
            if len(projection) >= feature_size // 2:
                break
            projection.append(kMappingTable[code & 3])
            code = code >> 2
        if len(projection) >= feature_size // 2:
            break
    return projection[: feature_size // 2]
def shift_mix(val)
Expand source code
def shift_mix(val):
    return val ^ (val >> 47)