Module `text_embeddings.x`

X is a Perceiver-based encoder model that incorporates byte hash embeddings, learned token pruning and layer wise adaptive computation (inspired from PonderNet).

Expand source code

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2021-08-19 12:47:53
# @Author  : Chenghao Mou (mouchenghao@gmail.com)

"""X is a Perceiver-based encoder model that incorporates byte hash embeddings, learned token pruning and layer wise adaptive computation (inspired from PonderNet)."""

import math
from typing import Callable

import torch
import torch.nn as nn

from torch import Tensor
from einops import repeat, rearrange
from transformers import CanineModel

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000, batch_first: bool = False):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        self.batch_first = batch_first

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        if self.batch_first:
            x = x.transpose(1, 0)
        x = x + self.pe[:x.size(0)]
        return self.dropout(x) if not self.batch_first else self.dropout(x).transpose(0, 1)


class AttentionWrapper(nn.Module):
    def __init__(
        self,
        attention_class: Callable,
        embed_dim: int,
        num_heads: int,
        ff_dim: int,
        dropout: float,
        batch_first: bool,
        is_cross_attention: bool,
    ):
        super().__init__()

        self.is_cross_attention = is_cross_attention
        self.pre_attention_q_norm = nn.LayerNorm(embed_dim)
        self.pre_attention_kv_norm = (
            nn.LayerNorm(embed_dim) if is_cross_attention else None
        )

        self.attention = attention_class(
            embed_dim=embed_dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=batch_first,
        )
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, embed_dim),
        )

    def forward(
        self,
        query: Tensor,
        key: Tensor = None,
        value: Tensor = None,
        mask: Tensor = None,
    ):
        query = self.pre_attention_q_norm(query)
        key = (
            self.pre_attention_kv_norm(key)
            if key is not None and self.pre_attention_kv_norm is not None
            else key
        )
        value = (
            self.pre_attention_kv_norm(value)
            if value is not None and self.pre_attention_kv_norm is not None
            else value
        )

        # mask is only useful for cross attention, ignore attention weights
        attn_output, *_ = (
            self.attention(query, key, value, key_padding_mask=mask)
            if self.is_cross_attention
            else self.attention(query, query, query)
        )
        output = attn_output + query
        output = self.ff(output) + output

        return output


class XLayer(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        num_cross_attention_heads: int,
        num_latent_attention_heads: int,
        num_latent_layers: int,
        ff_dim: int,
        dropout: float,
        batch_first: bool,
        latent_attention: Callable,
    ):
        super().__init__()

        self.cross_attention = AttentionWrapper(
            attention_class=nn.MultiheadAttention,
            embed_dim=embed_dim,
            num_heads=num_cross_attention_heads,
            ff_dim=ff_dim,
            dropout=dropout,
            batch_first=batch_first,
            is_cross_attention=True,
        )

        # pesudo transfomer
        self.latent_attentions = nn.ModuleList(
            [
                AttentionWrapper(
                    attention_class=latent_attention,
                    embed_dim=embed_dim,
                    num_heads=num_latent_attention_heads,
                    ff_dim=ff_dim,
                    dropout=dropout,
                    batch_first=batch_first,
                    is_cross_attention=False,
                )
                for _ in range(num_latent_layers)
            ]
        )

    def forward(
        self,
        query: Tensor,
        key: Tensor = None,
        value: Tensor = None,
        mask: Tensor = None,
    ):
        o = self.cross_attention(
            query,
            key,
            value,
            mask=mask,
        )

        for attn in self.latent_attentions:
            o = attn(o)

        return o


class X(nn.Module):
    def __init__(
        self,
        num_classes: int,
        latent_dim: int,
        num_layers: int,
        embed_dim: int,
        num_cross_attention_heads: int,
        num_latent_attention_heads: int,
        num_latent_layers: int,
        ff_dim: int,
        dropout: float,
        batch_first: bool,
        max_length: int,
        latent_attention: Callable,
    ):
        super().__init__()

        self.embedding = CanineModel.from_pretrained('google/canine-s')
        self.embedding_ff = nn.Linear(768, embed_dim)
        self.layers = nn.ModuleList(
            [
                XLayer(
                    embed_dim=embed_dim,
                    num_cross_attention_heads=num_cross_attention_heads,
                    num_latent_attention_heads=num_latent_attention_heads,
                    num_latent_layers=num_latent_layers,
                    ff_dim=ff_dim,
                    dropout=dropout,
                    batch_first=batch_first,
                    latent_attention=latent_attention,
                )
                for _ in range(num_layers)
            ]
        )

        self.num_classes = num_classes
        self.latent = nn.Parameter(torch.rand((latent_dim, embed_dim)))
        self.output_layer = nn.Linear(embed_dim, self.num_classes)
        self.lambda_layer = nn.Sequential(nn.Linear(embed_dim, 1), nn.Sigmoid())

    def forward(
        self,
        **inputs
    ):
        
        mask = inputs.get("attention_mask", None)
        with torch.no_grad():
            outputs = self.embedding(**inputs)
            x = outputs.last_hidden_state
        
        x = self.embedding_ff(x)
        batch_size, *_ = x.shape
        un_halted_prob = x.new_ones((batch_size,))
        halted = x.new_zeros((batch_size,))

        latent = repeat(
            rearrange(self.latent, "N D -> 1 N D"), "1 N D -> B N D", B=batch_size
        )

        probas = []
        preds = []

        p_m = x.new_zeros((batch_size,))
        y_m = x.new_zeros((batch_size, self.num_classes))

        for i, layer in enumerate(self.layers):
            latent = layer(latent, x, x, mask)

            # calculate halting probability for current layer
            layer_lambda = (
                x.new_ones((batch_size,))
                if i == len(self.layers) - 1
                else self.lambda_layer(torch.mean(latent, dim=1))
            )
            # calculate current prediction from current layer
            layer_predictions = self.output_layer(torch.mean(latent, dim=1))

            # conditional halting probability for current layer: previously not halted * halting now
            layer_halted_prob = un_halted_prob * layer_lambda.view(-1)
            un_halted_prob = un_halted_prob * (1 - layer_lambda.view(-1))

            # Halt based on the halting probability
            sampling = torch.bernoulli(layer_lambda.reshape(-1))
            halt = sampling * (1 - halted)

            probas.append(layer_halted_prob)
            preds.append(layer_predictions)

            p_m = p_m * (1 - halt) + layer_halted_prob * halt

            y_m = y_m * repeat(
                1 - halt, "B -> B C", C=self.num_classes
            ) + layer_predictions * repeat(halt, "B -> B C", C=self.num_classes)

            halted = halted + halt

            if not self.training and halted.sum() == batch_size:
                break

        return torch.stack(probas), torch.stack(preds), p_m, y_m


class ReconstructionLoss(nn.Module):
    def __init__(self, loss_fn: Callable):
        super().__init__()
        self.loss_fn = loss_fn

    def forward(self, probas, preds, labels):

        total = preds.new_tensor(0.0)
        for layer_probas, layer_preds in zip(probas, preds):
            layer_loss = layer_probas * self.loss_fn(layer_preds, labels)
            total = total + layer_loss.mean()

        return total


class RegularizationLoss(nn.Module):
    def __init__(self, lambda_p: float, max_layers: int):
        super().__init__()
        p_g = torch.zeros((max_layers,))
        not_halted = 1.0
        for k in range(max_layers):
            p_g[k] = lambda_p * not_halted
            not_halted = not_halted * (1 - lambda_p)

        self.p_g = nn.Parameter(p_g, requires_grad=False)
        self.kl_div = nn.KLDivLoss(reduction="batchmean")

    def forward(self, probas):
        probas = probas.transpose(0, 1)
        p_g = self.p_g[None, : probas.shape[1]].expand_as(probas)

        return self.kl_div(probas.log(), p_g)


class XLoss(nn.Module):
    def __init__(self, loss_fn: Callable, lambda_p: float, max_layers: int):
        super().__init__()
        self.reconstruction_loss = ReconstructionLoss(loss_fn)
        self.regularization_loss = RegularizationLoss(lambda_p, max_layers)

    def forward(self, probas, preds, labels):

        return self.reconstruction_loss(
            probas, preds, labels
        ) + self.regularization_loss(probas)

Classes

class AttentionWrapper (attention_class: Callable, embed_dim: int, num_heads: int, ff_dim: int, dropout: float, batch_first: bool, is_cross_attention: bool)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

Note

As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class AttentionWrapper(nn.Module):
    def __init__(
        self,
        attention_class: Callable,
        embed_dim: int,
        num_heads: int,
        ff_dim: int,
        dropout: float,
        batch_first: bool,
        is_cross_attention: bool,
    ):
        super().__init__()

        self.is_cross_attention = is_cross_attention
        self.pre_attention_q_norm = nn.LayerNorm(embed_dim)
        self.pre_attention_kv_norm = (
            nn.LayerNorm(embed_dim) if is_cross_attention else None
        )

        self.attention = attention_class(
            embed_dim=embed_dim,
            num_heads=num_heads,
            dropout=dropout,
            batch_first=batch_first,
        )
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, embed_dim),
        )

    def forward(
        self,
        query: Tensor,
        key: Tensor = None,
        value: Tensor = None,
        mask: Tensor = None,
    ):
        query = self.pre_attention_q_norm(query)
        key = (
            self.pre_attention_kv_norm(key)
            if key is not None and self.pre_attention_kv_norm is not None
            else key
        )
        value = (
            self.pre_attention_kv_norm(value)
            if value is not None and self.pre_attention_kv_norm is not None
            else value
        )

        # mask is only useful for cross attention, ignore attention weights
        attn_output, *_ = (
            self.attention(query, key, value, key_padding_mask=mask)
            if self.is_cross_attention
            else self.attention(query, query, query)
        )
        output = attn_output + query
        output = self.ff(output) + output

        return output

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, query: torch.Tensor, key: torch.Tensor = None, value: torch.Tensor = None, mask: torch.Tensor = None) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Although the recipe for forward pass needs to be defined within this function, one should call the :class:Module instance afterwards instead of this since the former takes care of running the registered hooks while the latter silently ignores them.

Expand source code

def forward(
    self,
    query: Tensor,
    key: Tensor = None,
    value: Tensor = None,
    mask: Tensor = None,
):
    query = self.pre_attention_q_norm(query)
    key = (
        self.pre_attention_kv_norm(key)
        if key is not None and self.pre_attention_kv_norm is not None
        else key
    )
    value = (
        self.pre_attention_kv_norm(value)
        if value is not None and self.pre_attention_kv_norm is not None
        else value
    )

    # mask is only useful for cross attention, ignore attention weights
    attn_output, *_ = (
        self.attention(query, key, value, key_padding_mask=mask)
        if self.is_cross_attention
        else self.attention(query, query, query)
    )
    output = attn_output + query
    output = self.ff(output) + output

    return output

class PositionalEncoding (d_model: int, dropout: float = 0.1, max_len: int = 5000, batch_first: bool = False)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

Note

As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000, batch_first: bool = False):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)
        self.batch_first = batch_first

    def forward(self, x: Tensor) -> Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        if self.batch_first:
            x = x.transpose(1, 0)
        x = x + self.pe[:x.size(0)]
        return self.dropout(x) if not self.batch_first else self.dropout(x).transpose(0, 1)

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, x: torch.Tensor) ‑> torch.Tensor

Args

x: Tensor, shape [seq_len, batch_size, embedding_dim]

Expand source code

def forward(self, x: Tensor) -> Tensor:
    """
    Args:
        x: Tensor, shape [seq_len, batch_size, embedding_dim]
    """
    if self.batch_first:
        x = x.transpose(1, 0)
    x = x + self.pe[:x.size(0)]
    return self.dropout(x) if not self.batch_first else self.dropout(x).transpose(0, 1)

class ReconstructionLoss (loss_fn: Callable)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

Note

As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class ReconstructionLoss(nn.Module):
    def __init__(self, loss_fn: Callable):
        super().__init__()
        self.loss_fn = loss_fn

    def forward(self, probas, preds, labels):

        total = preds.new_tensor(0.0)
        for layer_probas, layer_preds in zip(probas, preds):
            layer_loss = layer_probas * self.loss_fn(layer_preds, labels)
            total = total + layer_loss.mean()

        return total

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, probas, preds, labels) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Expand source code

def forward(self, probas, preds, labels):

    total = preds.new_tensor(0.0)
    for layer_probas, layer_preds in zip(probas, preds):
        layer_loss = layer_probas * self.loss_fn(layer_preds, labels)
        total = total + layer_loss.mean()

    return total

class RegularizationLoss (lambda_p: float, max_layers: int)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

Note

As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class RegularizationLoss(nn.Module):
    def __init__(self, lambda_p: float, max_layers: int):
        super().__init__()
        p_g = torch.zeros((max_layers,))
        not_halted = 1.0
        for k in range(max_layers):
            p_g[k] = lambda_p * not_halted
            not_halted = not_halted * (1 - lambda_p)

        self.p_g = nn.Parameter(p_g, requires_grad=False)
        self.kl_div = nn.KLDivLoss(reduction="batchmean")

    def forward(self, probas):
        probas = probas.transpose(0, 1)
        p_g = self.p_g[None, : probas.shape[1]].expand_as(probas)

        return self.kl_div(probas.log(), p_g)

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, probas) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Expand source code

def forward(self, probas):
    probas = probas.transpose(0, 1)
    p_g = self.p_g[None, : probas.shape[1]].expand_as(probas)

    return self.kl_div(probas.log(), p_g)

class X (num_classes: int, latent_dim: int, num_layers: int, embed_dim: int, num_cross_attention_heads: int, num_latent_attention_heads: int, num_latent_layers: int, ff_dim: int, dropout: float, batch_first: bool, max_length: int, latent_attention: Callable)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

Note

As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class X(nn.Module):
    def __init__(
        self,
        num_classes: int,
        latent_dim: int,
        num_layers: int,
        embed_dim: int,
        num_cross_attention_heads: int,
        num_latent_attention_heads: int,
        num_latent_layers: int,
        ff_dim: int,
        dropout: float,
        batch_first: bool,
        max_length: int,
        latent_attention: Callable,
    ):
        super().__init__()

        self.embedding = CanineModel.from_pretrained('google/canine-s')
        self.embedding_ff = nn.Linear(768, embed_dim)
        self.layers = nn.ModuleList(
            [
                XLayer(
                    embed_dim=embed_dim,
                    num_cross_attention_heads=num_cross_attention_heads,
                    num_latent_attention_heads=num_latent_attention_heads,
                    num_latent_layers=num_latent_layers,
                    ff_dim=ff_dim,
                    dropout=dropout,
                    batch_first=batch_first,
                    latent_attention=latent_attention,
                )
                for _ in range(num_layers)
            ]
        )

        self.num_classes = num_classes
        self.latent = nn.Parameter(torch.rand((latent_dim, embed_dim)))
        self.output_layer = nn.Linear(embed_dim, self.num_classes)
        self.lambda_layer = nn.Sequential(nn.Linear(embed_dim, 1), nn.Sigmoid())

    def forward(
        self,
        **inputs
    ):
        
        mask = inputs.get("attention_mask", None)
        with torch.no_grad():
            outputs = self.embedding(**inputs)
            x = outputs.last_hidden_state
        
        x = self.embedding_ff(x)
        batch_size, *_ = x.shape
        un_halted_prob = x.new_ones((batch_size,))
        halted = x.new_zeros((batch_size,))

        latent = repeat(
            rearrange(self.latent, "N D -> 1 N D"), "1 N D -> B N D", B=batch_size
        )

        probas = []
        preds = []

        p_m = x.new_zeros((batch_size,))
        y_m = x.new_zeros((batch_size, self.num_classes))

        for i, layer in enumerate(self.layers):
            latent = layer(latent, x, x, mask)

            # calculate halting probability for current layer
            layer_lambda = (
                x.new_ones((batch_size,))
                if i == len(self.layers) - 1
                else self.lambda_layer(torch.mean(latent, dim=1))
            )
            # calculate current prediction from current layer
            layer_predictions = self.output_layer(torch.mean(latent, dim=1))

            # conditional halting probability for current layer: previously not halted * halting now
            layer_halted_prob = un_halted_prob * layer_lambda.view(-1)
            un_halted_prob = un_halted_prob * (1 - layer_lambda.view(-1))

            # Halt based on the halting probability
            sampling = torch.bernoulli(layer_lambda.reshape(-1))
            halt = sampling * (1 - halted)

            probas.append(layer_halted_prob)
            preds.append(layer_predictions)

            p_m = p_m * (1 - halt) + layer_halted_prob * halt

            y_m = y_m * repeat(
                1 - halt, "B -> B C", C=self.num_classes
            ) + layer_predictions * repeat(halt, "B -> B C", C=self.num_classes)

            halted = halted + halt

            if not self.training and halted.sum() == batch_size:
                break

        return torch.stack(probas), torch.stack(preds), p_m, y_m

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, **inputs) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Expand source code

def forward(
    self,
    **inputs
):
    
    mask = inputs.get("attention_mask", None)
    with torch.no_grad():
        outputs = self.embedding(**inputs)
        x = outputs.last_hidden_state
    
    x = self.embedding_ff(x)
    batch_size, *_ = x.shape
    un_halted_prob = x.new_ones((batch_size,))
    halted = x.new_zeros((batch_size,))

    latent = repeat(
        rearrange(self.latent, "N D -> 1 N D"), "1 N D -> B N D", B=batch_size
    )

    probas = []
    preds = []

    p_m = x.new_zeros((batch_size,))
    y_m = x.new_zeros((batch_size, self.num_classes))

    for i, layer in enumerate(self.layers):
        latent = layer(latent, x, x, mask)

        # calculate halting probability for current layer
        layer_lambda = (
            x.new_ones((batch_size,))
            if i == len(self.layers) - 1
            else self.lambda_layer(torch.mean(latent, dim=1))
        )
        # calculate current prediction from current layer
        layer_predictions = self.output_layer(torch.mean(latent, dim=1))

        # conditional halting probability for current layer: previously not halted * halting now
        layer_halted_prob = un_halted_prob * layer_lambda.view(-1)
        un_halted_prob = un_halted_prob * (1 - layer_lambda.view(-1))

        # Halt based on the halting probability
        sampling = torch.bernoulli(layer_lambda.reshape(-1))
        halt = sampling * (1 - halted)

        probas.append(layer_halted_prob)
        preds.append(layer_predictions)

        p_m = p_m * (1 - halt) + layer_halted_prob * halt

        y_m = y_m * repeat(
            1 - halt, "B -> B C", C=self.num_classes
        ) + layer_predictions * repeat(halt, "B -> B C", C=self.num_classes)

        halted = halted + halt

        if not self.training and halted.sum() == batch_size:
            break

    return torch.stack(probas), torch.stack(preds), p_m, y_m

class XLayer (embed_dim: int, num_cross_attention_heads: int, num_latent_attention_heads: int, num_latent_layers: int, ff_dim: int, dropout: float, batch_first: bool, latent_attention: Callable)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

Note

As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class XLayer(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        num_cross_attention_heads: int,
        num_latent_attention_heads: int,
        num_latent_layers: int,
        ff_dim: int,
        dropout: float,
        batch_first: bool,
        latent_attention: Callable,
    ):
        super().__init__()

        self.cross_attention = AttentionWrapper(
            attention_class=nn.MultiheadAttention,
            embed_dim=embed_dim,
            num_heads=num_cross_attention_heads,
            ff_dim=ff_dim,
            dropout=dropout,
            batch_first=batch_first,
            is_cross_attention=True,
        )

        # pesudo transfomer
        self.latent_attentions = nn.ModuleList(
            [
                AttentionWrapper(
                    attention_class=latent_attention,
                    embed_dim=embed_dim,
                    num_heads=num_latent_attention_heads,
                    ff_dim=ff_dim,
                    dropout=dropout,
                    batch_first=batch_first,
                    is_cross_attention=False,
                )
                for _ in range(num_latent_layers)
            ]
        )

    def forward(
        self,
        query: Tensor,
        key: Tensor = None,
        value: Tensor = None,
        mask: Tensor = None,
    ):
        o = self.cross_attention(
            query,
            key,
            value,
            mask=mask,
        )

        for attn in self.latent_attentions:
            o = attn(o)

        return o

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, query: torch.Tensor, key: torch.Tensor = None, value: torch.Tensor = None, mask: torch.Tensor = None) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Expand source code

def forward(
    self,
    query: Tensor,
    key: Tensor = None,
    value: Tensor = None,
    mask: Tensor = None,
):
    o = self.cross_attention(
        query,
        key,
        value,
        mask=mask,
    )

    for attn in self.latent_attentions:
        o = attn(o)

    return o

class XLoss (loss_fn: Callable, lambda_p: float, max_layers: int)

Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in a tree structure. You can assign the submodules as regular attributes::

import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, 5)
        self.conv2 = nn.Conv2d(20, 20, 5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their parameters converted too when you call :meth:to, etc.

Note

As per the example above, an __init__() call to the parent class must be made before assignment on the child.

:ivar training: Boolean represents whether this module is in training or evaluation mode. :vartype training: bool

Initializes internal Module state, shared by both nn.Module and ScriptModule.

Expand source code

class XLoss(nn.Module):
    def __init__(self, loss_fn: Callable, lambda_p: float, max_layers: int):
        super().__init__()
        self.reconstruction_loss = ReconstructionLoss(loss_fn)
        self.regularization_loss = RegularizationLoss(lambda_p, max_layers)

    def forward(self, probas, preds, labels):

        return self.reconstruction_loss(
            probas, preds, labels
        ) + self.regularization_loss(probas)

Ancestors

torch.nn.modules.module.Module

Class variables

var dump_patches : bool
var training : bool

Methods

def forward(self, probas, preds, labels) ‑> Callable[..., Any]

Defines the computation performed at every call.

Should be overridden by all subclasses.

Note

Expand source code

def forward(self, probas, preds, labels):

    return self.reconstruction_loss(
        probas, preds, labels
    ) + self.regularization_loss(probas)