SillyTavern-extras/modules/voice_conversion/fairseq/models/fconv_self_att.py

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import logging
import math
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
from fairseq import checkpoint_utils
from fairseq.incremental_decoding_utils import with_incremental_state
from fairseq.models import (
    CompositeEncoder,
    FairseqDecoder,
    FairseqEncoder,
    FairseqEncoderDecoderModel,
    register_model,
    register_model_architecture,
)
from fairseq.modules import (
    DownsampledMultiHeadAttention,
    FairseqDropout,
    GradMultiply,
    LayerNorm,
    LearnedPositionalEmbedding,
    LinearizedConvolution,
)


logger = logging.getLogger(__name__)


@register_model("fconv_self_att")
class FConvModelSelfAtt(FairseqEncoderDecoderModel):
    @classmethod
    def hub_models(cls):
        return {
            "conv.stories.pretrained": {
                "path": "https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz",
                "checkpoint_file": "pretrained_checkpoint.pt",
                "tokenizer": "nltk",
            },
            "conv.stories": {
                "path": "https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz",
                "checkpoint_file": "fusion_checkpoint.pt",
                "tokenizer": "nltk",
                "pretrained": "True",
                "pretrained_checkpoint": "./pretrained_checkpoint.pt",
            },
            # Test set containing dictionaries
            "data.stories": "https://dl.fbaipublicfiles.com/fairseq/data/stories_test.tar.bz2",
        }

    def __init__(self, encoder, decoder, pretrained_encoder=None):
        super().__init__(encoder, decoder)
        self.encoder.num_attention_layers = sum(
            layer is not None for layer in decoder.attention
        )
        self.pretrained_encoder = pretrained_encoder
        if self.pretrained_encoder is None:
            encoders = {"encoder": encoder}
        else:
            encoders = {"encoder": encoder, "pretrained": self.pretrained_encoder}
        # for fusion model, CompositeEncoder contains both pretrained and training encoders
        # these are forwarded and then combined in the decoder
        self.encoder = CompositeEncoder(encoders)

    @staticmethod
    def add_args(parser):
        """Add model-specific arguments to the parser."""
        # fmt: off
        parser.add_argument('--dropout', type=float, metavar='D',
                            help='dropout probability')
        parser.add_argument('--encoder-embed-dim', type=int, metavar='N',
                            help='encoder embedding dimension')
        parser.add_argument('--encoder-layers', type=str, metavar='EXPR',
                            help='encoder layers [(dim, kernel_size), ...]')
        parser.add_argument('--decoder-embed-dim', type=int, metavar='N',
                            help='decoder embedding dimension')
        parser.add_argument('--decoder-layers', type=str, metavar='EXPR',
                            help='decoder layers [(dim, kernel_size), ...]')
        parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N',
                            help='decoder output embedding dimension')
        parser.add_argument('--decoder-attention', type=str, metavar='EXPR',
                            help='decoder attention [True, ...]')
        parser.add_argument('--self-attention', type=str, metavar='EXPR',
                            help='decoder self-attention layers, ex: [True] + [False]*5')
        parser.add_argument('--multihead-attention-nheads', type=int,
                            help='Number of heads to use in attention')
        parser.add_argument('--multihead-self-attention-nheads', type=int,
                            help='Number of heads to use in self-attention')
        parser.add_argument('--encoder-attention', type=str, metavar='EXPR',
                            help='encoder attention [True, ...]')
        parser.add_argument('--encoder-attention-nheads', type=int,
                            help='Number of heads to use in encoder attention')
        parser.add_argument('--project-input', type=str, metavar='EXPR',
                            help='Use projections in self-attention [True, ...]')
        parser.add_argument('--gated-attention', type=str, metavar='EXPR',
                            help='Use GLU layers in self-attention projections [True, ...]')
        parser.add_argument('--downsample', type=str, metavar='EXPR',
                            help='Use downsampling in self-attention [True, ...]')
        parser.add_argument('--pretrained-checkpoint', metavar='DIR',
                            help='path to load checkpoint from pretrained model')
        parser.add_argument('--pretrained', type=str, metavar='EXPR',
                            help='use pretrained model when training [True, ...]')
        # fmt: on

    @classmethod
    def build_model(cls, args, task):
        """Build a new model instance."""
        trained_encoder, trained_decoder = None, None
        pretrained = eval(args.pretrained)
        if pretrained:
            logger.info("loading pretrained model")
            if not os.path.exists(args.pretrained_checkpoint):
                new_pretrained_checkpoint = os.path.join(
                    args.data, args.pretrained_checkpoint
                )
                if os.path.exists(new_pretrained_checkpoint):
                    args.pretrained_checkpoint = new_pretrained_checkpoint
            trained_model = checkpoint_utils.load_model_ensemble(
                filenames=[args.pretrained_checkpoint],
                task=task,
            )[0][0]
            trained_decoder = list(trained_model.children())[1]
            trained_encoder = list(trained_model.children())[0]

            # freeze pretrained model
            for param in trained_decoder.parameters():
                param.requires_grad = False
            for param in trained_encoder.parameters():
                param.requires_grad = False

        encoder = FConvEncoder(
            task.source_dictionary,
            embed_dim=args.encoder_embed_dim,
            convolutions=eval(args.encoder_layers),
            dropout=args.dropout,
            max_positions=args.max_source_positions,
            attention=eval(args.encoder_attention),
            attention_nheads=args.encoder_attention_nheads,
        )

        decoder = FConvDecoder(
            task.target_dictionary,
            embed_dim=args.decoder_embed_dim,
            convolutions=eval(args.decoder_layers),
            out_embed_dim=args.decoder_out_embed_dim,
            attention=eval(args.decoder_attention),
            dropout=args.dropout,
            max_positions=args.max_target_positions,
            selfattention=eval(args.self_attention),
            attention_nheads=args.multihead_attention_nheads,
            selfattention_nheads=args.multihead_self_attention_nheads,
            project_input=eval(args.project_input),
            gated_attention=eval(args.gated_attention),
            downsample=eval(args.downsample),
            pretrained=pretrained,
            trained_decoder=trained_decoder,
        )
        model = FConvModelSelfAtt(encoder, decoder, trained_encoder)

        return model

    @property
    def pretrained(self):
        return self.pretrained_encoder is not None


class FConvEncoder(FairseqEncoder):
    """Convolutional encoder"""

    def __init__(
        self,
        dictionary,
        embed_dim=512,
        max_positions=1024,
        convolutions=((512, 3),) * 20,
        dropout=0.1,
        attention=False,
        attention_nheads=1,
    ):
        super().__init__(dictionary)
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__
        )
        self.num_attention_layers = None

        num_embeddings = len(dictionary)
        self.padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx)
        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            self.padding_idx,
        )

        def expand_bool_array(val):
            if isinstance(val, bool):
                # expand True into [True, True, ...] and do the same with False
                return [val] * len(convolutions)
            return val

        attention = expand_bool_array(attention)

        in_channels = convolutions[0][0]
        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()
        self.attproj = nn.ModuleList()
        for i, (out_channels, kernel_size) in enumerate(convolutions):
            self.projections.append(
                Linear(in_channels, out_channels)
                if in_channels != out_channels
                else None
            )
            self.convolutions.append(
                ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout)
            )

            self.attention.append(
                SelfAttention(out_channels, embed_dim, attention_nheads)
                if attention[i]
                else None
            )
            in_channels = out_channels

        self.fc2 = Linear(in_channels, embed_dim)

    def forward(self, src_tokens, src_lengths):
        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = self.dropout_module(x)
        input_embedding = x.transpose(0, 1)

        # project to size of convolution
        x = self.fc1(x)

        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        for proj, conv, attention in zip(
            self.projections, self.convolutions, self.attention
        ):
            residual = x if proj is None else proj(x)

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

            x = self.dropout_module(x)
            padding_l = (conv.kernel_size[0] - 1) // 2
            padding_r = conv.kernel_size[0] // 2
            x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
            x = conv(x)
            x = F.glu(x, dim=2)
            if attention is not None:
                x = attention(x)
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5)

        return {
            "encoder_out": (x, y),
            "encoder_padding_mask": encoder_padding_mask,  # B x T
        }

    def reorder_encoder_out(self, encoder_out, new_order):
        encoder_out["encoder_out"] = tuple(
            eo.index_select(0, new_order) for eo in encoder_out["encoder_out"]
        )

        if encoder_out["encoder_padding_mask"] is not None:
            encoder_out["encoder_padding_mask"] = encoder_out[
                "encoder_padding_mask"
            ].index_select(0, new_order)

        if "pretrained" in encoder_out:
            encoder_out["pretrained"]["encoder_out"] = tuple(
                eo.index_select(0, new_order)
                for eo in encoder_out["pretrained"]["encoder_out"]
            )

        return encoder_out

    def max_positions(self):
        """Maximum input length supported by the encoder."""
        return self.embed_positions.max_positions


@with_incremental_state
class FConvDecoder(FairseqDecoder):
    """Convolutional decoder"""

    def __init__(
        self,
        dictionary,
        embed_dim=512,
        out_embed_dim=256,
        max_positions=1024,
        convolutions=((512, 3),) * 8,
        attention=True,
        dropout=0.1,
        selfattention=False,
        attention_nheads=1,
        selfattention_nheads=1,
        project_input=False,
        gated_attention=False,
        downsample=False,
        pretrained=False,
        trained_decoder=None,
    ):
        super().__init__(dictionary)
        self.register_buffer("version", torch.Tensor([2]))
        self.pretrained = pretrained
        self.pretrained_decoder = trained_decoder
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__
        )
        self.need_attn = True
        in_channels = convolutions[0][0]

        def expand_bool_array(val):
            if isinstance(val, bool):
                # expand True into [True, True, ...] and do the same with False
                return [val] * len(convolutions)
            return val

        attention = expand_bool_array(attention)
        selfattention = expand_bool_array(selfattention)

        if not isinstance(attention, list) or len(attention) != len(convolutions):
            raise ValueError(
                "Attention is expected to be a list of booleans of "
                "length equal to the number of layers."
            )

        num_embeddings = len(dictionary)
        padding_idx = dictionary.pad()
        self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx)

        self.embed_positions = PositionalEmbedding(
            max_positions,
            embed_dim,
            padding_idx,
        )

        self.fc1 = Linear(embed_dim, in_channels, dropout=dropout)
        self.projections = nn.ModuleList()
        self.convolutions = nn.ModuleList()
        self.attention = nn.ModuleList()
        self.selfattention = nn.ModuleList()
        self.attproj = nn.ModuleList()
        for i, (out_channels, kernel_size) in enumerate(convolutions):
            self.projections.append(
                Linear(in_channels, out_channels)
                if in_channels != out_channels
                else None
            )
            self.convolutions.append(
                LinearizedConv1d(
                    in_channels,
                    out_channels * 2,
                    kernel_size,
                    padding=(kernel_size - 1),
                    dropout=dropout,
                )
            )

            self.attention.append(
                DownsampledMultiHeadAttention(
                    out_channels,
                    embed_dim,
                    attention_nheads,
                    project_input=project_input,
                    gated=False,
                    downsample=False,
                )
                if attention[i]
                else None
            )

            self.attproj.append(
                Linear(out_channels, embed_dim, dropout=dropout)
                if attention[i]
                else None
            )
            self.selfattention.append(
                SelfAttention(
                    out_channels,
                    embed_dim,
                    selfattention_nheads,
                    project_input=project_input,
                    gated=gated_attention,
                    downsample=downsample,
                )
                if selfattention[i]
                else None
            )
            in_channels = out_channels

        self.fc2 = Linear(in_channels, out_embed_dim)
        self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout)

        # model fusion
        if self.pretrained:
            # independent gates are learned from the concatenated input
            self.gate1 = nn.Sequential(
                Linear(out_embed_dim * 2, out_embed_dim), nn.Sigmoid()
            )
            self.gate2 = nn.Sequential(
                Linear(out_embed_dim * 2, out_embed_dim), nn.Sigmoid()
            )
            # pretrained and trained models are joined
            self.joining = nn.Sequential(
                Linear(out_embed_dim * 2, out_embed_dim * 2),
                LayerNorm(out_embed_dim * 2),
                nn.GLU(),
                Linear(out_embed_dim, out_embed_dim * 2),
                LayerNorm(out_embed_dim * 2),
                nn.GLU(),
                Linear(out_embed_dim, out_embed_dim),
                LayerNorm(out_embed_dim),
            )
            # pretrained model contains an output layer that is nhid -> vocab size
            # but the models are combined in their hidden state
            # the hook stores the output of the pretrained model forward
            self.pretrained_outputs = {}

            def save_output():
                def hook(a, b, output):
                    self.pretrained_outputs["out"] = output

                return hook

            self.pretrained_decoder.fc2.register_forward_hook(save_output())

    def forward(self, prev_output_tokens, encoder_out):
        trained_encoder_out = encoder_out["pretrained"] if self.pretrained else None
        encoder_out = encoder_out["encoder"]["encoder_out"]

        encoder_a, encoder_b = self._split_encoder_out(encoder_out)

        # embed positions
        positions = self.embed_positions(prev_output_tokens)

        # embed tokens and positions
        x = self.embed_tokens(prev_output_tokens) + positions
        x = self.dropout_module(x)
        target_embedding = x.transpose(0, 1)

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        avg_attn_scores = None
        for proj, conv, attention, selfattention, attproj in zip(
            self.projections,
            self.convolutions,
            self.attention,
            self.selfattention,
            self.attproj,
        ):
            residual = x if proj is None else proj(x)

            x = self.dropout_module(x)
            x = conv(x)
            x = F.glu(x, dim=2)

            # attention
            if attention is not None:
                r = x
                x, attn_scores = attention(
                    attproj(x) + target_embedding, encoder_a, encoder_b
                )
                x = x + r
                if not self.training and self.need_attn:
                    if avg_attn_scores is None:
                        avg_attn_scores = attn_scores
                    else:
                        avg_attn_scores.add_(attn_scores)

            if selfattention is not None:
                x = selfattention(x)

            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        # project back to size of vocabulary
        x = self.fc2(x)
        x = self.dropout_module(x)
        if not self.pretrained:
            x = self.fc3(x)

        # fusion gating
        if self.pretrained:
            trained_x, _ = self.pretrained_decoder.forward(
                prev_output_tokens, trained_encoder_out
            )
            y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1)
            gate1 = self.gate1(y)
            gate2 = self.gate2(y)
            gated_x1 = gate1 * x
            gated_x2 = gate2 * self.pretrained_outputs["out"]
            fusion = torch.cat([gated_x1, gated_x2], dim=-1)
            fusion = self.joining(fusion)
            fusion_output = self.fc3(fusion)
            return fusion_output, avg_attn_scores
        else:
            return x, avg_attn_scores

    def max_positions(self):
        """Maximum output length supported by the decoder."""
        return self.embed_positions.max_positions

    def make_generation_fast_(self, need_attn=False, **kwargs):
        self.need_attn = need_attn

    def _split_encoder_out(self, encoder_out):
        """Split and transpose encoder outputs."""
        # transpose only once to speed up attention layers
        encoder_a, encoder_b = encoder_out
        encoder_a = encoder_a.transpose(0, 1).contiguous()
        encoder_b = encoder_b.transpose(0, 1).contiguous()
        result = (encoder_a, encoder_b)
        return result


class SelfAttention(nn.Module):
    def __init__(
        self,
        out_channels,
        embed_dim,
        num_heads,
        project_input=False,
        gated=False,
        downsample=False,
    ):
        super().__init__()
        self.attention = DownsampledMultiHeadAttention(
            out_channels,
            embed_dim,
            num_heads,
            dropout=0,
            bias=True,
            project_input=project_input,
            gated=gated,
            downsample=downsample,
        )
        self.in_proj_q = Linear(out_channels, embed_dim)
        self.in_proj_k = Linear(out_channels, embed_dim)
        self.in_proj_v = Linear(out_channels, embed_dim)
        self.ln = LayerNorm(out_channels)

    def forward(self, x):
        residual = x
        query = self.in_proj_q(x)
        key = self.in_proj_k(x)
        value = self.in_proj_v(x)
        x, _ = self.attention(
            query, key, value, mask_future_timesteps=True, use_scalar_bias=True
        )
        return self.ln(x + residual)


def Embedding(num_embeddings, embedding_dim, padding_idx):
    m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx)
    m.weight.data.normal_(0, 0.1)
    return m


def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx):
    m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
    m.weight.data.normal_(0, 0.1)
    return m


def Linear(in_features, out_features, dropout=0.0):
    """Weight-normalized Linear layer (input: N x T x C)"""
    m = nn.Linear(in_features, out_features)
    m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features))
    m.bias.data.zero_()
    return m


def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
    """Weight-normalized Conv1d layer optimized for decoding"""
    m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs)
    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
    m.weight.data.normal_(mean=0, std=std)
    m.bias.data.zero_()
    return m


def ConvTBC(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs):
    """Weight-normalized Conv1d layer"""
    from fairseq.modules import ConvTBC

    m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs)
    std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels))
    m.weight.data.normal_(mean=0, std=std)
    m.bias.data.zero_()
    return m


@register_model_architecture("fconv_self_att", "fconv_self_att")
def base_architecture(args):
    args.dropout = getattr(args, "dropout", 0.1)
    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
    args.encoder_layers = getattr(args, "encoder_layers", "[(512, 3)] * 3")
    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
    args.decoder_layers = getattr(args, "decoder_layers", "[(512, 3)] * 8")
    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
    args.decoder_attention = getattr(args, "decoder_attention", "True")
    args.self_attention = getattr(args, "self_attention", "False")
    args.encoder_attention = getattr(args, "encoder_attention", "False")
    args.multihead_attention_nheads = getattr(args, "multihead_attention_nheads", 1)
    args.multihead_self_attention_nheads = getattr(
        args, "multihead_self_attention_nheads", 1
    )
    args.encoder_attention_nheads = getattr(args, "encoder_attention_nheads", 1)
    args.project_input = getattr(args, "project_input", "False")
    args.gated_attention = getattr(args, "gated_attention", "False")
    args.downsample = getattr(args, "downsample", "False")
    args.pretrained_checkpoint = getattr(args, "pretrained_checkpoint", "")
    args.pretrained = getattr(args, "pretrained", "False")


@register_model_architecture("fconv_self_att", "fconv_self_att_wp")
def fconv_self_att_wp(args):
    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
    args.encoder_layers = getattr(
        args, "encoder_layers", "[(128, 3)] * 2 + [(512,3)] * 1"
    )
    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256)
    args.decoder_layers = getattr(
        args, "decoder_layers", "[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1"
    )
    args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256)
    args.self_attention = getattr(args, "self_attention", "True")
    args.multihead_self_attention_nheads = getattr(
        args, "multihead_self_attention_nheads", 4
    )
    args.project_input = getattr(args, "project_input", "True")
    args.gated_attention = getattr(args, "gated_attention", "True")
    args.downsample = getattr(args, "downsample", "True")
    base_architecture(args)