mirror of
https://github.com/SillyTavern/SillyTavern-Extras.git
synced 2026-03-10 14:00:13 +00:00
307 lines
11 KiB
Python
307 lines
11 KiB
Python
# Copyright (c) Facebook, Inc. and its affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
from fairseq import utils
|
|
from fairseq.models import (
|
|
FairseqLanguageModel,
|
|
register_model,
|
|
register_model_architecture,
|
|
)
|
|
from fairseq.models.lightconv import Embedding, LightConvDecoder
|
|
from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder
|
|
|
|
|
|
@register_model("lightconv_lm")
|
|
class LightConvLanguageModel(FairseqLanguageModel):
|
|
def __init__(self, decoder):
|
|
super().__init__(decoder)
|
|
|
|
@staticmethod
|
|
def add_args(parser):
|
|
"""Add model-specific arguments to the parser."""
|
|
parser.add_argument(
|
|
"--dropout",
|
|
default=0.1,
|
|
type=float,
|
|
metavar="D",
|
|
help="dropout probability",
|
|
)
|
|
parser.add_argument(
|
|
"--attention-dropout",
|
|
default=0.0,
|
|
type=float,
|
|
metavar="D",
|
|
help="dropout probability for attention weights",
|
|
)
|
|
parser.add_argument(
|
|
"--relu-dropout",
|
|
default=0.0,
|
|
type=float,
|
|
metavar="D",
|
|
help="dropout probability after ReLU in FFN",
|
|
)
|
|
parser.add_argument(
|
|
"--input-dropout",
|
|
type=float,
|
|
metavar="D",
|
|
help="dropout probability of the inputs",
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-embed-dim",
|
|
type=int,
|
|
metavar="N",
|
|
help="decoder embedding dimension",
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-output-dim",
|
|
type=int,
|
|
metavar="N",
|
|
help="decoder output dimension",
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-input-dim", type=int, metavar="N", help="decoder input dimension"
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-ffn-embed-dim",
|
|
type=int,
|
|
metavar="N",
|
|
help="decoder embedding dimension for FFN",
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-layers", type=int, metavar="N", help="num decoder layers"
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-attention-heads",
|
|
type=int,
|
|
metavar="N",
|
|
help="num decoder attention heads or LightConv/DynamicConv heads",
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-normalize-before",
|
|
default=False,
|
|
action="store_true",
|
|
help="apply layernorm before each decoder block",
|
|
)
|
|
parser.add_argument(
|
|
"--adaptive-softmax-cutoff",
|
|
metavar="EXPR",
|
|
help="comma separated list of adaptive softmax cutoff points. "
|
|
"Must be used with adaptive_loss criterion",
|
|
)
|
|
parser.add_argument(
|
|
"--adaptive-softmax-dropout",
|
|
type=float,
|
|
metavar="D",
|
|
help="sets adaptive softmax dropout for the tail projections",
|
|
)
|
|
parser.add_argument(
|
|
"--adaptive-softmax-factor",
|
|
type=float,
|
|
metavar="N",
|
|
help="adaptive input factor",
|
|
)
|
|
parser.add_argument(
|
|
"--no-token-positional-embeddings",
|
|
default=False,
|
|
action="store_true",
|
|
help="if set, disables positional embeddings (outside self attention)",
|
|
)
|
|
parser.add_argument(
|
|
"--share-decoder-input-output-embed",
|
|
default=False,
|
|
action="store_true",
|
|
help="share decoder input and output embeddings",
|
|
)
|
|
parser.add_argument(
|
|
"--character-embeddings",
|
|
default=False,
|
|
action="store_true",
|
|
help="if set, uses character embedding convolutions to produce token embeddings",
|
|
)
|
|
parser.add_argument(
|
|
"--character-filters",
|
|
type=str,
|
|
metavar="LIST",
|
|
default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]",
|
|
help="size of character embeddings",
|
|
)
|
|
parser.add_argument(
|
|
"--character-embedding-dim",
|
|
type=int,
|
|
metavar="N",
|
|
default=4,
|
|
help="size of character embeddings",
|
|
)
|
|
parser.add_argument(
|
|
"--char-embedder-highway-layers",
|
|
type=int,
|
|
metavar="N",
|
|
default=2,
|
|
help="number of highway layers for character token embeddder",
|
|
)
|
|
parser.add_argument(
|
|
"--adaptive-input",
|
|
default=False,
|
|
action="store_true",
|
|
help="if set, uses adaptive input",
|
|
)
|
|
parser.add_argument(
|
|
"--adaptive-input-factor",
|
|
type=float,
|
|
metavar="N",
|
|
help="adaptive input factor",
|
|
)
|
|
parser.add_argument(
|
|
"--adaptive-input-cutoff",
|
|
metavar="EXPR",
|
|
help="comma separated list of adaptive input cutoff points.",
|
|
)
|
|
parser.add_argument(
|
|
"--tie-adaptive-weights",
|
|
action="store_true",
|
|
help="if set, ties the weights of adaptive softmax and adaptive input",
|
|
)
|
|
parser.add_argument(
|
|
"--tie-adaptive-proj",
|
|
action="store_true",
|
|
help="if set, ties the projection weights of adaptive softmax and adaptive input",
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-learned-pos",
|
|
action="store_true",
|
|
help="use learned positional embeddings in the decoder",
|
|
)
|
|
|
|
"""LightConv and DynamicConv arguments"""
|
|
parser.add_argument(
|
|
"--decoder-kernel-size-list",
|
|
type=lambda x: utils.eval_str_list(x, int),
|
|
help='list of kernel size (default: "[3,7,15,31,31,31]")',
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-glu", type=utils.eval_bool, help="glu after in proj"
|
|
)
|
|
parser.add_argument(
|
|
"--decoder-conv-type",
|
|
default="dynamic",
|
|
type=str,
|
|
choices=["dynamic", "lightweight"],
|
|
help="type of convolution",
|
|
)
|
|
parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool)
|
|
parser.add_argument(
|
|
"--weight-dropout",
|
|
type=float,
|
|
metavar="D",
|
|
help="dropout probability for conv weights",
|
|
)
|
|
|
|
@classmethod
|
|
def build_model(cls, args, task):
|
|
"""Build a new model instance."""
|
|
|
|
# make sure all arguments are present in older models
|
|
base_lm_architecture(args)
|
|
|
|
if getattr(args, "max_source_positions", None) is None:
|
|
args.max_source_positions = args.tokens_per_sample
|
|
if getattr(args, "max_target_positions", None) is None:
|
|
args.max_target_positions = args.tokens_per_sample
|
|
|
|
if args.character_embeddings:
|
|
embed_tokens = CharacterTokenEmbedder(
|
|
task.dictionary,
|
|
eval(args.character_filters),
|
|
args.character_embedding_dim,
|
|
args.decoder_embed_dim,
|
|
args.char_embedder_highway_layers,
|
|
)
|
|
elif args.adaptive_input:
|
|
embed_tokens = AdaptiveInput(
|
|
len(task.dictionary),
|
|
task.dictionary.pad(),
|
|
args.decoder_input_dim,
|
|
args.adaptive_input_factor,
|
|
args.decoder_embed_dim,
|
|
utils.eval_str_list(args.adaptive_input_cutoff, type=int),
|
|
)
|
|
else:
|
|
embed_tokens = Embedding(
|
|
len(task.dictionary), args.decoder_input_dim, task.dictionary.pad()
|
|
)
|
|
|
|
if args.tie_adaptive_weights:
|
|
assert args.adaptive_input
|
|
assert args.adaptive_input_factor == args.adaptive_softmax_factor
|
|
assert (
|
|
args.adaptive_softmax_cutoff == args.adaptive_input_cutoff
|
|
), "{} != {}".format(
|
|
args.adaptive_softmax_cutoff, args.adaptive_input_cutoff
|
|
)
|
|
assert args.decoder_input_dim == args.decoder_output_dim
|
|
|
|
decoder = LightConvDecoder(
|
|
args,
|
|
task.output_dictionary,
|
|
embed_tokens,
|
|
no_encoder_attn=True,
|
|
final_norm=False,
|
|
)
|
|
return LightConvLanguageModel(decoder)
|
|
|
|
|
|
@register_model_architecture("lightconv_lm", "lightconv_lm")
|
|
def base_lm_architecture(args):
|
|
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
|
|
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048)
|
|
args.decoder_layers = getattr(args, "decoder_layers", 6)
|
|
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
|
|
args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
|
|
args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
|
|
args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4)
|
|
args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
|
|
|
|
args.character_embeddings = getattr(args, "character_embeddings", False)
|
|
|
|
args.decoder_output_dim = getattr(
|
|
args, "decoder_output_dim", args.decoder_embed_dim
|
|
)
|
|
args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
|
|
args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim)
|
|
|
|
# The model training is not stable without this
|
|
args.decoder_normalize_before = True
|
|
|
|
args.adaptive_input = getattr(args, "adaptive_input", False)
|
|
args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4)
|
|
args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None)
|
|
|
|
args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
|
|
args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False)
|
|
|
|
args.decoder_kernel_size_list = getattr(
|
|
args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31]
|
|
)
|
|
if len(args.decoder_kernel_size_list) == 1:
|
|
args.decoder_kernel_size_list = (
|
|
args.decoder_kernel_size_list * args.decoder_layers
|
|
)
|
|
assert (
|
|
len(args.decoder_kernel_size_list) == args.decoder_layers
|
|
), "decoder_kernel_size_list doesn't match decoder_layers"
|
|
args.decoder_glu = getattr(args, "decoder_glu", True)
|
|
args.input_dropout = getattr(args, "input_dropout", 0.1)
|
|
args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout)
|
|
|
|
|
|
@register_model_architecture("lightconv_lm", "lightconv_lm_gbw")
|
|
def lightconv_lm_gbw(args):
|
|
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
|
|
args.dropout = getattr(args, "dropout", 0.1)
|
|
args.attention_dropout = getattr(args, "attention_dropout", 0.1)
|
|
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
|
|
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
|
|
base_lm_architecture(args)
|