Files
SillyTavern-extras/talkinghead/tha3/app/util.py
Juha Jeronen a794ad9dde Fix eye_unimpressed (arity 2; had only one JSON key)
- The "eye_unimpressed" morph had just one key in the emotion JSON, although the
  model has two morphs (left and right) for this. Now it has two, as it should.
- This change breaks backward compatibility for old emotion JSON files.
  - OTOH, not much of an issue, because in all versions prior to this one
    being developed, the emotion JSON system was underutilized (only a bunch
    of pre-made presets, only used internally by the live plugin).
  - Thus it is important to fix this now, before the next release, because the
    improved manual poser makes it easy to generate new emotion JSON files,
    so from the next release on we can assume those to exist in the wild.
2024-01-09 10:44:49 +02:00

220 lines
10 KiB
Python

"""App-level utilities."""
__all__ = ["posedict_keys", "posedict_key_to_index",
"load_emotion_presets",
"posedict_to_pose", "pose_to_posedict",
"maybe_install_models",
"torch_image_to_numpy", "to_talkinghead_image",
"RunningAverage"]
import logging
import json
import os
from typing import Dict, List, Tuple
import PIL
import numpy as np
import torch
from tha3.util import rgba_to_numpy_image, rgb_to_numpy_image, grid_change_to_numpy_image
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# The keys for a pose in the emotion JSON files.
posedict_keys = ["eyebrow_troubled_left_index", "eyebrow_troubled_right_index",
"eyebrow_angry_left_index", "eyebrow_angry_right_index",
"eyebrow_lowered_left_index", "eyebrow_lowered_right_index",
"eyebrow_raised_left_index", "eyebrow_raised_right_index",
"eyebrow_happy_left_index", "eyebrow_happy_right_index",
"eyebrow_serious_left_index", "eyebrow_serious_right_index",
"eye_wink_left_index", "eye_wink_right_index",
"eye_happy_wink_left_index", "eye_happy_wink_right_index",
"eye_surprised_left_index", "eye_surprised_right_index",
"eye_relaxed_left_index", "eye_relaxed_right_index",
"eye_unimpressed_left_index", "eye_unimpressed_right_index",
"eye_raised_lower_eyelid_left_index", "eye_raised_lower_eyelid_right_index",
"iris_small_left_index", "iris_small_right_index",
"mouth_aaa_index",
"mouth_iii_index",
"mouth_uuu_index",
"mouth_eee_index",
"mouth_ooo_index",
"mouth_delta",
"mouth_lowered_corner_left_index", "mouth_lowered_corner_right_index",
"mouth_raised_corner_left_index", "mouth_raised_corner_right_index",
"mouth_smirk",
"iris_rotation_x_index", "iris_rotation_y_index",
"head_x_index", "head_y_index",
"neck_z_index",
"body_y_index", "body_z_index",
"breathing_index"]
assert len(posedict_keys) == 45
# posedict_keys gives us index->key; make an inverse mapping.
posedict_key_to_index = {key: idx for idx, key in enumerate(posedict_keys)}
def load_emotion_presets(directory: str) -> Tuple[Dict[str, Dict[str, float]], List[str]]:
"""Load emotion presets from disk.
Returns the tuple `(emotions, emotion_names)`, where::
emotions = {emotion0_name: posedict0, ...}
emotion_names = [emotion0_name, emotion1_name, ...]
The dict contains the actual pose data. The list is a sorted list of emotion names
that can be used to map a linear index (e.g. the choice index in a GUI dropdown)
to the corresponding key of `emotions`.
The directory "talkinghead/emotions" must also contain a "_defaults.json" file,
containing factory defaults (as a fallback) for the 28 standard emotions
(as recognized by distilbert), as well as a hidden "zero" preset that represents
a neutral pose. (This is separate from the "neutral" emotion, which is allowed
to be "non-zero".)
"""
emotion_names = set()
for root, dirs, files in os.walk(directory, topdown=True):
for filename in files:
if filename == "_defaults.json": # skip the repository containing the default fallbacks
continue
if filename.lower().endswith(".json"):
emotion_names.add(filename[:-5]) # drop the ".json"
# Load the factory-default emotions as a fallback
with open(os.path.join(directory, "_defaults.json"), "r") as json_file:
factory_default_emotions = json.load(json_file)
for key in factory_default_emotions: # get keys from here too, in case some emotion files are missing
if key != "zero": # not an actual emotion, but a "reset character" feature
emotion_names.add(key)
emotion_names = list(emotion_names)
emotion_names.sort() # the 28 actual emotions
def load_emotion_with_fallback(emotion_name: str) -> Dict[str, float]:
try:
with open(os.path.join(directory, f"{emotion_name}.json"), "r") as json_file:
emotions_from_json = json.load(json_file) # A single json file may contain presets for multiple emotions.
posedict = emotions_from_json[emotion_name]
except (FileNotFoundError, KeyError): # If no separate json exists for the specified emotion, load the factory default (all 28 emotions have a default).
posedict = factory_default_emotions[emotion_name]
# If still not found, it's an error, so fail-fast: let the app exit with an informative exception message.
return posedict
# Dict keeps its keys in insertion order, so define some special states before inserting the actual emotions.
emotions = {"[custom]": {}, # custom = the user has changed at least one value manually after last loading a preset
"[reset]": load_emotion_with_fallback("zero")} # reset = a preset with all sliders in their default positions. Found in "_defaults.json".
for emotion_name in emotion_names:
emotions[emotion_name] = load_emotion_with_fallback(emotion_name)
emotion_names = list(emotions.keys())
return emotions, emotion_names
def posedict_to_pose(posedict: Dict[str, float]) -> List[float]:
"""Convert a posedict (from an emotion JSON) into a list of morph values (in the order the models expect them)."""
# sanity check
unrecognized_keys = set(posedict.keys()) - set(posedict_keys)
if unrecognized_keys:
logger.warning(f"posedict_to_pose: ignoring unrecognized keys in posedict: {unrecognized_keys}")
# Missing keys are fine - keys for zero values can simply be omitted.
pose = [0.0 for i in range(len(posedict_keys))]
for idx, key in enumerate(posedict_keys):
pose[idx] = posedict.get(key, 0.0)
return pose
def pose_to_posedict(pose: List[float]) -> Dict[str, float]:
"""Convert `pose` into a posedict for saving into an emotion JSON."""
return dict(zip(posedict_keys, pose))
# --------------------------------------------------------------------------------
def maybe_install_models(hf_reponame: str, modelsdir: str) -> None:
"""Download and install the posing engine (THA3) models into `modelsdir` if the directory does not exist yet. Else do nothing.
For maximal OS compatibility, symlinks are not used.
`hf_reponame`: HuggingFace repository to download from, e.g. "OktayAlpk/talking-head-anime-3".
`modelsdir`: Local path (absolute or relative) to install in.
"""
if not os.path.exists(modelsdir):
# API:
# https://huggingface.co/docs/huggingface_hub/en/guides/download
try:
from huggingface_hub import snapshot_download
except ImportError:
raise ImportError(
"You need to install huggingface_hub to install talkinghead models automatically. "
"See https://pypi.org/project/huggingface-hub/ for installation."
)
os.makedirs(modelsdir, exist_ok=True)
print(f"THA3 models not yet installed. Installing from {hf_reponame} into {modelsdir}.")
# Installing with symlinks would be generally better, but MS Windows support for symlinks is not optimal,
# so for maximal compatibility we avoid them. The drawback of installing directly as plain files is that
# if multiple programs need to download THA3, they will do so separately. But THA3 is rather rare, so in
# practice this is unlikely to be an issue.
snapshot_download(repo_id=hf_reponame, local_dir=modelsdir, local_dir_use_symlinks=False)
# --------------------------------------------------------------------------------
# TODO: move the image utils to the lower-level `tha3.util`?
def torch_image_to_numpy(image: torch.tensor) -> np.array:
if image.shape[2] == 2:
h, w, c = image.shape
numpy_image = torch.transpose(image.reshape(h * w, c), 0, 1).reshape(c, h, w)
elif image.shape[0] == 4:
numpy_image = rgba_to_numpy_image(image)
elif image.shape[0] == 3:
numpy_image = rgb_to_numpy_image(image)
elif image.shape[0] == 1:
c, h, w = image.shape
alpha_image = torch.cat([image.repeat(3, 1, 1) * 2.0 - 1.0, torch.ones(1, h, w)], dim=0)
numpy_image = rgba_to_numpy_image(alpha_image)
elif image.shape[0] == 2:
numpy_image = grid_change_to_numpy_image(image, num_channels=4)
else:
msg = f"torch_image_to_numpy: unsupported # image channels: {image.shape[0]}"
logger.error(msg)
raise RuntimeError(msg)
numpy_image = np.uint8(np.rint(numpy_image * 255.0))
return numpy_image
def to_talkinghead_image(image: PIL.Image, new_size: Tuple[int] = (512, 512)) -> PIL.Image:
"""Resize image to `new_size`, add alpha channel, and center.
With default `new_size`:
- Step 1: Resize (Lanczos) the image to maintain the aspect ratio with the larger dimension being 512 pixels.
- Step 2: Create a new image of size 512x512 with transparency.
- Step 3: Paste the resized image into the new image, centered.
"""
image.thumbnail(new_size, PIL.Image.LANCZOS)
new_image = PIL.Image.new("RGBA", new_size)
new_image.paste(image, ((new_size[0] - image.size[0]) // 2,
(new_size[1] - image.size[1]) // 2))
return new_image
# --------------------------------------------------------------------------------
class RunningAverage:
"""A simple running average, for things like FPS (frames per second) counters."""
def __init__(self):
self.count = 100
self.data = []
def add_datapoint(self, data: float) -> None:
self.data.append(data)
while len(self.data) > self.count:
del self.data[0]
def average(self) -> float:
if len(self.data) == 0:
return 0.0
else:
return sum(self.data) / len(self.data)