mirror of
https://github.com/ostris/ai-toolkit.git
synced 2026-05-11 16:30:40 +00:00
* Base ace step 1.5 xl added. Generating, still wip on training and ui * Base training code done * Fix some issues with caching text embeddings. Update sample cards to show audio * Fix issue with quantizing ace step * Add album artwork to samples with waveform. * Cleanup logs * Add album art endpoint to speed up album art loading * Made an make video with artwork script * Make ui handle basic audio models. Make multi line adjustments to the editor and better syntax hilighting. * Add prompt tagging system for special tagged models. * prompt tagging processing for ui working. * Moved default samples to a special file so we can add more when needed and they can be adjusted for a specific model * Add a captioner job with music captioner that is prepped for use with the ui * Add basit ui setup for captioning modal and handeling captioning jobs * Starting captioning job from ui working. Still better management for it. * Better filtering of job options in the job view for captioning jobs * Added qwen3 vl as a captioner for images * Have an indicator when a dataset is being captioned. * Adjust the way caption jobs look in the queue * Fix a few issues. Adjust defaults. * Version bump * Added ace step to the readme.
150 lines
4.4 KiB
Python
150 lines
4.4 KiB
Python
import os
|
|
import numpy as np
|
|
import av
|
|
from PIL import Image, ImageDraw
|
|
|
|
|
|
ARTWORK_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
BACKGROUND_PATH = os.path.join(ARTWORK_DIR, "ostris_logo.jpg")
|
|
WAVEFORM_COLOR = (0xFB, 0xBF, 0x24, 230) # #fbbf24 at 90% opacity
|
|
ARTWORK_SIZE = 1024
|
|
|
|
|
|
def load_waveform(audio_path: str, num_samples: int = 512) -> np.ndarray:
|
|
"""Load audio and return a downsampled waveform envelope using PyAV."""
|
|
container = av.open(audio_path)
|
|
stream = container.streams.audio[0]
|
|
stream.codec_context.thread_type = "AUTO"
|
|
|
|
frames = []
|
|
for frame in container.decode(stream):
|
|
arr = frame.to_ndarray()
|
|
# mix down to mono
|
|
if arr.ndim > 1:
|
|
arr = arr.mean(axis=0)
|
|
frames.append(arr)
|
|
container.close()
|
|
|
|
audio = np.concatenate(frames)
|
|
|
|
# downsample to num_samples bins by taking max absolute value per bin
|
|
bin_size = len(audio) // num_samples
|
|
if bin_size == 0:
|
|
bin_size = 1
|
|
trimmed = audio[: bin_size * num_samples]
|
|
bins = trimmed.reshape(num_samples, bin_size)
|
|
envelope = np.max(np.abs(bins), axis=1)
|
|
|
|
# normalize to 0-1
|
|
peak = envelope.max()
|
|
if peak > 0:
|
|
envelope = envelope / peak
|
|
return envelope
|
|
|
|
|
|
def create_artwork(waveform: np.ndarray, size: int = ARTWORK_SIZE) -> Image.Image:
|
|
"""Create album artwork with logo background and waveform overlay."""
|
|
bg = Image.open(BACKGROUND_PATH).convert("RGBA").resize((size, size), Image.LANCZOS)
|
|
|
|
# draw waveform on separate overlay for alpha compositing
|
|
wave_overlay = Image.new("RGBA", (size, size), (0, 0, 0, 0))
|
|
draw = ImageDraw.Draw(wave_overlay)
|
|
|
|
num_bars = len(waveform)
|
|
padding = int(size * 0.02)
|
|
draw_w = size - 2 * padding
|
|
bar_width = max(1, draw_w / num_bars)
|
|
center_y = size // 2
|
|
|
|
max_amp = (size // 2) * 0.85 # leave a little margin
|
|
|
|
for i, amp in enumerate(waveform):
|
|
x = padding + i * bar_width
|
|
h = amp * max_amp
|
|
y_top = center_y - h
|
|
y_bot = center_y + h
|
|
draw.rectangle(
|
|
[x, y_top, x + bar_width - 1, y_bot],
|
|
fill=WAVEFORM_COLOR,
|
|
)
|
|
|
|
bg = Image.alpha_composite(bg, wave_overlay)
|
|
return bg.convert("RGB")
|
|
|
|
|
|
def make_video(song_path: str, video_size: int = 512) -> str:
|
|
"""Create an MP4 video with album artwork as a static image for the duration of the audio."""
|
|
if not os.path.isfile(song_path):
|
|
raise FileNotFoundError(f"Audio file not found: {song_path}")
|
|
|
|
waveform = load_waveform(song_path)
|
|
artwork = create_artwork(waveform)
|
|
artwork = artwork.resize((video_size, video_size), Image.LANCZOS)
|
|
|
|
# get audio duration
|
|
container = av.open(song_path)
|
|
duration = float(container.duration) / av.time_base
|
|
container.close()
|
|
|
|
# output path: same name as input but .mp4, in the same directory
|
|
base, _ = os.path.splitext(song_path)
|
|
output_path = base + ".mp4"
|
|
|
|
fps = 1 # static image, 1 fps is enough
|
|
total_frames = max(1, int(duration * fps))
|
|
|
|
# convert artwork to numpy array for video encoding
|
|
frame_data = np.array(artwork)
|
|
|
|
out_container = av.open(output_path, mode="w")
|
|
video_stream = out_container.add_stream("libx264", rate=fps)
|
|
video_stream.width = video_size
|
|
video_stream.height = video_size
|
|
video_stream.pix_fmt = "yuv420p"
|
|
|
|
for _ in range(total_frames):
|
|
frame = av.VideoFrame.from_ndarray(frame_data, format="rgb24")
|
|
for packet in video_stream.encode(frame):
|
|
out_container.mux(packet)
|
|
|
|
# flush
|
|
for packet in video_stream.encode():
|
|
out_container.mux(packet)
|
|
|
|
out_container.close()
|
|
|
|
# mux audio into the video using ffmpeg via subprocess
|
|
import subprocess
|
|
final_path = base + "_final.mp4"
|
|
subprocess.run(
|
|
[
|
|
"ffmpeg", "-y",
|
|
"-i", output_path,
|
|
"-i", song_path,
|
|
"-c:v", "copy",
|
|
"-c:a", "aac",
|
|
"-shortest",
|
|
final_path,
|
|
],
|
|
check=True,
|
|
capture_output=True,
|
|
)
|
|
|
|
# replace silent video with final muxed version
|
|
os.replace(final_path, output_path)
|
|
|
|
return output_path
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Create an MP4 video with album artwork from an audio file"
|
|
)
|
|
parser.add_argument("audio", help="Path to the audio file")
|
|
args = parser.parse_args()
|
|
|
|
out = make_video(args.audio)
|
|
print(f"Created video: {out}")
|