Files
ai-toolkit/toolkit/audio/make_video.py
Jaret Burkett 78cf049c29 Add support for ACE-Step 1.5 and ACE-Step 1.5 XL. Also added dataset captioning through the UI. (#785)
* Base ace step 1.5 xl added. Generating, still wip on training and ui

* Base training code done

* Fix some issues with caching text embeddings. Update sample cards to show audio

* Fix issue with quantizing ace step

* Add album artwork to samples with waveform.

* Cleanup logs

* Add album art endpoint to speed up album art loading

* Made an make video with artwork script

* Make ui handle basic audio models. Make multi line adjustments to the editor and better syntax hilighting.

* Add prompt tagging system for special tagged models.

* prompt tagging processing for ui working.

* Moved default samples to a special file so we can add more when needed and they can be adjusted for a specific model

* Add a captioner job with music captioner that is prepped for use with the ui

* Add basit ui setup for captioning modal and handeling captioning jobs

* Starting captioning job from ui working. Still better management for it.

* Better filtering of job options in the job view for captioning jobs

* Added qwen3 vl as a captioner for images

* Have an indicator when a dataset is being captioned.

* Adjust the way caption jobs look in the queue

* Fix a few issues. Adjust defaults.

* Version bump

* Added ace step to the readme.
2026-04-09 15:02:03 -06:00

150 lines
4.4 KiB
Python

import os
import numpy as np
import av
from PIL import Image, ImageDraw
ARTWORK_DIR = os.path.dirname(os.path.abspath(__file__))
BACKGROUND_PATH = os.path.join(ARTWORK_DIR, "ostris_logo.jpg")
WAVEFORM_COLOR = (0xFB, 0xBF, 0x24, 230) # #fbbf24 at 90% opacity
ARTWORK_SIZE = 1024
def load_waveform(audio_path: str, num_samples: int = 512) -> np.ndarray:
"""Load audio and return a downsampled waveform envelope using PyAV."""
container = av.open(audio_path)
stream = container.streams.audio[0]
stream.codec_context.thread_type = "AUTO"
frames = []
for frame in container.decode(stream):
arr = frame.to_ndarray()
# mix down to mono
if arr.ndim > 1:
arr = arr.mean(axis=0)
frames.append(arr)
container.close()
audio = np.concatenate(frames)
# downsample to num_samples bins by taking max absolute value per bin
bin_size = len(audio) // num_samples
if bin_size == 0:
bin_size = 1
trimmed = audio[: bin_size * num_samples]
bins = trimmed.reshape(num_samples, bin_size)
envelope = np.max(np.abs(bins), axis=1)
# normalize to 0-1
peak = envelope.max()
if peak > 0:
envelope = envelope / peak
return envelope
def create_artwork(waveform: np.ndarray, size: int = ARTWORK_SIZE) -> Image.Image:
"""Create album artwork with logo background and waveform overlay."""
bg = Image.open(BACKGROUND_PATH).convert("RGBA").resize((size, size), Image.LANCZOS)
# draw waveform on separate overlay for alpha compositing
wave_overlay = Image.new("RGBA", (size, size), (0, 0, 0, 0))
draw = ImageDraw.Draw(wave_overlay)
num_bars = len(waveform)
padding = int(size * 0.02)
draw_w = size - 2 * padding
bar_width = max(1, draw_w / num_bars)
center_y = size // 2
max_amp = (size // 2) * 0.85 # leave a little margin
for i, amp in enumerate(waveform):
x = padding + i * bar_width
h = amp * max_amp
y_top = center_y - h
y_bot = center_y + h
draw.rectangle(
[x, y_top, x + bar_width - 1, y_bot],
fill=WAVEFORM_COLOR,
)
bg = Image.alpha_composite(bg, wave_overlay)
return bg.convert("RGB")
def make_video(song_path: str, video_size: int = 512) -> str:
"""Create an MP4 video with album artwork as a static image for the duration of the audio."""
if not os.path.isfile(song_path):
raise FileNotFoundError(f"Audio file not found: {song_path}")
waveform = load_waveform(song_path)
artwork = create_artwork(waveform)
artwork = artwork.resize((video_size, video_size), Image.LANCZOS)
# get audio duration
container = av.open(song_path)
duration = float(container.duration) / av.time_base
container.close()
# output path: same name as input but .mp4, in the same directory
base, _ = os.path.splitext(song_path)
output_path = base + ".mp4"
fps = 1 # static image, 1 fps is enough
total_frames = max(1, int(duration * fps))
# convert artwork to numpy array for video encoding
frame_data = np.array(artwork)
out_container = av.open(output_path, mode="w")
video_stream = out_container.add_stream("libx264", rate=fps)
video_stream.width = video_size
video_stream.height = video_size
video_stream.pix_fmt = "yuv420p"
for _ in range(total_frames):
frame = av.VideoFrame.from_ndarray(frame_data, format="rgb24")
for packet in video_stream.encode(frame):
out_container.mux(packet)
# flush
for packet in video_stream.encode():
out_container.mux(packet)
out_container.close()
# mux audio into the video using ffmpeg via subprocess
import subprocess
final_path = base + "_final.mp4"
subprocess.run(
[
"ffmpeg", "-y",
"-i", output_path,
"-i", song_path,
"-c:v", "copy",
"-c:a", "aac",
"-shortest",
final_path,
],
check=True,
capture_output=True,
)
# replace silent video with final muxed version
os.replace(final_path, output_path)
return output_path
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Create an MP4 video with album artwork from an audio file"
)
parser.add_argument("audio", help="Path to the audio file")
args = parser.parse_args()
out = make_video(args.audio)
print(f"Created video: {out}")