Add LTX-2 Support (#644)

* WIP, adding support for LTX2 * Training on images working * Fix loading comfy models * Handle converting and deconverting lora so it matches original format * Reworked ui to habdle ltx and propert dataset default overwriting. * Update the way lokr saves to it is more compatable with comfy * Audio loading and synchronization/resampling is working * Add audio to training. Does it work? Maybe, still testing. * Fixed fps default issue for sound * Have ui set fps for accurate audio mapping on ltx * Added audio procession options to the ui for ltx * Clean up requirements
2026-04-26 17:29:27 +00:00 · 2026-01-13 04:55:30 -07:00
parent 6870ab490f
commit 5b5aadadb8
28 changed files with 2180 additions and 71 deletions
--- a/ui/src/app/api/img/delete/route.ts
+++ b/ui/src/app/api/img/delete/route.ts
@@ -15,7 +15,7 @@ export async function POST(request: Request) {
    }

    // make sure it is an image
-    if (!/\.(jpg|jpeg|png|bmp|gif|tiff|webp)$/i.test(imgPath.toLowerCase())) {
+    if (!/\.(jpg|jpeg|png|bmp|gif|tiff|webp|mp4)$/i.test(imgPath.toLowerCase())) {
      return NextResponse.json({ error: 'Not an image' }, { status: 400 });
    }

--- a/ui/src/app/api/jobs/[jobID]/samples/route.ts
+++ b/ui/src/app/api/jobs/[jobID]/samples/route.ts
@@ -29,7 +29,7 @@ export async function GET(request: NextRequest, { params }: { params: { jobID: s
  const samples = fs
    .readdirSync(samplesFolder)
    .filter(file => {
-      return file.endsWith('.png') || file.endsWith('.jpg') || file.endsWith('.jpeg') || file.endsWith('.webp');
+      return file.endsWith('.png') || file.endsWith('.jpg') || file.endsWith('.jpeg') || file.endsWith('.webp') || file.endsWith('.mp4');
    })
    .map(file => {
      return path.join(samplesFolder, file);
--- a/ui/src/app/jobs/new/SimpleJob.tsx
+++ b/ui/src/app/jobs/new/SimpleJob.tsx
@@ -862,6 +862,48 @@ export default function SimpleJob({
                            docKey="datasets.do_i2v"
                          />
                        )}
+                        {modelArch?.additionalSections?.includes('datasets.do_audio') && (
+                          <Checkbox
+                            label="Do Audio"
+                            checked={dataset.do_audio || false}
+                            onChange={value => {
+                              if (!value) {
+                                setJobConfig(undefined, `config.process[0].datasets[${i}].do_audio`);
+                              } else {
+                                setJobConfig(value, `config.process[0].datasets[${i}].do_audio`);
+                              }
+                            }}
+                            docKey="datasets.do_audio"
+                          />
+                        )}
+                        {modelArch?.additionalSections?.includes('datasets.audio_normalize') && (
+                          <Checkbox
+                            label="Audio Normalize"
+                            checked={dataset.audio_normalize || false}
+                            onChange={value => {
+                              if (!value) {
+                                setJobConfig(undefined, `config.process[0].datasets[${i}].audio_normalize`);
+                              } else {
+                                setJobConfig(value, `config.process[0].datasets[${i}].audio_normalize`);
+                              }
+                            }}
+                            docKey="datasets.audio_normalize"
+                          />
+                        )}
+                        {modelArch?.additionalSections?.includes('datasets.audio_preserve_pitch') && (
+                          <Checkbox
+                            label="Audio Preserve Pitch"
+                            checked={dataset.audio_preserve_pitch || false}
+                            onChange={value => {
+                              if (!value) {
+                                setJobConfig(undefined, `config.process[0].datasets[${i}].audio_preserve_pitch`);
+                              } else {
+                                setJobConfig(value, `config.process[0].datasets[${i}].audio_preserve_pitch`);
+                              }
+                            }}
+                            docKey="datasets.audio_preserve_pitch"
+                          />
+                        )}
                      </FormGroup>
                      <FormGroup label="Flipping" docKey={'datasets.flip'} className="mt-2">
                        <Checkbox
--- a/ui/src/app/jobs/new/jobConfig.ts
+++ b/ui/src/app/jobs/new/jobConfig.ts
@@ -14,7 +14,6 @@ export const defaultDatasetConfig: DatasetConfig = {
  controls: [],
  shrink_video_to_frames: true,
  num_frames: 1,
-  do_i2v: true,
  flip_x: false,
  flip_y: false,
 };
--- a/ui/src/app/jobs/new/options.ts
+++ b/ui/src/app/jobs/new/options.ts
@@ -17,6 +17,9 @@ type AdditionalSections =
  | 'datasets.control_path'
  | 'datasets.multi_control_paths'
  | 'datasets.do_i2v'
+  | 'datasets.do_audio'
+  | 'datasets.audio_normalize'
+  | 'datasets.audio_preserve_pitch'
  | 'sample.ctrl_img'
  | 'sample.multi_ctrl_imgs'
  | 'datasets.num_frames'
@@ -288,6 +291,7 @@ export const modelArchs: ModelArch[] = [
      'config.process[0].sample.width': [768, 1024],
      'config.process[0].sample.height': [768, 1024],
      'config.process[0].train.timestep_type': ['weighted', 'sigmoid'],
+      'config.process[0].datasets[x].do_i2v': [true, undefined],
    },
    disableSections: ['network.conv'],
    additionalSections: ['sample.ctrl_img', 'datasets.num_frames', 'model.low_vram', 'datasets.do_i2v'],
@@ -601,6 +605,31 @@ export const modelArchs: ModelArch[] = [
    disableSections: ['network.conv'],
    additionalSections: ['model.low_vram', 'model.layer_offloading'],
  },
+  {
+    name: 'ltx2',
+    label: 'LTX-2',
+    group: 'video',
+    isVideoModel: true,
+    defaults: {
+      // default updates when [selected, unselected] in the UI
+      'config.process[0].model.name_or_path': ['Lightricks/LTX-2', defaultNameOrPath],
+      'config.process[0].model.quantize': [true, false],
+      'config.process[0].model.quantize_te': [true, false],
+      'config.process[0].model.low_vram': [true, false],
+      'config.process[0].sample.sampler': ['flowmatch', 'flowmatch'],
+      'config.process[0].train.noise_scheduler': ['flowmatch', 'flowmatch'],
+      'config.process[0].sample.num_frames': [121, 1],
+      'config.process[0].sample.fps': [24, 1],
+      'config.process[0].sample.width': [768, 1024],
+      'config.process[0].sample.height': [768, 1024],
+      'config.process[0].train.timestep_type': ['weighted', 'sigmoid'],
+      'config.process[0].datasets[x].do_i2v': [false, undefined],
+      'config.process[0].datasets[x].do_audio': [true, undefined],
+      'config.process[0].datasets[x].fps': [24, undefined],
+    },
+    disableSections: ['network.conv'],
+    additionalSections: ['datasets.num_frames', 'model.layer_offloading', 'model.low_vram', 'datasets.do_audio', 'datasets.audio_normalize', 'datasets.audio_preserve_pitch'],
+  },
 ].sort((a, b) => {
  // Sort by label, case-insensitive
  return a.label.localeCompare(b.label, undefined, { sensitivity: 'base' });
--- a/ui/src/app/jobs/new/utils.ts
+++ b/ui/src/app/jobs/new/utils.ts
@@ -2,6 +2,25 @@ import { GroupedSelectOption, JobConfig, SelectOption } from '@/types';
 import { modelArchs, ModelArch } from './options';
 import { objectCopy } from '@/utils/basic';

+const expandDatasetDefaults = (
+  defaults: { [key: string]: any },
+  numDatasets: number,
+): { [key: string]: any } => {
+  // expands the defaults for datasets[x] to datasets[0], datasets[1], etc.
+  const expandedDefaults: { [key: string]: any } = { ...defaults };
+  for (const key in defaults) {
+    if (key.includes('datasets[x].')) {
+      for (let i = 0; i < numDatasets; i++) {
+        const datasetKey = key.replace('datasets[x].', `datasets[${i}].`);
+        const v = defaults[key];
+        expandedDefaults[datasetKey] = Array.isArray(v) ? [...v] : objectCopy(v);
+      }
+      delete expandedDefaults[key];
+    }
+  }
+  return expandedDefaults;
+};
+
 export const handleModelArchChange = (
  currentArchName: string,
  newArchName: string,
@@ -39,16 +58,11 @@ export const handleModelArchChange = (
    }
  }

-  // revert defaults from previous model
-  for (const key in currentArch.defaults) {
-    setJobConfig(currentArch.defaults[key][1], key);
-  }
+  const numDatasets = jobConfig.config.process[0].datasets.length;
+
+  let currentDefaults = expandDatasetDefaults(currentArch.defaults || {}, numDatasets);
+  let newDefaults = expandDatasetDefaults(newArch?.defaults || {}, numDatasets);

-  if (newArch?.defaults) {
-    for (const key in newArch.defaults) {
-      setJobConfig(newArch.defaults[key][0], key);
-    }
-  }
  // set new model
  setJobConfig(newArchName, 'config.process[0].model.arch');

@@ -79,27 +93,27 @@ export const handleModelArchChange = (
      if (newDataset.control_path_1 && newDataset.control_path_1 !== '') {
        newDataset.control_path = newDataset.control_path_1;
      }
-      if (newDataset.control_path_1) {
+      if ('control_path_1' in newDataset) {
        delete newDataset.control_path_1;
      }
-      if (newDataset.control_path_2) {
+      if ('control_path_2' in newDataset) {
        delete newDataset.control_path_2;
      }
-      if (newDataset.control_path_3) {
+      if ('control_path_3' in newDataset) {
        delete newDataset.control_path_3;
      }
    } else {
      // does not have control images
-      if (newDataset.control_path) {
+      if ('control_path' in newDataset) {
        delete newDataset.control_path;
      }
-      if (newDataset.control_path_1) {
+      if ('control_path_1' in newDataset) {
        delete newDataset.control_path_1;
      }
-      if (newDataset.control_path_2) {
+      if ('control_path_2' in newDataset) {
        delete newDataset.control_path_2;
      }
-      if (newDataset.control_path_3) {
+      if ('control_path_3' in newDataset) {
        delete newDataset.control_path_3;
      }
    }
@@ -120,4 +134,13 @@ export const handleModelArchChange = (
    return newSample;
  });
  setJobConfig(samples, 'config.process[0].sample.samples');
+
+  // revert defaults from previous model
+  for (const key in currentDefaults) {
+    setJobConfig(currentDefaults[key][1], key);
+  }
+
+  for (const key in newDefaults) {
+    setJobConfig(newDefaults[key][0], key);
+  }
 };
--- a/ui/src/components/SampleImageCard.tsx
+++ b/ui/src/components/SampleImageCard.tsx
@@ -56,18 +56,6 @@ const SampleImageCard: React.FC<SampleImageCardProps> = ({
    return () => observer.disconnect();
  }, [observerRoot, rootMargin]);

-  // Pause video when leaving viewport
-  useEffect(() => {
-    if (!isVideo(imageUrl)) return;
-    const v = videoRef.current;
-    if (!v) return;
-    if (!isVisible && !v.paused) {
-      try {
-        v.pause();
-      } catch {}
-    }
-  }, [isVisible, imageUrl]);
-
  const handleLoad = () => setLoaded(true);

  return (
@@ -81,9 +69,11 @@ const SampleImageCard: React.FC<SampleImageCardProps> = ({
                src={`/api/img/${encodeURIComponent(imageUrl)}`}
                className="w-full h-full object-cover"
                preload="none"
+                onLoad={handleLoad}
                playsInline
                muted
                loop
+                autoPlay
                controls={false}
              />
            ) : (
--- a/ui/src/components/SampleImageViewer.tsx
+++ b/ui/src/components/SampleImageViewer.tsx
@@ -7,6 +7,7 @@ import { Cog } from 'lucide-react';
 import { Menu, MenuButton, MenuItem, MenuItems } from '@headlessui/react';
 import { openConfirm } from './ConfirmModal';
 import { apiClient } from '@/utils/api';
+import { isVideo } from '@/utils/basic';

 interface Props {
  imgPath: string | null; // current image path
@@ -200,13 +201,24 @@ export default function SampleImageViewer({
            className="relative transform rounded-lg bg-gray-800 text-left shadow-xl transition-all data-closed:translate-y-4 data-closed:opacity-0 data-enter:duration-300 data-enter:ease-out data-leave:duration-200 data-leave:ease-in max-w-[95%] max-h-[95vh] data-closed:sm:translate-y-0 data-closed:sm:scale-95 flex flex-col overflow-hidden"
          >
            <div className="overflow-hidden flex items-center justify-center">
-              {imgPath && (
-                <img
-                  src={`/api/img/${encodeURIComponent(imgPath)}`}
-                  alt="Sample Image"
-                  className="w-auto h-auto max-w-[95vw] max-h-[82vh] object-contain"
-                />
-              )}
+              {imgPath &&
+                (isVideo(imgPath) ? (
+                  <video
+                    src={`/api/img/${encodeURIComponent(imgPath)}`}
+                    className="w-auto h-auto max-w-[95vw] max-h-[82vh] object-contain"
+                    preload="none"
+                    playsInline
+                    loop
+                    autoPlay
+                    controls={true}
+                  />
+                ) : (
+                  <img
+                    src={`/api/img/${encodeURIComponent(imgPath)}`}
+                    alt="Sample Image"
+                    className="w-auto h-auto max-w-[95vw] max-h-[82vh] object-contain"
+                  />
+                ))}
            </div>
            {/* # make full width */}
            <div className="bg-gray-950 text-sm flex justify-between items-center px-4 py-2">
--- a/ui/src/docs.tsx
+++ b/ui/src/docs.tsx
@@ -107,6 +107,35 @@ const docs: { [key: string]: ConfigDoc } = {
      </>
    ),
  },
+  'datasets.do_audio': {
+    title: 'Do Audio',
+    description: (
+      <>
+        For models that support audio with video, this option will load the audio from the video and resize it to match
+        the video sequence. Since the video is automatically resized, the audio may drop or raise in pitch to match the new
+        speed of the video. It is important to prep your dataset to have the proper length before training.
+      </>
+    ),
+  },
+  'datasets.audio_normalize': {
+    title: 'Audio Normalize',
+    description: (
+      <>
+        When loading audio, this will normalize the audio volume to the max peaks. Useful if your dataset has varying audio
+        volumes. Warning, do not use if you have clips with full silence you want to keep, as it will raise the volume of those clips.
+      </>
+    ),
+  },
+  'datasets.audio_preserve_pitch': {
+    title: 'Audio Preserve Pitch',
+    description: (
+      <>
+        When loading audio to match the number of frames requested, this option will preserve the pitch of the audio if
+        the length does not match training target. It is recommended to have a dataset that matches your target length,
+        as this option can add sound distortions. 
+      </>
+    ),
+  },
  'datasets.flip': {
    title: 'Flip X and Flip Y',
    description: (
--- a/ui/src/types.ts
+++ b/ui/src/types.ts
@@ -96,7 +96,11 @@ export interface DatasetConfig {
  control_path?: string | null;
  num_frames: number;
  shrink_video_to_frames: boolean;
-  do_i2v: boolean;
+  do_i2v?: boolean;
+  do_audio?: boolean;
+  audio_normalize?: boolean;
+  audio_preserve_pitch?: boolean;
+  fps?: number;
  flip_x: boolean;
  flip_y: boolean;
  control_path_1?: string | null;
--- a/ui/src/utils/hooks.tsx
+++ b/ui/src/utils/hooks.tsx
@@ -17,21 +17,14 @@ export function setNestedValue<T, V>(obj: T, value: V, path?: string): T {
  }

  // Split the path into segments
-  const pathArray = path.split('.').flatMap(segment => {
-    // Handle array notation like 'process[0]'
-    const arrayMatch = segment.match(/^([^\[]+)(\[\d+\])+/);
-    if (arrayMatch) {
-      const propName = arrayMatch[1];
-      const indices = segment
-        .substring(propName.length)
-        .match(/\[(\d+)\]/g)
-        ?.map(idx => parseInt(idx.substring(1, idx.length - 1)));
+  const pathArray: Array<string | number> = [];
+  const re = /([^[.\]]+)|\[(\d+)\]/g;
+  let m: RegExpExecArray | null;

-      // Return property name followed by array indices
-      return [propName, ...(indices || [])];
-    }
-    return segment;
-  });
+  while ((m = re.exec(path)) !== null) {
+    if (m[1] !== undefined) pathArray.push(m[1]);
+    else pathArray.push(Number(m[2]));
+  }

  // Navigate to the target location
  let current: any = result;
@@ -43,8 +36,18 @@ export function setNestedValue<T, V>(obj: T, value: V, path?: string): T {
      if (!Array.isArray(current)) {
        throw new Error(`Cannot access index ${key} of non-array`);
      }
-      // Create a copy of the array to maintain immutability
-      current = [...current];
+
+      // Ensure the indexed element exists and is copied/created immutably
+      const nextKey = pathArray[i + 1];
+      const existing = current[key];
+
+      if (existing === undefined) {
+        current[key] = typeof nextKey === 'number' ? [] : {};
+      } else if (Array.isArray(existing)) {
+        current[key] = [...existing];
+      } else if (typeof existing === 'object' && existing !== null) {
+        current[key] = { ...existing };
+      } // else: primitives stay as-is
    } else {
      // For object properties, create a new object if it doesn't exist
      if (current[key] === undefined) {
@@ -63,7 +66,11 @@ export function setNestedValue<T, V>(obj: T, value: V, path?: string): T {

  // Set the value at the final path segment
  const finalKey = pathArray[pathArray.length - 1];
-  current[finalKey] = value;
+  if (value === undefined) {
+    delete current[finalKey];
+  } else {
+    current[finalKey] = value;
+  }

  return result;
 }