Files
ai-toolkit/ui/cron/actions/startJob.ts
2025-10-14 12:00:42 -06:00

180 lines
5.5 KiB
TypeScript

import prisma from '../prisma';
import { Job } from '@prisma/client';
import { spawn } from 'child_process';
import path from 'path';
import fs from 'fs';
import { TOOLKIT_ROOT, getTrainingFolder, getHFToken } from '../paths';
const isWindows = process.platform === 'win32';
const startAndWatchJob = (job: Job) => {
// starts and watches the job asynchronously
return new Promise<void>(async (resolve, reject) => {
const jobID = job.id;
// setup the training
const trainingRoot = await getTrainingFolder();
const trainingFolder = path.join(trainingRoot, job.name);
if (!fs.existsSync(trainingFolder)) {
fs.mkdirSync(trainingFolder, { recursive: true });
}
// make the config file
const configPath = path.join(trainingFolder, '.job_config.json');
//log to path
const logPath = path.join(trainingFolder, 'log.txt');
try {
// if the log path exists, move it to a folder called logs and rename it {num}_log.txt, looking for the highest num
// if the log path does not exist, create it
if (fs.existsSync(logPath)) {
const logsFolder = path.join(trainingFolder, 'logs');
if (!fs.existsSync(logsFolder)) {
fs.mkdirSync(logsFolder, { recursive: true });
}
let num = 0;
while (fs.existsSync(path.join(logsFolder, `${num}_log.txt`))) {
num++;
}
fs.renameSync(logPath, path.join(logsFolder, `${num}_log.txt`));
}
} catch (e) {
console.error('Error moving log file:', e);
}
// update the config dataset path
const jobConfig = JSON.parse(job.job_config);
jobConfig.config.process[0].sqlite_db_path = path.join(TOOLKIT_ROOT, 'aitk_db.db');
// write the config file
fs.writeFileSync(configPath, JSON.stringify(jobConfig, null, 2));
let pythonPath = 'python';
// use .venv or venv if it exists
if (fs.existsSync(path.join(TOOLKIT_ROOT, '.venv'))) {
if (isWindows) {
pythonPath = path.join(TOOLKIT_ROOT, '.venv', 'Scripts', 'python.exe');
} else {
pythonPath = path.join(TOOLKIT_ROOT, '.venv', 'bin', 'python');
}
} else if (fs.existsSync(path.join(TOOLKIT_ROOT, 'venv'))) {
if (isWindows) {
pythonPath = path.join(TOOLKIT_ROOT, 'venv', 'Scripts', 'python.exe');
} else {
pythonPath = path.join(TOOLKIT_ROOT, 'venv', 'bin', 'python');
}
}
const runFilePath = path.join(TOOLKIT_ROOT, 'run.py');
if (!fs.existsSync(runFilePath)) {
console.error(`run.py not found at path: ${runFilePath}`);
await prisma.job.update({
where: { id: jobID },
data: {
status: 'error',
info: `Error launching job: run.py not found`,
},
});
return;
}
const additionalEnv: any = {
AITK_JOB_ID: jobID,
CUDA_DEVICE_ORDER: 'PCI_BUS_ID',
CUDA_VISIBLE_DEVICES: `${job.gpu_ids}`,
IS_AI_TOOLKIT_UI: '1',
};
// HF_TOKEN
const hfToken = await getHFToken();
if (hfToken && hfToken.trim() !== '') {
additionalEnv.HF_TOKEN = hfToken;
}
// Add the --log argument to the command
const args = [runFilePath, configPath, '--log', logPath];
try {
let subprocess;
if (isWindows) {
// Spawn Python directly on Windows so the process can survive parent exit
subprocess = spawn(pythonPath, args, {
env: {
...process.env,
...additionalEnv,
},
cwd: TOOLKIT_ROOT,
detached: true,
windowsHide: true,
stdio: 'ignore', // don't tie stdio to parent
});
} else {
// For non-Windows platforms, fully detach and ignore stdio so it survives daemon-like
subprocess = spawn(pythonPath, args, {
detached: true,
stdio: 'ignore',
env: {
...process.env,
...additionalEnv,
},
cwd: TOOLKIT_ROOT,
});
}
// Important: let the child run independently of this Node process.
if (subprocess.unref) {
subprocess.unref();
}
// Optionally write a pid file for future management (stop/inspect) without keeping streams open
try {
fs.writeFileSync(path.join(trainingFolder, 'pid.txt'), String(subprocess.pid ?? ''), { flag: 'w' });
} catch (e) {
console.error('Error writing pid file:', e);
}
// (No stdout/stderr listeners — logging should go to --log handled by your Python)
// (No monitoring loop — the whole point is to let it live past this worker)
} catch (error: any) {
// Handle any exceptions during process launch
console.error('Error launching process:', error);
await prisma.job.update({
where: { id: jobID },
data: {
status: 'error',
info: `Error launching job: ${error?.message || 'Unknown error'}`,
},
});
return;
}
// Resolve the promise immediately after starting the process
resolve();
});
};
export default async function startJob(jobID: string) {
const job: Job | null = await prisma.job.findUnique({
where: { id: jobID },
});
if (!job) {
console.error(`Job with ID ${jobID} not found`);
return;
}
// update job status to 'running', this will run sync so we don't start multiple jobs.
await prisma.job.update({
where: { id: jobID },
data: {
status: 'running',
stop: false,
info: 'Starting job...',
},
});
// start and watch the job asynchronously so the cron can continue
startAndWatchJob(job);
}