Add optional blake3 hashing during asset scanning

- Make blake3 import lazy in hashing.py (only imported when needed)
- Add compute_hashes parameter to AssetSeeder.start(), build_asset_specs(), and seed_assets()
- Fix missing tag clearing: include is_missing states in sync when update_missing_tags=True
- Clear is_missing flag on cache states when files are restored with matching mtime/size
- Fix validation error serialization in routes.py (use json.loads(ve.json()))

Amp-Thread-ID: https://ampcode.com/threads/T-019c3614-56d4-74a8-a717-19922d6dbbee
Co-authored-by: Amp <amp@ampcode.com>
This commit is contained in:
Luke Mino-Altherr
2026-02-06 19:22:56 -08:00
parent 7a75af59ab
commit a2d26dece5
11 changed files with 119 additions and 52 deletions

View File

@@ -272,7 +272,9 @@ def resolve_asset_for_download(
states = list_cache_states_by_asset_id(session, asset_id=asset.id)
abs_path = select_best_live_path(states)
if not abs_path:
raise FileNotFoundError
raise FileNotFoundError(
f"No live path for AssetInfo {asset_info_id} (asset id={asset.id}, name={info.name})"
)
update_asset_info_access_time(session, asset_info_id=asset_info_id)
session.commit()

View File

@@ -36,6 +36,7 @@ class SeedAssetSpec(TypedDict):
tags: list[str]
fname: str
metadata: ExtractedMetadata | None
hash: str | None
class AssetRow(TypedDict):
@@ -163,7 +164,7 @@ def batch_insert_seed_assets(
asset_rows.append(
{
"id": asset_id,
"hash": None,
"hash": spec.get("hash"),
"size_bytes": spec["size_bytes"],
"mime_type": None,
"created_at": current_time,

View File

@@ -23,15 +23,16 @@ def verify_file_unchanged(
Returns True if the file's mtime and size match the database values.
Returns False if mtime_db is None or values don't match.
size_db=None means don't check size; 0 is a valid recorded size.
"""
if mtime_db is None:
return False
actual_mtime_ns = get_mtime_ns(stat_result)
if int(mtime_db) != int(actual_mtime_ns):
return False
sz = int(size_db or 0)
if sz > 0:
return int(stat_result.st_size) == sz
if size_db is not None:
return int(stat_result.st_size) == int(size_db)
return True

View File

@@ -2,10 +2,23 @@ import asyncio
import os
from typing import IO
from blake3 import blake3
DEFAULT_CHUNK = 8 * 1024 * 1024
_blake3 = None
def _get_blake3():
global _blake3
if _blake3 is None:
try:
from blake3 import blake3 as _b3
_blake3 = _b3
except ImportError:
raise ImportError(
"blake3 is required for asset hashing. Install with: pip install blake3"
)
return _blake3
def compute_blake3_hash(
fp: str | IO[bytes],
@@ -42,7 +55,7 @@ def _hash_file_obj(file_obj: IO, chunk_size: int = DEFAULT_CHUNK) -> str:
if orig_pos != 0:
file_obj.seek(0)
h = blake3()
h = _get_blake3()()
while True:
chunk = file_obj.read(chunk_size)
if not chunk: