fix: handle UnicodeEncodeError in URL seeder and strip zero-width chars (#1784)

From PR #1784 by @Br1an67
This commit is contained in:
unclecode
2026-03-07 06:16:41 +00:00
parent 1029815fd4
commit e47e810aca

View File

@@ -450,16 +450,20 @@ class AsyncUrlSeeder:
async def producer():
try:
async for u in gen():
if u in seen:
self._log("debug", "Skipping duplicate URL: {url}",
params={"url": u}, tag="URL_SEED")
try:
if u in seen:
self._log("debug", "Skipping duplicate URL: {url}",
params={"url": u}, tag="URL_SEED")
continue
if stop_event.is_set():
self._log(
"info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
break
seen.add(u)
await queue.put(u) # Will block if queue is full, providing backpressure
except UnicodeEncodeError:
# Skip URLs that cause encoding errors (e.g. on Windows)
continue
if stop_event.is_set():
self._log(
"info", "Producer stopping due to max_urls limit.", tag="URL_SEED")
break
seen.add(u)
await queue.put(u) # Will block if queue is full, providing backpressure
except Exception as e:
self._log("error", "Producer encountered an error: {error}", params={
"error": str(e)}, tag="URL_SEED")
@@ -987,7 +991,8 @@ class AsyncUrlSeeder:
def _normalize_loc(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
normalized = urljoin(base_url, raw.strip())
cleaned = raw.strip().replace("\u200b", "").replace("\ufeff", "")
normalized = urljoin(base_url, cleaned)
if not normalized:
return None
return normalized
@@ -1107,7 +1112,8 @@ class AsyncUrlSeeder:
def _normalize_loc(raw: Optional[str]) -> Optional[str]:
if not raw:
return None
normalized = urljoin(base_url, raw.strip())
cleaned = raw.strip().replace("\u200b", "").replace("\ufeff", "")
normalized = urljoin(base_url, cleaned)
if not normalized:
return None
return normalized