Files
vaiola/modules/civit/neofetch.py
2025-10-16 18:42:32 +07:00

464 lines
19 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import os.path
import time
from dataclasses import dataclass
from typing import Any
import netifaces
from ping3 import ping
import requests
from requests import Response
from requests.exceptions import ConnectionError, Timeout, SSLError, ProxyError, RequestException
class NetworkError(Exception): pass
class ValidationError(Exception): pass
class CursorError(Exception): pass
class ContentTypeMismatchError(Exception): pass
class JsonParseError(Exception): pass
class EmptyDataError(Exception): pass
class EmptyItemsError(Exception): pass
class EmptyMetaWarning(Exception): pass
class CursorSlipError(Exception): pass
class DuplicateDataError(Exception): pass
class PoorDataWarning(Exception): pass
class CursorIncrementError(Exception): pass
@dataclass
class neofetch_error_counters:
network_timeout_error = 0
network_ssl_error = 0
network_proxy_error = 0
network_connection_error = 0
network_request_exception = 0
network_success = 0
@property
def network_errors(self): return (self.network_timeout_error + self.network_ssl_error + self.network_proxy_error +
self.network_connection_error + self.network_request_exception)
@property
def network_error_percentage(self): return float(self.network_errors) / float(self.network_success + self.network_errors) * 100 if self.network_success + self.network_errors != 0 else 0# %
def reset_network_stats(self):
self.network_timeout_error = 0
self.network_ssl_error = 0
self.network_proxy_error = 0
self.network_connection_error = 0
self.network_request_exception = 0
self.network_success = 0
http_unavailable = 0
http_other = 0
http_success = 0
@property
def http_errors(self): return self.http_unavailable + self.http_other
@property
def http_error_percentage(self): return float(self.http_errors) / float(self.http_success + self.http_errors) * 100 if self.http_success + self.http_errors != 0 else 0# %
def reset_http_stats(self):
self.http_unavailable = 0
self.http_other = 0
self.http_success = 0
json_type_mismatch = 0
json_parse_error = 0
json_success = 0
@property
def json_errors(self): return self.json_type_mismatch + self.json_parse_error
@property
def json_error_percentage(self): return float(self.json_errors) / float(self.json_success + self.json_errors) * 100 if self.json_success + self.json_errors != 0 else 0 # %
def reset_json_stats(self):
self.json_type_mismatch = 0
self.json_parse_error = 0
self.json_success = 0
class neofetch_collector:
def __init__(self, start_number = 0, autosave = False, save_path = '', autosave_chunk_size = 2000):
self.items: dict[str, dict] = dict()
self.pending_items: dict[str, dict] = dict()
self.autosave = autosave
if autosave and save_path == '': raise ValueError('autosave mode is enabled, but path is not specified')
self.save_path = save_path
self.current_number = start_number
self.autosave_chunk_size = autosave_chunk_size
def check_autosave(self):
if len(self.pending_items) < self.autosave_chunk_size: return 0
self.save()
def save(self, path = None, flush = False):
if not path: path = self.save_path
if len(self.pending_items) == 0: return 0
if self.autosave:
pending_items: list = [value for key, value in self.pending_items.items()]
self.pending_items = dict()
else:
pending_items: list = [value for key, value in self.items.items()]
path = os.path.join(self.save_path, f'{self.current_number}-{len(self.items)}.json')
with open(path, "w", encoding="utf-8") as f:
json.dump(pending_items, f, indent=4, ensure_ascii=False)
self.current_number = len(self.items)
if flush: self.flush()
return len(pending_items)
def flush(self):
if self.save_path != '': self.save()
self.items = dict()
self.pending_items = dict()
@staticmethod
def _cast_items(items: dict | list[dict] | set[dict], pk ='id') -> dict[str, dict]:
result: dict[str, dict] = dict()
if isinstance(items, list): pass
elif isinstance(items, dict): items = [items]
elif isinstance(items, set): items = list(items)
for item in items: result[str(item.get(pk, 'None'))] = item
return result
def _compare(self, items: dict[str, dict]) -> int: return len(set(items) - set(self.items))
def compare(self, items: dict | list[dict] | set[dict], pk ='id') -> int:
return len(set(self._cast_items(items, pk)) - set(self.items))
def add(self, items: dict | list[dict] | set[dict], pk ='id') -> int:
items = self._cast_items(items, pk)
new_items_count = self._compare(items)
new_items = set(items) - set(self.items)
self.items.update(items)
if self.autosave:
for key in new_items: self.pending_items[key] = items[key]
self.check_autosave()
return new_items_count
@property
def list(self): return [value for key, value in self.items.items()]
@property
def set(self):
return {value for key, value in self.items.items()}
class neofetch_cursor:
def __init__(self, url: str):
self.url = url
self.stripped = True
def update(self, next_page, stripped = False):
self.url = next_page
self.stripped = stripped
def strip(self):
split = self.url.split('cursor=', maxsplit=1)
if len(split) < 2: return self.url
prefix = split[0] + 'cursor='
suffix = split[1].split('%', maxsplit=1)[0]
self.url = prefix + suffix
return self.url
@staticmethod
def _order(number):
mask = 10
while number > mask: mask *= 10
return mask
def increment(self, count = 1):
split = self.url.split('cursor=', maxsplit=1)
if len(split) < 2: return self.url
prefix = split[0] + 'cursor='
split = split[1].rsplit('%', maxsplit=1)
if len(split) >= 2: suffix = '%' + split[1]
else: suffix = ''
split = split[0].rsplit('.', maxsplit=1)
if len(split) >= 2:
prefix += split[0] + '.'
cursor = split[1]
else: cursor = split[0]
cursor_order = len(cursor)
cursor = int(cursor)
incremented_cursor = cursor + count
if incremented_cursor < pow(10, cursor_order): cursor = incremented_cursor
else: raise CursorIncrementError(f'cursor has reached bounds: {pow(10, cursor_order)}')
self.url = f'{prefix}{cursor:03d}{suffix}'
return self.url
def decrement(self, count = 1): return self.increment(-count)
class neofetch:
def __init__(self, path, base_url = None, session=None):
self.path = path
self.base_url = base_url or 'https://civitai.com'
self.session = session
self.errors = neofetch_error_counters()
self.tags_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'tags'))
self.creators_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'creators'))
self.models_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'models'))
self.images_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'images'))
@staticmethod
def check_network(hostname = None) -> bool:
# check gateway
p = ping(netifaces.gateways()['default'][netifaces.AF_INET][0])
if p: print('[Network check/INFO] gateway is reachable')
else: print('[Network check/WARN] gateway unreachable or ping is not allowed')
# check wan
p = ping('1.1.1.1')
if p: print('[Network check/INFO] WAN is reachable')
else:
print('[Network check/ERR] WAN is unreachable')
return False
# check DNS
p = ping('google.com')
if p: print('[Network check/INFO] DNS is working')
else:
print('[Network check/ERR] DNS is unreachable')
return False
if not hostname:
print('[Network check/WARN] target not specified. skipping')
# check target
p = ping(hostname)
if p:
print('[Network check/INFO] site host is up')
else:
print('[Network check/ERR] site host is unreachable')
raise NetworkError('[Network check/ERR] site host is unreachable')
# check site working
try:
response = requests.get('https://' + hostname)
print('[Network check/INFO] site is responding to HTTP requests')
return True
except RequestException as e:
raise NetworkError from e
def wait_for_network(self, hostname):
while not self.check_network(hostname):
print('Waiting for network...')
time.sleep(30)
def network_garant(self, url, session=None, headers=None, retries = 10) -> requests.models.Response | None:
if retries <= 0: raise ValidationError("Network error correction failed")
exception_occurred = False
try:
if session: r = session.get(url)
elif headers: r = requests.get(url, headers=headers)
else: r: requests.models.Response = requests.get(url)
if not isinstance(r, requests.models.Response): raise ValidationError(
f'response has type {type(r)} but requests.models.Response is required'
)
return r
except Timeout as e:
# Таймаут соединения/чтения
print("Timeout:", e)
self.errors.network_timeout_error += 1
exception_occurred = True
except SSLError as e:
# Проблемы с TLSрукава
print("SSL error:", e)
self.errors.network_ssl_error += 1
exception_occurred = True
except ProxyError as e:
# Ошибка прокси (часто является под‑случаем ConnectionError, но отдельно)
print("Proxy error:", e)
self.errors.network_proxy_error += 1
exception_occurred = True
except ConnectionError as e:
# Ошибки соединения: DNS, unreachable host, RST, proxy fail и т.п.
print("Connection failed:", e)
self.errors.network_connection_error += 1
exception_occurred = True
except RequestException as e:
# Любая другая непредвиденная ошибка
print("General request error:", e)
self.errors.network_request_exception += 1
exception_occurred = True
finally:
if exception_occurred:
try: self.wait_for_network(str(url).split('//', maxsplit=1)[1].split('/', maxsplit=1)[0])
except Exception as e: self.wait_for_network(hostname=None)
return self.network_garant(url, session, headers, retries-1)
else: self.errors.network_success += 1
def http_garant(self, url, session=None, headers=None, retries = 10, service_available_retries = 720):
if retries <= 0: raise ValidationError("HTTP error correction failed")
try:
response = self.network_garant(url, session, headers)
response.raise_for_status()
self.errors.http_success += 1
return response
except requests.exceptions.HTTPError as e:
status = e.response.status_code
if status == 503:
self.errors.http_unavailable += 1
print("[http_garant/WARN] HTTP error, waiting availability:", e)
time.sleep(60)
return self.http_garant(url, session, headers, retries, service_available_retries - 1)
else:
self.errors.http_other += 1
print("[http_garant/ERR] HTTP error:", e)
time.sleep(10)
if service_available_retries <= 0: return self.http_garant(url, session, headers, retries - 1, service_available_retries)
else: raise CursorError from e
def json_garant(self, url, session=None, headers=None, retries = 10):
if retries <= 0: raise ValidationError("JSON parse error correction failed")
try:
response = self.http_garant(url, session, headers)
ct = response.headers.get("Content-Type", "")
if not ct.lower().startswith("application/json"): raise ContentTypeMismatchError
j = response.json()
self.errors.json_success += 1
return j
except ContentTypeMismatchError:
self.errors.json_type_mismatch += 1
print("[json_garant/ERR] HTTP error")
time.sleep(10)
return self.json_garant(url, session, headers, retries - 1)
except ValueError as e:
self.errors.json_parse_error += 1
print("[json_garant/ERR] HTTP error:", e)
time.sleep(10)
return self.json_garant(url, session, headers, retries - 1)
def api_data_garant(self, url, collector: neofetch_collector, session=None, headers=None, retries = 10, ):
if retries <= 0: raise ValidationError("API data error correction failed")
try:
response = self.json_garant(url, session, headers)
if 'items' not in response or 'metadata' not in response: raise EmptyDataError
items = response['items']
metadata = response['metadata']
del response
if len(items) == 0 and len(metadata) == 0: raise EmptyDataError
elif len(items) == 0: raise EmptyItemsError
elif len(metadata) == 0: raise EmptyMetaWarning
if 'nextPage' not in metadata: raise EmptyMetaWarning('Metadata has not nextPage field')
else: next_page = metadata['nextPage']
if 'totalPages' in metadata: total_pages = metadata['totalPages']
else: total_pages = None
if next_page and next_page == url: raise CursorSlipError
new_items_count = collector.compare(items)
new_items_percentage = float(new_items_count) / float(len(items)) * 100
if new_items_count == 0: raise DuplicateDataError
elif new_items_percentage < 50: raise PoorDataWarning
return items, next_page, total_pages
except EmptyDataError:
print('[api_data_garant/ERR] EmptyDataError: Empty api response')
time.sleep(10)
if retries > 1:
return self.api_data_garant(url, collector, session, headers, retries - 1)
else: raise CursorError
except EmptyItemsError:
print('[api_data_garant/ERR] EmptyItemsError: Empty api response')
time.sleep(10)
if retries > 1:
return self.api_data_garant(url, collector, session, headers, retries - 1)
else:
raise CursorError
except EmptyMetaWarning:
print('[api_data_garant/WARN] EmptyMetaWarning')
return items, None, None
except DuplicateDataError:
print('[api_data_garant/ERR] DuplicateDataError')
if retries > 1:
return self.api_data_garant(url, collector, session, headers, retries - 1)
else:
raise CursorSlipError
except PoorDataWarning:
print('[api_data_garant/WARN] PoorDataWarning')
return items, next_page, total_pages
def cursor_garant(self, cursor: neofetch_cursor, collector: neofetch_collector, session=None, headers=None, retries = 10):
if retries <= 0: raise ValidationError("Cursor error correction failed")
try:
return self.api_data_garant(cursor.url, collector, session, headers)
except CursorError:
print('[cursor_garant/ERR] CursorError')
if not cursor.stripped:
time.sleep(10)
cursor.strip()
return self.cursor_garant(cursor, collector, session, headers, retries - 1)
elif retries > 5: return self.cursor_garant(cursor, collector, session, headers, retries - 1)
else:
cursor.decrement(2)
return self.cursor_garant(cursor, collector, session, headers, retries - 1)
except CursorSlipError:
print('[cursor_garant/ERR] CursorSlipError')
if not cursor.stripped:
time.sleep(10)
cursor.strip()
return self.cursor_garant(cursor, collector, session, headers, retries - 1)
elif retries > 5: return self.cursor_garant(cursor, collector, session, headers, retries - 1)
else:
cursor.increment(1)
return self.cursor_garant(cursor, collector, session, headers, retries - 1)
def validation_garant(self, cursor: neofetch_cursor, collector: neofetch_collector, session=None, headers=None, retries = 10):
try: return self.cursor_garant(cursor, collector, session, headers)
except ValidationError as e:
# TODO log error
if retries > 0: return self.validation_garant(cursor, collector, session, headers, retries - 1)
else: raise RuntimeError from e
except CursorIncrementError as e: raise RuntimeError from e
def crawler(self, next_page, collector: neofetch_collector, session, type: str, start_number = 0):
cur = neofetch_cursor(next_page)
collector.current_number = start_number
total_pages = None
while next_page:
print(f'Fetching {type}: page {next_page}{f'of {total_pages}' if total_pages else ''}')
try: items, next_page, total_pages = self.validation_garant(cur, collector, session)
except RuntimeError:
# TODO log error
break
cur.update(next_page)
collector.add(items)
collector.save()
def tags(self, start_number=0):
return self.crawler(next_page=self.base_url + 'api/v1/tags?limit=200', collector=self.tags_collector,
session=self.session, type='tags', start_number=start_number)
def creators(self, start_number=0):
return self.crawler(next_page=self.base_url + 'api/v1/creators?limit=200', collector=self.creators_collector,
session=self.session, type='creators', start_number=start_number)
def models(self, start_number=0):
return self.crawler(next_page=self.base_url + 'api/v1/models?period=AllTime&sort=Oldest&nsfw=true&limit=200',
collector=self.models_collector, session=self.session, type='models', start_number=start_number)
if __name__ == '__main__':
n = neofetch_cursor('https://civitai.com/api/v1/models?period=AllTime&sort=Oldest&nsfw=true&cursor=2022-11-16%2023%3A31%3A28.203%7C162')
n.increment(10)
pass