464 lines
19 KiB
Python
464 lines
19 KiB
Python
import json
|
||
import os.path
|
||
import time
|
||
from dataclasses import dataclass
|
||
from typing import Any
|
||
|
||
import netifaces
|
||
from ping3 import ping
|
||
|
||
import requests
|
||
from requests import Response
|
||
from requests.exceptions import ConnectionError, Timeout, SSLError, ProxyError, RequestException
|
||
|
||
class NetworkError(Exception): pass
|
||
class ValidationError(Exception): pass
|
||
class CursorError(Exception): pass
|
||
|
||
class ContentTypeMismatchError(Exception): pass
|
||
class JsonParseError(Exception): pass
|
||
|
||
class EmptyDataError(Exception): pass
|
||
class EmptyItemsError(Exception): pass
|
||
class EmptyMetaWarning(Exception): pass
|
||
class CursorSlipError(Exception): pass
|
||
class DuplicateDataError(Exception): pass
|
||
class PoorDataWarning(Exception): pass
|
||
|
||
class CursorIncrementError(Exception): pass
|
||
|
||
@dataclass
|
||
class neofetch_error_counters:
|
||
network_timeout_error = 0
|
||
network_ssl_error = 0
|
||
network_proxy_error = 0
|
||
network_connection_error = 0
|
||
network_request_exception = 0
|
||
network_success = 0
|
||
|
||
@property
|
||
def network_errors(self): return (self.network_timeout_error + self.network_ssl_error + self.network_proxy_error +
|
||
self.network_connection_error + self.network_request_exception)
|
||
@property
|
||
def network_error_percentage(self): return float(self.network_errors) / float(self.network_success + self.network_errors) * 100 if self.network_success + self.network_errors != 0 else 0# %
|
||
|
||
def reset_network_stats(self):
|
||
self.network_timeout_error = 0
|
||
self.network_ssl_error = 0
|
||
self.network_proxy_error = 0
|
||
self.network_connection_error = 0
|
||
self.network_request_exception = 0
|
||
self.network_success = 0
|
||
|
||
http_unavailable = 0
|
||
http_other = 0
|
||
http_success = 0
|
||
|
||
|
||
@property
|
||
def http_errors(self): return self.http_unavailable + self.http_other
|
||
@property
|
||
def http_error_percentage(self): return float(self.http_errors) / float(self.http_success + self.http_errors) * 100 if self.http_success + self.http_errors != 0 else 0# %
|
||
|
||
def reset_http_stats(self):
|
||
self.http_unavailable = 0
|
||
self.http_other = 0
|
||
self.http_success = 0
|
||
|
||
json_type_mismatch = 0
|
||
json_parse_error = 0
|
||
json_success = 0
|
||
|
||
@property
|
||
def json_errors(self): return self.json_type_mismatch + self.json_parse_error
|
||
|
||
@property
|
||
def json_error_percentage(self): return float(self.json_errors) / float(self.json_success + self.json_errors) * 100 if self.json_success + self.json_errors != 0 else 0 # %
|
||
|
||
def reset_json_stats(self):
|
||
self.json_type_mismatch = 0
|
||
self.json_parse_error = 0
|
||
self.json_success = 0
|
||
|
||
class neofetch_collector:
|
||
def __init__(self, start_number = 0, autosave = False, save_path = '', autosave_chunk_size = 2000):
|
||
self.items: dict[str, dict] = dict()
|
||
self.pending_items: dict[str, dict] = dict()
|
||
self.autosave = autosave
|
||
if autosave and save_path == '': raise ValueError('autosave mode is enabled, but path is not specified')
|
||
self.save_path = save_path
|
||
self.current_number = start_number
|
||
self.autosave_chunk_size = autosave_chunk_size
|
||
|
||
def check_autosave(self):
|
||
if len(self.pending_items) < self.autosave_chunk_size: return 0
|
||
self.save()
|
||
|
||
def save(self, path = None, flush = False):
|
||
if not path: path = self.save_path
|
||
if len(self.pending_items) == 0: return 0
|
||
if self.autosave:
|
||
pending_items: list = [value for key, value in self.pending_items.items()]
|
||
self.pending_items = dict()
|
||
else:
|
||
pending_items: list = [value for key, value in self.items.items()]
|
||
|
||
path = os.path.join(self.save_path, f'{self.current_number}-{len(self.items)}.json')
|
||
with open(path, "w", encoding="utf-8") as f:
|
||
json.dump(pending_items, f, indent=4, ensure_ascii=False)
|
||
self.current_number = len(self.items)
|
||
if flush: self.flush()
|
||
return len(pending_items)
|
||
|
||
def flush(self):
|
||
if self.save_path != '': self.save()
|
||
self.items = dict()
|
||
self.pending_items = dict()
|
||
|
||
|
||
|
||
@staticmethod
|
||
def _cast_items(items: dict | list[dict] | set[dict], pk ='id') -> dict[str, dict]:
|
||
result: dict[str, dict] = dict()
|
||
if isinstance(items, list): pass
|
||
elif isinstance(items, dict): items = [items]
|
||
elif isinstance(items, set): items = list(items)
|
||
|
||
for item in items: result[str(item.get(pk, 'None'))] = item
|
||
return result
|
||
|
||
def _compare(self, items: dict[str, dict]) -> int: return len(set(items) - set(self.items))
|
||
|
||
def compare(self, items: dict | list[dict] | set[dict], pk ='id') -> int:
|
||
return len(set(self._cast_items(items, pk)) - set(self.items))
|
||
|
||
def add(self, items: dict | list[dict] | set[dict], pk ='id') -> int:
|
||
items = self._cast_items(items, pk)
|
||
new_items_count = self._compare(items)
|
||
new_items = set(items) - set(self.items)
|
||
self.items.update(items)
|
||
if self.autosave:
|
||
for key in new_items: self.pending_items[key] = items[key]
|
||
self.check_autosave()
|
||
return new_items_count
|
||
|
||
@property
|
||
def list(self): return [value for key, value in self.items.items()]
|
||
|
||
@property
|
||
def set(self):
|
||
return {value for key, value in self.items.items()}
|
||
|
||
class neofetch_cursor:
|
||
def __init__(self, url: str):
|
||
self.url = url
|
||
self.stripped = True
|
||
|
||
def update(self, next_page, stripped = False):
|
||
self.url = next_page
|
||
self.stripped = stripped
|
||
|
||
def strip(self):
|
||
split = self.url.split('cursor=', maxsplit=1)
|
||
if len(split) < 2: return self.url
|
||
prefix = split[0] + 'cursor='
|
||
suffix = split[1].split('%', maxsplit=1)[0]
|
||
self.url = prefix + suffix
|
||
return self.url
|
||
|
||
@staticmethod
|
||
def _order(number):
|
||
mask = 10
|
||
while number > mask: mask *= 10
|
||
return mask
|
||
|
||
def increment(self, count = 1):
|
||
split = self.url.split('cursor=', maxsplit=1)
|
||
if len(split) < 2: return self.url
|
||
prefix = split[0] + 'cursor='
|
||
split = split[1].rsplit('%', maxsplit=1)
|
||
if len(split) >= 2: suffix = '%' + split[1]
|
||
else: suffix = ''
|
||
split = split[0].rsplit('.', maxsplit=1)
|
||
if len(split) >= 2:
|
||
prefix += split[0] + '.'
|
||
cursor = split[1]
|
||
else: cursor = split[0]
|
||
cursor_order = len(cursor)
|
||
cursor = int(cursor)
|
||
incremented_cursor = cursor + count
|
||
if incremented_cursor < pow(10, cursor_order): cursor = incremented_cursor
|
||
else: raise CursorIncrementError(f'cursor has reached bounds: {pow(10, cursor_order)}')
|
||
self.url = f'{prefix}{cursor:03d}{suffix}'
|
||
return self.url
|
||
|
||
def decrement(self, count = 1): return self.increment(-count)
|
||
|
||
|
||
class neofetch:
|
||
def __init__(self, path, base_url = None, session=None):
|
||
self.path = path
|
||
self.base_url = base_url or 'https://civitai.com'
|
||
self.session = session
|
||
|
||
|
||
self.errors = neofetch_error_counters()
|
||
self.tags_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'tags'))
|
||
self.creators_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'creators'))
|
||
self.models_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'models'))
|
||
self.images_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'images'))
|
||
|
||
|
||
|
||
|
||
|
||
@staticmethod
|
||
def check_network(hostname = None) -> bool:
|
||
# check gateway
|
||
p = ping(netifaces.gateways()['default'][netifaces.AF_INET][0])
|
||
if p: print('[Network check/INFO] gateway is reachable')
|
||
else: print('[Network check/WARN] gateway unreachable or ping is not allowed')
|
||
# check wan
|
||
p = ping('1.1.1.1')
|
||
if p: print('[Network check/INFO] WAN is reachable')
|
||
else:
|
||
print('[Network check/ERR] WAN is unreachable')
|
||
return False
|
||
# check DNS
|
||
p = ping('google.com')
|
||
if p: print('[Network check/INFO] DNS is working')
|
||
else:
|
||
print('[Network check/ERR] DNS is unreachable')
|
||
return False
|
||
if not hostname:
|
||
print('[Network check/WARN] target not specified. skipping')
|
||
|
||
# check target
|
||
p = ping(hostname)
|
||
if p:
|
||
print('[Network check/INFO] site host is up')
|
||
else:
|
||
print('[Network check/ERR] site host is unreachable')
|
||
raise NetworkError('[Network check/ERR] site host is unreachable')
|
||
# check site working
|
||
try:
|
||
response = requests.get('https://' + hostname)
|
||
print('[Network check/INFO] site is responding to HTTP requests')
|
||
return True
|
||
except RequestException as e:
|
||
raise NetworkError from e
|
||
|
||
def wait_for_network(self, hostname):
|
||
while not self.check_network(hostname):
|
||
print('Waiting for network...')
|
||
time.sleep(30)
|
||
|
||
def network_garant(self, url, session=None, headers=None, retries = 10) -> requests.models.Response | None:
|
||
if retries <= 0: raise ValidationError("Network error correction failed")
|
||
exception_occurred = False
|
||
|
||
try:
|
||
if session: r = session.get(url)
|
||
elif headers: r = requests.get(url, headers=headers)
|
||
else: r: requests.models.Response = requests.get(url)
|
||
if not isinstance(r, requests.models.Response): raise ValidationError(
|
||
f'response has type {type(r)} but requests.models.Response is required'
|
||
)
|
||
return r
|
||
except Timeout as e:
|
||
# Таймаут соединения/чтения
|
||
print("Timeout:", e)
|
||
self.errors.network_timeout_error += 1
|
||
exception_occurred = True
|
||
|
||
except SSLError as e:
|
||
# Проблемы с TLS‑рукава
|
||
print("SSL error:", e)
|
||
self.errors.network_ssl_error += 1
|
||
exception_occurred = True
|
||
|
||
except ProxyError as e:
|
||
# Ошибка прокси (часто является под‑случаем ConnectionError, но отдельно)
|
||
print("Proxy error:", e)
|
||
self.errors.network_proxy_error += 1
|
||
exception_occurred = True
|
||
except ConnectionError as e:
|
||
# Ошибки соединения: DNS, unreachable host, RST, proxy fail и т.п.
|
||
print("Connection failed:", e)
|
||
self.errors.network_connection_error += 1
|
||
exception_occurred = True
|
||
except RequestException as e:
|
||
# Любая другая непредвиденная ошибка
|
||
print("General request error:", e)
|
||
self.errors.network_request_exception += 1
|
||
exception_occurred = True
|
||
finally:
|
||
if exception_occurred:
|
||
try: self.wait_for_network(str(url).split('//', maxsplit=1)[1].split('/', maxsplit=1)[0])
|
||
except Exception as e: self.wait_for_network(hostname=None)
|
||
return self.network_garant(url, session, headers, retries-1)
|
||
else: self.errors.network_success += 1
|
||
|
||
def http_garant(self, url, session=None, headers=None, retries = 10, service_available_retries = 720):
|
||
if retries <= 0: raise ValidationError("HTTP error correction failed")
|
||
|
||
try:
|
||
response = self.network_garant(url, session, headers)
|
||
response.raise_for_status()
|
||
self.errors.http_success += 1
|
||
return response
|
||
except requests.exceptions.HTTPError as e:
|
||
status = e.response.status_code
|
||
if status == 503:
|
||
self.errors.http_unavailable += 1
|
||
print("[http_garant/WARN] HTTP error, waiting availability:", e)
|
||
time.sleep(60)
|
||
return self.http_garant(url, session, headers, retries, service_available_retries - 1)
|
||
else:
|
||
self.errors.http_other += 1
|
||
print("[http_garant/ERR] HTTP error:", e)
|
||
time.sleep(10)
|
||
if service_available_retries <= 0: return self.http_garant(url, session, headers, retries - 1, service_available_retries)
|
||
else: raise CursorError from e
|
||
|
||
def json_garant(self, url, session=None, headers=None, retries = 10):
|
||
if retries <= 0: raise ValidationError("JSON parse error correction failed")
|
||
|
||
try:
|
||
response = self.http_garant(url, session, headers)
|
||
ct = response.headers.get("Content-Type", "")
|
||
if not ct.lower().startswith("application/json"): raise ContentTypeMismatchError
|
||
j = response.json()
|
||
self.errors.json_success += 1
|
||
return j
|
||
except ContentTypeMismatchError:
|
||
self.errors.json_type_mismatch += 1
|
||
print("[json_garant/ERR] HTTP error")
|
||
time.sleep(10)
|
||
return self.json_garant(url, session, headers, retries - 1)
|
||
|
||
except ValueError as e:
|
||
self.errors.json_parse_error += 1
|
||
print("[json_garant/ERR] HTTP error:", e)
|
||
time.sleep(10)
|
||
return self.json_garant(url, session, headers, retries - 1)
|
||
|
||
def api_data_garant(self, url, collector: neofetch_collector, session=None, headers=None, retries = 10, ):
|
||
if retries <= 0: raise ValidationError("API data error correction failed")
|
||
|
||
try:
|
||
response = self.json_garant(url, session, headers)
|
||
if 'items' not in response or 'metadata' not in response: raise EmptyDataError
|
||
items = response['items']
|
||
metadata = response['metadata']
|
||
del response
|
||
|
||
if len(items) == 0 and len(metadata) == 0: raise EmptyDataError
|
||
elif len(items) == 0: raise EmptyItemsError
|
||
elif len(metadata) == 0: raise EmptyMetaWarning
|
||
|
||
if 'nextPage' not in metadata: raise EmptyMetaWarning('Metadata has not nextPage field')
|
||
else: next_page = metadata['nextPage']
|
||
if 'totalPages' in metadata: total_pages = metadata['totalPages']
|
||
else: total_pages = None
|
||
|
||
if next_page and next_page == url: raise CursorSlipError
|
||
new_items_count = collector.compare(items)
|
||
new_items_percentage = float(new_items_count) / float(len(items)) * 100
|
||
if new_items_count == 0: raise DuplicateDataError
|
||
elif new_items_percentage < 50: raise PoorDataWarning
|
||
return items, next_page, total_pages
|
||
|
||
|
||
except EmptyDataError:
|
||
print('[api_data_garant/ERR] EmptyDataError: Empty api response')
|
||
time.sleep(10)
|
||
if retries > 1:
|
||
return self.api_data_garant(url, collector, session, headers, retries - 1)
|
||
else: raise CursorError
|
||
|
||
except EmptyItemsError:
|
||
print('[api_data_garant/ERR] EmptyItemsError: Empty api response')
|
||
time.sleep(10)
|
||
if retries > 1:
|
||
return self.api_data_garant(url, collector, session, headers, retries - 1)
|
||
else:
|
||
raise CursorError
|
||
|
||
except EmptyMetaWarning:
|
||
print('[api_data_garant/WARN] EmptyMetaWarning')
|
||
return items, None, None
|
||
|
||
except DuplicateDataError:
|
||
print('[api_data_garant/ERR] DuplicateDataError')
|
||
if retries > 1:
|
||
return self.api_data_garant(url, collector, session, headers, retries - 1)
|
||
else:
|
||
raise CursorSlipError
|
||
|
||
except PoorDataWarning:
|
||
print('[api_data_garant/WARN] PoorDataWarning')
|
||
return items, next_page, total_pages
|
||
|
||
def cursor_garant(self, cursor: neofetch_cursor, collector: neofetch_collector, session=None, headers=None, retries = 10):
|
||
if retries <= 0: raise ValidationError("Cursor error correction failed")
|
||
|
||
try:
|
||
return self.api_data_garant(cursor.url, collector, session, headers)
|
||
except CursorError:
|
||
print('[cursor_garant/ERR] CursorError')
|
||
if not cursor.stripped:
|
||
time.sleep(10)
|
||
cursor.strip()
|
||
return self.cursor_garant(cursor, collector, session, headers, retries - 1)
|
||
elif retries > 5: return self.cursor_garant(cursor, collector, session, headers, retries - 1)
|
||
else:
|
||
cursor.decrement(2)
|
||
return self.cursor_garant(cursor, collector, session, headers, retries - 1)
|
||
except CursorSlipError:
|
||
print('[cursor_garant/ERR] CursorSlipError')
|
||
if not cursor.stripped:
|
||
time.sleep(10)
|
||
cursor.strip()
|
||
return self.cursor_garant(cursor, collector, session, headers, retries - 1)
|
||
elif retries > 5: return self.cursor_garant(cursor, collector, session, headers, retries - 1)
|
||
else:
|
||
cursor.increment(1)
|
||
return self.cursor_garant(cursor, collector, session, headers, retries - 1)
|
||
|
||
def validation_garant(self, cursor: neofetch_cursor, collector: neofetch_collector, session=None, headers=None, retries = 10):
|
||
try: return self.cursor_garant(cursor, collector, session, headers)
|
||
except ValidationError as e:
|
||
# TODO log error
|
||
if retries > 0: return self.validation_garant(cursor, collector, session, headers, retries - 1)
|
||
else: raise RuntimeError from e
|
||
except CursorIncrementError as e: raise RuntimeError from e
|
||
|
||
def crawler(self, next_page, collector: neofetch_collector, session, type: str, start_number = 0):
|
||
cur = neofetch_cursor(next_page)
|
||
collector.current_number = start_number
|
||
total_pages = None
|
||
while next_page:
|
||
print(f'Fetching {type}: page {next_page}{f'of {total_pages}' if total_pages else ''}')
|
||
try: items, next_page, total_pages = self.validation_garant(cur, collector, session)
|
||
except RuntimeError:
|
||
# TODO log error
|
||
break
|
||
cur.update(next_page)
|
||
collector.add(items)
|
||
collector.save()
|
||
|
||
def tags(self, start_number=0):
|
||
return self.crawler(next_page=self.base_url + 'api/v1/tags?limit=200', collector=self.tags_collector,
|
||
session=self.session, type='tags', start_number=start_number)
|
||
def creators(self, start_number=0):
|
||
return self.crawler(next_page=self.base_url + 'api/v1/creators?limit=200', collector=self.creators_collector,
|
||
session=self.session, type='creators', start_number=start_number)
|
||
def models(self, start_number=0):
|
||
return self.crawler(next_page=self.base_url + 'api/v1/models?period=AllTime&sort=Oldest&nsfw=true&limit=200',
|
||
collector=self.models_collector, session=self.session, type='models', start_number=start_number)
|
||
|
||
if __name__ == '__main__':
|
||
n = neofetch_cursor('https://civitai.com/api/v1/models?period=AllTime&sort=Oldest&nsfw=true&cursor=2022-11-16%2023%3A31%3A28.203%7C162')
|
||
n.increment(10)
|
||
pass |