import json import os.path import time from dataclasses import dataclass from typing import Any import netifaces from ping3 import ping import requests from requests import Response from requests.exceptions import ConnectionError, Timeout, SSLError, ProxyError, RequestException class NetworkError(Exception): pass class ValidationError(Exception): pass class CursorError(Exception): pass class ContentTypeMismatchError(Exception): pass class JsonParseError(Exception): pass class EmptyDataError(Exception): pass class EmptyItemsError(Exception): pass class EmptyMetaWarning(Exception): pass class CursorSlipError(Exception): pass class DuplicateDataError(Exception): pass class PoorDataWarning(Exception): pass class CursorIncrementError(Exception): pass @dataclass class neofetch_error_counters: network_timeout_error = 0 network_ssl_error = 0 network_proxy_error = 0 network_connection_error = 0 network_request_exception = 0 network_success = 0 @property def network_errors(self): return (self.network_timeout_error + self.network_ssl_error + self.network_proxy_error + self.network_connection_error + self.network_request_exception) @property def network_error_percentage(self): return float(self.network_errors) / float(self.network_success + self.network_errors) * 100 if self.network_success + self.network_errors != 0 else 0# % def reset_network_stats(self): self.network_timeout_error = 0 self.network_ssl_error = 0 self.network_proxy_error = 0 self.network_connection_error = 0 self.network_request_exception = 0 self.network_success = 0 http_unavailable = 0 http_other = 0 http_success = 0 @property def http_errors(self): return self.http_unavailable + self.http_other @property def http_error_percentage(self): return float(self.http_errors) / float(self.http_success + self.http_errors) * 100 if self.http_success + self.http_errors != 0 else 0# % def reset_http_stats(self): self.http_unavailable = 0 self.http_other = 0 self.http_success = 0 json_type_mismatch = 0 json_parse_error = 0 json_success = 0 @property def json_errors(self): return self.json_type_mismatch + self.json_parse_error @property def json_error_percentage(self): return float(self.json_errors) / float(self.json_success + self.json_errors) * 100 if self.json_success + self.json_errors != 0 else 0 # % def reset_json_stats(self): self.json_type_mismatch = 0 self.json_parse_error = 0 self.json_success = 0 class neofetch_collector: def __init__(self, start_number = 0, autosave = False, save_path = '', autosave_chunk_size = 2000): self.items: dict[str, dict] = dict() self.pending_items: dict[str, dict] = dict() self.autosave = autosave if autosave and save_path == '': raise ValueError('autosave mode is enabled, but path is not specified') self.save_path = save_path self.current_number = start_number self.autosave_chunk_size = autosave_chunk_size def check_autosave(self): if len(self.pending_items) < self.autosave_chunk_size: return 0 self.save() def save(self, path = None, flush = False): if not path: path = self.save_path if len(self.pending_items) == 0: return 0 if self.autosave: pending_items: list = [value for key, value in self.pending_items.items()] self.pending_items = dict() else: pending_items: list = [value for key, value in self.items.items()] path = os.path.join(self.save_path, f'{self.current_number}-{len(self.items)}.json') with open(path, "w", encoding="utf-8") as f: json.dump(pending_items, f, indent=4, ensure_ascii=False) self.current_number = len(self.items) if flush: self.flush() return len(pending_items) def flush(self): if self.save_path != '': self.save() self.items = dict() self.pending_items = dict() @staticmethod def _cast_items(items: dict | list[dict] | set[dict], pk ='id') -> dict[str, dict]: result: dict[str, dict] = dict() if isinstance(items, list): pass elif isinstance(items, dict): items = [items] elif isinstance(items, set): items = list(items) for item in items: result[str(item.get(pk, 'None'))] = item return result def _compare(self, items: dict[str, dict]) -> int: return len(set(items) - set(self.items)) def compare(self, items: dict | list[dict] | set[dict], pk ='id') -> int: return len(set(self._cast_items(items, pk)) - set(self.items)) def add(self, items: dict | list[dict] | set[dict], pk ='id') -> int: items = self._cast_items(items, pk) new_items_count = self._compare(items) new_items = set(items) - set(self.items) self.items.update(items) if self.autosave: for key in new_items: self.pending_items[key] = items[key] self.check_autosave() return new_items_count @property def list(self): return [value for key, value in self.items.items()] @property def set(self): return {value for key, value in self.items.items()} class neofetch_cursor: def __init__(self, url: str): self.url = url self.stripped = True def update(self, next_page, stripped = False): self.url = next_page self.stripped = stripped def strip(self): split = self.url.split('cursor=', maxsplit=1) if len(split) < 2: return self.url prefix = split[0] + 'cursor=' suffix = split[1].split('%', maxsplit=1)[0] self.url = prefix + suffix return self.url @staticmethod def _order(number): mask = 10 while number > mask: mask *= 10 return mask def increment(self, count = 1): split = self.url.split('cursor=', maxsplit=1) if len(split) < 2: return self.url prefix = split[0] + 'cursor=' split = split[1].rsplit('%', maxsplit=1) if len(split) >= 2: suffix = '%' + split[1] else: suffix = '' split = split[0].rsplit('.', maxsplit=1) if len(split) >= 2: prefix += split[0] + '.' cursor = split[1] else: cursor = split[0] cursor_order = len(cursor) cursor = int(cursor) incremented_cursor = cursor + count if incremented_cursor < pow(10, cursor_order): cursor = incremented_cursor else: raise CursorIncrementError(f'cursor has reached bounds: {pow(10, cursor_order)}') self.url = f'{prefix}{cursor:03d}{suffix}' return self.url def decrement(self, count = 1): return self.increment(-count) class neofetch: def __init__(self, path, base_url = None, session=None): self.path = path self.base_url = base_url or 'https://civitai.com' self.session = session self.errors = neofetch_error_counters() self.tags_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'tags')) self.creators_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'creators')) self.models_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'models')) self.images_collector = neofetch_collector(autosave=True, save_path=os.path.join(path, 'fetch', 'images')) @staticmethod def check_network(hostname = None) -> bool: # check gateway p = ping(netifaces.gateways()['default'][netifaces.AF_INET][0]) if p: print('[Network check/INFO] gateway is reachable') else: print('[Network check/WARN] gateway unreachable or ping is not allowed') # check wan p = ping('1.1.1.1') if p: print('[Network check/INFO] WAN is reachable') else: print('[Network check/ERR] WAN is unreachable') return False # check DNS p = ping('google.com') if p: print('[Network check/INFO] DNS is working') else: print('[Network check/ERR] DNS is unreachable') return False if not hostname: print('[Network check/WARN] target not specified. skipping') # check target p = ping(hostname) if p: print('[Network check/INFO] site host is up') else: print('[Network check/ERR] site host is unreachable') raise NetworkError('[Network check/ERR] site host is unreachable') # check site working try: response = requests.get('https://' + hostname) print('[Network check/INFO] site is responding to HTTP requests') return True except RequestException as e: raise NetworkError from e def wait_for_network(self, hostname): while not self.check_network(hostname): print('Waiting for network...') time.sleep(30) def network_garant(self, url, session=None, headers=None, retries = 10) -> requests.models.Response | None: if retries <= 0: raise ValidationError("Network error correction failed") exception_occurred = False try: if session: r = session.get(url) elif headers: r = requests.get(url, headers=headers) else: r: requests.models.Response = requests.get(url) if not isinstance(r, requests.models.Response): raise ValidationError( f'response has type {type(r)} but requests.models.Response is required' ) return r except Timeout as e: # Таймаут соединения/чтения print("Timeout:", e) self.errors.network_timeout_error += 1 exception_occurred = True except SSLError as e: # Проблемы с TLS‑рукава print("SSL error:", e) self.errors.network_ssl_error += 1 exception_occurred = True except ProxyError as e: # Ошибка прокси (часто является под‑случаем ConnectionError, но отдельно) print("Proxy error:", e) self.errors.network_proxy_error += 1 exception_occurred = True except ConnectionError as e: # Ошибки соединения: DNS, unreachable host, RST, proxy fail и т.п. print("Connection failed:", e) self.errors.network_connection_error += 1 exception_occurred = True except RequestException as e: # Любая другая непредвиденная ошибка print("General request error:", e) self.errors.network_request_exception += 1 exception_occurred = True finally: if exception_occurred: try: self.wait_for_network(str(url).split('//', maxsplit=1)[1].split('/', maxsplit=1)[0]) except Exception as e: self.wait_for_network(hostname=None) return self.network_garant(url, session, headers, retries-1) else: self.errors.network_success += 1 def http_garant(self, url, session=None, headers=None, retries = 10, service_available_retries = 720): if retries <= 0: raise ValidationError("HTTP error correction failed") try: response = self.network_garant(url, session, headers) response.raise_for_status() self.errors.http_success += 1 return response except requests.exceptions.HTTPError as e: status = e.response.status_code if status == 503: self.errors.http_unavailable += 1 print("[http_garant/WARN] HTTP error, waiting availability:", e) time.sleep(60) return self.http_garant(url, session, headers, retries, service_available_retries - 1) else: self.errors.http_other += 1 print("[http_garant/ERR] HTTP error:", e) time.sleep(10) if service_available_retries <= 0: return self.http_garant(url, session, headers, retries - 1, service_available_retries) else: raise CursorError from e def json_garant(self, url, session=None, headers=None, retries = 10): if retries <= 0: raise ValidationError("JSON parse error correction failed") try: response = self.http_garant(url, session, headers) ct = response.headers.get("Content-Type", "") if not ct.lower().startswith("application/json"): raise ContentTypeMismatchError j = response.json() self.errors.json_success += 1 return j except ContentTypeMismatchError: self.errors.json_type_mismatch += 1 print("[json_garant/ERR] HTTP error") time.sleep(10) return self.json_garant(url, session, headers, retries - 1) except ValueError as e: self.errors.json_parse_error += 1 print("[json_garant/ERR] HTTP error:", e) time.sleep(10) return self.json_garant(url, session, headers, retries - 1) def api_data_garant(self, url, collector: neofetch_collector, session=None, headers=None, retries = 10, ): if retries <= 0: raise ValidationError("API data error correction failed") try: response = self.json_garant(url, session, headers) if 'items' not in response or 'metadata' not in response: raise EmptyDataError items = response['items'] metadata = response['metadata'] del response if len(items) == 0 and len(metadata) == 0: raise EmptyDataError elif len(items) == 0: raise EmptyItemsError elif len(metadata) == 0: raise EmptyMetaWarning if 'nextPage' not in metadata: raise EmptyMetaWarning('Metadata has not nextPage field') else: next_page = metadata['nextPage'] if 'totalPages' in metadata: total_pages = metadata['totalPages'] else: total_pages = None if next_page and next_page == url: raise CursorSlipError new_items_count = collector.compare(items) new_items_percentage = float(new_items_count) / float(len(items)) * 100 if new_items_count == 0: raise DuplicateDataError elif new_items_percentage < 50: raise PoorDataWarning return items, next_page, total_pages except EmptyDataError: print('[api_data_garant/ERR] EmptyDataError: Empty api response') time.sleep(10) if retries > 1: return self.api_data_garant(url, collector, session, headers, retries - 1) else: raise CursorError except EmptyItemsError: print('[api_data_garant/ERR] EmptyItemsError: Empty api response') time.sleep(10) if retries > 1: return self.api_data_garant(url, collector, session, headers, retries - 1) else: raise CursorError except EmptyMetaWarning: print('[api_data_garant/WARN] EmptyMetaWarning') return items, None, None except DuplicateDataError: print('[api_data_garant/ERR] DuplicateDataError') if retries > 1: return self.api_data_garant(url, collector, session, headers, retries - 1) else: raise CursorSlipError except PoorDataWarning: print('[api_data_garant/WARN] PoorDataWarning') return items, next_page, total_pages def cursor_garant(self, cursor: neofetch_cursor, collector: neofetch_collector, session=None, headers=None, retries = 10): if retries <= 0: raise ValidationError("Cursor error correction failed") try: return self.api_data_garant(cursor.url, collector, session, headers) except CursorError: print('[cursor_garant/ERR] CursorError') if not cursor.stripped: time.sleep(10) cursor.strip() return self.cursor_garant(cursor, collector, session, headers, retries - 1) elif retries > 5: return self.cursor_garant(cursor, collector, session, headers, retries - 1) else: cursor.decrement(2) return self.cursor_garant(cursor, collector, session, headers, retries - 1) except CursorSlipError: print('[cursor_garant/ERR] CursorSlipError') if not cursor.stripped: time.sleep(10) cursor.strip() return self.cursor_garant(cursor, collector, session, headers, retries - 1) elif retries > 5: return self.cursor_garant(cursor, collector, session, headers, retries - 1) else: cursor.increment(1) return self.cursor_garant(cursor, collector, session, headers, retries - 1) def validation_garant(self, cursor: neofetch_cursor, collector: neofetch_collector, session=None, headers=None, retries = 10): try: return self.cursor_garant(cursor, collector, session, headers) except ValidationError as e: # TODO log error if retries > 0: return self.validation_garant(cursor, collector, session, headers, retries - 1) else: raise RuntimeError from e except CursorIncrementError as e: raise RuntimeError from e def crawler(self, next_page, collector: neofetch_collector, session, type: str, start_number = 0): cur = neofetch_cursor(next_page) collector.current_number = start_number total_pages = None while next_page: print(f'Fetching {type}: page {next_page}{f'of {total_pages}' if total_pages else ''}') try: items, next_page, total_pages = self.validation_garant(cur, collector, session) except RuntimeError: # TODO log error break cur.update(next_page) collector.add(items) collector.save() def tags(self, start_number=0): return self.crawler(next_page=self.base_url + 'api/v1/tags?limit=200', collector=self.tags_collector, session=self.session, type='tags', start_number=start_number) def creators(self, start_number=0): return self.crawler(next_page=self.base_url + 'api/v1/creators?limit=200', collector=self.creators_collector, session=self.session, type='creators', start_number=start_number) def models(self, start_number=0): return self.crawler(next_page=self.base_url + 'api/v1/models?period=AllTime&sort=Oldest&nsfw=true&limit=200', collector=self.models_collector, session=self.session, type='models', start_number=start_number) if __name__ == '__main__': n = neofetch_cursor('https://civitai.com/api/v1/models?period=AllTime&sort=Oldest&nsfw=true&cursor=2022-11-16%2023%3A31%3A28.203%7C162') n.increment(10) pass