Harden image fetch algorithm

Prepare for database integration
This commit is contained in:
2025-09-24 20:07:19 +07:00
parent 88d40c0d99
commit 3a88bdad3a
17 changed files with 600 additions and 26 deletions

View File

@@ -5,7 +5,7 @@ from dataclasses import dataclass, fields
from pathlib import Path from pathlib import Path
from typing import List from typing import List
from pythonapp.Libs.ConfigDataClass import Config from modules.shared.ConfigDataClass import Config
@dataclass @dataclass
@@ -61,7 +61,7 @@ class ModelPackage:
field_value = getattr(package_info, field_name) field_value = getattr(package_info, field_name)
if field_value is not None and field_value != "" and field_name != "filename": if field_value is not None and field_value != "" and field_name != "filename":
current_value = getattr(self.info, field_name) current_value = getattr(self.info, field_name)
if current_value is None or current_value == "" or current_value == 0 or len(current_value) == 0: if current_value is None or current_value == "" or current_value == 0 or len(str(current_value)) == 0:
setattr(self.info, field_name, field_value) setattr(self.info, field_name, field_value)
# Генерируем UUID если он не определен # Генерируем UUID если он не определен

View File

@@ -1,7 +1,7 @@
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from pythonapp.Libs.ConfigDataClass import Config from modules.shared.ConfigDataClass import Config
@dataclass @dataclass

View File

@@ -301,7 +301,7 @@ class ModelPackageSubRepository:
internal = True internal = True
break break
pulled = list() pulled: list[ModelPackage] = list()
for candidate in to_pull: for candidate in to_pull:
package_path = self.path / candidate['uuid'] package_path = self.path / candidate['uuid']
if os.path.exists(str(Path(package_path) / "package.json")): raise RuntimeError("package exists!") if os.path.exists(str(Path(package_path) / "package.json")): raise RuntimeError("package exists!")
@@ -357,13 +357,15 @@ class ModelPackageSubRepository:
pulled.append(package) pulled.append(package)
for package in pulled: for package in pulled:
info = package.info
print('Collections for package:') print('Collections for package:')
print(f' {'N':<{2}} {'version':<{10}} {'type':<{10}} {'release_date':<{25}}' print(f' {'N':<{2}} {'version':<{10}} {'type':<{10}} {'release_date':<{25}}'
f' {'lineage':<{10}} {'quant':<{5}} {'size':<{10}} ') f' {'lineage':<{10}} {'quant':<{5}} {'size':<{10}} ')
# TODO fix it
print( print(
f' {'N':<{2}} {candidate['version']:<{10}} {candidate['package_type']:<{10}} {candidate['release_date']:<{25}}' f' {'N':<{2}} {info.version:<{10}} {info.package_type:<{10}} {info.release_date:<{25}}'
f' {candidate['lineage']:<{10}} {candidate['quantisation']:<{5}} {format_bytes(candidate['size_bytes']):<{10}} ') f' {info.lineage:<{10}} {info.quantization:<{5}} {format_bytes(info.size_bytes):<{10}} ')
self._add_package_to_collections_interactive(package) self._add_package_to_collections_interactive(package)

View File

@@ -4,7 +4,7 @@ from pathlib import Path
from modelspace.ModelPackage import ModelPackage from modelspace.ModelPackage import ModelPackage
from modelspace.ModelPackageSubRepository import ModelPackageSubRepository from modelspace.ModelPackageSubRepository import ModelPackageSubRepository
from pythonapp.Libs.ConfigDataClass import Config from modules.shared.ConfigDataClass import Config
@dataclass @dataclass

View File

@@ -7,7 +7,7 @@ import time
from typing import Optional from typing import Optional
from requests import Session from requests import Session
from pythonapp.Libs.ConfigDataClass import Config from modules.shared.ConfigDataClass import Config
@dataclass @dataclass
class ClientConfig(Config): class ClientConfig(Config):

View File

@@ -2,6 +2,7 @@ import datetime
import json import json
import time import time
import os import os
import warnings
from collections import defaultdict, Counter from collections import defaultdict, Counter
from typing import Dict, List, Any, Tuple, Union from typing import Dict, List, Any, Tuple, Union
from pathlib import Path from pathlib import Path
@@ -315,20 +316,52 @@ class Fetch:
return next_page, next_cursor return next_page, next_cursor
@classmethod @classmethod
def _cursor_crawler_avoid_slip(cls, client: Client, url, path, entity, slip_retries = 5, chill_time = 3): def _cursor_crawler_avoid_slip(cls, client: Client, url, path, entity, slip_retries = 5, get_retries = 50, chill_time = 3):
slip_counter = 0 slip_counter = 0
get_counter = 0
page = None
while True: while True:
page = client.make_get_request(url).json() try:
try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page) page = client.make_get_request(url)
except RuntimeError: return page if not page: raise ValueError
if slip_counter >= slip_retries: break page = page.json()
if next_page == url: if not page.get('items', None) or len(page.get('items', None)) == 0: raise ValueError
slip_counter = slip_counter + 1 try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page)
with open(Path(path) / 'slip.log', 'a') as file: file.write(f'{url}\n') except RuntimeError as e: return page
continue if next_page == url: raise TypeError
else: # raise ValueError
return page return page
except ValueError:
get_counter = get_counter + 1
with open(Path(path) / '_get_error.log', 'a') as file:
file.write(f'{url}\n')
if get_counter >= get_retries: return page
if entity == 'images':
print("Trying avoid images get error by decreasing cursor position by 1")
split = url.rsplit('=', maxsplit=1)
prefix = split[0] + '='
split = split[1].rsplit('%', maxsplit=1)
cursor = int(split[0])
cursor = cursor - 1
# suffix = '%' + split[1]
url = prefix + str(cursor) # + suffix
print('get error detected. waiting 30s for retry')
time.sleep(30)
except TypeError:
slip_counter = slip_counter + 1
with open(Path(path) / '_slip.log', 'a') as file:
file.write(f'{url}\n')
if slip_counter >= slip_retries: break
print('slip error detected. waiting 30s for retry')
time.sleep(30)
if entity not in {'models'}: raise RuntimeError("Slip detected! Avoiding failed: NotImplemented") if entity not in {'models'}: raise RuntimeError("Slip detected! Avoiding failed: NotImplemented")
split = url.rsplit('.', 1) split = url.rsplit('.', 1)
@@ -349,7 +382,7 @@ class Fetch:
@classmethod @classmethod
def _cursor_crawler(cls, client: Client, entity: str, params: dict, save = True): def _cursor_crawler(cls, client: Client, entity: str, params: dict, save = True):
print(f"Fetching {entity}...") print(f"{datetime.datetime.now()} Fetching {entity}...")
path = Path(client.path) / ('fetch_' + entity) path = Path(client.path) / ('fetch_' + entity)
items = list() items = list()
url = f'{client.config.base_url}/api/v1/{entity}{client.build_query_string(params)}' url = f'{client.config.base_url}/api/v1/{entity}{client.build_query_string(params)}'
@@ -366,14 +399,17 @@ class Fetch:
except RuntimeError: return items except RuntimeError: return items
while next_page: while next_page:
time.sleep(3) time.sleep(3)
# with open(Path(client.path) / 'bugs.log', 'a') as f:
# f.write(next_page + '\n')
page = cls._cursor_crawler_avoid_slip(client, next_page, path, entity) page = cls._cursor_crawler_avoid_slip(client, next_page, path, entity)
if not page: return items
page_items = page.get('items', None) page_items = page.get('items', None)
if not page_items: if page_items is None:
with open(Path(client.path)/'bugs.log', 'a') as f: f.write(next_page + '\n') with open(Path(client.path)/'bugs.log', 'a') as f: f.write(next_page + '\n')
return items return items
l = len(items) l = len(items)
items.extend(page_items) items.extend(page_items)
print(f"Fetched {len(items) - l}/{len(page_items)} {entity} from page {next_page}") print(f"{datetime.datetime.now()} Fetched {len(items) - l}/{len(page_items)} {entity} from page {next_page}")
if save: cls._save_json(path / f'page_{next_cursor}.json', page_items) if save: cls._save_json(path / f'page_{next_cursor}.json', page_items)
try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page) try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page)
@@ -408,15 +444,30 @@ class Fetch:
# for username in ['yonne']: # for username in ['yonne']:
time.sleep(3) time.sleep(3)
if not username: continue if not username: continue
page_items = cls._cursor_crawler(client, 'images', {'period': 'AllTime', 'sort': 'Oldest', 'nsfw':'X', 'username': username, 'limit': '200'}, save=False) page_items = cls._cursor_crawler(client, 'images', {
'period': 'AllTime', 'sort': 'Oldest', 'nsfw':'X', 'username': username, 'limit': '200', 'cursor': 0
}, save=False)
# page_items = cls._cursor_crawler(client, 'images', {
# 'period': 'AllTime', 'sort': 'Most%20Reactions', 'nsfw': 'X', 'username': username, 'limit': '200', 'cursor': 0
# }, save=False)
if len(page_items) >= 1000:
with open(path / '_1k.log', 'a') as f: f.write(username + '\n')
if len(page_items) >= 5000:
with open(path / '_5k.log', 'a') as f: f.write(username + '\n')
if len(page_items) >= 10000:
with open(path / '_10k.log', 'a') as f: f.write(username + '\n')
if len(page_items) >= 25000:
with open(path / '_25k.log', 'a') as f: f.write(username + '\n')
if len(page_items) >= 49000: if len(page_items) >= 49000:
with open(path / '_giants.log', 'a') as f: f.write(username + '\n') with open(path / '_giants_over_50k.log', 'a') as f: f.write(username + '\n')
print(f'Giant {username} has more then {len(page_items)} images, starting deep scan') print(f'Giant {username} has more then {len(page_items)} images, starting deep scan')
page_items_dict = dict() page_items_dict = dict()
for item in page_items: page_items_dict[item['id']] = item for item in page_items: page_items_dict[item['id']] = item
print(f'Transferred {len(page_items_dict)} images of {len(page_items)}') print(f'Transferred {len(page_items_dict)} images of {len(page_items)}')
for sort in ['Newest', 'Most Reactions', 'Most Comments', 'Most Collected']: for sort in ['Newest', 'Most%20Reactions', 'Most%20Comments', 'Most%20Collected', ]:
page_items = cls._cursor_crawler(client, 'images', page_items = cls._cursor_crawler(client, 'images',
{'period': 'AllTime', 'sort': sort, 'nsfw': 'X', {'period': 'AllTime', 'sort': sort, 'nsfw': 'X',
'username': username, 'limit': '200'}, save=False) 'username': username, 'limit': '200'}, save=False)

View File

@@ -0,0 +1,90 @@
from dataclasses import dataclass, fields
from typing import Optional, List
from DataClassJson import DataClassJson
from modules.shared.DatabaseAbstraction import Cursor
types = {bool: 'INTEGER', int: 'INTEGER', float: 'REAL', str: "TEXT",
Optional[bool]: 'INTEGER', Optional[int]: 'INTEGER', Optional[float]: 'REAL', Optional[str]: "TEXT", }
@dataclass
class DataClassDatabase(DataClassJson):
_main_entity: bool = None
_table_name: str = None
pass
def __post_init__(self):
super().__post_init__()
@classmethod
def get_create_sqls(cls, table_name = None):
result: list[str] = list()
result.append(f'CREATE TABLE IF NOT EXISTS {table_name} (fk TEXT NOT NULL, pk TEXT NOT NULL, PRIMARY KEY(pk, fk));')
tmp_instance = cls()
if not table_name: table_name = tmp_instance._table_name
excluded_fields = {f.name for f in fields(DataClassDatabase)}
all_fields = [f for f in fields(cls) if f.name not in excluded_fields and not f.name.startswith('_')]
for field in all_fields:
if field.name in tmp_instance._forwarding:
inner_type: type = tmp_instance._forwarding[field.name]
try: result.extend(inner_type.get_create_sqls())
except Exception as e: raise RuntimeError('invalid forwarding type') from e
elif field.type in { list, Optional[list], Optional[List] }:
result.append(f'CREATE TABLE IF NOT EXISTS {table_name}_{field.name} (fk TEXT NOT NULL, data TEXT NOT NULL);')
else:
result.append(f'ALTER TABLE {table_name} ADD COLUMN {field.name} {types.get(field.type, 'TEXT')};')
return result
@classmethod
def create(cls, cur: Cursor):
for sql in cls.get_create_sqls(): cur.execute(sql)
if __name__ == '__main__':
@dataclass
class ModelStats(DataClassDatabase):
downloadCount: Optional[int] = None
favoriteCount: Optional[int] = None
thumbsUpCount: Optional[int] = None
thumbsDownCount: Optional[int] = None
commentCount: Optional[int] = None
ratingCount: Optional[int] = None
rating: Optional[int] = None
def __post_init__(self):
super().__post_init__()
self._forwarding = {}
@dataclass
class Model(DataClassDatabase):
id: Optional[int] = None
name: Optional[str] = None
description: Optional[str] = None
allowNoCredit: Optional[bool] = None
allowCommercialUse: Optional[list] = None
allowDerivatives: Optional[bool] = None
allowDifferentLicense: Optional[bool] = None
type: Optional[str] = None
minor: Optional[bool] = None
sfwOnly: Optional[bool] = None
poi: Optional[bool] = None
nsfw: Optional[bool] = None
nsfwLevel: Optional[int] = None
availability: Optional[str] = None
cosmetic: Optional[str] = None
supportsGeneration: Optional[bool] = None
stats: Optional[ModelStats] = None
def __post_init__(self):
super().__post_init__()
self._forwarding = {
'stats': ModelStats,
}
self._key_field = 'id'
self._table_name = 'gagaga'
for s in Model.get_create_sqls():
print(s)

View File

@@ -0,0 +1,185 @@
from dataclasses import dataclass, field, fields
from typing import Dict, Any, Optional
import warnings
# Определим базовый класс для удобного наследования
@dataclass
class DataClassJson:
_forwarding: Dict[str, type] = field(default_factory=dict)
_key_field: str = 'key' # Поле, которое будет использоваться как ключ
fixed: bool = False
# Скрытые поля для хранения данных
_key: Optional[str] = None
other_data: Optional[Dict[str, Any]] = None
# Пример поля, которое будет использоваться в _forwarding
# Должно быть переопределено в дочерних классах
key: Optional[str] = None
def __post_init__(self):
if self._key is not None:
self.key = self._key
@property
def key(self) -> Optional[str]:
return self._key
@key.setter
def key(self, value: str):
self._key = value
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'DataClassJson':
# Создаем экземпляр класса
instance = cls()
instance.fixed = data.get('fixed', False)
instance.other_data = None
# Список всех полей
excluded_fields = {f.name for f in fields(DataClassJson)}
all_fields = {f.name for f in fields(cls) if f.name not in excluded_fields and not f.name.startswith('_')}
# Обрабатываем поля из _forwarding
handled_keys = set()
field_values = {}
for key, value in data.items():
if key in handled_keys:
continue
if key in instance._forwarding:
target_type = instance._forwarding[key]
if isinstance(value, dict):
# Обрабатываем словарь
sub_instance = target_type.from_dict(value)
field_values[key] = sub_instance
handled_keys.add(key)
elif isinstance(value, list):
# Обрабатываем список словарей
results = []
for item in value:
if isinstance(item, dict):
sub_instance = target_type.from_dict(item)
results.append(sub_instance)
else:
# Если элемент не словарь, записываем в other_data
warnings.warn(f"Non-dict value {item} in list for field '{key}' will be added to 'other_data'")
if instance.other_data is None:
instance.other_data = {}
instance.other_data[key] = item # Сохраняем оригинал
field_values[key] = results
handled_keys.add(key)
else:
# Если не словарь и не список, тоже добавляем в other_data
warnings.warn(f"Non-dict/list value {value} for field '{key}' will be added to 'other_data'")
if instance.other_data is None:
instance.other_data = {}
instance.other_data[key] = value
else:
# Обычное поле
if key in all_fields:
field_values[key] = value
handled_keys.add(key)
else:
# Неизвестное поле, добавляем в other_data
warnings.warn(f"Unknown field '{key}', adding to 'other_data'")
if instance.other_data is None:
instance.other_data = {}
instance.other_data[key] = value
# Заполняем обычные поля
for key, value in field_values.items():
setattr(instance, key, value)
# Устанавливаем ключ, если есть
if hasattr(instance, '_key_field') and instance._key_field in data:
instance.key = data[instance._key_field]
# Проверяем флаг fixed и other_data
if instance.fixed and instance.other_data is not None:
raise ValueError("Cannot serialize with fixed=True and non-empty other_data")
return instance
def to_dict(self) -> Dict[str, Any]:
result = {}
excluded_fields = {f.name for f in fields(DataClassJson)}
field_names = [f.name for f in fields(self) if f.name not in excluded_fields and not f.name.startswith('_')]
for field_name in field_names:
if not hasattr(self, field_name):
result[field_name] = None
warnings.warn(f'object not have field {field_name}, something went wrong')
continue
value = getattr(self, field_name)
if not value:
result[field_name] = None
warnings.warn(f'object not have data in field {field_name}, it may be correct situation')
continue
if field_name in self._forwarding:
target_type = self._forwarding[field_name]
result[field_name] = list()
single = False
if not isinstance(value, list):
single = True
value = [value]
for v in value:
try:
v = v.to_dict()
except Exception as e:
warnings.warn(str(e))
finally:
result[field_name].append(v)
if single: result[field_name] = result[field_name][0]
continue
else: result[field_name] = value
# Добавляем other_data, если есть
if self.other_data and isinstance(self.other_data, dict):
for key, value in self.other_data.items():
if key not in result:
result[key] = value
else:
if not isinstance(result[key], list): result[key] = [result[key]]
if not isinstance(value, list): value = [value]
result[key].extend(value)
return result
# Пример использования:
@dataclass
class Person(DataClassJson):
name: Optional[str] = None
age: Optional[int] = None
email: Optional[str] = None
def __post_init__(self):
super().__post_init__()
self._forwarding = {}
self._key_field = 'name'
@dataclass
class User(DataClassJson):
id: Optional[list] = None
username: Optional[str] = None
person: Optional[Person] = None
def __post_init__(self):
super().__post_init__()
self._forwarding = {'person': Person}
self._key_field = 'username'
# Пример десериализации:
if __name__ == "__main__":
data = {
"id": [1,2,3,4,5,6],
"username": "user1",
"person": None,
"extra_field": "should_be_in_other_data"
}
user = User.from_dict(data)
data2 = user.to_dict()
print(user.to_dict())

View File

@@ -0,0 +1,52 @@
class Cursor:
def __init__(self, cursor):
pass
def execute(self, sql: str, params: list = None) -> None:
pass
def fetchone(self, sql: str, params: list = None) -> dict:
pass
def fetchmany(self, sql: str = None, params: list = None) -> list[dict]:
pass
def fetchall(self, sql: str, params: list = None) -> list[dict]:
pass
def lastrowid(self):
pass
class Database:
def __init__(self, name: str):
self.name = name
self.connected = False
def commit(self):
pass
def cursor(self) -> Cursor:
pass
class DBContainer:
def __init__(self, db: Database):
self.db: Database = db
def switch_db(self, db: Database):
self.db.commit()
self.db: Database = db
@property
def connected(self) -> bool:
return self.db.connected
def commit(self):
self.db.commit()
def cursor(self) -> Cursor:
return self.db.cursor()

View File

@@ -0,0 +1,91 @@
from pathlib import Path
from .DatabaseAbstraction import Database, Cursor
import sqlite3 as sq
class SQLiteCursor(Cursor):
def __init__(self, cursor):
super().__init__(cursor)
self._cursor = cursor
def execute(self, sql: str, params: list = None) -> None:
"""Выполняет SQL запрос"""
if params is None:
self._cursor.execute(sql)
else:
self._cursor.execute(sql, params)
def fetchone(self, sql: str, params: list = None) -> dict:
"""Получает одну строку результата"""
if params is None:
self._cursor.execute(sql)
else:
self._cursor.execute(sql, params)
row = self._cursor.fetchone()
if row is None:
return None
# Преобразуем в словарь с именами колонок
columns = [description[0] for description in self._cursor.description]
return dict(zip(columns, row))
def fetchmany(self, sql: str = None, params: list = None) -> list[dict]:
"""Получает несколько строк результата"""
if sql is not None:
if params is None:
self._cursor.execute(sql)
else:
self._cursor.execute(sql, params)
rows = self._cursor.fetchmany()
if not rows:
return []
# Преобразуем в список словарей
columns = [description[0] for description in self._cursor.description]
return [dict(zip(columns, row)) for row in rows]
def fetchall(self, sql: str, params: list = None) -> list[dict]:
"""Получает все строки результата"""
if params is None:
self._cursor.execute(sql)
else:
self._cursor.execute(sql, params)
rows = self._cursor.fetchall()
if not rows:
return []
# Преобразуем в список словарей
columns = [description[0] for description in self._cursor.description]
return [dict(zip(columns, row)) for row in rows]
def lastrowid(self):
"""Возвращает ID последней вставленной строки"""
return self._cursor.lastrowid
class SQLiteDatabase(Database):
def __init__(self, name: str, path = '.'):
super().__init__(name)
self._connection: sq.Connection = sq.connect(Path(path) / (name + '.db'))
self._connection.autocommit = True
self._connection.row_factory = sq.Row # Для получения словарей
self.connected = True
def commit(self):
"""Фиксирует транзакцию"""
if self.connected and self._connection:
self._connection.commit()
def cursor(self) -> Cursor:
"""Создает и возвращает курсор"""
return SQLiteCursor(self._connection.cursor())
def close(self):
"""Закрывает соединение с базой данных"""
if self.connected and self._connection:
self._connection.close()
self.connected = False

View File

@@ -0,0 +1,23 @@
class ListsDict:
def __init__(self):
self._data: dict[str, list] = dict()
def add(self, key, value):
if key not in self._data: self._data[key] = list()
if value not in self._data[key]: self._data[key].append(value)
def delete(self, key, value):
if self._data.get(key, None): self._data[key].remove(value)
@property
def index(self):
res = list()
for key, collection in self._data.items():
for elem in collection:
if elem not in res:
res.append(elem)
return res
def by_key(self, key):
return self._data.get(key, None)

View File

@@ -0,0 +1,25 @@
class SetsDict:
def __init__(self):
self._data: dict[str, set] = dict()
def add(self, key, value):
if key not in self._data: self._data[key] = set()
if value not in self._data[key]: self._data[key].add(value)
def delete(self, key, value):
if self._data.get(key, None): self._data[key].remove(value)
@property
def index(self):
res = set()
for key, collection in self._data.items():
for elem in collection:
if elem not in res:
res.add(elem)
return res
def by_key(self, key):
return self._data.get(key, None)
@property
def keys(self): return self._data.keys()

View File

View File

@@ -0,0 +1,11 @@
def format_bytes(bytes_size):
"""Convert bytes to human readable format"""
if bytes_size < 1024:
return f"{bytes_size} B"
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if bytes_size < 1024.0:
return f"{bytes_size:.1f} {unit}"
bytes_size /= 1024.0
return f"{bytes_size:.1f} PB"

View File

@@ -0,0 +1,44 @@
def select_elements(lst, selection_string):
"""
Выбирает элементы из списка согласно строке выбора
Args:
lst: Исходный список
selection_string: Строка вида "1 2 4-6 all"
Returns:
Новый список с выбранными элементами, отсортированными по номерам
"""
selection_string = selection_string.strip()
if not selection_string.strip():
return []
if selection_string == "all":
return lst.copy()
selected_indices = set()
parts = selection_string.split()
for part in parts:
if '-' in part:
# Обработка диапазона
start, end = map(int, part.split('-'))
# Обработка диапазона в любом направлении
if start <= end:
selected_indices.update(range(start, end + 1))
else:
selected_indices.update(range(start, end - 1, -1))
else:
# Обработка отдельного элемента
selected_indices.add(int(part))
# Преобразуем в список и сортируем по номерам
sorted_indices = sorted(selected_indices)
# Выбираем элементы
result = []
for idx in sorted_indices:
if 0 <= idx < len(lst):
result.append(lst[idx])
return result

View File

@@ -1,6 +1,6 @@
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from pythonapp.Libs.ConfigDataClass import Config from modules.shared.ConfigDataClass import Config
class InstanceFileNaming: class InstanceFileNaming: