Harden image fetch algorithm

Prepare for database integration
This commit is contained in:
2025-09-24 20:07:19 +07:00
parent 88d40c0d99
commit 3a88bdad3a
17 changed files with 600 additions and 26 deletions

View File

@@ -5,7 +5,7 @@ from dataclasses import dataclass, fields
from pathlib import Path
from typing import List
from pythonapp.Libs.ConfigDataClass import Config
from modules.shared.ConfigDataClass import Config
@dataclass
@@ -61,7 +61,7 @@ class ModelPackage:
field_value = getattr(package_info, field_name)
if field_value is not None and field_value != "" and field_name != "filename":
current_value = getattr(self.info, field_name)
if current_value is None or current_value == "" or current_value == 0 or len(current_value) == 0:
if current_value is None or current_value == "" or current_value == 0 or len(str(current_value)) == 0:
setattr(self.info, field_name, field_value)
# Генерируем UUID если он не определен

View File

@@ -1,7 +1,7 @@
from dataclasses import dataclass
from pathlib import Path
from pythonapp.Libs.ConfigDataClass import Config
from modules.shared.ConfigDataClass import Config
@dataclass

View File

@@ -301,7 +301,7 @@ class ModelPackageSubRepository:
internal = True
break
pulled = list()
pulled: list[ModelPackage] = list()
for candidate in to_pull:
package_path = self.path / candidate['uuid']
if os.path.exists(str(Path(package_path) / "package.json")): raise RuntimeError("package exists!")
@@ -357,13 +357,15 @@ class ModelPackageSubRepository:
pulled.append(package)
for package in pulled:
info = package.info
print('Collections for package:')
print(f' {'N':<{2}} {'version':<{10}} {'type':<{10}} {'release_date':<{25}}'
f' {'lineage':<{10}} {'quant':<{5}} {'size':<{10}} ')
# TODO fix it
print(
f' {'N':<{2}} {candidate['version']:<{10}} {candidate['package_type']:<{10}} {candidate['release_date']:<{25}}'
f' {candidate['lineage']:<{10}} {candidate['quantisation']:<{5}} {format_bytes(candidate['size_bytes']):<{10}} ')
f' {'N':<{2}} {info.version:<{10}} {info.package_type:<{10}} {info.release_date:<{25}}'
f' {info.lineage:<{10}} {info.quantization:<{5}} {format_bytes(info.size_bytes):<{10}} ')
self._add_package_to_collections_interactive(package)

View File

@@ -4,7 +4,7 @@ from pathlib import Path
from modelspace.ModelPackage import ModelPackage
from modelspace.ModelPackageSubRepository import ModelPackageSubRepository
from pythonapp.Libs.ConfigDataClass import Config
from modules.shared.ConfigDataClass import Config
@dataclass

View File

@@ -7,7 +7,7 @@ import time
from typing import Optional
from requests import Session
from pythonapp.Libs.ConfigDataClass import Config
from modules.shared.ConfigDataClass import Config
@dataclass
class ClientConfig(Config):

View File

@@ -2,6 +2,7 @@ import datetime
import json
import time
import os
import warnings
from collections import defaultdict, Counter
from typing import Dict, List, Any, Tuple, Union
from pathlib import Path
@@ -315,20 +316,52 @@ class Fetch:
return next_page, next_cursor
@classmethod
def _cursor_crawler_avoid_slip(cls, client: Client, url, path, entity, slip_retries = 5, chill_time = 3):
def _cursor_crawler_avoid_slip(cls, client: Client, url, path, entity, slip_retries = 5, get_retries = 50, chill_time = 3):
slip_counter = 0
get_counter = 0
page = None
while True:
page = client.make_get_request(url).json()
try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page)
except RuntimeError: return page
if slip_counter >= slip_retries: break
if next_page == url:
slip_counter = slip_counter + 1
with open(Path(path) / 'slip.log', 'a') as file: file.write(f'{url}\n')
continue
else:
try:
page = client.make_get_request(url)
if not page: raise ValueError
page = page.json()
if not page.get('items', None) or len(page.get('items', None)) == 0: raise ValueError
try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page)
except RuntimeError as e: return page
if next_page == url: raise TypeError
# raise ValueError
return page
except ValueError:
get_counter = get_counter + 1
with open(Path(path) / '_get_error.log', 'a') as file:
file.write(f'{url}\n')
if get_counter >= get_retries: return page
if entity == 'images':
print("Trying avoid images get error by decreasing cursor position by 1")
split = url.rsplit('=', maxsplit=1)
prefix = split[0] + '='
split = split[1].rsplit('%', maxsplit=1)
cursor = int(split[0])
cursor = cursor - 1
# suffix = '%' + split[1]
url = prefix + str(cursor) # + suffix
print('get error detected. waiting 30s for retry')
time.sleep(30)
except TypeError:
slip_counter = slip_counter + 1
with open(Path(path) / '_slip.log', 'a') as file:
file.write(f'{url}\n')
if slip_counter >= slip_retries: break
print('slip error detected. waiting 30s for retry')
time.sleep(30)
if entity not in {'models'}: raise RuntimeError("Slip detected! Avoiding failed: NotImplemented")
split = url.rsplit('.', 1)
@@ -349,7 +382,7 @@ class Fetch:
@classmethod
def _cursor_crawler(cls, client: Client, entity: str, params: dict, save = True):
print(f"Fetching {entity}...")
print(f"{datetime.datetime.now()} Fetching {entity}...")
path = Path(client.path) / ('fetch_' + entity)
items = list()
url = f'{client.config.base_url}/api/v1/{entity}{client.build_query_string(params)}'
@@ -366,14 +399,17 @@ class Fetch:
except RuntimeError: return items
while next_page:
time.sleep(3)
# with open(Path(client.path) / 'bugs.log', 'a') as f:
# f.write(next_page + '\n')
page = cls._cursor_crawler_avoid_slip(client, next_page, path, entity)
if not page: return items
page_items = page.get('items', None)
if not page_items:
if page_items is None:
with open(Path(client.path)/'bugs.log', 'a') as f: f.write(next_page + '\n')
return items
l = len(items)
items.extend(page_items)
print(f"Fetched {len(items) - l}/{len(page_items)} {entity} from page {next_page}")
print(f"{datetime.datetime.now()} Fetched {len(items) - l}/{len(page_items)} {entity} from page {next_page}")
if save: cls._save_json(path / f'page_{next_cursor}.json', page_items)
try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page)
@@ -408,15 +444,30 @@ class Fetch:
# for username in ['yonne']:
time.sleep(3)
if not username: continue
page_items = cls._cursor_crawler(client, 'images', {'period': 'AllTime', 'sort': 'Oldest', 'nsfw':'X', 'username': username, 'limit': '200'}, save=False)
page_items = cls._cursor_crawler(client, 'images', {
'period': 'AllTime', 'sort': 'Oldest', 'nsfw':'X', 'username': username, 'limit': '200', 'cursor': 0
}, save=False)
# page_items = cls._cursor_crawler(client, 'images', {
# 'period': 'AllTime', 'sort': 'Most%20Reactions', 'nsfw': 'X', 'username': username, 'limit': '200', 'cursor': 0
# }, save=False)
if len(page_items) >= 1000:
with open(path / '_1k.log', 'a') as f: f.write(username + '\n')
if len(page_items) >= 5000:
with open(path / '_5k.log', 'a') as f: f.write(username + '\n')
if len(page_items) >= 10000:
with open(path / '_10k.log', 'a') as f: f.write(username + '\n')
if len(page_items) >= 25000:
with open(path / '_25k.log', 'a') as f: f.write(username + '\n')
if len(page_items) >= 49000:
with open(path / '_giants.log', 'a') as f: f.write(username + '\n')
with open(path / '_giants_over_50k.log', 'a') as f: f.write(username + '\n')
print(f'Giant {username} has more then {len(page_items)} images, starting deep scan')
page_items_dict = dict()
for item in page_items: page_items_dict[item['id']] = item
print(f'Transferred {len(page_items_dict)} images of {len(page_items)}')
for sort in ['Newest', 'Most Reactions', 'Most Comments', 'Most Collected']:
for sort in ['Newest', 'Most%20Reactions', 'Most%20Comments', 'Most%20Collected', ]:
page_items = cls._cursor_crawler(client, 'images',
{'period': 'AllTime', 'sort': sort, 'nsfw': 'X',
'username': username, 'limit': '200'}, save=False)

View File

@@ -0,0 +1,90 @@
from dataclasses import dataclass, fields
from typing import Optional, List
from DataClassJson import DataClassJson
from modules.shared.DatabaseAbstraction import Cursor
types = {bool: 'INTEGER', int: 'INTEGER', float: 'REAL', str: "TEXT",
Optional[bool]: 'INTEGER', Optional[int]: 'INTEGER', Optional[float]: 'REAL', Optional[str]: "TEXT", }
@dataclass
class DataClassDatabase(DataClassJson):
_main_entity: bool = None
_table_name: str = None
pass
def __post_init__(self):
super().__post_init__()
@classmethod
def get_create_sqls(cls, table_name = None):
result: list[str] = list()
result.append(f'CREATE TABLE IF NOT EXISTS {table_name} (fk TEXT NOT NULL, pk TEXT NOT NULL, PRIMARY KEY(pk, fk));')
tmp_instance = cls()
if not table_name: table_name = tmp_instance._table_name
excluded_fields = {f.name for f in fields(DataClassDatabase)}
all_fields = [f for f in fields(cls) if f.name not in excluded_fields and not f.name.startswith('_')]
for field in all_fields:
if field.name in tmp_instance._forwarding:
inner_type: type = tmp_instance._forwarding[field.name]
try: result.extend(inner_type.get_create_sqls())
except Exception as e: raise RuntimeError('invalid forwarding type') from e
elif field.type in { list, Optional[list], Optional[List] }:
result.append(f'CREATE TABLE IF NOT EXISTS {table_name}_{field.name} (fk TEXT NOT NULL, data TEXT NOT NULL);')
else:
result.append(f'ALTER TABLE {table_name} ADD COLUMN {field.name} {types.get(field.type, 'TEXT')};')
return result
@classmethod
def create(cls, cur: Cursor):
for sql in cls.get_create_sqls(): cur.execute(sql)
if __name__ == '__main__':
@dataclass
class ModelStats(DataClassDatabase):
downloadCount: Optional[int] = None
favoriteCount: Optional[int] = None
thumbsUpCount: Optional[int] = None
thumbsDownCount: Optional[int] = None
commentCount: Optional[int] = None
ratingCount: Optional[int] = None
rating: Optional[int] = None
def __post_init__(self):
super().__post_init__()
self._forwarding = {}
@dataclass
class Model(DataClassDatabase):
id: Optional[int] = None
name: Optional[str] = None
description: Optional[str] = None
allowNoCredit: Optional[bool] = None
allowCommercialUse: Optional[list] = None
allowDerivatives: Optional[bool] = None
allowDifferentLicense: Optional[bool] = None
type: Optional[str] = None
minor: Optional[bool] = None
sfwOnly: Optional[bool] = None
poi: Optional[bool] = None
nsfw: Optional[bool] = None
nsfwLevel: Optional[int] = None
availability: Optional[str] = None
cosmetic: Optional[str] = None
supportsGeneration: Optional[bool] = None
stats: Optional[ModelStats] = None
def __post_init__(self):
super().__post_init__()
self._forwarding = {
'stats': ModelStats,
}
self._key_field = 'id'
self._table_name = 'gagaga'
for s in Model.get_create_sqls():
print(s)

View File

@@ -0,0 +1,185 @@
from dataclasses import dataclass, field, fields
from typing import Dict, Any, Optional
import warnings
# Определим базовый класс для удобного наследования
@dataclass
class DataClassJson:
_forwarding: Dict[str, type] = field(default_factory=dict)
_key_field: str = 'key' # Поле, которое будет использоваться как ключ
fixed: bool = False
# Скрытые поля для хранения данных
_key: Optional[str] = None
other_data: Optional[Dict[str, Any]] = None
# Пример поля, которое будет использоваться в _forwarding
# Должно быть переопределено в дочерних классах
key: Optional[str] = None
def __post_init__(self):
if self._key is not None:
self.key = self._key
@property
def key(self) -> Optional[str]:
return self._key
@key.setter
def key(self, value: str):
self._key = value
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'DataClassJson':
# Создаем экземпляр класса
instance = cls()
instance.fixed = data.get('fixed', False)
instance.other_data = None
# Список всех полей
excluded_fields = {f.name for f in fields(DataClassJson)}
all_fields = {f.name for f in fields(cls) if f.name not in excluded_fields and not f.name.startswith('_')}
# Обрабатываем поля из _forwarding
handled_keys = set()
field_values = {}
for key, value in data.items():
if key in handled_keys:
continue
if key in instance._forwarding:
target_type = instance._forwarding[key]
if isinstance(value, dict):
# Обрабатываем словарь
sub_instance = target_type.from_dict(value)
field_values[key] = sub_instance
handled_keys.add(key)
elif isinstance(value, list):
# Обрабатываем список словарей
results = []
for item in value:
if isinstance(item, dict):
sub_instance = target_type.from_dict(item)
results.append(sub_instance)
else:
# Если элемент не словарь, записываем в other_data
warnings.warn(f"Non-dict value {item} in list for field '{key}' will be added to 'other_data'")
if instance.other_data is None:
instance.other_data = {}
instance.other_data[key] = item # Сохраняем оригинал
field_values[key] = results
handled_keys.add(key)
else:
# Если не словарь и не список, тоже добавляем в other_data
warnings.warn(f"Non-dict/list value {value} for field '{key}' will be added to 'other_data'")
if instance.other_data is None:
instance.other_data = {}
instance.other_data[key] = value
else:
# Обычное поле
if key in all_fields:
field_values[key] = value
handled_keys.add(key)
else:
# Неизвестное поле, добавляем в other_data
warnings.warn(f"Unknown field '{key}', adding to 'other_data'")
if instance.other_data is None:
instance.other_data = {}
instance.other_data[key] = value
# Заполняем обычные поля
for key, value in field_values.items():
setattr(instance, key, value)
# Устанавливаем ключ, если есть
if hasattr(instance, '_key_field') and instance._key_field in data:
instance.key = data[instance._key_field]
# Проверяем флаг fixed и other_data
if instance.fixed and instance.other_data is not None:
raise ValueError("Cannot serialize with fixed=True and non-empty other_data")
return instance
def to_dict(self) -> Dict[str, Any]:
result = {}
excluded_fields = {f.name for f in fields(DataClassJson)}
field_names = [f.name for f in fields(self) if f.name not in excluded_fields and not f.name.startswith('_')]
for field_name in field_names:
if not hasattr(self, field_name):
result[field_name] = None
warnings.warn(f'object not have field {field_name}, something went wrong')
continue
value = getattr(self, field_name)
if not value:
result[field_name] = None
warnings.warn(f'object not have data in field {field_name}, it may be correct situation')
continue
if field_name in self._forwarding:
target_type = self._forwarding[field_name]
result[field_name] = list()
single = False
if not isinstance(value, list):
single = True
value = [value]
for v in value:
try:
v = v.to_dict()
except Exception as e:
warnings.warn(str(e))
finally:
result[field_name].append(v)
if single: result[field_name] = result[field_name][0]
continue
else: result[field_name] = value
# Добавляем other_data, если есть
if self.other_data and isinstance(self.other_data, dict):
for key, value in self.other_data.items():
if key not in result:
result[key] = value
else:
if not isinstance(result[key], list): result[key] = [result[key]]
if not isinstance(value, list): value = [value]
result[key].extend(value)
return result
# Пример использования:
@dataclass
class Person(DataClassJson):
name: Optional[str] = None
age: Optional[int] = None
email: Optional[str] = None
def __post_init__(self):
super().__post_init__()
self._forwarding = {}
self._key_field = 'name'
@dataclass
class User(DataClassJson):
id: Optional[list] = None
username: Optional[str] = None
person: Optional[Person] = None
def __post_init__(self):
super().__post_init__()
self._forwarding = {'person': Person}
self._key_field = 'username'
# Пример десериализации:
if __name__ == "__main__":
data = {
"id": [1,2,3,4,5,6],
"username": "user1",
"person": None,
"extra_field": "should_be_in_other_data"
}
user = User.from_dict(data)
data2 = user.to_dict()
print(user.to_dict())

View File

@@ -0,0 +1,52 @@
class Cursor:
def __init__(self, cursor):
pass
def execute(self, sql: str, params: list = None) -> None:
pass
def fetchone(self, sql: str, params: list = None) -> dict:
pass
def fetchmany(self, sql: str = None, params: list = None) -> list[dict]:
pass
def fetchall(self, sql: str, params: list = None) -> list[dict]:
pass
def lastrowid(self):
pass
class Database:
def __init__(self, name: str):
self.name = name
self.connected = False
def commit(self):
pass
def cursor(self) -> Cursor:
pass
class DBContainer:
def __init__(self, db: Database):
self.db: Database = db
def switch_db(self, db: Database):
self.db.commit()
self.db: Database = db
@property
def connected(self) -> bool:
return self.db.connected
def commit(self):
self.db.commit()
def cursor(self) -> Cursor:
return self.db.cursor()

View File

@@ -0,0 +1,91 @@
from pathlib import Path
from .DatabaseAbstraction import Database, Cursor
import sqlite3 as sq
class SQLiteCursor(Cursor):
def __init__(self, cursor):
super().__init__(cursor)
self._cursor = cursor
def execute(self, sql: str, params: list = None) -> None:
"""Выполняет SQL запрос"""
if params is None:
self._cursor.execute(sql)
else:
self._cursor.execute(sql, params)
def fetchone(self, sql: str, params: list = None) -> dict:
"""Получает одну строку результата"""
if params is None:
self._cursor.execute(sql)
else:
self._cursor.execute(sql, params)
row = self._cursor.fetchone()
if row is None:
return None
# Преобразуем в словарь с именами колонок
columns = [description[0] for description in self._cursor.description]
return dict(zip(columns, row))
def fetchmany(self, sql: str = None, params: list = None) -> list[dict]:
"""Получает несколько строк результата"""
if sql is not None:
if params is None:
self._cursor.execute(sql)
else:
self._cursor.execute(sql, params)
rows = self._cursor.fetchmany()
if not rows:
return []
# Преобразуем в список словарей
columns = [description[0] for description in self._cursor.description]
return [dict(zip(columns, row)) for row in rows]
def fetchall(self, sql: str, params: list = None) -> list[dict]:
"""Получает все строки результата"""
if params is None:
self._cursor.execute(sql)
else:
self._cursor.execute(sql, params)
rows = self._cursor.fetchall()
if not rows:
return []
# Преобразуем в список словарей
columns = [description[0] for description in self._cursor.description]
return [dict(zip(columns, row)) for row in rows]
def lastrowid(self):
"""Возвращает ID последней вставленной строки"""
return self._cursor.lastrowid
class SQLiteDatabase(Database):
def __init__(self, name: str, path = '.'):
super().__init__(name)
self._connection: sq.Connection = sq.connect(Path(path) / (name + '.db'))
self._connection.autocommit = True
self._connection.row_factory = sq.Row # Для получения словарей
self.connected = True
def commit(self):
"""Фиксирует транзакцию"""
if self.connected and self._connection:
self._connection.commit()
def cursor(self) -> Cursor:
"""Создает и возвращает курсор"""
return SQLiteCursor(self._connection.cursor())
def close(self):
"""Закрывает соединение с базой данных"""
if self.connected and self._connection:
self._connection.close()
self.connected = False

View File

@@ -0,0 +1,23 @@
class ListsDict:
def __init__(self):
self._data: dict[str, list] = dict()
def add(self, key, value):
if key not in self._data: self._data[key] = list()
if value not in self._data[key]: self._data[key].append(value)
def delete(self, key, value):
if self._data.get(key, None): self._data[key].remove(value)
@property
def index(self):
res = list()
for key, collection in self._data.items():
for elem in collection:
if elem not in res:
res.append(elem)
return res
def by_key(self, key):
return self._data.get(key, None)

View File

@@ -0,0 +1,25 @@
class SetsDict:
def __init__(self):
self._data: dict[str, set] = dict()
def add(self, key, value):
if key not in self._data: self._data[key] = set()
if value not in self._data[key]: self._data[key].add(value)
def delete(self, key, value):
if self._data.get(key, None): self._data[key].remove(value)
@property
def index(self):
res = set()
for key, collection in self._data.items():
for elem in collection:
if elem not in res:
res.add(elem)
return res
def by_key(self, key):
return self._data.get(key, None)
@property
def keys(self): return self._data.keys()

View File

View File

@@ -0,0 +1,11 @@
def format_bytes(bytes_size):
"""Convert bytes to human readable format"""
if bytes_size < 1024:
return f"{bytes_size} B"
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if bytes_size < 1024.0:
return f"{bytes_size:.1f} {unit}"
bytes_size /= 1024.0
return f"{bytes_size:.1f} PB"

View File

@@ -0,0 +1,44 @@
def select_elements(lst, selection_string):
"""
Выбирает элементы из списка согласно строке выбора
Args:
lst: Исходный список
selection_string: Строка вида "1 2 4-6 all"
Returns:
Новый список с выбранными элементами, отсортированными по номерам
"""
selection_string = selection_string.strip()
if not selection_string.strip():
return []
if selection_string == "all":
return lst.copy()
selected_indices = set()
parts = selection_string.split()
for part in parts:
if '-' in part:
# Обработка диапазона
start, end = map(int, part.split('-'))
# Обработка диапазона в любом направлении
if start <= end:
selected_indices.update(range(start, end + 1))
else:
selected_indices.update(range(start, end - 1, -1))
else:
# Обработка отдельного элемента
selected_indices.add(int(part))
# Преобразуем в список и сортируем по номерам
sorted_indices = sorted(selected_indices)
# Выбираем элементы
result = []
for idx in sorted_indices:
if 0 <= idx < len(lst):
result.append(lst[idx])
return result

View File

@@ -1,6 +1,6 @@
from dataclasses import dataclass
from pathlib import Path
from pythonapp.Libs.ConfigDataClass import Config
from modules.shared.ConfigDataClass import Config
class InstanceFileNaming: