489 lines
20 KiB
Python
489 lines
20 KiB
Python
import datetime
|
||
import json
|
||
import time
|
||
import os
|
||
import warnings
|
||
from collections import defaultdict, Counter
|
||
from typing import Dict, List, Any, Tuple, Union
|
||
from pathlib import Path
|
||
|
||
|
||
from modules.civit.client import Client
|
||
|
||
|
||
class EntityAnalyzer:
|
||
def __init__(self):
|
||
self.field_analysis = {}
|
||
|
||
def _get_json_files(self, directory_path: str) -> List[str]:
|
||
"""Получает список всех JSON файлов в директории"""
|
||
json_files = []
|
||
for filename in os.listdir(directory_path):
|
||
if filename.endswith('.json'):
|
||
json_files.append(os.path.join(directory_path, filename))
|
||
return json_files
|
||
|
||
def _load_json_data(self, file_path: str) -> List[Dict]:
|
||
"""Загружает данные из JSON файла"""
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
if isinstance(data, list):
|
||
return data
|
||
else:
|
||
return [data]
|
||
except (json.JSONDecodeError, IOError) as e:
|
||
print(f"Ошибка чтения файла {file_path}: {e}")
|
||
return []
|
||
|
||
def _collect_all_entities(self, directory_path: str) -> List[Dict]:
|
||
"""Собирает все экземпляры из всех JSON файлов"""
|
||
all_entities = []
|
||
json_files = self._get_json_files(directory_path)
|
||
|
||
for file_path in json_files:
|
||
entities = self._load_json_data(file_path)
|
||
all_entities.extend(entities)
|
||
|
||
return all_entities
|
||
|
||
def _get_field_types(self, value: Any) -> str:
|
||
"""Определяет тип значения"""
|
||
if isinstance(value, dict):
|
||
return 'dict'
|
||
elif isinstance(value, list):
|
||
return 'list'
|
||
elif isinstance(value, bool):
|
||
return 'bool'
|
||
elif isinstance(value, int):
|
||
return 'int'
|
||
elif isinstance(value, float):
|
||
return 'float'
|
||
elif isinstance(value, str):
|
||
return 'str'
|
||
else:
|
||
return 'unknown'
|
||
|
||
def _get_main_type(self, types: List[str]) -> str:
|
||
"""Определяет основной тип из списка типов"""
|
||
if not types:
|
||
return 'unknown'
|
||
|
||
# Если есть dict или list - это сложная структура
|
||
if 'dict' in types or 'list' in types:
|
||
return 'complex'
|
||
|
||
# Иначе возвращаем первый тип (или объединяем)
|
||
unique_types = set(types)
|
||
if len(unique_types) == 1:
|
||
return types[0]
|
||
else:
|
||
return 'mixed'
|
||
|
||
def _is_hashable(self, value: Any) -> bool:
|
||
"""Проверяет, является ли значение хэшируемым"""
|
||
try:
|
||
hash(value)
|
||
return True
|
||
except TypeError:
|
||
return False
|
||
|
||
def _serialize_value_for_counter(self, value: Any) -> str:
|
||
"""Преобразует значение в строку для использования в Counter"""
|
||
if self._is_hashable(value):
|
||
return value
|
||
else:
|
||
# Для нехэшируемых типов используем строковое представление
|
||
return str(value)
|
||
|
||
def _analyze_fields_recursive(self, entity: Dict, parent_path: str,
|
||
field_types: Dict, field_presence: Dict,
|
||
field_values: Dict, top_n: int):
|
||
"""Рекурсивно анализирует поля сущности"""
|
||
if not isinstance(entity, dict):
|
||
return
|
||
|
||
for key, value in entity.items():
|
||
field_path = f"{parent_path}.{key}" if parent_path else key
|
||
|
||
# Добавляем тип поля
|
||
field_types[field_path].append(self._get_field_types(value))
|
||
|
||
# Отмечаем наличие поля
|
||
field_presence[field_path].append(True)
|
||
|
||
# Сохраняем значение для подсчета частоты (обрабатываем нехэшируемые типы)
|
||
if value is not None:
|
||
serialized_value = self._serialize_value_for_counter(value)
|
||
field_values[field_path].append(serialized_value)
|
||
|
||
# Рекурсивно анализируем вложенные структуры
|
||
if isinstance(value, dict):
|
||
self._analyze_fields_recursive(value, field_path, field_types,
|
||
field_presence, field_values, top_n)
|
||
elif isinstance(value, list):
|
||
for item in value:
|
||
if isinstance(item, dict):
|
||
self._analyze_fields_recursive(item, field_path, field_types,
|
||
field_presence, field_values, top_n)
|
||
|
||
def _analyze_entity_structure(self, entities: List[Dict], top_n: int) -> Dict[str, Any]:
|
||
"""Анализирует структуру всех сущностей"""
|
||
if not entities:
|
||
return {}
|
||
|
||
# Собираем все поля и их типы
|
||
field_types = defaultdict(list)
|
||
field_presence = defaultdict(list)
|
||
field_values = defaultdict(list)
|
||
|
||
for entity in entities:
|
||
self._analyze_fields_recursive(entity, "", field_types, field_presence,
|
||
field_values, top_n)
|
||
|
||
# Формируем финальный анализ
|
||
result = {}
|
||
for field_path, types in field_types.items():
|
||
# Определяем основной тип
|
||
main_type = self._get_main_type(types)
|
||
|
||
# Подсчитываем частоту наличия поля
|
||
presence_count = len(field_presence[field_path])
|
||
total_count = len(entities)
|
||
always_present = presence_count == total_count
|
||
|
||
# Получаем топ N значений
|
||
top_values = []
|
||
if field_path in field_values:
|
||
try:
|
||
# Преобразуем строки обратно в оригинальные типы для отображения
|
||
value_counter = Counter(field_values[field_path])
|
||
top_values = [item[0] for item in value_counter.most_common(top_n)]
|
||
except Exception:
|
||
# Если возникла ошибка, используем пустой список
|
||
top_values = []
|
||
|
||
result[field_path] = {
|
||
'type': main_type,
|
||
'always_present': always_present,
|
||
'top_values': top_values,
|
||
'total_count': total_count,
|
||
'presence_count': presence_count
|
||
}
|
||
|
||
return result
|
||
|
||
def analyze_directory(self, directory_path: str, top_n: int = 10) -> Dict[str, Any]:
|
||
"""
|
||
Основной метод анализа директории
|
||
|
||
Args:
|
||
directory_path: Путь к директории с JSON файлами
|
||
top_n: Количество самых частых значений для каждого поля
|
||
|
||
Returns:
|
||
Словарь с анализом структуры данных
|
||
"""
|
||
# Шаг 1: Собираем все экземпляры из JSON файлов
|
||
entities = self._collect_all_entities(directory_path)
|
||
|
||
# Шаг 2: Анализируем структуру сущностей
|
||
self.field_analysis = self._analyze_entity_structure(entities, top_n)
|
||
|
||
return self.field_analysis
|
||
|
||
|
||
|
||
|
||
class Fetch:
|
||
|
||
@staticmethod
|
||
def load_json_dir(directory_path):
|
||
"""
|
||
Получает путь к директории, находит в ней все файлы json,
|
||
читает из них списки словарей и возвращает один список со всеми словарями
|
||
|
||
Args:
|
||
directory_path (str): Путь к директории с JSON файлами
|
||
|
||
Returns:
|
||
list: Список всех словарей из всех JSON файлов
|
||
"""
|
||
all_dicts = []
|
||
files = os.listdir(directory_path)
|
||
|
||
# Проходим по всем файлам в директории
|
||
for filename in files:
|
||
if filename.endswith('.json'):
|
||
file_path = os.path.join(directory_path, filename)
|
||
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
# Если данные - список словарей
|
||
if isinstance(data, list):
|
||
all_dicts.extend(data)
|
||
# Если данные - один словарь
|
||
elif isinstance(data, dict):
|
||
all_dicts.append(data)
|
||
|
||
except (json.JSONDecodeError, IOError) as e:
|
||
print(f"Ошибка чтения файла {filename}: {e}")
|
||
continue
|
||
|
||
return all_dicts
|
||
|
||
entities = {
|
||
'creator': 'fetch_creators',
|
||
'creators': 'fetch_creators',
|
||
'tag': 'fetch_tags',
|
||
'tags': 'fetch_tags',
|
||
'model': 'fetch_models',
|
||
'models': 'fetch_models',
|
||
'image': 'fetch_images',
|
||
'images': 'fetch_images',
|
||
}
|
||
|
||
@classmethod
|
||
def load(cls, client: Client, entity_type: str):
|
||
if entity_type in cls.entities: subdir = cls.entities[entity_type]
|
||
else: raise ValueError(f'Civit doesn\'t have entity type {entity_type}')
|
||
res = cls.load_json_dir(str(Path(client.path) / subdir))
|
||
return res
|
||
|
||
|
||
@classmethod
|
||
def datamodel(cls, client: Client, subdir, top = None):
|
||
if not top: top = 10
|
||
path = Path(client.path) / subdir
|
||
datamodel = EntityAnalyzer().analyze_directory(path, top_n=top)
|
||
return datamodel
|
||
|
||
@classmethod
|
||
def _save_json(cls, path, items):
|
||
with open(path, 'w') as f:
|
||
json.dump(items, f, indent=2, ensure_ascii=False)
|
||
|
||
@classmethod
|
||
def _paginated_crawler_parse_metadata(cls, page):
|
||
metadata = page.get('metadata', None)
|
||
if not metadata: raise RuntimeError("Unable to find metadata")
|
||
total_pages = metadata.get('totalPages', None)
|
||
current_page = metadata.get('currentPage', None)
|
||
if not total_pages or not current_page: RuntimeError("Unable to parse metadata")
|
||
print(f"Found! Total pages: {total_pages}")
|
||
return total_pages, current_page
|
||
|
||
@classmethod
|
||
def _paginated_crawler(cls, client: Client, entity: str, save = True):
|
||
items = list()
|
||
print(f"Fetching {entity}...")
|
||
path = Path(client.path) / ('fetch_' + entity)
|
||
first_page = client.get_creators_tags_raw(entity)
|
||
if first_page.get('items', None): items.extend(first_page.get('items', None))
|
||
if save:
|
||
path.mkdir(exist_ok=True)
|
||
cls._save_json(path / 'first.json', items)
|
||
total_pages, current_page = cls._paginated_crawler_parse_metadata(first_page)
|
||
for i in range(2, total_pages + 1):
|
||
print(f"Fetching page {i} of {total_pages}")
|
||
page = client.get_creators_tags_raw(entity, page=i)
|
||
time.sleep(3)
|
||
page_items = page.get('items', None)
|
||
if not page_items: continue
|
||
items.extend(page_items)
|
||
if save: cls._save_json(path / f'page_{i}.json', page_items)
|
||
|
||
if save: cls._save_json(path / f'all.json', items)
|
||
return items
|
||
|
||
@classmethod
|
||
def creators(cls, client: Client, subdir = 'fetch_creators', save = True):
|
||
return cls._paginated_crawler(client, 'creators', save)
|
||
|
||
@classmethod
|
||
def tags(cls, client: Client, subdir='fetch_tags', save=True):
|
||
return cls._paginated_crawler(client, 'tags', save)
|
||
|
||
@classmethod
|
||
def _cursor_crawler_parse_metadata(cls, page):
|
||
metadata = page.get('metadata', None)
|
||
if not metadata: raise RuntimeError("Unable to find metadata")
|
||
next_page = metadata.get('nextPage', None)
|
||
next_cursor = metadata.get('nextCursor', None)
|
||
if not next_page or not next_cursor: RuntimeError("Unable to parse metadata")
|
||
return next_page, next_cursor
|
||
|
||
@classmethod
|
||
def _cursor_crawler_avoid_slip(cls, client: Client, url, path, entity, slip_retries = 5, get_retries = 50, chill_time = 3):
|
||
slip_counter = 0
|
||
get_counter = 0
|
||
page = None
|
||
while True:
|
||
try:
|
||
page = client.make_get_request(url)
|
||
if not page: raise ValueError
|
||
page = page.json()
|
||
if not page.get('items', None) or len(page.get('items', None)) == 0: raise ValueError
|
||
try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page)
|
||
except RuntimeError as e: return page
|
||
if next_page == url: raise TypeError
|
||
# raise ValueError
|
||
return page
|
||
|
||
except ValueError:
|
||
get_counter = get_counter + 1
|
||
with open(Path(path) / '_get_error.log', 'a') as file:
|
||
file.write(f'{url}\n')
|
||
if get_counter >= get_retries: return page
|
||
if entity == 'images':
|
||
print("Trying avoid images get error by decreasing cursor position by 1")
|
||
split = url.rsplit('=', maxsplit=1)
|
||
prefix = split[0] + '='
|
||
split = split[1].rsplit('%', maxsplit=1)
|
||
cursor = int(split[0])
|
||
cursor = cursor - 1
|
||
|
||
# suffix = '%' + split[1]
|
||
url = prefix + str(cursor) # + suffix
|
||
|
||
|
||
print('get error detected. waiting 30s for retry')
|
||
time.sleep(30)
|
||
|
||
|
||
except TypeError:
|
||
slip_counter = slip_counter + 1
|
||
with open(Path(path) / '_slip.log', 'a') as file:
|
||
file.write(f'{url}\n')
|
||
if slip_counter >= slip_retries: break
|
||
print('slip error detected. waiting 30s for retry')
|
||
time.sleep(30)
|
||
|
||
|
||
if entity not in {'models'}: raise RuntimeError("Slip detected! Avoiding failed: NotImplemented")
|
||
|
||
split = url.rsplit('.', 1)
|
||
prefix = split[0] + '.'
|
||
split = split[1].split('%', 1)
|
||
suffix = '%' + split[1]
|
||
num = int(split[0])
|
||
if num < 999:
|
||
num = num + 1
|
||
else:
|
||
raise RuntimeError("Slip avoiding failed: Number overflow")
|
||
url = prefix + f'{num:03d}' + suffix
|
||
page = client.make_get_request(url).json()
|
||
next_page, next_cursor = cls._paginated_crawler_parse_metadata(page)
|
||
if next_page != url: return page
|
||
else: raise RuntimeError("Slip avoiding failed: Not effective")
|
||
|
||
|
||
@classmethod
|
||
def _cursor_crawler(cls, client: Client, entity: str, params: dict, save = True):
|
||
print(f"{datetime.datetime.now()} Fetching {entity}...")
|
||
path = Path(client.path) / ('fetch_' + entity)
|
||
items = list()
|
||
url = f'{client.config.base_url}/api/v1/{entity}{client.build_query_string(params)}'
|
||
first_page = client.make_get_request(url)
|
||
if not first_page:
|
||
with open(Path(client.path) / 'bugs.log', 'a') as f: f.write(url + '\n')
|
||
return items
|
||
first_page = first_page.json()
|
||
if first_page.get('items', None): items.extend(first_page.get('items', None))
|
||
if save:
|
||
path.mkdir(exist_ok=True)
|
||
cls._save_json(path / 'first.json', items)
|
||
try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(first_page)
|
||
except RuntimeError: return items
|
||
while next_page:
|
||
time.sleep(3)
|
||
# with open(Path(client.path) / 'bugs.log', 'a') as f:
|
||
# f.write(next_page + '\n')
|
||
page = cls._cursor_crawler_avoid_slip(client, next_page, path, entity)
|
||
if not page: return items
|
||
page_items = page.get('items', None)
|
||
if page_items is None:
|
||
with open(Path(client.path)/'bugs.log', 'a') as f: f.write(next_page + '\n')
|
||
return items
|
||
l = len(items)
|
||
items.extend(page_items)
|
||
print(f"{datetime.datetime.now()} Fetched {len(items) - l}/{len(page_items)} {entity} from page {next_page}")
|
||
|
||
if save: cls._save_json(path / f'page_{next_cursor}.json', page_items)
|
||
try: next_page, next_cursor = cls._cursor_crawler_parse_metadata(page)
|
||
except RuntimeError: break
|
||
|
||
if save: cls._save_json(path / f'all.json', items)
|
||
return items
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
@classmethod
|
||
def models(cls, client: Client, subdir='fetch_models', save=True):
|
||
return cls._cursor_crawler(client, 'models', {'period': 'AllTime', 'sort': 'Oldest', 'nsfw':'true'}, save)
|
||
|
||
@classmethod
|
||
def images(cls, client: Client, subdir='fetch_images', save=True, start_with = None):
|
||
items = list()
|
||
if not start_with: start_with = 0
|
||
path = Path(client.path) / ('fetch_' + 'images')
|
||
if save: path.mkdir(exist_ok=True)
|
||
creators = [c.get('username', None) for c in cls.load(client, 'creators')]
|
||
counter = 1 + int(start_with)
|
||
|
||
for username in creators[int(start_with):]:
|
||
# for username in ['yonne']:
|
||
time.sleep(3)
|
||
if not username: continue
|
||
page_items = cls._cursor_crawler(client, 'images', {
|
||
'period': 'AllTime', 'sort': 'Oldest', 'nsfw':'X', 'username': username, 'limit': '200', 'cursor': 0
|
||
}, save=False)
|
||
|
||
# page_items = cls._cursor_crawler(client, 'images', {
|
||
# 'period': 'AllTime', 'sort': 'Most%20Reactions', 'nsfw': 'X', 'username': username, 'limit': '200', 'cursor': 0
|
||
# }, save=False)
|
||
|
||
if len(page_items) >= 1000:
|
||
with open(path / '_1k.log', 'a') as f: f.write(username + '\n')
|
||
if len(page_items) >= 5000:
|
||
with open(path / '_5k.log', 'a') as f: f.write(username + '\n')
|
||
if len(page_items) >= 10000:
|
||
with open(path / '_10k.log', 'a') as f: f.write(username + '\n')
|
||
if len(page_items) >= 25000:
|
||
with open(path / '_25k.log', 'a') as f: f.write(username + '\n')
|
||
|
||
if len(page_items) >= 49000:
|
||
with open(path / '_giants_over_50k.log', 'a') as f: f.write(username + '\n')
|
||
print(f'Giant {username} has more then {len(page_items)} images, starting deep scan')
|
||
page_items_dict = dict()
|
||
for item in page_items: page_items_dict[item['id']] = item
|
||
print(f'Transferred {len(page_items_dict)} images of {len(page_items)}')
|
||
for sort in ['Newest', 'Most%20Reactions', 'Most%20Comments', 'Most%20Collected', ]:
|
||
page_items = cls._cursor_crawler(client, 'images',
|
||
{'period': 'AllTime', 'sort': sort, 'nsfw': 'X',
|
||
'username': username, 'limit': '200'}, save=False)
|
||
l = len(page_items_dict)
|
||
for item in page_items: page_items_dict[item['id']] = item
|
||
print(f'Added {len(page_items_dict) - l} images by {sort} sort crawl. {len(page_items_dict)} images total')
|
||
|
||
page_items = [value for key, value in page_items_dict.items()]
|
||
|
||
|
||
l = len(items)
|
||
#items.extend(page_items)
|
||
print(f"Fetched {len(page_items)} images by {username} ({counter}/{len(creators)})")
|
||
counter = counter + 1
|
||
|
||
if save: cls._save_json(path / f'{username}.json', page_items)
|
||
|
||
#if save: cls._save_json(path / 'aaa.json', items)
|
||
return items |