140 lines
4.8 KiB
Python
Executable File
140 lines
4.8 KiB
Python
Executable File
"""
|
|
Test script for DvachParser.
|
|
|
|
This script tests the parser on the to_parse.html file.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
|
|
# Add Program directory to path
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'Program'))
|
|
|
|
from Programm.Parsers.Dvach.Parser import Parser
|
|
|
|
|
|
def test_parser():
|
|
"""Test the parser with to_parse.html file."""
|
|
|
|
print("=" * 80)
|
|
print("TESTING DVACH PARSER")
|
|
print("=" * 80)
|
|
|
|
# Read HTML file
|
|
html_file = "to_parse.html"
|
|
if not os.path.exists(html_file):
|
|
print(f"ERROR: File {html_file} not found!")
|
|
return
|
|
|
|
print(f"\n1. Reading HTML file: {html_file}")
|
|
with open(html_file, 'r', encoding='utf-8') as f:
|
|
html_content = f.read()
|
|
|
|
print(f" File size: {len(html_content)} bytes")
|
|
|
|
# Parse thread
|
|
print("\n2. Parsing thread...")
|
|
from Programm.Parsers.Dvach.parse_thread import parse_thread
|
|
thread = parse_thread(html_content)
|
|
|
|
if not thread:
|
|
print(" ERROR: Failed to parse thread!")
|
|
return
|
|
|
|
print(f" ✓ Thread parsed successfully")
|
|
print(f" - Thread ID: {thread.id}")
|
|
print(f" - Title: {thread.title}")
|
|
print(f" - Board ID: {thread.board_id}")
|
|
print(f" - Messages: {len(thread.messages)}")
|
|
print(f" - View count: {thread.view_count}")
|
|
print(f" - Reply count: {thread.reply_count}")
|
|
|
|
# Parse each message
|
|
print("\n3. Parsing messages:")
|
|
for i, message in enumerate(thread.messages, 1):
|
|
print(f"\n Message #{i}:")
|
|
print(f" - ID: {message.id}")
|
|
print(f" - Author: {message.author}")
|
|
print(f" - Timestamp: {message.timestamp}")
|
|
print(f" - Content length: {len(message.content)} chars")
|
|
print(f" - Text length: {len(message.text_content)} chars")
|
|
print(f" - Images: {len(message.images)}")
|
|
print(f" - Reply links: {len(message.reply_links)}")
|
|
print(f" - Ref links: {len(message.ref_links)}")
|
|
|
|
# Show first 200 chars of text content
|
|
text_preview = message.text_content[:200] + "..." if len(message.text_content) > 200 else message.text_content
|
|
print(f" - Text preview: {text_preview}")
|
|
|
|
# Show images
|
|
for j, image in enumerate(message.images, 1):
|
|
print(f"\n Image #{j}:")
|
|
print(f" - Name: {image.name}")
|
|
print(f" - Type: {image.type} (6=webm, 9=webp, 10=mp4)")
|
|
print(f" - Size: {image.size}KB")
|
|
print(f" - Dimensions: {image.width}x{image.height}")
|
|
print(f" - Duration: {image.duration or 'N/A'}")
|
|
print(f" - URL: {image.url}")
|
|
|
|
# Summary
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY")
|
|
print("=" * 80)
|
|
print(f"✓ Total messages parsed: {len(thread.messages)}")
|
|
print(f"✓ Total images parsed: {sum(len(msg.images) for msg in thread.messages)}")
|
|
print(f"✓ Total reply links: {sum(len(msg.reply_links) for msg in thread.messages)}")
|
|
print(f"✓ Total ref links: {sum(len(msg.ref_links) for msg in thread.messages)}")
|
|
|
|
# Test image downloading
|
|
print("\n" + "=" * 80)
|
|
print("TESTING IMAGE DOWNLOADING")
|
|
print("=" * 80)
|
|
|
|
# Create parser instance with session
|
|
print("\n1. Creating parser instance with session...")
|
|
parser = Parser()
|
|
print(" ✓ Parser created")
|
|
|
|
# Download all images
|
|
print("\n2. Downloading all images from thread...")
|
|
downloaded_count = parser.download_all_images(thread)
|
|
|
|
print(f"\n3. Image download summary:")
|
|
print(f" - Successfully downloaded: {downloaded_count}")
|
|
|
|
# Verify downloaded images
|
|
print("\n4. Verifying downloaded images:")
|
|
for i, message in enumerate(thread.messages, 1):
|
|
for j, image in enumerate(message.images, 1):
|
|
if image.data:
|
|
print(f" ✓ Message #{i}, Image #{j}: {image.name} ({len(image.data)} bytes)")
|
|
else:
|
|
print(f" ✗ Message #{i}, Image #{j}: {image.name} - NOT DOWNLOADED")
|
|
|
|
# Test thread_from_url method
|
|
print("\n" + "=" * 80)
|
|
print("TESTING thread_from_url METHOD")
|
|
print("=" * 80)
|
|
|
|
# Create a test URL
|
|
test_url = "https://2ch.life/ai/res/1484052.html"
|
|
print(f"\nTesting URL: {test_url}")
|
|
print("Note: This will make an HTTP request. If offline, it will fail.")
|
|
|
|
thread_from_url = parser.thread_from_url(test_url)
|
|
|
|
if thread_from_url:
|
|
print(f"✓ Thread fetched from URL successfully")
|
|
print(f" - ID: {thread_from_url.id}")
|
|
print(f" - Title: {thread_from_url.title}")
|
|
print(f" - Messages: {len(thread_from_url.messages)}")
|
|
else:
|
|
print("✗ Failed to fetch thread from URL (expected if offline)")
|
|
|
|
print("\n" + "=" * 80)
|
|
print("TEST COMPLETED")
|
|
print("=" * 80)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
test_parser() |