Files
forum-scrapper/test_parser.py

140 lines
4.8 KiB
Python
Executable File

"""
Test script for DvachParser.
This script tests the parser on the to_parse.html file.
"""
import sys
import os
# Add Program directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'Program'))
from Programm.Parsers.Dvach.Parser import Parser
def test_parser():
"""Test the parser with to_parse.html file."""
print("=" * 80)
print("TESTING DVACH PARSER")
print("=" * 80)
# Read HTML file
html_file = "to_parse.html"
if not os.path.exists(html_file):
print(f"ERROR: File {html_file} not found!")
return
print(f"\n1. Reading HTML file: {html_file}")
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
print(f" File size: {len(html_content)} bytes")
# Parse thread
print("\n2. Parsing thread...")
from Programm.Parsers.Dvach.parse_thread import parse_thread
thread = parse_thread(html_content)
if not thread:
print(" ERROR: Failed to parse thread!")
return
print(f" ✓ Thread parsed successfully")
print(f" - Thread ID: {thread.id}")
print(f" - Title: {thread.title}")
print(f" - Board ID: {thread.board_id}")
print(f" - Messages: {len(thread.messages)}")
print(f" - View count: {thread.view_count}")
print(f" - Reply count: {thread.reply_count}")
# Parse each message
print("\n3. Parsing messages:")
for i, message in enumerate(thread.messages, 1):
print(f"\n Message #{i}:")
print(f" - ID: {message.id}")
print(f" - Author: {message.author}")
print(f" - Timestamp: {message.timestamp}")
print(f" - Content length: {len(message.content)} chars")
print(f" - Text length: {len(message.text_content)} chars")
print(f" - Images: {len(message.images)}")
print(f" - Reply links: {len(message.reply_links)}")
print(f" - Ref links: {len(message.ref_links)}")
# Show first 200 chars of text content
text_preview = message.text_content[:200] + "..." if len(message.text_content) > 200 else message.text_content
print(f" - Text preview: {text_preview}")
# Show images
for j, image in enumerate(message.images, 1):
print(f"\n Image #{j}:")
print(f" - Name: {image.name}")
print(f" - Type: {image.type} (6=webm, 9=webp, 10=mp4)")
print(f" - Size: {image.size}KB")
print(f" - Dimensions: {image.width}x{image.height}")
print(f" - Duration: {image.duration or 'N/A'}")
print(f" - URL: {image.url}")
# Summary
print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"✓ Total messages parsed: {len(thread.messages)}")
print(f"✓ Total images parsed: {sum(len(msg.images) for msg in thread.messages)}")
print(f"✓ Total reply links: {sum(len(msg.reply_links) for msg in thread.messages)}")
print(f"✓ Total ref links: {sum(len(msg.ref_links) for msg in thread.messages)}")
# Test image downloading
print("\n" + "=" * 80)
print("TESTING IMAGE DOWNLOADING")
print("=" * 80)
# Create parser instance with session
print("\n1. Creating parser instance with session...")
parser = Parser()
print(" ✓ Parser created")
# Download all images
print("\n2. Downloading all images from thread...")
downloaded_count = parser.download_all_images(thread)
print(f"\n3. Image download summary:")
print(f" - Successfully downloaded: {downloaded_count}")
# Verify downloaded images
print("\n4. Verifying downloaded images:")
for i, message in enumerate(thread.messages, 1):
for j, image in enumerate(message.images, 1):
if image.data:
print(f" ✓ Message #{i}, Image #{j}: {image.name} ({len(image.data)} bytes)")
else:
print(f" ✗ Message #{i}, Image #{j}: {image.name} - NOT DOWNLOADED")
# Test thread_from_url method
print("\n" + "=" * 80)
print("TESTING thread_from_url METHOD")
print("=" * 80)
# Create a test URL
test_url = "https://2ch.life/ai/res/1484052.html"
print(f"\nTesting URL: {test_url}")
print("Note: This will make an HTTP request. If offline, it will fail.")
thread_from_url = parser.thread_from_url(test_url)
if thread_from_url:
print(f"✓ Thread fetched from URL successfully")
print(f" - ID: {thread_from_url.id}")
print(f" - Title: {thread_from_url.title}")
print(f" - Messages: {len(thread_from_url.messages)}")
else:
print("✗ Failed to fetch thread from URL (expected if offline)")
print("\n" + "=" * 80)
print("TEST COMPLETED")
print("=" * 80)
if __name__ == "__main__":
test_parser()