mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 15:58:15 +00:00
- #1520: Preserve trailing slashes in URL normalization (RFC 3986 compliance) - #1489: Preserve query parameter key casing in normalize_url - #1374: Close NamedTemporaryFile handle before reopening (Windows fix) - #1424: Fix CosineStrategy returning empty results (delimiter fallback + at_least_k >= 1) - #1183: Fix extract_xml_data regex matching tag names in prose text - #1354: Make import_knowledge_base async (fix asyncio.run in running loop) - #880: Fix 404 sample_ecommerce.html gist URL in docs (6 occurrences) - #1031: Make Docker playground code editor resizable with overflow-auto - #1251: Add DEFAULT_CONFIG with deep-merge in load_config to prevent KeyError crashes - #1758: Change screenshot stitching format from BMP to PNG
232 lines
8.0 KiB
Python
232 lines
8.0 KiB
Python
"""
|
|
Knowledge Base Export and Import
|
|
|
|
This example demonstrates how to export crawled knowledge bases and
|
|
import them for reuse, sharing, or analysis.
|
|
"""
|
|
|
|
import asyncio
|
|
import json
|
|
from pathlib import Path
|
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
|
|
|
|
|
async def build_knowledge_base():
|
|
"""Build a knowledge base about web technologies"""
|
|
print("="*60)
|
|
print("PHASE 1: Building Knowledge Base")
|
|
print("="*60)
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
adaptive = AdaptiveCrawler(crawler)
|
|
|
|
# Crawl information about HTTP
|
|
print("\n1. Gathering HTTP protocol information...")
|
|
await adaptive.digest(
|
|
start_url="https://httpbin.org",
|
|
query="http methods headers status codes"
|
|
)
|
|
print(f" - Pages crawled: {len(adaptive.state.crawled_urls)}")
|
|
print(f" - Confidence: {adaptive.confidence:.2%}")
|
|
|
|
# Add more information about APIs
|
|
print("\n2. Adding API documentation knowledge...")
|
|
await adaptive.digest(
|
|
start_url="https://httpbin.org/anything",
|
|
query="rest api json response request"
|
|
)
|
|
print(f" - Total pages: {len(adaptive.state.crawled_urls)}")
|
|
print(f" - Confidence: {adaptive.confidence:.2%}")
|
|
|
|
# Export the knowledge base
|
|
export_path = "web_tech_knowledge.jsonl"
|
|
print(f"\n3. Exporting knowledge base to {export_path}")
|
|
adaptive.export_knowledge_base(export_path)
|
|
|
|
# Show export statistics
|
|
export_size = Path(export_path).stat().st_size / 1024
|
|
with open(export_path, 'r') as f:
|
|
line_count = sum(1 for _ in f)
|
|
|
|
print(f" - Exported {line_count} documents")
|
|
print(f" - File size: {export_size:.1f} KB")
|
|
|
|
return export_path
|
|
|
|
|
|
async def analyze_knowledge_base(kb_path):
|
|
"""Analyze the exported knowledge base"""
|
|
print("\n" + "="*60)
|
|
print("PHASE 2: Analyzing Exported Knowledge Base")
|
|
print("="*60)
|
|
|
|
# Read and analyze JSONL
|
|
documents = []
|
|
with open(kb_path, 'r') as f:
|
|
for line in f:
|
|
documents.append(json.loads(line))
|
|
|
|
print(f"\nKnowledge base contains {len(documents)} documents:")
|
|
|
|
# Analyze document properties
|
|
total_content_length = 0
|
|
urls_by_domain = {}
|
|
|
|
for doc in documents:
|
|
# Content analysis
|
|
content_length = len(doc.get('content', ''))
|
|
total_content_length += content_length
|
|
|
|
# URL analysis
|
|
url = doc.get('url', '')
|
|
domain = url.split('/')[2] if url.startswith('http') else 'unknown'
|
|
urls_by_domain[domain] = urls_by_domain.get(domain, 0) + 1
|
|
|
|
# Show sample document
|
|
if documents.index(doc) == 0:
|
|
print(f"\nSample document structure:")
|
|
print(f" - URL: {url}")
|
|
print(f" - Content length: {content_length} chars")
|
|
print(f" - Has metadata: {'metadata' in doc}")
|
|
print(f" - Has links: {len(doc.get('links', []))} links")
|
|
print(f" - Query: {doc.get('query', 'N/A')}")
|
|
|
|
print(f"\nContent statistics:")
|
|
print(f" - Total content: {total_content_length:,} characters")
|
|
print(f" - Average per document: {total_content_length/len(documents):,.0f} chars")
|
|
|
|
print(f"\nDomain distribution:")
|
|
for domain, count in urls_by_domain.items():
|
|
print(f" - {domain}: {count} pages")
|
|
|
|
|
|
async def import_and_continue():
|
|
"""Import a knowledge base and continue crawling"""
|
|
print("\n" + "="*60)
|
|
print("PHASE 3: Importing and Extending Knowledge Base")
|
|
print("="*60)
|
|
|
|
kb_path = "web_tech_knowledge.jsonl"
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
# Create new adaptive crawler
|
|
adaptive = AdaptiveCrawler(crawler)
|
|
|
|
# Import existing knowledge base
|
|
print(f"\n1. Importing knowledge base from {kb_path}")
|
|
await adaptive.import_knowledge_base(kb_path)
|
|
|
|
print(f" - Imported {len(adaptive.state.knowledge_base)} documents")
|
|
print(f" - Existing URLs: {len(adaptive.state.crawled_urls)}")
|
|
|
|
# Check current state
|
|
print("\n2. Checking imported knowledge state:")
|
|
adaptive.print_stats(detailed=False)
|
|
|
|
# Continue crawling with new query
|
|
print("\n3. Extending knowledge with new query...")
|
|
await adaptive.digest(
|
|
start_url="https://httpbin.org/status/200",
|
|
query="error handling retry timeout"
|
|
)
|
|
|
|
print("\n4. Final knowledge base state:")
|
|
adaptive.print_stats(detailed=False)
|
|
|
|
# Export extended knowledge base
|
|
extended_path = "web_tech_knowledge_extended.jsonl"
|
|
adaptive.export_knowledge_base(extended_path)
|
|
print(f"\n5. Extended knowledge base exported to {extended_path}")
|
|
|
|
|
|
async def share_knowledge_bases():
|
|
"""Demonstrate sharing knowledge bases between projects"""
|
|
print("\n" + "="*60)
|
|
print("PHASE 4: Sharing Knowledge Between Projects")
|
|
print("="*60)
|
|
|
|
# Simulate two different projects
|
|
project_a_kb = "project_a_knowledge.jsonl"
|
|
project_b_kb = "project_b_knowledge.jsonl"
|
|
|
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
|
# Project A: Security documentation
|
|
print("\n1. Project A: Building security knowledge...")
|
|
crawler_a = AdaptiveCrawler(crawler)
|
|
await crawler_a.digest(
|
|
start_url="https://httpbin.org/basic-auth/user/pass",
|
|
query="authentication security headers"
|
|
)
|
|
crawler_a.export_knowledge_base(project_a_kb)
|
|
print(f" - Exported {len(crawler_a.state.knowledge_base)} documents")
|
|
|
|
# Project B: API testing
|
|
print("\n2. Project B: Building testing knowledge...")
|
|
crawler_b = AdaptiveCrawler(crawler)
|
|
await crawler_b.digest(
|
|
start_url="https://httpbin.org/anything",
|
|
query="testing endpoints mocking"
|
|
)
|
|
crawler_b.export_knowledge_base(project_b_kb)
|
|
print(f" - Exported {len(crawler_b.state.knowledge_base)} documents")
|
|
|
|
# Merge knowledge bases
|
|
print("\n3. Merging knowledge bases...")
|
|
merged_crawler = AdaptiveCrawler(crawler)
|
|
|
|
# Import both knowledge bases
|
|
await merged_crawler.import_knowledge_base(project_a_kb)
|
|
initial_size = len(merged_crawler.state.knowledge_base)
|
|
|
|
await merged_crawler.import_knowledge_base(project_b_kb)
|
|
final_size = len(merged_crawler.state.knowledge_base)
|
|
|
|
print(f" - Project A documents: {initial_size}")
|
|
print(f" - Additional from Project B: {final_size - initial_size}")
|
|
print(f" - Total merged documents: {final_size}")
|
|
|
|
# Export merged knowledge
|
|
merged_kb = "merged_knowledge.jsonl"
|
|
merged_crawler.export_knowledge_base(merged_kb)
|
|
print(f"\n4. Merged knowledge base exported to {merged_kb}")
|
|
|
|
# Show combined coverage
|
|
print("\n5. Combined knowledge coverage:")
|
|
merged_crawler.print_stats(detailed=False)
|
|
|
|
|
|
async def main():
|
|
"""Run all examples"""
|
|
try:
|
|
# Build initial knowledge base
|
|
kb_path = await build_knowledge_base()
|
|
|
|
# Analyze the export
|
|
await analyze_knowledge_base(kb_path)
|
|
|
|
# Import and extend
|
|
await import_and_continue()
|
|
|
|
# Demonstrate sharing
|
|
await share_knowledge_bases()
|
|
|
|
print("\n" + "="*60)
|
|
print("All examples completed successfully!")
|
|
print("="*60)
|
|
|
|
finally:
|
|
# Clean up generated files
|
|
print("\nCleaning up generated files...")
|
|
for file in [
|
|
"web_tech_knowledge.jsonl",
|
|
"web_tech_knowledge_extended.jsonl",
|
|
"project_a_knowledge.jsonl",
|
|
"project_b_knowledge.jsonl",
|
|
"merged_knowledge.jsonl"
|
|
]:
|
|
Path(file).unlink(missing_ok=True)
|
|
print("Cleanup complete.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main()) |