Files
crawl4ai/docs/examples/adaptive_crawling/export_import_kb.py
unclecode 3a75dd3f4c fix: batch fix for 10 open issues (#1520, #1489, #1374, #1424, #1183, #1354, #880, #1031, #1251, #1758)
- #1520: Preserve trailing slashes in URL normalization (RFC 3986 compliance)
- #1489: Preserve query parameter key casing in normalize_url
- #1374: Close NamedTemporaryFile handle before reopening (Windows fix)
- #1424: Fix CosineStrategy returning empty results (delimiter fallback + at_least_k >= 1)
- #1183: Fix extract_xml_data regex matching tag names in prose text
- #1354: Make import_knowledge_base async (fix asyncio.run in running loop)
- #880: Fix 404 sample_ecommerce.html gist URL in docs (6 occurrences)
- #1031: Make Docker playground code editor resizable with overflow-auto
- #1251: Add DEFAULT_CONFIG with deep-merge in load_config to prevent KeyError crashes
- #1758: Change screenshot stitching format from BMP to PNG
2026-03-07 09:47:38 +00:00

232 lines
8.0 KiB
Python

"""
Knowledge Base Export and Import
This example demonstrates how to export crawled knowledge bases and
import them for reuse, sharing, or analysis.
"""
import asyncio
import json
from pathlib import Path
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
async def build_knowledge_base():
"""Build a knowledge base about web technologies"""
print("="*60)
print("PHASE 1: Building Knowledge Base")
print("="*60)
async with AsyncWebCrawler(verbose=False) as crawler:
adaptive = AdaptiveCrawler(crawler)
# Crawl information about HTTP
print("\n1. Gathering HTTP protocol information...")
await adaptive.digest(
start_url="https://httpbin.org",
query="http methods headers status codes"
)
print(f" - Pages crawled: {len(adaptive.state.crawled_urls)}")
print(f" - Confidence: {adaptive.confidence:.2%}")
# Add more information about APIs
print("\n2. Adding API documentation knowledge...")
await adaptive.digest(
start_url="https://httpbin.org/anything",
query="rest api json response request"
)
print(f" - Total pages: {len(adaptive.state.crawled_urls)}")
print(f" - Confidence: {adaptive.confidence:.2%}")
# Export the knowledge base
export_path = "web_tech_knowledge.jsonl"
print(f"\n3. Exporting knowledge base to {export_path}")
adaptive.export_knowledge_base(export_path)
# Show export statistics
export_size = Path(export_path).stat().st_size / 1024
with open(export_path, 'r') as f:
line_count = sum(1 for _ in f)
print(f" - Exported {line_count} documents")
print(f" - File size: {export_size:.1f} KB")
return export_path
async def analyze_knowledge_base(kb_path):
"""Analyze the exported knowledge base"""
print("\n" + "="*60)
print("PHASE 2: Analyzing Exported Knowledge Base")
print("="*60)
# Read and analyze JSONL
documents = []
with open(kb_path, 'r') as f:
for line in f:
documents.append(json.loads(line))
print(f"\nKnowledge base contains {len(documents)} documents:")
# Analyze document properties
total_content_length = 0
urls_by_domain = {}
for doc in documents:
# Content analysis
content_length = len(doc.get('content', ''))
total_content_length += content_length
# URL analysis
url = doc.get('url', '')
domain = url.split('/')[2] if url.startswith('http') else 'unknown'
urls_by_domain[domain] = urls_by_domain.get(domain, 0) + 1
# Show sample document
if documents.index(doc) == 0:
print(f"\nSample document structure:")
print(f" - URL: {url}")
print(f" - Content length: {content_length} chars")
print(f" - Has metadata: {'metadata' in doc}")
print(f" - Has links: {len(doc.get('links', []))} links")
print(f" - Query: {doc.get('query', 'N/A')}")
print(f"\nContent statistics:")
print(f" - Total content: {total_content_length:,} characters")
print(f" - Average per document: {total_content_length/len(documents):,.0f} chars")
print(f"\nDomain distribution:")
for domain, count in urls_by_domain.items():
print(f" - {domain}: {count} pages")
async def import_and_continue():
"""Import a knowledge base and continue crawling"""
print("\n" + "="*60)
print("PHASE 3: Importing and Extending Knowledge Base")
print("="*60)
kb_path = "web_tech_knowledge.jsonl"
async with AsyncWebCrawler(verbose=False) as crawler:
# Create new adaptive crawler
adaptive = AdaptiveCrawler(crawler)
# Import existing knowledge base
print(f"\n1. Importing knowledge base from {kb_path}")
await adaptive.import_knowledge_base(kb_path)
print(f" - Imported {len(adaptive.state.knowledge_base)} documents")
print(f" - Existing URLs: {len(adaptive.state.crawled_urls)}")
# Check current state
print("\n2. Checking imported knowledge state:")
adaptive.print_stats(detailed=False)
# Continue crawling with new query
print("\n3. Extending knowledge with new query...")
await adaptive.digest(
start_url="https://httpbin.org/status/200",
query="error handling retry timeout"
)
print("\n4. Final knowledge base state:")
adaptive.print_stats(detailed=False)
# Export extended knowledge base
extended_path = "web_tech_knowledge_extended.jsonl"
adaptive.export_knowledge_base(extended_path)
print(f"\n5. Extended knowledge base exported to {extended_path}")
async def share_knowledge_bases():
"""Demonstrate sharing knowledge bases between projects"""
print("\n" + "="*60)
print("PHASE 4: Sharing Knowledge Between Projects")
print("="*60)
# Simulate two different projects
project_a_kb = "project_a_knowledge.jsonl"
project_b_kb = "project_b_knowledge.jsonl"
async with AsyncWebCrawler(verbose=False) as crawler:
# Project A: Security documentation
print("\n1. Project A: Building security knowledge...")
crawler_a = AdaptiveCrawler(crawler)
await crawler_a.digest(
start_url="https://httpbin.org/basic-auth/user/pass",
query="authentication security headers"
)
crawler_a.export_knowledge_base(project_a_kb)
print(f" - Exported {len(crawler_a.state.knowledge_base)} documents")
# Project B: API testing
print("\n2. Project B: Building testing knowledge...")
crawler_b = AdaptiveCrawler(crawler)
await crawler_b.digest(
start_url="https://httpbin.org/anything",
query="testing endpoints mocking"
)
crawler_b.export_knowledge_base(project_b_kb)
print(f" - Exported {len(crawler_b.state.knowledge_base)} documents")
# Merge knowledge bases
print("\n3. Merging knowledge bases...")
merged_crawler = AdaptiveCrawler(crawler)
# Import both knowledge bases
await merged_crawler.import_knowledge_base(project_a_kb)
initial_size = len(merged_crawler.state.knowledge_base)
await merged_crawler.import_knowledge_base(project_b_kb)
final_size = len(merged_crawler.state.knowledge_base)
print(f" - Project A documents: {initial_size}")
print(f" - Additional from Project B: {final_size - initial_size}")
print(f" - Total merged documents: {final_size}")
# Export merged knowledge
merged_kb = "merged_knowledge.jsonl"
merged_crawler.export_knowledge_base(merged_kb)
print(f"\n4. Merged knowledge base exported to {merged_kb}")
# Show combined coverage
print("\n5. Combined knowledge coverage:")
merged_crawler.print_stats(detailed=False)
async def main():
"""Run all examples"""
try:
# Build initial knowledge base
kb_path = await build_knowledge_base()
# Analyze the export
await analyze_knowledge_base(kb_path)
# Import and extend
await import_and_continue()
# Demonstrate sharing
await share_knowledge_bases()
print("\n" + "="*60)
print("All examples completed successfully!")
print("="*60)
finally:
# Clean up generated files
print("\nCleaning up generated files...")
for file in [
"web_tech_knowledge.jsonl",
"web_tech_knowledge_extended.jsonl",
"project_a_knowledge.jsonl",
"project_b_knowledge.jsonl",
"merged_knowledge.jsonl"
]:
Path(file).unlink(missing_ok=True)
print("Cleanup complete.")
if __name__ == "__main__":
asyncio.run(main())