mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 07:48:50 +00:00
Update docs/examples to use current API: - proxy → proxy_config in BrowserConfig - result.fit_markdown → result.markdown.fit_markdown - result.fit_html → result.markdown.fit_html - markdown_v2 deprecation notes updated - bypass_cache → cache_mode=CacheMode.BYPASS - LLMExtractionStrategy now uses llm_config=LLMConfig(...) - CrawlerConfig → CrawlerRunConfig - cache_mode string values → CacheMode enum - Fix missing CacheMode import in local-files.md - Fix indentation in app-detail.html example - Fix tautological cache mode descriptions in arun.md From PR #1770 by @maksimzayats
56 lines
1.7 KiB
Python
56 lines
1.7 KiB
Python
# File: async_webcrawler_multiple_urls_example.py
|
|
import os, sys
|
|
|
|
# append 2 parent directories to sys.path to import crawl4ai
|
|
parent_dir = os.path.dirname(
|
|
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
)
|
|
sys.path.append(parent_dir)
|
|
|
|
import asyncio
|
|
from crawl4ai import AsyncWebCrawler
|
|
|
|
|
|
async def main():
|
|
# Initialize the AsyncWebCrawler
|
|
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
# List of URLs to crawl
|
|
urls = [
|
|
"https://example.com",
|
|
"https://python.org",
|
|
"https://github.com",
|
|
"https://stackoverflow.com",
|
|
"https://news.ycombinator.com",
|
|
]
|
|
|
|
# Set up crawling parameters
|
|
word_count_threshold = 100
|
|
|
|
# Run the crawling process for multiple URLs
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
word_count_threshold=word_count_threshold,
|
|
cache_mode=CacheMode.BYPASS,
|
|
verbose=True,
|
|
)
|
|
|
|
# Process the results
|
|
for result in results:
|
|
if result.success:
|
|
print(f"Successfully crawled: {result.url}")
|
|
print(f"Title: {result.metadata.get('title', 'N/A')}")
|
|
print(f"Word count: {len(result.markdown.split())}")
|
|
print(
|
|
f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}"
|
|
)
|
|
print(f"Number of images: {len(result.media.get('images', []))}")
|
|
print("---")
|
|
else:
|
|
print(f"Failed to crawl: {result.url}")
|
|
print(f"Error: {result.error_message}")
|
|
print("---")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|