mirror of
https://github.com/unclecode/crawl4ai.git
synced 2026-06-10 07:48:50 +00:00
Update docs/examples to use current API: - proxy → proxy_config in BrowserConfig - result.fit_markdown → result.markdown.fit_markdown - result.fit_html → result.markdown.fit_html - markdown_v2 deprecation notes updated - bypass_cache → cache_mode=CacheMode.BYPASS - LLMExtractionStrategy now uses llm_config=LLMConfig(...) - CrawlerConfig → CrawlerRunConfig - cache_mode string values → CacheMode enum - Fix missing CacheMode import in local-files.md - Fix indentation in app-detail.html example - Fix tautological cache mode descriptions in arun.md From PR #1770 by @maksimzayats
54 lines
2.1 KiB
Python
54 lines
2.1 KiB
Python
import asyncio
|
|
import json
|
|
import os
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
|
from crawl4ai import CacheMode, LLMExtractionStrategy
|
|
|
|
url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
|
|
|
|
class PageSummary(BaseModel):
|
|
title: str = Field(..., description="Title of the page.")
|
|
summary: str = Field(..., description="Summary of the page.")
|
|
brief_summary: str = Field(..., description="Brief summary of the page.")
|
|
keywords: list = Field(..., description="Keywords assigned to the page.")
|
|
|
|
|
|
async def main():
|
|
async with AsyncWebCrawler() as crawler:
|
|
result = await crawler.arun(
|
|
url=url,
|
|
config=CrawlerRunConfig(
|
|
word_count_threshold=1,
|
|
cache_mode=CacheMode.BYPASS,
|
|
extraction_strategy=LLMExtractionStrategy(
|
|
llm_config=LLMConfig(
|
|
provider="openai/gpt-4o",
|
|
api_token=os.getenv("OPENAI_API_KEY"),
|
|
),
|
|
schema=PageSummary.model_json_schema(),
|
|
extraction_type="schema",
|
|
apply_chunking=False,
|
|
instruction="From the crawled content, extract the following details: "
|
|
"1. Title of the page "
|
|
"2. Summary of the page, which is a detailed summary "
|
|
"3. Brief summary of the page, which is a paragraph text "
|
|
"4. Keywords assigned to the page, which is a list of keywords. "
|
|
'The extracted JSON format should look like this: '
|
|
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
|
|
),
|
|
),
|
|
)
|
|
|
|
page_summary = json.loads(result.extracted_content)
|
|
print(page_summary)
|
|
|
|
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
|
|
f.write(result.extracted_content)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|