Files
crawl4ai/docs/examples/summarize_page.py
unclecode 04e83aa3c7 docs: modernize deprecated API usage across shipped docs (#1770)
Update docs/examples to use current API:
- proxy → proxy_config in BrowserConfig
- result.fit_markdown → result.markdown.fit_markdown
- result.fit_html → result.markdown.fit_html
- markdown_v2 deprecation notes updated
- bypass_cache → cache_mode=CacheMode.BYPASS
- LLMExtractionStrategy now uses llm_config=LLMConfig(...)
- CrawlerConfig → CrawlerRunConfig
- cache_mode string values → CacheMode enum
- Fix missing CacheMode import in local-files.md
- Fix indentation in app-detail.html example
- Fix tautological cache mode descriptions in arun.md

From PR #1770 by @maksimzayats
2026-03-07 07:01:06 +00:00

54 lines
2.1 KiB
Python

import asyncio
import json
import os
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai import CacheMode, LLMExtractionStrategy
url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"
class PageSummary(BaseModel):
title: str = Field(..., description="Title of the page.")
summary: str = Field(..., description="Summary of the page.")
brief_summary: str = Field(..., description="Brief summary of the page.")
keywords: list = Field(..., description="Keywords assigned to the page.")
async def main():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=url,
config=CrawlerRunConfig(
word_count_threshold=1,
cache_mode=CacheMode.BYPASS,
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(
provider="openai/gpt-4o",
api_token=os.getenv("OPENAI_API_KEY"),
),
schema=PageSummary.model_json_schema(),
extraction_type="schema",
apply_chunking=False,
instruction="From the crawled content, extract the following details: "
"1. Title of the page "
"2. Summary of the page, which is a detailed summary "
"3. Brief summary of the page, which is a paragraph text "
"4. Keywords assigned to the page, which is a list of keywords. "
'The extracted JSON format should look like this: '
'{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
),
),
)
page_summary = json.loads(result.extracted_content)
print(page_summary)
with open(".data/page_summary.json", "w", encoding="utf-8") as f:
f.write(result.extracted_content)
if __name__ == "__main__":
asyncio.run(main())