crawl4ai/docs/examples/summarize_page.py

import asyncio
import json
import os

from pydantic import BaseModel, Field

from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai import CacheMode, LLMExtractionStrategy

url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot"

class PageSummary(BaseModel):
    title: str = Field(..., description="Title of the page.")
    summary: str = Field(..., description="Summary of the page.")
    brief_summary: str = Field(..., description="Brief summary of the page.")
    keywords: list = Field(..., description="Keywords assigned to the page.")


async def main():
    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url=url,
            config=CrawlerRunConfig(
                word_count_threshold=1,
                cache_mode=CacheMode.BYPASS,
                extraction_strategy=LLMExtractionStrategy(
                    llm_config=LLMConfig(
                        provider="openai/gpt-4o",
                        api_token=os.getenv("OPENAI_API_KEY"),
                    ),
                    schema=PageSummary.model_json_schema(),
                    extraction_type="schema",
                    apply_chunking=False,
                    instruction="From the crawled content, extract the following details: "
                    "1. Title of the page "
                    "2. Summary of the page, which is a detailed summary "
                    "3. Brief summary of the page, which is a paragraph text "
                    "4. Keywords assigned to the page, which is a list of keywords. "
                    'The extracted JSON format should look like this: '
                    '{ "title": "Page Title", "summary": "Detailed summary of the page.", "brief_summary": "Brief summary in a paragraph.", "keywords": ["keyword1", "keyword2", "keyword3"] }',
                ),
            ),
        )

    page_summary = json.loads(result.extracted_content)
    print(page_summary)

    with open(".data/page_summary.json", "w", encoding="utf-8") as f:
        f.write(result.extracted_content)


if __name__ == "__main__":
    asyncio.run(main())