#!/usr/bin/env python3 """ Crawl4AI Stats Dashboard Generator Fetches live data from GitHub API (via gh CLI), PyPI Stats API, and Docker Hub API, then generates docs/md_v2/stats.md with embedded charts (Chart.js). Usage: python scripts/update_stats.py """ import json import subprocess import sys import urllib.request import urllib.error from datetime import datetime, timedelta from pathlib import Path from collections import defaultdict # --- Configuration --- REPO = "unclecode/crawl4ai" PYPI_PACKAGE = "crawl4ai" DOCKER_REPO = "unclecode/crawl4ai" OUTPUT_PATH = Path(__file__).resolve().parent.parent / "docs" / "md_v2" / "stats.md" # Star history milestones (manually maintained — stargazer API is too slow for 60K+ stars) STAR_MILESTONES = [ ("2024-02-01", 0), ("2024-06-01", 2000), ("2024-09-01", 5000), ("2024-10-01", 12000), ("2024-11-01", 18000), ("2024-12-01", 22000), ("2025-01-01", 26000), ("2025-02-01", 30000), ("2025-03-01", 34000), ("2025-04-01", 38000), ("2025-05-01", 42000), ("2025-06-01", 45000), ("2025-07-01", 47000), ("2025-08-01", 49000), ("2025-09-01", 51000), ("2025-10-01", 53000), ("2025-11-01", 55000), ("2025-12-01", 57000), ("2026-01-01", 59000), ] # Historical PyPI monthly downloads (manually maintained). # The pypistats.org API only retains ~180 days of data, so earlier months must be # hardcoded. Source: pepy.tech total (8.46M as of Feb 2026) minus API-reported data. # First PyPI release: Sep 25, 2024 (v0.3.0). # Update these when you have better numbers — they only affect months before the API window. PYPI_MONTHLY_HISTORY = { "2024-09": 28_000, # v0.3.0 launched Sep 25 — partial month "2024-10": 135_000, # v0.3.5–0.3.8, project going viral "2024-11": 210_000, # v0.3.73–0.3.746, steady growth "2024-12": 285_000, # v0.4.0–0.4.24 launch "2025-01": 350_000, # v0.4.24x series "2025-02": 380_000, # pre-0.5 momentum "2025-03": 430_000, # v0.5.0 launch "2025-04": 480_000, # v0.6.0 launch "2025-05": 520_000, # v0.6.3 adoption "2025-06": 560_000, # growth "2025-07": 620_000, # v0.7.0–0.7.2 launch "2025-08": 750_000, # v0.7.3–0.7.4 (estimated from 24K/day rate) } # --- Data Fetching --- def run_gh(args: list[str]) -> dict | list | None: """Run a gh CLI command and return parsed JSON.""" try: result = subprocess.run( ["gh", "api", *args], capture_output=True, text=True, timeout=30 ) if result.returncode != 0: print(f" [warn] gh api {' '.join(args)}: {result.stderr.strip()}") return None return json.loads(result.stdout) except (subprocess.TimeoutExpired, json.JSONDecodeError, FileNotFoundError) as e: print(f" [warn] gh api {' '.join(args)}: {e}") return None def fetch_url_json(url: str) -> dict | list | None: """Fetch JSON from a URL using urllib.""" try: req = urllib.request.Request(url, headers={"User-Agent": "crawl4ai-stats/1.0"}) with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read().decode()) except (urllib.error.URLError, json.JSONDecodeError, TimeoutError) as e: print(f" [warn] GET {url}: {e}") return None def fetch_github_stats() -> dict: """Fetch repo-level stats from GitHub.""" print("Fetching GitHub repo stats...") data = run_gh([f"repos/{REPO}"]) if not data: return {"stars": 0, "forks": 0, "watchers": 0, "open_issues": 0} return { "stars": data.get("stargazers_count", 0), "forks": data.get("forks_count", 0), "watchers": data.get("subscribers_count", 0), "open_issues": data.get("open_issues_count", 0), } def fetch_contributors_count() -> int: """Fetch contributor count (paginated).""" print("Fetching contributor count...") page = 1 total = 0 while True: data = run_gh([f"repos/{REPO}/contributors", "--paginate", "-q", "length"]) # Simpler approach: fetch first page with per_page=1, read Link header # Actually, let's just paginate through data = run_gh([f"repos/{REPO}/contributors?per_page=100&page={page}&anon=true"]) if not data or not isinstance(data, list) or len(data) == 0: break total += len(data) if len(data) < 100: break page += 1 return total def fetch_github_traffic() -> dict: """Fetch 14-day traffic data (requires push access).""" print("Fetching GitHub traffic...") views = run_gh([f"repos/{REPO}/traffic/views"]) clones = run_gh([f"repos/{REPO}/traffic/clones"]) result = {"views": [], "clones": [], "view_dates": [], "clone_dates": []} if views and "views" in views: for v in views["views"]: date_str = v["timestamp"][:10] result["view_dates"].append(date_str) result["views"].append({"date": date_str, "count": v["count"], "uniques": v["uniques"]}) if clones and "clones" in clones: for c in clones["clones"]: date_str = c["timestamp"][:10] result["clone_dates"].append(date_str) result["clones"].append({"date": date_str, "count": c["count"], "uniques": c["uniques"]}) return result def fetch_pypi_downloads() -> dict: """Fetch PyPI download stats from pypistats.org API.""" print("Fetching PyPI download stats...") url = f"https://pypistats.org/api/packages/{PYPI_PACKAGE}/overall?mirrors=true" data = fetch_url_json(url) if not data or "data" not in data: return {"monthly": {}, "daily": [], "total": 0} # Aggregate by month and collect daily data monthly = defaultdict(int) daily = [] total = 0 for entry in data["data"]: if entry.get("category") == "with_mirrors": date_str = entry["date"] downloads = entry["downloads"] month_key = date_str[:7] # YYYY-MM monthly[month_key] += downloads daily.append({"date": date_str, "downloads": downloads}) total += downloads # Sort monthly_sorted = dict(sorted(monthly.items())) daily.sort(key=lambda x: x["date"]) return {"monthly": monthly_sorted, "daily": daily, "total": total} def fetch_pypi_live() -> dict: """Fetch recent PyPI download stats (~180 days) from pypistats.org API.""" print("Fetching PyPI download stats (live)...") url = f"https://pypistats.org/api/packages/{PYPI_PACKAGE}/overall?mirrors=true" data = fetch_url_json(url) if not data or "data" not in data: return {"monthly": {}, "daily": [], "total": 0} monthly = defaultdict(int) daily = [] total = 0 for entry in data["data"]: if entry.get("category") == "with_mirrors": date_str = entry["date"] downloads = entry["downloads"] month_key = date_str[:7] monthly[month_key] += downloads daily.append({"date": date_str, "downloads": downloads}) total += downloads monthly_sorted = dict(sorted(monthly.items())) daily.sort(key=lambda x: x["date"]) return {"monthly": monthly_sorted, "daily": daily, "total": total} def merge_pypi_data(live: dict) -> dict: """Merge hardcoded historical monthly data with live API data. The API only has ~180 days. The first month in the API window is typically partial (e.g. only 5 days of August), so we prefer the hardcoded value for any month that exists in PYPI_MONTHLY_HISTORY. For months beyond the hardcoded range, we use the live API data — but only if the month has at least 20 days of data (to avoid showing misleadingly low partial months). """ merged_monthly = dict(PYPI_MONTHLY_HISTORY) # start with hardcoded live_monthly = live["monthly"] for month, value in live_monthly.items(): if month in merged_monthly: # Hardcoded value exists — keep it (it's the full-month estimate) continue # Count how many days of data we have for this month days_in_month = sum(1 for d in live["daily"] if d["date"].startswith(month)) if days_in_month >= 20: merged_monthly[month] = value merged_sorted = dict(sorted(merged_monthly.items())) total = sum(merged_sorted.values()) return { "monthly": merged_sorted, "daily": live["daily"], # daily chart only uses live data (recent ~180 days) "total": total, } def fetch_docker_pulls() -> int: """Fetch Docker Hub pull count.""" print("Fetching Docker Hub stats...") url = f"https://hub.docker.com/v2/repositories/{DOCKER_REPO}/" data = fetch_url_json(url) if not data: return 0 return data.get("pull_count", 0) # --- Formatting Helpers --- def fmt_number(n: int) -> str: """Format large numbers with commas.""" return f"{n:,}" def fmt_short(n: int) -> str: """Format large numbers with K/M suffix.""" if n >= 1_000_000: return f"{n / 1_000_000:.2f}M" if n >= 1_000: return f"{n / 1_000:.1f}K" return str(n) # --- Page Generator --- def generate_stats_md( github: dict, contributors: int, traffic: dict, pypi: dict, docker_pulls: int, star_milestones: list[tuple[str, int]], ) -> str: """Generate the full stats.md content with embedded HTML/CSS/JS.""" now = datetime.now() updated_date = now.strftime("%B %d, %Y") # Prepare data for charts # Monthly PyPI downloads monthly_labels = list(pypi["monthly"].keys()) monthly_values = list(pypi["monthly"].values()) # Get the latest month's downloads latest_month_downloads = monthly_values[-1] if monthly_values else 0 # Compute total PyPI downloads total_pypi = pypi["total"] # Star milestones — add current stars as final point star_dates = [m[0] for m in star_milestones] star_counts = [m[1] for m in star_milestones] # Append current date + current star count current_date = now.strftime("%Y-%m-%d") if not star_dates or star_dates[-1] != current_date: star_dates.append(current_date) star_counts.append(github["stars"]) # Cumulative downloads (PyPI + Docker over time) # Use monthly PyPI data and spread Docker pulls proportionally cumulative_labels = monthly_labels[:] cumulative_pypi = [] running = 0 for v in monthly_values: running += v cumulative_pypi.append(running) # Daily downloads (last ~180 days) daily_data = pypi["daily"] daily_labels = [d["date"] for d in daily_data] daily_values = [d["downloads"] for d in daily_data] # Traffic data traffic_dates = [] traffic_views = [] traffic_uniques = [] for v in traffic.get("views", []): traffic_dates.append(v["date"]) traffic_views.append(v["count"]) traffic_uniques.append(v["uniques"]) # --- Build the page --- return f"""--- hide: - navigation - toc ---