#!/usr/bin/env python3

# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT

"""
Example 03: FMHA Benchmark

Performance benchmarking with warmup and repeated iterations across
multiple (batch, sequence length) configurations.

Usage:
    python3 03_benchmark.py
    python3 03_benchmark.py --help
    python3 03_benchmark.py --warmup 5 --repeat 20
    python3 03_benchmark.py --arch gfx942
"""

import sys
import argparse
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent / "python"))
import numpy as np

from fmha_utils import (
    FmhaKernelSpec,
    FmhaProblem,
    detect_gpu_arch,
    setup_fmha_dispatcher,
    spec_to_config,
)


def main():
    parser = argparse.ArgumentParser(
        description="FMHA Benchmark Example - performance testing",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python3 03_benchmark.py                     # Default benchmark suite
  python3 03_benchmark.py --warmup 5          # More warmup iterations
  python3 03_benchmark.py --repeat 20         # More benchmark iterations
        """,
    )
    parser.add_argument(
        "--arch",
        default=detect_gpu_arch(),
        help="Target architecture (auto-detected from rocminfo)",
    )
    parser.add_argument(
        "--warmup", type=int, default=3, help="Warmup iterations (default: 3)"
    )
    parser.add_argument(
        "--repeat", type=int, default=10, help="Benchmark iterations (default: 10)"
    )
    args = parser.parse_args()

    print("=" * 70)
    print("Example 03: FMHA Benchmark")
    print("=" * 70)

    # Step 1: Setup dispatcher with compute-optimized config
    print("\nStep 1: Setup Dispatcher")

    # FmhaKernelSpec fields:
    #   name      -- human-readable kernel identifier
    #   hdim      -- head dimension (hdim_q = hdim_v)
    #   pipeline  -- "qr_async" (async prefetch) or "qr" (synchronous)
    #   tile_m0   -- Stage 0 tile along seqlen_q  (Q*K^T M dimension)
    #   tile_n0   -- Stage 0 tile along seqlen_k  (Q*K^T N dimension)
    #   tile_k0   -- Stage 0 tile along hdim_q    (Q*K^T K dimension)
    spec = FmhaKernelSpec(name="benchmark", hdim=128, pipeline="qr_async")
    config = spec_to_config(spec, dtype="fp16", arch=args.arch)

    setup = setup_fmha_dispatcher(config, verbose=True)
    if not setup.success:
        print(f"  ERROR: {setup.error}")
        return 1

    runner = setup.runner
    print(f"  Library: {setup.library_path}")
    print(f"  Build:   {setup.build_time_s:.1f} s")

    # Step 2: Benchmark
    print("\nStep 2: Benchmark")

    bench_configs = [
        (1, 128),
        (1, 256),
        (1, 512),
        (1, 1024),
        (1, 2048),
        (2, 128),
        (2, 256),
        (2, 512),
        (2, 1024),
        (4, 128),
        (4, 256),
        (4, 512),
        (8, 128),
        (8, 256),
    ]

    print(f"  Warmup: {args.warmup}, Repeat: {args.repeat}\n")

    print(
        f"  {'Batch':>5} {'SeqLen':>7} | {'Min(ms)':>10} {'Avg(ms)':>10} {'Max(ms)':>10} | {'TFLOPS':>10}"
    )
    print("  " + "-" * 62)

    all_tflops = []

    for batch, seqlen in bench_configs:
        prob = FmhaProblem(
            batch=batch,
            nhead_q=8,
            nhead_k=8,
            seqlen_q=seqlen,
            seqlen_k=seqlen,
            hdim_q=128,
            hdim_v=128,
        )

        np.random.seed(42)
        Q = (np.random.randn(*prob.q_shape()) * 0.1).astype(np.float16)
        K = (np.random.randn(*prob.k_shape()) * 0.1).astype(np.float16)
        V = (np.random.randn(*prob.v_shape()) * 0.1).astype(np.float16)

        for _ in range(args.warmup):
            runner.run(Q, K, V, prob)

        times = []
        for _ in range(args.repeat):
            result = runner.run(Q, K, V, prob)
            if result.success:
                times.append(result.time_ms)

        if times:
            min_time = min(times)
            avg_time = sum(times) / len(times)
            max_time = max(times)
            tflops = prob.num_ops / (avg_time * 1e-3) / 1e12
            all_tflops.append(tflops)
            print(
                f"  {batch:>5} {seqlen:>7} | {min_time:>10.4f} {avg_time:>10.4f} {max_time:>10.4f} | {tflops:>10.2f}"
            )
        else:
            print(
                f"  {batch:>5} {seqlen:>7} | {'---':>10} {'---':>10} {'---':>10} | {'FAIL':>10}"
            )

    runner.cleanup()

    # Summary
    print("\n" + "=" * 70)
    print("Summary")
    print("=" * 70)

    if all_tflops:
        print(f"  Average: {sum(all_tflops) / len(all_tflops):.2f} TFLOPS")
        print(f"  Peak:    {max(all_tflops):.2f} TFLOPS")

    print("=" * 70)

    return 0


if __name__ == "__main__":
    sys.exit(main())