composable_kernel/script/update_amd_copyright_headers.py

#!/usr/bin/env python3

# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
# SPDX-License-Identifier: MIT

"""
Purpose:
  Normalize and enforce AMD two-line copyright + SPDX headers across files.

Target files:
  - C/C++-style: .cpp, .hpp, .inc  -> uses "//" comment style
  - Hash-style:  .py, .cmake, .sh, and CMakeLists.txt -> uses "#" style

Header formats inserted (top of file, followed by exactly one blank line):
  C/C++  :
    // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
    // SPDX-License-Identifier: MIT
    <blank>
  Hash   :
    <blank>

Shebang special case (hash-style only):
  - If line 1 starts with "#!", keep shebang, then a blank line, then the
    two hash-style header lines, then a blank line.

Removal rules:
  - Remove any comment lines (anywhere in file) containing the keywords
    "copyright" or "spdx" (case-insensitive). Blank lines are preserved.
  - Remove long-form MIT license block comment when:
      a) The file starts with the block (absolute top), OR
      b) The block appears immediately after the AMD header position
         (i.e., when remainder at insertion point begins with "/*" and
         the first content line is "* The MIT License (MIT)").

Blank-line normalization:
  - Enforce exactly ONE blank line immediately after the AMD header.
    (Drop only the leading blank lines at the insertion point before
     re-inserting the header.)
  - Do not change blank lines between other non-copyright comments.

Preservation:
  - Preserve original newline style: CRLF (\r\n) vs LF (\n).
  - Preserve UTF-8 BOM if present.
  - Do not modify non-comment code lines.

Idempotency:
  - Running this script multiple times does not further modify files.
"""

from __future__ import annotations
import re
import sys
from pathlib import Path
from typing import List, Tuple

AMD_CPP_HEADER_TEXT = [
    "// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.",
    "// SPDX-License-Identifier: MIT",
]
AMD_HASH_HEADER_TEXT = [
    "# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.",
    "# SPDX-License-Identifier: MIT",
]

CPP_EXTS = {".cpp", ".hpp", ".inc"}
HASH_EXTS = {".py", ".cmake", ".sh"}

# --- Encoding helpers -------------------------------------------------------


def has_bom(raw: bytes) -> bool:
    return raw.startswith(b"\xef\xbb\xbf")


def decode_text(raw: bytes) -> str:
    return raw.decode("utf-8-sig", errors="replace")


def encode_text(text: str, bom: bool) -> bytes:
    data = text.encode("utf-8")
    return (b"\xef\xbb\xbf" + data) if bom else data


# --- Newline detection ------------------------------------------------------


def detect_newline_sequence(raw: bytes) -> str:
    if b"\r\n" in raw:
        return "\r\n"
    elif b"\n" in raw:
        return "\n"
    else:
        return "\n"


# --- Utilities --------------------------------------------------------------


def is_comment_line(line: str, style: str) -> bool:
    stripped = line.lstrip()
    if style == "cpp":
        return (
            stripped.startswith("//")
            or stripped.startswith("/*")
            or stripped.startswith("*")
            or stripped.startswith("*/")
        )
    elif style == "hash":
        return stripped.startswith("#")
    return False


def has_keywords(line: str) -> bool:
    lower_line = line.lower()
    return ("copyright" in lower_line) or ("spdx" in lower_line)


# --- MIT License banner detection ------------------------------
MIT_C_FIRST_LINE_RE = re.compile(r"^\s*\*\s*The MIT License \(MIT\)")
MIT_HASH_FIRST_LINE_RE = re.compile(r"^\s*#\s*The MIT License \(MIT\)")


def remove_top_mit_block(lines: List[str]) -> Tuple[List[str], bool]:
    """
    Unified MIT banner removal at the top of 'lines'.
    Supports:
      - C-style block starting with '/*' and ending with '*/'; removes only if
        a line within the block matches MIT_C_FIRST_LINE_RE.
      - Hash-style banner: contiguous top run of lines starting with '#';
        removes only if any line in that run matches MIT_HASH_FIRST_LINE_RE.
    Returns (new_lines, removed_flag). Preserves EOLs.
    """
    if not lines:
        return lines, False

    first = lines[0].lstrip()

    # C-style block
    if first.startswith("/*"):
        end_idx, saw_mit = None, False
        for i, line in enumerate(lines[1:], 1):
            if not saw_mit and MIT_C_FIRST_LINE_RE.match(line):
                saw_mit = True
            s = line.lstrip()
            if s.startswith("*/") or s.rstrip().endswith("*/"):
                end_idx = i + 1
                break
        if end_idx is not None and saw_mit:
            return lines[end_idx:], True
        return lines, False

    # Hash-style contiguous banner
    if first.startswith("#"):
        end_idx, saw_mit = 0, False
        for i, line in enumerate(lines):
            if line.lstrip().startswith("#"):
                if not saw_mit and MIT_HASH_FIRST_LINE_RE.match(line):
                    saw_mit = True
                end_idx = i + 1
            else:
                break
        if saw_mit:
            return lines[end_idx:], True
        return lines, False

    return lines, False


# --- Removal + normalization helpers ---------------------------------------


def remove_keyword_comment_lines_globally(lines: List[str], style: str) -> List[str]:
    """Remove comment lines containing keywords anywhere in the file.
    **Do not** remove blank lines; preserve all other lines as-is."""
    out: List[str] = []
    for line in lines:
        if is_comment_line(line, style) and has_keywords(line):
            continue
        out.append(line)
    return out


def drop_leading_blank_lines(lines: List[str]) -> List[str]:
    """Drop only the leading blank lines at the start of the given list."""
    i = 0
    while i < len(lines) and lines[i].strip() == "":
        i += 1
    return lines[i:]


# --- Header builder ---------------------------------------------------------


def build_header_lines(style: str, nl: str) -> List[str]:
    base = AMD_CPP_HEADER_TEXT if style == "cpp" else AMD_HASH_HEADER_TEXT
    return [base[0] + nl, base[1] + nl, nl]  # header + exactly one blank


# --- Main transforms --------------------------------------------------------


def process_cpp(text: str, nl: str) -> str:
    lines = text.splitlines(True)

    # Remove MIT block if it is at the *absolute* top
    lines, _ = remove_top_mit_block(lines)

    # Remove keyworded comment lines globally (blank lines preserved)
    lines = remove_keyword_comment_lines_globally(lines, style="cpp")

    # Normalize insertion point and remove MIT block if it appears *after header*
    lines = drop_leading_blank_lines(lines)
    lines, _ = remove_top_mit_block(lines)

    # Prepend AMD header (guarantee exactly one blank after)
    return "".join(build_header_lines("cpp", nl) + lines)


def process_hash(text: str, nl: str) -> str:
    lines = text.splitlines(True)
    if not lines:
        return "".join(build_header_lines("hash", nl))

    shebang = lines[0].startswith("#!")

    if shebang:
        remainder = remove_keyword_comment_lines_globally(lines[1:], style="hash")
        remainder = drop_leading_blank_lines(remainder)
        remainder, _ = remove_top_mit_block(remainder)  # remove MIT block after header
        new_top = [lines[0], nl] + build_header_lines("hash", nl)
        return "".join(new_top + remainder)
    else:
        remainder = remove_keyword_comment_lines_globally(lines, style="hash")
        remainder = drop_leading_blank_lines(remainder)
        remainder, _ = remove_top_mit_block(remainder)  # remove MIT block after header
        return "".join(build_header_lines("hash", nl) + remainder)


# --- File processing & CLI --------------------------------------------------


def process_file(path: Path) -> bool:
    name = path.name
    suffix = path.suffix.lower()
    if suffix in CPP_EXTS:
        style = "cpp"
    elif suffix in HASH_EXTS or name == "CMakeLists.txt":
        style = "hash"
    else:
        return False

    raw = path.read_bytes()
    bom = has_bom(raw)
    nl = detect_newline_sequence(raw)
    text = decode_text(raw)

    updated = process_cpp(text, nl) if style == "cpp" else process_hash(text, nl)
    if updated != text:
        path.write_bytes(encode_text(updated, bom))
        return True
    return False


def main(argv: List[str]) -> int:
    if len(argv) < 2:
        print(__doc__)
        return 2
    changed = 0
    skipped = 0
    errors: List[str] = []
    for arg in argv[1:]:
        p = Path(arg)
        try:
            if not p.exists():
                errors.append(f"Not found: {p}")
                continue
            if p.is_dir():
                errors.append(f"Is a directory (pass specific files): {p}")
                continue
            if process_file(p):
                changed += 1
                print(f"Updated: {p}")
            else:
                skipped += 1
                print(f"Skipped (no change needed or unsupported type): {p}")
        except Exception as e:
            errors.append(f"Error processing {p}: {e}")
    print(f"\nSummary: {changed} updated, {skipped} skipped, {len(errors)} errors")
    for msg in errors:
        print(f" - {msg}")
    return 0 if not errors else 1


if __name__ == "__main__":
    raise SystemExit(main(sys.argv))