From dd0bea4adf1b49bdb9cdaf59bc25309178c0eed1 Mon Sep 17 00:00:00 2001 From: Michal Kulikowski Date: Thu, 21 May 2026 13:48:56 +0200 Subject: [PATCH] [rocm-libraries] direct push (commit c51a0ad) [CK_TILE] Instruction cache POC Copilot review fixes. Signed-off-by: Michal Kulikowski --- cmake/InstPrefetchPatch.cmake | 15 ++++++++++++- script/patch_prefetch_offset.py | 38 +++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/cmake/InstPrefetchPatch.cmake b/cmake/InstPrefetchPatch.cmake index 30d2413732..fe9eb3f547 100644 --- a/cmake/InstPrefetchPatch.cmake +++ b/cmake/InstPrefetchPatch.cmake @@ -24,6 +24,10 @@ if(NOT DEFINED CK_INST_PREFETCH_PATCH_DEFINED) "Dump intermediate files (merged tables, objdump text) during prefetch patching." OFF) + # Capture at include-time; CMAKE_CURRENT_LIST_DIR inside a function() + # resolves to the *caller's* directory, not the defining file's directory. + set(_CK_INST_PREFETCH_SCRIPT_DIR "${CMAKE_CURRENT_LIST_DIR}" CACHE INTERNAL "") + function(ck_inst_prefetch_patch TARGET_NAME) if(NOT ENABLE_INST_PREFETCH_PATCH) return() @@ -36,17 +40,26 @@ if(NOT DEFINED CK_INST_PREFETCH_PATCH_DEFINED) if(INST_PREFETCH_PATCH_DUMP_INTERMEDIATES) list(APPEND _extra_args --dump-intermediates) endif() + # Derive llvm-objdump from the HIP/ROCm compiler's bin directory. + get_filename_component(_compiler_dir "${CMAKE_CXX_COMPILER}" DIRECTORY) + find_program(_llvm_objdump llvm-objdump + HINTS ${_compiler_dir} /opt/rocm/llvm/bin + NO_DEFAULT_PATH) + if(NOT _llvm_objdump) + find_program(_llvm_objdump llvm-objdump) + endif() add_custom_command( TARGET ${TARGET_NAME} PRE_LINK COMMAND ${CMAKE_COMMAND} -E echo "[inst-prefetch-patch] Running patch_prefetch_offset.py for ${TARGET_NAME} (log: ${_log_file})" COMMAND ${Python3_EXECUTABLE} - ${CMAKE_SOURCE_DIR}/script/patch_prefetch_offset.py + ${_CK_INST_PREFETCH_SCRIPT_DIR}/../script/patch_prefetch_offset.py --build-dir ${CMAKE_BINARY_DIR} --target ${TARGET_NAME} --jobs ${_nproc} --skip-build-round1 + --objdump-path ${_llvm_objdump} ${_extra_args} COMMENT "[inst-prefetch-patch] Patching prefetch offsets for ${TARGET_NAME} — log: ${_log_file}" VERBATIM diff --git a/script/patch_prefetch_offset.py b/script/patch_prefetch_offset.py index 6f7e8ea4dc..60251062b9 100644 --- a/script/patch_prefetch_offset.py +++ b/script/patch_prefetch_offset.py @@ -2,6 +2,8 @@ # Copyright (c) Advanced Micro Devices, Inc., or its affiliates. # SPDX-License-Identifier: MIT +from __future__ import annotations + """Two-pass instruction-prefetch offset patcher. Round 1: build with koffset=0 so the compiler emits s_prefetch_inst_pc_rel @@ -35,6 +37,7 @@ CMake PRE_LINK usage (round 1 already done by cmake, only patch the .o): import argparse import multiprocessing import re +import shutil import subprocess import sys from pathlib import Path @@ -174,9 +177,14 @@ def find_asm_file(search_dir: Path, cpp_stem: str, gpu_arch: str = "") -> Path: def find_obj_file(build_dir: Path, target: str) -> Path: - """Find the most recent .o for the given CMake target.""" + """Find the most recent .o for the given CMake target. + + Uses ``**`` under ``{target}.dir/`` so that multi-config generators + (e.g. Ninja Multi-Config, Visual Studio) whose object files live in a + config subdirectory like ``{target}.dir/Release/`` are also found. + """ candidates = sorted( - build_dir.rglob(f"{target}.dir/*.o"), + build_dir.rglob(f"{target}.dir/**/*.o"), key=lambda p: p.stat().st_mtime, reverse=True, ) if not candidates: @@ -451,12 +459,11 @@ def _clamp_prefetch_region( return (koffset, klength) # ── Forward direction ──────────────────────────────────────────────── - koffset = target - pc_next + offset_bytes + prefetch_base = target_aligned + offset_bytes + koffset = prefetch_base - pc_next if koffset < 0: print(f"[warn] {name[:60]!r}: negative koffset — target before prefetch, skipping") return None - - prefetch_base = target_aligned + offset_bytes region_end = prefetch_base + (klength + 1) * CACHELINE_SIZE if region_end > func_end: needed = max(0, (func_end - prefetch_base + CACHELINE_SIZE - 1) // CACHELINE_SIZE) @@ -1020,8 +1027,8 @@ def main() -> None: ap = argparse.ArgumentParser(description=__doc__) ap.add_argument("--build-dir", required=True, type=Path, help="CMake build directory") ap.add_argument("--target", required=True, help="CMake target to build") - ap.add_argument("--objdump-path", default="/opt/rocm/llvm/bin/llvm-objdump", - help="Path to llvm-objdump (default: /opt/rocm/llvm/bin/llvm-objdump)") + ap.add_argument("--objdump-path", default=None, + help="Path to llvm-objdump (auto-detected from PATH / /opt/rocm if omitted)") ap.add_argument("--objdump-mcpu", default="", help="--mcpu value for llvm-objdump/llvm-mc (auto-detected from .s if omitted)") ap.add_argument("--dry-run", action="store_true", @@ -1043,6 +1050,23 @@ def main() -> None: ap.add_argument("--bundler-path", default="", help=argparse.SUPPRESS) args = ap.parse_args() + # Auto-detect llvm-objdump if not provided. + if args.objdump_path is None: + _candidates = [ + shutil.which("llvm-objdump"), + "/opt/rocm/llvm/bin/llvm-objdump", + ] + for _c in _candidates: + if _c and Path(_c).is_file(): + args.objdump_path = _c + break + if args.objdump_path is None: + sys.exit( + "Cannot find llvm-objdump. Pass --objdump-path explicitly or " + "ensure llvm-objdump is on PATH or installed at /opt/rocm/llvm/bin/." + ) + print(f"[auto] Using llvm-objdump: {args.objdump_path}") + # Log setup. log_path: Path | None if args.log_file is None: