mirror of
https://github.com/kvcache-ai/ktransformers.git
synced 2026-03-15 02:47:22 +00:00
同步0.2.4npu的scripts
This commit is contained in:
44
0.sh
Normal file
44
0.sh
Normal file
@@ -0,0 +1,44 @@
|
||||
#!/bin/bash
|
||||
#set -ex
|
||||
|
||||
# export area
|
||||
export ASDOPS_LOG_TO_FILE=0
|
||||
export ASDOPS_LOG_TO_STDOUT=0
|
||||
export ASDOPS_LOG_LEVEL=ERROR
|
||||
export ATB_LOG_TO_FILE=0
|
||||
export ATB_LOG_TO_STDOUT=0
|
||||
export ATB_LOG_LEVEL=ERROR
|
||||
export USE_MERGE=0
|
||||
# export PROF_DECODE=1
|
||||
#export PYTORCH_NPU_ALLOC_CONF=expandable_segments:False
|
||||
export INF_NAN_MODE_FORCE_DISABLE=1
|
||||
export CAPTURE_PLUGIN_PATH=/home/x30058903/pack/npu_graph
|
||||
export BLAS_NUM_THREADS=1
|
||||
export TASK_QUEUE_ENABLE=0
|
||||
# source area
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh
|
||||
|
||||
# global vars
|
||||
LOG_DIR="../logs"
|
||||
LOG_NAME="test_cpuinfer49"
|
||||
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||
mkdir -p ${LOG_DIR}
|
||||
LOG_FILE="${LOG_DIR}/${LOG_NAME}_${TIMESTAMP}.log"
|
||||
# /home/x00656918/600-500.txt
|
||||
# /home/x00656918/600-500.txt
|
||||
# torchrun and model area
|
||||
torchrun \
|
||||
--master-port 41532 \
|
||||
--nproc_per_node 1 \
|
||||
-m ktransformers.local_chat \
|
||||
--cpu_infer 65 \
|
||||
--model_path /home/DeepSeek-V3-q4km-w8a8/ \
|
||||
--gguf_path /home/DeepSeek-V3-q4km-w8a8/ \
|
||||
--max_new_tokens 200 \
|
||||
--use_cuda_graph True \
|
||||
--optimize_config_path ./ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-800IA2-npu.yaml \
|
||||
2>&1 | tee "${LOG_FILE}"
|
||||
|
||||
# --gguf_path /mnt/DeepSeek-R1-BF16/ \
|
||||
|
||||
@@ -29,7 +29,7 @@ pip install -r requirements-local_chat.txt
|
||||
pip install -r ktransformers/server/requirements.txt
|
||||
|
||||
echo "Installing ktransformers"
|
||||
KTRANSFORMERS_FORCE_BUILD=TRUE pip install -v . --no-build-isolation
|
||||
KTRANSFORMERS_FORCE_BUILD=TRUE USE_BALANCE_SERVE=1 pip install -vvv . --no-build-isolation
|
||||
|
||||
if [[ "$DEV_BACKEND" == "cuda" ]]; then
|
||||
echo "Installing custom_flashinfer for CUDA backend"
|
||||
|
||||
122
install_for_npu.sh
Normal file
122
install_for_npu.sh
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
|
||||
INITIALIZED="FALSE"
|
||||
ROOT_DIR=$(pwd)
|
||||
|
||||
if [ -f ".INITIALIZED" ]; then
|
||||
INITIALIZED="TRUE"
|
||||
fi
|
||||
|
||||
case "$INITIALIZED" in
|
||||
TRUE)
|
||||
echo "Detect file .INITIALIZED, will not init again."
|
||||
;;
|
||||
FALSE)
|
||||
echo "Not detect file .INITIALIZED, do init."
|
||||
# Detect architecture
|
||||
ARCH=$(uname -m)
|
||||
IS_ARM=false
|
||||
if [[ "$ARCH" == "armv7l" || "$ARCH" == "aarch64" ]]; then
|
||||
IS_ARM=true
|
||||
fi
|
||||
|
||||
# ARM-specific operations
|
||||
if $IS_ARM; then
|
||||
echo "Processing ARM architecture specific tasks"
|
||||
|
||||
# Copy ARM specific files
|
||||
# cp ./for_arm/CMakeLists.txt ./csrc/ktransformers_ext/CMakeLists.txt
|
||||
# cp ./for_arm/iqk_mul_mat.inc ./third_party/llamafile/iqk_mul_mat.inc
|
||||
# cp ./for_arm/sgemm.cpp ./third_party/llamafile/sgemm.cpp
|
||||
# cp ./for_arm/tinyblas_cpu_sgemm.inc ./third_party/llamafile/tinyblas_cpu_sgemm.inc
|
||||
cp ./for_arm/requirements-local_chat.txt ./requirements-local_chat.txt
|
||||
cp ./for_arm/setup.py ./setup.py
|
||||
fi
|
||||
|
||||
# init third_party
|
||||
# clone third_party or unzip third_party file
|
||||
third_party_file=""
|
||||
third_party="$ROOT_DIR/third_party"
|
||||
cd "$third_party"
|
||||
|
||||
for i in "$@"
|
||||
do
|
||||
case $i in
|
||||
--third-party-file=*|-f=*)
|
||||
third_party_file="${i#*=}"
|
||||
shift
|
||||
;;
|
||||
*)
|
||||
echo "Unknown operation: $i"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ -n "$third_party_file" ]; then
|
||||
if [[ "$third_party_file" != /* ]]; then
|
||||
third_party_file="$ROOT_DIR/$third_party_file"
|
||||
fi
|
||||
|
||||
if [ ! -f "$third_party_file" ]; then
|
||||
echo "Error: file not found on '$third_party_file'"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
case "${third_party_file}" in
|
||||
*.tar.gz|*.tgz)
|
||||
tar -xzf "$third_party_file"
|
||||
;;
|
||||
*.zip)
|
||||
unzip "$third_party_file"
|
||||
;;
|
||||
*)
|
||||
echo "Error: unsupported file format '$third_party_file'"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
echo "Finish decompress ${third_party_file}"
|
||||
else
|
||||
# todo update
|
||||
git clone https://github.com/kvcache-ai/custom_flashinfer.git -b fix-precision-mla-merge-main && cd custom_flashinfer && git checkout fd94393f
|
||||
git submodule init && git submodule update && cd 3rdparty
|
||||
cd composable_kernels && git checkout 5055b3bd && cd ..
|
||||
cd cutlass && git checkout cc3c29a8 && cd ..
|
||||
cd googletest && git checkout 5a37b517 && cd ..
|
||||
cd mscclpp && git checkout v0.5.1 && cd ..
|
||||
cd nvbench && git checkout 555d628e && cd ..
|
||||
cd spdlog && git checkout v1.x && cd ..
|
||||
cd "$third_party"
|
||||
|
||||
git clone https://github.com/ggerganov/llama.cpp.git -b master && cd llama.cpp && git checkout b3173
|
||||
git submodule init && git submodule update
|
||||
cd kompute && git checkout 4565194e && cd ..
|
||||
cd "$third_party"
|
||||
|
||||
git clone https://github.com/jupp0r/prometheus-cpp -b master && cd prometheus-cpp && git checkout f13cdd05
|
||||
git submodule init && git submodule update && cd 3rdparty
|
||||
cd civetweb && git checkout v1.16 && cd ..
|
||||
cd googletest && git checkout release-1.11.0 && cd ..
|
||||
cd "$third_party"
|
||||
|
||||
git clone https://github.com/pybind/pybind11.git -b master && cd pybind11 && git checkout bb05e081 && cd ..
|
||||
git clone https://github.com/gabime/spdlog.git -b v1.x && cd spdlog && git checkout v1.15.2 && cd ..
|
||||
git clone https://github.com/Cyan4973/xxHash.git -b dev && cd xxHash && git checkout 953a09ab && cd ..
|
||||
|
||||
echo "Finish clone and checkout third_party"
|
||||
fi
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
touch ./.INITIALIZED
|
||||
;;
|
||||
*)
|
||||
echo "Error"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
cd "$ROOT_DIR"
|
||||
sed -i 's/\r$//' ./install.sh
|
||||
bash ./install.sh
|
||||
@@ -6,7 +6,7 @@ import sys
|
||||
# sys.path.insert(0, "/home/azure/ktransformers")
|
||||
import argparse
|
||||
import torch
|
||||
from ktransformers.util.custom_loader import GGUFLoader, translate_name_to_gguf
|
||||
from ktransformers.util.custom_gguf import GGUFLoader, translate_name_to_gguf
|
||||
from safetensors import safe_open
|
||||
from safetensors.torch import save_file
|
||||
import re
|
||||
|
||||
62
scripts/test_curl.sh
Normal file
62
scripts/test_curl.sh
Normal file
@@ -0,0 +1,62 @@
|
||||
#curl -X 'POST' 'http://localhost:10068/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
|
||||
# "messages": [
|
||||
# {
|
||||
# "content": "Give me a clear and full explanation about the history of ThanksGiving.",
|
||||
# "role": "user"
|
||||
# }
|
||||
# ],
|
||||
# "model": "DeepSeek-Coder-V2-Instruct",
|
||||
# "stream": false
|
||||
#}' &
|
||||
#
|
||||
#sleep 1
|
||||
|
||||
curl -X 'POST' 'http://localhost:10068/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
|
||||
"messages": [
|
||||
{
|
||||
"content": "Who are you?",
|
||||
"role": "user"
|
||||
}
|
||||
],
|
||||
"model": "DeepSeek-Coder-V2-Instruct",
|
||||
"stream": false
|
||||
}' &
|
||||
|
||||
sleep 1
|
||||
|
||||
curl -X 'POST' 'http://localhost:10068/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
|
||||
"messages": [
|
||||
{
|
||||
"content": "Where is Beijing?",
|
||||
"role": "user"
|
||||
}
|
||||
],
|
||||
"model": "DeepSeek-Coder-V2-Instruct",
|
||||
"stream": false
|
||||
}' &
|
||||
|
||||
sleep 1
|
||||
|
||||
curl -X 'POST' 'http://localhost:10068/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
|
||||
"messages": [
|
||||
{
|
||||
"content": "What you do?",
|
||||
"role": "user"
|
||||
}
|
||||
],
|
||||
"model": "DeepSeek-Coder-V2-Instruct",
|
||||
"stream": false
|
||||
}' &
|
||||
|
||||
sleep 1
|
||||
|
||||
curl -X 'POST' 'http://localhost:10068/v1/chat/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{
|
||||
"messages": [
|
||||
{
|
||||
"content": "Where is Beijing?",
|
||||
"role": "user"
|
||||
}
|
||||
],
|
||||
"model": "DeepSeek-Coder-V2-Instruct",
|
||||
"stream": false
|
||||
}' &
|
||||
163
scripts/test_generator.py
Normal file
163
scripts/test_generator.py
Normal file
@@ -0,0 +1,163 @@
|
||||
import os
|
||||
import re
|
||||
import socket
|
||||
import random
|
||||
from copy import deepcopy
|
||||
from typing import Sequence
|
||||
|
||||
import yaml
|
||||
|
||||
|
||||
def get_sh_body():
|
||||
sh = r'''#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# export area
|
||||
{export_area}
|
||||
|
||||
# source area
|
||||
{source_area}
|
||||
|
||||
# global vars
|
||||
LOG_DIR="../logs"
|
||||
{log_file_name}
|
||||
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||
mkdir -p ${LOG_DIR}
|
||||
LOG_FILE="${LOG_DIR}/${LOG_NAME}_${TIMESTAMP}.log"
|
||||
|
||||
# torchrun and model area
|
||||
torchrun \
|
||||
--master-port {master_port} \
|
||||
{torchrun_area}
|
||||
{model_area}
|
||||
2>&1 | tee "${LOG_FILE}"'''
|
||||
return sh
|
||||
|
||||
|
||||
def add_static_pattern_seq(sh: str, data: Sequence, pattern_name: str, replace_line: str):
|
||||
lines = [replace_line.format(val) for val in data]
|
||||
replace_context = "\n".join(lines)
|
||||
return sh.replace(pattern_name, replace_context)
|
||||
|
||||
|
||||
def add_static_pattern_dict(sh: str, data: dict, pattern_name: str, replace_line: str):
|
||||
lines = [replace_line.format(key, val) for key, val in data.items()]
|
||||
replace_context = "\n".join(lines)
|
||||
return sh.replace(pattern_name, replace_context)
|
||||
|
||||
|
||||
def add_export(sh, data: dict):
|
||||
return add_static_pattern_dict(sh, data, "{export_area}", "export {0}={1}")
|
||||
|
||||
|
||||
def add_source(sh, data: Sequence):
|
||||
return add_static_pattern_seq(sh, data, "{source_area}", "source {0}")
|
||||
|
||||
|
||||
def add_torchrun(sh, data):
|
||||
long_replace = add_static_pattern_dict("long", data["long"], "long", " --{0} {1} \\")
|
||||
short_replace = add_static_pattern_dict("short", data["short"], "short", " -{0} {1} \\")
|
||||
return sh.replace("{torchrun_area}", long_replace + "\n" + short_replace)
|
||||
|
||||
|
||||
def add_model(sh, data: dict):
|
||||
return add_static_pattern_dict(sh, data, "{model_area}", " --{0} {1} \\")
|
||||
|
||||
|
||||
def get_valid_file_name_sequence(s: str) -> str:
|
||||
return "".join(char for char in s if char.isalnum())
|
||||
|
||||
|
||||
def get_valid_file_name_lines(hyper):
|
||||
keys_required = ("cpu_infer", )
|
||||
if not all(key in hyper for key in keys_required):
|
||||
raise ValueError(f"{', '.join(keys_required)} should be in hyperparams of generator.py to generate file name.")
|
||||
ret = [get_valid_file_name_sequence(f"{key}{hyper[key]}") for key in keys_required]
|
||||
return "test_" + "_".join(ret)
|
||||
|
||||
|
||||
def add_log_file(sh, hyper):
|
||||
hyperparams_lines = get_valid_file_name_lines(hyper)
|
||||
return sh.replace("{log_file_name}", "LOG_NAME=\"" + hyperparams_lines + "\"")
|
||||
|
||||
|
||||
def is_available_port(port):
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
try:
|
||||
s.bind(("127.0.0.1", port))
|
||||
return True
|
||||
except socket.error as e:
|
||||
if e.errno == 98:
|
||||
return False
|
||||
raise e
|
||||
|
||||
def add_available_port(sh, start=1024, end=65535):
|
||||
while True:
|
||||
port = random.randint(start, end)
|
||||
if is_available_port(port):
|
||||
return sh.replace("{master_port}", f"{port}")
|
||||
|
||||
|
||||
def get_all_hyper_keys(hypers):
|
||||
keys = set()
|
||||
for hyper in hypers:
|
||||
for key, val in hyper.items():
|
||||
keys.add(key)
|
||||
return keys
|
||||
|
||||
|
||||
def replace_hyper(sh: str, hyper, all_hyper_keys):
|
||||
lines = []
|
||||
hyper_not_used = set(hyper.keys())
|
||||
for line in sh.split("\n"):
|
||||
match = re.search("\$([a-zA-Z0-9_]+)", line)
|
||||
if not match:
|
||||
lines.append(line)
|
||||
continue
|
||||
|
||||
key = match.group(1)
|
||||
if key in hyper:
|
||||
hyper_not_used.remove(key)
|
||||
lines.append(line.replace(f"${key}", str(hyper[key])))
|
||||
elif key not in all_hyper_keys:
|
||||
print(f"[WARNING] `{key}` not in hyperparams, will skip generating the line.")
|
||||
|
||||
if hyper_not_used:
|
||||
print(f"[WARNING] The following hyperparams are not used: {','.join(hyper_not_used)}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def load_yaml_file(config_path):
|
||||
if not os.path.exists(config_path):
|
||||
raise FileNotFoundError("Failed to find config file on {}".format(config_path))
|
||||
|
||||
with open(config_path) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
def save_sh_file(sh, fname):
|
||||
with open(fname, "w") as f:
|
||||
f.write(sh)
|
||||
os.chmod(fname, 0o550)
|
||||
print("Generate file: ", fname)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
root_dir = os.path.abspath(os.path.dirname(__file__)) + "/"
|
||||
y = load_yaml_file(root_dir + "test_generator.yaml")
|
||||
hypers = y["hyperparams"]
|
||||
all_hyper_keys = get_all_hyper_keys(hypers)
|
||||
test_cnt = len(hypers)
|
||||
|
||||
sh = get_sh_body()
|
||||
sh = add_export(sh, y["export_area"])
|
||||
sh = add_source(sh, y["source_area"])
|
||||
sh = add_torchrun(sh, y["torchrun_area"])
|
||||
sh = add_model(sh, y["model_area"])
|
||||
|
||||
for hyper in hypers:
|
||||
sh_ = deepcopy(sh)
|
||||
sh_ = add_available_port(sh_)
|
||||
sh_ = add_log_file(sh_, hyper)
|
||||
sh_ = replace_hyper(sh_, hyper, all_hyper_keys)
|
||||
fname = root_dir + get_valid_file_name_lines(hyper) + ".sh"
|
||||
save_sh_file(sh_, fname)
|
||||
41
scripts/test_generator.yaml
Normal file
41
scripts/test_generator.yaml
Normal file
@@ -0,0 +1,41 @@
|
||||
global_vars:
|
||||
q8: &q8 /home/dataset/int8
|
||||
q4: &q4 /home/dataset/int4
|
||||
w8a8: &w8a8 /home/dataset/w8a8
|
||||
ascend_toolkit: &toolkit /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
atb: &atb /home/ascend/ascend-transformer-boost/output/atb/set_env.sh
|
||||
optimize_config_path: &opt_cfg ./ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-800IA2-npu.yaml
|
||||
|
||||
hyperparams:
|
||||
- cpu_infer: 49
|
||||
w8a8_safetensor_path: *w8a8
|
||||
|
||||
- cpu_infer: 37
|
||||
|
||||
export_area:
|
||||
ASDOPS_LOG_TO_FILE: 0
|
||||
ASDOPS_LOG_TO_STDOUT: 0
|
||||
ASDOPS_LOG_LEVEL: ERROR
|
||||
ATB_LOG_TO_FILE: 0
|
||||
ATB_LOG_TO_STDOUT: 0
|
||||
ATB_LOG_LEVEL: ERROR
|
||||
USE_MERGE: 0
|
||||
|
||||
source_area:
|
||||
- *toolkit
|
||||
- *atb
|
||||
|
||||
torchrun_area:
|
||||
long:
|
||||
nproc_per_node: 1
|
||||
short:
|
||||
m: ktransformers.local_chat
|
||||
|
||||
model_area:
|
||||
cpu_infer: $cpu_infer
|
||||
model_path: $model_path
|
||||
gguf_path: *q8
|
||||
q4_gguf_path: *q4
|
||||
w8a8_safetensor_path: $w8a8_safetensor_path
|
||||
max_new_tokens: 50
|
||||
optimize_config_path: *opt_cfg
|
||||
332
scripts/test_runner.py
Normal file
332
scripts/test_runner.py
Normal file
@@ -0,0 +1,332 @@
|
||||
import os
|
||||
import select
|
||||
import subprocess
|
||||
import time
|
||||
from functools import partial
|
||||
from types import SimpleNamespace
|
||||
from typing import Sequence, Tuple, Dict, Union
|
||||
|
||||
import psutil
|
||||
import yaml
|
||||
|
||||
|
||||
def load_yaml_file(config_path):
|
||||
if not os.path.exists(config_path):
|
||||
raise FileNotFoundError('[Runner] Failed to find config file on {}'.format(config_path))
|
||||
|
||||
with open(config_path) as f:
|
||||
return yaml.safe_load(f)
|
||||
|
||||
|
||||
def store_output_stat(line: str, store: dict[str, list[str]]) -> bool:
|
||||
line_head = line[:30]
|
||||
for key in store.keys():
|
||||
if key in line_head:
|
||||
store[key].append(line)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def get_file_line_cnt(line):
|
||||
if os.path.isfile(line):
|
||||
with open(line, 'r') as f:
|
||||
non_empty_lines = sum(1 for line in f if line.strip())
|
||||
else:
|
||||
non_empty_lines = 1
|
||||
print(f'[Runner] Use default line count: 1')
|
||||
return non_empty_lines
|
||||
|
||||
|
||||
def print_script_running(proc, script, args, script_id) -> tuple[bool, str | dict] | None:
|
||||
is_print_lines = args.is_print_lines
|
||||
print_interval = args.print_interval
|
||||
test_input = script.get('test_input', 'Hello, world.')
|
||||
timeout = script.get('timeout', 20000) # sec
|
||||
|
||||
line_cnt = get_file_line_cnt(test_input)
|
||||
max_stat_count = line_cnt * len(args.stat_keys)
|
||||
print(f'[Runner] The script will exit after the stat info in the {max_stat_count} line is collected.')
|
||||
stat_count = 0
|
||||
success_collect = False
|
||||
test_output = {key: [] for key in args.stat_keys}
|
||||
|
||||
failure_start_time = 0
|
||||
failure_wait_time = 3
|
||||
failure_ret_info = ''
|
||||
|
||||
buffer = ''
|
||||
prompt_detected = False
|
||||
last_status_time = start_time = time.time()
|
||||
|
||||
os.set_blocking(proc.stdout.fileno(), False)
|
||||
|
||||
read_chunk = partial(os.read, proc.stdout.fileno(), 4096)
|
||||
|
||||
def add_os_linesep(text):
|
||||
if not text.endswith(os.linesep):
|
||||
text += os.linesep
|
||||
return text
|
||||
|
||||
def work_on_detected():
|
||||
nonlocal prompt_detected
|
||||
print(f'[Runner] Chat detected')
|
||||
proc.stdin.write(add_os_linesep(test_input))
|
||||
proc.stdin.flush()
|
||||
prompt_detected = True
|
||||
|
||||
def update_buffer():
|
||||
try:
|
||||
nonlocal buffer
|
||||
while True:
|
||||
data = read_chunk().decode('utf-8', errors='replace')
|
||||
if not data:
|
||||
break
|
||||
buffer += data
|
||||
except BlockingIOError:
|
||||
pass # Buffer is empty
|
||||
|
||||
def process_lines(lines):
|
||||
nonlocal failure_start_time, failure_ret_info, stat_count, is_print_lines
|
||||
for line in lines:
|
||||
if is_print_lines:
|
||||
print(line)
|
||||
|
||||
if 'Traceback' in line and failure_ret_info == '':
|
||||
failure_start_time = cur_time
|
||||
failure_ret_info = f'Detect traceback, exiting...'
|
||||
print(f'[Runner] Detect traceback')
|
||||
print(line)
|
||||
is_print_lines = True
|
||||
|
||||
if 'NPU out of memory' in line:
|
||||
failure_ret_info = f'Detect NPU OOM, exiting...'
|
||||
|
||||
if store_output_stat(line, test_output):
|
||||
stat_count += 1
|
||||
|
||||
if stat_count >= max_stat_count:
|
||||
nonlocal success_collect
|
||||
success_collect = True
|
||||
|
||||
if not prompt_detected and 'Chat:' in line:
|
||||
work_on_detected()
|
||||
|
||||
def process_incomplete_line(line):
|
||||
if not prompt_detected and 'Chat:' in line:
|
||||
if is_print_lines:
|
||||
print(line)
|
||||
work_on_detected()
|
||||
line = ''
|
||||
return line
|
||||
|
||||
while True:
|
||||
cur_time = time.time()
|
||||
cur_running_time = int(cur_time - start_time)
|
||||
if cur_time - last_status_time > print_interval:
|
||||
print(f'[Runner] Script still running (elapsed: {cur_running_time}s), task id: {script_id}')
|
||||
last_status_time = cur_time
|
||||
|
||||
if cur_time - start_time > timeout:
|
||||
print(f'[Runner] Timeout, exiting...')
|
||||
return False, f'Script execution timeout after {cur_running_time}s'
|
||||
|
||||
if success_collect:
|
||||
return True, test_output
|
||||
|
||||
if failure_ret_info and cur_time - failure_start_time >= failure_wait_time:
|
||||
return False, failure_ret_info
|
||||
|
||||
rlist, _, _ = select.select([proc.stdout], [], [], 0.2)
|
||||
|
||||
if rlist:
|
||||
update_buffer()
|
||||
|
||||
if buffer:
|
||||
lines = buffer.split('\n')
|
||||
|
||||
process_lines(lines[:-1])
|
||||
|
||||
buffer = process_incomplete_line(lines[-1])
|
||||
|
||||
|
||||
def kill_process(p):
|
||||
try:
|
||||
if p.status() == psutil.STATUS_ZOMBIE:
|
||||
print(f'[Runner] Found zombie process {p.pid}, waiting parent to reap it')
|
||||
os.waitpid(p.pid, os.WNOHANG)
|
||||
else:
|
||||
p.kill()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
pass
|
||||
|
||||
|
||||
def kill_subprocess(args):
|
||||
pid = os.getpid()
|
||||
self = psutil.Process(pid)
|
||||
|
||||
for _ in range(10):
|
||||
children = self.children(recursive=True)
|
||||
if not children:
|
||||
print('[Runner] Script exiting successfully.')
|
||||
time.sleep(2)
|
||||
return
|
||||
|
||||
print(f'[Runner] Killing {len(children)} subprocesses...')
|
||||
|
||||
for child in children:
|
||||
kill_process(child)
|
||||
|
||||
time.sleep(args.sleep_time)
|
||||
|
||||
raise RuntimeError('[Runner] Subprocess exited unsuccessfully!')
|
||||
|
||||
|
||||
def script_runner(script: dict[str, str], args, script_id) -> tuple[bool, Union[str, dict | Sequence]]:
|
||||
max_retries = args.max_retries
|
||||
script_path = script['path']
|
||||
if not os.path.isabs(script_path):
|
||||
script_path = os.path.join(args.runner_path, script_path)
|
||||
retries = 0
|
||||
ret_failure = []
|
||||
|
||||
while retries < max_retries:
|
||||
print(f'[Runner] Running: {script_path}' + ('' if retries == 0 else f'(attempt {retries + 1}/{max_retries})'))
|
||||
proc = subprocess.Popen(
|
||||
['/bin/bash', '--noprofile', '--norc', script_path],
|
||||
stdin=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
bufsize=4096,
|
||||
text=True
|
||||
)
|
||||
|
||||
stat, ret = print_script_running(proc, script, args, script_id)
|
||||
|
||||
proc.stdin.close()
|
||||
proc.stdout.close()
|
||||
proc.wait()
|
||||
|
||||
kill_subprocess(args)
|
||||
|
||||
if stat:
|
||||
return True, ret
|
||||
|
||||
if not isinstance(ret, str):
|
||||
raise AssertionError('ret must be a string when run failure.')
|
||||
ret_failure.append(ret)
|
||||
if 'Script execution timeout after' in ret:
|
||||
print('[Runner] Timeout.')
|
||||
elif 'Detect traceback, exiting...' in ret:
|
||||
print('[Runner] Get traceback.')
|
||||
elif 'Detect NPU OOM, exiting...' in ret:
|
||||
print('[Runner] Get NPU OOM.')
|
||||
else:
|
||||
print(f'[Runner] Get error: {ret}')
|
||||
|
||||
if args.stop_on_failure:
|
||||
return False, ret_failure
|
||||
|
||||
retries += 1
|
||||
if retries < max_retries:
|
||||
continue
|
||||
|
||||
return False, ret_failure
|
||||
|
||||
return False, ('[RunnerError] max_retries must greater than 0.',)
|
||||
|
||||
|
||||
def print_centered_summary(text, char='=', width=60):
|
||||
if len(text) >= width - 10 - 2:
|
||||
print(f'\n{char * 5} {text} {char * 5}\n', end='')
|
||||
else:
|
||||
padding = (width - 2 - len(text)) // 2
|
||||
left_padding = char * padding
|
||||
right_padding = char * (width - len(text) - padding)
|
||||
print(f'\n{left_padding} {text} {right_padding}\n', end='')
|
||||
|
||||
|
||||
def print_success_output(output):
|
||||
for script_name, vals in output.items():
|
||||
print_centered_summary(script_name, '-')
|
||||
if len(vals) == 0:
|
||||
return
|
||||
len_val = len(next(iter(vals.values())))
|
||||
for idx in range(len_val):
|
||||
for val in vals.values():
|
||||
print(val[idx], end='\n')
|
||||
print()
|
||||
|
||||
|
||||
def print_failure_output(output):
|
||||
for script_name, vals in output.items():
|
||||
print_centered_summary(script_name, '-')
|
||||
if len(vals) == 0:
|
||||
return
|
||||
for val in vals:
|
||||
print(val, end='\n')
|
||||
print()
|
||||
|
||||
|
||||
def make_titles(scripts):
|
||||
names_cnt = {}
|
||||
global_cnt = 0
|
||||
titles = []
|
||||
for item in scripts:
|
||||
name = item['name']
|
||||
names_cnt[name] = names_cnt.get(name, 0) + 1
|
||||
global_cnt += 1
|
||||
titles.append(f'{global_cnt} {name}' + ('' if names_cnt[name] == 1 else f' ({names_cnt[name]})'))
|
||||
return titles
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
runner_path = os.path.abspath(os.path.dirname(__file__)) + '/'
|
||||
root_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + '/'
|
||||
|
||||
config = load_yaml_file(runner_path + 'test_runner.yaml')
|
||||
settings = config['settings']
|
||||
scripts = config['scripts']
|
||||
stat_keys = config['stat_keys']
|
||||
titles = make_titles(scripts)
|
||||
if len(scripts) == 0:
|
||||
print('[Runner] No scripts defined.')
|
||||
exit(1)
|
||||
|
||||
args = SimpleNamespace()
|
||||
args.stat_keys = stat_keys
|
||||
args.max_retries = settings.get('max_retries', 3)
|
||||
args.sleep_time = settings.get('sleep_time', 30) # second
|
||||
args.is_print_lines = settings.get('is_print_lines', True)
|
||||
args.print_interval = settings.get('print_interval', 30) # second
|
||||
args.stop_on_failure = settings.get('stop_on_failure', False)
|
||||
args.runner_path = runner_path
|
||||
args.root_path = root_path
|
||||
|
||||
success_output = {}
|
||||
failure_output = {}
|
||||
try:
|
||||
for i, script in enumerate(scripts):
|
||||
stat, ret = script_runner(script, args, i + 1)
|
||||
if stat:
|
||||
success_output[titles[i]] = ret
|
||||
else:
|
||||
failure_output[titles[i]] = ret
|
||||
if args.stop_on_failure and not stat:
|
||||
print(f'[Runner] Running {titles[i]} failed. Exit early now because stop_on_failure is set to True.')
|
||||
break
|
||||
except Exception as e:
|
||||
if '[Runner' in e.__str__():
|
||||
print(f'{e.__str__()} \n[Runner] Detect error, the collected information will be print.')
|
||||
else:
|
||||
raise e
|
||||
|
||||
print_centered_summary('Summary Begin')
|
||||
print_success_output(success_output)
|
||||
print_failure_output(failure_output)
|
||||
success_failure_stat = []
|
||||
if len(success_output) > 0:
|
||||
success_failure_stat.append(f'{len(success_output)} scripts execute success')
|
||||
if len(failure_output) > 0:
|
||||
success_failure_stat.append(f'{len(failure_output)} scripts execute failure')
|
||||
print(', '.join(success_failure_stat) + '.')
|
||||
print_centered_summary('Summary End')
|
||||
27
scripts/test_runner.yaml
Normal file
27
scripts/test_runner.yaml
Normal file
@@ -0,0 +1,27 @@
|
||||
global_args:
|
||||
8k1k: &8k1k /home/prompts/8k1k.txt
|
||||
|
||||
settings:
|
||||
max_retries: 3
|
||||
stop_on_failure: false
|
||||
is_print_lines: true
|
||||
sleep_time: 8
|
||||
|
||||
stat_keys:
|
||||
- 'prompt eval count'
|
||||
- 'prompt eval duration'
|
||||
- 'prompt eval rate'
|
||||
- 'eval count' # 'eval count' belong to 'prompt eval count', but it is shorter,
|
||||
- 'eval duration' # put shorter before longer, so match longer failure than match shorter
|
||||
- 'eval rate'
|
||||
|
||||
scripts:
|
||||
- name: 'test_cpuinfer37.sh'
|
||||
path: 'test_cpuinfer37.sh'
|
||||
test_input: *8k1k
|
||||
timeout: 20000
|
||||
|
||||
- name: 'test_cpuinfer49.sh'
|
||||
path: 'test_cpuinfer49.sh'
|
||||
test_input: *8k1k
|
||||
timeout: 20000
|
||||
63
serve_test.sh
Normal file
63
serve_test.sh
Normal file
@@ -0,0 +1,63 @@
|
||||
#!/bin/bash
|
||||
#set -ex
|
||||
|
||||
# export area
|
||||
export ASDOPS_LOG_TO_FILE=0
|
||||
export ASDOPS_LOG_TO_STDOUT=0
|
||||
export ASDOPS_LOG_LEVEL=ERROR
|
||||
export ATB_LOG_TO_FILE=0
|
||||
export ATB_LOG_TO_STDOUT=0
|
||||
export ATB_LOG_LEVEL=ERROR
|
||||
export USE_MERGE=0
|
||||
#export PROF_DECODE=1
|
||||
#export PYTORCH_NPU_ALLOC_CONF=expandable_segments:False
|
||||
export INF_NAN_MODE_FORCE_DISABLE=1
|
||||
export CAPTURE_PLUGIN_PATH=/home/x30058903/pack/npu_graph
|
||||
export BLAS_NUM_THREADS=1
|
||||
# export TASK_QUEUE_ENABLE=0
|
||||
# source area
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh
|
||||
|
||||
# global vars
|
||||
LOG_DIR="../logs"
|
||||
LOG_NAME="test_cpuinfer49"
|
||||
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||
mkdir -p ${LOG_DIR}
|
||||
LOG_FILE="${LOG_DIR}/${LOG_NAME}_${TIMESTAMP}.log"
|
||||
|
||||
# torchrun \
|
||||
# --master-port 41532 \
|
||||
# --nproc_per_node 1 \
|
||||
# -m ktransformers.server.main \
|
||||
# --cpu_infer 65 \
|
||||
# --model_path /home/mount/DeepSeek-R1-q4km-w8a8 \
|
||||
# --gguf_path /home/mount/DeepSeek-R1-q4km-w8a8 \
|
||||
# --max_new_tokens 200 \
|
||||
# --use_cuda_graph \
|
||||
# --optimize_config_path ./ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-800IA2-npu.yaml \
|
||||
# 2>&1 | tee "${LOG_FILE}"
|
||||
|
||||
LOG_DIR="../logs"
|
||||
WS=1
|
||||
CPUINFER=65
|
||||
LOG_NAME="combime_test"
|
||||
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
|
||||
mkdir -p $LOG_DIR
|
||||
LOG_FILE="${LOG_DIR}/${LOG_NAME}_${TIMESTAMP}.log"
|
||||
echo ${LOG_FILE}
|
||||
torchrun --nproc_per_node $WS \
|
||||
--master_port 6685 -m ktransformers.server.main \
|
||||
--cpu_infer $CPUINFER \
|
||||
--batch_size 1 \
|
||||
--chunk_size 16384 \
|
||||
--model_path /home/mount/DeepSeek-R1-q4km-w8a8 \
|
||||
--gguf_path /home/mount/DeepSeek-R1-q4km-w8a8 \
|
||||
--optimize_config_path ./ktransformers/optimize/optimize_rules/DeepSeek-V3-Chat-800IA2-npu.yaml \
|
||||
--port 10014 \
|
||||
--force_think \
|
||||
--use_cuda_graph \
|
||||
--max_new_tokens 2048 >&1 | tee "$LOG_FILE"
|
||||
|
||||
# --gguf_path /mnt/DeepSeek-R1-BF16/ \
|
||||
|
||||
60
setup.py
60
setup.py
@@ -41,6 +41,13 @@ except ImportError:
|
||||
MUSA_HOME=None
|
||||
KTRANSFORMERS_BUILD_XPU = torch.xpu.is_available()
|
||||
|
||||
|
||||
try:
|
||||
import torch_npu
|
||||
KTRANSFORMERS_BUILD_NPU = torch_npu.npu.is_available()
|
||||
except:
|
||||
KTRANSFORMERS_BUILD_NPU = False
|
||||
|
||||
# 检测 DEV_BACKEND 环境变量
|
||||
dev_backend = os.environ.get("DEV_BACKEND", "").lower()
|
||||
if dev_backend == "xpu":
|
||||
@@ -179,6 +186,8 @@ class VersionInfo:
|
||||
else:
|
||||
print("Using native cpu instruct")
|
||||
if sys.platform.startswith("linux"):
|
||||
if KTRANSFORMERS_BUILD_NPU:
|
||||
return 'aarch64'
|
||||
with open('/proc/cpuinfo', 'r', encoding="utf-8") as cpu_f:
|
||||
cpuinfo = cpu_f.read()
|
||||
flags_line = [line for line in cpuinfo.split(
|
||||
@@ -237,6 +246,8 @@ class VersionInfo:
|
||||
backend_version = f"rocm{self.get_rocm_bare_metal_version(ROCM_HOME)}"
|
||||
elif torch.xpu.is_available():
|
||||
backend_version = f"xpu"
|
||||
elif KTRANSFORMERS_BUILD_NPU:
|
||||
backend_version = f"npu{torch_npu.__version__}"
|
||||
else:
|
||||
raise ValueError("Unsupported backend: CUDA_HOME MUSA_HOME ROCM_HOME all not set and XPU is not available.")
|
||||
package_version = f"{flash_version}+{backend_version}torch{torch_version}{cpu_instruct}"
|
||||
@@ -509,6 +520,8 @@ class CMakeBuild(BuildExtension):
|
||||
cmake_args += ["-DKTRANSFORMERS_USE_ROCM=ON"]
|
||||
elif KTRANSFORMERS_BUILD_XPU:
|
||||
cmake_args += ["-DKTRANSFORMERS_USE_XPU=ON", "-DKTRANSFORMERS_USE_CUDA=OFF"]
|
||||
elif KTRANSFORMERS_BUILD_NPU:
|
||||
cmake_args += ["-DKTRANSFORMERS_USE_NPU=ON", "-DKTRANSFORMERS_USE_CUDA=OFF"]
|
||||
else:
|
||||
raise ValueError("Unsupported backend: CUDA_HOME, MUSA_HOME, and ROCM_HOME are not set and XPU is not available.")
|
||||
|
||||
@@ -636,10 +649,12 @@ elif MUSA_HOME is not None:
|
||||
)
|
||||
elif torch.xpu.is_available(): #XPUExtension is not available now.
|
||||
ops_module = None
|
||||
elif KTRANSFORMERS_BUILD_NPU:
|
||||
pass
|
||||
else:
|
||||
raise ValueError("Unsupported backend: CUDA_HOME ROCM_HOME MUSA_HOME are not set and XPU is not available.")
|
||||
|
||||
if not torch.xpu.is_available():
|
||||
if not torch.xpu.is_available() and not KTRANSFORMERS_BUILD_NPU:
|
||||
ext_modules = [
|
||||
CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
|
||||
ops_module,
|
||||
@@ -660,15 +675,42 @@ if not torch.xpu.is_available():
|
||||
ext_modules.append(
|
||||
CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
|
||||
)
|
||||
else:
|
||||
|
||||
setup(
|
||||
name=VersionInfo.PACKAGE_NAME,
|
||||
version=VersionInfo().get_package_version(),
|
||||
install_requires=triton_dep,
|
||||
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
|
||||
ext_modules=ext_modules
|
||||
)
|
||||
|
||||
|
||||
|
||||
elif torch.xpu.is_available():
|
||||
ext_modules = [
|
||||
CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
|
||||
]
|
||||
setup(
|
||||
name=VersionInfo.PACKAGE_NAME,
|
||||
version=VersionInfo().get_package_version(),
|
||||
install_requires=triton_dep,
|
||||
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
|
||||
ext_modules=ext_modules
|
||||
)
|
||||
|
||||
setup(
|
||||
name=VersionInfo.PACKAGE_NAME,
|
||||
version=VersionInfo().get_package_version(),
|
||||
install_requires=triton_dep,
|
||||
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
|
||||
ext_modules=ext_modules
|
||||
)
|
||||
elif KTRANSFORMERS_BUILD_NPU:
|
||||
ext_modules = [
|
||||
CMakeExtension("cpuinfer_ext", os.fspath(Path("").resolve() / "csrc" / "ktransformers_ext")),
|
||||
]
|
||||
if with_balance:
|
||||
print("using balance_serve")
|
||||
ext_modules.append(
|
||||
CMakeExtension("balance_serve", os.fspath(Path("").resolve()/ "csrc"/ "balance_serve"))
|
||||
)
|
||||
|
||||
setup(
|
||||
name=VersionInfo.PACKAGE_NAME,
|
||||
version=VersionInfo().get_package_version(),
|
||||
cmdclass={"bdist_wheel":BuildWheelsCommand ,"build_ext": CMakeBuild},
|
||||
ext_modules=ext_modules
|
||||
)
|
||||
Reference in New Issue
Block a user