import os, sys
import time

os.environ["BLAS_NUM_THREADS"] = "1"
sys.path.insert(0, os.path.dirname(__file__) + "/../build")
from kt_kernel import kt_kernel_ext
from kt_kernel_ext.kvcache import ggml_type
import torch
import logging
import sys
import json
from pathlib import Path
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForCausalLM,
    GenerationConfig,
    TextStreamer,
)

logger = logging.getLogger("reader")

from gguf.gguf_reader import GGUFReader

# load_layers = 6
load_layers = None
CPUInfer = kt_kernel_ext.CPUInfer(304)
max_qlen = 4096
max_kvlen = 4096
page_size = 256
pages_count = 200


def read_gguf_file(gguf_file_path):
    """
    Reads and prints key-value pairs and tensor information from a GGUF file in an improved format.

    Parameters:
    - gguf_file_path: Path to the GGUF file.
    """

    reader = GGUFReader(gguf_file_path)

    # List all key-value pairs in a columnized format
    # print("Key-Value Pairs:") # noqa: NP100
    # max_key_length = max(len(key) for key in reader.fields.keys())
    for key, field in reader.fields.items():
        value = field.parts[field.data[0]]
        # print(f"{key:{max_key_length}} : {value}") # noqa: NP100
    # print("----") # noqa: NP100

    # List all tensors
    # print("Tensors:") # noqa: NP100
    # tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
    # print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
    # print("-" * 80) # noqa: NP100
    re = []
    for tensor in reader.tensors:
        shape_str = "x".join(map(str, tensor.shape))
        size_str = str(tensor.n_elements)
        quantization_str = tensor.tensor_type.name
        # print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
        re.append(tensor)
    return re


def read_gguf_directory(directory):
    """
    Reads all GGUF files in a directory and prints their contents.

    Parameters:
    - directory: Path to the directory containing GGUF files.
    """
    if not os.path.isdir(directory):
        logger.error(f"Directory {directory} does not exist.")
        return

    # List all GGUF files in the directory
    files = [f for f in os.listdir(directory) if f.endswith(".gguf")]
    if not files:
        logger.info(f"No GGUF files found in {directory}.")
        return

    re = []
    for file in files:
        file_path = os.path.join(directory, file)
        # print(f"Reading {file_path}:") # noqa: NP100
        # print("\n") # noqa: NP100
        re.extend(read_gguf_file(file_path))
    re = {r.name: r for r in re}
    return re


def find_weights(name, weights):
    """
    Finds and returns the weights for a given name from the list of weights.

    Parameters:
    - name: The name of the weights to find.
    - weights: List of weight tensors.

    Returns:
    - The weight tensor if found, otherwise None.
    """
    for weight in weights:
        if weight.name == name:
            return weight
    raise ValueError(f"Weight with name {name} not found in the provided weights list.")


def get_torch_tensor_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous()


def get_torch_tensor_and_type_from_gguf(gguf_weights, name):
    return torch.from_numpy(gguf_weights[name].data).contiguous(), gguf_weights[name].tensor_type.name


def type_to_ggml_type(type):
    if type == "F32":
        return ggml_type.FP32
    elif type == "F16":
        return ggml_type.FP16
    elif type == "BF16":
        return ggml_type.BF16
    else:
        raise ValueError(f"Unsupported data type: {type}")


def build_mla(layer_idx, json_config, gguf_weights):
    hidden_size = json_config["hidden_size"]
    num_heads = json_config["num_attention_heads"]
    q_lora_rank = json_config["q_lora_rank"]
    kv_lora_rank = json_config["kv_lora_rank"]
    nope_size = json_config["qk_nope_head_dim"]
    rope_size = json_config["qk_rope_head_dim"]
    max_position_embeddings = json_config["max_position_embeddings"]
    rope_theta = json_config["rope_theta"]
    rope_scaling = json_config["rope_scaling"]

    config = kt_kernel_ext.mla.MLAConfig(
        hidden_size,
        q_lora_rank,
        kv_lora_rank,
        num_heads,
        nope_size,
        rope_size,
    )
    config.max_qlen = max_qlen
    config.max_kvlen = max_kvlen
    config.max_position_embeddings = max_position_embeddings
    config.rope_scaling_factor = rope_scaling["factor"]
    config.rope_theta = rope_theta
    config.rope_scaling_beta_fast = rope_scaling["beta_fast"]
    config.rope_scaling_beta_slow = rope_scaling["beta_slow"]
    config.rope_scaling_mscale = rope_scaling["mscale"]
    config.rope_scaling_mscale_all_dim = rope_scaling["mscale_all_dim"]
    config.rope_scaling_original_max_position_embeddings = rope_scaling["original_max_position_embeddings"]

    q_a_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a.weight")
    config.q_a_proj = q_a_proj_weight.data_ptr()
    config.q_a_proj_type = type_to_ggml_type(type)
    q_a_type = type

    q_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_a_norm.weight")
    config.q_a_norm = q_a_norm_weight.data_ptr()
    config.q_a_norm_type = type_to_ggml_type(type)

    q_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_q_b.weight")
    config.q_b_proj = q_b_proj_weight.data_ptr()
    config.q_b_proj_type = type_to_ggml_type(type)

    kv_a_proj_with_mqa_weight, type = get_torch_tensor_and_type_from_gguf(
        gguf_weights, f"blk.{layer_idx}.attn_kv_a_mqa.weight"
    )
    config.kv_a_proj_with_mqa = kv_a_proj_with_mqa_weight.data_ptr()
    config.kv_a_proj_with_mqa_type = type_to_ggml_type(type)

    kv_a_norm_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_a_norm.weight")
    config.kv_a_norm = kv_a_norm_weight.data_ptr()
    config.kv_a_norm_type = type_to_ggml_type(type)

    kv_b_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_kv_b.weight")
    config.kv_b_proj = kv_b_proj_weight.data_ptr()
    config.kv_b_proj_type = type_to_ggml_type(type)

    o_proj_weight, type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.attn_output.weight")
    config.o_proj = o_proj_weight.data_ptr()
    config.w_o_type = type_to_ggml_type(type)

    config.layer_idx = layer_idx
    config.pool = CPUInfer.backend_
    config.page_count = pages_count

    if q_a_type == "F32":
        mla = kt_kernel_ext.mla.MLA_F32(config)
    elif q_a_type == "F16":
        mla = kt_kernel_ext.mla.MLA_F16(config)
    elif q_a_type == "BF16":
        # mla = kt_kernel_ext.mla.MLA_F32(config)
        mla = kt_kernel_ext.mla.MLA_QUAN_F32(config)
    else:
        raise ValueError(f"Unsupported data type: {q_a_type}")

    mla.load_weights()
    mla.set_local_pages(pages_count)
    return mla


def build_ffn(layer_idx, json_config, gguf_weights):
    if f"blk.{layer_idx}.ffn_gate.weight" in gguf_weights:  # dense
        config = kt_kernel_ext.moe.MOEConfig(
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["hidden_size"],
            json_config["moe_intermediate_size"],
        )
        config.layer_idx = layer_idx
        config.max_len = max_qlen
        config.pool = CPUInfer.backend_
        gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate.weight")
        up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up.weight")
        down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down.weight")

        config.gate_proj = gate.data_ptr()
        config.gate_type = type_to_ggml_type(gate_type)
        config.up_proj = up.data_ptr()
        config.up_type = type_to_ggml_type(up_type)
        config.down_proj = down.data_ptr()
        config.down_type = type_to_ggml_type(down_type)

        moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
        moe.load_weights()
        return moe

    elif f"blk.{layer_idx}.ffn_gate_exps.weight" in gguf_weights:
        config = kt_kernel_ext.moe.MOEConfig(
            json_config["n_routed_experts"] + json_config["n_shared_experts"],
            json_config["num_experts_per_tok"] + json_config["n_shared_experts"],
            json_config["hidden_size"],
            json_config["moe_intermediate_size"],
        )
        config.layer_idx = layer_idx
        config.max_len = max_qlen
        config.pool = CPUInfer.backend_
        gate, gate_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_exps.weight")
        up, up_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_exps.weight")
        down, down_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_down_exps.weight")

        gate_sh, gate_sh_type = get_torch_tensor_and_type_from_gguf(
            gguf_weights, f"blk.{layer_idx}.ffn_gate_shexp.weight"
        )
        up_sh, up_sh_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_up_shexp.weight")
        down_sh, down_sh_type = get_torch_tensor_and_type_from_gguf(
            gguf_weights, f"blk.{layer_idx}.ffn_down_shexp.weight"
        )

        gate_sh_expanded = gate_sh.unsqueeze(0)
        gate = torch.cat([gate, gate_sh_expanded], dim=0).contiguous()
        up_sh_expanded = up_sh.unsqueeze(0)
        up = torch.cat([up, up_sh_expanded], dim=0).contiguous()
        down_sh_expanded = down_sh.unsqueeze(0)
        down = torch.cat([down, down_sh_expanded], dim=0).contiguous()

        config.gate_proj = gate.data_ptr()
        config.gate_type = type_to_ggml_type(gate_type)
        config.up_proj = up.data_ptr()
        config.up_type = type_to_ggml_type(up_type)
        config.down_proj = down.data_ptr()
        config.down_type = type_to_ggml_type(down_type)

        moe = kt_kernel_ext.moe.KMLInt8_MOE(config)
        moe.load_weights()
        return moe

    else:
        raise ValueError(f"Unsupported FFN type for layer {layer_idx}")


def build_moegate(layer_idx, json_config, gguf_weights):
    config = kt_kernel_ext.gate.GateConfig(
        json_config["hidden_size"],
        json_config["num_experts_per_tok"],
        json_config["n_routed_experts"],
        json_config["n_group"],
        json_config["topk_group"],
    )

    config.routed_scaling_factor = json_config["routed_scaling_factor"]

    config.pool = CPUInfer.backend_

    weight, weight_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.ffn_gate_inp.weight")
    config.weight = weight.data_ptr()
    config.weight_type = type_to_ggml_type(weight_type)

    bias, bias_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{layer_idx}.exp_probs_b.bias")
    config.e_score_correction_bias = bias.data_ptr()
    config.e_score_correction_bias_type = type_to_ggml_type(bias_type)

    gate = kt_kernel_ext.gate.MoEGate(config)

    return gate


def build_llm(json_config, gguf_weights):

    general_config = kt_kernel_ext.GeneralConfig()
    general_config.vocab_size = json_config["vocab_size"]
    general_config.hidden_size = json_config["hidden_size"]
    general_config.num_experts_per_tok = json_config["num_experts_per_tok"]
    general_config.n_routed_experts = json_config["n_routed_experts"]
    general_config.n_shared_experts = json_config["n_shared_experts"]
    general_config.max_qlen = max_qlen

    lm_heads, lm_heads_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output.weight")
    general_config.lm_heads_ptr = lm_heads.data_ptr()
    general_config.lm_heads_type = type_to_ggml_type(lm_heads_type)

    output_norm, output_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, "output_norm.weight")
    general_config.norm_weights_ptr = output_norm.data_ptr()
    general_config.norm_weights_type = type_to_ggml_type(output_norm_type)

    token_embd, token_embd_type = get_torch_tensor_and_type_from_gguf(weights, "token_embd.weight")
    general_config.token_embd_ptr = token_embd.data_ptr()
    general_config.token_embd_type = type_to_ggml_type(token_embd_type)

    general_config.pool = CPUInfer.backend_

    llm = kt_kernel_ext.DeepseekV3ForCausalLM(general_config)
    model = kt_kernel_ext.DeepseekV3Model(general_config)
    llm.model = model

    decoder_layers = []
    real_load_layers = json_config["num_hidden_layers"] if load_layers is None else load_layers

    for i in range(real_load_layers):
        layer = kt_kernel_ext.DeepseekV3DecoderLayer(general_config, i)
        attn_norm, attn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.attn_norm.weight")
        ffn_norm, ffn_norm_type = get_torch_tensor_and_type_from_gguf(gguf_weights, f"blk.{i}.ffn_norm.weight")

        layer.load_norm(
            attn_norm.data_ptr(),
            type_to_ggml_type(attn_norm_type),
            ffn_norm.data_ptr(),
            type_to_ggml_type(ffn_norm_type),
        )
        layer.self_attn = build_mla(i, json_config, gguf_weights)
        if f"blk.{i}.ffn_gate_inp.weight" in gguf_weights:
            layer.gate = build_moegate(i, json_config, gguf_weights)
        layer.ffn = build_ffn(i, json_config, gguf_weights)
        decoder_layers.append(layer)

    model.layers = decoder_layers
    return llm


safetensor_path = "/home/bd/models/DeepSeek-R1"
json_path = os.path.join(safetensor_path, "config.json")
json_config = json.load(open(json_path, "r"))
print(json_config)

gguf_path = "/home/bd/models/DeepSeek-R1-BF16"
weights = read_gguf_directory(gguf_path)
weights = dict(sorted(weights.items()))


for name, t in weights.items():
    # if not name.startswith("blk"):
    # if name.startswith("blk.10."):
    # if "ffn_gate." in name:
    # print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")
    print(f"Found weight: {t.name}, Shape: {t.shape}, Type: {t.tensor_type.name}, Size: {t.n_elements}")

print("Building LLM ...")
load_start_time = time.perf_counter()
llm = build_llm(json_config, weights)
load_end_time = time.perf_counter()
print(f"Load time: {load_end_time - load_start_time:.4f} seconds")

print("Release Weight Tensors ...")
weights = None
print("Loading Configs ...")


tokenizer = AutoTokenizer.from_pretrained(safetensor_path, trust_remote_code=True)
config = AutoConfig.from_pretrained(safetensor_path, trust_remote_code=True)

force_think = False


output_logits = torch.zeros((max_qlen, json_config["vocab_size"]), dtype=torch.float32)


def start_chat(content=None):
    if content is None:
        content = input("Chat: ")

    messages = [{"role": "user", "content": content}]
    input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    if force_think:
        token_thinks = torch.tensor(
            [tokenizer.encode("<think>\\n", add_special_tokens=False)], device=input_tensor.device
        )
        input_tensor = torch.cat([input_tensor, token_thinks], dim=1)
    input_tensor = input_tensor.squeeze(0)  # Add batch dimension

    print(f"Input tensor: {input_tensor}, type {input_tensor.dtype}, shape {input_tensor.shape}")
    kvlen = 0
    step = 2
    while True or step > 0:
        step -= 1
        stream = TextStreamer(tokenizer)

        qlen = input_tensor.shape[0]
        qlens = [qlen - kvlen]
        kvlens = [kvlen]
        page_tables = [list(range(pages_count))]
        start_time = time.perf_counter()
        llm.forward(qlens, page_tables, kvlens, input_tensor[kvlen:].data_ptr(), output_logits.data_ptr())
        end_time = time.perf_counter()
        print(
            f"Forward time: {end_time - start_time:.4f} seconds, tps: {qlens[0] / (end_time - start_time)} tokens/sec"
        )

        logits = output_logits[0]
        # print(logits)
        # sample
        next_token = torch.argmax(logits).item()
        # print(f"Next token: {next_token}, {tokenizer.decode(next_token)}")
        kvlen = input_tensor.shape[0]
        input_tensor = torch.cat((input_tensor, torch.tensor([next_token])), dim=-1)

        if next_token == tokenizer.eos_token_id or tokenizer.decode(next_token) == "<|im_end|>":
            stream.end()
            break
        else:
            stream.put(torch.tensor([next_token]))


job_id = 0
while True:
    try:
        # ---------- 让用户决定是否继续 ----------
        choice = input("\n【回车】开始对话 | 输入 1 读取文件 | 输入 q/quit/exit 退出程序： ").strip().lower()
        if choice in {"q", "quit", "exit"}:
            print("收到退出指令，程序结束。")
            break
        elif choice == "1":
            file_path = input("请输入要读取的文件路径：").strip()
            if not Path(file_path).is_file():
                print(f"文件 {file_path} 不存在，请检查路径。")
                continue
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
            print(f"读取到内容：\n{content}\n")
            start_chat(content)
        else:
            start_chat()

    except KeyboardInterrupt:
        # 随时 Ctrl-C：放弃当前任务并重启
        print(f"\n检测到 Ctrl-C，已终止对话 #{job_id}，马上重启…")
    except Exception as e:
        # 其他异常：打印错误信息并重启
        print(f"\n发生错误：{e}\n已终止对话 #{job_id}，马上重启…")
        logger.error(f"Error in job {job_id}: {e}", exc_info=True)
    finally:
        job_id += 1  # 不管中断与否，都给下一任务换编号