diff --git a/MANIFEST.in b/MANIFEST.in index 54535e5..5807212 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ recursive-include exllamav2 * +global-include *.typed global-exclude *.pyc global-exclude dni_* \ No newline at end of file diff --git a/exllamav2/py.typed b/exllamav2/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index 903f0ae..c8d8c77 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,23 @@ from setuptools import setup, Extension -from torch.utils import cpp_extension -from torch import version as torch_version +import importlib.util import os +if torch := importlib.util.find_spec("torch") is not None: + from torch.utils import cpp_extension + from torch import version as torch_version + extension_name = "exllamav2_ext" -verbose = False -ext_debug = False +precompile = "EXLLAMA_NOCOMPILE" not in os.environ +verbose = "EXLLAMA_VERBOSE" in os.environ +ext_debug = "EXLLAMA_EXT_DEBUG" in os.environ -precompile = 'EXLLAMA_NOCOMPILE' not in os.environ +if precompile and not torch: + print( + "cannot precompile unless torch is installed \ +To explicitly JIT install run EXLLAMA_NOCOMPILE= pip install " + ) -windows = (os.name == "nt") +windows = os.name == "nt" extra_cflags = ["/Ox"] if windows else ["-O3"] @@ -18,7 +26,7 @@ if ext_debug: extra_cuda_cflags = ["-lineinfo", "-O3"] -if torch_version.hip: +if torch and torch_version.hip: extra_cuda_cflags += ["-DHIPBLAS_USE_HIP_HALF"] extra_compile_args = { @@ -26,67 +34,72 @@ extra_compile_args = { "nvcc": extra_cuda_cflags, } -setup_kwargs = { - "ext_modules": [ - cpp_extension.CUDAExtension( - extension_name, - [ - "exllamav2/exllamav2_ext/ext_bindings.cpp", - "exllamav2/exllamav2_ext/ext_cache.cpp", - "exllamav2/exllamav2_ext/ext_gemm.cpp", - "exllamav2/exllamav2_ext/ext_hadamard.cpp", - "exllamav2/exllamav2_ext/ext_norm.cpp", - "exllamav2/exllamav2_ext/ext_qattn.cpp", - "exllamav2/exllamav2_ext/ext_qmatrix.cpp", - "exllamav2/exllamav2_ext/ext_qmlp.cpp", - "exllamav2/exllamav2_ext/ext_quant.cpp", - "exllamav2/exllamav2_ext/ext_rope.cpp", - "exllamav2/exllamav2_ext/ext_stloader.cpp", - "exllamav2/exllamav2_ext/ext_sampling.cpp", - "exllamav2/exllamav2_ext/ext_element.cpp", - "exllamav2/exllamav2_ext/ext_tp.cpp", - "exllamav2/exllamav2_ext/cuda/graph.cu", - "exllamav2/exllamav2_ext/cuda/h_add.cu", - "exllamav2/exllamav2_ext/cuda/h_gemm.cu", - "exllamav2/exllamav2_ext/cuda/lora.cu", - "exllamav2/exllamav2_ext/cuda/pack_tensor.cu", - "exllamav2/exllamav2_ext/cuda/quantize.cu", - "exllamav2/exllamav2_ext/cuda/q_matrix.cu", - "exllamav2/exllamav2_ext/cuda/q_attn.cu", - "exllamav2/exllamav2_ext/cuda/q_mlp.cu", - "exllamav2/exllamav2_ext/cuda/q_gemm.cu", - "exllamav2/exllamav2_ext/cuda/rms_norm.cu", - "exllamav2/exllamav2_ext/cuda/head_norm.cu", - "exllamav2/exllamav2_ext/cuda/layer_norm.cu", - "exllamav2/exllamav2_ext/cuda/rope.cu", - "exllamav2/exllamav2_ext/cuda/cache.cu", - "exllamav2/exllamav2_ext/cuda/util.cu", - "exllamav2/exllamav2_ext/cuda/softcap.cu", - "exllamav2/exllamav2_ext/cuda/tp.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/kernel_select.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_gptq_1.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_gptq_2.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_gptq_3.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_1a.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_1b.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_2a.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_2b.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_3a.cu", - "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_3b.cu", - "exllamav2/exllamav2_ext/cpp/quantize_func.cpp", - "exllamav2/exllamav2_ext/cpp/profiling.cpp", - "exllamav2/exllamav2_ext/cpp/generator.cpp", - "exllamav2/exllamav2_ext/cpp/sampling.cpp", - "exllamav2/exllamav2_ext/cpp/sampling_avx2.cpp", - ], - extra_compile_args=extra_compile_args, - libraries=["cublas"] if windows else [], - )], - "cmdclass": {"build_ext": cpp_extension.BuildExtension} -} if precompile else {} +setup_kwargs = ( + { + "ext_modules": [ + cpp_extension.CUDAExtension( + extension_name, + [ + "exllamav2/exllamav2_ext/ext_bindings.cpp", + "exllamav2/exllamav2_ext/ext_cache.cpp", + "exllamav2/exllamav2_ext/ext_gemm.cpp", + "exllamav2/exllamav2_ext/ext_hadamard.cpp", + "exllamav2/exllamav2_ext/ext_norm.cpp", + "exllamav2/exllamav2_ext/ext_qattn.cpp", + "exllamav2/exllamav2_ext/ext_qmatrix.cpp", + "exllamav2/exllamav2_ext/ext_qmlp.cpp", + "exllamav2/exllamav2_ext/ext_quant.cpp", + "exllamav2/exllamav2_ext/ext_rope.cpp", + "exllamav2/exllamav2_ext/ext_stloader.cpp", + "exllamav2/exllamav2_ext/ext_sampling.cpp", + "exllamav2/exllamav2_ext/ext_element.cpp", + "exllamav2/exllamav2_ext/ext_tp.cpp", + "exllamav2/exllamav2_ext/cuda/graph.cu", + "exllamav2/exllamav2_ext/cuda/h_add.cu", + "exllamav2/exllamav2_ext/cuda/h_gemm.cu", + "exllamav2/exllamav2_ext/cuda/lora.cu", + "exllamav2/exllamav2_ext/cuda/pack_tensor.cu", + "exllamav2/exllamav2_ext/cuda/quantize.cu", + "exllamav2/exllamav2_ext/cuda/q_matrix.cu", + "exllamav2/exllamav2_ext/cuda/q_attn.cu", + "exllamav2/exllamav2_ext/cuda/q_mlp.cu", + "exllamav2/exllamav2_ext/cuda/q_gemm.cu", + "exllamav2/exllamav2_ext/cuda/rms_norm.cu", + "exllamav2/exllamav2_ext/cuda/head_norm.cu", + "exllamav2/exllamav2_ext/cuda/layer_norm.cu", + "exllamav2/exllamav2_ext/cuda/rope.cu", + "exllamav2/exllamav2_ext/cuda/cache.cu", + "exllamav2/exllamav2_ext/cuda/util.cu", + "exllamav2/exllamav2_ext/cuda/softcap.cu", + "exllamav2/exllamav2_ext/cuda/tp.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/kernel_select.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_gptq_1.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_gptq_2.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_gptq_3.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_1a.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_1b.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_2a.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_2b.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_3a.cu", + "exllamav2/exllamav2_ext/cuda/comp_units/unit_exl2_3b.cu", + "exllamav2/exllamav2_ext/cpp/quantize_func.cpp", + "exllamav2/exllamav2_ext/cpp/profiling.cpp", + "exllamav2/exllamav2_ext/cpp/generator.cpp", + "exllamav2/exllamav2_ext/cpp/sampling.cpp", + "exllamav2/exllamav2_ext/cpp/sampling_avx2.cpp", + ], + extra_compile_args=extra_compile_args, + libraries=["cublas"] if windows else [], + ) + ], + "cmdclass": {"build_ext": cpp_extension.BuildExtension}, + } + if precompile and torch + else {} +) version_py = {} -with open("exllamav2/version.py", encoding = "utf8") as fp: +with open("exllamav2/version.py", encoding="utf8") as fp: exec(fp.read(), version_py) version = version_py["__version__"] print("Version:", version) @@ -94,9 +107,9 @@ print("Version:", version) # version = "0.0.5" setup( - name = "exllamav2", - version = version, - packages = [ + name="exllamav2", + version=version, + packages=[ "exllamav2", "exllamav2.generator", # "exllamav2.generator.filters", @@ -106,10 +119,10 @@ setup( # "exllamav2.exllamav2_ext.cuda", # "exllamav2.exllamav2_ext.cuda.quant", ], - url = "https://github.com/turboderp/exllamav2", - license = "MIT", - author = "turboderp", - install_requires = [ + url="https://github.com/turboderp/exllamav2", + license="MIT", + author="turboderp", + install_requires=[ "pandas", "ninja", "fastparquet", @@ -120,9 +133,12 @@ setup( "websockets", "regex", "numpy", - "rich" + "rich", ], - include_package_data = True, - verbose = verbose, + include_package_data=True, + package_data={ + "": ["py.typed"], + }, + verbose=verbose, **setup_kwargs, )