| | [general] |
| | name = "quantization" |
| | universal = false |
| |
|
| | [torch] |
| | include = ["."] |
| | src = [ |
| | "core/scalar_type.hpp", |
| | "torch-ext/torch_binding.cpp", |
| | "torch-ext/torch_binding.h", |
| | ] |
| |
|
| | [kernel.gptq_marlin] |
| | backend = "cuda" |
| | cuda-capabilities = [ |
| | "8.0", |
| | "8.6", |
| | "8.7", |
| | "8.9", |
| | "9.0", |
| | "10.0", |
| | "10.1", |
| | "12.0", |
| | ] |
| | depends = ["torch"] |
| | include = ["."] |
| | src = [ |
| | "core/scalar_type.hpp", |
| | "gptq_marlin/awq_marlin_repack.cu", |
| | "gptq_marlin/dequant.h", |
| | "gptq_marlin/gptq_marlin.cu", |
| | "gptq_marlin/gptq_marlin_repack.cu", |
| | "gptq_marlin/kernel.h", |
| | "gptq_marlin/kernel_bf16_kfe2m1f.cu", |
| | "gptq_marlin/kernel_bf16_kfe4m3fn.cu", |
| | "gptq_marlin/kernel_bf16_ku4.cu", |
| | "gptq_marlin/kernel_bf16_ku4b8.cu", |
| | "gptq_marlin/kernel_bf16_ku8b128.cu", |
| | "gptq_marlin/kernel_fp16_kfe2m1f.cu", |
| | "gptq_marlin/kernel_fp16_kfe4m3fn.cu", |
| | "gptq_marlin/kernel_fp16_ku4.cu", |
| | "gptq_marlin/kernel_fp16_ku4b8.cu", |
| | "gptq_marlin/kernel_fp16_ku8b128.cu", |
| | "gptq_marlin/marlin.cuh", |
| | "gptq_marlin/marlin_dtypes.cuh", |
| | "gptq_marlin/marlin_template.h", |
| | ] |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | [kernel.int8_common] |
| | backend = "cuda" |
| | cuda-capabilities = [ |
| | "7.0", |
| | "7.2", |
| | "7.5", |
| | "8.0", |
| | "8.6", |
| | "8.7", |
| | "8.9", |
| | "9.0", |
| | "10.0", |
| | "10.1", |
| | "12.0", |
| | ] |
| | depends = ["torch"] |
| | include = ["."] |
| | src = [ |
| | "compressed_tensors/int8_quant_kernels.cu", |
| | "dispatch_utils.h", |
| | "vectorization_utils.cuh", |
| | ] |
| |
|
| | [kernel.fp8_common] |
| | backend = "cuda" |
| | cuda-capabilities = [ |
| | "7.0", |
| | "7.2", |
| | "7.5", |
| | "8.0", |
| | "8.6", |
| | "8.7", |
| | "8.9", |
| | "9.0", |
| | "10.0", |
| | "10.1", |
| | "12.0", |
| | ] |
| | depends = ["torch"] |
| | include = ["."] |
| | src = [ |
| | "fp8/common.cu", |
| | "fp8/common.cuh", |
| | "dispatch_utils.h", |
| | "utils.cuh", |
| | "vectorization.cuh", |
| | ] |
| |
|
| | [kernel.cutlass_w8a8_hopper] |
| | backend = "cuda" |
| | cuda-capabilities = ["9.0a"] |
| | cuda-minver = "12.0" |
| | depends = [ |
| | "cutlass_3_9", |
| | "torch", |
| | ] |
| | include = ["."] |
| | src = [ |
| | "cuda_utils.h", |
| | "core/math.hpp", |
| | "cutlass_w8a8/c3x/cutlass_gemm_caller.cuh", |
| | "cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu", |
| | "cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu", |
| | "cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8_dispatch.cuh", |
| | "cutlass_w8a8/c3x/scaled_mm.cuh", |
| | "cutlass_w8a8/c3x/scaled_mm_kernels.hpp", |
| | "cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu", |
| | "cutlass_w8a8/c3x/scaled_mm_sm90_fp8_dispatch.cuh", |
| | "cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu", |
| | "cutlass_w8a8/c3x/scaled_mm_sm90_int8_dispatch.cuh", |
| | "cutlass_w8a8/c3x/scaled_mm_helper.hpp", |
| | "cutlass_w8a8/scaled_mm_c3x_sm90.cu", |
| | "cutlass_extensions/common.hpp", |
| | "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp", |
| | "cutlass_extensions/epilogue/broadcast_load_epilogue_c3x.hpp", |
| | "cutlass_extensions/epilogue/broadcast_load_epilogue_array_c3x.hpp", |
| | "cutlass_extensions/gemm/dispatch_policy.hpp", |
| | "cutlass_extensions/gemm/collective/collective_builder.hpp", |
| | "cutlass_extensions/gemm/collective/fp8_accumulation.hpp", |
| | "cutlass_extensions/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized_fp8_blockwise_scaling.hpp", |
| | ] |
| |
|
| |
|
| |
|
| | [kernel.cutlass_w8a8_blackwell] |
| | backend = "cuda" |
| | cuda-capabilities = [ |
| | "10.0a", |
| | "10.1a", |
| | "12.0a", |
| | ] |
| | cuda-minver = "12.9" |
| | depends = [ |
| | "cutlass_3_9", |
| | "torch", |
| | ] |
| | include = ["."] |
| | src = [ |
| | "cuda_utils.h", |
| | "cutlass_w8a8/scaled_mm_c3x_sm100.cu", |
| | "cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu", |
| | "cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8_dispatch.cuh", |
| | "cutlass_w8a8/c3x/scaled_mm_helper.hpp", |
| | "cutlass_w8a8/c3x/scaled_mm_kernels.hpp", |
| | "cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu", |
| | "cutlass_w8a8/c3x/scaled_mm_sm100_fp8_dispatch.cuh", |
| | ] |
| |
|
| | [kernel.cutlass_w8a8] |
| | backend = "cuda" |
| | cuda-capabilities = [ |
| | "7.5", |
| | "8.0", |
| | "8.6", |
| | "8.7", |
| | "8.9", |
| | "9.0", |
| | "10.0", |
| | "10.1", |
| | "12.0", |
| | ] |
| | depends = [ |
| | "cutlass_3_9", |
| | "torch", |
| | ] |
| | include = ["."] |
| | src = [ |
| | "core/math.hpp", |
| | "cutlass_w8a8/scaled_mm_c2x.cu", |
| | "cutlass_w8a8/scaled_mm_c2x.cuh", |
| | "cutlass_w8a8/scaled_mm_c2x_sm75_dispatch.cuh", |
| | "cutlass_w8a8/scaled_mm_c2x_sm80_dispatch.cuh", |
| | "cutlass_w8a8/scaled_mm_c2x_sm89_fp8_dispatch.cuh", |
| | "cutlass_w8a8/scaled_mm_c2x_sm89_int8_dispatch.cuh", |
| | "cutlass_w8a8/scaled_mm_entry.cu", |
| | "cutlass_extensions/common.cpp", |
| | "cutlass_extensions/common.hpp", |
| | "cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp", |
| | "cutlass_extensions/epilogue/broadcast_load_epilogue_c2x.hpp", |
| | ] |
| |
|
| | [kernel.marlin] |
| | backend = "cuda" |
| | cuda-capabilities = [ |
| | "8.0", |
| | "8.6", |
| | "8.7", |
| | "8.9", |
| | "9.0", |
| | "10.0", |
| | "10.1", |
| | "12.0", |
| | ] |
| | depends = ["torch"] |
| | include = ["."] |
| | src = [ |
| | "core/scalar_type.hpp", |
| | "marlin/dense/common/base.h", |
| | "marlin/dense/common/mem.h", |
| | "marlin/dense/marlin_cuda_kernel.cu", |
| | "marlin/qqq/marlin_qqq_gemm_kernel.cu", |
| | "marlin/sparse/common/base.h", |
| | "marlin/sparse/common/mem.h", |
| | "marlin/sparse/common/mma.h", |
| | "marlin/sparse/marlin_24_cuda_kernel.cu", |
| | ] |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|