Skip to content

Commit c7941cc

Browse files
authored
Explicitly explain quant method override ordering and ensure all overrides are ordered (#17256)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
1 parent b6dd32a commit c7941cc

File tree

2 files changed

+39
-9
lines changed

2 files changed

+39
-9
lines changed

vllm/config.py

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
2929
from vllm.logger import init_logger
3030
from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
31+
QuantizationMethods,
3132
get_quantization_config)
3233
from vllm.model_executor.models import ModelRegistry
3334
from vllm.platforms import CpuArchEnum, current_platform
@@ -767,12 +768,43 @@ def _verify_quantization(self) -> None:
767768
"compressed-tensors")
768769
quant_cfg["quant_method"] = quant_method
769770

771+
# Quantization methods which are overrides (i.e. they have a
772+
# `override_quantization_method` method) must be checked in order
773+
# of preference (this is particularly important for GPTQ).
774+
overrides = [
775+
"marlin",
776+
"bitblas",
777+
"gptq_marlin_24",
778+
"gptq_marlin",
779+
"gptq_bitblas",
780+
"awq_marlin",
781+
"ipex",
782+
"moe_wna16",
783+
]
784+
quantization_methods = [
785+
q for q in supported_quantization if q not in overrides
786+
]
787+
# Any custom overrides will be in quantization_methods so we place
788+
# them at the start of the list so custom overrides have preference
789+
# over the built in ones.
790+
quantization_methods = quantization_methods + overrides
791+
770792
# Detect which checkpoint is it
771-
for name in QUANTIZATION_METHODS:
793+
for name in quantization_methods:
772794
method = get_quantization_config(name)
773795
quantization_override = method.override_quantization_method(
774796
quant_cfg, self.quantization)
775-
if quantization_override:
797+
if quantization_override is not None:
798+
# Raise error if the override is not custom (custom would
799+
# be in QUANTIZATION_METHODS but not QuantizationMethods)
800+
# and hasn't been added to the overrides list.
801+
if (name in get_args(QuantizationMethods)
802+
and name not in overrides):
803+
raise ValueError(
804+
f"Quantization method {name} is an override but "
805+
"is has not been added to the `overrides` list "
806+
"above. This is necessary to ensure that the "
807+
"overrides are checked in order of preference.")
776808
quant_method = quantization_override
777809
self.quantization = quantization_override
778810
break

vllm/model_executor/layers/quantization/__init__.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
# SPDX-License-Identifier: Apache-2.0
22

3-
from typing import Dict, List, Type
3+
from typing import Literal, Type, get_args
44

55
from vllm.model_executor.layers.quantization.base_config import (
66
QuantizationConfig)
77

8-
QUANTIZATION_METHODS: List[str] = [
8+
QuantizationMethods = Literal[
99
"aqlm",
1010
"awq",
1111
"deepspeedfp",
@@ -15,8 +15,6 @@
1515
"fbgemm_fp8",
1616
"modelopt",
1717
"nvfp4",
18-
# The order of gptq methods is important for config.py iteration over
19-
# override_quantization_method(..)
2018
"marlin",
2119
"bitblas",
2220
"gguf",
@@ -36,6 +34,7 @@
3634
"moe_wna16",
3735
"torchao",
3836
]
37+
QUANTIZATION_METHODS: list[str] = list(get_args(QuantizationMethods))
3938

4039
# The customized quantization methods which will be added to this dict.
4140
_CUSTOMIZED_METHOD_TO_QUANT_CONFIG = {}
@@ -111,7 +110,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
111110
from .torchao import TorchAOConfig
112111
from .tpu_int8 import Int8TpuConfig
113112

114-
method_to_config: Dict[str, Type[QuantizationConfig]] = {
113+
method_to_config: dict[str, Type[QuantizationConfig]] = {
115114
"aqlm": AQLMConfig,
116115
"awq": AWQConfig,
117116
"deepspeedfp": DeepSpeedFPConfig,
@@ -120,8 +119,6 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
120119
"fbgemm_fp8": FBGEMMFp8Config,
121120
"modelopt": ModelOptFp8Config,
122121
"nvfp4": ModelOptNvFp4Config,
123-
# The order of gptq methods is important for config.py iteration over
124-
# override_quantization_method(..)
125122
"marlin": MarlinConfig,
126123
"bitblas": BitBLASConfig,
127124
"gguf": GGUFConfig,
@@ -150,6 +147,7 @@ def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
150147

151148
__all__ = [
152149
"QuantizationConfig",
150+
"QuantizationMethods",
153151
"get_quantization_config",
154152
"QUANTIZATION_METHODS",
155153
]

0 commit comments

Comments
 (0)