|
28 | 28 | from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
29 | 29 | from vllm.logger import init_logger
|
30 | 30 | from vllm.model_executor.layers.quantization import (QUANTIZATION_METHODS,
|
| 31 | + QuantizationMethods, |
31 | 32 | get_quantization_config)
|
32 | 33 | from vllm.model_executor.models import ModelRegistry
|
33 | 34 | from vllm.platforms import CpuArchEnum, current_platform
|
@@ -767,12 +768,43 @@ def _verify_quantization(self) -> None:
|
767 | 768 | "compressed-tensors")
|
768 | 769 | quant_cfg["quant_method"] = quant_method
|
769 | 770 |
|
| 771 | + # Quantization methods which are overrides (i.e. they have a |
| 772 | + # `override_quantization_method` method) must be checked in order |
| 773 | + # of preference (this is particularly important for GPTQ). |
| 774 | + overrides = [ |
| 775 | + "marlin", |
| 776 | + "bitblas", |
| 777 | + "gptq_marlin_24", |
| 778 | + "gptq_marlin", |
| 779 | + "gptq_bitblas", |
| 780 | + "awq_marlin", |
| 781 | + "ipex", |
| 782 | + "moe_wna16", |
| 783 | + ] |
| 784 | + quantization_methods = [ |
| 785 | + q for q in supported_quantization if q not in overrides |
| 786 | + ] |
| 787 | + # Any custom overrides will be in quantization_methods so we place |
| 788 | + # them at the start of the list so custom overrides have preference |
| 789 | + # over the built in ones. |
| 790 | + quantization_methods = quantization_methods + overrides |
| 791 | + |
770 | 792 | # Detect which checkpoint is it
|
771 |
| - for name in QUANTIZATION_METHODS: |
| 793 | + for name in quantization_methods: |
772 | 794 | method = get_quantization_config(name)
|
773 | 795 | quantization_override = method.override_quantization_method(
|
774 | 796 | quant_cfg, self.quantization)
|
775 |
| - if quantization_override: |
| 797 | + if quantization_override is not None: |
| 798 | + # Raise error if the override is not custom (custom would |
| 799 | + # be in QUANTIZATION_METHODS but not QuantizationMethods) |
| 800 | + # and hasn't been added to the overrides list. |
| 801 | + if (name in get_args(QuantizationMethods) |
| 802 | + and name not in overrides): |
| 803 | + raise ValueError( |
| 804 | + f"Quantization method {name} is an override but " |
| 805 | + "is has not been added to the `overrides` list " |
| 806 | + "above. This is necessary to ensure that the " |
| 807 | + "overrides are checked in order of preference.") |
776 | 808 | quant_method = quantization_override
|
777 | 809 | self.quantization = quantization_override
|
778 | 810 | break
|
|
0 commit comments