[temp] serialization stuff

JyotinderSingh · JyotinderSingh · commit cd48a2cf576a · 2025-09-09T19:07:03.000+05:30
diff --git a/keras/src/layers/core/dense.py b/keras/src/layers/core/dense.py
@@ -90,6 +90,7 @@ def __init__(
         bias_constraint=None,
         lora_rank=None,
         lora_alpha=None,
+        quantization_config=None,
         **kwargs,
     ):
         super().__init__(activity_regularizer=activity_regularizer, **kwargs)
@@ -107,11 +108,16 @@ def __init__(
         self.lora_enabled = False
         self.input_spec = InputSpec(min_ndim=2)
         self.supports_masking = True
+        self.quantization_config = quantization_config
 
     def build(self, input_shape):
         kernel_shape = (input_shape[-1], self.units)
         if self.quantization_mode:
-            self.quantized_build(kernel_shape, mode=self.quantization_mode)
+            self.quantized_build(
+                kernel_shape,
+                mode=self.quantization_mode,
+                config=self.quantization_config,
+            )
         if self.quantization_mode not in ("int8", "int4", "gptq"):
             # If the layer is quantized to int8 or int4, `self._kernel` will be
             # added in `self._int8_build` or `_int4_build`. Therefore, we skip
@@ -238,6 +244,11 @@ def save_own_variables(self, store):
                 target_variables.append(self.kernel_amax_history)
                 target_variables.append(self.outputs_grad_scale)
                 target_variables.append(self.outputs_grad_amax_history)
+            elif self.quantization_mode == "gptq":
+                target_variables.append(self.quantized_kernel)
+                target_variables.append(self.kernel_scale)
+                target_variables.append(self.kernel_zero)
+                target_variables.append(self.g_idx)
             else:
                 raise self._quantization_mode_error(self.quantization_mode)
         for i, variable in enumerate(target_variables):
@@ -264,6 +275,11 @@ def load_own_variables(self, store):
                 target_variables.append(self.kernel_amax_history)
                 target_variables.append(self.outputs_grad_scale)
                 target_variables.append(self.outputs_grad_amax_history)
+            elif self.quantization_mode == "gptq":
+                target_variables.append(self.quantized_kernel)
+                target_variables.append(self.kernel_scale)
+                target_variables.append(self.kernel_zero)
+                target_variables.append(self.g_idx)
             else:
                 raise self._quantization_mode_error(self.quantization_mode)
         for i, variable in enumerate(target_variables):
@@ -289,11 +305,25 @@ def get_config(self):
             "kernel_constraint": constraints.serialize(self.kernel_constraint),
             "bias_constraint": constraints.serialize(self.bias_constraint),
         }
+        if self.quantization_config:
+            config["quantization_config"] = self.quantization_config
         if self.lora_rank:
             config["lora_rank"] = self.lora_rank
             config["lora_alpha"] = self.lora_alpha
         return {**base_config, **config}
 
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()
+        from keras.src.saving import deserialize_keras_object
+
+        if "quantization_config" in config:
+            config["quantization_config"] = deserialize_keras_object(
+                config["quantization_config"],
+                custom_objects=custom_objects,
+            )
+        return cls(**config)
+
     def _check_load_own_variables(self, store):
         all_vars = self._trainable_variables + self._non_trainable_variables
         if len(store.keys()) != len(all_vars):
@@ -328,19 +358,19 @@ def _check_load_own_variables(self, store):
                 f"Expected: {[v.name for v in all_vars]}"
             )
 
-    # Quantization-related (int8 and float8) methods
-
-    def quantized_build(self, kernel_shape, mode, config):
+    def quantized_build(self, input_shape, mode, config=None):
         if mode == "int8":
-            self._int8_build(kernel_shape)
+            self._int8_build(input_shape)
         elif mode == "int4":
-            self._int4_build(kernel_shape)
+            self._int4_build(input_shape)
         elif mode == "float8":
             self._float8_build()
         elif mode == "gptq":
-            self._gptq_build(kernel_shape, config)
+            self._gptq_build(input_shape, config)
         else:
             raise self._quantization_mode_error(mode)
+        if config is not None:
+            self.quantization_config = config
         self._is_quantized = True
 
     def _int8_build(self, kernel_shape):
@@ -371,10 +401,10 @@ def _gptq_build(self, kernel_shape, config):
             trainable=False,
         )
 
-        if config.group_size == -1:
+        if config["group_size"] == -1:
             n_groups = 1
         else:
-            n_groups = ceil(self.kernel_shape[0] / config.group_size)
+            n_groups = ceil(self.kernel_shape[0] / config["group_size"])
         self.kernel_scale = self.add_weight(
             name="kernel_scale",
             shape=(self.units, n_groups),
@@ -761,7 +791,7 @@ def _get_kernel_with_merged_lora(self):
                 `kernel_scale`: The quantization scale for the merged kernel.
                     This is `None` if the layer is not quantized.
         """
-        if self.dtype_policy.quantization_mode is None:
+        if self.dtype_policy.quantization_mode in (None, "gptq"):
             return self.kernel, None
 
         kernel_value = self._kernel
diff --git a/keras/src/layers/core/einsum_dense.py b/keras/src/layers/core/einsum_dense.py
@@ -133,6 +133,7 @@ def __init__(
         bias_constraint=None,
         lora_rank=None,
         lora_alpha=None,
+        quantization_config=None,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -152,6 +153,7 @@ def __init__(
         self.lora_rank = lora_rank
         self.lora_alpha = lora_alpha if lora_alpha is not None else lora_rank
         self.lora_enabled = False
+        self.quantization_config = quantization_config
 
     def build(self, input_shape):
         shape_data = _analyze_einsum_string(
@@ -164,7 +166,11 @@ def build(self, input_shape):
         self.full_output_shape = tuple(full_output_shape)
         self.input_spec = InputSpec(ndim=len(input_shape))
         if self.quantization_mode is not None:
-            self.quantized_build(kernel_shape, mode=self.quantization_mode)
+            self.quantized_build(
+                kernel_shape,
+                mode=self.quantization_mode,
+                config=self.quantization_config,
+            )
         # Skip creating a duplicate kernel variable when the layer is already
         # quantized to int8 or int4, because `quantized_build` has created the
         # appropriate kernel variable. For other modes (e.g., float8 or no
@@ -297,6 +303,11 @@ def save_own_variables(self, store):
                 target_variables.append(self.kernel_amax_history)
                 target_variables.append(self.outputs_grad_scale)
                 target_variables.append(self.outputs_grad_amax_history)
+            elif self.quantization_mode == "gptq":
+                target_variables.append(self.quantized_kernel)
+                target_variables.append(self.kernel_scale)
+                target_variables.append(self.kernel_zero)
+                target_variables.append(self.g_idx)
             else:
                 raise self._quantization_mode_error(self.quantization_mode)
         for i, variable in enumerate(target_variables):
@@ -323,6 +334,11 @@ def load_own_variables(self, store):
                 target_variables.append(self.kernel_amax_history)
                 target_variables.append(self.outputs_grad_scale)
                 target_variables.append(self.outputs_grad_amax_history)
+            elif self.quantization_mode == "gptq":
+                target_variables.append(self.quantized_kernel)
+                target_variables.append(self.kernel_scale)
+                target_variables.append(self.kernel_zero)
+                target_variables.append(self.g_idx)
             else:
                 raise self._quantization_mode_error(self.quantization_mode)
         for i, variable in enumerate(target_variables):
@@ -352,11 +368,25 @@ def get_config(self):
             "kernel_constraint": constraints.serialize(self.kernel_constraint),
             "bias_constraint": constraints.serialize(self.bias_constraint),
         }
+        if self.quantization_config:
+            config["quantization_config"] = self.quantization_config
         if self.lora_rank:
             config["lora_rank"] = self.lora_rank
             config["lora_alpha"] = self.lora_alpha
         return {**base_config, **config}
 
+    @classmethod
+    def from_config(cls, config, custom_objects=None):
+        config = config.copy()
+        from keras.src.saving import deserialize_keras_object
+
+        if "quantization_config" in config:
+            config["quantization_config"] = deserialize_keras_object(
+                config["quantization_config"],
+                custom_objects=custom_objects,
+            )
+        return cls(**config)
+
     def _check_load_own_variables(self, store):
         all_vars = self._trainable_variables + self._non_trainable_variables
         if len(store.keys()) != len(all_vars):
@@ -391,19 +421,19 @@ def _check_load_own_variables(self, store):
                 f"Expected: {[v.name for v in all_vars]}"
             )
 
-    # Quantization-related (int8 and float8) methods
-
-    def quantized_build(self, kernel_shape, mode, config):
+    def quantized_build(self, input_shape, mode, config=None):
         if mode == "int8":
-            self._int8_build(kernel_shape)
+            self._int8_build(input_shape)
         elif mode == "int4":
-            self._int4_build(kernel_shape)
+            self._int4_build(input_shape)
         elif mode == "float8":
             self._float8_build()
         elif mode == "gptq":
-            self._gptq_build(kernel_shape, config=config)
+            self._gptq_build(input_shape, config=config)
         else:
             raise self._quantization_mode_error(mode)
+        if config is not None:
+            self.quantization_config = config
         self._is_quantized = True
 
     def _int8_build(self, kernel_shape):
@@ -466,10 +496,10 @@ def _gptq_build(self, kernel_shape, config):
             else:
                 raise ValueError("Could not determine row/column split.")
 
-        if config.group_size == -1:
+        if config["group_size"] == -1:
             n_groups = 1
         else:
-            n_groups = ceil(rows / config.group_size)
+            n_groups = ceil(rows / config["group_size"])
 
         if hasattr(self, "_set_quantization_info"):
             self._set_quantization_info()
@@ -965,7 +995,7 @@ def _get_kernel_with_merged_lora(self):
                     This is `None` if the layer is not quantized.
         """
         # If not a quantized layer, return the full-precision kernel directly.
-        if self.dtype_policy.quantization_mode is None:
+        if self.dtype_policy.quantization_mode in (None, "gptq"):
             return self.kernel, None
 
         # If quantized but LoRA is not enabled, return the original quantized
diff --git a/keras/src/quantizers/gptq_config.py b/keras/src/quantizers/gptq_config.py
@@ -157,3 +157,24 @@ def __init__(
         self.group_size = group_size
         self.symmetric = symmetric
         self.activation_order = activation_order
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def get_config(self):
+        return {
+            "dataset": self.dataset,
+            "tokenizer": self.tokenizer,
+            "num_samples": self.num_samples,
+            "per_channel": self.per_channel,
+            "sequence_length": self.sequence_length,
+            "hessian_damping": self.hessian_damping,
+            "weight_bits": self.weight_bits,
+            "group_size": self.group_size,
+            "symmetric": self.symmetric,
+            "activation_order": self.activation_order,
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
diff --git a/keras/src/quantizers/gptq_test.py b/keras/src/quantizers/gptq_test.py
@@ -1,3 +1,4 @@
+import os
 from collections.abc import Callable
 
 import numpy as np
@@ -9,6 +10,7 @@
 from keras.src import backend
 from keras.src import layers
 from keras.src import ops
+from keras.src import saving
 from keras.src import testing
 from keras.src.quantizers.gptq import GPTQ
 from keras.src.quantizers.gptq import _stable_permutation
@@ -410,16 +412,31 @@ def _get_sequence_classifier():
     num_heads = 4
     ff_dim = 32
 
+    @keras.saving.register_keras_serializable(package="GPTQTest")
     class SimpleTransformerBlock(layers.Layer):
         def __init__(self, embed_dim, num_heads, ff_dim, **kwargs):
             super().__init__(**kwargs)
+            self.embed_dim = embed_dim
+            self.num_heads = num_heads
+            self.ff_dim = ff_dim
+
             self.att = layers.MultiHeadAttention(
-                num_heads=num_heads, key_dim=embed_dim // num_heads
+                num_heads=num_heads, key_dim=embed_dim // num_heads, **kwargs
             )
+            sub_kwargs = kwargs.copy()
+            sub_kwargs.pop("name", None)
             self.ffn = models.Sequential(
                 [
-                    layers.Dense(ff_dim, activation="relu", use_bias=True),
-                    layers.Dense(embed_dim, use_bias=True),
+                    layers.Dense(
+                        ff_dim,
+                        activation="relu",
+                        use_bias=True,
+                        name="ffn_dense_1",
+                        **kwargs,
+                    ),
+                    layers.Dense(
+                        embed_dim, use_bias=True, name="ffn_dense_2", **kwargs
+                    ),
                 ]
             )
             self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
@@ -431,6 +448,19 @@ def call(self, inputs):
             ffn_output = self.ffn(out1)
             return self.layernorm2(out1 + ffn_output)
 
+        def get_config(self):
+            base_config = super().get_config()
+            config = {
+                "embed_dim": self.embed_dim,
+                "num_heads": self.num_heads,
+                "ff_dim": self.ff_dim,
+            }
+            return {**base_config, **config}
+
+        @classmethod
+        def from_config(cls, config):
+            return cls(**config)
+
     inputs = layers.Input(shape=(SEQ_LEN,), dtype="int32")
     x = layers.Embedding(VOCAB_SIZE, embed_dim)(inputs)
     x = SimpleTransformerBlock(embed_dim, num_heads, ff_dim)(x)
@@ -617,6 +647,16 @@ def test_quantize_gptq_combinations(self, dataset, config):
         )
         self.assertLessEqual(kl, 0.30, f"KL divergence too high: {kl:.3f}")
 
+        # Save and load the quantized model
+        temp_filepath = os.path.join(
+            self.get_temp_dir(), "quantized_model.keras"
+        )
+        model.save(temp_filepath)
+        loaded = saving.load_model(temp_filepath)
+        self.assertAllClose(
+            model.predict(x_eval), loaded.predict(x_eval), atol=1e-6
+        )
+
     @parameterized.named_parameters(
         {
             "testcase_name": "gptq_with_invalid_config",