turboderp-org
diff --git a/‎exllamav2/__init__.py
Lines changed: 1 addition & 0 deletions b/‎exllamav2/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎exllamav2/cache.py
Lines changed: 142 additions & 11 deletions b/‎exllamav2/cache.py
Lines changed: 142 additions & 11 deletions
@@ -4,6 +4,7 @@
 from exllamav2.cache import ExLlamaV2CacheBase
 from exllamav2.cache import ExLlamaV2Cache
 from exllamav2.cache import ExLlamaV2Cache_Q4
+from exllamav2.cache import ExLlamaV2Cache_Q8
 from exllamav2.cache import ExLlamaV2Cache_8bit
 from exllamav2.config import ExLlamaV2Config
 from exllamav2.tokenizer.tokenizer import ExLlamaV2Tokenizer
 
@@ -6,7 +6,7 @@
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
     from exllamav2.model import ExLlamaV2
-
+    from exllamav2 import ExLlamaV2Tokenizer
 
 class ExLlamaV2CacheBase:
 
@@ -204,6 +204,10 @@ def all_tensors(self):
         raise NotImplementedError()
 
 
+    def reset(self):
+        self.current_seq_len = 0
+
+
 class ExLlamaV2Cache(ExLlamaV2CacheBase):
     """
     FP16 cache
@@ -348,27 +352,31 @@ def all_tensors(self):
         return self.key_states + self.value_states
 
 
-class ExLlamaV2Cache_Q4(ExLlamaV2CacheBase):
+class ExLlamaV2Cache_Q(ExLlamaV2CacheBase):
     """
-    Q4 cache. Uses grouped RTN quantization for keys/values
+    Q cache. Uses grouped RTN quantization for keys/values
     """
 
+    wbits: int
+
     def __init__(self,
                  model: ExLlamaV2,
                  batch_size: int = 1,
                  max_seq_len: int = -1,
                  copy_from: ExLlamaV2Cache_Q4 | None = None,
-                 lazy: bool = False):
+                 lazy: bool = False,
+                 weights_per_byte: int = -1):
 
-        super().__init__(model, batch_size, max_seq_len, torch.uint8, 2, True)
+        super().__init__(model, batch_size, max_seq_len, torch.uint8, weights_per_byte, True)
+        cfg = self.model.config
 
         self.create_state_tensors(copy_from, lazy)
 
-        # Models with odd key/value dims need to to quantize/dequantize in multi-token blocks. Make sure the quant
+        # Models with odd key/value dims need to quantize/dequantize in multi-token blocks. Make sure the quant
         # blocksize aligns with a whole number of tokens
 
         Q_CACHE_BLOCKSIZE_Q = 512
-        kv_dim = model.config.num_key_value_heads * model.config.head_dim
+        kv_dim = cfg.num_key_value_heads * cfg.head_dim
         self.q_block = 1
         while (kv_dim * self.q_block) % Q_CACHE_BLOCKSIZE_Q:
             self.q_block += 1
@@ -380,6 +388,14 @@ def __init__(self,
         if not lazy:
             for device in self.model.get_cache_devices(): self.touch_device(device)
 
+        # Calibration mode
+
+        self.calibrated = False
+        self.calibrating = False
+        self.calibration_rows = [0] * cfg.num_hidden_layers
+        self.calibration_k = {}
+        self.calibration_v = {}
+
 
     def touch_device(self, device):
 
@@ -410,7 +426,7 @@ def get_kv_state(self,
             offset = a
             width = b - a
 
-        ext_c.q4_to_fp16_kv(
+        ext_c.q_to_fp16_kv(
             self.key_states[layer_idx],
             temp_key_state,
             self.key_scales[layer_idx],
@@ -422,8 +438,18 @@ def get_kv_state(self,
             width,
             page_size,
             cache_seqlens if cache_seqlens is not None else none_tensor,
-            block_table if block_table is not None else none_tensor
+            block_table if block_table is not None else none_tensor,
+            # none_tensor,
+            # none_tensor
+            self.calibration_k[layer_idx] if self.calibrated else none_tensor,
+            self.calibration_v[layer_idx] if self.calibrated else none_tensor,
+            self.wbits
         )
+
+        # if self.calibrated:
+        #     temp_key_state *= self.calibration_k[layer_idx]
+        #     temp_value_state *= self.calibration_v[layer_idx]
+
         return temp_key_state, temp_value_state
 
 
@@ -448,7 +474,12 @@ def store_kv_state(self,
 
         device = self.model.cache_map[layer_idx]
         temp_key_state, temp_value_state = self.temp_tensors[device]
-        ext_c.fp16_to_q4_kv(
+
+        # if self.calibrated:
+        #     temp_key_state /= self.calibration_k[layer_idx]
+        #     temp_value_state /= self.calibration_v[layer_idx]
+
+        ext_c.fp16_to_q_kv(
             temp_key_state,
             self.key_states[layer_idx],
             self.key_scales[layer_idx],
@@ -460,9 +491,43 @@ def store_kv_state(self,
             width,
             page_size,
             cache_seqlens if cache_seqlens is not None else none_tensor,
-            block_table if block_table is not None else none_tensor
+            block_table if block_table is not None else none_tensor,
+            # none_tensor,
+            # none_tensor
+            self.calibration_k[layer_idx] if self.calibrated else none_tensor,
+            self.calibration_v[layer_idx] if self.calibrated else none_tensor,
+            self.wbits
         )
 
+        # Collect calibration data
+
+        if self.calibrating:
+
+            cfg = self.model.config
+
+            if layer_idx not in self.calibration_k:
+                self.calibration_k[layer_idx] = torch.zeros(
+                    (cfg.num_key_value_heads, cfg.head_dim,),
+                    dtype = torch.float,
+                    device = temp_key_state.device
+                )
+                self.calibration_v[layer_idx] = torch.zeros(
+                    (cfg.num_key_value_heads, cfg.head_dim,),
+                    dtype = torch.float,
+                    device = temp_key_state.device
+                )
+
+            b, l, h, d = temp_key_state.shape
+            cal_k = self.calibration_k[layer_idx]
+            cal_v = self.calibration_v[layer_idx]
+            cal_k_input = temp_key_state[:, offset:offset+width, :, :].view(b * width, h * d)
+            cal_v_input = temp_value_state[:, offset:offset+width, :, :].view(b * width, h * d)
+            cal_k_sum = torch.norm(cal_k_input, p = 1, dim = 0, dtype = torch.float)
+            cal_v_sum = torch.norm(cal_v_input, p = 1, dim = 0, dtype = torch.float)
+            cal_k.add_(cal_k_sum.view(h, d))
+            cal_v.add_(cal_v_sum.view(h, d))
+            self.calibration_rows[layer_idx] += width
+
 
     def footprint(self) -> list[int]:
 
@@ -491,3 +556,69 @@ def all_tensors(self):
         return self.key_states + self.value_states + self.key_scales + self.value_scales
 
 
+    def calibrate(self,
+        tokenizer: ExLlamaV2Tokenizer,
+        num_batches = 8,
+        num_samples_per_batch = 256
+    ):
+        """
+        Unfinished
+        """
+
+        assert self.max_seq_len >= num_samples_per_batch, \
+            f"Cache max_seq_len must be at least {num_samples_per_batch} to calibrate."
+
+        self.calibrating = True
+        torch.manual_seed(123)
+
+        for _ in range(num_batches):
+
+            input_ids = torch.randint(
+                low = 0,
+                high = tokenizer.get_vocab_size() - 1,
+                size = (1, num_samples_per_batch),
+                dtype = torch.long
+            )
+
+            self.reset()
+            self.model.forward(input_ids, preprocess_only = True, cache = self)
+
+        self.calibrating = False
+
+        for i in range(self.model.config.num_hidden_layers):
+            cal_k = self.calibration_k[i] / self.calibration_rows[i]  # self.calibration_k[i].mean()
+            cal_v = self.calibration_v[i] / self.calibration_rows[i]  # self.calibration_v[i].mean()
+            cal_k = cal_k ** (1/8)
+            cal_v = cal_v ** (1/8)
+            cal_k = cal_k.half() * (-1)
+            cal_v = cal_v.half() * (-1)
+            self.calibration_k[i] = cal_k
+            self.calibration_v[i] = cal_v
+        self.calibrating = False
+        # self.calibrated = True
+
+
+class ExLlamaV2Cache_Q4(ExLlamaV2Cache_Q):
+
+    def __init__(self,
+                 model: ExLlamaV2,
+                 batch_size: int = 1,
+                 max_seq_len: int = -1,
+                 copy_from: ExLlamaV2Cache_Q4 | None = None,
+                 lazy: bool = False):
+
+        super().__init__(model, batch_size, max_seq_len, copy_from, lazy, 2)
+        self.wbits = 4
+
+
+class ExLlamaV2Cache_Q8(ExLlamaV2Cache_Q):
+
+    def __init__(self,
+                 model: ExLlamaV2,
+                 batch_size: int = 1,
+                 max_seq_len: int = -1,
+                 copy_from: ExLlamaV2Cache_Q4 | None = None,
+                 lazy: bool = False):
+
+        super().__init__(model, batch_size, max_seq_len, copy_from, lazy, 1)
+        self.wbits = 8