@@ -58,18 +58,10 @@ def __init__(self,
58
58
def get_absorbed (self ) -> Tuple [torch .Tensor , torch .Tensor ]:
59
59
if not (hasattr (self , 'q_absorb' ) and hasattr (self , 'out_absorb' )):
60
60
kv_b_proj = self .kv_b_proj .weight .view (self .num_heads , - 1 , self .kv_lora_rank )
61
- q_absorb = kv_b_proj [:, :self .qk_nope_head_dim , :].reshape (- 1 , self .kv_lora_rank )
62
- out_absorb = kv_b_proj [:, self .qk_nope_head_dim :, :].reshape (- 1 , self .kv_lora_rank )
63
- self .q_absorb = nn .Linear (self .kv_lora_rank , self .num_heads * self .qk_nope_head_dim ,
64
- bias = False , dtype = q_absorb .dtype , device = q_absorb .device )
65
- self .q_absorb .weight .data = q_absorb
66
- self .out_absorb = nn .Linear (self .kv_lora_rank , self .num_heads * self .v_head_dim ,
67
- bias = False , dtype = out_absorb .dtype , device = out_absorb .device )
68
- self .out_absorb .weight .data = out_absorb
69
- #del self.orig_module.kv_b_proj
70
- q_absorb = self .q_absorb .weight .view (self .num_heads , self .qk_nope_head_dim , self .kv_lora_rank )
71
- out_absorb = self .out_absorb .weight .view (self .num_heads , self .v_head_dim , self .kv_lora_rank )
72
- return q_absorb , out_absorb
61
+ self .q_absorb = kv_b_proj [:, :self .qk_nope_head_dim , :].view (self .num_heads , self .qk_nope_head_dim , self .kv_lora_rank )
62
+ self .out_absorb = kv_b_proj [:, self .qk_nope_head_dim :, :].view (self .num_heads , self .v_head_dim , self .kv_lora_rank )
63
+
64
+ return self .q_absorb , self .out_absorb
73
65
74
66
def forward_chunck (
75
67
self ,
@@ -105,7 +97,7 @@ def forward_chunck(
105
97
if past_key_value is not None :
106
98
if self .layer_idx is None :
107
99
raise ValueError (
108
- f"The cache structure has changed since version v4.36. If you are using { self .__class__ .__name__ } "
100
+ f"The cache structure has changed since transformer version v4.36. If you are using { self .__class__ .__name__ } "
109
101
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
110
102
"with a layer index."
111
103
)
@@ -129,8 +121,6 @@ def forward_chunck(
129
121
# compressed_kv [pages, page_size, 1, self.kv_lora_rank]
130
122
131
123
q_absorb , out_absorb = self .get_absorbed ()
132
- # if hasattr(self.orig_module, 'kv_b_proj'):
133
- # del self.orig_module.kv_b_proj
134
124
135
125
# q_nope [bsz, self.num_heads, q_len, self.qk_nope_head_dim]
136
126
# q_pe [bsz, self.num_heads, q_len, self.qk_rope_head_dim]
@@ -227,7 +217,7 @@ def forward_linux_triton(
227
217
if past_key_value is not None :
228
218
if self .layer_idx is None :
229
219
raise ValueError (
230
- f"The cache structure has changed since version v4.36. If you are using { self .__class__ .__name__ } "
220
+ f"The cache structure has changed since transformer version v4.36. If you are using { self .__class__ .__name__ } "
231
221
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
232
222
"with a layer index."
233
223
)
@@ -379,7 +369,7 @@ def forward_linux_flashinfer(
379
369
if past_key_value is not None :
380
370
if self .layer_idx is None :
381
371
raise ValueError (
382
- f"The cache structure has changed since version v4.36. If you are using { self .__class__ .__name__ } "
372
+ f"The cache structure has changed since version transformer verision v4.36. If you are using { self .__class__ .__name__ } "
383
373
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
384
374
"with a layer index."
385
375
)
0 commit comments