fix: Zero division in inverse estimator functions (#65)

maxdebayser · web-flow · commit 08573c054f67 · 2024-03-20T20:55:27.000-07:00
#### Motivation

Fixes this warning:

```
packages/text_generation_server/utils/memory_characterizer.py:71: RuntimeWarning: invalid value encountered in scalar divide
Shard 0:   return (np.sqrt(c0**2 + 4*c1*(mem/batch)) - c0)/(2*c1)
```

#### Modifications

When the memory characterizer doesn't find a linear or quadratic
behavior, the coefficients are set to zero resulting in division by zero
errors in the inverse functions.
    
In this commit this situation is detected and the max float is returned
to be consistent with the semantics of the memory estimator

Signed-off-by: Max de Bayser &lt;mbayser@br.ibm.com&gt;
diff --git a/server/text_generation_server/utils/memory_characterizer.py b/server/text_generation_server/utils/memory_characterizer.py
@@ -72,17 +72,21 @@ def inverse_quadratic_prefill(self, batch, mem):
         return (np.sqrt(c0**2 + 4*c1*(mem/batch)) - c0)/(2*c1)
 
     def inverse_prefill(self,batch, mem):
-        linear = self.inverse_linear_prefill(batch,mem)
-        quad   = self.inverse_quadratic_prefill(batch, mem)
+        linear = self.inverse_linear_prefill(batch,mem)     if self.linear_fit_params[0]    != 0.0 else sys.float_info.max
+        quad   = self.inverse_quadratic_prefill(batch, mem) if self.quadratic_fit_params[1] != 0.0 else sys.float_info.max
         return min(linear, quad)
 
     def nt_memory_usage(self, batch_size, input_len, output_len):
         return batch_size * self.next_token_params[0] * input_len + batch_size * self.next_token_params[1] * output_len
 
     def inverse_next_token_output(self, batch, in_seq, mem):
+        if self.next_token_params[1] == 0.0:
+            return sys.float_info.max
         return (mem - self.next_token_params[0]*batch*in_seq)/(batch*self.next_token_params[1])
 
     def inverse_next_token_input(self, batch, out_seq, mem):
+        if self.next_token_params[0] == 0.0:
+            return sys.float_info.max
         return (mem - self.next_token_params[1]*batch*out_seq)/(batch*self.next_token_params[0])
     
     def max_input_len_for_prefill(self, batch_size, max_input_len):