[DOC ] Fix cont-batching doc (#280)

zhentaoyu · web-flow · commit d355d70e44c1 · 2024-06-03T16:50:18.000+08:00
* fix cont-batching doc

Signed-off-by: Yu, Zhentao &lt;zhentao.yu@intel.com&gt;

* update

Signed-off-by: Yu, Zhentao &lt;zhentao.yu@intel.com&gt;

---------

Signed-off-by: Yu, Zhentao &lt;zhentao.yu@intel.com&gt;
diff --git a/docs/continuous_batching.md b/docs/continuous_batching.md
@@ -50,6 +50,7 @@ You can use below codes to get the `token/second` metric if you care about the t
 ```python
 from transformers import AutoTokenizer
 from neural_speed import Model
+import time
 
 model_name = "meta-llama/Llama-2-7b-hf"
 prompts = [
@@ -64,6 +65,8 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, pa
 # if the tokenizer has no pad_token, you can specify it.
 tokenizer.pad_token = tokenizer.eos_token
 pad_token_id = tokenizer.pad_token_id
+bs = len(prompts)
+print("batch_size is {}.".format(bs))
 inputs = tokenizer(prompts, padding=True, return_tensors='pt').input_ids
 
 model = Model()