You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I am using llama 3.1 model for getting the structured output .I want to calculate the count of output tokens using exllama library .Can you suggest some method for this task.
Below sharing the code snippet
class MihupExllamaLLM:
def __init__(self, efs_model_path,llm_model_name):
model_dir = efs_model_path + "/" + llm_model_name
if not os.path.exists(model_dir):
raise FileNotFoundError(f"Model directory not found at {model_dir}")
config = ExLlamaV2Config(model_dir)
self.model = ExLlamaV2(config)
# ExLlamaV2Cache
self.cache = ExLlamaV2Cache_Q4(self.model, max_seq_len=256 * 96, lazy=True) # 32768 - 8200 MB # 24576 - 7900 MB
self.model.load_autosplit(self.cache, progress=True)
print("Loading tokenizer...")
self.tokenizer = ExLlamaV2Tokenizer(config)
self.generator = ExLlamaV2DynamicGenerator(
model=self.model,
cache=self.cache,
tokenizer=self.tokenizer,
)
self.gen_settings = ExLlamaV2Sampler.Settings(
temperature=0.0, # Set to 0 for deterministic output
top_k = 1 , # Only consider the most likely token
top_p = 1.0 , # No nucleus sampling
token_repetition_penalty=1.0 # No repetition penalty
)
self.generator.warmup()
def run_mihup_llm_inference(self, call_transcript: str, prompt_tuples: List[Tuple]) -> (List[json]):
self.cache.reset()
common_transcript = format_transcript_text(call_transcript)
prompts = []
filters = []
use_case_ids = []
prompt_token_lengths = []
for upper_tuple in prompt_tuples:
use_case_id = upper_tuple[1]
use_case_ids.append(use_case_id)
p = upper_tuple[0]
prompt_str = p[0]
# print(f"use_case_id : {use_case_id}, prompt : {prompt_str}")
prompt_question_combined = format_llama3_prompt(mihup_system_prompt, common_transcript + prompt_str)
prompts.append(prompt_question_combined)
filter_schema_parser = p[1]
filters.append([
ExLlamaV2TokenEnforcerFilter(filter_schema_parser, self.tokenizer),
ExLlamaV2PrefixFilter(self.model, self.tokenizer, ["{", " {"])
])
outputs = self.generator.generate(
prompt=prompts,
filters=filters,
filter_prefer_eos=True,
max_new_tokens=2048,
add_bos=True,
stop_conditions=get_llama3_stop_conditions(self.tokenizer),
gen_settings=self.gen_settings,
completion_only=True,
encode_special_tokens=True
)
final_output = []
use_case_ids_to_be_considered = []
for i in range(len(outputs)):
try:
output_json = None
output_json = json.loads(outputs[i])
final_output.append(output_json)
use_case_ids_to_be_considered.append(use_case_ids[i])
except ValueError as e:
print("error: {0} , use_case_id :{1}".format(outputs[i], use_case_ids[i]))
use_case_id_key = "use_case_id"
for idx in range(len(final_output)):
final_output[idx][use_case_id_key] = use_case_ids_to_be_considered[idx]
return final_output
Solution
Please suggest some existing method to calculate the tokens
Alternatives
No response
Explanation
As response time totally dependent on the output tokens , so I needed this feature
Examples
No response
Additional context
No response
Acknowledgements
I have looked for similar requests before submitting this one.
I understand that the developers have lives and my issue will be answered when possible.
I understand the developers of this program are human, and I will make my requests politely.
The text was updated successfully, but these errors were encountered:
Not sure if this is what you're after, but if you add return_last_results = True to the call to generator.generate, you'll get some extra feedback including the number of tokens generated.
results will be a dictionary (or list of dictionaries, if you're batching), with the following information:
{
"job": ExLlamaV2DynamicJob# reference to job, not valid after generation has ended"stage": str# "streaming""identifier": object# optional identifier used in streaming mode"serial": int# job serial number"eos": bool# True since generation is complete"eos_reason": str# either "stop_token", "stop_string", "max_new_tokens", "end_filter""eos_triggering_token_id": int# present if eos_reason == "stop_token""eos_triggering_token_str": str# present if eos_reason == "stop_token""eos_triggering_string": str# present if eos_reason == "stop_string""full_completion": str# full text completion"new_tokens": int# number of tokens generated <----"time_enqueued": float# time from job was enqueued until it started, in seconds"time_prefill": float# time to first token, in seconds"time_generate": float# time to last token, in seconds"accepted_draft_tokens": int# SD metrics, present if SD enabled"rejected_draft_tokens": int# SD metrics, present if SD enabled"text": str# last chunk of decoded text"token_ids": torch.Tensor# last chunk of sampled token IDs"token_probs": torch.Tensor# last chunk of sampled token probabilities, if return_probs == True"top_k_tokens": torch.Tensor# last chunk of top-K token IDs, if return_top_tokens > 0"top_k_probs": torch.Tensor# last chunk of top-K probabilities, if return_top_tokens > 0"logits": torch.Tensor# last chunk of logits, if return_logits == True
}
Problem
I am using llama 3.1 model for getting the structured output .I want to calculate the count of output tokens using exllama library .Can you suggest some method for this task.
Below sharing the code snippet
Solution
Please suggest some existing method to calculate the tokens
Alternatives
No response
Explanation
As response time totally dependent on the output tokens , so I needed this feature
Examples
No response
Additional context
No response
Acknowledgements
The text was updated successfully, but these errors were encountered: