Implemented logit bias (#385)

Tyrest · web-flow · commit 8833d45d2e47 · 2023-06-30T15:55:45.000-07:00
* implemented logit bias * fixed comment * moved logit bias generation to `openai.py` * removed unused code from config * formatting * formatting * Revert "formatting" This reverts commit 5418326.
diff --git a/src/autolabel/configs/config.py b/src/autolabel/configs/config.py
@@ -29,6 +29,7 @@ class AutolabelConfig(BaseConfig):
     MODEL_NAME_KEY = "name"
     MODEL_PARAMS_KEY = "params"
     COMPUTE_CONFIDENCE_KEY = "compute_confidence"
+    LOGIT_BIAS_KEY = "logit_bias"
 
     # Embedding config keys (config["embedding"][<key>])
     EMBEDDING_PROVIDER_KEY = "provider"
@@ -124,6 +125,10 @@ def confidence(self) -> bool:
         """Returns true if the model is able to return a confidence score along with its predictions"""
         return self._model_config.get(self.COMPUTE_CONFIDENCE_KEY, False)
 
+    def logit_bias(self) -> bool:
+        """Returns true if the model is configured to use a logit bias"""
+        return self._model_config.get(self.LOGIT_BIAS_KEY, False)
+
     # Embedding config
     def embedding_provider(self) -> str:
         """Returns the name of the entity that provides the model used for computing embeddings"""
diff --git a/src/autolabel/models/openai.py b/src/autolabel/models/openai.py
@@ -1,5 +1,6 @@
 from functools import cached_property
 from typing import List, Optional
+import logging
 
 from langchain.chat_models import ChatOpenAI
 from langchain.llms import OpenAI
@@ -11,6 +12,9 @@
 from autolabel.cache import BaseCache
 
 
+logger = logging.getLogger(__name__)
+
+
 class OpenAILLM(BaseModel):
     CHAT_ENGINE_MODELS = [
         "gpt-3.5-turbo",
@@ -76,6 +80,14 @@ def __init__(self, config: AutolabelConfig, cache: BaseCache = None) -> None:
 
         # populate model params and initialize the LLM
         model_params = config.model_params()
+        if config.logit_bias():
+            logit_bias = self._generate_logit_bias(config)
+            # if logit_bias or max_tokens is specified already, we don't want to overwrite it
+            model_params = {
+                **logit_bias,
+                **model_params,
+            }
+
         if self._engine == "chat":
             self.model_params = {**self.DEFAULT_PARAMS_CHAT_ENGINE, **model_params}
             self.llm = ChatOpenAI(model_name=self.model_name, **self.model_params)
@@ -86,6 +98,32 @@ def __init__(self, config: AutolabelConfig, cache: BaseCache = None) -> None:
             }
             self.llm = OpenAI(model_name=self.model_name, **self.model_params)
 
+    def _generate_logit_bias(self, config: AutolabelConfig) -> None:
+        """Generates logit bias for the labels specified in the config
+
+        Args:
+            config (AutolabelConfig): AutolabelConfig object
+
+        Returns:
+            Dict: logit bias and max tokens
+        """
+        if len(config.labels_list()) == 0:
+            logger.warning(
+                "No labels specified in the config. Skipping logit bias generation."
+            )
+            return {}
+        encoding = tiktoken.encoding_for_model(self.model_name)
+        logit_bias = {}
+        max_tokens = 0
+        for label in config.labels_list():
+            if label not in logit_bias:
+                tokens = encoding.encode(label)
+                for token in tokens:
+                    logit_bias[token] = 100
+                max_tokens = max(max_tokens, len(tokens))
+
+        return {"logit_bias": logit_bias, "max_tokens": max_tokens}
+
     def _label(self, prompts: List[str]) -> LLMResult:
         if self._engine == "chat":
             # Need to convert list[prompts] -> list[messages]