TablewareBox
diff --git a/‎evals/elsuite/basic/match.py
Lines changed: 7 additions & 0 deletions b/‎evals/elsuite/basic/match.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎evals/elsuite/rag_match.py
Lines changed: 56 additions & 14 deletions b/‎evals/elsuite/rag_match.py
Lines changed: 56 additions & 14 deletions
diff --git a/‎evals/elsuite/utils.py
Lines changed: 139 additions & 76 deletions b/‎evals/elsuite/utils.py
Lines changed: 139 additions & 76 deletions
@@ -49,10 +49,17 @@ def eval_sample(self, sample: Any, *_):
         )
         sampled = result.get_completions()[0]
 
+        extras = {}
+        if hasattr(result, "extras"):
+            if "extracted_answer" in result.extras:
+                sampled = result.extras["extracted_answer"].rstrip(".")
+            extras = result.extras
+
         return evals.record_and_check_match(
             prompt=prompt,
             sampled=sampled,
             expected=sample["ideal"],
+            **extras
         )
 
     def run(self, recorder):
 
@@ -9,6 +9,7 @@
 import evals.metrics
 from evals.api import CompletionFn
 from evals.prompt.base import is_chat_prompt
+from evals.utils.misc import make_object
 
 
 def init_oss():
@@ -69,6 +70,9 @@ def __init__(
         max_tokens: int = 500,
         num_few_shot: int = 0,
         few_shot_jsonl: str = None,
+        func_postprocess_answer: str = None,
+        func_comparison: str = None,
+        record_match_threshold: float = -1,
         **kwargs,
     ):
         super().__init__(completion_fns, *args, **kwargs)
@@ -81,6 +85,10 @@ def __init__(
             self.few_shot_jsonl = few_shot_jsonl
             self.few_shot = evals.get_jsonl(self._prefix_registry_path(self.few_shot_jsonl))
 
+        self.func_postprocess_answer = make_object(func_postprocess_answer) if func_postprocess_answer else None
+        self.func_comparison = make_object(func_comparison) if func_comparison else None
+        self.record_match_threshold = record_match_threshold
+
     def eval_sample(self, sample: Any, *_):
         assert isinstance(sample, dict), "sample must be a dict"
         assert "input" in sample, "sample must have an 'input' key"
@@ -102,27 +110,61 @@ def eval_sample(self, sample: Any, *_):
             temperature=0.0,
             **{k: v for k, v in sample.items() if k not in ["input", "ideal"]}
         )
-        sampled = result.get_completions()[0]
+        sampled = result.get_completions()[0].strip()
 
-        extras = {}
+        extras = {"file_name": sample["file_name"], "file_link": sample["file_link"]} if "file_name" in sample else {}
         if hasattr(result, "extras"):
             if "extracted_answer" in result.extras:
                 sampled = result.extras["extracted_answer"].rstrip(".")
             extras = result.extras
-
-        return evals.record_and_check_match(
-            prompt=prompt,
-            sampled=sampled,
-            expected=sample["ideal"],
-            file_name=sample["file_name"],
-            **extras
-        )
+        else:
+            extras["answer"] = sampled
+
+        if self.func_postprocess_answer:
+            extras["answer"] = sampled
+            sampled = extras["extracted_answer"] = self.func_postprocess_answer(sampled)
+        
+        if self.func_comparison:
+            metrics = self.func_comparison(sampled, sample["ideal"][0])
+            if type(metrics) == bool:
+                evals.record.record_match(correct=metrics,
+                                          expected=sample["ideal"],
+                                          picked=sampled, sampled=extras["answer"],
+                                          prompt=prompt,
+                                          **extras)
+            else:
+                evals.record.record_metrics(**metrics)
+                if self.record_match_threshold > 0:
+                    evals.record.record_match(correct=metrics["score"] >= self.record_match_threshold,
+                                              **metrics,
+                                              expected=sample["ideal"],
+                                              picked=sampled, sampled=extras["answer"],
+                                              prompt=prompt,
+                                              **extras)
+        else:
+            return evals.record_and_check_match(
+                prompt=prompt,
+                sampled=sampled,
+                expected=sample["ideal"],
+                **extras
+            )
 
     def run(self, recorder):
         samples = get_rag_dataset(self._prefix_registry_path(self.samples_jsonl).as_posix())
         self.eval_all_samples(recorder, samples)
+
         events = recorder.get_events("match")
-        return {
-            "accuracy": evals.metrics.get_accuracy(events),
-            "boostrap_std": evals.metrics.get_bootstrap_accuracy_std(events),
-        }
+        if len(events) > 0:
+            record_metrics = {
+                "accuracy": evals.metrics.get_accuracy(events),
+                "bootstrap_std": evals.metrics.get_bootstrap_accuracy_std(events),
+            }
+        else:
+            record_metrics = {}
+
+        all_sample_metrics = recorder.get_metrics()
+        scores = [m["score"] for m in all_sample_metrics if m.get("score") is not None]
+        if scores:
+            record_metrics["score"] = sum(scores) / len(scores)
+
+        return record_metrics
@@ -123,6 +123,8 @@
 }
 
 
+# Highlight: Part 1. Post-Processing Functions for LLM Outputs
+
 def get_answer(text, answer_prompt, ignore_case=False):
     if ignore_case:
         idx = text.lower().rfind(answer_prompt.lower())
@@ -142,22 +144,6 @@ def get_consensus(answers):
     return max(counts, key=counts.get)
 
 
-def compare_molecule(smi1, smi2) -> bool:
-    from rdkit import Chem
-    from rdkit.Chem import AllChem
-
-    mol1 = Chem.MolFromSmiles(smi1)
-    mol2 = Chem.MolFromSmiles(smi2)
-    if mol1 is None or mol2 is None:
-        return False
-    else:
-        return Chem.MolToSmiles(Chem.RemoveHs(mol1)) == Chem.MolToSmiles(Chem.RemoveHs(mol2))
-    # return False
-    # fp1 = AllChem.GetMorganFingerprint(mol1, 2)
-    # fp2 = AllChem.GetMorganFingerprint(mol2, 2)
-    # return DataStructs.TanimotoSimilarity(fp1, fp2)
-
-
 def normalize(s: str) -> str:
     """Lower text and remove punctuation, articles and extra whitespace."""
     s = s.lower()
@@ -168,6 +154,87 @@ def normalize(s: str) -> str:
     return s
 
 
+def fuzzy_normalize_name(s):
+    if s.startswith("Unnamed"):
+        return ""
+    else:
+        """ 标准化字符串 """
+        # # 定义需要移除的单位和符号
+        # units = ["µM", "µg/mL", "nM", "%", "wt.%", "at.%", "at%", "wt%"]
+        # for unit in units:
+        #     s = s.replace(unit, "")
+
+        # 定义特定关键字
+        keywords = ["pIC50", "IC50", "EC50", "TC50", "GI50", "Ki", "Kd", "Kb", "pKb"]
+
+        # 移除非字母数字的字符，除了空格
+        s = re.sub(r'[^\w\s%.\-\(\)]', '', s)
+        if s in synonyms:
+            s = synonyms[s]
+
+        # 分割字符串为单词列表
+        words = s.split()
+
+        # 将关键字移到末尾
+        reordered_words = [word for word in words if word not in keywords]
+        keywords_in_string = [word for word in words if word in keywords]
+        reordered_words.extend(keywords_in_string)
+        # 重新组合为字符串
+        return ' '.join(reordered_words)
+
+
+def fuzzy_normalize_value(vi):
+    try:
+        vi = str(vi).lower()
+
+        if "bal" in vi or "remainder" in vi or "bas" in vi:
+            vi = "bal"
+            return "bal"
+
+        if ("nan" in vi and not "–" in vi) or "/" == vi or "n/a" in vi or "na" in vi or vi == "":
+            vi = "0"
+        vi = vi.replace("nan", "–").replace("~", "-")
+
+        pattern = r"\d+(?:\.\d+)?"
+        matches = re.findall(pattern, vi)
+        if len(matches) == 2:
+            vi = f"{matches[0]}-{matches[1]}"
+        elif len(matches) == 1:
+            vi = matches[0]
+
+        if "<" in vi:
+            vi = vi.replace("<", "")
+        if ">" in vi:
+            vi = vi.replace(">", "")
+
+        try:
+            vi = float(vi)
+            vi = round(vi, 3)
+        except:
+            # print(vi)
+            pass
+    except:
+        pass
+
+    return vi
+
+
+def extract_choice_and_value(sampled):
+    pattern = re.compile(r'\w\)\s\d+(?:\.\d+)?(?:\s?:\s?\d+(?:\.\d+)?)?\s?[°]?[CK]?')
+    matches = pattern.findall(sampled)
+    if matches:
+        sampled0 = pattern.findall(sampled)[0]
+    else:
+        return "No answer."
+    if sampled0 is None or sampled0 == []:
+        pass
+    else:
+        sampled = sampled0.replace("°", " ")
+        sampled = sampled.replace("  ", " ")
+    return sampled
+
+# Part 2. Comparison Functions for Post-Processed LLM Outputs
+
 def fuzzy_match(s1: str, s2: str) -> bool:
     s1 = normalize(s1)
     s2 = normalize(s2)
@@ -264,69 +331,32 @@ def is_float(str):
             pass
 
 
-def fuzzy_normalize_name(s):
-    if s.startswith("Unnamed"):
-        return ""
-    else:
-        """ 标准化字符串 """
-        # # 定义需要移除的单位和符号
-        # units = ["µM", "µg/mL", "nM", "%", "wt.%", "at.%", "at%", "wt%"]
-        # for unit in units:
-        #     s = s.replace(unit, "")
-
-        # 定义特定关键字
-        keywords = ["pIC50", "IC50", "EC50", "TC50", "GI50", "Ki", "Kd", "Kb", "pKb"]
-
-        # 移除非字母数字的字符，除了空格
-        s = re.sub(r'[^\w\s%.\-\(\)]', '', s)
-        if s in synonyms:
-            s = synonyms[s]
-
-        # 分割字符串为单词列表
-        words = s.split()
-
-        # 将关键字移到末尾
-        reordered_words = [word for word in words if word not in keywords]
-        keywords_in_string = [word for word in words if word in keywords]
-        reordered_words.extend(keywords_in_string)
-        # 重新组合为字符串
-        return ' '.join(reordered_words)
-
-
-def fuzzy_normalize_value(vi):
-    try:
-        vi = str(vi).lower()
-
-        if "bal" in vi or "remainder" in vi or "bas" in vi:
-            vi = "bal"
-            return "bal"
+def compare_molecule_similarity(smi1, smi2) -> dict:
+    from rdkit import Chem
+    from rdkit.Chem import AllChem
+    from rdkit import DataStructs
 
-        if ("nan" in vi and not "–" in vi) or "/" == vi or "n/a" in vi or "na" in vi or vi == "":
-            vi = "0"
-        vi = vi.replace("nan", "–").replace("~", "-")
+    mol1 = Chem.MolFromSmiles(re.sub(r'<.*>', '', str(smi1).strip("`")))
+    mol2 = Chem.MolFromSmiles(re.sub(r'<.*>', '', str(smi2).strip("`")))
 
-        pattern = r"\d+(?:\.\d+)?"
-        matches = re.findall(pattern, vi)
-        if len(matches) == 2:
-            vi = f"{matches[0]}-{matches[1]}"
-        elif len(matches) == 1:
-            vi = matches[0]
+    if mol1 is None or mol2 is None:
+        sim = 0.0
+    else:
+        fp1 = AllChem.GetMorganFingerprint(mol1, 2)
+        fp2 = AllChem.GetMorganFingerprint(mol2, 2)
+        sim = DataStructs.TanimotoSimilarity(fp1, fp2)
+    return {"score": sim}
 
-        if "<" in vi:
-            vi = vi.replace("<", "")
-        if ">" in vi:
-            vi = vi.replace(">", "")
 
-        try:
-            vi = float(vi)
-            vi = round(vi, 3)
-        except:
-            # print(vi)
-            pass
-    except:
-        pass
+def compare_molecule_strict(smi1, smi2) -> bool:
+    from rdkit import Chem
 
-    return vi
+    mol1 = Chem.MolFromSmiles(smi1)
+    mol2 = Chem.MolFromSmiles(smi2)
+    if mol1 is None or mol2 is None:
+        return False
+    else:
+        return Chem.MolToSmiles(Chem.RemoveHs(mol1)) == Chem.MolToSmiles(Chem.RemoveHs(mol2))
 
 
 def tableMatching(df_ref, df_prompt, index='Compound', compare_fields=[], record=True, file_name=None):
@@ -350,7 +380,7 @@ def match_indices(ind0, ind1, threshold=0.9) -> dict:
         Match the indices of two dataframes.
         """
         renames = {}
-        name2query = lambda name: name if type(name) != tuple else name[0] if len(name) == 1 or name[1] == "" else name[1]
+        name2query = lambda name: name if type(name) != tuple else name[0] if len(name) == 1 or name[-1] == "" else name[-1]
         similarities = np.array(np.ones([len(ind0) + 15, len(ind1) + 15]), dtype=np.float64)
         querys0 = [name2query(name) for name in ind0]
         querys1 = [name2query(name) for name in ind1]
@@ -434,7 +464,7 @@ def match_indices(ind0, ind1, threshold=0.9) -> dict:
             except:
                 p = 'not found'
 
-            _is_matching = fuzzy_compare_name(gt, p, compare_value=True) if col != "SMILES" else compare_molecule(gt, p)
+            _is_matching = fuzzy_compare_name(gt, p, compare_value=True) if col != "SMILES" else compare_molecule_strict(gt, p)
             if col == "SMILES":
                 smiles_match_score += float(_is_matching)
             if record:
@@ -558,6 +588,38 @@ def count_leaves(d, count=0):
         return 0
     ratio = total_diff_leaves / total_leaves_dict1
 
+    if total_diff_leaves == total_leaves_dict1 and len(list(dict_ref.keys())) == len(list(dict_prompt.keys())):
+        values1 = list(dict_ref.values())
+        values2 = list(dict_prompt.values())
+
+        # Initialize containers for differences
+        differences = []
+
+        # The maximum length to iterate over
+        max_length = max(len(values1), len(values2))
+
+        total_diff_leaves = 0
+
+        for i in range(max_length):
+            try:
+                value1 = values1[i]
+                value2 = values2[i]
+            except IndexError:
+                # Handle cases where the lists have different lengths
+                differences.append('Different number of elements.')
+                break
+
+            # If both values are dictionaries, use DeepDiff to compare them deeply
+            if isinstance(value1, dict) and isinstance(value2, dict):
+                diff = DeepDiff(value1, value2, ignore_order=True, report_repetition=True)
+                if diff:
+                    total_diff_leaves += sum(len(diff.get(key, {})) for key in diff_keys)
+                    differences.append(diff)
+            elif value1 != value2:
+                total_diff_leaves += 1
+                # For non-dictionary values, just compare them directly
+                differences.append({'different_values': (value1, value2)})
+
     return 1.0 - ratio, diff
 
 
@@ -863,6 +925,7 @@ def macro_f1_score_3(model, prediction: List[List[Any]], answers: List[List[Any]
     except:
         return 0.0
 
+
 def scrub_formatting_from_prompt(prompt):
     scrubbed_prompt = copy.copy(prompt)