From 681b925c95b55a341bd1e469945148b8f2288a2d Mon Sep 17 00:00:00 2001 From: stantonius Date: Mon, 6 Jan 2025 17:19:23 -0500 Subject: [PATCH 1/2] Added ReAct function/tool default args Addresses TOPIC 02 in react.py about handling default arguments in the Tool class Not super elegent, but seems effective in my testing --- dspy/predict/react.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/dspy/predict/react.py b/dspy/predict/react.py index 2690d066af..53d5195a59 100644 --- a/dspy/predict/react.py +++ b/dspy/predict/react.py @@ -11,7 +11,15 @@ class Tool: - def __init__(self, func: Callable, name: str = None, desc: str = None, args: dict[str, Any] = None): + + def __init__( + self, + func: Callable, + name: str = None, + desc: str = None, + args: dict[str, Any] = None, + defaults: dict[str, Any] = None, + ): annotations_func = func if inspect.isfunction(func) or inspect.ismethod(func) else func.__call__ self.func = func self.name = name or getattr(func, "__name__", type(func).__name__) @@ -23,6 +31,7 @@ def __init__(self, func: Callable, name: str = None, desc: str = None, args: dic for k, v in (args or get_type_hints(annotations_func)).items() if k != "return" } + self.defaults = defaults @with_callbacks def __call__(self, *args, **kwargs): @@ -63,6 +72,8 @@ def __init__(self, signature, tools: list[Callable], max_iters=5): args = tool.args if hasattr(tool, "args") else str({tool.input_variable: str}) desc = (f", whose description is {tool.desc}." if tool.desc else ".").replace("\n", " ") desc += f" It takes arguments {args} in JSON format." + if tool.defaults: + desc += f" Default arguments are {tool.defaults}." instr.append(f"({idx+1}) {tool.name}{desc}") react_signature = ( From 06cdd437204ef30620d7328e548c235f26adcaed Mon Sep 17 00:00:00 2001 From: stantonius Date: Tue, 7 Jan 2025 09:24:55 -0500 Subject: [PATCH 2/2] sync with remote main update --- .github/workflows/run_tests.yml | 29 +++---- docs/docs/tutorials/observability/index.md | 2 +- dspy/evaluate/evaluate.py | 94 ++++++++++++++++------ dspy/predict/react.py | 19 ++++- 4 files changed, 99 insertions(+), 45 deletions(-) diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index 2ffc80f95d..1aaf9236c8 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -41,6 +41,9 @@ jobs: python-version: ["3.9"] steps: - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} - name: Install Deno run: | curl -fsSL https://deno.land/install.sh | sh @@ -59,13 +62,7 @@ jobs: path: ~/.local key: poetry-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }} - name: Install Poetry - if: steps.cached-poetry.outputs.cache-hit != 'true' - uses: snok/install-poetry@v1 - - name: Set up python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - cache: "poetry" + run: python -m pip install --upgrade "poetry==${{ env.POETRY_VERSION }}" - name: Install dependencies run: poetry install --no-interaction - name: Run lint with tests @@ -89,14 +86,11 @@ jobs: with: path: ~/.local key: poetry-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }} - - name: Install Poetry - if: steps.cached-poetry.outputs.cache-hit != 'true' - uses: snok/install-poetry@v1 - - name: Set up python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - cache: "poetry" + - name: Install Poetry + run: python -m pip install --upgrade "poetry==${{ env.POETRY_VERSION }}" - name: Build run: poetry build - name: Install built package @@ -118,13 +112,10 @@ jobs: with: path: ~/.local key: poetry-${{ env.POETRY_VERSION }}-${{ hashFiles('**/poetry.lock') }} - - name: Install Poetry - if: steps.cached-poetry.outputs.cache-hit != 'true' - uses: snok/install-poetry@v1 - - name: Set up python ${{ matrix.python-version }} - uses: actions/setup-python@v5 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - cache: "poetry" + - name: Install Poetry + run: python -m pip install --upgrade "poetry==${{ env.POETRY_VERSION }}" - name: Run setup.py build run: python setup.py build diff --git a/docs/docs/tutorials/observability/index.md b/docs/docs/tutorials/observability/index.md index 0b540f042f..e7568107fc 100644 --- a/docs/docs/tutorials/observability/index.md +++ b/docs/docs/tutorials/observability/index.md @@ -1,6 +1,6 @@ # Tutorial: Debugging and Observability in DSPy -This guide demonstrates how to debug problems and improve observability in DSPy. Modern AI programs often involve multiple components, such as language models, retrievers, and tools. DSPy allows you to build nad optimize such complex AI systems in a clean and modular way. +This guide demonstrates how to debug problems and improve observability in DSPy. Modern AI programs often involve multiple components, such as language models, retrievers, and tools. DSPy allows you to build and optimize such complex AI systems in a clean and modular way. However, as systems grow more sophisticated, the ability to **understand what your system is doing** becomes critical. Without transparency, the prediction process can easily become a black box, making failures or quality issues difficult to diagnose and production maintenance challenging. diff --git a/dspy/evaluate/evaluate.py b/dspy/evaluate/evaluate.py index 9ef0bf7334..89be568937 100644 --- a/dspy/evaluate/evaluate.py +++ b/dspy/evaluate/evaluate.py @@ -1,6 +1,6 @@ import logging import types -from typing import Any +from typing import Any, Callable, List, Optional import pandas as pd import tqdm @@ -38,25 +38,40 @@ def HTML(x: str) -> str: logger = logging.getLogger(__name__) -logger = logging.getLogger(__name__) - - class Evaluate: + """DSPy Evaluate class. + + This class is used to evaluate the performance of a DSPy program. Users need to provide a evaluation dataset and + a metric function in order to use this class. This class supports parallel evaluation on the provided dataset. + """ def __init__( self, *, - devset, - metric=None, - num_threads=1, - display_progress=False, - display_table=False, - max_errors=5, - return_all_scores=False, - return_outputs=False, - provide_traceback=False, - failure_score=0.0, - **_kwargs, + devset: List["dspy.Example"], + metric: Optional[Callable] = None, + num_threads: int = 1, + display_progress: bool = False, + display_table: bool = False, + max_errors: int = 5, + return_all_scores: bool = False, + return_outputs: bool = False, + provide_traceback: bool = False, + failure_score: float = 0.0, + **kwargs, ): + """ + Args: + devset (List[dspy.Example]): the evaluation dataset. + metric (Callable): The metric function to use for evaluation. + num_threads (int): The number of threads to use for parallel evaluation. + display_progress (bool): Whether to display progress during evaluation. + display_table (bool): Whether to display the evaluation results in a table. + max_errors (int): The maximum number of errors to allow before stopping evaluation. + return_all_scores (bool): Whether to return scores for every data record in `devset`. + return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. + provide_traceback (bool): Whether to provide traceback information during evaluation. + failure_score (float): The default score to use if evaluation fails due to an exception. + """ self.devset = devset self.metric = metric self.num_threads = num_threads @@ -70,15 +85,48 @@ def __init__( def __call__( self, - program, - metric=None, - devset=None, - num_threads=None, - display_progress=None, - display_table=None, - return_all_scores=None, - return_outputs=None, + program: "dspy.Module", + metric: Optional[Callable] = None, + devset: Optional[List["dspy.Example"]] = None, + num_threads: Optional[int] = None, + display_progress: Optional[bool] = None, + display_table: Optional[bool] = None, + return_all_scores: Optional[bool] = None, + return_outputs: Optional[bool] = None, ): + """ + Args: + program (dspy.Module): The DSPy program to evaluate. + metric (Callable): The metric function to use for evaluation. if not provided, use `self.metric`. + devset (List[dspy.Example]): the evaluation dataset. if not provided, use `self.devset`. + num_threads (int): The number of threads to use for parallel evaluation. if not provided, use + `self.num_threads`. + display_progress (bool): Whether to display progress during evaluation. if not provided, use + `self.display_progress`. + display_table (bool): Whether to display the evaluation results in a table. if not provided, use + `self.display_table`. + return_all_scores (bool): Whether to return scores for every data record in `devset`. if not provided, + use `self.return_all_scores`. + return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not + provided, use `self.return_outputs`. + + Returns: + The evaluation results are returned in different formats based on the flags: + + - Base return: A float percentage score (e.g., 67.30) representing overall performance + + - With `return_all_scores=True`: + Returns (overall_score, individual_scores) where individual_scores is a list of + float scores for each example in devset + + - With `return_outputs=True`: + Returns (overall_score, result_triples) where result_triples is a list of + (example, prediction, score) tuples for each example in devset + + - With both flags=True: + Returns (overall_score, result_triples, individual_scores) + + """ metric = metric if metric is not None else self.metric devset = devset if devset is not None else self.devset num_threads = num_threads if num_threads is not None else self.num_threads diff --git a/dspy/predict/react.py b/dspy/predict/react.py index 53d5195a59..e116544d68 100644 --- a/dspy/predict/react.py +++ b/dspy/predict/react.py @@ -9,7 +9,6 @@ from dspy.signatures.signature import ensure_signature from dspy.utils.callback import with_callbacks - class Tool: def __init__( @@ -19,6 +18,7 @@ def __init__( desc: str = None, args: dict[str, Any] = None, defaults: dict[str, Any] = None, + private_defaults: dict[str, Any] = None, ): annotations_func = func if inspect.isfunction(func) or inspect.ismethod(func) else func.__call__ self.func = func @@ -32,6 +32,7 @@ def __init__( if k != "return" } self.defaults = defaults + self.private_defaults = private_defaults @with_callbacks def __call__(self, *args, **kwargs): @@ -74,6 +75,8 @@ def __init__(self, signature, tools: list[Callable], max_iters=5): desc += f" It takes arguments {args} in JSON format." if tool.defaults: desc += f" Default arguments are {tool.defaults}." + if tool.private_defaults: + desc += f" Assume the following function arguments will be provided at function execution time: {tool.private_defaults.keys()}. Therefore do not propose these arguments in the `next_tool_args`." instr.append(f"({idx+1}) {tool.name}{desc}") react_signature = ( @@ -102,13 +105,25 @@ def format(trajectory: dict[str, Any], last_iteration: bool): for idx in range(self.max_iters): pred = self.react(**input_args, trajectory=format(trajectory, last_iteration=(idx == self.max_iters - 1))) + # extract private defaults from the tool and supply them to the next tool call + # do not assign the private defaults to the next_tool_args as this will be captured in the trajectory logs, which is not what we want + private_defaults = ( + self.tools[pred.next_tool_name].private_defaults + if pred.next_tool_name in self.tools + and self.tools[pred.next_tool_name].private_defaults + else {} + ) + trajectory[f"thought_{idx}"] = pred.next_thought trajectory[f"tool_name_{idx}"] = pred.next_tool_name trajectory[f"tool_args_{idx}"] = pred.next_tool_args try: - trajectory[f"observation_{idx}"] = self.tools[pred.next_tool_name](**pred.next_tool_args) + trajectory[f"observation_{idx}"] = self.tools[pred.next_tool_name]( + **pred.next_tool_args, **private_defaults + ) except Exception as e: + # risk that the error log will capture the private defaults? trajectory[f"observation_{idx}"] = f"Failed to execute: {e}" if pred.next_tool_name == "finish":