Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple Selectable Approaches for Text (Chain of Thought with Prompt Chaining) #350

Merged
merged 9 commits into from
Nov 29, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llm_core/llm_core/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


DefaultModelConfig: Type[ModelConfig]
MiniModelConfig: ModelConfig
default_model_name = os.environ.get("LLM_DEFAULT_MODEL")
evaluation_model_name = os.environ.get("LLM_EVALUATION_MODEL")

Expand All @@ -18,6 +19,8 @@
types.append(openai_config.OpenAIModelConfig)
if default_model_name in openai_config.available_models:
DefaultModelConfig = openai_config.OpenAIModelConfig
if "openai_gpt-4o-mini" in openai_config.available_models:
MiniModelConfig = openai_config.OpenAIModelConfig(model_name="openai_gpt-4o-mini",max_tokens=3000, temperature=0,top_p=0.9,presence_penalty=0,frequency_penalty=0)
if evaluation_model_name in openai_config.available_models:
evaluation_model = openai_config.available_models[evaluation_model_name]
except AttributeError:
Expand Down
4 changes: 2 additions & 2 deletions llm_core/llm_core/utils/llm_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from typing import Type, TypeVar, List
from pydantic import BaseModel
import tiktoken
from langchain.chat_models import ChatOpenAI
from langchain_openai import AzureChatOpenAI, ChatOpenAI
from langchain.base_language import BaseLanguageModel
from langchain.prompts import (
ChatPromptTemplate,
Expand Down Expand Up @@ -75,7 +75,7 @@ def supports_function_calling(model: BaseLanguageModel):
Returns:
boolean: True if the model supports function calling, False otherwise
"""
return isinstance(model, ChatOpenAI)
return isinstance(model, ChatOpenAI) or isinstance(model, AzureChatOpenAI)


def get_chat_prompt_with_formatting_instructions(
Expand Down
13 changes: 7 additions & 6 deletions llm_core/llm_core/utils/predict_and_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,14 @@ async def predict_and_parse(
if experiment.run_id is not None:
tags.append(f"run-{experiment.run_id}")

structured_output_llm = model.with_structured_output(pydantic_object, method="json_mode")
chain = RunnableSequence(
chat_prompt,
structured_output_llm
)
structured_output_llm = model.with_structured_output(pydantic_object)
# chain = RunnableSequence(
# chat_prompt,
# structured_output_llm
# )
chain = chat_prompt | structured_output_llm

try:
return await chain.ainvoke(prompt_input, config={"tags": tags})
return await chain.ainvoke(prompt_input, config={"tags": tags}) # type: ignore #
except ValidationError as e:
raise ValueError(f"Could not parse output: {e}") from e
6 changes: 1 addition & 5 deletions modules/text/module_text_llm/module_text_llm/__main__.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,16 @@
import json
import os
from typing import List, Any

import nltk
import tiktoken

from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
from athena.text import Exercise, Submission, Feedback
from athena.logger import logger

from module_text_llm.config import Configuration
from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
from module_text_llm.generate_suggestions import generate_suggestions
from module_text_llm.generate_evaluation import generate_evaluation

from module_text_llm.approaches.approach_controller import generate_suggestions

@submissions_consumer
def receive_submissions(exercise: Exercise, submissions: List[Submission]):
Expand All @@ -30,7 +27,6 @@ def select_submission(exercise: Exercise, submissions: List[Submission]) -> Subm
def process_incoming_feedback(exercise: Exercise, submission: Submission, feedbacks: List[Feedback]):
logger.info("process_feedback: Received %d feedbacks for submission %d of exercise %d.", len(feedbacks), submission.id, exercise.id)


@feedback_provider
async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
logger.info("suggest_feedback: %s suggestions for submission %d of exercise %d were requested",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from abc import ABC
from pydantic import BaseModel, Field
from llm_core.models import ModelConfigType, DefaultModelConfig
from enum import Enum

class ApproachType(str, Enum):
basic = "BasicApproach"
chain_of_thought = "ChainOfThought"

class ApproachConfig(BaseModel, ABC):
max_input_tokens: int = Field(default=3000, description="Maximum number of tokens in the input prompt.")
model: ModelConfigType = Field(default=DefaultModelConfig()) # type: ignore
type: ApproachType = Field(..., description="The type of approach config")

class Config:
use_enum_values = True
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@

from typing import List
from athena.text import Exercise, Submission, Feedback
from module_text_llm.approaches.basic_approach.config import BasicApproachConfig
from module_text_llm.approaches.chain_of_thought_approach.config import ChainOfThoughtConfig
from module_text_llm.approaches.approach_config import ApproachConfig
from athena.logger import logger


from module_text_llm.approaches.basic_approach.generate_suggestions import generate_suggestions as generate_suggestions_basic
from module_text_llm.approaches.chain_of_thought_approach.generate_suggestions import generate_suggestions as generate_cot_suggestions

async def generate_suggestions(exercise: Exercise, submission: Submission, config: ApproachConfig, debug: bool) -> List[Feedback]:
if(isinstance(config, BasicApproachConfig)):
return await generate_suggestions_basic(exercise, submission, config, debug)
elif(isinstance(config, ChainOfThoughtConfig)):
return await generate_cot_suggestions(exercise, submission, config, debug)

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from module_text_llm.approaches.approach_config import ApproachConfig
from pydantic import Field, BaseModel
from typing import Literal


from module_text_llm.approaches.basic_approach.prompts.generate_suggestions import (
system_message as generate_suggestions_system_message,
human_message as generate_suggestions_human_message
)

class GenerateSuggestionsPrompt(BaseModel):
"""\
Features available: **{problem_statement}**, **{example_solution}**, **{grading_instructions}**, **{max_points}**, **{bonus_points}**, **{submission}**

_Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input is too long._\
"""
system_message: str = Field(default=generate_suggestions_system_message,
description="Message for priming AI behavior and instructing it what to do.")
human_message: str = Field(default=generate_suggestions_human_message,
description="Message from a human. The input on which the AI is supposed to act.")


class BasicApproachConfig(ApproachConfig):
type: Literal['basic'] = 'basic'
generate_suggestions_prompt: GenerateSuggestionsPrompt = Field(default=GenerateSuggestionsPrompt())

Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,13 @@
from athena import emit_meta
from athena.text import Exercise, Submission, Feedback
from athena.logger import logger

from module_text_llm.config import BasicApproachConfig
from llm_core.utils.llm_utils import (
get_chat_prompt_with_formatting_instructions,
check_prompt_length_and_omit_features_if_necessary,
num_tokens_from_prompt,
)
from llm_core.utils.predict_and_parse import predict_and_parse

from module_text_llm.config import BasicApproachConfig
from module_text_llm.helpers.utils import add_sentence_numbers, get_index_range_from_line_range, format_grading_instructions

class FeedbackModel(BaseModel):
Expand All @@ -25,18 +23,11 @@ class FeedbackModel(BaseModel):
description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
)

class Config:
title = "Feedback"


class AssessmentModel(BaseModel):
"""Collection of feedbacks making up an assessment"""

feedbacks: Sequence[FeedbackModel] = Field(description="Assessment feedbacks")

class Config:
title = "Assessment"

feedbacks: List[FeedbackModel] = Field(description="Assessment feedbacks")

async def generate_suggestions(exercise: Exercise, submission: Submission, config: BasicApproachConfig, debug: bool) -> List[Feedback]:
model = config.model.get_model() # type: ignore[attr-defined]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@
# Grading instructions
{grading_instructions}
Max points: {max_points}, bonus points: {bonus_points}\

Respomd in json format.
"""

human_message = """\
Student\'s submission to grade (with sentence numbers <number>: <sentence>):
Respomd in json format.

\"\"\"
{submission}
\"\"\"\
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from pydantic import BaseModel, Field
from typing import Literal
from llm_core.models import ModelConfigType, MiniModelConfig

from module_text_llm.approaches.chain_of_thought_approach.prompts.cot_suggestions import (
system_message as generate_cot_suggestions_system_message,
human_message as generate_cot_suggestions_human_message
)

from module_text_llm.approaches.chain_of_thought_approach.prompts.refined_cot_suggestions import (
system_message as generate_refined_cot_suggestions_system_message,
human_message as generate_refined_cot_suggestions_human_message
)

from module_text_llm.approaches.approach_config import ApproachConfig

class CoTGenerateSuggestionsPrompt(BaseModel):
"""\
Features cit available: **{problem_statement}**, **{example_solution}**, **{grading_instructions}**, **{max_points}**, **{bonus_points}**, **{submission}**

_Note: **{problem_statement}**, **{example_solution}**, or **{grading_instructions}** might be omitted if the input is too long._\
"""
system_message: str = Field(default=generate_cot_suggestions_system_message,
description="Message for priming AI behavior and instructing it what to do.")
human_message: str = Field(default=generate_cot_suggestions_human_message,
description="Message from a human. The input on which the AI is supposed to act.")
second_system_message: str = Field(default=generate_refined_cot_suggestions_system_message,
description="Message for priming AI behavior and instructing it what to do.")
answer_message: str = Field(default=generate_refined_cot_suggestions_human_message,
description="Message from a human. The input on which the AI is supposed to act.")

class ChainOfThoughtConfig(ApproachConfig):
# Defaults to the cheaper mini 4o model
type: Literal['chain_of_thought'] = 'chain_of_thought'
model: ModelConfigType = Field(default=MiniModelConfig) # type: ignore
generate_suggestions_prompt: CoTGenerateSuggestionsPrompt = Field(default=CoTGenerateSuggestionsPrompt())
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
from typing import List, Optional
from pydantic import BaseModel, Field

from athena import emit_meta
from athena.text import Exercise, Submission, Feedback
from athena.logger import logger

from module_text_llm.approaches.chain_of_thought_approach.config import ChainOfThoughtConfig

from llm_core.utils.llm_utils import (
get_chat_prompt_with_formatting_instructions,
check_prompt_length_and_omit_features_if_necessary,
num_tokens_from_prompt,
)
from llm_core.utils.predict_and_parse import predict_and_parse

from module_text_llm.helpers.utils import add_sentence_numbers, get_index_range_from_line_range, format_grading_instructions

class FeedbackModel(BaseModel):
title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
description: str = Field(description="Feedback description")
line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
credits: float = Field(0.0, description="Number of points received/deducted")
grading_instruction_id: Optional[int] = Field(
description="ID of the grading instruction that was used to generate this feedback, or empty if no grading instruction was used"
)

class AssessmentModel(BaseModel):
"""Collection of feedbacks making up an assessment"""

feedbacks: List[FeedbackModel] = Field(description="Assessment feedbacks")
class InitialAssessment(BaseModel):
title: str = Field(description="Very short title, i.e. feedback category or similar", example="Logic Error")
description: str = Field(description="Feedback description")
line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
credits: float = Field(0.0, description="Number of points received/deducted")
reasoning: str = Field(description="Reasoning why the feedback was given")
impprovment_suggestion: str = Field(description="Suggestion for improvement for the student")

class InitialAssessmentModel(BaseModel):
"""Collection of feedbacks making up an assessment"""

feedbacks: List[InitialAssessment] = Field(description="Assessment feedbacks")

async def generate_suggestions(exercise: Exercise, submission: Submission, config: ChainOfThoughtConfig, debug: bool) -> List[Feedback]:
model = config.model.get_model() # type: ignore[attr-defined]

prompt_input = {
"max_points": exercise.max_points,
"bonus_points": exercise.bonus_points,
"grading_instructions": format_grading_instructions(exercise.grading_instructions, exercise.grading_criteria),
"problem_statement": exercise.problem_statement or "No problem statement.",
"example_solution": exercise.example_solution,
"submission": add_sentence_numbers(submission.text)
}

chat_prompt = get_chat_prompt_with_formatting_instructions(
model=model,
system_message=config.generate_suggestions_prompt.system_message,
human_message=config.generate_suggestions_prompt.human_message,
pydantic_object=InitialAssessmentModel
)



# Check if the prompt is too long and omit features if necessary (in order of importance)
omittable_features = ["example_solution", "problem_statement", "grading_instructions"]
prompt_input, should_run = check_prompt_length_and_omit_features_if_necessary(
prompt=chat_prompt,
prompt_input= prompt_input,
max_input_tokens=config.max_input_tokens,
omittable_features=omittable_features,
debug=debug
)

# Skip if the prompt is too long
if not should_run:
logger.warning("Input too long. Skipping.")
if debug:
emit_meta("prompt", chat_prompt.format(**prompt_input))
emit_meta("error", f"Input too long {num_tokens_from_prompt(chat_prompt, prompt_input)} > {config.max_input_tokens}")
return []

initial_result = await predict_and_parse(
model=model,
chat_prompt=chat_prompt,
prompt_input=prompt_input,
pydantic_object=InitialAssessmentModel,
tags=[
f"exercise-{exercise.id}",
f"submission-{submission.id}",
]
)

second_prompt_input = {
"answer" : initial_result.dict(),
"submission": add_sentence_numbers(submission.text)

}

second_chat_prompt = get_chat_prompt_with_formatting_instructions(
model=model,
system_message=config.generate_suggestions_prompt.second_system_message,
human_message=config.generate_suggestions_prompt.answer_message,
pydantic_object=AssessmentModel)

result = await predict_and_parse(
model=model,
chat_prompt=second_chat_prompt,
prompt_input=second_prompt_input,
pydantic_object=AssessmentModel,
tags=[
f"exercise-{exercise.id}",
f"submission-{submission.id}",
]
)

if debug:
emit_meta("generate_suggestions", {
"prompt": chat_prompt.format(**prompt_input),
"result": result.dict() if result is not None else None
})


if result is None:
return []

grading_instruction_ids = set(
grading_instruction.id
for criterion in exercise.grading_criteria or []
for grading_instruction in criterion.structured_grading_instructions
)

feedbacks = []
for feedback in result.feedbacks:
index_start, index_end = get_index_range_from_line_range(feedback.line_start, feedback.line_end, submission.text)
grading_instruction_id = feedback.grading_instruction_id if feedback.grading_instruction_id in grading_instruction_ids else None
feedbacks.append(Feedback(
exercise_id=exercise.id,
submission_id=submission.id,
title=feedback.title,
description=feedback.description,
index_start=index_start,
index_end=index_end,
credits=feedback.credits,
structured_grading_instruction_id=grading_instruction_id,
meta={}
))

return feedbacks
Loading
Loading