DC for student, RAG ICL for tutor

ls1intum · Jan 28, 2025 · e93b942 · e93b942
1 parent 63ea116
commit e93b942
Show file tree

Hide file tree

Showing 9 changed files with 235 additions and 264 deletions.
diff --git a/modules/text/module_text_llm/module_text_llm/__main__.py b/modules/text/module_text_llm/module_text_llm/__main__.py
@@ -13,7 +13,8 @@
 from module_text_llm.approach_controller import generate_suggestions
 from module_text_llm.helpers.detect_suspicios_submission import hybrid_suspicion_score, llm_check
 from module_text_llm.helpers.feedback_icl.store_feedback_icl import store_feedback_icl
-from module_text_llm.few_shot_chain_of_thought_approach import FewShotChainOfThoughtConfig
+from module_text_llm.divide_and_conquer import DivideAndConquerConfig
+from module_text_llm.icl_rag import ICLRAGConfig
 #Test Demo
 from module_text_llm.analytics.compile import compile
 
@@ -51,17 +52,19 @@ async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded
     logger.info("suggest_feedback: %s suggestions for submission %d of exercise %d were requested, with approach: %s and model: %s",
                 "Graded" if is_graded else "Non-graded", submission.id, exercise.id, module_config.approach.__class__.__name__, module_config.approach.model.model_name)
 
-    # if not is_graded:    
-    #     is_sus, score = hybrid_suspicion_score(submission.text, threshold=0.8)
-    #     if is_sus:
-    #         logger.info("Suspicious submission detected with score %f", score)
-    #         is_suspicious,suspicios_text = await llm_check(submission.text)
-    #         if is_suspicious:
-    #             logger.info("Suspicious submission detected by LLM with text %s", suspicios_text)
-    #             return [Feedback(title="Instructors need to review this submission", description="This Submission potentially violates the content policy!", credits=-1.0, exercise_id=exercise.id, submission_id=submission.id, is_graded=is_graded)]
-    #     module_config.approach = FewShotChainOfThoughtConfig()
-    #     return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug, is_graded)
-    # module_config.approach = FewShotChainOfThoughtConfig()
+    # STUDENT
+    if not is_graded:    
+        is_sus, score = hybrid_suspicion_score(submission.text, threshold=0.8)
+        if is_sus:
+            logger.info("Suspicious submission detected with score %f", score)
+            is_suspicious,suspicios_text = await llm_check(submission.text)
+            if is_suspicious:
+                logger.info("Suspicious submission detected by LLM with text %s", suspicios_text)
+                return [Feedback(title="Instructors need to review this submission", description="This Submission potentially violates the content policy!", credits=-1.0, exercise_id=exercise.id, submission_id=submission.id, is_graded=is_graded)]
+        module_config.approach = FewShotChainOfThoughtConfig()
+        return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug, is_graded)
+    # TUTOR
+    module_config.approach = ICLRAGConfig()
     return await generate_suggestions(exercise, submission, module_config.approach, module_config.debug, is_graded)
 
 

diff --git a/modules/text/module_text_llm/module_text_llm/config.py b/modules/text/module_text_llm/module_text_llm/config.py
@@ -10,8 +10,9 @@
 from module_text_llm.basic_COT import BasicCOTApproachConfig
 from module_text_llm.icl_rag import ICLRAGConfig
 from module_text_llm.few_shot_COT import FewShotCOT
+from module_text_llm.divide_and_conquer import DivideAndConquerConfig
 
-ApproachConfigUnion = Union[FewShotCOT,ICLRAGConfig, BasicApproachConfig, FewShotChainOfThoughtConfig,BasicCOTApproachConfig, BestApproachConfig]
+ApproachConfigUnion = Union[DivideAndConquerConfig,FewShotCOT,ICLRAGConfig, BasicApproachConfig, FewShotChainOfThoughtConfig,BasicCOTApproachConfig, BestApproachConfig]
 
 @config_schema_provider
 class Configuration(BaseModel):

diff --git a/modules/text/module_text_llm/module_text_llm/divide_and_conquer/__init__.py b/modules/text/module_text_llm/module_text_llm/divide_and_conquer/__init__.py
@@ -0,0 +1,11 @@
+from module_text_llm.approach_config import ApproachConfig
+from typing import Literal
+from athena.text import Exercise, Submission
+from module_text_llm.divide_and_conquer.generate_suggestions import generate_suggestions
+
+class DivideAndConquerConfig(ApproachConfig):
+    type: Literal['divide_and_conquer'] = 'divide_and_conquer'
+    # Prompts are generated at run time.
+    async def generate_suggestions(self, exercise: Exercise, submission: Submission, config,*, debug: bool, is_graded: bool):
+        return await generate_suggestions(exercise, submission, config, debug, is_graded)
+
diff --git a/modules/text/module_text_llm/module_text_llm/divide_and_conquer/generate_suggestions.py b/modules/text/module_text_llm/module_text_llm/divide_and_conquer/generate_suggestions.py
@@ -0,0 +1,114 @@
+from athena.text import Exercise, Submission, Feedback
+from athena.logger import logger
+from llm_core.utils.llm_utils import get_chat_prompt_with_formatting_instructions
+from llm_core.utils.predict_and_parse import predict_and_parse
+from module_text_llm.divide_and_conquer.prompt_generate_suggestions import AssessmentModel, FeedbackModel, double_curly_braces, get_system_prompt, get_human_message
+from module_text_llm.approach_config import ApproachConfig
+from module_text_llm.helpers.utils import add_sentence_numbers, get_index_range_from_line_range
+import asyncio
+
+# Placeholder for generate suggestions logic.
+async def generate_suggestions(exercise: Exercise, submission: Submission, config: ApproachConfig, debug: bool, is_graded: bool):
+    submission_text = double_curly_braces(submission.text)
+    model = config.model.get_model()  # type: ignore[attr-defined]
+    prompt_input = {
+        "submission": add_sentence_numbers(submission_text)
+    }
+
+    grading_criteria = exercise.grading_criteria
+    feedbacks = []
+    grading_instruction_ids = set(
+        grading_instruction.id 
+        for criterion in exercise.grading_criteria or [] 
+        for grading_instruction in criterion.structured_grading_instructions
+    )
+    tasks = []
+
+    for idx, criteria in enumerate(grading_criteria):
+        processing_inputs = {
+            "model": model,
+            "prompt_input": prompt_input,
+            "exercise": exercise,
+            "submission": submission,
+            "grading_instruction_ids": grading_instruction_ids,
+            "is_graded": is_graded,
+            "criteria_title": criteria.title
+        }
+        if ("plagiarism" in criteria.title.lower()): # Exclude plagarism because the model cannot know and it hallucinates
+            continue
+        usage_count, system_prompt = get_system_prompt(idx,exercise, criteria)
+        if (usage_count == 1):
+            chat_prompt = get_chat_prompt_with_formatting_instructions(model = model, system_message = system_prompt,human_message = get_human_message(),pydantic_object = FeedbackModel)
+            processing_inputs["pydantic_object"] = FeedbackModel
+            processing_inputs["chat_prompt"] = chat_prompt
+        else:
+            chat_prompt = get_chat_prompt_with_formatting_instructions(model = model, system_message = system_prompt,human_message= get_human_message(),pydantic_object = AssessmentModel)
+            processing_inputs["pydantic_object"] = AssessmentModel
+            processing_inputs["chat_prompt"] = chat_prompt
+        tasks.append(process_criteria(processing_inputs))    
+
+    results = await asyncio.gather(*tasks)
+
+    # Flatten the list of feedbacks
+    for feedback_list in results:
+        feedbacks += feedback_list
+    return feedbacks
+
+async def process_criteria(processing_inputs):
+
+    # Call the predict_and_parse method
+    result = await predict_and_parse(
+        model=processing_inputs["model"], 
+        chat_prompt=processing_inputs["chat_prompt"], 
+        prompt_input=processing_inputs["prompt_input"], 
+        pydantic_object=processing_inputs["pydantic_object"],
+        tags=[
+            f"exercise-{processing_inputs['exercise'].id}",
+            f"submission-{processing_inputs['submission'].id}",
+        ],
+        use_function_calling=True
+    )
+
+    if processing_inputs["pydantic_object"] is AssessmentModel:
+        try:
+            return parse_assessment_result(result, processing_inputs['exercise'], processing_inputs['submission'], processing_inputs["grading_instruction_ids"], processing_inputs["is_graded"])
+        except Exception as e:
+            logger.info("Failed to parse assessment result")
+            return []
+    else:
+        try:
+            return parse_feedback_result(result, processing_inputs['exercise'], processing_inputs['submission'], processing_inputs["grading_instruction_ids"], processing_inputs["is_graded"])
+        except Exception as e:
+            logger.info("Failed to parse feedback result")
+            return []
+
+def parse_assessment_result(result, exercise, submission, grading_instruction_ids, is_graded):
+    result_feedbacks = []
+    for feedback in result.assessment:
+        result_feedbacks += parse_feedback_result(feedback, exercise, submission, grading_instruction_ids, is_graded)
+    return result_feedbacks
+
+def parse_feedback_result(feedback, exercise, submission, grading_instruction_ids, is_graded):
+    result_feedbacks = []
+
+    index_start, index_end = get_index_range_from_line_range(
+        feedback.line_start, feedback.line_end, submission.text
+    )
+    assessment_instruction_id = (
+        feedback.assessment_instruction_id 
+        if feedback.assessment_instruction_id in grading_instruction_ids 
+        else None
+    )
+    result_feedbacks.append(Feedback(
+        exercise_id=exercise.id,
+        submission_id=submission.id,
+        title=feedback.criteria,
+        description=feedback.feedback,
+        index_start=index_start,
+        index_end=index_end,
+        credits=feedback.credits,
+        is_graded=is_graded,
+        structured_grading_instruction_id=assessment_instruction_id,
+        meta={}
+    ))
+    return result_feedbacks
diff --git a/...es/text/module_text_llm/module_text_llm/divide_and_conquer/prompt_generate_suggestions.py b/...es/text/module_text_llm/module_text_llm/divide_and_conquer/prompt_generate_suggestions.py
@@ -0,0 +1,93 @@
+from pydantic import Field, BaseModel
+from typing import List, Optional
+from athena.schemas.grading_criterion import GradingCriterion
+
+def get_human_message():
+    return """
+Now you must assess the following student submission and respond in json. The student submission to asses (with sentence numbers <number>: <sentence>):
+
+\"\"\"
+{submission}
+\"\"\"\
+"""
+
+def double_curly_braces(input_str):
+    # Curly braces are used as placeholders in the prompt, so we need to escape them if found in the text
+    return input_str.replace("{", " ").replace("}", " ")
+
+# Prompts are generated at run time.
+def get_system_prompt(index,exericse,cirteria:GradingCriterion):
+    system_prompt = """You are an AI Assistant TUTOR at a prestigious university tasked with assessing text submissions. You are tasked with assessing a submission from a student. The problem statement is:"""
+    usage_count, formatted_criterion = format_divide_and_conquer_criteria(index,exericse,cirteria)
+    return usage_count, system_prompt + formatted_criterion
+
+def format_divide_and_conquer_criteria(index,exercise, criteria: GradingCriterion):
+    criteria_explanation_prompt = ""
+    problem_statement = f"""
+    # Problem Statement 
+    {double_curly_braces(exercise.problem_statement)}.
+    # End Problem Statement
+    A sample solution to the problem statement is:
+    # Example Solution
+    {double_curly_braces(exercise.example_solution)}
+    # End Example Solution
+    # General Instructions
+    You do not have access to lecture materials, exercise sheet or other materials so do not make assumptions.
+    # End General Instructions"""
+
+    criteria_explanation_prompt += problem_statement
+    # Handle Arbitrarily often criteria, this is denoted by 0, CAREFUL WITH THIS ONE.
+
+    criteria_explanation_prompt += f""" 
+    You have to assess the submission based on the criteria with the title: "{criteria.title}". There are
+    {len(criteria.structured_grading_instructions)} structured assessment instructions options for this criteria.
+    """
+    usage_counts = [instruction.usage_count for instruction in criteria.structured_grading_instructions]
+    use_same_usaged_count = False
+    if (len(set(usage_counts)) == 1):
+        use_same_usaged_count = True
+    if use_same_usaged_count:
+        criteria_explanation_prompt += f""" 
+        {get_criteria_application(usage_counts)}.
+        The structured assessment instructions are as follows: \n"""
+    for idx,instruction in enumerate(criteria.structured_grading_instructions):
+        criteria_explanation_prompt += f""" 
+        Instruction Number {idx+1}: Apply {instruction.credits} credits if the following description fits the students submission: "{instruction.instruction_description}. A possible feedback could be in the likes of "{instruction.feedback}" but you may adjust it as you see fit, however stay focused only on this criteria on your feedback. Apply assessment instruction id {instruction.id} to this segment of the submission. \n
+        """
+    return usage_counts[0] ,criteria_explanation_prompt
+
+def get_criteria_application(usage_counts):
+    usaged_count_prompt = ""
+    if usage_counts[0] == 0:
+        usaged_count_prompt = "You may apply this criteria as many times as it is needed if it fits the submission."
+    elif usage_counts[0] == 1:
+        usaged_count_prompt = "You may only apply this criteria ONCE. You must pick the instruction that best fits the submission. "
+    else:
+        usaged_count_prompt = f"You may apply thic criteria {usage_counts[0]} times. Each time must pick the instruction that best fits the submission."
+
+    usaged_count_prompt += """ For this criteria you have different levels of assessment to give, based on the structured assessment instructions."""
+    usaged_count_prompt += """For different segments of the submission you may apply a different assessment instruction that is fitting to that segment and give it its respective deserved credits. 
+    Identify all segments of the submission that relate to this criteria and its instructions and apply the correct feedback as described by the instructions. 
+    Keep in mind that the student might seperate his answers throught the whole submission. 
+    """ if usage_counts[0] != 1 else "You may apply this criteria only once and choose only a SINGLE assessment instruciton that best fits the submission!"
+    return usaged_count_prompt
+
+# Output Object
+# Names have been redefined here, to be consistent with the prompt
+# Local LLMs do better with these names. GoatPT does not care and does everything!
+class FeedbackModel(BaseModel):
+    """ A Feedback object consisting of the criteria title, the feedback text, a line_start and line_end to depict
+    a reference to the text, creidts to depcit the credit amount given and an assessment_instruction_id to depict the assessment instruction ID used"""
+    criteria: str = Field(description="Short Criteria title!")
+    feedback: str = Field(description="The feedback in text form.")
+    line_start: Optional[int] = Field(description="Referenced line number start, or empty if unreferenced")
+    line_end: Optional[int] = Field(description="Referenced line number end, or empty if unreferenced")
+    credits: float = Field(0.0, description="Number of credits received/deducted")
+    assessment_instruction_id: Optional[int] = Field(
+        description="ID of the assessment instruction that was used to generate this feedback, or empty if no assessment instruction was used"
+    )
+
+class AssessmentModel(BaseModel):
+    """Collection of feedbacks making up an assessment"""
+    assessment: List[FeedbackModel] = Field(description="Assessment feedbacks")
+
diff --git a/modules/text/module_text_llm/module_text_llm/ollama_chain_of_thought_approach/__init__.py b/modules/text/module_text_llm/module_text_llm/ollama_chain_of_thought_approach/__init__.py