diff --git a/athena/athena/__init__.py b/athena/athena/__init__.py
index f7eee0079..104e5a862 100644
--- a/athena/athena/__init__.py
+++ b/athena/athena/__init__.py
@@ -7,7 +7,7 @@
from .schemas import ExerciseType, GradingCriterion, StructuredGradingInstruction, StructuredGradingCriterion
from .metadata import emit_meta, get_meta
from .experiment import get_experiment_environment
-from .endpoints import submission_selector, submissions_consumer, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider # type: ignore
+from .endpoints import submission_selector, submissions_consumer, generate_statistics, feedback_consumer, feedback_provider, config_schema_provider, evaluation_provider # type: ignore
@app.get("/")
def module_health():
@@ -37,5 +37,6 @@ def run_module():
"ExerciseType",
"GradingCriterion",
"StructuredGradingInstruction",
- "StructuredGradingCriterion"
+ "StructuredGradingCriterion",
+ "generate_statistics"
]
diff --git a/athena/athena/endpoints.py b/athena/athena/endpoints.py
index 6d259a2a2..ec02dc83e 100644
--- a/athena/athena/endpoints.py
+++ b/athena/athena/endpoints.py
@@ -1,9 +1,10 @@
# type: ignore # too much weird behavior of mypy with decorators
import inspect
-from fastapi import Depends, BackgroundTasks, Body
+from fastapi import Depends, BackgroundTasks, Body, Request
from pydantic import BaseModel, ValidationError
from typing import TypeVar, Callable, List, Union, Any, Coroutine, Type
+from fastapi.responses import HTMLResponse
from athena.app import app
from athena.authenticate import authenticated
from athena.metadata import with_meta
@@ -196,6 +197,16 @@ async def wrapper(request: SubmissionSelectorRequest):
return wrapper
+def generate_statistics(func):
+ @app.post("/generate_statistics", response_class=HTMLResponse)
+ async def wrapper(request: Request):
+ try:
+ results = await request.json()
+ return await func(results)
+ except Exception as e:
+ return {"error": str(e)}
+
+ return wrapper
def feedback_consumer(func: Union[
Callable[[E, S, List[F]], None],
@@ -234,7 +245,6 @@ def feedback_consumer(func: Union[
submission_type = inspect.signature(func).parameters["submission"].annotation
feedback_type = inspect.signature(func).parameters["feedbacks"].annotation.__args__[0]
module_config_type = inspect.signature(func).parameters["module_config"].annotation if "module_config" in inspect.signature(func).parameters else None
-
@app.post("/feedbacks", responses=module_responses)
@authenticated
@with_meta
diff --git a/llm_core/llm_core/models/openai.py b/llm_core/llm_core/models/openai.py
index 7bcc0f11f..ad33d4374 100644
--- a/llm_core/llm_core/models/openai.py
+++ b/llm_core/llm_core/models/openai.py
@@ -81,7 +81,7 @@ class OpenAIModelConfig(ModelConfig):
We generally recommend altering this or `top_p` but not both.\
""")
- top_p: float = Field(default=1, ge=0, le=1, description="""\
+ top_p: float = Field(default=0, ge=0, le=1, description="""\
An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. \
So 0.1 means only the tokens comprising the top 10% probability mass are considered.
diff --git a/modules/text/module_text_llm/module_text_llm/__main__.py b/modules/text/module_text_llm/module_text_llm/__main__.py
index 0bfc6e41d..6380e356f 100644
--- a/modules/text/module_text_llm/module_text_llm/__main__.py
+++ b/modules/text/module_text_llm/module_text_llm/__main__.py
@@ -3,14 +3,14 @@
import nltk
import tiktoken
-from athena import app, submission_selector, submissions_consumer, feedback_consumer, feedback_provider, evaluation_provider
+from athena import app, submission_selector, submissions_consumer, generate_statistics,feedback_consumer, feedback_provider, evaluation_provider
from athena.text import Exercise, Submission, Feedback
from athena.logger import logger
-
from module_text_llm.config import Configuration
from module_text_llm.evaluation import get_feedback_statistics, get_llm_statistics
from module_text_llm.generate_evaluation import generate_evaluation
from module_text_llm.approach_controller import generate_suggestions
+from module_text_llm.analytics.compile import compile
@submissions_consumer
def receive_submissions(exercise: Exercise, submissions: List[Submission]):
@@ -27,6 +27,11 @@ def select_submission(exercise: Exercise, submissions: List[Submission]) -> Subm
def process_incoming_feedback(exercise: Exercise, submission: Submission, feedbacks: List[Feedback]):
logger.info("process_feedback: Received %d feedbacks for submission %d of exercise %d.", len(feedbacks), submission.id, exercise.id)
+@generate_statistics
+async def compile_analytics(results: dict):
+ logger.info("generate_statistics: Generating statistics")
+ return compile(results)
+
@feedback_provider
async def suggest_feedback(exercise: Exercise, submission: Submission, is_graded: bool, module_config: Configuration) -> List[Feedback]:
logger.info("suggest_feedback: %s suggestions for submission %d of exercise %d were requested, with approach: %s",
diff --git a/modules/text/module_text_llm/module_text_llm/analytics/analytics.py b/modules/text/module_text_llm/module_text_llm/analytics/analytics.py
new file mode 100644
index 000000000..178d89efe
--- /dev/null
+++ b/modules/text/module_text_llm/module_text_llm/analytics/analytics.py
@@ -0,0 +1,350 @@
+import plotly.express as px
+import plotly.graph_objects as go
+import numpy as np
+from collections import Counter
+
+def failure_success(credits_per_submission,failures,submission_ids):
+ failures_per_model = {}
+ list_of_models = failures.keys()
+ total_runs = len(submission_ids)
+ for submission_id, approaches in credits_per_submission.items():
+ for model in list_of_models:
+ if model not in failures_per_model:
+ failures_per_model[model] = 0
+ if model not in approaches:
+ failures_per_model[model] += 1
+
+ successes_per_model = {model: total_runs - failures for model, failures in failures_per_model.items()}
+
+ models = list(failures_per_model.keys())
+ failures = list(failures_per_model.values())
+ successes = list(successes_per_model.values())
+
+ fig = go.Figure()
+
+ fig.add_trace(go.Bar(
+ x=models,
+ y=failures,
+ name='Failures',
+ marker_color='red',
+ hovertemplate='%{y} failures
+ Its just a histogram. +
+ + """ + x = [] + group_labels = [] + approach_credits = {} + for submission_id, approaches in credit_data.items(): + for approach, credits in approaches.items(): + if approach not in approach_credits: + approach_credits[approach] = [] + if (sum(credits) > max_points): + approach_credits[approach].append(max_points) + else: + approach_credits[approach].append(sum(credits)) # /max_points*100 + for approach, credits in approach_credits.items(): + x.append(credits) + group_labels.append(approach) + + fig = go.Figure() + for approach, credits in approach_credits.items(): + fig.add_trace(go.Histogram(x=credits, name=approach,xbins={"size": 0.5})) + fig.update_layout( + title='Histogram of Total Credits Given', + xaxis_title='Total Credits', + yaxis_title='Count') + fig.update_traces(opacity=0.7) + return {"fig": fig, "html_explanation": html_explanation} + +def visualize_differences_histogram(credit_data,max_points): + html_explanation = """ ++ This graph represents the distribution of score differences between the LLM and the tutor. Negative values indicate + that the LLM has scored the submission lower than the tutor, while positive values suggest the opposite. +
++ The chart provides insights into the consistency and bias of the LLM's grading compared to the tutor. Viewers + should look for patterns such as a strong concentration of values near zero, which would indicate agreement, or + significant skew towards negative or positive values, highlighting systematic under- or over-grading by the LLM. +
++ This visualization can help identify discrepancies and areas where the LLM may need calibration or adjustment + to align more closely with tutor assessments. +
+ """ + differences_data = differences(credit_data) + + fig = go.Figure() + for approach, credits in differences_data.items(): + fig.add_trace(go.Histogram(x=credits, name=approach,xbins={"size": 0.5})) + fig.update_layout( + title='Histogram of differences', + xaxis_title='Difference LLM - Tutor', + yaxis_title='Count') + fig.update_traces(opacity=0.8) + return {"fig": fig, "html_explanation": html_explanation} + +def normalized_absolute_difference(credits, max_points): + """Plots the normalized absolute difference between the LLM and the other approaches in a sorted bar plot. + + Args: + credits (dict): A dictionary with approaches and their score differences. + max_points (float): Maximum possible credits for normalization. + """ + differences_data = differences(credits) + normalized_differences = { + approach: sum(abs(d) for d in diff_list) / len(diff_list) / max_points + for approach, diff_list in differences_data.items() + } + + sorted_differences = dict(sorted(normalized_differences.items(), key=lambda x: x[1], reverse=True)) + + fig = go.Figure() + fig.add_trace( + go.Bar( + x=list(sorted_differences.keys()), + y=list(sorted_differences.values()), + marker_color='cornflowerblue' + ) + ) + + fig.update_layout( + title='Normalized Absolute Differences Between LLM and Tutor Score', + xaxis_title='Approaches', + yaxis_title='Normalized Absolute Difference', + xaxis={"categoryorder": 'total descending'}, + yaxis={"range": [0, 1]}, + template='plotly_white' + ) + html_explanation = """+ This bar plot visualizes the normalized absolute differences in scores between the LLM and other approaches. + Each bar represents an approach, sorted from the highest to the lowest difference, and normalized by dividing the average + absolute difference by the maximum possible score. +
+Note: THIS PLOT IS NOT AN ACCURATE REPRESENTATION OF ALIGNMENT WITH TUTOR FEEDBACK
+Note: Refer to the next plot for a better representation of alignment
+ +""" + return {"fig": fig, "html_explanation": html_explanation} + +def differences(credits): + """ Calculates the literal differences between the tutor and the other approaches + removes the submission id, but keeps the credit differences in order so that + values at index 0 are the same submission and so on. + The calculation is LLM - Tutor, so a negative value means that the LLM has awarded less credits. + The end form is : + {approach: [differences], ...} + """ + differences_data = {} + for submission_id, approaches in credits.items(): + for approach, credit_list in approaches.items(): + if approach != "Tutor": + if approach not in differences_data: + differences_data[approach] = [] + differences_data[approach].append( sum(credit_list) - sum(approaches["Tutor"])) + return differences_data + +def getAbsoluteDifferences(differences): + abs_diff = {} + for approach, diff_list in differences.items(): + abs_diff[approach] = np.abs(diff_list) + return abs_diff + +def analyze_grading_instruction_usage(grading_instructions_used): + """ + Analyze grading instruction usage for each approach and plot matching vs. non-matching counts. + + Parameters: + - grading_instructions_used: dict, where keys are submission IDs, and values are dicts with approaches and lists of grading instruction IDs. + + Returns: + - A Plotly figure object with analytics on matching vs. non-matching grading instruction IDs. + - An HTML string explanation. + """ + approach_stats = {} + + for submission_id, approaches in grading_instructions_used.items(): + if "Tutor" not in approaches: + continue + + tutor_instructions = Counter(approaches["Tutor"]) + + for approach, instructions in approaches.items(): + if approach == "Tutor": + continue + + if approach not in approach_stats: + approach_stats[approach] = {"matches": 0, "non_matches": 0} + + approach_instructions = Counter(instructions) + + matches = 0 + non_matches = 0 + + for instruction, count in approach_instructions.items(): + if instruction in tutor_instructions: + matches += min(count, tutor_instructions[instruction]) + else: + non_matches += count + + approach_stats[approach]["matches"] += matches + approach_stats[approach]["non_matches"] += non_matches + + approaches = list(approach_stats.keys()) + matches = [approach_stats[approach]["matches"] for approach in approaches] + non_matches = [approach_stats[approach]["non_matches"] for approach in approaches] + + fig = go.Figure() + fig.add_trace(go.Bar( + x=approaches, y=matches, name="Matching Instructions", + marker_color="green" + )) + fig.add_trace(go.Bar( + x=approaches, y=non_matches, name="Non-Matching Instructions", + marker_color="red" + )) + + fig.update_layout( + barmode="group", + title="Matching vs. Non-Matching Grading Instructions by Approach", + xaxis_title="Approach", + yaxis_title="Count", + template="plotly_white", + ) + + html_explanation = """ ++ This visualization compares the grading instructions used by different approaches + against the "Tutor" approach. The green bars represent the count of grading instructions + that match those of the Tutor approach, while the red bars show the count of non-matching + instructions. This analysis highlights alignment and deviations between approaches. +
+ """ + + return fig, html_explanation + +def create_threshold_bar_plot(data,max_points): + thresholds = [0, 0.1, 0.15, 0.2, 0.25, 0.3] + data_dicts = [] + for threshold in thresholds: + data_dicts.append(percentage_within_range(data,max_points, threshold)) + fig = go.Figure() + + for approach in data_dicts[0].keys(): + fig.add_trace(go.Bar( + name=approach, + x=[f"{threshold*100}%" for threshold in thresholds], + y=[data[approach] for data in data_dicts], + text=[f"{v}%" for v in [data[approach] for data in data_dicts]], + textposition='auto' + )) + + fig.update_layout( + title="Percentage of Counts Within Thresholds by Approach", + xaxis_title="Thresholds", + yaxis_title="Percentage (%)", + barmode='group', + legend_title="Approaches", + template='plotly' + ) + html_explanation = """ ++ For example, for threshold 10 per cent. If the max points are 5, it only includes those llm results that are within 0.5 points of the tutor feedback. A bar of 20 per cent + would translate to 20 per cent of the llm results being within 0.5 points of the tutor feedback. +
+ """ + return {"fig": fig, "html_explanation": html_explanation} + +def percentage_within_range(data,max_points , threshold): + """ This method shows the percentage of the data that falls within a certain range difference of the maximum credits from the tutor + Args: + data (_type_): the credits data + """ + approach_credits = {} + for submission_id, approaches in data.items(): + for approach, credits in approaches.items(): + if approach not in approach_credits: + approach_credits[approach] = [] + approach_credits[approach].append(sum(credits)) + + results = {} + tutor_credits = approach_credits["Tutor"] + for approach,credit_total in approach_credits.items(): + if approach != "Tutor": + if approach not in results: + results[approach] = 0 + for idx,credit in enumerate(credit_total): + within_range = calculate_within_cutoff(tutor_credits[idx], credit,max_points, threshold) + if within_range: + results[approach] += 1 + for approach, count in results.items(): + results[approach] = round(count/len(tutor_credits)*100,2) + return results +def calculate_within_cutoff(tutor_value, llm_value,max_points, threshold): + upper_credit_cutoff = tutor_value + max_points * threshold + lower_credit_cutoff = tutor_value - max_points * threshold + within_range = lower_credit_cutoff <= llm_value <= upper_credit_cutoff + return within_range diff --git a/modules/text/module_text_llm/module_text_llm/analytics/compile.py b/modules/text/module_text_llm/module_text_llm/analytics/compile.py new file mode 100644 index 000000000..37137ad91 --- /dev/null +++ b/modules/text/module_text_llm/module_text_llm/analytics/compile.py @@ -0,0 +1,134 @@ +from module_text_llm.analytics.pre_processing import pre_processing +from module_text_llm.analytics.analytics import create_threshold_bar_plot,total_credit_per_submission,failure_success,analyze_grading_instruction_usage, visualize_differences_histogram,normalized_absolute_difference,visualize_histogram_kde_percentages +import os +import traceback + + +def compile(results): + """This function will compile the analytics for the given results +It first preprocesses the data and then calls multiple functions to generate the analytics. +All these are put together in an HTML file which is then returned as a string. +Through plotly, the figures are embedded in the HTML file and are fully interactive. + """ + try: + credits_per_submission,grading_instructions_used,exercise_id,grading_criteria,max_points,experiment_id,failures,submission_ids,title,problem_statement = pre_processing(results) + directory = "module_text_llm/analytics/created_analytics" + ensure_directory_exists(directory) + output_file = f"{directory}/analytics_{experiment_id}.html" + + if file_exists(output_file): + return get_html_content(output_file) + + ############################# CREDIT BASED ANALYTICS ############################# + # Define them here, must return a dict of type {"fig":fig,"html_explanation":html_explanation} + creditPSub = total_credit_per_submission(credits_per_submission) + histo = visualize_differences_histogram(credits_per_submission,max_points) + kde_percent = visualize_histogram_kde_percentages(credits_per_submission,max_points) + nmda = normalized_absolute_difference(credits_per_submission,max_points) + fail = failure_success(credits_per_submission,failures,submission_ids) + threshold_bar_plot = create_threshold_bar_plot(credits_per_submission,max_points) + + with open(output_file, "w", encoding="utf-8") as f: + f.write(get_introduction()) + + f.write(""" +