From d3208c74fb8ab52cd46db9fe519da26489a6e8b9 Mon Sep 17 00:00:00 2001 From: psaegert Date: Sun, 10 Mar 2024 10:54:26 +0100 Subject: [PATCH] Fix and Improve Docstrings - Fix typo in system prompt --- src/llmcoder/analyze/gpt_score_analyzer.py | 9 +- .../analyze/hallucination_analyzer.py | 15 ++- src/llmcoder/analyze/mypy_analyzer.py | 33 +++---- src/llmcoder/analyze/signature_analyzer.py | 17 ++-- src/llmcoder/conversation/conversation.py | 24 +++++ src/llmcoder/conversation/priority_queue.py | 10 ++ src/llmcoder/data/preprocessor.py | 45 +++++---- src/llmcoder/data/scraper.py | 39 ++++---- src/llmcoder/eval/evaluate.py | 91 ++++++++++--------- src/llmcoder/eval/metrics/extrinsic.py | 59 ++++++------ src/llmcoder/llmcoder.py | 38 ++++---- src/llmcoder/utils.py | 12 +-- system_prompts/2023-12-09_Scorer_v1.1.txt | 2 +- 13 files changed, 221 insertions(+), 173 deletions(-) diff --git a/src/llmcoder/analyze/gpt_score_analyzer.py b/src/llmcoder/analyze/gpt_score_analyzer.py index 4290bd4..4598c28 100644 --- a/src/llmcoder/analyze/gpt_score_analyzer.py +++ b/src/llmcoder/analyze/gpt_score_analyzer.py @@ -9,7 +9,7 @@ class GPTScoreAnalyzer(Analyzer): """ - Create a new GPTScoreAnalyzer + Analyzer that scores code using GPT-3.5 with a scoring prompt. Parameters ---------- @@ -18,7 +18,7 @@ class GPTScoreAnalyzer(Analyzer): scoring_prompt : str The scoring prompt to use reduction : str | None, optional - The reduction method to use, by default "geo" + The reduction method to use, by default "geo" (geometric mean) verbose : bool, optional Whether to print verbose output, by default False """ @@ -30,7 +30,8 @@ def __init__(self, client: OpenAI | None = None, scoring_prompt: str | None = No self.verbose = verbose def score_prompt(self, code_list: list[str]) -> str: - """Concatenates the code snippets with the scoring prompt in the following format: + """ + Concatenates the code snippets with the scoring prompt in the following format: Code snippet 1: ```python @@ -156,6 +157,8 @@ def analyze(self, input: str, completion: str, context: dict[str, dict[str, floa The input code completion : str The completion to analyze + context : dict[str, dict[str, float | int | str]] | None, optional + Ignored. The context of previous analyzers of the completion, by default None. reduction : str | None, optional The reduction method to use, by default "geo" diff --git a/src/llmcoder/analyze/hallucination_analyzer.py b/src/llmcoder/analyze/hallucination_analyzer.py index b324715..23db2a6 100644 --- a/src/llmcoder/analyze/hallucination_analyzer.py +++ b/src/llmcoder/analyze/hallucination_analyzer.py @@ -8,17 +8,14 @@ class HallucinationAnalyzer(Analyzer): """ Analyzer that checks mypy errors for hallucinations. + + Parameters + ---------- + verbose : bool + Whether to print debug messages. """ def __init__(self, verbose: bool = False) -> None: - """ - Initialize the SignatureAnalyzer. - - Parameters - ---------- - verbose : bool - Whether to print debug messages. - """ super().__init__(verbose) def analyze(self, input: str, completion: str, context: dict[str, dict[str, float | int | str]] | None = None) -> dict: @@ -31,7 +28,7 @@ def analyze(self, input: str, completion: str, context: dict[str, dict[str, floa The input code. completion : str The completion code. - context : dict[str, dict[str, float | int | str]] | None + context : dict[str, dict[str, float | int | str]] | None, optional The context from the previous analyzers. Returns diff --git a/src/llmcoder/analyze/mypy_analyzer.py b/src/llmcoder/analyze/mypy_analyzer.py index 38e2b12..67bb08c 100644 --- a/src/llmcoder/analyze/mypy_analyzer.py +++ b/src/llmcoder/analyze/mypy_analyzer.py @@ -9,25 +9,22 @@ class MypyAnalyzer(Analyzer): """ Analyzer that runs mypy on the code with the completion and returns the result. - """ + Parameters + ---------- + verbose : bool, optional + Whether to print verbose output, by default False. + """ def __init__(self, verbose: bool = False): - """ - Initializes the analyzer. - - Parameters - ---------- - verbose : bool, optional - Whether to print verbose output, by default False. - """ super().__init__(verbose=verbose) - def analyze(self, - input: str, - completion: str, - install_stubs: bool = True, - mypy_args: list[str] | None = None, - context: dict[str, dict[str, float | int | str]] | None = None) -> dict: + def analyze( + self, + input: str, + completion: str, + install_stubs: bool = True, + mypy_args: list[str] | None = None, + context: dict[str, dict[str, float | int | str]] | None = None) -> dict: """ Analyzes the completion using mypy. @@ -40,10 +37,10 @@ def analyze(self, The completion to analyze. install_stubs : bool, optional Whether to install missing stubs, by default True. - mypy_args : list[str], optional + mypy_args : list[str] | None, optional Additional arguments to pass to mypy, by default None. - context : dict[str, dict[str, float | int | str]], optional - The context of the completion, by default None. + context : dict[str, dict[str, float | int | str]] | None, optional + Ignored. The context of previous analyzers of the completion, by default None. Returns ------- diff --git a/src/llmcoder/analyze/signature_analyzer.py b/src/llmcoder/analyze/signature_analyzer.py index 4b77973..6f512f1 100644 --- a/src/llmcoder/analyze/signature_analyzer.py +++ b/src/llmcoder/analyze/signature_analyzer.py @@ -15,17 +15,14 @@ class SignatureAnalyzer(Analyzer): """ Analyzer that fetches the signatures and documentations of functions and classes in the code. + + Parameters + ---------- + verbose : bool + Whether to print debug messages. """ def __init__(self, verbose: bool = False) -> None: - """ - Initialize the SignatureAnalyzer. - - Parameters - ---------- - verbose : bool - Whether to print debug messages. - """ super().__init__(verbose) def get_imports(self, path: str, query: str | list[str] | None = None) -> Generator: @@ -309,8 +306,8 @@ def analyze(self, input: str, completion: str, context: dict[str, dict[str, floa The input code. completion : str The completion code. - context : dict[str, dict[str, float | int | str]] | None - The context from the previous analyzers. + context : dict[str, dict[str, float | int | str]] | None, optional + The context of previous analyzers of the completion. Returns ------- diff --git a/src/llmcoder/conversation/conversation.py b/src/llmcoder/conversation/conversation.py index 2bc8ffe..af8d640 100644 --- a/src/llmcoder/conversation/conversation.py +++ b/src/llmcoder/conversation/conversation.py @@ -2,6 +2,22 @@ class Conversation: + """ + A class to represent a conversation, which contains a list of messages, a score, and a list of analyses. + + Parameters + ---------- + score : int + The score of the conversation + messages : list[dict[str, str]] + The list of messages in the conversation + analyses : list[dict[str, dict[str, float | int | str | bool]]] | None, optional + The list of analyses in the conversation, by default None + path : list[Any] | None, optional + The path of the conversation in the conversation tree, by default None + passing : bool, optional + Whether the conversation has passed all critical analyzers, by default False + """ def __init__( self, score: int, @@ -43,6 +59,14 @@ def add_to_path(self, choice: Any) -> "Conversation": return self def update_passing(self) -> "Conversation": + """ + Update the passing status of the conversation based on the critical analyzers + + Returns + ------- + Conversation + The conversation with the updated passing status + """ # Print how many critical analyzers have passed n_passed = sum(results['pass'] for results in self.analyses[-1].values() if (results['type'] == "critical" and type(results['pass']) is bool)) n_total = len([results for results in self.analyses[-1].values() if results['type'] == "critical" and type(results['pass']) is bool]) diff --git a/src/llmcoder/conversation/priority_queue.py b/src/llmcoder/conversation/priority_queue.py index 76b5898..dd37f63 100644 --- a/src/llmcoder/conversation/priority_queue.py +++ b/src/llmcoder/conversation/priority_queue.py @@ -6,6 +6,16 @@ class PriorityQueue: + """ + A priority queue for conversations, which sorts the conversations based on their scores. + + Parameters + ---------- + conversations : Conversation | list[Conversation] | None, optional + The conversations to be added to the priority queue, by default None + backtracking : bool, optional + Whether to allow re-considering previous conversations, by default True + """ def __init__(self, conversations: Conversation | list[Conversation] | None = None, backtracking: bool = True): self.queue: list[Conversation] = [] self.backtracking = backtracking diff --git a/src/llmcoder/data/preprocessor.py b/src/llmcoder/data/preprocessor.py index b0991dd..44d10e8 100644 --- a/src/llmcoder/data/preprocessor.py +++ b/src/llmcoder/data/preprocessor.py @@ -61,6 +61,11 @@ def split_file(file_contents: str, min_pos: int = 1, max_pos: int = None) -> tup The minimum position to split the file at, by default 1 max_pos : int, optional The maximum position to split the file at, by default None + + Returns + ------- + tuple[str, str] + A tuple containing the first and second part of the file. """ if max_pos is None: max_pos = len(file_contents) - 1 @@ -122,27 +127,27 @@ def sample_files_from_dir(repo_dir: str, n_samples: int = 4, file_extensions: li class Preprocessor: - def __init__(self, dataset_name: str, tokenizer: str = "p50k_base", scraped_files_dir: str | None = None, save_pairs_dir: str | None = None, save_data_dir: str | None = None, system_prompt: str | None = None, disallowed_special_tokens: list[str] | None = None) -> None: - """ - A preprocessor for the fine-tuning data which samples files from scraped repositories, splits them into two parts and saves them in a format that can be used for fine-tuning. + """ + A preprocessor for the fine-tuning data which samples files from scraped repositories, splits them into two parts and saves them in a format that can be used for fine-tuning. - Parameters - ---------- - dataset_name : str - The name of the dataset. - tokenizer : str, optional - The tokenizer to use, by default "p50k_base" for gpt-3.5-turbo - scraped_files_dir : str - The directory to store the scraped files in, defaults to 'scraped_repos'. - save_pairs_dir : str - The directory to store the sampled files in, defaults to 'pairs'. - save_data_dir : str - The directory to store the preprocessed data in, defaults to 'github_mix'. - system_prompt : str - The system prompt to use, defaults to the default system prompt. - disallowed_special_tokens : list[str] - A list of disallowed special tokens, defaults to the default disallowed special tokens. - """ + Parameters + ---------- + dataset_name : str + The name of the dataset. + tokenizer : str, optional + The tokenizer to use, by default "p50k_base" for gpt-3.5-turbo + scraped_files_dir : str + The directory to store the scraped files in, defaults to 'scraped_repos'. + save_pairs_dir : str + The directory to store the sampled files in, defaults to 'pairs'. + save_data_dir : str + The directory to store the preprocessed data in, defaults to 'github_mix'. + system_prompt : str + The system prompt to use, defaults to the default system prompt. + disallowed_special_tokens : list[str] + A list of disallowed special tokens, defaults to the default disallowed special tokens. + """ + def __init__(self, dataset_name: str, tokenizer: str = "p50k_base", scraped_files_dir: str | None = None, save_pairs_dir: str | None = None, save_data_dir: str | None = None, system_prompt: str | None = None, disallowed_special_tokens: list[str] | None = None) -> None: self.name = dataset_name self.enc = tiktoken.get_encoding(tokenizer) diff --git a/src/llmcoder/data/scraper.py b/src/llmcoder/data/scraper.py index 799600b..dbee637 100644 --- a/src/llmcoder/data/scraper.py +++ b/src/llmcoder/data/scraper.py @@ -14,20 +14,17 @@ class GitHubScraper: """ A class for scraping GitHub repositories and storing them in a flat structure. + + Parameters + ---------- + dataset_name : str + The name of the dataset to scrape repositories for. + access_token : str + A GitHub access token for authenticating with the GitHub API. + scraped_files_dir : str + The directory to store the scraped files in, defaults to 'scraped_repos'. """ def __init__(self, dataset_name: str, access_token: str | None = None, scraped_files_dir: str | None = None) -> None: - """ - Initialize the GitHubScraper class with a GitHub access token. - - Parameters - ---------- - dataset_name : str - The name of the dataset to scrape repositories for. - access_token : str - A GitHub access token for authenticating with the GitHub API. - scraped_files_dir : str - The directory to store the scraped files in, defaults to 'scraped_repos'. - """ self.name = dataset_name self.access_token = access_token @@ -47,9 +44,13 @@ def get_repos_with_query(self, query: str, num_repos: int = 1) -> list: ---------- query : str A GitHub API query. - num_repos : int The number of repositories to fetch. + + Returns + ------- + list + A list of repositories. """ if self.access_token is not None: headers = {'Authorization': f'token {self.access_token}'} @@ -150,13 +151,13 @@ def accumulate_repositories(self, repository_sets: list[list[str]] | None = None Parameters ---------- - repository_sets : list[list[str]] - A list of lists of repository URLs to scrape. Each list represents a set of repositories to scrape relating to a specific topic. + repository_sets : list[list[str]] | None, optional + A list of lists of repository URLs to scrape. Each list represents a set of repositories to scrape relating to a specific topic. If None, a default set of repositories will be used. by default None. Returns ------- - list[str] - A list of repository URLs to scrape. + list[tuple[str, str]] + A list of tuples of (repo_url, repo_name). """ if repository_sets is None: # Get the top 10 Python repositories by stars @@ -230,8 +231,8 @@ def scrape_repositories(self, repos: list[tuple[str, str]] | None = None, max_n_ Parameters ---------- - repos : list[tuple[str, str]] - A list of tuples of (repo_url, repo_name). + repos : list[tuple[str, str]] | None, optional + A list of tuples of (repo_url, repo_name). If None, a default set of repositories will be used. by default None. max_n_repositories : int The maximum number of repositories to scrape. verbose : bool diff --git a/src/llmcoder/eval/evaluate.py b/src/llmcoder/eval/evaluate.py index e760c8b..6dff332 100644 --- a/src/llmcoder/eval/evaluate.py +++ b/src/llmcoder/eval/evaluate.py @@ -4,6 +4,7 @@ import time from contextlib import redirect_stdout from datetime import datetime +from typing import Any import pandas as pd from dynaconf import Dynaconf @@ -15,6 +16,19 @@ def check_config(config: Dynaconf) -> bool: + """ + Check if the configuration is correctly formatted. + + Parameters + ---------- + config : Dynaconf + The configuration object from Dynaconf. + + Returns + ------- + bool + Whether the configuration is correctly formatted. + """ # Return if all the required keys are present and the types are correct if not isinstance(config.get('analyzers'), list): @@ -44,15 +58,13 @@ def check_config(config: Dynaconf) -> bool: class Evaluation: + """ + Parameters + ---------- + configs : Dynaconf | list[Dynaconf] | str | list[str] | None, optional + The configuration object(s) from Dynaconf or path(s) to configuration file(s). + """ def __init__(self, configs: Dynaconf | list[Dynaconf] | str | list[str] | None = None): - """ - Initialize the Evaluation with a configuration or list of configurations. - - Parameters - ---------- - configs : Dynaconf | list[Dynaconf] | str | list[str] | None, optional - The configuration object(s) from Dynaconf or path(s) to configuration file(s). - """ if configs is None: self.configs = [ Dynaconf(settings_files=[os.path.join(get_config_dir(), config)]) @@ -89,14 +101,14 @@ def __init__(self, configs: Dynaconf | list[Dynaconf] | str | list[str] | None = def run(self, store: bool = True, n_repeat: int = 1, verbose: bool = False) -> dict[str, list]: """ - Run the evaluation end to end (reading inputs from the database and writing results back) + Run the evaluation for the configurations Parameters ---------- store : bool, optional Whether to store the results in the database, by default True n_repeat : int, optional - The number of times to repeat the evaluation, by default 1 + The number of times to repeat the evaluation for better statistics about indeterministic methods, by default 1 verbose : bool, optional Whether to print the results to the console, by default False @@ -126,7 +138,7 @@ def run(self, store: bool = True, n_repeat: int = 1, verbose: bool = False) -> d def predict(self, config: Dynaconf, store: bool = False, verbose: bool = False) -> dict: """ - Run the LLMCoder on the provided files and write the results to the database. + Run the LLMCoder on the provided files Parameters ---------- @@ -159,7 +171,7 @@ def predict(self, config: Dynaconf, store: bool = False, verbose: bool = False) # Return the results return results - def run_llmcoder(self, config: Dynaconf, inputs: dict, verbose: bool = False) -> dict: + def run_llmcoder(self, config: Dynaconf, inputs: dict[Any, str], verbose: bool = False) -> dict: """ Run the LLMCoder on the provided files and return the results. @@ -167,17 +179,17 @@ def run_llmcoder(self, config: Dynaconf, inputs: dict, verbose: bool = False) -> ---------- config : Dynaconf The configuration object from Dynaconf. - inputs : List[str] - A list of inputs to complete with the LLMCoder. + inputs : dict[Any, str] + The inputs to run the LLMCoder on with unique identifiers as keys. verbose : bool, optional - Whether to print the results to the console, by default False + Whether to print the results to the standard output, by default False Returns ------- dict The results from the evaluation. """ - results: dict[str, dict] = {} + results: dict[Any, dict] = {} # Run the LLMCoder on each input for input_id, input in tqdm(inputs.items(), desc='Prediction', total=len(inputs), disable=not verbose): @@ -201,6 +213,7 @@ def run_llmcoder(self, config: Dynaconf, inputs: dict, verbose: bool = False) -> verbose=True ) + # In case of errors, print verbose output try: _ = llmcoder.complete(input, n=config.get('n_choices')) except TypeError as e: @@ -226,12 +239,14 @@ def run_llmcoder(self, config: Dynaconf, inputs: dict, verbose: bool = False) -> def _write_results(self, config: Dynaconf, results: dict) -> None: """ - Write the results back to the database. + Write the results to a file. Parameters ---------- + config : Dynaconf + The configuration object from Dynaconf. Used to identify the dataset for naming the file. results : dict - The results to write back to the database. + The results to write to a file. """ # Get the current time current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") @@ -246,15 +261,13 @@ def _write_results(self, config: Dynaconf, results: dict) -> None: class Metrics: + """ + Parameters + ---------- + configs : Dynaconf | list[Dynaconf] | str | list[str], optional + The configuration object(s) from Dynaconf, or path(s) to configuration files. + """ def __init__(self, configs: Dynaconf | list[Dynaconf] | str | list[str] | None = None): - """ - Initialize the Metrics with a Dynaconf configuration or path to configurations. - - Parameters - ---------- - configs : Dynaconf | list[Dynaconf] | str | list[str], optional - The configuration object(s) from Dynaconf, or path(s) to configuration files. - """ if configs is None: # Load all configurations from the config directory self.configs = [ @@ -293,25 +306,23 @@ def __init__(self, configs: Dynaconf | list[Dynaconf] | str | list[str] | None = for config in self.configs: print(f'\t- {config.settings_file_for_dynaconf[0]}') - def run(self, store: bool = False, index: int | None = None, verbose: bool = False, force: bool = False) -> dict[str, dict[str, dict[str, dict]]]: + def run(self, store: bool = True, index: int | None = None, verbose: bool = False) -> dict[str, dict[str, dict[str, dict]]]: """ - Analyze the results from the database given the configuration and store it back in the database. + Analyze the results from the database given the configuration. Parameters ---------- store : bool, optional - Whether to store the analysis in the database, by default False + Whether to store the analysis in the database, by default True index : int, optional The index of the results to analyze, by default None (analyze all) verbose : bool, optional Whether to print the results to the console, by default False - force : bool, optional - Whether to force the analysis, by default False Returns ------- dict[str, dict[str, dict[str, dict]]] - The analysis results. + The analysis results with the configuration as the key. """ metrics = {} @@ -347,11 +358,7 @@ def compute_metrics(self, config: Dynaconf, results_dict: dict[str, dict], targe results_dict : dict[str, dict] The results to compute the metrics for. targets : dict - The target completions to compute with the LLMCoder. - intrinsic_score_functions : list[callable] - The intrinsic score functions to use. - extrinsic_score_functions : list[callable] - The extrinsic score functions to use. + The target completions to compare the LLMcoder results to. store : bool, optional Whether to store the analysis in the database, by default False verbose : bool, optional @@ -360,7 +367,7 @@ def compute_metrics(self, config: Dynaconf, results_dict: dict[str, dict], targe Returns ------- dict[str, dict[str, dict]] - The metrics for each result. + The metrics for each result with the repitition id as the key. """ metrics_dict = {} @@ -394,7 +401,7 @@ def compute_metrics(self, config: Dynaconf, results_dict: dict[str, dict], targe def load_results(self, config: Dynaconf, index: int | list[int] | None = None) -> dict[str, dict]: """ - Read the results for a tuple from the database. + Read the evaluation results for the given configuration from a directory named after the configuration. Parameters ---------- @@ -406,7 +413,7 @@ def load_results(self, config: Dynaconf, index: int | list[int] | None = None) - Returns ------- dict[str, dict] - The results from the database. + The selected results from the evaluation. """ config_name = os.path.splitext(os.path.split(config.settings_file_for_dynaconf[0])[-1])[0] @@ -427,7 +434,7 @@ def load_results(self, config: Dynaconf, index: int | list[int] | None = None) - def _write_metrics(self, config: Dynaconf, result_repitition_id: str, metrics: dict) -> None: """ - Write the analysis results back to the database. + Write the analysis results to a file. Parameters ---------- @@ -436,7 +443,7 @@ def _write_metrics(self, config: Dynaconf, result_repitition_id: str, metrics: d result_repitition_id : str The id of the result repitition. metrics : dict - The analysis results to write back to the database. + The analysis results to write to a file. """ # Create a dataframe from the analysis results df = pd.DataFrame.from_dict(metrics, orient='index') diff --git a/src/llmcoder/eval/metrics/extrinsic.py b/src/llmcoder/eval/metrics/extrinsic.py index 2caf62c..341a49b 100644 --- a/src/llmcoder/eval/metrics/extrinsic.py +++ b/src/llmcoder/eval/metrics/extrinsic.py @@ -1,3 +1,4 @@ +import textwrap import warnings from difflib import SequenceMatcher @@ -181,18 +182,19 @@ def sequence_matcher_score(ground_truth: str, llmcoder_result: dict | str) -> fl def _user_prompt_template(code_1: str, code_2: str, qualities_list: list[str] | None) -> str: quality_list_string = '\n'.join([f'- {q}' for q in qualities_list]) if qualities_list is not None else '' - return f"""Assess and compare these two code snippets and evaluate the completions. Do your own analysis and also consider the following criteria: -{quality_list_string} + return textwrap.dedent( + f"""Assess and compare these two code snippets and evaluate the completions. Do your own analysis and also consider the following criteria: + {quality_list_string} -CODE 1: -```python -{code_1} -``` + CODE 1: + ```python + {code_1} + ``` -CODE 2: -```python -{code_2} -```""" + CODE 2: + ```python + {code_2} + ```""") def _get_scores(ground_truth: str, completion: str, system_prompt_compare: str, qualities_list: list[str] | None = None, model: str = "gpt-3.5-turbo", max_iter: int = 5) -> float: @@ -249,24 +251,25 @@ def gpt_reviewer_score(ground_truth: str, llmcoder_result: dict | str, model: st The similarity between the two strings. Positive if the completion is better than the ground truth, negative otherwise. """ - system_prompt_compare = """You are a data scientist tasked with comparing and evaluating code completions made by a language model. -The user will submit two code snippets with the same beginning but different completions. -Given these snippets, you evaluate the completions in a concise way, and give each a score between 0 and 10, with 0 being the worst (unusable completion that would make a developer frustrated) and 10 being the best (perfect completion that would make a developer happy). -The user may ask you to prioritize different qualities of the code. -Take these priorities into account when scoring the completions. -Your output must always have the following format: -``` -COMPARISON: - - -SCORE 1: -SCORE 2: -``` - -Do not include any other information in your output. -It is very important that the output following "SCORE 1: " and "SCORE 2: " is a single integer between 0 and 10, with no other characters or spaces since scores will later be parsed at this exact location. -Therefore, make sure to keep your comparison (the text after COMPARISON:) concise, and adhere to the score format exactly. -""" + system_prompt_compare = textwrap.dedent( + """You are a data scientist tasked with comparing and evaluating code completions made by a language model. + The user will submit two code snippets with the same beginning but different completions. + Given these snippets, you evaluate the completions in a concise way, and give each a score between 0 and 10, with 0 being the worst (unusable completion that would make a developer frustrated) and 10 being the best (perfect completion that would make a developer happy). + The user may ask you to prioritize different qualities of the code. + Take these priorities into account when scoring the completions. + Your output must always have the following format: + ``` + COMPARISON: + + + SCORE 1: + SCORE 2: + ``` + + Do not include any other information in your output. + It is very important that the output following "SCORE 1: " and "SCORE 2: " is a single integer between 0 and 10, with no other characters or spaces since scores will later be parsed at this exact location. + Therefore, make sure to keep your comparison (the text after COMPARISON:) concise, and adhere to the score format exactly. + """) if qualities_list is None: qualities_list = [ diff --git a/src/llmcoder/llmcoder.py b/src/llmcoder/llmcoder.py index 1193c16..2f4ee4b 100644 --- a/src/llmcoder/llmcoder.py +++ b/src/llmcoder/llmcoder.py @@ -24,7 +24,7 @@ class LLMCoder: model_feedback : str, optional The model to use for the feedback loop, by default "ft:gpt-3.5-turbo-1106:personal::8LCi9Q0d" feedback_variant : str, optional - The feedback variant to use, by default "coworker" + The feedback variant to use, one of ["separate", "coworker"], by default "coworker", which enables a shared context for the analyzers system_prompt : str, optional The system prompt to use, by default the one used for preprocessing and fine-tuning max_iter : int, optional @@ -47,7 +47,7 @@ def __init__( system_prompt: str | None = None, max_iter: int = 10, backtracking: bool = True, - log_conversation: bool = True, + log_conversation: bool = False, n_procs: int = 1, verbose: bool = True) -> None: @@ -102,7 +102,7 @@ def __init__( def _get_best_completion(self, conversations: list[Conversation]) -> str: """ - Get the best completion from the provided conversations + Get the best completion from the provided conversations measured by their `score` Parameters ---------- @@ -128,14 +128,14 @@ def complete(self, code: str, temperature: float = 0.7, meta_temperature: float temperature : float, optional The temperature to use for the completion, by default 0.7 meta_temperature : float, optional - The temperature to use for choosing the most promising conversation, by default 0.1 + The temperature to use for choosing the most promising conversation, by default 0.0 n : int, optional The number of choices to generate, by default 1 Returns ------- str - The completed code + The code completion """ # Reset the feedback loop and internal variables self._reset_loop() @@ -182,6 +182,7 @@ def _create_conversation_file(cls) -> str: str The path to the conversation file """ + # FIXME: Add support for backtracking and graph mode return os.path.join(get_conversations_dir(create=True), f"{datetime.now()}.jsonl") def _is_bad_completion(self, completion: str) -> bool: @@ -192,7 +193,7 @@ def _is_bad_completion(self, completion: str) -> bool: Parameters ---------- completion : str - The completion to check + The completion to check against the existing conversations Returns ------- @@ -209,7 +210,7 @@ def _is_bad_completion(self, completion: str) -> bool: def _get_completions_for( self, conversation: Conversation, - model: str = 'gpt-3.5-turbo', + model: str = 'ft:gpt-3.5-turbo-1106:personal::8LCi9Q0d', temperature: float = 0.7, n: int = 1, max_retries: int = 5, @@ -223,15 +224,15 @@ def _get_completions_for( Parameters ---------- conversation: Conversation - Tuple in the priority queue. Contains the completion/code over which the model will complete. + The conversation to get completions for. Usually the most promising conversation from the priority queue model : str, optional - The model to use for the completion, by default 'gpt-3.5-turbo' + The model to use for the completion, by default 'ft:gpt-3.5-turbo-1106:personal::8LCi9Q0d' temperature : float, optional The temperature to use for the completion, by default 0.7 n : int, optional The number of choices to generate, by default 1 max_retries : int, optional - The maximum number of retries to get a valid completion, by default 5 + The maximum number of retries to get a valid completion due to repeated mistakes or duplicates, by default 5 delta_temperature : float, optional The amount to increase the temperature in case of repeated mistakes, by default 0.2 max_temperature : float, optional @@ -368,14 +369,14 @@ def _run_analyzers(self, code: str, completion: str) -> dict[str, dict]: Parameters ---------- code : str - The code to analyze + The beginning of the code completion : str - The completion to analyze + The completion of the code to analyze Returns ------- dict[str, dict] - The analyzer results + The analyzer results with the analyzer names as keys and the results as values """ analyzer_results: dict[str, dict] = {} @@ -407,7 +408,7 @@ def _feedback_prompt_template(self, result_messages: list[str]) -> str: Parameters ---------- result_messages : list[str] - The analyzer result messages + The analyzer result messages, typically obtained by concatenating the `message` field of the analyzer results Returns ------- @@ -418,8 +419,11 @@ def _feedback_prompt_template(self, result_messages: list[str]) -> str: def _step(self, code: str, temperature: float = 0.7, meta_temperature: float = 0.0, n: int = 1) -> None: """ - Complete the provided code with the OpenAI model and feedback, if available - Make choice on highest scored snippet through PriorityQueue.pop(). + Run one step of the feedback loop, including + - getting and duplicating the most promising conversation from the priority queue + - adding the user's code to the conversation + - getting completions for the conversation with `LLMCoder._get_completions_for` + - adding the completions to the priority queue Parameters ---------- @@ -428,7 +432,7 @@ def _step(self, code: str, temperature: float = 0.7, meta_temperature: float = 0 temperature : float, optional The temperature to use for the completion, by default 0.7 meta_temperature : float, optional - The temperature to use for choosing the most promising conversation, by default 0.1 + The temperature to use for choosing the most promising conversation, by default 0.0 n : int, optional The number of choices to generate, by default 1 """ diff --git a/src/llmcoder/utils.py b/src/llmcoder/utils.py index 9a6ef35..c5c0f84 100644 --- a/src/llmcoder/utils.py +++ b/src/llmcoder/utils.py @@ -8,7 +8,7 @@ def get_data_dir(*args: str, create: bool = False) -> str: Parameters ---------- args : str - The path to the data directory. + The path to the data directory in /data. create : bool Whether to create the directory if it does not exist. @@ -34,7 +34,7 @@ def get_config_dir(*args: str, create: bool = False) -> str: Parameters ---------- args : str - The path to the configs directory. + The path to the configs directory in /configs. create : bool Whether to create the directory if it does not exist. @@ -55,7 +55,7 @@ def get_config_dir(*args: str, create: bool = False) -> str: def get_openai_key(key: str = "") -> str: """ - Get OpenAI API key. Try to interpret the key as a key first, then as a path to a file containing the key. + Get OpenAI API key. Try to interpret the `key` parameter as a key first, then as a path to a file containing the key. Finally, fall back to the default key.txt file or the OPENAI_KEY environment variable. Parameters @@ -92,7 +92,7 @@ def get_openai_key(key: str = "") -> str: def get_github_access_token(token: str = "") -> str: """ - Get GitHub access token. Try to interpret the token as a token first, then as a path to a file containing the token. + Get GitHub access token. Try to interpret the `token` parameter as a token first, then as a path to a file containing the token. Finally, fall back to the default token.txt file or the GITHUB_ACCESS_TOKEN environment variable. Parameters @@ -151,7 +151,7 @@ def get_system_prompt(name: str = "2023-11-15_GPT-Builder.txt") -> str: def get_system_prompt_dir(*args: str, create: bool = False) -> str: """ - Get the path to the system prompts directory. + Get the path to the system prompts directory in /system_prompts. Parameters ---------- @@ -177,7 +177,7 @@ def get_system_prompt_dir(*args: str, create: bool = False) -> str: def get_conversations_dir(*args: str, create: bool = False) -> str: """ - Get the path to the log directory. + Get the path to the log directory in /conversations. Parameters ---------- diff --git a/system_prompts/2023-12-09_Scorer_v1.1.txt b/system_prompts/2023-12-09_Scorer_v1.1.txt index ed02602..c905f66 100644 --- a/system_prompts/2023-12-09_Scorer_v1.1.txt +++ b/system_prompts/2023-12-09_Scorer_v1.1.txt @@ -1,7 +1,7 @@ "You are a scoring system for python code that meticulously analyzes a code snippet and judges its quality in given categories. You are given a python code snippet and asked to score it based on the following categories: - the code quality: How well the code conforms to the python style guide -- the plausibliity of the last few lines: How much sense the last few lines make +- the plausiblity of the last few lines: How much sense the last few lines make - the consistency of the last few lines: How good the last few lines fit to the beginning and middle of the code - the readability of the code: How easy it is to understand the code