From d3208c74fb8ab52cd46db9fe519da26489a6e8b9 Mon Sep 17 00:00:00 2001
From: psaegert <paulsaegert@protonmail.com>
Date: Sun, 10 Mar 2024 10:54:26 +0100
Subject: [PATCH] Fix and Improve Docstrings

- Fix typo in system prompt
---
 src/llmcoder/analyze/gpt_score_analyzer.py    |  9 +-
 .../analyze/hallucination_analyzer.py         | 15 ++-
 src/llmcoder/analyze/mypy_analyzer.py         | 33 +++----
 src/llmcoder/analyze/signature_analyzer.py    | 17 ++--
 src/llmcoder/conversation/conversation.py     | 24 +++++
 src/llmcoder/conversation/priority_queue.py   | 10 ++
 src/llmcoder/data/preprocessor.py             | 45 +++++----
 src/llmcoder/data/scraper.py                  | 39 ++++----
 src/llmcoder/eval/evaluate.py                 | 91 ++++++++++---------
 src/llmcoder/eval/metrics/extrinsic.py        | 59 ++++++------
 src/llmcoder/llmcoder.py                      | 38 ++++----
 src/llmcoder/utils.py                         | 12 +--
 system_prompts/2023-12-09_Scorer_v1.1.txt     |  2 +-
 13 files changed, 221 insertions(+), 173 deletions(-)

diff --git a/src/llmcoder/analyze/gpt_score_analyzer.py b/src/llmcoder/analyze/gpt_score_analyzer.py
index 4290bd4..4598c28 100644
--- a/src/llmcoder/analyze/gpt_score_analyzer.py
+++ b/src/llmcoder/analyze/gpt_score_analyzer.py
@@ -9,7 +9,7 @@
 
 class GPTScoreAnalyzer(Analyzer):
     """
-    Create a new GPTScoreAnalyzer
+    Analyzer that scores code using GPT-3.5 with a scoring prompt.
 
     Parameters
     ----------
@@ -18,7 +18,7 @@ class GPTScoreAnalyzer(Analyzer):
     scoring_prompt : str
         The scoring prompt to use
     reduction : str | None, optional
-        The reduction method to use, by default "geo"
+        The reduction method to use, by default "geo" (geometric mean)
     verbose : bool, optional
         Whether to print verbose output, by default False
     """
@@ -30,7 +30,8 @@ def __init__(self, client: OpenAI | None = None, scoring_prompt: str | None = No
         self.verbose = verbose
 
     def score_prompt(self, code_list: list[str]) -> str:
-        """Concatenates the code snippets with the scoring prompt in the following format:
+        """
+        Concatenates the code snippets with the scoring prompt in the following format:
 
         Code snippet 1:
         ```python
@@ -156,6 +157,8 @@ def analyze(self, input: str, completion: str, context: dict[str, dict[str, floa
             The input code
         completion : str
             The completion to analyze
+        context : dict[str, dict[str, float | int | str]] | None, optional
+            Ignored. The context of previous analyzers of the completion, by default None.
         reduction : str | None, optional
             The reduction method to use, by default "geo"
 
diff --git a/src/llmcoder/analyze/hallucination_analyzer.py b/src/llmcoder/analyze/hallucination_analyzer.py
index b324715..23db2a6 100644
--- a/src/llmcoder/analyze/hallucination_analyzer.py
+++ b/src/llmcoder/analyze/hallucination_analyzer.py
@@ -8,17 +8,14 @@
 class HallucinationAnalyzer(Analyzer):
     """
     Analyzer that checks mypy errors for hallucinations.
+
+    Parameters
+    ----------
+    verbose : bool
+        Whether to print debug messages.
     """
 
     def __init__(self, verbose: bool = False) -> None:
-        """
-        Initialize the SignatureAnalyzer.
-
-        Parameters
-        ----------
-        verbose : bool
-            Whether to print debug messages.
-        """
         super().__init__(verbose)
 
     def analyze(self, input: str, completion: str, context: dict[str, dict[str, float | int | str]] | None = None) -> dict:
@@ -31,7 +28,7 @@ def analyze(self, input: str, completion: str, context: dict[str, dict[str, floa
             The input code.
         completion : str
             The completion code.
-        context : dict[str, dict[str, float | int | str]] | None
+        context : dict[str, dict[str, float | int | str]] | None, optional
             The context from the previous analyzers.
 
         Returns
diff --git a/src/llmcoder/analyze/mypy_analyzer.py b/src/llmcoder/analyze/mypy_analyzer.py
index 38e2b12..67bb08c 100644
--- a/src/llmcoder/analyze/mypy_analyzer.py
+++ b/src/llmcoder/analyze/mypy_analyzer.py
@@ -9,25 +9,22 @@
 class MypyAnalyzer(Analyzer):
     """
     Analyzer that runs mypy on the code with the completion and returns the result.
-    """
 
+    Parameters
+    ----------
+    verbose : bool, optional
+        Whether to print verbose output, by default False.
+    """
     def __init__(self, verbose: bool = False):
-        """
-        Initializes the analyzer.
-
-        Parameters
-        ----------
-        verbose : bool, optional
-            Whether to print verbose output, by default False.
-        """
         super().__init__(verbose=verbose)
 
-    def analyze(self,
-                input: str,
-                completion: str,
-                install_stubs: bool = True,
-                mypy_args: list[str] | None = None,
-                context: dict[str, dict[str, float | int | str]] | None = None) -> dict:
+    def analyze(
+            self,
+            input: str,
+            completion: str,
+            install_stubs: bool = True,
+            mypy_args: list[str] | None = None,
+            context: dict[str, dict[str, float | int | str]] | None = None) -> dict:
 
         """
         Analyzes the completion using mypy.
@@ -40,10 +37,10 @@ def analyze(self,
             The completion to analyze.
         install_stubs : bool, optional
             Whether to install missing stubs, by default True.
-        mypy_args : list[str], optional
+        mypy_args : list[str] | None, optional
             Additional arguments to pass to mypy, by default None.
-        context : dict[str, dict[str, float | int | str]], optional
-            The context of the completion, by default None.
+        context : dict[str, dict[str, float | int | str]] | None, optional
+            Ignored. The context of previous analyzers of the completion, by default None.
 
         Returns
         -------
diff --git a/src/llmcoder/analyze/signature_analyzer.py b/src/llmcoder/analyze/signature_analyzer.py
index 4b77973..6f512f1 100644
--- a/src/llmcoder/analyze/signature_analyzer.py
+++ b/src/llmcoder/analyze/signature_analyzer.py
@@ -15,17 +15,14 @@
 class SignatureAnalyzer(Analyzer):
     """
     Analyzer that fetches the signatures and documentations of functions and classes in the code.
+
+    Parameters
+    ----------
+    verbose : bool
+        Whether to print debug messages.
     """
 
     def __init__(self, verbose: bool = False) -> None:
-        """
-        Initialize the SignatureAnalyzer.
-
-        Parameters
-        ----------
-        verbose : bool
-            Whether to print debug messages.
-        """
         super().__init__(verbose)
 
     def get_imports(self, path: str, query: str | list[str] | None = None) -> Generator:
@@ -309,8 +306,8 @@ def analyze(self, input: str, completion: str, context: dict[str, dict[str, floa
             The input code.
         completion : str
             The completion code.
-        context : dict[str, dict[str, float | int | str]] | None
-            The context from the previous analyzers.
+        context : dict[str, dict[str, float | int | str]] | None, optional
+            The context of previous analyzers of the completion.
 
         Returns
         -------
diff --git a/src/llmcoder/conversation/conversation.py b/src/llmcoder/conversation/conversation.py
index 2bc8ffe..af8d640 100644
--- a/src/llmcoder/conversation/conversation.py
+++ b/src/llmcoder/conversation/conversation.py
@@ -2,6 +2,22 @@
 
 
 class Conversation:
+    """
+    A class to represent a conversation, which contains a list of messages, a score, and a list of analyses.
+
+    Parameters
+    ----------
+    score : int
+        The score of the conversation
+    messages : list[dict[str, str]]
+        The list of messages in the conversation
+    analyses : list[dict[str, dict[str, float | int | str | bool]]] | None, optional
+        The list of analyses in the conversation, by default None
+    path : list[Any] | None, optional
+        The path of the conversation in the conversation tree, by default None
+    passing : bool, optional
+        Whether the conversation has passed all critical analyzers, by default False
+    """
     def __init__(
             self,
             score: int,
@@ -43,6 +59,14 @@ def add_to_path(self, choice: Any) -> "Conversation":
         return self
 
     def update_passing(self) -> "Conversation":
+        """
+        Update the passing status of the conversation based on the critical analyzers
+
+        Returns
+        -------
+        Conversation
+            The conversation with the updated passing status
+        """
         # Print how many critical analyzers have passed
         n_passed = sum(results['pass'] for results in self.analyses[-1].values() if (results['type'] == "critical" and type(results['pass']) is bool))
         n_total = len([results for results in self.analyses[-1].values() if results['type'] == "critical" and type(results['pass']) is bool])
diff --git a/src/llmcoder/conversation/priority_queue.py b/src/llmcoder/conversation/priority_queue.py
index 76b5898..dd37f63 100644
--- a/src/llmcoder/conversation/priority_queue.py
+++ b/src/llmcoder/conversation/priority_queue.py
@@ -6,6 +6,16 @@
 
 
 class PriorityQueue:
+    """
+    A priority queue for conversations, which sorts the conversations based on their scores.
+
+    Parameters
+    ----------
+    conversations : Conversation | list[Conversation] | None, optional
+        The conversations to be added to the priority queue, by default None
+    backtracking : bool, optional
+        Whether to allow re-considering previous conversations, by default True
+    """
     def __init__(self, conversations: Conversation | list[Conversation] | None = None, backtracking: bool = True):
         self.queue: list[Conversation] = []
         self.backtracking = backtracking
diff --git a/src/llmcoder/data/preprocessor.py b/src/llmcoder/data/preprocessor.py
index b0991dd..44d10e8 100644
--- a/src/llmcoder/data/preprocessor.py
+++ b/src/llmcoder/data/preprocessor.py
@@ -61,6 +61,11 @@ def split_file(file_contents: str, min_pos: int = 1, max_pos: int = None) -> tup
         The minimum position to split the file at, by default 1
     max_pos : int, optional
         The maximum position to split the file at, by default None
+
+    Returns
+    -------
+    tuple[str, str]
+        A tuple containing the first and second part of the file.
     """
     if max_pos is None:
         max_pos = len(file_contents) - 1
@@ -122,27 +127,27 @@ def sample_files_from_dir(repo_dir: str, n_samples: int = 4, file_extensions: li
 
 
 class Preprocessor:
-    def __init__(self, dataset_name: str, tokenizer: str = "p50k_base", scraped_files_dir: str | None = None, save_pairs_dir: str | None = None, save_data_dir: str | None = None, system_prompt: str | None = None, disallowed_special_tokens: list[str] | None = None) -> None:
-        """
-        A preprocessor for the fine-tuning data which samples files from scraped repositories, splits them into two parts and saves them in a format that can be used for fine-tuning.
+    """
+    A preprocessor for the fine-tuning data which samples files from scraped repositories, splits them into two parts and saves them in a format that can be used for fine-tuning.
 
-        Parameters
-        ----------
-        dataset_name : str
-            The name of the dataset.
-        tokenizer : str, optional
-            The tokenizer to use, by default "p50k_base" for gpt-3.5-turbo
-        scraped_files_dir : str
-            The directory to store the scraped files in, defaults to 'scraped_repos'.
-        save_pairs_dir : str
-            The directory to store the sampled files in, defaults to 'pairs'.
-        save_data_dir : str
-            The directory to store the preprocessed data in, defaults to 'github_mix'.
-        system_prompt : str
-            The system prompt to use, defaults to the default system prompt.
-        disallowed_special_tokens : list[str]
-            A list of disallowed special tokens, defaults to the default disallowed special tokens.
-        """
+    Parameters
+    ----------
+    dataset_name : str
+        The name of the dataset.
+    tokenizer : str, optional
+        The tokenizer to use, by default "p50k_base" for gpt-3.5-turbo
+    scraped_files_dir : str
+        The directory to store the scraped files in, defaults to 'scraped_repos'.
+    save_pairs_dir : str
+        The directory to store the sampled files in, defaults to 'pairs'.
+    save_data_dir : str
+        The directory to store the preprocessed data in, defaults to 'github_mix'.
+    system_prompt : str
+        The system prompt to use, defaults to the default system prompt.
+    disallowed_special_tokens : list[str]
+        A list of disallowed special tokens, defaults to the default disallowed special tokens.
+    """
+    def __init__(self, dataset_name: str, tokenizer: str = "p50k_base", scraped_files_dir: str | None = None, save_pairs_dir: str | None = None, save_data_dir: str | None = None, system_prompt: str | None = None, disallowed_special_tokens: list[str] | None = None) -> None:
         self.name = dataset_name
 
         self.enc = tiktoken.get_encoding(tokenizer)
diff --git a/src/llmcoder/data/scraper.py b/src/llmcoder/data/scraper.py
index 799600b..dbee637 100644
--- a/src/llmcoder/data/scraper.py
+++ b/src/llmcoder/data/scraper.py
@@ -14,20 +14,17 @@
 class GitHubScraper:
     """
     A class for scraping GitHub repositories and storing them in a flat structure.
+
+    Parameters
+    ----------
+    dataset_name : str
+        The name of the dataset to scrape repositories for.
+    access_token : str
+        A GitHub access token for authenticating with the GitHub API.
+    scraped_files_dir : str
+        The directory to store the scraped files in, defaults to 'scraped_repos'.
     """
     def __init__(self, dataset_name: str, access_token: str | None = None, scraped_files_dir: str | None = None) -> None:
-        """
-        Initialize the GitHubScraper class with a GitHub access token.
-
-        Parameters
-        ----------
-        dataset_name : str
-            The name of the dataset to scrape repositories for.
-        access_token : str
-            A GitHub access token for authenticating with the GitHub API.
-        scraped_files_dir : str
-            The directory to store the scraped files in, defaults to 'scraped_repos'.
-        """
         self.name = dataset_name
 
         self.access_token = access_token
@@ -47,9 +44,13 @@ def get_repos_with_query(self, query: str, num_repos: int = 1) -> list:
         ----------
         query : str
             A GitHub API query.
-
         num_repos : int
             The number of repositories to fetch.
+
+        Returns
+        -------
+        list
+            A list of repositories.
         """
         if self.access_token is not None:
             headers = {'Authorization': f'token {self.access_token}'}
@@ -150,13 +151,13 @@ def accumulate_repositories(self, repository_sets: list[list[str]] | None = None
 
         Parameters
         ----------
-        repository_sets : list[list[str]]
-            A list of lists of repository URLs to scrape. Each list represents a set of repositories to scrape relating to a specific topic.
+        repository_sets : list[list[str]] | None, optional
+            A list of lists of repository URLs to scrape. Each list represents a set of repositories to scrape relating to a specific topic. If None, a default set of repositories will be used. by default None.
 
         Returns
         -------
-        list[str]
-            A list of repository URLs to scrape.
+        list[tuple[str, str]]
+            A list of tuples of (repo_url, repo_name).
         """
         if repository_sets is None:
             # Get the top 10 Python repositories by stars
@@ -230,8 +231,8 @@ def scrape_repositories(self, repos: list[tuple[str, str]] | None = None, max_n_
 
         Parameters
         ----------
-        repos : list[tuple[str, str]]
-            A list of tuples of (repo_url, repo_name).
+        repos : list[tuple[str, str]] | None, optional
+            A list of tuples of (repo_url, repo_name). If None, a default set of repositories will be used. by default None.
         max_n_repositories : int
             The maximum number of repositories to scrape.
         verbose : bool
diff --git a/src/llmcoder/eval/evaluate.py b/src/llmcoder/eval/evaluate.py
index e760c8b..6dff332 100644
--- a/src/llmcoder/eval/evaluate.py
+++ b/src/llmcoder/eval/evaluate.py
@@ -4,6 +4,7 @@
 import time
 from contextlib import redirect_stdout
 from datetime import datetime
+from typing import Any
 
 import pandas as pd
 from dynaconf import Dynaconf
@@ -15,6 +16,19 @@
 
 
 def check_config(config: Dynaconf) -> bool:
+    """
+    Check if the configuration is correctly formatted.
+
+    Parameters
+    ----------
+    config : Dynaconf
+        The configuration object from Dynaconf.
+
+    Returns
+    -------
+    bool
+        Whether the configuration is correctly formatted.
+    """
 
     # Return if all the required keys are present and the types are correct
     if not isinstance(config.get('analyzers'), list):
@@ -44,15 +58,13 @@ def check_config(config: Dynaconf) -> bool:
 
 
 class Evaluation:
+    """
+    Parameters
+    ----------
+    configs : Dynaconf | list[Dynaconf] | str | list[str] | None, optional
+        The configuration object(s) from Dynaconf or path(s) to configuration file(s).
+    """
     def __init__(self, configs: Dynaconf | list[Dynaconf] | str | list[str] | None = None):
-        """
-        Initialize the Evaluation with a configuration or list of configurations.
-
-        Parameters
-        ----------
-        configs : Dynaconf | list[Dynaconf] | str | list[str] | None, optional
-            The configuration object(s) from Dynaconf or path(s) to configuration file(s).
-        """
         if configs is None:
             self.configs = [
                 Dynaconf(settings_files=[os.path.join(get_config_dir(), config)])
@@ -89,14 +101,14 @@ def __init__(self, configs: Dynaconf | list[Dynaconf] | str | list[str] | None =
 
     def run(self, store: bool = True, n_repeat: int = 1, verbose: bool = False) -> dict[str, list]:
         """
-        Run the evaluation end to end (reading inputs from the database and writing results back)
+        Run the evaluation for the configurations
 
         Parameters
         ----------
         store : bool, optional
             Whether to store the results in the database, by default True
         n_repeat : int, optional
-            The number of times to repeat the evaluation, by default 1
+            The number of times to repeat the evaluation for better statistics about indeterministic methods, by default 1
         verbose : bool, optional
             Whether to print the results to the console, by default False
 
@@ -126,7 +138,7 @@ def run(self, store: bool = True, n_repeat: int = 1, verbose: bool = False) -> d
 
     def predict(self, config: Dynaconf, store: bool = False, verbose: bool = False) -> dict:
         """
-        Run the LLMCoder on the provided files and write the results to the database.
+        Run the LLMCoder on the provided files
 
         Parameters
         ----------
@@ -159,7 +171,7 @@ def predict(self, config: Dynaconf, store: bool = False, verbose: bool = False)
         # Return the results
         return results
 
-    def run_llmcoder(self, config: Dynaconf, inputs: dict, verbose: bool = False) -> dict:
+    def run_llmcoder(self, config: Dynaconf, inputs: dict[Any, str], verbose: bool = False) -> dict:
         """
         Run the LLMCoder on the provided files and return the results.
 
@@ -167,17 +179,17 @@ def run_llmcoder(self, config: Dynaconf, inputs: dict, verbose: bool = False) ->
         ----------
         config : Dynaconf
             The configuration object from Dynaconf.
-        inputs : List[str]
-            A list of inputs to complete with the LLMCoder.
+        inputs : dict[Any, str]
+            The inputs to run the LLMCoder on with unique identifiers as keys.
         verbose : bool, optional
-            Whether to print the results to the console, by default False
+            Whether to print the results to the standard output, by default False
 
         Returns
         -------
         dict
             The results from the evaluation.
         """
-        results: dict[str, dict] = {}
+        results: dict[Any, dict] = {}
 
         # Run the LLMCoder on each input
         for input_id, input in tqdm(inputs.items(), desc='Prediction', total=len(inputs), disable=not verbose):
@@ -201,6 +213,7 @@ def run_llmcoder(self, config: Dynaconf, inputs: dict, verbose: bool = False) ->
                     verbose=True
                 )
 
+                # In case of errors, print verbose output
                 try:
                     _ = llmcoder.complete(input, n=config.get('n_choices'))
                 except TypeError as e:
@@ -226,12 +239,14 @@ def run_llmcoder(self, config: Dynaconf, inputs: dict, verbose: bool = False) ->
 
     def _write_results(self, config: Dynaconf, results: dict) -> None:
         """
-        Write the results back to the database.
+        Write the results to a file.
 
         Parameters
         ----------
+        config : Dynaconf
+            The configuration object from Dynaconf. Used to identify the dataset for naming the file.
         results : dict
-            The results to write back to the database.
+            The results to write to a file.
         """
         # Get the current time
         current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
@@ -246,15 +261,13 @@ def _write_results(self, config: Dynaconf, results: dict) -> None:
 
 
 class Metrics:
+    """
+    Parameters
+    ----------
+    configs : Dynaconf | list[Dynaconf] | str | list[str], optional
+        The configuration object(s) from Dynaconf, or path(s) to configuration files.
+    """
     def __init__(self, configs: Dynaconf | list[Dynaconf] | str | list[str] | None = None):
-        """
-        Initialize the Metrics with a Dynaconf configuration or path to configurations.
-
-        Parameters
-        ----------
-        configs : Dynaconf | list[Dynaconf] | str | list[str], optional
-            The configuration object(s) from Dynaconf, or path(s) to configuration files.
-        """
         if configs is None:
             # Load all configurations from the config directory
             self.configs = [
@@ -293,25 +306,23 @@ def __init__(self, configs: Dynaconf | list[Dynaconf] | str | list[str] | None =
         for config in self.configs:
             print(f'\t- {config.settings_file_for_dynaconf[0]}')
 
-    def run(self, store: bool = False, index: int | None = None, verbose: bool = False, force: bool = False) -> dict[str, dict[str, dict[str, dict]]]:
+    def run(self, store: bool = True, index: int | None = None, verbose: bool = False) -> dict[str, dict[str, dict[str, dict]]]:
         """
-        Analyze the results from the database given the configuration and store it back in the database.
+        Analyze the results from the database given the configuration.
 
         Parameters
         ----------
         store : bool, optional
-            Whether to store the analysis in the database, by default False
+            Whether to store the analysis in the database, by default True
         index : int, optional
             The index of the results to analyze, by default None (analyze all)
         verbose : bool, optional
             Whether to print the results to the console, by default False
-        force : bool, optional
-            Whether to force the analysis, by default False
 
         Returns
         -------
         dict[str, dict[str, dict[str, dict]]]
-            The analysis results.
+            The analysis results with the configuration as the key.
         """
 
         metrics = {}
@@ -347,11 +358,7 @@ def compute_metrics(self, config: Dynaconf, results_dict: dict[str, dict], targe
         results_dict : dict[str, dict]
             The results to compute the metrics for.
         targets : dict
-            The target completions to compute with the LLMCoder.
-        intrinsic_score_functions : list[callable]
-            The intrinsic score functions to use.
-        extrinsic_score_functions : list[callable]
-            The extrinsic score functions to use.
+            The target completions to compare the LLMcoder results to.
         store : bool, optional
             Whether to store the analysis in the database, by default False
         verbose : bool, optional
@@ -360,7 +367,7 @@ def compute_metrics(self, config: Dynaconf, results_dict: dict[str, dict], targe
         Returns
         -------
         dict[str, dict[str, dict]]
-            The metrics for each result.
+            The metrics for each result with the repitition id as the key.
         """
 
         metrics_dict = {}
@@ -394,7 +401,7 @@ def compute_metrics(self, config: Dynaconf, results_dict: dict[str, dict], targe
 
     def load_results(self, config: Dynaconf, index: int | list[int] | None = None) -> dict[str, dict]:
         """
-        Read the results for a tuple from the database.
+        Read the evaluation results for the given configuration from a directory named after the configuration.
 
         Parameters
         ----------
@@ -406,7 +413,7 @@ def load_results(self, config: Dynaconf, index: int | list[int] | None = None) -
         Returns
         -------
         dict[str, dict]
-            The results from the database.
+            The selected results from the evaluation.
         """
         config_name = os.path.splitext(os.path.split(config.settings_file_for_dynaconf[0])[-1])[0]
 
@@ -427,7 +434,7 @@ def load_results(self, config: Dynaconf, index: int | list[int] | None = None) -
 
     def _write_metrics(self, config: Dynaconf, result_repitition_id: str, metrics: dict) -> None:
         """
-        Write the analysis results back to the database.
+        Write the analysis results to a file.
 
         Parameters
         ----------
@@ -436,7 +443,7 @@ def _write_metrics(self, config: Dynaconf, result_repitition_id: str, metrics: d
         result_repitition_id : str
             The id of the result repitition.
         metrics : dict
-            The analysis results to write back to the database.
+            The analysis results to write to a file.
         """
         # Create a dataframe from the analysis results
         df = pd.DataFrame.from_dict(metrics, orient='index')
diff --git a/src/llmcoder/eval/metrics/extrinsic.py b/src/llmcoder/eval/metrics/extrinsic.py
index 2caf62c..341a49b 100644
--- a/src/llmcoder/eval/metrics/extrinsic.py
+++ b/src/llmcoder/eval/metrics/extrinsic.py
@@ -1,3 +1,4 @@
+import textwrap
 import warnings
 from difflib import SequenceMatcher
 
@@ -181,18 +182,19 @@ def sequence_matcher_score(ground_truth: str, llmcoder_result: dict | str) -> fl
 
 def _user_prompt_template(code_1: str, code_2: str, qualities_list: list[str] | None) -> str:
     quality_list_string = '\n'.join([f'- {q}' for q in qualities_list]) if qualities_list is not None else ''
-    return f"""Assess and compare these two code snippets and evaluate the completions. Do your own analysis and also consider the following criteria:
-{quality_list_string}
+    return textwrap.dedent(
+        f"""Assess and compare these two code snippets and evaluate the completions. Do your own analysis and also consider the following criteria:
+        {quality_list_string}
 
-CODE 1:
-```python
-{code_1}
-```
+        CODE 1:
+        ```python
+        {code_1}
+        ```
 
-CODE 2:
-```python
-{code_2}
-```"""
+        CODE 2:
+        ```python
+        {code_2}
+        ```""")
 
 
 def _get_scores(ground_truth: str, completion: str, system_prompt_compare: str, qualities_list: list[str] | None = None, model: str = "gpt-3.5-turbo", max_iter: int = 5) -> float:
@@ -249,24 +251,25 @@ def gpt_reviewer_score(ground_truth: str, llmcoder_result: dict | str, model: st
         The similarity between the two strings. Positive if the completion is better than the ground truth, negative otherwise.
     """
 
-    system_prompt_compare = """You are a data scientist tasked with comparing and evaluating code completions made by a language model.
-The user will submit two code snippets with the same beginning but different completions.
-Given these snippets, you evaluate the completions in a concise way, and give each a score between 0 and 10, with 0 being the worst (unusable completion that would make a developer frustrated) and 10 being the best (perfect completion that would make a developer happy).
-The user may ask you to prioritize different qualities of the code.
-Take these priorities into account when scoring the completions.
-Your output must always have the following format:
-```
-COMPARISON:
-<comparison of the two completions with regard to the requested qualities>
-
-SCORE 1: <score for completion 1, integer between 0 and 10>
-SCORE 2: <score for completion 2, integer between 0 and 10>
-```
-
-Do not include any other information in your output.
-It is very important that the output following "SCORE 1: " and "SCORE 2: " is a single integer between 0 and 10, with no other characters or spaces since scores will later be parsed at this exact location.
-Therefore, make sure to keep your comparison (the text after COMPARISON:) concise, and adhere to the score format exactly.
-"""
+    system_prompt_compare = textwrap.dedent(
+        """You are a data scientist tasked with comparing and evaluating code completions made by a language model.
+        The user will submit two code snippets with the same beginning but different completions.
+        Given these snippets, you evaluate the completions in a concise way, and give each a score between 0 and 10, with 0 being the worst (unusable completion that would make a developer frustrated) and 10 being the best (perfect completion that would make a developer happy).
+        The user may ask you to prioritize different qualities of the code.
+        Take these priorities into account when scoring the completions.
+        Your output must always have the following format:
+        ```
+        COMPARISON:
+        <comparison of the two completions with regard to the requested qualities>
+
+        SCORE 1: <score for completion 1, integer between 0 and 10>
+        SCORE 2: <score for completion 2, integer between 0 and 10>
+        ```
+
+        Do not include any other information in your output.
+        It is very important that the output following "SCORE 1: " and "SCORE 2: " is a single integer between 0 and 10, with no other characters or spaces since scores will later be parsed at this exact location.
+        Therefore, make sure to keep your comparison (the text after COMPARISON:) concise, and adhere to the score format exactly.
+        """)
 
     if qualities_list is None:
         qualities_list = [
diff --git a/src/llmcoder/llmcoder.py b/src/llmcoder/llmcoder.py
index 1193c16..2f4ee4b 100644
--- a/src/llmcoder/llmcoder.py
+++ b/src/llmcoder/llmcoder.py
@@ -24,7 +24,7 @@ class LLMCoder:
     model_feedback : str, optional
         The model to use for the feedback loop, by default "ft:gpt-3.5-turbo-1106:personal::8LCi9Q0d"
     feedback_variant : str, optional
-        The feedback variant to use, by default "coworker"
+        The feedback variant to use, one of ["separate", "coworker"], by default "coworker", which enables a shared context for the analyzers
     system_prompt : str, optional
         The system prompt to use, by default the one used for preprocessing and fine-tuning
     max_iter : int, optional
@@ -47,7 +47,7 @@ def __init__(
             system_prompt: str | None = None,
             max_iter: int = 10,
             backtracking: bool = True,
-            log_conversation: bool = True,
+            log_conversation: bool = False,
             n_procs: int = 1,
             verbose: bool = True) -> None:
 
@@ -102,7 +102,7 @@ def __init__(
 
     def _get_best_completion(self, conversations: list[Conversation]) -> str:
         """
-        Get the best completion from the provided conversations
+        Get the best completion from the provided conversations measured by their `score`
 
         Parameters
         ----------
@@ -128,14 +128,14 @@ def complete(self, code: str, temperature: float = 0.7, meta_temperature: float
         temperature : float, optional
             The temperature to use for the completion, by default 0.7
         meta_temperature : float, optional
-            The temperature to use for choosing the most promising conversation, by default 0.1
+            The temperature to use for choosing the most promising conversation, by default 0.0
         n : int, optional
             The number of choices to generate, by default 1
 
         Returns
         -------
         str
-            The completed code
+            The code completion
         """
         # Reset the feedback loop and internal variables
         self._reset_loop()
@@ -182,6 +182,7 @@ def _create_conversation_file(cls) -> str:
         str
             The path to the conversation file
         """
+        # FIXME: Add support for backtracking and graph mode
         return os.path.join(get_conversations_dir(create=True), f"{datetime.now()}.jsonl")
 
     def _is_bad_completion(self, completion: str) -> bool:
@@ -192,7 +193,7 @@ def _is_bad_completion(self, completion: str) -> bool:
         Parameters
         ----------
         completion : str
-            The completion to check
+            The completion to check against the existing conversations
 
         Returns
         -------
@@ -209,7 +210,7 @@ def _is_bad_completion(self, completion: str) -> bool:
     def _get_completions_for(
             self,
             conversation: Conversation,
-            model: str = 'gpt-3.5-turbo',
+            model: str = 'ft:gpt-3.5-turbo-1106:personal::8LCi9Q0d',
             temperature: float = 0.7,
             n: int = 1,
             max_retries: int = 5,
@@ -223,15 +224,15 @@ def _get_completions_for(
         Parameters
         ----------
         conversation: Conversation
-            Tuple in the priority queue. Contains the completion/code over which the model will complete.
+            The conversation to get completions for. Usually the most promising conversation from the priority queue
         model : str, optional
-            The model to use for the completion, by default 'gpt-3.5-turbo'
+            The model to use for the completion, by default 'ft:gpt-3.5-turbo-1106:personal::8LCi9Q0d'
         temperature : float, optional
             The temperature to use for the completion, by default 0.7
         n : int, optional
             The number of choices to generate, by default 1
         max_retries : int, optional
-            The maximum number of retries to get a valid completion, by default 5
+            The maximum number of retries to get a valid completion due to repeated mistakes or duplicates, by default 5
         delta_temperature : float, optional
             The amount to increase the temperature in case of repeated mistakes, by default 0.2
         max_temperature : float, optional
@@ -368,14 +369,14 @@ def _run_analyzers(self, code: str, completion: str) -> dict[str, dict]:
         Parameters
         ----------
         code : str
-            The code to analyze
+            The beginning of the code
         completion : str
-            The completion to analyze
+            The completion of the code to analyze
 
         Returns
         -------
         dict[str, dict]
-            The analyzer results
+            The analyzer results with the analyzer names as keys and the results as values
         """
         analyzer_results: dict[str, dict] = {}
 
@@ -407,7 +408,7 @@ def _feedback_prompt_template(self, result_messages: list[str]) -> str:
         Parameters
         ----------
         result_messages : list[str]
-            The analyzer result messages
+            The analyzer result messages, typically obtained by concatenating the `message` field of the analyzer results
 
         Returns
         -------
@@ -418,8 +419,11 @@ def _feedback_prompt_template(self, result_messages: list[str]) -> str:
 
     def _step(self, code: str, temperature: float = 0.7, meta_temperature: float = 0.0, n: int = 1) -> None:
         """
-        Complete the provided code with the OpenAI model and feedback, if available
-        Make choice on highest scored snippet through PriorityQueue.pop().
+        Run one step of the feedback loop, including
+        - getting and duplicating the most promising conversation from the priority queue
+        - adding the user's code to the conversation
+        - getting completions for the conversation with `LLMCoder._get_completions_for`
+        - adding the completions to the priority queue
 
         Parameters
         ----------
@@ -428,7 +432,7 @@ def _step(self, code: str, temperature: float = 0.7, meta_temperature: float = 0
         temperature : float, optional
             The temperature to use for the completion, by default 0.7
         meta_temperature : float, optional
-            The temperature to use for choosing the most promising conversation, by default 0.1
+            The temperature to use for choosing the most promising conversation, by default 0.0
         n : int, optional
             The number of choices to generate, by default 1
         """
diff --git a/src/llmcoder/utils.py b/src/llmcoder/utils.py
index 9a6ef35..c5c0f84 100644
--- a/src/llmcoder/utils.py
+++ b/src/llmcoder/utils.py
@@ -8,7 +8,7 @@ def get_data_dir(*args: str, create: bool = False) -> str:
     Parameters
     ----------
     args : str
-        The path to the data directory.
+        The path to the data directory in /data.
     create : bool
         Whether to create the directory if it does not exist.
 
@@ -34,7 +34,7 @@ def get_config_dir(*args: str, create: bool = False) -> str:
     Parameters
     ----------
     args : str
-        The path to the configs directory.
+        The path to the configs directory in /configs.
     create : bool
         Whether to create the directory if it does not exist.
 
@@ -55,7 +55,7 @@ def get_config_dir(*args: str, create: bool = False) -> str:
 
 def get_openai_key(key: str = "") -> str:
     """
-    Get OpenAI API key. Try to interpret the key as a key first, then as a path to a file containing the key.
+    Get OpenAI API key. Try to interpret the `key` parameter as a key first, then as a path to a file containing the key.
     Finally, fall back to the default key.txt file or the OPENAI_KEY environment variable.
 
     Parameters
@@ -92,7 +92,7 @@ def get_openai_key(key: str = "") -> str:
 
 def get_github_access_token(token: str = "") -> str:
     """
-    Get GitHub access token. Try to interpret the token as a token first, then as a path to a file containing the token.
+    Get GitHub access token. Try to interpret the `token` parameter as a token first, then as a path to a file containing the token.
     Finally, fall back to the default token.txt file or the GITHUB_ACCESS_TOKEN environment variable.
 
     Parameters
@@ -151,7 +151,7 @@ def get_system_prompt(name: str = "2023-11-15_GPT-Builder.txt") -> str:
 
 def get_system_prompt_dir(*args: str, create: bool = False) -> str:
     """
-    Get the path to the system prompts directory.
+    Get the path to the system prompts directory in /system_prompts.
 
     Parameters
     ----------
@@ -177,7 +177,7 @@ def get_system_prompt_dir(*args: str, create: bool = False) -> str:
 
 def get_conversations_dir(*args: str, create: bool = False) -> str:
     """
-    Get the path to the log directory.
+    Get the path to the log directory in /conversations.
 
     Parameters
     ----------
diff --git a/system_prompts/2023-12-09_Scorer_v1.1.txt b/system_prompts/2023-12-09_Scorer_v1.1.txt
index ed02602..c905f66 100644
--- a/system_prompts/2023-12-09_Scorer_v1.1.txt
+++ b/system_prompts/2023-12-09_Scorer_v1.1.txt
@@ -1,7 +1,7 @@
 "You are a scoring system for python code that meticulously analyzes a code snippet and judges its quality in given categories.
 You are given a python code snippet and asked to score it based on the following categories:
 - the code quality: How well the code conforms to the python style guide
-- the plausibliity of the last few lines: How much sense the last few lines make
+- the plausiblity of the last few lines: How much sense the last few lines make
 - the consistency of the last few lines: How good the last few lines fit to the beginning and middle of the code
 - the readability of the code: How easy it is to understand the code